diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..24b0494a9f15e592e3a494a41e71a1204821d026 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,33 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+diffusers/src/diffusers/models/__pycache__/attention_processor.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+diffusers/src/diffusers/models/unets/__pycache__/unet_2d_blocks.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+diffusers/src/diffusers/pipelines/__pycache__/pipeline_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+examples/line/example0/input.png filter=lfs diff=lfs merge=lfs -text
+examples/line/example0/reference_image_0.png filter=lfs diff=lfs merge=lfs -text
+examples/line/example0/reference_image_1.png filter=lfs diff=lfs merge=lfs -text
+examples/line/example0/reference_image_2.png filter=lfs diff=lfs merge=lfs -text
+examples/line/example1/input.png filter=lfs diff=lfs merge=lfs -text
+examples/line/example1/reference_image_0.png filter=lfs diff=lfs merge=lfs -text
+examples/line/example2/input.png filter=lfs diff=lfs merge=lfs -text
+examples/line/example2/reference_image_0.png filter=lfs diff=lfs merge=lfs -text
+examples/line/example2/reference_image_1.png filter=lfs diff=lfs merge=lfs -text
+examples/line/example2/reference_image_2.png filter=lfs diff=lfs merge=lfs -text
+examples/line/example2/reference_image_3.png filter=lfs diff=lfs merge=lfs -text
+examples/line/example3/input.png filter=lfs diff=lfs merge=lfs -text
+examples/line/example3/reference_image_0.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example0/input.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example0/reference_image_0.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example0/reference_image_1.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example0/reference_image_2.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example0/reference_image_3.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example1/input.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example1/reference_image_0.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example1/reference_image_1.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example1/reference_image_2.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example1/reference_image_3.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example1/reference_image_4.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example1/reference_image_5.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example2/input.png filter=lfs diff=lfs merge=lfs -text
+examples/shadow/example2/reference_image_0.png filter=lfs diff=lfs merge=lfs -text
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..cedd8bd95b357ea19aad88212fe327a3d85e6b09
--- /dev/null
+++ b/app.py
@@ -0,0 +1,699 @@
+import contextlib
+import gc
+import json
+import logging
+import math
+import os
+import random
+import shutil
+import sys
+import time
+import itertools
+import copy
+from pathlib import Path
+
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+
+import accelerate
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from safetensors.torch import load_model
+from peft import LoraConfig
+import gradio as gr
+import pandas as pd
+
+import transformers
+from transformers import (
+    AutoTokenizer,
+    PretrainedConfig,
+    CLIPVisionModelWithProjection,
+    CLIPImageProcessor,
+    CLIPProcessor,
+)
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    PixArtTransformer2DModel,
+    CausalSparseDiTModel,
+    CausalSparseDiTControlModel,
+    CobraPixArtAlphaPipeline,
+    UniPCMultistepScheduler,
+)
+from cobra_utils.utils import *
+
+from huggingface_hub import snapshot_download
+
+# model_global_path = snapshot_download(repo_id="JunhaoZhuang/Cobra", cache_dir='./Cobra/', repo_type="model")
+model_global_path = './Cobra/models--JunhaoZhuang--Cobra/snapshots/e87145ca2cb71423753905ff10863ebba23ffc19'
+print(model_global_path)
+examples = [
+    [
+        "./examples/shadow/example0/input.png", 
+        ["./examples/shadow/example0/reference_image_0.png", 
+        "./examples/shadow/example0/reference_image_1.png", 
+        "./examples/shadow/example0/reference_image_2.png",
+        "./examples/shadow/example0/reference_image_3.png"], 
+        "line + shadow", # style
+        1, # seed
+        10, # step
+        20, # top k
+    ],
+    [
+        "./examples/shadow/example1/input.png", 
+        ["./examples/shadow/example1/reference_image_0.png", 
+        "./examples/shadow/example1/reference_image_1.png", 
+        "./examples/shadow/example1/reference_image_2.png",
+        "./examples/shadow/example1/reference_image_3.png",
+        "./examples/shadow/example1/reference_image_4.png",
+        "./examples/shadow/example1/reference_image_5.png"], 
+        "line + shadow", # style
+        1, # seed
+        10, # step
+        20, # top k
+    ],
+    [
+        "./examples/shadow/example2/input.png", 
+        ["./examples/shadow/example2/reference_image_0.png"], 
+        "line + shadow", # style
+        1, # seed
+        10, # step
+        3, # top k
+    ],
+    [
+        "./examples/line/example2/input.png", 
+        ["./examples/line/example2/reference_image_0.png", 
+        "./examples/line/example2/reference_image_1.png", 
+        "./examples/line/example2/reference_image_2.png",
+        "./examples/line/example2/reference_image_3.png"], 
+        "line", # style
+        1, # seed
+        10, # step
+        20, # top k
+    ],
+    [
+        "./examples/line/example0/input.png", 
+        ["./examples/line/example0/reference_image_0.png", 
+        "./examples/line/example0/reference_image_1.png", 
+        "./examples/line/example0/reference_image_2.png"],
+        "line", # style
+        0, # seed
+        10, # step
+        6, # top k
+    ],
+    [
+        "./examples/line/example1/input.png", 
+        ["./examples/line/example1/reference_image_0.png",],
+        "line", # style
+        0, # seed
+        10, # step
+        3, # top k
+    ],
+    [
+        "./examples/line/example3/input.png", 
+        ["./examples/line/example3/reference_image_0.png",],
+        "line", # style
+        0, # seed
+        10, # step
+        3, # top k
+    ],]
+
+ratio_list = [[800, 800], [768, 896], [704, 928], [672, 960], [640, 1024], [608, 1056], [576, 1088], [576, 1184]]
+ratio_list += [[896, 768], [928, 704], [960, 672], [1024, 640], [1056, 608], [1088, 576], [1184, 576]]
+
+def get_rate(image):
+    # 计算输入图片的长宽比
+    input_rate = image.size[0] / image.size[1]
+    # 计算与每个预设比例的差距
+    min_diff = float('inf')
+    best_idx = 0
+    
+    for i, ratio in enumerate(ratio_list):
+        ratio_rate = ratio[0] / ratio[1]
+        diff = abs(input_rate - ratio_rate)
+        if diff < min_diff:
+            min_diff = diff
+            best_idx = i
+            
+    return ratio_list[best_idx]
+
+
+transform = transforms.Compose([
+    transforms.ToTensor(),  
+    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  
+])
+weight_dtype = torch.float16
+
+# line model
+line_model_path = os.path.join(model_global_path, 'LE', 'erika.pth')
+line_model = res_skip()
+line_model.load_state_dict(torch.load(line_model_path))
+line_model.eval()
+line_model.cuda()
+
+
+# image encoder
+image_processor = CLIPImageProcessor()
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(os.path.join(model_global_path, 'image_encoder')).to('cuda')
+# os.path.join(model_global_path, 'image_encoder')
+
+
+
+# model_sketch = create_model_sketch('default').to('cuda')      # create a model given opt.model and other options
+# model_sketch.eval()
+
+
+global pipeline
+global MultiResNetModel
+
+def load_ckpt():
+    global pipeline
+    global MultiResNetModel
+    global causal_dit
+    global controlnet
+    weight_dtype = torch.float16
+
+    block_out_channels = [128, 128, 256, 512, 512]
+    MultiResNetModel = MultiHiddenResNetModel(block_out_channels, len(block_out_channels))
+    MultiResNetModel.load_state_dict(torch.load(os.path.join(model_global_path, 'shadow_GSRP', 'MultiResNetModel.bin'), map_location='cpu'), strict=True)
+    MultiResNetModel.to('cuda', dtype=weight_dtype)
+
+
+    # transformer
+    transform = transforms.Compose([
+                transforms.ToTensor(),  # 将 PIL 图像转换为 Tensor
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # 归一化
+            ])
+    # seed = 43
+    lora_rank = 128
+    pretrained_model_name_or_path = '/mnt/workspace/zhuangjunhao/ColorFlow/PixArt-XL-2-1024-MS/models--PixArt-alpha--PixArt-XL-2-1024-MS/snapshots/b89adadeccd9ead2adcb9fa2825d3fabec48d404'
+
+    transformer = PixArtTransformer2DModel.from_pretrained(
+            pretrained_model_name_or_path, subfolder="transformer", revision=None, variant=None
+        )
+    pixart_config = get_pixart_config()
+    causal_dit = CausalSparseDiTModel(num_attention_heads=pixart_config.get("num_attention_heads"),
+                        attention_head_dim=pixart_config.get("attention_head_dim"),
+                        in_channels=pixart_config.get("in_channels"),
+                        out_channels=pixart_config.get("out_channels"),
+                        num_layers=pixart_config.get("num_layers"),
+                        dropout=pixart_config.get("dropout"),
+                        norm_num_groups=pixart_config.get("norm_num_groups"),
+                        cross_attention_dim=pixart_config.get("cross_attention_dim"),
+                        attention_bias=pixart_config.get("attention_bias"),
+                        sample_size=pixart_config.get("sample_size"),
+                        patch_size=pixart_config.get("patch_size"),
+                        activation_fn=pixart_config.get("activation_fn"),
+                        num_embeds_ada_norm=pixart_config.get("num_embeds_ada_norm"),
+                        upcast_attention=pixart_config.get("upcast_attention"),
+                        norm_type=pixart_config.get("norm_type"),
+                        norm_elementwise_affine=pixart_config.get("norm_elementwise_affine"),
+                        norm_eps=pixart_config.get("norm_eps"),
+                        caption_channels=pixart_config.get("caption_channels"),
+                        attention_type=pixart_config.get("attention_type"))
+
+    causal_dit = init_causal_dit(causal_dit, transformer)
+    print('loaded causal_dit')
+    controlnet = CausalSparseDiTControlModel(num_attention_heads=pixart_config.get("num_attention_heads"),
+                                    attention_head_dim=pixart_config.get("attention_head_dim"),
+                                    in_channels=pixart_config.get("in_channels"),
+                                    cond_chanels = 9,
+                                    out_channels=pixart_config.get("out_channels"),
+                                    num_layers=pixart_config.get("num_layers"),
+                                    dropout=pixart_config.get("dropout"),
+                                    norm_num_groups=pixart_config.get("norm_num_groups"),
+                                    cross_attention_dim=pixart_config.get("cross_attention_dim"),
+                                    attention_bias=pixart_config.get("attention_bias"),
+                                    sample_size=pixart_config.get("sample_size"),
+                                    patch_size=pixart_config.get("patch_size"),
+                                    activation_fn=pixart_config.get("activation_fn"),
+                                    num_embeds_ada_norm=pixart_config.get("num_embeds_ada_norm"),
+                                    upcast_attention=pixart_config.get("upcast_attention"),
+                                    norm_type=pixart_config.get("norm_type"),
+                                    norm_elementwise_affine=pixart_config.get("norm_elementwise_affine"),
+                                    norm_eps=pixart_config.get("norm_eps"),
+                                    caption_channels=pixart_config.get("caption_channels"),
+                                    attention_type=pixart_config.get("attention_type")
+                                )
+    # controlnet = init_controlnet(controlnet, causal_dit)
+    del transformer
+    transformer_lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_rank,
+            # use_dora=True,
+            init_lora_weights="gaussian",
+            target_modules=["to_k",
+                "to_q",
+                "to_v",
+                "to_out.0",
+                "proj_in",
+                "proj_out",
+                "ff.net.0.proj",
+                "ff.net.2",
+                "proj",
+                "linear",
+                "linear_1",
+                "linear_2"],#ff.net.0.proj ff.net.2
+        )
+    causal_dit.add_adapter(transformer_lora_config)
+
+    
+    lora_state_dict = torch.load(os.path.join(model_global_path, 'shadow_ckpt', 'transformer_lora_pos.bin'), map_location='cpu')
+    causal_dit.load_state_dict(lora_state_dict, strict=False)
+    controlnet_state_dict = torch.load(os.path.join(model_global_path, 'shadow_ckpt', 'controlnet.bin'), map_location='cpu')
+    controlnet.load_state_dict(controlnet_state_dict, strict=True)
+
+    causal_dit.to('cuda', dtype=weight_dtype)
+    controlnet.to('cuda', dtype=weight_dtype)
+
+    pipeline = CobraPixArtAlphaPipeline.from_pretrained(
+            pretrained_model_name_or_path,
+            transformer=causal_dit,
+            controlnet=controlnet,
+            safety_checker=None,    
+            revision=None,
+            variant=None,
+            torch_dtype=weight_dtype,
+        )
+
+    pipeline = pipeline.to("cuda")
+    
+global cur_style
+cur_style = 'line + shadow'
+def change_ckpt(style):
+    global pipeline
+    global MultiResNetModel
+    global cur_style
+    # global causal_dit
+    # global controlnet
+    weight_dtype = torch.float16
+
+    if style == 'line':
+        MultiResNetModel_path = os.path.join(model_global_path, 'line_GSRP', 'MultiResNetModel.bin')
+        causal_dit_lora_path = os.path.join(model_global_path, 'line_ckpt', 'transformer_lora_pos.bin')
+        controlnet_path = os.path.join(model_global_path, 'line_ckpt', 'controlnet.bin')
+    elif style == 'line + shadow':
+        MultiResNetModel_path = os.path.join(model_global_path, 'shadow_GSRP', 'MultiResNetModel.bin')
+        causal_dit_lora_path = os.path.join(model_global_path, 'shadow_ckpt', 'transformer_lora_pos.bin')
+        controlnet_path = os.path.join(model_global_path, 'shadow_ckpt', 'controlnet.bin')
+    else:
+        raise ValueError("Invalid style: {}".format(style))
+
+    cur_style = style
+
+    MultiResNetModel.load_state_dict(torch.load(MultiResNetModel_path, map_location='cpu'), strict=True)
+    MultiResNetModel.to('cuda', dtype=weight_dtype)
+
+
+    lora_state_dict = torch.load(causal_dit_lora_path, map_location='cpu')
+    pipeline.transformer.load_state_dict(lora_state_dict, strict=False)
+    controlnet_state_dict = torch.load(controlnet_path, map_location='cpu')
+    pipeline.controlnet.load_state_dict(controlnet_state_dict, strict=True)
+
+    pipeline.transformer.to('cuda', dtype=weight_dtype)
+    pipeline.controlnet.to('cuda', dtype=weight_dtype)
+
+    print('loaded {} ckpt'.format(style))
+
+    return style
+
+    
+  
+load_ckpt()
+
+def fix_random_seeds(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+def process_multi_images(files):
+    images = [Image.open(file.name) for file in files]
+    imgs = []
+    for i, img in enumerate(images):
+        imgs.append(img)
+    return imgs 
+
+def extract_lines(image):
+    src = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
+
+    rows = int(np.ceil(src.shape[0] / 16)) * 16
+    cols = int(np.ceil(src.shape[1] / 16)) * 16
+
+    patch = np.ones((1, 1, rows, cols), dtype="float32")
+    patch[0, 0, 0:src.shape[0], 0:src.shape[1]] = src
+
+    tensor = torch.from_numpy(patch).cuda()
+
+    with torch.no_grad():
+        y = line_model(tensor)
+
+    yc = y.cpu().numpy()[0, 0, :, :]
+    yc[yc > 255] = 255
+    yc[yc < 0] = 0
+
+    outimg = yc[0:src.shape[0], 0:src.shape[1]]
+    outimg = outimg.astype(np.uint8)
+    outimg = Image.fromarray(outimg)
+    torch.cuda.empty_cache()
+    return outimg
+
+def extract_line_image(query_image_, resolution):
+    tar_width, tar_height = resolution
+    query_image = query_image_.resize((tar_width, tar_height))
+    # query_image.save('/mnt/workspace/zhuangjunhao/cobra_code/ColorFlow/examples/line/example3/input.png')
+    query_image = query_image.convert('L').convert('RGB')
+    extracted_line = extract_lines(query_image)
+    extracted_line = extracted_line.convert('L').convert('RGB')
+    torch.cuda.empty_cache()
+    return extracted_line, Image.new('RGB', (tar_width, tar_height), 'black')
+
+def extract_sketch_line_image(query_image_, input_style):
+    global cur_style
+    if input_style != cur_style:
+        change_ckpt(input_style)
+
+    resolution = get_rate(query_image_)
+    extracted_line, hint_mask = extract_line_image(query_image_, resolution)
+    extracted_sketch = extracted_line
+    extracted_sketch_line = Image.blend(extracted_sketch, extracted_line, 0.5)
+
+    extracted_sketch_line_ori = copy.deepcopy(extracted_sketch_line)
+
+    extracted_sketch_line_np = np.array(extracted_sketch_line)
+    # extracted_sketch_line_np[extracted_sketch_line_np < 236] = 0
+    # extracted_sketch_line_np[extracted_sketch_line_np >= 236] = 255
+    extracted_sketch_line = Image.fromarray(np.uint8(extracted_sketch_line_np))
+    if input_style == 'line + shadow':
+        print('line + shadow sketch')
+        black_rate = 74
+        black_value = 18
+        gary_rate = 155
+        up_bound = 145
+        ori_np = np.array(extracted_sketch_line_ori)
+        query_image_np = np.array(query_image_.resize(resolution).convert('L').convert('RGB'))
+        extracted_sketch_line_np = np.array(extracted_sketch_line.convert('L').convert('RGB'))
+        ori_np[query_image_np <= black_rate] = black_value
+        ori_np[(ori_np > gary_rate) & (query_image_np < up_bound) & (query_image_np > black_rate)] = gary_rate
+        extracted_sketch_line_ori = Image.fromarray(np.uint8(ori_np))
+
+        extracted_sketch_line_np[query_image_np <= black_rate] = black_value
+        extracted_sketch_line_np[(extracted_sketch_line_np > gary_rate) & (query_image_np < up_bound) & (query_image_np > black_rate)] = gary_rate
+        extracted_sketch_line = Image.fromarray(np.uint8(extracted_sketch_line_np))
+
+    return extracted_sketch_line.convert('RGB'), extracted_sketch_line.convert('RGB'), hint_mask, query_image_, extracted_sketch_line_ori.convert('RGB'), resolution
+
+def colorize_image(extracted_line, reference_images, resolution, seed, num_inference_steps, top_k, hint_mask=None, hint_color=None, query_image_origin=None, extracted_image_ori=None):
+    if extracted_line is None:
+        gr.Info("Please preprocess the image first")
+        raise ValueError("Please preprocess the image first")
+    global pipeline
+    global MultiResNetModel
+    reference_images = process_multi_images(reference_images)
+    fix_random_seeds(seed)
+
+    tar_width, tar_height = resolution
+
+    gr.Info("Image retrieval in progress...")
+
+    query_image_bw = extracted_line.resize((tar_width, tar_height))
+    query_image = query_image_bw.convert('RGB')
+
+    query_image_origin = query_image_origin.resize((tar_width, tar_height))
+
+    query_image_vae = extracted_image_ori.resize((int(tar_width*1.5), int(tar_height*1.5)))
+    reference_images = [process_image(ref_image, tar_width, tar_height) for ref_image in reference_images]
+    query_patches_pil = process_image_Q_varres(query_image_origin, tar_width, tar_height)
+    reference_patches_pil = []
+    # Save reference_images
+    # save_path = '/mnt/workspace/zhuangjunhao/cobra_code/ColorFlow/examples/line/example3'
+    # os.makedirs(save_path, exist_ok=True)
+    # for idx, ref_image in enumerate(reference_images):
+    #     ref_image.save(os.path.join(save_path, f'reference_image_{idx}.png'))
+
+    for reference_image in reference_images:
+        reference_patches_pil += process_image_ref_varres(reference_image, tar_width, tar_height)
+    with torch.no_grad():
+        clip_img = image_processor(images=query_patches_pil, return_tensors="pt").pixel_values.to(image_encoder.device, dtype=image_encoder.dtype)
+        query_embeddings = image_encoder(clip_img).image_embeds
+        reference_patches_pil_gray = [rimg.convert('RGB').convert('RGB') for rimg in reference_patches_pil]
+        clip_img = image_processor(images=reference_patches_pil_gray, return_tensors="pt").pixel_values.to(image_encoder.device, dtype=image_encoder.dtype)
+        reference_embeddings = image_encoder(clip_img).image_embeds
+        cosine_similarities = F.cosine_similarity(query_embeddings.unsqueeze(1), reference_embeddings.unsqueeze(0), dim=-1)
+        len_ref = len(reference_patches_pil)
+        # print(cosine_similarities)
+        sorted_indices = torch.argsort(cosine_similarities, descending=True, dim=1).tolist()
+
+        top_k_indices = [cur_sortlist[:top_k] for cur_sortlist in sorted_indices]
+        available_ref_patches = [[],[],[],[]]
+        for i in range(len(top_k_indices)):
+            for j in range(top_k):
+                available_ref_patches[i].append(reference_patches_pil[top_k_indices[i][j]].resize((tar_width//2, tar_height//2)).convert('RGB'))
+
+        flat_available_ref_patches = [item for sublist in available_ref_patches for item in sublist]
+
+    # 正方形拼接 flat_available_ref_patches 
+    grid_N = int(np.ceil(np.sqrt(len(flat_available_ref_patches))))
+    small_tar_width = tar_width//grid_N
+    small_tar_height = tar_height//grid_N
+    grid_img = Image.new('RGB', (grid_N*small_tar_width, grid_N*small_tar_height), 'black')
+    for i in range(len(flat_available_ref_patches)):
+        grid_img.paste(flat_available_ref_patches[i].resize((small_tar_width, small_tar_height)), (i%grid_N*small_tar_width, int(i/grid_N)*small_tar_height))
+
+    # grid_img 添加文字"Reference images"
+    draw = ImageDraw.Draw(grid_img)
+    draw.text((0, 0), "Reference Images", fill='red', font_size=50)
+
+    gr.Info("Model inference in progress...")
+    generator = torch.Generator(device='cuda').manual_seed(seed)
+    hint_mask = hint_mask.resize((tar_width//8, tar_height//8)).convert('RGB')
+    hint_color = hint_color.convert('RGB')
+    
+    colorized_image = pipeline(
+            cond_input=query_image_bw.convert('RGB'),
+            cond_refs=available_ref_patches,
+            hint_mask=hint_mask,
+            hint_color=hint_color,
+            num_inference_steps=num_inference_steps,
+            generator = generator,
+        )[0][0]
+    gr.Info("Post-processing image...")
+    with torch.no_grad():
+        up_img = colorized_image.resize(query_image_vae.size)
+        test_low_color = transform(up_img).unsqueeze(0).to('cuda', dtype=weight_dtype)
+        query_image_vae_ = transform(query_image_vae).unsqueeze(0).to('cuda', dtype=weight_dtype)
+
+        h_color, hidden_list_color = pipeline.vae._encode(test_low_color,return_dict = False, hidden_flag = True)
+        h_bw, hidden_list_bw = pipeline.vae._encode(query_image_vae_, return_dict = False, hidden_flag = True)
+
+        hidden_list_double = [torch.cat((hidden_list_color[hidden_idx], hidden_list_bw[hidden_idx]), dim = 1) for hidden_idx in range(len(hidden_list_color))]
+
+
+        hidden_list = MultiResNetModel(hidden_list_double)
+        output = pipeline.vae._decode(h_color.sample(),return_dict = False, hidden_list = hidden_list)[0]
+
+        output[output > 1] = 1
+        output[output < -1] = -1
+        high_res_image = Image.fromarray(((output[0] * 0.5 + 0.5).permute(1, 2, 0).detach().cpu().numpy() * 255).astype(np.uint8)).convert("RGB")
+    gr.Info("Colorization complete!")
+    torch.cuda.empty_cache()
+    
+    output_gallery = [high_res_image, query_image_bw, hint_mask, hint_color, grid_img] 
+    return output_gallery
+
+
+# Function to get color value from reference image
+def get_color_value(reference_image, evt: gr.SelectData):
+    if reference_image is None:
+        return "Please upload a reference image first."
+    x, y = evt.index
+    color_value = reference_image[y, x]
+    return f"Get Color value: {color_value}", color_value
+
+# Function to draw a square on the line drawing image
+def draw_square(line_drawing_image_pil, hint_mask, color_value, evt: gr.SelectData):
+    line_drawing_image = np.array(line_drawing_image_pil)
+    # line_drawing_image = np.array(Image.new('RGB', line_drawing_image_pil.size, 'black'))
+    hint_mask = np.array(hint_mask)
+    if line_drawing_image is None:
+        return "Please upload a line drawing image first."
+    if color_value is None:
+        return "Please pick a color from the reference image first."
+    x, y = evt.index
+    # Calculate square boundaries
+    start_x = max(0, x - 8)
+    start_y = max(0, y - 8)
+    end_x = min(line_drawing_image.shape[1], x + 8)
+    end_y = min(line_drawing_image.shape[0], y + 8)
+    # Draw the square
+    line_drawing_image[start_y:end_y, start_x:end_x] = color_value
+    line_drawing_image_pil = Image.fromarray(np.uint8(line_drawing_image))
+    hint_mask[start_y:end_y, start_x:end_x] = 255
+    hint_mask_pil = Image.fromarray(np.uint8(hint_mask))
+    return line_drawing_image_pil, hint_mask_pil
+
+
+with gr.Blocks() as demo:
+    gr.HTML(
+    """
+<div style="text-align: center;">
+    <h1 style="text-align: center; font-size: 3em;">🎨 Cobra:</h1>
+    <h3 style="text-align: center; font-size: 1.8em;">Efficient Line Art COlorization with BRoAder References</h3>
+    <p style="text-align: center; font-weight: bold;">
+        <a href="https://zhuang2002.github.io/Cobra/">Project Page</a> | 
+        <a href="https://arxiv.org">ArXiv Preprint</a> | 
+        <a href="https://github.com/Zhuang2002/Cobra">GitHub Repository</a>
+    </p>
+    <p style="text-align: center; font-weight: bold;">
+        NOTE: Each time you switch the input style, the corresponding model will be reloaded, which may take some time. Please be patient.
+    </p>
+    <p style="text-align: left; font-size: 1.1em;">
+        Welcome to the demo of <strong>Cobra</strong>. Follow the steps below to explore the capabilities of our model:
+    </p>
+</div>
+<div style="text-align: left; margin: 0 auto;">
+    <ol style="font-size: 1.1em;">
+        <li>Choose your input style: either line + shadow or line only.</li>
+        <li>Upload your image: Click the 'Upload' button to select the image you want to colorize.</li>
+        <li>Preprocess the image: Click the 'Preprocess' button to extract the line art from your image.</li>
+        <li>(Optional) Obtain color values and add color hints: Upload an image to the left area and click to get color values; then, add color hints to the line art on the right.</li>
+        <li>Upload reference images: Upload several reference images to help guide the colorization process.</li>
+        <li>(Optional) Set inference parameters: Adjust the inference settings as needed.</li>
+        <li>Run: Click the <b>Colorize</b> button to start the process.</li>
+    </ol>
+    <p>
+        ⏱️ <b>ZeroGPU Time Limit</b>: Hugging Face ZeroGPU has an inference time limit of 180 seconds. You may need to log in with a free account to use this demo. Large sampling steps might lead to timeout (GPU Abort). In that case, please consider logging in with a Pro account or running it on your local machine.
+    </p>
+</div>
+<div style="text-align: center;">
+    <p style="text-align: center; font-weight: bold;">
+        注意：每次切换输入样式时，相应的模型将被重新加载，可能需要一些时间。请耐心等待。
+    </p>
+    <p style="text-align: left; font-size: 1.1em;">
+        欢迎使用 <strong>Cobra</strong> 演示。请按照以下步骤探索我们模型的能力：
+    </p>
+</div>
+<div style="text-align: left; margin: 0 auto;">
+    <ol style="font-size: 1.1em;">
+        <li>选择输入样式：线条+阴影或仅线条。</li>
+        <li>上传您的图像：点击“上传”按钮选择您想要上色的图像。</li>
+        <li>预处理图像：点击“预处理”按钮从您的图像中提取线稿。</li>
+        <li>（可选）获取颜色值并添加颜色提示：上传一张图像到左侧区域，点击获取颜色值；然后，为右侧的线稿添加颜色提示。</li>
+        <li>上传参考图像：上传多个参考图像以帮助引导上色过程。</li>
+        <li>（可选）设置推理参数：根据需要调整推理设置。</li>
+        <li>运行：点击 <b>上色</b> 按钮开始处理。</li>
+    </ol>
+    <p>
+        ⏱️ <b>ZeroGPU时间限制</b>：Hugging Face ZeroGPU 的推理时间限制为 180 秒。您可能需要使用免费帐户登录以使用此演示。大采样步骤可能会导致超时（GPU 中止）。在这种情况下，请考虑使用专业帐户登录或在本地计算机上运行。
+    </p>
+</div>
+    """
+)
+    # extracted_line = gr.State()
+    # example_loading = gr.State(value=None)
+    hint_mask = gr.State()
+    hint_color = gr.State()
+    query_image_origin = gr.State()
+    resolution = gr.State()
+    extracted_image_ori = gr.State()
+    style = gr.State()
+    # updated_mask = gr.State()
+    # model_name = gr.Textbox(label="Model Name", value=None)
+
+    # style = gr.Dropdown(label="Model Name", choices=["line + shadow","line"], value="line + shadow")
+    
+    with gr.Column():
+        gr.Markdown("<h2 style='text-align: center;'>Load Model</h2>")
+        with gr.Row():
+            model_name = gr.Textbox(label="Model Name", value=None)
+            with gr.Column():
+                style = gr.Dropdown(label="Model List", choices=["line + shadow","line"], value="line + shadow")
+                change_ckpt_button = gr.Button("Load Model")
+                change_ckpt_button.click(change_ckpt, inputs=[style], outputs=[model_name])
+            # model_name = gr.Textbox(label="Model Name", value=None)
+
+        # 添加文字 英文 线稿提取
+        gr.Markdown("<h2 style='text-align: center;'>Line Drawing Extraction</h2>")
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(type="pil", label="Image to Colorize")
+                # resolution = gr.Radio(["800x800", "640x1024", "1024x640"], label="Select Resolution(Width*Height)", value="640x1024")
+                extract_button = gr.Button("Preprocess (Decolorize)")
+            extracted_image = gr.Image(type="pil", label="Decolorized Result")
+
+        
+        gr.Markdown("<h2 style='text-align: center;'>Color Selection 🎨 (Left) and Hint Placement 💡 (Right) - Click with Mouse 🖱️</h2>")
+
+        with gr.Row():
+            with gr.Column():
+                get_color_img = gr.Image(label="Upload an image to extract colors", type="numpy")
+                color_value_output = gr.Textbox(label="Color Value")
+                color_value_state = gr.State()
+                get_color_img.select(
+                    get_color_value,
+                    [get_color_img],
+                    [color_value_output, color_value_state]
+                )
+            with gr.Column():
+                hint_color = gr.Image(label="Line Drawing Image", type="pil")
+                # updated_image = gr.Image(label="Updated Image", type="pil")
+                hint_color.select(
+                    draw_square,
+                    [hint_color, hint_mask, color_value_state],
+                    [hint_color, hint_mask]
+                )
+
+        gr.Markdown("<h2 style='text-align: center;'>Retrieval and Colorization</h2>")
+        with gr.Row():
+            reference_images = gr.Files(label="Reference Images (Upload multiple)", file_count="multiple")
+            with gr.Column():
+                output_gallery = gr.Gallery(label="Colorization Results", type="pil")
+                seed = gr.Slider(label="Random Seed", minimum=0, maximum=100000, value=0, step=1)
+                num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=100, value=10, step=1)
+                colorize_button = gr.Button("Colorize")
+                top_k = gr.Slider(label="Top K (Total Reference Images: 4K) ", minimum=1, maximum=50, value=3, step=1)
+    
+
+    extract_button.click(
+        extract_sketch_line_image, 
+        inputs=[input_image, model_name], 
+        outputs=[extracted_image, 
+                    hint_color, 
+                    hint_mask, 
+                    query_image_origin, 
+                    extracted_image_ori,
+                    resolution
+                    ]
+    )
+    colorize_button.click(
+        colorize_image, 
+        inputs=[extracted_image, reference_images, resolution, seed, num_inference_steps, top_k, hint_mask, hint_color, query_image_origin, extracted_image_ori], 
+        outputs=output_gallery
+    )
+    with gr.Column():
+        gr.Markdown("### Quick Examples")
+        gr.Examples(
+            examples=examples,
+            inputs=[input_image, reference_images, model_name, seed, num_inference_steps, top_k],
+            label="Examples",
+            examples_per_page=8,
+        )
+
+
+demo.launch(server_name="0.0.0.0", server_port=52218)
\ No newline at end of file
diff --git a/cobra_utils/.DS_Store b/cobra_utils/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..a3d9c72411f907964c8c222c3674b8152b145eb8
Binary files /dev/null and b/cobra_utils/.DS_Store differ
diff --git a/cobra_utils/__pycache__/utils.cpython-311.pyc b/cobra_utils/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf60fa0942ca0a2b337d2297618c19caaa4e101b
Binary files /dev/null and b/cobra_utils/__pycache__/utils.cpython-311.pyc differ
diff --git a/cobra_utils/utils.py b/cobra_utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d650d2c1bc659c1587136e387d2e9b918c66910
--- /dev/null
+++ b/cobra_utils/utils.py
@@ -0,0 +1,603 @@
+import os
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+from PIL import Image
+from torchvision import transforms
+from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor
+import matplotlib.pyplot as plt
+import cv2
+import torch.nn.functional as F
+
+
+
+class _bn_relu_conv(nn.Module):
+    def __init__(self, in_filters, nb_filters, fw, fh, subsample=1):
+        super(_bn_relu_conv, self).__init__()
+        self.model = nn.Sequential(
+            nn.BatchNorm2d(in_filters, eps=1e-3),
+            nn.LeakyReLU(0.2),
+            nn.Conv2d(in_filters, nb_filters, (fw, fh), stride=subsample, padding=(fw//2, fh//2), padding_mode='zeros')
+        )
+
+    def forward(self, x):
+        return self.model(x)
+
+        # the following are for debugs
+        print("****", np.max(x.cpu().numpy()), np.min(x.cpu().numpy()), np.mean(x.cpu().numpy()), np.std(x.cpu().numpy()), x.shape)
+        for i,layer in enumerate(self.model):
+            if i != 2:
+                x = layer(x)
+            else:
+                x = layer(x)
+                #x = nn.functional.pad(x, (1, 1, 1, 1), mode='constant', value=0)
+            print("____", np.max(x.cpu().numpy()), np.min(x.cpu().numpy()), np.mean(x.cpu().numpy()), np.std(x.cpu().numpy()), x.shape)
+            print(x[0])
+        return x
+
+
+class _u_bn_relu_conv(nn.Module):
+    def __init__(self, in_filters, nb_filters, fw, fh, subsample=1):
+        super(_u_bn_relu_conv, self).__init__()
+        self.model = nn.Sequential(
+            nn.BatchNorm2d(in_filters, eps=1e-3),
+            nn.LeakyReLU(0.2),
+            nn.Conv2d(in_filters, nb_filters, (fw, fh), stride=subsample, padding=(fw//2, fh//2)),
+            nn.Upsample(scale_factor=2, mode='nearest')
+        )
+
+    def forward(self, x):
+        return self.model(x)
+
+
+
+class _shortcut(nn.Module):
+    def __init__(self, in_filters, nb_filters, subsample=1):
+        super(_shortcut, self).__init__()
+        self.process = False
+        self.model = None
+        if in_filters != nb_filters or subsample != 1:
+            self.process = True
+            self.model = nn.Sequential(
+                    nn.Conv2d(in_filters, nb_filters, (1, 1), stride=subsample)
+                )
+
+    def forward(self, x, y):
+        #print(x.size(), y.size(), self.process)
+        if self.process:
+            y0 = self.model(x)
+            #print("merge+", torch.max(y0+y), torch.min(y0+y),torch.mean(y0+y), torch.std(y0+y), y0.shape)
+            return y0 + y
+        else:
+            #print("merge", torch.max(x+y), torch.min(x+y),torch.mean(x+y), torch.std(x+y), y.shape)
+            return x + y
+
+class _u_shortcut(nn.Module):
+    def __init__(self, in_filters, nb_filters, subsample):
+        super(_u_shortcut, self).__init__()
+        self.process = False
+        self.model = None
+        if in_filters != nb_filters:
+            self.process = True
+            self.model = nn.Sequential(
+                nn.Conv2d(in_filters, nb_filters, (1, 1), stride=subsample, padding_mode='zeros'),
+                nn.Upsample(scale_factor=2, mode='nearest')
+            )
+
+    def forward(self, x, y):
+        if self.process:
+            return self.model(x) + y
+        else:
+            return x + y
+
+
+class basic_block(nn.Module):
+    def __init__(self, in_filters, nb_filters, init_subsample=1):
+        super(basic_block, self).__init__()
+        self.conv1 = _bn_relu_conv(in_filters, nb_filters, 3, 3, subsample=init_subsample)
+        self.residual = _bn_relu_conv(nb_filters, nb_filters, 3, 3)
+        self.shortcut = _shortcut(in_filters, nb_filters, subsample=init_subsample)
+
+    def forward(self, x):
+        x1 = self.conv1(x)
+        x2 = self.residual(x1)
+        return self.shortcut(x, x2)
+
+class _u_basic_block(nn.Module):
+    def __init__(self, in_filters, nb_filters, init_subsample=1):
+        super(_u_basic_block, self).__init__()
+        self.conv1 = _u_bn_relu_conv(in_filters, nb_filters, 3, 3, subsample=init_subsample)
+        self.residual = _bn_relu_conv(nb_filters, nb_filters, 3, 3)
+        self.shortcut = _u_shortcut(in_filters, nb_filters, subsample=init_subsample)
+
+    def forward(self, x):
+        y = self.residual(self.conv1(x))
+        return self.shortcut(x, y)
+
+
+class _residual_block(nn.Module):
+    def __init__(self, in_filters, nb_filters, repetitions, is_first_layer=False):
+        super(_residual_block, self).__init__()
+        layers = []
+        for i in range(repetitions):
+            init_subsample = 1
+            if i == repetitions - 1 and not is_first_layer:
+                init_subsample = 2
+            if i == 0:
+                l = basic_block(in_filters=in_filters, nb_filters=nb_filters, init_subsample=init_subsample)
+            else:
+                l = basic_block(in_filters=nb_filters, nb_filters=nb_filters, init_subsample=init_subsample)
+            layers.append(l)
+
+        self.model = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class _upsampling_residual_block(nn.Module):
+    def __init__(self, in_filters, nb_filters, repetitions):
+        super(_upsampling_residual_block, self).__init__()
+        layers = []
+        for i in range(repetitions):
+            l = None
+            if i == 0: 
+                l = _u_basic_block(in_filters=in_filters, nb_filters=nb_filters)#(input)
+            else:
+                l = basic_block(in_filters=nb_filters, nb_filters=nb_filters)#(input)
+            layers.append(l)
+
+        self.model = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class res_skip(nn.Module):
+
+    def __init__(self):
+        super(res_skip, self).__init__()
+        self.block0 = _residual_block(in_filters=1, nb_filters=24, repetitions=2, is_first_layer=True)#(input)
+        self.block1 = _residual_block(in_filters=24, nb_filters=48, repetitions=3)#(block0)
+        self.block2 = _residual_block(in_filters=48, nb_filters=96, repetitions=5)#(block1)
+        self.block3 = _residual_block(in_filters=96, nb_filters=192, repetitions=7)#(block2)
+        self.block4 = _residual_block(in_filters=192, nb_filters=384, repetitions=12)#(block3)
+        
+        self.block5 = _upsampling_residual_block(in_filters=384, nb_filters=192, repetitions=7)#(block4)
+        self.res1 = _shortcut(in_filters=192, nb_filters=192)#(block3, block5, subsample=(1,1))
+
+        self.block6 = _upsampling_residual_block(in_filters=192, nb_filters=96, repetitions=5)#(res1)
+        self.res2 = _shortcut(in_filters=96, nb_filters=96)#(block2, block6, subsample=(1,1))
+
+        self.block7 = _upsampling_residual_block(in_filters=96, nb_filters=48, repetitions=3)#(res2)
+        self.res3 = _shortcut(in_filters=48, nb_filters=48)#(block1, block7, subsample=(1,1))
+
+        self.block8 = _upsampling_residual_block(in_filters=48, nb_filters=24, repetitions=2)#(res3)
+        self.res4 = _shortcut(in_filters=24, nb_filters=24)#(block0,block8, subsample=(1,1))
+
+        self.block9 = _residual_block(in_filters=24, nb_filters=16, repetitions=2, is_first_layer=True)#(res4)
+        self.conv15 = _bn_relu_conv(in_filters=16, nb_filters=1, fh=1, fw=1, subsample=1)#(block7)
+
+    def forward(self, x):
+        x0 = self.block0(x)
+        x1 = self.block1(x0)
+        x2 = self.block2(x1)
+        x3 = self.block3(x2)
+        x4 = self.block4(x3)
+
+        x5 = self.block5(x4)
+        res1 = self.res1(x3, x5)
+
+        x6 = self.block6(res1)
+        res2 = self.res2(x2, x6)
+
+        x7 = self.block7(res2)
+        res3 = self.res3(x1, x7)
+
+        x8 = self.block8(res3)
+        res4 = self.res4(x0, x8)
+
+        x9 = self.block9(res4)
+        y = self.conv15(x9)
+
+        return y
+
+class MyDataset(Dataset):
+    def __init__(self, image_paths, transform=None):
+        self.image_paths = image_paths
+        self.transform = transform
+        
+    def get_class_label(self, image_name):
+        # your method here
+        head, tail = os.path.split(image_name)
+        #print(tail)
+        return tail
+        
+    def __getitem__(self, index):
+        image_path = self.image_paths[index]
+        x = Image.open(image_path)
+        y = self.get_class_label(image_path.split('/')[-1])
+        if self.transform is not None:
+            x = self.transform(x)
+        return x, y
+    
+    def __len__(self):
+        return len(self.image_paths)
+
+def loadImages(folder):
+    imgs = []
+    matches = []
+    
+    for filename in os.listdir(folder):
+        file_path = os.path.join(folder, filename)
+        if os.path.isfile(file_path):
+            matches.append(file_path)
+    
+    return matches
+
+
+def crop_center_square(image):
+
+    width, height = image.size
+    
+
+    side_length = min(width, height)
+
+    left = (width - side_length) // 2
+    top = (height - side_length) // 2
+    right = left + side_length
+    bottom = top + side_length
+    
+    cropped_image = image.crop((left, top, right, bottom))
+    
+    return cropped_image
+
+def crop_image(image, crop_size, stride):
+
+    width, height = image.size
+    crop_width, crop_height = crop_size
+    cropped_images = []
+
+    for j in range(0, height - crop_height + 1, stride):
+        for i in range(0, width - crop_width + 1, stride):
+            crop_box = (i, j, i + crop_width, j + crop_height)
+            cropped_image = image.crop(crop_box)
+            cropped_images.append(cropped_image)
+
+    return cropped_images
+
+def process_image_ref(image):
+
+    resized_image_512 = image.resize((512, 512))
+
+
+    image_list = [resized_image_512]
+
+
+    crop_size_384 = (384, 384)
+    stride_384 = 128
+    image_list.extend(crop_image(resized_image_512, crop_size_384, stride_384))
+
+
+    return image_list
+
+
+def process_image_Q(image):
+
+    resized_image_512 = image.resize((512, 512)).convert("RGB").convert("RGB")
+
+    image_list = []
+
+    crop_size_384 = (384, 384)
+    stride_384 = 128
+    image_list.extend(crop_image(resized_image_512, crop_size_384, stride_384))
+
+    return image_list
+
+def process_image(image, target_width=512, target_height = 512):
+    img_width, img_height = image.size
+    img_ratio = img_width / img_height
+    
+    target_ratio = target_width / target_height
+    
+    ratio_error = abs(img_ratio - target_ratio) / target_ratio
+    
+    if ratio_error < 0.15:
+        resized_image = image.resize((target_width, target_height), Image.BICUBIC)
+    else:
+        if img_ratio > target_ratio:
+            new_width = int(img_height * target_ratio)
+            left = int((0 + img_width - new_width)/2)
+            top = 0
+            right = left + new_width
+            bottom = img_height
+        else:
+            new_height = int(img_width / target_ratio)
+            left = 0
+            top = int((0 + img_height - new_height)/2)
+            right = img_width
+            bottom = top + new_height
+        
+        cropped_image = image.crop((left, top, right, bottom))
+        resized_image = cropped_image.resize((target_width, target_height), Image.BICUBIC)
+    
+    return resized_image.convert('RGB')
+
+def crop_image_varres(image, crop_size, h_stride, w_stride):
+        width, height = image.size
+        crop_width, crop_height = crop_size
+        cropped_images = []
+
+        for j in range(0, height - crop_height + 1, h_stride):
+            for i in range(0, width - crop_width + 1, w_stride):
+                crop_box = (i, j, i + crop_width, j + crop_height)
+                cropped_image = image.crop(crop_box)
+                cropped_images.append(cropped_image)
+
+        return cropped_images
+
+def process_image_ref_varres(image, target_width=512, target_height = 512):
+    resized_image_512 = image.resize((target_width, target_height))
+
+    image_list = [resized_image_512]
+
+    crop_size_384 = (target_width//4*3, target_height//4*3)
+    w_stride_384 = target_width//4
+    h_stride_384 = target_height//4
+    image_list.extend(crop_image_varres(resized_image_512, crop_size_384, h_stride = h_stride_384, w_stride = w_stride_384))
+
+    return image_list
+
+
+def process_image_Q_varres(image, target_width=512, target_height = 512):
+
+    resized_image_512 = image.resize((target_width, target_height)).convert("RGB").convert("RGB")
+
+    image_list = []
+
+    crop_size_384 = (target_width//4*3, target_height//4*3)
+    w_stride_384 = target_width//4
+    h_stride_384 = target_height//4
+    image_list.extend(crop_image_varres(resized_image_512, crop_size_384, h_stride = h_stride_384, w_stride = w_stride_384))
+
+
+    return image_list
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class ResNetBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride=1):
+        super(ResNetBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_channels != out_channels:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_channels)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)  # 直接相加
+        out = F.relu(out)
+        return out
+
+class TwoLayerResNet(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(TwoLayerResNet, self).__init__()
+        self.block1 = ResNetBlock(in_channels, out_channels)
+        self.block2 = ResNetBlock(out_channels, out_channels)
+        self.block3 = ResNetBlock(out_channels, out_channels)
+        self.block4 = ResNetBlock(out_channels, out_channels)
+
+    def forward(self, x):
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        return x
+    
+
+class MultiHiddenResNetModel(nn.Module):
+    def __init__(self, channels_list, num_tensors):
+        super(MultiHiddenResNetModel, self).__init__()
+        self.two_layer_resnets = nn.ModuleList([TwoLayerResNet(channels_list[idx]*2, channels_list[min(len(channels_list)-1,idx+2)]) for idx in range(num_tensors)])
+
+    def forward(self, tensor_list):
+        processed_list = []
+        for i, tensor in enumerate(tensor_list):
+            tensor = self.two_layer_resnets[i](tensor)
+            processed_list.append(tensor)
+        
+        return processed_list
+    
+
+def calculate_target_size(h, w):
+    if random.random()>0.5:
+        target_h = (h // 8) * 8
+        target_w = (w // 8) * 8
+    elif random.random()>0.5:
+        target_h = (h // 8) * 8
+        target_w = (w // 8) * 8
+    else:
+        target_h = (h // 8) * 8
+        target_w = (w // 8) * 8
+    
+    if target_h == 0:
+        target_h = 8
+    if target_w == 0:
+        target_w = 8
+    
+    return target_h, target_w
+
+
+def downsample_tensor(tensor):
+    b, c, h, w = tensor.shape
+    
+    target_h, target_w = calculate_target_size(h, w)
+    
+    downsampled_tensor = F.interpolate(tensor, size=(target_h, target_w), mode='bilinear', align_corners=False)
+    
+    return downsampled_tensor
+
+
+
+def get_pixart_config():
+    pixart_config = {
+            "_class_name": "Transformer2DModel",
+            "_diffusers_version": "0.22.0.dev0",
+            "activation_fn": "gelu-approximate",
+            "attention_bias": True,
+            "attention_head_dim": 72,
+            "attention_type": "default",
+            "caption_channels": 4096,
+            "cross_attention_dim": 1152,
+            "double_self_attention": False,
+            "dropout": 0.0,
+            "in_channels": 4,
+            # "interpolation_scale": 2,
+            "norm_elementwise_affine": False,
+            "norm_eps": 1e-06,
+            "norm_num_groups": 32,
+            "norm_type": "ada_norm_single",
+            "num_attention_heads": 16,
+            "num_embeds_ada_norm": 1000,
+            "num_layers": 28,
+            "num_vector_embeds": None,
+            "only_cross_attention": False,
+            "out_channels": 8,
+            "patch_size": 2,
+            "sample_size": 128,
+            "upcast_attention": False,
+            # "use_additional_conditions": False,
+            "use_linear_projection": False
+            }
+    return pixart_config
+
+
+
+class DoubleConv(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.double_conv = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, 3, 1, 1),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+            nn.Conv2d(out_channels, out_channels, 3, 1, 1),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU()
+        )
+
+    def forward(self, x):
+        return self.double_conv(x)
+
+
+class UNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # left
+        self.left_conv_1 = DoubleConv(6, 64)
+        self.down_1 = nn.MaxPool2d(2, 2)
+
+        self.left_conv_2 = DoubleConv(64, 128)
+        self.down_2 = nn.MaxPool2d(2, 2)
+
+        self.left_conv_3 = DoubleConv(128, 256)
+        self.down_3 = nn.MaxPool2d(2, 2)
+
+        self.left_conv_4 = DoubleConv(256, 512)
+        self.down_4 = nn.MaxPool2d(2, 2)
+
+        # center
+        self.center_conv = DoubleConv(512, 1024)
+
+        # right
+        self.up_1 = nn.ConvTranspose2d(1024, 512, 2, 2)
+        self.right_conv_1 = DoubleConv(1024, 512)
+
+        self.up_2 = nn.ConvTranspose2d(512, 256, 2, 2)
+        self.right_conv_2 = DoubleConv(512, 256)
+
+        self.up_3 = nn.ConvTranspose2d(256, 128, 2, 2)
+        self.right_conv_3 = DoubleConv(256, 128)
+
+        self.up_4 = nn.ConvTranspose2d(128, 64, 2, 2)
+        self.right_conv_4 = DoubleConv(128, 64)
+
+        # output
+        self.output = nn.Conv2d(64, 3, 1, 1, 0)
+
+    def forward(self, x):
+        # left
+        x1 = self.left_conv_1(x)
+        x1_down = self.down_1(x1)
+
+        x2 = self.left_conv_2(x1_down)
+        x2_down = self.down_2(x2)
+
+        x3 = self.left_conv_3(x2_down)
+        x3_down = self.down_3(x3)
+
+        x4 = self.left_conv_4(x3_down)
+        x4_down = self.down_4(x4)
+
+        # center
+        x5 = self.center_conv(x4_down)
+
+        # right
+        x6_up = self.up_1(x5)
+        temp = torch.cat((x6_up, x4), dim=1)
+        x6 = self.right_conv_1(temp)
+
+        x7_up = self.up_2(x6)
+        temp = torch.cat((x7_up, x3), dim=1)
+        x7 = self.right_conv_2(temp)
+
+        x8_up = self.up_3(x7)
+        temp = torch.cat((x8_up, x2), dim=1)
+        x8 = self.right_conv_3(temp)
+
+        x9_up = self.up_4(x8)
+        temp = torch.cat((x9_up, x1), dim=1)
+        x9 = self.right_conv_4(temp)
+
+        # output
+        output = self.output(x9)
+
+        return output
+
+
+
+from copy import deepcopy
+
+def init_causal_dit(model, base_model):
+    temp_ckpt = deepcopy(base_model)
+    checkpoint = temp_ckpt.state_dict()
+    # checkpoint['pos_embed_1d.weight'] = torch.zeros(3, model.config.num_attention_heads * model.config.attention_head_dim, device=model.pos_embed_1d.weight.device, dtype = model.pos_embed_1d.weight.dtype)
+    model.load_state_dict(checkpoint, strict=True)
+    del temp_ckpt
+    return model
+
+def init_controlnet(model, base_model):
+    temp_ckpt = deepcopy(base_model)
+    checkpoint = temp_ckpt.state_dict()
+    checkpoint_weight = checkpoint['pos_embed.proj.weight']
+    new_weight = torch.zeros(model.pos_embed.proj.weight.shape, device=model.pos_embed.proj.weight.device, dtype = model.pos_embed.proj.weight.dtype)
+    print('model.pos_embed.proj.weight.shape',model.pos_embed.proj.weight.shape)
+    new_weight[:, :4] = checkpoint_weight
+    checkpoint['pos_embed.proj.weight'] = new_weight
+    print('new_weight', new_weight.dtype)
+    model.load_state_dict(checkpoint, strict=False)
+    del temp_ckpt
+    return model
\ No newline at end of file
diff --git a/diffusers/.DS_Store b/diffusers/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..55e8b6050642f6a9af5511ace40cabc8b9db74fc
Binary files /dev/null and b/diffusers/.DS_Store differ
diff --git a/diffusers/.github/ISSUE_TEMPLATE/bug-report.yml b/diffusers/.github/ISSUE_TEMPLATE/bug-report.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a0517725284e6062287e08dad3495d94b0bd0481
--- /dev/null
+++ b/diffusers/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -0,0 +1,110 @@
+name: "\U0001F41B Bug Report"
+description: Report a bug on Diffusers
+labels: [ "bug" ]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks a lot for taking the time to file this issue 🤗.
+        Issues do not only help to improve the library, but also publicly document common problems, questions, workflows for the whole community!
+        Thus, issues are of the same importance as pull requests when contributing to this library ❤️.
+        In order to make your issue as **useful for the community as possible**, let's try to stick to some simple guidelines:
+        - 1. Please try to be as precise and concise as possible.
+             *Give your issue a fitting title. Assume that someone which very limited knowledge of Diffusers can understand your issue. Add links to the source code, documentation other issues, pull requests etc...*
+        - 2. If your issue is about something not working, **always** provide a reproducible code snippet. The reader should be able to reproduce your issue by **only copy-pasting your code snippet into a Python shell**.
+             *The community cannot solve your issue if it cannot reproduce it. If your bug is related to training, add your training script and make everything needed to train public. Otherwise, just add a simple Python code snippet.*
+        - 3. Add the **minimum** amount of code / context that is needed to understand, reproduce your issue.
+             *Make the life of maintainers easy. `diffusers` is getting many issues every day. Make sure your issue is about one bug and one bug only. Make sure you add only the context, code needed to understand your issues - nothing more. Generally, every issue is a way of documenting this library, try to make it a good documentation entry.*
+        - 4. For issues related to community pipelines (i.e., the pipelines located in the `examples/community` folder), please tag the author of the pipeline in your issue thread as those pipelines are not maintained.
+  - type: markdown
+    attributes:
+      value: |
+        For more in-detail information on how to write good issues you can have a look [here](https://huggingface.co/course/chapter8/5?fw=pt).
+  - type: textarea
+    id: bug-description
+    attributes:
+      label: Describe the bug
+      description: A clear and concise description of what the bug is. If you intend to submit a pull request for this issue, tell us in the description. Thanks!
+      placeholder: Bug description
+    validations:
+      required: true
+  - type: textarea
+    id: reproduction
+    attributes:
+      label: Reproduction
+      description: Please provide a minimal reproducible code which we can copy/paste and reproduce the issue.
+      placeholder: Reproduction
+    validations:
+      required: true
+  - type: textarea
+    id: logs
+    attributes:
+      label: Logs
+      description: "Please include the Python logs if you can."
+      render: shell
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System Info
+      description: Please share your system info with us. You can run the command `diffusers-cli env` and copy-paste its output below.
+      placeholder: Diffusers version, platform, Python version, ...
+    validations:
+      required: true
+  - type: textarea
+    id: who-can-help
+    attributes:
+      label: Who can help?
+      description: |
+        Your issue will be replied to more quickly if you can figure out the right person to tag with @.
+        If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
+
+        All issues are read by one of the core maintainers, so if you don't know who to tag, just leave this blank and
+        a core maintainer will ping the right person.
+
+        Please tag a maximum of 2 people.
+
+        Questions on DiffusionPipeline (Saving, Loading, From pretrained, ...): @sayakpaul @DN6
+
+        Questions on pipelines:
+        - Stable Diffusion @yiyixuxu @asomoza
+        - Stable Diffusion XL @yiyixuxu @sayakpaul @DN6
+        - Stable Diffusion 3: @yiyixuxu @sayakpaul @DN6 @asomoza
+        - Kandinsky @yiyixuxu
+        - ControlNet @sayakpaul @yiyixuxu @DN6
+        - T2I Adapter @sayakpaul @yiyixuxu @DN6
+        - IF @DN6
+        - Text-to-Video / Video-to-Video @DN6 @a-r-r-o-w
+        - Wuerstchen @DN6
+        - Other: @yiyixuxu @DN6
+        - Improving generation quality: @asomoza
+
+        Questions on models:
+        - UNet @DN6 @yiyixuxu @sayakpaul
+        - VAE @sayakpaul @DN6 @yiyixuxu
+        - Transformers/Attention @DN6 @yiyixuxu @sayakpaul
+
+        Questions on single file checkpoints: @DN6
+
+        Questions on Schedulers: @yiyixuxu
+
+        Questions on LoRA: @sayakpaul
+
+        Questions on Textual Inversion: @sayakpaul
+
+        Questions on Training:
+        - DreamBooth @sayakpaul
+        - Text-to-Image Fine-tuning @sayakpaul
+        - Textual Inversion @sayakpaul
+        - ControlNet @sayakpaul
+
+        Questions on Tests: @DN6 @sayakpaul @yiyixuxu
+
+        Questions on Documentation: @stevhliu
+
+        Questions on JAX- and MPS-related things: @pcuenca
+
+        Questions on audio pipelines: @sanchit-gandhi
+
+
+
+      placeholder: "@Username ..."
diff --git a/diffusers/.github/ISSUE_TEMPLATE/config.yml b/diffusers/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e81992fe3c69b65f58f627252ffa6569d1cd67e2
--- /dev/null
+++ b/diffusers/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,4 @@
+contact_links:
+  - name: Questions / Discussions
+    url: https://github.com/huggingface/diffusers/discussions
+    about: General usage questions and community discussions
diff --git a/diffusers/.github/ISSUE_TEMPLATE/feature_request.md b/diffusers/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000000000000000000000000000000000..42f93232c1de7c73dcd90cdb6b0733bbb4461508
--- /dev/null
+++ b/diffusers/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: "\U0001F680 Feature Request"
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...].
+
+**Describe the solution you'd like.**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered.**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context.**
+Add any other context or screenshots about the feature request here.
diff --git a/diffusers/.github/ISSUE_TEMPLATE/feedback.md b/diffusers/.github/ISSUE_TEMPLATE/feedback.md
new file mode 100644
index 0000000000000000000000000000000000000000..25808b6575a405694f64dbf1b5a0ece8e0fcd2e2
--- /dev/null
+++ b/diffusers/.github/ISSUE_TEMPLATE/feedback.md
@@ -0,0 +1,12 @@
+---
+name: "💬 Feedback about API Design"
+about: Give feedback about the current API design
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**What API design would you like to have changed or added to the library? Why?**
+
+**What use case would this enable or better enable? Can you give us a code example?**
diff --git a/diffusers/.github/ISSUE_TEMPLATE/new-model-addition.yml b/diffusers/.github/ISSUE_TEMPLATE/new-model-addition.yml
new file mode 100644
index 0000000000000000000000000000000000000000..432e287dd3348965466a696ee5e01a187f179ee5
--- /dev/null
+++ b/diffusers/.github/ISSUE_TEMPLATE/new-model-addition.yml
@@ -0,0 +1,31 @@
+name: "\U0001F31F New Model/Pipeline/Scheduler Addition"
+description: Submit a proposal/request to implement a new diffusion model/pipeline/scheduler
+labels: [ "New model/pipeline/scheduler" ]
+
+body:
+  - type: textarea
+    id: description-request
+    validations:
+      required: true
+    attributes:
+      label: Model/Pipeline/Scheduler description
+      description: |
+        Put any and all important information relative to the model/pipeline/scheduler
+
+  - type: checkboxes
+    id: information-tasks
+    attributes:
+      label: Open source status
+      description: |
+          Please note that if the model implementation isn't available or if the weights aren't open-source, we are less likely to implement it in `diffusers`.
+      options:
+        - label: "The model implementation is available."
+        - label: "The model weights are available (Only relevant if addition is not a scheduler)."
+
+  - type: textarea
+    id: additional-info
+    attributes:
+      label: Provide useful links for the implementation
+      description: |
+        Please provide information regarding the implementation, the weights, and the authors.
+        Please mention the authors by @gh-username if you're aware of their usernames.
diff --git a/diffusers/.github/ISSUE_TEMPLATE/translate.md b/diffusers/.github/ISSUE_TEMPLATE/translate.md
new file mode 100644
index 0000000000000000000000000000000000000000..3471ec9640d727e7cdf223852d2012834660e88a
--- /dev/null
+++ b/diffusers/.github/ISSUE_TEMPLATE/translate.md
@@ -0,0 +1,29 @@
+---
+name: 🌐 Translating a New Language?
+about: Start a new translation effort in your language
+title: '[<languageCode>] Translating docs to <languageName>'
+labels: WIP
+assignees: ''
+
+---
+
+<!--
+Note: Please search to see if an issue already exists for the language you are trying to translate.
+-->
+
+Hi!
+
+Let's bring the documentation to all the <languageName>-speaking community 🌐.
+
+Who would want to translate? Please follow the 🤗 [TRANSLATING guide](https://github.com/huggingface/diffusers/blob/main/docs/TRANSLATING.md). Here is a list of the files ready for translation. Let us know in this issue if you'd like to translate any, and we'll add your name to the list.
+
+Some notes:
+
+* Please translate using an informal tone (imagine you are talking with a friend about Diffusers 🤗).
+* Please translate in a gender-neutral way.
+* Add your translations to the folder called `<languageCode>` inside the [source folder](https://github.com/huggingface/diffusers/tree/main/docs/source).
+* Register your translation in `<languageCode>/_toctree.yml`; please follow the order of the [English version](https://github.com/huggingface/diffusers/blob/main/docs/source/en/_toctree.yml).
+* Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @stevhliu for review.
+* 🙋 If you'd like others to help you with the translation, you can also post in the 🤗 [forums](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63).
+
+Thank you so much for your help! 🤗
diff --git a/diffusers/.github/PULL_REQUEST_TEMPLATE.md b/diffusers/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000000000000000000000000000000000000..e4b2b45a4ecde200e65e3a2e16ac0cdb0b87d7c3
--- /dev/null
+++ b/diffusers/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,61 @@
+# What does this PR do?
+
+<!--
+Congratulations! You've made it this far! You're not quite done yet though.
+
+Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.
+
+Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.
+
+Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
+-->
+
+<!-- Remove if not applicable -->
+
+Fixes # (issue)
+
+
+## Before submitting
+- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
+- [ ] Did you read the [contributor guideline](https://github.com/huggingface/diffusers/blob/main/CONTRIBUTING.md)?
+- [ ] Did you read our [philosophy doc](https://github.com/huggingface/diffusers/blob/main/PHILOSOPHY.md) (important for complex PRs)?
+- [ ] Was this discussed/approved via a GitHub issue or the [forum](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63)? Please add a link to it if that's the case.
+- [ ] Did you make sure to update the documentation with your changes? Here are the
+      [documentation guidelines](https://github.com/huggingface/diffusers/tree/main/docs), and
+      [here are tips on formatting docstrings](https://github.com/huggingface/diffusers/tree/main/docs#writing-source-documentation).
+- [ ] Did you write any new necessary tests?
+
+
+## Who can review?
+
+Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
+members/contributors who may be interested in your PR.
+
+<!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @.
+
+ If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
+ Please tag fewer than 3 people.
+
+Core library:
+
+- Schedulers: @yiyixuxu
+- Pipelines and pipeline callbacks: @yiyixuxu and @asomoza
+- Training examples: @sayakpaul
+- Docs: @stevhliu and @sayakpaul
+- JAX and MPS: @pcuenca
+- Audio: @sanchit-gandhi
+- General functionalities: @sayakpaul @yiyixuxu @DN6
+
+Integrations:
+
+- deepspeed: HF Trainer/Accelerate: @SunMarc
+- PEFT: @sayakpaul @BenjaminBossan
+
+HF projects:
+
+- accelerate: [different repo](https://github.com/huggingface/accelerate)
+- datasets: [different repo](https://github.com/huggingface/datasets)
+- transformers: [different repo](https://github.com/huggingface/transformers)
+- safetensors: [different repo](https://github.com/huggingface/safetensors)
+
+-->
diff --git a/diffusers/.github/actions/setup-miniconda/action.yml b/diffusers/.github/actions/setup-miniconda/action.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b1f4f194bfe1fd14e03239269e466e7978e3d5c5
--- /dev/null
+++ b/diffusers/.github/actions/setup-miniconda/action.yml
@@ -0,0 +1,146 @@
+name: Set up conda environment for testing
+
+description: Sets up miniconda in your ${RUNNER_TEMP} environment and gives you the ${CONDA_RUN} environment variable so you don't have to worry about polluting non-empeheral runners anymore
+
+inputs:
+  python-version:
+    description: If set to any value, don't use sudo to clean the workspace
+    required: false
+    type: string
+    default: "3.9"
+  miniconda-version:
+    description: Miniconda version to install
+    required: false
+    type: string
+    default: "4.12.0"
+  environment-file:
+    description: Environment file to install dependencies from
+    required: false
+    type: string
+    default: ""
+
+runs:
+  using: composite
+  steps:
+      # Use the same trick from https://github.com/marketplace/actions/setup-miniconda
+      # to refresh the cache daily. This is kind of optional though
+      - name: Get date
+        id: get-date
+        shell: bash
+        run: echo "today=$(/bin/date -u '+%Y%m%d')d" >> $GITHUB_OUTPUT
+      - name: Setup miniconda cache
+        id: miniconda-cache
+        uses: actions/cache@v2
+        with:
+          path: ${{ runner.temp }}/miniconda
+          key: miniconda-${{ runner.os }}-${{ runner.arch }}-${{ inputs.python-version }}-${{ steps.get-date.outputs.today }}
+      - name: Install miniconda (${{ inputs.miniconda-version }})
+        if: steps.miniconda-cache.outputs.cache-hit != 'true'
+        env:
+          MINICONDA_VERSION: ${{ inputs.miniconda-version }}
+        shell: bash -l {0}
+        run: |
+          MINICONDA_INSTALL_PATH="${RUNNER_TEMP}/miniconda"
+          mkdir -p "${MINICONDA_INSTALL_PATH}"
+          case ${RUNNER_OS}-${RUNNER_ARCH} in
+            Linux-X64)
+              MINICONDA_ARCH="Linux-x86_64"
+              ;;
+            macOS-ARM64)
+              MINICONDA_ARCH="MacOSX-arm64"
+              ;;
+            macOS-X64)
+              MINICONDA_ARCH="MacOSX-x86_64"
+              ;;
+            *)
+            echo "::error::Platform ${RUNNER_OS}-${RUNNER_ARCH} currently unsupported using this action"
+              exit 1
+              ;;
+          esac
+          MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-py39_${MINICONDA_VERSION}-${MINICONDA_ARCH}.sh"
+          curl -fsSL "${MINICONDA_URL}" -o "${MINICONDA_INSTALL_PATH}/miniconda.sh"
+          bash "${MINICONDA_INSTALL_PATH}/miniconda.sh" -b -u -p "${MINICONDA_INSTALL_PATH}"
+          rm -rf "${MINICONDA_INSTALL_PATH}/miniconda.sh"
+      - name: Update GitHub path to include miniconda install
+        shell: bash
+        run: |
+          MINICONDA_INSTALL_PATH="${RUNNER_TEMP}/miniconda"
+          echo "${MINICONDA_INSTALL_PATH}/bin" >> $GITHUB_PATH
+      - name: Setup miniconda env cache (with env file)
+        id: miniconda-env-cache-env-file
+        if: ${{ runner.os }} == 'macOS' && ${{ inputs.environment-file }} != ''
+        uses: actions/cache@v2
+        with:
+          path: ${{ runner.temp }}/conda-python-${{ inputs.python-version }}
+          key: miniconda-env-${{ runner.os }}-${{ runner.arch }}-${{ inputs.python-version }}-${{ steps.get-date.outputs.today }}-${{ hashFiles(inputs.environment-file) }}
+      - name: Setup miniconda env cache (without env file)
+        id: miniconda-env-cache
+        if: ${{ runner.os }} == 'macOS' && ${{ inputs.environment-file }} == ''
+        uses: actions/cache@v2
+        with:
+          path: ${{ runner.temp }}/conda-python-${{ inputs.python-version }}
+          key: miniconda-env-${{ runner.os }}-${{ runner.arch }}-${{ inputs.python-version }}-${{ steps.get-date.outputs.today }}
+      - name: Setup conda environment with python (v${{ inputs.python-version }})
+        if: steps.miniconda-env-cache-env-file.outputs.cache-hit != 'true' && steps.miniconda-env-cache.outputs.cache-hit != 'true'
+        shell: bash
+        env:
+          PYTHON_VERSION: ${{ inputs.python-version }}
+          ENV_FILE: ${{ inputs.environment-file }}
+        run: |
+          CONDA_BASE_ENV="${RUNNER_TEMP}/conda-python-${PYTHON_VERSION}"
+          ENV_FILE_FLAG=""
+          if [[ -f "${ENV_FILE}" ]]; then
+            ENV_FILE_FLAG="--file ${ENV_FILE}"
+          elif [[ -n "${ENV_FILE}" ]]; then
+            echo "::warning::Specified env file (${ENV_FILE}) not found, not going to include it"
+          fi
+          conda create \
+            --yes \
+            --prefix "${CONDA_BASE_ENV}" \
+            "python=${PYTHON_VERSION}" \
+            ${ENV_FILE_FLAG} \
+            cmake=3.22 \
+            conda-build=3.21 \
+            ninja=1.10 \
+            pkg-config=0.29 \
+            wheel=0.37
+      - name: Clone the base conda environment and update GitHub env
+        shell: bash
+        env:
+          PYTHON_VERSION: ${{ inputs.python-version }}
+          CONDA_BASE_ENV: ${{ runner.temp }}/conda-python-${{ inputs.python-version }}
+        run: |
+          CONDA_ENV="${RUNNER_TEMP}/conda_environment_${GITHUB_RUN_ID}"
+          conda create \
+            --yes \
+            --prefix "${CONDA_ENV}" \
+            --clone "${CONDA_BASE_ENV}"
+          # TODO: conda-build could not be cloned because it hardcodes the path, so it
+          # could not be cached
+          conda install --yes -p ${CONDA_ENV} conda-build=3.21
+          echo "CONDA_ENV=${CONDA_ENV}" >> "${GITHUB_ENV}"
+          echo "CONDA_RUN=conda run -p ${CONDA_ENV} --no-capture-output" >> "${GITHUB_ENV}"
+          echo "CONDA_BUILD=conda run -p ${CONDA_ENV} conda-build" >> "${GITHUB_ENV}"
+          echo "CONDA_INSTALL=conda install -p ${CONDA_ENV}" >> "${GITHUB_ENV}"
+      - name: Get disk space usage and throw an error for low disk space
+        shell: bash
+        run: |
+          echo "Print the available disk space for manual inspection"
+          df -h
+          # Set the minimum requirement space to 4GB
+          MINIMUM_AVAILABLE_SPACE_IN_GB=4
+          MINIMUM_AVAILABLE_SPACE_IN_KB=$(($MINIMUM_AVAILABLE_SPACE_IN_GB * 1024 * 1024))
+          # Use KB to avoid floating point warning like 3.1GB
+          df -k | tr -s ' ' | cut -d' ' -f 4,9 | while read -r LINE;
+          do
+            AVAIL=$(echo $LINE | cut -f1 -d' ')
+            MOUNT=$(echo $LINE | cut -f2 -d' ')
+            if [ "$MOUNT" = "/" ]; then
+              if [ "$AVAIL" -lt "$MINIMUM_AVAILABLE_SPACE_IN_KB" ]; then
+                echo "There is only ${AVAIL}KB free space left in $MOUNT, which is less than the minimum requirement of ${MINIMUM_AVAILABLE_SPACE_IN_KB}KB. Please help create an issue to PyTorch Release Engineering via https://github.com/pytorch/test-infra/issues and provide the link to the workflow run."
+                exit 1;
+              else
+                echo "There is ${AVAIL}KB free space left in $MOUNT, continue"
+              fi
+            fi
+          done
diff --git a/diffusers/.github/workflows/benchmark.yml b/diffusers/.github/workflows/benchmark.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d311c1c73f1105bfd7d942ba4a80c6d431ebf30a
--- /dev/null
+++ b/diffusers/.github/workflows/benchmark.yml
@@ -0,0 +1,67 @@
+name: Benchmarking tests
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "30 1 1,15 * *" # every 2 weeks on the 1st and the 15th of every month at 1:30 AM
+
+env:
+  DIFFUSERS_IS_CI: yes
+  HF_HUB_ENABLE_HF_TRANSFER: 1
+  HF_HOME: /mnt/cache
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+
+jobs:
+  torch_pipelines_cuda_benchmark_tests:
+    env:
+      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_BENCHMARK }}
+    name: Torch Core Pipelines CUDA Benchmarking Tests
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    container:
+      image: diffusers/diffusers-pytorch-compile-cuda
+      options: --shm-size "16gb" --ipc host --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+      - name: Install dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+          python -m uv pip install pandas peft
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Diffusers Benchmarking
+        env:
+            HF_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
+            BASE_PATH: benchmark_outputs
+        run: |
+          export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")
+          cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark_test_reports
+          path: benchmarks/benchmark_outputs
+
+      - name: Report success status
+        if: ${{ success() }}
+        run: |
+          pip install requests && python utils/notify_benchmarking_status.py --status=success
+
+      - name: Report failure status
+        if: ${{ failure() }}
+        run: |
+          pip install requests && python utils/notify_benchmarking_status.py --status=failure
\ No newline at end of file
diff --git a/diffusers/.github/workflows/build_docker_images.yml b/diffusers/.github/workflows/build_docker_images.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9f4776db4315d9418a8a5bd07877f0a08f9ec2dd
--- /dev/null
+++ b/diffusers/.github/workflows/build_docker_images.yml
@@ -0,0 +1,103 @@
+name: Test, build, and push Docker images
+
+on:
+  pull_request: # During PRs, we just check if the changes Dockerfiles can be successfully built
+    branches:
+      - main
+    paths:
+      - "docker/**"
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *" # every day at midnight
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  REGISTRY: diffusers
+  CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }}
+
+jobs:
+  test-build-docker-images:
+    runs-on:
+      group: aws-general-8-plus
+    if: github.event_name == 'pull_request'
+    steps:
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: Check out code
+        uses: actions/checkout@v3
+
+      - name: Find Changed Dockerfiles
+        id: file_changes
+        uses: jitterbit/get-changed-files@v1
+        with:
+          format: 'space-delimited'
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build Changed Docker Images
+        run: |
+          CHANGED_FILES="${{ steps.file_changes.outputs.all }}"
+          for FILE in $CHANGED_FILES; do
+            if [[ "$FILE" == docker/*Dockerfile ]]; then
+              DOCKER_PATH="${FILE%/Dockerfile}"
+              DOCKER_TAG=$(basename "$DOCKER_PATH")
+              echo "Building Docker image for $DOCKER_TAG"
+              docker build -t "$DOCKER_TAG" "$DOCKER_PATH"
+            fi
+          done
+        if: steps.file_changes.outputs.all != ''
+
+  build-and-push-docker-images:
+    runs-on:
+      group: aws-general-8-plus
+    if: github.event_name != 'pull_request'
+
+    permissions:
+      contents: read
+      packages: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        image-name:
+          - diffusers-pytorch-cpu
+          - diffusers-pytorch-cuda
+          - diffusers-pytorch-compile-cuda
+          - diffusers-pytorch-xformers-cuda
+          - diffusers-flax-cpu
+          - diffusers-flax-tpu
+          - diffusers-onnxruntime-cpu
+          - diffusers-onnxruntime-cuda
+          - diffusers-doc-builder
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ env.REGISTRY }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Build and push
+        uses: docker/build-push-action@v3
+        with:
+          no-cache: true
+          context: ./docker/${{ matrix.image-name }}
+          push: true
+          tags: ${{ env.REGISTRY }}/${{ matrix.image-name }}:latest
+
+      - name: Post to a Slack channel
+        id: slack
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          # Slack channel id, channel name, or user id to post message.
+          # See also: https://api.slack.com/methods/chat.postMessage#channels
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: "🤗 Results of the ${{ matrix.image-name }} Docker Image build"
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
diff --git a/diffusers/.github/workflows/build_documentation.yml b/diffusers/.github/workflows/build_documentation.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6d4193e3cccc42e6824e9f0881a6b3e50bfa7173
--- /dev/null
+++ b/diffusers/.github/workflows/build_documentation.yml
@@ -0,0 +1,27 @@
+name: Build documentation
+
+on:
+  push:
+    branches:
+      - main
+      - doc-builder*
+      - v*-release
+      - v*-patch
+    paths:
+      - "src/diffusers/**.py"
+      - "examples/**"
+      - "docs/**"
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
+    with:
+      commit_sha: ${{ github.sha }}
+      install_libgl1: true
+      package: diffusers
+      notebook_folder: diffusers_doc
+      languages: en ko zh ja pt
+      custom_container: diffusers/diffusers-doc-builder
+    secrets:
+      token: ${{ secrets.HUGGINGFACE_PUSH }}
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
diff --git a/diffusers/.github/workflows/build_pr_documentation.yml b/diffusers/.github/workflows/build_pr_documentation.yml
new file mode 100644
index 0000000000000000000000000000000000000000..52e0757331639c7335132766cfc2afb2d74e0368
--- /dev/null
+++ b/diffusers/.github/workflows/build_pr_documentation.yml
@@ -0,0 +1,23 @@
+name: Build PR Documentation
+
+on:
+  pull_request:
+    paths:
+      - "src/diffusers/**.py"
+      - "examples/**"
+      - "docs/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    with:
+      commit_sha: ${{ github.event.pull_request.head.sha }}
+      pr_number: ${{ github.event.number }}
+      install_libgl1: true
+      package: diffusers
+      languages: en ko zh ja pt
+      custom_container: diffusers/diffusers-doc-builder
diff --git a/diffusers/.github/workflows/mirror_community_pipeline.yml b/diffusers/.github/workflows/mirror_community_pipeline.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a7a2a809bbebc5b32aec5d3743d746dd0d7c794b
--- /dev/null
+++ b/diffusers/.github/workflows/mirror_community_pipeline.yml
@@ -0,0 +1,102 @@
+name: Mirror Community Pipeline
+
+on:
+  # Push changes on the main branch
+  push:
+    branches:
+      - main
+    paths:
+      - 'examples/community/**.py'
+
+    # And on tag creation (e.g. `v0.28.1`)
+    tags:
+      - '*'
+
+  # Manual trigger with ref input
+  workflow_dispatch:
+    inputs:
+      ref:
+        description: "Either 'main' or a tag ref"
+        required: true
+        default: 'main'
+
+jobs:
+  mirror_community_pipeline:
+    env:
+      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_COMMUNITY_MIRROR }}
+
+    runs-on: ubuntu-latest
+    steps:
+      # Checkout to correct ref
+      #   If workflow dispatch
+      #     If ref is 'main', set:
+      #       CHECKOUT_REF=refs/heads/main
+      #       PATH_IN_REPO=main
+      #     Else it must be a tag. Set:
+      #       CHECKOUT_REF=refs/tags/{tag}
+      #       PATH_IN_REPO={tag}
+      #   If not workflow dispatch
+      #     If ref is 'refs/heads/main' => set 'main'
+      #     Else it must be a tag => set {tag}
+      - name: Set checkout_ref and path_in_repo
+        run: |
+          if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
+            if [ -z "${{ github.event.inputs.ref }}" ]; then
+              echo "Error: Missing ref input"
+              exit 1
+            elif [ "${{ github.event.inputs.ref }}" == "main" ]; then
+              echo "CHECKOUT_REF=refs/heads/main" >> $GITHUB_ENV
+              echo "PATH_IN_REPO=main" >> $GITHUB_ENV
+            else
+              echo "CHECKOUT_REF=refs/tags/${{ github.event.inputs.ref }}" >> $GITHUB_ENV
+              echo "PATH_IN_REPO=${{ github.event.inputs.ref }}" >> $GITHUB_ENV
+            fi
+          elif [ "${{ github.ref }}" == "refs/heads/main" ]; then
+            echo "CHECKOUT_REF=${{ github.ref }}" >> $GITHUB_ENV
+            echo "PATH_IN_REPO=main" >> $GITHUB_ENV
+          else
+            # e.g. refs/tags/v0.28.1 -> v0.28.1
+            echo "CHECKOUT_REF=${{ github.ref }}" >> $GITHUB_ENV
+            echo "PATH_IN_REPO=$(echo ${{ github.ref }} | sed 's/^refs\/tags\///')" >> $GITHUB_ENV
+          fi
+      - name: Print env vars
+        run: |
+          echo "CHECKOUT_REF: ${{ env.CHECKOUT_REF }}"
+          echo "PATH_IN_REPO: ${{ env.PATH_IN_REPO }}"
+      - uses: actions/checkout@v3
+        with:
+          ref: ${{ env.CHECKOUT_REF }}
+
+      # Setup + install dependencies
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install --upgrade huggingface_hub
+
+      # Check secret is set
+      - name: whoami
+        run: huggingface-cli whoami
+        env:
+            HF_TOKEN: ${{ secrets.HF_TOKEN_MIRROR_COMMUNITY_PIPELINES }}
+
+      # Push to HF! (under subfolder based on checkout ref)
+      # https://huggingface.co/datasets/diffusers/community-pipelines-mirror
+      - name: Mirror community pipeline to HF
+        run: huggingface-cli upload diffusers/community-pipelines-mirror ./examples/community ${PATH_IN_REPO} --repo-type dataset
+        env:
+            PATH_IN_REPO: ${{ env.PATH_IN_REPO }}
+            HF_TOKEN: ${{ secrets.HF_TOKEN_MIRROR_COMMUNITY_PIPELINES }}
+
+      - name: Report success status
+        if: ${{ success() }}
+        run: |
+          pip install requests && python utils/notify_community_pipelines_mirror.py --status=success
+
+      - name: Report failure status
+        if: ${{ failure() }}
+        run: |
+          pip install requests && python utils/notify_community_pipelines_mirror.py --status=failure
\ No newline at end of file
diff --git a/diffusers/.github/workflows/nightly_tests.yml b/diffusers/.github/workflows/nightly_tests.yml
new file mode 100644
index 0000000000000000000000000000000000000000..142dbb0f1e8fcddc0d27951904990edd00c40d9c
--- /dev/null
+++ b/diffusers/.github/workflows/nightly_tests.yml
@@ -0,0 +1,408 @@
+name: Nightly and release tests on main/release branch
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *" # every day at midnight
+
+env:
+  DIFFUSERS_IS_CI: yes
+  HF_HUB_ENABLE_HF_TRANSFER: 1
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 600
+  RUN_SLOW: yes
+  RUN_NIGHTLY: yes
+  PIPELINE_USAGE_CUTOFF: 5000
+  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+jobs:
+  setup_torch_cuda_pipeline_matrix:
+    name: Setup Torch Pipelines CUDA Slow Tests Matrix
+    runs-on:
+      group: aws-general-8-plus
+    container:
+      image: diffusers/diffusers-pytorch-cpu
+    outputs:
+      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: Install dependencies
+        run: |
+          pip install -e .[test]
+          pip install huggingface_hub
+      - name: Fetch Pipeline Matrix
+        id: fetch_pipeline_matrix
+        run: |
+          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
+          echo $matrix
+          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
+
+      - name: Pipeline Tests Artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-pipelines.json
+          path: reports
+
+  run_nightly_tests_for_torch_pipelines:
+    name: Nightly Torch Pipelines CUDA Tests
+    needs: setup_torch_cuda_pipeline_matrix
+    strategy:
+      fail-fast: false
+      max-parallel: 8
+      matrix:
+        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: nvidia-smi
+      - name: Install dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          python -m uv pip install pytest-reportlog
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Pipeline CUDA Test
+        env:
+          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+          CUBLAS_WORKSPACE_CONFIG: :16:8
+        run: |
+          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+            -s -v -k "not Flax and not Onnx" \
+            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
+            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
+            tests/pipelines/${{ matrix.module }}
+      - name: Failure short reports
+        if: ${{ failure() }}
+        run: |
+          cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
+          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: pipeline_${{ matrix.module }}_test_reports
+          path: reports
+      - name: Generate Report and Notify Channel
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+  run_nightly_tests_for_other_torch_modules:
+    name: Nightly Torch CUDA Tests
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host --gpus 0
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      fail-fast: false
+      max-parallel: 2
+      matrix:
+        module: [models, schedulers, lora, others, single_file, examples]
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install pytest-reportlog
+    - name: Environment
+      run: python utils/print_env.py
+
+    - name: Run nightly PyTorch CUDA tests for non-pipeline modules
+      if: ${{ matrix.module != 'examples'}}
+      env:
+        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+        CUBLAS_WORKSPACE_CONFIG: :16:8
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_torch_${{ matrix.module }}_cuda \
+          --report-log=tests_torch_${{ matrix.module }}_cuda.log \
+          tests/${{ matrix.module }}
+
+    - name: Run nightly example tests with Torch
+      if: ${{ matrix.module == 'examples' }}
+      env:
+        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+        CUBLAS_WORKSPACE_CONFIG: :16:8
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v --make-reports=examples_torch_cuda \
+          --report-log=examples_torch_cuda.log \
+          examples/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt
+        cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: torch_${{ matrix.module }}_cuda_test_reports
+        path: reports
+
+    - name: Generate Report and Notify Channel
+      if: always()
+      run: |
+        pip install slack_sdk tabulate
+        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+  run_flax_tpu_tests:
+    name: Nightly Flax TPU Tests
+    runs-on: docker-tpu
+    if: github.event_name == 'schedule'
+
+    container:
+      image: diffusers/diffusers-flax-tpu
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install pytest-reportlog
+
+    - name: Environment
+      run: python utils/print_env.py
+
+    - name: Run nightly Flax TPU tests
+      env:
+        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+      run: |
+        python -m pytest -n 0 \
+          -s -v -k "Flax" \
+          --make-reports=tests_flax_tpu \
+          --report-log=tests_flax_tpu.log \
+          tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_flax_tpu_stats.txt
+        cat reports/tests_flax_tpu_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: flax_tpu_test_reports
+        path: reports
+
+    - name: Generate Report and Notify Channel
+      if: always()
+      run: |
+        pip install slack_sdk tabulate
+        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+  run_nightly_onnx_tests:
+    name: Nightly ONNXRuntime CUDA tests on Ubuntu
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: diffusers/diffusers-onnxruntime-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: nvidia-smi
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install pytest-reportlog
+    - name: Environment
+      run: python utils/print_env.py
+
+    - name: Run Nightly ONNXRuntime CUDA tests
+      env:
+        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Onnx" \
+          --make-reports=tests_onnx_cuda \
+          --report-log=tests_onnx_cuda.log \
+          tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_onnx_cuda_stats.txt
+        cat reports/tests_onnx_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: tests_onnx_cuda_reports
+        path: reports
+
+    - name: Generate Report and Notify Channel
+      if: always()
+      run: |
+        pip install slack_sdk tabulate
+        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+# M1 runner currently not well supported
+# TODO: (Dhruv) add these back when we setup better testing for Apple Silicon
+#  run_nightly_tests_apple_m1:
+#    name: Nightly PyTorch MPS tests on MacOS
+#    runs-on: [ self-hosted, apple-m1 ]
+#    if: github.event_name == 'schedule'
+#
+#    steps:
+#      - name: Checkout diffusers
+#        uses: actions/checkout@v3
+#        with:
+#          fetch-depth: 2
+#
+#      - name: Clean checkout
+#        shell: arch -arch arm64 bash {0}
+#        run: |
+#          git clean -fxd
+#      - name: Setup miniconda
+#        uses: ./.github/actions/setup-miniconda
+#        with:
+#          python-version: 3.9
+#
+#      - name: Install dependencies
+#        shell: arch -arch arm64 bash {0}
+#        run: |
+#          ${CONDA_RUN} python -m pip install --upgrade pip uv
+#          ${CONDA_RUN} python -m uv pip install -e [quality,test]
+#          ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+#          ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
+#          ${CONDA_RUN} python -m uv pip install pytest-reportlog
+#      - name: Environment
+#        shell: arch -arch arm64 bash {0}
+#        run: |
+#          ${CONDA_RUN} python utils/print_env.py
+#      - name: Run nightly PyTorch tests on M1 (MPS)
+#        shell: arch -arch arm64 bash {0}
+#        env:
+#          HF_HOME: /System/Volumes/Data/mnt/cache
+#          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+#        run: |
+#          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
+#            --report-log=tests_torch_mps.log \
+#            tests/
+#      - name: Failure short reports
+#        if: ${{ failure() }}
+#        run: cat reports/tests_torch_mps_failures_short.txt
+#
+#      - name: Test suite reports artifacts
+#        if: ${{ always() }}
+#        uses: actions/upload-artifact@v4
+#        with:
+#          name: torch_mps_test_reports
+#          path: reports
+#
+#      - name: Generate Report and Notify Channel
+#        if: always()
+#        run: |
+#          pip install slack_sdk tabulate
+#          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY  run_nightly_tests_apple_m1:
+#    name: Nightly PyTorch MPS tests on MacOS
+#    runs-on: [ self-hosted, apple-m1 ]
+#    if: github.event_name == 'schedule'
+#
+#    steps:
+#      - name: Checkout diffusers
+#        uses: actions/checkout@v3
+#        with:
+#          fetch-depth: 2
+#
+#      - name: Clean checkout
+#        shell: arch -arch arm64 bash {0}
+#        run: |
+#          git clean -fxd
+#      - name: Setup miniconda
+#        uses: ./.github/actions/setup-miniconda
+#        with:
+#          python-version: 3.9
+#
+#      - name: Install dependencies
+#        shell: arch -arch arm64 bash {0}
+#        run: |
+#          ${CONDA_RUN} python -m pip install --upgrade pip uv
+#          ${CONDA_RUN} python -m uv pip install -e [quality,test]
+#          ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+#          ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
+#          ${CONDA_RUN} python -m uv pip install pytest-reportlog
+#      - name: Environment
+#        shell: arch -arch arm64 bash {0}
+#        run: |
+#          ${CONDA_RUN} python utils/print_env.py
+#      - name: Run nightly PyTorch tests on M1 (MPS)
+#        shell: arch -arch arm64 bash {0}
+#        env:
+#          HF_HOME: /System/Volumes/Data/mnt/cache
+#          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+#        run: |
+#          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
+#            --report-log=tests_torch_mps.log \
+#            tests/
+#      - name: Failure short reports
+#        if: ${{ failure() }}
+#        run: cat reports/tests_torch_mps_failures_short.txt
+#
+#      - name: Test suite reports artifacts
+#        if: ${{ always() }}
+#        uses: actions/upload-artifact@v4
+#        with:
+#          name: torch_mps_test_reports
+#          path: reports
+#
+#      - name: Generate Report and Notify Channel
+#        if: always()
+#        run: |
+#          pip install slack_sdk tabulate
+#          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
\ No newline at end of file
diff --git a/diffusers/.github/workflows/notify_slack_about_release.yml b/diffusers/.github/workflows/notify_slack_about_release.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f33e917f095cf64f0b8b640d65e66fb54849517a
--- /dev/null
+++ b/diffusers/.github/workflows/notify_slack_about_release.yml
@@ -0,0 +1,23 @@
+name: Notify Slack about a release
+
+on:
+  workflow_dispatch:
+  release:
+    types: [published]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.8'
+
+    - name: Notify Slack about the release
+      env:
+        SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
+      run: pip install requests && python utils/notify_slack_about_release.py
diff --git a/diffusers/.github/workflows/pr_dependency_test.yml b/diffusers/.github/workflows/pr_dependency_test.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c4bd86112fb88556315d25ae9824287e1b5e0ee4
--- /dev/null
+++ b/diffusers/.github/workflows/pr_dependency_test.yml
@@ -0,0 +1,35 @@
+name: Run dependency tests
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "src/diffusers/**.py"
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  check_dependencies:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m pip install --upgrade pip uv
+          python -m uv pip install -e .
+          python -m uv pip install pytest
+      - name: Check for soft dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          pytest tests/others/test_dependencies.py
diff --git a/diffusers/.github/workflows/pr_flax_dependency_test.yml b/diffusers/.github/workflows/pr_flax_dependency_test.yml
new file mode 100644
index 0000000000000000000000000000000000000000..bbad729299177146ab68cd417ba603f3016104dd
--- /dev/null
+++ b/diffusers/.github/workflows/pr_flax_dependency_test.yml
@@ -0,0 +1,38 @@
+name: Run Flax dependency tests
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "src/diffusers/**.py"
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  check_flax_dependencies:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m pip install --upgrade pip uv
+          python -m uv pip install -e .
+          python -m uv pip install "jax[cpu]>=0.2.16,!=0.3.2"
+          python -m uv pip install "flax>=0.4.1"
+          python -m uv pip install "jaxlib>=0.1.65"
+          python -m uv pip install pytest
+      - name: Check for soft dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          pytest tests/others/test_dependencies.py
diff --git a/diffusers/.github/workflows/pr_test_fetcher.yml b/diffusers/.github/workflows/pr_test_fetcher.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b032bb842786a73761b9ef73dfc1cc87a4ab0e26
--- /dev/null
+++ b/diffusers/.github/workflows/pr_test_fetcher.yml
@@ -0,0 +1,177 @@
+name: Fast tests for PRs - Test Fetcher
+
+on: workflow_dispatch
+
+env:
+  DIFFUSERS_IS_CI: yes
+  OMP_NUM_THREADS: 4
+  MKL_NUM_THREADS: 4
+  PYTEST_TIMEOUT: 60
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  setup_pr_tests:
+    name: Setup PR Tests
+    runs-on:
+      group: aws-general-8-plus
+    container:
+      image: diffusers/diffusers-pytorch-cpu
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+    defaults:
+      run:
+        shell: bash
+    outputs:
+      matrix: ${{ steps.set_matrix.outputs.matrix }}
+      test_map: ${{ steps.set_matrix.outputs.test_map }}
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+    - name: Environment
+      run: |
+        python utils/print_env.py
+        echo $(git --version)
+    - name: Fetch Tests
+      run: |
+        python utils/tests_fetcher.py | tee test_preparation.txt
+    - name: Report fetched tests
+      uses: actions/upload-artifact@v3
+      with:
+        name: test_fetched
+        path: test_preparation.txt
+    - id: set_matrix
+      name: Create Test Matrix
+      # The `keys` is used as GitHub actions matrix for jobs, i.e. `models`, `pipelines`, etc.
+      # The `test_map` is used to get the actual identified test files under each key.
+      # If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
+      run: |
+        if [ -f test_map.json ]; then
+            keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(json.dumps(d))')
+            test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(json.dumps(test_map))')
+        else
+            keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
+            test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
+        fi
+        echo $keys
+        echo $test_map
+        echo "matrix=$keys" >> $GITHUB_OUTPUT
+        echo "test_map=$test_map" >> $GITHUB_OUTPUT
+
+  run_pr_tests:
+    name: Run PR Tests
+    needs: setup_pr_tests
+    if: contains(fromJson(needs.setup_pr_tests.outputs.matrix), 'dummy') != true
+    strategy:
+      fail-fast: false
+      max-parallel: 2
+      matrix:
+        modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
+    runs-on:
+      group: aws-general-8-plus
+    container:
+      image: diffusers/diffusers-pytorch-cpu
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m pip install -e [quality,test]
+        python -m pip install accelerate
+
+    - name: Environment
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python utils/print_env.py
+
+    - name: Run all selected tests on CPU
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.modules }}_tests_cpu ${{ fromJson(needs.setup_pr_tests.outputs.test_map)[matrix.modules] }}
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      continue-on-error: true
+      run: |
+        cat reports/${{ matrix.modules }}_tests_cpu_stats.txt
+        cat reports/${{ matrix.modules }}_tests_cpu_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v3
+      with:
+          name: ${{ matrix.modules }}_test_reports
+          path: reports
+
+  run_staging_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: Hub tests for models, schedulers, and pipelines
+            framework: hub_tests_pytorch
+            runner: aws-general-8-plus
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_hub
+
+    name: ${{ matrix.config.name }}
+    runs-on:
+      group: ${{ matrix.config.runner }}
+    container:
+      image: ${{ matrix.config.image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m pip install -e [quality,test]
+
+    - name: Environment
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python utils/print_env.py
+
+    - name: Run Hub tests for models, schedulers, and pipelines on a staging env
+      if: ${{ matrix.config.framework == 'hub_tests_pytorch' }}
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        HUGGINGFACE_CO_STAGING=true python -m pytest \
+          -m "is_staging_test" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: pr_${{ matrix.config.report }}_test_reports
+        path: reports
diff --git a/diffusers/.github/workflows/pr_test_peft_backend.yml b/diffusers/.github/workflows/pr_test_peft_backend.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0433067e54bad832d798ed1434d3764154ce6753
--- /dev/null
+++ b/diffusers/.github/workflows/pr_test_peft_backend.yml
@@ -0,0 +1,132 @@
+name: Fast tests for PRs - PEFT backend
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "src/diffusers/**.py"
+      - "tests/**.py"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  DIFFUSERS_IS_CI: yes
+  OMP_NUM_THREADS: 4
+  MKL_NUM_THREADS: 4
+  PYTEST_TIMEOUT: 60
+
+jobs:
+  check_code_quality:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check quality
+        run: make quality
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
+
+  check_repository_consistency:
+    needs: check_code_quality
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check repo consistency
+        run: |
+          python utils/check_copies.py
+          python utils/check_dummies.py
+          make deps_table_check_updated
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
+
+  run_fast_tests:
+    needs: [check_code_quality, check_repository_consistency]
+    strategy:
+      fail-fast: false
+      matrix:
+        lib-versions: ["main", "latest"]
+
+
+    name: LoRA - ${{ matrix.lib-versions }}
+
+    runs-on:
+      group: aws-general-8-plus
+
+    container:
+      image: diffusers/diffusers-pytorch-cpu
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        if [ "${{ matrix.lib-versions }}" == "main" ]; then
+            python -m pip install -U peft@git+https://github.com/huggingface/peft.git
+            python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git
+            pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        else
+            python -m uv pip install -U peft transformers accelerate
+        fi
+
+    - name: Environment
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python utils/print_env.py
+
+    - name: Run fast PyTorch LoRA CPU tests with PEFT backend
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+          -s -v \
+          --make-reports=tests_${{ matrix.lib-versions }} \
+          tests/lora/
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+          -s -v \
+          --make-reports=tests_models_lora_${{ matrix.lib-versions }} \
+          tests/models/ -k "lora"
+
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_${{ matrix.lib-versions }}_failures_short.txt
+        cat reports/tests_models_lora_${{ matrix.lib-versions }}_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: pr_${{ matrix.lib-versions }}_test_reports
+        path: reports
diff --git a/diffusers/.github/workflows/pr_tests.yml b/diffusers/.github/workflows/pr_tests.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6bb67976b170cdaab0323508db37ee939e57c4af
--- /dev/null
+++ b/diffusers/.github/workflows/pr_tests.yml
@@ -0,0 +1,236 @@
+name: Fast tests for PRs
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "src/diffusers/**.py"
+      - "benchmarks/**.py"
+      - "examples/**.py"
+      - "scripts/**.py"
+      - "tests/**.py"
+      - ".github/**.yml"
+      - "utils/**.py"
+  push:
+    branches:
+      - ci-*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  DIFFUSERS_IS_CI: yes
+  HF_HUB_ENABLE_HF_TRANSFER: 1
+  OMP_NUM_THREADS: 4
+  MKL_NUM_THREADS: 4
+  PYTEST_TIMEOUT: 60
+
+jobs:
+  check_code_quality:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check quality
+        run: make quality
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
+
+  check_repository_consistency:
+    needs: check_code_quality
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check repo consistency
+        run: |
+          python utils/check_copies.py
+          python utils/check_dummies.py
+          make deps_table_check_updated
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
+
+  run_fast_tests:
+    needs: [check_code_quality, check_repository_consistency]
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: Fast PyTorch Pipeline CPU tests
+            framework: pytorch_pipelines
+            runner: aws-highmemory-32-plus
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_cpu_pipelines
+          - name: Fast PyTorch Models & Schedulers CPU tests
+            framework: pytorch_models
+            runner: aws-general-8-plus
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_cpu_models_schedulers
+          - name: Fast Flax CPU tests
+            framework: flax
+            runner: aws-general-8-plus
+            image: diffusers/diffusers-flax-cpu
+            report: flax_cpu
+          - name: PyTorch Example CPU tests
+            framework: pytorch_examples
+            runner: aws-general-8-plus
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_example_cpu
+
+    name: ${{ matrix.config.name }}
+
+    runs-on:
+      group: ${{ matrix.config.runner }}
+
+    container:
+      image: ${{ matrix.config.image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install accelerate
+
+    - name: Environment
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python utils/print_env.py
+
+    - name: Run fast PyTorch Pipeline CPU tests
+      if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/pipelines
+
+    - name: Run fast PyTorch Model Scheduler CPU tests
+      if: ${{ matrix.config.framework == 'pytorch_models' }}
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx and not Dependency" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/models tests/schedulers tests/others
+
+    - name: Run fast Flax TPU tests
+      if: ${{ matrix.config.framework == 'flax' }}
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Flax" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests
+
+    - name: Run example PyTorch CPU tests
+      if: ${{ matrix.config.framework == 'pytorch_examples' }}
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install peft timm
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+          --make-reports=tests_${{ matrix.config.report }} \
+          examples
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: pr_${{ matrix.config.framework }}_${{ matrix.config.report }}_test_reports
+        path: reports
+
+  run_staging_tests:
+    needs: [check_code_quality, check_repository_consistency]
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: Hub tests for models, schedulers, and pipelines
+            framework: hub_tests_pytorch
+            runner:
+              group: aws-general-8-plus
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_hub
+
+    name: ${{ matrix.config.name }}
+
+    runs-on: ${{ matrix.config.runner }}
+
+    container:
+      image: ${{ matrix.config.image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+
+    - name: Environment
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python utils/print_env.py
+
+    - name: Run Hub tests for models, schedulers, and pipelines on a staging env
+      if: ${{ matrix.config.framework == 'hub_tests_pytorch' }}
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        HUGGINGFACE_CO_STAGING=true python -m pytest \
+          -m "is_staging_test" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: pr_${{ matrix.config.report }}_test_reports
+        path: reports
diff --git a/diffusers/.github/workflows/pr_torch_dependency_test.yml b/diffusers/.github/workflows/pr_torch_dependency_test.yml
new file mode 100644
index 0000000000000000000000000000000000000000..16a7724fe744beb2d0e173fa85ad7ab1075f49ab
--- /dev/null
+++ b/diffusers/.github/workflows/pr_torch_dependency_test.yml
@@ -0,0 +1,36 @@
+name: Run Torch dependency tests
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "src/diffusers/**.py"
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  check_torch_dependencies:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m pip install --upgrade pip uv
+          python -m uv pip install -e .
+          python -m uv pip install torch torchvision torchaudio
+          python -m uv pip install pytest
+      - name: Check for soft dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          pytest tests/others/test_dependencies.py
diff --git a/diffusers/.github/workflows/push_tests.yml b/diffusers/.github/workflows/push_tests.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f07e6cda0d59b8d73b4a0e28f543a283ab346f7f
--- /dev/null
+++ b/diffusers/.github/workflows/push_tests.yml
@@ -0,0 +1,391 @@
+name: Fast GPU Tests on main
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+    paths:
+      - "src/diffusers/**.py"
+      - "examples/**.py"
+      - "tests/**.py"
+
+env:
+  DIFFUSERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  HF_HUB_ENABLE_HF_TRANSFER: 1
+  PYTEST_TIMEOUT: 600
+  PIPELINE_USAGE_CUTOFF: 50000
+
+jobs:
+  setup_torch_cuda_pipeline_matrix:
+    name: Setup Torch Pipelines CUDA Slow Tests Matrix
+    runs-on:
+      group: aws-general-8-plus
+    container:
+      image: diffusers/diffusers-pytorch-cpu
+    outputs:
+      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: Install dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Fetch Pipeline Matrix
+        id: fetch_pipeline_matrix
+        run: |
+          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
+          echo $matrix
+          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
+      - name: Pipeline Tests Artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-pipelines.json
+          path: reports
+
+  torch_pipelines_cuda_tests:
+    name: Torch Pipelines CUDA Tests
+    needs: setup_torch_cuda_pipeline_matrix
+    strategy:
+      fail-fast: false
+      max-parallel: 8
+      matrix:
+        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+      - name: Install dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Slow PyTorch CUDA checkpoint tests on Ubuntu
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+          CUBLAS_WORKSPACE_CONFIG: :16:8
+        run: |
+          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+            -s -v -k "not Flax and not Onnx" \
+            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
+            tests/pipelines/${{ matrix.module }}
+      - name: Failure short reports
+        if: ${{ failure() }}
+        run: |
+          cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
+          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: pipeline_${{ matrix.module }}_test_reports
+          path: reports
+
+  torch_cuda_tests:
+    name: Torch CUDA Tests
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host --gpus 0
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      fail-fast: false
+      max-parallel: 2
+      matrix:
+        module: [models, schedulers, lora, others, single_file]
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run PyTorch CUDA tests
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+        CUBLAS_WORKSPACE_CONFIG: :16:8
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_torch_cuda_${{ matrix.module }} \
+          tests/${{ matrix.module }}
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_torch_cuda_${{ matrix.module }}_stats.txt
+        cat reports/tests_torch_cuda_${{ matrix.module }}_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: torch_cuda_test_reports_${{ matrix.module }}
+        path: reports
+
+  flax_tpu_tests:
+    name: Flax TPU Tests
+    runs-on: docker-tpu
+    container:
+      image: diffusers/diffusers-flax-tpu
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --privileged
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run slow Flax TPU tests
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        python -m pytest -n 0 \
+          -s -v -k "Flax" \
+          --make-reports=tests_flax_tpu \
+          tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_flax_tpu_stats.txt
+        cat reports/tests_flax_tpu_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: flax_tpu_test_reports
+        path: reports
+
+  onnx_cuda_tests:
+    name: ONNX CUDA Tests
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: diffusers/diffusers-onnxruntime-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run slow ONNXRuntime CUDA tests
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Onnx" \
+          --make-reports=tests_onnx_cuda \
+          tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_onnx_cuda_stats.txt
+        cat reports/tests_onnx_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: onnx_cuda_test_reports
+        path: reports
+
+  run_torch_compile_tests:
+    name: PyTorch Compile CUDA tests
+
+    runs-on:
+      group: aws-g4dn-2xlarge
+
+    container:
+      image: diffusers/diffusers-pytorch-compile-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: |
+        nvidia-smi
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test,training]
+    - name: Environment
+      run: |
+        python utils/print_env.py
+    - name: Run example tests on GPU
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        RUN_COMPILE: yes
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_torch_compile_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: torch_compile_test_reports
+        path: reports
+
+  run_xformers_tests:
+    name: PyTorch xformers CUDA tests
+
+    runs-on:
+      group: aws-g4dn-2xlarge
+
+    container:
+      image: diffusers/diffusers-pytorch-xformers-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: |
+        nvidia-smi
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test,training]
+    - name: Environment
+      run: |
+        python utils/print_env.py
+    - name: Run example tests on GPU
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_torch_xformers_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: torch_xformers_test_reports
+        path: reports
+
+  run_examples_tests:
+    name: Examples PyTorch CUDA tests on Ubuntu
+
+    runs-on:
+      group: aws-g4dn-2xlarge
+
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: |
+        nvidia-smi
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test,training]
+
+    - name: Environment
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python utils/print_env.py
+
+    - name: Run example tests on GPU
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install timm
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/examples_torch_cuda_stats.txt
+        cat reports/examples_torch_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: examples_test_reports
+        path: reports
diff --git a/diffusers/.github/workflows/push_tests_fast.yml b/diffusers/.github/workflows/push_tests_fast.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e8a73446de737999a05b7d3e829e5a4832425bce
--- /dev/null
+++ b/diffusers/.github/workflows/push_tests_fast.yml
@@ -0,0 +1,126 @@
+name: Fast tests on main
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "src/diffusers/**.py"
+      - "examples/**.py"
+      - "tests/**.py"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  DIFFUSERS_IS_CI: yes
+  HF_HOME: /mnt/cache
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  HF_HUB_ENABLE_HF_TRANSFER: 1
+  PYTEST_TIMEOUT: 600
+  RUN_SLOW: no
+
+jobs:
+  run_fast_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: Fast PyTorch CPU tests on Ubuntu
+            framework: pytorch
+            runner: aws-general-8-plus
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_cpu
+          - name: Fast Flax CPU tests on Ubuntu
+            framework: flax
+            runner: aws-general-8-plus
+            image: diffusers/diffusers-flax-cpu
+            report: flax_cpu
+          - name: Fast ONNXRuntime CPU tests on Ubuntu
+            framework: onnxruntime
+            runner: aws-general-8-plus
+            image: diffusers/diffusers-onnxruntime-cpu
+            report: onnx_cpu
+          - name: PyTorch Example CPU tests on Ubuntu
+            framework: pytorch_examples
+            runner: aws-general-8-plus
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_example_cpu
+
+    name: ${{ matrix.config.name }}
+
+    runs-on:
+      group: ${{ matrix.config.runner }}
+
+    container:
+      image: ${{ matrix.config.image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+
+    - name: Environment
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python utils/print_env.py
+
+    - name: Run fast PyTorch CPU tests
+      if: ${{ matrix.config.framework == 'pytorch' }}
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run fast Flax TPU tests
+      if: ${{ matrix.config.framework == 'flax' }}
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Flax" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run fast ONNXRuntime CPU tests
+      if: ${{ matrix.config.framework == 'onnxruntime' }}
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run example PyTorch CPU tests
+      if: ${{ matrix.config.framework == 'pytorch_examples' }}
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install peft timm
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+          --make-reports=tests_${{ matrix.config.report }} \
+          examples
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: pr_${{ matrix.config.report }}_test_reports
+        path: reports
diff --git a/diffusers/.github/workflows/push_tests_mps.yml b/diffusers/.github/workflows/push_tests_mps.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8d521074a08fdf05c457479daa06f5f99bad9faa
--- /dev/null
+++ b/diffusers/.github/workflows/push_tests_mps.yml
@@ -0,0 +1,76 @@
+name: Fast mps tests on main
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "src/diffusers/**.py"
+      - "tests/**.py"
+
+env:
+  DIFFUSERS_IS_CI: yes
+  HF_HOME: /mnt/cache
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  HF_HUB_ENABLE_HF_TRANSFER: 1
+  PYTEST_TIMEOUT: 600
+  RUN_SLOW: no
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  run_fast_tests_apple_m1:
+    name: Fast PyTorch MPS tests on MacOS
+    runs-on: macos-13-xlarge
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Clean checkout
+      shell: arch -arch arm64 bash {0}
+      run: |
+        git clean -fxd
+
+    - name: Setup miniconda
+      uses: ./.github/actions/setup-miniconda
+      with:
+        python-version: 3.9
+
+    - name: Install dependencies
+      shell: arch -arch arm64 bash {0}
+      run: |
+        ${CONDA_RUN} python -m pip install --upgrade pip uv
+        ${CONDA_RUN} python -m uv pip install -e [quality,test]
+        ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio
+        ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+        ${CONDA_RUN} python -m uv pip install transformers --upgrade
+
+    - name: Environment
+      shell: arch -arch arm64 bash {0}
+      run: |
+        ${CONDA_RUN} python utils/print_env.py
+
+    - name: Run fast PyTorch tests on M1 (MPS)
+      shell: arch -arch arm64 bash {0}
+      env:
+        HF_HOME: /System/Volumes/Data/mnt/cache
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        ${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_torch_mps_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: pr_torch_mps_test_reports
+        path: reports
diff --git a/diffusers/.github/workflows/pypi_publish.yaml b/diffusers/.github/workflows/pypi_publish.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b09f60a352324da6ee0ed72aeade2c4ff3d9aeb
--- /dev/null
+++ b/diffusers/.github/workflows/pypi_publish.yaml
@@ -0,0 +1,81 @@
+# Adapted from https://blog.deepjyoti30.dev/pypi-release-github-action
+
+name: PyPI release
+
+on:
+  workflow_dispatch:
+  push:
+    tags:
+      - "*"
+
+jobs:
+  find-and-checkout-latest-branch:
+    runs-on: ubuntu-latest
+    outputs:
+      latest_branch: ${{ steps.set_latest_branch.outputs.latest_branch }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+
+      - name: Fetch latest branch
+        id: fetch_latest_branch
+        run: |
+          pip install -U requests packaging
+          LATEST_BRANCH=$(python utils/fetch_latest_release_branch.py)
+          echo "Latest branch: $LATEST_BRANCH"
+          echo "latest_branch=$LATEST_BRANCH" >> $GITHUB_ENV
+
+      - name: Set latest branch output
+        id: set_latest_branch
+        run: echo "::set-output name=latest_branch::${{ env.latest_branch }}"
+
+  release:
+    needs: find-and-checkout-latest-branch
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+        with:
+          ref: ${{ needs.find-and-checkout-latest-branch.outputs.latest_branch }}
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -U setuptools wheel twine
+          pip install -U torch --index-url https://download.pytorch.org/whl/cpu
+          pip install -U transformers
+
+      - name: Build the dist files
+        run: python setup.py bdist_wheel && python setup.py sdist
+
+      - name: Publish to the test PyPI
+        env:
+          TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
+        run: twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
+
+      - name: Test installing diffusers and importing
+        run: |
+          pip install diffusers && pip uninstall diffusers -y
+          pip install -i https://testpypi.python.org/pypi diffusers
+          python -c "from diffusers import __version__; print(__version__)"
+          python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('fusing/unet-ldm-dummy-update'); pipe()"
+          python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('hf-internal-testing/tiny-stable-diffusion-pipe', safety_checker=None); pipe('ah suh du')"
+          python -c "from diffusers import *"
+
+      - name: Publish to PyPI
+        env:
+          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        run: twine upload dist/* -r pypi
diff --git a/diffusers/.github/workflows/release_tests_fast.yml b/diffusers/.github/workflows/release_tests_fast.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a8a6f2699dca8a2ae02dd5c8dc9546efdc1e0f5c
--- /dev/null
+++ b/diffusers/.github/workflows/release_tests_fast.yml
@@ -0,0 +1,389 @@
+# Duplicate workflow to push_tests.yml that is meant to run on release/patch branches as a final check
+# Creating a duplicate workflow here is simpler than adding complex path/branch parsing logic to push_tests.yml
+# Needs to be updated if push_tests.yml updated
+name: (Release) Fast GPU Tests on main
+
+on:
+  push:
+    branches:
+      - "v*.*.*-release"
+      - "v*.*.*-patch"
+
+env:
+  DIFFUSERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 600
+  PIPELINE_USAGE_CUTOFF: 50000
+
+jobs:
+  setup_torch_cuda_pipeline_matrix:
+    name: Setup Torch Pipelines CUDA Slow Tests Matrix
+    runs-on:
+      group: aws-general-8-plus
+    container:
+      image: diffusers/diffusers-pytorch-cpu
+    outputs:
+      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: Install dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Fetch Pipeline Matrix
+        id: fetch_pipeline_matrix
+        run: |
+          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
+          echo $matrix
+          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
+      - name: Pipeline Tests Artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-pipelines.json
+          path: reports
+
+  torch_pipelines_cuda_tests:
+    name: Torch Pipelines CUDA Tests
+    needs: setup_torch_cuda_pipeline_matrix
+    strategy:
+      fail-fast: false
+      max-parallel: 8
+      matrix:
+        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+      - name: Install dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Slow PyTorch CUDA checkpoint tests on Ubuntu
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+          CUBLAS_WORKSPACE_CONFIG: :16:8
+        run: |
+          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+            -s -v -k "not Flax and not Onnx" \
+            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
+            tests/pipelines/${{ matrix.module }}
+      - name: Failure short reports
+        if: ${{ failure() }}
+        run: |
+          cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
+          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: pipeline_${{ matrix.module }}_test_reports
+          path: reports
+
+  torch_cuda_tests:
+    name: Torch CUDA Tests
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host --gpus 0
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      fail-fast: false
+      max-parallel: 2
+      matrix:
+        module: [models, schedulers, lora, others, single_file]
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run PyTorch CUDA tests
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+        CUBLAS_WORKSPACE_CONFIG: :16:8
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_torch_${{ matrix.module }}_cuda \
+          tests/${{ matrix.module }}
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt
+        cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: torch_cuda_${{ matrix.module }}_test_reports
+        path: reports
+
+  flax_tpu_tests:
+    name: Flax TPU Tests
+    runs-on: docker-tpu
+    container:
+      image: diffusers/diffusers-flax-tpu
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --privileged
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run slow Flax TPU tests
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        python -m pytest -n 0 \
+          -s -v -k "Flax" \
+          --make-reports=tests_flax_tpu \
+          tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_flax_tpu_stats.txt
+        cat reports/tests_flax_tpu_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: flax_tpu_test_reports
+        path: reports
+
+  onnx_cuda_tests:
+    name: ONNX CUDA Tests
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: diffusers/diffusers-onnxruntime-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run slow ONNXRuntime CUDA tests
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Onnx" \
+          --make-reports=tests_onnx_cuda \
+          tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_onnx_cuda_stats.txt
+        cat reports/tests_onnx_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: onnx_cuda_test_reports
+        path: reports
+
+  run_torch_compile_tests:
+    name: PyTorch Compile CUDA tests
+
+    runs-on:
+      group: aws-g4dn-2xlarge
+
+    container:
+      image: diffusers/diffusers-pytorch-compile-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: |
+        nvidia-smi
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test,training]
+    - name: Environment
+      run: |
+        python utils/print_env.py
+    - name: Run example tests on GPU
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        RUN_COMPILE: yes
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_torch_compile_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: torch_compile_test_reports
+        path: reports
+
+  run_xformers_tests:
+    name: PyTorch xformers CUDA tests
+
+    runs-on:
+      group: aws-g4dn-2xlarge
+
+    container:
+      image: diffusers/diffusers-pytorch-xformers-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: |
+        nvidia-smi
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test,training]
+    - name: Environment
+      run: |
+        python utils/print_env.py
+    - name: Run example tests on GPU
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_torch_xformers_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: torch_xformers_test_reports
+        path: reports
+
+  run_examples_tests:
+    name: Examples PyTorch CUDA tests on Ubuntu
+
+    runs-on:
+      group: aws-g4dn-2xlarge
+
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: |
+        nvidia-smi
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test,training]
+
+    - name: Environment
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python utils/print_env.py
+
+    - name: Run example tests on GPU
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install timm
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/examples_torch_cuda_stats.txt
+        cat reports/examples_torch_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: examples_test_reports
+        path: reports
diff --git a/diffusers/.github/workflows/run_tests_from_a_pr.yml b/diffusers/.github/workflows/run_tests_from_a_pr.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1e736e5430891ec5b1b7843ba2deadddadb9dca1
--- /dev/null
+++ b/diffusers/.github/workflows/run_tests_from_a_pr.yml
@@ -0,0 +1,74 @@
+name: Check running SLOW tests from a PR (only GPU)
+
+on:
+  workflow_dispatch:
+    inputs:
+      docker_image:
+        default: 'diffusers/diffusers-pytorch-cuda'
+        description: 'Name of the Docker image'
+        required: true
+      branch:
+        description: 'PR Branch to test on'
+        required: true
+      test:
+        description: 'Tests to run (e.g.: `tests/models`).'
+        required: true
+
+env:
+  DIFFUSERS_IS_CI: yes
+  IS_GITHUB_CI: "1"
+  HF_HOME: /mnt/cache
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 600
+  RUN_SLOW: yes
+
+jobs:
+  run_tests:
+    name: "Run a test on our runner from a PR"
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: ${{ github.event.inputs.docker_image }}
+      options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+
+    steps:
+      - name: Validate test files input
+        id: validate_test_files
+        env:
+          PY_TEST: ${{ github.event.inputs.test }}
+        run: |
+          if [[ ! "$PY_TEST" =~ ^tests/ ]]; then
+            echo "Error: The input string must start with 'tests/'."
+            exit 1
+          fi
+
+          if [[ ! "$PY_TEST" =~ ^tests/(models|pipelines) ]]; then
+            echo "Error: The input string must contain either 'models' or 'pipelines' after 'tests/'."
+            exit 1
+          fi
+
+          if [[ "$PY_TEST" == *";"* ]]; then
+            echo "Error: The input string must not contain ';'."
+            exit 1
+          fi
+          echo "$PY_TEST"
+
+      - name: Checkout PR branch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+
+
+      - name: Install pytest
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+          python -m uv pip install peft
+
+      - name: Run tests
+        env:
+            PY_TEST: ${{ github.event.inputs.test }}
+        run: |
+          pytest "$PY_TEST"
diff --git a/diffusers/.github/workflows/ssh-pr-runner.yml b/diffusers/.github/workflows/ssh-pr-runner.yml
new file mode 100644
index 0000000000000000000000000000000000000000..49fa9c0ad24d65634a891496ac93bc93e9e24378
--- /dev/null
+++ b/diffusers/.github/workflows/ssh-pr-runner.yml
@@ -0,0 +1,40 @@
+name: SSH into PR runners
+
+on:
+  workflow_dispatch:
+    inputs:
+      docker_image:
+        description: 'Name of the Docker image'
+        required: true
+
+env:
+  IS_GITHUB_CI: "1"
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  HF_HOME: /mnt/cache
+  DIFFUSERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes
+
+jobs:
+  ssh_runner:
+    name: "SSH"
+    runs-on:
+      group: aws-highmemory-32-plus
+    container:
+      image: ${{ github.event.inputs.docker_image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --privileged
+
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+
+      - name: Tailscale # In order to be able to SSH when a test fails
+        uses: huggingface/tailscale-action@main
+        with:
+          authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
+          slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
+          slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          waitForSSH: true
diff --git a/diffusers/.github/workflows/ssh-runner.yml b/diffusers/.github/workflows/ssh-runner.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0d4fe1578ba69d7668fc59546aa833267b569483
--- /dev/null
+++ b/diffusers/.github/workflows/ssh-runner.yml
@@ -0,0 +1,51 @@
+name: SSH into GPU runners
+
+on:
+  workflow_dispatch:
+    inputs:
+      runner_type:
+        description: 'Type of runner to test (aws-g6-4xlarge-plus: a10 or aws-g4dn-2xlarge: t4)'
+        type: choice
+        required: true
+        options:
+          - aws-g6-4xlarge-plus
+          - aws-g4dn-2xlarge
+      docker_image:
+        description: 'Name of the Docker image'
+        required: true
+
+env:
+  IS_GITHUB_CI: "1"
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  HF_HOME: /mnt/cache
+  DIFFUSERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes
+
+jobs:
+  ssh_runner:
+    name: "SSH"
+    runs-on:
+      group: "${{ github.event.inputs.runner_type }}"
+    container:
+      image: ${{ github.event.inputs.docker_image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
+
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Tailscale # In order to be able to SSH when a test fails
+        uses: huggingface/tailscale-action@main
+        with:
+          authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
+          slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
+          slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          waitForSSH: true
diff --git a/diffusers/.github/workflows/stale.yml b/diffusers/.github/workflows/stale.yml
new file mode 100644
index 0000000000000000000000000000000000000000..443f65404daf79dd9819daac44baa58e064b8c23
--- /dev/null
+++ b/diffusers/.github/workflows/stale.yml
@@ -0,0 +1,30 @@
+name: Stale Bot
+
+on:
+  schedule:
+    - cron: "0 15 * * *"
+
+jobs:
+  close_stale_issues:
+    name: Close Stale Issues
+    if: github.repository == 'huggingface/diffusers'
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Setup Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.8
+
+    - name: Install requirements
+      run: |
+        pip install PyGithub
+    - name: Close stale issues
+      run: |
+        python utils/stale.py
diff --git a/diffusers/.github/workflows/trufflehog.yml b/diffusers/.github/workflows/trufflehog.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9cbbf6803724dacc4759b69d7002bb34831e5937
--- /dev/null
+++ b/diffusers/.github/workflows/trufflehog.yml
@@ -0,0 +1,15 @@
+on:
+  push:
+
+name: Secret Leaks
+
+jobs:
+  trufflehog:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Secret Scanning
+      uses: trufflesecurity/trufflehog@main
diff --git a/diffusers/.github/workflows/typos.yml b/diffusers/.github/workflows/typos.yml
new file mode 100644
index 0000000000000000000000000000000000000000..fbd051b4da0dc6c1ec9e15a3a7bad07b122d81cd
--- /dev/null
+++ b/diffusers/.github/workflows/typos.yml
@@ -0,0 +1,14 @@
+name: Check typos
+
+on:
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: typos-action
+        uses: crate-ci/typos@v1.12.4
diff --git a/diffusers/.github/workflows/update_metadata.yml b/diffusers/.github/workflows/update_metadata.yml
new file mode 100644
index 0000000000000000000000000000000000000000..92aea0369ba855237b39d4dee9e8daee0c81010d
--- /dev/null
+++ b/diffusers/.github/workflows/update_metadata.yml
@@ -0,0 +1,30 @@
+name: Update Diffusers metadata
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+      - update_diffusers_metadata*
+
+jobs:
+  update_metadata:
+    runs-on: ubuntu-22.04
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup environment
+        run: |
+          pip install --upgrade pip
+          pip install datasets pandas
+          pip install .[torch]
+
+      - name: Update metadata
+        env:
+          HF_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }}
+        run: |
+          python utils/update_metadata.py --commit_sha ${{ github.sha }}
diff --git a/diffusers/.github/workflows/upload_pr_documentation.yml b/diffusers/.github/workflows/upload_pr_documentation.yml
new file mode 100644
index 0000000000000000000000000000000000000000..fc102df8103e48fb139a8bd47be05fc257d992c5
--- /dev/null
+++ b/diffusers/.github/workflows/upload_pr_documentation.yml
@@ -0,0 +1,16 @@
+name: Upload PR Documentation
+
+on:
+  workflow_run:
+    workflows: ["Build PR Documentation"]
+    types:
+      - completed
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
+    with:
+      package_name: diffusers
+    secrets:
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
diff --git a/diffusers/.gitignore b/diffusers/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..15617d5fdc745f57271db05b5fe7fd83f440c3a7
--- /dev/null
+++ b/diffusers/.gitignore
@@ -0,0 +1,178 @@
+# Initially taken from GitHub's Python gitignore file
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# tests and logs
+tests/fixtures/cached_*_text.txt
+logs/
+lightning_logs/
+lang_code_data/
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a Python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# vscode
+.vs
+.vscode
+
+# Pycharm
+.idea
+
+# TF code
+tensorflow_code
+
+# Models
+proc_data
+
+# examples
+runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep
+
+# data
+/data
+serialization_dir
+
+# emacs
+*.*~
+debug.env
+
+# vim
+.*.swp
+
+# ctags
+tags
+
+# pre-commit
+.pre-commit*
+
+# .lock
+*.lock
+
+# DS_Store (MacOS)
+.DS_Store
+
+# RL pipelines may produce mp4 outputs
+*.mp4
+
+# dependencies
+/transformers
+
+# ruff
+.ruff_cache
+
+# wandb
+wandb
\ No newline at end of file
diff --git a/diffusers/CITATION.cff b/diffusers/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..09fc6c744d06407fa6b4707af1afe3cc9529b4db
--- /dev/null
+++ b/diffusers/CITATION.cff
@@ -0,0 +1,52 @@
+cff-version: 1.2.0
+title: 'Diffusers: State-of-the-art diffusion models'
+message: >-
+  If you use this software, please cite it using the
+  metadata from this file.
+type: software
+authors:
+  - given-names: Patrick
+    family-names: von Platen
+  - given-names: Suraj
+    family-names: Patil
+  - given-names: Anton
+    family-names: Lozhkov
+  - given-names: Pedro
+    family-names: Cuenca
+  - given-names: Nathan
+    family-names: Lambert
+  - given-names: Kashif
+    family-names: Rasul
+  - given-names: Mishig
+    family-names: Davaadorj
+  - given-names: Dhruv
+    family-names: Nair
+  - given-names: Sayak
+    family-names: Paul
+  - given-names: Steven
+    family-names: Liu
+  - given-names: William
+    family-names: Berman
+  - given-names: Yiyi
+    family-names: Xu
+  - given-names: Thomas
+    family-names: Wolf
+repository-code: 'https://github.com/huggingface/diffusers'
+abstract: >-
+  Diffusers provides pretrained diffusion models across
+  multiple modalities, such as vision and audio, and serves
+  as a modular toolbox for inference and training of
+  diffusion models.
+keywords:
+  - deep-learning
+  - pytorch
+  - image-generation
+  - hacktoberfest
+  - diffusion
+  - text2image
+  - image2image
+  - score-based-generative-modeling
+  - stable-diffusion
+  - stable-diffusion-diffusers
+license: Apache-2.0
+version: 0.12.1
diff --git a/diffusers/CODE_OF_CONDUCT.md b/diffusers/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..2139079964fbd53692380985e60ef90e2fa05dad
--- /dev/null
+++ b/diffusers/CODE_OF_CONDUCT.md
@@ -0,0 +1,130 @@
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall Diffusers community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Spamming issues or PRs with links to projects unrelated to this library
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+feedback@huggingface.co.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+https://www.contributor-covenant.org/version/2/1/code_of_conduct.html.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.
diff --git a/diffusers/CONTRIBUTING.md b/diffusers/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..049d317599ad2292504acf3d139b332bc64cbe74
--- /dev/null
+++ b/diffusers/CONTRIBUTING.md
@@ -0,0 +1,506 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# How to contribute to Diffusers 🧨
+
+We ❤️ contributions from the open-source community! Everyone is welcome, and all types of participation –not just code– are valued and appreciated. Answering questions, helping others, reaching out, and improving the documentation are all immensely valuable to the community, so don't be afraid and get involved if you're up for it!
+
+Everyone is encouraged to start by saying 👋 in our public Discord channel. We discuss the latest trends in diffusion models, ask questions, show off personal projects, help each other with contributions, or just hang out ☕. <a href="https://discord.gg/G7tWnz98XR"><img alt="Join us on Discord" src="https://img.shields.io/discord/823813159592001537?color=5865F2&logo=Discord&logoColor=white"></a>
+
+Whichever way you choose to contribute, we strive to be part of an open, welcoming, and kind community. Please, read our [code of conduct](https://github.com/huggingface/diffusers/blob/main/CODE_OF_CONDUCT.md) and be mindful to respect it during your interactions. We also recommend you become familiar with the [ethical guidelines](https://huggingface.co/docs/diffusers/conceptual/ethical_guidelines) that guide our project and ask you to adhere to the same principles of transparency and responsibility.
+
+We enormously value feedback from the community, so please do not be afraid to speak up if you believe you have valuable feedback that can help improve the library - every message, comment, issue, and pull request (PR) is read and considered.
+
+## Overview
+
+You can contribute in many ways ranging from answering questions on issues to adding new diffusion models to
+the core library.
+
+In the following, we give an overview of different ways to contribute, ranked by difficulty in ascending order. All of them are valuable to the community.
+
+* 1. Asking and answering questions on [the Diffusers discussion forum](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers) or on [Discord](https://discord.gg/G7tWnz98XR).
+* 2. Opening new issues on [the GitHub Issues tab](https://github.com/huggingface/diffusers/issues/new/choose).
+* 3. Answering issues on [the GitHub Issues tab](https://github.com/huggingface/diffusers/issues).
+* 4. Fix a simple issue, marked by the "Good first issue" label, see [here](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22).
+* 5. Contribute to the [documentation](https://github.com/huggingface/diffusers/tree/main/docs/source).
+* 6. Contribute a [Community Pipeline](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3Acommunity-examples).
+* 7. Contribute to the [examples](https://github.com/huggingface/diffusers/tree/main/examples).
+* 8. Fix a more difficult issue, marked by the "Good second issue" label, see [here](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22Good+second+issue%22).
+* 9. Add a new pipeline, model, or scheduler, see ["New Pipeline/Model"](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+pipeline%2Fmodel%22) and ["New scheduler"](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+scheduler%22) issues. For this contribution, please have a look at [Design Philosophy](https://github.com/huggingface/diffusers/blob/main/PHILOSOPHY.md).
+
+As said before, **all contributions are valuable to the community**.
+In the following, we will explain each contribution a bit more in detail.
+
+For all contributions 4-9, you will need to open a PR. It is explained in detail how to do so in [Opening a pull request](#how-to-open-a-pr).
+
+### 1. Asking and answering questions on the Diffusers discussion forum or on the Diffusers Discord
+
+Any question or comment related to the Diffusers library can be asked on the [discussion forum](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/) or on [Discord](https://discord.gg/G7tWnz98XR). Such questions and comments include (but are not limited to):
+- Reports of training or inference experiments in an attempt to share knowledge
+- Presentation of personal projects
+- Questions to non-official training examples
+- Project proposals
+- General feedback
+- Paper summaries
+- Asking for help on personal projects that build on top of the Diffusers library
+- General questions
+- Ethical questions regarding diffusion models
+- ...
+
+Every question that is asked on the forum or on Discord actively encourages the community to publicly
+share knowledge and might very well help a beginner in the future who has the same question you're
+having. Please do pose any questions you might have.
+In the same spirit, you are of immense help to the community by answering such questions because this way you are publicly documenting knowledge for everybody to learn from.
+
+**Please** keep in mind that the more effort you put into asking or answering a question, the higher
+the quality of the publicly documented knowledge. In the same way, well-posed and well-answered questions create a high-quality knowledge database accessible to everybody, while badly posed questions or answers reduce the overall quality of the public knowledge database.
+In short, a high quality question or answer is *precise*, *concise*, *relevant*, *easy-to-understand*, *accessible*, and *well-formatted/well-posed*. For more information, please have a look through the [How to write a good issue](#how-to-write-a-good-issue) section.
+
+**NOTE about channels**:
+[*The forum*](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63) is much better indexed by search engines, such as Google. Posts are ranked by popularity rather than chronologically. Hence, it's easier to look up questions and answers that we posted some time ago.
+In addition, questions and answers posted in the forum can easily be linked to.
+In contrast, *Discord* has a chat-like format that invites fast back-and-forth communication.
+While it will most likely take less time for you to get an answer to your question on Discord, your
+question won't be visible anymore over time. Also, it's much harder to find information that was posted a while back on Discord. We therefore strongly recommend using the forum for high-quality questions and answers in an attempt to create long-lasting knowledge for the community. If discussions on Discord lead to very interesting answers and conclusions, we recommend posting the results on the forum to make the information more available for future readers.
+
+### 2. Opening new issues on the GitHub issues tab
+
+The 🧨 Diffusers library is robust and reliable thanks to the users who notify us of
+the problems they encounter. So thank you for reporting an issue.
+
+Remember, GitHub issues are reserved for technical questions directly related to the Diffusers library, bug reports, feature requests, or feedback on the library design.
+
+In a nutshell, this means that everything that is **not** related to the **code of the Diffusers library** (including the documentation) should **not** be asked on GitHub, but rather on either the [forum](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63) or [Discord](https://discord.gg/G7tWnz98XR).
+
+**Please consider the following guidelines when opening a new issue**:
+- Make sure you have searched whether your issue has already been asked before (use the search bar on GitHub under Issues).
+- Please never report a new issue on another (related) issue. If another issue is highly related, please
+open a new issue nevertheless and link to the related issue.
+- Make sure your issue is written in English. Please use one of the great, free online translation services, such as [DeepL](https://www.deepl.com/translator) to translate from your native language to English if you are not comfortable in English.
+- Check whether your issue might be solved by updating to the newest Diffusers version. Before posting your issue, please make sure that `python -c "import diffusers; print(diffusers.__version__)"` is higher or matches the latest Diffusers version.
+- Remember that the more effort you put into opening a new issue, the higher the quality of your answer will be and the better the overall quality of the Diffusers issues.
+
+New issues usually include the following.
+
+#### 2.1. Reproducible, minimal bug reports
+
+A bug report should always have a reproducible code snippet and be as minimal and concise as possible.
+This means in more detail:
+- Narrow the bug down as much as you can, **do not just dump your whole code file**.
+- Format your code.
+- Do not include any external libraries except for Diffusers depending on them.
+- **Always** provide all necessary information about your environment; for this, you can run: `diffusers-cli env` in your shell and copy-paste the displayed information to the issue.
+- Explain the issue. If the reader doesn't know what the issue is and why it is an issue, she cannot solve it.
+- **Always** make sure the reader can reproduce your issue with as little effort as possible. If your code snippet cannot be run because of missing libraries or undefined variables, the reader cannot help you. Make sure your reproducible code snippet is as minimal as possible and can be copy-pasted into a simple Python shell.
+- If in order to reproduce your issue a model and/or dataset is required, make sure the reader has access to that model or dataset. You can always upload your model or dataset to the [Hub](https://huggingface.co) to make it easily downloadable. Try to keep your model and dataset as small as possible, to make the reproduction of your issue as effortless as possible.
+
+For more information, please have a look through the [How to write a good issue](#how-to-write-a-good-issue) section.
+
+You can open a bug report [here](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=bug&projects=&template=bug-report.yml).
+
+#### 2.2. Feature requests
+
+A world-class feature request addresses the following points:
+
+1. Motivation first:
+* Is it related to a problem/frustration with the library? If so, please explain
+why. Providing a code snippet that demonstrates the problem is best.
+* Is it related to something you would need for a project? We'd love to hear
+about it!
+* Is it something you worked on and think could benefit the community?
+Awesome! Tell us what problem it solved for you.
+2. Write a *full paragraph* describing the feature;
+3. Provide a **code snippet** that demonstrates its future use;
+4. In case this is related to a paper, please attach a link;
+5. Attach any additional information (drawings, screenshots, etc.) you think may help.
+
+You can open a feature request [here](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feature_request.md&title=).
+
+#### 2.3 Feedback
+
+Feedback about the library design and why it is good or not good helps the core maintainers immensely to build a user-friendly library. To understand the philosophy behind the current design philosophy, please have a look [here](https://huggingface.co/docs/diffusers/conceptual/philosophy). If you feel like a certain design choice does not fit with the current design philosophy, please explain why and how it should be changed. If a certain design choice follows the design philosophy too much, hence restricting use cases, explain why and how it should be changed.
+If a certain design choice is very useful for you, please also leave a note as this is great feedback for future design decisions.
+
+You can open an issue about feedback [here](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=).
+
+#### 2.4 Technical questions
+
+Technical questions are mainly about why certain code of the library was written in a certain way, or what a certain part of the code does. Please make sure to link to the code in question and please provide detail on
+why this part of the code is difficult to understand.
+
+You can open an issue about a technical question [here](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=bug&template=bug-report.yml).
+
+#### 2.5 Proposal to add a new model, scheduler, or pipeline
+
+If the diffusion model community released a new model, pipeline, or scheduler that you would like to see in the Diffusers library, please provide the following information:
+
+* Short description of the diffusion pipeline, model, or scheduler and link to the paper or public release.
+* Link to any of its open-source implementation.
+* Link to the model weights if they are available.
+
+If you are willing to contribute to the model yourself, let us know so we can best guide you. Also, don't forget
+to tag the original author of the component (model, scheduler, pipeline, etc.) by GitHub handle if you can find it.
+
+You can open a request for a model/pipeline/scheduler [here](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=New+model%2Fpipeline%2Fscheduler&template=new-model-addition.yml).
+
+### 3. Answering issues on the GitHub issues tab
+
+Answering issues on GitHub might require some technical knowledge of Diffusers, but we encourage everybody to give it a try even if you are not 100% certain that your answer is correct.
+Some tips to give a high-quality answer to an issue:
+- Be as concise and minimal as possible.
+- Stay on topic. An answer to the issue should concern the issue and only the issue.
+- Provide links to code, papers, or other sources that prove or encourage your point.
+- Answer in code. If a simple code snippet is the answer to the issue or shows how the issue can be solved, please provide a fully reproducible code snippet.
+
+Also, many issues tend to be simply off-topic, duplicates of other issues, or irrelevant. It is of great
+help to the maintainers if you can answer such issues, encouraging the author of the issue to be
+more precise, provide the link to a duplicated issue or redirect them to [the forum](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63) or [Discord](https://discord.gg/G7tWnz98XR).
+
+If you have verified that the issued bug report is correct and requires a correction in the source code,
+please have a look at the next sections.
+
+For all of the following contributions, you will need to open a PR. It is explained in detail how to do so in the [Opening a pull request](#how-to-open-a-pr) section.
+
+### 4. Fixing a "Good first issue"
+
+*Good first issues* are marked by the [Good first issue](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) label. Usually, the issue already
+explains how a potential solution should look so that it is easier to fix.
+If the issue hasn't been closed and you would like to try to fix this issue, you can just leave a message "I would like to try this issue.". There are usually three scenarios:
+- a.) The issue description already proposes a fix. In this case and if the solution makes sense to you, you can open a PR or draft PR to fix it.
+- b.) The issue description does not propose a fix. In this case, you can ask what a proposed fix could look like and someone from the Diffusers team should answer shortly. If you have a good idea of how to fix it, feel free to directly open a PR.
+- c.) There is already an open PR to fix the issue, but the issue hasn't been closed yet. If the PR has gone stale, you can simply open a new PR and link to the stale PR. PRs often go stale if the original contributor who wanted to fix the issue suddenly cannot find the time anymore to proceed. This often happens in open-source and is very normal. In this case, the community will be very happy if you give it a new try and leverage the knowledge of the existing PR. If there is already a PR and it is active, you can help the author by giving suggestions, reviewing the PR or even asking whether you can contribute to the PR.
+
+
+### 5. Contribute to the documentation
+
+A good library **always** has good documentation! The official documentation is often one of the first points of contact for new users of the library, and therefore contributing to the documentation is a **highly
+valuable contribution**.
+
+Contributing to the library can have many forms:
+
+- Correcting spelling or grammatical errors.
+- Correct incorrect formatting of the docstring. If you see that the official documentation is weirdly displayed or a link is broken, we are very happy if you take some time to correct it.
+- Correct the shape or dimensions of a docstring input or output tensor.
+- Clarify documentation that is hard to understand or incorrect.
+- Update outdated code examples.
+- Translating the documentation to another language.
+
+Anything displayed on [the official Diffusers doc page](https://huggingface.co/docs/diffusers/index) is part of the official documentation and can be corrected, adjusted in the respective [documentation source](https://github.com/huggingface/diffusers/tree/main/docs/source).
+
+Please have a look at [this page](https://github.com/huggingface/diffusers/tree/main/docs) on how to verify changes made to the documentation locally.
+
+
+### 6. Contribute a community pipeline
+
+[Pipelines](https://huggingface.co/docs/diffusers/api/pipelines/overview) are usually the first point of contact between the Diffusers library and the user.
+Pipelines are examples of how to use Diffusers [models](https://huggingface.co/docs/diffusers/api/models/overview) and [schedulers](https://huggingface.co/docs/diffusers/api/schedulers/overview).
+We support two types of pipelines:
+
+- Official Pipelines
+- Community Pipelines
+
+Both official and community pipelines follow the same design and consist of the same type of components.
+
+Official pipelines are tested and maintained by the core maintainers of Diffusers. Their code
+resides in [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).
+In contrast, community pipelines are contributed and maintained purely by the **community** and are **not** tested.
+They reside in [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and while they can be accessed via the [PyPI diffusers package](https://pypi.org/project/diffusers/), their code is not part of the PyPI distribution.
+
+The reason for the distinction is that the core maintainers of the Diffusers library cannot maintain and test all
+possible ways diffusion models can be used for inference, but some of them may be of interest to the community.
+Officially released diffusion pipelines,
+such as Stable Diffusion are added to the core src/diffusers/pipelines package which ensures
+high quality of maintenance, no backward-breaking code changes, and testing.
+More bleeding edge pipelines should be added as community pipelines. If usage for a community pipeline is high, the pipeline can be moved to the official pipelines upon request from the community. This is one of the ways we strive to be a community-driven library.
+
+To add a community pipeline, one should add a <name-of-the-community>.py file to [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and adapt the [examples/community/README.md](https://github.com/huggingface/diffusers/tree/main/examples/community/README.md) to include an example of the new pipeline.
+
+An example can be seen [here](https://github.com/huggingface/diffusers/pull/2400).
+
+Community pipeline PRs are only checked at a superficial level and ideally they should be maintained by their original authors.
+
+Contributing a community pipeline is a great way to understand how Diffusers models and schedulers work. Having contributed a community pipeline is usually the first stepping stone to contributing an official pipeline to the
+core package.
+
+### 7. Contribute to training examples
+
+Diffusers examples are a collection of training scripts that reside in [examples](https://github.com/huggingface/diffusers/tree/main/examples).
+
+We support two types of training examples:
+
+- Official training examples
+- Research training examples
+
+Research training examples are located in [examples/research_projects](https://github.com/huggingface/diffusers/tree/main/examples/research_projects) whereas official training examples include all folders under [examples](https://github.com/huggingface/diffusers/tree/main/examples) except the `research_projects` and `community` folders.
+The official training examples are maintained by the Diffusers' core maintainers whereas the research training examples are maintained by the community.
+This is because of the same reasons put forward in [6. Contribute a community pipeline](#6-contribute-a-community-pipeline) for official pipelines vs. community pipelines: It is not feasible for the core maintainers to maintain all possible training methods for diffusion models.
+If the Diffusers core maintainers and the community consider a certain training paradigm to be too experimental or not popular enough, the corresponding training code should be put in the `research_projects` folder and maintained by the author.
+
+Both official training and research examples consist of a directory that contains one or more training scripts, a `requirements.txt` file, and a `README.md` file. In order for the user to make use of the
+training examples, it is required to clone the repository:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+```
+
+as well as to install all additional dependencies required for training:
+
+```bash
+cd diffusers
+pip install -r examples/<your-example-folder>/requirements.txt
+```
+
+Therefore when adding an example, the `requirements.txt` file shall define all pip dependencies required for your training example so that once all those are installed, the user can run the example's training script. See, for example, the [DreamBooth `requirements.txt` file](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/requirements.txt).
+
+Training examples of the Diffusers library should adhere to the following philosophy:
+- All the code necessary to run the examples should be found in a single Python file.
+- One should be able to run the example from the command line with `python <your-example>.py --args`.
+- Examples should be kept simple and serve as **an example** on how to use Diffusers for training. The purpose of example scripts is **not** to create state-of-the-art diffusion models, but rather to reproduce known training schemes without adding too much custom logic. As a byproduct of this point, our examples also strive to serve as good educational materials.
+
+To contribute an example, it is highly recommended to look at already existing examples such as [dreambooth](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py) to get an idea of how they should look like.
+We strongly advise contributors to make use of the [Accelerate library](https://github.com/huggingface/accelerate) as it's tightly integrated
+with Diffusers.
+Once an example script works, please make sure to add a comprehensive `README.md` that states how to use the example exactly. This README should include:
+- An example command on how to run the example script as shown [here e.g.](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth#running-locally-with-pytorch).
+- A link to some training results (logs, models, ...) that show what the user can expect as shown [here e.g.](https://api.wandb.ai/report/patrickvonplaten/xm6cd5q5).
+- If you are adding a non-official/research training example, **please don't forget** to add a sentence that you are maintaining this training example which includes your git handle as shown [here](https://github.com/huggingface/diffusers/tree/main/examples/research_projects/intel_opts#diffusers-examples-with-intel-optimizations).
+
+If you are contributing to the official training examples, please also make sure to add a test to [examples/test_examples.py](https://github.com/huggingface/diffusers/blob/main/examples/test_examples.py). This is not necessary for non-official training examples.
+
+### 8. Fixing a "Good second issue"
+
+*Good second issues* are marked by the [Good second issue](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22Good+second+issue%22) label. Good second issues are
+usually more complicated to solve than [Good first issues](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22).
+The issue description usually gives less guidance on how to fix the issue and requires
+a decent understanding of the library by the interested contributor.
+If you are interested in tackling a good second issue, feel free to open a PR to fix it and link the PR to the issue. If you see that a PR has already been opened for this issue but did not get merged, have a look to understand why it wasn't merged and try to open an improved PR.
+Good second issues are usually more difficult to get merged compared to good first issues, so don't hesitate to ask for help from the core maintainers. If your PR is almost finished the core maintainers can also jump into your PR and commit to it in order to get it merged.
+
+### 9. Adding pipelines, models, schedulers
+
+Pipelines, models, and schedulers are the most important pieces of the Diffusers library.
+They provide easy access to state-of-the-art diffusion technologies and thus allow the community to
+build powerful generative AI applications.
+
+By adding a new model, pipeline, or scheduler you might enable a new powerful use case for any of the user interfaces relying on Diffusers which can be of immense value for the whole generative AI ecosystem.
+
+Diffusers has a couple of open feature requests for all three components - feel free to gloss over them
+if you don't know yet what specific component you would like to add:
+- [Model or pipeline](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+pipeline%2Fmodel%22)
+- [Scheduler](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+scheduler%22)
+
+Before adding any of the three components, it is strongly recommended that you give the [Philosophy guide](https://github.com/huggingface/diffusers/blob/main/PHILOSOPHY.md) a read to better understand the design of any of the three components. Please be aware that
+we cannot merge model, scheduler, or pipeline additions that strongly diverge from our design philosophy
+as it will lead to API inconsistencies. If you fundamentally disagree with a design choice, please
+open a [Feedback issue](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=) instead so that it can be discussed whether a certain design
+pattern/design choice shall be changed everywhere in the library and whether we shall update our design philosophy. Consistency across the library is very important for us.
+
+Please make sure to add links to the original codebase/paper to the PR and ideally also ping the
+original author directly on the PR so that they can follow the progress and potentially help with questions.
+
+If you are unsure or stuck in the PR, don't hesitate to leave a message to ask for a first review or help.
+
+## How to write a good issue
+
+**The better your issue is written, the higher the chances that it will be quickly resolved.**
+
+1. Make sure that you've used the correct template for your issue. You can pick between *Bug Report*, *Feature Request*, *Feedback about API Design*, *New model/pipeline/scheduler addition*, *Forum*, or a blank issue. Make sure to pick the correct one when opening [a new issue](https://github.com/huggingface/diffusers/issues/new/choose).
+2. **Be precise**: Give your issue a fitting title. Try to formulate your issue description as simple as possible. The more precise you are when submitting an issue, the less time it takes to understand the issue and potentially solve it. Make sure to open an issue for one issue only and not for multiple issues. If you found multiple issues, simply open multiple issues. If your issue is a bug, try to be as precise as possible about what bug it is - you should not just write "Error in diffusers".
+3. **Reproducibility**: No reproducible code snippet == no solution. If you encounter a bug, maintainers **have to be able to reproduce** it. Make sure that you include a code snippet that can be copy-pasted into a Python interpreter to reproduce the issue. Make sure that your code snippet works, *i.e.* that there are no missing imports or missing links to images, ... Your issue should contain an error message **and** a code snippet that can be copy-pasted without any changes to reproduce the exact same error message. If your issue is using local model weights or local data that cannot be accessed by the reader, the issue cannot be solved. If you cannot share your data or model, try to make a dummy model or dummy data.
+4. **Minimalistic**: Try to help the reader as much as you can to understand the issue as quickly as possible by staying as concise as possible. Remove all code / all information that is irrelevant to the issue. If you have found a bug, try to create the easiest code example you can to demonstrate your issue, do not just dump your whole workflow into the issue as soon as you have found a bug. E.g., if you train a model and get an error at some point during the training, you should first try to understand what part of the training code is responsible for the error and try to reproduce it with a couple of lines. Try to use dummy data instead of full datasets.
+5. Add links. If you are referring to a certain naming, method, or model make sure to provide a link so that the reader can better understand what you mean. If you are referring to a specific PR or issue, make sure to link it to your issue. Do not assume that the reader knows what you are talking about. The more links you add to your issue the better.
+6. Formatting. Make sure to nicely format your issue by formatting code into Python code syntax, and error messages into normal code syntax. See the [official GitHub formatting docs](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) for more information.
+7. Think of your issue not as a ticket to be solved, but rather as a beautiful entry to a well-written encyclopedia. Every added issue is a contribution to publicly available knowledge. By adding a nicely written issue you not only make it easier for maintainers to solve your issue, but you are helping the whole community to better understand a certain aspect of the library.
+
+## How to write a good PR
+
+1. Be a chameleon. Understand existing design patterns and syntax and make sure your code additions flow seamlessly into the existing code base. Pull requests that significantly diverge from existing design patterns or user interfaces will not be merged.
+2. Be laser focused. A pull request should solve one problem and one problem only. Make sure to not fall into the trap of "also fixing another problem while we're adding it". It is much more difficult to review pull requests that solve multiple, unrelated problems at once.
+3. If helpful, try to add a code snippet that displays an example of how your addition can be used.
+4. The title of your pull request should be a summary of its contribution.
+5. If your pull request addresses an issue, please mention the issue number in
+the pull request description to make sure they are linked (and people
+consulting the issue know you are working on it);
+6. To indicate a work in progress please prefix the title with `[WIP]`. These
+are useful to avoid duplicated work, and to differentiate it from PRs ready
+to be merged;
+7. Try to formulate and format your text as explained in [How to write a good issue](#how-to-write-a-good-issue).
+8. Make sure existing tests pass;
+9. Add high-coverage tests. No quality testing = no merge.
+- If you are adding new `@slow` tests, make sure they pass using
+`RUN_SLOW=1 python -m pytest tests/test_my_new_model.py`.
+CircleCI does not run the slow tests, but GitHub Actions does every night!
+10. All public methods must have informative docstrings that work nicely with markdown. See [`pipeline_latent_diffusion.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py) for an example.
+11. Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos, and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
+[`hf-internal-testing`](https://huggingface.co/hf-internal-testing) or [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images) to place these files.
+If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
+to this dataset.
+
+## How to open a PR
+
+Before writing code, we strongly advise you to search through the existing PRs or
+issues to make sure that nobody is already working on the same thing. If you are
+unsure, it is always a good idea to open an issue to get some feedback.
+
+You will need basic `git` proficiency to be able to contribute to
+🧨 Diffusers. `git` is not the easiest tool to use but it has the greatest
+manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
+Git](https://git-scm.com/book/en/v2) is a very good reference.
+
+Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/42f25d601a910dceadaee6c44345896b4cfa9928/setup.py#L270)):
+
+1. Fork the [repository](https://github.com/huggingface/diffusers) by
+clicking on the 'Fork' button on the repository's page. This creates a copy of the code
+under your GitHub user account.
+
+2. Clone your fork to your local disk, and add the base repository as a remote:
+
+ ```bash
+ $ git clone git@github.com:<your GitHub handle>/diffusers.git
+ $ cd diffusers
+ $ git remote add upstream https://github.com/huggingface/diffusers.git
+ ```
+
+3. Create a new branch to hold your development changes:
+
+ ```bash
+ $ git checkout -b a-descriptive-name-for-my-changes
+ ```
+
+**Do not** work on the `main` branch.
+
+4. Set up a development environment by running the following command in a virtual environment:
+
+ ```bash
+ $ pip install -e ".[dev]"
+ ```
+
+If you have already cloned the repo, you might need to `git pull` to get the most recent changes in the
+library.
+
+5. Develop the features on your branch.
+
+As you work on the features, you should make sure that the test suite
+passes. You should run the tests impacted by your changes like this:
+
+ ```bash
+ $ pytest tests/<TEST_TO_RUN>.py
+ ```
+
+Before you run the tests, please make sure you install the dependencies required for testing. You can do so
+with this command:
+
+ ```bash
+ $ pip install -e ".[test]"
+ ```
+
+You can also run the full test suite with the following command, but it takes
+a beefy machine to produce a result in a decent amount of time now that
+Diffusers has grown a lot. Here is the command for it:
+
+ ```bash
+ $ make test
+ ```
+
+🧨 Diffusers relies on `ruff` and `isort` to format its source code
+consistently. After you make changes, apply automatic style corrections and code verifications
+that can't be automated in one go with:
+
+ ```bash
+ $ make style
+ ```
+
+🧨 Diffusers also uses `ruff` and a few custom scripts to check for coding mistakes. Quality
+control runs in CI, however, you can also run the same checks with:
+
+ ```bash
+ $ make quality
+ ```
+
+Once you're happy with your changes, add changed files using `git add` and
+make a commit with `git commit` to record your changes locally:
+
+ ```bash
+ $ git add modified_file.py
+ $ git commit -m "A descriptive message about your changes."
+ ```
+
+It is a good idea to sync your copy of the code with the original
+repository regularly. This way you can quickly account for changes:
+
+ ```bash
+ $ git pull upstream main
+ ```
+
+Push the changes to your account using:
+
+ ```bash
+ $ git push -u origin a-descriptive-name-for-my-changes
+ ```
+
+6. Once you are satisfied, go to the
+webpage of your fork on GitHub. Click on 'Pull request' to send your changes
+to the project maintainers for review.
+
+7. It's ok if maintainers ask you for changes. It happens to core contributors
+too! So everyone can see the changes in the Pull request, work in your local
+branch and push the changes to your fork. They will automatically appear in
+the pull request.
+
+### Tests
+
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
+the [tests folder](https://github.com/huggingface/diffusers/tree/main/tests).
+
+We like `pytest` and `pytest-xdist` because it's faster. From the root of the
+repository, here's how to run tests with `pytest` for the library:
+
+```bash
+$ python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+In fact, that's how `make test` is implemented!
+
+You can specify a smaller set of tests in order to test only the feature
+you're working on.
+
+By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to
+`yes` to run them. This will download many gigabytes of models — make sure you
+have enough disk space and a good Internet connection, or a lot of patience!
+
+```bash
+$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+`unittest` is fully supported, here's how to run tests with it:
+
+```bash
+$ python -m unittest discover -s tests -t . -v
+$ python -m unittest discover -s examples -t examples -v
+```
+
+### Syncing forked main with upstream (HuggingFace) main
+
+To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnecessary notifications to the developers involved in these PRs,
+when syncing the main branch of a forked repository, please, follow these steps:
+1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead, merge directly into the forked main.
+2. If a PR is absolutely necessary, use the following steps after checking out your branch:
+```bash
+$ git checkout -b your-branch-for-syncing
+$ git pull --squash --no-commit upstream main
+$ git commit -m '<your message without GitHub references>'
+$ git push --set-upstream origin your-branch-for-syncing
+```
+
+### Style guide
+
+For documentation strings, 🧨 Diffusers follows the [Google style](https://google.github.io/styleguide/pyguide.html).
diff --git a/diffusers/LICENSE b/diffusers/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/diffusers/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/diffusers/MANIFEST.in b/diffusers/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..b22fe1a28a1ef881fdb36af3c30b14c0a5d10aa5
--- /dev/null
+++ b/diffusers/MANIFEST.in
@@ -0,0 +1,2 @@
+include LICENSE
+include src/diffusers/utils/model_card_template.md
diff --git a/diffusers/Makefile b/diffusers/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..9af2e8b1a5c9993411ffa06e7e48f9cfec3bd164
--- /dev/null
+++ b/diffusers/Makefile
@@ -0,0 +1,96 @@
+.PHONY: deps_table_update modified_only_fixup extra_style_checks quality style fixup fix-copies test test-examples
+
+# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
+export PYTHONPATH = src
+
+check_dirs := examples scripts src tests utils benchmarks
+
+modified_only_fixup:
+	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
+	@if test -n "$(modified_py_files)"; then \
+		echo "Checking/fixing $(modified_py_files)"; \
+		ruff check $(modified_py_files) --fix; \
+		ruff format $(modified_py_files);\
+	else \
+		echo "No library .py files were modified"; \
+	fi
+
+# Update src/diffusers/dependency_versions_table.py
+
+deps_table_update:
+	@python setup.py deps_table_update
+
+deps_table_check_updated:
+	@md5sum src/diffusers/dependency_versions_table.py > md5sum.saved
+	@python setup.py deps_table_update
+	@md5sum -c --quiet md5sum.saved || (printf "\nError: the version dependency table is outdated.\nPlease run 'make fixup' or 'make style' and commit the changes.\n\n" && exit 1)
+	@rm md5sum.saved
+
+# autogenerating code
+
+autogenerate_code: deps_table_update
+
+# Check that the repo is in a good state
+
+repo-consistency:
+	python utils/check_dummies.py
+	python utils/check_repo.py
+	python utils/check_inits.py
+
+# this target runs checks on all files
+
+quality:
+	ruff check $(check_dirs) setup.py
+	ruff format --check $(check_dirs) setup.py
+	doc-builder style src/diffusers docs/source --max_len 119 --check_only
+	python utils/check_doc_toc.py
+
+# Format source code automatically and check is there are any problems left that need manual fixing
+
+extra_style_checks:
+	python utils/custom_init_isort.py
+	python utils/check_doc_toc.py --fix_and_overwrite
+
+# this target runs checks on all files and potentially modifies some of them
+
+style:
+	ruff check $(check_dirs) setup.py --fix
+	ruff format $(check_dirs) setup.py
+	doc-builder style src/diffusers docs/source --max_len 119
+	${MAKE} autogenerate_code
+	${MAKE} extra_style_checks
+
+# Super fast fix and check target that only works on relevant modified files since the branch was made
+
+fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency
+
+# Make marked copies of snippets of codes conform to the original
+
+fix-copies:
+	python utils/check_copies.py --fix_and_overwrite
+	python utils/check_dummies.py --fix_and_overwrite
+
+# Run tests for the library
+
+test:
+	python -m pytest -n auto --dist=loadfile -s -v ./tests/
+
+# Run tests for examples
+
+test-examples:
+	python -m pytest -n auto --dist=loadfile -s -v ./examples/
+
+
+# Release stuff
+
+pre-release:
+	python utils/release.py
+
+pre-patch:
+	python utils/release.py --patch
+
+post-release:
+	python utils/release.py --post_release
+
+post-patch:
+	python utils/release.py --post_release --patch
diff --git a/diffusers/PHILOSOPHY.md b/diffusers/PHILOSOPHY.md
new file mode 100644
index 0000000000000000000000000000000000000000..c646c61ec429020032cd51f0d17ac007b17edc48
--- /dev/null
+++ b/diffusers/PHILOSOPHY.md
@@ -0,0 +1,110 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Philosophy
+
+🧨 Diffusers provides **state-of-the-art** pretrained diffusion models across multiple modalities.
+Its purpose is to serve as a **modular toolbox** for both inference and training.
+
+We aim to build a library that stands the test of time and therefore take API design very seriously.
+
+In a nutshell, Diffusers is built to be a natural extension of PyTorch. Therefore, most of our design choices are based on [PyTorch's Design Principles](https://pytorch.org/docs/stable/community/design.html#pytorch-design-philosophy). Let's go over the most important ones:
+
+## Usability over Performance
+
+- While Diffusers has many built-in performance-enhancing features (see [Memory and Speed](https://huggingface.co/docs/diffusers/optimization/fp16)), models are always loaded with the highest precision and lowest optimization. Therefore, by default diffusion pipelines are always instantiated on CPU with float32 precision if not otherwise defined by the user. This ensures usability across different platforms and accelerators and means that no complex installations are required to run the library.
+- Diffusers aims to be a **light-weight** package and therefore has very few required dependencies, but many soft dependencies that can improve performance (such as `accelerate`, `safetensors`, `onnx`, etc...). We strive to keep the library as lightweight as possible so that it can be added without much concern as a dependency on other packages.
+- Diffusers prefers simple, self-explainable code over condensed, magic code. This means that short-hand code syntaxes such as lambda functions, and advanced PyTorch operators are often not desired.
+
+## Simple over easy
+
+As PyTorch states, **explicit is better than implicit** and **simple is better than complex**. This design philosophy is reflected in multiple parts of the library:
+- We follow PyTorch's API with methods like [`DiffusionPipeline.to`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.to) to let the user handle device management.
+- Raising concise error messages is preferred to silently correct erroneous input. Diffusers aims at teaching the user, rather than making the library as easy to use as possible.
+- Complex model vs. scheduler logic is exposed instead of magically handled inside. Schedulers/Samplers are separated from diffusion models with minimal dependencies on each other. This forces the user to write the unrolled denoising loop. However, the separation allows for easier debugging and gives the user more control over adapting the denoising process or switching out diffusion models or schedulers.
+- Separately trained components of the diffusion pipeline, *e.g.* the text encoder, the UNet, and the variational autoencoder, each has their own model class. This forces the user to handle the interaction between the different model components, and the serialization format separates the model components into different files. However, this allows for easier debugging and customization. DreamBooth or Textual Inversion training
+is very simple thanks to Diffusers' ability to separate single components of the diffusion pipeline.
+
+## Tweakable, contributor-friendly over abstraction
+
+For large parts of the library, Diffusers adopts an important design principle of the [Transformers library](https://github.com/huggingface/transformers), which is to prefer copy-pasted code over hasty abstractions. This design principle is very opinionated and stands in stark contrast to popular design principles such as [Don't repeat yourself (DRY)](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself).
+In short, just like Transformers does for modeling files, Diffusers prefers to keep an extremely low level of abstraction and very self-contained code for pipelines and schedulers.
+Functions, long code blocks, and even classes can be copied across multiple files which at first can look like a bad, sloppy design choice that makes the library unmaintainable.
+**However**, this design has proven to be extremely successful for Transformers and makes a lot of sense for community-driven, open-source machine learning libraries because:
+- Machine Learning is an extremely fast-moving field in which paradigms, model architectures, and algorithms are changing rapidly, which therefore makes it very difficult to define long-lasting code abstractions.
+- Machine Learning practitioners like to be able to quickly tweak existing code for ideation and research and therefore prefer self-contained code over one that contains many abstractions.
+- Open-source libraries rely on community contributions and therefore must build a library that is easy to contribute to. The more abstract the code, the more dependencies, the harder to read, and the harder to contribute to. Contributors simply stop contributing to very abstract libraries out of fear of breaking vital functionality. If contributing to a library cannot break other fundamental code, not only is it more inviting for potential new contributors, but it is also easier to review and contribute to multiple parts in parallel.
+
+At Hugging Face, we call this design the **single-file policy** which means that almost all of the code of a certain class should be written in a single, self-contained file. To read more about the philosophy, you can have a look
+at [this blog post](https://huggingface.co/blog/transformers-design-philosophy).
+
+In Diffusers, we follow this philosophy for both pipelines and schedulers, but only partly for diffusion models. The reason we don't follow this design fully for diffusion models is because almost all diffusion pipelines, such
+as [DDPM](https://huggingface.co/docs/diffusers/api/pipelines/ddpm), [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview#stable-diffusion-pipelines), [unCLIP (DALL·E 2)](https://huggingface.co/docs/diffusers/api/pipelines/unclip) and [Imagen](https://imagen.research.google/) all rely on the same diffusion model, the [UNet](https://huggingface.co/docs/diffusers/api/models/unet2d-cond).
+
+Great, now you should have generally understood why 🧨 Diffusers is designed the way it is 🤗.
+We try to apply these design principles consistently across the library. Nevertheless, there are some minor exceptions to the philosophy or some unlucky design choices. If you have feedback regarding the design, we would ❤️  to hear it [directly on GitHub](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=).
+
+## Design Philosophy in Details
+
+Now, let's look a bit into the nitty-gritty details of the design philosophy. Diffusers essentially consists of three major classes: [pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines), [models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models), and [schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers).
+Let's walk through more detailed design decisions for each class.
+
+### Pipelines
+
+Pipelines are designed to be easy to use (therefore do not follow [*Simple over easy*](#simple-over-easy) 100%), are not feature complete, and should loosely be seen as examples of how to use [models](#models) and [schedulers](#schedulers) for inference.
+
+The following design principles are followed:
+- Pipelines follow the single-file policy. All pipelines can be found in individual directories under src/diffusers/pipelines. One pipeline folder corresponds to one diffusion paper/project/release. Multiple pipeline files can be gathered in one pipeline folder, as it’s done for [`src/diffusers/pipelines/stable-diffusion`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/stable_diffusion). If pipelines share similar functionality, one can make use of the [# Copied from mechanism](https://github.com/huggingface/diffusers/blob/125d783076e5bd9785beb05367a2d2566843a271/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L251).
+- Pipelines all inherit from [`DiffusionPipeline`].
+- Every pipeline consists of different model and scheduler components, that are documented in the [`model_index.json` file](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/model_index.json), are accessible under the same name as attributes of the pipeline and can be shared between pipelines with [`DiffusionPipeline.components`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.components) function.
+- Every pipeline should be loadable via the [`DiffusionPipeline.from_pretrained`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained) function.
+- Pipelines should be used **only** for inference.
+- Pipelines should be very readable, self-explanatory, and easy to tweak.
+- Pipelines should be designed to build on top of each other and be easy to integrate into higher-level APIs.
+- Pipelines are **not** intended to be feature-complete user interfaces. For feature-complete user interfaces one should rather have a look at [InvokeAI](https://github.com/invoke-ai/InvokeAI), [Diffuzers](https://github.com/abhishekkrthakur/diffuzers), and [lama-cleaner](https://github.com/Sanster/lama-cleaner).
+- Every pipeline should have one and only one way to run it via a `__call__` method. The naming of the `__call__` arguments should be shared across all pipelines.
+- Pipelines should be named after the task they are intended to solve.
+- In almost all cases, novel diffusion pipelines shall be implemented in a new pipeline folder/file.
+
+### Models
+
+Models are designed as configurable toolboxes that are natural extensions of [PyTorch's Module class](https://pytorch.org/docs/stable/generated/torch.nn.Module.html). They only partly follow the **single-file policy**.
+
+The following design principles are followed:
+- Models correspond to **a type of model architecture**. *E.g.* the [`UNet2DConditionModel`] class is used for all UNet variations that expect 2D image inputs and are conditioned on some context.
+- All models can be found in [`src/diffusers/models`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and every model architecture shall be defined in its file, e.g. [`unets/unet_2d_condition.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unets/unet_2d_condition.py), [`transformers/transformer_2d.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformers/transformer_2d.py), etc...
+- Models **do not** follow the single-file policy and should make use of smaller model building blocks, such as [`attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py), [`resnet.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py), [`embeddings.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py), etc... **Note**: This is in stark contrast to Transformers' modeling files and shows that models do not really follow the single-file policy.
+- Models intend to expose complexity, just like PyTorch's `Module` class, and give clear error messages.
+- Models all inherit from `ModelMixin` and `ConfigMixin`.
+- Models can be optimized for performance when it doesn’t demand major code changes, keep backward compatibility, and give significant memory or compute gain.
+- Models should by default have the highest precision and lowest performance setting.
+- To integrate new model checkpoints whose general architecture can be classified as an architecture that already exists in Diffusers, the existing model architecture shall be adapted to make it work with the new checkpoint. One should only create a new file if the model architecture is fundamentally different.
+- Models should be designed to be easily extendable to future changes. This can be achieved by limiting public function arguments, configuration arguments, and "foreseeing" future changes, *e.g.* it is usually better to add `string` "...type" arguments that can easily be extended to new future types instead of boolean `is_..._type` arguments. Only the minimum amount of changes shall be made to existing architectures to make a new model checkpoint work.
+- The model design is a difficult trade-off between keeping code readable and concise and supporting many model checkpoints. For most parts of the modeling code, classes shall be adapted for new model checkpoints, while there are some exceptions where it is preferred to add new classes to make sure the code is kept concise and
+readable long-term, such as [UNet blocks](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unets/unet_2d_blocks.py) and [Attention processors](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+### Schedulers
+
+Schedulers are responsible to guide the denoising process for inference as well as to define a noise schedule for training. They are designed as individual classes with loadable configuration files and strongly follow the **single-file policy**.
+
+The following design principles are followed:
+- All schedulers are found in [`src/diffusers/schedulers`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers).
+- Schedulers are **not** allowed to import from large utils files and shall be kept very self-contained.
+- One scheduler Python file corresponds to one scheduler algorithm (as might be defined in a paper).
+- If schedulers share similar functionalities, we can make use of the `# Copied from` mechanism.
+- Schedulers all inherit from `SchedulerMixin` and `ConfigMixin`.
+- Schedulers can be easily swapped out with the [`ConfigMixin.from_config`](https://huggingface.co/docs/diffusers/main/en/api/configuration#diffusers.ConfigMixin.from_config) method as explained in detail [here](./docs/source/en/using-diffusers/schedulers.md).
+- Every scheduler has to have a `set_num_inference_steps`, and a `step` function. `set_num_inference_steps(...)` has to be called before every denoising process, *i.e.* before `step(...)` is called.
+- Every scheduler exposes the timesteps to be "looped over" via a `timesteps` attribute, which is an array of timesteps the model will be called upon.
+- The `step(...)` function takes a predicted model output and the "current" sample (x_t) and returns the "previous", slightly more denoised sample (x_t-1).
+- Given the complexity of diffusion schedulers, the `step` function does not expose all the complexity and can be a bit of a "black box".
+- In almost all cases, novel schedulers shall be implemented in a new scheduling file.
diff --git a/diffusers/README.md b/diffusers/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b99ca828e4d0dcf44a6954b9ae686435fc3127e9
--- /dev/null
+++ b/diffusers/README.md
@@ -0,0 +1,239 @@
+<!---
+Copyright 2022 - The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<p align="center">
+    <br>
+    <img src="https://raw.githubusercontent.com/huggingface/diffusers/main/docs/source/en/imgs/diffusers_library.jpg" width="400"/>
+    <br>
+<p>
+<p align="center">
+    <a href="https://github.com/huggingface/diffusers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/datasets.svg?color=blue"></a>
+    <a href="https://github.com/huggingface/diffusers/releases"><img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/diffusers.svg"></a>
+    <a href="https://pepy.tech/project/diffusers"><img alt="GitHub release" src="https://static.pepy.tech/badge/diffusers/month"></a>
+    <a href="CODE_OF_CONDUCT.md"><img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg"></a>
+    <a href="https://twitter.com/diffuserslib"><img alt="X account" src="https://img.shields.io/twitter/url/https/twitter.com/diffuserslib.svg?style=social&label=Follow%20%40diffuserslib"></a>
+</p>
+
+🤗 Diffusers is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. Whether you're looking for a simple inference solution or training your own diffusion models, 🤗 Diffusers is a modular toolbox that supports both. Our library is designed with a focus on [usability over performance](https://huggingface.co/docs/diffusers/conceptual/philosophy#usability-over-performance), [simple over easy](https://huggingface.co/docs/diffusers/conceptual/philosophy#simple-over-easy), and [customizability over abstractions](https://huggingface.co/docs/diffusers/conceptual/philosophy#tweakable-contributorfriendly-over-abstraction).
+
+🤗 Diffusers offers three core components:
+
+- State-of-the-art [diffusion pipelines](https://huggingface.co/docs/diffusers/api/pipelines/overview) that can be run in inference with just a few lines of code.
+- Interchangeable noise [schedulers](https://huggingface.co/docs/diffusers/api/schedulers/overview) for different diffusion speeds and output quality.
+- Pretrained [models](https://huggingface.co/docs/diffusers/api/models/overview) that can be used as building blocks, and combined with schedulers, for creating your own end-to-end diffusion systems.
+
+## Installation
+
+We recommend installing 🤗 Diffusers in a virtual environment from PyPI or Conda. For more details about installing [PyTorch](https://pytorch.org/get-started/locally/) and [Flax](https://flax.readthedocs.io/en/latest/#installation), please refer to their official documentation.
+
+### PyTorch
+
+With `pip` (official package):
+
+```bash
+pip install --upgrade diffusers[torch]
+```
+
+With `conda` (maintained by the community):
+
+```sh
+conda install -c conda-forge diffusers
+```
+
+### Flax
+
+With `pip` (official package):
+
+```bash
+pip install --upgrade diffusers[flax]
+```
+
+### Apple Silicon (M1/M2) support
+
+Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggingface.co/docs/diffusers/optimization/mps) guide.
+
+## Quickstart
+
+Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 30,000+ checkpoints):
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
+pipeline.to("cuda")
+pipeline("An image of a squirrel in Picasso style").images[0]
+```
+
+You can also dig into the models and schedulers toolbox to build your own diffusion system:
+
+```python
+from diffusers import DDPMScheduler, UNet2DModel
+from PIL import Image
+import torch
+
+scheduler = DDPMScheduler.from_pretrained("google/ddpm-cat-256")
+model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to("cuda")
+scheduler.set_timesteps(50)
+
+sample_size = model.config.sample_size
+noise = torch.randn((1, 3, sample_size, sample_size), device="cuda")
+input = noise
+
+for t in scheduler.timesteps:
+    with torch.no_grad():
+        noisy_residual = model(input, t).sample
+        prev_noisy_sample = scheduler.step(noisy_residual, t, input).prev_sample
+        input = prev_noisy_sample
+
+image = (input / 2 + 0.5).clamp(0, 1)
+image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+image = Image.fromarray((image * 255).round().astype("uint8"))
+image
+```
+
+Check out the [Quickstart](https://huggingface.co/docs/diffusers/quicktour) to launch your diffusion journey today!
+
+## How to navigate the documentation
+
+| **Documentation**                                                   | **What can I learn?**                                                                                                                                                                           |
+|---------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [Tutorial](https://huggingface.co/docs/diffusers/tutorials/tutorial_overview)                                                            | A basic crash course for learning how to use the library's most important features like using models and schedulers to build your own diffusion system, and training your own diffusion model.  |
+| [Loading](https://huggingface.co/docs/diffusers/using-diffusers/loading_overview)                                                             | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers.                                         |
+| [Pipelines for inference](https://huggingface.co/docs/diffusers/using-diffusers/pipeline_overview)                                             | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library.               |
+| [Optimization](https://huggingface.co/docs/diffusers/optimization/opt_overview)                                                        | Guides for how to optimize your diffusion model to run faster and consume less memory.                                                                                                          |
+| [Training](https://huggingface.co/docs/diffusers/training/overview) | Guides for how to train a diffusion model for different tasks with different training techniques.                                                                                               |
+## Contribution
+
+We ❤️  contributions from the open-source community!
+If you want to contribute to this library, please check out our [Contribution guide](https://github.com/huggingface/diffusers/blob/main/CONTRIBUTING.md).
+You can look out for [issues](https://github.com/huggingface/diffusers/issues) you'd like to tackle to contribute to the library.
+- See [Good first issues](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) for general opportunities to contribute
+- See [New model/pipeline](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+pipeline%2Fmodel%22) to contribute exciting new diffusion models / diffusion pipelines
+- See [New scheduler](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+scheduler%22)
+
+Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz98XR"><img alt="Join us on Discord" src="https://img.shields.io/discord/823813159592001537?color=5865F2&logo=discord&logoColor=white"></a>. We discuss the hottest trends about diffusion models, help each other with contributions, personal projects or just hang out ☕.
+
+
+## Popular Tasks & Pipelines
+
+<table>
+  <tr>
+    <th>Task</th>
+    <th>Pipeline</th>
+    <th>🤗 Hub</th>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td>Unconditional Image Generation</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/ddpm"> DDPM </a></td>
+    <td><a href="https://huggingface.co/google/ddpm-ema-church-256"> google/ddpm-ema-church-256 </a></td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td>Text-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img">Stable Diffusion Text-to-Image</a></td>
+      <td><a href="https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5"> stable-diffusion-v1-5/stable-diffusion-v1-5 </a></td>
+  </tr>
+  <tr>
+    <td>Text-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/unclip">unCLIP</a></td>
+      <td><a href="https://huggingface.co/kakaobrain/karlo-v1-alpha"> kakaobrain/karlo-v1-alpha </a></td>
+  </tr>
+  <tr>
+    <td>Text-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/deepfloyd_if">DeepFloyd IF</a></td>
+      <td><a href="https://huggingface.co/DeepFloyd/IF-I-XL-v1.0"> DeepFloyd/IF-I-XL-v1.0 </a></td>
+  </tr>
+  <tr>
+    <td>Text-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/kandinsky">Kandinsky</a></td>
+      <td><a href="https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder"> kandinsky-community/kandinsky-2-2-decoder </a></td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td>Text-guided Image-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/controlnet">ControlNet</a></td>
+      <td><a href="https://huggingface.co/lllyasviel/sd-controlnet-canny"> lllyasviel/sd-controlnet-canny </a></td>
+  </tr>
+  <tr>
+    <td>Text-guided Image-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/pix2pix">InstructPix2Pix</a></td>
+      <td><a href="https://huggingface.co/timbrooks/instruct-pix2pix"> timbrooks/instruct-pix2pix </a></td>
+  </tr>
+  <tr>
+    <td>Text-guided Image-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img">Stable Diffusion Image-to-Image</a></td>
+      <td><a href="https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5"> stable-diffusion-v1-5/stable-diffusion-v1-5 </a></td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td>Text-guided Image Inpainting</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint">Stable Diffusion Inpainting</a></td>
+      <td><a href="https://huggingface.co/runwayml/stable-diffusion-inpainting"> runwayml/stable-diffusion-inpainting </a></td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td>Image Variation</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/image_variation">Stable Diffusion Image Variation</a></td>
+      <td><a href="https://huggingface.co/lambdalabs/sd-image-variations-diffusers"> lambdalabs/sd-image-variations-diffusers </a></td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td>Super Resolution</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/upscale">Stable Diffusion Upscale</a></td>
+      <td><a href="https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler"> stabilityai/stable-diffusion-x4-upscaler </a></td>
+  </tr>
+  <tr>
+    <td>Super Resolution</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_upscale">Stable Diffusion Latent Upscale</a></td>
+      <td><a href="https://huggingface.co/stabilityai/sd-x2-latent-upscaler"> stabilityai/sd-x2-latent-upscaler </a></td>
+  </tr>
+</table>
+
+## Popular libraries using 🧨 Diffusers
+
+- https://github.com/microsoft/TaskMatrix
+- https://github.com/invoke-ai/InvokeAI
+- https://github.com/InstantID/InstantID
+- https://github.com/apple/ml-stable-diffusion
+- https://github.com/Sanster/lama-cleaner
+- https://github.com/IDEA-Research/Grounded-Segment-Anything
+- https://github.com/ashawkey/stable-dreamfusion
+- https://github.com/deep-floyd/IF
+- https://github.com/bentoml/BentoML
+- https://github.com/bmaltais/kohya_ss
+- +14,000 other amazing GitHub repositories 💪
+
+Thank you for using us ❤️.
+
+## Credits
+
+This library concretizes previous work by many different authors and would not have been possible without their great research and implementations. We'd like to thank, in particular, the following implementations which have helped us in our development and without which the API could not have been as polished today:
+
+- @CompVis' latent diffusion models library, available [here](https://github.com/CompVis/latent-diffusion)
+- @hojonathanho original DDPM implementation, available [here](https://github.com/hojonathanho/diffusion) as well as the extremely useful translation into PyTorch by @pesser, available [here](https://github.com/pesser/pytorch_diffusion)
+- @ermongroup's DDIM implementation, available [here](https://github.com/ermongroup/ddim)
+- @yang-song's Score-VE and Score-VP implementations, available [here](https://github.com/yang-song/score_sde_pytorch)
+
+We also want to thank @heejkoo for the very helpful overview of papers, code and resources on diffusion models, available [here](https://github.com/heejkoo/Awesome-Diffusion-Models) as well as @crowsonkb and @rromb for useful discussions and insights.
+
+## Citation
+
+```bibtex
+@misc{von-platen-etal-2022-diffusers,
+  author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Dhruv Nair and Sayak Paul and William Berman and Yiyi Xu and Steven Liu and Thomas Wolf},
+  title = {Diffusers: State-of-the-art diffusion models},
+  year = {2022},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/huggingface/diffusers}}
+}
+```
diff --git a/diffusers/_typos.toml b/diffusers/_typos.toml
new file mode 100644
index 0000000000000000000000000000000000000000..551099f981e7885fbda9ed28e297bace0e13407b
--- /dev/null
+++ b/diffusers/_typos.toml
@@ -0,0 +1,13 @@
+# Files for typos
+# Instruction:  https://github.com/marketplace/actions/typos-action#getting-started
+
+[default.extend-identifiers]
+
+[default.extend-words]
+NIN="NIN" # NIN is used in scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
+nd="np" # nd may be np (numpy)
+parms="parms" # parms is used in scripts/convert_original_stable_diffusion_to_diffusers.py
+
+
+[files]
+extend-exclude = ["_typos.toml"]
diff --git a/diffusers/benchmarks/base_classes.py b/diffusers/benchmarks/base_classes.py
new file mode 100644
index 0000000000000000000000000000000000000000..45bf65c93c93a3a77d5b2e4b321b19a4cf9cd63f
--- /dev/null
+++ b/diffusers/benchmarks/base_classes.py
@@ -0,0 +1,346 @@
+import os
+import sys
+
+import torch
+
+from diffusers import (
+    AutoPipelineForImage2Image,
+    AutoPipelineForInpainting,
+    AutoPipelineForText2Image,
+    ControlNetModel,
+    LCMScheduler,
+    StableDiffusionAdapterPipeline,
+    StableDiffusionControlNetPipeline,
+    StableDiffusionXLAdapterPipeline,
+    StableDiffusionXLControlNetPipeline,
+    T2IAdapter,
+    WuerstchenCombinedPipeline,
+)
+from diffusers.utils import load_image
+
+
+sys.path.append(".")
+
+from utils import (  # noqa: E402
+    BASE_PATH,
+    PROMPT,
+    BenchmarkInfo,
+    benchmark_fn,
+    bytes_to_giga_bytes,
+    flush,
+    generate_csv_dict,
+    write_to_csv,
+)
+
+
+RESOLUTION_MAPPING = {
+    "Lykon/DreamShaper": (512, 512),
+    "lllyasviel/sd-controlnet-canny": (512, 512),
+    "diffusers/controlnet-canny-sdxl-1.0": (1024, 1024),
+    "TencentARC/t2iadapter_canny_sd14v1": (512, 512),
+    "TencentARC/t2i-adapter-canny-sdxl-1.0": (1024, 1024),
+    "stabilityai/stable-diffusion-2-1": (768, 768),
+    "stabilityai/stable-diffusion-xl-base-1.0": (1024, 1024),
+    "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024),
+    "stabilityai/sdxl-turbo": (512, 512),
+}
+
+
+class BaseBenchmak:
+    pipeline_class = None
+
+    def __init__(self, args):
+        super().__init__()
+
+    def run_inference(self, args):
+        raise NotImplementedError
+
+    def benchmark(self, args):
+        raise NotImplementedError
+
+    def get_result_filepath(self, args):
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        name = (
+            args.ckpt.replace("/", "_")
+            + "_"
+            + pipeline_class_name
+            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
+        )
+        filepath = os.path.join(BASE_PATH, name)
+        return filepath
+
+
+class TextToImageBenchmark(BaseBenchmak):
+    pipeline_class = AutoPipelineForText2Image
+
+    def __init__(self, args):
+        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
+        pipe = pipe.to("cuda")
+
+        if args.run_compile:
+            if not isinstance(pipe, WuerstchenCombinedPipeline):
+                pipe.unet.to(memory_format=torch.channels_last)
+                print("Run torch compile")
+                pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+                if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None:
+                    pipe.movq.to(memory_format=torch.channels_last)
+                    pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True)
+            else:
+                print("Run torch compile")
+                pipe.decoder = torch.compile(pipe.decoder, mode="reduce-overhead", fullgraph=True)
+                pipe.vqgan = torch.compile(pipe.vqgan, mode="reduce-overhead", fullgraph=True)
+
+        pipe.set_progress_bar_config(disable=True)
+        self.pipe = pipe
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+    def benchmark(self, args):
+        flush()
+
+        print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n")
+
+        time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
+        memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
+        benchmark_info = BenchmarkInfo(time=time, memory=memory)
+
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        flush()
+        csv_dict = generate_csv_dict(
+            pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info
+        )
+        filepath = self.get_result_filepath(args)
+        write_to_csv(filepath, csv_dict)
+        print(f"Logs written to: {filepath}")
+        flush()
+
+
+class TurboTextToImageBenchmark(TextToImageBenchmark):
+    def __init__(self, args):
+        super().__init__(args)
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+            guidance_scale=0.0,
+        )
+
+
+class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
+    lora_id = "latent-consistency/lcm-lora-sdxl"
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.pipe.load_lora_weights(self.lora_id)
+        self.pipe.fuse_lora()
+        self.pipe.unload_lora_weights()
+        self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
+
+    def get_result_filepath(self, args):
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        name = (
+            self.lora_id.replace("/", "_")
+            + "_"
+            + pipeline_class_name
+            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
+        )
+        filepath = os.path.join(BASE_PATH, name)
+        return filepath
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+            guidance_scale=1.0,
+        )
+
+    def benchmark(self, args):
+        flush()
+
+        print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n")
+
+        time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
+        memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
+        benchmark_info = BenchmarkInfo(time=time, memory=memory)
+
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        flush()
+        csv_dict = generate_csv_dict(
+            pipeline_cls=pipeline_class_name, ckpt=self.lora_id, args=args, benchmark_info=benchmark_info
+        )
+        filepath = self.get_result_filepath(args)
+        write_to_csv(filepath, csv_dict)
+        print(f"Logs written to: {filepath}")
+        flush()
+
+
+class ImageToImageBenchmark(TextToImageBenchmark):
+    pipeline_class = AutoPipelineForImage2Image
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg"
+    image = load_image(url).convert("RGB")
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+
+class TurboImageToImageBenchmark(ImageToImageBenchmark):
+    def __init__(self, args):
+        super().__init__(args)
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+            guidance_scale=0.0,
+            strength=0.5,
+        )
+
+
+class InpaintingBenchmark(ImageToImageBenchmark):
+    pipeline_class = AutoPipelineForInpainting
+    mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png"
+    mask = load_image(mask_url).convert("RGB")
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+        self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt])
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            mask_image=self.mask,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+
+class IPAdapterTextToImageBenchmark(TextToImageBenchmark):
+    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png"
+    image = load_image(url)
+
+    def __init__(self, args):
+        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16).to("cuda")
+        pipe.load_ip_adapter(
+            args.ip_adapter_id[0],
+            subfolder="models" if "sdxl" not in args.ip_adapter_id[1] else "sdxl_models",
+            weight_name=args.ip_adapter_id[1],
+        )
+
+        if args.run_compile:
+            pipe.unet.to(memory_format=torch.channels_last)
+            print("Run torch compile")
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        pipe.set_progress_bar_config(disable=True)
+        self.pipe = pipe
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            ip_adapter_image=self.image,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+
+class ControlNetBenchmark(TextToImageBenchmark):
+    pipeline_class = StableDiffusionControlNetPipeline
+    aux_network_class = ControlNetModel
+    root_ckpt = "Lykon/DreamShaper"
+
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
+    image = load_image(url).convert("RGB")
+
+    def __init__(self, args):
+        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16)
+        pipe = pipe.to("cuda")
+
+        pipe.set_progress_bar_config(disable=True)
+        self.pipe = pipe
+
+        if args.run_compile:
+            pipe.unet.to(memory_format=torch.channels_last)
+            pipe.controlnet.to(memory_format=torch.channels_last)
+
+            print("Run torch compile")
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+            pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+
+class ControlNetSDXLBenchmark(ControlNetBenchmark):
+    pipeline_class = StableDiffusionXLControlNetPipeline
+    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+
+    def __init__(self, args):
+        super().__init__(args)
+
+
+class T2IAdapterBenchmark(ControlNetBenchmark):
+    pipeline_class = StableDiffusionAdapterPipeline
+    aux_network_class = T2IAdapter
+    root_ckpt = "Lykon/DreamShaper"
+
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png"
+    image = load_image(url).convert("L")
+
+    def __init__(self, args):
+        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16)
+        pipe = pipe.to("cuda")
+
+        pipe.set_progress_bar_config(disable=True)
+        self.pipe = pipe
+
+        if args.run_compile:
+            pipe.unet.to(memory_format=torch.channels_last)
+            pipe.adapter.to(memory_format=torch.channels_last)
+
+            print("Run torch compile")
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+            pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True)
+
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+
+
+class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark):
+    pipeline_class = StableDiffusionXLAdapterPipeline
+    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter_sdxl.png"
+    image = load_image(url)
+
+    def __init__(self, args):
+        super().__init__(args)
diff --git a/diffusers/benchmarks/benchmark_controlnet.py b/diffusers/benchmarks/benchmark_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..9217004461dc9352b1b9e6cda698dd866177eb67
--- /dev/null
+++ b/diffusers/benchmarks/benchmark_controlnet.py
@@ -0,0 +1,26 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="lllyasviel/sd-controlnet-canny",
+        choices=["lllyasviel/sd-controlnet-canny", "diffusers/controlnet-canny-sdxl-1.0"],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = (
+        ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args)
+    )
+    benchmark_pipe.benchmark(args)
diff --git a/diffusers/benchmarks/benchmark_ip_adapters.py b/diffusers/benchmarks/benchmark_ip_adapters.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a31a21fc60ddd2d041d7896ee4144d1766fcc01
--- /dev/null
+++ b/diffusers/benchmarks/benchmark_ip_adapters.py
@@ -0,0 +1,33 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import IPAdapterTextToImageBenchmark  # noqa: E402
+
+
+IP_ADAPTER_CKPTS = {
+    # because original SD v1.5 has been taken down.
+    "Lykon/DreamShaper": ("h94/IP-Adapter", "ip-adapter_sd15.bin"),
+    "stabilityai/stable-diffusion-xl-base-1.0": ("h94/IP-Adapter", "ip-adapter_sdxl.bin"),
+}
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="rstabilityai/stable-diffusion-xl-base-1.0",
+        choices=list(IP_ADAPTER_CKPTS.keys()),
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    args.ip_adapter_id = IP_ADAPTER_CKPTS[args.ckpt]
+    benchmark_pipe = IPAdapterTextToImageBenchmark(args)
+    args.ckpt = f"{args.ckpt} (IP-Adapter)"
+    benchmark_pipe.benchmark(args)
diff --git a/diffusers/benchmarks/benchmark_sd_img.py b/diffusers/benchmarks/benchmark_sd_img.py
new file mode 100644
index 0000000000000000000000000000000000000000..772befe8795fb933fdb7831565eb28848e642af2
--- /dev/null
+++ b/diffusers/benchmarks/benchmark_sd_img.py
@@ -0,0 +1,29 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import ImageToImageBenchmark, TurboImageToImageBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="Lykon/DreamShaper",
+        choices=[
+            "Lykon/DreamShaper",
+            "stabilityai/stable-diffusion-2-1",
+            "stabilityai/stable-diffusion-xl-refiner-1.0",
+            "stabilityai/sdxl-turbo",
+        ],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = ImageToImageBenchmark(args) if "turbo" not in args.ckpt else TurboImageToImageBenchmark(args)
+    benchmark_pipe.benchmark(args)
diff --git a/diffusers/benchmarks/benchmark_sd_inpainting.py b/diffusers/benchmarks/benchmark_sd_inpainting.py
new file mode 100644
index 0000000000000000000000000000000000000000..143adcb0d87c7a9cf9c8ae0d19581fd48f9ac4c8
--- /dev/null
+++ b/diffusers/benchmarks/benchmark_sd_inpainting.py
@@ -0,0 +1,28 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import InpaintingBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="Lykon/DreamShaper",
+        choices=[
+            "Lykon/DreamShaper",
+            "stabilityai/stable-diffusion-2-1",
+            "stabilityai/stable-diffusion-xl-base-1.0",
+        ],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = InpaintingBenchmark(args)
+    benchmark_pipe.benchmark(args)
diff --git a/diffusers/benchmarks/benchmark_t2i_adapter.py b/diffusers/benchmarks/benchmark_t2i_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..44b04b470ea65d5f3318bee21bb107c7b4b2b2f9
--- /dev/null
+++ b/diffusers/benchmarks/benchmark_t2i_adapter.py
@@ -0,0 +1,28 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import T2IAdapterBenchmark, T2IAdapterSDXLBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="TencentARC/t2iadapter_canny_sd14v1",
+        choices=["TencentARC/t2iadapter_canny_sd14v1", "TencentARC/t2i-adapter-canny-sdxl-1.0"],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = (
+        T2IAdapterBenchmark(args)
+        if args.ckpt == "TencentARC/t2iadapter_canny_sd14v1"
+        else T2IAdapterSDXLBenchmark(args)
+    )
+    benchmark_pipe.benchmark(args)
diff --git a/diffusers/benchmarks/benchmark_t2i_lcm_lora.py b/diffusers/benchmarks/benchmark_t2i_lcm_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..957e0a463e28fccc51fe32cd975f3d5234cfd1f2
--- /dev/null
+++ b/diffusers/benchmarks/benchmark_t2i_lcm_lora.py
@@ -0,0 +1,23 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import LCMLoRATextToImageBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="stabilityai/stable-diffusion-xl-base-1.0",
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=4)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = LCMLoRATextToImageBenchmark(args)
+    benchmark_pipe.benchmark(args)
diff --git a/diffusers/benchmarks/benchmark_text_to_image.py b/diffusers/benchmarks/benchmark_text_to_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddc7fb2676a5c6090ce08e4488f5ca697c4329aa
--- /dev/null
+++ b/diffusers/benchmarks/benchmark_text_to_image.py
@@ -0,0 +1,40 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark  # noqa: E402
+
+
+ALL_T2I_CKPTS = [
+    "Lykon/DreamShaper",
+    "segmind/SSD-1B",
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    "kandinsky-community/kandinsky-2-2-decoder",
+    "warp-ai/wuerstchen",
+    "stabilityai/sdxl-turbo",
+]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="Lykon/DreamShaper",
+        choices=ALL_T2I_CKPTS,
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_cls = None
+    if "turbo" in args.ckpt:
+        benchmark_cls = TurboTextToImageBenchmark
+    else:
+        benchmark_cls = TextToImageBenchmark
+
+    benchmark_pipe = benchmark_cls(args)
+    benchmark_pipe.benchmark(args)
diff --git a/diffusers/benchmarks/push_results.py b/diffusers/benchmarks/push_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..962e07c6d74cd9c9d0b18657e935d244a131a9de
--- /dev/null
+++ b/diffusers/benchmarks/push_results.py
@@ -0,0 +1,72 @@
+import glob
+import sys
+
+import pandas as pd
+from huggingface_hub import hf_hub_download, upload_file
+from huggingface_hub.utils._errors import EntryNotFoundError
+
+
+sys.path.append(".")
+from utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv  # noqa: E402
+
+
+def has_previous_benchmark() -> str:
+    csv_path = None
+    try:
+        csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILE)
+    except EntryNotFoundError:
+        csv_path = None
+    return csv_path
+
+
+def filter_float(value):
+    if isinstance(value, str):
+        return float(value.split()[0])
+    return value
+
+
+def push_to_hf_dataset():
+    all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv"))
+    collate_csv(all_csvs, FINAL_CSV_FILE)
+
+    # If there's an existing benchmark file, we should report the changes.
+    csv_path = has_previous_benchmark()
+    if csv_path is not None:
+        current_results = pd.read_csv(FINAL_CSV_FILE)
+        previous_results = pd.read_csv(csv_path)
+
+        numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns
+        numeric_columns = [
+            c for c in numeric_columns if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)"]
+        ]
+
+        for column in numeric_columns:
+            previous_results[column] = previous_results[column].map(lambda x: filter_float(x))
+
+            # Calculate the percentage change
+            current_results[column] = current_results[column].astype(float)
+            previous_results[column] = previous_results[column].astype(float)
+            percent_change = ((current_results[column] - previous_results[column]) / previous_results[column]) * 100
+
+            # Format the values with '+' or '-' sign and append to original values
+            current_results[column] = current_results[column].map(str) + percent_change.map(
+                lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)"
+            )
+            # There might be newly added rows. So, filter out the NaNs.
+            current_results[column] = current_results[column].map(lambda x: x.replace(" (nan%)", ""))
+
+        # Overwrite the current result file.
+        current_results.to_csv(FINAL_CSV_FILE, index=False)
+
+    commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results"
+    upload_file(
+        repo_id=REPO_ID,
+        path_in_repo=FINAL_CSV_FILE,
+        path_or_fileobj=FINAL_CSV_FILE,
+        repo_type="dataset",
+        commit_message=commit_message,
+    )
+
+
+if __name__ == "__main__":
+    push_to_hf_dataset()
diff --git a/diffusers/benchmarks/run_all.py b/diffusers/benchmarks/run_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9932cc71c38513a301854a5ad227a2e3ec28d30
--- /dev/null
+++ b/diffusers/benchmarks/run_all.py
@@ -0,0 +1,101 @@
+import glob
+import subprocess
+import sys
+from typing import List
+
+
+sys.path.append(".")
+from benchmark_text_to_image import ALL_T2I_CKPTS  # noqa: E402
+
+
+PATTERN = "benchmark_*.py"
+
+
+class SubprocessCallException(Exception):
+    pass
+
+
+# Taken from `test_examples_utils.py`
+def run_command(command: List[str], return_stdout=False):
+    """
+    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
+    if an error occurred while running `command`
+    """
+    try:
+        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
+        if return_stdout:
+            if hasattr(output, "decode"):
+                output = output.decode("utf-8")
+            return output
+    except subprocess.CalledProcessError as e:
+        raise SubprocessCallException(
+            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
+        ) from e
+
+
+def main():
+    python_files = glob.glob(PATTERN)
+
+    for file in python_files:
+        print(f"****** Running file: {file} ******")
+
+        # Run with canonical settings.
+        if file != "benchmark_text_to_image.py" and file != "benchmark_ip_adapters.py":
+            command = f"python {file}"
+            run_command(command.split())
+
+            command += " --run_compile"
+            run_command(command.split())
+
+    # Run variants.
+    for file in python_files:
+        # See: https://github.com/pytorch/pytorch/issues/129637
+        if file == "benchmark_ip_adapters.py":
+            continue
+
+        if file == "benchmark_text_to_image.py":
+            for ckpt in ALL_T2I_CKPTS:
+                command = f"python {file} --ckpt {ckpt}"
+
+                if "turbo" in ckpt:
+                    command += " --num_inference_steps 1"
+
+                run_command(command.split())
+
+                command += " --run_compile"
+                run_command(command.split())
+
+        elif file == "benchmark_sd_img.py":
+            for ckpt in ["stabilityai/stable-diffusion-xl-refiner-1.0", "stabilityai/sdxl-turbo"]:
+                command = f"python {file} --ckpt {ckpt}"
+
+                if ckpt == "stabilityai/sdxl-turbo":
+                    command += " --num_inference_steps 2"
+
+                run_command(command.split())
+                command += " --run_compile"
+                run_command(command.split())
+
+        elif file in ["benchmark_sd_inpainting.py", "benchmark_ip_adapters.py"]:
+            sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+            command = f"python {file} --ckpt {sdxl_ckpt}"
+            run_command(command.split())
+
+            command += " --run_compile"
+            run_command(command.split())
+
+        elif file in ["benchmark_controlnet.py", "benchmark_t2i_adapter.py"]:
+            sdxl_ckpt = (
+                "diffusers/controlnet-canny-sdxl-1.0"
+                if "controlnet" in file
+                else "TencentARC/t2i-adapter-canny-sdxl-1.0"
+            )
+            command = f"python {file} --ckpt {sdxl_ckpt}"
+            run_command(command.split())
+
+            command += " --run_compile"
+            run_command(command.split())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/benchmarks/utils.py b/diffusers/benchmarks/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fce920ac6c3549e3654b1cfb2f0e79096aa019d
--- /dev/null
+++ b/diffusers/benchmarks/utils.py
@@ -0,0 +1,98 @@
+import argparse
+import csv
+import gc
+import os
+from dataclasses import dataclass
+from typing import Dict, List, Union
+
+import torch
+import torch.utils.benchmark as benchmark
+
+
+GITHUB_SHA = os.getenv("GITHUB_SHA", None)
+BENCHMARK_FIELDS = [
+    "pipeline_cls",
+    "ckpt_id",
+    "batch_size",
+    "num_inference_steps",
+    "model_cpu_offload",
+    "run_compile",
+    "time (secs)",
+    "memory (gbs)",
+    "actual_gpu_memory (gbs)",
+    "github_sha",
+]
+
+PROMPT = "ghibli style, a fantasy landscape with castles"
+BASE_PATH = os.getenv("BASE_PATH", ".")
+TOTAL_GPU_MEMORY = float(os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3)))
+
+REPO_ID = "diffusers/benchmarks"
+FINAL_CSV_FILE = "collated_results.csv"
+
+
+@dataclass
+class BenchmarkInfo:
+    time: float
+    memory: float
+
+
+def flush():
+    """Wipes off memory."""
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+    torch.cuda.reset_peak_memory_stats()
+
+
+def bytes_to_giga_bytes(bytes):
+    return f"{(bytes / 1024 / 1024 / 1024):.3f}"
+
+
+def benchmark_fn(f, *args, **kwargs):
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)",
+        globals={"args": args, "kwargs": kwargs, "f": f},
+        num_threads=torch.get_num_threads(),
+    )
+    return f"{(t0.blocked_autorange().mean):.3f}"
+
+
+def generate_csv_dict(
+    pipeline_cls: str, ckpt: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo
+) -> Dict[str, Union[str, bool, float]]:
+    """Packs benchmarking data into a dictionary for latter serialization."""
+    data_dict = {
+        "pipeline_cls": pipeline_cls,
+        "ckpt_id": ckpt,
+        "batch_size": args.batch_size,
+        "num_inference_steps": args.num_inference_steps,
+        "model_cpu_offload": args.model_cpu_offload,
+        "run_compile": args.run_compile,
+        "time (secs)": benchmark_info.time,
+        "memory (gbs)": benchmark_info.memory,
+        "actual_gpu_memory (gbs)": f"{(TOTAL_GPU_MEMORY):.3f}",
+        "github_sha": GITHUB_SHA,
+    }
+    return data_dict
+
+
+def write_to_csv(file_name: str, data_dict: Dict[str, Union[str, bool, float]]):
+    """Serializes a dictionary into a CSV file."""
+    with open(file_name, mode="w", newline="") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=BENCHMARK_FIELDS)
+        writer.writeheader()
+        writer.writerow(data_dict)
+
+
+def collate_csv(input_files: List[str], output_file: str):
+    """Collates multiple identically structured CSVs into a single CSV file."""
+    with open(output_file, mode="w", newline="") as outfile:
+        writer = csv.DictWriter(outfile, fieldnames=BENCHMARK_FIELDS)
+        writer.writeheader()
+
+        for file in input_files:
+            with open(file, mode="r") as infile:
+                reader = csv.DictReader(infile)
+                for row in reader:
+                    writer.writerow(row)
diff --git a/diffusers/docker/diffusers-doc-builder/Dockerfile b/diffusers/docker/diffusers-doc-builder/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..c9fc62707cb0dac3126c06ebc22420c1353146ff
--- /dev/null
+++ b/diffusers/docker/diffusers-doc-builder/Dockerfile
@@ -0,0 +1,52 @@
+FROM ubuntu:20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get -y update \
+    && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   libsndfile1-dev \
+                   python3.10 \
+                   python3-pip \
+                   libgl1 \
+                   zip \
+                   wget \
+                   python3.10-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3.10 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3.10 -m uv pip install --no-cache-dir \
+        torch \
+        torchvision \
+        torchaudio \
+        invisible_watermark \
+        --extra-index-url https://download.pytorch.org/whl/cpu && \
+    python3.10 -m uv pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        Jinja2 \
+        librosa \
+        numpy==1.26.4 \
+        scipy \
+        tensorboard \
+        transformers \
+        matplotlib \
+        setuptools==69.5.1
+
+CMD ["/bin/bash"]
diff --git a/diffusers/docker/diffusers-flax-cpu/Dockerfile b/diffusers/docker/diffusers-flax-cpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..051008aa9a2ee057a56bfe55b7af34595c26c413
--- /dev/null
+++ b/diffusers/docker/diffusers-flax-cpu/Dockerfile
@@ -0,0 +1,49 @@
+FROM ubuntu:20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get -y update \
+    && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt install -y bash \
+        build-essential \
+        git \
+        git-lfs \
+        curl \
+        ca-certificates \
+        libsndfile1-dev \
+        libgl1 \
+        python3.10 \
+        python3-pip \
+        python3.10-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3.10 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
+RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3 -m uv pip install --upgrade --no-cache-dir \
+        clu \
+        "jax[cpu]>=0.2.16,!=0.3.2" \
+        "flax>=0.4.1" \
+        "jaxlib>=0.1.65" && \
+    python3 -m uv pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        Jinja2 \
+        librosa \
+        numpy==1.26.4 \
+        scipy \
+        tensorboard \
+        transformers \
+        hf_transfer
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/diffusers/docker/diffusers-flax-tpu/Dockerfile b/diffusers/docker/diffusers-flax-tpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..405f068923b79136197daa8089340734d1435c60
--- /dev/null
+++ b/diffusers/docker/diffusers-flax-tpu/Dockerfile
@@ -0,0 +1,51 @@
+FROM ubuntu:20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get -y update \
+    && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   libsndfile1-dev \
+                   libgl1 \
+                   python3.10 \
+                   python3-pip \
+                   python3.10-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3.10 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
+RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3 -m pip install --no-cache-dir \
+        "jax[tpu]>=0.2.16,!=0.3.2" \
+        -f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
+    python3 -m uv pip install --upgrade --no-cache-dir \
+        clu \
+        "flax>=0.4.1" \
+        "jaxlib>=0.1.65" && \
+    python3 -m uv pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        Jinja2 \
+        librosa \
+        numpy==1.26.4 \
+        scipy \
+        tensorboard \
+        transformers \
+        hf_transfer
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/diffusers/docker/diffusers-onnxruntime-cpu/Dockerfile b/diffusers/docker/diffusers-onnxruntime-cpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6f4b13e8a9ba0368bcc55336e7574f326b9cef80
--- /dev/null
+++ b/diffusers/docker/diffusers-onnxruntime-cpu/Dockerfile
@@ -0,0 +1,49 @@
+FROM ubuntu:20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get -y update \
+    && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   libsndfile1-dev \
+                   libgl1 \
+                   python3.10 \
+                   python3-pip \
+                   python3.10-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3.10 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3 -m uv pip install --no-cache-dir \
+        torch==2.1.2 \
+        torchvision==0.16.2 \
+        torchaudio==2.1.2 \
+        onnxruntime \
+        --extra-index-url https://download.pytorch.org/whl/cpu && \
+    python3 -m uv pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        Jinja2 \
+        librosa \
+        numpy==1.26.4 \
+        scipy \
+        tensorboard \
+        transformers \
+        hf_transfer
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/diffusers/docker/diffusers-onnxruntime-cuda/Dockerfile b/diffusers/docker/diffusers-onnxruntime-cuda/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..bd1d871033c96bd9eb0aa6157747c0fad3fa9093
--- /dev/null
+++ b/diffusers/docker/diffusers-onnxruntime-cuda/Dockerfile
@@ -0,0 +1,50 @@
+FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get -y update \
+    && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   libsndfile1-dev \
+                   libgl1 \
+                   python3.10 \
+                   python3-pip \
+                   python3.10-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3.10 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3.10 -m uv pip install --no-cache-dir \
+        torch \
+        torchvision \
+        torchaudio \
+        "onnxruntime-gpu>=1.13.1" \
+        --extra-index-url https://download.pytorch.org/whl/cu117 && \
+    python3.10 -m uv pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        hf_transfer \
+        Jinja2 \
+        librosa \
+        numpy==1.26.4 \
+        scipy \
+        tensorboard \
+        transformers \
+        hf_transfer
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/diffusers/docker/diffusers-pytorch-compile-cuda/Dockerfile b/diffusers/docker/diffusers-pytorch-compile-cuda/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..cb4a9c0f9896b0a1e8c3f21ef7baef1035a70d44
--- /dev/null
+++ b/diffusers/docker/diffusers-pytorch-compile-cuda/Dockerfile
@@ -0,0 +1,50 @@
+FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get -y update \
+    && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt install -y bash \
+    build-essential \
+    git \
+    git-lfs \
+    curl \
+    ca-certificates \
+    libsndfile1-dev \
+    libgl1 \
+    python3.10 \
+    python3.10-dev \
+    python3-pip \
+    python3.10-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3.10 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3.10 -m uv pip install --no-cache-dir \
+    torch \
+    torchvision \
+    torchaudio \
+    invisible_watermark && \
+    python3.10 -m pip install --no-cache-dir \
+    accelerate \
+    datasets \
+    hf-doc-builder \
+    huggingface-hub \
+    hf_transfer \
+    Jinja2 \
+    librosa \
+    numpy==1.26.4 \
+    scipy \
+    tensorboard \
+    transformers \
+    hf_transfer
+
+CMD ["/bin/bash"]
diff --git a/diffusers/docker/diffusers-pytorch-cpu/Dockerfile b/diffusers/docker/diffusers-pytorch-cpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..8d98c52598d2095a2842f97a2f68c56391e24431
--- /dev/null
+++ b/diffusers/docker/diffusers-pytorch-cpu/Dockerfile
@@ -0,0 +1,50 @@
+FROM ubuntu:20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get -y update \
+    && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   libsndfile1-dev \
+                   python3.10 \
+                   python3.10-dev \
+                   python3-pip \
+                   libgl1 \
+                   python3.10-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3.10 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3.10 -m uv pip install --no-cache-dir \
+        torch \
+        torchvision \
+        torchaudio \
+        invisible_watermark \
+        --extra-index-url https://download.pytorch.org/whl/cpu && \
+    python3.10 -m uv pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        Jinja2 \
+        librosa \
+        numpy==1.26.4 \
+        scipy \
+        tensorboard \
+        transformers matplotlib  \
+        hf_transfer
+
+CMD ["/bin/bash"]
diff --git a/diffusers/docker/diffusers-pytorch-cuda/Dockerfile b/diffusers/docker/diffusers-pytorch-cuda/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..695f5ed08dc5b202b84f57f3bfab7fe8559c9d97
--- /dev/null
+++ b/diffusers/docker/diffusers-pytorch-cuda/Dockerfile
@@ -0,0 +1,51 @@
+FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get -y update \
+    && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt install -y bash \
+    build-essential \
+    git \
+    git-lfs \
+    curl \
+    ca-certificates \
+    libsndfile1-dev \
+    libgl1 \
+    python3.10 \
+    python3.10-dev \
+    python3-pip \
+    python3.10-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3.10 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3.10 -m uv pip install --no-cache-dir \
+    torch \
+    torchvision \
+    torchaudio \
+    invisible_watermark && \
+    python3.10 -m pip install --no-cache-dir \
+    accelerate \
+    datasets \
+    hf-doc-builder \
+    huggingface-hub \
+    hf_transfer \
+    Jinja2 \
+    librosa \
+    numpy==1.26.4 \
+    scipy \
+    tensorboard \
+    transformers \
+    pytorch-lightning  \
+    hf_transfer
+
+CMD ["/bin/bash"]
diff --git a/diffusers/docker/diffusers-pytorch-xformers-cuda/Dockerfile b/diffusers/docker/diffusers-pytorch-xformers-cuda/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..1693eb2930242d950f591aa461e7d527c7268fc6
--- /dev/null
+++ b/diffusers/docker/diffusers-pytorch-xformers-cuda/Dockerfile
@@ -0,0 +1,51 @@
+FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get -y update \
+    && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   libsndfile1-dev \
+                   libgl1 \
+                   python3.10 \
+                   python3.10-dev \
+                   python3-pip \
+                   python3.10-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3.10 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3.10 -m pip install --no-cache-dir \
+        torch \
+        torchvision \
+        torchaudio \
+        invisible_watermark && \
+    python3.10 -m uv pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        hf_transfer \
+        Jinja2 \
+        librosa \
+        numpy==1.26.4 \
+        scipy \
+        tensorboard \
+        transformers \
+        xformers  \
+        hf_transfer
+
+CMD ["/bin/bash"]
diff --git a/diffusers/examples/README.md b/diffusers/examples/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c275070405456548a48fa63c73e473a9173f327e
--- /dev/null
+++ b/diffusers/examples/README.md
@@ -0,0 +1,70 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# 🧨 Diffusers Examples
+
+Diffusers examples are a collection of scripts to demonstrate how to effectively use the `diffusers` library
+for a variety of use cases involving training or fine-tuning.
+
+**Note**: If you are looking for **official** examples on how to use `diffusers` for inference, please have a look at [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).
+
+Our examples aspire to be **self-contained**, **easy-to-tweak**, **beginner-friendly** and for **one-purpose-only**.
+More specifically, this means:
+
+- **Self-contained**: An example script shall only depend on "pip-install-able" Python packages that can be found in a `requirements.txt` file. Example scripts shall **not** depend on any local files. This means that one can simply download an example script, *e.g.* [train_unconditional.py](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py), install the required dependencies, *e.g.* [requirements.txt](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/requirements.txt) and execute the example script.
+- **Easy-to-tweak**: While we strive to present as many use cases as possible, the example scripts are just that - examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, most of the examples fully expose the preprocessing of the data and the training loop to allow you to tweak and edit them as required.
+- **Beginner-friendly**: We do not aim for providing state-of-the-art training scripts for the newest models, but rather examples that can be used as a way to better understand diffusion models and how to use them with the `diffusers` library. We often purposefully leave out certain state-of-the-art methods if we consider them too complex for beginners.
+- **One-purpose-only**: Examples should show one task and one task only. Even if a task is from a modeling point of view very similar, *e.g.* image super-resolution and image modification tend to use the same model and training method, we want examples to showcase only one task to keep them as readable and easy-to-understand as possible.
+
+We provide **official** examples that cover the most popular tasks of diffusion models.
+*Official* examples are **actively** maintained by the `diffusers` maintainers and we try to rigorously follow our example philosophy as defined above.
+If you feel like another important example should exist, we are more than happy to welcome a [Feature Request](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feature_request.md&title=) or directly a [Pull Request](https://github.com/huggingface/diffusers/compare) from you!
+
+Training examples show how to pretrain or fine-tune diffusion models for a variety of tasks. Currently we support:
+
+| Task | 🤗 Accelerate | 🤗 Datasets | Colab
+|---|---|:---:|:---:|
+| [**Unconditional Image Generation**](./unconditional_image_generation) | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [**Text-to-Image fine-tuning**](./text_to_image) | ✅ | ✅ |
+| [**Textual Inversion**](./textual_inversion) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
+| [**Dreambooth**](./dreambooth) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
+| [**ControlNet**](./controlnet) | ✅ | ✅ | -
+| [**InstructPix2Pix**](./instruct_pix2pix) | ✅ | ✅ | -
+| [**Reinforcement Learning for Control**](./reinforcement_learning)                    | - | - | coming soon.
+
+## Community
+
+In addition, we provide **community** examples, which are examples added and maintained by our community.
+Community examples can consist of both *training* examples or *inference* pipelines.
+For such examples, we are more lenient regarding the philosophy defined above and also cannot guarantee to provide maintenance for every issue.
+Examples that are useful for the community, but are either not yet deemed popular or not yet following our above philosophy should go into the [community examples](https://github.com/huggingface/diffusers/tree/main/examples/community) folder. The community folder therefore includes training examples and inference pipelines.
+**Note**: Community examples can be a [great first contribution](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) to show to the community how you like to use `diffusers` 🪄.
+
+## Research Projects
+
+We also provide **research_projects** examples that are maintained by the community as defined in the respective research project folders. These examples are useful and offer the extended capabilities which are complementary to the official examples. You may refer to [research_projects](https://github.com/huggingface/diffusers/tree/main/examples/research_projects) for details.
+
+## Important note
+
+To make sure you can successfully run the latest versions of the example scripts, you have to **install the library from source** and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+Then cd in the example folder of your choice and run
+```bash
+pip install -r requirements.txt
+```
diff --git a/diffusers/examples/advanced_diffusion_training/README.md b/diffusers/examples/advanced_diffusion_training/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cd8c5feda9f038802590fe179e90873d64579ad1
--- /dev/null
+++ b/diffusers/examples/advanced_diffusion_training/README.md
@@ -0,0 +1,454 @@
+# Advanced diffusion training examples
+
+## Train Dreambooth LoRA with Stable Diffusion XL
+> [!TIP]
+> 💡 This example follows the techniques and recommended practices covered in the blog post: [LoRA training scripts of the world, unite!](https://huggingface.co/blog/sdxl_lora_advanced_script). Make sure to check it out before starting 🤗
+
+[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject.
+
+LoRA - Low-Rank Adaption of Large Language Models, was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*
+In a nutshell, LoRA allows to adapt pretrained models by adding pairs of rank-decomposition matrices to existing weights and **only** training those newly added weights. This has a couple of advantages:
+- Previous pretrained weights are kept frozen so that the model is not prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114)
+- Rank-decomposition matrices have significantly fewer parameters than the original model, which means that trained LoRA weights are easily portable.
+- LoRA attention layers allow to control to which extent the model is adapted towards new training images via a `scale` parameter.
+[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in
+the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository.
+
+The `train_dreambooth_lora_sdxl_advanced.py` script shows how to implement dreambooth-LoRA, combining the training process shown in `train_dreambooth_lora_sdxl.py`, with
+advanced features and techniques, inspired and built upon contributions by [Nataniel Ruiz](https://twitter.com/natanielruizg): [Dreambooth](https://dreambooth.github.io), [Rinon Gal](https://twitter.com/RinonGal): [Textual Inversion](https://textual-inversion.github.io), [Ron Mokady](https://twitter.com/MokadyRon): [Pivotal Tuning](https://arxiv.org/abs/2106.05744), [Simo Ryu](https://twitter.com/cloneofsimo): [cog-sdxl](https://github.com/replicate/cog-sdxl),
+[Kohya](https://twitter.com/kohya_tech/): [sd-scripts](https://github.com/kohya-ss/sd-scripts), [The Last Ben](https://twitter.com/__TheBen): [fast-stable-diffusion](https://github.com/TheLastBen/fast-stable-diffusion) ❤️
+
+> [!NOTE]
+> 💡If this is your first time training a Dreambooth LoRA, congrats!🥳
+> You might want to familiarize yourself more with the techniques: [Dreambooth blog](https://huggingface.co/blog/dreambooth), [Using LoRA for Efficient Stable Diffusion Fine-Tuning blog](https://huggingface.co/blog/lora)
+
+📚 Read more about the advanced features and best practices in this community derived blog post: [LoRA training scripts of the world, unite!](https://huggingface.co/blog/sdxl_lora_advanced_script)
+
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the `examples/advanced_diffusion_training` folder and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+### Pivotal Tuning
+**Training with text encoder(s)**
+
+Alongside the UNet, LoRA fine-tuning of the text encoders is also supported. In addition to the text encoder optimization
+available with `train_dreambooth_lora_sdxl_advanced.py`, in the advanced script **pivotal tuning** is also supported.
+[pivotal tuning](https://huggingface.co/blog/sdxl_lora_advanced_script#pivotal-tuning) combines Textual Inversion with regular diffusion fine-tuning -
+we insert new tokens into the text encoders of the model, instead of reusing existing ones.
+We then optimize the newly-inserted token embeddings to represent the new concept.
+
+To do so, just specify `--train_text_encoder_ti` while launching training (for regular text encoder optimizations, use `--train_text_encoder`).
+Please keep the following points in mind:
+
+* SDXL has two text encoders. So, we fine-tune both using LoRA.
+* When not fine-tuning the text encoders, we ALWAYS precompute the text embeddings to save memory.
+
+### 3D icon example
+
+Now let's get our dataset. For this example we will use some cool images of 3d rendered icons: https://huggingface.co/datasets/linoyts/3d_icon.
+
+Let's first download it locally:
+
+```python
+from huggingface_hub import snapshot_download
+
+local_dir = "./3d_icon"
+snapshot_download(
+    "LinoyTsaban/3d_icon",
+    local_dir=local_dir, repo_type="dataset",
+    ignore_patterns=".gitattributes",
+)
+```
+
+Let's review some of the advanced features we're going to be using for this example:
+- **custom captions**:
+To use custom captioning, first ensure that you have the datasets library installed, otherwise you can install it by
+```bash
+pip install datasets
+```
+
+Now we'll simply specify the name of the dataset and caption column (in this case it's "prompt")
+
+```
+--dataset_name=./3d_icon
+--caption_column=prompt
+```
+
+You can also load a dataset straight from by specifying it's name in `dataset_name`.
+Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loadin your own caption dataset.
+
+- **optimizer**: for this example, we'll use [prodigy](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers) - an adaptive optimizer
+- **pivotal tuning**
+- **min SNR gamma**
+
+**Now, we can launch training:**
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export DATASET_NAME="./3d_icon"
+export OUTPUT_DIR="3d-icon-SDXL-LoRA"
+export VAE_PATH="madebyollin/sdxl-vae-fp16-fix"
+
+accelerate launch train_dreambooth_lora_sdxl_advanced.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --pretrained_vae_model_name_or_path=$VAE_PATH \
+  --dataset_name=$DATASET_NAME \
+  --instance_prompt="3d icon in the style of TOK" \
+  --validation_prompt="a TOK icon of an astronaut riding a horse, in the style of TOK" \
+  --output_dir=$OUTPUT_DIR \
+  --caption_column="prompt" \
+  --mixed_precision="bf16" \
+  --resolution=1024 \
+  --train_batch_size=3 \
+  --repeats=1 \
+  --report_to="wandb"\
+  --gradient_accumulation_steps=1 \
+  --gradient_checkpointing \
+  --learning_rate=1.0 \
+  --text_encoder_lr=1.0 \
+  --optimizer="prodigy"\
+  --train_text_encoder_ti\
+  --train_text_encoder_ti_frac=0.5\
+  --snr_gamma=5.0 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --rank=8 \
+  --max_train_steps=1000 \
+  --checkpointing_steps=2000 \
+  --seed="0" \
+  --push_to_hub
+```
+
+To better track our training experiments, we're using the following flags in the command above:
+
+* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
+* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
+
+Our experiments were conducted on a single 40GB A100 GPU.
+
+
+### Inference
+
+Once training is done, we can perform inference like so:
+1. starting with loading the unet lora weights
+```python
+import torch
+from huggingface_hub import hf_hub_download, upload_file
+from diffusers import DiffusionPipeline
+from diffusers.models import AutoencoderKL
+from safetensors.torch import load_file
+
+username = "linoyts"
+repo_id = f"{username}/3d-icon-SDXL-LoRA"
+
+pipe = DiffusionPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-xl-base-1.0",
+        torch_dtype=torch.float16,
+        variant="fp16",
+).to("cuda")
+
+
+pipe.load_lora_weights(repo_id, weight_name="pytorch_lora_weights.safetensors")
+```
+2. now we load the pivotal tuning embeddings
+
+```python
+text_encoders = [pipe.text_encoder, pipe.text_encoder_2]
+tokenizers = [pipe.tokenizer, pipe.tokenizer_2]
+
+embedding_path = hf_hub_download(repo_id=repo_id, filename="3d-icon-SDXL-LoRA_emb.safetensors", repo_type="model")
+
+state_dict = load_file(embedding_path)
+# load embeddings of text_encoder 1 (CLIP ViT-L/14)
+pipe.load_textual_inversion(state_dict["clip_l"], token=["<s0>", "<s1>"], text_encoder=pipe.text_encoder, tokenizer=pipe.tokenizer)
+# load embeddings of text_encoder 2 (CLIP ViT-G/14)
+pipe.load_textual_inversion(state_dict["clip_g"], token=["<s0>", "<s1>"], text_encoder=pipe.text_encoder_2, tokenizer=pipe.tokenizer_2)
+```
+
+3. let's generate images
+
+```python
+instance_token = "<s0><s1>"
+prompt = f"a {instance_token} icon of an orange llama eating ramen, in the style of {instance_token}"
+
+image = pipe(prompt=prompt, num_inference_steps=25, cross_attention_kwargs={"scale": 1.0}).images[0]
+image.save("llama.png")
+```
+
+### Comfy UI / AUTOMATIC1111 Inference
+The new script fully supports textual inversion loading with Comfy UI and AUTOMATIC1111 formats!
+
+**AUTOMATIC1111 / SD.Next** \
+In AUTOMATIC1111/SD.Next we will load a LoRA and a textual embedding at the same time.
+- *LoRA*: Besides the diffusers format, the script will also train a WebUI compatible LoRA. It is generated as `{your_lora_name}.safetensors`. You can then include it in your `models/Lora` directory.
+- *Embedding*: the embedding is the same for diffusers and WebUI. You can download your `{lora_name}_emb.safetensors` file from a trained model, and include it in your `embeddings` directory.
+
+You can then run inference by prompting `a y2k_emb webpage about the movie Mean Girls <lora:y2k:0.9>`. You can use the `y2k_emb` token normally, including increasing its weight by doing `(y2k_emb:1.2)`.
+
+**ComfyUI** \
+In ComfyUI we will load a LoRA and a textual embedding at the same time.
+- *LoRA*: Besides the diffusers format, the script will also train a ComfyUI compatible LoRA. It is generated as `{your_lora_name}.safetensors`. You can then include it in your `models/Lora` directory. Then you will load the LoRALoader node and hook that up with your model and CLIP. [Official guide for loading LoRAs](https://comfyanonymous.github.io/ComfyUI_examples/lora/)
+- *Embedding*: the embedding is the same for diffusers and WebUI. You can download your `{lora_name}_emb.safetensors` file from a trained model, and include it in your `models/embeddings` directory and use it in your prompts like `embedding:y2k_emb`. [Official guide for loading embeddings](https://comfyanonymous.github.io/ComfyUI_examples/textual_inversion_embeddings/).
+-
+### Specifying a better VAE
+
+SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
+
+### DoRA training
+The advanced script supports DoRA training too!
+> Proposed in [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353),
+**DoRA** is very similar to LoRA, except it decomposes the pre-trained weight into two components, **magnitude** and **direction** and employs LoRA for _directional_ updates to efficiently minimize the number of trainable parameters.
+The authors found that by using DoRA, both the learning capacity and training stability of LoRA are enhanced without any additional overhead during inference.
+
+> [!NOTE]
+> 💡DoRA training is still _experimental_
+> and is likely to require different hyperparameter values to perform best compared to a LoRA.
+> Specifically, we've noticed 2 differences to take into account your training:
+> 1. **LoRA seem to converge faster than DoRA** (so a set of parameters that may lead to overfitting when training a LoRA may be working well for a DoRA)
+> 2. **DoRA quality superior to LoRA especially in lower ranks** the difference in quality of DoRA of rank 8 and LoRA of rank 8 appears to be more significant than when training ranks of 32 or 64 for example.
+> This is also aligned with some of the quantitative analysis shown in the paper.
+
+**Usage**
+1. To use DoRA you need to install `peft` from main:
+```bash
+pip install git+https://github.com/huggingface/peft.git
+```
+2. Enable DoRA training by adding this flag
+```bash
+--use_dora
+```
+**Inference**
+The inference is the same as if you train a regular LoRA 🤗
+
+## Conducting EDM-style training
+
+It's now possible to perform EDM-style training as proposed in [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364).
+
+simply set:
+
+```diff
++  --do_edm_style_training \
+```
+
+Other SDXL-like models that use the EDM formulation, such as [playgroundai/playground-v2.5-1024px-aesthetic](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic), can also be DreamBooth'd with the script. Below is an example command:
+
+```bash
+accelerate launch train_dreambooth_lora_sdxl_advanced.py \
+  --pretrained_model_name_or_path="playgroundai/playground-v2.5-1024px-aesthetic"  \
+  --dataset_name="linoyts/3d_icon" \
+  --instance_prompt="3d icon in the style of TOK" \
+  --validation_prompt="a TOK icon of an astronaut riding a horse, in the style of TOK" \
+  --output_dir="3d-icon-SDXL-LoRA" \
+  --do_edm_style_training \
+  --caption_column="prompt" \
+  --mixed_precision="bf16" \
+  --resolution=1024 \
+  --train_batch_size=3 \
+  --repeats=1 \
+  --report_to="wandb"\
+  --gradient_accumulation_steps=1 \
+  --gradient_checkpointing \
+  --learning_rate=1.0 \
+  --text_encoder_lr=1.0 \
+  --optimizer="prodigy"\
+  --train_text_encoder_ti\
+  --train_text_encoder_ti_frac=0.5\
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --rank=8 \
+  --max_train_steps=1000 \
+  --checkpointing_steps=2000 \
+  --seed="0" \
+  --push_to_hub
+```
+
+> [!CAUTION]
+> Min-SNR gamma is not supported with the EDM-style training yet. When training with the PlaygroundAI model, it's recommended to not pass any "variant".
+
+### B-LoRA training
+The advanced script now supports B-LoRA training too!
+> Proposed in [Implicit Style-Content Separation using B-LoRA](https://arxiv.org/abs/2403.14572),
+B-LoRA is a method that leverages LoRA to implicitly separate the style and content components of a **single** image.
+It was shown that learning the LoRA weights of two specific blocks (referred to as B-LoRAs)
+achieves style-content separation that cannot be achieved by training each B-LoRA independently.
+Once trained, the two B-LoRAs can be used as independent components to allow various image stylization tasks
+
+**Usage**
+Enable B-LoRA training by adding this flag
+```bash
+--use_blora
+```
+You can train a B-LoRA with as little as 1 image, and 1000 steps. Try this default configuration as a start:
+```bash
+!accelerate launch train_dreambooth_b-lora_sdxl.py \
+ --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" \
+ --instance_data_dir="linoyts/B-LoRA_teddy_bear" \
+ --output_dir="B-LoRA_teddy_bear" \
+ --instance_prompt="a [v18]" \
+ --resolution=1024 \
+ --rank=64 \
+ --train_batch_size=1 \
+ --learning_rate=5e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --max_train_steps=1000 \
+ --checkpointing_steps=2000 \
+ --seed="0" \
+ --gradient_checkpointing \
+ --mixed_precision="fp16"
+```
+**Inference**
+The inference is a bit different:
+1. we need load *specific* unet layers (as opposed to a regular LoRA/DoRA)
+2. the trained layers we load, changes based on our objective (e.g. style/content)
+
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline, AutoencoderKL
+
+# taken & modified from B-LoRA repo - https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
+def is_belong_to_blocks(key, blocks):
+    try:
+        for g in blocks:
+            if g in key:
+                return True
+        return False
+    except Exception as e:
+        raise type(e)(f'failed to is_belong_to_block, due to: {e}')
+
+def lora_lora_unet_blocks(lora_path, alpha, target_blocks):
+  state_dict, _ = pipeline.lora_state_dict(lora_path)
+  filtered_state_dict = {k: v * alpha for k, v in state_dict.items() if is_belong_to_blocks(k, target_blocks)}
+  return filtered_state_dict
+
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    vae=vae,
+    torch_dtype=torch.float16,
+).to("cuda")
+
+# pick a blora for content/style (you can also set one to None)
+content_B_lora_path  = "lora-library/B-LoRA-teddybear"
+style_B_lora_path= "lora-library/B-LoRA-pen_sketch"
+
+
+content_B_LoRA = lora_lora_unet_blocks(content_B_lora_path,alpha=1,target_blocks=["unet.up_blocks.0.attentions.0"])
+style_B_LoRA = lora_lora_unet_blocks(style_B_lora_path,alpha=1.1,target_blocks=["unet.up_blocks.0.attentions.1"])
+combined_lora = {**content_B_LoRA, **style_B_LoRA}
+
+# Load both loras
+pipeline.load_lora_into_unet(combined_lora, None, pipeline.unet)
+
+#generate
+prompt = "a [v18] in [v30] style"
+pipeline(prompt, num_images_per_prompt=4).images
+```
+### LoRA training of Targeted U-net Blocks
+The advanced script now supports custom choice of U-net blocks to train during Dreambooth LoRA tuning.
+> [!NOTE]
+> This feature is still experimental
+
+> Recently, works like B-LoRA showed the potential advantages of learning the LoRA weights of specific U-net blocks, not only in speed & memory,
+> but also in reducing the amount of needed data, improving style manipulation and overcoming overfitting issues.
+> In light of this, we're introducing a new feature to the advanced script to allow for configurable U-net learned blocks.
+
+**Usage**
+Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma seperated string specifying the targeted blocks.
+e.g:
+```bash
+--lora_unet_blocks="unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1"
+```
+
+> [!NOTE]
+> if you specify both `--use_blora` and `--lora_unet_blocks`, values given in --lora_unet_blocks will be ignored.
+> When enabling --use_blora, targeted U-net blocks are automatically set to be "unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1" as discussed in the paper.
+> If you wish to experiment with different blocks, specify `--lora_unet_blocks` only.
+
+**Inference**
+Inference is the same as for B-LoRAs, except the input targeted blocks should be modified based on your training configuration.
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline, AutoencoderKL
+
+# taken & modified from B-LoRA repo - https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
+def is_belong_to_blocks(key, blocks):
+    try:
+        for g in blocks:
+            if g in key:
+                return True
+        return False
+    except Exception as e:
+        raise type(e)(f'failed to is_belong_to_block, due to: {e}')
+
+def lora_lora_unet_blocks(lora_path, alpha, target_blocks):
+  state_dict, _ = pipeline.lora_state_dict(lora_path)
+  filtered_state_dict = {k: v * alpha for k, v in state_dict.items() if is_belong_to_blocks(k, target_blocks)}
+  return filtered_state_dict
+
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    vae=vae,
+    torch_dtype=torch.float16,
+).to("cuda")
+
+lora_path  = "lora-library/B-LoRA-pen_sketch"
+
+state_dict = lora_lora_unet_blocks(content_B_lora_path,alpha=1,target_blocks=["unet.up_blocks.0.attentions.0"])
+
+# Load trained lora layers into the unet
+pipeline.load_lora_into_unet(state_dict, None, pipeline.unet)
+
+#generate
+prompt = "a dog in [v30] style"
+pipeline(prompt, num_images_per_prompt=4).images
+```
+
+
+### Tips and Tricks
+Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices)
+
+## Running on Colab Notebook
+Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_Dreambooth_LoRA_advanced_example.ipynb).
+to train using the advanced features (including pivotal tuning), and [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_.ipynb) to train on a free colab, using some of the advanced features (excluding pivotal tuning)
+
diff --git a/diffusers/examples/advanced_diffusion_training/requirements.txt b/diffusers/examples/advanced_diffusion_training/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3f86855e1d1e03c20874e9de4a6037a119509444
--- /dev/null
+++ b/diffusers/examples/advanced_diffusion_training/requirements.txt
@@ -0,0 +1,7 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
+peft==0.7.0
\ No newline at end of file
diff --git a/diffusers/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py b/diffusers/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e1a0298ba1d88716692910d0ec9c6740601e9e1
--- /dev/null
+++ b/diffusers/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py
@@ -0,0 +1,2013 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import gc
+import hashlib
+import itertools
+import logging
+import math
+import os
+import re
+import shutil
+import warnings
+from contextlib import nullcontext
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+# imports of the TokenEmbeddingsHandler class
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from safetensors.torch import load_file, save_file
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import StableDiffusionLoraLoaderMixin
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import compute_snr
+from diffusers.utils import (
+    check_min_version,
+    convert_all_state_dict_to_peft,
+    convert_state_dict_to_diffusers,
+    convert_state_dict_to_kohya,
+    is_wandb_available,
+)
+from diffusers.utils.import_utils import is_xformers_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    use_dora: bool,
+    images=None,
+    base_model=str,
+    train_text_encoder=False,
+    train_text_encoder_ti=False,
+    token_abstraction_dict=None,
+    instance_prompt=str,
+    validation_prompt=str,
+    repo_folder=None,
+    vae_path=None,
+):
+    img_str = "widget:\n"
+    lora = "lora" if not use_dora else "dora"
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"""
+        - text: '{validation_prompt if validation_prompt else ' ' }'
+          output:
+            url:
+                "image_{i}.png"
+        """
+    if not images:
+        img_str += f"""
+        - text: '{instance_prompt}'
+        """
+    embeddings_filename = f"{repo_folder}_emb"
+    instance_prompt_webui = re.sub(r"<s\d+>", "", re.sub(r"<s\d+>", embeddings_filename, instance_prompt, count=1))
+    ti_keys = ", ".join(f'"{match}"' for match in re.findall(r"<s\d+>", instance_prompt))
+    if instance_prompt_webui != embeddings_filename:
+        instance_prompt_sentence = f"For example, `{instance_prompt_webui}`"
+    else:
+        instance_prompt_sentence = ""
+    trigger_str = f"You should use {instance_prompt} to trigger the image generation."
+    diffusers_imports_pivotal = ""
+    diffusers_example_pivotal = ""
+    webui_example_pivotal = ""
+    if train_text_encoder_ti:
+        trigger_str = (
+            "To trigger image generation of trained concept(or concepts) replace each concept identifier "
+            "in you prompt with the new inserted tokens:\n"
+        )
+        diffusers_imports_pivotal = """from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+        """
+        diffusers_example_pivotal = f"""embedding_path = hf_hub_download(repo_id='{repo_id}', filename='{embeddings_filename}.safetensors', repo_type="model")
+state_dict = load_file(embedding_path)
+pipeline.load_textual_inversion(state_dict["clip_l"], token=[{ti_keys}], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer)
+        """
+        webui_example_pivotal = f"""- *Embeddings*: download **[`{embeddings_filename}.safetensors` here 💾](/{repo_id}/blob/main/{embeddings_filename}.safetensors)**.
+    - Place it on it on your `embeddings` folder
+    - Use it by adding `{embeddings_filename}` to your prompt. {instance_prompt_sentence}
+    (you need both the LoRA and the embeddings as they were trained together for this LoRA)
+    """
+        if token_abstraction_dict:
+            for key, value in token_abstraction_dict.items():
+                tokens = "".join(value)
+                trigger_str += f"""
+to trigger concept `{key}` → use `{tokens}` in your prompt \n
+"""
+
+    yaml = f"""---
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- diffusers-training
+- text-to-image
+- diffusers
+- {lora}
+- template:sd-lora
+{img_str}
+base_model: {base_model}
+instance_prompt: {instance_prompt}
+license: openrail++
+---
+"""
+
+    model_card = f"""
+# SD1.5 LoRA DreamBooth - {repo_id}
+
+<Gallery />
+
+## Model description
+
+### These are {repo_id} LoRA adaption weights for {base_model}.
+
+## Download model
+
+### Use it with UIs such as AUTOMATIC1111, Comfy UI, SD.Next, Invoke
+
+- **LoRA**: download **[`{repo_folder}.safetensors` here 💾](/{repo_id}/blob/main/{repo_folder}.safetensors)**.
+    - Place it on your `models/Lora` folder.
+    - On AUTOMATIC1111, load the LoRA by adding `<lora:{repo_folder}:1>` to your prompt. On ComfyUI just [load it as a regular LoRA](https://comfyanonymous.github.io/ComfyUI_examples/lora/).
+{webui_example_pivotal}
+
+## Use it with the [🧨 diffusers library](https://github.com/huggingface/diffusers)
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+{diffusers_imports_pivotal}
+pipeline = AutoPipelineForText2Image.from_pretrained('runwayml/stable-diffusion-v1-5', torch_dtype=torch.float16).to('cuda')
+pipeline.load_lora_weights('{repo_id}', weight_name='pytorch_lora_weights.safetensors')
+{diffusers_example_pivotal}
+image = pipeline('{validation_prompt if validation_prompt else instance_prompt}').images[0]
+```
+
+For more details, including weighting, merging and fusing LoRAs, check the [documentation on loading LoRAs in diffusers](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading_adapters)
+
+## Trigger words
+
+{trigger_str}
+
+## Details
+All [Files & versions](/{repo_id}/tree/main).
+
+The weights were trained using [🧨 diffusers Advanced Dreambooth Training Script](https://github.com/huggingface/diffusers/blob/main/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py).
+
+LoRA for the text encoder was enabled. {train_text_encoder}.
+
+Pivotal tuning was enabled: {train_text_encoder_ti}.
+
+Special VAE used for training: {vae_path}.
+
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand.To load the custom captions, the training set directory needs to follow the structure of a "
+            "datasets ImageFolder, containing both the images and the corresponding caption for each image. see: "
+            "https://huggingface.co/docs/datasets/image_dataset for more information"
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset. In some cases, a dataset may have more than one configuration (for example "
+        "if it contains different subsets of data within, and you only wish to load a specific subset - in that case specify the desired configuration using --dataset_config_name. Leave as "
+        "None if there's only one config.",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help="A path to local folder containing the training data of instance images. Specify this arg instead of "
+        "--dataset_name if you wish to train using a local folder without custom captions. If you wish to train with custom captions please specify "
+        "--dataset_name instead.",
+    )
+
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+
+    parser.add_argument(
+        "--image_column",
+        type=str,
+        default="image",
+        help="The column of the dataset containing the target image. By "
+        "default, the standard Image Dataset maps out 'file_name' "
+        "to 'image'.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default=None,
+        help="The column of the dataset containing the instance prompt for each image",
+    )
+
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance, e.g. 'photo of a TOK dog', 'in the style of TOK'",
+    )
+    parser.add_argument(
+        "--token_abstraction",
+        type=str,
+        default="TOK",
+        help="identifier specifying the instance(or instances) as used in instance_prompt, validation prompt, "
+        "captions - e.g. TOK. To use multiple identifiers, please specify them in a comma separated string - e.g. "
+        "'TOK,TOK2,TOK3' etc.",
+    )
+
+    parser.add_argument(
+        "--num_new_tokens_per_abstraction",
+        type=int,
+        default=2,
+        help="number of new tokens inserted to the tokenizers per token_abstraction identifier when "
+        "--train_text_encoder_ti = True. By default, each --token_abstraction (e.g. TOK) is mapped to 2 new "
+        "tokens - <si><si+1> ",
+    )
+
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="lora-dreambooth-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+
+    parser.add_argument(
+        "--text_encoder_lr",
+        type=float,
+        default=5e-6,
+        help="Text encoder learning rate to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+
+    parser.add_argument(
+        "--train_text_encoder_ti",
+        action="store_true",
+        help=("Whether to use textual inversion"),
+    )
+
+    parser.add_argument(
+        "--train_text_encoder_ti_frac",
+        type=float,
+        default=0.5,
+        help=("The percentage of epochs to perform textual inversion"),
+    )
+
+    parser.add_argument(
+        "--train_text_encoder_frac",
+        type=float,
+        default=1.0,
+        help=("The percentage of epochs to perform text encoder tuning"),
+    )
+
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="adamW",
+        help=('The optimizer type to use. Choose between ["AdamW", "prodigy"]'),
+    )
+
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+
+    parser.add_argument(
+        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--prodigy_beta3",
+        type=float,
+        default=None,
+        help="coefficients for computing the Prodigy stepsize using running averages. If set to None, "
+        "uses the value of square root of beta2. Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+    parser.add_argument(
+        "--adam_weight_decay_text_encoder", type=float, default=None, help="Weight decay to use for text_encoder"
+    )
+
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
+    )
+
+    parser.add_argument(
+        "--prodigy_use_bias_correction",
+        type=bool,
+        default=True,
+        help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW",
+    )
+    parser.add_argument(
+        "--prodigy_safeguard_warmup",
+        type=bool,
+        default=True,
+        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. "
+        "Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--use_dora",
+        action="store_true",
+        default=False,
+        help=(
+            "Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
+            "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
+        ),
+    )
+    parser.add_argument(
+        "--cache_latents",
+        action="store_true",
+        default=False,
+        help="Cache the VAE latents",
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.instance_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--instance_data_dir`")
+
+    if args.dataset_name is not None and args.instance_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--instance_data_dir`")
+
+    if args.train_text_encoder and args.train_text_encoder_ti:
+        raise ValueError(
+            "Specify only one of `--train_text_encoder` or `--train_text_encoder_ti. "
+            "For full LoRA text encoder training check --train_text_encoder, for textual "
+            "inversion training check `--train_text_encoder_ti`"
+        )
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+# Taken from https://github.com/replicate/cog-sdxl/blob/main/dataset_and_utils.py
+class TokenEmbeddingsHandler:
+    def __init__(self, text_encoders, tokenizers):
+        self.text_encoders = text_encoders
+        self.tokenizers = tokenizers
+
+        self.train_ids: Optional[torch.Tensor] = None
+        self.inserting_toks: Optional[List[str]] = None
+        self.embeddings_settings = {}
+
+    def initialize_new_tokens(self, inserting_toks: List[str]):
+        idx = 0
+        for tokenizer, text_encoder in zip(self.tokenizers, self.text_encoders):
+            assert isinstance(inserting_toks, list), "inserting_toks should be a list of strings."
+            assert all(
+                isinstance(tok, str) for tok in inserting_toks
+            ), "All elements in inserting_toks should be strings."
+
+            self.inserting_toks = inserting_toks
+            special_tokens_dict = {"additional_special_tokens": self.inserting_toks}
+            tokenizer.add_special_tokens(special_tokens_dict)
+            text_encoder.resize_token_embeddings(len(tokenizer))
+
+            self.train_ids = tokenizer.convert_tokens_to_ids(self.inserting_toks)
+
+            # random initialization of new tokens
+            std_token_embedding = text_encoder.text_model.embeddings.token_embedding.weight.data.std()
+
+            print(f"{idx} text encoder's std_token_embedding: {std_token_embedding}")
+
+            text_encoder.text_model.embeddings.token_embedding.weight.data[self.train_ids] = (
+                torch.randn(len(self.train_ids), text_encoder.text_model.config.hidden_size)
+                .to(device=self.device)
+                .to(dtype=self.dtype)
+                * std_token_embedding
+            )
+            self.embeddings_settings[
+                f"original_embeddings_{idx}"
+            ] = text_encoder.text_model.embeddings.token_embedding.weight.data.clone()
+            self.embeddings_settings[f"std_token_embedding_{idx}"] = std_token_embedding
+
+            inu = torch.ones((len(tokenizer),), dtype=torch.bool)
+            inu[self.train_ids] = False
+
+            self.embeddings_settings[f"index_no_updates_{idx}"] = inu
+
+            print(self.embeddings_settings[f"index_no_updates_{idx}"].shape)
+
+            idx += 1
+
+    # Copied from train_dreambooth_lora_sdxl_advanced.py
+    def save_embeddings(self, file_path: str):
+        assert self.train_ids is not None, "Initialize new tokens before saving embeddings."
+        tensors = {}
+        # text_encoder_0 - CLIP ViT-L/14, text_encoder_1 -  CLIP ViT-G/14 - TODO - change for sd
+        idx_to_text_encoder_name = {0: "clip_l", 1: "clip_g"}
+        for idx, text_encoder in enumerate(self.text_encoders):
+            assert text_encoder.text_model.embeddings.token_embedding.weight.data.shape[0] == len(
+                self.tokenizers[0]
+            ), "Tokenizers should be the same."
+            new_token_embeddings = text_encoder.text_model.embeddings.token_embedding.weight.data[self.train_ids]
+
+            # New tokens for each text encoder are saved under "clip_l" (for text_encoder 0), "clip_g" (for
+            # text_encoder 1) to keep compatible with the ecosystem.
+            # Note: When loading with diffusers, any name can work - simply specify in inference
+            tensors[idx_to_text_encoder_name[idx]] = new_token_embeddings
+            # tensors[f"text_encoders_{idx}"] = new_token_embeddings
+
+        save_file(tensors, file_path)
+
+    @property
+    def dtype(self):
+        return self.text_encoders[0].dtype
+
+    @property
+    def device(self):
+        return self.text_encoders[0].device
+
+    @torch.no_grad()
+    def retract_embeddings(self):
+        for idx, text_encoder in enumerate(self.text_encoders):
+            index_no_updates = self.embeddings_settings[f"index_no_updates_{idx}"]
+            text_encoder.text_model.embeddings.token_embedding.weight.data[index_no_updates] = (
+                self.embeddings_settings[f"original_embeddings_{idx}"][index_no_updates]
+                .to(device=text_encoder.device)
+                .to(dtype=text_encoder.dtype)
+            )
+
+            # for the parts that were updated, we need to normalize them
+            # to have the same std as before
+            std_token_embedding = self.embeddings_settings[f"std_token_embedding_{idx}"]
+
+            index_updates = ~index_no_updates
+            new_embeddings = text_encoder.text_model.embeddings.token_embedding.weight.data[index_updates]
+            off_ratio = std_token_embedding / new_embeddings.std()
+
+            new_embeddings = new_embeddings * (off_ratio**0.1)
+            text_encoder.text_model.embeddings.token_embedding.weight.data[index_updates] = new_embeddings
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        class_prompt,
+        dataset_name,
+        dataset_config_name,
+        cache_dir,
+        image_column,
+        caption_column,
+        train_text_encoder_ti,
+        class_data_root=None,
+        class_num=None,
+        token_abstraction_dict=None,  # token mapping for textual inversion
+        size=1024,
+        repeats=1,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+
+        self.instance_prompt = instance_prompt
+        self.custom_instance_prompts = None
+        self.class_prompt = class_prompt
+        self.token_abstraction_dict = token_abstraction_dict
+        self.train_text_encoder_ti = train_text_encoder_ti
+        # if --dataset_name is provided or a metadata jsonl file is provided in the local --instance_data directory,
+        # we load the training data using load_dataset
+        if dataset_name is not None:
+            try:
+                from datasets import load_dataset
+            except ImportError:
+                raise ImportError(
+                    "You are trying to load your data using the datasets library. If you wish to train using custom "
+                    "captions please install the datasets library: `pip install datasets`. If you wish to load a "
+                    "local folder containing images only, specify --instance_data_dir instead."
+                )
+            # Downloading and loading a dataset from the hub.
+            # See more about loading custom images at
+            # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+            dataset = load_dataset(
+                dataset_name,
+                dataset_config_name,
+                cache_dir=cache_dir,
+            )
+            # Preprocessing the datasets.
+            column_names = dataset["train"].column_names
+
+            # 6. Get the column names for input/target.
+            if image_column is None:
+                image_column = column_names[0]
+                logger.info(f"image column defaulting to {image_column}")
+            else:
+                if image_column not in column_names:
+                    raise ValueError(
+                        f"`--image_column` value '{image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+            instance_images = dataset["train"][image_column]
+
+            if caption_column is None:
+                logger.info(
+                    "No caption column provided, defaulting to instance_prompt for all images. If your dataset "
+                    "contains captions/prompts for the images, make sure to specify the "
+                    "column as --caption_column"
+                )
+                self.custom_instance_prompts = None
+            else:
+                if caption_column not in column_names:
+                    raise ValueError(
+                        f"`--caption_column` value '{caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+                custom_instance_prompts = dataset["train"][caption_column]
+                # create final list of captions according to --repeats
+                self.custom_instance_prompts = []
+                for caption in custom_instance_prompts:
+                    self.custom_instance_prompts.extend(itertools.repeat(caption, repeats))
+        else:
+            self.instance_data_root = Path(instance_data_root)
+            if not self.instance_data_root.exists():
+                raise ValueError("Instance images root doesn't exists.")
+
+            instance_images = [Image.open(path) for path in list(Path(instance_data_root).iterdir())]
+            self.custom_instance_prompts = None
+
+        self.instance_images = []
+        for img in instance_images:
+            self.instance_images.extend(itertools.repeat(img, repeats))
+        self.num_instance_images = len(self.instance_images)
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = self.instance_images[index % self.num_instance_images]
+        instance_image = exif_transpose(instance_image)
+
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+
+        if self.custom_instance_prompts:
+            caption = self.custom_instance_prompts[index % self.num_instance_images]
+            if caption:
+                if self.train_text_encoder_ti:
+                    # replace instances of --token_abstraction in caption with the new tokens: "<si><si+1>" etc.
+                    for token_abs, token_replacement in self.token_abstraction_dict.items():
+                        caption = caption.replace(token_abs, "".join(token_replacement))
+                example["instance_prompt"] = caption
+            else:
+                example["instance_prompt"] = self.instance_prompt
+
+        else:  # custom prompts were provided, but length does not match size of image dataset
+            example["instance_prompt"] = self.instance_prompt
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            class_image = exif_transpose(class_image)
+
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt"] = self.class_prompt
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    pixel_values = [example["instance_images"] for example in examples]
+    prompts = [example["instance_prompt"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        pixel_values += [example["class_images"] for example in examples]
+        prompts += [example["class_prompt"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    batch = {"pixel_values": pixel_values, "prompts": prompts}
+    return batch
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def tokenize_prompt(tokenizer, prompt, add_special_tokens=False):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        add_special_tokens=add_special_tokens,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(text_encoders, tokenizers, prompt, text_input_ids_list=None):
+    for i, text_encoder in enumerate(text_encoders):
+        if tokenizers is not None:
+            tokenizer = tokenizers[i]
+            text_input_ids = tokenize_prompt(tokenizer, prompt)
+        else:
+            assert text_input_ids_list is not None
+            text_input_ids = text_input_ids_list[i]
+
+        prompt_embeds = text_encoder(
+            text_input_ids.to(text_encoder.device),
+            output_hidden_states=True,
+        )
+
+    return prompt_embeds[0]
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = StableDiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                revision=args.revision,
+                variant=args.variant,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        model_id = args.hub_model_id or Path(args.output_dir).name
+        repo_id = None
+        if args.push_to_hub:
+            repo_id = create_repo(repo_id=model_id, exist_ok=True, token=args.hub_token).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        variant=args.variant,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    vae_scaling_factor = vae.config.scaling_factor
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    if args.train_text_encoder_ti:
+        # we parse the provided token identifier (or identifiers) into a list. s.t. - "TOK" -> ["TOK"], "TOK,
+        # TOK2" -> ["TOK", "TOK2"] etc.
+        token_abstraction_list = "".join(args.token_abstraction.split()).split(",")
+        logger.info(f"list of token identifiers: {token_abstraction_list}")
+
+        token_abstraction_dict = {}
+        token_idx = 0
+        for i, token in enumerate(token_abstraction_list):
+            token_abstraction_dict[token] = [
+                f"<s{token_idx + i + j}>" for j in range(args.num_new_tokens_per_abstraction)
+            ]
+            token_idx += args.num_new_tokens_per_abstraction - 1
+
+        # replace instances of --token_abstraction in --instance_prompt with the new tokens: "<si><si+1>" etc.
+        for token_abs, token_replacement in token_abstraction_dict.items():
+            args.instance_prompt = args.instance_prompt.replace(token_abs, "".join(token_replacement))
+            if args.with_prior_preservation:
+                args.class_prompt = args.class_prompt.replace(token_abs, "".join(token_replacement))
+
+        # initialize the new tokens for textual inversion
+        embedding_handler = TokenEmbeddingsHandler([text_encoder_one], [tokenizer_one])
+        inserting_toks = []
+        for new_tok in token_abstraction_dict.values():
+            inserting_toks.extend(new_tok)
+        embedding_handler.initialize_new_tokens(inserting_toks=inserting_toks)
+
+    # We only train the additional adapter LoRA layers
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+
+    # The VAE is always in float32 to avoid NaN losses.
+    vae.to(accelerator.device, dtype=torch.float32)
+
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, "
+                    "please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder_one.gradient_checkpointing_enable()
+
+    # now we will add new LoRA weights to the attention layers
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        use_dora=args.use_dora,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+    unet.add_adapter(unet_lora_config)
+
+    # The text encoder comes from 🤗 transformers, so we cannot directly modify it.
+    # So, instead, we monkey-patch the forward calls of its attention-blocks.
+    if args.train_text_encoder:
+        text_lora_config = LoraConfig(
+            r=args.rank,
+            lora_alpha=args.rank,
+            use_dora=args.use_dora,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder_one.add_adapter(text_lora_config)
+
+    # if we use textual inversion, we freeze all parameters except for the token embeddings
+    # in text encoder
+    elif args.train_text_encoder_ti:
+        text_lora_parameters_one = []
+        for name, param in text_encoder_one.named_parameters():
+            if "token_embedding" in name:
+                # ensure that dtype is float32, even if rest of the model that isn't trained is loaded in fp16
+                param = param.to(dtype=torch.float32)
+                param.requires_grad = True
+                text_lora_parameters_one.append(param)
+            else:
+                param.requires_grad = False
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [unet]
+        if args.train_text_encoder:
+            models.extend([text_encoder_one])
+        for model in models:
+            for param in model.parameters():
+                # only upcast trainable parameters (LoRA) into fp32
+                if param.requires_grad:
+                    param.data = param.to(torch.float32)
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder atten layers
+            unet_lora_layers_to_save = None
+            text_encoder_one_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(accelerator.unwrap_model(unet))):
+                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
+                elif isinstance(model, type(accelerator.unwrap_model(text_encoder_one))):
+                    if args.train_text_encoder:
+                        text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers(
+                            get_peft_model_state_dict(model)
+                        )
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusionPipeline.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
+            )
+        if args.train_text_encoder_ti:
+            embedding_handler.save_embeddings(f"{args.output_dir}/{Path(args.output_dir).name}_emb.safetensors")
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+        text_encoder_one_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(accelerator.unwrap_model(unet))):
+                unet_ = model
+            elif isinstance(model, type(accelerator.unwrap_model(text_encoder_one))):
+                text_encoder_one_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir)
+        StableDiffusionLoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=unet_)
+
+        text_encoder_state_dict = {k: v for k, v in lora_state_dict.items() if "text_encoder." in k}
+        StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder(
+            text_encoder_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_one_
+        )
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    unet_lora_parameters = list(filter(lambda p: p.requires_grad, unet.parameters()))
+
+    if args.train_text_encoder:
+        text_lora_parameters_one = list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
+
+    # If neither --train_text_encoder nor --train_text_encoder_ti, text_encoders remain frozen during training
+    freeze_text_encoder = not (args.train_text_encoder or args.train_text_encoder_ti)
+
+    # Optimization parameters
+    unet_lora_parameters_with_lr = {"params": unet_lora_parameters, "lr": args.learning_rate}
+    if not freeze_text_encoder:
+        # different learning rate for text encoder and unet
+        text_lora_parameters_one_with_lr = {
+            "params": text_lora_parameters_one,
+            "weight_decay": args.adam_weight_decay_text_encoder
+            if args.adam_weight_decay_text_encoder
+            else args.adam_weight_decay,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        params_to_optimize = [
+            unet_lora_parameters_with_lr,
+            text_lora_parameters_one_with_lr,
+        ]
+    else:
+        params_to_optimize = [unet_lora_parameters_with_lr]
+
+    # Optimizer creation
+    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+            "Defaulting to adamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.optimizer.lower() == "adamw":
+        if args.use_8bit_adam:
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError(
+                    "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+                )
+
+            optimizer_class = bnb.optim.AdamW8bit
+        else:
+            optimizer_class = torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+
+    if args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+        if args.train_text_encoder and args.text_encoder_lr:
+            logger.warning(
+                f"Learning rates were provided both for the unet and the text encoder- e.g. text_encoder_lr:"
+                f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
+                f"When using prodigy only learning_rate is used as the initial learning rate."
+            )
+            # changes the learning rate of text_encoder_parameters_one to be
+            # --learning_rate
+            params_to_optimize[1]["lr"] = args.learning_rate
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_prompt=args.class_prompt,
+        dataset_name=args.dataset_name,
+        dataset_config_name=args.dataset_config_name,
+        cache_dir=args.cache_dir,
+        image_column=args.image_column,
+        train_text_encoder_ti=args.train_text_encoder_ti,
+        caption_column=args.caption_column,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        token_abstraction_dict=token_abstraction_dict if args.train_text_encoder_ti else None,
+        class_num=args.num_class_images,
+        size=args.resolution,
+        repeats=args.repeats,
+        center_crop=args.center_crop,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    if not args.train_text_encoder:
+        tokenizers = [tokenizer_one]
+        text_encoders = [text_encoder_one]
+
+        def compute_text_embeddings(prompt, text_encoders, tokenizers):
+            with torch.no_grad():
+                prompt_embeds = encode_prompt(text_encoders, tokenizers, prompt)
+                prompt_embeds = prompt_embeds.to(accelerator.device)
+            return prompt_embeds
+
+    # If no type of tuning is done on the text_encoder and custom instance prompts are NOT
+    # provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid
+    # the redundant encoding.
+    if freeze_text_encoder and not train_dataset.custom_instance_prompts:
+        instance_prompt_hidden_states = compute_text_embeddings(args.instance_prompt, text_encoders, tokenizers)
+
+    # Handle class prompt for prior-preservation.
+    if args.with_prior_preservation:
+        if freeze_text_encoder:
+            class_prompt_hidden_states = compute_text_embeddings(args.class_prompt, text_encoders, tokenizers)
+
+    # Clear the memory here
+    if freeze_text_encoder and not train_dataset.custom_instance_prompts:
+        del tokenizers, text_encoders
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    # if --train_text_encoder_ti we need add_special_tokens to be True for textual inversion
+    add_special_tokens = True if args.train_text_encoder_ti else False
+
+    if not train_dataset.custom_instance_prompts:
+        if freeze_text_encoder:
+            prompt_embeds = instance_prompt_hidden_states
+            if args.with_prior_preservation:
+                prompt_embeds = torch.cat([prompt_embeds, class_prompt_hidden_states], dim=0)
+
+        # if we're optimizing the text encoder (both if instance prompt is used for all images or custom prompts) we need to tokenize and encode the
+        # batch prompts on all training steps
+        else:
+            tokens_one = tokenize_prompt(tokenizer_one, args.instance_prompt, add_special_tokens)
+            if args.with_prior_preservation:
+                class_tokens_one = tokenize_prompt(tokenizer_one, args.class_prompt, add_special_tokens)
+                tokens_one = torch.cat([tokens_one, class_tokens_one], dim=0)
+
+    if args.train_text_encoder_ti and args.validation_prompt:
+        # replace instances of --token_abstraction in validation prompt with the new tokens: "<si><si+1>" etc.
+        for token_abs, token_replacement in train_dataset.token_abstraction_dict.items():
+            args.validation_prompt = args.validation_prompt.replace(token_abs, "".join(token_replacement))
+    print("validation prompt:", args.validation_prompt)
+
+    if args.cache_latents:
+        latents_cache = []
+        for batch in tqdm(train_dataloader, desc="Caching latents"):
+            with torch.no_grad():
+                batch["pixel_values"] = batch["pixel_values"].to(
+                    accelerator.device, non_blocking=True, dtype=torch.float32
+                )
+                latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
+
+        if args.validation_prompt is None:
+            del vae
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Scheduler and math around the number of training steps.
+    # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
+    num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
+    if args.max_train_steps is None:
+        len_train_dataloader_after_sharding = math.ceil(len(train_dataloader) / accelerator.num_processes)
+        num_update_steps_per_epoch = math.ceil(len_train_dataloader_after_sharding / args.gradient_accumulation_steps)
+        num_training_steps_for_scheduler = (
+            args.num_train_epochs * num_update_steps_per_epoch * accelerator.num_processes
+        )
+    else:
+        num_training_steps_for_scheduler = args.max_train_steps * accelerator.num_processes
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps_for_scheduler,
+        num_training_steps=num_training_steps_for_scheduler,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if not freeze_text_encoder:
+        unet, text_encoder_one, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder_one, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    if num_training_steps_for_scheduler != args.max_train_steps * accelerator.num_processes:
+        logger.warning(
+            f"The length of the 'train_dataloader' after 'accelerator.prepare' ({len(train_dataloader)}) does not match "
+            f"the expected length ({len_train_dataloader_after_sharding}) when the learning rate scheduler was created. "
+            f"This inconsistency may result in the learning rate scheduler not functioning properly."
+        )
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("dreambooth-lora-sd-15", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    if args.train_text_encoder:
+        num_train_epochs_text_encoder = int(args.train_text_encoder_frac * args.num_train_epochs)
+    elif args.train_text_encoder_ti:  # args.train_text_encoder_ti
+        num_train_epochs_text_encoder = int(args.train_text_encoder_ti_frac * args.num_train_epochs)
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        # if performing any kind of optimization of text_encoder params
+        if args.train_text_encoder or args.train_text_encoder_ti:
+            if epoch == num_train_epochs_text_encoder:
+                print("PIVOT HALFWAY", epoch)
+                # stopping optimization of text_encoder params
+                # re setting the optimizer to optimize only on unet params
+                optimizer.param_groups[1]["lr"] = 0.0
+
+            else:
+                # still optimizng the text encoder
+                text_encoder_one.train()
+                # set top parameter requires_grad = True for gradient checkpointing works
+                if args.train_text_encoder:
+                    text_encoder_one.text_model.embeddings.requires_grad_(True)
+
+        unet.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                prompts = batch["prompts"]
+                # encode batch prompts when custom prompts are provided for each image -
+                if train_dataset.custom_instance_prompts:
+                    if freeze_text_encoder:
+                        prompt_embeds = compute_text_embeddings(prompts, text_encoders, tokenizers)
+
+                    else:
+                        tokens_one = tokenize_prompt(tokenizer_one, prompts, add_special_tokens)
+
+                if args.cache_latents:
+                    model_input = latents_cache[step].sample()
+                else:
+                    pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                    model_input = vae.encode(pixel_values).latent_dist.sample()
+
+                model_input = model_input * vae_scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    model_input = model_input.to(weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (model_input.shape[0], model_input.shape[1], 1, 1), device=model_input.device
+                    )
+                bsz = model_input.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                )
+                timesteps = timesteps.long()
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+
+                # Calculate the elements to repeat depending on the use of prior-preservation and custom captions.
+                if not train_dataset.custom_instance_prompts:
+                    elems_to_repeat_text_embeds = bsz // 2 if args.with_prior_preservation else bsz
+
+                else:
+                    elems_to_repeat_text_embeds = 1
+
+                # Predict the noise residual
+                if freeze_text_encoder:
+                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
+                    model_pred = unet(noisy_model_input, timesteps, prompt_embeds_input).sample
+                else:
+                    prompt_embeds = encode_prompt(
+                        text_encoders=[text_encoder_one],
+                        tokenizers=None,
+                        prompt=None,
+                        text_input_ids_list=[tokens_one],
+                    )
+                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
+                    model_pred = unet(noisy_model_input, timesteps, prompt_embeds_input).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute prior loss
+                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+
+                    if args.with_prior_preservation:
+                        # if we're using prior preservation, we calc snr for instance loss only -
+                        # and hence only need timesteps corresponding to instance images
+                        snr_timesteps, _ = torch.chunk(timesteps, 2, dim=0)
+                    else:
+                        snr_timesteps = timesteps
+
+                    snr = compute_snr(noise_scheduler, snr_timesteps)
+                    base_weight = (
+                        torch.stack([snr, args.snr_gamma * torch.ones_like(snr_timesteps)], dim=1).min(dim=1)[0] / snr
+                    )
+
+                    if noise_scheduler.config.prediction_type == "v_prediction":
+                        # Velocity objective needs to be floored to an SNR weight of one.
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                if args.with_prior_preservation:
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(unet_lora_parameters, text_lora_parameters_one)
+                        if (args.train_text_encoder or args.train_text_encoder_ti)
+                        else unet_lora_parameters
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+                # every step, we reset the embeddings to the original embeddings.
+                if args.train_text_encoder_ti:
+                    for idx, text_encoder in enumerate(text_encoders):
+                        embedding_handler.retract_embeddings()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                # create pipeline
+                if freeze_text_encoder:
+                    text_encoder_one = text_encoder_cls_one.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        subfolder="text_encoder",
+                        revision=args.revision,
+                        variant=args.variant,
+                    )
+                pipeline = StableDiffusionPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    tokenizer=tokenizer_one,
+                    text_encoder=accelerator.unwrap_model(text_encoder_one),
+                    unet=accelerator.unwrap_model(unet),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+
+                # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+                scheduler_args = {}
+
+                if "variance_type" in pipeline.scheduler.config:
+                    variance_type = pipeline.scheduler.config.variance_type
+
+                    if variance_type in ["learned", "learned_range"]:
+                        variance_type = "fixed_small"
+
+                    scheduler_args["variance_type"] = variance_type
+
+                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
+                    pipeline.scheduler.config, **scheduler_args
+                )
+
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+                pipeline_args = {"prompt": args.validation_prompt}
+
+                if torch.backends.mps.is_available():
+                    autocast_ctx = nullcontext()
+                else:
+                    autocast_ctx = torch.autocast(accelerator.device.type)
+
+                with autocast_ctx:
+                    images = [
+                        pipeline(**pipeline_args, generator=generator).images[0]
+                        for _ in range(args.num_validation_images)
+                    ]
+
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "validation": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        unet = unet.to(torch.float32)
+        unet_lora_layers = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+
+        if args.train_text_encoder:
+            text_encoder_one = accelerator.unwrap_model(text_encoder_one)
+            text_encoder_lora_layers = convert_state_dict_to_diffusers(
+                get_peft_model_state_dict(text_encoder_one.to(torch.float32))
+            )
+        else:
+            text_encoder_lora_layers = None
+
+        StableDiffusionPipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_layers,
+            text_encoder_lora_layers=text_encoder_lora_layers,
+        )
+
+        if args.train_text_encoder_ti:
+            embeddings_path = f"{args.output_dir}/{args.output_dir}_emb.safetensors"
+            embedding_handler.save_embeddings(embeddings_path)
+
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            # Final inference
+            # Load previous pipeline
+            vae = AutoencoderKL.from_pretrained(
+                vae_path,
+                subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+                revision=args.revision,
+                variant=args.variant,
+                torch_dtype=weight_dtype,
+            )
+            pipeline = StableDiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                vae=vae,
+                revision=args.revision,
+                variant=args.variant,
+                torch_dtype=weight_dtype,
+            )
+
+            # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+            scheduler_args = {}
+
+            if "variance_type" in pipeline.scheduler.config:
+                variance_type = pipeline.scheduler.config.variance_type
+
+                if variance_type in ["learned", "learned_range"]:
+                    variance_type = "fixed_small"
+
+                scheduler_args["variance_type"] = variance_type
+
+            pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+
+            # load attention processors
+            pipeline.load_lora_weights(args.output_dir)
+
+            # load new tokens
+            if args.train_text_encoder_ti:
+                state_dict = load_file(embeddings_path)
+                all_new_tokens = []
+                for key, value in token_abstraction_dict.items():
+                    all_new_tokens.extend(value)
+                pipeline.load_textual_inversion(
+                    state_dict["clip_l"],
+                    token=all_new_tokens,
+                    text_encoder=pipeline.text_encoder,
+                    tokenizer=pipeline.tokenizer,
+                )
+            # run inference
+            pipeline = pipeline.to(accelerator.device)
+            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+            images = [
+                pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                for _ in range(args.num_validation_images)
+            ]
+
+            for tracker in accelerator.trackers:
+                if tracker.name == "tensorboard":
+                    np_images = np.stack([np.asarray(img) for img in images])
+                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+                if tracker.name == "wandb":
+                    tracker.log(
+                        {
+                            "test": [
+                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                for i, image in enumerate(images)
+                            ]
+                        }
+                    )
+
+        # Convert to WebUI format
+        lora_state_dict = load_file(f"{args.output_dir}/pytorch_lora_weights.safetensors")
+        peft_state_dict = convert_all_state_dict_to_peft(lora_state_dict)
+        kohya_state_dict = convert_state_dict_to_kohya(peft_state_dict)
+        save_file(kohya_state_dict, f"{args.output_dir}/{Path(args.output_dir).name}.safetensors")
+
+        save_model_card(
+            model_id if not args.push_to_hub else repo_id,
+            use_dora=args.use_dora,
+            images=images,
+            base_model=args.pretrained_model_name_or_path,
+            train_text_encoder=args.train_text_encoder,
+            train_text_encoder_ti=args.train_text_encoder_ti,
+            token_abstraction_dict=train_dataset.token_abstraction_dict,
+            instance_prompt=args.instance_prompt,
+            validation_prompt=args.validation_prompt,
+            repo_folder=args.output_dir,
+            vae_path=args.pretrained_vae_model_name_or_path,
+        )
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/diffusers/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
new file mode 100644
index 0000000000000000000000000000000000000000..5222c8afe6f1acdae1de0a61d77bd48a0a34cbab
--- /dev/null
+++ b/diffusers/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
@@ -0,0 +1,2459 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import gc
+import itertools
+import json
+import logging
+import math
+import os
+import random
+import re
+import shutil
+import warnings
+from contextlib import nullcontext
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, hf_hub_download, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from packaging import version
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from safetensors.torch import load_file, save_file
+from torch.utils.data import Dataset
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EDMEulerScheduler,
+    EulerDiscreteScheduler,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import StableDiffusionLoraLoaderMixin
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr
+from diffusers.utils import (
+    check_min_version,
+    convert_all_state_dict_to_peft,
+    convert_state_dict_to_diffusers,
+    convert_state_dict_to_kohya,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def determine_scheduler_type(pretrained_model_name_or_path, revision):
+    model_index_filename = "model_index.json"
+    if os.path.isdir(pretrained_model_name_or_path):
+        model_index = os.path.join(pretrained_model_name_or_path, model_index_filename)
+    else:
+        model_index = hf_hub_download(
+            repo_id=pretrained_model_name_or_path, filename=model_index_filename, revision=revision
+        )
+
+    with open(model_index, "r") as f:
+        scheduler_type = json.load(f)["scheduler"][1]
+    return scheduler_type
+
+
+def save_model_card(
+    repo_id: str,
+    use_dora: bool,
+    images=None,
+    base_model: str = None,
+    train_text_encoder=False,
+    train_text_encoder_ti=False,
+    token_abstraction_dict=None,
+    instance_prompt: str = None,
+    validation_prompt: str = None,
+    repo_folder=None,
+    vae_path=None,
+):
+    img_str = "widget:\n"
+    lora = "lora" if not use_dora else "dora"
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"""
+        - text: '{validation_prompt if validation_prompt else ' ' }'
+          output:
+            url:
+                "image_{i}.png"
+        """
+    if not images:
+        img_str += f"""
+        - text: '{instance_prompt}'
+        """
+    embeddings_filename = f"{repo_folder}_emb"
+    instance_prompt_webui = re.sub(r"<s\d+>", "", re.sub(r"<s\d+>", embeddings_filename, instance_prompt, count=1))
+    ti_keys = ", ".join(f'"{match}"' for match in re.findall(r"<s\d+>", instance_prompt))
+    if instance_prompt_webui != embeddings_filename:
+        instance_prompt_sentence = f"For example, `{instance_prompt_webui}`"
+    else:
+        instance_prompt_sentence = ""
+    trigger_str = f"You should use {instance_prompt} to trigger the image generation."
+    diffusers_imports_pivotal = ""
+    diffusers_example_pivotal = ""
+    webui_example_pivotal = ""
+    license = ""
+    if "playground" in base_model:
+        license = """\n
+    ## License
+
+    Please adhere to the licensing terms as described [here](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic/blob/main/LICENSE.md).
+    """
+
+    if train_text_encoder_ti:
+        trigger_str = (
+            "To trigger image generation of trained concept(or concepts) replace each concept identifier "
+            "in you prompt with the new inserted tokens:\n"
+        )
+        diffusers_imports_pivotal = """from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+        """
+        diffusers_example_pivotal = f"""embedding_path = hf_hub_download(repo_id='{repo_id}', filename='{embeddings_filename}.safetensors', repo_type="model")
+state_dict = load_file(embedding_path)
+pipeline.load_textual_inversion(state_dict["clip_l"], token=[{ti_keys}], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer)
+pipeline.load_textual_inversion(state_dict["clip_g"], token=[{ti_keys}], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2)
+        """
+        webui_example_pivotal = f"""- *Embeddings*: download **[`{embeddings_filename}.safetensors` here 💾](/{repo_id}/blob/main/{embeddings_filename}.safetensors)**.
+    - Place it on it on your `embeddings` folder
+    - Use it by adding `{embeddings_filename}` to your prompt. {instance_prompt_sentence}
+    (you need both the LoRA and the embeddings as they were trained together for this LoRA)
+    """
+        if token_abstraction_dict:
+            for key, value in token_abstraction_dict.items():
+                tokens = "".join(value)
+                trigger_str += f"""
+to trigger concept `{key}` → use `{tokens}` in your prompt \n
+"""
+
+    yaml = f"""---
+tags:
+- stable-diffusion-xl
+- stable-diffusion-xl-diffusers
+- diffusers-training
+- text-to-image
+- diffusers
+- {lora}
+- template:sd-lora
+{img_str}
+base_model: {base_model}
+instance_prompt: {instance_prompt}
+license: openrail++
+---
+"""
+
+    model_card = f"""
+# SDXL LoRA DreamBooth - {repo_id}
+
+<Gallery />
+
+## Model description
+
+### These are {repo_id} LoRA adaption weights for {base_model}.
+
+## Download model
+
+### Use it with UIs such as AUTOMATIC1111, Comfy UI, SD.Next, Invoke
+
+- **LoRA**: download **[`{repo_folder}.safetensors` here 💾](/{repo_id}/blob/main/{repo_folder}.safetensors)**.
+    - Place it on your `models/Lora` folder.
+    - On AUTOMATIC1111, load the LoRA by adding `<lora:{repo_folder}:1>` to your prompt. On ComfyUI just [load it as a regular LoRA](https://comfyanonymous.github.io/ComfyUI_examples/lora/).
+{webui_example_pivotal}
+
+## Use it with the [🧨 diffusers library](https://github.com/huggingface/diffusers)
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+{diffusers_imports_pivotal}
+pipeline = AutoPipelineForText2Image.from_pretrained('stabilityai/stable-diffusion-xl-base-1.0', torch_dtype=torch.float16).to('cuda')
+pipeline.load_lora_weights('{repo_id}', weight_name='pytorch_lora_weights.safetensors')
+{diffusers_example_pivotal}
+image = pipeline('{validation_prompt if validation_prompt else instance_prompt}').images[0]
+```
+
+For more details, including weighting, merging and fusing LoRAs, check the [documentation on loading LoRAs in diffusers](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading_adapters)
+
+## Trigger words
+
+{trigger_str}
+
+## Details
+All [Files & versions](/{repo_id}/tree/main).
+
+The weights were trained using [🧨 diffusers Advanced Dreambooth Training Script](https://github.com/huggingface/diffusers/blob/main/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py).
+
+LoRA for the text encoder was enabled. {train_text_encoder}.
+
+Pivotal tuning was enabled: {train_text_encoder_ti}.
+
+Special VAE used for training: {vae_path}.
+
+{license}
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+
+    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+    scheduler_args = {}
+
+    if not args.do_edm_style_training:
+        if "variance_type" in pipeline.scheduler.config:
+            variance_type = pipeline.scheduler.config.variance_type
+
+            if variance_type in ["learned", "learned_range"]:
+                variance_type = "fixed_small"
+
+            scheduler_args["variance_type"] = variance_type
+
+        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    # Currently the context determination is a bit hand-wavy. We can improve it in the future if there's a better
+    # way to condition it. Reference: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
+    if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+
+    with autocast_ctx:
+        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    return images
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand.To load the custom captions, the training set directory needs to follow the structure of a "
+            "datasets ImageFolder, containing both the images and the corresponding caption for each image. see: "
+            "https://huggingface.co/docs/datasets/image_dataset for more information"
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset. In some cases, a dataset may have more than one configuration (for example "
+        "if it contains different subsets of data within, and you only wish to load a specific subset - in that case specify the desired configuration using --dataset_config_name. Leave as "
+        "None if there's only one config.",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help="A path to local folder containing the training data of instance images. Specify this arg instead of "
+        "--dataset_name if you wish to train using a local folder without custom captions. If you wish to train with custom captions please specify "
+        "--dataset_name instead.",
+    )
+
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+
+    parser.add_argument(
+        "--image_column",
+        type=str,
+        default="image",
+        help="The column of the dataset containing the target image. By "
+        "default, the standard Image Dataset maps out 'file_name' "
+        "to 'image'.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default=None,
+        help="The column of the dataset containing the instance prompt for each image",
+    )
+
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance, e.g. 'photo of a TOK dog', 'in the style of TOK'",
+    )
+    parser.add_argument(
+        "--token_abstraction",
+        type=str,
+        default="TOK",
+        help="identifier specifying the instance(or instances) as used in instance_prompt, validation prompt, "
+        "captions - e.g. TOK. To use multiple identifiers, please specify them in a comma separated string - e.g. "
+        "'TOK,TOK2,TOK3' etc.",
+    )
+
+    parser.add_argument(
+        "--num_new_tokens_per_abstraction",
+        type=int,
+        default=2,
+        help="number of new tokens inserted to the tokenizers per token_abstraction identifier when "
+        "--train_text_encoder_ti = True. By default, each --token_abstraction (e.g. TOK) is mapped to 2 new "
+        "tokens - <si><si+1> ",
+    )
+
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--do_edm_style_training",
+        default=False,
+        action="store_true",
+        help="Flag to conduct training using the EDM formulation as introduced in https://arxiv.org/abs/2206.00364.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="lora-dreambooth-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--clip_skip",
+        type=int,
+        default=None,
+        help="Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that "
+        "the output of the pre-final layer will be used for computing the prompt embeddings.",
+    )
+
+    parser.add_argument(
+        "--text_encoder_lr",
+        type=float,
+        default=5e-6,
+        help="Text encoder learning rate to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+
+    parser.add_argument(
+        "--train_text_encoder_ti",
+        action="store_true",
+        help=("Whether to use textual inversion"),
+    )
+
+    parser.add_argument(
+        "--train_text_encoder_ti_frac",
+        type=float,
+        default=0.5,
+        help=("The percentage of epochs to perform textual inversion"),
+    )
+
+    parser.add_argument(
+        "--train_text_encoder_frac",
+        type=float,
+        default=1.0,
+        help=("The percentage of epochs to perform text encoder tuning"),
+    )
+
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="AdamW",
+        help=('The optimizer type to use. Choose between ["AdamW", "prodigy"]'),
+    )
+
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+
+    parser.add_argument(
+        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--prodigy_beta3",
+        type=float,
+        default=None,
+        help="coefficients for computing the Prodigy stepsize using running averages. If set to None, "
+        "uses the value of square root of beta2. Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+    parser.add_argument(
+        "--adam_weight_decay_text_encoder", type=float, default=None, help="Weight decay to use for text_encoder"
+    )
+
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
+    )
+
+    parser.add_argument(
+        "--prodigy_use_bias_correction",
+        type=bool,
+        default=True,
+        help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW",
+    )
+    parser.add_argument(
+        "--prodigy_safeguard_warmup",
+        type=bool,
+        default=True,
+        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. "
+        "Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--use_dora",
+        action="store_true",
+        default=False,
+        help=(
+            "Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
+            "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
+        ),
+    )
+    parser.add_argument(
+        "--lora_unet_blocks",
+        type=str,
+        default=None,
+        help=(
+            "the U-net blocks to tune during training. please specify them in a comma separated string, e.g. `unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1` etc."
+            "NOTE: By default (if not specified) - regular LoRA training is performed. "
+            "if --use_blora is enabled, this arg will be ignored, since in B-LoRA training, targeted U-net blocks are `unet.up_blocks.0.attentions.0` and `unet.up_blocks.0.attentions.1`"
+        ),
+    )
+    parser.add_argument(
+        "--use_blora",
+        action="store_true",
+        help=(
+            "Whether to train a B-LoRA as proposed in- Implicit Style-Content Separation using B-LoRA https://arxiv.org/abs/2403.14572. "
+        ),
+    )
+    parser.add_argument(
+        "--cache_latents",
+        action="store_true",
+        default=False,
+        help="Cache the VAE latents",
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.instance_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--instance_data_dir`")
+
+    if args.dataset_name is not None and args.instance_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--instance_data_dir`")
+
+    if args.train_text_encoder and args.train_text_encoder_ti:
+        raise ValueError(
+            "Specify only one of `--train_text_encoder` or `--train_text_encoder_ti. "
+            "For full LoRA text encoder training check --train_text_encoder, for textual "
+            "inversion training check `--train_text_encoder_ti`"
+        )
+    if args.use_blora and args.lora_unet_blocks:
+        warnings.warn(
+            "You specified both `--use_blora` and `--lora_unet_blocks`, for B-LoRA training, target unet blocks are: `unet.up_blocks.0.attentions.0` and `unet.up_blocks.0.attentions.1`. "
+            "If you wish to target different U-net blocks, don't enable `--use_blora`"
+        )
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+# Taken (and slightly modified) from B-LoRA repo https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
+def is_belong_to_blocks(key, blocks):
+    try:
+        for g in blocks:
+            if g in key:
+                return True
+        return False
+    except Exception as e:
+        raise type(e)(f"failed to is_belong_to_block, due to: {e}")
+
+
+def get_unet_lora_target_modules(unet, use_blora, target_blocks=None):
+    if use_blora:
+        content_b_lora_blocks = "unet.up_blocks.0.attentions.0"
+        style_b_lora_blocks = "unet.up_blocks.0.attentions.1"
+        target_blocks = [content_b_lora_blocks, style_b_lora_blocks]
+    try:
+        blocks = [(".").join(blk.split(".")[1:]) for blk in target_blocks]
+
+        attns = [
+            attn_processor_name.rsplit(".", 1)[0]
+            for attn_processor_name, _ in unet.attn_processors.items()
+            if is_belong_to_blocks(attn_processor_name, blocks)
+        ]
+
+        target_modules = [f"{attn}.{mat}" for mat in ["to_k", "to_q", "to_v", "to_out.0"] for attn in attns]
+        return target_modules
+    except Exception as e:
+        raise type(e)(
+            f"failed to get_target_modules, due to: {e}. "
+            f"Please check the modules specified in --lora_unet_blocks are correct"
+        )
+
+
+# Taken from https://github.com/replicate/cog-sdxl/blob/main/dataset_and_utils.py
+class TokenEmbeddingsHandler:
+    def __init__(self, text_encoders, tokenizers):
+        self.text_encoders = text_encoders
+        self.tokenizers = tokenizers
+
+        self.train_ids: Optional[torch.Tensor] = None
+        self.inserting_toks: Optional[List[str]] = None
+        self.embeddings_settings = {}
+
+    def initialize_new_tokens(self, inserting_toks: List[str]):
+        idx = 0
+        for tokenizer, text_encoder in zip(self.tokenizers, self.text_encoders):
+            assert isinstance(inserting_toks, list), "inserting_toks should be a list of strings."
+            assert all(
+                isinstance(tok, str) for tok in inserting_toks
+            ), "All elements in inserting_toks should be strings."
+
+            self.inserting_toks = inserting_toks
+            special_tokens_dict = {"additional_special_tokens": self.inserting_toks}
+            tokenizer.add_special_tokens(special_tokens_dict)
+            text_encoder.resize_token_embeddings(len(tokenizer))
+
+            self.train_ids = tokenizer.convert_tokens_to_ids(self.inserting_toks)
+
+            # random initialization of new tokens
+            std_token_embedding = text_encoder.text_model.embeddings.token_embedding.weight.data.std()
+
+            print(f"{idx} text encoder's std_token_embedding: {std_token_embedding}")
+
+            text_encoder.text_model.embeddings.token_embedding.weight.data[self.train_ids] = (
+                torch.randn(len(self.train_ids), text_encoder.text_model.config.hidden_size)
+                .to(device=self.device)
+                .to(dtype=self.dtype)
+                * std_token_embedding
+            )
+            self.embeddings_settings[
+                f"original_embeddings_{idx}"
+            ] = text_encoder.text_model.embeddings.token_embedding.weight.data.clone()
+            self.embeddings_settings[f"std_token_embedding_{idx}"] = std_token_embedding
+
+            inu = torch.ones((len(tokenizer),), dtype=torch.bool)
+            inu[self.train_ids] = False
+
+            self.embeddings_settings[f"index_no_updates_{idx}"] = inu
+
+            print(self.embeddings_settings[f"index_no_updates_{idx}"].shape)
+
+            idx += 1
+
+    def save_embeddings(self, file_path: str):
+        assert self.train_ids is not None, "Initialize new tokens before saving embeddings."
+        tensors = {}
+        # text_encoder_0 - CLIP ViT-L/14, text_encoder_1 -  CLIP ViT-G/14
+        idx_to_text_encoder_name = {0: "clip_l", 1: "clip_g"}
+        for idx, text_encoder in enumerate(self.text_encoders):
+            assert text_encoder.text_model.embeddings.token_embedding.weight.data.shape[0] == len(
+                self.tokenizers[0]
+            ), "Tokenizers should be the same."
+            new_token_embeddings = text_encoder.text_model.embeddings.token_embedding.weight.data[self.train_ids]
+
+            # New tokens for each text encoder are saved under "clip_l" (for text_encoder 0), "clip_g" (for
+            # text_encoder 1) to keep compatible with the ecosystem.
+            # Note: When loading with diffusers, any name can work - simply specify in inference
+            tensors[idx_to_text_encoder_name[idx]] = new_token_embeddings
+            # tensors[f"text_encoders_{idx}"] = new_token_embeddings
+
+        save_file(tensors, file_path)
+
+    @property
+    def dtype(self):
+        return self.text_encoders[0].dtype
+
+    @property
+    def device(self):
+        return self.text_encoders[0].device
+
+    @torch.no_grad()
+    def retract_embeddings(self):
+        for idx, text_encoder in enumerate(self.text_encoders):
+            index_no_updates = self.embeddings_settings[f"index_no_updates_{idx}"]
+            text_encoder.text_model.embeddings.token_embedding.weight.data[index_no_updates] = (
+                self.embeddings_settings[f"original_embeddings_{idx}"][index_no_updates]
+                .to(device=text_encoder.device)
+                .to(dtype=text_encoder.dtype)
+            )
+
+            # for the parts that were updated, we need to normalize them
+            # to have the same std as before
+            std_token_embedding = self.embeddings_settings[f"std_token_embedding_{idx}"]
+
+            index_updates = ~index_no_updates
+            new_embeddings = text_encoder.text_model.embeddings.token_embedding.weight.data[index_updates]
+            off_ratio = std_token_embedding / new_embeddings.std()
+
+            new_embeddings = new_embeddings * (off_ratio**0.1)
+            text_encoder.text_model.embeddings.token_embedding.weight.data[index_updates] = new_embeddings
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        class_prompt,
+        train_text_encoder_ti,
+        class_data_root=None,
+        class_num=None,
+        token_abstraction_dict=None,  # token mapping for textual inversion
+        size=1024,
+        repeats=1,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+
+        self.instance_prompt = instance_prompt
+        self.custom_instance_prompts = None
+        self.class_prompt = class_prompt
+        self.token_abstraction_dict = token_abstraction_dict
+        self.train_text_encoder_ti = train_text_encoder_ti
+        # if --dataset_name is provided or a metadata jsonl file is provided in the local --instance_data directory,
+        # we load the training data using load_dataset
+        if args.dataset_name is not None:
+            try:
+                from datasets import load_dataset
+            except ImportError:
+                raise ImportError(
+                    "You are trying to load your data using the datasets library. If you wish to train using custom "
+                    "captions please install the datasets library: `pip install datasets`. If you wish to load a "
+                    "local folder containing images only, specify --instance_data_dir instead."
+                )
+            # Downloading and loading a dataset from the hub.
+            # See more about loading custom images at
+            # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+            dataset = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                cache_dir=args.cache_dir,
+            )
+            # Preprocessing the datasets.
+            column_names = dataset["train"].column_names
+
+            # 6. Get the column names for input/target.
+            if args.image_column is None:
+                image_column = column_names[0]
+                logger.info(f"image column defaulting to {image_column}")
+            else:
+                image_column = args.image_column
+                if image_column not in column_names:
+                    raise ValueError(
+                        f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+            instance_images = dataset["train"][image_column]
+
+            if args.caption_column is None:
+                logger.info(
+                    "No caption column provided, defaulting to instance_prompt for all images. If your dataset "
+                    "contains captions/prompts for the images, make sure to specify the "
+                    "column as --caption_column"
+                )
+                self.custom_instance_prompts = None
+            else:
+                if args.caption_column not in column_names:
+                    raise ValueError(
+                        f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+                custom_instance_prompts = dataset["train"][args.caption_column]
+                # create final list of captions according to --repeats
+                self.custom_instance_prompts = []
+                for caption in custom_instance_prompts:
+                    self.custom_instance_prompts.extend(itertools.repeat(caption, repeats))
+        else:
+            self.instance_data_root = Path(instance_data_root)
+            if not self.instance_data_root.exists():
+                raise ValueError("Instance images root doesn't exists.")
+
+            instance_images = [Image.open(path) for path in list(Path(instance_data_root).iterdir())]
+            self.custom_instance_prompts = None
+
+        self.instance_images = []
+        for img in instance_images:
+            self.instance_images.extend(itertools.repeat(img, repeats))
+
+        # image processing to prepare for using SD-XL micro-conditioning
+        self.original_sizes = []
+        self.crop_top_lefts = []
+        self.pixel_values = []
+        train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
+        train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
+        train_flip = transforms.RandomHorizontalFlip(p=1.0)
+        train_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        # if using B-LoRA for single image. do not use transformations
+        single_image = len(self.instance_images) < 2
+        for image in self.instance_images:
+            if not single_image:
+                image = exif_transpose(image)
+            if not image.mode == "RGB":
+                image = image.convert("RGB")
+            self.original_sizes.append((image.height, image.width))
+            image = train_resize(image)
+
+            if not single_image and args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop or single_image:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            crop_top_left = (y1, x1)
+            self.crop_top_lefts.append(crop_top_left)
+            image = train_transforms(image)
+            self.pixel_values.append(image)
+
+        self.num_instance_images = len(self.instance_images)
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+
+            self.original_sizes_class_imgs = []
+            self.crop_top_lefts_class_imgs = []
+            self.pixel_values_class_imgs = []
+            self.class_images = [Image.open(path) for path in self.class_images_path]
+            for image in self.class_images:
+                image = exif_transpose(image)
+                if not image.mode == "RGB":
+                    image = image.convert("RGB")
+                self.original_sizes_class_imgs.append((image.height, image.width))
+                image = train_resize(image)
+                if args.random_flip and random.random() < 0.5:
+                    # flip
+                    image = train_flip(image)
+                if args.center_crop:
+                    y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                    x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                    image = train_crop(image)
+                else:
+                    y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                    image = crop(image, y1, x1, h, w)
+                crop_top_left = (y1, x1)
+                self.crop_top_lefts_class_imgs.append(crop_top_left)
+                image = train_transforms(image)
+                self.pixel_values_class_imgs.append(image)
+
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        example["instance_images"] = self.pixel_values[index % self.num_instance_images]
+        example["original_size"] = self.original_sizes[index % self.num_instance_images]
+        example["crop_top_left"] = self.crop_top_lefts[index % self.num_instance_images]
+
+        if self.custom_instance_prompts:
+            caption = self.custom_instance_prompts[index % self.num_instance_images]
+            if caption:
+                if self.train_text_encoder_ti:
+                    # replace instances of --token_abstraction in caption with the new tokens: "<si><si+1>" etc.
+                    for token_abs, token_replacement in self.token_abstraction_dict.items():
+                        caption = caption.replace(token_abs, "".join(token_replacement))
+                example["instance_prompt"] = caption
+            else:
+                example["instance_prompt"] = self.instance_prompt
+
+        else:  # custom prompts were provided, but length does not match size of image dataset
+            example["instance_prompt"] = self.instance_prompt
+
+        if self.class_data_root:
+            example["class_prompt"] = self.class_prompt
+            example["class_images"] = self.pixel_values_class_imgs[index % self.num_class_images]
+            example["class_original_size"] = self.original_sizes_class_imgs[index % self.num_class_images]
+            example["class_crop_top_left"] = self.crop_top_lefts_class_imgs[index % self.num_class_images]
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    pixel_values = [example["instance_images"] for example in examples]
+    prompts = [example["instance_prompt"] for example in examples]
+    original_sizes = [example["original_size"] for example in examples]
+    crop_top_lefts = [example["crop_top_left"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        pixel_values += [example["class_images"] for example in examples]
+        prompts += [example["class_prompt"] for example in examples]
+        original_sizes += [example["class_original_size"] for example in examples]
+        crop_top_lefts += [example["class_crop_top_left"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    batch = {
+        "pixel_values": pixel_values,
+        "prompts": prompts,
+        "original_sizes": original_sizes,
+        "crop_top_lefts": crop_top_lefts,
+    }
+    return batch
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def tokenize_prompt(tokenizer, prompt, add_special_tokens=False):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        add_special_tokens=add_special_tokens,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(text_encoders, tokenizers, prompt, text_input_ids_list=None, clip_skip=None):
+    prompt_embeds_list = []
+
+    for i, text_encoder in enumerate(text_encoders):
+        if tokenizers is not None:
+            tokenizer = tokenizers[i]
+            text_input_ids = tokenize_prompt(tokenizer, prompt)
+        else:
+            assert text_input_ids_list is not None
+            text_input_ids = text_input_ids_list[i]
+
+        prompt_embeds = text_encoder(
+            text_input_ids.to(text_encoder.device), output_hidden_states=True, return_dict=False
+        )
+
+        # We are only ALWAYS interested in the pooled output of the final text encoder
+        pooled_prompt_embeds = prompt_embeds[0]
+        if clip_skip is None:
+            prompt_embeds = prompt_embeds[-1][-2]
+        else:
+            # "2" because SDXL always indexes from the penultimate layer.
+            prompt_embeds = prompt_embeds[-1][-(clip_skip + 2)]
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+        prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if args.do_edm_style_training and args.snr_gamma is not None:
+        raise ValueError("Min-SNR formulation is not supported when conducting EDM-style training.")
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            has_supported_fp16_accelerator = torch.cuda.is_available() or torch.backends.mps.is_available()
+            torch_dtype = torch.float16 if has_supported_fp16_accelerator else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = StableDiffusionXLPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                revision=args.revision,
+                variant=args.variant,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        model_id = args.hub_model_id or Path(args.output_dir).name
+        repo_id = None
+        if args.push_to_hub:
+            repo_id = create_repo(repo_id=model_id, exist_ok=True, token=args.hub_token).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        variant=args.variant,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        variant=args.variant,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    scheduler_type = determine_scheduler_type(args.pretrained_model_name_or_path, args.revision)
+    if "EDM" in scheduler_type:
+        args.do_edm_style_training = True
+        noise_scheduler = EDMEulerScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+        logger.info("Performing EDM-style training!")
+    elif args.do_edm_style_training:
+        noise_scheduler = EulerDiscreteScheduler.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="scheduler"
+        )
+        logger.info("Performing EDM-style training!")
+    else:
+        noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    latents_mean = latents_std = None
+    if hasattr(vae.config, "latents_mean") and vae.config.latents_mean is not None:
+        latents_mean = torch.tensor(vae.config.latents_mean).view(1, 4, 1, 1)
+    if hasattr(vae.config, "latents_std") and vae.config.latents_std is not None:
+        latents_std = torch.tensor(vae.config.latents_std).view(1, 4, 1, 1)
+
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    if args.train_text_encoder_ti:
+        # we parse the provided token identifier (or identifiers) into a list. s.t. - "TOK" -> ["TOK"], "TOK,
+        # TOK2" -> ["TOK", "TOK2"] etc.
+        token_abstraction_list = "".join(args.token_abstraction.split()).split(",")
+        logger.info(f"list of token identifiers: {token_abstraction_list}")
+
+        token_abstraction_dict = {}
+        token_idx = 0
+        for i, token in enumerate(token_abstraction_list):
+            token_abstraction_dict[token] = [
+                f"<s{token_idx + i + j}>" for j in range(args.num_new_tokens_per_abstraction)
+            ]
+            token_idx += args.num_new_tokens_per_abstraction - 1
+
+        # replace instances of --token_abstraction in --instance_prompt with the new tokens: "<si><si+1>" etc.
+        for token_abs, token_replacement in token_abstraction_dict.items():
+            args.instance_prompt = args.instance_prompt.replace(token_abs, "".join(token_replacement))
+            if args.with_prior_preservation:
+                args.class_prompt = args.class_prompt.replace(token_abs, "".join(token_replacement))
+            if args.validation_prompt:
+                args.validation_prompt = args.validation_prompt.replace(token_abs, "".join(token_replacement))
+                print("validation prompt:", args.validation_prompt)
+        # initialize the new tokens for textual inversion
+        embedding_handler = TokenEmbeddingsHandler(
+            [text_encoder_one, text_encoder_two], [tokenizer_one, tokenizer_two]
+        )
+        inserting_toks = []
+        for new_tok in token_abstraction_dict.values():
+            inserting_toks.extend(new_tok)
+        embedding_handler.initialize_new_tokens(inserting_toks=inserting_toks)
+
+    # We only train the additional adapter LoRA layers
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+
+    # The VAE is always in float32 to avoid NaN losses.
+    vae.to(accelerator.device, dtype=torch.float32)
+
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, "
+                    "please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder_one.gradient_checkpointing_enable()
+            text_encoder_two.gradient_checkpointing_enable()
+
+    # now we will add new LoRA weights to the attention layers
+
+    if args.use_blora:
+        # if using B-LoRA, the targeted blocks to train are automatically set
+        target_modules = get_unet_lora_target_modules(unet, use_blora=True)
+    elif args.lora_unet_blocks:
+        # if training specific unet blocks not in the B-LoRA scheme
+        target_blocks_list = "".join(args.lora_unet_blocks.split()).split(",")
+        logger.info(f"list of unet blocks to train: {target_blocks_list}")
+        target_modules = get_unet_lora_target_modules(unet, use_blora=False, target_blocks=target_blocks_list)
+    else:
+        target_modules = ["to_k", "to_q", "to_v", "to_out.0"]
+
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        use_dora=args.use_dora,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=target_modules,
+    )
+    unet.add_adapter(unet_lora_config)
+
+    # The text encoder comes from 🤗 transformers, so we cannot directly modify it.
+    # So, instead, we monkey-patch the forward calls of its attention-blocks.
+    if args.train_text_encoder:
+        text_lora_config = LoraConfig(
+            r=args.rank,
+            use_dora=args.use_dora,
+            lora_alpha=args.rank,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder_one.add_adapter(text_lora_config)
+        text_encoder_two.add_adapter(text_lora_config)
+
+    # if we use textual inversion, we freeze all parameters except for the token embeddings
+    # in text encoder
+    elif args.train_text_encoder_ti:
+        text_lora_parameters_one = []
+        for name, param in text_encoder_one.named_parameters():
+            if "token_embedding" in name:
+                # ensure that dtype is float32, even if rest of the model that isn't trained is loaded in fp16
+                param.data = param.to(dtype=torch.float32)
+                param.requires_grad = True
+                text_lora_parameters_one.append(param)
+            else:
+                param.requires_grad = False
+        text_lora_parameters_two = []
+        for name, param in text_encoder_two.named_parameters():
+            if "token_embedding" in name:
+                # ensure that dtype is float32, even if rest of the model that isn't trained is loaded in fp16
+                param.data = param.to(dtype=torch.float32)
+                param.requires_grad = True
+                text_lora_parameters_two.append(param)
+            else:
+                param.requires_grad = False
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder atten layers
+            unet_lora_layers_to_save = None
+            text_encoder_one_lora_layers_to_save = None
+            text_encoder_two_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(unwrap_model(unet))):
+                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
+                elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                    if args.train_text_encoder:
+                        text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers(
+                            get_peft_model_state_dict(model)
+                        )
+                elif isinstance(model, type(unwrap_model(text_encoder_two))):
+                    if args.train_text_encoder:
+                        text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers(
+                            get_peft_model_state_dict(model)
+                        )
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusionXLPipeline.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
+                text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save,
+            )
+        if args.train_text_encoder_ti:
+            embedding_handler.save_embeddings(f"{args.output_dir}/{Path(args.output_dir).name}_emb.safetensors")
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+        text_encoder_one_ = None
+        text_encoder_two_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(unet))):
+                unet_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                text_encoder_one_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_two))):
+                text_encoder_two_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir)
+
+        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        if args.train_text_encoder:
+            # Do we need to call `scale_lora_layers()` here?
+            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)
+
+            _set_state_dict_into_text_encoder(
+                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_two_
+            )
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [unet_]
+            if args.train_text_encoder:
+                models.extend([text_encoder_one_, text_encoder_two_])
+                # only upcast trainable parameters (LoRA) into fp32
+                cast_training_params(models)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [unet]
+        if args.train_text_encoder:
+            models.extend([text_encoder_one, text_encoder_two])
+
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(models, dtype=torch.float32)
+
+    unet_lora_parameters = list(filter(lambda p: p.requires_grad, unet.parameters()))
+
+    if args.train_text_encoder:
+        text_lora_parameters_one = list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
+        text_lora_parameters_two = list(filter(lambda p: p.requires_grad, text_encoder_two.parameters()))
+
+    # If neither --train_text_encoder nor --train_text_encoder_ti, text_encoders remain frozen during training
+    freeze_text_encoder = not (args.train_text_encoder or args.train_text_encoder_ti)
+
+    # Optimization parameters
+    unet_lora_parameters_with_lr = {"params": unet_lora_parameters, "lr": args.learning_rate}
+    if not freeze_text_encoder:
+        # different learning rate for text encoder and unet
+        text_lora_parameters_one_with_lr = {
+            "params": text_lora_parameters_one,
+            "weight_decay": args.adam_weight_decay_text_encoder
+            if args.adam_weight_decay_text_encoder
+            else args.adam_weight_decay,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        text_lora_parameters_two_with_lr = {
+            "params": text_lora_parameters_two,
+            "weight_decay": args.adam_weight_decay_text_encoder
+            if args.adam_weight_decay_text_encoder
+            else args.adam_weight_decay,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        params_to_optimize = [
+            unet_lora_parameters_with_lr,
+            text_lora_parameters_one_with_lr,
+            text_lora_parameters_two_with_lr,
+        ]
+    else:
+        params_to_optimize = [unet_lora_parameters_with_lr]
+
+    # Optimizer creation
+    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+            "Defaulting to adamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.optimizer.lower() == "adamw":
+        if args.use_8bit_adam:
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError(
+                    "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+                )
+
+            optimizer_class = bnb.optim.AdamW8bit
+        else:
+            optimizer_class = torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+
+    if args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+        if args.train_text_encoder and args.text_encoder_lr:
+            logger.warning(
+                f"Learning rates were provided both for the unet and the text encoder- e.g. text_encoder_lr:"
+                f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
+                f"When using prodigy only learning_rate is used as the initial learning rate."
+            )
+            # changes the learning rate of text_encoder_parameters_one and text_encoder_parameters_two to be
+            # --learning_rate
+            params_to_optimize[1]["lr"] = args.learning_rate
+            params_to_optimize[2]["lr"] = args.learning_rate
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_prompt=args.class_prompt,
+        train_text_encoder_ti=args.train_text_encoder_ti,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        token_abstraction_dict=token_abstraction_dict if args.train_text_encoder_ti else None,
+        class_num=args.num_class_images,
+        size=args.resolution,
+        repeats=args.repeats,
+        center_crop=args.center_crop,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Computes additional embeddings/ids required by the SDXL UNet.
+    # regular text embeddings (when `train_text_encoder` is not True)
+    # pooled text embeddings
+    # time ids
+
+    def compute_time_ids(crops_coords_top_left, original_size=None):
+        # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+        target_size = (args.resolution, args.resolution)
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids])
+        add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+        return add_time_ids
+
+    if not args.train_text_encoder:
+        tokenizers = [tokenizer_one, tokenizer_two]
+        text_encoders = [text_encoder_one, text_encoder_two]
+
+        def compute_text_embeddings(prompt, text_encoders, tokenizers, clip_skip):
+            with torch.no_grad():
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(text_encoders, tokenizers, prompt, clip_skip)
+                prompt_embeds = prompt_embeds.to(accelerator.device)
+                pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device)
+            return prompt_embeds, pooled_prompt_embeds
+
+    # If no type of tuning is done on the text_encoder and custom instance prompts are NOT
+    # provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid
+    # the redundant encoding.
+    if freeze_text_encoder and not train_dataset.custom_instance_prompts:
+        instance_prompt_hidden_states, instance_pooled_prompt_embeds = compute_text_embeddings(
+            args.instance_prompt, text_encoders, tokenizers, args.clip_skip
+        )
+
+    # Handle class prompt for prior-preservation.
+    if args.with_prior_preservation:
+        if freeze_text_encoder:
+            class_prompt_hidden_states, class_pooled_prompt_embeds = compute_text_embeddings(
+                args.class_prompt, text_encoders, tokenizers
+            )
+
+    # Clear the memory here
+    if freeze_text_encoder and not train_dataset.custom_instance_prompts:
+        del tokenizers, text_encoders
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+    # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
+    # pack the statically computed variables appropriately here. This is so that we don't
+    # have to pass them to the dataloader.
+
+    # if --train_text_encoder_ti we need add_special_tokens to be True fo textual inversion
+    add_special_tokens = True if args.train_text_encoder_ti else False
+
+    if not train_dataset.custom_instance_prompts:
+        if freeze_text_encoder:
+            prompt_embeds = instance_prompt_hidden_states
+            unet_add_text_embeds = instance_pooled_prompt_embeds
+            if args.with_prior_preservation:
+                prompt_embeds = torch.cat([prompt_embeds, class_prompt_hidden_states], dim=0)
+                unet_add_text_embeds = torch.cat([unet_add_text_embeds, class_pooled_prompt_embeds], dim=0)
+        # if we're optimizing the text encoder (both if instance prompt is used for all images or custom prompts) we need to tokenize and encode the
+        # batch prompts on all training steps
+        else:
+            tokens_one = tokenize_prompt(tokenizer_one, args.instance_prompt, add_special_tokens)
+            tokens_two = tokenize_prompt(tokenizer_two, args.instance_prompt, add_special_tokens)
+            if args.with_prior_preservation:
+                class_tokens_one = tokenize_prompt(tokenizer_one, args.class_prompt, add_special_tokens)
+                class_tokens_two = tokenize_prompt(tokenizer_two, args.class_prompt, add_special_tokens)
+                tokens_one = torch.cat([tokens_one, class_tokens_one], dim=0)
+                tokens_two = torch.cat([tokens_two, class_tokens_two], dim=0)
+
+    if args.cache_latents:
+        latents_cache = []
+        for batch in tqdm(train_dataloader, desc="Caching latents"):
+            with torch.no_grad():
+                batch["pixel_values"] = batch["pixel_values"].to(
+                    accelerator.device, non_blocking=True, dtype=torch.float32
+                )
+                latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
+
+        if args.validation_prompt is None:
+            del vae
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Scheduler and math around the number of training steps.
+    # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
+    num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
+    if args.max_train_steps is None:
+        len_train_dataloader_after_sharding = math.ceil(len(train_dataloader) / accelerator.num_processes)
+        num_update_steps_per_epoch = math.ceil(len_train_dataloader_after_sharding / args.gradient_accumulation_steps)
+        num_training_steps_for_scheduler = (
+            args.num_train_epochs * num_update_steps_per_epoch * accelerator.num_processes
+        )
+    else:
+        num_training_steps_for_scheduler = args.max_train_steps * accelerator.num_processes
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps_for_scheduler,
+        num_training_steps=num_training_steps_for_scheduler,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if not freeze_text_encoder:
+        unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        if num_training_steps_for_scheduler != args.max_train_steps * accelerator.num_processes:
+            logger.warning(
+                f"The length of the 'train_dataloader' after 'accelerator.prepare' ({len(train_dataloader)}) does not match "
+                f"the expected length ({len_train_dataloader_after_sharding}) when the learning rate scheduler was created. "
+                f"This inconsistency may result in the learning rate scheduler not functioning properly."
+            )
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = (
+            "dreambooth-lora-sd-xl"
+            if "playground" not in args.pretrained_model_name_or_path
+            else "dreambooth-lora-playground"
+        )
+        accelerator.init_trackers(tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    if args.train_text_encoder:
+        num_train_epochs_text_encoder = int(args.train_text_encoder_frac * args.num_train_epochs)
+    elif args.train_text_encoder_ti:  # args.train_text_encoder_ti
+        num_train_epochs_text_encoder = int(args.train_text_encoder_ti_frac * args.num_train_epochs)
+    # flag used for textual inversion
+    pivoted = False
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        # if performing any kind of optimization of text_encoder params
+        if args.train_text_encoder or args.train_text_encoder_ti:
+            if epoch == num_train_epochs_text_encoder:
+                print("PIVOT HALFWAY", epoch)
+                # stopping optimization of text_encoder params
+                # this flag is used to reset the optimizer to optimize only on unet params
+                pivoted = True
+
+            else:
+                # still optimizing the text encoder
+                text_encoder_one.train()
+                text_encoder_two.train()
+                # set top parameter requires_grad = True for gradient checkpointing works
+                if args.train_text_encoder:
+                    accelerator.unwrap_model(text_encoder_one).text_model.embeddings.requires_grad_(True)
+                    accelerator.unwrap_model(text_encoder_two).text_model.embeddings.requires_grad_(True)
+
+        for step, batch in enumerate(train_dataloader):
+            if pivoted:
+                # stopping optimization of text_encoder params
+                # re setting the optimizer to optimize only on unet params
+                optimizer.param_groups[1]["lr"] = 0.0
+                optimizer.param_groups[2]["lr"] = 0.0
+
+            with accelerator.accumulate(unet):
+                prompts = batch["prompts"]
+                # encode batch prompts when custom prompts are provided for each image -
+                if train_dataset.custom_instance_prompts:
+                    if freeze_text_encoder:
+                        prompt_embeds, unet_add_text_embeds = compute_text_embeddings(
+                            prompts, text_encoders, tokenizers, args.clip_skip
+                        )
+
+                    else:
+                        tokens_one = tokenize_prompt(tokenizer_one, prompts, add_special_tokens)
+                        tokens_two = tokenize_prompt(tokenizer_two, prompts, add_special_tokens)
+
+                if args.cache_latents:
+                    model_input = latents_cache[step].sample()
+                else:
+                    pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                    model_input = vae.encode(pixel_values).latent_dist.sample()
+
+                if latents_mean is None and latents_std is None:
+                    model_input = model_input * vae.config.scaling_factor
+                    if args.pretrained_vae_model_name_or_path is None:
+                        model_input = model_input.to(weight_dtype)
+                else:
+                    latents_mean = latents_mean.to(device=model_input.device, dtype=model_input.dtype)
+                    latents_std = latents_std.to(device=model_input.device, dtype=model_input.dtype)
+                    model_input = (model_input - latents_mean) * vae.config.scaling_factor / latents_std
+                    model_input = model_input.to(dtype=weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (model_input.shape[0], model_input.shape[1], 1, 1), device=model_input.device
+                    )
+
+                bsz = model_input.shape[0]
+
+                # Sample a random timestep for each image
+                if not args.do_edm_style_training:
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                    )
+                    timesteps = timesteps.long()
+                else:
+                    # in EDM formulation, the model is conditioned on the pre-conditioned noise levels
+                    # instead of discrete timesteps, so here we sample indices to get the noise levels
+                    # from `scheduler.timesteps`
+                    indices = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,))
+                    timesteps = noise_scheduler.timesteps[indices].to(device=model_input.device)
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+                # For EDM-style training, we first obtain the sigmas based on the continuous timesteps.
+                # We then precondition the final model inputs based on these sigmas instead of the timesteps.
+                # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                if args.do_edm_style_training:
+                    sigmas = get_sigmas(timesteps, len(noisy_model_input.shape), noisy_model_input.dtype)
+                    if "EDM" in scheduler_type:
+                        inp_noisy_latents = noise_scheduler.precondition_inputs(noisy_model_input, sigmas)
+                    else:
+                        inp_noisy_latents = noisy_model_input / ((sigmas**2 + 1) ** 0.5)
+
+                # time ids
+                add_time_ids = torch.cat(
+                    [
+                        compute_time_ids(original_size=s, crops_coords_top_left=c)
+                        for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])
+                    ]
+                )
+
+                # Calculate the elements to repeat depending on the use of prior-preservation and custom captions.
+                if not train_dataset.custom_instance_prompts:
+                    elems_to_repeat_text_embeds = bsz // 2 if args.with_prior_preservation else bsz
+
+                else:
+                    elems_to_repeat_text_embeds = 1
+
+                # Predict the noise residual
+                if freeze_text_encoder:
+                    unet_added_conditions = {
+                        "time_ids": add_time_ids,
+                        "text_embeds": unet_add_text_embeds.repeat(elems_to_repeat_text_embeds, 1),
+                    }
+                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
+                    model_pred = unet(
+                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
+                        timesteps,
+                        prompt_embeds_input,
+                        added_cond_kwargs=unet_added_conditions,
+                        return_dict=False,
+                    )[0]
+                else:
+                    unet_added_conditions = {"time_ids": add_time_ids}
+                    prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                        text_encoders=[text_encoder_one, text_encoder_two],
+                        tokenizers=None,
+                        prompt=None,
+                        text_input_ids_list=[tokens_one, tokens_two],
+                        clip_skip=args.clip_skip,
+                    )
+                    unet_added_conditions.update(
+                        {"text_embeds": pooled_prompt_embeds.repeat(elems_to_repeat_text_embeds, 1)}
+                    )
+                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
+                    model_pred = unet(
+                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
+                        timesteps,
+                        prompt_embeds_input,
+                        added_cond_kwargs=unet_added_conditions,
+                        return_dict=False,
+                    )[0]
+
+                weighting = None
+                if args.do_edm_style_training:
+                    # Similar to the input preconditioning, the model predictions are also preconditioned
+                    # on noised model inputs (before preconditioning) and the sigmas.
+                    # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                    if "EDM" in scheduler_type:
+                        model_pred = noise_scheduler.precondition_outputs(noisy_model_input, model_pred, sigmas)
+                    else:
+                        if noise_scheduler.config.prediction_type == "epsilon":
+                            model_pred = model_pred * (-sigmas) + noisy_model_input
+                        elif noise_scheduler.config.prediction_type == "v_prediction":
+                            model_pred = model_pred * (-sigmas / (sigmas**2 + 1) ** 0.5) + (
+                                noisy_model_input / (sigmas**2 + 1)
+                            )
+                    # We are not doing weighting here because it tends result in numerical problems.
+                    # See: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
+                    # There might be other alternatives for weighting as well:
+                    # https://github.com/huggingface/diffusers/pull/7126#discussion_r1505404686
+                    if "EDM" not in scheduler_type:
+                        weighting = (sigmas**-2.0).float()
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = model_input if args.do_edm_style_training else noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = (
+                        model_input
+                        if args.do_edm_style_training
+                        else noise_scheduler.get_velocity(model_input, noise, timesteps)
+                    )
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute prior loss
+                    if weighting is not None:
+                        prior_loss = torch.mean(
+                            (weighting.float() * (model_pred_prior.float() - target_prior.float()) ** 2).reshape(
+                                target_prior.shape[0], -1
+                            ),
+                            1,
+                        )
+                        prior_loss = prior_loss.mean()
+                    else:
+                        prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                if args.snr_gamma is None:
+                    if weighting is not None:
+                        loss = torch.mean(
+                            (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(
+                                target.shape[0], -1
+                            ),
+                            1,
+                        )
+                        loss = loss.mean()
+                    else:
+                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+
+                    if args.with_prior_preservation:
+                        # if we're using prior preservation, we calc snr for instance loss only -
+                        # and hence only need timesteps corresponding to instance images
+                        snr_timesteps, _ = torch.chunk(timesteps, 2, dim=0)
+                    else:
+                        snr_timesteps = timesteps
+
+                    snr = compute_snr(noise_scheduler, snr_timesteps)
+                    base_weight = (
+                        torch.stack([snr, args.snr_gamma * torch.ones_like(snr_timesteps)], dim=1).min(dim=1)[0] / snr
+                    )
+
+                    if noise_scheduler.config.prediction_type == "v_prediction":
+                        # Velocity objective needs to be floored to an SNR weight of one.
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                if args.with_prior_preservation:
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(unet_lora_parameters, text_lora_parameters_one, text_lora_parameters_two)
+                        if (args.train_text_encoder or args.train_text_encoder_ti)
+                        else unet_lora_parameters
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+                # every step, we reset the embeddings to the original embeddings.
+                if args.train_text_encoder_ti:
+                    embedding_handler.retract_embeddings()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                # create pipeline
+                if freeze_text_encoder:
+                    text_encoder_one = text_encoder_cls_one.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        subfolder="text_encoder",
+                        revision=args.revision,
+                        variant=args.variant,
+                    )
+                    text_encoder_two = text_encoder_cls_two.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        subfolder="text_encoder_2",
+                        revision=args.revision,
+                        variant=args.variant,
+                    )
+                pipeline = StableDiffusionXLPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    tokenizer=tokenizer_one,
+                    tokenizer_2=tokenizer_two,
+                    text_encoder=accelerator.unwrap_model(text_encoder_one),
+                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
+                    unet=accelerator.unwrap_model(unet),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline_args = {"prompt": args.validation_prompt}
+
+                images = log_validation(
+                    pipeline,
+                    args,
+                    accelerator,
+                    pipeline_args,
+                    epoch,
+                )
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        unet = unet.to(torch.float32)
+        unet_lora_layers = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+
+        if args.train_text_encoder:
+            text_encoder_one = unwrap_model(text_encoder_one)
+            text_encoder_lora_layers = convert_state_dict_to_diffusers(
+                get_peft_model_state_dict(text_encoder_one.to(torch.float32))
+            )
+            text_encoder_two = unwrap_model(text_encoder_two)
+            text_encoder_2_lora_layers = convert_state_dict_to_diffusers(
+                get_peft_model_state_dict(text_encoder_two.to(torch.float32))
+            )
+        else:
+            text_encoder_lora_layers = None
+            text_encoder_2_lora_layers = None
+
+        StableDiffusionXLPipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_layers,
+            text_encoder_lora_layers=text_encoder_lora_layers,
+            text_encoder_2_lora_layers=text_encoder_2_lora_layers,
+        )
+
+        if args.train_text_encoder_ti:
+            embeddings_path = f"{args.output_dir}/{args.output_dir}_emb.safetensors"
+            embedding_handler.save_embeddings(embeddings_path)
+
+        # Final inference
+        # Load previous pipeline
+        vae = AutoencoderKL.from_pretrained(
+            vae_path,
+            subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            vae=vae,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+
+        # load attention processors
+        pipeline.load_lora_weights(args.output_dir)
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline_args = {"prompt": args.validation_prompt, "num_inference_steps": 25}
+            images = log_validation(
+                pipeline,
+                args,
+                accelerator,
+                pipeline_args,
+                epoch,
+                is_final_validation=True,
+            )
+
+        # Convert to WebUI format
+        lora_state_dict = load_file(f"{args.output_dir}/pytorch_lora_weights.safetensors")
+        peft_state_dict = convert_all_state_dict_to_peft(lora_state_dict)
+        kohya_state_dict = convert_state_dict_to_kohya(peft_state_dict)
+        save_file(kohya_state_dict, f"{args.output_dir}/{Path(args.output_dir).name}.safetensors")
+
+        save_model_card(
+            model_id if not args.push_to_hub else repo_id,
+            use_dora=args.use_dora,
+            images=images,
+            base_model=args.pretrained_model_name_or_path,
+            train_text_encoder=args.train_text_encoder,
+            train_text_encoder_ti=args.train_text_encoder_ti,
+            token_abstraction_dict=train_dataset.token_abstraction_dict,
+            instance_prompt=args.instance_prompt,
+            validation_prompt=args.validation_prompt,
+            repo_folder=args.output_dir,
+            vae_path=args.pretrained_vae_model_name_or_path,
+        )
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/amused/README.md b/diffusers/examples/amused/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1230bd8667578de596acdca110b534344e477210
--- /dev/null
+++ b/diffusers/examples/amused/README.md
@@ -0,0 +1,326 @@
+## Amused training
+
+Amused can be finetuned on simple datasets relatively cheaply and quickly. Using 8bit optimizers, lora, and gradient accumulation, amused can be finetuned with as little as 5.5 GB. Here are a set of examples for finetuning amused on some relatively simple datasets. These training recipies are aggressively oriented towards minimal resources and fast verification -- i.e. the batch sizes are quite low and the learning rates are quite high. For optimal quality, you will probably want to increase the batch sizes and decrease learning rates.
+
+All training examples use fp16 mixed precision and gradient checkpointing. We don't show 8 bit adam + lora as its about the same memory use as just using lora (bitsandbytes uses full precision optimizer states for weights below a minimum size).
+
+### Finetuning the 256 checkpoint
+
+These examples finetune on this [nouns](https://huggingface.co/datasets/m1guelpf/nouns) dataset.
+
+Example results:
+
+![noun1](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/noun1.png) ![noun2](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/noun2.png) ![noun3](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/noun3.png)
+
+
+#### Full finetuning
+
+Batch size: 8, Learning rate: 1e-4, Gives decent results in 750-1000 steps
+
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    8        |          1                   |     8             |      19.7 GB       |
+|    4        |          2                   |     8             |      18.3 GB       |
+|    1        |          8                   |     8             |      17.9 GB       |
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 1e-4 \
+    --pretrained_model_name_or_path amused/amused-256 \
+    --instance_data_dataset  'm1guelpf/nouns' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 256 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'a pixel art character with square red glasses, a baseball-shaped head and a orange-colored body on a dark background' \
+        'a pixel art character with square orange glasses, a lips-shaped head and a red-colored body on a light background' \
+        'a pixel art character with square blue glasses, a microwave-shaped head and a purple-colored body on a sunny background' \
+        'a pixel art character with square red glasses, a baseball-shaped head and a blue-colored body on an orange background' \
+        'a pixel art character with square red glasses' \
+        'a pixel art character' \
+        'square red glasses on a pixel art character' \
+        'square red glasses on a pixel art character with a baseball-shaped head' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+
+#### Full finetuning + 8 bit adam
+
+Note that this training config keeps the batch size low and the learning rate high to get results fast with low resources. However, due to 8 bit adam, it will diverge eventually. If you want to train for longer, you will have to up the batch size and lower the learning rate.
+
+Batch size: 16, Learning rate: 2e-5, Gives decent results in ~750 steps
+
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    16        |          1                   |     16             |      20.1 GB       |
+|    8        |          2                   |      16           |      15.6 GB       |
+|    1        |          16                   |     16            |      10.7 GB       |
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 2e-5 \
+    --use_8bit_adam \
+    --pretrained_model_name_or_path amused/amused-256 \
+    --instance_data_dataset  'm1guelpf/nouns' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 256 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'a pixel art character with square red glasses, a baseball-shaped head and a orange-colored body on a dark background' \
+        'a pixel art character with square orange glasses, a lips-shaped head and a red-colored body on a light background' \
+        'a pixel art character with square blue glasses, a microwave-shaped head and a purple-colored body on a sunny background' \
+        'a pixel art character with square red glasses, a baseball-shaped head and a blue-colored body on an orange background' \
+        'a pixel art character with square red glasses' \
+        'a pixel art character' \
+        'square red glasses on a pixel art character' \
+        'square red glasses on a pixel art character with a baseball-shaped head' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+
+#### Full finetuning + lora
+
+Batch size: 16, Learning rate: 8e-4, Gives decent results in 1000-1250 steps
+
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    16        |          1                   |     16             |      14.1 GB       |
+|    8        |          2                   |      16           |      10.1 GB       |
+|    1        |          16                   |     16            |      6.5 GB       |
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 8e-4 \
+    --use_lora \
+    --pretrained_model_name_or_path amused/amused-256 \
+    --instance_data_dataset  'm1guelpf/nouns' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 256 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'a pixel art character with square red glasses, a baseball-shaped head and a orange-colored body on a dark background' \
+        'a pixel art character with square orange glasses, a lips-shaped head and a red-colored body on a light background' \
+        'a pixel art character with square blue glasses, a microwave-shaped head and a purple-colored body on a sunny background' \
+        'a pixel art character with square red glasses, a baseball-shaped head and a blue-colored body on an orange background' \
+        'a pixel art character with square red glasses' \
+        'a pixel art character' \
+        'square red glasses on a pixel art character' \
+        'square red glasses on a pixel art character with a baseball-shaped head' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+
+### Finetuning the 512 checkpoint
+
+These examples finetune on this [minecraft](https://huggingface.co/monadical-labs/minecraft-preview) dataset.
+
+Example results:
+
+![minecraft1](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/minecraft1.png) ![minecraft2](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/minecraft2.png) ![minecraft3](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/minecraft3.png)
+
+#### Full finetuning
+
+Batch size: 8, Learning rate: 8e-5, Gives decent results in 500-1000 steps
+
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    8        |          1                   |     8             |      24.2 GB       |
+|    4        |          2                   |     8             |      19.7 GB       |
+|    1        |          8                   |     8             |      16.99 GB       |
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 8e-5 \
+    --pretrained_model_name_or_path amused/amused-512 \
+    --instance_data_dataset  'monadical-labs/minecraft-preview' \
+    --prompt_prefix 'minecraft ' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 512 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'minecraft Avatar' \
+        'minecraft character' \
+        'minecraft' \
+        'minecraft president' \
+        'minecraft pig' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+
+#### Full finetuning + 8 bit adam
+
+Batch size: 8, Learning rate: 5e-6, Gives decent results in 500-1000 steps
+
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    8        |          1                   |     8             |      21.2 GB       |
+|    4        |          2                   |     8             |      13.3 GB       |
+|    1        |          8                   |     8             |      9.9 GB       |
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 5e-6 \
+    --pretrained_model_name_or_path amused/amused-512 \
+    --instance_data_dataset  'monadical-labs/minecraft-preview' \
+    --prompt_prefix 'minecraft ' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 512 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'minecraft Avatar' \
+        'minecraft character' \
+        'minecraft' \
+        'minecraft president' \
+        'minecraft pig' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+
+#### Full finetuning + lora
+
+Batch size: 8, Learning rate: 1e-4, Gives decent results in 500-1000 steps
+
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    8        |          1                   |     8             |      12.7 GB       |
+|    4        |          2                   |     8             |      9.0 GB       |
+|    1        |          8                   |     8             |      5.6 GB       |
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 1e-4 \
+    --use_lora \
+    --pretrained_model_name_or_path amused/amused-512 \
+    --instance_data_dataset  'monadical-labs/minecraft-preview' \
+    --prompt_prefix 'minecraft ' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 512 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'minecraft Avatar' \
+        'minecraft character' \
+        'minecraft' \
+        'minecraft president' \
+        'minecraft pig' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+
+### Styledrop
+
+[Styledrop](https://arxiv.org/abs/2306.00983) is an efficient finetuning method for learning a new style from just one or very few images. It has an optional first stage to generate human picked additional training samples. The additional training samples can be used to augment the initial images. Our examples exclude the optional additional image selection stage and instead we just finetune on a single image.
+
+This is our example style image:
+![example](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/A%20mushroom%20in%20%5BV%5D%20style.png)
+
+Download it to your local directory with
+```sh
+wget https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/A%20mushroom%20in%20%5BV%5D%20style.png
+```
+
+#### 256
+
+Example results:
+
+![glowing_256_1](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/glowing_256_1.png) ![glowing_256_2](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/glowing_256_2.png) ![glowing_256_3](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/glowing_256_3.png)
+
+Learning rate: 4e-4, Gives decent results in 1500-2000 steps
+
+Memory used: 6.5 GB
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --mixed_precision fp16 \
+    --report_to wandb \
+    --use_lora \
+    --pretrained_model_name_or_path amused/amused-256 \
+    --train_batch_size 1 \
+    --lr_scheduler constant \
+    --learning_rate 4e-4 \
+    --validation_prompts \
+        'A chihuahua walking on the street in [V] style' \
+        'A banana on the table in [V] style' \
+        'A church on the street in [V] style' \
+        'A tabby cat walking in the forest in [V] style' \
+    --instance_data_image 'A mushroom in [V] style.png' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 100 \
+    --resolution 256
+```
+
+#### 512
+
+Example results:
+
+![glowing_512_1](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/glowing_512_1.png) ![glowing_512_2](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/glowing_512_2.png) ![glowing_512_3](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/glowing_512_3.png)
+
+Learning rate: 1e-3, Lora alpha 1, Gives decent results in 1500-2000 steps
+
+Memory used: 5.6 GB
+
+```
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --mixed_precision fp16 \
+    --report_to wandb \
+    --use_lora \
+    --pretrained_model_name_or_path amused/amused-512 \
+    --train_batch_size 1 \
+    --lr_scheduler constant \
+    --learning_rate 1e-3 \
+    --validation_prompts \
+        'A chihuahua walking on the street in [V] style' \
+        'A banana on the table in [V] style' \
+        'A church on the street in [V] style' \
+        'A tabby cat walking in the forest in [V] style' \
+    --instance_data_image 'A mushroom in [V] style.png' \
+    --max_train_steps 100000 \
+    --checkpointing_steps 500 \
+    --validation_steps 100 \
+    --resolution 512 \
+    --lora_alpha 1
+```
\ No newline at end of file
diff --git a/diffusers/examples/amused/train_amused.py b/diffusers/examples/amused/train_amused.py
new file mode 100644
index 0000000000000000000000000000000000000000..ede51775dd8feb14f5670fb9e1e6b7e9cedc2bae
--- /dev/null
+++ b/diffusers/examples/amused/train_amused.py
@@ -0,0 +1,975 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import copy
+import logging
+import math
+import os
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+
+import torch
+import torch.nn.functional as F
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import DataLoader, Dataset, default_collate
+from torchvision import transforms
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+)
+
+import diffusers.optimization
+from diffusers import AmusedPipeline, AmusedScheduler, EMAModel, UVit2DModel, VQModel
+from diffusers.loaders import AmusedLoraLoaderMixin
+from diffusers.utils import is_wandb_available
+
+
+if is_wandb_available():
+    import wandb
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--instance_data_dataset",
+        type=str,
+        default=None,
+        required=False,
+        help="A Hugging Face dataset containing the training images",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--instance_data_image", type=str, default=None, required=False, help="A single training image"
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument("--ema_decay", type=float, default=0.9999)
+    parser.add_argument("--ema_update_after_step", type=int, default=0)
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="muse_training",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--logging_steps",
+        type=int,
+        default=50,
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more details"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=0.0003,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="wandb",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--validation_prompts", type=str, nargs="*")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument("--split_vae_encode", type=int, required=False, default=None)
+    parser.add_argument("--min_masking_rate", type=float, default=0.0)
+    parser.add_argument("--cond_dropout_prob", type=float, default=0.0)
+    parser.add_argument("--max_grad_norm", default=None, type=float, help="Max gradient norm.", required=False)
+    parser.add_argument("--use_lora", action="store_true", help="Fine tune the model using LoRa")
+    parser.add_argument("--text_encoder_use_lora", action="store_true", help="Fine tune the model using LoRa")
+    parser.add_argument("--lora_r", default=16, type=int)
+    parser.add_argument("--lora_alpha", default=32, type=int)
+    parser.add_argument("--lora_target_modules", default=["to_q", "to_k", "to_v"], type=str, nargs="+")
+    parser.add_argument("--text_encoder_lora_r", default=16, type=int)
+    parser.add_argument("--text_encoder_lora_alpha", default=32, type=int)
+    parser.add_argument("--text_encoder_lora_target_modules", default=["to_q", "to_k", "to_v"], type=str, nargs="+")
+    parser.add_argument("--train_text_encoder", action="store_true")
+    parser.add_argument("--image_key", type=str, required=False)
+    parser.add_argument("--prompt_key", type=str, required=False)
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument("--prompt_prefix", type=str, required=False, default=None)
+
+    args = parser.parse_args()
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    num_datasources = sum(
+        [x is not None for x in [args.instance_data_dir, args.instance_data_image, args.instance_data_dataset]]
+    )
+
+    if num_datasources != 1:
+        raise ValueError(
+            "provide one and only one of `--instance_data_dir`, `--instance_data_image`, or `--instance_data_dataset`"
+        )
+
+    if args.instance_data_dir is not None:
+        if not os.path.exists(args.instance_data_dir):
+            raise ValueError(f"Does not exist: `--args.instance_data_dir` {args.instance_data_dir}")
+
+    if args.instance_data_image is not None:
+        if not os.path.exists(args.instance_data_image):
+            raise ValueError(f"Does not exist: `--args.instance_data_image` {args.instance_data_image}")
+
+    if args.instance_data_dataset is not None and (args.image_key is None or args.prompt_key is None):
+        raise ValueError("`--instance_data_dataset` requires setting `--image_key` and `--prompt_key`")
+
+    return args
+
+
+class InstanceDataRootDataset(Dataset):
+    def __init__(
+        self,
+        instance_data_root,
+        tokenizer,
+        size=512,
+    ):
+        self.size = size
+        self.tokenizer = tokenizer
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+
+    def __len__(self):
+        return len(self.instance_images_path)
+
+    def __getitem__(self, index):
+        image_path = self.instance_images_path[index % len(self.instance_images_path)]
+        instance_image = Image.open(image_path)
+        rv = process_image(instance_image, self.size)
+
+        prompt = os.path.splitext(os.path.basename(image_path))[0]
+        rv["prompt_input_ids"] = tokenize_prompt(self.tokenizer, prompt)[0]
+        return rv
+
+
+class InstanceDataImageDataset(Dataset):
+    def __init__(
+        self,
+        instance_data_image,
+        train_batch_size,
+        size=512,
+    ):
+        self.value = process_image(Image.open(instance_data_image), size)
+        self.train_batch_size = train_batch_size
+
+    def __len__(self):
+        # Needed so a full batch of the data can be returned. Otherwise will return
+        # batches of size 1
+        return self.train_batch_size
+
+    def __getitem__(self, index):
+        return self.value
+
+
+class HuggingFaceDataset(Dataset):
+    def __init__(
+        self,
+        hf_dataset,
+        tokenizer,
+        image_key,
+        prompt_key,
+        prompt_prefix=None,
+        size=512,
+    ):
+        self.size = size
+        self.image_key = image_key
+        self.prompt_key = prompt_key
+        self.tokenizer = tokenizer
+        self.hf_dataset = hf_dataset
+        self.prompt_prefix = prompt_prefix
+
+    def __len__(self):
+        return len(self.hf_dataset)
+
+    def __getitem__(self, index):
+        item = self.hf_dataset[index]
+
+        rv = process_image(item[self.image_key], self.size)
+
+        prompt = item[self.prompt_key]
+
+        if self.prompt_prefix is not None:
+            prompt = self.prompt_prefix + prompt
+
+        rv["prompt_input_ids"] = tokenize_prompt(self.tokenizer, prompt)[0]
+
+        return rv
+
+
+def process_image(image, size):
+    image = exif_transpose(image)
+
+    if not image.mode == "RGB":
+        image = image.convert("RGB")
+
+    orig_height = image.height
+    orig_width = image.width
+
+    image = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)(image)
+
+    c_top, c_left, _, _ = transforms.RandomCrop.get_params(image, output_size=(size, size))
+    image = transforms.functional.crop(image, c_top, c_left, size, size)
+
+    image = transforms.ToTensor()(image)
+
+    micro_conds = torch.tensor(
+        [orig_width, orig_height, c_top, c_left, 6.0],
+    )
+
+    return {"image": image, "micro_conds": micro_conds}
+
+
+def tokenize_prompt(tokenizer, prompt):
+    return tokenizer(
+        prompt,
+        truncation=True,
+        padding="max_length",
+        max_length=77,
+        return_tensors="pt",
+    ).input_ids
+
+
+def encode_prompt(text_encoder, input_ids):
+    outputs = text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+    encoder_hidden_states = outputs.hidden_states[-2]
+    cond_embeds = outputs[0]
+    return encoder_hidden_states, cond_embeds
+
+
+def main(args):
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if accelerator.is_main_process:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+
+    if accelerator.is_main_process:
+        accelerator.init_trackers("amused", config=vars(copy.deepcopy(args)))
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # TODO - will have to fix loading if training text encoder
+    text_encoder = CLIPTextModelWithProjection.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, variant=args.variant
+    )
+    vq_model = VQModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vqvae", revision=args.revision, variant=args.variant
+    )
+
+    if args.train_text_encoder:
+        if args.text_encoder_use_lora:
+            lora_config = LoraConfig(
+                r=args.text_encoder_lora_r,
+                lora_alpha=args.text_encoder_lora_alpha,
+                target_modules=args.text_encoder_lora_target_modules,
+            )
+            text_encoder.add_adapter(lora_config)
+        text_encoder.train()
+        text_encoder.requires_grad_(True)
+    else:
+        text_encoder.eval()
+        text_encoder.requires_grad_(False)
+
+    vq_model.requires_grad_(False)
+
+    model = UVit2DModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="transformer",
+        revision=args.revision,
+        variant=args.variant,
+    )
+
+    if args.use_lora:
+        lora_config = LoraConfig(
+            r=args.lora_r,
+            lora_alpha=args.lora_alpha,
+            target_modules=args.lora_target_modules,
+        )
+        model.add_adapter(lora_config)
+
+    model.train()
+
+    if args.gradient_checkpointing:
+        model.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
+
+    if args.use_ema:
+        ema = EMAModel(
+            model.parameters(),
+            decay=args.ema_decay,
+            update_after_step=args.ema_update_after_step,
+            model_cls=UVit2DModel,
+            model_config=model.config,
+        )
+
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            transformer_lora_layers_to_save = None
+            text_encoder_lora_layers_to_save = None
+
+            for model_ in models:
+                if isinstance(model_, type(accelerator.unwrap_model(model))):
+                    if args.use_lora:
+                        transformer_lora_layers_to_save = get_peft_model_state_dict(model_)
+                    else:
+                        model_.save_pretrained(os.path.join(output_dir, "transformer"))
+                elif isinstance(model_, type(accelerator.unwrap_model(text_encoder))):
+                    if args.text_encoder_use_lora:
+                        text_encoder_lora_layers_to_save = get_peft_model_state_dict(model_)
+                    else:
+                        model_.save_pretrained(os.path.join(output_dir, "text_encoder"))
+                else:
+                    raise ValueError(f"unexpected save model: {model_.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            if transformer_lora_layers_to_save is not None or text_encoder_lora_layers_to_save is not None:
+                AmusedLoraLoaderMixin.save_lora_weights(
+                    output_dir,
+                    transformer_lora_layers=transformer_lora_layers_to_save,
+                    text_encoder_lora_layers=text_encoder_lora_layers_to_save,
+                )
+
+            if args.use_ema:
+                ema.save_pretrained(os.path.join(output_dir, "ema_model"))
+
+    def load_model_hook(models, input_dir):
+        transformer = None
+        text_encoder_ = None
+
+        while len(models) > 0:
+            model_ = models.pop()
+
+            if isinstance(model_, type(accelerator.unwrap_model(model))):
+                if args.use_lora:
+                    transformer = model_
+                else:
+                    load_model = UVit2DModel.from_pretrained(os.path.join(input_dir, "transformer"))
+                    model_.load_state_dict(load_model.state_dict())
+                    del load_model
+            elif isinstance(model, type(accelerator.unwrap_model(text_encoder))):
+                if args.text_encoder_use_lora:
+                    text_encoder_ = model_
+                else:
+                    load_model = CLIPTextModelWithProjection.from_pretrained(os.path.join(input_dir, "text_encoder"))
+                    model_.load_state_dict(load_model.state_dict())
+                    del load_model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        if transformer is not None or text_encoder_ is not None:
+            lora_state_dict, network_alphas = AmusedLoraLoaderMixin.lora_state_dict(input_dir)
+            AmusedLoraLoaderMixin.load_lora_into_text_encoder(
+                lora_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_
+            )
+            AmusedLoraLoaderMixin.load_lora_into_transformer(
+                lora_state_dict, network_alphas=network_alphas, transformer=transformer
+            )
+
+        if args.use_ema:
+            load_from = EMAModel.from_pretrained(os.path.join(input_dir, "ema_model"), model_cls=UVit2DModel)
+            ema.load_state_dict(load_from.state_dict())
+            del load_from
+
+    accelerator.register_load_state_pre_hook(load_model_hook)
+    accelerator.register_save_state_pre_hook(save_model_hook)
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+        )
+
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    # no decay on bias and layernorm and embedding
+    no_decay = ["bias", "layer_norm.weight", "mlm_ln.weight", "embeddings.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.adam_weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+
+    if args.train_text_encoder:
+        optimizer_grouped_parameters.append(
+            {"params": text_encoder.parameters(), "weight_decay": args.adam_weight_decay}
+        )
+
+    optimizer = optimizer_cls(
+        optimizer_grouped_parameters,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    logger.info("Creating dataloaders and lr_scheduler")
+
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    if args.instance_data_dir is not None:
+        dataset = InstanceDataRootDataset(
+            instance_data_root=args.instance_data_dir,
+            tokenizer=tokenizer,
+            size=args.resolution,
+        )
+    elif args.instance_data_image is not None:
+        dataset = InstanceDataImageDataset(
+            instance_data_image=args.instance_data_image,
+            train_batch_size=args.train_batch_size,
+            size=args.resolution,
+        )
+    elif args.instance_data_dataset is not None:
+        dataset = HuggingFaceDataset(
+            hf_dataset=load_dataset(args.instance_data_dataset, split="train"),
+            tokenizer=tokenizer,
+            image_key=args.image_key,
+            prompt_key=args.prompt_key,
+            prompt_prefix=args.prompt_prefix,
+            size=args.resolution,
+        )
+    else:
+        assert False
+
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        num_workers=args.dataloader_num_workers,
+        collate_fn=default_collate,
+    )
+    train_dataloader.num_batches = len(train_dataloader)
+
+    lr_scheduler = diffusers.optimization.get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+    )
+
+    logger.info("Preparing model, optimizer and dataloaders")
+
+    if args.train_text_encoder:
+        model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
+            model, optimizer, lr_scheduler, train_dataloader, text_encoder
+        )
+    else:
+        model, optimizer, lr_scheduler, train_dataloader = accelerator.prepare(
+            model, optimizer, lr_scheduler, train_dataloader
+        )
+
+    train_dataloader.num_batches = len(train_dataloader)
+
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    if not args.train_text_encoder:
+        text_encoder.to(device=accelerator.device, dtype=weight_dtype)
+
+    vq_model.to(device=accelerator.device)
+
+    if args.use_ema:
+        ema.to(accelerator.device)
+
+    with nullcontext() if args.train_text_encoder else torch.no_grad():
+        empty_embeds, empty_clip_embeds = encode_prompt(
+            text_encoder, tokenize_prompt(tokenizer, "").to(text_encoder.device, non_blocking=True)
+        )
+
+        # There is a single image, we can just pre-encode the single prompt
+        if args.instance_data_image is not None:
+            prompt = os.path.splitext(os.path.basename(args.instance_data_image))[0]
+            encoder_hidden_states, cond_embeds = encode_prompt(
+                text_encoder, tokenize_prompt(tokenizer, prompt).to(text_encoder.device, non_blocking=True)
+            )
+            encoder_hidden_states = encoder_hidden_states.repeat(args.train_batch_size, 1, 1)
+            cond_embeds = cond_embeds.repeat(args.train_batch_size, 1)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    # Afterwards we recalculate our number of training epochs.
+    # Note: We are not doing epoch based training here, but just using this for book keeping and being able to
+    # reuse the same training loop with other datasets/loaders.
+    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info(f"  Num training steps = {args.max_train_steps}")
+    logger.info(f"  Instantaneous batch size per device = { args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+
+    resume_from_checkpoint = args.resume_from_checkpoint
+    if resume_from_checkpoint:
+        if resume_from_checkpoint == "latest":
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            if len(dirs) > 0:
+                resume_from_checkpoint = os.path.join(args.output_dir, dirs[-1])
+            else:
+                resume_from_checkpoint = None
+
+        if resume_from_checkpoint is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+        else:
+            accelerator.print(f"Resuming from checkpoint {resume_from_checkpoint}")
+
+    if resume_from_checkpoint is None:
+        global_step = 0
+        first_epoch = 0
+    else:
+        accelerator.load_state(resume_from_checkpoint)
+        global_step = int(os.path.basename(resume_from_checkpoint).split("-")[1])
+        first_epoch = global_step // num_update_steps_per_epoch
+
+    # As stated above, we are not doing epoch based training here, but just using this for book keeping and being able to
+    # reuse the same training loop with other datasets/loaders.
+    for epoch in range(first_epoch, num_train_epochs):
+        for batch in train_dataloader:
+            with torch.no_grad():
+                micro_conds = batch["micro_conds"].to(accelerator.device, non_blocking=True)
+                pixel_values = batch["image"].to(accelerator.device, non_blocking=True)
+
+                batch_size = pixel_values.shape[0]
+
+                split_batch_size = args.split_vae_encode if args.split_vae_encode is not None else batch_size
+                num_splits = math.ceil(batch_size / split_batch_size)
+                image_tokens = []
+                for i in range(num_splits):
+                    start_idx = i * split_batch_size
+                    end_idx = min((i + 1) * split_batch_size, batch_size)
+                    bs = pixel_values.shape[0]
+                    image_tokens.append(
+                        vq_model.quantize(vq_model.encode(pixel_values[start_idx:end_idx]).latents)[2][2].reshape(
+                            bs, -1
+                        )
+                    )
+                image_tokens = torch.cat(image_tokens, dim=0)
+
+                batch_size, seq_len = image_tokens.shape
+
+                timesteps = torch.rand(batch_size, device=image_tokens.device)
+                mask_prob = torch.cos(timesteps * math.pi * 0.5)
+                mask_prob = mask_prob.clip(args.min_masking_rate)
+
+                num_token_masked = (seq_len * mask_prob).round().clamp(min=1)
+                batch_randperm = torch.rand(batch_size, seq_len, device=image_tokens.device).argsort(dim=-1)
+                mask = batch_randperm < num_token_masked.unsqueeze(-1)
+
+                mask_id = accelerator.unwrap_model(model).config.vocab_size - 1
+                input_ids = torch.where(mask, mask_id, image_tokens)
+                labels = torch.where(mask, image_tokens, -100)
+
+                if args.cond_dropout_prob > 0.0:
+                    assert encoder_hidden_states is not None
+
+                    batch_size = encoder_hidden_states.shape[0]
+
+                    mask = (
+                        torch.zeros((batch_size, 1, 1), device=encoder_hidden_states.device).float().uniform_(0, 1)
+                        < args.cond_dropout_prob
+                    )
+
+                    empty_embeds_ = empty_embeds.expand(batch_size, -1, -1)
+                    encoder_hidden_states = torch.where(
+                        (encoder_hidden_states * mask).bool(), encoder_hidden_states, empty_embeds_
+                    )
+
+                    empty_clip_embeds_ = empty_clip_embeds.expand(batch_size, -1)
+                    cond_embeds = torch.where((cond_embeds * mask.squeeze(-1)).bool(), cond_embeds, empty_clip_embeds_)
+
+                bs = input_ids.shape[0]
+                vae_scale_factor = 2 ** (len(vq_model.config.block_out_channels) - 1)
+                resolution = args.resolution // vae_scale_factor
+                input_ids = input_ids.reshape(bs, resolution, resolution)
+
+            if "prompt_input_ids" in batch:
+                with nullcontext() if args.train_text_encoder else torch.no_grad():
+                    encoder_hidden_states, cond_embeds = encode_prompt(
+                        text_encoder, batch["prompt_input_ids"].to(accelerator.device, non_blocking=True)
+                    )
+
+            # Train Step
+            with accelerator.accumulate(model):
+                codebook_size = accelerator.unwrap_model(model).config.codebook_size
+
+                logits = (
+                    model(
+                        input_ids=input_ids,
+                        encoder_hidden_states=encoder_hidden_states,
+                        micro_conds=micro_conds,
+                        pooled_text_emb=cond_embeds,
+                    )
+                    .reshape(bs, codebook_size, -1)
+                    .permute(0, 2, 1)
+                    .reshape(-1, codebook_size)
+                )
+
+                loss = F.cross_entropy(
+                    logits,
+                    labels.view(-1),
+                    ignore_index=-100,
+                    reduction="mean",
+                )
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                avg_masking_rate = accelerator.gather(mask_prob.repeat(args.train_batch_size)).mean()
+
+                accelerator.backward(loss)
+
+                if args.max_grad_norm is not None and accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+                optimizer.step()
+                lr_scheduler.step()
+
+                optimizer.zero_grad(set_to_none=True)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema.step(model.parameters())
+
+                if (global_step + 1) % args.logging_steps == 0:
+                    logs = {
+                        "step_loss": avg_loss.item(),
+                        "lr": lr_scheduler.get_last_lr()[0],
+                        "avg_masking_rate": avg_masking_rate.item(),
+                    }
+                    accelerator.log(logs, step=global_step + 1)
+
+                    logger.info(
+                        f"Step: {global_step + 1} "
+                        f"Loss: {avg_loss.item():0.4f} "
+                        f"LR: {lr_scheduler.get_last_lr()[0]:0.6f}"
+                    )
+
+                if (global_step + 1) % args.checkpointing_steps == 0:
+                    save_checkpoint(args, accelerator, global_step + 1)
+
+                if (global_step + 1) % args.validation_steps == 0 and accelerator.is_main_process:
+                    if args.use_ema:
+                        ema.store(model.parameters())
+                        ema.copy_to(model.parameters())
+
+                    with torch.no_grad():
+                        logger.info("Generating images...")
+
+                        model.eval()
+
+                        if args.train_text_encoder:
+                            text_encoder.eval()
+
+                        scheduler = AmusedScheduler.from_pretrained(
+                            args.pretrained_model_name_or_path,
+                            subfolder="scheduler",
+                            revision=args.revision,
+                            variant=args.variant,
+                        )
+
+                        pipe = AmusedPipeline(
+                            transformer=accelerator.unwrap_model(model),
+                            tokenizer=tokenizer,
+                            text_encoder=text_encoder,
+                            vqvae=vq_model,
+                            scheduler=scheduler,
+                        )
+
+                        pil_images = pipe(prompt=args.validation_prompts).images
+                        wandb_images = [
+                            wandb.Image(image, caption=args.validation_prompts[i])
+                            for i, image in enumerate(pil_images)
+                        ]
+
+                        wandb.log({"generated_images": wandb_images}, step=global_step + 1)
+
+                        model.train()
+
+                        if args.train_text_encoder:
+                            text_encoder.train()
+
+                    if args.use_ema:
+                        ema.restore(model.parameters())
+
+                global_step += 1
+
+            # Stop training if max steps is reached
+            if global_step >= args.max_train_steps:
+                break
+        # End for
+
+    accelerator.wait_for_everyone()
+
+    # Evaluate and save checkpoint at the end of training
+    save_checkpoint(args, accelerator, global_step)
+
+    # Save the final trained checkpoint
+    if accelerator.is_main_process:
+        model = accelerator.unwrap_model(model)
+        if args.use_ema:
+            ema.copy_to(model.parameters())
+        model.save_pretrained(args.output_dir)
+
+    accelerator.end_training()
+
+
+def save_checkpoint(args, accelerator, global_step):
+    output_dir = args.output_dir
+
+    # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+    if accelerator.is_main_process and args.checkpoints_total_limit is not None:
+        checkpoints = os.listdir(output_dir)
+        checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+        checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+        # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+        if len(checkpoints) >= args.checkpoints_total_limit:
+            num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+            removing_checkpoints = checkpoints[0:num_to_remove]
+
+            logger.info(
+                f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+            )
+            logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+            for removing_checkpoint in removing_checkpoints:
+                removing_checkpoint = os.path.join(output_dir, removing_checkpoint)
+                shutil.rmtree(removing_checkpoint)
+
+    save_path = Path(output_dir) / f"checkpoint-{global_step}"
+    accelerator.save_state(save_path)
+    logger.info(f"Saved state to {save_path}")
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/diffusers/examples/cogvideo/README.md b/diffusers/examples/cogvideo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..398ae9543150493d8fc175d4dafa7eb8ba523cd5
--- /dev/null
+++ b/diffusers/examples/cogvideo/README.md
@@ -0,0 +1,228 @@
+# LoRA finetuning example for CogVideoX
+
+Low-Rank Adaption of Large Language Models was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*.
+
+In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-decomposition matrices to existing weights and **only** training those newly added weights. This has a couple of advantages:
+
+- Previous pretrained weights are kept frozen so that model is not prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114).
+- Rank-decomposition matrices have significantly fewer parameters than original model, which means that trained LoRA weights are easily portable.
+- LoRA attention layers allow to control to which extent the model is adapted toward new training images via a `scale` parameter.
+
+At the moment, LoRA finetuning has only been tested for [CogVideoX-2b](https://huggingface.co/THUDM/CogVideoX-2b).
+
+## Data Preparation
+
+The training scripts accepts data in two formats.
+
+**First data format**
+
+Two files where one file contains line-separated prompts and another file contains line-separated paths to video data (the path to video files must be relative to the path you pass when specifying `--instance_data_root`). Let's take a look at an example to understand this better!
+
+Assume you've specified `--instance_data_root` as `/dataset`, and that this directory contains the files: `prompts.txt` and `videos.txt`.
+
+The `prompts.txt` file should contain line-separated prompts:
+
+```
+A black and white animated sequence featuring a rabbit, named Rabbity Ribfried, and an anthropomorphic goat in a musical, playful environment, showcasing their evolving interaction.
+A black and white animated sequence on a ship's deck features a bulldog character, named Bully Bulldoger, showcasing exaggerated facial expressions and body language. The character progresses from confident to focused, then to strained and distressed, displaying a range of emotions as it navigates challenges. The ship's interior remains static in the background, with minimalistic details such as a bell and open door. The character's dynamic movements and changing expressions drive the narrative, with no camera movement to distract from its evolving reactions and physical gestures.
+...
+```
+
+The `videos.txt` file should contain line-separate paths to video files. Note that the path should be _relative_ to the `--instance_data_root` directory.
+
+```
+videos/00000.mp4
+videos/00001.mp4
+...
+```
+
+Overall, this is how your dataset would look like if you ran the `tree` command on the dataset root directory:
+
+```
+/dataset
+├── prompts.txt
+├── videos.txt
+├── videos
+    ├── videos/00000.mp4
+    ├── videos/00001.mp4
+    ├── ...
+```
+
+When using this format, the `--caption_column` must be `prompts.txt` and `--video_column` must be `videos.txt`.
+
+**Second data format**
+
+You could use a single CSV file. For the sake of this example, assume you have a `metadata.csv` file. The expected format is:
+
+```
+<CAPTION_COLUMN>,<PATH_TO_VIDEO_COLUMN>
+"""A black and white animated sequence featuring a rabbit, named Rabbity Ribfried, and an anthropomorphic goat in a musical, playful environment, showcasing their evolving interaction.""","""00000.mp4"""
+"""A black and white animated sequence on a ship's deck features a bulldog character, named Bully Bulldoger, showcasing exaggerated facial expressions and body language. The character progresses from confident to focused, then to strained and distressed, displaying a range of emotions as it navigates challenges. The ship's interior remains static in the background, with minimalistic details such as a bell and open door. The character's dynamic movements and changing expressions drive the narrative, with no camera movement to distract from its evolving reactions and physical gestures.""","""00001.mp4"""
+...
+```
+
+In this case, the `--instance_data_root` should be the location where the videos are stored and `--dataset_name` should be either a path to local folder or `load_dataset` compatible hosted HF Dataset Repository or URL. Assuming you have videos of your Minecraft gameplay at `https://huggingface.co/datasets/my-awesome-username/minecraft-videos`, you would have to specify `my-awesome-username/minecraft-videos`.
+
+When using this format, the `--caption_column` must be `<CAPTION_COLUMN>` and `--video_column` must be `<PATH_TO_VIDEO_COLUMN>`.
+
+You are not strictly restricted to the CSV format. As long as the `load_dataset` method supports the file format to load a basic `<PATH_TO_VIDEO_COLUMN>` and `<CAPTION_COLUMN>`, you should be good to go. The reason for going through these dataset organization gymnastics for loading video data is because we found `load_dataset` from the datasets library to not fully support all kinds of video formats. This will undoubtedly be improved in the future.
+
+>![NOTE]
+> CogVideoX works best with long and descriptive LLM-augmented prompts for video generation. We recommend pre-processing your videos by first generating a summary using a VLM and then augmenting the prompts with an LLM. To generate the above captions, we use [MiniCPM-V-26](https://huggingface.co/openbmb/MiniCPM-V-2_6) and [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct). A very barebones and no-frills example for this is available [here](https://gist.github.com/a-r-r-o-w/4dee20250e82f4e44690a02351324a4a). The official recommendation for augmenting prompts is [ChatGLM](https://huggingface.co/THUDM?search_models=chatglm) and a length of 50-100 words is considered good.
+
+>![NOTE]
+> It is expected that your dataset is already pre-processed. If not, some basic pre-processing can be done by playing with the following parameters:
+> `--height`, `--width`, `--fps`, `--max_num_frames`, `--skip_frames_start` and `--skip_frames_end`.
+> Presently, all videos in your dataset should contain the same number of video frames when using a training batch size > 1.
+
+<!-- TODO: Implement frame packing in future to address above issue. -->
+
+## Training
+
+You need to setup your development environment by installing the necessary requirements. The following packages are required:
+- Torch 2.0 or above based on the training features you are utilizing (might require latest or nightly versions for quantized/deepspeed training)
+- `pip install diffusers transformers accelerate peft huggingface_hub` for all things modeling and training related
+- `pip install datasets decord` for loading video training data
+- `pip install bitsandbytes` for using 8-bit Adam or AdamW optimizers for memory-optimized training
+- `pip install wandb` optionally for monitoring training logs
+- `pip install deepspeed` optionally for [DeepSpeed](https://github.com/microsoft/DeepSpeed) training
+- `pip install prodigyopt` optionally if you would like to use the Prodigy optimizer for training
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+And initialize an [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell (e.g., a notebook)
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups. Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+If you would like to push your model to the HF Hub after training is completed with a neat model card, make sure you're logged in:
+
+```
+huggingface-cli login
+
+# Alternatively, you could upload your model manually using:
+# huggingface-cli upload my-cool-account-name/my-cool-lora-name /path/to/awesome/lora
+```
+
+Make sure your data is prepared as described in [Data Preparation](#data-preparation). When ready, you can begin training!
+
+Assuming you are training on 50 videos of a similar concept, we have found 1500-2000 steps to work well. The official recommendation, however, is 100 videos with a total of 4000 steps. Assuming you are training on a single GPU with a `--train_batch_size` of `1`:
+- 1500 steps on 50 videos would correspond to `30` training epochs
+- 4000 steps on 100 videos would correspond to `40` training epochs
+
+```bash
+#!/bin/bash
+
+GPU_IDS="0"
+
+accelerate launch --gpu_ids $GPU_IDS examples/cogvideo/train_cogvideox_lora.py \
+  --pretrained_model_name_or_path THUDM/CogVideoX-2b \
+  --cache_dir <CACHE_DIR> \
+  --instance_data_root <PATH_TO_WHERE_VIDEO_FILES_ARE_STORED> \
+  --dataset_name my-awesome-name/my-awesome-dataset \
+  --caption_column <CAPTION_COLUMN> \
+  --video_column <PATH_TO_VIDEO_COLUMN> \
+  --id_token <ID_TOKEN> \
+  --validation_prompt "<ID_TOKEN> Spiderman swinging over buildings:::A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance" \
+  --validation_prompt_separator ::: \
+  --num_validation_videos 1 \
+  --validation_epochs 10 \
+  --seed 42 \
+  --rank 64 \
+  --lora_alpha 64 \
+  --mixed_precision fp16 \
+  --output_dir /raid/aryan/cogvideox-lora \
+  --height 480 --width 720 --fps 8 --max_num_frames 49 --skip_frames_start 0 --skip_frames_end 0 \
+  --train_batch_size 1 \
+  --num_train_epochs 30 \
+  --checkpointing_steps 1000 \
+  --gradient_accumulation_steps 1 \
+  --learning_rate 1e-3 \
+  --lr_scheduler cosine_with_restarts \
+  --lr_warmup_steps 200 \
+  --lr_num_cycles 1 \
+  --enable_slicing \
+  --enable_tiling \
+  --optimizer Adam \
+  --adam_beta1 0.9 \
+  --adam_beta2 0.95 \
+  --max_grad_norm 1.0 \
+  --report_to wandb
+```
+
+To better track our training experiments, we're using the following flags in the command above:
+* `--report_to wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
+* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
+
+Note that setting the `<ID_TOKEN>` is not necessary. From some limited experimentation, we found it to work better (as it resembles [Dreambooth](https://huggingface.co/docs/diffusers/en/training/dreambooth) like training) than without. When provided, the ID_TOKEN is appended to the beginning of each prompt. So, if your ID_TOKEN was `"DISNEY"` and your prompt was `"Spiderman swinging over buildings"`, the effective prompt used in training would be `"DISNEY Spiderman swinging over buildings"`. When not provided, you would either be training without any such additional token or could augment your dataset to apply the token where you wish before starting the training.
+
+> [!TIP]
+> You can pass `--use_8bit_adam` to reduce the memory requirements of training.
+
+> [!IMPORTANT]
+> The following settings have been tested at the time of adding CogVideoX LoRA training support:
+> - Our testing was primarily done on CogVideoX-2b. We will work on CogVideoX-5b and CogVideoX-5b-I2V soon
+> - One dataset comprised of 70 training videos of resolutions `200 x 480 x 720` (F x H x W). From this, by using frame skipping in data preprocessing, we created two smaller 49-frame and 16-frame datasets for faster experimentation and because the maximum limit recommended by the CogVideoX team is 49 frames. Out of the 70 videos, we created three groups of 10, 25 and 50 videos. All videos were similar in nature of the concept being trained.
+> - 25+ videos worked best for training new concepts and styles.
+> - We found that it is better to train with an identifier token that can be specified as `--id_token`. This is similar to Dreambooth-like training but normal finetuning without such a token works too.
+> - Trained concept seemed to work decently well when combined with completely unrelated prompts. We expect even better results if CogVideoX-5B is finetuned.
+> - The original repository uses a `lora_alpha` of `1`. We found this not suitable in many runs, possibly due to difference in modeling backends and training settings. Our recommendation is to set to the `lora_alpha` to either `rank` or `rank // 2`.
+> - If you're training on data whose captions generate bad results with the original model, a `rank` of 64 and above is good and also the recommendation by the team behind CogVideoX. If the generations are already moderately good on your training captions, a `rank` of 16/32 should work. We found that setting the rank too low, say `4`, is not ideal and doesn't produce promising results.
+> - The authors of CogVideoX recommend 4000 training steps and 100 training videos overall to achieve the best result. While that might yield the best results, we found from our limited experimentation that 2000 steps and 25 videos could also be sufficient.
+> - When using the Prodigy opitimizer for training, one can follow the recommendations from [this](https://huggingface.co/blog/sdxl_lora_advanced_script) blog. Prodigy tends to overfit quickly. From my very limited testing, I found a learning rate of `0.5` to be suitable in addition to `--prodigy_use_bias_correction`, `prodigy_safeguard_warmup` and `--prodigy_decouple`.
+> - The recommended learning rate by the CogVideoX authors and from our experimentation with Adam/AdamW is between `1e-3` and `1e-4` for a dataset of 25+ videos.
+>
+> Note that our testing is not exhaustive due to limited time for exploration. Our recommendation would be to play around with the different knobs and dials to find the best settings for your data.
+
+<!-- TODO: Test finetuning with CogVideoX-5b and CogVideoX-5b-I2V and update scripts accordingly -->
+
+## Inference
+
+Once you have trained a lora model, the inference can be done simply loading the lora weights into the `CogVideoXPipeline`.
+
+```python
+import torch
+from diffusers import CogVideoXPipeline
+from diffusers.utils import export_to_video
+
+pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16)
+# pipe.load_lora_weights("/path/to/lora/weights", adapter_name="cogvideox-lora") # Or,
+pipe.load_lora_weights("my-awesome-hf-username/my-awesome-lora-name", adapter_name="cogvideox-lora") # If loading from the HF Hub
+pipe.to("cuda")
+
+# Assuming lora_alpha=32 and rank=64 for training. If different, set accordingly
+pipe.set_adapters(["cogvideox-lora"], [32 / 64])
+
+prompt = (
+    "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The "
+    "panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
+    "pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
+    "casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
+    "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
+    "atmosphere of this unique musical performance"
+)
+frames = pipe(prompt, guidance_scale=6, use_dynamic_cfg=True).frames[0]
+export_to_video(frames, "output.mp4", fps=8)
+```
diff --git a/diffusers/examples/cogvideo/requirements.txt b/diffusers/examples/cogvideo/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c2238804be9f4b41300ac2962fd76afca53f1e71
--- /dev/null
+++ b/diffusers/examples/cogvideo/requirements.txt
@@ -0,0 +1,10 @@
+accelerate>=0.31.0
+torchvision
+transformers>=4.41.2
+ftfy
+tensorboard
+Jinja2
+peft>=0.11.1
+sentencepiece
+decord>=0.6.0
+imageio-ffmpeg
\ No newline at end of file
diff --git a/diffusers/examples/cogvideo/train_cogvideox_lora.py b/diffusers/examples/cogvideo/train_cogvideox_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..137f3222f6d901990464388c478a9416d16f587e
--- /dev/null
+++ b/diffusers/examples/cogvideo/train_cogvideox_lora.py
@@ -0,0 +1,1544 @@
+# Copyright 2024 The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import torch
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from peft import LoraConfig, get_peft_model_state_dict, set_peft_model_state_dict
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, T5EncoderModel, T5Tokenizer
+
+import diffusers
+from diffusers import AutoencoderKLCogVideoX, CogVideoXDPMScheduler, CogVideoXPipeline, CogVideoXTransformer3DModel
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.cogvideo.pipeline_cogvideox import get_resize_crop_region_for_grid
+from diffusers.training_utils import (
+    cast_training_params,
+    clear_objs_and_retain_memory,
+)
+from diffusers.utils import check_min_version, convert_unet_state_dict_to_peft, export_to_video, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script for CogVideoX.")
+
+    # Model information
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+
+    # Dataset information
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--instance_data_root",
+        type=str,
+        default=None,
+        help=("A folder containing the training data."),
+    )
+    parser.add_argument(
+        "--video_column",
+        type=str,
+        default="video",
+        help="The column of the dataset containing videos. Or, the name of the file in `--instance_data_root` folder containing the line-separated path to video data.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing the instance prompt for each video. Or, the name of the file in `--instance_data_root` folder containing the line-separated instance prompts.",
+    )
+    parser.add_argument(
+        "--id_token", type=str, default=None, help="Identifier token appended to the start of each prompt if provided."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+
+    # Validation
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="One or more prompt(s) that is used during validation to verify that the model is learning. Multiple validation prompts should be separated by the '--validation_prompt_seperator' string.",
+    )
+    parser.add_argument(
+        "--validation_prompt_separator",
+        type=str,
+        default=":::",
+        help="String that separates multiple validation prompts",
+    )
+    parser.add_argument(
+        "--num_validation_videos",
+        type=int,
+        default=1,
+        help="Number of videos that should be generated during validation per `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run validation every X epochs. Validation consists of running the prompt `args.validation_prompt` multiple times: `args.num_validation_videos`."
+        ),
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=6,
+        help="The guidance scale to use while sampling validation videos.",
+    )
+    parser.add_argument(
+        "--use_dynamic_cfg",
+        action="store_true",
+        default=False,
+        help="Whether or not to use the default cosine dynamic guidance schedule when sampling validation videos.",
+    )
+
+    # Training information
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=128,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--lora_alpha",
+        type=float,
+        default=128,
+        help=("The scaling factor to scale LoRA weight update. The actual scaling factor is `lora_alpha / rank`"),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="cogvideox-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=480,
+        help="All input videos are resized to this height.",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=720,
+        help="All input videos are resized to this width.",
+    )
+    parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.")
+    parser.add_argument(
+        "--max_num_frames", type=int, default=49, help="All input videos will be truncated to these many frames."
+    )
+    parser.add_argument(
+        "--skip_frames_start",
+        type=int,
+        default=0,
+        help="Number of frames to skip from the beginning of each input video. Useful if training data contains intro sequences.",
+    )
+    parser.add_argument(
+        "--skip_frames_end",
+        type=int,
+        default=0,
+        help="Number of frames to skip from the end of each input video. Useful if training data contains outro sequences.",
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip videos horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides `--num_train_epochs`.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--enable_slicing",
+        action="store_true",
+        default=False,
+        help="Whether or not to use VAE slicing for saving memory.",
+    )
+    parser.add_argument(
+        "--enable_tiling",
+        action="store_true",
+        default=False,
+        help="Whether or not to use VAE tiling for saving memory.",
+    )
+
+    # Optimizer
+    parser.add_argument(
+        "--optimizer",
+        type=lambda s: s.lower(),
+        default="adam",
+        choices=["adam", "adamw", "prodigy"],
+        help=("The optimizer type to use."),
+    )
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+    parser.add_argument(
+        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--adam_beta2", type=float, default=0.95, help="The beta2 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--prodigy_beta3",
+        type=float,
+        default=None,
+        help="Coefficients for computing the Prodigy optimizer's stepsize using running averages. If set to None, uses the value of square root of beta2.",
+    )
+    parser.add_argument("--prodigy_decouple", action="store_true", help="Use AdamW style decoupled weight decay")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--prodigy_use_bias_correction", action="store_true", help="Turn on Adam's bias correction.")
+    parser.add_argument(
+        "--prodigy_safeguard_warmup",
+        action="store_true",
+        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage.",
+    )
+
+    # Other information
+    parser.add_argument("--tracker_name", type=str, default=None, help="Project tracker name")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help="Directory where logs are stored.",
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default=None,
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+
+    return parser.parse_args()
+
+
+class VideoDataset(Dataset):
+    def __init__(
+        self,
+        instance_data_root: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        dataset_config_name: Optional[str] = None,
+        caption_column: str = "text",
+        video_column: str = "video",
+        height: int = 480,
+        width: int = 720,
+        fps: int = 8,
+        max_num_frames: int = 49,
+        skip_frames_start: int = 0,
+        skip_frames_end: int = 0,
+        cache_dir: Optional[str] = None,
+        id_token: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+
+        self.instance_data_root = Path(instance_data_root) if instance_data_root is not None else None
+        self.dataset_name = dataset_name
+        self.dataset_config_name = dataset_config_name
+        self.caption_column = caption_column
+        self.video_column = video_column
+        self.height = height
+        self.width = width
+        self.fps = fps
+        self.max_num_frames = max_num_frames
+        self.skip_frames_start = skip_frames_start
+        self.skip_frames_end = skip_frames_end
+        self.cache_dir = cache_dir
+        self.id_token = id_token or ""
+
+        if dataset_name is not None:
+            self.instance_prompts, self.instance_video_paths = self._load_dataset_from_hub()
+        else:
+            self.instance_prompts, self.instance_video_paths = self._load_dataset_from_local_path()
+
+        self.num_instance_videos = len(self.instance_video_paths)
+        if self.num_instance_videos != len(self.instance_prompts):
+            raise ValueError(
+                f"Expected length of instance prompts and videos to be the same but found {len(self.instance_prompts)=} and {len(self.instance_video_paths)=}. Please ensure that the number of caption prompts and videos match in your dataset."
+            )
+
+        self.instance_videos = self._preprocess_data()
+
+    def __len__(self):
+        return self.num_instance_videos
+
+    def __getitem__(self, index):
+        return {
+            "instance_prompt": self.id_token + self.instance_prompts[index],
+            "instance_video": self.instance_videos[index],
+        }
+
+    def _load_dataset_from_hub(self):
+        try:
+            from datasets import load_dataset
+        except ImportError:
+            raise ImportError(
+                "You are trying to load your data using the datasets library. If you wish to train using custom "
+                "captions please install the datasets library: `pip install datasets`. If you wish to load a "
+                "local folder containing images only, specify --instance_data_root instead."
+            )
+
+        # Downloading and loading a dataset from the hub. See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+        dataset = load_dataset(
+            self.dataset_name,
+            self.dataset_config_name,
+            cache_dir=self.cache_dir,
+        )
+        column_names = dataset["train"].column_names
+
+        if self.video_column is None:
+            video_column = column_names[0]
+            logger.info(f"`video_column` defaulting to {video_column}")
+        else:
+            video_column = self.video_column
+            if video_column not in column_names:
+                raise ValueError(
+                    f"`--video_column` value '{video_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                )
+
+        if self.caption_column is None:
+            caption_column = column_names[1]
+            logger.info(f"`caption_column` defaulting to {caption_column}")
+        else:
+            caption_column = self.caption_column
+            if self.caption_column not in column_names:
+                raise ValueError(
+                    f"`--caption_column` value '{self.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                )
+
+        instance_prompts = dataset["train"][caption_column]
+        instance_videos = [Path(self.instance_data_root, filepath) for filepath in dataset["train"][video_column]]
+
+        return instance_prompts, instance_videos
+
+    def _load_dataset_from_local_path(self):
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance videos root folder does not exist")
+
+        prompt_path = self.instance_data_root.joinpath(self.caption_column)
+        video_path = self.instance_data_root.joinpath(self.video_column)
+
+        if not prompt_path.exists() or not prompt_path.is_file():
+            raise ValueError(
+                "Expected `--caption_column` to be path to a file in `--instance_data_root` containing line-separated text prompts."
+            )
+        if not video_path.exists() or not video_path.is_file():
+            raise ValueError(
+                "Expected `--video_column` to be path to a file in `--instance_data_root` containing line-separated paths to video data in the same directory."
+            )
+
+        with open(prompt_path, "r", encoding="utf-8") as file:
+            instance_prompts = [line.strip() for line in file.readlines() if len(line.strip()) > 0]
+        with open(video_path, "r", encoding="utf-8") as file:
+            instance_videos = [
+                self.instance_data_root.joinpath(line.strip()) for line in file.readlines() if len(line.strip()) > 0
+            ]
+
+        if any(not path.is_file() for path in instance_videos):
+            raise ValueError(
+                "Expected '--video_column' to be a path to a file in `--instance_data_root` containing line-separated paths to video data but found atleast one path that is not a valid file."
+            )
+
+        return instance_prompts, instance_videos
+
+    def _preprocess_data(self):
+        try:
+            import decord
+        except ImportError:
+            raise ImportError(
+                "The `decord` package is required for loading the video dataset. Install with `pip install decord`"
+            )
+
+        decord.bridge.set_bridge("torch")
+
+        videos = []
+        train_transforms = transforms.Compose(
+            [
+                transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0),
+            ]
+        )
+
+        for filename in self.instance_video_paths:
+            video_reader = decord.VideoReader(uri=filename.as_posix(), width=self.width, height=self.height)
+            video_num_frames = len(video_reader)
+
+            start_frame = min(self.skip_frames_start, video_num_frames)
+            end_frame = max(0, video_num_frames - self.skip_frames_end)
+            if end_frame <= start_frame:
+                frames = video_reader.get_batch([start_frame])
+            elif end_frame - start_frame <= self.max_num_frames:
+                frames = video_reader.get_batch(list(range(start_frame, end_frame)))
+            else:
+                indices = list(range(start_frame, end_frame, (end_frame - start_frame) // self.max_num_frames))
+                frames = video_reader.get_batch(indices)
+
+            # Ensure that we don't go over the limit
+            frames = frames[: self.max_num_frames]
+            selected_num_frames = frames.shape[0]
+
+            # Choose first (4k + 1) frames as this is how many is required by the VAE
+            remainder = (3 + (selected_num_frames % 4)) % 4
+            if remainder != 0:
+                frames = frames[:-remainder]
+            selected_num_frames = frames.shape[0]
+
+            assert (selected_num_frames - 1) % 4 == 0
+
+            # Training transforms
+            frames = frames.float()
+            frames = torch.stack([train_transforms(frame) for frame in frames], dim=0)
+            videos.append(frames.permute(0, 3, 1, 2).contiguous())  # [F, C, H, W]
+
+        return videos
+
+
+def save_model_card(
+    repo_id: str,
+    videos=None,
+    base_model: str = None,
+    validation_prompt=None,
+    repo_folder=None,
+    fps=8,
+):
+    widget_dict = []
+    if videos is not None:
+        for i, video in enumerate(videos):
+            export_to_video(video, os.path.join(repo_folder, f"final_video_{i}.mp4", fps=fps))
+            widget_dict.append(
+                {"text": validation_prompt if validation_prompt else " ", "output": {"url": f"video_{i}.mp4"}}
+            )
+
+    model_description = f"""
+# CogVideoX LoRA - {repo_id}
+
+<Gallery />
+
+## Model description
+
+These are {repo_id} LoRA weights for {base_model}.
+
+The weights were trained using the [CogVideoX Diffusers trainer](https://github.com/huggingface/diffusers/blob/main/examples/cogvideo/train_cogvideox_lora.py).
+
+Was LoRA for the text encoder enabled? No.
+
+## Download model
+
+[Download the *.safetensors LoRA]({repo_id}/tree/main) in the Files & versions tab.
+
+## Use it with the [🧨 diffusers library](https://github.com/huggingface/diffusers)
+
+```py
+from diffusers import CogVideoXPipeline
+import torch
+
+pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16).to("cuda")
+pipe.load_lora_weights("{repo_id}", weight_name="pytorch_lora_weights.safetensors", adapter_name=["cogvideox-lora"])
+
+# The LoRA adapter weights are determined by what was used for training.
+# In this case, we assume `--lora_alpha` is 32 and `--rank` is 64.
+# It can be made lower or higher from what was used in training to decrease or amplify the effect
+# of the LoRA upto a tolerance, beyond which one might notice no effect at all or overflows.
+pipe.set_adapters(["cogvideox-lora"], [32 / 64])
+
+video = pipe("{validation_prompt}", guidance_scale=6, use_dynamic_cfg=True).frames[0]
+```
+
+For more details, including weighting, merging and fusing LoRAs, check the [documentation on loading LoRAs in diffusers](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading_adapters)
+
+## License
+
+Please adhere to the licensing terms as described [here](https://huggingface.co/THUDM/CogVideoX-5b/blob/main/LICENSE) and [here](https://huggingface.co/THUDM/CogVideoX-2b/blob/main/LICENSE).
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="other",
+        base_model=base_model,
+        prompt=validation_prompt,
+        model_description=model_description,
+        widget=widget_dict,
+    )
+    tags = [
+        "text-to-video",
+        "diffusers-training",
+        "diffusers",
+        "lora",
+        "cogvideox",
+        "cogvideox-diffusers",
+        "template:sd-lora",
+    ]
+
+    model_card = populate_model_card(model_card, tags=tags)
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(
+    pipe,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    is_final_validation: bool = False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_videos} videos with prompt: {pipeline_args['prompt']}."
+    )
+    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+    scheduler_args = {}
+
+    if "variance_type" in pipe.scheduler.config:
+        variance_type = pipe.scheduler.config.variance_type
+
+        if variance_type in ["learned", "learned_range"]:
+            variance_type = "fixed_small"
+
+        scheduler_args["variance_type"] = variance_type
+
+    pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, **scheduler_args)
+    pipe = pipe.to(accelerator.device)
+    # pipe.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+
+    videos = []
+    for _ in range(args.num_validation_videos):
+        video = pipe(**pipeline_args, generator=generator, output_type="np").frames[0]
+        videos.append(video)
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "wandb":
+            video_filenames = []
+            for i, video in enumerate(videos):
+                prompt = (
+                    pipeline_args["prompt"][:25]
+                    .replace(" ", "_")
+                    .replace(" ", "_")
+                    .replace("'", "_")
+                    .replace('"', "_")
+                    .replace("/", "_")
+                )
+                filename = os.path.join(args.output_dir, f"{phase_name}_video_{i}_{prompt}.mp4")
+                export_to_video(video, filename, fps=8)
+                video_filenames.append(filename)
+
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Video(filename, caption=f"{i}: {pipeline_args['prompt']}")
+                        for i, filename in enumerate(video_filenames)
+                    ]
+                }
+            )
+
+    clear_objs_and_retain_memory([pipe])
+
+    return videos
+
+
+def _get_t5_prompt_embeds(
+    tokenizer: T5Tokenizer,
+    text_encoder: T5EncoderModel,
+    prompt: Union[str, List[str]],
+    num_videos_per_prompt: int = 1,
+    max_sequence_length: int = 226,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("`text_input_ids` must be provided when the tokenizer is not specified.")
+
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    _, seq_len, _ = prompt_embeds.shape
+    prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+    return prompt_embeds
+
+
+def encode_prompt(
+    tokenizer: T5Tokenizer,
+    text_encoder: T5EncoderModel,
+    prompt: Union[str, List[str]],
+    num_videos_per_prompt: int = 1,
+    max_sequence_length: int = 226,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    prompt_embeds = _get_t5_prompt_embeds(
+        tokenizer,
+        text_encoder,
+        prompt=prompt,
+        num_videos_per_prompt=num_videos_per_prompt,
+        max_sequence_length=max_sequence_length,
+        device=device,
+        dtype=dtype,
+        text_input_ids=text_input_ids,
+    )
+    return prompt_embeds
+
+
+def compute_prompt_embeddings(
+    tokenizer, text_encoder, prompt, max_sequence_length, device, dtype, requires_grad: bool = False
+):
+    if requires_grad:
+        prompt_embeds = encode_prompt(
+            tokenizer,
+            text_encoder,
+            prompt,
+            num_videos_per_prompt=1,
+            max_sequence_length=max_sequence_length,
+            device=device,
+            dtype=dtype,
+        )
+    else:
+        with torch.no_grad():
+            prompt_embeds = encode_prompt(
+                tokenizer,
+                text_encoder,
+                prompt,
+                num_videos_per_prompt=1,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+    return prompt_embeds
+
+
+def prepare_rotary_positional_embeddings(
+    height: int,
+    width: int,
+    num_frames: int,
+    vae_scale_factor_spatial: int = 8,
+    patch_size: int = 2,
+    attention_head_dim: int = 64,
+    device: Optional[torch.device] = None,
+    base_height: int = 480,
+    base_width: int = 720,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    grid_height = height // (vae_scale_factor_spatial * patch_size)
+    grid_width = width // (vae_scale_factor_spatial * patch_size)
+    base_size_width = base_width // (vae_scale_factor_spatial * patch_size)
+    base_size_height = base_height // (vae_scale_factor_spatial * patch_size)
+
+    grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size_width, base_size_height)
+    freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+        embed_dim=attention_head_dim,
+        crops_coords=grid_crops_coords,
+        grid_size=(grid_height, grid_width),
+        temporal_size=num_frames,
+    )
+
+    freqs_cos = freqs_cos.to(device=device)
+    freqs_sin = freqs_sin.to(device=device)
+    return freqs_cos, freqs_sin
+
+
+def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
+    # Use DeepSpeed optimzer
+    if use_deepspeed:
+        from accelerate.utils import DummyOptim
+
+        return DummyOptim(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+
+    # Optimizer creation
+    supported_optimizers = ["adam", "adamw", "prodigy"]
+    if args.optimizer not in supported_optimizers:
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}. Supported optimizers include {supported_optimizers}. Defaulting to AdamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not (args.optimizer.lower() not in ["adam", "adamw"]):
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'Adam' or 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+    if args.optimizer.lower() == "adamw":
+        optimizer_class = bnb.optim.AdamW8bit if args.use_8bit_adam else torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+    elif args.optimizer.lower() == "adam":
+        optimizer_class = bnb.optim.Adam8bit if args.use_8bit_adam else torch.optim.Adam
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+    elif args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+
+    return optimizer
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+            ).repo_id
+
+    # Prepare models and scheduler
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+
+    text_encoder = T5EncoderModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+
+    # CogVideoX-2b weights are stored in float16
+    # CogVideoX-5b and CogVideoX-5b-I2V weights are stored in bfloat16
+    load_dtype = torch.bfloat16 if "5b" in args.pretrained_model_name_or_path.lower() else torch.float16
+    transformer = CogVideoXTransformer3DModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="transformer",
+        torch_dtype=load_dtype,
+        revision=args.revision,
+        variant=args.variant,
+    )
+
+    vae = AutoencoderKLCogVideoX.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+    )
+
+    scheduler = CogVideoXDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    if args.enable_slicing:
+        vae.enable_slicing()
+    if args.enable_tiling:
+        vae.enable_tiling()
+
+    # We only train the additional adapter LoRA layers
+    text_encoder.requires_grad_(False)
+    transformer.requires_grad_(False)
+    vae.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.state.deepspeed_plugin:
+        # DeepSpeed is handling precision, use what's in the DeepSpeed config
+        if (
+            "fp16" in accelerator.state.deepspeed_plugin.deepspeed_config
+            and accelerator.state.deepspeed_plugin.deepspeed_config["fp16"]["enabled"]
+        ):
+            weight_dtype = torch.float16
+        if (
+            "bf16" in accelerator.state.deepspeed_plugin.deepspeed_config
+            and accelerator.state.deepspeed_plugin.deepspeed_config["bf16"]["enabled"]
+        ):
+            weight_dtype = torch.float16
+    else:
+        if accelerator.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif accelerator.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+
+    # now we will add new LoRA weights to the attention layers
+    transformer_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.lora_alpha,
+        init_lora_weights=True,
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+    transformer.add_adapter(transformer_lora_config)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            transformer_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(unwrap_model(transformer))):
+                    transformer_lora_layers_to_save = get_peft_model_state_dict(model)
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            CogVideoXPipeline.save_lora_weights(
+                output_dir,
+                transformer_lora_layers=transformer_lora_layers_to_save,
+            )
+
+    def load_model_hook(models, input_dir):
+        transformer_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(transformer))):
+                transformer_ = model
+            else:
+                raise ValueError(f"Unexpected save model: {model.__class__}")
+
+        lora_state_dict = CogVideoXPipeline.lora_state_dict(input_dir)
+
+        transformer_state_dict = {
+            f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.")
+        }
+        transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)
+        incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            # only upcast trainable parameters (LoRA) into fp32
+            cast_training_params([transformer_])
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params([transformer], dtype=torch.float32)
+
+    transformer_lora_parameters = list(filter(lambda p: p.requires_grad, transformer.parameters()))
+
+    # Optimization parameters
+    transformer_parameters_with_lr = {"params": transformer_lora_parameters, "lr": args.learning_rate}
+    params_to_optimize = [transformer_parameters_with_lr]
+
+    use_deepspeed_optimizer = (
+        accelerator.state.deepspeed_plugin is not None
+        and "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config
+    )
+    use_deepspeed_scheduler = (
+        accelerator.state.deepspeed_plugin is not None
+        and "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
+    )
+
+    optimizer = get_optimizer(args, params_to_optimize, use_deepspeed=use_deepspeed_optimizer)
+
+    # Dataset and DataLoader
+    train_dataset = VideoDataset(
+        instance_data_root=args.instance_data_root,
+        dataset_name=args.dataset_name,
+        dataset_config_name=args.dataset_config_name,
+        caption_column=args.caption_column,
+        video_column=args.video_column,
+        height=args.height,
+        width=args.width,
+        fps=args.fps,
+        max_num_frames=args.max_num_frames,
+        skip_frames_start=args.skip_frames_start,
+        skip_frames_end=args.skip_frames_end,
+        cache_dir=args.cache_dir,
+        id_token=args.id_token,
+    )
+
+    def encode_video(video):
+        video = video.to(accelerator.device, dtype=vae.dtype).unsqueeze(0)
+        video = video.permute(0, 2, 1, 3, 4)  # [B, C, F, H, W]
+        latent_dist = vae.encode(video).latent_dist
+        return latent_dist
+
+    train_dataset.instance_videos = [encode_video(video) for video in train_dataset.instance_videos]
+
+    def collate_fn(examples):
+        videos = [example["instance_video"].sample() * vae.config.scaling_factor for example in examples]
+        prompts = [example["instance_prompt"] for example in examples]
+
+        videos = torch.cat(videos)
+        videos = videos.to(memory_format=torch.contiguous_format).float()
+
+        return {
+            "videos": videos,
+            "prompts": prompts,
+        }
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    if use_deepspeed_scheduler:
+        from accelerate.utils import DummyScheduler
+
+        lr_scheduler = DummyScheduler(
+            name=args.lr_scheduler,
+            optimizer=optimizer,
+            total_num_steps=args.max_train_steps * accelerator.num_processes,
+            num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        )
+    else:
+        lr_scheduler = get_scheduler(
+            args.lr_scheduler,
+            optimizer=optimizer,
+            num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+            num_training_steps=args.max_train_steps * accelerator.num_processes,
+            num_cycles=args.lr_num_cycles,
+            power=args.lr_power,
+        )
+
+    # Prepare everything with our `accelerator`.
+    transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        transformer, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = args.tracker_name or "cogvideox-lora"
+        accelerator.init_trackers(tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    num_trainable_parameters = sum(param.numel() for model in params_to_optimize for param in model["params"])
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num trainable parameters = {num_trainable_parameters}")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if not args.resume_from_checkpoint:
+        initial_global_step = 0
+    else:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+    vae_scale_factor_spatial = 2 ** (len(vae.config.block_out_channels) - 1)
+
+    # For DeepSpeed training
+    model_config = transformer.module.config if hasattr(transformer, "module") else transformer.config
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [transformer]
+
+            with accelerator.accumulate(models_to_accumulate):
+                model_input = batch["videos"].permute(0, 2, 1, 3, 4).to(dtype=weight_dtype)  # [B, F, C, H, W]
+                prompts = batch["prompts"]
+
+                # encode prompts
+                prompt_embeds = compute_prompt_embeddings(
+                    tokenizer,
+                    text_encoder,
+                    prompts,
+                    model_config.max_text_seq_length,
+                    accelerator.device,
+                    weight_dtype,
+                    requires_grad=False,
+                )
+
+                # Sample noise that will be added to the latents
+                noise = torch.randn_like(model_input)
+                batch_size, num_frames, num_channels, height, width = model_input.shape
+
+                # Sample a random timestep for each image
+                timesteps = torch.randint(
+                    0, scheduler.config.num_train_timesteps, (batch_size,), device=model_input.device
+                )
+                timesteps = timesteps.long()
+
+                # Prepare rotary embeds
+                image_rotary_emb = (
+                    prepare_rotary_positional_embeddings(
+                        height=args.height,
+                        width=args.width,
+                        num_frames=num_frames,
+                        vae_scale_factor_spatial=vae_scale_factor_spatial,
+                        patch_size=model_config.patch_size,
+                        attention_head_dim=model_config.attention_head_dim,
+                        device=accelerator.device,
+                    )
+                    if model_config.use_rotary_positional_embeddings
+                    else None
+                )
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = scheduler.add_noise(model_input, noise, timesteps)
+
+                # Predict the noise residual
+                model_output = transformer(
+                    hidden_states=noisy_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timesteps,
+                    image_rotary_emb=image_rotary_emb,
+                    return_dict=False,
+                )[0]
+                model_pred = scheduler.get_velocity(model_output, noisy_model_input, timesteps)
+
+                alphas_cumprod = scheduler.alphas_cumprod[timesteps]
+                weights = 1 / (1 - alphas_cumprod)
+                while len(weights.shape) < len(model_pred.shape):
+                    weights = weights.unsqueeze(-1)
+
+                target = model_input
+
+                loss = torch.mean((weights * (model_pred - target) ** 2).reshape(batch_size, -1), dim=1)
+                loss = loss.mean()
+                accelerator.backward(loss)
+
+                if accelerator.sync_gradients:
+                    params_to_clip = transformer.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                if accelerator.state.deepspeed_plugin is None:
+                    optimizer.step()
+                    optimizer.zero_grad()
+
+                lr_scheduler.step()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"Removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and (epoch + 1) % args.validation_epochs == 0:
+                # Create pipeline
+                pipe = CogVideoXPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    transformer=unwrap_model(transformer),
+                    text_encoder=unwrap_model(text_encoder),
+                    vae=unwrap_model(vae),
+                    scheduler=scheduler,
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+
+                validation_prompts = args.validation_prompt.split(args.validation_prompt_separator)
+                for validation_prompt in validation_prompts:
+                    pipeline_args = {
+                        "prompt": validation_prompt,
+                        "guidance_scale": args.guidance_scale,
+                        "use_dynamic_cfg": args.use_dynamic_cfg,
+                        "height": args.height,
+                        "width": args.width,
+                    }
+
+                    validation_outputs = log_validation(
+                        pipe=pipe,
+                        args=args,
+                        accelerator=accelerator,
+                        pipeline_args=pipeline_args,
+                        epoch=epoch,
+                    )
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        transformer = unwrap_model(transformer)
+        dtype = (
+            torch.float16
+            if args.mixed_precision == "fp16"
+            else torch.bfloat16
+            if args.mixed_precision == "bf16"
+            else torch.float32
+        )
+        transformer = transformer.to(dtype)
+        transformer_lora_layers = get_peft_model_state_dict(transformer)
+
+        CogVideoXPipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            transformer_lora_layers=transformer_lora_layers,
+        )
+
+        # Final test inference
+        pipe = CogVideoXPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config)
+
+        if args.enable_slicing:
+            pipe.vae.enable_slicing()
+        if args.enable_tiling:
+            pipe.vae.enable_tiling()
+
+        # Load LoRA weights
+        lora_scaling = args.lora_alpha / args.rank
+        pipe.load_lora_weights(args.output_dir, adapter_name="cogvideox-lora")
+        pipe.set_adapters(["cogvideox-lora"], [lora_scaling])
+
+        # Run inference
+        validation_outputs = []
+        if args.validation_prompt and args.num_validation_videos > 0:
+            validation_prompts = args.validation_prompt.split(args.validation_prompt_separator)
+            for validation_prompt in validation_prompts:
+                pipeline_args = {
+                    "prompt": validation_prompt,
+                    "guidance_scale": args.guidance_scale,
+                    "use_dynamic_cfg": args.use_dynamic_cfg,
+                    "height": args.height,
+                    "width": args.width,
+                }
+
+                video = log_validation(
+                    pipe=pipe,
+                    args=args,
+                    accelerator=accelerator,
+                    pipeline_args=pipeline_args,
+                    epoch=epoch,
+                    is_final_validation=True,
+                )
+                validation_outputs.extend(video)
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                videos=validation_outputs,
+                base_model=args.pretrained_model_name_or_path,
+                validation_prompt=args.validation_prompt,
+                repo_folder=args.output_dir,
+                fps=args.fps,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = get_args()
+    main(args)
diff --git a/diffusers/examples/community/README.md b/diffusers/examples/community/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e51124e759560abd6c829c90f597e45de31dec70
--- /dev/null
+++ b/diffusers/examples/community/README.md
@@ -0,0 +1,4402 @@
+# Community Pipeline Examples
+
+> **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).**
+
+**Community pipeline** examples consist pipelines that have been added by the community.
+Please have a look at the following tables to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
+If a community pipeline doesn't work as expected, please open an issue and ping the author on it.
+
+Please also check out our [Community Scripts](https://github.com/huggingface/diffusers/blob/main/examples/community/README_community_scripts.md) examples for tips and tricks that you can use with diffusers without having to run a community pipeline.
+
+| Example                                                                                                                               | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                                              | Colab                                                                                                                                                                                                              |                                                        Author |
+|:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
+|Flux with CFG|[Flux with CFG](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md) provides an implementation of using CFG in [Flux](https://blackforestlabs.ai/announcing-black-forest-labs/).|[Flux with CFG](#flux-with-cfg)|NA|[Linoy Tsaban](https://github.com/linoytsaban), [Apolinário](https://github.com/apolinario), and [Sayak Paul](https://github.com/sayakpaul)|
+|Differential Diffusion|[Differential Diffusion](https://github.com/exx8/differential-diffusion) modifies an image according to a text prompt, and according to a map that specifies the amount of change in each region.|[Differential Diffusion](#differential-diffusion)|[![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/exx8/differential-diffusion) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/exx8/differential-diffusion/blob/main/examples/SD2.ipynb)|[Eran Levin](https://github.com/exx8) and [Ohad Fried](https://www.ohadf.com/)|
+| HD-Painter                                                                                                                            | [HD-Painter](https://github.com/Picsart-AI-Research/HD-Painter) enables prompt-faithfull and high resolution (up to 2k) image inpainting upon any diffusion-based image inpainting method.                                                                                                                                                                                                                                                                                                               | [HD-Painter](#hd-painter)                                                                 | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/PAIR/HD-Painter)                                                                              | [Manukyan Hayk](https://github.com/haikmanukyan) and [Sargsyan Andranik](https://github.com/AndranikSargsyan) |
+| Marigold Monocular Depth Estimation                                                                                                   | A universal monocular depth estimator, utilizing Stable Diffusion, delivering sharp predictions in the wild. (See the [project page](https://marigoldmonodepth.github.io) and [full codebase](https://github.com/prs-eth/marigold) for more details.)                                                                                                                                                                                                                                                        | [Marigold Depth Estimation](#marigold-depth-estimation)                                   | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/toshas/marigold) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/12G8reD13DdpMie5ZQlaFNo2WCGeNUH-u?usp=sharing) | [Bingxin Ke](https://github.com/markkua) and [Anton Obukhov](https://github.com/toshas) |
+| LLM-grounded Diffusion (LMD+)                                                                                                         | LMD greatly improves the prompt following ability of text-to-image generation models by introducing an LLM as a front-end prompt parser and layout planner. [Project page.](https://llm-grounded-diffusion.github.io/) [See our full codebase (also with diffusers).](https://github.com/TonyLianLong/LLM-groundedDiffusion)                                                                                                                                                                                                                                                                                                                                                                                                                                   | [LLM-grounded Diffusion (LMD+)](#llm-grounded-diffusion)                             | [Huggingface Demo](https://huggingface.co/spaces/longlian/llm-grounded-diffusion) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SXzMSeAB-LJYISb2yrUOdypLz4OYWUKj) |                [Long (Tony) Lian](https://tonylian.com/) |
+| CLIP Guided Stable Diffusion                                                                                                          | Doing CLIP guidance for text to image generation with Stable Diffusion                                                                                                                                                                                                                                                                                                                                                                                                                                   | [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion)                             | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) |                [Suraj Patil](https://github.com/patil-suraj/) |
+| One Step U-Net (Dummy)                                                                                                                | Example showcasing of how to use Community Pipelines (see <https://github.com/huggingface/diffusers/issues/841>)                                                                                                                                                                                                                                                                                                                                                                                           | [One Step U-Net](#one-step-unet)                                                          | -                                                                                                                                                                                                                  |    [Patrick von Platen](https://github.com/patrickvonplaten/) |
+| Stable Diffusion Interpolation                                                                                                        | Interpolate the latent space of Stable Diffusion between different prompts/seeds                                                                                                                                                                                                                                                                                                                                                                                                                         | [Stable Diffusion Interpolation](#stable-diffusion-interpolation)                         | -                                                                                                                                                                                                                  |                       [Nate Raw](https://github.com/nateraw/) |
+| Stable Diffusion Mega                                                                                                                 | **One** Stable Diffusion Pipeline with all functionalities of [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega)                                           | -                                                                                                                                                                                                                  |    [Patrick von Platen](https://github.com/patrickvonplaten/) |
+| Long Prompt Weighting Stable Diffusion                                                                                                | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt.                                                                                                                                                                                                                                                                                                                                                                                                  | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion)         | -                                                                                                                                                                                                                  |                           [SkyTNT](https://github.com/SkyTNT) |
+| Speech to Image                                                                                                                       | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images                                                                                                                                                                                                                                                                                                                                                                                                            | [Speech to Image](#speech-to-image)                                                       | -                                                                                                                                                                                                                  |             [Mikail Duzenli](https://github.com/MikailINTech)
+| Wild Card Stable Diffusion                                                                                                            | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values                                                                                                                                                                                                                                                                            | [Wildcard Stable Diffusion](#wildcard-stable-diffusion)                                   | -                                                                                                                                                                                                                  |              [Shyam Sudhakaran](https://github.com/shyamsn97) |
+| [Composable Stable Diffusion](https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/) | Stable Diffusion Pipeline that supports prompts that contain "&#124;" in prompts (as an AND condition) and weights (separated by "&#124;" as well) to positively / negatively weight prompts.                                                                                                                                                                                                                                                                                                            | [Composable Stable Diffusion](#composable-stable-diffusion)                               | -                                                                                                                                                                                                                  |                      [Mark Rich](https://github.com/MarkRich) |
+| Seed Resizing Stable Diffusion                                                                                                        | Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation.                                                                                                                                                                                                                                                                                                                                                                                       | [Seed Resizing](#seed-resizing)                                                           | -                                                                                                                                                                                                                  |                      [Mark Rich](https://github.com/MarkRich) |
+| Imagic Stable Diffusion                                                                                                               | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image                                                                                                                                                                                                                                                                                                                                                                                                                   | [Imagic Stable Diffusion](#imagic-stable-diffusion)                                       | -                                                                                                                                                                                                                  |                      [Mark Rich](https://github.com/MarkRich) |
+| Multilingual Stable Diffusion                                                                                                         | Stable Diffusion Pipeline that supports prompts in 50 different languages.                                                                                                                                                                                                                                                                                                                                                                                                                               | [Multilingual Stable Diffusion](#multilingual-stable-diffusion-pipeline)                  | -                                                                                                                                                                                                                  |          [Juan Carlos Piñeros](https://github.com/juancopi81) |
+| GlueGen Stable Diffusion                                                                                                         | Stable Diffusion Pipeline that supports prompts in different languages using GlueGen adapter.                                                                                                                                                                                                                                                                                                                                                                                                                               | [GlueGen Stable Diffusion](#gluegen-stable-diffusion-pipeline)                  | -                                                                                                                                                                                                                  |          [Phạm Hồng Vinh](https://github.com/rootonchair) |
+| Image to Image Inpainting Stable Diffusion                                                                                            | Stable Diffusion Pipeline that enables the overlaying of two images and subsequent inpainting                                                                                                                                                                                                                                                                                                                                                                                                            | [Image to Image Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion) | -                                                                                                                                                                                                                  |                    [Alex McKinney](https://github.com/vvvm23) |
+| Text Based Inpainting Stable Diffusion                                                                                                | Stable Diffusion Inpainting Pipeline that enables passing a text prompt to generate the mask for inpainting                                                                                                                                                                                                                                                                                                                                                                                              | [Text Based Inpainting Stable Diffusion](#text-based-inpainting-stable-diffusion)     | -                                                                                                                                                                                                                  |                   [Dhruv Karan](https://github.com/unography) |
+| Bit Diffusion                                                                                                                         | Diffusion on discrete data                                                                                                                                                                                                                                                                                                                                                                                                                                                                               | [Bit Diffusion](#bit-diffusion)                                                           | -  |                       [Stuti R.](https://github.com/kingstut) |
+| K-Diffusion Stable Diffusion                                                                                                          | Run Stable Diffusion with any of [K-Diffusion's samplers](https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py)                                                                                                                                                                                                                                                                                                                                                                  | [Stable Diffusion with K Diffusion](#stable-diffusion-with-k-diffusion)                   | -  |    [Patrick von Platen](https://github.com/patrickvonplaten/) |
+| Checkpoint Merger Pipeline                                                                                                            | Diffusion Pipeline that enables merging of saved model checkpoints                                                                                                                                                                                                                                                                                                                                                                                                                                       | [Checkpoint Merger Pipeline](#checkpoint-merger-pipeline)                                 | -                                                                                                                                                                                                                  | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) |
+| Stable Diffusion v1.1-1.4 Comparison                                                                                                  | Run all 4 model checkpoints for Stable Diffusion and compare their results together                                                                                                                                                                                                                                                                                                                                                                                                                      | [Stable Diffusion Comparison](#stable-diffusion-comparisons)                              | - |        [Suvaditya Mukherjee](https://github.com/suvadityamuk) |
+| MagicMix                                                                                                                              | Diffusion Pipeline for semantic mixing of an image and a text prompt                                                                                                                                                                                                                                                                                                                                                                                                                                     | [MagicMix](#magic-mix)                                                                    | - |                    [Partho Das](https://github.com/daspartho) |
+| Stable UnCLIP                                                                                                                         | Diffusion Pipeline for combining prior model (generate clip image embedding from text, UnCLIPPipeline `"kakaobrain/karlo-v1-alpha"`) and decoder pipeline (decode clip image embedding to image, StableDiffusionImageVariationPipeline `"lambdalabs/sd-image-variations-diffusers"` ).                                                                                                                                                                                                                   | [Stable UnCLIP](#stable-unclip)                                                           | -  |                                [Ray Wang](https://wrong.wang) |
+| UnCLIP Text Interpolation Pipeline                                                                                                    | Diffusion Pipeline that allows passing two prompts and produces images while interpolating between the text-embeddings of the two prompts                                                                                                                                                                                                                                                                                                                                                                | [UnCLIP Text Interpolation Pipeline](#unclip-text-interpolation-pipeline)                 | -                                                                                                                                                                                                                  | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) |
+| UnCLIP Image Interpolation Pipeline                                                                                                   | Diffusion Pipeline that allows passing two images/image_embeddings and produces images while interpolating between their image-embeddings                                                                                                                                                                                                                                                                                                                                                                | [UnCLIP Image Interpolation Pipeline](#unclip-image-interpolation-pipeline)               | -                                                                                                                                                                                                                  | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) |
+| DDIM Noise Comparative Analysis Pipeline                                                                                              | Investigating how the diffusion models learn visual concepts from each noise level (which is a contribution of [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227))                                                                                                                                                                                                                                                                                                                             | [DDIM Noise Comparative Analysis Pipeline](#ddim-noise-comparative-analysis-pipeline)     | - |              [Aengus (Duc-Anh)](https://github.com/aengusng8) |
+| CLIP Guided Img2Img Stable Diffusion Pipeline                                                                                         | Doing CLIP guidance for image to image generation with Stable Diffusion                                                                                                                                                                                                                                                                                                                                                                                                                                  | [CLIP Guided Img2Img Stable Diffusion](#clip-guided-img2img-stable-diffusion)             | - |               [Nipun Jindal](https://github.com/nipunjindal/) |
+| TensorRT Stable Diffusion Text to Image Pipeline                                                                                                    | Accelerates the Stable Diffusion Text2Image Pipeline using TensorRT                                                                                                                                                                                                                                                                                                                                                                                                                                      | [TensorRT Stable Diffusion Text to Image Pipeline](#tensorrt-text2image-stable-diffusion-pipeline)      | - |              [Asfiya Baig](https://github.com/asfiyab-nvidia) |
+| EDICT Image Editing Pipeline                                                                                                          | Diffusion pipeline for text-guided image editing                                                                                                                                                                                                                                                                                                                                                                                                                                                         | [EDICT Image Editing Pipeline](#edict-image-editing-pipeline)                             | - |                    [Joqsan Azocar](https://github.com/Joqsan) |
+| Stable Diffusion RePaint                                                                                                              | Stable Diffusion pipeline using [RePaint](https://arxiv.org/abs/2201.09865) for inpainting.                                                                                                                                                                                                                                                                                                                                                                                                               | [Stable Diffusion RePaint](#stable-diffusion-repaint )                                    | - |                  [Markus Pobitzer](https://github.com/Markus-Pobitzer) |
+| TensorRT Stable Diffusion Image to Image Pipeline                                                                                                    | Accelerates the Stable Diffusion Image2Image Pipeline using TensorRT                                                                                                                                                                                                                                                                                                                                                                                                                                      | [TensorRT Stable Diffusion Image to Image Pipeline](#tensorrt-image2image-stable-diffusion-pipeline)      | - |              [Asfiya Baig](https://github.com/asfiyab-nvidia) |
+| Stable Diffusion IPEX Pipeline | Accelerate Stable Diffusion inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion on IPEX](#stable-diffusion-on-ipex) | - | [Yingjie Han](https://github.com/yingjie-han/) |
+| CLIP Guided Images Mixing Stable Diffusion Pipeline | Сombine images using usual diffusion models. | [CLIP Guided Images Mixing Using Stable Diffusion](#clip-guided-images-mixing-with-stable-diffusion) | - | [Karachev Denis](https://github.com/TheDenk) |
+| TensorRT Stable Diffusion Inpainting Pipeline                                                                                                    | Accelerates the Stable Diffusion Inpainting Pipeline using TensorRT                                                                                                                                                                                                                                                                                                                                                                                                                                      | [TensorRT Stable Diffusion Inpainting Pipeline](#tensorrt-inpainting-stable-diffusion-pipeline)      | - |              [Asfiya Baig](https://github.com/asfiyab-nvidia) |
+|   IADB Pipeline                                                                                                    | Implementation of [Iterative α-(de)Blending: a Minimalist Deterministic Diffusion Model](https://arxiv.org/abs/2305.03486)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [IADB Pipeline](#iadb-pipeline)      | - |              [Thomas Chambon](https://github.com/tchambon)
+|   Zero1to3 Pipeline                                                                                                    | Implementation of [Zero-1-to-3: Zero-shot One Image to 3D Object](https://arxiv.org/abs/2303.11328)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Zero1to3 Pipeline](#zero1to3-pipeline)      | - |              [Xin Kong](https://github.com/kxhit) |
+| Stable Diffusion XL Long Weighted Prompt Pipeline | A pipeline support unlimited length of prompt and negative prompt, use A1111 style of prompt weighting | [Stable Diffusion XL Long Weighted Prompt Pipeline](#stable-diffusion-xl-long-weighted-prompt-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1LsqilswLR40XLLcp6XFOl5nKb_wOe26W?usp=sharing) | [Andrew Zhu](https://xhinker.medium.com/) |
+| FABRIC - Stable Diffusion with feedback Pipeline | pipeline supports feedback from liked and disliked images | [Stable Diffusion Fabric Pipeline](#stable-diffusion-fabric-pipeline) | - | [Shauray Singh](https://shauray8.github.io/about_shauray/) |
+| sketch inpaint - Inpainting with non-inpaint Stable Diffusion | sketch inpaint much like in automatic1111 | [Masked Im2Im Stable Diffusion Pipeline](#stable-diffusion-masked-im2im) | - | [Anatoly Belikov](https://github.com/noskill) |
+| sketch inpaint xl - Inpainting with non-inpaint Stable Diffusion | sketch inpaint much like in automatic1111 | [Masked Im2Im Stable Diffusion XL Pipeline](#stable-diffusion-xl-masked-im2im) | - | [Anatoly Belikov](https://github.com/noskill) |
+| prompt-to-prompt | change parts of a prompt and retain image structure (see [paper page](https://prompt-to-prompt.github.io/)) | [Prompt2Prompt Pipeline](#prompt2prompt-pipeline) | - | [Umer H. Adil](https://twitter.com/UmerHAdil) |
+|   Latent Consistency Pipeline                                                                                                    | Implementation of [Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference](https://arxiv.org/abs/2310.04378)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Latent Consistency Pipeline](#latent-consistency-pipeline)      | - |              [Simian Luo](https://github.com/luosiallen) |
+|   Latent Consistency Img2img Pipeline                                                                                                    | Img2img pipeline for Latent Consistency Models                                                                                                                                                                                                                                                                                                                                                                                                                                    | [Latent Consistency Img2Img Pipeline](#latent-consistency-img2img-pipeline)      | - |              [Logan Zoellner](https://github.com/nagolinc) |
+|   Latent Consistency Interpolation Pipeline                                                                                                    | Interpolate the latent space of Latent Consistency Models with multiple prompts                                                                                                                                                                                                                                                                                                                                                                                                                                    | [Latent Consistency Interpolation Pipeline](#latent-consistency-interpolation-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1pK3NrLWJSiJsBynLns1K1-IDTW9zbPvl?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
+| SDE Drag Pipeline                                                                                                                         | The pipeline supports drag editing of images using stochastic differential equations                                                                                                                                                                                                                                                                                                                                                                                                                | [SDE Drag Pipeline](#sde-drag-pipeline)                                                     | - | [NieShen](https://github.com/NieShenRuc) [Fengqi Zhu](https://github.com/Monohydroxides) |
+|   Regional Prompting Pipeline                                                                                               | Assign multiple prompts for different regions                                                                                                                                                                                                                                                                                                                                                    |  [Regional Prompting Pipeline](#regional-prompting-pipeline) | - | [hako-mikan](https://github.com/hako-mikan) |
+| LDM3D-sr (LDM3D upscaler)                                                                                                             | Upscale low resolution RGB and depth inputs to high resolution                                                                                                                                                                                                                                                                                                                                                                                                                              | [StableDiffusionUpscaleLDM3D Pipeline](https://github.com/estelleafl/diffusers/tree/ldm3d_upscaler_community/examples/community#stablediffusionupscaleldm3d-pipeline)                                                                             | -                                                                                                                                                                                                             |                                                        [Estelle Aflalo](https://github.com/estelleafl) |
+| AnimateDiff ControlNet Pipeline                                                                                                    | Combines AnimateDiff with precise motion control using ControlNets                                                                                                                                                                                                                                                                                                                                                                                                                                    | [AnimateDiff ControlNet Pipeline](#animatediff-controlnet-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SKboYeGjEQmQPWoFC0aLYpBlYdHXkvAu?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) and [Edoardo Botta](https://github.com/EdoardoBotta) |
+|   DemoFusion Pipeline                                                                                                    | Implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [DemoFusion Pipeline](#demofusion)      | - |              [Ruoyi Du](https://github.com/RuoyiDu) |
+|   Instaflow Pipeline                                                                                                    | Implementation of [InstaFlow! One-Step Stable Diffusion with Rectified Flow](https://arxiv.org/abs/2309.06380)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Instaflow Pipeline](#instaflow-pipeline)      | - |              [Ayush Mangal](https://github.com/ayushtues) |
+|   Null-Text Inversion Pipeline  | Implement [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/abs/2211.09794) as a pipeline.                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Null-Text Inversion](https://github.com/google/prompt-to-prompt/)      | - |              [Junsheng Luan](https://github.com/Junsheng121) |
+|   Rerender A Video Pipeline                                                                                                    | Implementation of [[SIGGRAPH Asia 2023] Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation](https://arxiv.org/abs/2306.07954)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Rerender A Video Pipeline](#rerender-a-video)      | - |              [Yifan Zhou](https://github.com/SingleZombie) |
+| StyleAligned Pipeline                                                                                                    | Implementation of [Style Aligned Image Generation via Shared Attention](https://arxiv.org/abs/2312.02133)                                                                                                                                                                                                                                                                                                                                                                                                                                   | [StyleAligned Pipeline](#stylealigned-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/15X2E0jFPTajUIjS0FzX50OaHsCbP2lQ0/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
+| AnimateDiff Image-To-Video Pipeline | Experimental Image-To-Video support for AnimateDiff (open to improvements) | [AnimateDiff Image To Video Pipeline](#animatediff-image-to-video-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/1TvzCDPHhfFtdcJZe4RLloAwyoLKuttWK/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
+|   IP Adapter FaceID Stable Diffusion                                                                                               | Stable Diffusion Pipeline that supports IP Adapter Face ID                                                                                                                                                                                                                                                                                                                                                  |  [IP Adapter Face ID](#ip-adapter-face-id) | - | [Fabio Rigano](https://github.com/fabiorigano) |
+|   InstantID Pipeline                                                                                               | Stable Diffusion XL Pipeline that supports InstantID                                                                                                                                                                                                                                                                                                                                                 |  [InstantID Pipeline](#instantid-pipeline) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/InstantX/InstantID) | [Haofan Wang](https://github.com/haofanwang) |
+|   UFOGen Scheduler                                                                                               | Scheduler for UFOGen Model (compatible with Stable Diffusion pipelines)                                                                                                                                                                                                                                                                                                                                                 |  [UFOGen Scheduler](#ufogen-scheduler) | - | [dg845](https://github.com/dg845) |
+| Stable Diffusion XL IPEX Pipeline | Accelerate Stable Diffusion XL inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion XL on IPEX](#stable-diffusion-xl-on-ipex) | - | [Dan Li](https://github.com/ustcuna/) |
+| Stable Diffusion BoxDiff Pipeline | Training-free controlled generation with bounding boxes using [BoxDiff](https://github.com/showlab/BoxDiff) | [Stable Diffusion BoxDiff Pipeline](#stable-diffusion-boxdiff) | - | [Jingyang Zhang](https://github.com/zjysteven/) |
+|   FRESCO V2V Pipeline                                                                                                    | Implementation of [[CVPR 2024] FRESCO: Spatial-Temporal Correspondence for Zero-Shot Video Translation](https://arxiv.org/abs/2403.12962)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [FRESCO V2V Pipeline](#fresco)      | - |              [Yifan Zhou](https://github.com/SingleZombie) |
+| AnimateDiff IPEX Pipeline | Accelerate AnimateDiff inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [AnimateDiff on IPEX](#animatediff-on-ipex) | - | [Dan Li](https://github.com/ustcuna/) |
+| HunyuanDiT Differential Diffusion Pipeline | Applies [Differential Diffsuion](https://github.com/exx8/differential-diffusion) to [HunyuanDiT](https://github.com/huggingface/diffusers/pull/8240). | [HunyuanDiT with Differential Diffusion](#hunyuandit-with-differential-diffusion) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1v44a5fpzyr4Ffr4v2XBQ7BajzG874N4P?usp=sharing) | [Monjoy Choudhury](https://github.com/MnCSSJ4x) |
+
+To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
+
+```py
+pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", custom_pipeline="filename_in_the_community_folder")
+```
+
+## Example usages
+
+### Flux with CFG
+
+Know more about Flux [here](https://blackforestlabs.ai/announcing-black-forest-labs/). Since Flux doesn't use CFG, this implementation provides one, inspired by the [PuLID Flux adaptation](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md).  
+
+Example usage:
+
+```py
+from diffusers import DiffusionPipeline
+import torch 
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev", 
+    torch_dtype=torch.bfloat16, 
+    custom_pipeline="pipeline_flux_with_cfg"
+)
+pipeline.enable_model_cpu_offload()
+prompt = "a watercolor painting of a unicorn"
+negative_prompt = "pink"
+
+img = pipeline(
+    prompt=prompt, 
+    negative_prompt=negative_prompt, 
+    true_cfg=1.5, 
+    guidance_scale=3.5, 
+    num_images_per_prompt=1,
+    generator=torch.manual_seed(0)
+).images[0]
+img.save("cfg_flux.png")
+```
+
+### Differential Diffusion
+
+**Eran Levin, Ohad Fried**
+
+**Tel Aviv University, Reichman University**
+
+Diffusion models have revolutionized image generation and editing, producing state-of-the-art results in conditioned and unconditioned image synthesis. While current techniques enable user control over the degree of change in an image edit, the controllability is limited to global changes over an entire edited region. This paper introduces a novel framework that enables customization of the amount of change per pixel or per image region. Our framework can be integrated into any existing diffusion model, enhancing it with this capability. Such granular control on the quantity of change opens up a diverse array of new editing capabilities, such as control of the extent to which individual objects are modified, or the ability to introduce gradual spatial changes. Furthermore, we showcase the framework's effectiveness in soft-inpainting---the completion of portions of an image while subtly adjusting the surrounding areas to ensure seamless integration. Additionally, we introduce a new tool for exploring the effects of different change quantities. Our framework operates solely during inference, requiring no model training or fine-tuning. We demonstrate our method with the current open state-of-the-art models, and validate it via both quantitative and qualitative comparisons, and a user study.
+
+![teaser-img](https://github.com/exx8/differential-diffusion/raw/main/assets/teaser.png)
+
+You can find additional information about Differential Diffusion in the [paper](https://differential-diffusion.github.io/paper.pdf) or in the [project website](https://differential-diffusion.github.io/).
+
+#### Usage example
+
+```python
+import torch
+from torchvision import transforms
+
+from diffusers import DPMSolverMultistepScheduler
+from diffusers.utils import load_image
+from examples.community.pipeline_stable_diffusion_xl_differential_img2img import (
+    StableDiffusionXLDifferentialImg2ImgPipeline,
+)
+
+
+pipeline = StableDiffusionXLDifferentialImg2ImgPipeline.from_pretrained(
+    "SG161222/RealVisXL_V4.0", torch_dtype=torch.float16, variant="fp16"
+).to("cuda")
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, use_karras_sigmas=True)
+
+
+def preprocess_image(image):
+    image = image.convert("RGB")
+    image = transforms.CenterCrop((image.size[1] // 64 * 64, image.size[0] // 64 * 64))(image)
+    image = transforms.ToTensor()(image)
+    image = image * 2 - 1
+    image = image.unsqueeze(0).to("cuda")
+    return image
+
+
+def preprocess_map(map):
+    map = map.convert("L")
+    map = transforms.CenterCrop((map.size[1] // 64 * 64, map.size[0] // 64 * 64))(map)
+    map = transforms.ToTensor()(map)
+    map = map.to("cuda")
+    return map
+
+
+image = preprocess_image(
+    load_image(
+        "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/20240329211129_4024911930.png?download=true"
+    )
+)
+
+mask = preprocess_map(
+    load_image(
+        "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/gradient_mask.png?download=true"
+    )
+)
+
+prompt = "a green pear"
+negative_prompt = "blurry"
+
+image = pipeline(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    guidance_scale=7.5,
+    num_inference_steps=25,
+    original_image=image,
+    image=image,
+    strength=1.0,
+    map=mask,
+).images[0]
+
+image.save("result.png")
+```
+
+### HD-Painter
+
+Implementation of [HD-Painter: High-Resolution and Prompt-Faithful Text-Guided Image Inpainting with Diffusion Models](https://arxiv.org/abs/2312.14091).
+
+![teaser-img](https://raw.githubusercontent.com/Picsart-AI-Research/HD-Painter/main/__assets__/github/teaser.jpg)
+
+The abstract from the paper is:
+
+Recent progress in text-guided image inpainting, based on the unprecedented success of text-to-image diffusion models, has led to exceptionally realistic and visually plausible results.
+However, there is still significant potential for improvement in current text-to-image inpainting models, particularly in better aligning the inpainted area with user prompts and performing high-resolution inpainting.
+Therefore, in this paper we introduce _HD-Painter_, a completely **training-free** approach that **accurately follows to prompts** and coherently **scales to high-resolution** image inpainting.
+To this end, we design the _Prompt-Aware Introverted Attention (PAIntA)_ layer enhancing self-attention scores by prompt information and resulting in better text alignment generations.
+To further improve the prompt coherence we introduce the _Reweighting Attention Score Guidance (RASG)_ mechanism seamlessly integrating a post-hoc sampling strategy into general form of DDIM to prevent out-of-distribution latent shifts.
+Moreover, HD-Painter allows extension to larger scales by introducing a specialized super-resolution technique customized for inpainting, enabling the completion of missing regions in images of up to 2K resolution.
+Our experiments demonstrate that HD-Painter surpasses existing state-of-the-art approaches qualitatively and quantitatively, achieving an impressive generation accuracy improvement of **61.4** vs **51.9**.
+We will make the codes publicly available.
+
+You can find additional information about Text2Video-Zero in the [paper](https://arxiv.org/abs/2312.14091) or the [original codebase](https://github.com/Picsart-AI-Research/HD-Painter).
+
+#### Usage example
+
+```python
+import torch
+from diffusers import DiffusionPipeline, DDIMScheduler
+from diffusers.utils import load_image, make_image_grid
+
+pipe = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-2-inpainting",
+    custom_pipeline="hd_painter"
+)
+pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+
+prompt = "wooden boat"
+init_image = load_image("https://raw.githubusercontent.com/Picsart-AI-Research/HD-Painter/main/__assets__/samples/images/2.jpg")
+mask_image = load_image("https://raw.githubusercontent.com/Picsart-AI-Research/HD-Painter/main/__assets__/samples/masks/2.png")
+
+image = pipe(prompt, init_image, mask_image, use_rasg=True, use_painta=True, generator=torch.manual_seed(12345)).images[0]
+
+make_image_grid([init_image, mask_image, image], rows=1, cols=3)
+```
+
+### Marigold Depth Estimation
+
+Marigold is a universal monocular depth estimator that delivers accurate and sharp predictions in the wild. Based on Stable Diffusion, it is trained exclusively with synthetic depth data and excels in zero-shot adaptation to real-world imagery. This pipeline is an official implementation of the inference process. More details can be found on our [project page](https://marigoldmonodepth.github.io) and [full codebase](https://github.com/prs-eth/marigold) (also implemented with diffusers).
+
+![Marigold Teaser](https://marigoldmonodepth.github.io/images/teaser_collage_compressed.jpg)
+
+This depth estimation pipeline processes a single input image through multiple diffusion denoising stages to estimate depth maps. These maps are subsequently merged to produce the final output. Below is an example code snippet, including optional arguments:
+
+```python
+import numpy as np
+import torch
+from PIL import Image
+from diffusers import DiffusionPipeline
+from diffusers.utils import load_image
+
+# Original DDIM version (higher quality)
+pipe = DiffusionPipeline.from_pretrained(
+    "prs-eth/marigold-v1-0",
+    custom_pipeline="marigold_depth_estimation"
+    # torch_dtype=torch.float16,                # (optional) Run with half-precision (16-bit float).
+    # variant="fp16",                           # (optional) Use with `torch_dtype=torch.float16`, to directly load fp16 checkpoint
+)
+
+# (New) LCM version (faster speed)
+pipe = DiffusionPipeline.from_pretrained(
+    "prs-eth/marigold-depth-lcm-v1-0",
+    custom_pipeline="marigold_depth_estimation"
+    # torch_dtype=torch.float16,                # (optional) Run with half-precision (16-bit float).
+    # variant="fp16",                           # (optional) Use with `torch_dtype=torch.float16`, to directly load fp16 checkpoint
+)
+
+pipe.to("cuda")
+
+img_path_or_url = "https://share.phys.ethz.ch/~pf/bingkedata/marigold/pipeline_example.jpg"
+image: Image.Image = load_image(img_path_or_url)
+
+pipeline_output = pipe(
+    image,                    # Input image.
+    # ----- recommended setting for DDIM version -----
+    # denoising_steps=10,     # (optional) Number of denoising steps of each inference pass. Default: 10.
+    # ensemble_size=10,       # (optional) Number of inference passes in the ensemble. Default: 10.
+    # ------------------------------------------------
+
+    # ----- recommended setting for LCM version ------
+    # denoising_steps=4,
+    # ensemble_size=5,
+    # -------------------------------------------------
+
+    # processing_res=768,     # (optional) Maximum resolution of processing. If set to 0: will not resize at all. Defaults to 768.
+    # match_input_res=True,   # (optional) Resize depth prediction to match input resolution.
+    # batch_size=0,           # (optional) Inference batch size, no bigger than `num_ensemble`. If set to 0, the script will automatically decide the proper batch size. Defaults to 0.
+    # seed=2024,              # (optional) Random seed can be set to ensure additional reproducibility. Default: None (unseeded). Note: forcing --batch_size 1 helps to increase reproducibility. To ensure full reproducibility, deterministic mode needs to be used.
+    # color_map="Spectral",   # (optional) Colormap used to colorize the depth map. Defaults to "Spectral". Set to `None` to skip colormap generation.
+    # show_progress_bar=True, # (optional) If true, will show progress bars of the inference progress.
+)
+
+depth: np.ndarray = pipeline_output.depth_np                    # Predicted depth map
+depth_colored: Image.Image = pipeline_output.depth_colored      # Colorized prediction
+
+# Save as uint16 PNG
+depth_uint16 = (depth * 65535.0).astype(np.uint16)
+Image.fromarray(depth_uint16).save("./depth_map.png", mode="I;16")
+
+# Save colorized depth map
+depth_colored.save("./depth_colored.png")
+```
+
+### LLM-grounded Diffusion
+
+LMD and LMD+ greatly improves the prompt understanding ability of text-to-image generation models by introducing an LLM as a front-end prompt parser and layout planner. It improves spatial reasoning, the understanding of negation, attribute binding, generative numeracy, etc. in a unified manner without explicitly aiming for each. LMD is completely training-free (i.e., uses SD model off-the-shelf). LMD+ takes in additional adapters for better control. This is a reproduction of LMD+ model used in our work. [Project page.](https://llm-grounded-diffusion.github.io/) [See our full codebase (also with diffusers).](https://github.com/TonyLianLong/LLM-groundedDiffusion)
+
+![Main Image](https://llm-grounded-diffusion.github.io/main_figure.jpg)
+![Visualizations: Enhanced Prompt Understanding](https://llm-grounded-diffusion.github.io/visualizations.jpg)
+
+This pipeline can be used with an LLM or on its own. We provide a parser that parses LLM outputs to the layouts. You can obtain the prompt to input to the LLM for layout generation [here](https://github.com/TonyLianLong/LLM-groundedDiffusion/blob/main/prompt.py). After feeding the prompt to an LLM (e.g., GPT-4 on ChatGPT website), you can feed the LLM response into our pipeline.
+
+The following code has been tested on 1x RTX 4090, but it should also support GPUs with lower GPU memory.
+
+#### Use this pipeline with an LLM
+
+```python
+import torch
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained(
+    "longlian/lmd_plus",
+    custom_pipeline="llm_grounded_diffusion",
+    custom_revision="main",
+    variant="fp16", torch_dtype=torch.float16
+)
+pipe.enable_model_cpu_offload()
+
+# Generate directly from a text prompt and an LLM response
+prompt = "a waterfall and a modern high speed train in a beautiful forest with fall foliage"
+phrases, boxes, bg_prompt, neg_prompt = pipe.parse_llm_response("""
+[('a waterfall', [71, 105, 148, 258]), ('a modern high speed train', [255, 223, 181, 149])]
+Background prompt: A beautiful forest with fall foliage
+Negative prompt:
+""")
+
+images = pipe(
+    prompt=prompt,
+    negative_prompt=neg_prompt,
+    phrases=phrases,
+    boxes=boxes,
+    gligen_scheduled_sampling_beta=0.4,
+    output_type="pil",
+    num_inference_steps=50,
+    lmd_guidance_kwargs={}
+).images
+
+images[0].save("./lmd_plus_generation.jpg")
+```
+
+#### Use this pipeline on its own for layout generation
+
+```python
+import torch
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained(
+    "longlian/lmd_plus",
+    custom_pipeline="llm_grounded_diffusion",
+    variant="fp16", torch_dtype=torch.float16
+)
+pipe.enable_model_cpu_offload()
+
+# Generate an image described by the prompt and
+# insert objects described by text at the region defined by bounding boxes
+prompt = "a waterfall and a modern high speed train in a beautiful forest with fall foliage"
+boxes = [[0.1387, 0.2051, 0.4277, 0.7090], [0.4980, 0.4355, 0.8516, 0.7266]]
+phrases = ["a waterfall", "a modern high speed train"]
+
+images = pipe(
+    prompt=prompt,
+    phrases=phrases,
+    boxes=boxes,
+    gligen_scheduled_sampling_beta=0.4,
+    output_type="pil",
+    num_inference_steps=50,
+    lmd_guidance_kwargs={}
+).images
+
+images[0].save("./lmd_plus_generation.jpg")
+```
+
+### CLIP Guided Stable Diffusion
+
+CLIP guided stable diffusion can help to generate more realistic images
+by guiding stable diffusion at every denoising step with an additional CLIP model.
+
+The following code requires roughly 12GB of GPU RAM.
+
+```python
+from diffusers import DiffusionPipeline
+from transformers import CLIPImageProcessor, CLIPModel
+import torch
+
+
+feature_extractor = CLIPImageProcessor.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
+clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K", torch_dtype=torch.float16)
+
+
+guided_pipeline = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    custom_pipeline="clip_guided_stable_diffusion",
+    clip_model=clip_model,
+    feature_extractor=feature_extractor,
+    torch_dtype=torch.float16,
+)
+guided_pipeline.enable_attention_slicing()
+guided_pipeline = guided_pipeline.to("cuda")
+
+prompt = "fantasy book cover, full moon, fantasy forest landscape, golden vector elements, fantasy magic, dark light night, intricate, elegant, sharp focus, illustration, highly detailed, digital painting, concept art, matte, art by WLOP and Artgerm and Albert Bierstadt, masterpiece"
+
+generator = torch.Generator(device="cuda").manual_seed(0)
+images = []
+for i in range(4):
+    image = guided_pipeline(
+        prompt,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        clip_guidance_scale=100,
+        num_cutouts=4,
+        use_cutouts=False,
+        generator=generator,
+    ).images[0]
+    images.append(image)
+
+# save images locally
+for i, img in enumerate(images):
+    img.save(f"./clip_guided_sd/image_{i}.png")
+```
+
+The `images` list contains a list of PIL images that can be saved locally or displayed directly in a google colab.
+Generated images tend to be of higher quality than natively using stable diffusion. E.g. the above script generates the following images:
+
+![clip_guidance](https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/clip_guidance/merged_clip_guidance.jpg).
+
+### One Step Unet
+
+The dummy "one-step-unet" can be run as follows:
+
+```python
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet")
+pipe()
+```
+
+**Note**: This community pipeline is not useful as a feature, but rather just serves as an example of how community pipelines can be added (see <https://github.com/huggingface/diffusers/issues/841>).
+
+### Stable Diffusion Interpolation
+
+The following code can be run on a GPU of at least 8GB VRAM and should take approximately 5 minutes.
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    variant='fp16',
+    torch_dtype=torch.float16,
+    safety_checker=None,  # Very important for videos...lots of false positives while interpolating
+    custom_pipeline="interpolate_stable_diffusion",
+).to('cuda')
+pipe.enable_attention_slicing()
+
+frame_filepaths = pipe.walk(
+    prompts=['a dog', 'a cat', 'a horse'],
+    seeds=[42, 1337, 1234],
+    num_interpolation_steps=16,
+    output_dir='./dreams',
+    batch_size=4,
+    height=512,
+    width=512,
+    guidance_scale=8.5,
+    num_inference_steps=50,
+)
+```
+
+The output of the `walk(...)` function returns a list of images saved under the folder as defined in `output_dir`. You can use these images to create videos of stable diffusion.
+
+> **Please have a look at <https://github.com/nateraw/stable-diffusion-videos> for more in-detail information on how to create videos using stable diffusion as well as more feature-complete functionality.**
+
+### Stable Diffusion Mega
+
+The Stable Diffusion Mega Pipeline lets you use the main use cases of the stable diffusion pipeline in a single class.
+
+```python
+#!/usr/bin/env python3
+from diffusers import DiffusionPipeline
+import PIL
+import requests
+from io import BytesIO
+import torch
+
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", custom_pipeline="stable_diffusion_mega", torch_dtype=torch.float16, variant="fp16")
+pipe.to("cuda")
+pipe.enable_attention_slicing()
+
+
+### Text-to-Image
+images = pipe.text2img("An astronaut riding a horse").images
+
+### Image-to-Image
+init_image = download_image("https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg")
+
+prompt = "A fantasy landscape, trending on artstation"
+
+images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
+
+### Inpainting
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+
+prompt = "a cat sitting on a bench"
+images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
+```
+
+As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
+
+### Long Prompt Weighting Stable Diffusion
+
+Features of this custom pipeline:
+
+- Input a prompt without the 77 token length limit.
+- Includes tx2img, img2img, and inpainting pipelines.
+- Emphasize/weigh part of your prompt with parentheses as so: `a baby deer with (big eyes)`
+- De-emphasize part of your prompt as so: `a [baby] deer with big eyes`
+- Precisely weigh part of your prompt as so: `a baby deer with (big eyes:1.3)`
+
+Prompt weighting equivalents:
+
+- `a baby deer with` == `(a baby deer with:1.0)`
+- `(big eyes)` == `(big eyes:1.1)`
+- `((big eyes))` == `(big eyes:1.21)`
+- `[big eyes]` == `(big eyes:0.91)`
+
+You can run this custom pipeline as so:
+
+#### PyTorch
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    'hakurei/waifu-diffusion',
+    custom_pipeline="lpw_stable_diffusion",
+    torch_dtype=torch.float16
+)
+pipe = pipe.to("cuda")
+
+prompt = "best_quality (1girl:1.3) bow bride brown_hair closed_mouth frilled_bow frilled_hair_tubes frills (full_body:1.3) fox_ear hair_bow hair_tubes happy hood japanese_clothes kimono long_sleeves red_bow smile solo tabi uchikake white_kimono wide_sleeves cherry_blossoms"
+neg_prompt = "lowres, bad_anatomy, error_body, error_hair, error_arm, error_hands, bad_hands, error_fingers, bad_fingers, missing_fingers, error_legs, bad_legs, multiple_legs, missing_legs, error_lighting, error_shadow, error_reflection, text, error, extra_digit, fewer_digits, cropped, worst_quality, low_quality, normal_quality, jpeg_artifacts, signature, watermark, username, blurry"
+
+pipe.text2img(prompt, negative_prompt=neg_prompt, width=512, height=512, max_embeddings_multiples=3).images[0]
+```
+
+#### onnxruntime
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    'CompVis/stable-diffusion-v1-4',
+    custom_pipeline="lpw_stable_diffusion_onnx",
+    revision="onnx",
+    provider="CUDAExecutionProvider"
+)
+
+prompt = "a photo of an astronaut riding a horse on mars, best quality"
+neg_prompt = "lowres, bad anatomy, error body, error hair, error arm, error hands, bad hands, error fingers, bad fingers, missing fingers, error legs, bad legs, multiple legs, missing legs, error lighting, error shadow, error reflection, text, error, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
+
+pipe.text2img(prompt, negative_prompt=neg_prompt, width=512, height=512, max_embeddings_multiples=3).images[0]
+```
+
+If you see `Token indices sequence length is longer than the specified maximum sequence length for this model ( *** > 77 ) . Running this sequence through the model will result in indexing errors`. Do not worry, it is normal.
+
+### Speech to Image
+
+The following code can generate an image from an audio sample using pre-trained OpenAI whisper-small and Stable Diffusion.
+
+```Python
+import torch
+
+import matplotlib.pyplot as plt
+from datasets import load_dataset
+from diffusers import DiffusionPipeline
+from transformers import (
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+)
+
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+audio_sample = ds[3]
+
+text = audio_sample["text"].lower()
+speech_data = audio_sample["audio"]["array"]
+
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
+processor = WhisperProcessor.from_pretrained("openai/whisper-small")
+
+diffuser_pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="speech_to_image_diffusion",
+    speech_model=model,
+    speech_processor=processor,
+    torch_dtype=torch.float16,
+)
+
+diffuser_pipeline.enable_attention_slicing()
+diffuser_pipeline = diffuser_pipeline.to(device)
+
+output = diffuser_pipeline(speech_data)
+plt.imshow(output.images[0])
+```
+
+This example produces the following image:
+
+![image](https://user-images.githubusercontent.com/45072645/196901736-77d9c6fc-63ee-4072-90b0-dc8b903d63e3.png)
+
+### Wildcard Stable Diffusion
+
+Following the great examples from <https://github.com/jtkelm2/stable-diffusion-webui-1/blob/master/scripts/wildcards.py> and <https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Custom-Scripts#wildcards>, here's a minimal implementation that allows for users to add "wildcards", denoted by `__wildcard__` to prompts that are used as placeholders for randomly sampled values given by either a dictionary or a `.txt` file. For example:
+
+Say we have a prompt:
+
+```
+prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
+```
+
+We can then define possible values to be sampled for `animal`, `object`, and `clothing`. These can either be from a `.txt` with the same name as the category.
+
+The possible values can also be defined / combined by using a dictionary like: `{"animal":["dog", "cat", mouse"]}`.
+
+The actual pipeline works just like `StableDiffusionPipeline`, except the `__call__` method takes in:
+
+`wildcard_files`: list of file paths for wild card replacement
+`wildcard_option_dict`: dict with key as `wildcard` and values as a list of possible replacements
+`num_prompt_samples`: number of prompts to sample, uniformly sampling wildcards
+
+A full example:
+
+create `animal.txt`, with contents like:
+
+```
+dog
+cat
+mouse
+```
+
+create `object.txt`, with contents like:
+
+```
+chair
+sofa
+bench
+```
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="wildcard_stable_diffusion",
+    torch_dtype=torch.float16,
+)
+prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
+out = pipe(
+    prompt,
+    wildcard_option_dict={
+        "clothing":["hat", "shirt", "scarf", "beret"]
+    },
+    wildcard_files=["object.txt", "animal.txt"],
+    num_prompt_samples=1
+)
+```
+
+### Composable Stable diffusion
+
+[Composable Stable Diffusion](https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/) proposes conjunction and negation (negative prompts) operators for compositional generation with conditional diffusion models.
+
+```python
+import torch as th
+import numpy as np
+import torchvision.utils as tvu
+
+from diffusers import DiffusionPipeline
+
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--prompt", type=str, default="mystical trees | A magical pond | dark",
+                    help="use '|' as the delimiter to compose separate sentences.")
+parser.add_argument("--steps", type=int, default=50)
+parser.add_argument("--scale", type=float, default=7.5)
+parser.add_argument("--weights", type=str, default="7.5 | 7.5 | -7.5")
+parser.add_argument("--seed", type=int, default=2)
+parser.add_argument("--model_path", type=str, default="CompVis/stable-diffusion-v1-4")
+parser.add_argument("--num_images", type=int, default=1)
+args = parser.parse_args()
+
+has_cuda = th.cuda.is_available()
+device = th.device('cpu' if not has_cuda else 'cuda')
+
+prompt = args.prompt
+scale = args.scale
+steps = args.steps
+
+pipe = DiffusionPipeline.from_pretrained(
+    args.model_path,
+    custom_pipeline="composable_stable_diffusion",
+).to(device)
+
+pipe.safety_checker = None
+
+images = []
+generator = th.Generator("cuda").manual_seed(args.seed)
+for i in range(args.num_images):
+    image = pipe(prompt, guidance_scale=scale, num_inference_steps=steps,
+                 weights=args.weights, generator=generator).images[0]
+    images.append(th.from_numpy(np.array(image)).permute(2, 0, 1) / 255.)
+grid = tvu.make_grid(th.stack(images, dim=0), nrow=4, padding=0)
+tvu.save_image(grid, f'{prompt}_{args.weights}' + '.png')
+```
+
+### Imagic Stable Diffusion
+
+Allows you to edit an image using stable diffusion.
+
+```python
+import requests
+from PIL import Image
+from io import BytesIO
+import torch
+import os
+from diffusers import DiffusionPipeline, DDIMScheduler
+
+has_cuda = torch.cuda.is_available()
+device = torch.device('cpu' if not has_cuda else 'cuda')
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    safety_checker=None,
+    custom_pipeline="imagic_stable_diffusion",
+    scheduler=DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
+).to(device)
+generator = torch.Generator("cuda").manual_seed(0)
+seed = 0
+prompt = "A photo of Barack Obama smiling with a big grin"
+url = 'https://www.dropbox.com/s/6tlwzr73jd1r9yk/obama.png?dl=1'
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+res = pipe.train(
+    prompt,
+    image=init_image,
+    generator=generator)
+res = pipe(alpha=1, guidance_scale=7.5, num_inference_steps=50)
+os.makedirs("imagic", exist_ok=True)
+image = res.images[0]
+image.save('./imagic/imagic_image_alpha_1.png')
+res = pipe(alpha=1.5, guidance_scale=7.5, num_inference_steps=50)
+image = res.images[0]
+image.save('./imagic/imagic_image_alpha_1_5.png')
+res = pipe(alpha=2, guidance_scale=7.5, num_inference_steps=50)
+image = res.images[0]
+image.save('./imagic/imagic_image_alpha_2.png')
+```
+
+### Seed Resizing
+
+Test seed resizing. Originally generate an image in 512 by 512, then generate image with same seed at 512 by 592 using seed resizing. Finally, generate 512 by 592 using original stable diffusion pipeline.
+
+```python
+import torch as th
+import numpy as np
+from diffusers import DiffusionPipeline
+
+has_cuda = th.cuda.is_available()
+device = th.device('cpu' if not has_cuda else 'cuda')
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="seed_resize_stable_diffusion"
+).to(device)
+
+def dummy(images, **kwargs):
+    return images, False
+
+pipe.safety_checker = dummy
+
+
+images = []
+th.manual_seed(0)
+generator = th.Generator("cuda").manual_seed(0)
+
+seed = 0
+prompt = "A painting of a futuristic cop"
+
+width = 512
+height = 512
+
+res = pipe(
+    prompt,
+    guidance_scale=7.5,
+    num_inference_steps=50,
+    height=height,
+    width=width,
+    generator=generator)
+image = res.images[0]
+image.save('./seed_resize/seed_resize_{w}_{h}_image.png'.format(w=width, h=height))
+
+
+th.manual_seed(0)
+generator = th.Generator("cuda").manual_seed(0)
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="/home/mark/open_source/diffusers/examples/community/"
+).to(device)
+
+width = 512
+height = 592
+
+res = pipe(
+    prompt,
+    guidance_scale=7.5,
+    num_inference_steps=50,
+    height=height,
+    width=width,
+    generator=generator)
+image = res.images[0]
+image.save('./seed_resize/seed_resize_{w}_{h}_image.png'.format(w=width, h=height))
+
+pipe_compare = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="/home/mark/open_source/diffusers/examples/community/"
+).to(device)
+
+res = pipe_compare(
+    prompt,
+    guidance_scale=7.5,
+    num_inference_steps=50,
+    height=height,
+    width=width,
+    generator=generator
+)
+
+image = res.images[0]
+image.save('./seed_resize/seed_resize_{w}_{h}_image_compare.png'.format(w=width, h=height))
+```
+
+### Multilingual Stable Diffusion Pipeline
+
+The following code can generate images from texts in different languages using the pre-trained [mBART-50 many-to-one multilingual machine translation model](https://huggingface.co/facebook/mbart-large-50-many-to-one-mmt) and Stable Diffusion.
+
+```python
+from PIL import Image
+
+import torch
+
+from diffusers import DiffusionPipeline
+from transformers import (
+    pipeline,
+    MBart50TokenizerFast,
+    MBartForConditionalGeneration,
+)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+device_dict = {"cuda": 0, "cpu": -1}
+
+# helper function taken from: https://huggingface.co/blog/stable_diffusion
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows*cols
+
+    w, h = imgs[0].size
+    grid = Image.new('RGB', size=(cols*w, rows*h))
+    grid_w, grid_h = grid.size
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i%cols*w, i//cols*h))
+    return grid
+
+# Add language detection pipeline
+language_detection_model_ckpt = "papluca/xlm-roberta-base-language-detection"
+language_detection_pipeline = pipeline("text-classification",
+                                       model=language_detection_model_ckpt,
+                                       device=device_dict[device])
+
+# Add model for language translation
+trans_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
+trans_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-one-mmt").to(device)
+
+diffuser_pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="multilingual_stable_diffusion",
+    detection_pipeline=language_detection_pipeline,
+    translation_model=trans_model,
+    translation_tokenizer=trans_tokenizer,
+    torch_dtype=torch.float16,
+)
+
+diffuser_pipeline.enable_attention_slicing()
+diffuser_pipeline = diffuser_pipeline.to(device)
+
+prompt = ["a photograph of an astronaut riding a horse",
+          "Una casa en la playa",
+          "Ein Hund, der Orange isst",
+          "Un restaurant parisien"]
+
+output = diffuser_pipeline(prompt)
+
+images = output.images
+
+grid = image_grid(images, rows=2, cols=2)
+```
+
+This example produces the following images:
+![image](https://user-images.githubusercontent.com/4313860/198328706-295824a4-9856-4ce5-8e66-278ceb42fd29.png)
+
+### GlueGen Stable Diffusion Pipeline
+
+GlueGen is a minimal adapter that allows alignment between any encoder (Text Encoder of different language, Multilingual Roberta, AudioClip) and CLIP text encoder used in standard Stable Diffusion model. This method allows easy language adaptation to available english Stable Diffusion checkpoints without the need of an image captioning dataset as well as long training hours.
+
+Make sure you downloaded `gluenet_French_clip_overnorm_over3_noln.ckpt` for French (there are also pre-trained weights for Chinese, Italian, Japanese, Spanish or train your own) at [GlueGen's official repo](https://github.com/salesforce/GlueGen/tree/main).
+
+```python
+from PIL import Image
+
+import torch
+
+from transformers import AutoModel, AutoTokenizer
+
+from diffusers import DiffusionPipeline
+
+if __name__ == "__main__":
+    device = "cuda"
+
+    lm_model_id = "xlm-roberta-large"
+    token_max_length = 77
+
+    text_encoder = AutoModel.from_pretrained(lm_model_id)
+    tokenizer = AutoTokenizer.from_pretrained(lm_model_id, model_max_length=token_max_length, use_fast=False)
+
+    tensor_norm = torch.Tensor([[43.8203],[28.3668],[27.9345],[28.0084],[28.2958],[28.2576],[28.3373],[28.2695],[28.4097],[28.2790],[28.2825],[28.2807],[28.2775],[28.2708],[28.2682],[28.2624],[28.2589],[28.2611],[28.2616],[28.2639],[28.2613],[28.2566],[28.2615],[28.2665],[28.2799],[28.2885],[28.2852],[28.2863],[28.2780],[28.2818],[28.2764],[28.2532],[28.2412],[28.2336],[28.2514],[28.2734],[28.2763],[28.2977],[28.2971],[28.2948],[28.2818],[28.2676],[28.2831],[28.2890],[28.2979],[28.2999],[28.3117],[28.3363],[28.3554],[28.3626],[28.3589],[28.3597],[28.3543],[28.3660],[28.3731],[28.3717],[28.3812],[28.3753],[28.3810],[28.3777],[28.3693],[28.3713],[28.3670],[28.3691],[28.3679],[28.3624],[28.3703],[28.3703],[28.3720],[28.3594],[28.3576],[28.3562],[28.3438],[28.3376],[28.3389],[28.3433],[28.3191]])
+
+    pipeline = DiffusionPipeline.from_pretrained(
+        "stable-diffusion-v1-5/stable-diffusion-v1-5",
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        custom_pipeline="gluegen"
+    ).to(device)
+    pipeline.load_language_adapter("gluenet_French_clip_overnorm_over3_noln.ckpt", num_token=token_max_length, dim=1024, dim_out=768, tensor_norm=tensor_norm)
+
+    prompt = "une voiture sur la plage"
+
+    generator = torch.Generator(device=device).manual_seed(42)
+    image = pipeline(prompt, generator=generator).images[0]
+    image.save("gluegen_output_fr.png")
+```
+
+Which will produce:
+
+![output_image](https://github.com/rootonchair/diffusers/assets/23548268/db43ffb6-8667-47c1-8872-26f85dc0a57f)
+
+### Image to Image Inpainting Stable Diffusion
+
+Similar to the standard stable diffusion inpainting example, except with the addition of an `inner_image` argument.
+
+`image`, `inner_image`, and `mask` should have the same dimensions. `inner_image` should have an alpha (transparency) channel.
+
+The aim is to overlay two images, then mask out the boundary between `image` and `inner_image` to allow stable diffusion to make the connection more seamless.
+For example, this could be used to place a logo on a shirt and make it blend seamlessly.
+
+```python
+import PIL
+import torch
+
+from diffusers import DiffusionPipeline
+
+image_path = "./path-to-image.png"
+inner_image_path = "./path-to-inner-image.png"
+mask_path = "./path-to-mask.png"
+
+init_image = PIL.Image.open(image_path).convert("RGB").resize((512, 512))
+inner_image = PIL.Image.open(inner_image_path).convert("RGBA").resize((512, 512))
+mask_image = PIL.Image.open(mask_path).convert("RGB").resize((512, 512))
+
+pipe = DiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-inpainting",
+    custom_pipeline="img2img_inpainting",
+    torch_dtype=torch.float16
+)
+pipe = pipe.to("cuda")
+
+prompt = "Your prompt here!"
+image = pipe(prompt=prompt, image=init_image, inner_image=inner_image, mask_image=mask_image).images[0]
+```
+
+![2 by 2 grid demonstrating image to image inpainting.](https://user-images.githubusercontent.com/44398246/203506577-ec303be4-887e-4ebd-a773-c83fcb3dd01a.png)
+
+### Text Based Inpainting Stable Diffusion
+
+Use a text prompt to generate the mask for the area to be inpainted.
+Currently uses the CLIPSeg model for mask generation, then calls the standard Stable Diffusion Inpainting pipeline to perform the inpainting.
+
+```python
+from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
+from diffusers import DiffusionPipeline
+
+from PIL import Image
+import requests
+
+processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")
+
+pipe = DiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-inpainting",
+    custom_pipeline="text_inpainting",
+    segmentation_model=model,
+    segmentation_processor=processor
+)
+pipe = pipe.to("cuda")
+
+
+url = "https://github.com/timojl/clipseg/blob/master/example_image.jpg?raw=true"
+image = Image.open(requests.get(url, stream=True).raw).resize((512, 512))
+text = "a glass"  # will mask out this text
+prompt = "a cup"  # the masked out region will be replaced with this
+
+image = pipe(image=image, text=text, prompt=prompt).images[0]
+```
+
+### Bit Diffusion
+
+Based <https://arxiv.org/abs/2208.04202>, this is used for diffusion on discrete data - eg, discrete image data, DNA sequence data. An unconditional discrete image can be generated like this:
+
+```python
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="bit_diffusion")
+image = pipe().images[0]
+```
+
+### Stable Diffusion with K Diffusion
+
+Make sure you have @crowsonkb's <https://github.com/crowsonkb/k-diffusion> installed:
+
+```sh
+pip install k-diffusion
+```
+
+You can use the community pipeline as follows:
+
+```python
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="sd_text2img_k_diffusion")
+pipe = pipe.to("cuda")
+
+prompt = "an astronaut riding a horse on mars"
+pipe.set_scheduler("sample_heun")
+generator = torch.Generator(device="cuda").manual_seed(seed)
+image = pipe(prompt, generator=generator, num_inference_steps=20).images[0]
+
+image.save("./astronaut_heun_k_diffusion.png")
+```
+
+To make sure that K Diffusion and `diffusers` yield the same results:
+
+**Diffusers**:
+
+```python
+from diffusers import DiffusionPipeline, EulerDiscreteScheduler
+
+seed = 33
+
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+
+generator = torch.Generator(device="cuda").manual_seed(seed)
+image = pipe(prompt, generator=generator, num_inference_steps=50).images[0]
+```
+
+![diffusers_euler](https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/k_diffusion/astronaut_euler.png)
+
+**K Diffusion**:
+
+```python
+from diffusers import DiffusionPipeline, EulerDiscreteScheduler
+
+seed = 33
+
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="sd_text2img_k_diffusion")
+pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+
+pipe.set_scheduler("sample_euler")
+generator = torch.Generator(device="cuda").manual_seed(seed)
+image = pipe(prompt, generator=generator, num_inference_steps=50).images[0]
+```
+
+![diffusers_euler](https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/k_diffusion/astronaut_euler_k_diffusion.png)
+
+### Checkpoint Merger Pipeline
+
+Based on the AUTOMATIC1111/webui for checkpoint merging. This is a custom pipeline that merges up to 3 pretrained model checkpoints as long as they are in the HuggingFace model_index.json format.
+
+The checkpoint merging is currently memory intensive as it modifies the weights of a DiffusionPipeline object in place. Expect at least 13GB RAM usage on Kaggle GPU kernels and
+on Colab you might run out of the 12GB memory even while merging two checkpoints.
+
+Usage:-
+
+```python
+from diffusers import DiffusionPipeline
+
+# Return a CheckpointMergerPipeline class that allows you to merge checkpoints.
+# The checkpoint passed here is ignored. But still pass one of the checkpoints you plan to
+# merge for convenience
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="checkpoint_merger")
+
+# There are multiple possible scenarios:
+# The pipeline with the merged checkpoints is returned in all the scenarios
+
+# Compatible checkpoints a.k.a matched model_index.json files. Ignores the meta attributes in model_index.json during comparison.( attrs with _ as prefix )
+merged_pipe = pipe.merge(["CompVis/stable-diffusion-v1-4"," CompVis/stable-diffusion-v1-2"], interp="sigmoid", alpha=0.4)
+
+# Incompatible checkpoints in model_index.json but merge might be possible. Use force=True to ignore model_index.json compatibility
+merged_pipe_1 = pipe.merge(["CompVis/stable-diffusion-v1-4", "hakurei/waifu-diffusion"], force=True, interp="sigmoid", alpha=0.4)
+
+# Three checkpoint merging. Only "add_difference" method actually works on all three checkpoints. Using any other options will ignore the 3rd checkpoint.
+merged_pipe_2 = pipe.merge(["CompVis/stable-diffusion-v1-4", "hakurei/waifu-diffusion", "prompthero/openjourney"], force=True, interp="add_difference", alpha=0.4)
+
+prompt = "An astronaut riding a horse on Mars"
+
+image = merged_pipe(prompt).images[0]
+```
+
+Some examples along with the merge details:
+
+1. "CompVis/stable-diffusion-v1-4" + "hakurei/waifu-diffusion" ; Sigmoid interpolation; alpha = 0.8
+
+![Stable plus Waifu Sigmoid 0.8](https://huggingface.co/datasets/NagaSaiAbhinay/CheckpointMergerSamples/resolve/main/stability_v1_4_waifu_sig_0.8.png)
+
+2. "hakurei/waifu-diffusion" + "prompthero/openjourney" ; Inverse Sigmoid interpolation; alpha = 0.8
+
+![Waifu plus openjourney Sigmoid 0.8](https://huggingface.co/datasets/NagaSaiAbhinay/CheckpointMergerSamples/resolve/main/waifu_openjourney_inv_sig_0.8.png)
+
+3. "CompVis/stable-diffusion-v1-4" + "hakurei/waifu-diffusion" + "prompthero/openjourney"; Add Difference interpolation; alpha = 0.5
+
+![Stable plus Waifu plus openjourney add_diff 0.5](https://huggingface.co/datasets/NagaSaiAbhinay/CheckpointMergerSamples/resolve/main/stable_waifu_openjourney_add_diff_0.5.png)
+
+### Stable Diffusion Comparisons
+
+This Community Pipeline enables the comparison between the 4 checkpoints that exist for Stable Diffusion. They can be found through the following links:
+
+1. [Stable Diffusion v1.1](https://huggingface.co/CompVis/stable-diffusion-v1-1)
+2. [Stable Diffusion v1.2](https://huggingface.co/CompVis/stable-diffusion-v1-2)
+3. [Stable Diffusion v1.3](https://huggingface.co/CompVis/stable-diffusion-v1-3)
+4. [Stable Diffusion v1.4](https://huggingface.co/CompVis/stable-diffusion-v1-4)
+
+```python
+from diffusers import DiffusionPipeline
+import matplotlib.pyplot as plt
+
+pipe = DiffusionPipeline.from_pretrained('CompVis/stable-diffusion-v1-4', custom_pipeline='suvadityamuk/StableDiffusionComparison')
+pipe.enable_attention_slicing()
+pipe = pipe.to('cuda')
+prompt = "an astronaut riding a horse on mars"
+output = pipe(prompt)
+
+plt.subplots(2,2,1)
+plt.imshow(output.images[0])
+plt.title('Stable Diffusion v1.1')
+plt.axis('off')
+plt.subplots(2,2,2)
+plt.imshow(output.images[1])
+plt.title('Stable Diffusion v1.2')
+plt.axis('off')
+plt.subplots(2,2,3)
+plt.imshow(output.images[2])
+plt.title('Stable Diffusion v1.3')
+plt.axis('off')
+plt.subplots(2,2,4)
+plt.imshow(output.images[3])
+plt.title('Stable Diffusion v1.4')
+plt.axis('off')
+
+plt.show()
+```
+
+As a result, you can look at a grid of all 4 generated images being shown together, that captures a difference the advancement of the training between the 4 checkpoints.
+
+### Magic Mix
+
+Implementation of the [MagicMix: Semantic Mixing with Diffusion Models](https://arxiv.org/abs/2210.16056) paper. This is a Diffusion Pipeline for semantic mixing of an image and a text prompt to create a new concept while preserving the spatial layout and geometry of the subject in the image. The pipeline takes an image that provides the layout semantics and a prompt that provides the content semantics for the mixing process.
+
+There are 3 parameters for the method-
+
+- `mix_factor`: It is the interpolation constant used in the layout generation phase. The greater the value of `mix_factor`, the greater the influence of the prompt on the layout generation process.
+- `kmax` and `kmin`: These determine the range for the layout and content generation process. A higher value of kmax results in loss of more information about the layout of the original image and a higher value of kmin results in more steps for content generation process.
+
+Here is an example usage-
+
+```python
+from diffusers import DiffusionPipeline, DDIMScheduler
+from PIL import Image
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="magic_mix",
+    scheduler=DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler"),
+).to('cuda')
+
+img = Image.open('phone.jpg')
+mix_img = pipe(
+    img,
+    prompt='bed',
+    kmin=0.3,
+    kmax=0.5,
+    mix_factor=0.5,
+    )
+mix_img.save('phone_bed_mix.jpg')
+```
+
+The `mix_img` is a PIL image that can be saved locally or displayed directly in a google colab. Generated image is a mix of the layout semantics of the given image and the content semantics of the prompt.
+
+E.g. the above script generates the following image:
+
+`phone.jpg`
+
+![206903102-34e79b9f-9ed2-4fac-bb38-82871343c655](https://user-images.githubusercontent.com/59410571/209578593-141467c7-d831-4792-8b9a-b17dc5e47816.jpg)
+
+`phone_bed_mix.jpg`
+
+![206903104-913a671d-ef53-4ae4-919d-64c3059c8f67](https://user-images.githubusercontent.com/59410571/209578602-70f323fa-05b7-4dd6-b055-e40683e37914.jpg)
+
+For more example generations check out this [demo notebook](https://github.com/daspartho/MagicMix/blob/main/demo.ipynb).
+
+### Stable UnCLIP
+
+UnCLIPPipeline("kakaobrain/karlo-v1-alpha") provides a prior model that can generate clip image embedding from text.
+StableDiffusionImageVariationPipeline("lambdalabs/sd-image-variations-diffusers") provides a decoder model than can generate images from clip image embedding.
+
+```python
+import torch
+from diffusers import DiffusionPipeline
+
+device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "kakaobrain/karlo-v1-alpha",
+    torch_dtype=torch.float16,
+    custom_pipeline="stable_unclip",
+    decoder_pipe_kwargs=dict(
+        image_encoder=None,
+    ),
+)
+pipeline.to(device)
+
+prompt = "a shiba inu wearing a beret and black turtleneck"
+random_generator = torch.Generator(device=device).manual_seed(1000)
+output = pipeline(
+    prompt=prompt,
+    width=512,
+    height=512,
+    generator=random_generator,
+    prior_guidance_scale=4,
+    prior_num_inference_steps=25,
+    decoder_guidance_scale=8,
+    decoder_num_inference_steps=50,
+)
+
+image = output.images[0]
+image.save("./shiba-inu.jpg")
+
+# debug
+
+# `pipeline.decoder_pipe` is a regular StableDiffusionImageVariationPipeline instance.
+# It is used to convert clip image embedding to latents, then fed into VAE decoder.
+print(pipeline.decoder_pipe.__class__)
+# <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline'>
+
+# this pipeline only uses prior module in "kakaobrain/karlo-v1-alpha"
+# It is used to convert clip text embedding to clip image embedding.
+print(pipeline)
+# StableUnCLIPPipeline {
+#   "_class_name": "StableUnCLIPPipeline",
+#   "_diffusers_version": "0.12.0.dev0",
+#   "prior": [
+#     "diffusers",
+#     "PriorTransformer"
+#   ],
+#   "prior_scheduler": [
+#     "diffusers",
+#     "UnCLIPScheduler"
+#   ],
+#   "text_encoder": [
+#     "transformers",
+#     "CLIPTextModelWithProjection"
+#   ],
+#   "tokenizer": [
+#     "transformers",
+#     "CLIPTokenizer"
+#   ]
+# }
+
+# pipeline.prior_scheduler is the scheduler used for prior in UnCLIP.
+print(pipeline.prior_scheduler)
+# UnCLIPScheduler {
+#   "_class_name": "UnCLIPScheduler",
+#   "_diffusers_version": "0.12.0.dev0",
+#   "clip_sample": true,
+#   "clip_sample_range": 5.0,
+#   "num_train_timesteps": 1000,
+#   "prediction_type": "sample",
+#   "variance_type": "fixed_small_log"
+# }
+```
+
+`shiba-inu.jpg`
+
+![shiba-inu](https://user-images.githubusercontent.com/16448529/209185639-6e5ec794-ce9d-4883-aa29-bd6852a2abad.jpg)
+
+### UnCLIP Text Interpolation Pipeline
+
+This Diffusion Pipeline takes two prompts and interpolates between the two input prompts using spherical interpolation ( slerp ). The input prompts are converted to text embeddings by the pipeline's text_encoder and the interpolation is done on the resulting text_embeddings over the number of steps specified. Defaults to 5 steps.
+
+```python
+import torch
+from diffusers import DiffusionPipeline
+
+device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
+
+pipe = DiffusionPipeline.from_pretrained(
+    "kakaobrain/karlo-v1-alpha",
+    torch_dtype=torch.float16,
+    custom_pipeline="unclip_text_interpolation"
+)
+pipe.to(device)
+
+start_prompt = "A photograph of an adult lion"
+end_prompt = "A photograph of a lion cub"
+# For best results keep the prompts close in length to each other. Of course, feel free to try out with differing lengths.
+generator = torch.Generator(device=device).manual_seed(42)
+
+output = pipe(start_prompt, end_prompt, steps=6, generator=generator, enable_sequential_cpu_offload=False)
+
+for i,image in enumerate(output.images):
+    img.save('result%s.jpg' % i)
+```
+
+The resulting images in order:-
+
+![result_0](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPTextInterpolationSamples/resolve/main/lion_to_cub_0.png)
+![result_1](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPTextInterpolationSamples/resolve/main/lion_to_cub_1.png)
+![result_2](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPTextInterpolationSamples/resolve/main/lion_to_cub_2.png)
+![result_3](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPTextInterpolationSamples/resolve/main/lion_to_cub_3.png)
+![result_4](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPTextInterpolationSamples/resolve/main/lion_to_cub_4.png)
+![result_5](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPTextInterpolationSamples/resolve/main/lion_to_cub_5.png)
+
+### UnCLIP Image Interpolation Pipeline
+
+This Diffusion Pipeline takes two images or an image_embeddings tensor of size 2 and interpolates between their embeddings using spherical interpolation ( slerp ). The input images/image_embeddings are converted to image embeddings by the pipeline's image_encoder and the interpolation is done on the resulting image_embeddings over the number of steps specified. Defaults to 5 steps.
+
+```python
+import torch
+from diffusers import DiffusionPipeline
+from PIL import Image
+
+device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
+dtype = torch.float16 if torch.cuda.is_available() else torch.bfloat16
+
+pipe = DiffusionPipeline.from_pretrained(
+    "kakaobrain/karlo-v1-alpha-image-variations",
+    torch_dtype=dtype,
+    custom_pipeline="unclip_image_interpolation"
+)
+pipe.to(device)
+
+images = [Image.open('./starry_night.jpg'), Image.open('./flowers.jpg')]
+# For best results keep the prompts close in length to each other. Of course, feel free to try out with differing lengths.
+generator = torch.Generator(device=device).manual_seed(42)
+
+output = pipe(image=images, steps=6, generator=generator)
+
+for i,image in enumerate(output.images):
+    image.save('starry_to_flowers_%s.jpg' % i)
+```
+
+The original images:-
+
+![starry](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPImageInterpolationSamples/resolve/main/starry_night.jpg)
+![flowers](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPImageInterpolationSamples/resolve/main/flowers.jpg)
+
+The resulting images in order:-
+
+![result0](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPImageInterpolationSamples/resolve/main/starry_to_flowers_0.png)
+![result1](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPImageInterpolationSamples/resolve/main/starry_to_flowers_1.png)
+![result2](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPImageInterpolationSamples/resolve/main/starry_to_flowers_2.png)
+![result3](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPImageInterpolationSamples/resolve/main/starry_to_flowers_3.png)
+![result4](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPImageInterpolationSamples/resolve/main/starry_to_flowers_4.png)
+![result5](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPImageInterpolationSamples/resolve/main/starry_to_flowers_5.png)
+
+### DDIM Noise Comparative Analysis Pipeline
+
+#### **Research question: What visual concepts do the diffusion models learn from each noise level during training?**
+
+The [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227) paper proposed an approach to answer the above question, which is their second contribution.
+The approach consists of the following steps:
+
+1. The input is an image x0.
+2. Perturb it to xt using a diffusion process q(xt|x0).
+    - `strength` is a value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
+3. Reconstruct the image with the learned denoising process pθ(ˆx0|xt).
+4. Compare x0 and ˆx0 among various t to show how each step contributes to the sample.
+The authors used [openai/guided-diffusion](https://github.com/openai/guided-diffusion) model to denoise images in FFHQ dataset. This pipeline extends their second contribution by investigating DDIM on any input image.
+
+```python
+import torch
+from PIL import Image
+import numpy as np
+
+image_path = "path/to/your/image"  # images from CelebA-HQ might be better
+image_pil = Image.open(image_path)
+image_name = image_path.split("/")[-1].split(".")[0]
+
+device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
+pipe = DiffusionPipeline.from_pretrained(
+    "google/ddpm-ema-celebahq-256",
+    custom_pipeline="ddim_noise_comparative_analysis",
+)
+pipe = pipe.to(device)
+
+for strength in np.linspace(0.1, 1, 25):
+    denoised_image, latent_timestep = pipe(
+        image_pil, strength=strength, return_dict=False
+    )
+    denoised_image = denoised_image[0]
+    denoised_image.save(
+        f"noise_comparative_analysis_{image_name}_{latent_timestep}.png"
+    )
+```
+
+Here is the result of this pipeline (which is DDIM) on CelebA-HQ dataset.
+
+![noise-comparative-analysis](https://user-images.githubusercontent.com/67547213/224677066-4474b2ed-56ab-4c27-87c6-de3c0255eb9c.jpeg)
+
+### CLIP Guided Img2Img Stable Diffusion
+
+CLIP guided Img2Img stable diffusion can help to generate more realistic images with an initial image
+by guiding stable diffusion at every denoising step with an additional CLIP model.
+
+The following code requires roughly 12GB of GPU RAM.
+
+```python
+from io import BytesIO
+import requests
+import torch
+from diffusers import DiffusionPipeline
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPModel
+
+feature_extractor = CLIPImageProcessor.from_pretrained(
+    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
+)
+clip_model = CLIPModel.from_pretrained(
+    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", torch_dtype=torch.float16
+)
+guided_pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    # custom_pipeline="clip_guided_stable_diffusion",
+    custom_pipeline="/home/njindal/diffusers/examples/community/clip_guided_stable_diffusion.py",
+    clip_model=clip_model,
+    feature_extractor=feature_extractor,
+    torch_dtype=torch.float16,
+)
+guided_pipeline.enable_attention_slicing()
+guided_pipeline = guided_pipeline.to("cuda")
+prompt = "fantasy book cover, full moon, fantasy forest landscape, golden vector elements, fantasy magic, dark light night, intricate, elegant, sharp focus, illustration, highly detailed, digital painting, concept art, matte, art by WLOP and Artgerm and Albert Bierstadt, masterpiece"
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+image = guided_pipeline(
+    prompt=prompt,
+    num_inference_steps=30,
+    image=init_image,
+    strength=0.75,
+    guidance_scale=7.5,
+    clip_guidance_scale=100,
+    num_cutouts=4,
+    use_cutouts=False,
+).images[0]
+display(image)
+```
+
+Init Image
+
+![img2img_init_clip_guidance](https://huggingface.co/datasets/njindal/images/resolve/main/clip_guided_img2img_init.jpg)
+
+Output Image
+
+![img2img_clip_guidance](https://huggingface.co/datasets/njindal/images/resolve/main/clip_guided_img2img.jpg)
+
+### TensorRT Text2Image Stable Diffusion Pipeline
+
+The TensorRT Pipeline can be used to accelerate the Text2Image Stable Diffusion Inference run.
+
+NOTE: The ONNX conversions and TensorRT engine build may take up to 30 minutes.
+
+```python
+import torch
+from diffusers import DDIMScheduler
+from diffusers.pipelines import DiffusionPipeline
+
+# Use the DDIMScheduler scheduler here instead
+scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="scheduler")
+
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1",
+    custom_pipeline="stable_diffusion_tensorrt_txt2img",
+    variant='fp16',
+    torch_dtype=torch.float16,
+    scheduler=scheduler,)
+
+# re-use cached folder to save ONNX models and TensorRT Engines
+pipe.set_cached_folder("stabilityai/stable-diffusion-2-1", variant='fp16',)
+
+pipe = pipe.to("cuda")
+
+prompt = "a beautiful photograph of Mt. Fuji during cherry blossom"
+image = pipe(prompt).images[0]
+image.save('tensorrt_mt_fuji.png')
+```
+
+### EDICT Image Editing Pipeline
+
+This pipeline implements the text-guided image editing approach from the paper [EDICT: Exact Diffusion Inversion via Coupled Transformations](https://arxiv.org/abs/2211.12446). You have to pass:
+
+- (`PIL`) `image` you want to edit.
+- `base_prompt`: the text prompt describing the current image (before editing).
+- `target_prompt`: the text prompt describing with the edits.
+
+```python
+from diffusers import DiffusionPipeline, DDIMScheduler
+from transformers import CLIPTextModel
+import torch, PIL, requests
+from io import BytesIO
+from IPython.display import display
+
+def center_crop_and_resize(im):
+
+    width, height = im.size
+    d = min(width, height)
+    left = (width - d) / 2
+    upper = (height - d) / 2
+    right = (width + d) / 2
+    lower = (height + d) / 2
+
+    return im.crop((left, upper, right, lower)).resize((512, 512))
+
+torch_dtype = torch.float16
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+# scheduler and text_encoder param values as in the paper
+scheduler = DDIMScheduler(
+        num_train_timesteps=1000,
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        set_alpha_to_one=False,
+        clip_sample=False,
+)
+
+text_encoder = CLIPTextModel.from_pretrained(
+    pretrained_model_name_or_path="openai/clip-vit-large-patch14",
+    torch_dtype=torch_dtype,
+)
+
+# initialize pipeline
+pipeline = DiffusionPipeline.from_pretrained(
+    pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4",
+    custom_pipeline="edict_pipeline",
+    variant="fp16",
+    scheduler=scheduler,
+    text_encoder=text_encoder,
+    leapfrog_steps=True,
+    torch_dtype=torch_dtype,
+).to(device)
+
+# download image
+image_url = "https://huggingface.co/datasets/Joqsan/images/resolve/main/imagenet_dog_1.jpeg"
+response = requests.get(image_url)
+image = PIL.Image.open(BytesIO(response.content))
+
+# preprocess it
+cropped_image = center_crop_and_resize(image)
+
+# define the prompts
+base_prompt = "A dog"
+target_prompt = "A golden retriever"
+
+# run the pipeline
+result_image = pipeline(
+      base_prompt=base_prompt,
+      target_prompt=target_prompt,
+      image=cropped_image,
+)
+
+display(result_image)
+```
+
+Init Image
+
+![img2img_init_edict_text_editing](https://huggingface.co/datasets/Joqsan/images/resolve/main/imagenet_dog_1.jpeg)
+
+Output Image
+
+![img2img_edict_text_editing](https://huggingface.co/datasets/Joqsan/images/resolve/main/imagenet_dog_1_cropped_generated.png)
+
+### Stable Diffusion RePaint
+
+This pipeline uses the [RePaint](https://arxiv.org/abs/2201.09865) logic on the latent space of stable diffusion. It can
+be used similarly to other image inpainting pipelines but does not rely on a specific inpainting model. This means you can use
+models that are not specifically created for inpainting.
+
+Make sure to use the ```RePaintScheduler``` as shown in the example below.
+
+Disclaimer: The mask gets transferred into latent space, this may lead to unexpected changes on the edge of the masked part.
+The inference time is a lot slower.
+
+```py
+import PIL
+import requests
+import torch
+from io import BytesIO
+from diffusers import StableDiffusionPipeline, RePaintScheduler
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+mask_image = PIL.ImageOps.invert(mask_image)
+pipe = StableDiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16, custom_pipeline="stable_diffusion_repaint",
+)
+pipe.scheduler = RePaintScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+```
+
+### TensorRT Image2Image Stable Diffusion Pipeline
+
+The TensorRT Pipeline can be used to accelerate the Image2Image Stable Diffusion Inference run.
+
+NOTE: The ONNX conversions and TensorRT engine build may take up to 30 minutes.
+
+```python
+import requests
+from io import BytesIO
+from PIL import Image
+import torch
+from diffusers import DDIMScheduler
+from diffusers import DiffusionPipeline
+
+# Use the DDIMScheduler scheduler here instead
+scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1",
+                                            subfolder="scheduler")
+
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1",
+                                            custom_pipeline="stable_diffusion_tensorrt_img2img",
+                                            variant='fp16',
+                                            torch_dtype=torch.float16,
+                                            scheduler=scheduler,)
+
+# re-use cached folder to save ONNX models and TensorRT Engines
+pipe.set_cached_folder("stabilityai/stable-diffusion-2-1", variant='fp16',)
+
+pipe = pipe.to("cuda")
+
+url = "https://pajoca.com/wp-content/uploads/2022/09/tekito-yamakawa-1.png"
+response = requests.get(url)
+input_image = Image.open(BytesIO(response.content)).convert("RGB")
+prompt = "photorealistic new zealand hills"
+image = pipe(prompt, image=input_image, strength=0.75,).images[0]
+image.save('tensorrt_img2img_new_zealand_hills.png')
+```
+
+### Stable Diffusion BoxDiff
+BoxDiff is a training-free method for controlled generation with bounding box coordinates. It should work with any Stable Diffusion model. Below shows an example with `stable-diffusion-2-1-base`.
+```py
+import torch
+from PIL import Image, ImageDraw
+from copy import deepcopy
+
+from examples.community.pipeline_stable_diffusion_boxdiff import StableDiffusionBoxDiffPipeline
+
+def draw_box_with_text(img, boxes, names):
+    colors = ["red", "olive", "blue", "green", "orange", "brown", "cyan", "purple"]
+    img_new = deepcopy(img)
+    draw = ImageDraw.Draw(img_new)
+
+    W, H = img.size
+    for bid, box in enumerate(boxes):
+        draw.rectangle([box[0] * W, box[1] * H, box[2] * W, box[3] * H], outline=colors[bid % len(colors)], width=4)
+        draw.text((box[0] * W, box[1] * H), names[bid], fill=colors[bid % len(colors)])
+    return img_new
+
+pipe = StableDiffusionBoxDiffPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-2-1-base",
+    torch_dtype=torch.float16,
+)
+pipe.to("cuda")
+
+# example 1
+prompt = "as the aurora lights up the sky, a herd of reindeer leisurely wanders on the grassy meadow, admiring the breathtaking view, a serene lake quietly reflects the magnificent display, and in the distance, a snow-capped mountain stands majestically, fantasy, 8k, highly detailed"
+phrases = [
+    "aurora",
+    "reindeer",
+    "meadow",
+    "lake",
+    "mountain"
+]
+boxes = [[1,3,512,202], [75,344,421,495], [1,327,508,507], [2,217,507,341], [1,135,509,242]]
+
+# example 2
+# prompt = "A rabbit wearing sunglasses looks very proud"
+# phrases = ["rabbit", "sunglasses"]
+# boxes = [[67,87,366,512], [66,130,364,262]]
+
+boxes = [[x / 512 for x in box] for box in boxes]
+
+images = pipe(
+    prompt,
+    boxdiff_phrases=phrases,
+    boxdiff_boxes=boxes,
+    boxdiff_kwargs={
+        "attention_res": 16,
+        "normalize_eot": True
+    },
+    num_inference_steps=50,
+    guidance_scale=7.5,
+    generator=torch.manual_seed(42),
+    safety_checker=None
+).images
+
+draw_box_with_text(images[0], boxes, phrases).save("output.png")
+```
+
+
+### Stable Diffusion Reference
+
+This pipeline uses the Reference Control. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280).
+
+Based on [this issue](https://github.com/huggingface/diffusers/issues/3566),
+
+- `EulerAncestralDiscreteScheduler` got poor results.
+
+```py
+import torch
+from diffusers import UniPCMultistepScheduler
+from diffusers.utils import load_image
+
+input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
+
+pipe = StableDiffusionReferencePipeline.from_pretrained(
+       "stable-diffusion-v1-5/stable-diffusion-v1-5",
+       safety_checker=None,
+       torch_dtype=torch.float16
+       ).to('cuda:0')
+
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+
+result_img = pipe(ref_image=input_image,
+      prompt="1girl",
+      num_inference_steps=20,
+      reference_attn=True,
+      reference_adain=True).images[0]
+```
+
+Reference Image
+
+![reference_image](https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png)
+
+Output Image of `reference_attn=True` and `reference_adain=False`
+
+![output_image](https://github.com/huggingface/diffusers/assets/24734142/813b5c6a-6d89-46ba-b7a4-2624e240eea5)
+
+Output Image of `reference_attn=False` and `reference_adain=True`
+
+![output_image](https://github.com/huggingface/diffusers/assets/24734142/ffc90339-9ef0-4c4d-a544-135c3e5644da)
+
+Output Image of `reference_attn=True` and `reference_adain=True`
+
+![output_image](https://github.com/huggingface/diffusers/assets/24734142/3c5255d6-867d-4d35-b202-8dfd30cc6827)
+
+### Stable Diffusion ControlNet Reference
+
+This pipeline uses the Reference Control with ControlNet. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280).
+
+Based on [this issue](https://github.com/huggingface/diffusers/issues/3566),
+
+- `EulerAncestralDiscreteScheduler` got poor results.
+- `guess_mode=True` works well for ControlNet v1.1
+
+```py
+import cv2
+import torch
+import numpy as np
+from PIL import Image
+from diffusers import UniPCMultistepScheduler
+from diffusers.utils import load_image
+
+input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
+
+# get canny image
+image = cv2.Canny(np.array(input_image), 100, 200)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+pipe = StableDiffusionControlNetReferencePipeline.from_pretrained(
+       "stable-diffusion-v1-5/stable-diffusion-v1-5",
+       controlnet=controlnet,
+       safety_checker=None,
+       torch_dtype=torch.float16
+       ).to('cuda:0')
+
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+
+result_img = pipe(ref_image=input_image,
+      prompt="1girl",
+      image=canny_image,
+      num_inference_steps=20,
+      reference_attn=True,
+      reference_adain=True).images[0]
+```
+
+Reference Image
+
+![reference_image](https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png)
+
+Output Image
+
+![output_image](https://github.com/huggingface/diffusers/assets/24734142/7b9a5830-f173-4b92-b0cf-73d0e9c01d60)
+
+### Stable Diffusion on IPEX
+
+This diffusion pipeline aims to accelerate the inference of Stable-Diffusion on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch).
+
+To use this pipeline, you need to:
+
+1. Install [IPEX](https://github.com/intel/intel-extension-for-pytorch)
+
+**Note:** For each PyTorch release, there is a corresponding release of the IPEX. Here is the mapping relationship. It is recommended to install PyTorch/IPEX2.0 to get the best performance.
+
+|PyTorch Version|IPEX Version|
+|--|--|
+|[v2.0.\*](https://github.com/pytorch/pytorch/tree/v2.0.1 "v2.0.1")|[v2.0.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v2.0.100+cpu)|
+|[v1.13.\*](https://github.com/pytorch/pytorch/tree/v1.13.0 "v1.13.0")|[v1.13.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v1.13.100+cpu)|
+
+You can simply use pip to install IPEX with the latest version.
+
+```sh
+python -m pip install intel_extension_for_pytorch
+```
+
+**Note:** To install a specific version, run with the following command:
+
+```sh
+python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+
+2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX acceleration. Supported inference datatypes are Float32 and BFloat16.
+
+**Note:** The setting of generated image height/width for `prepare_for_ipex()` should be same as the setting of pipeline inference.
+
+```python
+pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", custom_pipeline="stable_diffusion_ipex")
+# For Float32
+pipe.prepare_for_ipex(prompt, dtype=torch.float32, height=512, width=512) # value of image height/width should be consistent with the pipeline inference
+# For BFloat16
+pipe.prepare_for_ipex(prompt, dtype=torch.bfloat16, height=512, width=512) # value of image height/width should be consistent with the pipeline inference
+```
+
+Then you can use the ipex pipeline in a similar way to the default stable diffusion pipeline.
+
+```python
+# For Float32
+image = pipe(prompt, num_inference_steps=20, height=512, width=512).images[0] # value of image height/width should be consistent with 'prepare_for_ipex()'
+# For BFloat16
+with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+    image = pipe(prompt, num_inference_steps=20, height=512, width=512).images[0] # value of image height/width should be consistent with 'prepare_for_ipex()'
+```
+
+The following code compares the performance of the original stable diffusion pipeline with the ipex-optimized pipeline.
+
+```python
+import torch
+import intel_extension_for_pytorch as ipex
+from diffusers import StableDiffusionPipeline
+import time
+
+prompt = "sailing ship in storm by Rembrandt"
+model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+# Helper function for time evaluation
+def elapsed_time(pipeline, nb_pass=3, num_inference_steps=20):
+    # warmup
+    for _ in range(2):
+        images = pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512).images
+    # time evaluation
+    start = time.time()
+    for _ in range(nb_pass):
+        pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512)
+    end = time.time()
+    return (end - start) / nb_pass
+
+##############     bf16 inference performance    ###############
+
+# 1. IPEX Pipeline initialization
+pipe = DiffusionPipeline.from_pretrained(model_id, custom_pipeline="stable_diffusion_ipex")
+pipe.prepare_for_ipex(prompt, dtype=torch.bfloat16, height=512, width=512)
+
+# 2. Original Pipeline initialization
+pipe2 = StableDiffusionPipeline.from_pretrained(model_id)
+
+# 3. Compare performance between Original Pipeline and IPEX Pipeline
+with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+    latency = elapsed_time(pipe)
+    print("Latency of StableDiffusionIPEXPipeline--bf16", latency)
+    latency = elapsed_time(pipe2)
+    print("Latency of StableDiffusionPipeline--bf16", latency)
+
+##############     fp32 inference performance    ###############
+
+# 1. IPEX Pipeline initialization
+pipe3 = DiffusionPipeline.from_pretrained(model_id, custom_pipeline="stable_diffusion_ipex")
+pipe3.prepare_for_ipex(prompt, dtype=torch.float32, height=512, width=512)
+
+# 2. Original Pipeline initialization
+pipe4 = StableDiffusionPipeline.from_pretrained(model_id)
+
+# 3. Compare performance between Original Pipeline and IPEX Pipeline
+latency = elapsed_time(pipe3)
+print("Latency of StableDiffusionIPEXPipeline--fp32", latency)
+latency = elapsed_time(pipe4)
+print("Latency of StableDiffusionPipeline--fp32", latency)
+```
+
+### Stable Diffusion XL on IPEX
+
+This diffusion pipeline aims to accelerate the inference of Stable-Diffusion XL on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch).
+
+To use this pipeline, you need to:
+
+1. Install [IPEX](https://github.com/intel/intel-extension-for-pytorch)
+
+**Note:** For each PyTorch release, there is a corresponding release of IPEX. Here is the mapping relationship. It is recommended to install Pytorch/IPEX2.0 to get the best performance.
+
+|PyTorch Version|IPEX Version|
+|--|--|
+|[v2.0.\*](https://github.com/pytorch/pytorch/tree/v2.0.1 "v2.0.1")|[v2.0.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v2.0.100+cpu)|
+|[v1.13.\*](https://github.com/pytorch/pytorch/tree/v1.13.0 "v1.13.0")|[v1.13.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v1.13.100+cpu)|
+
+You can simply use pip to install IPEX with the latest version.
+
+```sh
+python -m pip install intel_extension_for_pytorch
+```
+
+**Note:** To install a specific version, run with the following command:
+
+```sh
+python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+
+2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX acceleration. Supported inference datatypes are Float32 and BFloat16.
+
+**Note:** The values of `height` and `width` used during preparation with `prepare_for_ipex()` should be the same when running inference with the prepared pipeline.
+
+```python
+pipe = StableDiffusionXLPipelineIpex.from_pretrained("stabilityai/sdxl-turbo", low_cpu_mem_usage=True, use_safetensors=True)
+# value of image height/width should be consistent with the pipeline inference
+# For Float32
+pipe.prepare_for_ipex(torch.float32, prompt, height=512, width=512)
+# For BFloat16
+pipe.prepare_for_ipex(torch.bfloat16, prompt, height=512, width=512)
+```
+
+Then you can use the ipex pipeline in a similar way to the default stable diffusion xl pipeline.
+
+```python
+# value of image height/width should be consistent with 'prepare_for_ipex()'
+# For Float32
+image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=guidance_scale).images[0]
+# For BFloat16
+with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+    image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=guidance_scale).images[0]
+```
+
+The following code compares the performance of the original stable diffusion xl pipeline with the ipex-optimized pipeline.
+By using this optimized pipeline, we can get about 1.4-2 times performance boost with BFloat16 on fourth generation of Intel Xeon CPUs,
+code-named Sapphire Rapids.
+
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline
+from pipeline_stable_diffusion_xl_ipex import StableDiffusionXLPipelineIpex
+import time
+
+prompt = "sailing ship in storm by Rembrandt"
+model_id = "stabilityai/sdxl-turbo"
+steps = 4
+
+# Helper function for time evaluation
+def elapsed_time(pipeline, nb_pass=3, num_inference_steps=1):
+    # warmup
+    for _ in range(2):
+        images = pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=0.0).images
+    # time evaluation
+    start = time.time()
+    for _ in range(nb_pass):
+        pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=0.0)
+    end = time.time()
+    return (end - start) / nb_pass
+
+##############     bf16 inference performance    ###############
+
+# 1. IPEX Pipeline initialization
+pipe = StableDiffusionXLPipelineIpex.from_pretrained(model_id, low_cpu_mem_usage=True, use_safetensors=True)
+pipe.prepare_for_ipex(torch.bfloat16, prompt, height=512, width=512)
+
+# 2. Original Pipeline initialization
+pipe2 = StableDiffusionXLPipeline.from_pretrained(model_id, low_cpu_mem_usage=True, use_safetensors=True)
+
+# 3. Compare performance between Original Pipeline and IPEX Pipeline
+with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+    latency = elapsed_time(pipe, num_inference_steps=steps)
+    print("Latency of StableDiffusionXLPipelineIpex--bf16", latency, "s for total", steps, "steps")
+    latency = elapsed_time(pipe2, num_inference_steps=steps)
+    print("Latency of StableDiffusionXLPipeline--bf16", latency, "s for total", steps, "steps")
+
+##############     fp32 inference performance    ###############
+
+# 1. IPEX Pipeline initialization
+pipe3 = StableDiffusionXLPipelineIpex.from_pretrained(model_id, low_cpu_mem_usage=True, use_safetensors=True)
+pipe3.prepare_for_ipex(torch.float32, prompt, height=512, width=512)
+
+# 2. Original Pipeline initialization
+pipe4 = StableDiffusionXLPipeline.from_pretrained(model_id, low_cpu_mem_usage=True, use_safetensors=True)
+
+# 3. Compare performance between Original Pipeline and IPEX Pipeline
+latency = elapsed_time(pipe3, num_inference_steps=steps)
+print("Latency of StableDiffusionXLPipelineIpex--fp32", latency, "s for total", steps, "steps")
+latency = elapsed_time(pipe4, num_inference_steps=steps)
+print("Latency of StableDiffusionXLPipeline--fp32", latency, "s for total", steps, "steps")
+```
+
+### CLIP Guided Images Mixing With Stable Diffusion
+
+![clip_guided_images_mixing_examples](https://huggingface.co/datasets/TheDenk/images_mixing/resolve/main/main.png)
+
+CLIP guided stable diffusion images mixing pipeline allows to combine two images using standard diffusion models.
+This approach is using (optional) CoCa model to avoid writing image description.
+[More code examples](https://github.com/TheDenk/images_mixing)
+
+### Stable Diffusion XL Long Weighted Prompt Pipeline
+
+This SDXL pipeline supports unlimited length prompt and negative prompt, compatible with A1111 prompt weighted style.
+
+You can provide both `prompt` and `prompt_2`. If only one prompt is provided, `prompt_2` will be a copy of the provided `prompt`. Here is a sample code to use this pipeline.
+
+```python
+from diffusers import DiffusionPipeline
+from diffusers.utils import load_image
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0"
+    , torch_dtype       = torch.float16
+    , use_safetensors   = True
+    , variant           = "fp16"
+    , custom_pipeline   = "lpw_stable_diffusion_xl",
+)
+
+prompt = "photo of a cute (white) cat running on the grass" * 20
+prompt2 = "chasing (birds:1.5)" * 20
+prompt = f"{prompt},{prompt2}"
+neg_prompt = "blur, low quality, carton, animate"
+
+pipe.to("cuda")
+
+# text2img
+t2i_images = pipe(
+    prompt=prompt,
+    negative_prompt=neg_prompt,
+).images  # alternatively, you can call the .text2img() function
+
+# img2img
+input_image = load_image("/path/to/local/image.png")  # or URL to your input image
+i2i_images = pipe.img2img(
+  prompt=prompt,
+  negative_prompt=neg_prompt,
+  image=input_image,
+  strength=0.8,  # higher strength will result in more variation compared to original image
+).images
+
+# inpaint
+input_mask = load_image("/path/to/local/mask.png")  # or URL to your input inpainting mask
+inpaint_images = pipe.inpaint(
+  prompt="photo of a cute (black) cat running on the grass" * 20,
+  negative_prompt=neg_prompt,
+  image=input_image,
+  mask=input_mask,
+  strength=0.6,  # higher strength will result in more variation compared to original image
+).images
+
+pipe.to("cpu")
+torch.cuda.empty_cache()
+
+from IPython.display import display  # assuming you are using this code in a notebook
+display(t2i_images[0])
+display(i2i_images[0])
+display(inpaint_images[0])
+```
+
+In the above code, the `prompt2` is appended to the `prompt`, which is more than 77 tokens. "birds" are showing up in the result.
+![Stable Diffusion XL Long Weighted Prompt Pipeline sample](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_long_weighted_prompt.png)
+
+For more results, checkout [PR #6114](https://github.com/huggingface/diffusers/pull/6114).
+
+### Example Images Mixing (with CoCa)
+
+```python
+import requests
+from io import BytesIO
+
+import PIL
+import torch
+import open_clip
+from open_clip import SimpleTokenizer
+from diffusers import DiffusionPipeline
+from transformers import CLIPImageProcessor, CLIPModel
+
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+# Loading additional models
+feature_extractor = CLIPImageProcessor.from_pretrained(
+    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
+)
+clip_model = CLIPModel.from_pretrained(
+    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", torch_dtype=torch.float16
+)
+coca_model = open_clip.create_model('coca_ViT-L-14', pretrained='laion2B-s13B-b90k').to('cuda')
+coca_model.dtype = torch.float16
+coca_transform = open_clip.image_transform(
+    coca_model.visual.image_size,
+    is_train=False,
+    mean=getattr(coca_model.visual, 'image_mean', None),
+    std=getattr(coca_model.visual, 'image_std', None),
+)
+coca_tokenizer = SimpleTokenizer()
+
+# Pipeline creating
+mixing_pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="clip_guided_images_mixing_stable_diffusion",
+    clip_model=clip_model,
+    feature_extractor=feature_extractor,
+    coca_model=coca_model,
+    coca_tokenizer=coca_tokenizer,
+    coca_transform=coca_transform,
+    torch_dtype=torch.float16,
+)
+mixing_pipeline.enable_attention_slicing()
+mixing_pipeline = mixing_pipeline.to("cuda")
+
+# Pipeline running
+generator = torch.Generator(device="cuda").manual_seed(17)
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+content_image = download_image("https://huggingface.co/datasets/TheDenk/images_mixing/resolve/main/boromir.jpg")
+style_image = download_image("https://huggingface.co/datasets/TheDenk/images_mixing/resolve/main/gigachad.jpg")
+
+pipe_images = mixing_pipeline(
+    num_inference_steps=50,
+    content_image=content_image,
+    style_image=style_image,
+    noise_strength=0.65,
+    slerp_latent_style_strength=0.9,
+    slerp_prompt_style_strength=0.1,
+    slerp_clip_image_style_strength=0.1,
+    guidance_scale=9.0,
+    batch_size=1,
+    clip_guidance_scale=100,
+    generator=generator,
+).images
+```
+
+![image_mixing_result](https://huggingface.co/datasets/TheDenk/images_mixing/resolve/main/boromir_gigachad.png)
+
+### Stable Diffusion Mixture Tiling
+
+This pipeline uses the Mixture. Refer to the [Mixture](https://arxiv.org/abs/2302.02412) paper for more details.
+
+```python
+from diffusers import LMSDiscreteScheduler, DiffusionPipeline
+
+# Create scheduler and model (similar to StableDiffusionPipeline)
+scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
+pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler, custom_pipeline="mixture_tiling")
+pipeline.to("cuda")
+
+# Mixture of Diffusers generation
+image = pipeline(
+    prompt=[[
+        "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+        "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+        "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece"
+    ]],
+    tile_height=640,
+    tile_width=640,
+    tile_row_overlap=0,
+    tile_col_overlap=256,
+    guidance_scale=8,
+    seed=7178915308,
+    num_inference_steps=50,
+)["images"][0]
+```
+
+![mixture_tiling_results](https://huggingface.co/datasets/kadirnar/diffusers_readme_images/resolve/main/mixture_tiling.png)
+
+### TensorRT Inpainting Stable Diffusion Pipeline
+
+The TensorRT Pipeline can be used to accelerate the Inpainting Stable Diffusion Inference run.
+
+NOTE: The ONNX conversions and TensorRT engine build may take up to 30 minutes.
+
+```python
+import requests
+from io import BytesIO
+from PIL import Image
+import torch
+from diffusers import PNDMScheduler
+from diffusers.pipelines import DiffusionPipeline
+
+# Use the PNDMScheduler scheduler here instead
+scheduler = PNDMScheduler.from_pretrained("stabilityai/stable-diffusion-2-inpainting", subfolder="scheduler")
+
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting",
+    custom_pipeline="stable_diffusion_tensorrt_inpaint",
+    variant='fp16',
+    torch_dtype=torch.float16,
+    scheduler=scheduler,
+    )
+
+# re-use cached folder to save ONNX models and TensorRT Engines
+pipe.set_cached_folder("stabilityai/stable-diffusion-2-inpainting", variant='fp16',)
+
+pipe = pipe.to("cuda")
+
+url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+response = requests.get(url)
+input_image = Image.open(BytesIO(response.content)).convert("RGB")
+
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+response = requests.get(mask_url)
+mask_image = Image.open(BytesIO(response.content)).convert("RGB")
+
+prompt = "a mecha robot sitting on a bench"
+image = pipe(prompt, image=input_image, mask_image=mask_image, strength=0.75,).images[0]
+image.save('tensorrt_inpaint_mecha_robot.png')
+```
+
+### Stable Diffusion Mixture Canvas
+
+This pipeline uses the Mixture. Refer to the [Mixture](https://arxiv.org/abs/2302.02412) paper for more details.
+
+```python
+from PIL import Image
+from diffusers import LMSDiscreteScheduler, DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import Image2ImageRegion, Text2ImageRegion, preprocess_image
+
+
+# Load and preprocess guide image
+iic_image = preprocess_image(Image.open("input_image.png").convert("RGB"))
+
+# Create scheduler and model (similar to StableDiffusionPipeline)
+scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
+pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler).to("cuda:0", custom_pipeline="mixture_canvas")
+pipeline.to("cuda")
+
+# Mixture of Diffusers generation
+output = pipeline(
+    canvas_height=800,
+    canvas_width=352,
+    regions=[
+        Text2ImageRegion(0, 800, 0, 352, guidance_scale=8,
+            prompt=f"best quality, masterpiece, WLOP, sakimichan, art contest winner on pixiv, 8K, intricate details, wet effects, rain drops, ethereal, mysterious, futuristic, UHD, HDR, cinematic lighting, in a beautiful forest, rainy day, award winning, trending on artstation, beautiful confident cheerful young woman, wearing a futuristic sleeveless dress, ultra beautiful detailed  eyes, hyper-detailed face, complex,  perfect, model,  textured,  chiaroscuro, professional make-up, realistic, figure in frame, "),
+        Image2ImageRegion(352-800, 352, 0, 352, reference_image=iic_image, strength=1.0),
+    ],
+    num_inference_steps=100,
+    seed=5525475061,
+)["images"][0]
+```
+
+![Input_Image](https://huggingface.co/datasets/kadirnar/diffusers_readme_images/resolve/main/input_image.png)
+![mixture_canvas_results](https://huggingface.co/datasets/kadirnar/diffusers_readme_images/resolve/main/canvas.png)
+
+### IADB pipeline
+
+This pipeline is the implementation of the [α-(de)Blending: a Minimalist Deterministic Diffusion Model](https://arxiv.org/abs/2305.03486) paper.
+It is a simple and minimalist diffusion model.
+
+The following code shows how to use the IADB pipeline to generate images using a pretrained celebahq-256 model.
+
+```python
+pipeline_iadb = DiffusionPipeline.from_pretrained("thomasc4/iadb-celebahq-256", custom_pipeline='iadb')
+
+pipeline_iadb = pipeline_iadb.to('cuda')
+
+output = pipeline_iadb(batch_size=4, num_inference_steps=128)
+for i in range(len(output[0])):
+    plt.imshow(output[0][i])
+    plt.show()
+```
+
+Sampling with the IADB formulation is easy, and can be done in a few lines (the pipeline already implements it):
+
+```python
+def sample_iadb(model, x0, nb_step):
+    x_alpha = x0
+    for t in range(nb_step):
+        alpha = (t/nb_step)
+        alpha_next =((t+1)/nb_step)
+
+        d = model(x_alpha, torch.tensor(alpha, device=x_alpha.device))['sample']
+        x_alpha = x_alpha + (alpha_next-alpha)*d
+
+    return x_alpha
+```
+
+The training loop is also straightforward:
+
+```python
+# Training loop
+while True:
+    x0 = sample_noise()
+    x1 = sample_dataset()
+
+    alpha = torch.rand(batch_size)
+
+    # Blend
+    x_alpha = (1-alpha) * x0 + alpha * x1
+
+    # Loss
+    loss = torch.sum((D(x_alpha, alpha)- (x1-x0))**2)
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+```
+
+### Zero1to3 pipeline
+
+This pipeline is the implementation of the [Zero-1-to-3: Zero-shot One Image to 3D Object](https://arxiv.org/abs/2303.11328) paper.
+The original pytorch-lightning [repo](https://github.com/cvlab-columbia/zero123) and a diffusers [repo](https://github.com/kxhit/zero123-hf).
+
+The following code shows how to use the Zero1to3 pipeline to generate novel view synthesis images using a pretrained stable diffusion model.
+
+```python
+import os
+import torch
+from pipeline_zero1to3 import Zero1to3StableDiffusionPipeline
+from diffusers.utils import load_image
+
+model_id = "kxic/zero123-165000"  # zero123-105000, zero123-165000, zero123-xl
+
+pipe = Zero1to3StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+
+pipe.enable_xformers_memory_efficient_attention()
+pipe.enable_vae_tiling()
+pipe.enable_attention_slicing()
+pipe = pipe.to("cuda")
+
+num_images_per_prompt = 4
+
+# test inference pipeline
+# x y z, Polar angle (vertical rotation in degrees)  Azimuth angle (horizontal rotation in degrees)  Zoom (relative distance from center)
+query_pose1 = [-75.0, 100.0, 0.0]
+query_pose2 = [-20.0, 125.0, 0.0]
+query_pose3 = [-55.0, 90.0, 0.0]
+
+# load image
+# H, W = (256, 256) # H, W = (512, 512)   # zero123 training is 256,256
+
+# for batch input
+input_image1 = load_image("./demo/4_blackarm.png")  # load_image("https://cvlab-zero123-live.hf.space/file=/home/user/app/configs/4_blackarm.png")
+input_image2 = load_image("./demo/8_motor.png")  # load_image("https://cvlab-zero123-live.hf.space/file=/home/user/app/configs/8_motor.png")
+input_image3 = load_image("./demo/7_london.png")  # load_image("https://cvlab-zero123-live.hf.space/file=/home/user/app/configs/7_london.png")
+input_images = [input_image1, input_image2, input_image3]
+query_poses = [query_pose1, query_pose2, query_pose3]
+
+# # for single input
+# H, W = (256, 256)
+# input_images = [input_image2.resize((H, W), PIL.Image.NEAREST)]
+# query_poses = [query_pose2]
+
+
+# better do preprocessing
+from gradio_new import preprocess_image, create_carvekit_interface
+import numpy as np
+import PIL.Image as Image
+
+pre_images = []
+models = dict()
+print('Instantiating Carvekit HiInterface...')
+models['carvekit'] = create_carvekit_interface()
+if not isinstance(input_images, list):
+    input_images = [input_images]
+for raw_im in input_images:
+    input_im = preprocess_image(models, raw_im, True)
+    H, W = input_im.shape[:2]
+    pre_images.append(Image.fromarray((input_im * 255.0).astype(np.uint8)))
+input_images = pre_images
+
+# infer pipeline, in original zero123 num_inference_steps=76
+images = pipe(input_imgs=input_images, prompt_imgs=input_images, poses=query_poses, height=H, width=W,
+              guidance_scale=3.0, num_images_per_prompt=num_images_per_prompt, num_inference_steps=50).images
+
+# save imgs
+log_dir = "logs"
+os.makedirs(log_dir, exist_ok=True)
+bs = len(input_images)
+i = 0
+for obj in range(bs):
+    for idx in range(num_images_per_prompt):
+        images[i].save(os.path.join(log_dir,f"obj{obj}_{idx}.jpg"))
+        i += 1
+```
+
+### Stable Diffusion XL Reference
+
+This pipeline uses the Reference. Refer to the [stable_diffusion_reference](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#stable-diffusion-reference).
+
+```py
+import torch
+from PIL import Image
+from diffusers.utils import load_image
+from diffusers import DiffusionPipeline
+from diffusers.schedulers import UniPCMultistepScheduler
+
+input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
+
+# pipe = DiffusionPipeline.from_pretrained(
+#     "stabilityai/stable-diffusion-xl-base-1.0",
+#     custom_pipeline="stable_diffusion_xl_reference",
+#     torch_dtype=torch.float16,
+#     use_safetensors=True,
+#     variant="fp16").to('cuda:0')
+
+pipe = StableDiffusionXLReferencePipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+    variant="fp16").to('cuda:0')
+
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+
+result_img = pipe(ref_image=input_image,
+      prompt="1girl",
+      num_inference_steps=20,
+      reference_attn=True,
+      reference_adain=True).images[0]
+```
+
+Reference Image
+
+![reference_image](https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png)
+
+Output Image
+
+`prompt: 1 girl`
+
+`reference_attn=True, reference_adain=True, num_inference_steps=20`
+![Output_image](https://github.com/zideliu/diffusers/assets/34944964/743848da-a215-48f9-ae39-b5e2ae49fb13)
+
+Reference Image
+![reference_image](https://github.com/huggingface/diffusers/assets/34944964/449bdab6-e744-4fb2-9620-d4068d9a741b)
+
+Output Image
+
+`prompt: A dog`
+
+`reference_attn=True, reference_adain=False, num_inference_steps=20`
+![Output_image](https://github.com/huggingface/diffusers/assets/34944964/fff2f16f-6e91-434b-abcc-5259d866c31e)
+
+Reference Image
+![reference_image](https://github.com/huggingface/diffusers/assets/34944964/077ed4fe-2991-4b79-99a1-009f056227d1)
+
+Output Image
+
+`prompt: An astronaut riding a lion`
+
+`reference_attn=True, reference_adain=True, num_inference_steps=20`
+![output_image](https://github.com/huggingface/diffusers/assets/34944964/9b2f1aca-886f-49c3-89ec-d2031c8e3670)
+
+### Stable diffusion fabric pipeline
+
+FABRIC approach applicable to a wide range of popular diffusion models, which exploits
+the self-attention layer present in the most widely used architectures to condition
+the diffusion process on a set of feedback images.
+
+```python
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+
+from diffusers import DiffusionPipeline
+
+# load the pipeline
+# make sure you're logged in with `huggingface-cli login`
+model_id_or_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+# can also be used with dreamlike-art/dreamlike-photoreal-2.0
+pipe = DiffusionPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16, custom_pipeline="pipeline_fabric").to("cuda")
+
+# let's specify a prompt
+prompt = "An astronaut riding an elephant"
+negative_prompt = "lowres, cropped"
+
+# call the pipeline
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    num_inference_steps=20,
+    generator=torch.manual_seed(12)
+).images[0]
+
+image.save("horse_to_elephant.jpg")
+
+# let's try another example with feedback
+url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+
+prompt = "photo, A blue colored car, fish eye"
+liked = [init_image]
+## same goes with disliked
+
+# call the pipeline
+torch.manual_seed(0)
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    liked=liked,
+    num_inference_steps=20,
+).images[0]
+
+image.save("black_to_blue.png")
+```
+
+*With enough feedbacks you can create very similar high quality images.*
+
+The original codebase can be found at [sd-fabric/fabric](https://github.com/sd-fabric/fabric), and available checkpoints are [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0), [stable-diffusion-v1-5/stable-diffusion-v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5), and [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (may give unexpected results).
+
+Let's have a look at the images (_512X512_)
+
+| Without Feedback            | With Feedback  (1st image)          |
+|---------------------|---------------------|
+| ![Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_wo_feedback.jpg) | ![Feedback Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_w_feedback.png) |
+
+### Masked Im2Im Stable Diffusion Pipeline
+
+This pipeline reimplements sketch inpaint feature from A1111 for non-inpaint models. The following code reads two images, original and one with mask painted over it. It computes mask as a difference of two images and does the inpainting in the area defined by the mask.
+
+```python
+img = PIL.Image.open("./mech.png")
+# read image with mask painted over
+img_paint = PIL.Image.open("./mech_painted.png")
+neq = numpy.any(numpy.array(img) != numpy.array(img_paint), axis=-1)
+mask = neq / neq.max()
+
+pipeline = MaskedStableDiffusionImg2ImgPipeline.from_pretrained("frankjoshua/icbinpICantBelieveIts_v8")
+
+# works best with EulerAncestralDiscreteScheduler
+pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
+generator = torch.Generator(device="cpu").manual_seed(4)
+
+prompt = "a man wearing a mask"
+result = pipeline(prompt=prompt, image=img_paint, mask=mask, strength=0.75,
+                  generator=generator)
+result.images[0].save("result.png")
+```
+
+original image mech.png
+
+<img src=https://github.com/noskill/diffusers/assets/733626/10ad972d-d655-43cb-8de1-039e3d79e849 width="25%" >
+
+image with mask mech_painted.png
+
+<img src=https://github.com/noskill/diffusers/assets/733626/c334466a-67fe-4377-9ff7-f46021b9c224 width="25%" >
+
+result:
+
+<img src=https://github.com/noskill/diffusers/assets/733626/23a0a71d-51db-471e-926a-107ac62512a8 width="25%" >
+
+### Masked Im2Im Stable Diffusion Pipeline XL
+
+This pipeline implements sketch inpaint feature from A1111 for non-inpaint models. The following code reads two images, original and one with mask painted over it. It computes mask as a difference of two images and does the inpainting in the area defined by the mask. Latent code is initialized from the image with the mask by default so the color of the mask affects the result.
+
+```
+img = PIL.Image.open("./mech.png")
+# read image with mask painted over
+img_paint = PIL.Image.open("./mech_painted.png")
+
+pipeline = MaskedStableDiffusionXLImg2ImgPipeline.from_pretrained("frankjoshua/juggernautXL_v8Rundiffusion", dtype=torch.float16)
+
+pipeline.to('cuda')
+pipeline.enable_xformers_memory_efficient_attention()
+
+prompt = "a mech warrior wearing a mask"
+seed = 8348273636437
+for i in range(10):
+    generator = torch.Generator(device="cuda").manual_seed(seed + i)
+    print(seed + i)
+    result = pipeline(prompt=prompt, blur=48, image=img_paint, original_image=img, strength=0.9,
+                          generator=generator, num_inference_steps=60, num_images_per_prompt=1)
+    im = result.images[0]
+    im.save(f"result{i}.png")
+```
+
+original image mech.png
+
+<img src=https://github.com/noskill/diffusers/assets/733626/10ad972d-d655-43cb-8de1-039e3d79e849 width="25%" >
+
+image with mask mech_painted.png
+
+<img src=https://github.com/noskill/diffusers/assets/733626/c334466a-67fe-4377-9ff7-f46021b9c224 width="25%" >
+
+result: 
+
+<img src=https://github.com/noskill/diffusers/assets/733626/5043fb57-a785-4606-a5ba-a36704f7cb42 width="25%" >
+
+### Prompt2Prompt Pipeline
+
+Prompt2Prompt allows the following edits:
+
+- ReplaceEdit (change words in prompt)
+- ReplaceEdit with local blend (change words in prompt, keep image part unrelated to changes constant)
+- RefineEdit (add words to prompt)
+- RefineEdit with local blend (add words to prompt, keep image part unrelated to changes constant)
+- ReweightEdit (modulate importance of words)
+
+Here's a full example for `ReplaceEdit``:
+
+```python
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="pipeline_prompt2prompt").to("cuda")
+
+prompts = ["A turtle playing with a ball",
+           "A monkey playing with a ball"]
+
+cross_attention_kwargs = {
+    "edit_type": "replace",
+    "cross_replace_steps": 0.4,
+    "self_replace_steps": 0.4
+}
+
+outputs = pipe(prompt=prompts, height=512, width=512, num_inference_steps=50, cross_attention_kwargs=cross_attention_kwargs)
+```
+
+And abbreviated examples for the other edits:
+
+`ReplaceEdit with local blend`
+
+```python
+prompts = ["A turtle playing with a ball",
+           "A monkey playing with a ball"]
+
+cross_attention_kwargs = {
+    "edit_type": "replace",
+    "cross_replace_steps": 0.4,
+    "self_replace_steps": 0.4,
+    "local_blend_words": ["turtle", "monkey"]
+}
+```
+
+`RefineEdit`
+
+```python
+prompts = ["A turtle",
+           "A turtle in a forest"]
+
+cross_attention_kwargs = {
+    "edit_type": "refine",
+    "cross_replace_steps": 0.4,
+    "self_replace_steps": 0.4,
+}
+```
+
+`RefineEdit with local blend`
+
+```python
+prompts = ["A turtle",
+           "A turtle in a forest"]
+
+cross_attention_kwargs = {
+    "edit_type": "refine",
+    "cross_replace_steps": 0.4,
+    "self_replace_steps": 0.4,
+    "local_blend_words": ["in", "a" , "forest"]
+}
+```
+
+`ReweightEdit`
+
+```python
+prompts = ["A smiling turtle"] * 2
+
+edit_kcross_attention_kwargswargs = {
+    "edit_type": "reweight",
+    "cross_replace_steps": 0.4,
+    "self_replace_steps": 0.4,
+    "equalizer_words": ["smiling"],
+    "equalizer_strengths": [5]
+}
+```
+
+Side note: See [this GitHub gist](https://gist.github.com/UmerHA/b65bb5fb9626c9c73f3ade2869e36164) if you want to visualize the attention maps.
+
+### Latent Consistency Pipeline
+
+Latent Consistency Models was proposed in [Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference](https://arxiv.org/abs/2310.04378) by _Simian Luo, Yiqin Tan, Longbo Huang, Jian Li, Hang Zhao_ from Tsinghua University.
+
+The abstract of the paper reads as follows:
+
+*Latent Diffusion models (LDMs) have achieved remarkable results in synthesizing high-resolution images. However, the iterative sampling process is computationally intensive and leads to slow generation. Inspired by Consistency Models (song et al.), we propose Latent Consistency Models (LCMs), enabling swift inference with minimal steps on any pre-trained LDMs, including Stable Diffusion (rombach et al). Viewing the guided reverse diffusion process as solving an augmented probability flow ODE (PF-ODE), LCMs are designed to directly predict the solution of such ODE in latent space, mitigating the need for numerous iterations and allowing rapid, high-fidelity sampling. Efficiently distilled from pre-trained classifier-free guided diffusion models, a high-quality 768 x 768 2~4-step LCM takes only 32 A100 GPU hours for training. Furthermore, we introduce Latent Consistency Fine-tuning (LCF), a novel method that is tailored for fine-tuning LCMs on customized image datasets. Evaluation on the LAION-5B-Aesthetics dataset demonstrates that LCMs achieve state-of-the-art text-to-image generation performance with few-step inference. Project Page: [this https URL](https://latent-consistency-models.github.io/)*
+
+The model can be used with `diffusers` as follows:
+
+- *1. Load the model from the community pipeline.*
+
+```py
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", custom_pipeline="latent_consistency_txt2img", custom_revision="main")
+
+# To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+pipe.to(torch_device="cuda", torch_dtype=torch.float32)
+```
+
+- 2. Run inference with as little as 4 steps:
+
+```py
+prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
+
+# Can be set to 1~50 steps. LCM supports fast inference even <= 4 steps. Recommend: 1~8 steps.
+num_inference_steps = 4
+
+images = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images
+```
+
+For any questions or feedback, feel free to reach out to [Simian Luo](https://github.com/luosiallen).
+
+You can also try this pipeline directly in the [🚀 official spaces](https://huggingface.co/spaces/SimianLuo/Latent_Consistency_Model).
+
+### Latent Consistency Img2img Pipeline
+
+This pipeline extends the Latent Consistency Pipeline to allow it to take an input image.
+
+```py
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", custom_pipeline="latent_consistency_img2img")
+
+# To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+pipe.to(torch_device="cuda", torch_dtype=torch.float32)
+```
+
+- 2. Run inference with as little as 4 steps:
+
+```py
+prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
+
+
+input_image=Image.open("myimg.png")
+
+strength = 0.5  # strength =0 (no change) strength=1 (completely overwrite image)
+
+# Can be set to 1~50 steps. LCM supports fast inference even <= 4 steps. Recommend: 1~8 steps.
+num_inference_steps = 4
+
+images = pipe(prompt=prompt, image=input_image, strength=strength, num_inference_steps=num_inference_steps, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images
+```
+
+### Latent Consistency Interpolation Pipeline
+
+This pipeline extends the Latent Consistency Pipeline to allow for interpolation of the latent space between multiple prompts. It is similar to the [Stable Diffusion Interpolate](https://github.com/huggingface/diffusers/blob/main/examples/community/interpolate_stable_diffusion.py) and [unCLIP Interpolate](https://github.com/huggingface/diffusers/blob/main/examples/community/unclip_text_interpolation.py) community pipelines.
+
+```py
+import torch
+import numpy as np
+
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", custom_pipeline="latent_consistency_interpolate")
+
+# To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+pipe.to(torch_device="cuda", torch_dtype=torch.float32)
+
+prompts = [
+    "Self-portrait oil painting, a beautiful cyborg with golden hair, Margot Robbie, 8k",
+    "Self-portrait oil painting, an extremely strong man, body builder, Huge Jackman, 8k",
+    "An astronaut floating in space, renaissance art, realistic, high quality, 8k",
+    "Oil painting of a cat, cute, dream-like",
+    "Hugging face emoji, cute, realistic"
+]
+num_inference_steps = 4
+num_interpolation_steps = 60
+seed = 1337
+
+torch.manual_seed(seed)
+np.random.seed(seed)
+
+images = pipe(
+    prompt=prompts,
+    height=512,
+    width=512,
+    num_inference_steps=num_inference_steps,
+    num_interpolation_steps=num_interpolation_steps,
+    guidance_scale=8.0,
+    embedding_interpolation_type="lerp",
+    latent_interpolation_type="slerp",
+    process_batch_size=4,  # Make it higher or lower based on your GPU memory
+    generator=torch.Generator(seed),
+)
+
+assert len(images) == (len(prompts) - 1) * num_interpolation_steps
+```
+
+### StableDiffusionUpscaleLDM3D Pipeline
+
+[LDM3D-VR](https://arxiv.org/pdf/2311.03226.pdf) is an extended version of LDM3D.
+
+The abstract from the paper is:
+*Latent diffusion models have proven to be state-of-the-art in the creation and manipulation of visual outputs. However, as far as we know, the generation of depth maps jointly with RGB is still limited. We introduce LDM3D-VR, a suite of diffusion models targeting virtual reality development that includes LDM3D-pano and LDM3D-SR. These models enable the generation of panoramic RGBD based on textual prompts and the upscaling of low-resolution inputs to high-resolution RGBD, respectively. Our models are fine-tuned from existing pretrained models on datasets containing panoramic/high-resolution RGB images, depth maps and captions. Both models are evaluated in comparison to existing related methods*
+
+Two checkpoints are available for use:
+
+- [ldm3d-pano](https://huggingface.co/Intel/ldm3d-pano). This checkpoint enables the generation of panoramic images and requires the StableDiffusionLDM3DPipeline pipeline to be used.
+- [ldm3d-sr](https://huggingface.co/Intel/ldm3d-sr). This checkpoint enables the upscaling of RGB and depth images. Can be used in cascade after the original LDM3D pipeline using the StableDiffusionUpscaleLDM3DPipeline pipeline.
+
+```py
+from PIL import Image
+import os
+import torch
+from diffusers import StableDiffusionLDM3DPipeline, DiffusionPipeline
+
+# Generate a rgb/depth output from LDM3D
+
+pipe_ldm3d = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d-4c")
+pipe_ldm3d.to("cuda")
+
+prompt = "A picture of some lemons on a table"
+output = pipe_ldm3d(prompt)
+rgb_image, depth_image = output.rgb, output.depth
+rgb_image[0].save("lemons_ldm3d_rgb.jpg")
+depth_image[0].save("lemons_ldm3d_depth.png")
+
+# Upscale the previous output to a resolution of (1024, 1024)
+
+pipe_ldm3d_upscale = DiffusionPipeline.from_pretrained("Intel/ldm3d-sr", custom_pipeline="pipeline_stable_diffusion_upscale_ldm3d")
+
+pipe_ldm3d_upscale.to("cuda")
+
+low_res_img = Image.open("lemons_ldm3d_rgb.jpg").convert("RGB")
+low_res_depth = Image.open("lemons_ldm3d_depth.png").convert("L")
+outputs = pipe_ldm3d_upscale(prompt="high quality high resolution uhd 4k image", rgb=low_res_img, depth=low_res_depth, num_inference_steps=50, target_res=[1024, 1024])
+
+upscaled_rgb, upscaled_depth = outputs.rgb[0], outputs.depth[0]
+upscaled_rgb.save("upscaled_lemons_rgb.png")
+upscaled_depth.save("upscaled_lemons_depth.png")
+```
+
+### ControlNet + T2I Adapter Pipeline
+
+This pipeline combines both ControlNet and T2IAdapter into a single pipeline, where the forward pass is executed once.
+It receives `control_image` and `adapter_image`, as well as `controlnet_conditioning_scale` and `adapter_conditioning_scale`, for the ControlNet and Adapter modules, respectively. Whenever `adapter_conditioning_scale=0` or `controlnet_conditioning_scale=0`, it will act as a full ControlNet module or as a full T2IAdapter module, respectively.
+
+```py
+import cv2
+import numpy as np
+import torch
+from controlnet_aux.midas import MidasDetector
+from PIL import Image
+
+from diffusers import AutoencoderKL, ControlNetModel, MultiAdapter, T2IAdapter
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.utils import load_image
+from examples.community.pipeline_stable_diffusion_xl_controlnet_adapter import (
+    StableDiffusionXLControlNetAdapterPipeline,
+)
+
+controlnet_depth = ControlNetModel.from_pretrained(
+    "diffusers/controlnet-depth-sdxl-1.0",
+    torch_dtype=torch.float16,
+    variant="fp16",
+    use_safetensors=True
+)
+adapter_depth = T2IAdapter.from_pretrained(
+  "TencentARC/t2i-adapter-depth-midas-sdxl-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+)
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
+
+pipe = StableDiffusionXLControlNetAdapterPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    controlnet=controlnet_depth,
+    adapter=adapter_depth,
+    vae=vae,
+    variant="fp16",
+    use_safetensors=True,
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+pipe.enable_xformers_memory_efficient_attention()
+# pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
+midas_depth = MidasDetector.from_pretrained(
+  "valhalla/t2iadapter-aux-models", filename="dpt_large_384.pt", model_type="dpt_large"
+).to("cuda")
+
+prompt = "a tiger sitting on a park bench"
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+
+image = load_image(img_url).resize((1024, 1024))
+
+depth_image = midas_depth(
+  image, detect_resolution=512, image_resolution=1024
+)
+
+strength = 0.5
+
+images = pipe(
+    prompt,
+    control_image=depth_image,
+    adapter_image=depth_image,
+    num_inference_steps=30,
+    controlnet_conditioning_scale=strength,
+    adapter_conditioning_scale=strength,
+).images
+images[0].save("controlnet_and_adapter.png")
+```
+
+### ControlNet + T2I Adapter + Inpainting Pipeline
+
+```py
+import cv2
+import numpy as np
+import torch
+from controlnet_aux.midas import MidasDetector
+from PIL import Image
+
+from diffusers import AutoencoderKL, ControlNetModel, MultiAdapter, T2IAdapter
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.utils import load_image
+from examples.community.pipeline_stable_diffusion_xl_controlnet_adapter_inpaint import (
+    StableDiffusionXLControlNetAdapterInpaintPipeline,
+)
+
+controlnet_depth = ControlNetModel.from_pretrained(
+    "diffusers/controlnet-depth-sdxl-1.0",
+    torch_dtype=torch.float16,
+    variant="fp16",
+    use_safetensors=True
+)
+adapter_depth = T2IAdapter.from_pretrained(
+  "TencentARC/t2i-adapter-depth-midas-sdxl-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+)
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
+
+pipe = StableDiffusionXLControlNetAdapterInpaintPipeline.from_pretrained(
+    "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
+    controlnet=controlnet_depth,
+    adapter=adapter_depth,
+    vae=vae,
+    variant="fp16",
+    use_safetensors=True,
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+pipe.enable_xformers_memory_efficient_attention()
+# pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
+midas_depth = MidasDetector.from_pretrained(
+  "valhalla/t2iadapter-aux-models", filename="dpt_large_384.pt", model_type="dpt_large"
+).to("cuda")
+
+prompt = "a tiger sitting on a park bench"
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+image = load_image(img_url).resize((1024, 1024))
+mask_image = load_image(mask_url).resize((1024, 1024))
+
+depth_image = midas_depth(
+  image, detect_resolution=512, image_resolution=1024
+)
+
+strength = 0.4
+
+images = pipe(
+    prompt,
+    image=image,
+    mask_image=mask_image,
+    control_image=depth_image,
+    adapter_image=depth_image,
+    num_inference_steps=30,
+    controlnet_conditioning_scale=strength,
+    adapter_conditioning_scale=strength,
+    strength=0.7,
+).images
+images[0].save("controlnet_and_adapter_inpaint.png")
+```
+
+### Regional Prompting Pipeline
+
+This pipeline is a port of the [Regional Prompter extension](https://github.com/hako-mikan/sd-webui-regional-prompter) for [Stable Diffusion web UI](https://github.com/AUTOMATIC1111/stable-diffusion-webui) to `diffusers`.
+This code implements a pipeline for the Stable Diffusion model, enabling the division of the canvas into multiple regions, with different prompts applicable to each region. Users can specify regions in two ways: using `Cols` and `Rows` modes for grid-like divisions, or the `Prompt` mode for regions calculated based on prompts.
+
+![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline1.png)
+
+### Usage
+
+### Sample Code
+
+```py
+from examples.community.regional_prompting_stable_diffusion import RegionalPromptingStableDiffusionPipeline
+
+pipe = RegionalPromptingStableDiffusionPipeline.from_single_file(model_path, vae=vae)
+
+rp_args = {
+    "mode":"rows",
+    "div": "1;1;1"
+}
+
+prompt = """
+green hair twintail BREAK
+red blouse BREAK
+blue skirt
+"""
+
+images = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    guidance_scale=7.5,
+    height=768,
+    width=512,
+    num_inference_steps=20,
+    num_images_per_prompt=1,
+    rp_args=rp_args
+    ).images
+
+time = time.strftime(r"%Y%m%d%H%M%S")
+i = 1
+for image in images:
+    i += 1
+    fileName = f'img-{time}-{i+1}.png'
+    image.save(fileName)
+```
+
+### Cols, Rows mode
+
+In the Cols, Rows mode, you can split the screen vertically and horizontally and assign prompts to each region. The split ratio can be specified by 'div', and you can set the division ratio like '3;3;2' or '0.1;0.5'. Furthermore, as will be described later, you can also subdivide the split Cols, Rows to specify more complex regions.
+
+In this image, the image is divided into three parts, and a separate prompt is applied to each. The prompts are divided by 'BREAK', and each is applied to the respective region.
+![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline2.png)
+
+```
+green hair twintail BREAK
+red blouse BREAK
+blue skirt
+```
+
+### 2-Dimentional division
+
+The prompt consists of instructions separated by the term `BREAK` and is assigned to different regions of a two-dimensional space. The image is initially split in the main splitting direction, which in this case is rows, due to the presence of a single semicolon `;`, dividing the space into an upper and a lower section. Additional sub-splitting is then applied, indicated by commas. The upper row is split into ratios of `2:1:1`, while the lower row is split into a ratio of `4:6`. Rows themselves are split in a `1:2` ratio. According to the reference image, the blue sky is designated as the first region, green hair as the second, the bookshelf as the third, and so on, in a sequence based on their position from the top left. The terrarium is placed on the desk in the fourth region, and the orange dress and sofa are in the fifth region, conforming to their respective splits.
+
+```py
+rp_args = {
+    "mode":"rows",
+    "div": "1,2,1,1;2,4,6"
+}
+
+prompt = """
+blue sky BREAK
+green hair BREAK
+book shelf BREAK
+terrarium on the desk BREAK
+orange dress and sofa
+"""
+```
+
+![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline4.png)
+
+### Prompt Mode
+
+There are limitations to methods of specifying regions in advance. This is because specifying regions can be a hindrance when designating complex shapes or dynamic compositions. In the region specified by the prompt, the region is determined after the image generation has begun. This allows us to accommodate compositions and complex regions.
+For further infomagen, see [here](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/main/prompt_en.md).
+
+### Syntax
+
+```
+baseprompt target1 target2 BREAK
+effect1, target1 BREAK
+effect2 ,target2
+```
+
+First, write the base prompt. In the base prompt, write the words (target1, target2) for which you want to create a mask. Next, separate them with BREAK. Next, write the prompt corresponding to target1. Then enter a comma and write target1. The order of the targets in the base prompt and the order of the BREAK-separated targets can be back to back.
+
+```
+target2 baseprompt target1  BREAK
+effect1, target1 BREAK
+effect2 ,target2
+```
+
+is also effective.
+
+### Sample
+
+In this example, masks are calculated for shirt, tie, skirt, and color prompts are specified only for those regions.
+
+```py
+rp_args = {
+    "mode": "prompt-ex",
+    "save_mask": True,
+    "th": "0.4,0.6,0.6",
+}
+
+prompt = """
+a girl in street with shirt, tie, skirt BREAK
+red, shirt BREAK
+green, tie BREAK
+blue , skirt
+"""
+```
+
+![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline3.png)
+
+### Threshold
+
+The threshold used to determine the mask created by the prompt. This can be set as many times as there are masks, as the range varies widely depending on the target prompt. If multiple regions are used, enter them separated by commas. For example, hair tends to be ambiguous and requires a small value, while face tends to be large and requires a small value. These should be ordered by BREAK.
+
+```
+a lady ,hair, face  BREAK
+red, hair BREAK
+tanned ,face
+```
+
+`threshold : 0.4,0.6`
+If only one input is given for multiple regions, they are all assumed to be the same value.
+
+### Prompt and Prompt-EX
+
+The difference is that in Prompt, duplicate regions are added, whereas in Prompt-EX, duplicate regions are overwritten sequentially. Since they are processed in order, setting a TARGET with a large regions first makes it easier for the effect of small regions to remain unmuffled.
+
+### Accuracy
+
+In the case of a 512x512 image, Attention mode reduces the size of the region to about 8x8 pixels deep in the U-Net, so that small regions get mixed up; Latent mode calculates 64*64, so that the region is exact.
+
+```
+girl hair twintail frills,ribbons, dress, face BREAK
+girl, ,face
+```
+
+### Mask
+
+When an image is generated, the generated mask is displayed. It is generated at the same size as the image, but is actually used at a much smaller size.
+
+### Use common prompt
+
+You can attach the prompt up to ADDCOMM to all prompts by separating it first with ADDCOMM. This is useful when you want to include elements common to all regions. For example, when generating pictures of three people with different appearances, it's necessary to include the instruction of 'three people' in all regions. It's also useful when inserting quality tags and other things. "For example, if you write as follows:
+
+```
+best quality, 3persons in garden, ADDCOMM
+a girl white dress BREAK
+a boy blue shirt BREAK
+an old man red suit
+```
+
+If common is enabled, this prompt is converted to the following:
+
+```
+best quality, 3persons in garden, a girl white dress BREAK
+best quality, 3persons in garden, a boy blue shirt BREAK
+best quality, 3persons in garden, an old man red suit
+```
+
+### Negative prompt
+
+Negative prompts are equally effective across all regions, but it is possible to set region-specific prompts for negative prompts as well. The number of BREAKs must be the same as the number of prompts. If the number of prompts does not match, the negative prompts will be used without being divided into regions.
+
+### Parameters
+
+To activate Regional Prompter, it is necessary to enter settings in `rp_args`. The items that can be set are as follows. `rp_args` is a dictionary type.
+
+### Input Parameters
+
+Parameters are specified through the `rp_arg`(dictionary type).
+
+```py
+rp_args = {
+    "mode":"rows",
+    "div": "1;1;1"
+}
+
+pipe(prompt=prompt, rp_args=rp_args)
+```
+
+### Required Parameters
+
+- `mode`: Specifies the method for defining regions. Choose from `Cols`, `Rows`, `Prompt`, or `Prompt-Ex`. This parameter is case-insensitive.
+- `divide`: Used in `Cols` and `Rows` modes. Details on how to specify this are provided under the respective `Cols` and `Rows` sections.
+- `th`: Used in `Prompt` mode. The method of specification is detailed under the `Prompt` section.
+
+### Optional Parameters
+
+- `save_mask`: In `Prompt` mode, choose whether to output the generated mask along with the image. The default is `False`.
+
+The Pipeline supports `compel` syntax. Input prompts using the `compel` structure will be automatically applied and processed.
+
+### Diffusion Posterior Sampling Pipeline
+
+- Reference paper
+
+    ```bibtex
+    @article{chung2022diffusion,
+    title={Diffusion posterior sampling for general noisy inverse problems},
+    author={Chung, Hyungjin and Kim, Jeongsol and Mccann, Michael T and Klasky, Marc L and Ye, Jong Chul},
+    journal={arXiv preprint arXiv:2209.14687},
+    year={2022}
+    }
+    ```
+
+- This pipeline allows zero-shot conditional sampling from the posterior distribution $p(x|y)$, given observation on $y$, unconditional generative model $p(x)$ and differentiable operator $y=f(x)$.
+
+- For example, $f(.)$ can be downsample operator, then $y$ is a downsampled image, and the pipeline becomes a super-resolution pipeline.
+- To use this pipeline, you need to know your operator $f(.)$ and corrupted image $y$, and pass them during the call. For example, as in the main function of `dps_pipeline.py`, you need to first define the Gaussian blurring operator $f(.)$. The operator should be a callable `nn.Module`, with all the parameter gradient disabled:
+
+    ```python
+    import torch.nn.functional as F
+    import scipy
+    from torch import nn
+
+    # define the Gaussian blurring operator first
+    class GaussialBlurOperator(nn.Module):
+        def __init__(self, kernel_size, intensity):
+            super().__init__()
+
+            class Blurkernel(nn.Module):
+                def __init__(self, blur_type='gaussian', kernel_size=31, std=3.0):
+                    super().__init__()
+                    self.blur_type = blur_type
+                    self.kernel_size = kernel_size
+                    self.std = std
+                    self.seq = nn.Sequential(
+                        nn.ReflectionPad2d(self.kernel_size//2),
+                        nn.Conv2d(3, 3, self.kernel_size, stride=1, padding=0, bias=False, groups=3)
+                    )
+                    self.weights_init()
+
+                def forward(self, x):
+                    return self.seq(x)
+
+                def weights_init(self):
+                    if self.blur_type == "gaussian":
+                        n = np.zeros((self.kernel_size, self.kernel_size))
+                        n[self.kernel_size // 2, self.kernel_size // 2] = 1
+                        k = scipy.ndimage.gaussian_filter(n, sigma=self.std)
+                        k = torch.from_numpy(k)
+                        self.k = k
+                        for name, f in self.named_parameters():
+                            f.data.copy_(k)
+                    elif self.blur_type == "motion":
+                        k = Kernel(size=(self.kernel_size, self.kernel_size), intensity=self.std).kernelMatrix
+                        k = torch.from_numpy(k)
+                        self.k = k
+                        for name, f in self.named_parameters():
+                            f.data.copy_(k)
+
+                def update_weights(self, k):
+                    if not torch.is_tensor(k):
+                        k = torch.from_numpy(k)
+                    for name, f in self.named_parameters():
+                        f.data.copy_(k)
+
+                def get_kernel(self):
+                    return self.k
+
+            self.kernel_size = kernel_size
+            self.conv = Blurkernel(blur_type='gaussian',
+                                kernel_size=kernel_size,
+                                std=intensity)
+            self.kernel = self.conv.get_kernel()
+            self.conv.update_weights(self.kernel.type(torch.float32))
+
+            for param in self.parameters():
+                param.requires_grad = False
+
+        def forward(self, data, **kwargs):
+            return self.conv(data)
+
+        def transpose(self, data, **kwargs):
+            return data
+
+        def get_kernel(self):
+            return self.kernel.view(1, 1, self.kernel_size, self.kernel_size)
+    ```
+
+- Next, you should obtain the corrupted image $y$ by the operator. In this example, we generate $y$ from the source image $x$. However in practice, having the operator $f(.)$ and corrupted image $y$ is enough:
+
+    ```python
+    # set up source image
+    src = Image.open('sample.png')
+    # read image into [1,3,H,W]
+    src = torch.from_numpy(np.array(src, dtype=np.float32)).permute(2,0,1)[None]
+    # normalize image to [-1,1]
+    src = (src / 127.5) - 1.0
+    src = src.to("cuda")
+
+    # set up operator and measurement
+    operator = GaussialBlurOperator(kernel_size=61, intensity=3.0).to("cuda")
+    measurement = operator(src)
+
+    # save the source and corrupted images
+    save_image((src+1.0)/2.0, "dps_src.png")
+    save_image((measurement+1.0)/2.0, "dps_mea.png")
+    ```
+
+- We provide an example pair of saved source and corrupted images, using the Gaussian blur operator above
+  - Source image:
+  - ![sample](https://github.com/tongdaxu/Images/assets/22267548/4d2a1216-08d1-4aeb-9ce3-7a2d87561d65)
+  - Gaussian blurred image:
+  - ![ddpm_generated_image](https://github.com/tongdaxu/Images/assets/22267548/65076258-344b-4ed8-b704-a04edaade8ae)
+  - You can download those images to run the example on your own.
+
+- Next, we need to define a loss function used for diffusion posterior sample. For most of the cases, the RMSE is fine:
+
+    ```python
+    def RMSELoss(yhat, y):
+        return torch.sqrt(torch.sum((yhat-y)**2))
+    ```
+
+- And next, as any other diffusion models, we need the score estimator and scheduler. As we are working with $256x256$ face images, we use ddpm-celebahq-256:
+
+    ```python
+    # set up scheduler
+    scheduler = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256")
+    scheduler.set_timesteps(1000)
+
+    # set up model
+    model = UNet2DModel.from_pretrained("google/ddpm-celebahq-256").to("cuda")
+    ```
+
+- And finally, run the pipeline:
+
+    ```python
+    # finally, the pipeline
+    dpspipe = DPSPipeline(model, scheduler)
+    image = dpspipe(
+        measurement=measurement,
+        operator=operator,
+        loss_fn=RMSELoss,
+        zeta=1.0,
+    ).images[0]
+    image.save("dps_generated_image.png")
+    ```
+
+- The `zeta` is a hyperparameter that is in range of $[0,1]$. It needs to be tuned for best effect. By setting `zeta=1`, you should be able to have the reconstructed result:
+  - Reconstructed image:
+  - ![sample](https://github.com/tongdaxu/Images/assets/22267548/0ceb5575-d42e-4f0b-99c0-50e69c982209)
+
+- The reconstruction is perceptually similar to the source image, but different in details.
+- In `dps_pipeline.py`, we also provide a super-resolution example, which should produce:
+  - Downsampled image:
+  - ![dps_mea](https://github.com/tongdaxu/Images/assets/22267548/ff6a33d6-26f0-42aa-88ce-f8a76ba45a13)
+  - Reconstructed image:
+  - ![dps_generated_image](https://github.com/tongdaxu/Images/assets/22267548/b74f084d-93f4-4845-83d8-44c0fa758a5f)
+
+### AnimateDiff ControlNet Pipeline
+
+This pipeline combines AnimateDiff and ControlNet. Enjoy precise motion control for your videos! Refer to [this](https://github.com/huggingface/diffusers/issues/5866) issue for more details.
+
+```py
+import torch
+from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter, DiffusionPipeline, DPMSolverMultistepScheduler
+from diffusers.utils import export_to_gif
+from PIL import Image
+
+motion_id = "guoyww/animatediff-motion-adapter-v1-5-2"
+adapter = MotionAdapter.from_pretrained(motion_id)
+controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_openpose", torch_dtype=torch.float16)
+vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
+
+model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+pipe = DiffusionPipeline.from_pretrained(
+    model_id,
+    motion_adapter=adapter,
+    controlnet=controlnet,
+    vae=vae,
+    custom_pipeline="pipeline_animatediff_controlnet",
+    torch_dtype=torch.float16,
+).to(device="cuda")
+pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained(
+    model_id, subfolder="scheduler", beta_schedule="linear", clip_sample=False, timestep_spacing="linspace", steps_offset=1
+)
+pipe.enable_vae_slicing()
+
+conditioning_frames = []
+for i in range(1, 16 + 1):
+    conditioning_frames.append(Image.open(f"frame_{i}.png"))
+
+prompt = "astronaut in space, dancing"
+negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly"
+result = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=512,
+    height=768,
+    conditioning_frames=conditioning_frames,
+    num_inference_steps=20,
+).frames[0]
+
+export_to_gif(result.frames[0], "result.gif")
+```
+
+<table>
+  <tr><td colspan="2" align=center><b>Conditioning Frames</b></td></tr>
+  <tr align=center>
+    <td align=center><img src="https://user-images.githubusercontent.com/7365912/265043418-23291941-864d-495a-8ba8-d02e05756396.gif" alt="input-frames"></td>
+  </tr>
+  <tr><td colspan="2" align=center><b>AnimateDiff model: SG161222/Realistic_Vision_V5.1_noVAE</b></td></tr>
+  <tr>
+    <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/baf301e2-d03c-4129-bd84-203a1de2b2be" alt="gif-1"></td>
+    <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/9f923475-ecaf-452b-92c8-4e42171182d8" alt="gif-2"></td>
+  </tr>
+  <tr><td colspan="2" align=center><b>AnimateDiff model: CardosAnime</b></td></tr>
+  <tr>
+    <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/b2c41028-38a0-45d6-86ed-fec7446b87f7" alt="gif-1"></td>
+    <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/eb7d2952-72e4-44fa-b664-077c79b4fc70" alt="gif-2"></td>
+  </tr>
+</table>
+
+You can also use multiple controlnets at once!
+
+```python
+import torch
+from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter, DiffusionPipeline, DPMSolverMultistepScheduler
+from diffusers.utils import export_to_gif
+from PIL import Image
+
+motion_id = "guoyww/animatediff-motion-adapter-v1-5-2"
+adapter = MotionAdapter.from_pretrained(motion_id)
+controlnet1 = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_openpose", torch_dtype=torch.float16)
+controlnet2 = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
+
+model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+pipe = DiffusionPipeline.from_pretrained(
+    model_id,
+    motion_adapter=adapter,
+    controlnet=[controlnet1, controlnet2],
+    vae=vae,
+    custom_pipeline="pipeline_animatediff_controlnet",
+    torch_dtype=torch.float16,
+).to(device="cuda")
+pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained(
+    model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1, beta_schedule="linear",
+)
+pipe.enable_vae_slicing()
+
+def load_video(file_path: str):
+    images = []
+
+    if file_path.startswith(('http://', 'https://')):
+        # If the file_path is a URL
+        response = requests.get(file_path)
+        response.raise_for_status()
+        content = BytesIO(response.content)
+        vid = imageio.get_reader(content)
+    else:
+        # Assuming it's a local file path
+        vid = imageio.get_reader(file_path)
+
+    for frame in vid:
+        pil_image = Image.fromarray(frame)
+        images.append(pil_image)
+
+    return images
+
+video = load_video("dance.gif")
+
+# You need to install it using `pip install controlnet_aux`
+from controlnet_aux.processor import Processor
+
+p1 = Processor("openpose_full")
+cn1 = [p1(frame) for frame in video]
+
+p2 = Processor("canny")
+cn2 = [p2(frame) for frame in video]
+
+prompt = "astronaut in space, dancing"
+negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly"
+result = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=512,
+    height=768,
+    conditioning_frames=[cn1, cn2],
+    num_inference_steps=20,
+)
+
+export_to_gif(result.frames[0], "result.gif")
+```
+
+### DemoFusion
+
+This pipeline is the official implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973).
+The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion).
+
+- `view_batch_size` (`int`, defaults to 16):
+  The batch size for multiple denoising paths. Typically, a larger batch size can result in higher efficiency but comes with increased GPU memory requirements.
+
+- `stride` (`int`, defaults to 64):
+  The stride of moving local patches. A smaller stride is better for alleviating seam issues, but it also introduces additional computational overhead and inference time.
+
+- `cosine_scale_1` (`float`, defaults to 3):
+  Control the strength of skip-residual. For specific impacts, please refer to Appendix C in the DemoFusion paper.
+
+- `cosine_scale_2` (`float`, defaults to 1):
+  Control the strength of dilated sampling. For specific impacts, please refer to Appendix C in the DemoFusion paper.
+
+- `cosine_scale_3` (`float`, defaults to 1):
+  Control the strength of the Gaussian filter. For specific impacts, please refer to Appendix C in the DemoFusion paper.
+
+- `sigma` (`float`, defaults to 1):
+  The standard value of the Gaussian filter. Larger sigma promotes the global guidance of dilated sampling, but has the potential of over-smoothing.
+
+- `multi_decoder` (`bool`, defaults to True):
+  Determine whether to use a tiled decoder. Generally, when the resolution exceeds 3072x3072, a tiled decoder becomes necessary.
+
+- `show_image` (`bool`, defaults to False):
+  Determine whether to show intermediate results during generation.
+
+```py
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    custom_pipeline="pipeline_demofusion_sdxl",
+    custom_revision="main",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "Envision a portrait of an elderly woman, her face a canvas of time, framed by a headscarf with muted tones of rust and cream. Her eyes, blue like faded denim. Her attire, simple yet dignified."
+negative_prompt = "blurry, ugly, duplicate, poorly drawn, deformed, mosaic"
+
+images = pipe(
+    prompt,
+    negative_prompt=negative_prompt,
+    height=3072,
+    width=3072,
+    view_batch_size=16,
+    stride=64,
+    num_inference_steps=50,
+    guidance_scale=7.5,
+    cosine_scale_1=3,
+    cosine_scale_2=1,
+    cosine_scale_3=1,
+    sigma=0.8,
+    multi_decoder=True,
+    show_image=True
+)
+```
+
+You can display and save the generated images as:
+
+```py
+def image_grid(imgs, save_path=None):
+
+    w = 0
+    for i, img in enumerate(imgs):
+        h_, w_ = imgs[i].size
+        w += w_
+    h = h_
+    grid = Image.new('RGB', size=(w, h))
+    grid_w, grid_h = grid.size
+
+    w = 0
+    for i, img in enumerate(imgs):
+        h_, w_ = imgs[i].size
+        grid.paste(img, box=(w, h - h_))
+        if save_path != None:
+            img.save(save_path + "/img_{}.jpg".format((i + 1) * 1024))
+        w += w_
+
+    return grid
+
+image_grid(images, save_path="./outputs/")
+```
+
+ ![output_example](https://github.com/PRIS-CV/DemoFusion/blob/main/output_example.png)
+
+### SDE Drag pipeline
+
+This pipeline provides drag-and-drop image editing using stochastic differential equations. It enables image editing by inputting prompt, image, mask_image, source_points, and target_points.
+
+![SDE Drag Image](https://github.com/huggingface/diffusers/assets/75928535/bd54f52f-f002-4951-9934-b2a4592771a5)
+
+See [paper](https://arxiv.org/abs/2311.01410), [paper page](https://ml-gsai.github.io/SDE-Drag-demo/), [original repo](https://github.com/ML-GSAI/SDE-Drag) for more information.
+
+```py
+import PIL
+import torch
+from diffusers import DDIMScheduler, DiffusionPipeline
+
+# Load the pipeline
+model_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+scheduler = DDIMScheduler.from_pretrained(model_path, subfolder="scheduler")
+pipe = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, custom_pipeline="sde_drag")
+pipe.to('cuda')
+
+# To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+# If not training LoRA, please avoid using torch.float16
+# pipe.to(torch.float16)
+
+# Provide prompt, image, mask image, and the starting and target points for drag editing.
+prompt = "prompt of the image"
+image = PIL.Image.open('/path/to/image')
+mask_image = PIL.Image.open('/path/to/mask_image')
+source_points = [[123, 456]]
+target_points = [[234, 567]]
+
+# train_lora is optional, and in most cases, using train_lora can better preserve consistency with the original image.
+pipe.train_lora(prompt, image)
+
+output = pipe(prompt, image, mask_image, source_points, target_points)
+output_image = PIL.Image.fromarray(output)
+output_image.save("./output.png")
+```
+
+### Instaflow Pipeline
+
+InstaFlow is an ultra-fast, one-step image generator that achieves image quality close to Stable Diffusion, significantly reducing the demand of computational resources. This efficiency is made possible through a recent [Rectified Flow](https://github.com/gnobitab/RectifiedFlow) technique, which trains probability flows with straight trajectories, hence inherently requiring only a single step for fast inference.
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+
+pipe = DiffusionPipeline.from_pretrained("XCLIU/instaflow_0_9B_from_sd_1_5", torch_dtype=torch.float16, custom_pipeline="instaflow_one_step")
+pipe.to("cuda")  ### if GPU is not available, comment this line
+prompt = "A hyper-realistic photo of a cute cat."
+
+images = pipe(prompt=prompt,
+            num_inference_steps=1,
+            guidance_scale=0.0).images
+images[0].save("./image.png")
+```
+
+![image1](https://huggingface.co/datasets/ayushtues/instaflow_images/resolve/main/instaflow_cat.png)
+
+You can also combine it with LORA out of the box, like <https://huggingface.co/artificialguybr/logo-redmond-1-5v-logo-lora-for-liberteredmond-sd-1-5>, to unlock cool use cases in single step!
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+
+pipe = DiffusionPipeline.from_pretrained("XCLIU/instaflow_0_9B_from_sd_1_5", torch_dtype=torch.float16, custom_pipeline="instaflow_one_step")
+pipe.to("cuda")  ### if GPU is not available, comment this line
+pipe.load_lora_weights("artificialguybr/logo-redmond-1-5v-logo-lora-for-liberteredmond-sd-1-5")
+prompt = "logo, A logo for a fitness app, dynamic running figure, energetic colors (red, orange) ),LogoRedAF ,"
+images = pipe(prompt=prompt,
+            num_inference_steps=1,
+            guidance_scale=0.0).images
+images[0].save("./image.png")
+```
+
+![image0](https://huggingface.co/datasets/ayushtues/instaflow_images/resolve/main/instaflow_logo.png)
+
+### Null-Text Inversion pipeline
+
+This pipeline provides null-text inversion for editing real images. It enables null-text optimization, and DDIM reconstruction via w, w/o null-text optimization. No prompt-to-prompt code is implemented as there is a Prompt2PromptPipeline.
+
+- Reference paper
+
+    ```bibtex
+    @article{hertz2022prompt,
+    title={Prompt-to-prompt image editing with cross attention control},
+    author={Hertz, Amir and Mokady, Ron and Tenenbaum, Jay and Aberman, Kfir and Pritch, Yael and Cohen-Or, Daniel},
+    booktitle={arXiv preprint arXiv:2208.01626},
+    year={2022}
+    ```}
+
+```py
+from diffusers import DDIMScheduler
+from examples.community.pipeline_null_text_inversion import NullTextPipeline
+import torch
+
+device = "cuda"
+# Provide invert_prompt and the image for null-text optimization.
+invert_prompt = "A lying cat"
+input_image = "siamese.jpg"
+steps = 50
+
+# Provide prompt used for generation. Same if reconstruction
+prompt = "A lying cat"
+# or different if editing.
+prompt = "A lying dog"
+
+# Float32 is essential to a well optimization
+model_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+scheduler = DDIMScheduler(num_train_timesteps=1000, beta_start=0.00085, beta_end=0.0120, beta_schedule="scaled_linear")
+pipeline = NullTextPipeline.from_pretrained(model_path, scheduler=scheduler, torch_dtype=torch.float32).to(device)
+
+# Saves the inverted_latent to save time
+inverted_latent, uncond = pipeline.invert(input_image, invert_prompt, num_inner_steps=10, early_stop_epsilon=1e-5, num_inference_steps=steps)
+pipeline(prompt, uncond, inverted_latent, guidance_scale=7.5, num_inference_steps=steps).images[0].save(input_image+".output.jpg")
+```
+
+### Rerender A Video
+
+This is the Diffusers implementation of zero-shot video-to-video translation pipeline [Rerender A Video](https://github.com/williamyang1991/Rerender_A_Video) (without Ebsynth postprocessing). To run the code, please install gmflow. Then modify the path in `gmflow_dir`. After that, you can run the pipeline with:
+
+```py
+import sys
+gmflow_dir = "/path/to/gmflow"
+sys.path.insert(0, gmflow_dir)
+
+from diffusers import ControlNetModel, AutoencoderKL, DDIMScheduler
+from diffusers.utils import export_to_video
+import numpy as np
+import torch
+
+import cv2
+from PIL import Image
+
+def video_to_frame(video_path: str, interval: int):
+    vidcap = cv2.VideoCapture(video_path)
+    success = True
+
+    count = 0
+    res = []
+    while success:
+        count += 1
+        success, image = vidcap.read()
+        if count % interval != 1:
+            continue
+        if image is not None:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            res.append(image)
+
+    vidcap.release()
+    return res
+
+input_video_path = 'path/to/video'
+input_interval = 10
+frames = video_to_frame(
+    input_video_path, input_interval)
+
+control_frames = []
+# get canny image
+for frame in frames:
+    np_image = cv2.Canny(frame, 50, 100)
+    np_image = np_image[:, :, None]
+    np_image = np.concatenate([np_image, np_image, np_image], axis=2)
+    canny_image = Image.fromarray(np_image)
+    control_frames.append(canny_image)
+
+# You can use any ControlNet here
+controlnet = ControlNetModel.from_pretrained(
+    "lllyasviel/sd-controlnet-canny").to('cuda')
+
+# You can use any finetuned SD here
+pipe = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, custom_pipeline='rerender_a_video').to('cuda')
+
+# Optional: you can download vae-ft-mse-840000-ema-pruned.ckpt to enhance the results
+# pipe.vae = AutoencoderKL.from_single_file(
+#     "path/to/vae-ft-mse-840000-ema-pruned.ckpt").to('cuda')
+
+pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+
+generator = torch.manual_seed(0)
+frames = [Image.fromarray(frame) for frame in frames]
+output_frames = pipe(
+    "a beautiful woman in CG style, best quality, extremely detailed",
+    frames,
+    control_frames,
+    num_inference_steps=20,
+    strength=0.75,
+    controlnet_conditioning_scale=0.7,
+    generator=generator,
+    warp_start=0.0,
+    warp_end=0.1,
+    mask_start=0.5,
+    mask_end=0.8,
+    mask_strength=0.5,
+    negative_prompt='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
+).frames[0]
+
+export_to_video(
+    output_frames, "/path/to/video.mp4", 5)
+```
+
+### StyleAligned Pipeline
+
+This pipeline is the implementation of [Style Aligned Image Generation via Shared Attention](https://arxiv.org/abs/2312.02133). You can find more results [here](https://github.com/huggingface/diffusers/pull/6489#issuecomment-1881209354).
+
+> Large-scale Text-to-Image (T2I) models have rapidly gained prominence across creative fields, generating visually compelling outputs from textual prompts. However, controlling these models to ensure consistent style remains challenging, with existing methods necessitating fine-tuning and manual intervention to disentangle content and style. In this paper, we introduce StyleAligned, a novel technique designed to establish style alignment among a series of generated images. By employing minimal `attention sharing' during the diffusion process, our method maintains style consistency across images within T2I models. This approach allows for the creation of style-consistent images using a reference style through a straightforward inversion operation. Our method's evaluation across diverse styles and text prompts demonstrates high-quality synthesis and fidelity, underscoring its efficacy in achieving consistent style across various inputs.
+
+```python
+from typing import List
+
+import torch
+from diffusers import DiffusionPipeline
+from PIL import Image
+
+model_id = "a-r-r-o-w/dreamshaper-xl-turbo"
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16", custom_pipeline="pipeline_sdxl_style_aligned")
+pipe = pipe.to("cuda")
+
+# Enable memory saving techniques
+pipe.enable_vae_slicing()
+pipe.enable_vae_tiling()
+
+prompt = [
+  "a toy train. macro photo. 3d game asset",
+  "a toy airplane. macro photo. 3d game asset",
+  "a toy bicycle. macro photo. 3d game asset",
+  "a toy car. macro photo. 3d game asset",
+]
+negative_prompt = "low quality, worst quality, "
+
+# Enable StyleAligned
+pipe.enable_style_aligned(
+    share_group_norm=False,
+    share_layer_norm=False,
+    share_attention=True,
+    adain_queries=True,
+    adain_keys=True,
+    adain_values=False,
+    full_attention_share=False,
+    shared_score_scale=1.0,
+    shared_score_shift=0.0,
+    only_self_level=0.0,
+)
+
+# Run inference
+images = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    guidance_scale=2,
+    height=1024,
+    width=1024,
+    num_inference_steps=10,
+    generator=torch.Generator().manual_seed(42),
+).images
+
+# Disable StyleAligned if you do not wish to use it anymore
+pipe.disable_style_aligned()
+```
+
+### AnimateDiff Image-To-Video Pipeline
+
+This pipeline adds experimental support for the image-to-video task using AnimateDiff. Refer to [this](https://github.com/huggingface/diffusers/pull/6328) PR for more examples and results.
+
+This pipeline relies on a "hack" discovered by the community that allows the generation of videos given an input image with AnimateDiff. It works by creating a copy of the image `num_frames` times and progressively adding more noise to the image based on the strength and latent interpolation method.
+
+```py
+import torch
+from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
+from diffusers.utils import export_to_gif, load_image
+
+model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
+pipe = DiffusionPipeline.from_pretrained(model_id, motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
+pipe.scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", beta_schedule="linear", steps_offset=1)
+
+image = load_image("snail.png")
+output = pipe(
+  image=image,
+  prompt="A snail moving on the ground",
+  strength=0.8,
+  latent_interpolation_method="slerp",  # can be lerp, slerp, or your own callback
+)
+frames = output.frames[0]
+export_to_gif(frames, "animation.gif")
+```
+
+### IP Adapter Face ID
+
+IP Adapter FaceID is an experimental IP Adapter model that uses image embeddings generated by `insightface`, so no image encoder needs to be loaded.
+You need to install `insightface` and all its requirements to use this model.
+You must pass the image embedding tensor as `image_embeds` to the `DiffusionPipeline` instead of `ip_adapter_image`.
+You can find more results [here](https://github.com/huggingface/diffusers/pull/6276).
+
+```py
+import torch
+from diffusers.utils import load_image
+import cv2
+import numpy as np
+from diffusers import DiffusionPipeline, AutoencoderKL, DDIMScheduler
+from insightface.app import FaceAnalysis
+
+
+noise_scheduler = DDIMScheduler(
+    num_train_timesteps=1000,
+    beta_start=0.00085,
+    beta_end=0.012,
+    beta_schedule="scaled_linear",
+    clip_sample=False,
+    set_alpha_to_one=False,
+    steps_offset=1,
+)
+vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse").to(dtype=torch.float16)
+pipeline = DiffusionPipeline.from_pretrained(
+    "SG161222/Realistic_Vision_V4.0_noVAE",
+    torch_dtype=torch.float16,
+    scheduler=noise_scheduler,
+    vae=vae,
+    custom_pipeline="ip_adapter_face_id"
+)
+pipeline.load_ip_adapter_face_id("h94/IP-Adapter-FaceID", "ip-adapter-faceid_sd15.bin")
+pipeline.to("cuda")
+
+generator = torch.Generator(device="cpu").manual_seed(42)
+num_images = 2
+
+image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png")
+
+app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+app.prepare(ctx_id=0, det_size=(640, 640))
+image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
+faces = app.get(image)
+image = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
+images = pipeline(
+    prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
+    image_embeds=image,
+    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
+    num_inference_steps=20, num_images_per_prompt=num_images, width=512, height=704,
+    generator=generator
+).images
+
+for i in range(num_images):
+    images[i].save(f"c{i}.png")
+```
+
+### InstantID Pipeline
+
+InstantID is a new state-of-the-art tuning-free method to achieve ID-Preserving generation with only single image, supporting various downstream tasks. For any usage question, please refer to the [official implementation](https://github.com/InstantID/InstantID).
+
+```py
+# !pip install diffusers opencv-python transformers accelerate insightface
+import diffusers
+from diffusers.utils import load_image
+from diffusers import ControlNetModel
+
+import cv2
+import torch
+import numpy as np
+from PIL import Image
+
+from insightface.app import FaceAnalysis
+from pipeline_stable_diffusion_xl_instantid import StableDiffusionXLInstantIDPipeline, draw_kps
+
+# prepare 'antelopev2' under ./models
+# https://github.com/deepinsight/insightface/issues/1896#issuecomment-1023867304
+app = FaceAnalysis(name='antelopev2', root='./', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+app.prepare(ctx_id=0, det_size=(640, 640))
+
+# prepare models under ./checkpoints
+# https://huggingface.co/InstantX/InstantID
+from huggingface_hub import hf_hub_download
+
+hf_hub_download(repo_id="InstantX/InstantID", filename="ControlNetModel/config.json", local_dir="./checkpoints")
+hf_hub_download(repo_id="InstantX/InstantID", filename="ControlNetModel/diffusion_pytorch_model.safetensors", local_dir="./checkpoints")
+hf_hub_download(repo_id="InstantX/InstantID", filename="ip-adapter.bin", local_dir="./checkpoints")
+
+face_adapter = './checkpoints/ip-adapter.bin'
+controlnet_path = './checkpoints/ControlNetModel'
+
+# load IdentityNet
+controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
+
+base_model = 'wangqixun/YamerMIX_v8'
+pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(
+    base_model,
+    controlnet=controlnet,
+    torch_dtype=torch.float16
+)
+pipe.to("cuda")
+
+# load adapter
+pipe.load_ip_adapter_instantid(face_adapter)
+
+# load an image
+face_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png")
+
+# prepare face emb
+face_info = app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR))
+face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1]  # only use the maximum face
+face_emb = face_info['embedding']
+face_kps = draw_kps(face_image, face_info['kps'])
+
+# prompt
+prompt = "film noir style, ink sketch|vector, male man, highly detailed, sharp focus, ultra sharpness, monochrome, high contrast, dramatic shadows, 1940s style, mysterious, cinematic"
+negative_prompt = "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, vibrant, colorful"
+
+# generate image
+pipe.set_ip_adapter_scale(0.8)
+image = pipe(
+    prompt,
+    image_embeds=face_emb,
+    image=face_kps,
+    controlnet_conditioning_scale=0.8,
+).images[0]
+```
+
+### UFOGen Scheduler
+
+[UFOGen](https://arxiv.org/abs/2311.09257) is a generative model designed for fast one-step text-to-image generation, trained via adversarial training starting from an initial pretrained diffusion model such as Stable Diffusion. `scheduling_ufogen.py` implements a onestep and multistep sampling algorithm for UFOGen models compatible with pipelines like `StableDiffusionPipeline`. A usage example is as follows:
+
+```py
+import torch
+from diffusers import StableDiffusionPipeline
+
+from scheduling_ufogen import UFOGenScheduler
+
+# NOTE: currently, I am not aware of any publicly available UFOGen model checkpoints trained from SD v1.5.
+ufogen_model_id_or_path = "/path/to/ufogen/model"
+pipe = StableDiffusionPipeline(
+    ufogen_model_id_or_path,
+    torch_dtype=torch.float16,
+)
+
+# You can initialize a UFOGenScheduler as follows:
+pipe.scheduler = UFOGenScheduler.from_config(pipe.scheduler.config)
+
+prompt = "Three cats having dinner at a table at new years eve, cinematic shot, 8k."
+
+# Onestep sampling
+onestep_image = pipe(prompt, num_inference_steps=1).images[0]
+
+# Multistep sampling
+multistep_image = pipe(prompt, num_inference_steps=4).images[0]
+```
+
+### FRESCO
+
+This is the Diffusers implementation of zero-shot video-to-video translation pipeline [FRESCO](https://github.com/williamyang1991/FRESCO) (without Ebsynth postprocessing and background smooth). To run the code, please install gmflow. Then modify the path in `gmflow_dir`. After that, you can run the pipeline with:
+
+```py
+from PIL import Image
+import cv2
+import torch
+import numpy as np
+
+from diffusers import ControlNetModel, DDIMScheduler, DiffusionPipeline
+import sys
+
+gmflow_dir = "/path/to/gmflow"
+sys.path.insert(0, gmflow_dir)
+
+def video_to_frame(video_path: str, interval: int):
+    vidcap = cv2.VideoCapture(video_path)
+    success = True
+
+    count = 0
+    res = []
+    while success:
+        count += 1
+        success, image = vidcap.read()
+        if count % interval != 1:
+            continue
+        if image is not None:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            res.append(image)
+            if len(res) >= 8:
+                break
+
+    vidcap.release()
+    return res
+
+
+input_video_path = 'https://github.com/williamyang1991/FRESCO/raw/main/data/car-turn.mp4'
+output_video_path = 'car.gif'
+
+# You can use any finetuned SD here
+model_path = 'SG161222/Realistic_Vision_V2.0'
+
+prompt = 'a red car turns in the winter'
+a_prompt = ', RAW photo, subject, (high detailed skin:1.2), 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3, '
+n_prompt = '(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers:1.4), (deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation'
+
+input_interval = 5
+frames = video_to_frame(
+    input_video_path, input_interval)
+
+control_frames = []
+# get canny image
+for frame in frames:
+    image = cv2.Canny(frame, 50, 100)
+    np_image = np.array(image)
+    np_image = np_image[:, :, None]
+    np_image = np.concatenate([np_image, np_image, np_image], axis=2)
+    canny_image = Image.fromarray(np_image)
+    control_frames.append(canny_image)
+
+# You can use any ControlNet here
+controlnet = ControlNetModel.from_pretrained(
+    "lllyasviel/sd-controlnet-canny").to('cuda')
+
+pipe = DiffusionPipeline.from_pretrained(
+    model_path, controlnet=controlnet, custom_pipeline='fresco_v2v').to('cuda')
+pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+
+generator = torch.manual_seed(0)
+frames = [Image.fromarray(frame) for frame in frames]
+
+output_frames = pipe(
+    prompt + a_prompt,
+    frames,
+    control_frames,
+    num_inference_steps=20,
+    strength=0.75,
+    controlnet_conditioning_scale=0.7,
+    generator=generator,
+    negative_prompt=n_prompt
+).images
+
+output_frames[0].save(output_video_path, save_all=True,
+                 append_images=output_frames[1:], duration=100, loop=0)
+```
+
+### AnimateDiff on IPEX
+
+This diffusion pipeline aims to accelerate the inference of AnimateDiff on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch).
+
+To use this pipeline, you need to:
+1. Install [IPEX](https://github.com/intel/intel-extension-for-pytorch)
+
+**Note:** For each PyTorch release, there is a corresponding release of IPEX. Here is the mapping relationship. It is recommended to install Pytorch/IPEX2.3 to get the best performance.
+
+|PyTorch Version|IPEX Version|
+|--|--|
+|[v2.3.\*](https://github.com/pytorch/pytorch/tree/v2.3.0 "v2.3.0")|[v2.3.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v2.3.0+cpu)|
+|[v1.13.\*](https://github.com/pytorch/pytorch/tree/v1.13.0 "v1.13.0")|[v1.13.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v1.13.100+cpu)|
+
+You can simply use pip to install IPEX with the latest version.
+```python
+python -m pip install intel_extension_for_pytorch
+```
+**Note:** To install a specific version, run with the following command:
+```
+python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX accelaration. Supported inference datatypes are Float32 and BFloat16.
+
+```python
+pipe = AnimateDiffPipelineIpex.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
+# For Float32
+pipe.prepare_for_ipex(torch.float32, prompt="A girl smiling")
+# For BFloat16
+pipe.prepare_for_ipex(torch.bfloat16, prompt="A girl smiling")
+```
+
+Then you can use the ipex pipeline in a similar way to the default animatediff pipeline.
+```python
+# For Float32
+output = pipe(prompt="A girl smiling", guidance_scale=1.0, num_inference_steps=step)
+# For BFloat16
+with torch.cpu.amp.autocast(enabled = True, dtype = torch.bfloat16):
+    output = pipe(prompt="A girl smiling", guidance_scale=1.0, num_inference_steps=step)
+```
+
+The following code compares the performance of the original animatediff pipeline with the ipex-optimized pipeline.
+By using this optimized pipeline, we can get about 1.5-2.2 times performance boost with BFloat16 on the fifth generation of Intel Xeon CPUs, code-named Emerald Rapids.
+
+```python
+import torch
+from diffusers import MotionAdapter, AnimateDiffPipeline, EulerDiscreteScheduler
+from safetensors.torch import load_file
+from pipeline_animatediff_ipex import AnimateDiffPipelineIpex
+import time
+
+device = "cpu"
+dtype = torch.float32
+
+prompt = "A girl smiling"
+step = 8  # Options: [1,2,4,8]
+repo = "ByteDance/AnimateDiff-Lightning"
+ckpt = f"animatediff_lightning_{step}step_diffusers.safetensors"
+base = "emilianJR/epiCRealism"  # Choose to your favorite base model.
+
+adapter = MotionAdapter().to(device, dtype)
+adapter.load_state_dict(load_file(hf_hub_download(repo, ckpt), device=device))
+
+# Helper function for time evaluation
+def elapsed_time(pipeline, nb_pass=3, num_inference_steps=1):
+    # warmup
+    for _ in range(2):
+        output = pipeline(prompt = prompt, guidance_scale=1.0, num_inference_steps = num_inference_steps)
+    #time evaluation
+    start = time.time()
+    for _ in range(nb_pass):
+        pipeline(prompt = prompt, guidance_scale=1.0, num_inference_steps = num_inference_steps)
+    end = time.time()
+    return (end - start) / nb_pass
+
+##############     bf16 inference performance    ###############
+
+# 1. IPEX Pipeline initialization
+pipe = AnimateDiffPipelineIpex.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
+pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing", beta_schedule="linear")
+pipe.prepare_for_ipex(torch.bfloat16, prompt = prompt)
+
+# 2. Original Pipeline initialization
+pipe2 = AnimateDiffPipeline.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
+pipe2.scheduler = EulerDiscreteScheduler.from_config(pipe2.scheduler.config, timestep_spacing="trailing", beta_schedule="linear")
+
+# 3. Compare performance between Original Pipeline and IPEX Pipeline
+with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+    latency = elapsed_time(pipe, num_inference_steps=step)
+    print("Latency of AnimateDiffPipelineIpex--bf16", latency, "s for total", step, "steps")
+    latency = elapsed_time(pipe2, num_inference_steps=step)
+    print("Latency of AnimateDiffPipeline--bf16", latency, "s for total", step, "steps")
+
+##############     fp32 inference performance    ###############
+
+# 1. IPEX Pipeline initialization
+pipe3 = AnimateDiffPipelineIpex.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
+pipe3.scheduler = EulerDiscreteScheduler.from_config(pipe3.scheduler.config, timestep_spacing="trailing", beta_schedule="linear")
+pipe3.prepare_for_ipex(torch.float32, prompt = prompt)
+
+# 2. Original Pipeline initialization
+pipe4 = AnimateDiffPipeline.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
+pipe4.scheduler = EulerDiscreteScheduler.from_config(pipe4.scheduler.config, timestep_spacing="trailing", beta_schedule="linear")
+
+# 3. Compare performance between Original Pipeline and IPEX Pipeline
+latency = elapsed_time(pipe3, num_inference_steps=step)
+print("Latency of AnimateDiffPipelineIpex--fp32", latency, "s for total", step, "steps")
+latency = elapsed_time(pipe4, num_inference_steps=step)
+print("Latency of AnimateDiffPipeline--fp32",latency, "s for total", step, "steps")
+```
+### HunyuanDiT with Differential Diffusion
+
+#### Usage
+
+```python
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import load_image
+from PIL import Image
+from torchvision import transforms
+
+from pipeline_hunyuandit_differential_img2img import (
+    HunyuanDiTDifferentialImg2ImgPipeline,
+)
+
+
+pipe = HunyuanDiTDifferentialImg2ImgPipeline.from_pretrained(
+    "Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
+).to("cuda")
+
+
+source_image = load_image(
+    "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/20240329211129_4024911930.png"
+)
+map = load_image(
+    "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/gradient_mask_2.png"
+)
+prompt = "a green pear"
+negative_prompt = "blurry"
+
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    image=source_image,
+    num_inference_steps=28,
+    guidance_scale=4.5,
+    strength=1.0,
+    map=map,
+).images[0]
+```
+
+| ![Gradient](https://github.com/user-attachments/assets/e38ce4d5-1ae6-4df0-ab43-adc1b45716b5) | ![Input](https://github.com/user-attachments/assets/9c95679c-e9d7-4f5a-90d6-560203acd6b3) | ![Output](https://github.com/user-attachments/assets/5313ff64-a0c4-418b-8b55-a38f1a5e7532) |
+| ------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------- |
+| Gradient                                                                                   | Input                                                                                   | Output                                                                                   |
+
+A colab notebook demonstrating all results can be found [here](https://colab.research.google.com/drive/1v44a5fpzyr4Ffr4v2XBQ7BajzG874N4P?usp=sharing). Depth Maps have also been added in the same colab.
+
+# Perturbed-Attention Guidance
+
+[Project](https://ku-cvlab.github.io/Perturbed-Attention-Guidance/) / [arXiv](https://arxiv.org/abs/2403.17377) / [GitHub](https://github.com/KU-CVLAB/Perturbed-Attention-Guidance)
+
+This implementation is based on [Diffusers](https://huggingface.co/docs/diffusers/index). `StableDiffusionPAGPipeline` is a modification of `StableDiffusionPipeline` to support Perturbed-Attention Guidance (PAG).
+
+## Example Usage
+
+```py
+import os
+import torch
+
+from accelerate.utils import set_seed
+
+from diffusers import StableDiffusionPipeline
+from diffusers.utils import load_image, make_image_grid
+from diffusers.utils.torch_utils import randn_tensor
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    custom_pipeline="hyoungwoncho/sd_perturbed_attention_guidance",
+    torch_dtype=torch.float16
+)
+
+device = "cuda"
+pipe = pipe.to(device)
+
+pag_scale = 5.0
+pag_applied_layers_index = ['m0']
+
+batch_size = 4
+seed = 10
+
+base_dir = "./results/"
+grid_dir = base_dir + "/pag" + str(pag_scale) + "/"
+
+if not os.path.exists(grid_dir):
+    os.makedirs(grid_dir)
+
+set_seed(seed)
+
+latent_input = randn_tensor(shape=(batch_size,4,64,64), generator=None, device=device, dtype=torch.float16)
+
+output_baseline = pipe(
+    "",
+    width=512,
+    height=512,
+    num_inference_steps=50,
+    guidance_scale=0.0,
+    pag_scale=0.0,
+    pag_applied_layers_index=pag_applied_layers_index,
+    num_images_per_prompt=batch_size,
+    latents=latent_input
+).images
+
+output_pag = pipe(
+    "",
+    width=512,
+    height=512,
+    num_inference_steps=50,
+    guidance_scale=0.0,
+    pag_scale=5.0,
+    pag_applied_layers_index=pag_applied_layers_index,
+    num_images_per_prompt=batch_size,
+    latents=latent_input
+).images
+
+grid_image = make_image_grid(output_baseline + output_pag, rows=2, cols=batch_size)
+grid_image.save(grid_dir + "sample.png")
+```
+
+## PAG Parameters
+
+`pag_scale` : guidance scale of PAG (ex: 5.0)
+
+`pag_applied_layers_index` : index of the layer to apply perturbation (ex: ['m0'])
diff --git a/diffusers/examples/community/README_community_scripts.md b/diffusers/examples/community/README_community_scripts.md
new file mode 100644
index 0000000000000000000000000000000000000000..8432b4e82c9f594449c3ce7b1d6f18711a5ce795
--- /dev/null
+++ b/diffusers/examples/community/README_community_scripts.md
@@ -0,0 +1,232 @@
+# Community Scripts
+
+**Community scripts** consist of inference examples using Diffusers pipelines that have been added by the community.
+Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste code example that you can try out.
+If a community script doesn't work as expected, please open an issue and ping the author on it.
+
+| Example                                                                                                                               | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                                              | Colab                                                                                                                                                                                                              |                                                        Author |
+|:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
+| Using IP-Adapter with negative noise                                                                                                  | Using negative noise with IP-adapter to better control the generation (see the [original post](https://github.com/huggingface/diffusers/discussions/7167) on the forum for more details)                                                                                                                                                                                                                                                    | [IP-Adapter Negative Noise](#ip-adapter-negative-noise)                                   | | [Álvaro Somoza](https://github.com/asomoza)|
+| asymmetric tiling                                                                                                  |configure seamless image tiling independently for the X and Y axes                                                                                                                                                                                                      | [Asymmetric Tiling](#asymmetric-tiling )                                   | | [alexisrolland](https://github.com/alexisrolland)|
+
+
+## Example usages
+
+### IP Adapter Negative Noise
+
+Diffusers pipelines are fully integrated with IP-Adapter, which allows you to prompt the diffusion model with an image. However, it does not support negative image prompts (there is no `negative_ip_adapter_image` argument) the same way it supports negative text prompts. When you pass an `ip_adapter_image,` it will create a zero-filled tensor as a negative image. This script shows you how to create a negative noise from `ip_adapter_image` and use it to significantly improve the generation quality while preserving the composition of images.
+
+[cubiq](https://github.com/cubiq) initially developed this feature in his [repository](https://github.com/cubiq/ComfyUI_IPAdapter_plus). The community script was contributed by [asomoza](https://github.com/Somoza). You can find more details about this experimentation [this discussion](https://github.com/huggingface/diffusers/discussions/7167)
+
+IP-Adapter without negative noise
+|source|result|
+|---|---|
+|![20240229150812](https://github.com/huggingface/diffusers/assets/5442875/901d8bd8-7a59-4fe7-bda1-a0e0d6c7dffd)|![20240229163923_normal](https://github.com/huggingface/diffusers/assets/5442875/3432e25a-ece6-45f4-a3f4-fca354f40b5b)|
+
+IP-Adapter with negative noise
+|source|result|
+|---|---|
+|![20240229150812](https://github.com/huggingface/diffusers/assets/5442875/901d8bd8-7a59-4fe7-bda1-a0e0d6c7dffd)|![20240229163923](https://github.com/huggingface/diffusers/assets/5442875/736fd15a-36ba-40c0-a7d8-6ec1ac26f788)|
+
+```python
+import torch
+
+from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, StableDiffusionXLPipeline
+from diffusers.models import ImageProjection
+from diffusers.utils import load_image
+
+
+def encode_image(
+    image_encoder,
+    feature_extractor,
+    image,
+    device,
+    num_images_per_prompt,
+    output_hidden_states=None,
+    negative_image=None,
+):
+    dtype = next(image_encoder.parameters()).dtype
+
+    if not isinstance(image, torch.Tensor):
+        image = feature_extractor(image, return_tensors="pt").pixel_values
+
+    image = image.to(device=device, dtype=dtype)
+    if output_hidden_states:
+        image_enc_hidden_states = image_encoder(image, output_hidden_states=True).hidden_states[-2]
+        image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if negative_image is None:
+            uncond_image_enc_hidden_states = image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+        else:
+            if not isinstance(negative_image, torch.Tensor):
+                negative_image = feature_extractor(negative_image, return_tensors="pt").pixel_values
+            negative_image = negative_image.to(device=device, dtype=dtype)
+            uncond_image_enc_hidden_states = image_encoder(negative_image, output_hidden_states=True).hidden_states[-2]
+
+        uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        return image_enc_hidden_states, uncond_image_enc_hidden_states
+    else:
+        image_embeds = image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+
+        return image_embeds, uncond_image_embeds
+
+
+@torch.no_grad()
+def prepare_ip_adapter_image_embeds(
+    unet,
+    image_encoder,
+    feature_extractor,
+    ip_adapter_image,
+    do_classifier_free_guidance,
+    device,
+    num_images_per_prompt,
+    ip_adapter_negative_image=None,
+):
+    if not isinstance(ip_adapter_image, list):
+        ip_adapter_image = [ip_adapter_image]
+
+    if len(ip_adapter_image) != len(unet.encoder_hid_proj.image_projection_layers):
+        raise ValueError(
+            f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+        )
+
+    image_embeds = []
+    for single_ip_adapter_image, image_proj_layer in zip(
+        ip_adapter_image, unet.encoder_hid_proj.image_projection_layers
+    ):
+        output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+        single_image_embeds, single_negative_image_embeds = encode_image(
+            image_encoder,
+            feature_extractor,
+            single_ip_adapter_image,
+            device,
+            1,
+            output_hidden_state,
+            negative_image=ip_adapter_negative_image,
+        )
+        single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+        single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+            single_image_embeds = single_image_embeds.to(device)
+
+        image_embeds.append(single_image_embeds)
+
+    return image_embeds
+
+
+vae = AutoencoderKL.from_pretrained(
+    "madebyollin/sdxl-vae-fp16-fix",
+    torch_dtype=torch.float16,
+).to("cuda")
+
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "RunDiffusion/Juggernaut-XL-v9",
+    torch_dtype=torch.float16,
+    vae=vae,
+    variant="fp16",
+).to("cuda")
+
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+pipeline.scheduler.config.use_karras_sigmas = True
+
+pipeline.load_ip_adapter(
+    "h94/IP-Adapter",
+    subfolder="sdxl_models",
+    weight_name="ip-adapter-plus_sdxl_vit-h.safetensors",
+    image_encoder_folder="models/image_encoder",
+)
+pipeline.set_ip_adapter_scale(0.7)
+
+ip_image = load_image("source.png")
+negative_ip_image = load_image("noise.png")
+
+image_embeds = prepare_ip_adapter_image_embeds(
+    unet=pipeline.unet,
+    image_encoder=pipeline.image_encoder,
+    feature_extractor=pipeline.feature_extractor,
+    ip_adapter_image=[[ip_image]],
+    do_classifier_free_guidance=True,
+    device="cuda",
+    num_images_per_prompt=1,
+    ip_adapter_negative_image=negative_ip_image,
+)
+
+
+prompt = "cinematic photo of a cyborg in the city, 4k, high quality, intricate, highly detailed"
+negative_prompt = "blurry, smooth, plastic"
+
+image = pipeline(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    ip_adapter_image_embeds=image_embeds,
+    guidance_scale=6.0,
+    num_inference_steps=25,
+    generator=torch.Generator(device="cpu").manual_seed(1556265306),
+).images[0]
+
+image.save("result.png")
+```
+
+### Asymmetric Tiling
+Stable Diffusion is not trained to generate seamless textures. However, you can use this simple script to add tiling to your generation. This script is contributed by [alexisrolland](https://github.com/alexisrolland). See more details in the [this issue](https://github.com/huggingface/diffusers/issues/556)
+
+
+|Generated|Tiled|
+|---|---|
+|![20240313003235_573631814](https://github.com/huggingface/diffusers/assets/5442875/eca174fb-06a4-464e-a3a7-00dbb024543e)|![wall](https://github.com/huggingface/diffusers/assets/5442875/b4aa774b-2a6a-4316-a8eb-8f30b5f4d024)|
+
+
+```py
+import torch
+from typing import Optional
+from diffusers import StableDiffusionPipeline
+from diffusers.models.lora import LoRACompatibleConv
+
+def seamless_tiling(pipeline, x_axis, y_axis):
+    def asymmetric_conv2d_convforward(self, input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
+        self.paddingX = (self._reversed_padding_repeated_twice[0], self._reversed_padding_repeated_twice[1], 0, 0)
+        self.paddingY = (0, 0, self._reversed_padding_repeated_twice[2], self._reversed_padding_repeated_twice[3])
+        working = torch.nn.functional.pad(input, self.paddingX, mode=x_mode)
+        working = torch.nn.functional.pad(working, self.paddingY, mode=y_mode)
+        return torch.nn.functional.conv2d(working, weight, bias, self.stride, torch.nn.modules.utils._pair(0), self.dilation, self.groups)
+    x_mode = 'circular' if x_axis else 'constant'
+    y_mode = 'circular' if y_axis else 'constant'
+    targets = [pipeline.vae, pipeline.text_encoder, pipeline.unet]
+    convolution_layers = []
+    for target in targets:
+        for module in target.modules():
+            if isinstance(module, torch.nn.Conv2d):
+                convolution_layers.append(module)
+    for layer in convolution_layers:
+        if isinstance(layer, LoRACompatibleConv) and layer.lora_layer is None:
+            layer.lora_layer = lambda * x: 0
+        layer._conv_forward = asymmetric_conv2d_convforward.__get__(layer, torch.nn.Conv2d)
+    return pipeline
+
+pipeline = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True)
+pipeline.enable_model_cpu_offload()
+prompt = ["texture of a red brick wall"]
+seed = 123456
+generator = torch.Generator(device='cuda').manual_seed(seed)
+
+pipeline = seamless_tiling(pipeline=pipeline, x_axis=True, y_axis=True)
+image = pipeline(
+    prompt=prompt,
+    width=512,
+    height=512,
+    num_inference_steps=20,
+    guidance_scale=7,
+    num_images_per_prompt=1,
+    generator=generator
+).images[0]
+seamless_tiling(pipeline=pipeline, x_axis=False, y_axis=False)
+
+torch.cuda.empty_cache()
+image.save('image.png')
+```
\ No newline at end of file
diff --git a/diffusers/examples/community/bit_diffusion.py b/diffusers/examples/community/bit_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d8f31163f2db719c418b5cbb5f54479c4b9033
--- /dev/null
+++ b/diffusers/examples/community/bit_diffusion.py
@@ -0,0 +1,264 @@
+from typing import Optional, Tuple, Union
+
+import torch
+from einops import rearrange, reduce
+
+from diffusers import DDIMScheduler, DDPMScheduler, DiffusionPipeline, ImagePipelineOutput, UNet2DConditionModel
+from diffusers.schedulers.scheduling_ddim import DDIMSchedulerOutput
+from diffusers.schedulers.scheduling_ddpm import DDPMSchedulerOutput
+
+
+BITS = 8
+
+
+# convert to bit representations and back taken from https://github.com/lucidrains/bit-diffusion/blob/main/bit_diffusion/bit_diffusion.py
+def decimal_to_bits(x, bits=BITS):
+    """expects image tensor ranging from 0 to 1, outputs bit tensor ranging from -1 to 1"""
+    device = x.device
+
+    x = (x * 255).int().clamp(0, 255)
+
+    mask = 2 ** torch.arange(bits - 1, -1, -1, device=device)
+    mask = rearrange(mask, "d -> d 1 1")
+    x = rearrange(x, "b c h w -> b c 1 h w")
+
+    bits = ((x & mask) != 0).float()
+    bits = rearrange(bits, "b c d h w -> b (c d) h w")
+    bits = bits * 2 - 1
+    return bits
+
+
+def bits_to_decimal(x, bits=BITS):
+    """expects bits from -1 to 1, outputs image tensor from 0 to 1"""
+    device = x.device
+
+    x = (x > 0).int()
+    mask = 2 ** torch.arange(bits - 1, -1, -1, device=device, dtype=torch.int32)
+
+    mask = rearrange(mask, "d -> d 1 1")
+    x = rearrange(x, "b (c d) h w -> b c d h w", d=8)
+    dec = reduce(x * mask, "b c d h w -> b c h w", "sum")
+    return (dec / 255).clamp(0.0, 1.0)
+
+
+# modified scheduler step functions for clamping the predicted x_0 between -bit_scale and +bit_scale
+def ddim_bit_scheduler_step(
+    self,
+    model_output: torch.Tensor,
+    timestep: int,
+    sample: torch.Tensor,
+    eta: float = 0.0,
+    use_clipped_model_output: bool = True,
+    generator=None,
+    return_dict: bool = True,
+) -> Union[DDIMSchedulerOutput, Tuple]:
+    """
+    Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+    process from the learned model outputs (most often the predicted noise).
+    Args:
+        model_output (`torch.Tensor`): direct output from learned diffusion model.
+        timestep (`int`): current discrete timestep in the diffusion chain.
+        sample (`torch.Tensor`):
+            current instance of sample being created by diffusion process.
+        eta (`float`): weight of noise for added noise in diffusion step.
+        use_clipped_model_output (`bool`): TODO
+        generator: random number generator.
+        return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
+    Returns:
+        [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
+        [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+        returning a tuple, the first element is the sample tensor.
+    """
+    if self.num_inference_steps is None:
+        raise ValueError(
+            "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+        )
+
+    # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+    # Ideally, read DDIM paper in-detail understanding
+
+    # Notation (<variable name> -> <name in paper>
+    # - pred_noise_t -> e_theta(x_t, t)
+    # - pred_original_sample -> f_theta(x_t, t) or x_0
+    # - std_dev_t -> sigma_t
+    # - eta -> η
+    # - pred_sample_direction -> "direction pointing to x_t"
+    # - pred_prev_sample -> "x_t-1"
+
+    # 1. get previous step value (=t-1)
+    prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+    # 2. compute alphas, betas
+    alpha_prod_t = self.alphas_cumprod[timestep]
+    alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+    beta_prod_t = 1 - alpha_prod_t
+
+    # 3. compute predicted original sample from predicted noise also called
+    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+
+    # 4. Clip "predicted x_0"
+    scale = self.bit_scale
+    if self.config.clip_sample:
+        pred_original_sample = torch.clamp(pred_original_sample, -scale, scale)
+
+    # 5. compute variance: "sigma_t(η)" -> see formula (16)
+    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+    variance = self._get_variance(timestep, prev_timestep)
+    std_dev_t = eta * variance ** (0.5)
+
+    if use_clipped_model_output:
+        # the model_output is always re-derived from the clipped x_0 in Glide
+        model_output = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+
+    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * model_output
+
+    # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+    if eta > 0:
+        # randn_like does not support generator https://github.com/pytorch/pytorch/issues/27072
+        device = model_output.device if torch.is_tensor(model_output) else "cpu"
+        noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator).to(device)
+        variance = self._get_variance(timestep, prev_timestep) ** (0.5) * eta * noise
+
+        prev_sample = prev_sample + variance
+
+    if not return_dict:
+        return (prev_sample,)
+
+    return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+
+def ddpm_bit_scheduler_step(
+    self,
+    model_output: torch.Tensor,
+    timestep: int,
+    sample: torch.Tensor,
+    prediction_type="epsilon",
+    generator=None,
+    return_dict: bool = True,
+) -> Union[DDPMSchedulerOutput, Tuple]:
+    """
+    Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+    process from the learned model outputs (most often the predicted noise).
+    Args:
+        model_output (`torch.Tensor`): direct output from learned diffusion model.
+        timestep (`int`): current discrete timestep in the diffusion chain.
+        sample (`torch.Tensor`):
+            current instance of sample being created by diffusion process.
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the samples (`sample`).
+        generator: random number generator.
+        return_dict (`bool`): option for returning tuple rather than DDPMSchedulerOutput class
+    Returns:
+        [`~schedulers.scheduling_utils.DDPMSchedulerOutput`] or `tuple`:
+        [`~schedulers.scheduling_utils.DDPMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+        returning a tuple, the first element is the sample tensor.
+    """
+    t = timestep
+
+    if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
+        model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+    else:
+        predicted_variance = None
+
+    # 1. compute alphas, betas
+    alpha_prod_t = self.alphas_cumprod[t]
+    alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one
+    beta_prod_t = 1 - alpha_prod_t
+    beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+    # 2. compute predicted original sample from predicted noise also called
+    # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+    if prediction_type == "epsilon":
+        pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+    elif prediction_type == "sample":
+        pred_original_sample = model_output
+    else:
+        raise ValueError(f"Unsupported prediction_type {prediction_type}.")
+
+    # 3. Clip "predicted x_0"
+    scale = self.bit_scale
+    if self.config.clip_sample:
+        pred_original_sample = torch.clamp(pred_original_sample, -scale, scale)
+
+    # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+    # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+    pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * self.betas[t]) / beta_prod_t
+    current_sample_coeff = self.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+    # 5. Compute predicted previous sample µ_t
+    # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+    pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+    # 6. Add noise
+    variance = 0
+    if t > 0:
+        noise = torch.randn(
+            model_output.size(), dtype=model_output.dtype, layout=model_output.layout, generator=generator
+        ).to(model_output.device)
+        variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * noise
+
+    pred_prev_sample = pred_prev_sample + variance
+
+    if not return_dict:
+        return (pred_prev_sample,)
+
+    return DDPMSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+
+class BitDiffusion(DiffusionPipeline):
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+        bit_scale: Optional[float] = 1.0,
+    ):
+        super().__init__()
+        self.bit_scale = bit_scale
+        self.scheduler.step = (
+            ddim_bit_scheduler_step if isinstance(scheduler, DDIMScheduler) else ddpm_bit_scheduler_step
+        )
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        height: Optional[int] = 256,
+        width: Optional[int] = 256,
+        num_inference_steps: Optional[int] = 50,
+        generator: Optional[torch.Generator] = None,
+        batch_size: Optional[int] = 1,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[Tuple, ImagePipelineOutput]:
+        latents = torch.randn(
+            (batch_size, self.unet.config.in_channels, height, width),
+            generator=generator,
+        )
+        latents = decimal_to_bits(latents) * self.bit_scale
+        latents = latents.to(self.device)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # predict the noise residual
+            noise_pred = self.unet(latents, t).sample
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+
+        image = bits_to_decimal(latents)
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/examples/community/checkpoint_merger.py b/diffusers/examples/community/checkpoint_merger.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ba4b8c6e837964723aee3e7f573b84f2470493f
--- /dev/null
+++ b/diffusers/examples/community/checkpoint_merger.py
@@ -0,0 +1,284 @@
+import glob
+import os
+from typing import Dict, List, Union
+
+import safetensors.torch
+import torch
+from huggingface_hub import snapshot_download
+from huggingface_hub.utils import validate_hf_hub_args
+
+from diffusers import DiffusionPipeline, __version__
+from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
+from diffusers.utils import CONFIG_NAME, ONNX_WEIGHTS_NAME, WEIGHTS_NAME
+
+
+class CheckpointMergerPipeline(DiffusionPipeline):
+    """
+    A class that supports merging diffusion models based on the discussion here:
+    https://github.com/huggingface/diffusers/issues/877
+
+    Example usage:-
+
+    pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="checkpoint_merger.py")
+
+    merged_pipe = pipe.merge(["CompVis/stable-diffusion-v1-4","prompthero/openjourney"], interp = 'inv_sigmoid', alpha = 0.8, force = True)
+
+    merged_pipe.to('cuda')
+
+    prompt = "An astronaut riding a unicycle on Mars"
+
+    results = merged_pipe(prompt)
+
+    ## For more details, see the docstring for the merge method.
+
+    """
+
+    def __init__(self):
+        self.register_to_config()
+        super().__init__()
+
+    def _compare_model_configs(self, dict0, dict1):
+        if dict0 == dict1:
+            return True
+        else:
+            config0, meta_keys0 = self._remove_meta_keys(dict0)
+            config1, meta_keys1 = self._remove_meta_keys(dict1)
+            if config0 == config1:
+                print(f"Warning !: Mismatch in keys {meta_keys0} and {meta_keys1}.")
+                return True
+        return False
+
+    def _remove_meta_keys(self, config_dict: Dict):
+        meta_keys = []
+        temp_dict = config_dict.copy()
+        for key in config_dict.keys():
+            if key.startswith("_"):
+                temp_dict.pop(key)
+                meta_keys.append(key)
+        return (temp_dict, meta_keys)
+
+    @torch.no_grad()
+    @validate_hf_hub_args
+    def merge(self, pretrained_model_name_or_path_list: List[Union[str, os.PathLike]], **kwargs):
+        """
+        Returns a new pipeline object of the class 'DiffusionPipeline' with the merged checkpoints(weights) of the models passed
+        in the argument 'pretrained_model_name_or_path_list' as a list.
+
+        Parameters:
+        -----------
+            pretrained_model_name_or_path_list : A list of valid pretrained model names in the HuggingFace hub or paths to locally stored models in the HuggingFace format.
+
+            **kwargs:
+                Supports all the default DiffusionPipeline.get_config_dict kwargs viz..
+
+                cache_dir, force_download, proxies, local_files_only, token, revision, torch_dtype, device_map.
+
+                alpha - The interpolation parameter. Ranges from 0 to 1.  It affects the ratio in which the checkpoints are merged. A 0.8 alpha
+                    would mean that the first model checkpoints would affect the final result far less than an alpha of 0.2
+
+                interp - The interpolation method to use for the merging. Supports "sigmoid", "inv_sigmoid", "add_diff" and None.
+                    Passing None uses the default interpolation which is weighted sum interpolation. For merging three checkpoints, only "add_diff" is supported.
+
+                force - Whether to ignore mismatch in model_config.json for the current models. Defaults to False.
+
+                variant - which variant of a pretrained model to load, e.g. "fp16" (None)
+
+        """
+        # Default kwargs from DiffusionPipeline
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        token = kwargs.pop("token", None)
+        variant = kwargs.pop("variant", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        device_map = kwargs.pop("device_map", None)
+
+        alpha = kwargs.pop("alpha", 0.5)
+        interp = kwargs.pop("interp", None)
+
+        print("Received list", pretrained_model_name_or_path_list)
+        print(f"Combining with alpha={alpha}, interpolation mode={interp}")
+
+        checkpoint_count = len(pretrained_model_name_or_path_list)
+        # Ignore result from model_index_json comparison of the two checkpoints
+        force = kwargs.pop("force", False)
+
+        # If less than 2 checkpoints, nothing to merge. If more than 3, not supported for now.
+        if checkpoint_count > 3 or checkpoint_count < 2:
+            raise ValueError(
+                "Received incorrect number of checkpoints to merge. Ensure that either 2 or 3 checkpoints are being"
+                " passed."
+            )
+
+        print("Received the right number of checkpoints")
+        # chkpt0, chkpt1 = pretrained_model_name_or_path_list[0:2]
+        # chkpt2 = pretrained_model_name_or_path_list[2] if checkpoint_count == 3 else None
+
+        # Validate that the checkpoints can be merged
+        # Step 1: Load the model config and compare the checkpoints. We'll compare the model_index.json first while ignoring the keys starting with '_'
+        config_dicts = []
+        for pretrained_model_name_or_path in pretrained_model_name_or_path_list:
+            config_dict = DiffusionPipeline.load_config(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+            )
+            config_dicts.append(config_dict)
+
+        comparison_result = True
+        for idx in range(1, len(config_dicts)):
+            comparison_result &= self._compare_model_configs(config_dicts[idx - 1], config_dicts[idx])
+            if not force and comparison_result is False:
+                raise ValueError("Incompatible checkpoints. Please check model_index.json for the models.")
+        print("Compatible model_index.json files found")
+        # Step 2: Basic Validation has succeeded. Let's download the models and save them into our local files.
+        cached_folders = []
+        for pretrained_model_name_or_path, config_dict in zip(pretrained_model_name_or_path_list, config_dicts):
+            folder_names = [k for k in config_dict.keys() if not k.startswith("_")]
+            allow_patterns = [os.path.join(k, "*") for k in folder_names]
+            allow_patterns += [
+                WEIGHTS_NAME,
+                SCHEDULER_CONFIG_NAME,
+                CONFIG_NAME,
+                ONNX_WEIGHTS_NAME,
+                DiffusionPipeline.config_name,
+            ]
+            requested_pipeline_class = config_dict.get("_class_name")
+            user_agent = {"diffusers": __version__, "pipeline_class": requested_pipeline_class}
+
+            cached_folder = (
+                pretrained_model_name_or_path
+                if os.path.isdir(pretrained_model_name_or_path)
+                else snapshot_download(
+                    pretrained_model_name_or_path,
+                    cache_dir=cache_dir,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    allow_patterns=allow_patterns,
+                    user_agent=user_agent,
+                )
+            )
+            print("Cached Folder", cached_folder)
+            cached_folders.append(cached_folder)
+
+        # Step 3:-
+        # Load the first checkpoint as a diffusion pipeline and modify its module state_dict in place
+        final_pipe = DiffusionPipeline.from_pretrained(
+            cached_folders[0],
+            torch_dtype=torch_dtype,
+            device_map=device_map,
+            variant=variant,
+        )
+        final_pipe.to(self.device)
+
+        checkpoint_path_2 = None
+        if len(cached_folders) > 2:
+            checkpoint_path_2 = os.path.join(cached_folders[2])
+
+        if interp == "sigmoid":
+            theta_func = CheckpointMergerPipeline.sigmoid
+        elif interp == "inv_sigmoid":
+            theta_func = CheckpointMergerPipeline.inv_sigmoid
+        elif interp == "add_diff":
+            theta_func = CheckpointMergerPipeline.add_difference
+        else:
+            theta_func = CheckpointMergerPipeline.weighted_sum
+
+        # Find each module's state dict.
+        for attr in final_pipe.config.keys():
+            if not attr.startswith("_"):
+                checkpoint_path_1 = os.path.join(cached_folders[1], attr)
+                if os.path.exists(checkpoint_path_1):
+                    files = [
+                        *glob.glob(os.path.join(checkpoint_path_1, "*.safetensors")),
+                        *glob.glob(os.path.join(checkpoint_path_1, "*.bin")),
+                    ]
+                    checkpoint_path_1 = files[0] if len(files) > 0 else None
+                if len(cached_folders) < 3:
+                    checkpoint_path_2 = None
+                else:
+                    checkpoint_path_2 = os.path.join(cached_folders[2], attr)
+                    if os.path.exists(checkpoint_path_2):
+                        files = [
+                            *glob.glob(os.path.join(checkpoint_path_2, "*.safetensors")),
+                            *glob.glob(os.path.join(checkpoint_path_2, "*.bin")),
+                        ]
+                        checkpoint_path_2 = files[0] if len(files) > 0 else None
+                # For an attr if both checkpoint_path_1 and 2 are None, ignore.
+                # If at least one is present, deal with it according to interp method, of course only if the state_dict keys match.
+                if checkpoint_path_1 is None and checkpoint_path_2 is None:
+                    print(f"Skipping {attr}: not present in 2nd or 3d model")
+                    continue
+                try:
+                    module = getattr(final_pipe, attr)
+                    if isinstance(module, bool):  # ignore requires_safety_checker boolean
+                        continue
+                    theta_0 = getattr(module, "state_dict")
+                    theta_0 = theta_0()
+
+                    update_theta_0 = getattr(module, "load_state_dict")
+                    theta_1 = (
+                        safetensors.torch.load_file(checkpoint_path_1)
+                        if (checkpoint_path_1.endswith(".safetensors"))
+                        else torch.load(checkpoint_path_1, map_location="cpu")
+                    )
+                    theta_2 = None
+                    if checkpoint_path_2:
+                        theta_2 = (
+                            safetensors.torch.load_file(checkpoint_path_2)
+                            if (checkpoint_path_2.endswith(".safetensors"))
+                            else torch.load(checkpoint_path_2, map_location="cpu")
+                        )
+
+                    if not theta_0.keys() == theta_1.keys():
+                        print(f"Skipping {attr}: key mismatch")
+                        continue
+                    if theta_2 and not theta_1.keys() == theta_2.keys():
+                        print(f"Skipping {attr}:y mismatch")
+                except Exception as e:
+                    print(f"Skipping {attr} do to an unexpected error: {str(e)}")
+                    continue
+                print(f"MERGING {attr}")
+
+                for key in theta_0.keys():
+                    if theta_2:
+                        theta_0[key] = theta_func(theta_0[key], theta_1[key], theta_2[key], alpha)
+                    else:
+                        theta_0[key] = theta_func(theta_0[key], theta_1[key], None, alpha)
+
+                del theta_1
+                del theta_2
+                update_theta_0(theta_0)
+
+                del theta_0
+        return final_pipe
+
+    @staticmethod
+    def weighted_sum(theta0, theta1, theta2, alpha):
+        return ((1 - alpha) * theta0) + (alpha * theta1)
+
+    # Smoothstep (https://en.wikipedia.org/wiki/Smoothstep)
+    @staticmethod
+    def sigmoid(theta0, theta1, theta2, alpha):
+        alpha = alpha * alpha * (3 - (2 * alpha))
+        return theta0 + ((theta1 - theta0) * alpha)
+
+    # Inverse Smoothstep (https://en.wikipedia.org/wiki/Smoothstep)
+    @staticmethod
+    def inv_sigmoid(theta0, theta1, theta2, alpha):
+        import math
+
+        alpha = 0.5 - math.sin(math.asin(1.0 - 2.0 * alpha) / 3.0)
+        return theta0 + ((theta1 - theta0) * alpha)
+
+    @staticmethod
+    def add_difference(theta0, theta1, theta2, alpha):
+        return theta0 + (theta1 - theta2) * (1.0 - alpha)
diff --git a/diffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py b/diffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9a4b12ad20fa59ecc690cbc71be7007667edc71
--- /dev/null
+++ b/diffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py
@@ -0,0 +1,445 @@
+# -*- coding: utf-8 -*-
+import inspect
+from typing import Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from torch.nn import functional as F
+from torchvision import transforms
+from transformers import CLIPImageProcessor, CLIPModel, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.utils import PIL_INTERPOLATION
+from diffusers.utils.torch_utils import randn_tensor
+
+
+def preprocess(image, w, h):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
+    if not isinstance(v0, np.ndarray):
+        inputs_are_torch = True
+        input_device = v0.device
+        v0 = v0.cpu().numpy()
+        v1 = v1.cpu().numpy()
+
+    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
+    if np.abs(dot) > DOT_THRESHOLD:
+        v2 = (1 - t) * v0 + t * v1
+    else:
+        theta_0 = np.arccos(dot)
+        sin_theta_0 = np.sin(theta_0)
+        theta_t = theta_0 * t
+        sin_theta_t = np.sin(theta_t)
+        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+        s1 = sin_theta_t / sin_theta_0
+        v2 = s0 * v0 + s1 * v1
+
+    if inputs_are_torch:
+        v2 = torch.from_numpy(v2).to(input_device)
+
+    return v2
+
+
+def spherical_dist_loss(x, y):
+    x = F.normalize(x, dim=-1)
+    y = F.normalize(y, dim=-1)
+    return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)
+
+
+def set_requires_grad(model, value):
+    for param in model.parameters():
+        param.requires_grad = value
+
+
+class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        clip_model: CLIPModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, DPMSolverMultistepScheduler],
+        feature_extractor: CLIPImageProcessor,
+        coca_model=None,
+        coca_tokenizer=None,
+        coca_transform=None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            clip_model=clip_model,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            coca_model=coca_model,
+            coca_tokenizer=coca_tokenizer,
+            coca_transform=coca_transform,
+        )
+        self.feature_extractor_size = (
+            feature_extractor.size
+            if isinstance(feature_extractor.size, int)
+            else feature_extractor.size["shortest_edge"]
+        )
+        self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+        set_requires_grad(self.text_encoder, False)
+        set_requires_grad(self.clip_model, False)
+
+    def freeze_vae(self):
+        set_requires_grad(self.vae, False)
+
+    def unfreeze_vae(self):
+        set_requires_grad(self.vae, True)
+
+    def freeze_unet(self):
+        set_requires_grad(self.unet, False)
+
+    def unfreeze_unet(self):
+        set_requires_grad(self.unet, True)
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, dtype, device, generator=None):
+        if not isinstance(image, torch.Tensor):
+            raise ValueError(f"`image` has to be of type `torch.Tensor` but is {type(image)}")
+
+        image = image.to(device=device, dtype=dtype)
+
+        if isinstance(generator, list):
+            init_latents = [
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+            ]
+            init_latents = torch.cat(init_latents, dim=0)
+        else:
+            init_latents = self.vae.encode(image).latent_dist.sample(generator)
+
+        # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor
+        init_latents = 0.18215 * init_latents
+        init_latents = init_latents.repeat_interleave(batch_size, dim=0)
+
+        noise = randn_tensor(init_latents.shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    def get_image_description(self, image):
+        transformed_image = self.coca_transform(image).unsqueeze(0)
+        with torch.no_grad(), torch.cuda.amp.autocast():
+            generated = self.coca_model.generate(transformed_image.to(device=self.device, dtype=self.coca_model.dtype))
+        generated = self.coca_tokenizer.decode(generated[0].cpu().numpy())
+        return generated.split("<end_of_text>")[0].replace("<start_of_text>", "").rstrip(" .,")
+
+    def get_clip_image_embeddings(self, image, batch_size):
+        clip_image_input = self.feature_extractor.preprocess(image)
+        clip_image_features = torch.from_numpy(clip_image_input["pixel_values"][0]).unsqueeze(0).to(self.device).half()
+        image_embeddings_clip = self.clip_model.get_image_features(clip_image_features)
+        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
+        image_embeddings_clip = image_embeddings_clip.repeat_interleave(batch_size, dim=0)
+        return image_embeddings_clip
+
+    @torch.enable_grad()
+    def cond_fn(
+        self,
+        latents,
+        timestep,
+        index,
+        text_embeddings,
+        noise_pred_original,
+        original_image_embeddings_clip,
+        clip_guidance_scale,
+    ):
+        latents = latents.detach().requires_grad_()
+
+        latent_model_input = self.scheduler.scale_model_input(latents, timestep)
+
+        # predict the noise residual
+        noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample
+
+        if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler, DPMSolverMultistepScheduler)):
+            alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+            beta_prod_t = 1 - alpha_prod_t
+            # compute predicted original sample from predicted noise also called
+            # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+            pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+            fac = torch.sqrt(beta_prod_t)
+            sample = pred_original_sample * (fac) + latents * (1 - fac)
+        elif isinstance(self.scheduler, LMSDiscreteScheduler):
+            sigma = self.scheduler.sigmas[index]
+            sample = latents - sigma * noise_pred
+        else:
+            raise ValueError(f"scheduler type {type(self.scheduler)} not supported")
+
+        # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor
+        sample = 1 / 0.18215 * sample
+        image = self.vae.decode(sample).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        image = transforms.Resize(self.feature_extractor_size)(image)
+        image = self.normalize(image).to(latents.dtype)
+
+        image_embeddings_clip = self.clip_model.get_image_features(image)
+        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
+
+        loss = spherical_dist_loss(image_embeddings_clip, original_image_embeddings_clip).mean() * clip_guidance_scale
+
+        grads = -torch.autograd.grad(loss, latents)[0]
+
+        if isinstance(self.scheduler, LMSDiscreteScheduler):
+            latents = latents.detach() + grads * (sigma**2)
+            noise_pred = noise_pred_original
+        else:
+            noise_pred = noise_pred_original - torch.sqrt(beta_prod_t) * grads
+        return noise_pred, latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        style_image: Union[torch.Tensor, PIL.Image.Image],
+        content_image: Union[torch.Tensor, PIL.Image.Image],
+        style_prompt: Optional[str] = None,
+        content_prompt: Optional[str] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        noise_strength: float = 0.6,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        batch_size: Optional[int] = 1,
+        eta: float = 0.0,
+        clip_guidance_scale: Optional[float] = 100,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        slerp_latent_style_strength: float = 0.8,
+        slerp_prompt_style_strength: float = 0.1,
+        slerp_clip_image_style_strength: float = 0.1,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(f"You have passed {batch_size} batch_size, but only {len(generator)} generators.")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if isinstance(generator, torch.Generator) and batch_size > 1:
+            generator = [generator] + [None] * (batch_size - 1)
+
+        coca_is_none = [
+            ("model", self.coca_model is None),
+            ("tokenizer", self.coca_tokenizer is None),
+            ("transform", self.coca_transform is None),
+        ]
+        coca_is_none = [x[0] for x in coca_is_none if x[1]]
+        coca_is_none_str = ", ".join(coca_is_none)
+        # generate prompts with coca model if prompt is None
+        if content_prompt is None:
+            if len(coca_is_none):
+                raise ValueError(
+                    f"Content prompt is None and CoCa [{coca_is_none_str}] is None."
+                    f"Set prompt or pass Coca [{coca_is_none_str}] to DiffusionPipeline."
+                )
+            content_prompt = self.get_image_description(content_image)
+        if style_prompt is None:
+            if len(coca_is_none):
+                raise ValueError(
+                    f"Style prompt is None and CoCa [{coca_is_none_str}] is None."
+                    f" Set prompt or pass Coca [{coca_is_none_str}] to DiffusionPipeline."
+                )
+            style_prompt = self.get_image_description(style_image)
+
+        # get prompt text embeddings for content and style
+        content_text_input = self.tokenizer(
+            content_prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        content_text_embeddings = self.text_encoder(content_text_input.input_ids.to(self.device))[0]
+
+        style_text_input = self.tokenizer(
+            style_prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        style_text_embeddings = self.text_encoder(style_text_input.input_ids.to(self.device))[0]
+
+        text_embeddings = slerp(slerp_prompt_style_strength, content_text_embeddings, style_text_embeddings)
+
+        # duplicate text embeddings for each generation per prompt
+        text_embeddings = text_embeddings.repeat_interleave(batch_size, dim=0)
+
+        # set timesteps
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        self.scheduler.timesteps.to(self.device)
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, noise_strength, self.device)
+        latent_timestep = timesteps[:1].repeat(batch_size)
+
+        # Preprocess image
+        preprocessed_content_image = preprocess(content_image, width, height)
+        content_latents = self.prepare_latents(
+            preprocessed_content_image, latent_timestep, batch_size, text_embeddings.dtype, self.device, generator
+        )
+
+        preprocessed_style_image = preprocess(style_image, width, height)
+        style_latents = self.prepare_latents(
+            preprocessed_style_image, latent_timestep, batch_size, text_embeddings.dtype, self.device, generator
+        )
+
+        latents = slerp(slerp_latent_style_strength, content_latents, style_latents)
+
+        if clip_guidance_scale > 0:
+            content_clip_image_embedding = self.get_clip_image_embeddings(content_image, batch_size)
+            style_clip_image_embedding = self.get_clip_image_embeddings(style_image, batch_size)
+            clip_image_embeddings = slerp(
+                slerp_clip_image_style_strength, content_clip_image_embedding, style_clip_image_embedding
+            )
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            max_length = content_text_input.input_ids.shape[-1]
+            uncond_input = self.tokenizer([""], padding="max_length", max_length=max_length, return_tensors="pt")
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+            # duplicate unconditional embeddings for each generation per prompt
+            uncond_embeddings = uncond_embeddings.repeat_interleave(batch_size, dim=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+                # perform classifier free guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # perform clip guidance
+                if clip_guidance_scale > 0:
+                    text_embeddings_for_guidance = (
+                        text_embeddings.chunk(2)[1] if do_classifier_free_guidance else text_embeddings
+                    )
+                    noise_pred, latents = self.cond_fn(
+                        latents,
+                        t,
+                        i,
+                        text_embeddings_for_guidance,
+                        noise_pred,
+                        clip_image_embeddings,
+                        clip_guidance_scale,
+                    )
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                progress_bar.update()
+        # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, None)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
diff --git a/diffusers/examples/community/clip_guided_stable_diffusion.py b/diffusers/examples/community/clip_guided_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..1350650113dada84073f16d8cabc6a63eb9d0165
--- /dev/null
+++ b/diffusers/examples/community/clip_guided_stable_diffusion.py
@@ -0,0 +1,337 @@
+import inspect
+from typing import List, Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torchvision import transforms
+from transformers import CLIPImageProcessor, CLIPModel, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
+
+
+class MakeCutouts(nn.Module):
+    def __init__(self, cut_size, cut_power=1.0):
+        super().__init__()
+
+        self.cut_size = cut_size
+        self.cut_power = cut_power
+
+    def forward(self, pixel_values, num_cutouts):
+        sideY, sideX = pixel_values.shape[2:4]
+        max_size = min(sideX, sideY)
+        min_size = min(sideX, sideY, self.cut_size)
+        cutouts = []
+        for _ in range(num_cutouts):
+            size = int(torch.rand([]) ** self.cut_power * (max_size - min_size) + min_size)
+            offsetx = torch.randint(0, sideX - size + 1, ())
+            offsety = torch.randint(0, sideY - size + 1, ())
+            cutout = pixel_values[:, :, offsety : offsety + size, offsetx : offsetx + size]
+            cutouts.append(F.adaptive_avg_pool2d(cutout, self.cut_size))
+        return torch.cat(cutouts)
+
+
+def spherical_dist_loss(x, y):
+    x = F.normalize(x, dim=-1)
+    y = F.normalize(y, dim=-1)
+    return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)
+
+
+def set_requires_grad(model, value):
+    for param in model.parameters():
+        param.requires_grad = value
+
+
+class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
+    """CLIP guided stable diffusion based on the amazing repo by @crowsonkb and @Jack000
+    - https://github.com/Jack000/glid-3-xl
+    - https://github.dev/crowsonkb/k-diffusion
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        clip_model: CLIPModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, DPMSolverMultistepScheduler],
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            clip_model=clip_model,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+
+        self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+        self.cut_out_size = (
+            feature_extractor.size
+            if isinstance(feature_extractor.size, int)
+            else feature_extractor.size["shortest_edge"]
+        )
+        self.make_cutouts = MakeCutouts(self.cut_out_size)
+
+        set_requires_grad(self.text_encoder, False)
+        set_requires_grad(self.clip_model, False)
+
+    def freeze_vae(self):
+        set_requires_grad(self.vae, False)
+
+    def unfreeze_vae(self):
+        set_requires_grad(self.vae, True)
+
+    def freeze_unet(self):
+        set_requires_grad(self.unet, False)
+
+    def unfreeze_unet(self):
+        set_requires_grad(self.unet, True)
+
+    @torch.enable_grad()
+    def cond_fn(
+        self,
+        latents,
+        timestep,
+        index,
+        text_embeddings,
+        noise_pred_original,
+        text_embeddings_clip,
+        clip_guidance_scale,
+        num_cutouts,
+        use_cutouts=True,
+    ):
+        latents = latents.detach().requires_grad_()
+
+        latent_model_input = self.scheduler.scale_model_input(latents, timestep)
+
+        # predict the noise residual
+        noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample
+
+        if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler, DPMSolverMultistepScheduler)):
+            alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+            beta_prod_t = 1 - alpha_prod_t
+            # compute predicted original sample from predicted noise also called
+            # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+            pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+            fac = torch.sqrt(beta_prod_t)
+            sample = pred_original_sample * (fac) + latents * (1 - fac)
+        elif isinstance(self.scheduler, LMSDiscreteScheduler):
+            sigma = self.scheduler.sigmas[index]
+            sample = latents - sigma * noise_pred
+        else:
+            raise ValueError(f"scheduler type {type(self.scheduler)} not supported")
+
+        sample = 1 / self.vae.config.scaling_factor * sample
+        image = self.vae.decode(sample).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        if use_cutouts:
+            image = self.make_cutouts(image, num_cutouts)
+        else:
+            image = transforms.Resize(self.cut_out_size)(image)
+        image = self.normalize(image).to(latents.dtype)
+
+        image_embeddings_clip = self.clip_model.get_image_features(image)
+        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
+
+        if use_cutouts:
+            dists = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip)
+            dists = dists.view([num_cutouts, sample.shape[0], -1])
+            loss = dists.sum(2).mean(0).sum() * clip_guidance_scale
+        else:
+            loss = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip).mean() * clip_guidance_scale
+
+        grads = -torch.autograd.grad(loss, latents)[0]
+
+        if isinstance(self.scheduler, LMSDiscreteScheduler):
+            latents = latents.detach() + grads * (sigma**2)
+            noise_pred = noise_pred_original
+        else:
+            noise_pred = noise_pred_original - torch.sqrt(beta_prod_t) * grads
+        return noise_pred, latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        clip_guidance_scale: Optional[float] = 100,
+        clip_prompt: Optional[Union[str, List[str]]] = None,
+        num_cutouts: Optional[int] = 4,
+        use_cutouts: Optional[bool] = True,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        # duplicate text embeddings for each generation per prompt
+        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if clip_guidance_scale > 0:
+            if clip_prompt is not None:
+                clip_text_input = self.tokenizer(
+                    clip_prompt,
+                    padding="max_length",
+                    max_length=self.tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                ).input_ids.to(self.device)
+            else:
+                clip_text_input = text_input.input_ids.to(self.device)
+            text_embeddings_clip = self.clip_model.get_text_features(clip_text_input)
+            text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
+            # duplicate text embeddings clip for each generation per prompt
+            text_embeddings_clip = text_embeddings_clip.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            max_length = text_input.input_ids.shape[-1]
+            uncond_input = self.tokenizer([""], padding="max_length", max_length=max_length, return_tensors="pt")
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+            # duplicate unconditional embeddings for each generation per prompt
+            uncond_embeddings = uncond_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # set timesteps
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform classifier free guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # perform clip guidance
+            if clip_guidance_scale > 0:
+                text_embeddings_for_guidance = (
+                    text_embeddings.chunk(2)[1] if do_classifier_free_guidance else text_embeddings
+                )
+                noise_pred, latents = self.cond_fn(
+                    latents,
+                    t,
+                    i,
+                    text_embeddings_for_guidance,
+                    noise_pred,
+                    text_embeddings_clip,
+                    clip_guidance_scale,
+                    num_cutouts,
+                    use_cutouts,
+                )
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, None)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
diff --git a/diffusers/examples/community/clip_guided_stable_diffusion_img2img.py b/diffusers/examples/community/clip_guided_stable_diffusion_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..91c74b9ffa74ad415cf75b8c53506afed6ccca5f
--- /dev/null
+++ b/diffusers/examples/community/clip_guided_stable_diffusion_img2img.py
@@ -0,0 +1,490 @@
+import inspect
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torchvision import transforms
+from transformers import CLIPImageProcessor, CLIPModel, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.utils import PIL_INTERPOLATION, deprecate
+from diffusers.utils.torch_utils import randn_tensor
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from io import BytesIO
+
+        import requests
+        import torch
+        from diffusers import DiffusionPipeline
+        from PIL import Image
+        from transformers import CLIPImageProcessor, CLIPModel
+
+        feature_extractor = CLIPImageProcessor.from_pretrained(
+            "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
+        )
+        clip_model = CLIPModel.from_pretrained(
+            "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", torch_dtype=torch.float16
+        )
+
+
+        guided_pipeline = DiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            # custom_pipeline="clip_guided_stable_diffusion",
+            custom_pipeline="/home/njindal/diffusers/examples/community/clip_guided_stable_diffusion.py",
+            clip_model=clip_model,
+            feature_extractor=feature_extractor,
+            torch_dtype=torch.float16,
+        )
+        guided_pipeline.enable_attention_slicing()
+        guided_pipeline = guided_pipeline.to("cuda")
+
+        prompt = "fantasy book cover, full moon, fantasy forest landscape, golden vector elements, fantasy magic, dark light night, intricate, elegant, sharp focus, illustration, highly detailed, digital painting, concept art, matte, art by WLOP and Artgerm and Albert Bierstadt, masterpiece"
+
+        url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+        response = requests.get(url)
+        init_image = Image.open(BytesIO(response.content)).convert("RGB")
+
+        image = guided_pipeline(
+            prompt=prompt,
+            num_inference_steps=30,
+            image=init_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            clip_guidance_scale=100,
+            num_cutouts=4,
+            use_cutouts=False,
+        ).images[0]
+        display(image)
+        ```
+"""
+
+
+def preprocess(image, w, h):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+class MakeCutouts(nn.Module):
+    def __init__(self, cut_size, cut_power=1.0):
+        super().__init__()
+
+        self.cut_size = cut_size
+        self.cut_power = cut_power
+
+    def forward(self, pixel_values, num_cutouts):
+        sideY, sideX = pixel_values.shape[2:4]
+        max_size = min(sideX, sideY)
+        min_size = min(sideX, sideY, self.cut_size)
+        cutouts = []
+        for _ in range(num_cutouts):
+            size = int(torch.rand([]) ** self.cut_power * (max_size - min_size) + min_size)
+            offsetx = torch.randint(0, sideX - size + 1, ())
+            offsety = torch.randint(0, sideY - size + 1, ())
+            cutout = pixel_values[:, :, offsety : offsety + size, offsetx : offsetx + size]
+            cutouts.append(F.adaptive_avg_pool2d(cutout, self.cut_size))
+        return torch.cat(cutouts)
+
+
+def spherical_dist_loss(x, y):
+    x = F.normalize(x, dim=-1)
+    y = F.normalize(y, dim=-1)
+    return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)
+
+
+def set_requires_grad(model, value):
+    for param in model.parameters():
+        param.requires_grad = value
+
+
+class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
+    """CLIP guided stable diffusion based on the amazing repo by @crowsonkb and @Jack000
+    - https://github.com/Jack000/glid-3-xl
+    - https://github.dev/crowsonkb/k-diffusion
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        clip_model: CLIPModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, DPMSolverMultistepScheduler],
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            clip_model=clip_model,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+
+        self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+        self.cut_out_size = (
+            feature_extractor.size
+            if isinstance(feature_extractor.size, int)
+            else feature_extractor.size["shortest_edge"]
+        )
+        self.make_cutouts = MakeCutouts(self.cut_out_size)
+
+        set_requires_grad(self.text_encoder, False)
+        set_requires_grad(self.clip_model, False)
+
+    def freeze_vae(self):
+        set_requires_grad(self.vae, False)
+
+    def unfreeze_vae(self):
+        set_requires_grad(self.vae, True)
+
+    def freeze_unet(self):
+        set_requires_grad(self.unet, False)
+
+    def unfreeze_unet(self):
+        set_requires_grad(self.unet, True)
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if isinstance(generator, list):
+            init_latents = [
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+            ]
+            init_latents = torch.cat(init_latents, dim=0)
+        else:
+            init_latents = self.vae.encode(image).latent_dist.sample(generator)
+
+        init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    @torch.enable_grad()
+    def cond_fn(
+        self,
+        latents,
+        timestep,
+        index,
+        text_embeddings,
+        noise_pred_original,
+        text_embeddings_clip,
+        clip_guidance_scale,
+        num_cutouts,
+        use_cutouts=True,
+    ):
+        latents = latents.detach().requires_grad_()
+
+        latent_model_input = self.scheduler.scale_model_input(latents, timestep)
+
+        # predict the noise residual
+        noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample
+
+        if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler, DPMSolverMultistepScheduler)):
+            alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+            beta_prod_t = 1 - alpha_prod_t
+            # compute predicted original sample from predicted noise also called
+            # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+            pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+            fac = torch.sqrt(beta_prod_t)
+            sample = pred_original_sample * (fac) + latents * (1 - fac)
+        elif isinstance(self.scheduler, LMSDiscreteScheduler):
+            sigma = self.scheduler.sigmas[index]
+            sample = latents - sigma * noise_pred
+        else:
+            raise ValueError(f"scheduler type {type(self.scheduler)} not supported")
+
+        sample = 1 / self.vae.config.scaling_factor * sample
+        image = self.vae.decode(sample).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        if use_cutouts:
+            image = self.make_cutouts(image, num_cutouts)
+        else:
+            image = transforms.Resize(self.cut_out_size)(image)
+        image = self.normalize(image).to(latents.dtype)
+
+        image_embeddings_clip = self.clip_model.get_image_features(image)
+        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
+
+        if use_cutouts:
+            dists = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip)
+            dists = dists.view([num_cutouts, sample.shape[0], -1])
+            loss = dists.sum(2).mean(0).sum() * clip_guidance_scale
+        else:
+            loss = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip).mean() * clip_guidance_scale
+
+        grads = -torch.autograd.grad(loss, latents)[0]
+
+        if isinstance(self.scheduler, LMSDiscreteScheduler):
+            latents = latents.detach() + grads * (sigma**2)
+            noise_pred = noise_pred_original
+        else:
+            noise_pred = noise_pred_original - torch.sqrt(beta_prod_t) * grads
+        return noise_pred, latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        clip_guidance_scale: Optional[float] = 100,
+        clip_prompt: Optional[Union[str, List[str]]] = None,
+        num_cutouts: Optional[int] = 4,
+        use_cutouts: Optional[bool] = True,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        # duplicate text embeddings for each generation per prompt
+        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # set timesteps
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        self.scheduler.timesteps.to(self.device)
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, self.device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # Preprocess image
+        image = preprocess(image, width, height)
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                text_embeddings.dtype,
+                self.device,
+                generator,
+            )
+
+        if clip_guidance_scale > 0:
+            if clip_prompt is not None:
+                clip_text_input = self.tokenizer(
+                    clip_prompt,
+                    padding="max_length",
+                    max_length=self.tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                ).input_ids.to(self.device)
+            else:
+                clip_text_input = text_input.input_ids.to(self.device)
+            text_embeddings_clip = self.clip_model.get_text_features(clip_text_input)
+            text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
+            # duplicate text embeddings clip for each generation per prompt
+            text_embeddings_clip = text_embeddings_clip.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            max_length = text_input.input_ids.shape[-1]
+            uncond_input = self.tokenizer([""], padding="max_length", max_length=max_length, return_tensors="pt")
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+            # duplicate unconditional embeddings for each generation per prompt
+            uncond_embeddings = uncond_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+
+        with self.progress_bar(total=num_inference_steps):
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+                # perform classifier free guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # perform clip guidance
+                if clip_guidance_scale > 0:
+                    text_embeddings_for_guidance = (
+                        text_embeddings.chunk(2)[1] if do_classifier_free_guidance else text_embeddings
+                    )
+                    noise_pred, latents = self.cond_fn(
+                        latents,
+                        t,
+                        i,
+                        text_embeddings_for_guidance,
+                        noise_pred,
+                        text_embeddings_clip,
+                        clip_guidance_scale,
+                        num_cutouts,
+                        use_cutouts,
+                    )
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, None)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
diff --git a/diffusers/examples/community/composable_stable_diffusion.py b/diffusers/examples/community/composable_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..46d12ba1f2aa4222c99523917eec1274f50a41a0
--- /dev/null
+++ b/diffusers/examples/community/composable_stable_diffusion.py
@@ -0,0 +1,532 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import deprecate, logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        weights: Optional[str] = "",
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if "|" in prompt:
+            prompt = [x.strip() for x in prompt.split("|")]
+            print(f"composing {prompt}...")
+
+            if not weights:
+                # specify weights for prompts (excluding the unconditional score)
+                print("using equal positive weights (conjunction) for all prompts...")
+                weights = torch.tensor([guidance_scale] * len(prompt), device=self.device).reshape(-1, 1, 1, 1)
+            else:
+                # set prompt weight for each
+                num_prompts = len(prompt) if isinstance(prompt, list) else 1
+                weights = [float(w.strip()) for w in weights.split("|")]
+                # guidance scale as the default
+                if len(weights) < num_prompts:
+                    weights.append(guidance_scale)
+                else:
+                    weights = weights[:num_prompts]
+                assert len(weights) == len(prompt), "weights specified are not equal to the number of prompts"
+                weights = torch.tensor(weights, device=self.device).reshape(-1, 1, 1, 1)
+        else:
+            weights = guidance_scale
+
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # composable diffusion
+        if isinstance(prompt, list) and batch_size == 1:
+            # remove extra unconditional embedding
+            # N = one unconditional embed + conditional embeds
+            text_embeddings = text_embeddings[len(prompt) - 1 :]
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = []
+                for j in range(text_embeddings.shape[0]):
+                    noise_pred.append(
+                        self.unet(latent_model_input[:1], t, encoder_hidden_states=text_embeddings[j : j + 1]).sample
+                    )
+                noise_pred = torch.cat(noise_pred, dim=0)
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred[:1], noise_pred[1:]
+                    noise_pred = noise_pred_uncond + (weights * (noise_pred_text - noise_pred_uncond)).sum(
+                        dim=0, keepdims=True
+                    )
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 8. Post-processing
+        image = self.decode_latents(latents)
+
+        # 9. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
+
+        # 10. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/ddim_noise_comparative_analysis.py b/diffusers/examples/community/ddim_noise_comparative_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..829106c47f65cb70746c031e9d4f9ef698035afa
--- /dev/null
+++ b/diffusers/examples/community/ddim_noise_comparative_analysis.py
@@ -0,0 +1,190 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Union
+
+import PIL.Image
+import torch
+from torchvision import transforms
+
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.schedulers import DDIMScheduler
+from diffusers.utils.torch_utils import randn_tensor
+
+
+trans = transforms.Compose(
+    [
+        transforms.Resize((256, 256)),
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ]
+)
+
+
+def preprocess(image):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    image = [trans(img.convert("RGB")) for img in image]
+    image = torch.stack(image)
+    return image
+
+
+class DDIMNoiseComparativeAnalysisPipeline(DiffusionPipeline):
+    r"""
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Parameters:
+        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+            [`DDPMScheduler`], or [`DDIMScheduler`].
+    """
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+
+        # make sure scheduler can always be converted to DDIM
+        scheduler = DDIMScheduler.from_config(scheduler.config)
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    def check_inputs(self, strength):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        init_latents = image.to(device=device, dtype=dtype)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        print("add noise to latents at timestep", timestep)
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        batch_size: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        eta: float = 0.0,
+        num_inference_steps: int = 50,
+        use_clipped_model_output: Optional[bool] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Args:
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            eta (`float`, *optional*, defaults to 0.0):
+                The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            use_clipped_model_output (`bool`, *optional*, defaults to `None`):
+                if `True` or `False`, see documentation for `DDIMScheduler.step`. If `None`, nothing is passed
+                downstream to the scheduler. So use `None` for schedulers which don't support this argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
+            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+        """
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(strength)
+
+        # 2. Preprocess image
+        image = preprocess(image)
+
+        # 3. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, self.device)
+        latent_timestep = timesteps[:1].repeat(batch_size)
+
+        # 4. Prepare latent variables
+        latents = self.prepare_latents(image, latent_timestep, batch_size, self.unet.dtype, self.device, generator)
+        image = latents
+
+        # 5. Denoising loop
+        for t in self.progress_bar(timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(image, t).sample
+
+            # 2. predict previous mean of image x_t-1 and add variance depending on eta
+            # eta corresponds to η in paper and should be between [0, 1]
+            # do x_t -> x_t-1
+            image = self.scheduler.step(
+                model_output,
+                t,
+                image,
+                eta=eta,
+                use_clipped_model_output=use_clipped_model_output,
+                generator=generator,
+            ).prev_sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, latent_timestep.item())
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/examples/community/dps_pipeline.py b/diffusers/examples/community/dps_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0bf3e0ad33d0e09e2f1d30ad8a091cd17a2aad9
--- /dev/null
+++ b/diffusers/examples/community/dps_pipeline.py
@@ -0,0 +1,466 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from math import pi
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image
+
+from diffusers import DDPMScheduler, DiffusionPipeline, ImagePipelineOutput, UNet2DModel
+from diffusers.utils.torch_utils import randn_tensor
+
+
+class DPSPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for Diffusion Posterior Sampling.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+            [`DDPMScheduler`], or [`DDIMScheduler`].
+    """
+
+    model_cpu_offload_seq = "unet"
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        measurement: torch.Tensor,
+        operator: torch.nn.Module,
+        loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
+        batch_size: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        num_inference_steps: int = 1000,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        zeta: float = 0.3,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            measurement (`torch.Tensor`, *required*):
+                A 'torch.Tensor', the corrupted image
+            operator (`torch.nn.Module`, *required*):
+                A 'torch.nn.Module', the operator generating the corrupted image
+            loss_fn (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *required*):
+                A 'Callable[[torch.Tensor, torch.Tensor], torch.Tensor]', the loss function used
+                between the measurements, for most of the cases using RMSE is fine.
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 1000):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import DDPMPipeline
+
+        >>> # load model and scheduler
+        >>> pipe = DDPMPipeline.from_pretrained("google/ddpm-cat-256")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pipe().images[0]
+
+        >>> # save image
+        >>> image.save("ddpm_generated_image.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        # Sample gaussian noise to begin loop
+        if isinstance(self.unet.config.sample_size, int):
+            image_shape = (
+                batch_size,
+                self.unet.config.in_channels,
+                self.unet.config.sample_size,
+                self.unet.config.sample_size,
+            )
+        else:
+            image_shape = (batch_size, self.unet.config.in_channels, *self.unet.config.sample_size)
+
+        if self.device.type == "mps":
+            # randn does not work reproducibly on mps
+            image = randn_tensor(image_shape, generator=generator)
+            image = image.to(self.device)
+        else:
+            image = randn_tensor(image_shape, generator=generator, device=self.device)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            with torch.enable_grad():
+                # 1. predict noise model_output
+                image = image.requires_grad_()
+                model_output = self.unet(image, t).sample
+
+                # 2. compute previous image x'_{t-1} and original prediction x0_{t}
+                scheduler_out = self.scheduler.step(model_output, t, image, generator=generator)
+                image_pred, origi_pred = scheduler_out.prev_sample, scheduler_out.pred_original_sample
+
+                # 3. compute y'_t = f(x0_{t})
+                measurement_pred = operator(origi_pred)
+
+                # 4. compute loss = d(y, y'_t-1)
+                loss = loss_fn(measurement, measurement_pred)
+                loss.backward()
+
+                print("distance: {0:.4f}".format(loss.item()))
+
+                with torch.no_grad():
+                    image_pred = image_pred - zeta * image.grad
+                    image = image_pred.detach()
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
+
+
+if __name__ == "__main__":
+    import scipy
+    from torch import nn
+    from torchvision.utils import save_image
+
+    # defining the operators f(.) of y = f(x)
+    # super-resolution operator
+    class SuperResolutionOperator(nn.Module):
+        def __init__(self, in_shape, scale_factor):
+            super().__init__()
+
+            # Resizer local class, do not use outiside the SR operator class
+            class Resizer(nn.Module):
+                def __init__(self, in_shape, scale_factor=None, output_shape=None, kernel=None, antialiasing=True):
+                    super(Resizer, self).__init__()
+
+                    # First standardize values and fill missing arguments (if needed) by deriving scale from output shape or vice versa
+                    scale_factor, output_shape = self.fix_scale_and_size(in_shape, output_shape, scale_factor)
+
+                    # Choose interpolation method, each method has the matching kernel size
+                    def cubic(x):
+                        absx = np.abs(x)
+                        absx2 = absx**2
+                        absx3 = absx**3
+                        return (1.5 * absx3 - 2.5 * absx2 + 1) * (absx <= 1) + (
+                            -0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2
+                        ) * ((1 < absx) & (absx <= 2))
+
+                    def lanczos2(x):
+                        return (
+                            (np.sin(pi * x) * np.sin(pi * x / 2) + np.finfo(np.float32).eps)
+                            / ((pi**2 * x**2 / 2) + np.finfo(np.float32).eps)
+                        ) * (abs(x) < 2)
+
+                    def box(x):
+                        return ((-0.5 <= x) & (x < 0.5)) * 1.0
+
+                    def lanczos3(x):
+                        return (
+                            (np.sin(pi * x) * np.sin(pi * x / 3) + np.finfo(np.float32).eps)
+                            / ((pi**2 * x**2 / 3) + np.finfo(np.float32).eps)
+                        ) * (abs(x) < 3)
+
+                    def linear(x):
+                        return (x + 1) * ((-1 <= x) & (x < 0)) + (1 - x) * ((0 <= x) & (x <= 1))
+
+                    method, kernel_width = {
+                        "cubic": (cubic, 4.0),
+                        "lanczos2": (lanczos2, 4.0),
+                        "lanczos3": (lanczos3, 6.0),
+                        "box": (box, 1.0),
+                        "linear": (linear, 2.0),
+                        None: (cubic, 4.0),  # set default interpolation method as cubic
+                    }.get(kernel)
+
+                    # Antialiasing is only used when downscaling
+                    antialiasing *= np.any(np.array(scale_factor) < 1)
+
+                    # Sort indices of dimensions according to scale of each dimension. since we are going dim by dim this is efficient
+                    sorted_dims = np.argsort(np.array(scale_factor))
+                    self.sorted_dims = [int(dim) for dim in sorted_dims if scale_factor[dim] != 1]
+
+                    # Iterate over dimensions to calculate local weights for resizing and resize each time in one direction
+                    field_of_view_list = []
+                    weights_list = []
+                    for dim in self.sorted_dims:
+                        # for each coordinate (along 1 dim), calculate which coordinates in the input image affect its result and the
+                        # weights that multiply the values there to get its result.
+                        weights, field_of_view = self.contributions(
+                            in_shape[dim], output_shape[dim], scale_factor[dim], method, kernel_width, antialiasing
+                        )
+
+                        # convert to torch tensor
+                        weights = torch.tensor(weights.T, dtype=torch.float32)
+
+                        # We add singleton dimensions to the weight matrix so we can multiply it with the big tensor we get for
+                        # tmp_im[field_of_view.T], (bsxfun style)
+                        weights_list.append(
+                            nn.Parameter(
+                                torch.reshape(weights, list(weights.shape) + (len(scale_factor) - 1) * [1]),
+                                requires_grad=False,
+                            )
+                        )
+                        field_of_view_list.append(
+                            nn.Parameter(
+                                torch.tensor(field_of_view.T.astype(np.int32), dtype=torch.long), requires_grad=False
+                            )
+                        )
+
+                    self.field_of_view = nn.ParameterList(field_of_view_list)
+                    self.weights = nn.ParameterList(weights_list)
+
+                def forward(self, in_tensor):
+                    x = in_tensor
+
+                    # Use the affecting position values and the set of weights to calculate the result of resizing along this 1 dim
+                    for dim, fov, w in zip(self.sorted_dims, self.field_of_view, self.weights):
+                        # To be able to act on each dim, we swap so that dim 0 is the wanted dim to resize
+                        x = torch.transpose(x, dim, 0)
+
+                        # This is a bit of a complicated multiplication: x[field_of_view.T] is a tensor of order image_dims+1.
+                        # for each pixel in the output-image it matches the positions the influence it from the input image (along 1 dim
+                        # only, this is why it only adds 1 dim to 5the shape). We then multiply, for each pixel, its set of positions with
+                        # the matching set of weights. we do this by this big tensor element-wise multiplication (MATLAB bsxfun style:
+                        # matching dims are multiplied element-wise while singletons mean that the matching dim is all multiplied by the
+                        # same number
+                        x = torch.sum(x[fov] * w, dim=0)
+
+                        # Finally we swap back the axes to the original order
+                        x = torch.transpose(x, dim, 0)
+
+                    return x
+
+                def fix_scale_and_size(self, input_shape, output_shape, scale_factor):
+                    # First fixing the scale-factor (if given) to be standardized the function expects (a list of scale factors in the
+                    # same size as the number of input dimensions)
+                    if scale_factor is not None:
+                        # By default, if scale-factor is a scalar we assume 2d resizing and duplicate it.
+                        if np.isscalar(scale_factor) and len(input_shape) > 1:
+                            scale_factor = [scale_factor, scale_factor]
+
+                        # We extend the size of scale-factor list to the size of the input by assigning 1 to all the unspecified scales
+                        scale_factor = list(scale_factor)
+                        scale_factor = [1] * (len(input_shape) - len(scale_factor)) + scale_factor
+
+                    # Fixing output-shape (if given): extending it to the size of the input-shape, by assigning the original input-size
+                    # to all the unspecified dimensions
+                    if output_shape is not None:
+                        output_shape = list(input_shape[len(output_shape) :]) + list(np.uint(np.array(output_shape)))
+
+                    # Dealing with the case of non-give scale-factor, calculating according to output-shape. note that this is
+                    # sub-optimal, because there can be different scales to the same output-shape.
+                    if scale_factor is None:
+                        scale_factor = 1.0 * np.array(output_shape) / np.array(input_shape)
+
+                    # Dealing with missing output-shape. calculating according to scale-factor
+                    if output_shape is None:
+                        output_shape = np.uint(np.ceil(np.array(input_shape) * np.array(scale_factor)))
+
+                    return scale_factor, output_shape
+
+                def contributions(self, in_length, out_length, scale, kernel, kernel_width, antialiasing):
+                    # This function calculates a set of 'filters' and a set of field_of_view that will later on be applied
+                    # such that each position from the field_of_view will be multiplied with a matching filter from the
+                    # 'weights' based on the interpolation method and the distance of the sub-pixel location from the pixel centers
+                    # around it. This is only done for one dimension of the image.
+
+                    # When anti-aliasing is activated (default and only for downscaling) the receptive field is stretched to size of
+                    # 1/sf. this means filtering is more 'low-pass filter'.
+                    fixed_kernel = (lambda arg: scale * kernel(scale * arg)) if antialiasing else kernel
+                    kernel_width *= 1.0 / scale if antialiasing else 1.0
+
+                    # These are the coordinates of the output image
+                    out_coordinates = np.arange(1, out_length + 1)
+
+                    # since both scale-factor and output size can be provided simulatneously, perserving the center of the image requires shifting
+                    # the output coordinates. the deviation is because out_length doesn't necesary equal in_length*scale.
+                    # to keep the center we need to subtract half of this deivation so that we get equal margins for boths sides and center is preserved.
+                    shifted_out_coordinates = out_coordinates - (out_length - in_length * scale) / 2
+
+                    # These are the matching positions of the output-coordinates on the input image coordinates.
+                    # Best explained by example: say we have 4 horizontal pixels for HR and we downscale by SF=2 and get 2 pixels:
+                    # [1,2,3,4] -> [1,2]. Remember each pixel number is the middle of the pixel.
+                    # The scaling is done between the distances and not pixel numbers (the right boundary of pixel 4 is transformed to
+                    # the right boundary of pixel 2. pixel 1 in the small image matches the boundary between pixels 1 and 2 in the big
+                    # one and not to pixel 2. This means the position is not just multiplication of the old pos by scale-factor).
+                    # So if we measure distance from the left border, middle of pixel 1 is at distance d=0.5, border between 1 and 2 is
+                    # at d=1, and so on (d = p - 0.5).  we calculate (d_new = d_old / sf) which means:
+                    # (p_new-0.5 = (p_old-0.5) / sf)     ->          p_new = p_old/sf + 0.5 * (1-1/sf)
+                    match_coordinates = shifted_out_coordinates / scale + 0.5 * (1 - 1 / scale)
+
+                    # This is the left boundary to start multiplying the filter from, it depends on the size of the filter
+                    left_boundary = np.floor(match_coordinates - kernel_width / 2)
+
+                    # Kernel width needs to be enlarged because when covering has sub-pixel borders, it must 'see' the pixel centers
+                    # of the pixels it only covered a part from. So we add one pixel at each side to consider (weights can zeroize them)
+                    expanded_kernel_width = np.ceil(kernel_width) + 2
+
+                    # Determine a set of field_of_view for each each output position, these are the pixels in the input image
+                    # that the pixel in the output image 'sees'. We get a matrix whos horizontal dim is the output pixels (big) and the
+                    # vertical dim is the pixels it 'sees' (kernel_size + 2)
+                    field_of_view = np.squeeze(
+                        np.int16(np.expand_dims(left_boundary, axis=1) + np.arange(expanded_kernel_width) - 1)
+                    )
+
+                    # Assign weight to each pixel in the field of view. A matrix whos horizontal dim is the output pixels and the
+                    # vertical dim is a list of weights matching to the pixel in the field of view (that are specified in
+                    # 'field_of_view')
+                    weights = fixed_kernel(1.0 * np.expand_dims(match_coordinates, axis=1) - field_of_view - 1)
+
+                    # Normalize weights to sum up to 1. be careful from dividing by 0
+                    sum_weights = np.sum(weights, axis=1)
+                    sum_weights[sum_weights == 0] = 1.0
+                    weights = 1.0 * weights / np.expand_dims(sum_weights, axis=1)
+
+                    # We use this mirror structure as a trick for reflection padding at the boundaries
+                    mirror = np.uint(np.concatenate((np.arange(in_length), np.arange(in_length - 1, -1, step=-1))))
+                    field_of_view = mirror[np.mod(field_of_view, mirror.shape[0])]
+
+                    # Get rid of  weights and pixel positions that are of zero weight
+                    non_zero_out_pixels = np.nonzero(np.any(weights, axis=0))
+                    weights = np.squeeze(weights[:, non_zero_out_pixels])
+                    field_of_view = np.squeeze(field_of_view[:, non_zero_out_pixels])
+
+                    # Final products are the relative positions and the matching weights, both are output_size X fixed_kernel_size
+                    return weights, field_of_view
+
+            self.down_sample = Resizer(in_shape, 1 / scale_factor)
+            for param in self.parameters():
+                param.requires_grad = False
+
+        def forward(self, data, **kwargs):
+            return self.down_sample(data)
+
+    # Gaussian blurring operator
+    class GaussialBlurOperator(nn.Module):
+        def __init__(self, kernel_size, intensity):
+            super().__init__()
+
+            class Blurkernel(nn.Module):
+                def __init__(self, blur_type="gaussian", kernel_size=31, std=3.0):
+                    super().__init__()
+                    self.blur_type = blur_type
+                    self.kernel_size = kernel_size
+                    self.std = std
+                    self.seq = nn.Sequential(
+                        nn.ReflectionPad2d(self.kernel_size // 2),
+                        nn.Conv2d(3, 3, self.kernel_size, stride=1, padding=0, bias=False, groups=3),
+                    )
+                    self.weights_init()
+
+                def forward(self, x):
+                    return self.seq(x)
+
+                def weights_init(self):
+                    if self.blur_type == "gaussian":
+                        n = np.zeros((self.kernel_size, self.kernel_size))
+                        n[self.kernel_size // 2, self.kernel_size // 2] = 1
+                        k = scipy.ndimage.gaussian_filter(n, sigma=self.std)
+                        k = torch.from_numpy(k)
+                        self.k = k
+                        for name, f in self.named_parameters():
+                            f.data.copy_(k)
+
+                def update_weights(self, k):
+                    if not torch.is_tensor(k):
+                        k = torch.from_numpy(k)
+                    for name, f in self.named_parameters():
+                        f.data.copy_(k)
+
+                def get_kernel(self):
+                    return self.k
+
+            self.kernel_size = kernel_size
+            self.conv = Blurkernel(blur_type="gaussian", kernel_size=kernel_size, std=intensity)
+            self.kernel = self.conv.get_kernel()
+            self.conv.update_weights(self.kernel.type(torch.float32))
+
+            for param in self.parameters():
+                param.requires_grad = False
+
+        def forward(self, data, **kwargs):
+            return self.conv(data)
+
+        def transpose(self, data, **kwargs):
+            return data
+
+        def get_kernel(self):
+            return self.kernel.view(1, 1, self.kernel_size, self.kernel_size)
+
+    # assuming the forward process y = f(x) is polluted by Gaussian noise, use l2 norm
+    def RMSELoss(yhat, y):
+        return torch.sqrt(torch.sum((yhat - y) ** 2))
+
+    # set up source image
+    src = Image.open("sample.png")
+    # read image into [1,3,H,W]
+    src = torch.from_numpy(np.array(src, dtype=np.float32)).permute(2, 0, 1)[None]
+    # normalize image to [-1,1]
+    src = (src / 127.5) - 1.0
+    src = src.to("cuda")
+
+    # set up operator and measurement
+    # operator = SuperResolutionOperator(in_shape=src.shape, scale_factor=4).to("cuda")
+    operator = GaussialBlurOperator(kernel_size=61, intensity=3.0).to("cuda")
+    measurement = operator(src)
+
+    # set up scheduler
+    scheduler = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256")
+    scheduler.set_timesteps(1000)
+
+    # set up model
+    model = UNet2DModel.from_pretrained("google/ddpm-celebahq-256").to("cuda")
+
+    save_image((src + 1.0) / 2.0, "dps_src.png")
+    save_image((measurement + 1.0) / 2.0, "dps_mea.png")
+
+    # finally, the pipeline
+    dpspipe = DPSPipeline(model, scheduler)
+    image = dpspipe(
+        measurement=measurement,
+        operator=operator,
+        loss_fn=RMSELoss,
+        zeta=1.0,
+    ).images[0]
+
+    image.save("dps_generated_image.png")
diff --git a/diffusers/examples/community/edict_pipeline.py b/diffusers/examples/community/edict_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac977f79abecd281c07e780c76023216afb1a5f6
--- /dev/null
+++ b/diffusers/examples/community/edict_pipeline.py
@@ -0,0 +1,264 @@
+from typing import Optional
+
+import torch
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DDIMScheduler, DiffusionPipeline, UNet2DConditionModel
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.utils import (
+    deprecate,
+)
+
+
+class EDICTPipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+        mixing_coeff: float = 0.93,
+        leapfrog_steps: bool = True,
+    ):
+        self.mixing_coeff = mixing_coeff
+        self.leapfrog_steps = leapfrog_steps
+
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def _encode_prompt(
+        self, prompt: str, negative_prompt: Optional[str] = None, do_classifier_free_guidance: bool = False
+    ):
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        prompt_embeds = self.text_encoder(text_inputs.input_ids.to(self.device)).last_hidden_state
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=self.device)
+
+        if do_classifier_free_guidance:
+            uncond_tokens = "" if negative_prompt is None else negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            negative_prompt_embeds = self.text_encoder(uncond_input.input_ids.to(self.device)).last_hidden_state
+
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def denoise_mixing_layer(self, x: torch.Tensor, y: torch.Tensor):
+        x = self.mixing_coeff * x + (1 - self.mixing_coeff) * y
+        y = self.mixing_coeff * y + (1 - self.mixing_coeff) * x
+
+        return [x, y]
+
+    def noise_mixing_layer(self, x: torch.Tensor, y: torch.Tensor):
+        y = (y - (1 - self.mixing_coeff) * x) / self.mixing_coeff
+        x = (x - (1 - self.mixing_coeff) * y) / self.mixing_coeff
+
+        return [x, y]
+
+    def _get_alpha_and_beta(self, t: torch.Tensor):
+        # as self.alphas_cumprod is always in cpu
+        t = int(t)
+
+        alpha_prod = self.scheduler.alphas_cumprod[t] if t >= 0 else self.scheduler.final_alpha_cumprod
+
+        return alpha_prod, 1 - alpha_prod
+
+    def noise_step(
+        self,
+        base: torch.Tensor,
+        model_input: torch.Tensor,
+        model_output: torch.Tensor,
+        timestep: torch.Tensor,
+    ):
+        prev_timestep = timestep - self.scheduler.config.num_train_timesteps / self.scheduler.num_inference_steps
+
+        alpha_prod_t, beta_prod_t = self._get_alpha_and_beta(timestep)
+        alpha_prod_t_prev, beta_prod_t_prev = self._get_alpha_and_beta(prev_timestep)
+
+        a_t = (alpha_prod_t_prev / alpha_prod_t) ** 0.5
+        b_t = -a_t * (beta_prod_t**0.5) + beta_prod_t_prev**0.5
+
+        next_model_input = (base - b_t * model_output) / a_t
+
+        return model_input, next_model_input.to(base.dtype)
+
+    def denoise_step(
+        self,
+        base: torch.Tensor,
+        model_input: torch.Tensor,
+        model_output: torch.Tensor,
+        timestep: torch.Tensor,
+    ):
+        prev_timestep = timestep - self.scheduler.config.num_train_timesteps / self.scheduler.num_inference_steps
+
+        alpha_prod_t, beta_prod_t = self._get_alpha_and_beta(timestep)
+        alpha_prod_t_prev, beta_prod_t_prev = self._get_alpha_and_beta(prev_timestep)
+
+        a_t = (alpha_prod_t_prev / alpha_prod_t) ** 0.5
+        b_t = -a_t * (beta_prod_t**0.5) + beta_prod_t_prev**0.5
+        next_model_input = a_t * base + b_t * model_output
+
+        return model_input, next_model_input.to(base.dtype)
+
+    @torch.no_grad()
+    def decode_latents(self, latents: torch.Tensor):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        return image
+
+    @torch.no_grad()
+    def prepare_latents(
+        self,
+        image: Image.Image,
+        text_embeds: torch.Tensor,
+        timesteps: torch.Tensor,
+        guidance_scale: float,
+        generator: Optional[torch.Generator] = None,
+    ):
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        image = image.to(device=self.device, dtype=text_embeds.dtype)
+        latent = self.vae.encode(image).latent_dist.sample(generator)
+
+        latent = self.vae.config.scaling_factor * latent
+
+        coupled_latents = [latent.clone(), latent.clone()]
+
+        for i, t in tqdm(enumerate(timesteps), total=len(timesteps)):
+            coupled_latents = self.noise_mixing_layer(x=coupled_latents[0], y=coupled_latents[1])
+
+            # j - model_input index, k - base index
+            for j in range(2):
+                k = j ^ 1
+
+                if self.leapfrog_steps:
+                    if i % 2 == 0:
+                        k, j = j, k
+
+                model_input = coupled_latents[j]
+                base = coupled_latents[k]
+
+                latent_model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input
+
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeds).sample
+
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                base, model_input = self.noise_step(
+                    base=base,
+                    model_input=model_input,
+                    model_output=noise_pred,
+                    timestep=t,
+                )
+
+                coupled_latents[k] = model_input
+
+        return coupled_latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        base_prompt: str,
+        target_prompt: str,
+        image: Image.Image,
+        guidance_scale: float = 3.0,
+        num_inference_steps: int = 50,
+        strength: float = 0.8,
+        negative_prompt: Optional[str] = None,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+    ):
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        image = self.image_processor.preprocess(image)
+
+        base_embeds = self._encode_prompt(base_prompt, negative_prompt, do_classifier_free_guidance)
+        target_embeds = self._encode_prompt(target_prompt, negative_prompt, do_classifier_free_guidance)
+
+        self.scheduler.set_timesteps(num_inference_steps, self.device)
+
+        t_limit = num_inference_steps - int(num_inference_steps * strength)
+        fwd_timesteps = self.scheduler.timesteps[t_limit:]
+        bwd_timesteps = fwd_timesteps.flip(0)
+
+        coupled_latents = self.prepare_latents(image, base_embeds, bwd_timesteps, guidance_scale, generator)
+
+        for i, t in tqdm(enumerate(fwd_timesteps), total=len(fwd_timesteps)):
+            # j - model_input index, k - base index
+            for k in range(2):
+                j = k ^ 1
+
+                if self.leapfrog_steps:
+                    if i % 2 == 1:
+                        k, j = j, k
+
+                model_input = coupled_latents[j]
+                base = coupled_latents[k]
+
+                latent_model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input
+
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=target_embeds).sample
+
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                base, model_input = self.denoise_step(
+                    base=base,
+                    model_input=model_input,
+                    model_output=noise_pred,
+                    timestep=t,
+                )
+
+                coupled_latents[k] = model_input
+
+            coupled_latents = self.denoise_mixing_layer(x=coupled_latents[0], y=coupled_latents[1])
+
+        # either one is fine
+        final_latent = coupled_latents[0]
+
+        if output_type not in ["latent", "pt", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
+
+        if output_type == "latent":
+            image = final_latent
+        else:
+            image = self.decode_latents(final_latent)
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        return image
diff --git a/diffusers/examples/community/fresco_v2v.py b/diffusers/examples/community/fresco_v2v.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab191ecf0d814217da7e204026ad4aed925abcd4
--- /dev/null
+++ b/diffusers/examples/community/fresco_v2v.py
@@ -0,0 +1,2511 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+import torch.utils.model_zoo
+from einops import rearrange, repeat
+from gmflow.gmflow import GMFlow
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from diffusers.models.attention_processor import AttnProcessor2_0
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.controlnet.pipeline_controlnet_img2img import StableDiffusionControlNetImg2ImgPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def clear_cache():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def coords_grid(b, h, w, homogeneous=False, device=None):
+    y, x = torch.meshgrid(torch.arange(h), torch.arange(w))  # [H, W]
+
+    stacks = [x, y]
+
+    if homogeneous:
+        ones = torch.ones_like(x)  # [H, W]
+        stacks.append(ones)
+
+    grid = torch.stack(stacks, dim=0).float()  # [2, H, W] or [3, H, W]
+
+    grid = grid[None].repeat(b, 1, 1, 1)  # [B, 2, H, W] or [B, 3, H, W]
+
+    if device is not None:
+        grid = grid.to(device)
+
+    return grid
+
+
+def bilinear_sample(img, sample_coords, mode="bilinear", padding_mode="zeros", return_mask=False):
+    # img: [B, C, H, W]
+    # sample_coords: [B, 2, H, W] in image scale
+    if sample_coords.size(1) != 2:  # [B, H, W, 2]
+        sample_coords = sample_coords.permute(0, 3, 1, 2)
+
+    b, _, h, w = sample_coords.shape
+
+    # Normalize to [-1, 1]
+    x_grid = 2 * sample_coords[:, 0] / (w - 1) - 1
+    y_grid = 2 * sample_coords[:, 1] / (h - 1) - 1
+
+    grid = torch.stack([x_grid, y_grid], dim=-1)  # [B, H, W, 2]
+
+    img = F.grid_sample(img, grid, mode=mode, padding_mode=padding_mode, align_corners=True)
+
+    if return_mask:
+        mask = (x_grid >= -1) & (y_grid >= -1) & (x_grid <= 1) & (y_grid <= 1)  # [B, H, W]
+
+        return img, mask
+
+    return img
+
+
+class Dilate:
+    def __init__(self, kernel_size=7, channels=1, device="cpu"):
+        self.kernel_size = kernel_size
+        self.channels = channels
+        gaussian_kernel = torch.ones(1, 1, self.kernel_size, self.kernel_size)
+        gaussian_kernel = gaussian_kernel.repeat(self.channels, 1, 1, 1)
+        self.mean = (self.kernel_size - 1) // 2
+        gaussian_kernel = gaussian_kernel.to(device)
+        self.gaussian_filter = gaussian_kernel
+
+    def __call__(self, x):
+        x = F.pad(x, (self.mean, self.mean, self.mean, self.mean), "replicate")
+        return torch.clamp(F.conv2d(x, self.gaussian_filter, bias=None), 0, 1)
+
+
+def flow_warp(feature, flow, mask=False, mode="bilinear", padding_mode="zeros"):
+    b, c, h, w = feature.size()
+    assert flow.size(1) == 2
+
+    grid = coords_grid(b, h, w).to(flow.device) + flow  # [B, 2, H, W]
+    grid = grid.to(feature.dtype)
+    return bilinear_sample(feature, grid, mode=mode, padding_mode=padding_mode, return_mask=mask)
+
+
+def forward_backward_consistency_check(fwd_flow, bwd_flow, alpha=0.01, beta=0.5):
+    # fwd_flow, bwd_flow: [B, 2, H, W]
+    # alpha and beta values are following UnFlow
+    # (https://arxiv.org/abs/1711.07837)
+    assert fwd_flow.dim() == 4 and bwd_flow.dim() == 4
+    assert fwd_flow.size(1) == 2 and bwd_flow.size(1) == 2
+    flow_mag = torch.norm(fwd_flow, dim=1) + torch.norm(bwd_flow, dim=1)  # [B, H, W]
+
+    warped_bwd_flow = flow_warp(bwd_flow, fwd_flow)  # [B, 2, H, W]
+    warped_fwd_flow = flow_warp(fwd_flow, bwd_flow)  # [B, 2, H, W]
+
+    diff_fwd = torch.norm(fwd_flow + warped_bwd_flow, dim=1)  # [B, H, W]
+    diff_bwd = torch.norm(bwd_flow + warped_fwd_flow, dim=1)
+
+    threshold = alpha * flow_mag + beta
+
+    fwd_occ = (diff_fwd > threshold).float()  # [B, H, W]
+    bwd_occ = (diff_bwd > threshold).float()
+
+    return fwd_occ, bwd_occ
+
+
+def numpy2tensor(img):
+    x0 = torch.from_numpy(img.copy()).float().cuda() / 255.0 * 2.0 - 1.0
+    x0 = torch.stack([x0], dim=0)
+    # einops.rearrange(x0, 'b h w c -> b c h w').clone()
+    return x0.permute(0, 3, 1, 2)
+
+
+def calc_mean_std(feat, eps=1e-5, chunk=1):
+    size = feat.size()
+    assert len(size) == 4
+    if chunk == 2:
+        feat = torch.cat(feat.chunk(2), dim=3)
+    N, C = size[:2]
+    feat_var = feat.view(N // chunk, C, -1).var(dim=2) + eps
+    feat_std = feat_var.sqrt().view(N, C, 1, 1)
+    feat_mean = feat.view(N // chunk, C, -1).mean(dim=2).view(N // chunk, C, 1, 1)
+    return feat_mean.repeat(chunk, 1, 1, 1), feat_std.repeat(chunk, 1, 1, 1)
+
+
+def adaptive_instance_normalization(content_feat, style_feat, chunk=1):
+    assert content_feat.size()[:2] == style_feat.size()[:2]
+    size = content_feat.size()
+    style_mean, style_std = calc_mean_std(style_feat, chunk)
+    content_mean, content_std = calc_mean_std(content_feat)
+
+    normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
+    return normalized_feat * style_std.expand(size) + style_mean.expand(size)
+
+
+def optimize_feature(
+    sample, flows, occs, correlation_matrix=[], intra_weight=1e2, iters=20, unet_chunk_size=2, optimize_temporal=True
+):
+    """
+    FRESO-guided latent feature optimization
+    * optimize spatial correspondence (match correlation_matrix)
+    * optimize temporal correspondence (match warped_image)
+    """
+    if (flows is None or occs is None or (not optimize_temporal)) and (
+        intra_weight == 0 or len(correlation_matrix) == 0
+    ):
+        return sample
+    # flows=[fwd_flows, bwd_flows]: (N-1)*2*H1*W1
+    # occs=[fwd_occs, bwd_occs]: (N-1)*H1*W1
+    # sample: 2N*C*H*W
+    torch.cuda.empty_cache()
+    video_length = sample.shape[0] // unet_chunk_size
+    latent = rearrange(sample.to(torch.float32), "(b f) c h w -> b f c h w", f=video_length)
+
+    cs = torch.nn.Parameter((latent.detach().clone()))
+    optimizer = torch.optim.Adam([cs], lr=0.2)
+
+    # unify resolution
+    if flows is not None and occs is not None:
+        scale = sample.shape[2] * 1.0 / flows[0].shape[2]
+        kernel = int(1 / scale)
+        bwd_flow_ = F.interpolate(flows[1] * scale, scale_factor=scale, mode="bilinear").repeat(
+            unet_chunk_size, 1, 1, 1
+        )
+        bwd_occ_ = F.max_pool2d(occs[1].unsqueeze(1), kernel_size=kernel).repeat(
+            unet_chunk_size, 1, 1, 1
+        )  # 2(N-1)*1*H1*W1
+        fwd_flow_ = F.interpolate(flows[0] * scale, scale_factor=scale, mode="bilinear").repeat(
+            unet_chunk_size, 1, 1, 1
+        )
+        fwd_occ_ = F.max_pool2d(occs[0].unsqueeze(1), kernel_size=kernel).repeat(
+            unet_chunk_size, 1, 1, 1
+        )  # 2(N-1)*1*H1*W1
+        # match frame 0,1,2,3 and frame 1,2,3,0
+        reshuffle_list = list(range(1, video_length)) + [0]
+
+    # attention_probs is the GRAM matrix of the normalized feature
+    attention_probs = None
+    for tmp in correlation_matrix:
+        if sample.shape[2] * sample.shape[3] == tmp.shape[1]:
+            attention_probs = tmp  # 2N*HW*HW
+            break
+
+    n_iter = [0]
+    while n_iter[0] < iters:
+
+        def closure():
+            optimizer.zero_grad()
+
+            loss = 0
+
+            # temporal consistency loss
+            if optimize_temporal and flows is not None and occs is not None:
+                c1 = rearrange(cs[:, :], "b f c h w -> (b f) c h w")
+                c2 = rearrange(cs[:, reshuffle_list], "b f c h w -> (b f) c h w")
+                warped_image1 = flow_warp(c1, bwd_flow_)
+                warped_image2 = flow_warp(c2, fwd_flow_)
+                loss = (
+                    abs((c2 - warped_image1) * (1 - bwd_occ_)) + abs((c1 - warped_image2) * (1 - fwd_occ_))
+                ).mean() * 2
+
+            # spatial consistency loss
+            if attention_probs is not None and intra_weight > 0:
+                cs_vector = rearrange(cs, "b f c h w -> (b f) (h w) c")
+                # attention_scores = torch.bmm(cs_vector, cs_vector.transpose(-1, -2))
+                # cs_attention_probs = attention_scores.softmax(dim=-1)
+                cs_vector = cs_vector / ((cs_vector**2).sum(dim=2, keepdims=True) ** 0.5)
+                cs_attention_probs = torch.bmm(cs_vector, cs_vector.transpose(-1, -2))
+                tmp = F.l1_loss(cs_attention_probs, attention_probs) * intra_weight
+                loss = tmp + loss
+
+            loss.backward()
+            n_iter[0] += 1
+
+            return loss
+
+        optimizer.step(closure)
+
+    torch.cuda.empty_cache()
+    return adaptive_instance_normalization(rearrange(cs.data.to(sample.dtype), "b f c h w -> (b f) c h w"), sample)
+
+
+@torch.no_grad()
+def warp_tensor(sample, flows, occs, saliency, unet_chunk_size):
+    """
+    Warp images or features based on optical flow
+    Fuse the warped imges or features based on occusion masks and saliency map
+    """
+    scale = sample.shape[2] * 1.0 / flows[0].shape[2]
+    kernel = int(1 / scale)
+    bwd_flow_ = F.interpolate(flows[1] * scale, scale_factor=scale, mode="bilinear")
+    bwd_occ_ = F.max_pool2d(occs[1].unsqueeze(1), kernel_size=kernel)  # (N-1)*1*H1*W1
+    if scale == 1:
+        bwd_occ_ = Dilate(kernel_size=13, device=sample.device)(bwd_occ_)
+    fwd_flow_ = F.interpolate(flows[0] * scale, scale_factor=scale, mode="bilinear")
+    fwd_occ_ = F.max_pool2d(occs[0].unsqueeze(1), kernel_size=kernel)  # (N-1)*1*H1*W1
+    if scale == 1:
+        fwd_occ_ = Dilate(kernel_size=13, device=sample.device)(fwd_occ_)
+    scale2 = sample.shape[2] * 1.0 / saliency.shape[2]
+    saliency = F.interpolate(saliency, scale_factor=scale2, mode="bilinear")
+    latent = sample.to(torch.float32)
+    video_length = sample.shape[0] // unet_chunk_size
+    warp_saliency = flow_warp(saliency, bwd_flow_)
+    warp_saliency_ = flow_warp(saliency[0:1], fwd_flow_[video_length - 1 : video_length])
+
+    for j in range(unet_chunk_size):
+        for ii in range(video_length - 1):
+            i = video_length * j + ii
+            warped_image = flow_warp(latent[i : i + 1], bwd_flow_[ii : ii + 1])
+            mask = (1 - bwd_occ_[ii : ii + 1]) * saliency[ii + 1 : ii + 2] * warp_saliency[ii : ii + 1]
+            latent[i + 1 : i + 2] = latent[i + 1 : i + 2] * (1 - mask) + warped_image * mask
+        i = video_length * j
+        ii = video_length - 1
+        warped_image = flow_warp(latent[i : i + 1], fwd_flow_[ii : ii + 1])
+        mask = (1 - fwd_occ_[ii : ii + 1]) * saliency[ii : ii + 1] * warp_saliency_
+        latent[ii + i : ii + i + 1] = latent[ii + i : ii + i + 1] * (1 - mask) + warped_image * mask
+
+    return latent.to(sample.dtype)
+
+
+def my_forward(
+    self,
+    steps=[],
+    layers=[0, 1, 2, 3],
+    flows=None,
+    occs=None,
+    correlation_matrix=[],
+    intra_weight=1e2,
+    iters=20,
+    optimize_temporal=True,
+    saliency=None,
+):
+    """
+    Hacked pipe.unet.forward()
+    copied from https://github.com/huggingface/diffusers/blob/v0.19.3/src/diffusers/models/unet_2d_condition.py#L700
+    if you are using a new version of diffusers, please copy the source code and modify it accordingly (find [HACK] in the code)
+    * restore and return the decoder features
+    * optimize the decoder features
+    * perform background smoothing
+    """
+
+    def forward(
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 3. down
+
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
+
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_block_additional_residuals.pop(0)
+
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    sample += down_block_additional_residuals.pop(0)
+            down_block_res_samples += res_samples
+
+        if is_controlnet:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+
+        # 5. up
+        """
+        [HACK] restore the decoder features in up_samples
+        """
+        up_samples = ()
+        # down_samples = ()
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            """
+            [HACK] restore the decoder features in up_samples
+            [HACK] optimize the decoder features
+            [HACK] perform background smoothing
+            """
+            if i in layers:
+                up_samples += (sample,)
+            if timestep in steps and i in layers:
+                sample = optimize_feature(
+                    sample, flows, occs, correlation_matrix, intra_weight, iters, optimize_temporal=optimize_temporal
+                )
+                if saliency is not None:
+                    sample = warp_tensor(sample, flows, occs, saliency, 2)
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        """
+        [HACK] return the output feature as well as the decoder features
+        """
+        if not return_dict:
+            return (sample,) + up_samples
+
+        return UNet2DConditionOutput(sample=sample)
+
+    return forward
+
+
+@torch.no_grad()
+def get_single_mapping_ind(bwd_flow, bwd_occ, imgs, scale=1.0):
+    """
+    FLATTEN: Optical fLow-guided attention (Temoporal-guided attention)
+    Find the correspondence between every pixels in a pair of frames
+
+    [input]
+    bwd_flow: 1*2*H*W
+    bwd_occ: 1*H*W      i.e., f2 = warp(f1, bwd_flow) * bwd_occ
+    imgs: 2*3*H*W       i.e., [f1,f2]
+
+    [output]
+    mapping_ind: pixel index correspondence
+    unlinkedmask: indicate whether a pixel has no correspondence
+    i.e., f2 = f1[mapping_ind] * unlinkedmask
+    """
+    flows = F.interpolate(bwd_flow, scale_factor=1.0 / scale, mode="bilinear")[0][[1, 0]] / scale  # 2*H*W
+    _, H, W = flows.shape
+    masks = torch.logical_not(F.interpolate(bwd_occ[None], scale_factor=1.0 / scale, mode="bilinear") > 0.5)[
+        0
+    ]  # 1*H*W
+    frames = F.interpolate(imgs, scale_factor=1.0 / scale, mode="bilinear").view(2, 3, -1)  # 2*3*HW
+    grid = torch.stack(torch.meshgrid([torch.arange(H), torch.arange(W)]), dim=0).to(flows.device)  # 2*H*W
+    warp_grid = torch.round(grid + flows)
+    mask = torch.logical_and(
+        torch.logical_and(
+            torch.logical_and(torch.logical_and(warp_grid[0] >= 0, warp_grid[0] < H), warp_grid[1] >= 0),
+            warp_grid[1] < W,
+        ),
+        masks[0],
+    ).view(-1)  # HW
+    warp_grid = warp_grid.view(2, -1)  # 2*HW
+    warp_ind = (warp_grid[0] * W + warp_grid[1]).to(torch.long)  # HW
+    mapping_ind = torch.zeros_like(warp_ind) - 1  # HW
+
+    for f0ind, f1ind in enumerate(warp_ind):
+        if mask[f0ind]:
+            if mapping_ind[f1ind] == -1:
+                mapping_ind[f1ind] = f0ind
+            else:
+                targetv = frames[0, :, f1ind]
+                pref0ind = mapping_ind[f1ind]
+                prev = frames[1, :, pref0ind]
+                v = frames[1, :, f0ind]
+                if ((prev - targetv) ** 2).mean() > ((v - targetv) ** 2).mean():
+                    mask[pref0ind] = False
+                    mapping_ind[f1ind] = f0ind
+                else:
+                    mask[f0ind] = False
+
+    unusedind = torch.arange(len(mask)).to(mask.device)[~mask]
+    unlinkedmask = mapping_ind == -1
+    mapping_ind[unlinkedmask] = unusedind
+    return mapping_ind, unlinkedmask
+
+
+@torch.no_grad()
+def get_mapping_ind(bwd_flows, bwd_occs, imgs, scale=1.0):
+    """
+    FLATTEN: Optical fLow-guided attention (Temoporal-guided attention)
+    Find pixel correspondence between every consecutive frames in a batch
+
+    [input]
+    bwd_flow: (N-1)*2*H*W
+    bwd_occ: (N-1)*H*W
+    imgs: N*3*H*W
+
+    [output]
+    fwd_mappings: N*1*HW
+    bwd_mappings: N*1*HW
+    flattn_mask: HW*1*N*N
+    i.e., imgs[i,:,fwd_mappings[i]] corresponds to imgs[0]
+    i.e., imgs[i,:,fwd_mappings[i]][:,bwd_mappings[i]] restore the original imgs[i]
+    """
+    N, H, W = imgs.shape[0], int(imgs.shape[2] // scale), int(imgs.shape[3] // scale)
+    iterattn_mask = torch.ones(H * W, N, N, dtype=torch.bool).to(imgs.device)
+    for i in range(len(imgs) - 1):
+        one_mask = torch.ones(N, N, dtype=torch.bool).to(imgs.device)
+        one_mask[: i + 1, i + 1 :] = False
+        one_mask[i + 1 :, : i + 1] = False
+        mapping_ind, unlinkedmask = get_single_mapping_ind(
+            bwd_flows[i : i + 1], bwd_occs[i : i + 1], imgs[i : i + 2], scale
+        )
+        if i == 0:
+            fwd_mapping = [torch.arange(len(mapping_ind)).to(mapping_ind.device)]
+            bwd_mapping = [torch.arange(len(mapping_ind)).to(mapping_ind.device)]
+        iterattn_mask[unlinkedmask[fwd_mapping[-1]]] = torch.logical_and(
+            iterattn_mask[unlinkedmask[fwd_mapping[-1]]], one_mask
+        )
+        fwd_mapping += [mapping_ind[fwd_mapping[-1]]]
+        bwd_mapping += [torch.sort(fwd_mapping[-1])[1]]
+    fwd_mappings = torch.stack(fwd_mapping, dim=0).unsqueeze(1)
+    bwd_mappings = torch.stack(bwd_mapping, dim=0).unsqueeze(1)
+    return fwd_mappings, bwd_mappings, iterattn_mask.unsqueeze(1)
+
+
+def apply_FRESCO_opt(
+    pipe,
+    steps=[],
+    layers=[0, 1, 2, 3],
+    flows=None,
+    occs=None,
+    correlation_matrix=[],
+    intra_weight=1e2,
+    iters=20,
+    optimize_temporal=True,
+    saliency=None,
+):
+    """
+    Apply FRESCO-based optimization to a StableDiffusionPipeline
+    """
+    pipe.unet.forward = my_forward(
+        pipe.unet, steps, layers, flows, occs, correlation_matrix, intra_weight, iters, optimize_temporal, saliency
+    )
+
+
+@torch.no_grad()
+def get_intraframe_paras(pipe, imgs, frescoProc, prompt_embeds, do_classifier_free_guidance=True, generator=None):
+    """
+    Get parameters for spatial-guided attention and optimization
+    * perform one step denoising
+    * collect attention feature, stored in frescoProc.controller.stored_attn['decoder_attn']
+    * compute the gram matrix of the normalized feature for spatial consistency loss
+    """
+
+    noise_scheduler = pipe.scheduler
+    timestep = noise_scheduler.timesteps[-1]
+    device = pipe._execution_device
+    B, C, H, W = imgs.shape
+
+    frescoProc.controller.disable_controller()
+    apply_FRESCO_opt(pipe)
+    frescoProc.controller.clear_store()
+    frescoProc.controller.enable_store()
+
+    latents = pipe.prepare_latents(
+        imgs.to(pipe.unet.dtype), timestep, B, 1, prompt_embeds.dtype, device, generator=generator, repeat_noise=False
+    )
+
+    latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+    model_output = pipe.unet(
+        latent_model_input,
+        timestep,
+        encoder_hidden_states=prompt_embeds,
+        cross_attention_kwargs=None,
+        return_dict=False,
+    )
+
+    frescoProc.controller.disable_store()
+
+    # gram matrix of the normalized feature for spatial consistency loss
+    correlation_matrix = []
+    for tmp in model_output[1:]:
+        latent_vector = rearrange(tmp, "b c h w -> b (h w) c")
+        latent_vector = latent_vector / ((latent_vector**2).sum(dim=2, keepdims=True) ** 0.5)
+        attention_probs = torch.bmm(latent_vector, latent_vector.transpose(-1, -2))
+        correlation_matrix += [attention_probs.detach().clone().to(torch.float32)]
+        del attention_probs, latent_vector, tmp
+    del model_output
+
+    clear_cache()
+
+    return correlation_matrix
+
+
+@torch.no_grad()
+def get_flow_and_interframe_paras(flow_model, imgs):
+    """
+    Get parameters for temporal-guided attention and optimization
+    * predict optical flow and occlusion mask
+    * compute pixel index correspondence for FLATTEN
+    """
+    images = torch.stack([torch.from_numpy(img).permute(2, 0, 1).float() for img in imgs], dim=0).cuda()
+    imgs_torch = torch.cat([numpy2tensor(img) for img in imgs], dim=0)
+
+    reshuffle_list = list(range(1, len(images))) + [0]
+
+    results_dict = flow_model(
+        images,
+        images[reshuffle_list],
+        attn_splits_list=[2],
+        corr_radius_list=[-1],
+        prop_radius_list=[-1],
+        pred_bidir_flow=True,
+    )
+    flow_pr = results_dict["flow_preds"][-1]  # [2*B, 2, H, W]
+    fwd_flows, bwd_flows = flow_pr.chunk(2)  # [B, 2, H, W]
+    fwd_occs, bwd_occs = forward_backward_consistency_check(fwd_flows, bwd_flows)  # [B, H, W]
+
+    warped_image1 = flow_warp(images, bwd_flows)
+    bwd_occs = torch.clamp(
+        bwd_occs + (abs(images[reshuffle_list] - warped_image1).mean(dim=1) > 255 * 0.25).float(), 0, 1
+    )
+
+    warped_image2 = flow_warp(images[reshuffle_list], fwd_flows)
+    fwd_occs = torch.clamp(fwd_occs + (abs(images - warped_image2).mean(dim=1) > 255 * 0.25).float(), 0, 1)
+
+    attn_mask = []
+    for scale in [8.0, 16.0, 32.0]:
+        bwd_occs_ = F.interpolate(bwd_occs[:-1].unsqueeze(1), scale_factor=1.0 / scale, mode="bilinear")
+        attn_mask += [
+            torch.cat((bwd_occs_[0:1].reshape(1, -1) > -1, bwd_occs_.reshape(bwd_occs_.shape[0], -1) > 0.5), dim=0)
+        ]
+
+    fwd_mappings = []
+    bwd_mappings = []
+    interattn_masks = []
+    for scale in [8.0, 16.0]:
+        fwd_mapping, bwd_mapping, interattn_mask = get_mapping_ind(bwd_flows, bwd_occs, imgs_torch, scale=scale)
+        fwd_mappings += [fwd_mapping]
+        bwd_mappings += [bwd_mapping]
+        interattn_masks += [interattn_mask]
+
+    interattn_paras = {}
+    interattn_paras["fwd_mappings"] = fwd_mappings
+    interattn_paras["bwd_mappings"] = bwd_mappings
+    interattn_paras["interattn_masks"] = interattn_masks
+
+    clear_cache()
+
+    return [fwd_flows, bwd_flows], [fwd_occs, bwd_occs], attn_mask, interattn_paras
+
+
+class AttentionControl:
+    """
+    Control FRESCO-based attention
+    * enable/diable spatial-guided attention
+    * enable/diable temporal-guided attention
+    * enable/diable cross-frame attention
+    * collect intermediate attention feature (for spatial-guided attention)
+    """
+
+    def __init__(self):
+        self.stored_attn = self.get_empty_store()
+        self.store = False
+        self.index = 0
+        self.attn_mask = None
+        self.interattn_paras = None
+        self.use_interattn = False
+        self.use_cfattn = False
+        self.use_intraattn = False
+        self.intraattn_bias = 0
+        self.intraattn_scale_factor = 0.2
+        self.interattn_scale_factor = 0.2
+
+    @staticmethod
+    def get_empty_store():
+        return {
+            "decoder_attn": [],
+        }
+
+    def clear_store(self):
+        del self.stored_attn
+        torch.cuda.empty_cache()
+        gc.collect()
+        self.stored_attn = self.get_empty_store()
+        self.disable_intraattn()
+
+    # store attention feature of the input frame for spatial-guided attention
+    def enable_store(self):
+        self.store = True
+
+    def disable_store(self):
+        self.store = False
+
+    # spatial-guided attention
+    def enable_intraattn(self):
+        self.index = 0
+        self.use_intraattn = True
+        self.disable_store()
+        if len(self.stored_attn["decoder_attn"]) == 0:
+            self.use_intraattn = False
+
+    def disable_intraattn(self):
+        self.index = 0
+        self.use_intraattn = False
+        self.disable_store()
+
+    def disable_cfattn(self):
+        self.use_cfattn = False
+
+    # cross frame attention
+    def enable_cfattn(self, attn_mask=None):
+        if attn_mask:
+            if self.attn_mask:
+                del self.attn_mask
+                torch.cuda.empty_cache()
+            self.attn_mask = attn_mask
+            self.use_cfattn = True
+        else:
+            if self.attn_mask:
+                self.use_cfattn = True
+            else:
+                print("Warning: no valid cross-frame attention parameters available!")
+                self.disable_cfattn()
+
+    def disable_interattn(self):
+        self.use_interattn = False
+
+    # temporal-guided attention
+    def enable_interattn(self, interattn_paras=None):
+        if interattn_paras:
+            if self.interattn_paras:
+                del self.interattn_paras
+                torch.cuda.empty_cache()
+            self.interattn_paras = interattn_paras
+            self.use_interattn = True
+        else:
+            if self.interattn_paras:
+                self.use_interattn = True
+            else:
+                print("Warning: no valid temporal-guided attention parameters available!")
+                self.disable_interattn()
+
+    def disable_controller(self):
+        self.disable_intraattn()
+        self.disable_interattn()
+        self.disable_cfattn()
+
+    def enable_controller(self, interattn_paras=None, attn_mask=None):
+        self.enable_intraattn()
+        self.enable_interattn(interattn_paras)
+        self.enable_cfattn(attn_mask)
+
+    def forward(self, context):
+        if self.store:
+            self.stored_attn["decoder_attn"].append(context.detach())
+        if self.use_intraattn and len(self.stored_attn["decoder_attn"]) > 0:
+            tmp = self.stored_attn["decoder_attn"][self.index]
+            self.index = self.index + 1
+            if self.index >= len(self.stored_attn["decoder_attn"]):
+                self.index = 0
+                self.disable_store()
+            return tmp
+        return context
+
+    def __call__(self, context):
+        context = self.forward(context)
+        return context
+
+
+class FRESCOAttnProcessor2_0:
+    """
+    Hack self attention to FRESCO-based attention
+    * adding spatial-guided attention
+    * adding temporal-guided attention
+    * adding cross-frame attention
+
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    Usage
+    frescoProc = FRESCOAttnProcessor2_0(2, attn_mask)
+    attnProc = AttnProcessor2_0()
+
+    attn_processor_dict = {}
+    for k in pipe.unet.attn_processors.keys():
+        if k.startswith("up_blocks.2") or k.startswith("up_blocks.3"):
+            attn_processor_dict[k] = frescoProc
+        else:
+            attn_processor_dict[k] = attnProc
+    pipe.unet.set_attn_processor(attn_processor_dict)
+    """
+
+    def __init__(self, unet_chunk_size=2, controller=None):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.unet_chunk_size = unet_chunk_size
+        self.controller = controller
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        crossattn = False
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+            if self.controller and self.controller.store:
+                self.controller(hidden_states.detach().clone())
+        else:
+            crossattn = True
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        # BC * HW * 8D
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query_raw, key_raw = None, None
+        if self.controller and self.controller.use_interattn and (not crossattn):
+            query_raw, key_raw = query.clone(), key.clone()
+
+        inner_dim = key.shape[-1]  # 8D
+        head_dim = inner_dim // attn.heads  # D
+
+        """for efficient cross-frame attention"""
+        if self.controller and self.controller.use_cfattn and (not crossattn):
+            video_length = key.size()[0] // self.unet_chunk_size
+            former_frame_index = [0] * video_length
+            attn_mask = None
+            if self.controller.attn_mask is not None:
+                for m in self.controller.attn_mask:
+                    if m.shape[1] == key.shape[1]:
+                        attn_mask = m
+            # BC * HW * 8D --> B * C * HW * 8D
+            key = rearrange(key, "(b f) d c -> b f d c", f=video_length)
+            # B * C * HW * 8D --> B * C * HW * 8D
+            if attn_mask is None:
+                key = key[:, former_frame_index]
+            else:
+                key = repeat(key[:, attn_mask], "b d c -> b f d c", f=video_length)
+            # B * C * HW * 8D --> BC * HW * 8D
+            key = rearrange(key, "b f d c -> (b f) d c").detach()
+            value = rearrange(value, "(b f) d c -> b f d c", f=video_length)
+            if attn_mask is None:
+                value = value[:, former_frame_index]
+            else:
+                value = repeat(value[:, attn_mask], "b d c -> b f d c", f=video_length)
+            value = rearrange(value, "b f d c -> (b f) d c").detach()
+
+        # BC * HW * 8D --> BC * HW * 8 * D --> BC * 8 * HW * D
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # BC * 8 * HW2 * D
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # BC * 8 * HW2 * D2
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        """for spatial-guided intra-frame attention"""
+        if self.controller and self.controller.use_intraattn and (not crossattn):
+            ref_hidden_states = self.controller(None)
+            assert ref_hidden_states.shape == encoder_hidden_states.shape
+            query_ = attn.to_q(ref_hidden_states)
+            key_ = attn.to_k(ref_hidden_states)
+
+            # BC * 8 * HW * D
+            query_ = query_.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key_ = key_.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            query = F.scaled_dot_product_attention(
+                query_,
+                key_ * self.controller.intraattn_scale_factor,
+                query,
+                attn_mask=torch.eye(query_.size(-2), key_.size(-2), dtype=query.dtype, device=query.device)
+                * self.controller.intraattn_bias,
+            ).detach()
+
+            del query_, key_
+            torch.cuda.empty_cache()
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        # output: BC * 8 * HW * D2
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        """for temporal-guided inter-frame attention (FLATTEN)"""
+        if self.controller and self.controller.use_interattn and (not crossattn):
+            del query, key, value
+            torch.cuda.empty_cache()
+            bwd_mapping = None
+            fwd_mapping = None
+            for i, f in enumerate(self.controller.interattn_paras["fwd_mappings"]):
+                if f.shape[2] == hidden_states.shape[2]:
+                    fwd_mapping = f
+                    bwd_mapping = self.controller.interattn_paras["bwd_mappings"][i]
+                    interattn_mask = self.controller.interattn_paras["interattn_masks"][i]
+            video_length = key_raw.size()[0] // self.unet_chunk_size
+            # BC * HW * 8D --> C * 8BD * HW
+            key = rearrange(key_raw, "(b f) d c -> f (b c) d", f=video_length)
+            query = rearrange(query_raw, "(b f) d c -> f (b c) d", f=video_length)
+            # BC * 8 * HW * D --> C * 8BD * HW
+            # key = rearrange(hidden_states, "(b f) h d c -> f (b h c) d", f=video_length) ########
+            # query = rearrange(hidden_states, "(b f) h d c -> f (b h c) d", f=video_length) #######
+
+            value = rearrange(hidden_states, "(b f) h d c -> f (b h c) d", f=video_length)
+            key = torch.gather(key, 2, fwd_mapping.expand(-1, key.shape[1], -1))
+            query = torch.gather(query, 2, fwd_mapping.expand(-1, query.shape[1], -1))
+            value = torch.gather(value, 2, fwd_mapping.expand(-1, value.shape[1], -1))
+            # C * 8BD * HW --> BHW, C, 8D
+            key = rearrange(key, "f (b c) d -> (b d) f c", b=self.unet_chunk_size)
+            query = rearrange(query, "f (b c) d -> (b d) f c", b=self.unet_chunk_size)
+            value = rearrange(value, "f (b c) d -> (b d) f c", b=self.unet_chunk_size)
+            # BHW * C * 8D --> BHW * C * 8 * D--> BHW * 8 * C * D
+            query = query.view(-1, video_length, attn.heads, head_dim).transpose(1, 2).detach()
+            key = key.view(-1, video_length, attn.heads, head_dim).transpose(1, 2).detach()
+            value = value.view(-1, video_length, attn.heads, head_dim).transpose(1, 2).detach()
+            hidden_states_ = F.scaled_dot_product_attention(
+                query,
+                key * self.controller.interattn_scale_factor,
+                value,
+                # .to(query.dtype)-1.0) * 1e6 -
+                attn_mask=(interattn_mask.repeat(self.unet_chunk_size, 1, 1, 1)),
+                # torch.eye(interattn_mask.shape[2]).to(query.device).to(query.dtype) * 1e4,
+            )
+
+            # BHW * 8 * C * D --> C * 8BD * HW
+            hidden_states_ = rearrange(hidden_states_, "(b d) h f c -> f (b h c) d", b=self.unet_chunk_size)
+            hidden_states_ = torch.gather(
+                hidden_states_, 2, bwd_mapping.expand(-1, hidden_states_.shape[1], -1)
+            ).detach()
+            # C * 8BD * HW --> BC * 8 * HW * D
+            hidden_states = rearrange(
+                hidden_states_, "f (b h c) d -> (b f) h d c", b=self.unet_chunk_size, h=attn.heads
+            )
+
+        # BC * 8 * HW * D --> BC * HW * 8D
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+def apply_FRESCO_attn(pipe):
+    """
+    Apply FRESCO-guided attention to a StableDiffusionPipeline
+    """
+    frescoProc = FRESCOAttnProcessor2_0(2, AttentionControl())
+    attnProc = AttnProcessor2_0()
+    attn_processor_dict = {}
+    for k in pipe.unet.attn_processors.keys():
+        if k.startswith("up_blocks.2") or k.startswith("up_blocks.3"):
+            attn_processor_dict[k] = frescoProc
+        else:
+            attn_processor_dict[k] = attnProc
+    pipe.unet.set_attn_processor(attn_processor_dict)
+    return frescoProc
+
+
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def prepare_image(image):
+    if isinstance(image, torch.Tensor):
+        # Batch single image
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        image = image.to(dtype=torch.float32)
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    return image
+
+
+class FrescoV2VPipeline(StableDiffusionControlNetImg2ImgPipeline):
+    r"""
+    Pipeline for video-to-video translation using Stable Diffusion with FRESCO Algorithm.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__(
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            controlnet,
+            scheduler,
+            safety_checker,
+            feature_extractor,
+            image_encoder,
+            requires_safety_checker,
+        )
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+        frescoProc = FRESCOAttnProcessor2_0(2, AttentionControl())
+        attnProc = AttnProcessor2_0()
+        attn_processor_dict = {}
+        for k in self.unet.attn_processors.keys():
+            if k.startswith("up_blocks.2") or k.startswith("up_blocks.3"):
+                attn_processor_dict[k] = frescoProc
+            else:
+                attn_processor_dict[k] = attnProc
+        self.unet.set_attn_processor(attn_processor_dict)
+        self.frescoProc = frescoProc
+
+        flow_model = GMFlow(
+            feature_channels=128,
+            num_scales=1,
+            upsample_factor=8,
+            num_head=1,
+            attention_type="swin",
+            ffn_dim_expansion=4,
+            num_transformer_layers=6,
+        ).to(self.device)
+
+        checkpoint = torch.utils.model_zoo.load_url(
+            "https://huggingface.co/Anonymous-sub/Rerender/resolve/main/models/gmflow_sintel-0c07dcb3.pth",
+            map_location=lambda storage, loc: storage,
+        )
+        weights = checkpoint["model"] if "model" in checkpoint else checkpoint
+        flow_model.load_state_dict(weights, strict=False)
+        flow_model.eval()
+        self.flow_model = flow_model
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
+    def prepare_latents(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, repeat_noise, generator=None
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        if repeat_noise:
+            noise = randn_tensor((1, *shape[1:]), generator=generator, device=device, dtype=dtype)
+            one_tuple = (1,) * (len(shape) - 1)
+            noise = noise.repeat(batch_size, *one_tuple)
+        else:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        frames: Union[List[np.ndarray], torch.FloatTensor] = None,
+        control_frames: Union[List[np.ndarray], torch.FloatTensor] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        end_opt_step=15,
+        num_intraattn_steps=1,
+        step_interattn_end=350,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            frames (`List[np.ndarray]` or `torch.FloatTensor`): The input images to be used as the starting point for the image generation process.
+            control_frames (`List[np.ndarray]` or `torch.FloatTensor`): The ControlNet input images condition to provide guidance to the `unet` for generation.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            end_opt_step:
+                The feature optimization is activated from strength * num_inference_step to end_opt_step.
+            num_intraattn_steps:
+                Apply num_interattn_steps steps of spatial-guided attention.
+            step_interattn_end:
+                Apply temporal-guided attention in [step_interattn_end, 1000] steps
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            control_frames[0],
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        batch_size = len(frames)
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        prompt_embeds = prompt_embeds.repeat(batch_size, 1, 1)
+        negative_prompt_embeds = negative_prompt_embeds.repeat(batch_size, 1, 1)
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image
+        imgs_np = []
+        for frame in frames:
+            if isinstance(frame, PIL.Image.Image):
+                imgs_np.append(np.asarray(frame))
+            else:
+                # np.ndarray
+                imgs_np.append(frame)
+        images_pt = self.image_processor.preprocess(frames).to(dtype=torch.float32)
+
+        # 5. Prepare controlnet_conditioning_image
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_frames,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_frames:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+        else:
+            assert False
+
+        self.flow_model.to(device)
+
+        flows, occs, attn_mask, interattn_paras = get_flow_and_interframe_paras(self.flow_model, imgs_np)
+        correlation_matrix = get_intraframe_paras(self, images_pt, self.frescoProc, prompt_embeds, generator)
+
+        """
+        Flexible settings for attention:
+        * Turn off FRESCO-guided attention: frescoProc.controller.disable_controller()
+        Then you can turn on one specific attention submodule
+        * Turn on Cross-frame attention: frescoProc.controller.enable_cfattn(attn_mask)
+        * Turn on Spatial-guided attention: frescoProc.controller.enable_intraattn()
+        * Turn on Temporal-guided attention: frescoProc.controller.enable_interattn(interattn_paras)
+
+        Flexible settings for optimization:
+        * Turn off Spatial-guided optimization: set optimize_temporal = False in apply_FRESCO_opt()
+        * Turn off Temporal-guided optimization: set correlation_matrix = [] in apply_FRESCO_opt()
+        * Turn off FRESCO-guided optimization: disable_FRESCO_opt(pipe)
+
+        Flexible settings for background smoothing:
+        * Turn off background smoothing: set saliency = None in apply_FRESCO_opt()
+        """
+
+        self.frescoProc.controller.enable_controller(interattn_paras=interattn_paras, attn_mask=attn_mask)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        apply_FRESCO_opt(
+            self,
+            steps=timesteps[:end_opt_step],
+            flows=flows,
+            occs=occs,
+            correlation_matrix=correlation_matrix,
+            saliency=None,
+            optimize_temporal=True,
+        )
+
+        clear_cache()
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            images_pt,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator=generator,
+            repeat_noise=True,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 7.2 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if i >= num_intraattn_steps:
+                    self.frescoProc.controller.disable_intraattn()
+                if t < step_interattn_end:
+                    self.frescoProc.controller.disable_interattn()
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Inferred ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/gluegen.py b/diffusers/examples/community/gluegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..91026c5d966f824302a8e454e3016007171c4c43
--- /dev/null
+++ b/diffusers/examples/community/gluegen.py
@@ -0,0 +1,816 @@
+import inspect
+from typing import Any, Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers import AutoModel, AutoTokenizer, CLIPImageProcessor
+
+from diffusers import DiffusionPipeline
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import StableDiffusionLoraLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class TranslatorBase(nn.Module):
+    def __init__(self, num_tok, dim, dim_out, mult=2):
+        super().__init__()
+
+        self.dim_in = dim
+        self.dim_out = dim_out
+
+        self.net_tok = nn.Sequential(
+            nn.Linear(num_tok, int(num_tok * mult)),
+            nn.LayerNorm(int(num_tok * mult)),
+            nn.GELU(),
+            nn.Linear(int(num_tok * mult), int(num_tok * mult)),
+            nn.LayerNorm(int(num_tok * mult)),
+            nn.GELU(),
+            nn.Linear(int(num_tok * mult), num_tok),
+            nn.LayerNorm(num_tok),
+        )
+
+        self.net_sen = nn.Sequential(
+            nn.Linear(dim, int(dim * mult)),
+            nn.LayerNorm(int(dim * mult)),
+            nn.GELU(),
+            nn.Linear(int(dim * mult), int(dim * mult)),
+            nn.LayerNorm(int(dim * mult)),
+            nn.GELU(),
+            nn.Linear(int(dim * mult), dim_out),
+            nn.LayerNorm(dim_out),
+        )
+
+    def forward(self, x):
+        if self.dim_in == self.dim_out:
+            indentity_0 = x
+            x = self.net_sen(x)
+            x += indentity_0
+            x = x.transpose(1, 2)
+
+            indentity_1 = x
+            x = self.net_tok(x)
+            x += indentity_1
+            x = x.transpose(1, 2)
+        else:
+            x = self.net_sen(x)
+            x = x.transpose(1, 2)
+
+            x = self.net_tok(x)
+            x = x.transpose(1, 2)
+        return x
+
+
+class TranslatorBaseNoLN(nn.Module):
+    def __init__(self, num_tok, dim, dim_out, mult=2):
+        super().__init__()
+
+        self.dim_in = dim
+        self.dim_out = dim_out
+
+        self.net_tok = nn.Sequential(
+            nn.Linear(num_tok, int(num_tok * mult)),
+            nn.GELU(),
+            nn.Linear(int(num_tok * mult), int(num_tok * mult)),
+            nn.GELU(),
+            nn.Linear(int(num_tok * mult), num_tok),
+        )
+
+        self.net_sen = nn.Sequential(
+            nn.Linear(dim, int(dim * mult)),
+            nn.GELU(),
+            nn.Linear(int(dim * mult), int(dim * mult)),
+            nn.GELU(),
+            nn.Linear(int(dim * mult), dim_out),
+        )
+
+    def forward(self, x):
+        if self.dim_in == self.dim_out:
+            indentity_0 = x
+            x = self.net_sen(x)
+            x += indentity_0
+            x = x.transpose(1, 2)
+
+            indentity_1 = x
+            x = self.net_tok(x)
+            x += indentity_1
+            x = x.transpose(1, 2)
+        else:
+            x = self.net_sen(x)
+            x = x.transpose(1, 2)
+
+            x = self.net_tok(x)
+            x = x.transpose(1, 2)
+        return x
+
+
+class TranslatorNoLN(nn.Module):
+    def __init__(self, num_tok, dim, dim_out, mult=2, depth=5):
+        super().__init__()
+
+        self.blocks = nn.ModuleList([TranslatorBase(num_tok, dim, dim, mult=2) for d in range(depth)])
+        self.gelu = nn.GELU()
+
+        self.tail = TranslatorBaseNoLN(num_tok, dim, dim_out, mult=2)
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x) + x
+            x = self.gelu(x)
+
+        x = self.tail(x)
+        return x
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionLoraLoaderMixin):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: AutoModel,
+        tokenizer: AutoTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        language_adapter: TranslatorNoLN = None,
+        tensor_norm: torch.Tensor = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            language_adapter=language_adapter,
+            tensor_norm=tensor_norm,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def load_language_adapter(
+        self,
+        model_path: str,
+        num_token: int,
+        dim: int,
+        dim_out: int,
+        tensor_norm: torch.Tensor,
+        mult: int = 2,
+        depth: int = 5,
+    ):
+        device = self._execution_device
+        self.tensor_norm = tensor_norm.to(device)
+        self.language_adapter = TranslatorNoLN(num_tok=num_token, dim=dim, dim_out=dim_out, mult=mult, depth=depth).to(
+            device
+        )
+        self.language_adapter.load_state_dict(torch.load(model_path))
+
+    def _adapt_language(self, prompt_embeds: torch.Tensor):
+        prompt_embeds = prompt_embeds / 3
+        prompt_embeds = self.language_adapter(prompt_embeds) * (self.tensor_norm / 2)
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            elif self.language_adapter is not None:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+            # Run prompt language adapter
+            if self.language_adapter is not None:
+                prompt_embeds = self._adapt_language(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+            # Run negative prompt language adapter
+            if self.language_adapter is not None:
+                negative_prompt_embeds = self._adapt_language(negative_prompt_embeds)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/hd_painter.py b/diffusers/examples/community/hd_painter.py
new file mode 100644
index 0000000000000000000000000000000000000000..df41be9ef7b126e5bf60be2aca13555f478dc580
--- /dev/null
+++ b/diffusers/examples/community/hd_painter.py
@@ -0,0 +1,994 @@
+import math
+import numbers
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import AsymmetricAutoencoderKL, ImageProjection
+from diffusers.models.attention_processor import Attention, AttnProcessor
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import (
+    StableDiffusionInpaintPipeline,
+    retrieve_timesteps,
+)
+from diffusers.utils import deprecate
+
+
+class RASGAttnProcessor:
+    def __init__(self, mask, token_idx, scale_factor):
+        self.attention_scores = None  # Stores the last output of the similarity matrix here. Each layer will get its own RASGAttnProcessor assigned
+        self.mask = mask
+        self.token_idx = token_idx
+        self.scale_factor = scale_factor
+        self.mask_resoltuion = mask.shape[-1] * mask.shape[-2]  # 64 x 64 if the image is 512x512
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        scale: float = 1.0,
+    ) -> torch.Tensor:
+        # Same as the default AttnProcessor up untill the part where similarity matrix gets saved
+        downscale_factor = self.mask_resoltuion // hidden_states.shape[1]
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        # Automatically recognize the resolution and save the attention similarity values
+        # We need to use the values before the softmax function, hence the rewritten get_attention_scores function.
+        if downscale_factor == self.scale_factor**2:
+            self.attention_scores = get_attention_scores(attn, query, key, attention_mask)
+            attention_probs = self.attention_scores.softmax(dim=-1)
+            attention_probs = attention_probs.to(query.dtype)
+        else:
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)  # Original code
+
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class PAIntAAttnProcessor:
+    def __init__(self, transformer_block, mask, token_idx, do_classifier_free_guidance, scale_factors):
+        self.transformer_block = transformer_block  # Stores the parent transformer block.
+        self.mask = mask
+        self.scale_factors = scale_factors
+        self.do_classifier_free_guidance = do_classifier_free_guidance
+        self.token_idx = token_idx
+        self.shape = mask.shape[2:]
+        self.mask_resoltuion = mask.shape[-1] * mask.shape[-2]  # 64 x 64
+        self.default_processor = AttnProcessor()
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        scale: float = 1.0,
+    ) -> torch.Tensor:
+        # Automatically recognize the resolution of the current attention layer and resize the masks accordingly
+        downscale_factor = self.mask_resoltuion // hidden_states.shape[1]
+
+        mask = None
+        for factor in self.scale_factors:
+            if downscale_factor == factor**2:
+                shape = (self.shape[0] // factor, self.shape[1] // factor)
+                mask = F.interpolate(self.mask, shape, mode="bicubic")  # B, 1, H, W
+                break
+        if mask is None:
+            return self.default_processor(attn, hidden_states, encoder_hidden_states, attention_mask, temb, scale)
+
+        # STARTS HERE
+        residual = hidden_states
+        # Save the input hidden_states for later use
+        input_hidden_states = hidden_states
+
+        # ================================================== #
+        # =============== SELF ATTENTION 1 ================= #
+        # ================================================== #
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        # self_attention_probs = attn.get_attention_scores(query, key, attention_mask) # We can't use post-softmax attention scores in this case
+        self_attention_scores = get_attention_scores(
+            attn, query, key, attention_mask
+        )  # The custom function returns pre-softmax probabilities
+        self_attention_probs = self_attention_scores.softmax(
+            dim=-1
+        )  # Manually compute the probabilities here, the scores will be reused in the second part of PAIntA
+        self_attention_probs = self_attention_probs.to(query.dtype)
+
+        hidden_states = torch.bmm(self_attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        # x = x + self.attn1(self.norm1(x))
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:  # So many residuals everywhere
+            hidden_states = hidden_states + residual
+
+        self_attention_output_hidden_states = hidden_states / attn.rescale_output_factor
+
+        # ================================================== #
+        # ============ BasicTransformerBlock =============== #
+        # ================================================== #
+        # We use a hack by running the code from the BasicTransformerBlock that is between Self and Cross attentions here
+        # The other option would've been modifying the BasicTransformerBlock and adding this functionality here.
+        # I assumed that changing the BasicTransformerBlock would have been a bigger deal and decided to use this hack isntead.
+
+        # The SelfAttention block recieves the normalized latents from the BasicTransformerBlock,
+        # But the residual of the output is the non-normalized version.
+        # Therefore we unnormalize the input hidden state here
+        unnormalized_input_hidden_states = (
+            input_hidden_states + self.transformer_block.norm1.bias
+        ) * self.transformer_block.norm1.weight
+
+        # TODO: return if neccessary
+        # if self.use_ada_layer_norm_zero:
+        #     attn_output = gate_msa.unsqueeze(1) * attn_output
+        # elif self.use_ada_layer_norm_single:
+        #     attn_output = gate_msa * attn_output
+
+        transformer_hidden_states = self_attention_output_hidden_states + unnormalized_input_hidden_states
+        if transformer_hidden_states.ndim == 4:
+            transformer_hidden_states = transformer_hidden_states.squeeze(1)
+
+        # TODO: return if neccessary
+        # 2.5 GLIGEN Control
+        # if gligen_kwargs is not None:
+        #     transformer_hidden_states = self.fuser(transformer_hidden_states, gligen_kwargs["objs"])
+        # NOTE: we experimented with using GLIGEN and HDPainter together, the results were not that great
+
+        # 3. Cross-Attention
+        if self.transformer_block.use_ada_layer_norm:
+            # transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states, timestep)
+            raise NotImplementedError()
+        elif self.transformer_block.use_ada_layer_norm_zero or self.transformer_block.use_layer_norm:
+            transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states)
+        elif self.transformer_block.use_ada_layer_norm_single:
+            # For PixArt norm2 isn't applied here:
+            # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+            transformer_norm_hidden_states = transformer_hidden_states
+        elif self.transformer_block.use_ada_layer_norm_continuous:
+            # transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states, added_cond_kwargs["pooled_text_emb"])
+            raise NotImplementedError()
+        else:
+            raise ValueError("Incorrect norm")
+
+        if self.transformer_block.pos_embed is not None and self.transformer_block.use_ada_layer_norm_single is False:
+            transformer_norm_hidden_states = self.transformer_block.pos_embed(transformer_norm_hidden_states)
+
+        # ================================================== #
+        # ================= CROSS ATTENTION ================ #
+        # ================================================== #
+
+        # We do an initial pass of the CrossAttention up to obtaining the similarity matrix here.
+        # The similarity matrix is used to obtain scaling coefficients for the attention matrix of the self attention
+        # We reuse the previously computed self-attention matrix, and only repeat the steps after the softmax
+
+        cross_attention_input_hidden_states = (
+            transformer_norm_hidden_states  # Renaming the variable for the sake of readability
+        )
+
+        # TODO: check if classifier_free_guidance is being used before splitting here
+        if self.do_classifier_free_guidance:
+            # Our scaling coefficients depend only on the conditional part, so we split the inputs
+            (
+                _cross_attention_input_hidden_states_unconditional,
+                cross_attention_input_hidden_states_conditional,
+            ) = cross_attention_input_hidden_states.chunk(2)
+
+            # Same split for the encoder_hidden_states i.e. the tokens
+            # Since the SelfAttention processors don't get the encoder states as input, we inject them into the processor in the begining.
+            _encoder_hidden_states_unconditional, encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(
+                2
+            )
+        else:
+            cross_attention_input_hidden_states_conditional = cross_attention_input_hidden_states
+            encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(2)
+
+        # Rename the variables for the sake of readability
+        # The part below is the beginning of the __call__ function of the following CrossAttention layer
+        cross_attention_hidden_states = cross_attention_input_hidden_states_conditional
+        cross_attention_encoder_hidden_states = encoder_hidden_states_conditional
+
+        attn2 = self.transformer_block.attn2
+
+        if attn2.spatial_norm is not None:
+            cross_attention_hidden_states = attn2.spatial_norm(cross_attention_hidden_states, temb)
+
+        input_ndim = cross_attention_hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = cross_attention_hidden_states.shape
+            cross_attention_hidden_states = cross_attention_hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+
+        (
+            batch_size,
+            sequence_length,
+            _,
+        ) = cross_attention_hidden_states.shape  # It is definitely a cross attention, so no need for an if block
+        # TODO: change the attention_mask here
+        attention_mask = attn2.prepare_attention_mask(
+            None, sequence_length, batch_size
+        )  # I assume the attention mask is the same...
+
+        if attn2.group_norm is not None:
+            cross_attention_hidden_states = attn2.group_norm(cross_attention_hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+
+        query2 = attn2.to_q(cross_attention_hidden_states)
+
+        if attn2.norm_cross:
+            cross_attention_encoder_hidden_states = attn2.norm_encoder_hidden_states(
+                cross_attention_encoder_hidden_states
+            )
+
+        key2 = attn2.to_k(cross_attention_encoder_hidden_states)
+        query2 = attn2.head_to_batch_dim(query2)
+        key2 = attn2.head_to_batch_dim(key2)
+
+        cross_attention_probs = attn2.get_attention_scores(query2, key2, attention_mask)
+
+        # CrossAttention ends here, the remaining part is not used
+
+        # ================================================== #
+        # ================ SELF ATTENTION 2 ================ #
+        # ================================================== #
+        # DEJA VU!
+
+        mask = (mask > 0.5).to(self_attention_output_hidden_states.dtype)
+        m = mask.to(self_attention_output_hidden_states.device)
+        # m = rearrange(m, 'b c h w -> b (h w) c').contiguous()
+        m = m.permute(0, 2, 3, 1).reshape((m.shape[0], -1, m.shape[1])).contiguous()  # B HW 1
+        m = torch.matmul(m, m.permute(0, 2, 1)) + (1 - m)
+
+        # # Compute scaling coefficients for the similarity matrix
+        # # Select the cross attention values for the correct tokens only!
+        # cross_attention_probs = cross_attention_probs.mean(dim = 0)
+        # cross_attention_probs = cross_attention_probs[:, self.token_idx].sum(dim=1)
+
+        # cross_attention_probs = cross_attention_probs.reshape(shape)
+        # gaussian_smoothing = GaussianSmoothing(channels=1, kernel_size=3, sigma=0.5, dim=2).to(self_attention_output_hidden_states.device)
+        # cross_attention_probs = gaussian_smoothing(cross_attention_probs.unsqueeze(0))[0] # optional smoothing
+        # cross_attention_probs = cross_attention_probs.reshape(-1)
+        # cross_attention_probs = ((cross_attention_probs - torch.median(cross_attention_probs.ravel())) / torch.max(cross_attention_probs.ravel())).clip(0, 1)
+
+        # c = (1 - m) * cross_attention_probs.reshape(1, 1, -1) + m # PAIntA scaling coefficients
+
+        # Compute scaling coefficients for the similarity matrix
+        # Select the cross attention values for the correct tokens only!
+
+        batch_size, dims, channels = cross_attention_probs.shape
+        batch_size = batch_size // attn.heads
+        cross_attention_probs = cross_attention_probs.reshape((batch_size, attn.heads, dims, channels))  # B, D, HW, T
+
+        cross_attention_probs = cross_attention_probs.mean(dim=1)  # B, HW, T
+        cross_attention_probs = cross_attention_probs[..., self.token_idx].sum(dim=-1)  # B, HW
+        cross_attention_probs = cross_attention_probs.reshape((batch_size,) + shape)  # , B, H, W
+
+        gaussian_smoothing = GaussianSmoothing(channels=1, kernel_size=3, sigma=0.5, dim=2).to(
+            self_attention_output_hidden_states.device
+        )
+        cross_attention_probs = gaussian_smoothing(cross_attention_probs[:, None])[:, 0]  # optional smoothing B, H, W
+
+        # Median normalization
+        cross_attention_probs = cross_attention_probs.reshape(batch_size, -1)  # B, HW
+        cross_attention_probs = (
+            cross_attention_probs - cross_attention_probs.median(dim=-1, keepdim=True).values
+        ) / cross_attention_probs.max(dim=-1, keepdim=True).values
+        cross_attention_probs = cross_attention_probs.clip(0, 1)
+
+        c = (1 - m) * cross_attention_probs.reshape(batch_size, 1, -1) + m
+        c = c.repeat_interleave(attn.heads, 0)  # BD, HW
+        if self.do_classifier_free_guidance:
+            c = torch.cat([c, c])  # 2BD, HW
+
+        # Rescaling the original self-attention matrix
+        self_attention_scores_rescaled = self_attention_scores * c
+        self_attention_probs_rescaled = self_attention_scores_rescaled.softmax(dim=-1)
+
+        # Continuing the self attention normally using the new matrix
+        hidden_states = torch.bmm(self_attention_probs_rescaled, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + input_hidden_states
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class StableDiffusionHDPainterPipeline(StableDiffusionInpaintPipeline):
+    def get_tokenized_prompt(self, prompt):
+        out = self.tokenizer(prompt)
+        return [self.tokenizer.decode(x) for x in out["input_ids"]]
+
+    def init_attn_processors(
+        self,
+        mask,
+        token_idx,
+        use_painta=True,
+        use_rasg=True,
+        painta_scale_factors=[2, 4],  # 64x64 -> [16x16, 32x32]
+        rasg_scale_factor=4,  # 64x64 -> 16x16
+        self_attention_layer_name="attn1",
+        cross_attention_layer_name="attn2",
+        list_of_painta_layer_names=None,
+        list_of_rasg_layer_names=None,
+    ):
+        default_processor = AttnProcessor()
+        width, height = mask.shape[-2:]
+        width, height = width // self.vae_scale_factor, height // self.vae_scale_factor
+
+        painta_scale_factors = [x * self.vae_scale_factor for x in painta_scale_factors]
+        rasg_scale_factor = self.vae_scale_factor * rasg_scale_factor
+
+        attn_processors = {}
+        for x in self.unet.attn_processors:
+            if (list_of_painta_layer_names is None and self_attention_layer_name in x) or (
+                list_of_painta_layer_names is not None and x in list_of_painta_layer_names
+            ):
+                if use_painta:
+                    transformer_block = self.unet.get_submodule(x.replace(".attn1.processor", ""))
+                    attn_processors[x] = PAIntAAttnProcessor(
+                        transformer_block, mask, token_idx, self.do_classifier_free_guidance, painta_scale_factors
+                    )
+                else:
+                    attn_processors[x] = default_processor
+            elif (list_of_rasg_layer_names is None and cross_attention_layer_name in x) or (
+                list_of_rasg_layer_names is not None and x in list_of_rasg_layer_names
+            ):
+                if use_rasg:
+                    attn_processors[x] = RASGAttnProcessor(mask, token_idx, rasg_scale_factor)
+                else:
+                    attn_processors[x] = default_processor
+
+        self.unet.set_attn_processor(attn_processors)
+        # import json
+        # with open('/home/hayk.manukyan/repos/diffusers/debug.txt', 'a')  as f:
+        #     json.dump({x:str(y) for x,y in self.unet.attn_processors.items()}, f, indent=4)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: torch.Tensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        positive_prompt: Optional[str] = "",
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.01,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        use_painta=True,
+        use_rasg=True,
+        self_attention_layer_name=".attn1",
+        cross_attention_layer_name=".attn2",
+        painta_scale_factors=[2, 4],  # 16 x 16 and 32 x 32
+        rasg_scale_factor=4,  # 16x16 by default
+        list_of_painta_layer_names=None,
+        list_of_rasg_layer_names=None,
+        **kwargs,
+    ):
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        #
+        prompt_no_positives = prompt
+        if isinstance(prompt, list):
+            prompt = [x + positive_prompt for x in prompt]
+        else:
+            prompt = prompt + positive_prompt
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            image,
+            mask_image,
+            height,
+            width,
+            strength,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            padding_mask_crop,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # assert batch_size == 1, "Does not work with batch size > 1 currently"
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        # 4. set timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps=num_inference_steps, strength=strength, device=device
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+
+        # 5. Preprocess mask and image
+
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 7. Prepare mask latent variables
+        mask_condition = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+
+        if masked_image_latents is None:
+            masked_image = init_image * (mask_condition < 0.5)
+        else:
+            masked_image = masked_image_latents
+
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask_condition,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+
+        # 7.5 Setting up HD-Painter
+
+        # Get the indices of the tokens to be modified by both RASG and PAIntA
+        token_idx = list(range(1, self.get_tokenized_prompt(prompt_no_positives).index("<|endoftext|>"))) + [
+            self.get_tokenized_prompt(prompt).index("<|endoftext|>")
+        ]
+
+        # Setting up the attention processors
+        self.init_attn_processors(
+            mask_condition,
+            token_idx,
+            use_painta,
+            use_rasg,
+            painta_scale_factors=painta_scale_factors,
+            rasg_scale_factor=rasg_scale_factor,
+            self_attention_layer_name=self_attention_layer_name,
+            cross_attention_layer_name=cross_attention_layer_name,
+            list_of_painta_layer_names=list_of_painta_layer_names,
+            list_of_rasg_layer_names=list_of_rasg_layer_names,
+        )
+
+        # 8. Check that sizes of mask, masked image and latents match
+        if num_channels_unet == 9:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
+        elif num_channels_unet != 4:
+            raise ValueError(
+                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+            )
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        if use_rasg:
+            extra_step_kwargs["generator"] = None
+
+        # 9.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # 9.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 10. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        painta_active = True
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                if t < 500 and painta_active:
+                    self.init_attn_processors(
+                        mask_condition,
+                        token_idx,
+                        False,
+                        use_rasg,
+                        painta_scale_factors=painta_scale_factors,
+                        rasg_scale_factor=rasg_scale_factor,
+                        self_attention_layer_name=self_attention_layer_name,
+                        cross_attention_layer_name=cross_attention_layer_name,
+                        list_of_painta_layer_names=list_of_painta_layer_names,
+                        list_of_rasg_layer_names=list_of_rasg_layer_names,
+                    )
+                    painta_active = False
+
+                with torch.enable_grad():
+                    self.unet.zero_grad()
+                    latents = latents.detach()
+                    latents.requires_grad = True
+
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                    # concat latents, mask, masked_image_latents in the channel dimension
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    if num_channels_unet == 9:
+                        latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                    self.scheduler.latents = latents
+                    self.encoder_hidden_states = prompt_embeds
+                    for attn_processor in self.unet.attn_processors.values():
+                        attn_processor.encoder_hidden_states = prompt_embeds
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep_cond=timestep_cond,
+                        cross_attention_kwargs=self.cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=False,
+                    )[0]
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    if use_rasg:
+                        # Perform RASG
+                        _, _, height, width = mask_condition.shape  # 512 x 512
+                        scale_factor = self.vae_scale_factor * rasg_scale_factor  # 8 * 4 = 32
+
+                        # TODO: Fix for > 1 batch_size
+                        rasg_mask = F.interpolate(
+                            mask_condition, (height // scale_factor, width // scale_factor), mode="bicubic"
+                        )[0, 0]  # mode is nearest by default, B, H, W
+
+                        # Aggregate the saved attention maps
+                        attn_map = []
+                        for processor in self.unet.attn_processors.values():
+                            if hasattr(processor, "attention_scores") and processor.attention_scores is not None:
+                                if self.do_classifier_free_guidance:
+                                    attn_map.append(processor.attention_scores.chunk(2)[1])  # (B/2) x H, 256, 77
+                                else:
+                                    attn_map.append(processor.attention_scores)  # B x H, 256, 77 ?
+
+                        attn_map = (
+                            torch.cat(attn_map)
+                            .mean(0)
+                            .permute(1, 0)
+                            .reshape((-1, height // scale_factor, width // scale_factor))
+                        )  # 77, 16, 16
+
+                        # Compute the attention score
+                        attn_score = -sum(
+                            [
+                                F.binary_cross_entropy_with_logits(x - 1.0, rasg_mask.to(device))
+                                for x in attn_map[token_idx]
+                            ]
+                        )
+
+                        # Backward the score and compute the gradients
+                        attn_score.backward()
+
+                        # Normalzie the gradients and compute the noise component
+                        variance_noise = latents.grad.detach()
+                        # print("VARIANCE SHAPE", variance_noise.shape)
+                        variance_noise -= torch.mean(variance_noise, [1, 2, 3], keepdim=True)
+                        variance_noise /= torch.std(variance_noise, [1, 2, 3], keepdim=True)
+                    else:
+                        variance_noise = None
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False, variance_noise=variance_noise
+                )[0]
+
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            condition_kwargs = {}
+            if isinstance(self.vae, AsymmetricAutoencoderKL):
+                init_image = init_image.to(device=device, dtype=masked_image_latents.dtype)
+                init_image_condition = init_image.clone()
+                init_image = self._encode_vae_image(init_image, generator=generator)
+                mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype)
+                condition_kwargs = {"image": init_image_condition, "mask": mask_condition}
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor, return_dict=False, generator=generator, **condition_kwargs
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+
+# ============= Utility Functions ============== #
+
+
+class GaussianSmoothing(nn.Module):
+    """
+    Apply gaussian smoothing on a
+    1d, 2d or 3d tensor. Filtering is performed seperately for each channel
+    in the input using a depthwise convolution.
+    Arguments:
+        channels (int, sequence): Number of channels of the input tensors. Output will
+            have this number of channels as well.
+        kernel_size (int, sequence): Size of the gaussian kernel.
+        sigma (float, sequence): Standard deviation of the gaussian kernel.
+        dim (int, optional): The number of dimensions of the data.
+            Default value is 2 (spatial).
+    """
+
+    def __init__(self, channels, kernel_size, sigma, dim=2):
+        super(GaussianSmoothing, self).__init__()
+        if isinstance(kernel_size, numbers.Number):
+            kernel_size = [kernel_size] * dim
+        if isinstance(sigma, numbers.Number):
+            sigma = [sigma] * dim
+
+        # The gaussian kernel is the product of the
+        # gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
+
+        self.register_buffer("weight", kernel)
+        self.groups = channels
+
+        if dim == 1:
+            self.conv = F.conv1d
+        elif dim == 2:
+            self.conv = F.conv2d
+        elif dim == 3:
+            self.conv = F.conv3d
+        else:
+            raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim))
+
+    def forward(self, input):
+        """
+        Apply gaussian filter to input.
+        Arguments:
+            input (torch.Tensor): Input to apply gaussian filter on.
+        Returns:
+            filtered (torch.Tensor): Filtered output.
+        """
+        return self.conv(input, weight=self.weight.to(input.dtype), groups=self.groups, padding="same")
+
+
+def get_attention_scores(
+    self, query: torch.Tensor, key: torch.Tensor, attention_mask: torch.Tensor = None
+) -> torch.Tensor:
+    r"""
+    Compute the attention scores.
+
+    Args:
+        query (`torch.Tensor`): The query tensor.
+        key (`torch.Tensor`): The key tensor.
+        attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
+
+    Returns:
+        `torch.Tensor`: The attention probabilities/scores.
+    """
+    if self.upcast_attention:
+        query = query.float()
+        key = key.float()
+
+    if attention_mask is None:
+        baddbmm_input = torch.empty(
+            query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
+        )
+        beta = 0
+    else:
+        baddbmm_input = attention_mask
+        beta = 1
+
+    attention_scores = torch.baddbmm(
+        baddbmm_input,
+        query,
+        key.transpose(-1, -2),
+        beta=beta,
+        alpha=self.scale,
+    )
+    del baddbmm_input
+
+    if self.upcast_softmax:
+        attention_scores = attention_scores.float()
+
+    return attention_scores
diff --git a/diffusers/examples/community/iadb.py b/diffusers/examples/community/iadb.py
new file mode 100644
index 0000000000000000000000000000000000000000..81e9e8d89dc6724fd9deb6e9d9f2b757d4cfb776
--- /dev/null
+++ b/diffusers/examples/community/iadb.py
@@ -0,0 +1,149 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import ConfigMixin
+from diffusers.pipelines.pipeline_utils import ImagePipelineOutput
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+
+
+class IADBScheduler(SchedulerMixin, ConfigMixin):
+    """
+    IADBScheduler is a scheduler for the Iterative α-(de)Blending denoising method. It is simple and minimalist.
+
+    For more details, see the original paper: https://arxiv.org/abs/2305.03486 and the blog post: https://ggx-research.github.io/publication/2023/05/10/publication-iadb.html
+    """
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        x_alpha: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Predict the sample at the previous timestep by reversing the ODE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`): direct output from learned diffusion model. It is the direction from x0 to x1.
+            timestep (`float`): current timestep in the diffusion chain.
+            x_alpha (`torch.Tensor`): x_alpha sample for the current timestep
+
+        Returns:
+            `torch.Tensor`: the sample at the previous timestep
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        alpha = timestep / self.num_inference_steps
+        alpha_next = (timestep + 1) / self.num_inference_steps
+
+        d = model_output
+
+        x_alpha = x_alpha + (alpha_next - alpha) * d
+
+        return x_alpha
+
+    def set_timesteps(self, num_inference_steps: int):
+        self.num_inference_steps = num_inference_steps
+
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        alpha: torch.Tensor,
+    ) -> torch.Tensor:
+        return original_samples * alpha + noise * (1 - alpha)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+
+class IADBPipeline(DiffusionPipeline):
+    r"""
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Parameters:
+        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+            [`DDPMScheduler`], or [`DDIMScheduler`].
+    """
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        num_inference_steps: int = 50,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
+            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        # Sample gaussian noise to begin loop
+        if isinstance(self.unet.config.sample_size, int):
+            image_shape = (
+                batch_size,
+                self.unet.config.in_channels,
+                self.unet.config.sample_size,
+                self.unet.config.sample_size,
+            )
+        else:
+            image_shape = (batch_size, self.unet.config.in_channels, *self.unet.config.sample_size)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        image = torch.randn(image_shape, generator=generator, device=self.device, dtype=self.unet.dtype)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+        x_alpha = image.clone()
+        for t in self.progress_bar(range(num_inference_steps)):
+            alpha = t / num_inference_steps
+
+            # 1. predict noise model_output
+            model_output = self.unet(x_alpha, torch.tensor(alpha, device=x_alpha.device)).sample
+
+            # 2. step
+            x_alpha = self.scheduler.step(model_output, t, x_alpha)
+
+        image = (x_alpha * 0.5 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/examples/community/imagic_stable_diffusion.py b/diffusers/examples/community/imagic_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..cea55dd3837bdac5bf973354af4d317ae7c16183
--- /dev/null
+++ b/diffusers/examples/community/imagic_stable_diffusion.py
@@ -0,0 +1,470 @@
+"""
+modeled after the textual_inversion.py / train_dreambooth.py and the work
+of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
+"""
+
+import inspect
+import warnings
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from accelerate import Accelerator
+
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+from tqdm.auto import tqdm
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import DiffusionPipeline
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import logging
+
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def preprocess(image):
+    w, h = image.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+class ImagicStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for imagic image editing.
+    See paper here: https://arxiv.org/pdf/2210.09276.pdf
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offsensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def train(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.Tensor, PIL.Image.Image],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        generator: Optional[torch.Generator] = None,
+        embedding_learning_rate: float = 0.001,
+        diffusion_model_learning_rate: float = 2e-6,
+        text_embedding_optimization_steps: int = 500,
+        model_fine_tuning_optimization_steps: int = 1000,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        accelerator = Accelerator(
+            gradient_accumulation_steps=1,
+            mixed_precision="fp16",
+        )
+
+        if "torch_device" in kwargs:
+            device = kwargs.pop("torch_device")
+            warnings.warn(
+                "`torch_device` is deprecated as an input argument to `__call__` and will be removed in v0.3.0."
+                " Consider using `pipe.to(torch_device)` instead."
+            )
+
+            if device is None:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.to(device)
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # Freeze vae and unet
+        self.vae.requires_grad_(False)
+        self.unet.requires_grad_(False)
+        self.text_encoder.requires_grad_(False)
+        self.unet.eval()
+        self.vae.eval()
+        self.text_encoder.eval()
+
+        if accelerator.is_main_process:
+            accelerator.init_trackers(
+                "imagic",
+                config={
+                    "embedding_learning_rate": embedding_learning_rate,
+                    "text_embedding_optimization_steps": text_embedding_optimization_steps,
+                },
+            )
+
+        # get text embeddings for prompt
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = torch.nn.Parameter(
+            self.text_encoder(text_input.input_ids.to(self.device))[0], requires_grad=True
+        )
+        text_embeddings = text_embeddings.detach()
+        text_embeddings.requires_grad_()
+        text_embeddings_orig = text_embeddings.clone()
+
+        # Initialize the optimizer
+        optimizer = torch.optim.Adam(
+            [text_embeddings],  # only optimize the embeddings
+            lr=embedding_learning_rate,
+        )
+
+        if isinstance(image, PIL.Image.Image):
+            image = preprocess(image)
+
+        latents_dtype = text_embeddings.dtype
+        image = image.to(device=self.device, dtype=latents_dtype)
+        init_latent_image_dist = self.vae.encode(image).latent_dist
+        image_latents = init_latent_image_dist.sample(generator=generator)
+        image_latents = 0.18215 * image_latents
+
+        progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process)
+        progress_bar.set_description("Steps")
+
+        global_step = 0
+
+        logger.info("First optimizing the text embedding to better reconstruct the init image")
+        for _ in range(text_embedding_optimization_steps):
+            with accelerator.accumulate(text_embeddings):
+                # Sample noise that we'll add to the latents
+                noise = torch.randn(image_latents.shape).to(image_latents.device)
+                timesteps = torch.randint(1000, (1,), device=image_latents.device)
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
+
+                # Predict the noise residual
+                noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
+
+                loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
+                accelerator.backward(loss)
+
+                optimizer.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+            logs = {"loss": loss.detach().item()}  # , "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+        accelerator.wait_for_everyone()
+
+        text_embeddings.requires_grad_(False)
+
+        # Now we fine tune the unet to better reconstruct the image
+        self.unet.requires_grad_(True)
+        self.unet.train()
+        optimizer = torch.optim.Adam(
+            self.unet.parameters(),  # only optimize unet
+            lr=diffusion_model_learning_rate,
+        )
+        progress_bar = tqdm(range(model_fine_tuning_optimization_steps), disable=not accelerator.is_local_main_process)
+
+        logger.info("Next fine tuning the entire model to better reconstruct the init image")
+        for _ in range(model_fine_tuning_optimization_steps):
+            with accelerator.accumulate(self.unet.parameters()):
+                # Sample noise that we'll add to the latents
+                noise = torch.randn(image_latents.shape).to(image_latents.device)
+                timesteps = torch.randint(1000, (1,), device=image_latents.device)
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
+
+                # Predict the noise residual
+                noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
+
+                loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
+                accelerator.backward(loss)
+
+                optimizer.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+            logs = {"loss": loss.detach().item()}  # , "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+        accelerator.wait_for_everyone()
+        self.text_embeddings_orig = text_embeddings_orig
+        self.text_embeddings = text_embeddings
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        alpha: float = 1.2,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        guidance_scale: float = 7.5,
+        eta: float = 0.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            alpha (`float`, *optional*, defaults to 1.2):
+                The interpolation factor between the original and optimized text embeddings. A value closer to 0
+                will resemble the original input image.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if self.text_embeddings is None:
+            raise ValueError("Please run the pipe.train() before trying to generate an image.")
+        if self.text_embeddings_orig is None:
+            raise ValueError("Please run the pipe.train() before trying to generate an image.")
+
+        text_embeddings = alpha * self.text_embeddings_orig + (1 - alpha) * self.text_embeddings
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens = [""]
+            max_length = self.tokenizer.model_max_length
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.view(1, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (1, self.unet.config.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if self.device.type == "mps":
+            # randn does not exist on mps
+            latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                self.device
+            )
+        else:
+            latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/img2img_inpainting.py b/diffusers/examples/community/img2img_inpainting.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dfb7a39155fc2052312fa03c13d56541add13ea
--- /dev/null
+++ b/diffusers/examples/community/img2img_inpainting.py
@@ -0,0 +1,437 @@
+import inspect
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def prepare_mask_and_masked_image(image, mask):
+    image = np.array(image.convert("RGB"))
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    mask = np.array(mask.convert("L"))
+    mask = mask.astype(np.float32) / 255.0
+    mask = mask[None, None]
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+    mask = torch.from_numpy(mask)
+
+    masked_image = image * (mask < 0.5)
+
+    return mask, masked_image
+
+
+def check_size(image, height, width):
+    if isinstance(image, PIL.Image.Image):
+        w, h = image.size
+    elif isinstance(image, torch.Tensor):
+        *_, h, w = image.shape
+
+    if h != height or w != width:
+        raise ValueError(f"Image size should be {height}x{width}, but got {h}x{w}")
+
+
+def overlay_inner_image(image, inner_image, paste_offset: Tuple[int] = (0, 0)):
+    inner_image = inner_image.convert("RGBA")
+    image = image.convert("RGB")
+
+    image.paste(inner_image, paste_offset, inner_image)
+    image = image.convert("RGB")
+
+    return image
+
+
+class ImageToImageInpaintingPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image-to-image inpainting using Stable Diffusion. *This is an experimental feature*.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.Tensor, PIL.Image.Image],
+        inner_image: Union[torch.Tensor, PIL.Image.Image],
+        mask_image: Union[torch.Tensor, PIL.Image.Image],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            inner_image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be overlayed onto `image`. Non-transparent
+                regions of `inner_image` must fit inside white pixels in `mask_image`. Expects four channels, with
+                the last channel representing the alpha channel, which will be used to blend `inner_image` with
+                `image`. If not provided, it will be forcibly cast to RGBA.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # check if input sizes are correct
+        check_size(image, height, width)
+        check_size(inner_image, height, width)
+        check_size(mask_image, height, width)
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        num_channels_latents = self.vae.config.latent_channels
+        latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # overlay the inner image
+        image = overlay_inner_image(image, inner_image)
+
+        # prepare mask and masked_image
+        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+        mask = mask.to(device=self.device, dtype=text_embeddings.dtype)
+        masked_image = masked_image.to(device=self.device, dtype=text_embeddings.dtype)
+
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        mask = torch.nn.functional.interpolate(mask, size=(height // 8, width // 8))
+
+        # encode the mask image into latents space so we can concatenate it to the latents
+        masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
+        masked_image_latents = 0.18215 * masked_image_latents
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        mask = mask.repeat(batch_size * num_images_per_prompt, 1, 1, 1)
+        masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        num_channels_mask = mask.shape[1]
+        num_channels_masked_image = masked_image_latents.shape[1]
+
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            # concat latents, mask, masked_image_latents in the channel dimension
+            latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/instaflow_one_step.py b/diffusers/examples/community/instaflow_one_step.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fef02287186d37d87384f4e235eb2efa14bea21
--- /dev/null
+++ b/diffusers/examples/community/instaflow_one_step.py
@@ -0,0 +1,689 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    deprecate,
+    logging,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class InstaFlowPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Rectified Flow and Euler discretization.
+    This customized pipeline is based on StableDiffusionPipeline from the official Diffusers library (0.21.4)
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def merge_dW_to_unet(pipe, dW_dict, alpha=1.0):
+        _tmp_sd = pipe.unet.state_dict()
+        for key in dW_dict.keys():
+            _tmp_sd[key] += dW_dict[key] * alpha
+        pipe.unet.load_state_dict(_tmp_sd, strict=False)
+        return pipe
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.7):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        timesteps = [(1.0 - i / num_inference_steps) * 1000.0 for i in range(num_inference_steps)]
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        dt = 1.0 / num_inference_steps
+
+        # 7. Denoising loop of Euler discretization from t = 0 to t = 1
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                vec_t = torch.ones((latent_model_input.shape[0],), device=latents.device) * t
+
+                v_pred = self.unet(latent_model_input, vec_t, encoder_hidden_states=prompt_embeds).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    v_pred_neg, v_pred_text = v_pred.chunk(2)
+                    v_pred = v_pred_neg + guidance_scale * (v_pred_text - v_pred_neg)
+
+                latents = latents + dt * v_pred
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/interpolate_stable_diffusion.py b/diffusers/examples/community/interpolate_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..52b2707f33f7eb7eddc7e6339b30d80b97cd13e5
--- /dev/null
+++ b/diffusers/examples/community/interpolate_stable_diffusion.py
@@ -0,0 +1,498 @@
+import inspect
+import time
+from pathlib import Path
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
+    """helper function to spherically interpolate two arrays v1 v2"""
+
+    if not isinstance(v0, np.ndarray):
+        inputs_are_torch = True
+        input_device = v0.device
+        v0 = v0.cpu().numpy()
+        v1 = v1.cpu().numpy()
+
+    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
+    if np.abs(dot) > DOT_THRESHOLD:
+        v2 = (1 - t) * v0 + t * v1
+    else:
+        theta_0 = np.arccos(dot)
+        sin_theta_0 = np.sin(theta_0)
+        theta_t = theta_0 * t
+        sin_theta_t = np.sin(theta_t)
+        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+        s1 = sin_theta_t / sin_theta_0
+        v2 = s0 * v0 + s1 * v1
+
+    if inputs_are_torch:
+        v2 = torch.from_numpy(v2).to(input_device)
+
+    return v2
+
+
+class StableDiffusionWalkPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        text_embeddings: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*, defaults to `None`):
+                The prompt or prompts to guide the image generation. If not provided, `text_embeddings` is required.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            text_embeddings (`torch.Tensor`, *optional*, defaults to `None`):
+                Pre-generated text embeddings to be used as inputs for image generation. Can be used in place of
+                `prompt` to avoid re-computing the embeddings. If not provided, the embeddings will be generated from
+                the supplied `prompt`.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if text_embeddings is None:
+            if isinstance(prompt, str):
+                batch_size = 1
+            elif isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+
+            if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+                removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+                print(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+            text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+        else:
+            batch_size = text_embeddings.shape[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = self.tokenizer.model_max_length
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    def embed_text(self, text):
+        """takes in text and turns it into text embeddings"""
+        text_input = self.tokenizer(
+            text,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        with torch.no_grad():
+            embed = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        return embed
+
+    def get_noise(self, seed, dtype=torch.float32, height=512, width=512):
+        """Takes in random seed and returns corresponding noise vector"""
+        return torch.randn(
+            (1, self.unet.config.in_channels, height // 8, width // 8),
+            generator=torch.Generator(device=self.device).manual_seed(seed),
+            device=self.device,
+            dtype=dtype,
+        )
+
+    def walk(
+        self,
+        prompts: List[str],
+        seeds: List[int],
+        num_interpolation_steps: Optional[int] = 6,
+        output_dir: Optional[str] = "./dreams",
+        name: Optional[str] = None,
+        batch_size: Optional[int] = 1,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        guidance_scale: Optional[float] = 7.5,
+        num_inference_steps: Optional[int] = 50,
+        eta: Optional[float] = 0.0,
+    ) -> List[str]:
+        """
+        Walks through a series of prompts and seeds, interpolating between them and saving the results to disk.
+
+        Args:
+            prompts (`List[str]`):
+                List of prompts to generate images for.
+            seeds (`List[int]`):
+                List of seeds corresponding to provided prompts. Must be the same length as prompts.
+            num_interpolation_steps (`int`, *optional*, defaults to 6):
+                Number of interpolation steps to take between prompts.
+            output_dir (`str`, *optional*, defaults to `./dreams`):
+                Directory to save the generated images to.
+            name (`str`, *optional*, defaults to `None`):
+                Subdirectory of `output_dir` to save the generated images to. If `None`, the name will
+                be the current time.
+            batch_size (`int`, *optional*, defaults to 1):
+                Number of images to generate at once.
+            height (`int`, *optional*, defaults to 512):
+                Height of the generated images.
+            width (`int`, *optional*, defaults to 512):
+                Width of the generated images.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+
+        Returns:
+            `List[str]`: List of paths to the generated images.
+        """
+        if not len(prompts) == len(seeds):
+            raise ValueError(
+                f"Number of prompts and seeds must be equalGot {len(prompts)} prompts and {len(seeds)} seeds"
+            )
+
+        name = name or time.strftime("%Y%m%d-%H%M%S")
+        save_path = Path(output_dir) / name
+        save_path.mkdir(exist_ok=True, parents=True)
+
+        frame_idx = 0
+        frame_filepaths = []
+        for prompt_a, prompt_b, seed_a, seed_b in zip(prompts, prompts[1:], seeds, seeds[1:]):
+            # Embed Text
+            embed_a = self.embed_text(prompt_a)
+            embed_b = self.embed_text(prompt_b)
+
+            # Get Noise
+            noise_dtype = embed_a.dtype
+            noise_a = self.get_noise(seed_a, noise_dtype, height, width)
+            noise_b = self.get_noise(seed_b, noise_dtype, height, width)
+
+            noise_batch, embeds_batch = None, None
+            T = np.linspace(0.0, 1.0, num_interpolation_steps)
+            for i, t in enumerate(T):
+                noise = slerp(float(t), noise_a, noise_b)
+                embed = torch.lerp(embed_a, embed_b, t)
+
+                noise_batch = noise if noise_batch is None else torch.cat([noise_batch, noise], dim=0)
+                embeds_batch = embed if embeds_batch is None else torch.cat([embeds_batch, embed], dim=0)
+
+                batch_is_ready = embeds_batch.shape[0] == batch_size or i + 1 == T.shape[0]
+                if batch_is_ready:
+                    outputs = self(
+                        latents=noise_batch,
+                        text_embeddings=embeds_batch,
+                        height=height,
+                        width=width,
+                        guidance_scale=guidance_scale,
+                        eta=eta,
+                        num_inference_steps=num_inference_steps,
+                    )
+                    noise_batch, embeds_batch = None, None
+
+                    for image in outputs["images"]:
+                        frame_filepath = str(save_path / f"frame_{frame_idx:06d}.png")
+                        image.save(frame_filepath)
+                        frame_filepaths.append(frame_filepath)
+                        frame_idx += 1
+        return frame_filepaths
diff --git a/diffusers/examples/community/ip_adapter_face_id.py b/diffusers/examples/community/ip_adapter_face_id.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7dc775eeee363ac7fcf10f04a5e4efb1ff2bf41
--- /dev/null
+++ b/diffusers/examples/community/ip_adapter_face_id.py
@@ -0,0 +1,1128 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+from safetensors import safe_open
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.attention_processor import (
+    AttnProcessor,
+    AttnProcessor2_0,
+    IPAdapterAttnProcessor,
+    IPAdapterAttnProcessor2_0,
+)
+from diffusers.models.embeddings import MultiIPAdapterImageProjection
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    _get_model_file,
+    deprecate,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class IPAdapterFullImageProjection(nn.Module):
+    def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mult=1, num_tokens=1):
+        super().__init__()
+        from diffusers.models.attention import FeedForward
+
+        self.num_tokens = num_tokens
+        self.cross_attention_dim = cross_attention_dim
+        self.ff = FeedForward(image_embed_dim, cross_attention_dim * num_tokens, mult=mult, activation_fn="gelu")
+        self.norm = nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, image_embeds: torch.Tensor):
+        x = self.ff(image_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        return self.norm(x)
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class IPAdapterFaceIDStableDiffusionPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def load_ip_adapter_face_id(self, pretrained_model_name_or_path_or_dict, weight_name, **kwargs):
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+        model_file = _get_model_file(
+            pretrained_model_name_or_path_or_dict,
+            weights_name=weight_name,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+        )
+        if weight_name.endswith(".safetensors"):
+            state_dict = {"image_proj": {}, "ip_adapter": {}}
+            with safe_open(model_file, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                    elif key.startswith("ip_adapter."):
+                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(model_file, map_location="cpu")
+        self._load_ip_adapter_weights(state_dict)
+
+    def convert_ip_adapter_image_proj_to_diffusers(self, state_dict):
+        updated_state_dict = {}
+        clip_embeddings_dim_in = state_dict["proj.0.weight"].shape[1]
+        clip_embeddings_dim_out = state_dict["proj.0.weight"].shape[0]
+        multiplier = clip_embeddings_dim_out // clip_embeddings_dim_in
+        norm_layer = "norm.weight"
+        cross_attention_dim = state_dict[norm_layer].shape[0]
+        num_tokens = state_dict["proj.2.weight"].shape[0] // cross_attention_dim
+
+        image_projection = IPAdapterFullImageProjection(
+            cross_attention_dim=cross_attention_dim,
+            image_embed_dim=clip_embeddings_dim_in,
+            mult=multiplier,
+            num_tokens=num_tokens,
+        )
+
+        for key, value in state_dict.items():
+            diffusers_name = key.replace("proj.0", "ff.net.0.proj")
+            diffusers_name = diffusers_name.replace("proj.2", "ff.net.2")
+            updated_state_dict[diffusers_name] = value
+
+        image_projection.load_state_dict(updated_state_dict)
+        return image_projection
+
+    def _load_ip_adapter_weights(self, state_dict):
+        num_image_text_embeds = 4
+
+        self.unet.encoder_hid_proj = None
+
+        # set ip-adapter cross-attention processors & load state_dict
+        attn_procs = {}
+        lora_dict = {}
+        key_id = 0
+        for name in self.unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else self.unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = self.unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None or "motion_modules" in name:
+                attn_processor_class = (
+                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                )
+                attn_procs[name] = attn_processor_class()
+
+                lora_dict.update(
+                    {f"unet.{name}.to_k_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_k_lora.down.weight"]}
+                )
+                lora_dict.update(
+                    {f"unet.{name}.to_q_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_q_lora.down.weight"]}
+                )
+                lora_dict.update(
+                    {f"unet.{name}.to_v_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_v_lora.down.weight"]}
+                )
+                lora_dict.update(
+                    {
+                        f"unet.{name}.to_out_lora.down.weight": state_dict["ip_adapter"][
+                            f"{key_id}.to_out_lora.down.weight"
+                        ]
+                    }
+                )
+                lora_dict.update(
+                    {f"unet.{name}.to_k_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_k_lora.up.weight"]}
+                )
+                lora_dict.update(
+                    {f"unet.{name}.to_q_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_q_lora.up.weight"]}
+                )
+                lora_dict.update(
+                    {f"unet.{name}.to_v_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_v_lora.up.weight"]}
+                )
+                lora_dict.update(
+                    {f"unet.{name}.to_out_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_out_lora.up.weight"]}
+                )
+                key_id += 1
+            else:
+                attn_processor_class = (
+                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
+                )
+                attn_procs[name] = attn_processor_class(
+                    hidden_size=hidden_size,
+                    cross_attention_dim=cross_attention_dim,
+                    scale=1.0,
+                    num_tokens=num_image_text_embeds,
+                ).to(dtype=self.dtype, device=self.device)
+
+                lora_dict.update(
+                    {f"unet.{name}.to_k_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_k_lora.down.weight"]}
+                )
+                lora_dict.update(
+                    {f"unet.{name}.to_q_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_q_lora.down.weight"]}
+                )
+                lora_dict.update(
+                    {f"unet.{name}.to_v_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_v_lora.down.weight"]}
+                )
+                lora_dict.update(
+                    {
+                        f"unet.{name}.to_out_lora.down.weight": state_dict["ip_adapter"][
+                            f"{key_id}.to_out_lora.down.weight"
+                        ]
+                    }
+                )
+                lora_dict.update(
+                    {f"unet.{name}.to_k_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_k_lora.up.weight"]}
+                )
+                lora_dict.update(
+                    {f"unet.{name}.to_q_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_q_lora.up.weight"]}
+                )
+                lora_dict.update(
+                    {f"unet.{name}.to_v_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_v_lora.up.weight"]}
+                )
+                lora_dict.update(
+                    {f"unet.{name}.to_out_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_out_lora.up.weight"]}
+                )
+
+                value_dict = {}
+                value_dict.update({"to_k_ip.0.weight": state_dict["ip_adapter"][f"{key_id}.to_k_ip.weight"]})
+                value_dict.update({"to_v_ip.0.weight": state_dict["ip_adapter"][f"{key_id}.to_v_ip.weight"]})
+                attn_procs[name].load_state_dict(value_dict)
+                key_id += 1
+
+        self.unet.set_attn_processor(attn_procs)
+
+        self.load_lora_weights(lora_dict, adapter_name="faceid")
+        self.set_adapters(["faceid"], adapter_weights=[1.0])
+
+        # convert IP-Adapter Image Projection layers to diffusers
+        image_projection = self.convert_ip_adapter_image_proj_to_diffusers(state_dict["image_proj"])
+        image_projection_layers = [image_projection.to(device=self.device, dtype=self.dtype)]
+
+        self.unet.encoder_hid_proj = MultiIPAdapterImageProjection(image_projection_layers)
+        self.unet.config.encoder_hid_dim_type = "ip_image_proj"
+
+    def set_ip_adapter_scale(self, scale):
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        for attn_processor in unet.attn_processors.values():
+            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
+                attn_processor.scale = [scale]
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            image_embeds (`torch.Tensor`, *optional*):
+                Pre-generated image embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if image_embeds is not None:
+            image_embeds = torch.stack([image_embeds] * num_images_per_prompt, dim=0).to(
+                device=device, dtype=prompt_embeds.dtype
+            )
+            negative_image_embeds = torch.zeros_like(image_embeds)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+        image_embeds = [image_embeds]
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if image_embeds is not None else {}
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/kohya_hires_fix.py b/diffusers/examples/community/kohya_hires_fix.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e36f32b19a3c421c7e75e3d67d002661563e89f
--- /dev/null
+++ b/diffusers/examples/community/kohya_hires_fix.py
@@ -0,0 +1,468 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers.configuration_utils import register_to_config
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel, UNet2DConditionOutput
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class UNet2DConditionModelHighResFix(UNet2DConditionModel):
+    r"""
+    A conditional 2D UNet model that applies Kohya fix proposed for high resolution image generation.
+
+    This model inherits from [`UNet2DConditionModel`]. Check the superclass documentation for learning about all the parameters.
+
+    Parameters:
+        high_res_fix (`List[Dict]`, *optional*, defaults to `[{'timestep': 600, 'scale_factor': 0.5, 'block_num': 1}]`):
+            Enables Kohya fix for high resolution generation. The activation maps are scaled based on the scale_factor up to the timestep at specified block_num.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(self, high_res_fix: List[Dict] = [{"timestep": 600, "scale_factor": 0.5, "block_num": 1}], **kwargs):
+        super().__init__(**kwargs)
+        if high_res_fix:
+            self.config.high_res_fix = sorted(high_res_fix, key=lambda x: x["timestep"], reverse=True)
+
+    @classmethod
+    def _resize(cls, sample, target=None, scale_factor=1, mode="bicubic"):
+        dtype = sample.dtype
+        if dtype == torch.bfloat16:
+            sample = sample.to(torch.float32)
+
+        if target is not None:
+            if sample.shape[-2:] != target.shape[-2:]:
+                sample = nn.functional.interpolate(sample, size=target.shape[-2:], mode=mode, align_corners=False)
+        elif scale_factor != 1:
+            sample = nn.functional.interpolate(sample, scale_factor=scale_factor, mode=mode, align_corners=False)
+
+        return sample.to(dtype)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                # Forward upsample size to force interpolation output size.
+                forward_upsample_size = True
+                break
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        t_emb = self.get_time_embed(sample=sample, timestep=timestep)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
+        if class_emb is not None:
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+
+        aug_emb = self.get_aug_embed(
+            emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+        )
+        if self.config.addition_embed_type == "image_hint":
+            aug_emb, hint = aug_emb
+            sample = torch.cat([sample, hint], dim=1)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+
+        encoder_hidden_states = self.process_encoder_hidden_states(
+            encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+        )
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+
+        # 3. down
+        # we're popping the `scale` instead of getting it because otherwise `scale` will be propagated
+        # to the internal blocks and will raise deprecation warnings. this will be confusing for our users.
+        if cross_attention_kwargs is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            lora_scale = cross_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+
+        down_block_res_samples = (sample,)
+        for down_i, downsample_block in enumerate(self.down_blocks):
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+
+            down_block_res_samples += res_samples
+
+            # kohya high res fix
+            if self.config.high_res_fix:
+                for high_res_fix in self.config.high_res_fix:
+                    if timestep > high_res_fix["timestep"] and down_i == high_res_fix["block_num"]:
+                        sample = self.__class__._resize(sample, scale_factor=high_res_fix["scale_factor"])
+                        break
+
+        if is_controlnet:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # up scaling of kohya high res fix
+            if self.config.high_res_fix is not None:
+                if res_samples[0].shape[-2:] != sample.shape[-2:]:
+                    sample = self.__class__._resize(sample, target=res_samples[0])
+                    res_samples_up_sampled = (res_samples[0],)
+                    for res_sample in res_samples[1:]:
+                        res_samples_up_sampled += (self.__class__._resize(res_sample, target=res_samples[0]),)
+                    res_samples = res_samples_up_sampled
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DConditionOutput(sample=sample)
+
+    @classmethod
+    def from_unet(cls, unet: UNet2DConditionModel, high_res_fix: list):
+        config = dict((unet.config))
+        config["high_res_fix"] = high_res_fix
+        unet_high_res = cls(**config)
+        unet_high_res.load_state_dict(unet.state_dict())
+        unet_high_res.to(unet.dtype)
+        return unet_high_res
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
+
+        >>> pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4",
+                                         custom_pipeline="kohya_hires_fix",
+                                         torch_dtype=torch.float16,
+                                         high_res_fix=[{'timestep': 600,
+                                                        'scale_factor': 0.5,
+                                                        'block_num': 1}])
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt, height=1000, width=1600).images[0]
+        ```
+"""
+
+
+class StableDiffusionHighResFixPipeline(StableDiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with Kohya fix for high resolution generation.
+
+    This model inherits from [`StableDiffusionPipeline`]. Check the superclass documentation for the generic methods.
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+        high_res_fix (`List[Dict]`, *optional*, defaults to `[{'timestep': 600, 'scale_factor': 0.5, 'block_num': 1}]`):
+            Enables Kohya fix for high resolution generation. The activation maps are scaled based on the scale_factor up to the timestep at specified block_num.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+        high_res_fix: List[Dict] = [{"timestep": 600, "scale_factor": 0.5, "block_num": 1}],
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            requires_safety_checker=requires_safety_checker,
+        )
+
+        unet = UNet2DConditionModelHighResFix.from_unet(unet=unet, high_res_fix=high_res_fix)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
diff --git a/diffusers/examples/community/latent_consistency_img2img.py b/diffusers/examples/community/latent_consistency_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fe53ab6b8301ed2c7eb3a5cc503f97fc3141f51
--- /dev/null
+++ b/diffusers/examples/community/latent_consistency_img2img.py
@@ -0,0 +1,821 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, ConfigMixin, DiffusionPipeline, SchedulerMixin, UNet2DConditionModel, logging
+from diffusers.configuration_utils import register_to_config
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class LatentConsistencyModelImg2ImgPipeline(DiffusionPipeline):
+    _optional_components = ["scheduler"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: "LCMSchedulerWithTimestamp",
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        scheduler = (
+            scheduler
+            if scheduler is not None
+            else LCMSchedulerWithTimestamp(
+                beta_start=0.00085, beta_end=0.0120, beta_schedule="scaled_linear", prediction_type="epsilon"
+            )
+        )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        prompt_embeds: None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+        """
+
+        if prompt is not None and isinstance(prompt, str):
+            pass
+        elif prompt is not None and isinstance(prompt, list):
+            len(prompt)
+        else:
+            prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # Don't need to get uncond prompt embedding because of LCM Guided Distillation
+        return prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def prepare_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        latents=None,
+        generator=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        # batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = self.vae.encode(image).latent_dist.sample(generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            # deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+        timesteps: torch.Tensor: generate embedding vectors at these timesteps
+        embedding_dim: int: dimension of the embeddings to generate
+        dtype: data type of the generated embeddings
+        Returns:
+        embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.8,
+        height: Optional[int] = 768,
+        width: Optional[int] = 768,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        latents: Optional[torch.Tensor] = None,
+        num_inference_steps: int = 4,
+        lcm_origin_steps: int = 50,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # do_classifier_free_guidance = guidance_scale > 0.0  # In LCM Implementation:  cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG)
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            prompt_embeds=prompt_embeds,
+        )
+
+        # 3.5 encode image
+        image = self.image_processor.preprocess(image)
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(strength, num_inference_steps, lcm_origin_steps)
+        # timesteps = self.scheduler.timesteps
+        # timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, 1.0, device)
+        timesteps = self.scheduler.timesteps
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        print("timesteps: ", timesteps)
+
+        # 5. Prepare latent variable
+        num_channels_latents = self.unet.config.in_channels
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                latents,
+            )
+        bs = batch_size * num_images_per_prompt
+
+        # 6. Get Guidance Scale Embedding
+        w = torch.tensor(guidance_scale).repeat(bs)
+        w_embedding = self.get_w_embedding(w, embedding_dim=256).to(device=device, dtype=latents.dtype)
+
+        # 7. LCM MultiStep Sampling Loop:
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                ts = torch.full((bs,), t, device=device, dtype=torch.long)
+                latents = latents.to(prompt_embeds.dtype)
+
+                # model prediction (v-prediction, eps, x)
+                model_pred = self.unet(
+                    latents,
+                    ts,
+                    timestep_cond=w_embedding,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents, denoised = self.scheduler.step(model_pred, i, t, latents, return_dict=False)
+
+                # # call the callback, if provided
+                # if i == len(timesteps) - 1:
+                progress_bar.update()
+
+        denoised = denoised.to(prompt_embeds.dtype)
+        if not output_type == "latent":
+            image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = denoised
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class LCMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    denoised: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
+    """
+    This class modifies LCMScheduler to add a timestamp argument to set_timesteps
+
+
+    `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
+    non-Markovian guidance.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    # _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, height, width = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * height * width)
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, height, width)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def set_timesteps(
+        self, stength, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # LCM Timesteps Setting:  # Linear Spacing
+        c = self.config.num_train_timesteps // lcm_origin_steps
+        lcm_origin_timesteps = (
+            np.asarray(list(range(1, int(lcm_origin_steps * stength) + 1))) * c - 1
+        )  # LCM Training  Steps Schedule
+        skipping_step = len(lcm_origin_timesteps) // num_inference_steps
+        timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps]  # LCM Inference Steps Schedule
+
+        self.timesteps = torch.from_numpy(timesteps.copy()).to(device)
+
+    def get_scalings_for_boundary_condition_discrete(self, t):
+        self.sigma_data = 0.5  # Default: 0.5
+
+        # By dividing 0.1: This is almost a delta function at t=0.
+        c_skip = self.sigma_data**2 / ((t / 0.1) ** 2 + self.sigma_data**2)
+        c_out = (t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data**2) ** 0.5
+        return c_skip, c_out
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timeindex: int,
+        timestep: int,
+        sample: torch.Tensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[LCMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                The weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`, defaults to `False`):
+                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
+                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
+                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
+                `use_clipped_model_output` has no effect.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # 1. get previous step value
+        prev_timeindex = timeindex + 1
+        if prev_timeindex < len(self.timesteps):
+            prev_timestep = self.timesteps[prev_timeindex]
+        else:
+            prev_timestep = timestep
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        # 3. Get scalings for boundary conditions
+        c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
+
+        # 4. Different Parameterization:
+        parameterization = self.config.prediction_type
+
+        if parameterization == "epsilon":  # noise-prediction
+            pred_x0 = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
+
+        elif parameterization == "sample":  # x-prediction
+            pred_x0 = model_output
+
+        elif parameterization == "v_prediction":  # v-prediction
+            pred_x0 = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
+
+        # 4. Denoise model output using boundary conditions
+        denoised = c_out * pred_x0 + c_skip * sample
+
+        # 5. Sample z ~ N(0, I), For MultiStep Inference
+        # Noise is not used for one-step sampling.
+        if len(self.timesteps) > 1:
+            noise = torch.randn(model_output.shape).to(model_output.device)
+            prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
+        else:
+            prev_sample = denoised
+
+        if not return_dict:
+            return (prev_sample, denoised)
+
+        return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/examples/community/latent_consistency_interpolate.py b/diffusers/examples/community/latent_consistency_interpolate.py
new file mode 100644
index 0000000000000000000000000000000000000000..84adc125b1911981d6057ecade44578cd17fb817
--- /dev/null
+++ b/diffusers/examples/community/latent_consistency_interpolate.py
@@ -0,0 +1,999 @@
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.schedulers import LCMScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> import numpy as np
+
+        >>> from diffusers import DiffusionPipeline
+
+        >>> pipe = DiffusionPipeline.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", custom_pipeline="latent_consistency_interpolate")
+        >>> # To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+        >>> pipe.to(torch_device="cuda", torch_dtype=torch.float32)
+
+        >>> prompts = ["A cat", "A dog", "A horse"]
+        >>> num_inference_steps = 4
+        >>> num_interpolation_steps = 24
+        >>> seed = 1337
+
+        >>> torch.manual_seed(seed)
+        >>> np.random.seed(seed)
+
+        >>> images = pipe(
+                prompt=prompts,
+                height=512,
+                width=512,
+                num_inference_steps=num_inference_steps,
+                num_interpolation_steps=num_interpolation_steps,
+                guidance_scale=8.0,
+                embedding_interpolation_type="lerp",
+                latent_interpolation_type="slerp",
+                process_batch_size=4, # Make it higher or lower based on your GPU memory
+                generator=torch.Generator(seed),
+            )
+
+        >>> # Save the images as a video
+        >>> import imageio
+        >>> from PIL import Image
+
+        >>> def pil_to_video(images: List[Image.Image], filename: str, fps: int = 60) -> None:
+                frames = [np.array(image) for image in images]
+                with imageio.get_writer(filename, fps=fps) as video_writer:
+                    for frame in frames:
+                        video_writer.append_data(frame)
+
+        >>> pil_to_video(images, "lcm_interpolate.mp4", fps=24)
+        ```
+"""
+
+
+def lerp(
+    v0: Union[torch.Tensor, np.ndarray],
+    v1: Union[torch.Tensor, np.ndarray],
+    t: Union[float, torch.Tensor, np.ndarray],
+) -> Union[torch.Tensor, np.ndarray]:
+    """
+    Linearly interpolate between two vectors/tensors.
+
+    Args:
+        v0 (`torch.Tensor` or `np.ndarray`): First vector/tensor.
+        v1 (`torch.Tensor` or `np.ndarray`): Second vector/tensor.
+        t: (`float`, `torch.Tensor`, or `np.ndarray`):
+            Interpolation factor. If float, must be between 0 and 1. If np.ndarray or
+            torch.Tensor, must be one dimensional with values between 0 and 1.
+
+    Returns:
+        Union[torch.Tensor, np.ndarray]
+            Interpolated vector/tensor between v0 and v1.
+    """
+    inputs_are_torch = False
+    t_is_float = False
+
+    if isinstance(v0, torch.Tensor):
+        inputs_are_torch = True
+        input_device = v0.device
+        v0 = v0.cpu().numpy()
+        v1 = v1.cpu().numpy()
+
+    if isinstance(t, torch.Tensor):
+        inputs_are_torch = True
+        input_device = t.device
+        t = t.cpu().numpy()
+    elif isinstance(t, float):
+        t_is_float = True
+        t = np.array([t])
+
+    t = t[..., None]
+    v0 = v0[None, ...]
+    v1 = v1[None, ...]
+    v2 = (1 - t) * v0 + t * v1
+
+    if t_is_float and v0.ndim > 1:
+        assert v2.shape[0] == 1
+        v2 = np.squeeze(v2, axis=0)
+    if inputs_are_torch:
+        v2 = torch.from_numpy(v2).to(input_device)
+
+    return v2
+
+
+def slerp(
+    v0: Union[torch.Tensor, np.ndarray],
+    v1: Union[torch.Tensor, np.ndarray],
+    t: Union[float, torch.Tensor, np.ndarray],
+    DOT_THRESHOLD=0.9995,
+) -> Union[torch.Tensor, np.ndarray]:
+    """
+    Spherical linear interpolation between two vectors/tensors.
+
+    Args:
+        v0 (`torch.Tensor` or `np.ndarray`): First vector/tensor.
+        v1 (`torch.Tensor` or `np.ndarray`): Second vector/tensor.
+        t: (`float`, `torch.Tensor`, or `np.ndarray`):
+            Interpolation factor. If float, must be between 0 and 1. If np.ndarray or
+            torch.Tensor, must be one dimensional with values between 0 and 1.
+        DOT_THRESHOLD (`float`, *optional*, default=0.9995):
+            Threshold for when to use linear interpolation instead of spherical interpolation.
+
+    Returns:
+        `torch.Tensor` or `np.ndarray`:
+            Interpolated vector/tensor between v0 and v1.
+    """
+    inputs_are_torch = False
+    t_is_float = False
+
+    if isinstance(v0, torch.Tensor):
+        inputs_are_torch = True
+        input_device = v0.device
+        v0 = v0.cpu().numpy()
+        v1 = v1.cpu().numpy()
+
+    if isinstance(t, torch.Tensor):
+        inputs_are_torch = True
+        input_device = t.device
+        t = t.cpu().numpy()
+    elif isinstance(t, float):
+        t_is_float = True
+        t = np.array([t], dtype=v0.dtype)
+
+    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
+    if np.abs(dot) > DOT_THRESHOLD:
+        # v1 and v2 are close to parallel
+        # Use linear interpolation instead
+        v2 = lerp(v0, v1, t)
+    else:
+        theta_0 = np.arccos(dot)
+        sin_theta_0 = np.sin(theta_0)
+        theta_t = theta_0 * t
+        sin_theta_t = np.sin(theta_t)
+        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+        s1 = sin_theta_t / sin_theta_0
+        s0 = s0[..., None]
+        s1 = s1[..., None]
+        v0 = v0[None, ...]
+        v1 = v1[None, ...]
+        v2 = s0 * v0 + s1 * v1
+
+    if t_is_float and v0.ndim > 1:
+        assert v2.shape[0] == 1
+        v2 = np.squeeze(v2, axis=0)
+    if inputs_are_torch:
+        v2 = torch.from_numpy(v2).to(input_device)
+
+    return v2
+
+
+class LatentConsistencyModelWalkPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using a latent consistency model.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Currently only
+            supports [`LCMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+        requires_safety_checker (`bool`, *optional*, defaults to `True`):
+            Whether the pipeline requires a safety checker component.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "denoised", "prompt_embeds", "w_embedding"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: LCMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Currently StableDiffusionPipeline.check_inputs with negative prompt stuff removed
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: int,
+        width: int,
+        callback_steps: int,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+    @torch.no_grad()
+    def interpolate_embedding(
+        self,
+        start_embedding: torch.Tensor,
+        end_embedding: torch.Tensor,
+        num_interpolation_steps: Union[int, List[int]],
+        interpolation_type: str,
+    ) -> torch.Tensor:
+        if interpolation_type == "lerp":
+            interpolation_fn = lerp
+        elif interpolation_type == "slerp":
+            interpolation_fn = slerp
+        else:
+            raise ValueError(
+                f"embedding_interpolation_type must be one of ['lerp', 'slerp'], got {interpolation_type}."
+            )
+
+        embedding = torch.cat([start_embedding, end_embedding])
+        steps = torch.linspace(0, 1, num_interpolation_steps, dtype=embedding.dtype).cpu().numpy()
+        steps = np.expand_dims(steps, axis=tuple(range(1, embedding.ndim)))
+        interpolations = []
+
+        # Interpolate between text embeddings
+        # TODO(aryan): Think of a better way of doing this
+        # See if it can be done parallelly instead
+        for i in range(embedding.shape[0] - 1):
+            interpolations.append(interpolation_fn(embedding[i], embedding[i + 1], steps).squeeze(dim=1))
+
+        interpolations = torch.cat(interpolations)
+        return interpolations
+
+    @torch.no_grad()
+    def interpolate_latent(
+        self,
+        start_latent: torch.Tensor,
+        end_latent: torch.Tensor,
+        num_interpolation_steps: Union[int, List[int]],
+        interpolation_type: str,
+    ) -> torch.Tensor:
+        if interpolation_type == "lerp":
+            interpolation_fn = lerp
+        elif interpolation_type == "slerp":
+            interpolation_fn = slerp
+
+        latent = torch.cat([start_latent, end_latent])
+        steps = torch.linspace(0, 1, num_interpolation_steps, dtype=latent.dtype).cpu().numpy()
+        steps = np.expand_dims(steps, axis=tuple(range(1, latent.ndim)))
+        interpolations = []
+
+        # Interpolate between latents
+        # TODO: Think of a better way of doing this
+        # See if it can be done parallelly instead
+        for i in range(latent.shape[0] - 1):
+            interpolations.append(interpolation_fn(latent[i], latent[i + 1], steps).squeeze(dim=1))
+
+        return torch.cat(interpolations)
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 4,
+        num_interpolation_steps: int = 8,
+        original_inference_steps: int = None,
+        guidance_scale: float = 8.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        embedding_interpolation_type: str = "lerp",
+        latent_interpolation_type: str = "slerp",
+        process_batch_size: int = 4,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            original_inference_steps (`int`, *optional*):
+                The original number of inference steps use to generate a linearly-spaced timestep schedule, from which
+                we will draw `num_inference_steps` evenly spaced timesteps from as our final timestep schedule,
+                following the Skipping-Step method in the paper (see Section 4.3). If not set this will default to the
+                scheduler's `original_inference_steps` attribute.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+                Note that the original latent consistency models paper uses a different CFG formulation where the
+                guidance scales are decreased by 1 (so in the paper formulation CFG is enabled when `guidance_scale >
+                0`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            embedding_interpolation_type (`str`, *optional*, defaults to `"lerp"`):
+                The type of interpolation to use for interpolating between text embeddings. Choose between `"lerp"` and `"slerp"`.
+            latent_interpolation_type (`str`, *optional*, defaults to `"slerp"`):
+                The type of interpolation to use for interpolating between latents. Choose between `"lerp"` and `"slerp"`.
+            process_batch_size (`int`, *optional*, defaults to 4):
+                The batch size to use for processing the images. This is useful when generating a large number of images
+                and you want to avoid running out of memory.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps, prompt_embeds, callback_on_step_end_tensor_inputs)
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if batch_size < 2:
+            raise ValueError(f"`prompt` must have length of at least 2 but found {batch_size}")
+        if num_images_per_prompt != 1:
+            raise ValueError("`num_images_per_prompt` must be `1` as no other value is supported yet")
+        if prompt_embeds is not None:
+            raise ValueError("`prompt_embeds` must be None since it is not supported yet")
+        if latents is not None:
+            raise ValueError("`latents` must be None since it is not supported yet")
+
+        device = self._execution_device
+        # do_classifier_free_guidance = guidance_scale > 1.0
+
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        self.scheduler.set_timesteps(num_inference_steps, device, original_inference_steps=original_inference_steps)
+        timesteps = self.scheduler.timesteps
+        num_channels_latents = self.unet.config.in_channels
+        # bs = batch_size * num_images_per_prompt
+
+        # 3. Encode initial input prompt
+        prompt_embeds_1, _ = self.encode_prompt(
+            prompt[:1],
+            device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=False,
+            negative_prompt=None,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=None,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Prepare initial latent variables
+        latents_1 = self.prepare_latents(
+            1,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds_1.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None)
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        images = []
+
+        # 5. Iterate over prompts and perform latent walk. Note that we do this two prompts at a time
+        #    otherwise the memory usage ends up being too high.
+        with self.progress_bar(total=batch_size - 1) as prompt_progress_bar:
+            for i in range(1, batch_size):
+                # 6. Encode current prompt
+                prompt_embeds_2, _ = self.encode_prompt(
+                    prompt[i : i + 1],
+                    device,
+                    num_images_per_prompt=num_images_per_prompt,
+                    do_classifier_free_guidance=False,
+                    negative_prompt=None,
+                    prompt_embeds=prompt_embeds,
+                    negative_prompt_embeds=None,
+                    lora_scale=lora_scale,
+                    clip_skip=self.clip_skip,
+                )
+
+                # 7. Prepare current latent variables
+                latents_2 = self.prepare_latents(
+                    1,
+                    num_channels_latents,
+                    height,
+                    width,
+                    prompt_embeds_2.dtype,
+                    device,
+                    generator,
+                    latents,
+                )
+
+                # 8. Interpolate between previous and current prompt embeddings and latents
+                inference_embeddings = self.interpolate_embedding(
+                    start_embedding=prompt_embeds_1,
+                    end_embedding=prompt_embeds_2,
+                    num_interpolation_steps=num_interpolation_steps,
+                    interpolation_type=embedding_interpolation_type,
+                )
+                inference_latents = self.interpolate_latent(
+                    start_latent=latents_1,
+                    end_latent=latents_2,
+                    num_interpolation_steps=num_interpolation_steps,
+                    interpolation_type=latent_interpolation_type,
+                )
+                next_prompt_embeds = inference_embeddings[-1:].detach().clone()
+                next_latents = inference_latents[-1:].detach().clone()
+                bs = num_interpolation_steps
+
+                # 9. Perform inference in batches. Note the use of `process_batch_size` to control the batch size
+                #    of the inference. This is useful for reducing memory usage and can be configured based on the
+                #    available GPU memory.
+                with self.progress_bar(
+                    total=(bs + process_batch_size - 1) // process_batch_size
+                ) as batch_progress_bar:
+                    for batch_index in range(0, bs, process_batch_size):
+                        batch_inference_latents = inference_latents[batch_index : batch_index + process_batch_size]
+                        batch_inference_embeddings = inference_embeddings[
+                            batch_index : batch_index + process_batch_size
+                        ]
+
+                        self.scheduler.set_timesteps(
+                            num_inference_steps, device, original_inference_steps=original_inference_steps
+                        )
+                        timesteps = self.scheduler.timesteps
+
+                        current_bs = batch_inference_embeddings.shape[0]
+                        w = torch.tensor(self.guidance_scale - 1).repeat(current_bs)
+                        w_embedding = self.get_guidance_scale_embedding(
+                            w, embedding_dim=self.unet.config.time_cond_proj_dim
+                        ).to(device=device, dtype=latents_1.dtype)
+
+                        # 10. Perform inference for current batch
+                        with self.progress_bar(total=num_inference_steps) as progress_bar:
+                            for index, t in enumerate(timesteps):
+                                batch_inference_latents = batch_inference_latents.to(batch_inference_embeddings.dtype)
+
+                                # model prediction (v-prediction, eps, x)
+                                model_pred = self.unet(
+                                    batch_inference_latents,
+                                    t,
+                                    timestep_cond=w_embedding,
+                                    encoder_hidden_states=batch_inference_embeddings,
+                                    cross_attention_kwargs=self.cross_attention_kwargs,
+                                    return_dict=False,
+                                )[0]
+
+                                # compute the previous noisy sample x_t -> x_t-1
+                                batch_inference_latents, denoised = self.scheduler.step(
+                                    model_pred, t, batch_inference_latents, **extra_step_kwargs, return_dict=False
+                                )
+                                if callback_on_step_end is not None:
+                                    callback_kwargs = {}
+                                    for k in callback_on_step_end_tensor_inputs:
+                                        callback_kwargs[k] = locals()[k]
+                                    callback_outputs = callback_on_step_end(self, index, t, callback_kwargs)
+
+                                    batch_inference_latents = callback_outputs.pop("latents", batch_inference_latents)
+                                    batch_inference_embeddings = callback_outputs.pop(
+                                        "prompt_embeds", batch_inference_embeddings
+                                    )
+                                    w_embedding = callback_outputs.pop("w_embedding", w_embedding)
+                                    denoised = callback_outputs.pop("denoised", denoised)
+
+                                # call the callback, if provided
+                                if index == len(timesteps) - 1 or (
+                                    (index + 1) > num_warmup_steps and (index + 1) % self.scheduler.order == 0
+                                ):
+                                    progress_bar.update()
+                                    if callback is not None and index % callback_steps == 0:
+                                        step_idx = index // getattr(self.scheduler, "order", 1)
+                                        callback(step_idx, t, batch_inference_latents)
+
+                        denoised = denoised.to(batch_inference_embeddings.dtype)
+
+                        # Note: This is not supported because you would get black images in your latent walk if
+                        #       NSFW concept is detected
+                        # if not output_type == "latent":
+                        #     image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]
+                        #     image, has_nsfw_concept = self.run_safety_checker(image, device, inference_embeddings.dtype)
+                        # else:
+                        #     image = denoised
+                        #     has_nsfw_concept = None
+
+                        # if has_nsfw_concept is None:
+                        #     do_denormalize = [True] * image.shape[0]
+                        # else:
+                        #     do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+                        image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]
+                        do_denormalize = [True] * image.shape[0]
+                        has_nsfw_concept = None
+
+                        image = self.image_processor.postprocess(
+                            image, output_type=output_type, do_denormalize=do_denormalize
+                        )
+                        images.append(image)
+
+                        batch_progress_bar.update()
+
+                prompt_embeds_1 = next_prompt_embeds
+                latents_1 = next_latents
+
+                prompt_progress_bar.update()
+
+        # 11. Determine what should be returned
+        if output_type == "pil":
+            images = [image for image_list in images for image in image_list]
+        elif output_type == "np":
+            images = np.concatenate(images)
+        elif output_type == "pt":
+            images = torch.cat(images)
+        else:
+            raise ValueError("`output_type` must be one of 'pil', 'np' or 'pt'.")
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (images, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/latent_consistency_txt2img.py b/diffusers/examples/community/latent_consistency_txt2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f25a6db2722e9923a5a269385b166880ff73dce
--- /dev/null
+++ b/diffusers/examples/community/latent_consistency_txt2img.py
@@ -0,0 +1,729 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, ConfigMixin, DiffusionPipeline, SchedulerMixin, UNet2DConditionModel, logging
+from diffusers.configuration_utils import register_to_config
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.utils import BaseOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class LatentConsistencyModelPipeline(DiffusionPipeline):
+    _optional_components = ["scheduler"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: "LCMScheduler",
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        scheduler = (
+            scheduler
+            if scheduler is not None
+            else LCMScheduler(
+                beta_start=0.00085, beta_end=0.0120, beta_schedule="scaled_linear", prediction_type="epsilon"
+            )
+        )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        prompt_embeds: None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+        """
+
+        if prompt is not None and isinstance(prompt, str):
+            pass
+        elif prompt is not None and isinstance(prompt, list):
+            len(prompt)
+        else:
+            prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # Don't need to get uncond prompt embedding because of LCM Guided Distillation
+        return prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if latents is None:
+            latents = torch.randn(shape, dtype=dtype).to(device)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+        timesteps: torch.Tensor: generate embedding vectors at these timesteps
+        embedding_dim: int: dimension of the embeddings to generate
+        dtype: data type of the generated embeddings
+        Returns:
+        embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = 768,
+        width: Optional[int] = 768,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        latents: Optional[torch.Tensor] = None,
+        num_inference_steps: int = 4,
+        lcm_origin_steps: int = 50,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # do_classifier_free_guidance = guidance_scale > 0.0  # In LCM Implementation:  cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG)
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            prompt_embeds=prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, lcm_origin_steps)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variable
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            latents,
+        )
+        bs = batch_size * num_images_per_prompt
+
+        # 6. Get Guidance Scale Embedding
+        w = torch.tensor(guidance_scale).repeat(bs)
+        w_embedding = self.get_w_embedding(w, embedding_dim=256).to(device=device, dtype=latents.dtype)
+
+        # 7. LCM MultiStep Sampling Loop:
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                ts = torch.full((bs,), t, device=device, dtype=torch.long)
+                latents = latents.to(prompt_embeds.dtype)
+
+                # model prediction (v-prediction, eps, x)
+                model_pred = self.unet(
+                    latents,
+                    ts,
+                    timestep_cond=w_embedding,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents, denoised = self.scheduler.step(model_pred, i, t, latents, return_dict=False)
+
+                # # call the callback, if provided
+                # if i == len(timesteps) - 1:
+                progress_bar.update()
+
+        denoised = denoised.to(prompt_embeds.dtype)
+        if not output_type == "latent":
+            image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = denoised
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class LCMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    denoised: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class LCMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
+    non-Markovian guidance.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    # _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, height, width = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * height * width)
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, height, width)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # LCM Timesteps Setting:  # Linear Spacing
+        c = self.config.num_train_timesteps // lcm_origin_steps
+        lcm_origin_timesteps = np.asarray(list(range(1, lcm_origin_steps + 1))) * c - 1  # LCM Training  Steps Schedule
+        skipping_step = len(lcm_origin_timesteps) // num_inference_steps
+        timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps]  # LCM Inference Steps Schedule
+
+        self.timesteps = torch.from_numpy(timesteps.copy()).to(device)
+
+    def get_scalings_for_boundary_condition_discrete(self, t):
+        self.sigma_data = 0.5  # Default: 0.5
+
+        # By dividing 0.1: This is almost a delta function at t=0.
+        c_skip = self.sigma_data**2 / ((t / 0.1) ** 2 + self.sigma_data**2)
+        c_out = (t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data**2) ** 0.5
+        return c_skip, c_out
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timeindex: int,
+        timestep: int,
+        sample: torch.Tensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[LCMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                The weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`, defaults to `False`):
+                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
+                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
+                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
+                `use_clipped_model_output` has no effect.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # 1. get previous step value
+        prev_timeindex = timeindex + 1
+        if prev_timeindex < len(self.timesteps):
+            prev_timestep = self.timesteps[prev_timeindex]
+        else:
+            prev_timestep = timestep
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        # 3. Get scalings for boundary conditions
+        c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
+
+        # 4. Different Parameterization:
+        parameterization = self.config.prediction_type
+
+        if parameterization == "epsilon":  # noise-prediction
+            pred_x0 = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
+
+        elif parameterization == "sample":  # x-prediction
+            pred_x0 = model_output
+
+        elif parameterization == "v_prediction":  # v-prediction
+            pred_x0 = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
+
+        # 4. Denoise model output using boundary conditions
+        denoised = c_out * pred_x0 + c_skip * sample
+
+        # 5. Sample z ~ N(0, I), For MultiStep Inference
+        # Noise is not used for one-step sampling.
+        if len(self.timesteps) > 1:
+            noise = torch.randn(model_output.shape).to(model_output.device)
+            prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
+        else:
+            prev_sample = denoised
+
+        if not return_dict:
+            return (prev_sample, denoised)
+
+        return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/examples/community/llm_grounded_diffusion.py b/diffusers/examples/community/llm_grounded_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..49c07491135496ee0737672da7278280f0b6b4af
--- /dev/null
+++ b/diffusers/examples/community/llm_grounded_diffusion.py
@@ -0,0 +1,1563 @@
+# Copyright 2024 Long Lian, the GLIGEN Authors, and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is a single file implementation of LMD+. See README.md for examples.
+
+import ast
+import gc
+import inspect
+import math
+import warnings
+from collections.abc import Iterable
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.attention import Attention, GatedSelfAttentionDense
+from diffusers.models.attention_processor import AttnProcessor2_0
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
+
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     "longlian/lmd_plus",
+        ...     custom_pipeline="llm_grounded_diffusion",
+        ...     custom_revision="main",
+        ...     variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # Generate an image described by the prompt and
+        >>> # insert objects described by text at the region defined by bounding boxes
+        >>> prompt = "a waterfall and a modern high speed train in a beautiful forest with fall foliage"
+        >>> boxes = [[0.1387, 0.2051, 0.4277, 0.7090], [0.4980, 0.4355, 0.8516, 0.7266]]
+        >>> phrases = ["a waterfall", "a modern high speed train"]
+
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     phrases=phrases,
+        ...     boxes=boxes,
+        ...     gligen_scheduled_sampling_beta=0.4,
+        ...     output_type="pil",
+        ...     num_inference_steps=50,
+        ...     lmd_guidance_kwargs={}
+        ... ).images
+
+        >>> images[0].save("./lmd_plus_generation.jpg")
+
+        >>> # Generate directly from a text prompt and an LLM response
+        >>> prompt = "a waterfall and a modern high speed train in a beautiful forest with fall foliage"
+        >>> phrases, boxes, bg_prompt, neg_prompt = pipe.parse_llm_response(\"""
+        [('a waterfall', [71, 105, 148, 258]), ('a modern high speed train', [255, 223, 181, 149])]
+        Background prompt: A beautiful forest with fall foliage
+        Negative prompt:
+        \""")
+
+        >> images = pipe(
+        ...     prompt=prompt,
+        ...     negative_prompt=neg_prompt,
+        ...     phrases=phrases,
+        ...     boxes=boxes,
+        ...     gligen_scheduled_sampling_beta=0.4,
+        ...     output_type="pil",
+        ...     num_inference_steps=50,
+        ...     lmd_guidance_kwargs={}
+        ... ).images
+
+        >>> images[0].save("./lmd_plus_generation.jpg")
+
+images[0]
+
+        ```
+"""
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# All keys in Stable Diffusion models: [('down', 0, 0, 0), ('down', 0, 1, 0), ('down', 1, 0, 0), ('down', 1, 1, 0), ('down', 2, 0, 0), ('down', 2, 1, 0), ('mid', 0, 0, 0), ('up', 1, 0, 0), ('up', 1, 1, 0), ('up', 1, 2, 0), ('up', 2, 0, 0), ('up', 2, 1, 0), ('up', 2, 2, 0), ('up', 3, 0, 0), ('up', 3, 1, 0), ('up', 3, 2, 0)]
+# Note that the first up block is `UpBlock2D` rather than `CrossAttnUpBlock2D` and does not have attention. The last index is always 0 in our case since we have one `BasicTransformerBlock` in each `Transformer2DModel`.
+DEFAULT_GUIDANCE_ATTN_KEYS = [
+    ("mid", 0, 0, 0),
+    ("up", 1, 0, 0),
+    ("up", 1, 1, 0),
+    ("up", 1, 2, 0),
+]
+
+
+def convert_attn_keys(key):
+    """Convert the attention key from tuple format to the torch state format"""
+
+    if key[0] == "mid":
+        assert key[1] == 0, f"mid block only has one block but the index is {key[1]}"
+        return f"{key[0]}_block.attentions.{key[2]}.transformer_blocks.{key[3]}.attn2.processor"
+
+    return f"{key[0]}_blocks.{key[1]}.attentions.{key[2]}.transformer_blocks.{key[3]}.attn2.processor"
+
+
+DEFAULT_GUIDANCE_ATTN_KEYS = [convert_attn_keys(key) for key in DEFAULT_GUIDANCE_ATTN_KEYS]
+
+
+def scale_proportion(obj_box, H, W):
+    # Separately rounding box_w and box_h to allow shift invariant box sizes. Otherwise box sizes may change when both coordinates being rounded end with ".5".
+    x_min, y_min = round(obj_box[0] * W), round(obj_box[1] * H)
+    box_w, box_h = round((obj_box[2] - obj_box[0]) * W), round((obj_box[3] - obj_box[1]) * H)
+    x_max, y_max = x_min + box_w, y_min + box_h
+
+    x_min, y_min = max(x_min, 0), max(y_min, 0)
+    x_max, y_max = min(x_max, W), min(y_max, H)
+
+    return x_min, y_min, x_max, y_max
+
+
+# Adapted from the parent class `AttnProcessor2_0`
+class AttnProcessorWithHook(AttnProcessor2_0):
+    def __init__(
+        self,
+        attn_processor_key,
+        hidden_size,
+        cross_attention_dim,
+        hook=None,
+        fast_attn=True,
+        enabled=True,
+    ):
+        super().__init__()
+        self.attn_processor_key = attn_processor_key
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.hook = hook
+        self.fast_attn = fast_attn
+        self.enabled = enabled
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale: float = 1.0,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        args = () if USE_PEFT_BACKEND else (scale,)
+        query = attn.to_q(hidden_states, *args)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        if (self.hook is not None and self.enabled) or not self.fast_attn:
+            query_batch_dim = attn.head_to_batch_dim(query)
+            key_batch_dim = attn.head_to_batch_dim(key)
+            value_batch_dim = attn.head_to_batch_dim(value)
+            attention_probs = attn.get_attention_scores(query_batch_dim, key_batch_dim, attention_mask)
+
+        if self.hook is not None and self.enabled:
+            # Call the hook with query, key, value, and attention maps
+            self.hook(
+                self.attn_processor_key,
+                query_batch_dim,
+                key_batch_dim,
+                value_batch_dim,
+                attention_probs,
+            )
+
+        if self.fast_attn:
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+            key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+            if attention_mask is not None:
+                # scaled_dot_product_attention expects attention_mask shape to be
+                # (batch, heads, source_length, target_length)
+                attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+            # the output of sdp = (batch, num_heads, seq_len, head_dim)
+            # TODO: add support for attn.scale when we move to Torch 2.1
+            hidden_states = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=attention_mask,
+                dropout_p=0.0,
+                is_causal=False,
+            )
+            hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            hidden_states = torch.bmm(attention_probs, value)
+            hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class LLMGroundedDiffusionPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for layout-grounded text-to-image generation using LLM-grounded Diffusion (LMD+): https://arxiv.org/pdf/2305.13655.pdf.
+
+    This model inherits from [`StableDiffusionPipeline`] and aims at implementing the pipeline with minimal modifications. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    This is a simplified implementation that does not perform latent or attention transfer from single object generation to overall generation. The final image is generated directly with attention and adapters control.
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+        requires_safety_checker (bool):
+            Whether a safety checker is needed for this pipeline.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    objects_text = "Objects: "
+    bg_prompt_text = "Background prompt: "
+    bg_prompt_text_no_trailing_space = bg_prompt_text.rstrip()
+    neg_prompt_text = "Negative prompt: "
+    neg_prompt_text_no_trailing_space = neg_prompt_text.rstrip()
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        # This is copied from StableDiffusionPipeline, with hook initizations for LMD+.
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+        # Initialize the attention hooks for LLM-grounded Diffusion
+        self.register_attn_hooks(unet)
+        self._saved_attn = None
+
+    def attn_hook(self, name, query, key, value, attention_probs):
+        if name in DEFAULT_GUIDANCE_ATTN_KEYS:
+            self._saved_attn[name] = attention_probs
+
+    @classmethod
+    def convert_box(cls, box, height, width):
+        # box: x, y, w, h (in 512 format) -> x_min, y_min, x_max, y_max
+        x_min, y_min = box[0] / width, box[1] / height
+        w_box, h_box = box[2] / width, box[3] / height
+
+        x_max, y_max = x_min + w_box, y_min + h_box
+
+        return x_min, y_min, x_max, y_max
+
+    @classmethod
+    def _parse_response_with_negative(cls, text):
+        if not text:
+            raise ValueError("LLM response is empty")
+
+        if cls.objects_text in text:
+            text = text.split(cls.objects_text)[1]
+
+        text_split = text.split(cls.bg_prompt_text_no_trailing_space)
+        if len(text_split) == 2:
+            gen_boxes, text_rem = text_split
+        else:
+            raise ValueError(f"LLM response is incomplete: {text}")
+
+        text_split = text_rem.split(cls.neg_prompt_text_no_trailing_space)
+
+        if len(text_split) == 2:
+            bg_prompt, neg_prompt = text_split
+        else:
+            raise ValueError(f"LLM response is incomplete: {text}")
+
+        try:
+            gen_boxes = ast.literal_eval(gen_boxes)
+        except SyntaxError as e:
+            # Sometimes the response is in plain text
+            if "No objects" in gen_boxes or gen_boxes.strip() == "":
+                gen_boxes = []
+            else:
+                raise e
+        bg_prompt = bg_prompt.strip()
+        neg_prompt = neg_prompt.strip()
+
+        # LLM may return "None" to mean no negative prompt provided.
+        if neg_prompt == "None":
+            neg_prompt = ""
+
+        return gen_boxes, bg_prompt, neg_prompt
+
+    @classmethod
+    def parse_llm_response(cls, response, canvas_height=512, canvas_width=512):
+        # Infer from spec
+        gen_boxes, bg_prompt, neg_prompt = cls._parse_response_with_negative(text=response)
+
+        gen_boxes = sorted(gen_boxes, key=lambda gen_box: gen_box[0])
+
+        phrases = [name for name, _ in gen_boxes]
+        boxes = [cls.convert_box(box, height=canvas_height, width=canvas_width) for _, box in gen_boxes]
+
+        return phrases, boxes, bg_prompt, neg_prompt
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        phrases,
+        boxes,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        phrase_indices=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt is None and phrase_indices is None:
+            raise ValueError("If the prompt is None, the phrase_indices cannot be None")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if len(phrases) != len(boxes):
+            raise ValueError(
+                "length of `phrases` and `boxes` has to be same, but"
+                f" got: `phrases` {len(phrases)} != `boxes` {len(boxes)}"
+            )
+
+    def register_attn_hooks(self, unet):
+        """Registering hooks to obtain the attention maps for guidance"""
+
+        attn_procs = {}
+
+        for name in unet.attn_processors.keys():
+            # Only obtain the queries and keys from cross-attention
+            if name.endswith("attn1.processor") or name.endswith("fuser.attn.processor"):
+                # Keep the same attn_processors for self-attention (no hooks for self-attention)
+                attn_procs[name] = unet.attn_processors[name]
+                continue
+
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+
+            attn_procs[name] = AttnProcessorWithHook(
+                attn_processor_key=name,
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+                hook=self.attn_hook,
+                fast_attn=True,
+                # Not enabled by default
+                enabled=False,
+            )
+
+        unet.set_attn_processor(attn_procs)
+
+    def enable_fuser(self, enabled=True):
+        for module in self.unet.modules():
+            if isinstance(module, GatedSelfAttentionDense):
+                module.enabled = enabled
+
+    def enable_attn_hook(self, enabled=True):
+        for module in self.unet.attn_processors.values():
+            if isinstance(module, AttnProcessorWithHook):
+                module.enabled = enabled
+
+    def get_token_map(self, prompt, padding="do_not_pad", verbose=False):
+        """Get a list of mapping: prompt index to str (prompt in a list of token str)"""
+        fg_prompt_tokens = self.tokenizer([prompt], padding=padding, max_length=77, return_tensors="np")
+        input_ids = fg_prompt_tokens["input_ids"][0]
+
+        token_map = []
+        for ind, item in enumerate(input_ids.tolist()):
+            token = self.tokenizer._convert_id_to_token(item)
+
+            if verbose:
+                logger.info(f"{ind}, {token} ({item})")
+
+            token_map.append(token)
+
+        return token_map
+
+    def get_phrase_indices(
+        self,
+        prompt,
+        phrases,
+        token_map=None,
+        add_suffix_if_not_found=False,
+        verbose=False,
+    ):
+        for obj in phrases:
+            # Suffix the prompt with object name for attention guidance if object is not in the prompt, using "|" to separate the prompt and the suffix
+            if obj not in prompt:
+                prompt += "| " + obj
+
+        if token_map is None:
+            # We allow using a pre-computed token map.
+            token_map = self.get_token_map(prompt=prompt, padding="do_not_pad", verbose=verbose)
+        token_map_str = " ".join(token_map)
+
+        phrase_indices = []
+
+        for obj in phrases:
+            phrase_token_map = self.get_token_map(prompt=obj, padding="do_not_pad", verbose=verbose)
+            # Remove <bos> and <eos> in substr
+            phrase_token_map = phrase_token_map[1:-1]
+            phrase_token_map_len = len(phrase_token_map)
+            phrase_token_map_str = " ".join(phrase_token_map)
+
+            if verbose:
+                logger.info(
+                    "Full str:",
+                    token_map_str,
+                    "Substr:",
+                    phrase_token_map_str,
+                    "Phrase:",
+                    phrases,
+                )
+
+            # Count the number of token before substr
+            # The substring comes with a trailing space that needs to be removed by minus one in the index.
+            obj_first_index = len(token_map_str[: token_map_str.index(phrase_token_map_str) - 1].split(" "))
+
+            obj_position = list(range(obj_first_index, obj_first_index + phrase_token_map_len))
+            phrase_indices.append(obj_position)
+
+        if add_suffix_if_not_found:
+            return phrase_indices, prompt
+
+        return phrase_indices
+
+    def add_ca_loss_per_attn_map_to_loss(
+        self,
+        loss,
+        attn_map,
+        object_number,
+        bboxes,
+        phrase_indices,
+        fg_top_p=0.2,
+        bg_top_p=0.2,
+        fg_weight=1.0,
+        bg_weight=1.0,
+    ):
+        # b is the number of heads, not batch
+        b, i, j = attn_map.shape
+        H = W = int(math.sqrt(i))
+        for obj_idx in range(object_number):
+            obj_loss = 0
+            mask = torch.zeros(size=(H, W), device="cuda")
+            obj_boxes = bboxes[obj_idx]
+
+            # We support two level (one box per phrase) and three level (multiple boxes per phrase)
+            if not isinstance(obj_boxes[0], Iterable):
+                obj_boxes = [obj_boxes]
+
+            for obj_box in obj_boxes:
+                # x_min, y_min, x_max, y_max = int(obj_box[0] * W), int(obj_box[1] * H), int(obj_box[2] * W), int(obj_box[3] * H)
+                x_min, y_min, x_max, y_max = scale_proportion(obj_box, H=H, W=W)
+                mask[y_min:y_max, x_min:x_max] = 1
+
+            for obj_position in phrase_indices[obj_idx]:
+                # Could potentially optimize to compute this for loop in batch.
+                # Could crop the ref cross attention before saving to save memory.
+
+                ca_map_obj = attn_map[:, :, obj_position].reshape(b, H, W)
+
+                # shape: (b, H * W)
+                ca_map_obj = attn_map[:, :, obj_position]  # .reshape(b, H, W)
+                k_fg = (mask.sum() * fg_top_p).long().clamp_(min=1)
+                k_bg = ((1 - mask).sum() * bg_top_p).long().clamp_(min=1)
+
+                mask_1d = mask.view(1, -1)
+
+                # Max-based loss function
+
+                # Take the topk over spatial dimension, and then take the sum over heads dim
+                # The mean is over k_fg and k_bg dimension, so we don't need to sum and divide on our own.
+                obj_loss += (1 - (ca_map_obj * mask_1d).topk(k=k_fg).values.mean(dim=1)).sum(dim=0) * fg_weight
+                obj_loss += ((ca_map_obj * (1 - mask_1d)).topk(k=k_bg).values.mean(dim=1)).sum(dim=0) * bg_weight
+
+            loss += obj_loss / len(phrase_indices[obj_idx])
+
+        return loss
+
+    def compute_ca_loss(
+        self,
+        saved_attn,
+        bboxes,
+        phrase_indices,
+        guidance_attn_keys,
+        verbose=False,
+        **kwargs,
+    ):
+        """
+        The `saved_attn` is supposed to be passed to `save_attn_to_dict` in `cross_attention_kwargs` prior to computing ths loss.
+        `AttnProcessor` will put attention maps into the `save_attn_to_dict`.
+
+        `index` is the timestep.
+        `ref_ca_word_token_only`: This has precedence over `ref_ca_last_token_only` (i.e., if both are enabled, we take the token from word rather than the last token).
+        `ref_ca_last_token_only`: `ref_ca_saved_attn` comes from the attention map of the last token of the phrase in single object generation, so we apply it only to the last token of the phrase in overall generation if this is set to True. If set to False, `ref_ca_saved_attn` will be applied to all the text tokens.
+        """
+        loss = torch.tensor(0).float().cuda()
+        object_number = len(bboxes)
+        if object_number == 0:
+            return loss
+
+        for attn_key in guidance_attn_keys:
+            # We only have 1 cross attention for mid.
+
+            attn_map_integrated = saved_attn[attn_key]
+            if not attn_map_integrated.is_cuda:
+                attn_map_integrated = attn_map_integrated.cuda()
+            # Example dimension: [20, 64, 77]
+            attn_map = attn_map_integrated.squeeze(dim=0)
+
+            loss = self.add_ca_loss_per_attn_map_to_loss(
+                loss, attn_map, object_number, bboxes, phrase_indices, **kwargs
+            )
+
+        num_attn = len(guidance_attn_keys)
+
+        if num_attn > 0:
+            loss = loss / (object_number * num_attn)
+
+        return loss
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        gligen_scheduled_sampling_beta: float = 0.3,
+        phrases: List[str] = None,
+        boxes: List[List[float]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        lmd_guidance_kwargs: Optional[Dict[str, Any]] = {},
+        phrase_indices: Optional[List[int]] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            phrases (`List[str]`):
+                The phrases to guide what to include in each of the regions defined by the corresponding
+                `boxes`. There should only be one phrase per bounding box.
+            boxes (`List[List[float]]`):
+                The bounding boxes that identify rectangular regions of the image that are going to be filled with the
+                content described by the corresponding `phrases`. Each rectangular box is defined as a
+                `List[float]` of 4 elements `[xmin, ymin, xmax, ymax]` where each value is between [0,1].
+            gligen_scheduled_sampling_beta (`float`, defaults to 0.3):
+                Scheduled Sampling factor from [GLIGEN: Open-Set Grounded Text-to-Image
+                Generation](https://arxiv.org/pdf/2301.07093.pdf). Scheduled Sampling factor is only varied for
+                scheduled sampling during inference for improved quality and controllability.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            lmd_guidance_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to `latent_lmd_guidance` function. Useful keys include `loss_scale` (the guidance strength), `loss_threshold` (when loss is lower than this value, the guidance is not applied anymore), `max_iter` (the number of iterations of guidance for each step), and `guidance_timesteps` (the number of diffusion timesteps to apply guidance on). See `latent_lmd_guidance` for implementation details.
+            phrase_indices (`list` of `list`, *optional*): The indices of the tokens of each phrase in the overall prompt. If omitted, the pipeline will match the first token subsequence. The pipeline will append the missing phrases to the end of the prompt by default.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            phrases,
+            boxes,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            phrase_indices,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+            if phrase_indices is None:
+                phrase_indices, prompt = self.get_phrase_indices(prompt, phrases, add_suffix_if_not_found=True)
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+            if phrase_indices is None:
+                phrase_indices = []
+                prompt_parsed = []
+                for prompt_item in prompt:
+                    (
+                        phrase_indices_parsed_item,
+                        prompt_parsed_item,
+                    ) = self.get_phrase_indices(prompt_item, add_suffix_if_not_found=True)
+                    phrase_indices.append(phrase_indices_parsed_item)
+                    prompt_parsed.append(prompt_parsed_item)
+                prompt = prompt_parsed
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+
+        cond_prompt_embeds = prompt_embeds
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 5.1 Prepare GLIGEN variables
+        max_objs = 30
+        if len(boxes) > max_objs:
+            warnings.warn(
+                f"More that {max_objs} objects found. Only first {max_objs} objects will be processed.",
+                FutureWarning,
+            )
+            phrases = phrases[:max_objs]
+            boxes = boxes[:max_objs]
+
+        n_objs = len(boxes)
+        if n_objs:
+            # prepare batched input to the PositionNet (boxes, phrases, mask)
+            # Get tokens for phrases from pre-trained CLIPTokenizer
+            tokenizer_inputs = self.tokenizer(phrases, padding=True, return_tensors="pt").to(device)
+            # For the token, we use the same pre-trained text encoder
+            # to obtain its text feature
+            _text_embeddings = self.text_encoder(**tokenizer_inputs).pooler_output
+
+        # For each entity, described in phrases, is denoted with a bounding box,
+        # we represent the location information as (xmin,ymin,xmax,ymax)
+        cond_boxes = torch.zeros(max_objs, 4, device=device, dtype=self.text_encoder.dtype)
+        if n_objs:
+            cond_boxes[:n_objs] = torch.tensor(boxes)
+        text_embeddings = torch.zeros(
+            max_objs,
+            self.unet.config.cross_attention_dim,
+            device=device,
+            dtype=self.text_encoder.dtype,
+        )
+        if n_objs:
+            text_embeddings[:n_objs] = _text_embeddings
+        # Generate a mask for each object that is entity described by phrases
+        masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        masks[:n_objs] = 1
+
+        repeat_batch = batch_size * num_images_per_prompt
+        cond_boxes = cond_boxes.unsqueeze(0).expand(repeat_batch, -1, -1).clone()
+        text_embeddings = text_embeddings.unsqueeze(0).expand(repeat_batch, -1, -1).clone()
+        masks = masks.unsqueeze(0).expand(repeat_batch, -1).clone()
+        if do_classifier_free_guidance:
+            repeat_batch = repeat_batch * 2
+            cond_boxes = torch.cat([cond_boxes] * 2)
+            text_embeddings = torch.cat([text_embeddings] * 2)
+            masks = torch.cat([masks] * 2)
+            masks[: repeat_batch // 2] = 0
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+        cross_attention_kwargs["gligen"] = {
+            "boxes": cond_boxes,
+            "positive_embeddings": text_embeddings,
+            "masks": masks,
+        }
+
+        num_grounding_steps = int(gligen_scheduled_sampling_beta * len(timesteps))
+        self.enable_fuser(True)
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        loss_attn = torch.tensor(10000.0)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Scheduled sampling
+                if i == num_grounding_steps:
+                    self.enable_fuser(False)
+
+                if latents.shape[1] != 4:
+                    latents = torch.randn_like(latents[:, :4])
+
+                # 7.1 Perform LMD guidance
+                if boxes:
+                    latents, loss_attn = self.latent_lmd_guidance(
+                        cond_prompt_embeds,
+                        index=i,
+                        boxes=boxes,
+                        phrase_indices=phrase_indices,
+                        t=t,
+                        latents=latents,
+                        loss=loss_attn,
+                        **lmd_guidance_kwargs,
+                    )
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    @torch.set_grad_enabled(True)
+    def latent_lmd_guidance(
+        self,
+        cond_embeddings,
+        index,
+        boxes,
+        phrase_indices,
+        t,
+        latents,
+        loss,
+        *,
+        loss_scale=20,
+        loss_threshold=5.0,
+        max_iter=[3] * 5 + [2] * 5 + [1] * 5,
+        guidance_timesteps=15,
+        cross_attention_kwargs=None,
+        guidance_attn_keys=DEFAULT_GUIDANCE_ATTN_KEYS,
+        verbose=False,
+        clear_cache=False,
+        unet_additional_kwargs={},
+        guidance_callback=None,
+        **kwargs,
+    ):
+        scheduler, unet = self.scheduler, self.unet
+
+        iteration = 0
+
+        if index < guidance_timesteps:
+            if isinstance(max_iter, list):
+                max_iter = max_iter[index]
+
+            if verbose:
+                logger.info(
+                    f"time index {index}, loss: {loss.item()/loss_scale:.3f} (de-scaled with scale {loss_scale:.1f}), loss threshold: {loss_threshold:.3f}"
+                )
+
+            try:
+                self.enable_attn_hook(enabled=True)
+
+                while (
+                    loss.item() / loss_scale > loss_threshold and iteration < max_iter and index < guidance_timesteps
+                ):
+                    self._saved_attn = {}
+
+                    latents.requires_grad_(True)
+                    latent_model_input = latents
+                    latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+
+                    unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=cond_embeddings,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        **unet_additional_kwargs,
+                    )
+
+                    # update latents with guidance
+                    loss = (
+                        self.compute_ca_loss(
+                            saved_attn=self._saved_attn,
+                            bboxes=boxes,
+                            phrase_indices=phrase_indices,
+                            guidance_attn_keys=guidance_attn_keys,
+                            verbose=verbose,
+                            **kwargs,
+                        )
+                        * loss_scale
+                    )
+
+                    if torch.isnan(loss):
+                        raise RuntimeError("**Loss is NaN**")
+
+                    # This callback allows visualizations.
+                    if guidance_callback is not None:
+                        guidance_callback(self, latents, loss, iteration, index)
+
+                    self._saved_attn = None
+
+                    grad_cond = torch.autograd.grad(loss.requires_grad_(True), [latents])[0]
+
+                    latents.requires_grad_(False)
+
+                    # Scaling with classifier guidance
+                    alpha_prod_t = scheduler.alphas_cumprod[t]
+                    # Classifier guidance: https://arxiv.org/pdf/2105.05233.pdf
+                    # DDIM: https://arxiv.org/pdf/2010.02502.pdf
+                    scale = (1 - alpha_prod_t) ** (0.5)
+                    latents = latents - scale * grad_cond
+
+                    iteration += 1
+
+                    if clear_cache:
+                        gc.collect()
+                        torch.cuda.empty_cache()
+
+                    if verbose:
+                        logger.info(
+                            f"time index {index}, loss: {loss.item()/loss_scale:.3f}, loss threshold: {loss_threshold:.3f}, iteration: {iteration}"
+                        )
+
+            finally:
+                self.enable_attn_hook(enabled=False)
+
+        return latents, loss
+
+    # Below are methods copied from StableDiffusionPipeline
+    # The design choice of not inheriting from StableDiffusionPipeline is discussed here: https://github.com/huggingface/diffusers/pull/5993#issuecomment-1834258517
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device),
+                    attention_mask=attention_mask,
+                    output_hidden_states=True,
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_scale
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_rescale
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.clip_skip
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.do_classifier_free_guidance
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.cross_attention_kwargs
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.num_timesteps
+    def num_timesteps(self):
+        return self._num_timesteps
diff --git a/diffusers/examples/community/lpw_stable_diffusion.py b/diffusers/examples/community/lpw_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec27acdce3318fd8cd27363d8cf452a5100d24e3
--- /dev/null
+++ b/diffusers/examples/community/lpw_stable_diffusion.py
@@ -0,0 +1,1427 @@
+import inspect
+import re
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+# ------------------------------------------------------------------------------
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+
+
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \\( - literal character '('
+      \\[ - literal character '['
+      \\) - literal character ')'
+      \\] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\\(literal\\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+
+    res = []
+    round_brackets = []
+    square_brackets = []
+
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+
+    if len(res) == 0:
+        res = [["", 1.0]]
+
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+
+    return res
+
+
+def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_length: int):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    truncated = False
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = pipe.tokenizer(word).input_ids[1:-1]
+            text_token += token
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                truncated = True
+                break
+        # truncate
+        if len(text_token) > max_length:
+            truncated = True
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+        tokens.append(text_token)
+        weights.append(text_weight)
+    if truncated:
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
+    return tokens, weights
+
+
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
+    for i in range(len(tokens)):
+        tokens[i] = [bos] + tokens[i] + [pad] * (max_length - 1 - len(tokens[i]) - 1) + [eos]
+        if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range(max_embeddings_multiples):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+
+    return tokens, weights
+
+
+def get_unweighted_text_embeddings(
+    pipe: DiffusionPipeline,
+    text_input: torch.Tensor,
+    chunk_length: int,
+    no_boseos_middle: Optional[bool] = True,
+    clip_skip: Optional[int] = None,
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
+
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            text_input_chunk[:, -1] = text_input[0, -1]
+            if clip_skip is None:
+                prompt_embeds = pipe.text_encoder(text_input_chunk.to(pipe.device))
+                text_embedding = prompt_embeds[0]
+            else:
+                prompt_embeds = pipe.text_encoder(text_input_chunk.to(pipe.device), output_hidden_states=True)
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                text_embedding = pipe.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+
+            text_embeddings.append(text_embedding)
+        text_embeddings = torch.concat(text_embeddings, axis=1)
+    else:
+        if clip_skip is None:
+            clip_skip = 0
+        prompt_embeds = pipe.text_encoder(text_input, output_hidden_states=True)[-1][-(clip_skip + 1)]
+        text_embeddings = pipe.text_encoder.text_model.final_layer_norm(prompt_embeds)
+    return text_embeddings
+
+
+def get_weighted_text_embeddings(
+    pipe: DiffusionPipeline,
+    prompt: Union[str, List[str]],
+    uncond_prompt: Optional[Union[str, List[str]]] = None,
+    max_embeddings_multiples: Optional[int] = 3,
+    no_boseos_middle: Optional[bool] = False,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+    clip_skip=None,
+    lora_scale=None,
+):
+    r"""
+    Prompts can be assigned with local weights using brackets. For example,
+    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
+    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
+
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
+
+    Args:
+        pipe (`DiffusionPipeline`):
+            Pipe to provide access to the tokenizer and the text encoder.
+        prompt (`str` or `List[str]`):
+            The prompt or prompts to guide the image generation.
+        uncond_prompt (`str` or `List[str]`):
+            The unconditional prompt or prompts for guide the image generation. If unconditional prompt
+            is provided, the embeddings of prompt and uncond_prompt are concatenated.
+        max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+            The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        no_boseos_middle (`bool`, *optional*, defaults to `False`):
+            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
+            ending token in each of the chunk in the middle.
+        skip_parsing (`bool`, *optional*, defaults to `False`):
+            Skip the parsing of brackets.
+        skip_weighting (`bool`, *optional*, defaults to `False`):
+            Skip the weighting. When the parsing is skipped, it is forced True.
+    """
+    # set lora scale so that monkey patched LoRA
+    # function of text encoder can correctly access it
+    if lora_scale is not None and isinstance(pipe, StableDiffusionLoraLoaderMixin):
+        pipe._lora_scale = lora_scale
+
+        # dynamically adjust the LoRA scale
+        if not USE_PEFT_BACKEND:
+            adjust_lora_scale_text_encoder(pipe.text_encoder, lora_scale)
+        else:
+            scale_lora_layers(pipe.text_encoder, lora_scale)
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    if isinstance(prompt, str):
+        prompt = [prompt]
+
+    if not skip_parsing:
+        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
+    else:
+        prompt_tokens = [
+            token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
+        ]
+        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens = [
+                token[1:-1]
+                for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
+            ]
+            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
+
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    if uncond_prompt is not None:
+        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
+
+    max_embeddings_multiples = min(
+        max_embeddings_multiples,
+        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+
+    # pad the length of tokens and weights
+    bos = pipe.tokenizer.bos_token_id
+    eos = pipe.tokenizer.eos_token_id
+    pad = getattr(pipe.tokenizer, "pad_token_id", eos)
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        pad,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=pipe.tokenizer.model_max_length,
+    )
+    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=pipe.device)
+    if uncond_prompt is not None:
+        uncond_tokens, uncond_weights = pad_tokens_and_weights(
+            uncond_tokens,
+            uncond_weights,
+            max_length,
+            bos,
+            eos,
+            pad,
+            no_boseos_middle=no_boseos_middle,
+            chunk_length=pipe.tokenizer.model_max_length,
+        )
+        uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=pipe.device)
+
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        pipe, prompt_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle, clip_skip=clip_skip
+    )
+    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=text_embeddings.device)
+    if uncond_prompt is not None:
+        uncond_embeddings = get_unweighted_text_embeddings(
+            pipe,
+            uncond_tokens,
+            pipe.tokenizer.model_max_length,
+            no_boseos_middle=no_boseos_middle,
+            clip_skip=clip_skip,
+        )
+        uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=uncond_embeddings.device)
+
+    # assign weights to the prompts and normalize in the sense of mean
+    # TODO: should we normalize by chunk or in a whole (current implementation)?
+    if (not skip_parsing) and (not skip_weighting):
+        previous_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        text_embeddings *= prompt_weights.unsqueeze(-1)
+        current_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        text_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+        if uncond_prompt is not None:
+            previous_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
+            uncond_embeddings *= uncond_weights.unsqueeze(-1)
+            current_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
+            uncond_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+
+    if pipe.text_encoder is not None:
+        if isinstance(pipe, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(pipe.text_encoder, lora_scale)
+
+    if uncond_prompt is not None:
+        return text_embeddings, uncond_embeddings
+    return text_embeddings, None
+
+
+def preprocess_image(image, batch_size):
+    w, h = image.size
+    w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = np.array(image).astype(np.float32) / 255.0
+    image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask, batch_size, scale_factor=8):
+    if not isinstance(mask, torch.Tensor):
+        mask = mask.convert("L")
+        w, h = mask.size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+        mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
+        mask = np.array(mask).astype(np.float32) / 255.0
+        mask = np.tile(mask, (4, 1, 1))
+        mask = np.vstack([mask[None]] * batch_size)
+        mask = 1 - mask  # repaint white, keep black
+        mask = torch.from_numpy(mask)
+        return mask
+
+    else:
+        valid_mask_channel_sizes = [1, 3]
+        # if mask channel is fourth tensor dimension, permute dimensions to pytorch standard (B, C, H, W)
+        if mask.shape[3] in valid_mask_channel_sizes:
+            mask = mask.permute(0, 3, 1, 2)
+        elif mask.shape[1] not in valid_mask_channel_sizes:
+            raise ValueError(
+                f"Mask channel dimension of size in {valid_mask_channel_sizes} should be second or fourth dimension,"
+                f" but received mask of shape {tuple(mask.shape)}"
+            )
+        # (potentially) reduce mask channel dimension from 3 to 1 for broadcasting to latent shape
+        mask = mask.mean(dim=1, keepdim=True)
+        h, w = mask.shape[-2:]
+        h, w = (x - x % 8 for x in (h, w))  # resize to integer multiple of 8
+        mask = torch.nn.functional.interpolate(mask, (h // scale_factor, w // scale_factor))
+        return mask
+
+
+class StableDiffusionLongPromptWeightingPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
+    weighting in prompt.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder-->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(
+            requires_safety_checker=requires_safety_checker,
+        )
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        max_embeddings_multiples=3,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        clip_skip: Optional[int] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if negative_prompt_embeds is None:
+            if negative_prompt is None:
+                negative_prompt = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                negative_prompt = [negative_prompt] * batch_size
+            if batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+        if prompt_embeds is None or negative_prompt_embeds is None:
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+                if do_classifier_free_guidance and negative_prompt_embeds is None:
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, self.tokenizer)
+
+            prompt_embeds1, negative_prompt_embeds1 = get_weighted_text_embeddings(
+                pipe=self,
+                prompt=prompt,
+                uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
+                max_embeddings_multiples=max_embeddings_multiples,
+                clip_skip=clip_skip,
+                lora_scale=lora_scale,
+            )
+            if prompt_embeds is None:
+                prompt_embeds = prompt_embeds1
+            if negative_prompt_embeds is None:
+                negative_prompt_embeds = negative_prompt_embeds1
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            bs_embed, seq_len, _ = negative_prompt_embeds.shape
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def get_timesteps(self, num_inference_steps, strength, device, is_text2img):
+        if is_text2img:
+            return self.scheduler.timesteps.to(device), num_inference_steps
+        else:
+            # get the original timestep using init_timestep
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+            t_start = max(num_inference_steps - init_timestep, 0)
+            timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+            return timesteps, num_inference_steps - t_start
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def prepare_latents(
+        self,
+        image,
+        timestep,
+        num_images_per_prompt,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if image is None:
+            batch_size = batch_size * num_images_per_prompt
+            shape = (
+                batch_size,
+                num_channels_latents,
+                int(height) // self.vae_scale_factor,
+                int(width) // self.vae_scale_factor,
+            )
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if latents is None:
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            else:
+                latents = latents.to(device)
+
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+            return latents, None, None
+        else:
+            image = image.to(device=self.device, dtype=dtype)
+            init_latent_dist = self.vae.encode(image).latent_dist
+            init_latents = init_latent_dist.sample(generator=generator)
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+            # Expand init_latents for batch_size and num_images_per_prompt
+            init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
+            init_latents_orig = init_latents
+
+            # add noise to latents using the timesteps
+            noise = randn_tensor(init_latents.shape, generator=generator, device=self.device, dtype=dtype)
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+            latents = init_latents
+            return latents, init_latents_orig, noise
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[torch.Tensor, PIL.Image.Image] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        num_images_per_prompt: Optional[int] = 1,
+        add_predicted_noise: Optional[bool] = False,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        clip_skip: Optional[int] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            add_predicted_noise (`bool`, *optional*, defaults to True):
+                Use predicted noise instead of random noise when constructing noisy versions of the original image in
+                the reverse diffusion process
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Returns:
+            `None` if cancelled by `is_cancelled_callback`,
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            max_embeddings_multiples,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+            lora_scale=lora_scale,
+        )
+        dtype = prompt_embeds.dtype
+
+        # 4. Preprocess image and mask
+        if isinstance(image, PIL.Image.Image):
+            image = preprocess_image(image, batch_size)
+        if image is not None:
+            image = image.to(device=self.device, dtype=dtype)
+        if isinstance(mask_image, PIL.Image.Image):
+            mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor)
+        if mask_image is not None:
+            mask = mask_image.to(device=self.device, dtype=dtype)
+            mask = torch.cat([mask] * num_images_per_prompt)
+        else:
+            mask = None
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device, image is None)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        latents, init_latents_orig, noise = self.prepare_latents(
+            image,
+            latent_timestep,
+            num_images_per_prompt,
+            batch_size,
+            self.unet.config.in_channels,
+            height,
+            width,
+            dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                if mask is not None:
+                    # masking
+                    if add_predicted_noise:
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_orig, noise_pred_uncond, torch.tensor([t])
+                        )
+                    else:
+                        init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                    latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if i % callback_steps == 0:
+                        if callback is not None:
+                            step_idx = i // getattr(self.scheduler, "order", 1)
+                            callback(step_idx, t, latents)
+                        if is_cancelled_callback is not None and is_cancelled_callback():
+                            return None
+
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 9. Post-processing
+            image = self.decode_latents(latents)
+
+            # 10. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 11. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 9. Post-processing
+            image = self.decode_latents(latents)
+
+            # 10. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return image, has_nsfw_concept
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    def text2img(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        clip_skip=None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function for text-to-image generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Returns:
+            `None` if cancelled by `is_cancelled_callback`,
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            is_cancelled_callback=is_cancelled_callback,
+            clip_skip=clip_skip,
+            callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
+        )
+
+    def img2img(
+        self,
+        image: Union[torch.Tensor, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function for image-to-image generation.
+        Args:
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Returns:
+            `None` if cancelled by `is_cancelled_callback`,
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            is_cancelled_callback=is_cancelled_callback,
+            callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
+        )
+
+    def inpaint(
+        self,
+        image: Union[torch.Tensor, PIL.Image.Image],
+        mask_image: Union[torch.Tensor, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        add_predicted_noise: Optional[bool] = False,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function for inpaint.
+        Args:
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+                is 1, the denoising process will be run on the masked area for the full number of iterations specified
+                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
+                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            add_predicted_noise (`bool`, *optional*, defaults to True):
+                Use predicted noise instead of random noise when constructing noisy versions of the original image in
+                the reverse diffusion process
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Returns:
+            `None` if cancelled by `is_cancelled_callback`,
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=image,
+            mask_image=mask_image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            add_predicted_noise=add_predicted_noise,
+            eta=eta,
+            generator=generator,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            is_cancelled_callback=is_cancelled_callback,
+            callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
+        )
diff --git a/diffusers/examples/community/lpw_stable_diffusion_onnx.py b/diffusers/examples/community/lpw_stable_diffusion_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..87c2944dbc44a063b33c5a9be9ff7bbdbcb77544
--- /dev/null
+++ b/diffusers/examples/community/lpw_stable_diffusion_onnx.py
@@ -0,0 +1,1148 @@
+import inspect
+import re
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTokenizer
+
+import diffusers
+from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline, SchedulerMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.utils import logging
+
+
+try:
+    from diffusers.pipelines.onnx_utils import ORT_TO_NP_TYPE
+except ImportError:
+    ORT_TO_NP_TYPE = {
+        "tensor(bool)": np.bool_,
+        "tensor(int8)": np.int8,
+        "tensor(uint8)": np.uint8,
+        "tensor(int16)": np.int16,
+        "tensor(uint16)": np.uint16,
+        "tensor(int32)": np.int32,
+        "tensor(uint32)": np.uint32,
+        "tensor(int64)": np.int64,
+        "tensor(uint64)": np.uint64,
+        "tensor(float16)": np.float16,
+        "tensor(float)": np.float32,
+        "tensor(double)": np.float64,
+    }
+
+try:
+    from diffusers.utils import PIL_INTERPOLATION
+except ImportError:
+    if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+        PIL_INTERPOLATION = {
+            "linear": PIL.Image.Resampling.BILINEAR,
+            "bilinear": PIL.Image.Resampling.BILINEAR,
+            "bicubic": PIL.Image.Resampling.BICUBIC,
+            "lanczos": PIL.Image.Resampling.LANCZOS,
+            "nearest": PIL.Image.Resampling.NEAREST,
+        }
+    else:
+        PIL_INTERPOLATION = {
+            "linear": PIL.Image.LINEAR,
+            "bilinear": PIL.Image.BILINEAR,
+            "bicubic": PIL.Image.BICUBIC,
+            "lanczos": PIL.Image.LANCZOS,
+            "nearest": PIL.Image.NEAREST,
+        }
+# ------------------------------------------------------------------------------
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+
+
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \\( - literal character '('
+      \\[ - literal character '['
+      \\) - literal character ')'
+      \\] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\\(literal\\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+
+    res = []
+    round_brackets = []
+    square_brackets = []
+
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+
+    if len(res) == 0:
+        res = [["", 1.0]]
+
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+
+    return res
+
+
+def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    truncated = False
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = pipe.tokenizer(word, return_tensors="np").input_ids[0, 1:-1]
+            text_token += list(token)
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                truncated = True
+                break
+        # truncate
+        if len(text_token) > max_length:
+            truncated = True
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+        tokens.append(text_token)
+        weights.append(text_weight)
+    if truncated:
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
+    return tokens, weights
+
+
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
+    for i in range(len(tokens)):
+        tokens[i] = [bos] + tokens[i] + [pad] * (max_length - 1 - len(tokens[i]) - 1) + [eos]
+        if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range(max_embeddings_multiples):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+
+    return tokens, weights
+
+
+def get_unweighted_text_embeddings(
+    pipe,
+    text_input: np.array,
+    chunk_length: int,
+    no_boseos_middle: Optional[bool] = True,
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].copy()
+
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            text_input_chunk[:, -1] = text_input[0, -1]
+
+            text_embedding = pipe.text_encoder(input_ids=text_input_chunk)[0]
+
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+
+            text_embeddings.append(text_embedding)
+        text_embeddings = np.concatenate(text_embeddings, axis=1)
+    else:
+        text_embeddings = pipe.text_encoder(input_ids=text_input)[0]
+    return text_embeddings
+
+
+def get_weighted_text_embeddings(
+    pipe,
+    prompt: Union[str, List[str]],
+    uncond_prompt: Optional[Union[str, List[str]]] = None,
+    max_embeddings_multiples: Optional[int] = 4,
+    no_boseos_middle: Optional[bool] = False,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+    **kwargs,
+):
+    r"""
+    Prompts can be assigned with local weights using brackets. For example,
+    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
+    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
+
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
+
+    Args:
+        pipe (`OnnxStableDiffusionPipeline`):
+            Pipe to provide access to the tokenizer and the text encoder.
+        prompt (`str` or `List[str]`):
+            The prompt or prompts to guide the image generation.
+        uncond_prompt (`str` or `List[str]`):
+            The unconditional prompt or prompts for guide the image generation. If unconditional prompt
+            is provided, the embeddings of prompt and uncond_prompt are concatenated.
+        max_embeddings_multiples (`int`, *optional*, defaults to `1`):
+            The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        no_boseos_middle (`bool`, *optional*, defaults to `False`):
+            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
+            ending token in each of the chunk in the middle.
+        skip_parsing (`bool`, *optional*, defaults to `False`):
+            Skip the parsing of brackets.
+        skip_weighting (`bool`, *optional*, defaults to `False`):
+            Skip the weighting. When the parsing is skipped, it is forced True.
+    """
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    if isinstance(prompt, str):
+        prompt = [prompt]
+
+    if not skip_parsing:
+        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
+    else:
+        prompt_tokens = [
+            token[1:-1]
+            for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True, return_tensors="np").input_ids
+        ]
+        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens = [
+                token[1:-1]
+                for token in pipe.tokenizer(
+                    uncond_prompt,
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="np",
+                ).input_ids
+            ]
+            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
+
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    if uncond_prompt is not None:
+        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
+
+    max_embeddings_multiples = min(
+        max_embeddings_multiples,
+        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+
+    # pad the length of tokens and weights
+    bos = pipe.tokenizer.bos_token_id
+    eos = pipe.tokenizer.eos_token_id
+    pad = getattr(pipe.tokenizer, "pad_token_id", eos)
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        pad,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=pipe.tokenizer.model_max_length,
+    )
+    prompt_tokens = np.array(prompt_tokens, dtype=np.int32)
+    if uncond_prompt is not None:
+        uncond_tokens, uncond_weights = pad_tokens_and_weights(
+            uncond_tokens,
+            uncond_weights,
+            max_length,
+            bos,
+            eos,
+            pad,
+            no_boseos_middle=no_boseos_middle,
+            chunk_length=pipe.tokenizer.model_max_length,
+        )
+        uncond_tokens = np.array(uncond_tokens, dtype=np.int32)
+
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        pipe,
+        prompt_tokens,
+        pipe.tokenizer.model_max_length,
+        no_boseos_middle=no_boseos_middle,
+    )
+    prompt_weights = np.array(prompt_weights, dtype=text_embeddings.dtype)
+    if uncond_prompt is not None:
+        uncond_embeddings = get_unweighted_text_embeddings(
+            pipe,
+            uncond_tokens,
+            pipe.tokenizer.model_max_length,
+            no_boseos_middle=no_boseos_middle,
+        )
+        uncond_weights = np.array(uncond_weights, dtype=uncond_embeddings.dtype)
+
+    # assign weights to the prompts and normalize in the sense of mean
+    # TODO: should we normalize by chunk or in a whole (current implementation)?
+    if (not skip_parsing) and (not skip_weighting):
+        previous_mean = text_embeddings.mean(axis=(-2, -1))
+        text_embeddings *= prompt_weights[:, :, None]
+        text_embeddings *= (previous_mean / text_embeddings.mean(axis=(-2, -1)))[:, None, None]
+        if uncond_prompt is not None:
+            previous_mean = uncond_embeddings.mean(axis=(-2, -1))
+            uncond_embeddings *= uncond_weights[:, :, None]
+            uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=(-2, -1)))[:, None, None]
+
+    # For classifier free guidance, we need to do two forward passes.
+    # Here we concatenate the unconditional and text embeddings into a single batch
+    # to avoid doing two forward passes
+    if uncond_prompt is not None:
+        return text_embeddings, uncond_embeddings
+
+    return text_embeddings
+
+
+def preprocess_image(image):
+    w, h = image.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask, scale_factor=8):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
+    return mask
+
+
+class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
+    weighting in prompt.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    """
+
+    if version.parse(version.parse(diffusers.__version__).base_version) >= version.parse("0.9.0"):
+
+        def __init__(
+            self,
+            vae_encoder: OnnxRuntimeModel,
+            vae_decoder: OnnxRuntimeModel,
+            text_encoder: OnnxRuntimeModel,
+            tokenizer: CLIPTokenizer,
+            unet: OnnxRuntimeModel,
+            scheduler: SchedulerMixin,
+            safety_checker: OnnxRuntimeModel,
+            feature_extractor: CLIPImageProcessor,
+            requires_safety_checker: bool = True,
+        ):
+            super().__init__(
+                vae_encoder=vae_encoder,
+                vae_decoder=vae_decoder,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                unet=unet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+                requires_safety_checker=requires_safety_checker,
+            )
+            self.__init__additional__()
+
+    else:
+
+        def __init__(
+            self,
+            vae_encoder: OnnxRuntimeModel,
+            vae_decoder: OnnxRuntimeModel,
+            text_encoder: OnnxRuntimeModel,
+            tokenizer: CLIPTokenizer,
+            unet: OnnxRuntimeModel,
+            scheduler: SchedulerMixin,
+            safety_checker: OnnxRuntimeModel,
+            feature_extractor: CLIPImageProcessor,
+        ):
+            super().__init__(
+                vae_encoder=vae_encoder,
+                vae_decoder=vae_decoder,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                unet=unet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+            )
+            self.__init__additional__()
+
+    def __init__additional__(self):
+        self.unet.config.in_channels = 4
+        self.vae_scale_factor = 8
+
+    def _encode_prompt(
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt,
+        max_embeddings_multiples,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+        elif isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt] * batch_size
+        if batch_size != len(negative_prompt):
+            raise ValueError(
+                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                " the batch size of `prompt`."
+            )
+
+        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
+            pipe=self,
+            prompt=prompt,
+            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
+            max_embeddings_multiples=max_embeddings_multiples,
+        )
+
+        text_embeddings = text_embeddings.repeat(num_images_per_prompt, 0)
+        if do_classifier_free_guidance:
+            uncond_embeddings = uncond_embeddings.repeat(num_images_per_prompt, 0)
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
+    def check_inputs(self, prompt, height, width, strength, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def get_timesteps(self, num_inference_steps, strength, is_text2img):
+        if is_text2img:
+            return self.scheduler.timesteps, num_inference_steps
+        else:
+            # get the original timestep using init_timestep
+            offset = self.scheduler.config.get("steps_offset", 0)
+            init_timestep = int(num_inference_steps * strength) + offset
+            init_timestep = min(init_timestep, num_inference_steps)
+
+            t_start = max(num_inference_steps - init_timestep + offset, 0)
+            timesteps = self.scheduler.timesteps[t_start:]
+            return timesteps, num_inference_steps - t_start
+
+    def run_safety_checker(self, image):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # There will throw an error if use safety_checker directly and batchsize>1
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        image = np.concatenate(
+            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+        )
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def prepare_latents(self, image, timestep, batch_size, height, width, dtype, generator, latents=None):
+        if image is None:
+            shape = (
+                batch_size,
+                self.unet.config.in_channels,
+                height // self.vae_scale_factor,
+                width // self.vae_scale_factor,
+            )
+
+            if latents is None:
+                latents = torch.randn(shape, generator=generator, device="cpu").numpy().astype(dtype)
+            else:
+                if latents.shape != shape:
+                    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = (torch.from_numpy(latents) * self.scheduler.init_noise_sigma).numpy()
+            return latents, None, None
+        else:
+            init_latents = self.vae_encoder(sample=image)[0]
+            init_latents = 0.18215 * init_latents
+            init_latents = np.concatenate([init_latents] * batch_size, axis=0)
+            init_latents_orig = init_latents
+            shape = init_latents.shape
+
+            # add noise to latents using the timesteps
+            noise = torch.randn(shape, generator=generator, device="cpu").numpy().astype(dtype)
+            latents = self.scheduler.add_noise(
+                torch.from_numpy(init_latents), torch.from_numpy(noise), timestep
+            ).numpy()
+            return latents, init_latents_orig, noise
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        image: Union[np.ndarray, PIL.Image.Image] = None,
+        mask_image: Union[np.ndarray, PIL.Image.Image] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[np.ndarray] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`np.ndarray`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            `None` if cancelled by `is_cancelled_callback`,
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, strength, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            max_embeddings_multiples,
+        )
+        dtype = text_embeddings.dtype
+
+        # 4. Preprocess image and mask
+        if isinstance(image, PIL.Image.Image):
+            image = preprocess_image(image)
+        if image is not None:
+            image = image.astype(dtype)
+        if isinstance(mask_image, PIL.Image.Image):
+            mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
+        if mask_image is not None:
+            mask = mask_image.astype(dtype)
+            mask = np.concatenate([mask] * batch_size * num_images_per_prompt)
+        else:
+            mask = None
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, image is None)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        latents, init_latents_orig, noise = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            dtype,
+            generator,
+            latents,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = latent_model_input.numpy()
+
+            # predict the noise residual
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=np.array([t], dtype=timestep_dtype),
+                encoder_hidden_states=text_embeddings,
+            )
+            noise_pred = noise_pred[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            scheduler_output = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )
+            latents = scheduler_output.prev_sample.numpy()
+
+            if mask is not None:
+                # masking
+                init_latents_proper = self.scheduler.add_noise(
+                    torch.from_numpy(init_latents_orig),
+                    torch.from_numpy(noise),
+                    t,
+                ).numpy()
+                latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+            # call the callback, if provided
+            if i % callback_steps == 0:
+                if callback is not None:
+                    step_idx = i // getattr(self.scheduler, "order", 1)
+                    callback(step_idx, t, latents)
+                if is_cancelled_callback is not None and is_cancelled_callback():
+                    return None
+
+        # 9. Post-processing
+        image = self.decode_latents(latents)
+
+        # 10. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image)
+
+        # 11. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return image, has_nsfw_concept
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    def text2img(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[np.ndarray] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        Function for text-to-image generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`np.ndarray`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+    def img2img(
+        self,
+        image: Union[np.ndarray, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        Function for image-to-image generation.
+        Args:
+            image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or ndarray representing an image batch, that will be used as the starting point for the
+                process.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+    def inpaint(
+        self,
+        image: Union[np.ndarray, PIL.Image.Image],
+        mask_image: Union[np.ndarray, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        Function for inpaint.
+        Args:
+            image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+                is 1, the denoising process will be run on the masked area for the full number of iterations specified
+                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
+                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=image,
+            mask_image=mask_image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
diff --git a/diffusers/examples/community/lpw_stable_diffusion_xl.py b/diffusers/examples/community/lpw_stable_diffusion_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..13d1e2a1156aed7a877cdaf77e00e97d97f9454e
--- /dev/null
+++ b/diffusers/examples/community/lpw_stable_diffusion_xl.py
@@ -0,0 +1,2243 @@
+## ----------------------------------------------------------
+# A SDXL pipeline can take unlimited weighted prompt
+#
+# Author: Andrew Zhu
+# GitHub: https://github.com/xhinker
+# Medium: https://medium.com/@xhinker
+## -----------------------------------------------------------
+
+import inspect
+import os
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from PIL import Image
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import DiffusionPipeline, StableDiffusionXLPipeline
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_accelerate_available,
+    is_accelerate_version,
+    is_invisible_watermark_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+if is_invisible_watermark_available():
+    from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \\( - literal character '('
+      \\[ - literal character '['
+      \\) - literal character ')'
+      \\] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\\(literal\\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+    import re
+
+    re_attention = re.compile(
+        r"""
+            \\\(|\\\)|\\\[|\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|
+            \)|]|[^\\()\[\]:]+|:
+        """,
+        re.X,
+    )
+
+    re_break = re.compile(r"\s*\bBREAK\b\s*", re.S)
+
+    res = []
+    round_brackets = []
+    square_brackets = []
+
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            parts = re.split(re_break, text)
+            for i, part in enumerate(parts):
+                if i > 0:
+                    res.append(["BREAK", -1])
+                res.append([part, 1.0])
+
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+
+    if len(res) == 0:
+        res = [["", 1.0]]
+
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+
+    return res
+
+
+def get_prompts_tokens_with_weights(clip_tokenizer: CLIPTokenizer, prompt: str):
+    """
+    Get prompt token ids and weights, this function works for both prompt and negative prompt
+
+    Args:
+        pipe (CLIPTokenizer)
+            A CLIPTokenizer
+        prompt (str)
+            A prompt string with weights
+
+    Returns:
+        text_tokens (list)
+            A list contains token ids
+        text_weight (list)
+            A list contains the correspondent weight of token ids
+
+    Example:
+        import torch
+        from transformers import CLIPTokenizer
+
+        clip_tokenizer = CLIPTokenizer.from_pretrained(
+            "stablediffusionapi/deliberate-v2"
+            , subfolder = "tokenizer"
+            , dtype = torch.float16
+        )
+
+        token_id_list, token_weight_list = get_prompts_tokens_with_weights(
+            clip_tokenizer = clip_tokenizer
+            ,prompt = "a (red:1.5) cat"*70
+        )
+    """
+    texts_and_weights = parse_prompt_attention(prompt)
+    text_tokens, text_weights = [], []
+    for word, weight in texts_and_weights:
+        # tokenize and discard the starting and the ending token
+        token = clip_tokenizer(word, truncation=False).input_ids[1:-1]  # so that tokenize whatever length prompt
+        # the returned token is a 1d list: [320, 1125, 539, 320]
+
+        # merge the new tokens to the all tokens holder: text_tokens
+        text_tokens = [*text_tokens, *token]
+
+        # each token chunk will come with one weight, like ['red cat', 2.0]
+        # need to expand weight for each token.
+        chunk_weights = [weight] * len(token)
+
+        # append the weight back to the weight holder: text_weights
+        text_weights = [*text_weights, *chunk_weights]
+    return text_tokens, text_weights
+
+
+def group_tokens_and_weights(token_ids: list, weights: list, pad_last_block=False):
+    """
+    Produce tokens and weights in groups and pad the missing tokens
+
+    Args:
+        token_ids (list)
+            The token ids from tokenizer
+        weights (list)
+            The weights list from function get_prompts_tokens_with_weights
+        pad_last_block (bool)
+            Control if fill the last token list to 75 tokens with eos
+    Returns:
+        new_token_ids (2d list)
+        new_weights (2d list)
+
+    Example:
+        token_groups,weight_groups = group_tokens_and_weights(
+            token_ids = token_id_list
+            , weights = token_weight_list
+        )
+    """
+    bos, eos = 49406, 49407
+
+    # this will be a 2d list
+    new_token_ids = []
+    new_weights = []
+    while len(token_ids) >= 75:
+        # get the first 75 tokens
+        head_75_tokens = [token_ids.pop(0) for _ in range(75)]
+        head_75_weights = [weights.pop(0) for _ in range(75)]
+
+        # extract token ids and weights
+        temp_77_token_ids = [bos] + head_75_tokens + [eos]
+        temp_77_weights = [1.0] + head_75_weights + [1.0]
+
+        # add 77 token and weights chunk to the holder list
+        new_token_ids.append(temp_77_token_ids)
+        new_weights.append(temp_77_weights)
+
+    # padding the left
+    if len(token_ids) > 0:
+        padding_len = 75 - len(token_ids) if pad_last_block else 0
+
+        temp_77_token_ids = [bos] + token_ids + [eos] * padding_len + [eos]
+        new_token_ids.append(temp_77_token_ids)
+
+        temp_77_weights = [1.0] + weights + [1.0] * padding_len + [1.0]
+        new_weights.append(temp_77_weights)
+
+    return new_token_ids, new_weights
+
+
+def get_weighted_text_embeddings_sdxl(
+    pipe: StableDiffusionXLPipeline,
+    prompt: str = "",
+    prompt_2: str = None,
+    neg_prompt: str = "",
+    neg_prompt_2: str = None,
+    num_images_per_prompt: int = 1,
+    device: Optional[torch.device] = None,
+    clip_skip: Optional[int] = None,
+    lora_scale: Optional[int] = None,
+):
+    """
+    This function can process long prompt with weights, no length limitation
+    for Stable Diffusion XL
+
+    Args:
+        pipe (StableDiffusionPipeline)
+        prompt (str)
+        prompt_2 (str)
+        neg_prompt (str)
+        neg_prompt_2 (str)
+        num_images_per_prompt (int)
+        device (torch.device)
+        clip_skip (int)
+    Returns:
+        prompt_embeds (torch.Tensor)
+        neg_prompt_embeds (torch.Tensor)
+    """
+    device = device or pipe._execution_device
+
+    # set lora scale so that monkey patched LoRA
+    # function of text encoder can correctly access it
+    if lora_scale is not None and isinstance(pipe, StableDiffusionXLLoraLoaderMixin):
+        pipe._lora_scale = lora_scale
+
+        # dynamically adjust the LoRA scale
+        if pipe.text_encoder is not None:
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(pipe.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(pipe.text_encoder, lora_scale)
+
+        if pipe.text_encoder_2 is not None:
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(pipe.text_encoder_2, lora_scale)
+            else:
+                scale_lora_layers(pipe.text_encoder_2, lora_scale)
+
+    if prompt_2:
+        prompt = f"{prompt} {prompt_2}"
+
+    if neg_prompt_2:
+        neg_prompt = f"{neg_prompt} {neg_prompt_2}"
+
+    prompt_t1 = prompt_t2 = prompt
+    neg_prompt_t1 = neg_prompt_t2 = neg_prompt
+
+    if isinstance(pipe, TextualInversionLoaderMixin):
+        prompt_t1 = pipe.maybe_convert_prompt(prompt_t1, pipe.tokenizer)
+        neg_prompt_t1 = pipe.maybe_convert_prompt(neg_prompt_t1, pipe.tokenizer)
+        prompt_t2 = pipe.maybe_convert_prompt(prompt_t2, pipe.tokenizer_2)
+        neg_prompt_t2 = pipe.maybe_convert_prompt(neg_prompt_t2, pipe.tokenizer_2)
+
+    eos = pipe.tokenizer.eos_token_id
+
+    # tokenizer 1
+    prompt_tokens, prompt_weights = get_prompts_tokens_with_weights(pipe.tokenizer, prompt_t1)
+    neg_prompt_tokens, neg_prompt_weights = get_prompts_tokens_with_weights(pipe.tokenizer, neg_prompt_t1)
+
+    # tokenizer 2
+    prompt_tokens_2, prompt_weights_2 = get_prompts_tokens_with_weights(pipe.tokenizer_2, prompt_t2)
+    neg_prompt_tokens_2, neg_prompt_weights_2 = get_prompts_tokens_with_weights(pipe.tokenizer_2, neg_prompt_t2)
+
+    # padding the shorter one for prompt set 1
+    prompt_token_len = len(prompt_tokens)
+    neg_prompt_token_len = len(neg_prompt_tokens)
+
+    if prompt_token_len > neg_prompt_token_len:
+        # padding the neg_prompt with eos token
+        neg_prompt_tokens = neg_prompt_tokens + [eos] * abs(prompt_token_len - neg_prompt_token_len)
+        neg_prompt_weights = neg_prompt_weights + [1.0] * abs(prompt_token_len - neg_prompt_token_len)
+    else:
+        # padding the prompt
+        prompt_tokens = prompt_tokens + [eos] * abs(prompt_token_len - neg_prompt_token_len)
+        prompt_weights = prompt_weights + [1.0] * abs(prompt_token_len - neg_prompt_token_len)
+
+    # padding the shorter one for token set 2
+    prompt_token_len_2 = len(prompt_tokens_2)
+    neg_prompt_token_len_2 = len(neg_prompt_tokens_2)
+
+    if prompt_token_len_2 > neg_prompt_token_len_2:
+        # padding the neg_prompt with eos token
+        neg_prompt_tokens_2 = neg_prompt_tokens_2 + [eos] * abs(prompt_token_len_2 - neg_prompt_token_len_2)
+        neg_prompt_weights_2 = neg_prompt_weights_2 + [1.0] * abs(prompt_token_len_2 - neg_prompt_token_len_2)
+    else:
+        # padding the prompt
+        prompt_tokens_2 = prompt_tokens_2 + [eos] * abs(prompt_token_len_2 - neg_prompt_token_len_2)
+        prompt_weights_2 = prompt_weights + [1.0] * abs(prompt_token_len_2 - neg_prompt_token_len_2)
+
+    embeds = []
+    neg_embeds = []
+
+    prompt_token_groups, prompt_weight_groups = group_tokens_and_weights(prompt_tokens.copy(), prompt_weights.copy())
+
+    neg_prompt_token_groups, neg_prompt_weight_groups = group_tokens_and_weights(
+        neg_prompt_tokens.copy(), neg_prompt_weights.copy()
+    )
+
+    prompt_token_groups_2, prompt_weight_groups_2 = group_tokens_and_weights(
+        prompt_tokens_2.copy(), prompt_weights_2.copy()
+    )
+
+    neg_prompt_token_groups_2, neg_prompt_weight_groups_2 = group_tokens_and_weights(
+        neg_prompt_tokens_2.copy(), neg_prompt_weights_2.copy()
+    )
+
+    # get prompt embeddings one by one is not working.
+    for i in range(len(prompt_token_groups)):
+        # get positive prompt embeddings with weights
+        token_tensor = torch.tensor([prompt_token_groups[i]], dtype=torch.long, device=device)
+        weight_tensor = torch.tensor(prompt_weight_groups[i], dtype=torch.float16, device=device)
+
+        token_tensor_2 = torch.tensor([prompt_token_groups_2[i]], dtype=torch.long, device=device)
+
+        # use first text encoder
+        prompt_embeds_1 = pipe.text_encoder(token_tensor.to(device), output_hidden_states=True)
+
+        # use second text encoder
+        prompt_embeds_2 = pipe.text_encoder_2(token_tensor_2.to(device), output_hidden_states=True)
+        pooled_prompt_embeds = prompt_embeds_2[0]
+
+        if clip_skip is None:
+            prompt_embeds_1_hidden_states = prompt_embeds_1.hidden_states[-2]
+            prompt_embeds_2_hidden_states = prompt_embeds_2.hidden_states[-2]
+        else:
+            # "2" because SDXL always indexes from the penultimate layer.
+            prompt_embeds_1_hidden_states = prompt_embeds_1.hidden_states[-(clip_skip + 2)]
+            prompt_embeds_2_hidden_states = prompt_embeds_2.hidden_states[-(clip_skip + 2)]
+
+        prompt_embeds_list = [prompt_embeds_1_hidden_states, prompt_embeds_2_hidden_states]
+        token_embedding = torch.concat(prompt_embeds_list, dim=-1).squeeze(0)
+
+        for j in range(len(weight_tensor)):
+            if weight_tensor[j] != 1.0:
+                token_embedding[j] = (
+                    token_embedding[-1] + (token_embedding[j] - token_embedding[-1]) * weight_tensor[j]
+                )
+
+        token_embedding = token_embedding.unsqueeze(0)
+        embeds.append(token_embedding)
+
+        # get negative prompt embeddings with weights
+        neg_token_tensor = torch.tensor([neg_prompt_token_groups[i]], dtype=torch.long, device=device)
+        neg_token_tensor_2 = torch.tensor([neg_prompt_token_groups_2[i]], dtype=torch.long, device=device)
+        neg_weight_tensor = torch.tensor(neg_prompt_weight_groups[i], dtype=torch.float16, device=device)
+
+        # use first text encoder
+        neg_prompt_embeds_1 = pipe.text_encoder(neg_token_tensor.to(device), output_hidden_states=True)
+        neg_prompt_embeds_1_hidden_states = neg_prompt_embeds_1.hidden_states[-2]
+
+        # use second text encoder
+        neg_prompt_embeds_2 = pipe.text_encoder_2(neg_token_tensor_2.to(device), output_hidden_states=True)
+        neg_prompt_embeds_2_hidden_states = neg_prompt_embeds_2.hidden_states[-2]
+        negative_pooled_prompt_embeds = neg_prompt_embeds_2[0]
+
+        neg_prompt_embeds_list = [neg_prompt_embeds_1_hidden_states, neg_prompt_embeds_2_hidden_states]
+        neg_token_embedding = torch.concat(neg_prompt_embeds_list, dim=-1).squeeze(0)
+
+        for z in range(len(neg_weight_tensor)):
+            if neg_weight_tensor[z] != 1.0:
+                neg_token_embedding[z] = (
+                    neg_token_embedding[-1] + (neg_token_embedding[z] - neg_token_embedding[-1]) * neg_weight_tensor[z]
+                )
+
+        neg_token_embedding = neg_token_embedding.unsqueeze(0)
+        neg_embeds.append(neg_token_embedding)
+
+    prompt_embeds = torch.cat(embeds, dim=1)
+    negative_prompt_embeds = torch.cat(neg_embeds, dim=1)
+
+    bs_embed, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+    seq_len = negative_prompt_embeds.shape[1]
+    negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+    pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1).view(
+        bs_embed * num_images_per_prompt, -1
+    )
+    negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1).view(
+        bs_embed * num_images_per_prompt, -1
+    )
+
+    if pipe.text_encoder is not None:
+        if isinstance(pipe, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(pipe.text_encoder, lora_scale)
+
+    if pipe.text_encoder_2 is not None:
+        if isinstance(pipe, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(pipe.text_encoder_2, lora_scale)
+
+    return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+
+# -------------------------------------------------------------------------------------------------------------------------------
+# reuse the backbone code from StableDiffusionXLPipeline
+# -------------------------------------------------------------------------------------------------------------------------------
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipe = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0"
+            , torch_dtype       = torch.float16
+            , use_safetensors   = True
+            , variant           = "fp16"
+            , custom_pipeline   = "lpw_stable_diffusion_xl",
+        )
+
+        prompt = "a white cat running on the grass"*20
+        prompt2 = "play a football"*20
+        prompt = f"{prompt},{prompt2}"
+        neg_prompt = "blur, low quality"
+
+        pipe.to("cuda")
+        images = pipe(
+            prompt                  = prompt
+            , negative_prompt       = neg_prompt
+        ).images[0]
+
+        pipe.to("cpu")
+        torch.cuda.empty_cache()
+        images
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class SDXLLongPromptWeightingPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        model_sequence = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+        model_sequence.extend([self.unet, self.vae])
+
+        hook = None
+        for cpu_offloaded_model in model_sequence:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(
+                    text_input_ids.to(device),
+                    output_hidden_states=True,
+                )
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                prompt_embeds = prompt_embeds.hidden_states[-2]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+        else:
+            t_start = 0
+
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        # Strength is irrelevant if we directly request a timestep to start at;
+        # that is, strength is determined by the denoising_start instead.
+        if denoising_start is not None:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            timesteps = timesteps[-num_inference_steps:]
+            return timesteps, num_inference_steps
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(
+        self,
+        image,
+        mask,
+        width,
+        height,
+        num_channels_latents,
+        timestep,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        device,
+        generator=None,
+        add_noise=True,
+        latents=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        batch_size *= num_images_per_prompt
+
+        if image is None:
+            shape = (
+                batch_size,
+                num_channels_latents,
+                int(height) // self.vae_scale_factor,
+                int(width) // self.vae_scale_factor,
+            )
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if latents is None:
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            else:
+                latents = latents.to(device)
+
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+            return latents
+
+        elif mask is None:
+            if not isinstance(image, (torch.Tensor, Image.Image, list)):
+                raise ValueError(
+                    f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+                )
+
+            # Offload text encoder if `enable_model_cpu_offload` was enabled
+            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                self.text_encoder_2.to("cpu")
+                torch.cuda.empty_cache()
+
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 4:
+                init_latents = image
+
+            else:
+                # make sure the VAE is in float32 mode, as it overflows in float16
+                if self.vae.config.force_upcast:
+                    image = image.float()
+                    self.vae.to(dtype=torch.float32)
+
+                if isinstance(generator, list) and len(generator) != batch_size:
+                    raise ValueError(
+                        f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                        f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                    )
+
+                elif isinstance(generator, list):
+                    init_latents = [
+                        retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                        for i in range(batch_size)
+                    ]
+                    init_latents = torch.cat(init_latents, dim=0)
+                else:
+                    init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+                if self.vae.config.force_upcast:
+                    self.vae.to(dtype)
+
+                init_latents = init_latents.to(dtype)
+                init_latents = self.vae.config.scaling_factor * init_latents
+
+            if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                additional_image_per_prompt = batch_size // init_latents.shape[0]
+                init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+            elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                init_latents = torch.cat([init_latents], dim=0)
+
+            if add_noise:
+                shape = init_latents.shape
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                # get latents
+                init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+            latents = init_latents
+            return latents
+
+        else:
+            shape = (
+                batch_size,
+                num_channels_latents,
+                int(height) // self.vae_scale_factor,
+                int(width) // self.vae_scale_factor,
+            )
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if (image is None or timestep is None) and not is_strength_max:
+                raise ValueError(
+                    "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                    "However, either the image or the noise timestep has not been provided."
+                )
+
+            if image.shape[1] == 4:
+                image_latents = image.to(device=device, dtype=dtype)
+                image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+            elif return_image_latents or (latents is None and not is_strength_max):
+                image = image.to(device=device, dtype=dtype)
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+                image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+            if latents is None and add_noise:
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                # if strength is 1. then initialise the latents to noise, else initial to image + noise
+                latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+                # if pure noise then scale the initial latents by the  Scheduler's init sigma
+                latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+            elif add_noise:
+                noise = latents.to(device)
+                latents = noise * self.scheduler.init_noise_sigma
+            else:
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                latents = image_latents.to(device)
+
+            outputs = (latents,)
+
+            if return_noise:
+                outputs += (noise,)
+
+            if return_image_latents:
+                outputs += (image_latents,)
+
+            return outputs
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        dtype = image.dtype
+        if self.vae.config.force_upcast:
+            image = image.float()
+            self.vae.to(dtype=torch.float32)
+
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        if self.vae.config.force_upcast:
+            self.vae.to(dtype)
+
+        image_latents = image_latents.to(dtype)
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+
+        if masked_image is not None and masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = None
+
+        if masked_image is not None:
+            if masked_image_latents is None:
+                masked_image = masked_image.to(device=device, dtype=dtype)
+                masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+            if masked_image_latents.shape[0] < batch_size:
+                if not batch_size % masked_image_latents.shape[0] == 0:
+                    raise ValueError(
+                        "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                        f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                        " Make sure the number of images that you pass is divisible by the total requested batch size."
+                    )
+                masked_image_latents = masked_image_latents.repeat(
+                    batch_size // masked_image_latents.shape[0], 1, 1, 1
+                )
+
+            masked_image_latents = (
+                torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+            )
+
+            # aligning device to prevent device errors when concating it with the latent model input
+            masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+
+        return mask, masked_image_latents
+
+    def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + self.text_encoder_2.config.projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (AttnProcessor2_0, XFormersAttnProcessor),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def denoising_start(self):
+        return self._denoising_start
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: str = None,
+        prompt_2: Optional[str] = None,
+        image: Optional[PipelineImageInput] = None,
+        mask_image: Optional[PipelineImageInput] = None,
+        masked_image_latents: Optional[torch.Tensor] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str`):
+                The prompt  to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str`):
+                The prompt to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`PipelineImageInput`, *optional*):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`PipelineImageInput`, *optional*):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refine Image
+                Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality).
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
+                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
+                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
+                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refine Image
+                Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality).
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str`):
+                The prompt not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str`):
+                The prompt not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            strength,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self._cross_attention_kwargs.get("scale", None) if self._cross_attention_kwargs is not None else None
+        )
+
+        negative_prompt = negative_prompt if negative_prompt is not None else ""
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = get_weighted_text_embeddings_sdxl(
+            pipe=self,
+            prompt=prompt,
+            neg_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            clip_skip=clip_skip,
+            lora_scale=lora_scale,
+        )
+        dtype = prompt_embeds.dtype
+
+        if isinstance(image, Image.Image):
+            image = self.image_processor.preprocess(image, height=height, width=width)
+        if image is not None:
+            image = image.to(device=self.device, dtype=dtype)
+
+        if isinstance(mask_image, Image.Image):
+            mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
+        else:
+            mask = mask_image
+        if mask_image is not None:
+            mask = mask.to(device=self.device, dtype=dtype)
+
+            if masked_image_latents is not None:
+                masked_image = masked_image_latents
+            elif image.shape[1] == 4:
+                # if image is in latent space, we can't mask it
+                masked_image = None
+            else:
+                masked_image = image * (mask < 0.5)
+        else:
+            mask = None
+
+        # 4. Prepare timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        if image is not None:
+            timesteps, num_inference_steps = self.get_timesteps(
+                num_inference_steps,
+                strength,
+                device,
+                denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
+            )
+
+            # check that number of inference steps is not < 1 - as this doesn't make sense
+            if num_inference_steps < 1:
+                raise ValueError(
+                    f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                    f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+                )
+
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        is_strength_max = strength == 1.0
+        add_noise = True if self.denoising_start is None else False
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
+        latents = self.prepare_latents(
+            image=image,
+            mask=mask,
+            width=width,
+            height=height,
+            num_channels_latents=num_channels_unet,
+            timestep=latent_timestep,
+            batch_size=batch_size,
+            num_images_per_prompt=num_images_per_prompt,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            add_noise=add_noise,
+            latents=latents,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if mask is not None:
+            if return_image_latents:
+                latents, noise, image_latents = latents
+            else:
+                latents, noise = latents
+
+        # 5.1 Prepare mask latent variables
+        if mask is not None:
+            mask, masked_image_latents = self.prepare_mask_latents(
+                mask=mask,
+                masked_image=masked_image,
+                batch_size=batch_size * num_images_per_prompt,
+                height=height,
+                width=width,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+
+            # Check that sizes of mask, masked image and latents match
+            if num_channels_unet == 9:
+                # default case for runwayml/stable-diffusion-inpainting
+                num_channels_mask = mask.shape[1]
+                num_channels_masked_image = masked_image_latents.shape[1]
+                if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet:
+                    raise ValueError(
+                        f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                        f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                        f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                        f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                        " `pipeline.unet` or your `mask_image` or `image` input."
+                    )
+            elif num_channels_unet != 4:
+                raise ValueError(
+                    f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+                )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else {}
+
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        add_time_ids = self._get_add_time_ids(
+            original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
+        )
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 7.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and self.denoising_start is not None
+            and denoising_value_valid(self.denoising_end)
+            and denoising_value_valid(self.denoising_start)
+            and self.denoising_start >= self.denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {self.denoising_end} when using type float."
+            )
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 8. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+
+        # 9. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if mask is not None and num_channels_unet == 9:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                # predict the noise residual
+                added_cond_kwargs.update({"text_embeds": add_text_embeds, "time_ids": add_time_ids})
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if mask is not None and num_channels_unet == 4:
+                    init_latents_proper = image_latents
+
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+            return StableDiffusionXLPipelineOutput(images=image)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
+
+    def text2img(
+        self,
+        prompt: str = None,
+        prompt_2: Optional[str] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling pipeline for text-to-image.
+
+        Refer to the documentation of the `__call__` method for parameter descriptions.
+        """
+        return self.__call__(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            denoising_start=denoising_start,
+            denoising_end=denoising_end,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            ip_adapter_image=ip_adapter_image,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            output_type=output_type,
+            return_dict=return_dict,
+            cross_attention_kwargs=cross_attention_kwargs,
+            guidance_rescale=guidance_rescale,
+            original_size=original_size,
+            crops_coords_top_left=crops_coords_top_left,
+            target_size=target_size,
+            clip_skip=clip_skip,
+            callback_on_step_end=callback_on_step_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            **kwargs,
+        )
+
+    def img2img(
+        self,
+        prompt: str = None,
+        prompt_2: Optional[str] = None,
+        image: Optional[PipelineImageInput] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling pipeline for image-to-image.
+
+        Refer to the documentation of the `__call__` method for parameter descriptions.
+        """
+        return self.__call__(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            image=image,
+            height=height,
+            width=width,
+            strength=strength,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            denoising_start=denoising_start,
+            denoising_end=denoising_end,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            ip_adapter_image=ip_adapter_image,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            output_type=output_type,
+            return_dict=return_dict,
+            cross_attention_kwargs=cross_attention_kwargs,
+            guidance_rescale=guidance_rescale,
+            original_size=original_size,
+            crops_coords_top_left=crops_coords_top_left,
+            target_size=target_size,
+            clip_skip=clip_skip,
+            callback_on_step_end=callback_on_step_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            **kwargs,
+        )
+
+    def inpaint(
+        self,
+        prompt: str = None,
+        prompt_2: Optional[str] = None,
+        image: Optional[PipelineImageInput] = None,
+        mask_image: Optional[PipelineImageInput] = None,
+        masked_image_latents: Optional[torch.Tensor] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling pipeline for inpainting.
+
+        Refer to the documentation of the `__call__` method for parameter descriptions.
+        """
+        return self.__call__(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            image=image,
+            mask_image=mask_image,
+            masked_image_latents=masked_image_latents,
+            height=height,
+            width=width,
+            strength=strength,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            denoising_start=denoising_start,
+            denoising_end=denoising_end,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            ip_adapter_image=ip_adapter_image,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            output_type=output_type,
+            return_dict=return_dict,
+            cross_attention_kwargs=cross_attention_kwargs,
+            guidance_rescale=guidance_rescale,
+            original_size=original_size,
+            crops_coords_top_left=crops_coords_top_left,
+            target_size=target_size,
+            clip_skip=clip_skip,
+            callback_on_step_end=callback_on_step_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            **kwargs,
+        )
+
+    # Override to properly handle the loading and unloading of the additional text encoder.
+    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
+        # We could have accessed the unet config from `lora_state_dict()` too. We pass
+        # it here explicitly to be able to tell that it's coming from an SDXL
+        # pipeline.
+        state_dict, network_alphas = self.lora_state_dict(
+            pretrained_model_name_or_path_or_dict,
+            unet_config=self.unet.config,
+            **kwargs,
+        )
+        self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
+
+        text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
+        if len(text_encoder_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_state_dict,
+                network_alphas=network_alphas,
+                text_encoder=self.text_encoder,
+                prefix="text_encoder",
+                lora_scale=self.lora_scale,
+            )
+
+        text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
+        if len(text_encoder_2_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_2_state_dict,
+                network_alphas=network_alphas,
+                text_encoder=self.text_encoder_2,
+                prefix="text_encoder_2",
+                lora_scale=self.lora_scale,
+            )
+
+    @classmethod
+    def save_lora_weights(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = False,
+    ):
+        state_dict = {}
+
+        def pack_weights(layers, prefix):
+            layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
+            layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
+            return layers_state_dict
+
+        state_dict.update(pack_weights(unet_lora_layers, "unet"))
+
+        if text_encoder_lora_layers and text_encoder_2_lora_layers:
+            state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
+            state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
+
+        cls.write_lora_layers(
+            state_dict=state_dict,
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+
+    def _remove_text_encoder_monkey_patch(self):
+        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
+        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)
diff --git a/diffusers/examples/community/magic_mix.py b/diffusers/examples/community/magic_mix.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3d118f84bfcdcdabad76a074515efec42f20bb7
--- /dev/null
+++ b/diffusers/examples/community/magic_mix.py
@@ -0,0 +1,152 @@
+from typing import Union
+
+import torch
+from PIL import Image
+from torchvision import transforms as tfms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+
+
+class MagicMixPipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler],
+    ):
+        super().__init__()
+
+        self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+
+    # convert PIL image to latents
+    def encode(self, img):
+        with torch.no_grad():
+            latent = self.vae.encode(tfms.ToTensor()(img).unsqueeze(0).to(self.device) * 2 - 1)
+            latent = 0.18215 * latent.latent_dist.sample()
+        return latent
+
+    # convert latents to PIL image
+    def decode(self, latent):
+        latent = (1 / 0.18215) * latent
+        with torch.no_grad():
+            img = self.vae.decode(latent).sample
+        img = (img / 2 + 0.5).clamp(0, 1)
+        img = img.detach().cpu().permute(0, 2, 3, 1).numpy()
+        img = (img * 255).round().astype("uint8")
+        return Image.fromarray(img[0])
+
+    # convert prompt into text embeddings, also unconditional embeddings
+    def prep_text(self, prompt):
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_embedding = self.text_encoder(text_input.input_ids.to(self.device))[0]
+
+        uncond_input = self.tokenizer(
+            "",
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        uncond_embedding = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+        return torch.cat([uncond_embedding, text_embedding])
+
+    def __call__(
+        self,
+        img: Image.Image,
+        prompt: str,
+        kmin: float = 0.3,
+        kmax: float = 0.6,
+        mix_factor: float = 0.5,
+        seed: int = 42,
+        steps: int = 50,
+        guidance_scale: float = 7.5,
+    ) -> Image.Image:
+        tmin = steps - int(kmin * steps)
+        tmax = steps - int(kmax * steps)
+
+        text_embeddings = self.prep_text(prompt)
+
+        self.scheduler.set_timesteps(steps)
+
+        width, height = img.size
+        encoded = self.encode(img)
+
+        torch.manual_seed(seed)
+        noise = torch.randn(
+            (1, self.unet.config.in_channels, height // 8, width // 8),
+        ).to(self.device)
+
+        latents = self.scheduler.add_noise(
+            encoded,
+            noise,
+            timesteps=self.scheduler.timesteps[tmax],
+        )
+
+        input = torch.cat([latents] * 2)
+
+        input = self.scheduler.scale_model_input(input, self.scheduler.timesteps[tmax])
+
+        with torch.no_grad():
+            pred = self.unet(
+                input,
+                self.scheduler.timesteps[tmax],
+                encoder_hidden_states=text_embeddings,
+            ).sample
+
+        pred_uncond, pred_text = pred.chunk(2)
+        pred = pred_uncond + guidance_scale * (pred_text - pred_uncond)
+
+        latents = self.scheduler.step(pred, self.scheduler.timesteps[tmax], latents).prev_sample
+
+        for i, t in enumerate(tqdm(self.scheduler.timesteps)):
+            if i > tmax:
+                if i < tmin:  # layout generation phase
+                    orig_latents = self.scheduler.add_noise(
+                        encoded,
+                        noise,
+                        timesteps=t,
+                    )
+
+                    input = (
+                        (mix_factor * latents) + (1 - mix_factor) * orig_latents
+                    )  # interpolating between layout noise and conditionally generated noise to preserve layout sematics
+                    input = torch.cat([input] * 2)
+
+                else:  # content generation phase
+                    input = torch.cat([latents] * 2)
+
+                input = self.scheduler.scale_model_input(input, t)
+
+                with torch.no_grad():
+                    pred = self.unet(
+                        input,
+                        t,
+                        encoder_hidden_states=text_embeddings,
+                    ).sample
+
+                pred_uncond, pred_text = pred.chunk(2)
+                pred = pred_uncond + guidance_scale * (pred_text - pred_uncond)
+
+                latents = self.scheduler.step(pred, t, latents).prev_sample
+
+        return self.decode(latents)
diff --git a/diffusers/examples/community/marigold_depth_estimation.py b/diffusers/examples/community/marigold_depth_estimation.py
new file mode 100644
index 0000000000000000000000000000000000000000..92f01d046ef927edc497d284ee9b8248c8c2c705
--- /dev/null
+++ b/diffusers/examples/community/marigold_depth_estimation.py
@@ -0,0 +1,674 @@
+# Copyright 2024 Bingxin Ke, ETH Zurich and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+
+
+import logging
+import math
+from typing import Dict, Union
+
+import matplotlib
+import numpy as np
+import torch
+from PIL import Image
+from PIL.Image import Resampling
+from scipy.optimize import minimize
+from torch.utils.data import DataLoader, TensorDataset
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    LCMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.utils import BaseOutput, check_min_version
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+
+class MarigoldDepthOutput(BaseOutput):
+    """
+    Output class for Marigold monocular depth prediction pipeline.
+
+    Args:
+        depth_np (`np.ndarray`):
+            Predicted depth map, with depth values in the range of [0, 1].
+        depth_colored (`None` or `PIL.Image.Image`):
+            Colorized depth map, with the shape of [3, H, W] and values in [0, 1].
+        uncertainty (`None` or `np.ndarray`):
+            Uncalibrated uncertainty(MAD, median absolute deviation) coming from ensembling.
+    """
+
+    depth_np: np.ndarray
+    depth_colored: Union[None, Image.Image]
+    uncertainty: Union[None, np.ndarray]
+
+
+def get_pil_resample_method(method_str: str) -> Resampling:
+    resample_method_dic = {
+        "bilinear": Resampling.BILINEAR,
+        "bicubic": Resampling.BICUBIC,
+        "nearest": Resampling.NEAREST,
+    }
+    resample_method = resample_method_dic.get(method_str, None)
+    if resample_method is None:
+        raise ValueError(f"Unknown resampling method: {resample_method}")
+    else:
+        return resample_method
+
+
+class MarigoldPipeline(DiffusionPipeline):
+    """
+    Pipeline for monocular depth estimation using Marigold: https://marigoldmonodepth.github.io.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        unet (`UNet2DConditionModel`):
+            Conditional U-Net to denoise the depth latent, conditioned on image latent.
+        vae (`AutoencoderKL`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images and depth maps
+            to and from latent representations.
+        scheduler (`DDIMScheduler`):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        text_encoder (`CLIPTextModel`):
+            Text-encoder, for empty text embedding.
+        tokenizer (`CLIPTokenizer`):
+            CLIP tokenizer.
+    """
+
+    rgb_latent_scale_factor = 0.18215
+    depth_latent_scale_factor = 0.18215
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: DDIMScheduler,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            vae=vae,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+
+        self.empty_text_embed = None
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        input_image: Image,
+        denoising_steps: int = 10,
+        ensemble_size: int = 10,
+        processing_res: int = 768,
+        match_input_res: bool = True,
+        resample_method: str = "bilinear",
+        batch_size: int = 0,
+        seed: Union[int, None] = None,
+        color_map: str = "Spectral",
+        show_progress_bar: bool = True,
+        ensemble_kwargs: Dict = None,
+    ) -> MarigoldDepthOutput:
+        """
+        Function invoked when calling the pipeline.
+
+        Args:
+            input_image (`Image`):
+                Input RGB (or gray-scale) image.
+            processing_res (`int`, *optional*, defaults to `768`):
+                Maximum resolution of processing.
+                If set to 0: will not resize at all.
+            match_input_res (`bool`, *optional*, defaults to `True`):
+                Resize depth prediction to match input resolution.
+                Only valid if `processing_res` > 0.
+            resample_method: (`str`, *optional*, defaults to `bilinear`):
+                Resampling method used to resize images and depth predictions. This can be one of `bilinear`, `bicubic` or `nearest`, defaults to: `bilinear`.
+            denoising_steps (`int`, *optional*, defaults to `10`):
+                Number of diffusion denoising steps (DDIM) during inference.
+            ensemble_size (`int`, *optional*, defaults to `10`):
+                Number of predictions to be ensembled.
+            batch_size (`int`, *optional*, defaults to `0`):
+                Inference batch size, no bigger than `num_ensemble`.
+                If set to 0, the script will automatically decide the proper batch size.
+            seed (`int`, *optional*, defaults to `None`)
+                Reproducibility seed.
+            show_progress_bar (`bool`, *optional*, defaults to `True`):
+                Display a progress bar of diffusion denoising.
+            color_map (`str`, *optional*, defaults to `"Spectral"`, pass `None` to skip colorized depth map generation):
+                Colormap used to colorize the depth map.
+            ensemble_kwargs (`dict`, *optional*, defaults to `None`):
+                Arguments for detailed ensembling settings.
+        Returns:
+            `MarigoldDepthOutput`: Output class for Marigold monocular depth prediction pipeline, including:
+            - **depth_np** (`np.ndarray`) Predicted depth map, with depth values in the range of [0, 1]
+            - **depth_colored** (`PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] and values in [0, 1], None if `color_map` is `None`
+            - **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation)
+                    coming from ensembling. None if `ensemble_size = 1`
+        """
+
+        device = self.device
+        input_size = input_image.size
+
+        if not match_input_res:
+            assert processing_res is not None, "Value error: `resize_output_back` is only valid with "
+        assert processing_res >= 0
+        assert ensemble_size >= 1
+
+        # Check if denoising step is reasonable
+        self._check_inference_step(denoising_steps)
+
+        resample_method: Resampling = get_pil_resample_method(resample_method)
+
+        # ----------------- Image Preprocess -----------------
+        # Resize image
+        if processing_res > 0:
+            input_image = self.resize_max_res(
+                input_image,
+                max_edge_resolution=processing_res,
+                resample_method=resample_method,
+            )
+        # Convert the image to RGB, to 1.remove the alpha channel 2.convert B&W to 3-channel
+        input_image = input_image.convert("RGB")
+        image = np.asarray(input_image)
+
+        # Normalize rgb values
+        rgb = np.transpose(image, (2, 0, 1))  # [H, W, rgb] -> [rgb, H, W]
+        rgb_norm = rgb / 255.0 * 2.0 - 1.0  #  [0, 255] -> [-1, 1]
+        rgb_norm = torch.from_numpy(rgb_norm).to(self.dtype)
+        rgb_norm = rgb_norm.to(device)
+        assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0
+
+        # ----------------- Predicting depth -----------------
+        # Batch repeated input image
+        duplicated_rgb = torch.stack([rgb_norm] * ensemble_size)
+        single_rgb_dataset = TensorDataset(duplicated_rgb)
+        if batch_size > 0:
+            _bs = batch_size
+        else:
+            _bs = self._find_batch_size(
+                ensemble_size=ensemble_size,
+                input_res=max(rgb_norm.shape[1:]),
+                dtype=self.dtype,
+            )
+
+        single_rgb_loader = DataLoader(single_rgb_dataset, batch_size=_bs, shuffle=False)
+
+        # Predict depth maps (batched)
+        depth_pred_ls = []
+        if show_progress_bar:
+            iterable = tqdm(single_rgb_loader, desc=" " * 2 + "Inference batches", leave=False)
+        else:
+            iterable = single_rgb_loader
+        for batch in iterable:
+            (batched_img,) = batch
+            depth_pred_raw = self.single_infer(
+                rgb_in=batched_img,
+                num_inference_steps=denoising_steps,
+                show_pbar=show_progress_bar,
+                seed=seed,
+            )
+            depth_pred_ls.append(depth_pred_raw.detach())
+        depth_preds = torch.concat(depth_pred_ls, dim=0).squeeze()
+        torch.cuda.empty_cache()  # clear vram cache for ensembling
+
+        # ----------------- Test-time ensembling -----------------
+        if ensemble_size > 1:
+            depth_pred, pred_uncert = self.ensemble_depths(depth_preds, **(ensemble_kwargs or {}))
+        else:
+            depth_pred = depth_preds
+            pred_uncert = None
+
+        # ----------------- Post processing -----------------
+        # Scale prediction to [0, 1]
+        min_d = torch.min(depth_pred)
+        max_d = torch.max(depth_pred)
+        depth_pred = (depth_pred - min_d) / (max_d - min_d)
+
+        # Convert to numpy
+        depth_pred = depth_pred.cpu().numpy().astype(np.float32)
+
+        # Resize back to original resolution
+        if match_input_res:
+            pred_img = Image.fromarray(depth_pred)
+            pred_img = pred_img.resize(input_size, resample=resample_method)
+            depth_pred = np.asarray(pred_img)
+
+        # Clip output range
+        depth_pred = depth_pred.clip(0, 1)
+
+        # Colorize
+        if color_map is not None:
+            depth_colored = self.colorize_depth_maps(
+                depth_pred, 0, 1, cmap=color_map
+            ).squeeze()  # [3, H, W], value in (0, 1)
+            depth_colored = (depth_colored * 255).astype(np.uint8)
+            depth_colored_hwc = self.chw2hwc(depth_colored)
+            depth_colored_img = Image.fromarray(depth_colored_hwc)
+        else:
+            depth_colored_img = None
+
+        return MarigoldDepthOutput(
+            depth_np=depth_pred,
+            depth_colored=depth_colored_img,
+            uncertainty=pred_uncert,
+        )
+
+    def _check_inference_step(self, n_step: int):
+        """
+        Check if denoising step is reasonable
+        Args:
+            n_step (`int`): denoising steps
+        """
+        assert n_step >= 1
+
+        if isinstance(self.scheduler, DDIMScheduler):
+            if n_step < 10:
+                logging.warning(
+                    f"Too few denoising steps: {n_step}. Recommended to use the LCM checkpoint for few-step inference."
+                )
+        elif isinstance(self.scheduler, LCMScheduler):
+            if not 1 <= n_step <= 4:
+                logging.warning(f"Non-optimal setting of denoising steps: {n_step}. Recommended setting is 1-4 steps.")
+        else:
+            raise RuntimeError(f"Unsupported scheduler type: {type(self.scheduler)}")
+
+    def _encode_empty_text(self):
+        """
+        Encode text embedding for empty prompt.
+        """
+        prompt = ""
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="do_not_pad",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(self.text_encoder.device)
+        self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)
+
+    @torch.no_grad()
+    def single_infer(
+        self,
+        rgb_in: torch.Tensor,
+        num_inference_steps: int,
+        seed: Union[int, None],
+        show_pbar: bool,
+    ) -> torch.Tensor:
+        """
+        Perform an individual depth prediction without ensembling.
+
+        Args:
+            rgb_in (`torch.Tensor`):
+                Input RGB image.
+            num_inference_steps (`int`):
+                Number of diffusion denoisign steps (DDIM) during inference.
+            show_pbar (`bool`):
+                Display a progress bar of diffusion denoising.
+        Returns:
+            `torch.Tensor`: Predicted depth map.
+        """
+        device = rgb_in.device
+
+        # Set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps  # [T]
+
+        # Encode image
+        rgb_latent = self.encode_rgb(rgb_in)
+
+        # Initial depth map (noise)
+        if seed is None:
+            rand_num_generator = None
+        else:
+            rand_num_generator = torch.Generator(device=device)
+            rand_num_generator.manual_seed(seed)
+        depth_latent = torch.randn(
+            rgb_latent.shape,
+            device=device,
+            dtype=self.dtype,
+            generator=rand_num_generator,
+        )  # [B, 4, h, w]
+
+        # Batched empty text embedding
+        if self.empty_text_embed is None:
+            self._encode_empty_text()
+        batch_empty_text_embed = self.empty_text_embed.repeat((rgb_latent.shape[0], 1, 1))  # [B, 2, 1024]
+
+        # Denoising loop
+        if show_pbar:
+            iterable = tqdm(
+                enumerate(timesteps),
+                total=len(timesteps),
+                leave=False,
+                desc=" " * 4 + "Diffusion denoising",
+            )
+        else:
+            iterable = enumerate(timesteps)
+
+        for i, t in iterable:
+            unet_input = torch.cat([rgb_latent, depth_latent], dim=1)  # this order is important
+
+            # predict the noise residual
+            noise_pred = self.unet(unet_input, t, encoder_hidden_states=batch_empty_text_embed).sample  # [B, 4, h, w]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            depth_latent = self.scheduler.step(noise_pred, t, depth_latent, generator=rand_num_generator).prev_sample
+
+        depth = self.decode_depth(depth_latent)
+
+        # clip prediction
+        depth = torch.clip(depth, -1.0, 1.0)
+        # shift to [0, 1]
+        depth = (depth + 1.0) / 2.0
+
+        return depth
+
+    def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
+        """
+        Encode RGB image into latent.
+
+        Args:
+            rgb_in (`torch.Tensor`):
+                Input RGB image to be encoded.
+
+        Returns:
+            `torch.Tensor`: Image latent.
+        """
+        # encode
+        h = self.vae.encoder(rgb_in)
+        moments = self.vae.quant_conv(h)
+        mean, logvar = torch.chunk(moments, 2, dim=1)
+        # scale latent
+        rgb_latent = mean * self.rgb_latent_scale_factor
+        return rgb_latent
+
+    def decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
+        """
+        Decode depth latent into depth map.
+
+        Args:
+            depth_latent (`torch.Tensor`):
+                Depth latent to be decoded.
+
+        Returns:
+            `torch.Tensor`: Decoded depth map.
+        """
+        # scale latent
+        depth_latent = depth_latent / self.depth_latent_scale_factor
+        # decode
+        z = self.vae.post_quant_conv(depth_latent)
+        stacked = self.vae.decoder(z)
+        # mean of output channels
+        depth_mean = stacked.mean(dim=1, keepdim=True)
+        return depth_mean
+
+    @staticmethod
+    def resize_max_res(img: Image.Image, max_edge_resolution: int, resample_method=Resampling.BILINEAR) -> Image.Image:
+        """
+        Resize image to limit maximum edge length while keeping aspect ratio.
+
+        Args:
+            img (`Image.Image`):
+                Image to be resized.
+            max_edge_resolution (`int`):
+                Maximum edge length (pixel).
+            resample_method (`PIL.Image.Resampling`):
+                Resampling method used to resize images.
+
+        Returns:
+            `Image.Image`: Resized image.
+        """
+        original_width, original_height = img.size
+        downscale_factor = min(max_edge_resolution / original_width, max_edge_resolution / original_height)
+
+        new_width = int(original_width * downscale_factor)
+        new_height = int(original_height * downscale_factor)
+
+        resized_img = img.resize((new_width, new_height), resample=resample_method)
+        return resized_img
+
+    @staticmethod
+    def colorize_depth_maps(depth_map, min_depth, max_depth, cmap="Spectral", valid_mask=None):
+        """
+        Colorize depth maps.
+        """
+        assert len(depth_map.shape) >= 2, "Invalid dimension"
+
+        if isinstance(depth_map, torch.Tensor):
+            depth = depth_map.detach().clone().squeeze().numpy()
+        elif isinstance(depth_map, np.ndarray):
+            depth = depth_map.copy().squeeze()
+        # reshape to [ (B,) H, W ]
+        if depth.ndim < 3:
+            depth = depth[np.newaxis, :, :]
+
+        # colorize
+        cm = matplotlib.colormaps[cmap]
+        depth = ((depth - min_depth) / (max_depth - min_depth)).clip(0, 1)
+        img_colored_np = cm(depth, bytes=False)[:, :, :, 0:3]  # value from 0 to 1
+        img_colored_np = np.rollaxis(img_colored_np, 3, 1)
+
+        if valid_mask is not None:
+            if isinstance(depth_map, torch.Tensor):
+                valid_mask = valid_mask.detach().numpy()
+            valid_mask = valid_mask.squeeze()  # [H, W] or [B, H, W]
+            if valid_mask.ndim < 3:
+                valid_mask = valid_mask[np.newaxis, np.newaxis, :, :]
+            else:
+                valid_mask = valid_mask[:, np.newaxis, :, :]
+            valid_mask = np.repeat(valid_mask, 3, axis=1)
+            img_colored_np[~valid_mask] = 0
+
+        if isinstance(depth_map, torch.Tensor):
+            img_colored = torch.from_numpy(img_colored_np).float()
+        elif isinstance(depth_map, np.ndarray):
+            img_colored = img_colored_np
+
+        return img_colored
+
+    @staticmethod
+    def chw2hwc(chw):
+        assert 3 == len(chw.shape)
+        if isinstance(chw, torch.Tensor):
+            hwc = torch.permute(chw, (1, 2, 0))
+        elif isinstance(chw, np.ndarray):
+            hwc = np.moveaxis(chw, 0, -1)
+        return hwc
+
+    @staticmethod
+    def _find_batch_size(ensemble_size: int, input_res: int, dtype: torch.dtype) -> int:
+        """
+        Automatically search for suitable operating batch size.
+
+        Args:
+            ensemble_size (`int`):
+                Number of predictions to be ensembled.
+            input_res (`int`):
+                Operating resolution of the input image.
+
+        Returns:
+            `int`: Operating batch size.
+        """
+        # Search table for suggested max. inference batch size
+        bs_search_table = [
+            # tested on A100-PCIE-80GB
+            {"res": 768, "total_vram": 79, "bs": 35, "dtype": torch.float32},
+            {"res": 1024, "total_vram": 79, "bs": 20, "dtype": torch.float32},
+            # tested on A100-PCIE-40GB
+            {"res": 768, "total_vram": 39, "bs": 15, "dtype": torch.float32},
+            {"res": 1024, "total_vram": 39, "bs": 8, "dtype": torch.float32},
+            {"res": 768, "total_vram": 39, "bs": 30, "dtype": torch.float16},
+            {"res": 1024, "total_vram": 39, "bs": 15, "dtype": torch.float16},
+            # tested on RTX3090, RTX4090
+            {"res": 512, "total_vram": 23, "bs": 20, "dtype": torch.float32},
+            {"res": 768, "total_vram": 23, "bs": 7, "dtype": torch.float32},
+            {"res": 1024, "total_vram": 23, "bs": 3, "dtype": torch.float32},
+            {"res": 512, "total_vram": 23, "bs": 40, "dtype": torch.float16},
+            {"res": 768, "total_vram": 23, "bs": 18, "dtype": torch.float16},
+            {"res": 1024, "total_vram": 23, "bs": 10, "dtype": torch.float16},
+            # tested on GTX1080Ti
+            {"res": 512, "total_vram": 10, "bs": 5, "dtype": torch.float32},
+            {"res": 768, "total_vram": 10, "bs": 2, "dtype": torch.float32},
+            {"res": 512, "total_vram": 10, "bs": 10, "dtype": torch.float16},
+            {"res": 768, "total_vram": 10, "bs": 5, "dtype": torch.float16},
+            {"res": 1024, "total_vram": 10, "bs": 3, "dtype": torch.float16},
+        ]
+
+        if not torch.cuda.is_available():
+            return 1
+
+        total_vram = torch.cuda.mem_get_info()[1] / 1024.0**3
+        filtered_bs_search_table = [s for s in bs_search_table if s["dtype"] == dtype]
+        for settings in sorted(
+            filtered_bs_search_table,
+            key=lambda k: (k["res"], -k["total_vram"]),
+        ):
+            if input_res <= settings["res"] and total_vram >= settings["total_vram"]:
+                bs = settings["bs"]
+                if bs > ensemble_size:
+                    bs = ensemble_size
+                elif bs > math.ceil(ensemble_size / 2) and bs < ensemble_size:
+                    bs = math.ceil(ensemble_size / 2)
+                return bs
+
+        return 1
+
+    @staticmethod
+    def ensemble_depths(
+        input_images: torch.Tensor,
+        regularizer_strength: float = 0.02,
+        max_iter: int = 2,
+        tol: float = 1e-3,
+        reduction: str = "median",
+        max_res: int = None,
+    ):
+        """
+        To ensemble multiple affine-invariant depth images (up to scale and shift),
+            by aligning estimating the scale and shift
+        """
+
+        def inter_distances(tensors: torch.Tensor):
+            """
+            To calculate the distance between each two depth maps.
+            """
+            distances = []
+            for i, j in torch.combinations(torch.arange(tensors.shape[0])):
+                arr1 = tensors[i : i + 1]
+                arr2 = tensors[j : j + 1]
+                distances.append(arr1 - arr2)
+            dist = torch.concatenate(distances, dim=0)
+            return dist
+
+        device = input_images.device
+        dtype = input_images.dtype
+        np_dtype = np.float32
+
+        original_input = input_images.clone()
+        n_img = input_images.shape[0]
+        ori_shape = input_images.shape
+
+        if max_res is not None:
+            scale_factor = torch.min(max_res / torch.tensor(ori_shape[-2:]))
+            if scale_factor < 1:
+                downscaler = torch.nn.Upsample(scale_factor=scale_factor, mode="nearest")
+                input_images = downscaler(torch.from_numpy(input_images)).numpy()
+
+        # init guess
+        _min = np.min(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1)
+        _max = np.max(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1)
+        s_init = 1.0 / (_max - _min).reshape((-1, 1, 1))
+        t_init = (-1 * s_init.flatten() * _min.flatten()).reshape((-1, 1, 1))
+        x = np.concatenate([s_init, t_init]).reshape(-1).astype(np_dtype)
+
+        input_images = input_images.to(device)
+
+        # objective function
+        def closure(x):
+            l = len(x)
+            s = x[: int(l / 2)]
+            t = x[int(l / 2) :]
+            s = torch.from_numpy(s).to(dtype=dtype).to(device)
+            t = torch.from_numpy(t).to(dtype=dtype).to(device)
+
+            transformed_arrays = input_images * s.view((-1, 1, 1)) + t.view((-1, 1, 1))
+            dists = inter_distances(transformed_arrays)
+            sqrt_dist = torch.sqrt(torch.mean(dists**2))
+
+            if "mean" == reduction:
+                pred = torch.mean(transformed_arrays, dim=0)
+            elif "median" == reduction:
+                pred = torch.median(transformed_arrays, dim=0).values
+            else:
+                raise ValueError
+
+            near_err = torch.sqrt((0 - torch.min(pred)) ** 2)
+            far_err = torch.sqrt((1 - torch.max(pred)) ** 2)
+
+            err = sqrt_dist + (near_err + far_err) * regularizer_strength
+            err = err.detach().cpu().numpy().astype(np_dtype)
+            return err
+
+        res = minimize(
+            closure,
+            x,
+            method="BFGS",
+            tol=tol,
+            options={"maxiter": max_iter, "disp": False},
+        )
+        x = res.x
+        l = len(x)
+        s = x[: int(l / 2)]
+        t = x[int(l / 2) :]
+
+        # Prediction
+        s = torch.from_numpy(s).to(dtype=dtype).to(device)
+        t = torch.from_numpy(t).to(dtype=dtype).to(device)
+        transformed_arrays = original_input * s.view(-1, 1, 1) + t.view(-1, 1, 1)
+        if "mean" == reduction:
+            aligned_images = torch.mean(transformed_arrays, dim=0)
+            std = torch.std(transformed_arrays, dim=0)
+            uncertainty = std
+        elif "median" == reduction:
+            aligned_images = torch.median(transformed_arrays, dim=0).values
+            # MAD (median absolute deviation) as uncertainty indicator
+            abs_dev = torch.abs(transformed_arrays - aligned_images)
+            mad = torch.median(abs_dev, dim=0).values
+            uncertainty = mad
+        else:
+            raise ValueError(f"Unknown reduction method: {reduction}")
+
+        # Scale and shift to [0, 1]
+        _min = torch.min(aligned_images)
+        _max = torch.max(aligned_images)
+        aligned_images = (aligned_images - _min) / (_max - _min)
+        uncertainty /= _max - _min
+
+        return aligned_images, uncertainty
diff --git a/diffusers/examples/community/masked_stable_diffusion_img2img.py b/diffusers/examples/community/masked_stable_diffusion_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..a210c167a2f3574ad0f730441377bc17167d4281
--- /dev/null
+++ b/diffusers/examples/community/masked_stable_diffusion_img2img.py
@@ -0,0 +1,262 @@
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from diffusers import StableDiffusionImg2ImgPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+
+
+class MaskedStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline):
+    debug_save = False
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[
+            torch.Tensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.Tensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        mask: Union[
+            torch.Tensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.Tensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            mask (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`, *optional*):
+                A mask with non-zero elements for the area to be inpainted. If not specified, no mask is applied.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # code adapted from parent class StableDiffusionImg2ImgPipeline
+
+        # 0. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # 1. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 2. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+
+        # 3. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 4. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 5. Prepare latent variables
+        # it is sampled from the latent distribution of the VAE
+        latents = self.prepare_latents(
+            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+        )
+
+        # mean of the latent distribution
+        init_latents = [
+            self.vae.encode(image.to(device=device, dtype=prompt_embeds.dtype)[i : i + 1]).latent_dist.mean
+            for i in range(batch_size)
+        ]
+        init_latents = torch.cat(init_latents, dim=0)
+
+        # 6. create latent mask
+        latent_mask = self._make_latent_mask(latents, mask)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if latent_mask is not None:
+                    latents = torch.lerp(init_latents * self.vae.config.scaling_factor, latents, latent_mask)
+                    noise_pred = torch.lerp(torch.zeros_like(noise_pred), noise_pred, latent_mask)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            scaled = latents / self.vae.config.scaling_factor
+            if latent_mask is not None:
+                # scaled = latents / self.vae.config.scaling_factor * latent_mask + init_latents * (1 - latent_mask)
+                scaled = torch.lerp(init_latents, scaled, latent_mask)
+            image = self.vae.decode(scaled, return_dict=False)[0]
+            if self.debug_save:
+                image_gen = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+                image_gen = self.image_processor.postprocess(image_gen, output_type=output_type, do_denormalize=[True])
+                image_gen[0].save("from_latent.png")
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    def _make_latent_mask(self, latents, mask):
+        if mask is not None:
+            latent_mask = []
+            if not isinstance(mask, list):
+                tmp_mask = [mask]
+            else:
+                tmp_mask = mask
+            _, l_channels, l_height, l_width = latents.shape
+            for m in tmp_mask:
+                if not isinstance(m, PIL.Image.Image):
+                    if len(m.shape) == 2:
+                        m = m[..., np.newaxis]
+                    if m.max() > 1:
+                        m = m / 255.0
+                    m = self.image_processor.numpy_to_pil(m)[0]
+                if m.mode != "L":
+                    m = m.convert("L")
+                resized = self.image_processor.resize(m, l_height, l_width)
+                if self.debug_save:
+                    resized.save("latent_mask.png")
+                latent_mask.append(np.repeat(np.array(resized)[np.newaxis, :, :], l_channels, axis=0))
+            latent_mask = torch.as_tensor(np.stack(latent_mask)).to(latents)
+            latent_mask = latent_mask / latent_mask.max()
+        return latent_mask
diff --git a/diffusers/examples/community/masked_stable_diffusion_xl_img2img.py b/diffusers/examples/community/masked_stable_diffusion_xl_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6b0ced527b5ebdfadc3aa6de648a47f9a2ed11f
--- /dev/null
+++ b/diffusers/examples/community/masked_stable_diffusion_xl_img2img.py
@@ -0,0 +1,682 @@
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image, ImageFilter
+
+from diffusers.image_processor import PipelineImageInput
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img import (
+    StableDiffusionXLImg2ImgPipeline,
+    rescale_noise_cfg,
+    retrieve_latents,
+    retrieve_timesteps,
+)
+from diffusers.utils import (
+    deprecate,
+    is_torch_xla_available,
+    logging,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class MaskedStableDiffusionXLImg2ImgPipeline(StableDiffusionXLImg2ImgPipeline):
+    debug_save = 0
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        original_image: PipelineImageInput = None,
+        strength: float = 0.3,
+        num_inference_steps: Optional[int] = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: Optional[float] = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        mask: Union[
+            torch.FloatTensor,
+            Image.Image,
+            np.ndarray,
+            List[torch.FloatTensor],
+            List[Image.Image],
+            List[np.ndarray],
+        ] = None,
+        blur=24,
+        blur_compose=4,
+        sample_mode="sample",
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`PipelineImageInput`):
+                `Image` or tensor representing an image batch to be used as the starting point. This image might have mask painted on it.
+            original_image (`PipelineImageInput`, *optional*):
+                `Image` or tensor representing an image batch to be used for blending with the result.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                ,`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            blur (`int`, *optional*):
+                blur to apply to mask
+            blur_compose (`int`, *optional*):
+                blur to apply for composition of original a
+            mask (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`, *optional*):
+                A mask with non-zero elements for the area to be inpainted. If not specified, no mask is applied.
+            sample_mode (`str`, *optional*):
+                control latents initialisation for the inpaint area, can be one of sample, argmax, random
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # code adapted from parent class StableDiffusionXLImg2ImgPipeline
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            strength,
+            num_inference_steps,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        self._interrupt = False
+
+        # 1. Define call parameters
+        # mask is computed from difference between image and original_image
+        if image is not None:
+            neq = np.any(np.array(original_image) != np.array(image), axis=-1)
+            mask = neq.astype(np.uint8) * 255
+        else:
+            assert mask is not None
+
+        if not isinstance(mask, Image.Image):
+            pil_mask = Image.fromarray(mask)
+            if pil_mask.mode != "L":
+                pil_mask = pil_mask.convert("L")
+        mask_blur = self.blur_mask(pil_mask, blur)
+        mask_compose = self.blur_mask(pil_mask, blur_compose)
+        if original_image is None:
+            original_image = image
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 2. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 3. Preprocess image
+        input_image = image if image is not None else original_image
+        image = self.image_processor.preprocess(input_image)
+        original_image = self.image_processor.preprocess(original_image)
+
+        # 4. set timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
+        )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        add_noise = True if self.denoising_start is None else False
+
+        # 5. Prepare latent variables
+        # It is sampled from the latent distribution of the VAE
+        # that's what we repaint
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            add_noise,
+            sample_mode=sample_mode,
+        )
+
+        # mean of the latent distribution
+        # it is multiplied by self.vae.config.scaling_factor
+        non_paint_latents = self.prepare_latents(
+            original_image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            add_noise=False,
+            sample_mode="argmax",
+        )
+
+        if self.debug_save:
+            init_img_from_latents = self.latents_to_img(non_paint_latents)
+            init_img_from_latents[0].save("non_paint_latents.png")
+        # 6. create latent mask
+        latent_mask = self._make_latent_mask(latents, mask)
+
+        # 7. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 8. Prepare added time ids & embeddings
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 10. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 10.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and self.denoising_start is not None
+            and denoising_value_valid(self.denoising_end)
+            and denoising_value_valid(self.denoising_start)
+            and self.denoising_start >= self.denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {self.denoising_end} when using type float."
+            )
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 10.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                shape = non_paint_latents.shape
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=latents.dtype)
+                # noisy latent code of input image at current step
+                orig_latents_t = non_paint_latents
+                orig_latents_t = self.scheduler.add_noise(non_paint_latents, noise, t.unsqueeze(0))
+
+                # orig_latents_t (1 - latent_mask) + latents * latent_mask
+                latents = torch.lerp(orig_latents_t, latents, latent_mask)
+
+                if self.debug_save:
+                    img1 = self.latents_to_img(latents)
+                    t_str = str(t.int().item())
+                    for i in range(3 - len(t_str)):
+                        t_str = "0" + t_str
+                    img1[0].save(f"step{t_str}.png")
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+
+            if self.debug_save:
+                image_gen = self.latents_to_img(latents)
+                image_gen[0].save("from_latent.png")
+
+            if latent_mask is not None:
+                # interpolate with latent mask
+                latents = torch.lerp(non_paint_latents, latents, latent_mask)
+
+            latents = self.denormalize(latents)
+            image = self.vae.decode(latents, return_dict=False)[0]
+            m = mask_compose.permute(2, 0, 1).unsqueeze(0).to(image)
+            img_compose = m * image + (1 - m) * original_image.to(image)
+            image = img_compose
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
+
+    def _make_latent_mask(self, latents, mask):
+        if mask is not None:
+            latent_mask = []
+            if not isinstance(mask, list):
+                tmp_mask = [mask]
+            else:
+                tmp_mask = mask
+            _, l_channels, l_height, l_width = latents.shape
+            for m in tmp_mask:
+                if not isinstance(m, Image.Image):
+                    if len(m.shape) == 2:
+                        m = m[..., np.newaxis]
+                    if m.max() > 1:
+                        m = m / 255.0
+                    m = self.image_processor.numpy_to_pil(m)[0]
+                if m.mode != "L":
+                    m = m.convert("L")
+                resized = self.image_processor.resize(m, l_height, l_width)
+                if self.debug_save:
+                    resized.save("latent_mask.png")
+                latent_mask.append(np.repeat(np.array(resized)[np.newaxis, :, :], l_channels, axis=0))
+            latent_mask = torch.as_tensor(np.stack(latent_mask)).to(latents)
+            latent_mask = latent_mask / max(latent_mask.max(), 1)
+        return latent_mask
+
+    def prepare_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        device,
+        generator=None,
+        add_noise=True,
+        sample_mode: str = "sample",
+    ):
+        if not isinstance(image, (torch.Tensor, Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+        elif sample_mode == "random":
+            height, width = image.shape[-2:]
+            num_channels_latents = self.unet.config.in_channels
+            latents = self.random_latents(
+                batch_size,
+                num_channels_latents,
+                height,
+                width,
+                dtype,
+                device,
+                generator,
+            )
+            return self.vae.config.scaling_factor * latents
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(
+                        self.vae.encode(image[i : i + 1]), generator=generator[i], sample_mode=sample_mode
+                    )
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator, sample_mode=sample_mode)
+
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        if add_noise:
+            shape = init_latents.shape
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # get latents
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def random_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def denormalize(self, latents):
+        # unscale/denormalize the latents
+        # denormalize with the mean and std if available and not None
+        has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+        has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+        if has_latents_mean and has_latents_std:
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+            )
+            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+            latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+        else:
+            latents = latents / self.vae.config.scaling_factor
+
+        return latents
+
+    def latents_to_img(self, latents):
+        l1 = self.denormalize(latents)
+        img1 = self.vae.decode(l1, return_dict=False)[0]
+        img1 = self.image_processor.postprocess(img1, output_type="pil", do_denormalize=[True])
+        return img1
+
+    def blur_mask(self, pil_mask, blur):
+        mask_blur = pil_mask.filter(ImageFilter.GaussianBlur(radius=blur))
+        mask_blur = np.array(mask_blur)
+        return torch.from_numpy(np.tile(mask_blur / mask_blur.max(), (3, 1, 1)).transpose(1, 2, 0))
diff --git a/diffusers/examples/community/mixture_canvas.py b/diffusers/examples/community/mixture_canvas.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bb054a123d055f3b00ed72e58507f82e687f407
--- /dev/null
+++ b/diffusers/examples/community/mixture_canvas.py
@@ -0,0 +1,501 @@
+import re
+from copy import deepcopy
+from dataclasses import asdict, dataclass
+from enum import Enum
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+from numpy import exp, pi, sqrt
+from torchvision.transforms.functional import resize
+from tqdm.auto import tqdm
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+
+
+def preprocess_image(image):
+    from PIL import Image
+
+    """Preprocess an input image
+
+    Same as
+    https://github.com/huggingface/diffusers/blob/1138d63b519e37f0ce04e027b9f4a3261d27c628/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L44
+    """
+    w, h = image.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+@dataclass
+class CanvasRegion:
+    """Class defining a rectangular region in the canvas"""
+
+    row_init: int  # Region starting row in pixel space (included)
+    row_end: int  # Region end row in pixel space (not included)
+    col_init: int  # Region starting column in pixel space (included)
+    col_end: int  # Region end column in pixel space (not included)
+    region_seed: int = None  # Seed for random operations in this region
+    noise_eps: float = 0.0  # Deviation of a zero-mean gaussian noise to be applied over the latents in this region. Useful for slightly "rerolling" latents
+
+    def __post_init__(self):
+        # Initialize arguments if not specified
+        if self.region_seed is None:
+            self.region_seed = np.random.randint(9999999999)
+        # Check coordinates are non-negative
+        for coord in [self.row_init, self.row_end, self.col_init, self.col_end]:
+            if coord < 0:
+                raise ValueError(
+                    f"A CanvasRegion must be defined with non-negative indices, found ({self.row_init}, {self.row_end}, {self.col_init}, {self.col_end})"
+                )
+        # Check coordinates are divisible by 8, else we end up with nasty rounding error when mapping to latent space
+        for coord in [self.row_init, self.row_end, self.col_init, self.col_end]:
+            if coord // 8 != coord / 8:
+                raise ValueError(
+                    f"A CanvasRegion must be defined with locations divisible by 8, found ({self.row_init}-{self.row_end}, {self.col_init}-{self.col_end})"
+                )
+        # Check noise eps is non-negative
+        if self.noise_eps < 0:
+            raise ValueError(f"A CanvasRegion must be defined noises eps non-negative, found {self.noise_eps}")
+        # Compute coordinates for this region in latent space
+        self.latent_row_init = self.row_init // 8
+        self.latent_row_end = self.row_end // 8
+        self.latent_col_init = self.col_init // 8
+        self.latent_col_end = self.col_end // 8
+
+    @property
+    def width(self):
+        return self.col_end - self.col_init
+
+    @property
+    def height(self):
+        return self.row_end - self.row_init
+
+    def get_region_generator(self, device="cpu"):
+        """Creates a torch.Generator based on the random seed of this region"""
+        # Initialize region generator
+        return torch.Generator(device).manual_seed(self.region_seed)
+
+    @property
+    def __dict__(self):
+        return asdict(self)
+
+
+class MaskModes(Enum):
+    """Modes in which the influence of diffuser is masked"""
+
+    CONSTANT = "constant"
+    GAUSSIAN = "gaussian"
+    QUARTIC = "quartic"  # See https://en.wikipedia.org/wiki/Kernel_(statistics)
+
+
+@dataclass
+class DiffusionRegion(CanvasRegion):
+    """Abstract class defining a region where some class of diffusion process is acting"""
+
+    pass
+
+
+@dataclass
+class Text2ImageRegion(DiffusionRegion):
+    """Class defining a region where a text guided diffusion process is acting"""
+
+    prompt: str = ""  # Text prompt guiding the diffuser in this region
+    guidance_scale: float = 7.5  # Guidance scale of the diffuser in this region. If None, randomize
+    mask_type: MaskModes = MaskModes.GAUSSIAN.value  # Kind of weight mask applied to this region
+    mask_weight: float = 1.0  # Global weights multiplier of the mask
+    tokenized_prompt = None  # Tokenized prompt
+    encoded_prompt = None  # Encoded prompt
+
+    def __post_init__(self):
+        super().__post_init__()
+        # Mask weight cannot be negative
+        if self.mask_weight < 0:
+            raise ValueError(
+                f"A Text2ImageRegion must be defined with non-negative mask weight, found {self.mask_weight}"
+            )
+        # Mask type must be an actual known mask
+        if self.mask_type not in [e.value for e in MaskModes]:
+            raise ValueError(
+                f"A Text2ImageRegion was defined with mask {self.mask_type}, which is not an accepted mask ({[e.value for e in MaskModes]})"
+            )
+        # Randomize arguments if given as None
+        if self.guidance_scale is None:
+            self.guidance_scale = np.random.randint(5, 30)
+        # Clean prompt
+        self.prompt = re.sub(" +", " ", self.prompt).replace("\n", " ")
+
+    def tokenize_prompt(self, tokenizer):
+        """Tokenizes the prompt for this diffusion region using a given tokenizer"""
+        self.tokenized_prompt = tokenizer(
+            self.prompt,
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+    def encode_prompt(self, text_encoder, device):
+        """Encodes the previously tokenized prompt for this diffusion region using a given encoder"""
+        assert self.tokenized_prompt is not None, ValueError(
+            "Prompt in diffusion region must be tokenized before encoding"
+        )
+        self.encoded_prompt = text_encoder(self.tokenized_prompt.input_ids.to(device))[0]
+
+
+@dataclass
+class Image2ImageRegion(DiffusionRegion):
+    """Class defining a region where an image guided diffusion process is acting"""
+
+    reference_image: torch.Tensor = None
+    strength: float = 0.8  # Strength of the image
+
+    def __post_init__(self):
+        super().__post_init__()
+        if self.reference_image is None:
+            raise ValueError("Must provide a reference image when creating an Image2ImageRegion")
+        if self.strength < 0 or self.strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {self.strength}")
+        # Rescale image to region shape
+        self.reference_image = resize(self.reference_image, size=[self.height, self.width])
+
+    def encode_reference_image(self, encoder, device, generator, cpu_vae=False):
+        """Encodes the reference image for this Image2Image region into the latent space"""
+        # Place encoder in CPU or not following the parameter cpu_vae
+        if cpu_vae:
+            # Note here we use mean instead of sample, to avoid moving also generator to CPU, which is troublesome
+            self.reference_latents = encoder.cpu().encode(self.reference_image).latent_dist.mean.to(device)
+        else:
+            self.reference_latents = encoder.encode(self.reference_image.to(device)).latent_dist.sample(
+                generator=generator
+            )
+        self.reference_latents = 0.18215 * self.reference_latents
+
+    @property
+    def __dict__(self):
+        # This class requires special casting to dict because of the reference_image tensor. Otherwise it cannot be casted to JSON
+
+        # Get all basic fields from parent class
+        super_fields = {key: getattr(self, key) for key in DiffusionRegion.__dataclass_fields__.keys()}
+        # Pack other fields
+        return {**super_fields, "reference_image": self.reference_image.cpu().tolist(), "strength": self.strength}
+
+
+class RerollModes(Enum):
+    """Modes in which the reroll regions operate"""
+
+    RESET = "reset"  # Completely reset the random noise in the region
+    EPSILON = "epsilon"  # Alter slightly the latents in the region
+
+
+@dataclass
+class RerollRegion(CanvasRegion):
+    """Class defining a rectangular canvas region in which initial latent noise will be rerolled"""
+
+    reroll_mode: RerollModes = RerollModes.RESET.value
+
+
+@dataclass
+class MaskWeightsBuilder:
+    """Auxiliary class to compute a tensor of weights for a given diffusion region"""
+
+    latent_space_dim: int  # Size of the U-net latent space
+    nbatch: int = 1  # Batch size in the U-net
+
+    def compute_mask_weights(self, region: DiffusionRegion) -> torch.tensor:
+        """Computes a tensor of weights for a given diffusion region"""
+        MASK_BUILDERS = {
+            MaskModes.CONSTANT.value: self._constant_weights,
+            MaskModes.GAUSSIAN.value: self._gaussian_weights,
+            MaskModes.QUARTIC.value: self._quartic_weights,
+        }
+        return MASK_BUILDERS[region.mask_type](region)
+
+    def _constant_weights(self, region: DiffusionRegion) -> torch.tensor:
+        """Computes a tensor of constant for a given diffusion region"""
+        latent_width = region.latent_col_end - region.latent_col_init
+        latent_height = region.latent_row_end - region.latent_row_init
+        return torch.ones(self.nbatch, self.latent_space_dim, latent_height, latent_width) * region.mask_weight
+
+    def _gaussian_weights(self, region: DiffusionRegion) -> torch.tensor:
+        """Generates a gaussian mask of weights for tile contributions"""
+        latent_width = region.latent_col_end - region.latent_col_init
+        latent_height = region.latent_row_end - region.latent_row_init
+
+        var = 0.01
+        midpoint = (latent_width - 1) / 2  # -1 because index goes from 0 to latent_width - 1
+        x_probs = [
+            exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var)
+            for x in range(latent_width)
+        ]
+        midpoint = (latent_height - 1) / 2
+        y_probs = [
+            exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var)
+            for y in range(latent_height)
+        ]
+
+        weights = np.outer(y_probs, x_probs) * region.mask_weight
+        return torch.tile(torch.tensor(weights), (self.nbatch, self.latent_space_dim, 1, 1))
+
+    def _quartic_weights(self, region: DiffusionRegion) -> torch.tensor:
+        """Generates a quartic mask of weights for tile contributions
+
+        The quartic kernel has bounded support over the diffusion region, and a smooth decay to the region limits.
+        """
+        quartic_constant = 15.0 / 16.0
+
+        support = (np.array(range(region.latent_col_init, region.latent_col_end)) - region.latent_col_init) / (
+            region.latent_col_end - region.latent_col_init - 1
+        ) * 1.99 - (1.99 / 2.0)
+        x_probs = quartic_constant * np.square(1 - np.square(support))
+        support = (np.array(range(region.latent_row_init, region.latent_row_end)) - region.latent_row_init) / (
+            region.latent_row_end - region.latent_row_init - 1
+        ) * 1.99 - (1.99 / 2.0)
+        y_probs = quartic_constant * np.square(1 - np.square(support))
+
+        weights = np.outer(y_probs, x_probs) * region.mask_weight
+        return torch.tile(torch.tensor(weights), (self.nbatch, self.latent_space_dim, 1, 1))
+
+
+class StableDiffusionCanvasPipeline(DiffusionPipeline, StableDiffusionMixin):
+    """Stable Diffusion pipeline that mixes several diffusers in the same canvas"""
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def decode_latents(self, latents, cpu_vae=False):
+        """Decodes a given array of latents into pixel space"""
+        # scale and decode the image latents with vae
+        if cpu_vae:
+            lat = deepcopy(latents).cpu()
+            vae = deepcopy(self.vae).cpu()
+        else:
+            lat = latents
+            vae = self.vae
+
+        lat = 1 / 0.18215 * lat
+        image = vae.decode(lat).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        return self.numpy_to_pil(image)
+
+    def get_latest_timestep_img2img(self, num_inference_steps, strength):
+        """Finds the latest timesteps where an img2img strength does not impose latents anymore"""
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * (1 - strength)) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        t_start = min(max(num_inference_steps - init_timestep + offset, 0), num_inference_steps - 1)
+        latest_timestep = self.scheduler.timesteps[t_start]
+
+        return latest_timestep
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        canvas_height: int,
+        canvas_width: int,
+        regions: List[DiffusionRegion],
+        num_inference_steps: Optional[int] = 50,
+        seed: Optional[int] = 12345,
+        reroll_regions: Optional[List[RerollRegion]] = None,
+        cpu_vae: Optional[bool] = False,
+        decode_steps: Optional[bool] = False,
+    ):
+        if reroll_regions is None:
+            reroll_regions = []
+        batch_size = 1
+
+        if decode_steps:
+            steps_images = []
+
+        # Prepare scheduler
+        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+
+        # Split diffusion regions by their kind
+        text2image_regions = [region for region in regions if isinstance(region, Text2ImageRegion)]
+        image2image_regions = [region for region in regions if isinstance(region, Image2ImageRegion)]
+
+        # Prepare text embeddings
+        for region in text2image_regions:
+            region.tokenize_prompt(self.tokenizer)
+            region.encode_prompt(self.text_encoder, self.device)
+
+        # Create original noisy latents using the timesteps
+        latents_shape = (batch_size, self.unet.config.in_channels, canvas_height // 8, canvas_width // 8)
+        generator = torch.Generator(self.device).manual_seed(seed)
+        init_noise = torch.randn(latents_shape, generator=generator, device=self.device)
+
+        # Reset latents in seed reroll regions, if requested
+        for region in reroll_regions:
+            if region.reroll_mode == RerollModes.RESET.value:
+                region_shape = (
+                    latents_shape[0],
+                    latents_shape[1],
+                    region.latent_row_end - region.latent_row_init,
+                    region.latent_col_end - region.latent_col_init,
+                )
+                init_noise[
+                    :,
+                    :,
+                    region.latent_row_init : region.latent_row_end,
+                    region.latent_col_init : region.latent_col_end,
+                ] = torch.randn(region_shape, generator=region.get_region_generator(self.device), device=self.device)
+
+        # Apply epsilon noise to regions: first diffusion regions, then reroll regions
+        all_eps_rerolls = regions + [r for r in reroll_regions if r.reroll_mode == RerollModes.EPSILON.value]
+        for region in all_eps_rerolls:
+            if region.noise_eps > 0:
+                region_noise = init_noise[
+                    :,
+                    :,
+                    region.latent_row_init : region.latent_row_end,
+                    region.latent_col_init : region.latent_col_end,
+                ]
+                eps_noise = (
+                    torch.randn(
+                        region_noise.shape, generator=region.get_region_generator(self.device), device=self.device
+                    )
+                    * region.noise_eps
+                )
+                init_noise[
+                    :,
+                    :,
+                    region.latent_row_init : region.latent_row_end,
+                    region.latent_col_init : region.latent_col_end,
+                ] += eps_noise
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = init_noise * self.scheduler.init_noise_sigma
+
+        # Get unconditional embeddings for classifier free guidance in text2image regions
+        for region in text2image_regions:
+            max_length = region.tokenized_prompt.input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            region.encoded_prompt = torch.cat([uncond_embeddings, region.encoded_prompt])
+
+        # Prepare image latents
+        for region in image2image_regions:
+            region.encode_reference_image(self.vae, device=self.device, generator=generator)
+
+        # Prepare mask of weights for each region
+        mask_builder = MaskWeightsBuilder(latent_space_dim=self.unet.config.in_channels, nbatch=batch_size)
+        mask_weights = [mask_builder.compute_mask_weights(region).to(self.device) for region in text2image_regions]
+
+        # Diffusion timesteps
+        for i, t in tqdm(enumerate(self.scheduler.timesteps)):
+            # Diffuse each region
+            noise_preds_regions = []
+
+            # text2image regions
+            for region in text2image_regions:
+                region_latents = latents[
+                    :,
+                    :,
+                    region.latent_row_init : region.latent_row_end,
+                    region.latent_col_init : region.latent_col_end,
+                ]
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([region_latents] * 2)
+                # scale model input following scheduler rules
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=region.encoded_prompt)["sample"]
+                # perform guidance
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred_region = noise_pred_uncond + region.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_preds_regions.append(noise_pred_region)
+
+            # Merge noise predictions for all tiles
+            noise_pred = torch.zeros(latents.shape, device=self.device)
+            contributors = torch.zeros(latents.shape, device=self.device)
+            # Add each tile contribution to overall latents
+            for region, noise_pred_region, mask_weights_region in zip(
+                text2image_regions, noise_preds_regions, mask_weights
+            ):
+                noise_pred[
+                    :,
+                    :,
+                    region.latent_row_init : region.latent_row_end,
+                    region.latent_col_init : region.latent_col_end,
+                ] += noise_pred_region * mask_weights_region
+                contributors[
+                    :,
+                    :,
+                    region.latent_row_init : region.latent_row_end,
+                    region.latent_col_init : region.latent_col_end,
+                ] += mask_weights_region
+            # Average overlapping areas with more than 1 contributor
+            noise_pred /= contributors
+            noise_pred = torch.nan_to_num(
+                noise_pred
+            )  # Replace NaNs by zeros: NaN can appear if a position is not covered by any DiffusionRegion
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+
+            # Image2Image regions: override latents generated by the scheduler
+            for region in image2image_regions:
+                influence_step = self.get_latest_timestep_img2img(num_inference_steps, region.strength)
+                # Only override in the timesteps before the last influence step of the image (given by its strength)
+                if t > influence_step:
+                    timestep = t.repeat(batch_size)
+                    region_init_noise = init_noise[
+                        :,
+                        :,
+                        region.latent_row_init : region.latent_row_end,
+                        region.latent_col_init : region.latent_col_end,
+                    ]
+                    region_latents = self.scheduler.add_noise(region.reference_latents, region_init_noise, timestep)
+                    latents[
+                        :,
+                        :,
+                        region.latent_row_init : region.latent_row_end,
+                        region.latent_col_init : region.latent_col_end,
+                    ] = region_latents
+
+            if decode_steps:
+                steps_images.append(self.decode_latents(latents, cpu_vae))
+
+        # scale and decode the image latents with vae
+        image = self.decode_latents(latents, cpu_vae)
+
+        output = {"images": image}
+        if decode_steps:
+            output = {**output, "steps_images": steps_images}
+        return output
diff --git a/diffusers/examples/community/mixture_tiling.py b/diffusers/examples/community/mixture_tiling.py
new file mode 100644
index 0000000000000000000000000000000000000000..867bce0d9eb8c9bb7d191144267fcc9a09980dee
--- /dev/null
+++ b/diffusers/examples/community/mixture_tiling.py
@@ -0,0 +1,405 @@
+import inspect
+from copy import deepcopy
+from enum import Enum
+from typing import List, Optional, Tuple, Union
+
+import torch
+from tqdm.auto import tqdm
+
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import logging
+
+
+try:
+    from ligo.segments import segment
+    from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+except ImportError:
+    raise ImportError("Please install transformers and ligo-segments to use the mixture pipeline")
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import LMSDiscreteScheduler, DiffusionPipeline
+
+        >>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
+        >>> pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler, custom_pipeline="mixture_tiling")
+        >>> pipeline.to("cuda")
+
+        >>> image = pipeline(
+        >>>     prompt=[[
+        >>>         "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+        >>>         "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+        >>>         "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece"
+        >>>     ]],
+        >>>     tile_height=640,
+        >>>     tile_width=640,
+        >>>     tile_row_overlap=0,
+        >>>     tile_col_overlap=256,
+        >>>     guidance_scale=8,
+        >>>     seed=7178915308,
+        >>>     num_inference_steps=50,
+    >>> )["images"][0]
+        ```
+"""
+
+
+def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
+    """Given a tile row and column numbers returns the range of pixels affected by that tiles in the overall image
+
+    Returns a tuple with:
+        - Starting coordinates of rows in pixel space
+        - Ending coordinates of rows in pixel space
+        - Starting coordinates of columns in pixel space
+        - Ending coordinates of columns in pixel space
+    """
+    px_row_init = 0 if tile_row == 0 else tile_row * (tile_height - tile_row_overlap)
+    px_row_end = px_row_init + tile_height
+    px_col_init = 0 if tile_col == 0 else tile_col * (tile_width - tile_col_overlap)
+    px_col_end = px_col_init + tile_width
+    return px_row_init, px_row_end, px_col_init, px_col_end
+
+
+def _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end):
+    """Translates coordinates in pixel space to coordinates in latent space"""
+    return px_row_init // 8, px_row_end // 8, px_col_init // 8, px_col_end // 8
+
+
+def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
+    """Given a tile row and column numbers returns the range of latents affected by that tiles in the overall image
+
+    Returns a tuple with:
+        - Starting coordinates of rows in latent space
+        - Ending coordinates of rows in latent space
+        - Starting coordinates of columns in latent space
+        - Ending coordinates of columns in latent space
+    """
+    px_row_init, px_row_end, px_col_init, px_col_end = _tile2pixel_indices(
+        tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+    )
+    return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end)
+
+
+def _tile2latent_exclusive_indices(
+    tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap, rows, columns
+):
+    """Given a tile row and column numbers returns the range of latents affected only by that tile in the overall image
+
+    Returns a tuple with:
+        - Starting coordinates of rows in latent space
+        - Ending coordinates of rows in latent space
+        - Starting coordinates of columns in latent space
+        - Ending coordinates of columns in latent space
+    """
+    row_init, row_end, col_init, col_end = _tile2latent_indices(
+        tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+    )
+    row_segment = segment(row_init, row_end)
+    col_segment = segment(col_init, col_end)
+    # Iterate over the rest of tiles, clipping the region for the current tile
+    for row in range(rows):
+        for column in range(columns):
+            if row != tile_row and column != tile_col:
+                clip_row_init, clip_row_end, clip_col_init, clip_col_end = _tile2latent_indices(
+                    row, column, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+                )
+                row_segment = row_segment - segment(clip_row_init, clip_row_end)
+                col_segment = col_segment - segment(clip_col_init, clip_col_end)
+    # return row_init, row_end, col_init, col_end
+    return row_segment[0], row_segment[1], col_segment[0], col_segment[1]
+
+
+class StableDiffusionExtrasMixin:
+    """Mixin providing additional convenience method to Stable Diffusion pipelines"""
+
+    def decode_latents(self, latents, cpu_vae=False):
+        """Decodes a given array of latents into pixel space"""
+        # scale and decode the image latents with vae
+        if cpu_vae:
+            lat = deepcopy(latents).cpu()
+            vae = deepcopy(self.vae).cpu()
+        else:
+            lat = latents
+            vae = self.vae
+
+        lat = 1 / 0.18215 * lat
+        image = vae.decode(lat).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        return self.numpy_to_pil(image)
+
+
+class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixin):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    class SeedTilesMode(Enum):
+        """Modes in which the latents of a particular tile can be re-seeded"""
+
+        FULL = "full"
+        EXCLUSIVE = "exclusive"
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[List[str]]],
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        eta: Optional[float] = 0.0,
+        seed: Optional[int] = None,
+        tile_height: Optional[int] = 512,
+        tile_width: Optional[int] = 512,
+        tile_row_overlap: Optional[int] = 256,
+        tile_col_overlap: Optional[int] = 256,
+        guidance_scale_tiles: Optional[List[List[float]]] = None,
+        seed_tiles: Optional[List[List[int]]] = None,
+        seed_tiles_mode: Optional[Union[str, List[List[str]]]] = "full",
+        seed_reroll_regions: Optional[List[Tuple[int, int, int, int, int]]] = None,
+        cpu_vae: Optional[bool] = False,
+    ):
+        r"""
+        Function to run the diffusion pipeline with tiling support.
+
+        Args:
+            prompt: either a single string (no tiling) or a list of lists with all the prompts to use (one list for each row of tiles). This will also define the tiling structure.
+            num_inference_steps: number of diffusions steps.
+            guidance_scale: classifier-free guidance.
+            seed: general random seed to initialize latents.
+            tile_height: height in pixels of each grid tile.
+            tile_width: width in pixels of each grid tile.
+            tile_row_overlap: number of overlap pixels between tiles in consecutive rows.
+            tile_col_overlap: number of overlap pixels between tiles in consecutive columns.
+            guidance_scale_tiles: specific weights for classifier-free guidance in each tile.
+            guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used.
+            seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter.
+            seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overriden.
+            seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles.
+            cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.
+
+        Examples:
+
+        Returns:
+            A PIL image with the generated image.
+
+        """
+        if not isinstance(prompt, list) or not all(isinstance(row, list) for row in prompt):
+            raise ValueError(f"`prompt` has to be a list of lists but is {type(prompt)}")
+        grid_rows = len(prompt)
+        grid_cols = len(prompt[0])
+        if not all(len(row) == grid_cols for row in prompt):
+            raise ValueError("All prompt rows must have the same number of prompt columns")
+        if not isinstance(seed_tiles_mode, str) and (
+            not isinstance(seed_tiles_mode, list) or not all(isinstance(row, list) for row in seed_tiles_mode)
+        ):
+            raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}")
+        if isinstance(seed_tiles_mode, str):
+            seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt]
+
+        modes = [mode.value for mode in self.SeedTilesMode]
+        if any(mode not in modes for row in seed_tiles_mode for mode in row):
+            raise ValueError(f"Seed tiles mode must be one of {modes}")
+        if seed_reroll_regions is None:
+            seed_reroll_regions = []
+        batch_size = 1
+
+        # create original noisy latents using the timesteps
+        height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap)
+        width = tile_width + (grid_cols - 1) * (tile_width - tile_col_overlap)
+        latents_shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8)
+        generator = torch.Generator("cuda").manual_seed(seed)
+        latents = torch.randn(latents_shape, generator=generator, device=self.device)
+
+        # overwrite latents for specific tiles if provided
+        if seed_tiles is not None:
+            for row in range(grid_rows):
+                for col in range(grid_cols):
+                    if (seed_tile := seed_tiles[row][col]) is not None:
+                        mode = seed_tiles_mode[row][col]
+                        if mode == self.SeedTilesMode.FULL.value:
+                            row_init, row_end, col_init, col_end = _tile2latent_indices(
+                                row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+                            )
+                        else:
+                            row_init, row_end, col_init, col_end = _tile2latent_exclusive_indices(
+                                row,
+                                col,
+                                tile_width,
+                                tile_height,
+                                tile_row_overlap,
+                                tile_col_overlap,
+                                grid_rows,
+                                grid_cols,
+                            )
+                        tile_generator = torch.Generator("cuda").manual_seed(seed_tile)
+                        tile_shape = (latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init)
+                        latents[:, :, row_init:row_end, col_init:col_end] = torch.randn(
+                            tile_shape, generator=tile_generator, device=self.device
+                        )
+
+        # overwrite again for seed reroll regions
+        for row_init, row_end, col_init, col_end, seed_reroll in seed_reroll_regions:
+            row_init, row_end, col_init, col_end = _pixel2latent_indices(
+                row_init, row_end, col_init, col_end
+            )  # to latent space coordinates
+            reroll_generator = torch.Generator("cuda").manual_seed(seed_reroll)
+            region_shape = (latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init)
+            latents[:, :, row_init:row_end, col_init:col_end] = torch.randn(
+                region_shape, generator=reroll_generator, device=self.device
+            )
+
+        # Prepare scheduler
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+        # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
+        if isinstance(self.scheduler, LMSDiscreteScheduler):
+            latents = latents * self.scheduler.sigmas[0]
+
+        # get prompts text embeddings
+        text_input = [
+            [
+                self.tokenizer(
+                    col,
+                    padding="max_length",
+                    max_length=self.tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                for col in row
+            ]
+            for row in prompt
+        ]
+        text_embeddings = [[self.text_encoder(col.input_ids.to(self.device))[0] for col in row] for row in text_input]
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0  # TODO: also active if any tile has guidance scale
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            for i in range(grid_rows):
+                for j in range(grid_cols):
+                    max_length = text_input[i][j].input_ids.shape[-1]
+                    uncond_input = self.tokenizer(
+                        [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
+                    )
+                    uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+                    # For classifier free guidance, we need to do two forward passes.
+                    # Here we concatenate the unconditional and text embeddings into a single batch
+                    # to avoid doing two forward passes
+                    text_embeddings[i][j] = torch.cat([uncond_embeddings, text_embeddings[i][j]])
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # Mask for tile weights strength
+        tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size)
+
+        # Diffusion timesteps
+        for i, t in tqdm(enumerate(self.scheduler.timesteps)):
+            # Diffuse each tile
+            noise_preds = []
+            for row in range(grid_rows):
+                noise_preds_row = []
+                for col in range(grid_cols):
+                    px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices(
+                        row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+                    )
+                    tile_latents = latents[:, :, px_row_init:px_row_end, px_col_init:px_col_end]
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([tile_latents] * 2) if do_classifier_free_guidance else tile_latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    # predict the noise residual
+                    noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings[row][col])[
+                        "sample"
+                    ]
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        guidance = (
+                            guidance_scale
+                            if guidance_scale_tiles is None or guidance_scale_tiles[row][col] is None
+                            else guidance_scale_tiles[row][col]
+                        )
+                        noise_pred_tile = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond)
+                        noise_preds_row.append(noise_pred_tile)
+                noise_preds.append(noise_preds_row)
+            # Stitch noise predictions for all tiles
+            noise_pred = torch.zeros(latents.shape, device=self.device)
+            contributors = torch.zeros(latents.shape, device=self.device)
+            # Add each tile contribution to overall latents
+            for row in range(grid_rows):
+                for col in range(grid_cols):
+                    px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices(
+                        row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+                    )
+                    noise_pred[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += (
+                        noise_preds[row][col] * tile_weights
+                    )
+                    contributors[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += tile_weights
+            # Average overlapping areas with more than 1 contributor
+            noise_pred /= contributors
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+
+        # scale and decode the image latents with vae
+        image = self.decode_latents(latents, cpu_vae)
+
+        return {"images": image}
+
+    def _gaussian_weights(self, tile_width, tile_height, nbatches):
+        """Generates a gaussian mask of weights for tile contributions"""
+        import numpy as np
+        from numpy import exp, pi, sqrt
+
+        latent_width = tile_width // 8
+        latent_height = tile_height // 8
+
+        var = 0.01
+        midpoint = (latent_width - 1) / 2  # -1 because index goes from 0 to latent_width - 1
+        x_probs = [
+            exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var)
+            for x in range(latent_width)
+        ]
+        midpoint = latent_height / 2
+        y_probs = [
+            exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var)
+            for y in range(latent_height)
+        ]
+
+        weights = np.outer(y_probs, x_probs)
+        return torch.tile(torch.tensor(weights, device=self.device), (nbatches, self.unet.config.in_channels, 1, 1))
diff --git a/diffusers/examples/community/multilingual_stable_diffusion.py b/diffusers/examples/community/multilingual_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc335e0b585e13158e1bfd9b513294a83193ec79
--- /dev/null
+++ b/diffusers/examples/community/multilingual_stable_diffusion.py
@@ -0,0 +1,410 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    MBart50TokenizerFast,
+    MBartForConditionalGeneration,
+    pipeline,
+)
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def detect_language(pipe, prompt, batch_size):
+    """helper function to detect language(s) of prompt"""
+
+    if batch_size == 1:
+        preds = pipe(prompt, top_k=1, truncation=True, max_length=128)
+        return preds[0]["label"]
+    else:
+        detected_languages = []
+        for p in prompt:
+            preds = pipe(p, top_k=1, truncation=True, max_length=128)
+            detected_languages.append(preds[0]["label"])
+
+        return detected_languages
+
+
+def translate_prompt(prompt, translation_tokenizer, translation_model, device):
+    """helper function to translate prompt to English"""
+
+    encoded_prompt = translation_tokenizer(prompt, return_tensors="pt").to(device)
+    generated_tokens = translation_model.generate(**encoded_prompt, max_new_tokens=1000)
+    en_trans = translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+
+    return en_trans[0]
+
+
+class MultilingualStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion in different languages.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        detection_pipeline ([`pipeline`]):
+            Transformers pipeline to detect prompt's language.
+        translation_model ([`MBartForConditionalGeneration`]):
+            Model to translate prompt to English, if necessary. Please refer to the
+            [model card](https://huggingface.co/docs/transformers/model_doc/mbart) for details.
+        translation_tokenizer ([`MBart50TokenizerFast`]):
+            Tokenizer of the translation model.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        detection_pipeline: pipeline,
+        translation_model: MBartForConditionalGeneration,
+        translation_tokenizer: MBart50TokenizerFast,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            detection_pipeline=detection_pipeline,
+            translation_model=translation_model,
+            translation_tokenizer=translation_tokenizer,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation. Can be in different languages.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # detect language and translate if necessary
+        prompt_language = detect_language(self.detection_pipeline, prompt, batch_size)
+        if batch_size == 1 and prompt_language != "en":
+            prompt = translate_prompt(prompt, self.translation_tokenizer, self.translation_model, self.device)
+
+        if isinstance(prompt, list):
+            for index in range(batch_size):
+                if prompt_language[index] != "en":
+                    p = translate_prompt(
+                        prompt[index], self.translation_tokenizer, self.translation_model, self.device
+                    )
+                    prompt[index] = p
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                # detect language and translate it if necessary
+                negative_prompt_language = detect_language(self.detection_pipeline, negative_prompt, batch_size)
+                if negative_prompt_language != "en":
+                    negative_prompt = translate_prompt(
+                        negative_prompt, self.translation_tokenizer, self.translation_model, self.device
+                    )
+                if isinstance(negative_prompt, str):
+                    uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                # detect language and translate it if necessary
+                if isinstance(negative_prompt, list):
+                    negative_prompt_languages = detect_language(self.detection_pipeline, negative_prompt, batch_size)
+                    for index in range(batch_size):
+                        if negative_prompt_languages[index] != "en":
+                            p = translate_prompt(
+                                negative_prompt[index], self.translation_tokenizer, self.translation_model, self.device
+                            )
+                            negative_prompt[index] = p
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/one_step_unet.py b/diffusers/examples/community/one_step_unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d34bfd83191d63483bc562cb54cc887660cdffa
--- /dev/null
+++ b/diffusers/examples/community/one_step_unet.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+import torch
+
+from diffusers import DiffusionPipeline
+
+
+class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+    def __init__(self, unet, scheduler):
+        super().__init__()
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    def __call__(self):
+        image = torch.randn(
+            (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
+        )
+        timestep = 1
+
+        model_output = self.unet(image, timestep).sample
+        scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
+
+        result = scheduler_output - scheduler_output + torch.ones_like(scheduler_output)
+
+        return result
diff --git a/diffusers/examples/community/pipeline_animatediff_controlnet.py b/diffusers/examples/community/pipeline_animatediff_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bedf002d024c575d2ffb3aa8839b9b63559d2cf4
--- /dev/null
+++ b/diffusers/examples/community/pipeline_animatediff_controlnet.py
@@ -0,0 +1,1129 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.models.unets.unet_motion_model import MotionAdapter
+from diffusers.pipelines.animatediff.pipeline_output import AnimateDiffPipelineOutput
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter
+        >>> from diffusers.pipelines import DiffusionPipeline
+        >>> from diffusers.schedulers import DPMSolverMultistepScheduler
+        >>> from PIL import Image
+
+        >>> motion_id = "guoyww/animatediff-motion-adapter-v1-5-2"
+        >>> adapter = MotionAdapter.from_pretrained(motion_id)
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_openpose", torch_dtype=torch.float16)
+        >>> vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
+
+        >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     model_id,
+        ...     motion_adapter=adapter,
+        ...     controlnet=controlnet,
+        ...     vae=vae,
+        ...     custom_pipeline="pipeline_animatediff_controlnet",
+        ... ).to(device="cuda", dtype=torch.float16)
+        >>> pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained(
+        ...     model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1, beta_schedule="linear",
+        ... )
+        >>> pipe.enable_vae_slicing()
+
+        >>> conditioning_frames = []
+        >>> for i in range(1, 16 + 1):
+        ...     conditioning_frames.append(Image.open(f"frame_{i}.png"))
+
+        >>> prompt = "astronaut in space, dancing"
+        >>> negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly"
+        >>> result = pipe(
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     width=512,
+        ...     height=768,
+        ...     conditioning_frames=conditioning_frames,
+        ...     num_inference_steps=12,
+        ... )
+
+        >>> from diffusers.utils import export_to_gif
+        >>> export_to_gif(result.frames[0], "result.gif")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
+def tensor2vid(video: torch.Tensor, processor, output_type="np"):
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+
+        outputs.append(batch_output)
+
+    if output_type == "np":
+        outputs = np.stack(outputs)
+
+    elif output_type == "pt":
+        outputs = torch.stack(outputs)
+
+    elif not output_type == "pil":
+        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
+
+    return outputs
+
+
+class AnimateDiffControlNetPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+):
+    r"""
+    Pipeline for text-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        motion_adapter: MotionAdapter,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+    ):
+        super().__init__()
+        unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if self.do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            image_embeds = ip_adapter_image_embeds
+        return image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = (
+            image[None, :]
+            .reshape(
+                (
+                    batch_size,
+                    num_frames,
+                    -1,
+                )
+                + image.shape[2:]
+            )
+            .permute(0, 2, 1, 3, 4)
+        )
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        num_frames,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        image=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError(f"For single controlnet, `image` must be of type `list` but got {type(image)}")
+            if len(image) != num_frames:
+                raise ValueError(f"Excepted image to have length {num_frames} but got {len(image)=}")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list) or not isinstance(image[0], list):
+                raise TypeError(f"For multiple controlnets: `image` must be type list of lists but got {type(image)=}")
+            if len(image[0]) != num_frames:
+                raise ValueError(f"Expected length of image sublist as {num_frames} but got {len(image[0])=}")
+            if any(len(img) != len(image[0]) for img in image):
+                raise ValueError("All conditioning frame batches for multicontrolnet must be same size")
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_frames: Optional[int] = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[PipelineImageInput] = None,
+        conditioning_frames: Optional[List[PipelineImageInput]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            conditioning_frames (`List[PipelineImageInput]`, *optional*):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If multiple ControlNets
+                are specified, images must be passed as a list such that each element of the list can be correctly
+                batched for input to a single ControlNet.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or
+                `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            callback_steps=callback_steps,
+            negative_prompt=negative_prompt,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            image=conditioning_frames,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            control_guidance_start=control_guidance_start,
+            control_guidance_end=control_guidance_end,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt
+            )
+
+        if isinstance(controlnet, ControlNetModel):
+            conditioning_frames = self.prepare_image(
+                image=conditioning_frames,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_videos_per_prompt * num_frames,
+                num_images_per_prompt=num_videos_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            cond_prepared_frames = []
+            for frame_ in conditioning_frames:
+                prepared_frame = self.prepare_image(
+                    image=frame_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_videos_per_prompt * num_frames,
+                    num_images_per_prompt=num_videos_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+                cond_prepared_frames.append(prepared_frame)
+            conditioning_frames = cond_prepared_frames
+        else:
+            assert False
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                controlnet_prompt_embeds = controlnet_prompt_embeds.repeat_interleave(num_frames, dim=0)
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                control_model_input = torch.transpose(control_model_input, 1, 2)
+                control_model_input = control_model_input.reshape(
+                    (-1, control_model_input.shape[2], control_model_input.shape[3], control_model_input.shape[4])
+                )
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=conditioning_frames,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        # 9. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+        # 10. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
diff --git a/diffusers/examples/community/pipeline_animatediff_img2video.py b/diffusers/examples/community/pipeline_animatediff_img2video.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a578d4b8ef6d251c327f2e2d3cd9d1b2845e6a3
--- /dev/null
+++ b/diffusers/examples/community/pipeline_animatediff_img2video.py
@@ -0,0 +1,984 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Note:
+# This pipeline relies on a "hack" discovered by the community that allows
+# the generation of videos given an input image with AnimateDiff. It works
+# by creating a copy of the image `num_frames` times and progressively adding
+# more noise to the image based on the strength and latent interpolation method.
+
+import inspect
+from types import FunctionType
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.models.unet_motion_model import MotionAdapter
+from diffusers.pipelines.animatediff.pipeline_output import AnimateDiffPipelineOutput
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
+        >>> from diffusers.utils import export_to_gif, load_image
+
+        >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+        >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
+        >>> pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
+        >>> pipe.scheduler = pipe.scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", beta_schedule="linear", steps_offset=1)
+
+        >>> image = load_image("snail.png")
+        >>> output = pipe(image=image, prompt="A snail moving on the ground", strength=0.8, latent_interpolation_method="slerp")
+        >>> frames = output.frames[0]
+        >>> export_to_gif(frames, "animation.gif")
+        ```
+"""
+
+
+def lerp(
+    v0: torch.Tensor,
+    v1: torch.Tensor,
+    t: Union[float, torch.Tensor],
+) -> torch.Tensor:
+    r"""
+    Linear Interpolation between two tensors.
+
+    Args:
+        v0 (`torch.Tensor`): First tensor.
+        v1 (`torch.Tensor`): Second tensor.
+        t: (`float` or `torch.Tensor`): Interpolation factor.
+    """
+    t_is_float = False
+    input_device = v0.device
+    v0 = v0.cpu().numpy()
+    v1 = v1.cpu().numpy()
+
+    if isinstance(t, torch.Tensor):
+        t = t.cpu().numpy()
+    else:
+        t_is_float = True
+        t = np.array([t], dtype=v0.dtype)
+
+    t = t[..., None]
+    v0 = v0[None, ...]
+    v1 = v1[None, ...]
+    v2 = (1 - t) * v0 + t * v1
+
+    if t_is_float and v0.ndim > 1:
+        assert v2.shape[0] == 1
+        v2 = np.squeeze(v2, axis=0)
+
+    v2 = torch.from_numpy(v2).to(input_device)
+    return v2
+
+
+def slerp(
+    v0: torch.Tensor,
+    v1: torch.Tensor,
+    t: Union[float, torch.Tensor],
+    DOT_THRESHOLD: float = 0.9995,
+) -> torch.Tensor:
+    r"""
+    Spherical Linear Interpolation between two tensors.
+
+    Args:
+        v0 (`torch.Tensor`): First tensor.
+        v1 (`torch.Tensor`): Second tensor.
+        t: (`float` or `torch.Tensor`): Interpolation factor.
+        DOT_THRESHOLD (`float`):
+            Dot product threshold exceeding which linear interpolation will be used
+            because input tensors are close to parallel.
+    """
+    t_is_float = False
+    input_device = v0.device
+    v0 = v0.cpu().numpy()
+    v1 = v1.cpu().numpy()
+
+    if isinstance(t, torch.Tensor):
+        t = t.cpu().numpy()
+    else:
+        t_is_float = True
+        t = np.array([t], dtype=v0.dtype)
+
+    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
+
+    if np.abs(dot) > DOT_THRESHOLD:
+        # v0 and v1 are close to parallel, so use linear interpolation instead
+        v2 = lerp(v0, v1, t)
+    else:
+        theta_0 = np.arccos(dot)
+        sin_theta_0 = np.sin(theta_0)
+        theta_t = theta_0 * t
+        sin_theta_t = np.sin(theta_t)
+        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+        s1 = sin_theta_t / sin_theta_0
+        s0 = s0[..., None]
+        s1 = s1[..., None]
+        v0 = v0[None, ...]
+        v1 = v1[None, ...]
+        v2 = s0 * v0 + s1 * v1
+
+    if t_is_float and v0.ndim > 1:
+        assert v2.shape[0] == 1
+        v2 = np.squeeze(v2, axis=0)
+
+    v2 = torch.from_numpy(v2).to(input_device)
+    return v2
+
+
+# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
+def tensor2vid(video: torch.Tensor, processor, output_type="np"):
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+
+        outputs.append(batch_output)
+
+    if output_type == "np":
+        outputs = np.stack(outputs)
+
+    elif output_type == "pt":
+        outputs = torch.stack(outputs)
+
+    elif not output_type == "pil":
+        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
+
+    return outputs
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class AnimateDiffImgToVideoPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+):
+    r"""
+    Pipeline for image-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        motion_adapter: MotionAdapter,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+        unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if self.do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            image_embeds = ip_adapter_image_embeds
+        return image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = (
+            image[None, :]
+            .reshape(
+                (
+                    batch_size,
+                    num_frames,
+                    -1,
+                )
+                + image.shape[2:]
+            )
+            .permute(0, 2, 1, 3, 4)
+        )
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        latent_interpolation_method=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if latent_interpolation_method is not None:
+            if latent_interpolation_method not in ["lerp", "slerp"] and not isinstance(
+                latent_interpolation_method, FunctionType
+            ):
+                raise ValueError(
+                    "`latent_interpolation_method` must be one of `lerp`, `slerp` or a Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]"
+                )
+
+    def prepare_latents(
+        self,
+        image,
+        strength,
+        batch_size,
+        num_channels_latents,
+        num_frames,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        latent_interpolation_method="slerp",
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+
+        if latents is None:
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 4:
+                latents = image
+            else:
+                # make sure the VAE is in float32 mode, as it overflows in float16
+                if self.vae.config.force_upcast:
+                    image = image.float()
+                    self.vae.to(dtype=torch.float32)
+
+                if isinstance(generator, list):
+                    if len(generator) != batch_size:
+                        raise ValueError(
+                            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                        )
+
+                    init_latents = [
+                        retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                        for i in range(batch_size)
+                    ]
+                    init_latents = torch.cat(init_latents, dim=0)
+                else:
+                    init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+                if self.vae.config.force_upcast:
+                    self.vae.to(dtype)
+
+                init_latents = init_latents.to(dtype)
+                init_latents = self.vae.config.scaling_factor * init_latents
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                latents = latents * self.scheduler.init_noise_sigma
+
+                if latent_interpolation_method == "lerp":
+
+                    def latent_cls(v0, v1, index):
+                        return lerp(v0, v1, index / num_frames * (1 - strength))
+                elif latent_interpolation_method == "slerp":
+
+                    def latent_cls(v0, v1, index):
+                        return slerp(v0, v1, index / num_frames * (1 - strength))
+                else:
+                    latent_cls = latent_interpolation_method
+
+                for i in range(num_frames):
+                    latents[:, :, i, :, :] = latent_cls(latents[:, :, i, :, :], init_latents, i)
+        else:
+            if shape != latents.shape:
+                # [B, C, F, H, W]
+                raise ValueError(f"`latents` expected to have {shape=}, but found {latents.shape=}")
+            latents = latents.to(device, dtype=dtype)
+
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 16,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        latent_interpolation_method: Union[str, Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]] = "slerp",
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PipelineImageInput`):
+                The input image to condition the generation on.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            strength (`float`, *optional*, defaults to 0.8):
+                Higher strength leads to more differences between original image and generated video.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or
+                `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`AnimateDiffImgToVideoPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            latent_interpolation_method (`str` or `Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]]`, *optional*):
+                Must be one of "lerp", "slerp" or a callable that takes in a random noisy latent, image latent and a frame index
+                as input and returns an initial latent for sampling.
+        Examples:
+
+        Returns:
+            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            callback_steps=callback_steps,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            latent_interpolation_method=latent_interpolation_method,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt
+            )
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image, height=height, width=width)
+
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            image=image,
+            strength=strength,
+            batch_size=batch_size * num_videos_per_prompt,
+            num_channels_latents=num_channels_latents,
+            num_frames=num_frames,
+            height=height,
+            width=width,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+            latent_interpolation_method=latent_interpolation_method,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        if output_type == "latent":
+            return AnimateDiffPipelineOutput(frames=latents)
+
+        # 10. Post-processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+        # 11. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
diff --git a/diffusers/examples/community/pipeline_animatediff_ipex.py b/diffusers/examples/community/pipeline_animatediff_ipex.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc65e76bc43b03f7f8315a924e1a7137c5487cfd
--- /dev/null
+++ b/diffusers/examples/community/pipeline_animatediff_ipex.py
@@ -0,0 +1,1002 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import intel_extension_for_pytorch as ipex
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers.image_processor import PipelineImageInput
+from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.models.unets.unet_motion_model import MotionAdapter
+from diffusers.pipelines.animatediff.pipeline_output import AnimateDiffPipelineOutput
+from diffusers.pipelines.free_init_utils import FreeInitMixin
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import MotionAdapter, AnimateDiffPipelineIpex, EulerDiscreteScheduler
+        >>> from diffusers.utils import export_to_gif
+        >>> from safetensors.torch import load_file
+
+        >>> device = "cpu"
+        >>> dtype = torch.float32
+
+        >>> # ByteDance/AnimateDiff-Lightning, a distilled version of AnimateDiff SD1.5 v2,
+        >>> # a lightning-fast text-to-video generation model which can generate videos
+        >>> # more than ten times faster than the original AnimateDiff.
+        >>> step = 8  # Options: [1,2,4,8]
+        >>> repo = "ByteDance/AnimateDiff-Lightning"
+        >>> ckpt = f"animatediff_lightning_{step}step_diffusers.safetensors"
+        >>> base = "emilianJR/epiCRealism"  # Choose to your favorite base model.
+
+        >>> adapter = MotionAdapter().to(device, dtype)
+        >>> adapter.load_state_dict(load_file(hf_hub_download(repo ,ckpt), device=device))
+
+        >>> pipe = AnimateDiffPipelineIpex.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
+        >>> pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing", beta_schedule="linear")
+
+        >>> # For Float32
+        >>> pipe.prepare_for_ipex(torch.float32, prompt = "A girl smiling")
+        >>> # For BFloat16
+        >>> pipe.prepare_for_ipex(torch.bfloat16, prompt = "A girl smiling")
+
+        >>> # For Float32
+        >>> output = pipe(prompt = "A girl smiling", guidance_scale=1.0, num_inference_steps = step)
+        >>> # For BFloat16
+        >>> with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+        >>>     output = pipe(prompt = "A girl smiling", guidance_scale=1.0, num_inference_steps = step)
+
+        >>> frames = output.frames[0]
+        >>> export_to_gif(frames, "animation.gif")
+        ```
+"""
+
+
+class AnimateDiffPipelineIpex(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    LoraLoaderMixin,
+    FreeInitMixin,
+):
+    r"""
+    Pipeline for text-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: Union[UNet2DConditionModel, UNetMotionModel],
+        motion_adapter: MotionAdapter,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=torch.float32)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_frames: Optional[int] = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+
+            self._num_timesteps = len(timesteps)
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+            # 8. Denoising loop
+            with self.progress_bar(total=self._num_timesteps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        # cross_attention_kwargs=cross_attention_kwargs,
+                        # added_cond_kwargs=added_cond_kwargs,
+                        # ).sample
+                    )["sample"]
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+
+        # 9. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+        # 10. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
+
+    @torch.no_grad()
+    def prepare_for_ipex(
+        self,
+        dtype=torch.float32,
+        prompt: Union[str, List[str]] = None,
+        num_frames: Optional[int] = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+
+        self._num_timesteps = len(timesteps)
+
+        dummy = timesteps[0]
+        latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+        latent_model_input = self.scheduler.scale_model_input(latent_model_input, dummy)
+
+        self.unet = self.unet.to(memory_format=torch.channels_last)
+        self.vae.decoder = self.vae.decoder.to(memory_format=torch.channels_last)
+        self.text_encoder = self.text_encoder.to(memory_format=torch.channels_last)
+
+        unet_input_example = {
+            "sample": latent_model_input,
+            "timestep": dummy,
+            "encoder_hidden_states": prompt_embeds,
+        }
+
+        fake_latents = 1 / self.vae.config.scaling_factor * latents
+        batch_size, channels, num_frames, height, width = fake_latents.shape
+        fake_latents = fake_latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+        vae_decoder_input_example = fake_latents
+
+        # optimize with ipex
+        if dtype == torch.bfloat16:
+            self.unet = ipex.optimize(self.unet.eval(), dtype=torch.bfloat16, inplace=True)
+            self.vae.decoder = ipex.optimize(self.vae.decoder.eval(), dtype=torch.bfloat16, inplace=True)
+            self.text_encoder = ipex.optimize(self.text_encoder.eval(), dtype=torch.bfloat16, inplace=True)
+        elif dtype == torch.float32:
+            self.unet = ipex.optimize(
+                self.unet.eval(),
+                dtype=torch.float32,
+                inplace=True,
+                # sample_input=unet_input_example,
+                level="O1",
+                weights_prepack=True,
+                auto_kernel_selection=False,
+            )
+            self.vae.decoder = ipex.optimize(
+                self.vae.decoder.eval(),
+                dtype=torch.float32,
+                inplace=True,
+                level="O1",
+                weights_prepack=True,
+                auto_kernel_selection=False,
+            )
+            self.text_encoder = ipex.optimize(
+                self.text_encoder.eval(),
+                dtype=torch.float32,
+                inplace=True,
+                level="O1",
+                weights_prepack=True,
+                auto_kernel_selection=False,
+            )
+        else:
+            raise ValueError(" The value of 'dtype' should be 'torch.bfloat16' or 'torch.float32' !")
+
+        # trace unet model to get better performance on IPEX
+        with torch.cpu.amp.autocast(enabled=dtype == torch.bfloat16), torch.no_grad():
+            unet_trace_model = torch.jit.trace(
+                self.unet, example_kwarg_inputs=unet_input_example, check_trace=False, strict=False
+            )
+            unet_trace_model = torch.jit.freeze(unet_trace_model)
+            self.unet.forward = unet_trace_model.forward
+
+        # trace vae.decoder model to get better performance on IPEX
+        with torch.cpu.amp.autocast(enabled=dtype == torch.bfloat16), torch.no_grad():
+            vae_decoder_trace_model = torch.jit.trace(
+                self.vae.decoder, vae_decoder_input_example, check_trace=False, strict=False
+            )
+            vae_decoder_trace_model = torch.jit.freeze(vae_decoder_trace_model)
+            self.vae.decoder.forward = vae_decoder_trace_model.forward
diff --git a/diffusers/examples/community/pipeline_demofusion_sdxl.py b/diffusers/examples/community/pipeline_demofusion_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..f83d1b4014200681d8e96d920b588c8fc228f9e3
--- /dev/null
+++ b/diffusers/examples/community/pipeline_demofusion_sdxl.py
@@ -0,0 +1,1386 @@
+import inspect
+import os
+import random
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import matplotlib.pyplot as plt
+import torch
+import torch.nn.functional as F
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    is_invisible_watermark_available,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+if is_invisible_watermark_available():
+    from diffusers.pipelines.stable_diffusion_xl.watermark import (
+        StableDiffusionXLWatermarker,
+    )
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLPipeline
+
+        >>> pipe = StableDiffusionXLPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+def gaussian_kernel(kernel_size=3, sigma=1.0, channels=3):
+    x_coord = torch.arange(kernel_size)
+    gaussian_1d = torch.exp(-((x_coord - (kernel_size - 1) / 2) ** 2) / (2 * sigma**2))
+    gaussian_1d = gaussian_1d / gaussian_1d.sum()
+    gaussian_2d = gaussian_1d[:, None] * gaussian_1d[None, :]
+    kernel = gaussian_2d[None, None, :, :].repeat(channels, 1, 1, 1)
+
+    return kernel
+
+
+def gaussian_filter(latents, kernel_size=3, sigma=1.0):
+    channels = latents.shape[1]
+    kernel = gaussian_kernel(kernel_size, sigma, channels).to(latents.device, latents.dtype)
+    blurred_latents = F.conv2d(latents, kernel, padding=kernel_size // 2, groups=channels)
+
+    return blurred_latents
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class DemoFusionSDXLPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    In addition the pipeline inherits the following loading methods:
+        - *LoRA*: [`StableDiffusionXLPipeline.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.StableDiffusionXLPipeline.save_lora_weights`]
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(
+                    text_input_ids.to(device),
+                    output_hidden_states=True,
+                )
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                prompt_embeds = prompt_embeds.hidden_states[-2]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        num_images_per_prompt=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        # DemoFusion specific checks
+        if max(height, width) % 1024 != 0:
+            raise ValueError(
+                f"the larger one of `height` and `width` has to be divisible by 1024 but are {height} and {width}."
+            )
+
+        if num_images_per_prompt != 1:
+            warnings.warn("num_images_per_prompt != 1 is not supported by DemoFusion and will be ignored.")
+            num_images_per_prompt = 1
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + self.text_encoder_2.config.projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    def get_views(self, height, width, window_size=128, stride=64, random_jitter=False):
+        height //= self.vae_scale_factor
+        width //= self.vae_scale_factor
+        num_blocks_height = int((height - window_size) / stride - 1e-6) + 2 if height > window_size else 1
+        num_blocks_width = int((width - window_size) / stride - 1e-6) + 2 if width > window_size else 1
+        total_num_blocks = int(num_blocks_height * num_blocks_width)
+        views = []
+        for i in range(total_num_blocks):
+            h_start = int((i // num_blocks_width) * stride)
+            h_end = h_start + window_size
+            w_start = int((i % num_blocks_width) * stride)
+            w_end = w_start + window_size
+
+            if h_end > height:
+                h_start = int(h_start + height - h_end)
+                h_end = int(height)
+            if w_end > width:
+                w_start = int(w_start + width - w_end)
+                w_end = int(width)
+            if h_start < 0:
+                h_end = int(h_end - h_start)
+                h_start = 0
+            if w_start < 0:
+                w_end = int(w_end - w_start)
+                w_start = 0
+
+            if random_jitter:
+                jitter_range = (window_size - stride) // 4
+                w_jitter = 0
+                h_jitter = 0
+                if (w_start != 0) and (w_end != width):
+                    w_jitter = random.randint(-jitter_range, jitter_range)
+                elif (w_start == 0) and (w_end != width):
+                    w_jitter = random.randint(-jitter_range, 0)
+                elif (w_start != 0) and (w_end == width):
+                    w_jitter = random.randint(0, jitter_range)
+                if (h_start != 0) and (h_end != height):
+                    h_jitter = random.randint(-jitter_range, jitter_range)
+                elif (h_start == 0) and (h_end != height):
+                    h_jitter = random.randint(-jitter_range, 0)
+                elif (h_start != 0) and (h_end == height):
+                    h_jitter = random.randint(0, jitter_range)
+                h_start += h_jitter + jitter_range
+                h_end += h_jitter + jitter_range
+                w_start += w_jitter + jitter_range
+                w_end += w_jitter + jitter_range
+
+            views.append((h_start, h_end, w_start, w_end))
+        return views
+
+    def tiled_decode(self, latents, current_height, current_width):
+        core_size = self.unet.config.sample_size // 4
+        core_stride = core_size
+        pad_size = self.unet.config.sample_size // 4 * 3
+        decoder_view_batch_size = 1
+
+        views = self.get_views(current_height, current_width, stride=core_stride, window_size=core_size)
+        views_batch = [views[i : i + decoder_view_batch_size] for i in range(0, len(views), decoder_view_batch_size)]
+        latents_ = F.pad(latents, (pad_size, pad_size, pad_size, pad_size), "constant", 0)
+        image = torch.zeros(latents.size(0), 3, current_height, current_width).to(latents.device)
+        count = torch.zeros_like(image).to(latents.device)
+        # get the latents corresponding to the current view coordinates
+        with self.progress_bar(total=len(views_batch)) as progress_bar:
+            for j, batch_view in enumerate(views_batch):
+                len(batch_view)
+                latents_for_view = torch.cat(
+                    [
+                        latents_[:, :, h_start : h_end + pad_size * 2, w_start : w_end + pad_size * 2]
+                        for h_start, h_end, w_start, w_end in batch_view
+                    ]
+                )
+                image_patch = self.vae.decode(latents_for_view / self.vae.config.scaling_factor, return_dict=False)[0]
+                h_start, h_end, w_start, w_end = views[j]
+                h_start, h_end, w_start, w_end = (
+                    h_start * self.vae_scale_factor,
+                    h_end * self.vae_scale_factor,
+                    w_start * self.vae_scale_factor,
+                    w_end * self.vae_scale_factor,
+                )
+                p_h_start, p_h_end, p_w_start, p_w_end = (
+                    pad_size * self.vae_scale_factor,
+                    image_patch.size(2) - pad_size * self.vae_scale_factor,
+                    pad_size * self.vae_scale_factor,
+                    image_patch.size(3) - pad_size * self.vae_scale_factor,
+                )
+                image[:, :, h_start:h_end, w_start:w_end] += image_patch[:, :, p_h_start:p_h_end, p_w_start:p_w_end]
+                count[:, :, h_start:h_end, w_start:w_end] += 1
+                progress_bar.update()
+        image = image / count
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (AttnProcessor2_0, XFormersAttnProcessor),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = False,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        ################### DemoFusion specific parameters ####################
+        view_batch_size: int = 16,
+        multi_decoder: bool = True,
+        stride: Optional[int] = 64,
+        cosine_scale_1: Optional[float] = 3.0,
+        cosine_scale_2: Optional[float] = 1.0,
+        cosine_scale_3: Optional[float] = 1.0,
+        sigma: Optional[float] = 0.8,
+        show_image: bool = False,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            ################### DemoFusion specific parameters ####################
+            view_batch_size (`int`, defaults to 16):
+                The batch size for multiple denoising paths. Typically, a larger batch size can result in higher
+                efficiency but comes with increased GPU memory requirements.
+            multi_decoder (`bool`, defaults to True):
+                Determine whether to use a tiled decoder. Generally, when the resolution exceeds 3072x3072,
+                a tiled decoder becomes necessary.
+            stride (`int`, defaults to 64):
+                The stride of moving local patches. A smaller stride is better for alleviating seam issues,
+                but it also introduces additional computational overhead and inference time.
+            cosine_scale_1 (`float`, defaults to 3):
+                Control the strength of skip-residual. For specific impacts, please refer to Appendix C
+                in the DemoFusion paper.
+            cosine_scale_2 (`float`, defaults to 1):
+                Control the strength of dilated sampling. For specific impacts, please refer to Appendix C
+                in the DemoFusion paper.
+            cosine_scale_3 (`float`, defaults to 1):
+                Control the strength of the gaussian filter. For specific impacts, please refer to Appendix C
+                in the DemoFusion paper.
+            sigma (`float`, defaults to 1):
+                The standard value of the gaussian filter.
+            show_image (`bool`, defaults to False):
+                Determine whether to show intermediate results during generation.
+
+        Examples:
+
+        Returns:
+            a `list` with the generated images at each phase.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        x1_size = self.default_sample_size * self.vae_scale_factor
+
+        height_scale = height / x1_size
+        width_scale = width / x1_size
+        scale_num = int(max(height_scale, width_scale))
+        aspect_ratio = min(height_scale, width_scale) / max(height_scale, width_scale)
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            num_images_per_prompt,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height // scale_num,
+            width // scale_num,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        add_time_ids = self._get_add_time_ids(
+            original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 7.1 Apply denoising_end
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        output_images = []
+
+        ############################################################### Phase 1 #################################################################
+
+        print("### Phase 1 Denoising ###")
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latents_for_view = latents
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = latents.repeat_interleave(2, dim=0) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2]
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+            anchor_mean = latents.mean()
+            anchor_std = latents.std()
+            if not output_type == "latent":
+                # make sure the VAE is in float32 mode, as it overflows in float16
+                needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+                if needs_upcasting:
+                    self.upcast_vae()
+                    latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+                print("### Phase 1 Decoding ###")
+                image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+                # cast back to fp16 if needed
+                if needs_upcasting:
+                    self.vae.to(dtype=torch.float16)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+            if show_image:
+                plt.figure(figsize=(10, 10))
+                plt.imshow(image[0])
+                plt.axis("off")  # Turn off axis numbers and ticks
+                plt.show()
+            output_images.append(image[0])
+
+        ####################################################### Phase 2+ #####################################################
+
+        for current_scale_num in range(2, scale_num + 1):
+            print("### Phase {} Denoising ###".format(current_scale_num))
+            current_height = self.unet.config.sample_size * self.vae_scale_factor * current_scale_num
+            current_width = self.unet.config.sample_size * self.vae_scale_factor * current_scale_num
+            if height > width:
+                current_width = int(current_width * aspect_ratio)
+            else:
+                current_height = int(current_height * aspect_ratio)
+
+            latents = F.interpolate(
+                latents,
+                size=(int(current_height / self.vae_scale_factor), int(current_width / self.vae_scale_factor)),
+                mode="bicubic",
+            )
+
+            noise_latents = []
+            noise = torch.randn_like(latents)
+            for timestep in timesteps:
+                noise_latent = self.scheduler.add_noise(latents, noise, timestep.unsqueeze(0))
+                noise_latents.append(noise_latent)
+            latents = noise_latents[0]
+
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    count = torch.zeros_like(latents)
+                    value = torch.zeros_like(latents)
+                    cosine_factor = (
+                        0.5
+                        * (
+                            1
+                            + torch.cos(
+                                torch.pi
+                                * (self.scheduler.config.num_train_timesteps - t)
+                                / self.scheduler.config.num_train_timesteps
+                            )
+                        ).cpu()
+                    )
+
+                    c1 = cosine_factor**cosine_scale_1
+                    latents = latents * (1 - c1) + noise_latents[i] * c1
+
+                    ############################################# MultiDiffusion #############################################
+
+                    views = self.get_views(
+                        current_height,
+                        current_width,
+                        stride=stride,
+                        window_size=self.unet.config.sample_size,
+                        random_jitter=True,
+                    )
+                    views_batch = [views[i : i + view_batch_size] for i in range(0, len(views), view_batch_size)]
+
+                    jitter_range = (self.unet.config.sample_size - stride) // 4
+                    latents_ = F.pad(latents, (jitter_range, jitter_range, jitter_range, jitter_range), "constant", 0)
+
+                    count_local = torch.zeros_like(latents_)
+                    value_local = torch.zeros_like(latents_)
+
+                    for j, batch_view in enumerate(views_batch):
+                        vb_size = len(batch_view)
+
+                        # get the latents corresponding to the current view coordinates
+                        latents_for_view = torch.cat(
+                            [
+                                latents_[:, :, h_start:h_end, w_start:w_end]
+                                for h_start, h_end, w_start, w_end in batch_view
+                            ]
+                        )
+
+                        # expand the latents if we are doing classifier free guidance
+                        latent_model_input = latents_for_view
+                        latent_model_input = (
+                            latent_model_input.repeat_interleave(2, dim=0)
+                            if do_classifier_free_guidance
+                            else latent_model_input
+                        )
+                        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                        prompt_embeds_input = torch.cat([prompt_embeds] * vb_size)
+                        add_text_embeds_input = torch.cat([add_text_embeds] * vb_size)
+                        add_time_ids_input = []
+                        for h_start, h_end, w_start, w_end in batch_view:
+                            add_time_ids_ = add_time_ids.clone()
+                            add_time_ids_[:, 2] = h_start * self.vae_scale_factor
+                            add_time_ids_[:, 3] = w_start * self.vae_scale_factor
+                            add_time_ids_input.append(add_time_ids_)
+                        add_time_ids_input = torch.cat(add_time_ids_input)
+
+                        # predict the noise residual
+                        added_cond_kwargs = {"text_embeds": add_text_embeds_input, "time_ids": add_time_ids_input}
+                        noise_pred = self.unet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=prompt_embeds_input,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            added_cond_kwargs=added_cond_kwargs,
+                            return_dict=False,
+                        )[0]
+
+                        if do_classifier_free_guidance:
+                            noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2]
+                            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                        if do_classifier_free_guidance and guidance_rescale > 0.0:
+                            # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                            noise_pred = rescale_noise_cfg(
+                                noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
+                            )
+
+                        # compute the previous noisy sample x_t -> x_t-1
+                        self.scheduler._init_step_index(t)
+                        latents_denoised_batch = self.scheduler.step(
+                            noise_pred, t, latents_for_view, **extra_step_kwargs, return_dict=False
+                        )[0]
+
+                        # extract value from batch
+                        for latents_view_denoised, (h_start, h_end, w_start, w_end) in zip(
+                            latents_denoised_batch.chunk(vb_size), batch_view
+                        ):
+                            value_local[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised
+                            count_local[:, :, h_start:h_end, w_start:w_end] += 1
+
+                    value_local = value_local[
+                        :,
+                        :,
+                        jitter_range : jitter_range + current_height // self.vae_scale_factor,
+                        jitter_range : jitter_range + current_width // self.vae_scale_factor,
+                    ]
+                    count_local = count_local[
+                        :,
+                        :,
+                        jitter_range : jitter_range + current_height // self.vae_scale_factor,
+                        jitter_range : jitter_range + current_width // self.vae_scale_factor,
+                    ]
+
+                    c2 = cosine_factor**cosine_scale_2
+
+                    value += value_local / count_local * (1 - c2)
+                    count += torch.ones_like(value_local) * (1 - c2)
+
+                    ############################################# Dilated Sampling #############################################
+
+                    views = [[h, w] for h in range(current_scale_num) for w in range(current_scale_num)]
+                    views_batch = [views[i : i + view_batch_size] for i in range(0, len(views), view_batch_size)]
+
+                    h_pad = (current_scale_num - (latents.size(2) % current_scale_num)) % current_scale_num
+                    w_pad = (current_scale_num - (latents.size(3) % current_scale_num)) % current_scale_num
+                    latents_ = F.pad(latents, (w_pad, 0, h_pad, 0), "constant", 0)
+
+                    count_global = torch.zeros_like(latents_)
+                    value_global = torch.zeros_like(latents_)
+
+                    c3 = 0.99 * cosine_factor**cosine_scale_3 + 1e-2
+                    std_, mean_ = latents_.std(), latents_.mean()
+                    latents_gaussian = gaussian_filter(
+                        latents_, kernel_size=(2 * current_scale_num - 1), sigma=sigma * c3
+                    )
+                    latents_gaussian = (
+                        latents_gaussian - latents_gaussian.mean()
+                    ) / latents_gaussian.std() * std_ + mean_
+
+                    for j, batch_view in enumerate(views_batch):
+                        latents_for_view = torch.cat(
+                            [latents_[:, :, h::current_scale_num, w::current_scale_num] for h, w in batch_view]
+                        )
+                        latents_for_view_gaussian = torch.cat(
+                            [latents_gaussian[:, :, h::current_scale_num, w::current_scale_num] for h, w in batch_view]
+                        )
+
+                        vb_size = latents_for_view.size(0)
+
+                        # expand the latents if we are doing classifier free guidance
+                        latent_model_input = latents_for_view_gaussian
+                        latent_model_input = (
+                            latent_model_input.repeat_interleave(2, dim=0)
+                            if do_classifier_free_guidance
+                            else latent_model_input
+                        )
+                        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                        prompt_embeds_input = torch.cat([prompt_embeds] * vb_size)
+                        add_text_embeds_input = torch.cat([add_text_embeds] * vb_size)
+                        add_time_ids_input = torch.cat([add_time_ids] * vb_size)
+
+                        # predict the noise residual
+                        added_cond_kwargs = {"text_embeds": add_text_embeds_input, "time_ids": add_time_ids_input}
+                        noise_pred = self.unet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=prompt_embeds_input,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            added_cond_kwargs=added_cond_kwargs,
+                            return_dict=False,
+                        )[0]
+
+                        if do_classifier_free_guidance:
+                            noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2]
+                            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                        if do_classifier_free_guidance and guidance_rescale > 0.0:
+                            # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                            noise_pred = rescale_noise_cfg(
+                                noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
+                            )
+
+                        # compute the previous noisy sample x_t -> x_t-1
+                        self.scheduler._init_step_index(t)
+                        latents_denoised_batch = self.scheduler.step(
+                            noise_pred, t, latents_for_view, **extra_step_kwargs, return_dict=False
+                        )[0]
+
+                        # extract value from batch
+                        for latents_view_denoised, (h, w) in zip(latents_denoised_batch.chunk(vb_size), batch_view):
+                            value_global[:, :, h::current_scale_num, w::current_scale_num] += latents_view_denoised
+                            count_global[:, :, h::current_scale_num, w::current_scale_num] += 1
+
+                    c2 = cosine_factor**cosine_scale_2
+
+                    value_global = value_global[:, :, h_pad:, w_pad:]
+
+                    value += value_global * c2
+                    count += torch.ones_like(value_global) * c2
+
+                    ###########################################################
+
+                    latents = torch.where(count > 0, value / count, value)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            step_idx = i // getattr(self.scheduler, "order", 1)
+                            callback(step_idx, t, latents)
+
+                #########################################################################################################################################
+
+                latents = (latents - latents.mean()) / latents.std() * anchor_std + anchor_mean
+                if not output_type == "latent":
+                    # make sure the VAE is in float32 mode, as it overflows in float16
+                    needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+                    if needs_upcasting:
+                        self.upcast_vae()
+                        latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+                    print("### Phase {} Decoding ###".format(current_scale_num))
+                    if multi_decoder:
+                        image = self.tiled_decode(latents, current_height, current_width)
+                    else:
+                        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+                    # cast back to fp16 if needed
+                    if needs_upcasting:
+                        self.vae.to(dtype=torch.float16)
+                else:
+                    image = latents
+
+                if not output_type == "latent":
+                    image = self.image_processor.postprocess(image, output_type=output_type)
+                    if show_image:
+                        plt.figure(figsize=(10, 10))
+                        plt.imshow(image[0])
+                        plt.axis("off")  # Turn off axis numbers and ticks
+                        plt.show()
+                    output_images.append(image[0])
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        return output_images
+
+    # Override to properly handle the loading and unloading of the additional text encoder.
+    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
+        # We could have accessed the unet config from `lora_state_dict()` too. We pass
+        # it here explicitly to be able to tell that it's coming from an SDXL
+        # pipeline.
+
+        # Remove any existing hooks.
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+        else:
+            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
+
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        recursive = False
+        for _, component in self.components.items():
+            if isinstance(component, torch.nn.Module):
+                if hasattr(component, "_hf_hook"):
+                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                    is_sequential_cpu_offload = (
+                        isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                        or hasattr(component._hf_hook, "hooks")
+                        and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+                    )
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                    )
+                    recursive = is_sequential_cpu_offload
+                    remove_hook_from_module(component, recurse=recursive)
+        state_dict, network_alphas = self.lora_state_dict(
+            pretrained_model_name_or_path_or_dict,
+            unet_config=self.unet.config,
+            **kwargs,
+        )
+        self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
+
+        text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
+        if len(text_encoder_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_state_dict,
+                network_alphas=network_alphas,
+                text_encoder=self.text_encoder,
+                prefix="text_encoder",
+                lora_scale=self.lora_scale,
+            )
+
+        text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
+        if len(text_encoder_2_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_2_state_dict,
+                network_alphas=network_alphas,
+                text_encoder=self.text_encoder_2,
+                prefix="text_encoder_2",
+                lora_scale=self.lora_scale,
+            )
+
+        # Offload back.
+        if is_model_cpu_offload:
+            self.enable_model_cpu_offload()
+        elif is_sequential_cpu_offload:
+            self.enable_sequential_cpu_offload()
+
+    @classmethod
+    def save_lora_weights(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+    ):
+        state_dict = {}
+
+        def pack_weights(layers, prefix):
+            layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
+            layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
+            return layers_state_dict
+
+        if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers):
+            raise ValueError(
+                "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`."
+            )
+
+        if unet_lora_layers:
+            state_dict.update(pack_weights(unet_lora_layers, "unet"))
+
+        if text_encoder_lora_layers and text_encoder_2_lora_layers:
+            state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
+            state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
+
+        cls.write_lora_layers(
+            state_dict=state_dict,
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+
+    def _remove_text_encoder_monkey_patch(self):
+        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
+        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)
diff --git a/diffusers/examples/community/pipeline_fabric.py b/diffusers/examples/community/pipeline_fabric.py
new file mode 100644
index 0000000000000000000000000000000000000000..02fdcd04c1038f8f5f24ea2dfc8fe97f9eb8b8d8
--- /dev/null
+++ b/diffusers/examples/community/pipeline_fabric.py
@@ -0,0 +1,751 @@
+# Copyright 2024 FABRIC authors and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Union
+
+import torch
+from packaging import version
+from PIL import Image
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, UNet2DConditionModel
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.models.attention_processor import LoRAAttnProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.schedulers import EulerAncestralDiscreteScheduler, KarrasDiffusionSchedulers
+from diffusers.utils import (
+    deprecate,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import DiffusionPipeline
+        >>> import torch
+
+        >>> model_id = "dreamlike-art/dreamlike-photoreal-2.0"
+        >>> pipe = DiffusionPipeline(model_id, torch_dtype=torch.float16, custom_pipeline="pipeline_fabric")
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "a giant standing in a fantasy landscape best quality"
+        >>> liked = []  # list of images for positive feedback
+        >>> disliked = []  # list of images for negative feedback
+        >>> image = pipe(prompt, num_images=4, liked=liked, disliked=disliked).images[0]
+        ```
+"""
+
+
+class FabricCrossAttnProcessor:
+    def __init__(self):
+        self.attntion_probs = None
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        weights=None,
+        lora_scale=1.0,
+    ):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if isinstance(attn.processor, LoRAAttnProcessor):
+            query = attn.to_q(hidden_states) + lora_scale * attn.processor.to_q_lora(hidden_states)
+        else:
+            query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        if isinstance(attn.processor, LoRAAttnProcessor):
+            key = attn.to_k(encoder_hidden_states) + lora_scale * attn.processor.to_k_lora(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states) + lora_scale * attn.processor.to_v_lora(encoder_hidden_states)
+        else:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+
+        if weights is not None:
+            if weights.shape[0] != 1:
+                weights = weights.repeat_interleave(attn.heads, dim=0)
+            attention_probs = attention_probs * weights[:, None]
+            attention_probs = attention_probs / attention_probs.sum(dim=-1, keepdim=True)
+
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        if isinstance(attn.processor, LoRAAttnProcessor):
+            hidden_states = attn.to_out[0](hidden_states) + lora_scale * attn.processor.to_out_lora(hidden_states)
+        else:
+            hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class FabricPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion and conditioning the results using feedback images.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`EulerAncestralDiscreteScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            unet=unet,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def get_unet_hidden_states(self, z_all, t, prompt_embd):
+        cached_hidden_states = []
+        for module in self.unet.modules():
+            if isinstance(module, BasicTransformerBlock):
+
+                def new_forward(self, hidden_states, *args, **kwargs):
+                    cached_hidden_states.append(hidden_states.clone().detach().cpu())
+                    return self.old_forward(hidden_states, *args, **kwargs)
+
+                module.attn1.old_forward = module.attn1.forward
+                module.attn1.forward = new_forward.__get__(module.attn1)
+
+        # run forward pass to cache hidden states, output can be discarded
+        _ = self.unet(z_all, t, encoder_hidden_states=prompt_embd)
+
+        # restore original forward pass
+        for module in self.unet.modules():
+            if isinstance(module, BasicTransformerBlock):
+                module.attn1.forward = module.attn1.old_forward
+                del module.attn1.old_forward
+
+        return cached_hidden_states
+
+    def unet_forward_with_cached_hidden_states(
+        self,
+        z_all,
+        t,
+        prompt_embd,
+        cached_pos_hiddens: Optional[List[torch.Tensor]] = None,
+        cached_neg_hiddens: Optional[List[torch.Tensor]] = None,
+        pos_weights=(0.8, 0.8),
+        neg_weights=(0.5, 0.5),
+    ):
+        if cached_pos_hiddens is None and cached_neg_hiddens is None:
+            return self.unet(z_all, t, encoder_hidden_states=prompt_embd)
+
+        local_pos_weights = torch.linspace(*pos_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist()
+        local_neg_weights = torch.linspace(*neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist()
+        for block, pos_weight, neg_weight in zip(
+            self.unet.down_blocks + [self.unet.mid_block] + self.unet.up_blocks,
+            local_pos_weights + [pos_weights[1]] + local_pos_weights[::-1],
+            local_neg_weights + [neg_weights[1]] + local_neg_weights[::-1],
+        ):
+            for module in block.modules():
+                if isinstance(module, BasicTransformerBlock):
+
+                    def new_forward(
+                        self,
+                        hidden_states,
+                        pos_weight=pos_weight,
+                        neg_weight=neg_weight,
+                        **kwargs,
+                    ):
+                        cond_hiddens, uncond_hiddens = hidden_states.chunk(2, dim=0)
+                        batch_size, d_model = cond_hiddens.shape[:2]
+                        device, dtype = hidden_states.device, hidden_states.dtype
+
+                        weights = torch.ones(batch_size, d_model, device=device, dtype=dtype)
+                        out_pos = self.old_forward(hidden_states)
+                        out_neg = self.old_forward(hidden_states)
+
+                        if cached_pos_hiddens is not None:
+                            cached_pos_hs = cached_pos_hiddens.pop(0).to(hidden_states.device)
+                            cond_pos_hs = torch.cat([cond_hiddens, cached_pos_hs], dim=1)
+                            pos_weights = weights.clone().repeat(1, 1 + cached_pos_hs.shape[1] // d_model)
+                            pos_weights[:, d_model:] = pos_weight
+                            attn_with_weights = FabricCrossAttnProcessor()
+                            out_pos = attn_with_weights(
+                                self,
+                                cond_hiddens,
+                                encoder_hidden_states=cond_pos_hs,
+                                weights=pos_weights,
+                            )
+                        else:
+                            out_pos = self.old_forward(cond_hiddens)
+
+                        if cached_neg_hiddens is not None:
+                            cached_neg_hs = cached_neg_hiddens.pop(0).to(hidden_states.device)
+                            uncond_neg_hs = torch.cat([uncond_hiddens, cached_neg_hs], dim=1)
+                            neg_weights = weights.clone().repeat(1, 1 + cached_neg_hs.shape[1] // d_model)
+                            neg_weights[:, d_model:] = neg_weight
+                            attn_with_weights = FabricCrossAttnProcessor()
+                            out_neg = attn_with_weights(
+                                self,
+                                uncond_hiddens,
+                                encoder_hidden_states=uncond_neg_hs,
+                                weights=neg_weights,
+                            )
+                        else:
+                            out_neg = self.old_forward(uncond_hiddens)
+
+                        out = torch.cat([out_pos, out_neg], dim=0)
+                        return out
+
+                    module.attn1.old_forward = module.attn1.forward
+                    module.attn1.forward = new_forward.__get__(module.attn1)
+
+        out = self.unet(z_all, t, encoder_hidden_states=prompt_embd)
+
+        # restore original forward pass
+        for module in self.unet.modules():
+            if isinstance(module, BasicTransformerBlock):
+                module.attn1.forward = module.attn1.old_forward
+                del module.attn1.old_forward
+
+        return out
+
+    def preprocess_feedback_images(self, images, vae, dim, device, dtype, generator) -> torch.tensor:
+        images_t = [self.image_to_tensor(img, dim, dtype) for img in images]
+        images_t = torch.stack(images_t).to(device)
+        latents = vae.config.scaling_factor * vae.encode(images_t).latent_dist.sample(generator)
+
+        return torch.cat([latents], dim=0)
+
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt=None,
+        liked=None,
+        disliked=None,
+        height=None,
+        width=None,
+    ):
+        if prompt is None:
+            raise ValueError("Provide `prompt`. Cannot leave both `prompt` undefined.")
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and (
+            not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
+        ):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+        if liked is not None and not isinstance(liked, list):
+            raise ValueError(f"`liked` has to be of type `list` but is {type(liked)}")
+
+        if disliked is not None and not isinstance(disliked, list):
+            raise ValueError(f"`disliked` has to be of type `list` but is {type(disliked)}")
+
+        if height is not None and not isinstance(height, int):
+            raise ValueError(f"`height` has to be of type `int` but is {type(height)}")
+
+        if width is not None and not isinstance(width, int):
+            raise ValueError(f"`width` has to be of type `int` but is {type(width)}")
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = "",
+        negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality",
+        liked: Optional[Union[List[str], List[Image.Image]]] = [],
+        disliked: Optional[Union[List[str], List[Image.Image]]] = [],
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        height: int = 512,
+        width: int = 512,
+        return_dict: bool = True,
+        num_images: int = 4,
+        guidance_scale: float = 7.0,
+        num_inference_steps: int = 20,
+        output_type: Optional[str] = "pil",
+        feedback_start_ratio: float = 0.33,
+        feedback_end_ratio: float = 0.66,
+        min_weight: float = 0.05,
+        max_weight: float = 0.8,
+        neg_scale: float = 0.5,
+        pos_bottleneck_scale: float = 1.0,
+        neg_bottleneck_scale: float = 1.0,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The
+        feedback can be given as a list of liked and disliked images.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            liked (`List[Image.Image]` or `List[str]`, *optional*):
+                Encourages images with liked features.
+            disliked (`List[Image.Image]` or `List[str]`, *optional*):
+                Discourages images with disliked features.
+            generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to
+                make generation deterministic.
+            height (`int`, *optional*, defaults to 512):
+                Height of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                Width of the generated image.
+            num_images (`int`, *optional*, defaults to 4):
+                The number of images to generate per prompt.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            num_inference_steps (`int`, *optional*, defaults to 20):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            feedback_start_ratio (`float`, *optional*, defaults to `.33`):
+                Start point for providing feedback (between 0 and 1).
+            feedback_end_ratio (`float`, *optional*, defaults to `.66`):
+                End point for providing feedback (between 0 and 1).
+            min_weight (`float`, *optional*, defaults to `.05`):
+                Minimum weight for feedback.
+            max_weight (`float`, *optional*, defults tp `1.0`):
+                Maximum weight for feedback.
+            neg_scale (`float`, *optional*, defaults to `.5`):
+                Scale factor for negative feedback.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+
+        """
+
+        self.check_inputs(prompt, negative_prompt, liked, disliked)
+
+        device = self._execution_device
+        dtype = self.unet.dtype
+
+        if isinstance(prompt, str) and prompt is not None:
+            batch_size = 1
+        elif isinstance(prompt, list) and prompt is not None:
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if isinstance(negative_prompt, str):
+            negative_prompt = negative_prompt
+        elif isinstance(negative_prompt, list):
+            negative_prompt = negative_prompt
+        else:
+            assert len(negative_prompt) == batch_size
+
+        shape = (
+            batch_size * num_images,
+            self.unet.config.in_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        latent_noise = randn_tensor(
+            shape,
+            device=device,
+            dtype=dtype,
+            generator=generator,
+        )
+
+        positive_latents = (
+            self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype, generator)
+            if liked and len(liked) > 0
+            else torch.tensor(
+                [],
+                device=device,
+                dtype=dtype,
+            )
+        )
+        negative_latents = (
+            self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype, generator)
+            if disliked and len(disliked) > 0
+            else torch.tensor(
+                [],
+                device=device,
+                dtype=dtype,
+            )
+        )
+
+        do_classifier_free_guidance = guidance_scale > 0.1
+
+        (prompt_neg_embs, prompt_pos_embs) = self._encode_prompt(
+            prompt,
+            device,
+            num_images,
+            do_classifier_free_guidance,
+            negative_prompt,
+        ).split([num_images * batch_size, num_images * batch_size])
+
+        batched_prompt_embd = torch.cat([prompt_pos_embs, prompt_neg_embs], dim=0)
+
+        null_tokens = self.tokenizer(
+            [""],
+            return_tensors="pt",
+            max_length=self.tokenizer.model_max_length,
+            padding="max_length",
+            truncation=True,
+        )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = null_tokens.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        null_prompt_emb = self.text_encoder(
+            input_ids=null_tokens.input_ids.to(device),
+            attention_mask=attention_mask,
+        ).last_hidden_state
+
+        null_prompt_emb = null_prompt_emb.to(device=device, dtype=dtype)
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        latent_noise = latent_noise * self.scheduler.init_noise_sigma
+
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+        ref_start_idx = round(len(timesteps) * feedback_start_ratio)
+        ref_end_idx = round(len(timesteps) * feedback_end_ratio)
+
+        with self.progress_bar(total=num_inference_steps) as pbar:
+            for i, t in enumerate(timesteps):
+                sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, "sigma_t") else 0
+                if hasattr(self.scheduler, "sigmas"):
+                    sigma = self.scheduler.sigmas[i]
+
+                alpha_hat = 1 / (sigma**2 + 1)
+
+                z_single = self.scheduler.scale_model_input(latent_noise, t)
+                z_all = torch.cat([z_single] * 2, dim=0)
+                z_ref = torch.cat([positive_latents, negative_latents], dim=0)
+
+                if i >= ref_start_idx and i <= ref_end_idx:
+                    weight_factor = max_weight
+                else:
+                    weight_factor = min_weight
+
+                pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale)
+                neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale)
+
+                if z_ref.size(0) > 0 and weight_factor > 0:
+                    noise = torch.randn_like(z_ref)
+                    if isinstance(self.scheduler, EulerAncestralDiscreteScheduler):
+                        z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype)
+                    else:
+                        z_ref_noised = self.scheduler.add_noise(z_ref, noise, t)
+
+                    ref_prompt_embd = torch.cat(
+                        [null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0
+                    )
+                    cached_hidden_states = self.get_unet_hidden_states(z_ref_noised, t, ref_prompt_embd)
+
+                    n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0]
+                    cached_pos_hs, cached_neg_hs = [], []
+                    for hs in cached_hidden_states:
+                        cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0)
+                        cached_pos = cached_pos.view(1, -1, *cached_pos.shape[2:]).expand(num_images, -1, -1)
+                        cached_neg = cached_neg.view(1, -1, *cached_neg.shape[2:]).expand(num_images, -1, -1)
+                        cached_pos_hs.append(cached_pos)
+                        cached_neg_hs.append(cached_neg)
+
+                    if n_pos == 0:
+                        cached_pos_hs = None
+                    if n_neg == 0:
+                        cached_neg_hs = None
+                else:
+                    cached_pos_hs, cached_neg_hs = None, None
+                unet_out = self.unet_forward_with_cached_hidden_states(
+                    z_all,
+                    t,
+                    prompt_embd=batched_prompt_embd,
+                    cached_pos_hiddens=cached_pos_hs,
+                    cached_neg_hiddens=cached_neg_hs,
+                    pos_weights=pos_ws,
+                    neg_weights=neg_ws,
+                )[0]
+
+                noise_cond, noise_uncond = unet_out.chunk(2)
+                guidance = noise_cond - noise_uncond
+                noise_pred = noise_uncond + guidance_scale * guidance
+                latent_noise = self.scheduler.step(noise_pred, t, latent_noise)[0]
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    pbar.update()
+
+        y = self.vae.decode(latent_noise / self.vae.config.scaling_factor, return_dict=False)[0]
+        imgs = self.image_processor.postprocess(
+            y,
+            output_type=output_type,
+        )
+
+        if not return_dict:
+            return imgs
+
+        return StableDiffusionPipelineOutput(imgs, False)
+
+    def image_to_tensor(self, image: Union[str, Image.Image], dim: tuple, dtype):
+        """
+        Convert latent PIL image to a torch tensor for further processing.
+        """
+        if isinstance(image, str):
+            image = Image.open(image)
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        image = self.image_processor.preprocess(image, height=dim[0], width=dim[1])[0]
+        return image.type(dtype)
diff --git a/diffusers/examples/community/pipeline_flux_with_cfg.py b/diffusers/examples/community/pipeline_flux_with_cfg.py
new file mode 100644
index 0000000000000000000000000000000000000000..06da6da899cd57925ebcf04baf356f0c20ecb95b
--- /dev/null
+++ b/diffusers/examples/community/pipeline_flux_with_cfg.py
@@ -0,0 +1,850 @@
+# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FluxLoraLoaderMixin, FromSingleFileMixin
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.transformers import FluxTransformer2DModel
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxPipeline
+
+        >>> pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> # Depending on the variant being used, the pipeline call will slightly vary.
+        >>> # Refer to the pipeline documentation for more details.
+        >>> image = pipe(prompt, num_inference_steps=4, guidance_scale=0.0).images[0]
+        >>> image.save("flux.png")
+        ```
+"""
+
+
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class FluxCFGPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    The Flux pipeline for text-to-image generation.
+
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 64
+
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        negative_prompt: Union[str, List[str]] = None,
+        negative_prompt_2: Union[str, List[str]] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+        do_true_cfg: bool = False,
+    ):
+        device = device or self._execution_device
+
+        # Set LoRA scale if applicable
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        if do_true_cfg and negative_prompt is not None:
+            negative_prompt = [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_batch_size = len(negative_prompt)
+
+            if negative_batch_size != batch_size:
+                raise ValueError(
+                    f"Negative prompt batch size ({negative_batch_size}) does not match prompt batch size ({batch_size})"
+                )
+
+            # Concatenate prompts
+            prompts = prompt + negative_prompt
+            prompts_2 = (
+                prompt_2 + negative_prompt_2 if prompt_2 is not None and negative_prompt_2 is not None else None
+            )
+        else:
+            prompts = prompt
+            prompts_2 = prompt_2
+
+        if prompt_embeds is None:
+            if prompts_2 is None:
+                prompts_2 = prompts
+
+            # Get pooled prompt embeddings from CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompts,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompts_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            if do_true_cfg and negative_prompt is not None:
+                # Split embeddings back into positive and negative parts
+                total_batch_size = batch_size * num_images_per_prompt
+                positive_indices = slice(0, total_batch_size)
+                negative_indices = slice(total_batch_size, 2 * total_batch_size)
+
+                positive_pooled_prompt_embeds = pooled_prompt_embeds[positive_indices]
+                negative_pooled_prompt_embeds = pooled_prompt_embeds[negative_indices]
+
+                positive_prompt_embeds = prompt_embeds[positive_indices]
+                negative_prompt_embeds = prompt_embeds[negative_indices]
+
+                pooled_prompt_embeds = positive_pooled_prompt_embeds
+                prompt_embeds = positive_prompt_embeds
+
+        # Unscale LoRA layers
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+
+        if do_true_cfg and negative_prompt is not None:
+            return (
+                prompt_embeds,
+                pooled_prompt_embeds,
+                text_ids,
+                negative_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            )
+        else:
+            return prompt_embeds, pooled_prompt_embeds, text_ids, None, None
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+
+        return latent_image_ids.to(device=device, dtype=dtype)
+
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+
+        return latents
+
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+
+        shape = (batch_size, num_channels_latents, height, width)
+
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+
+        return latents, latent_image_ids
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Union[str, List[str]] = None,  #
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        true_cfg: float = 1.0,  #
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        do_true_cfg = true_cfg > 1 and negative_prompt is not None
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+            negative_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+            do_true_cfg=do_true_cfg,
+        )
+
+        if do_true_cfg:
+            # Concatenate embeddings
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                latent_model_input = torch.cat([latents] * 2) if do_true_cfg else latents
+
+                # handle guidance
+                if self.transformer.config.guidance_embeds:
+                    guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+                    guidance = guidance.expand(latent_model_input.shape[0])
+                else:
+                    guidance = None
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                if do_true_cfg:
+                    neg_noise_pred, noise_pred = noise_pred.chunk(2)
+                    noise_pred = neg_noise_pred + true_cfg * (noise_pred - neg_noise_pred)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return FluxPipelineOutput(images=image)
diff --git a/diffusers/examples/community/pipeline_hunyuandit_differential_img2img.py b/diffusers/examples/community/pipeline_hunyuandit_differential_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ece670e5bde8e822f0411a8e41bc168eb94153b
--- /dev/null
+++ b/diffusers/examples/community/pipeline_hunyuandit_differential_img2img.py
@@ -0,0 +1,1144 @@
+# Copyright 2024 HunyuanDiT Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import (
+    BertModel,
+    BertTokenizer,
+    CLIPImageProcessor,
+    MT5Tokenizer,
+    T5EncoderModel,
+)
+
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models import AutoencoderKL, HunyuanDiT2DModel
+from diffusers.models.embeddings import get_2d_rotary_pos_embed
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import DDPMScheduler
+from diffusers.utils import (
+    deprecate,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FlowMatchEulerDiscreteScheduler
+        >>> from diffusers.utils import load_image
+        >>> from PIL import Image
+        >>> from torchvision import transforms
+        >>> from pipeline_hunyuandit_differential_img2img import HunyuanDiTDifferentialImg2ImgPipeline
+        >>> pipe = HunyuanDiTDifferentialImg2ImgPipeline.from_pretrained(
+        >>>     "Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
+        >>> ).to("cuda")
+        >>> source_image = load_image(
+        >>>     "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/20240329211129_4024911930.png"
+        >>> )
+        >>> map = load_image(
+        >>>     "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/gradient_mask_2.png"
+        >>> )
+        >>> prompt = "a green pear"
+        >>> negative_prompt = "blurry"
+        >>> image = pipe(
+        >>>     prompt=prompt,
+        >>>     negative_prompt=negative_prompt,
+        >>>     image=source_image,
+        >>>     num_inference_steps=28,
+        >>>     guidance_scale=4.5,
+        >>>     strength=1.0,
+        >>>     map=map,
+        >>> ).images[0]
+
+        ```
+"""
+
+STANDARD_RATIO = np.array(
+    [
+        1.0,  # 1:1
+        4.0 / 3.0,  # 4:3
+        3.0 / 4.0,  # 3:4
+        16.0 / 9.0,  # 16:9
+        9.0 / 16.0,  # 9:16
+    ]
+)
+STANDARD_SHAPE = [
+    [(1024, 1024), (1280, 1280)],  # 1:1
+    [(1024, 768), (1152, 864), (1280, 960)],  # 4:3
+    [(768, 1024), (864, 1152), (960, 1280)],  # 3:4
+    [(1280, 768)],  # 16:9
+    [(768, 1280)],  # 9:16
+]
+STANDARD_AREA = [np.array([w * h for w, h in shapes]) for shapes in STANDARD_SHAPE]
+SUPPORTED_SHAPE = [
+    (1024, 1024),
+    (1280, 1280),  # 1:1
+    (1024, 768),
+    (1152, 864),
+    (1280, 960),  # 4:3
+    (768, 1024),
+    (864, 1152),
+    (960, 1280),  # 3:4
+    (1280, 768),  # 16:9
+    (768, 1280),  # 9:16
+]
+
+
+def map_to_standard_shapes(target_width, target_height):
+    target_ratio = target_width / target_height
+    closest_ratio_idx = np.argmin(np.abs(STANDARD_RATIO - target_ratio))
+    closest_area_idx = np.argmin(np.abs(STANDARD_AREA[closest_ratio_idx] - target_width * target_height))
+    width, height = STANDARD_SHAPE[closest_ratio_idx][closest_area_idx]
+    return width, height
+
+
+def get_resize_crop_region_for_grid(src, tgt_size):
+    th = tw = tgt_size
+    h, w = src
+
+    r = h / w
+
+    # resize
+    if r > 1:
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor,
+    generator: Optional[torch.Generator] = None,
+    sample_mode: str = "sample",
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class HunyuanDiTDifferentialImg2ImgPipeline(DiffusionPipeline):
+    r"""
+    Differential Pipeline for English/Chinese-to-image generation using HunyuanDiT.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    HunyuanDiT uses two text encoders: [mT5](https://huggingface.co/google/mt5-base) and [bilingual CLIP](fine-tuned by
+    ourselves)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. We use
+            `sdxl-vae-fp16-fix`.
+        text_encoder (Optional[`~transformers.BertModel`, `~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+            HunyuanDiT uses a fine-tuned [bilingual CLIP].
+        tokenizer (Optional[`~transformers.BertTokenizer`, `~transformers.CLIPTokenizer`]):
+            A `BertTokenizer` or `CLIPTokenizer` to tokenize text.
+        transformer ([`HunyuanDiT2DModel`]):
+            The HunyuanDiT model designed by Tencent Hunyuan.
+        text_encoder_2 (`T5EncoderModel`):
+            The mT5 embedder. Specifically, it is 't5-v1_1-xxl'.
+        tokenizer_2 (`MT5Tokenizer`):
+            The tokenizer for the mT5 embedder.
+        scheduler ([`DDPMScheduler`]):
+            A scheduler to be used in combination with HunyuanDiT to denoise the encoded image latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = [
+        "safety_checker",
+        "feature_extractor",
+        "text_encoder_2",
+        "tokenizer_2",
+        "text_encoder",
+        "tokenizer",
+    ]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "prompt_embeds_2",
+        "negative_prompt_embeds_2",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: BertModel,
+        tokenizer: BertTokenizer,
+        transformer: HunyuanDiT2DModel,
+        scheduler: DDPMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+        text_encoder_2=T5EncoderModel,
+        tokenizer_2=MT5Tokenizer,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            text_encoder_2=text_encoder_2,
+        )
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_normalize=False,
+            do_convert_grayscale=True,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.default_sample_size = (
+            self.transformer.config.sample_size
+            if hasattr(self, "transformer") and self.transformer is not None
+            else 128
+        )
+
+    # copied from diffusers.pipelines.huanyuandit.pipeline_huanyuandit.HunyuanDiTPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        device: torch.device = None,
+        dtype: torch.dtype = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        max_sequence_length: Optional[int] = None,
+        text_encoder_index: int = 0,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            dtype (`torch.dtype`):
+                torch dtype
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the prompt. Required when `prompt_embeds` is passed directly.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the negative prompt. Required when `negative_prompt_embeds` is passed directly.
+            max_sequence_length (`int`, *optional*): maximum sequence length to use for the prompt.
+            text_encoder_index (`int`, *optional*):
+                Index of the text encoder to use. `0` for clip and `1` for T5.
+        """
+        if dtype is None:
+            if self.text_encoder_2 is not None:
+                dtype = self.text_encoder_2.dtype
+            elif self.transformer is not None:
+                dtype = self.transformer.dtype
+            else:
+                dtype = None
+
+        if device is None:
+            device = self._execution_device
+
+        tokenizers = [self.tokenizer, self.tokenizer_2]
+        text_encoders = [self.text_encoder, self.text_encoder_2]
+
+        tokenizer = tokenizers[text_encoder_index]
+        text_encoder = text_encoders[text_encoder_index]
+
+        if max_sequence_length is None:
+            if text_encoder_index == 0:
+                max_length = 77
+            if text_encoder_index == 1:
+                max_length = 256
+        else:
+            max_length = max_sequence_length
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_attention_mask = text_inputs.attention_mask.to(device)
+            prompt_embeds = text_encoder(
+                text_input_ids.to(device),
+                attention_mask=prompt_attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+            prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            negative_prompt_attention_mask = uncond_input.attention_mask.to(device)
+            negative_prompt_embeds = text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=negative_prompt_attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+        prompt_embeds_2=None,
+        negative_prompt_embeds_2=None,
+        prompt_attention_mask_2=None,
+        negative_prompt_attention_mask_2=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is None and prompt_embeds_2 is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds_2`. Cannot leave both `prompt` and `prompt_embeds_2` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+        if prompt_embeds_2 is not None and prompt_attention_mask_2 is None:
+            raise ValueError("Must provide `prompt_attention_mask_2` when specifying `prompt_embeds_2`.")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+        if negative_prompt_embeds_2 is not None and negative_prompt_attention_mask_2 is None:
+            raise ValueError(
+                "Must provide `negative_prompt_attention_mask_2` when specifying `negative_prompt_embeds_2`."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if prompt_embeds_2 is not None and negative_prompt_embeds_2 is not None:
+            if prompt_embeds_2.shape != negative_prompt_embeds_2.shape:
+                raise ValueError(
+                    "`prompt_embeds_2` and `negative_prompt_embeds_2` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds_2` {prompt_embeds_2.shape} != `negative_prompt_embeds_2`"
+                    f" {negative_prompt_embeds_2.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        image,
+        timestep,
+        dtype,
+        device,
+        generator=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+
+        image = image.to(device=device, dtype=dtype)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        elif isinstance(generator, list):
+            init_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(batch_size)
+            ]
+            init_latents = torch.cat(init_latents, dim=0)
+
+        else:
+            init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        init_latents = init_latents * self.vae.config.scaling_factor
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate(
+                "len(prompt) != len(image)",
+                "1.0.0",
+                deprecation_message,
+                standard_warn=False,
+            )
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.8,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: Optional[int] = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: Optional[float] = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_2: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_2: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        prompt_attention_mask_2: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask_2: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[
+                Callable[[int, int, Dict], None],
+                PipelineCallback,
+                MultiPipelineCallbacks,
+            ]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = (1024, 1024),
+        target_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        use_resolution_binning: bool = True,
+        map: PipelineImageInput = None,
+        denoising_start: Optional[float] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation with HunyuanDiT.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            height (`int`):
+                The height in pixels of the generated image.
+            width (`int`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            negative_prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the prompt. Required when `prompt_embeds` is passed directly.
+            prompt_attention_mask_2 (`torch.Tensor`, *optional*):
+                Attention mask for the prompt. Required when `prompt_embeds_2` is passed directly.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the negative prompt. Required when `negative_prompt_embeds` is passed directly.
+            negative_prompt_attention_mask_2 (`torch.Tensor`, *optional*):
+                Attention mask for the negative prompt. Required when `negative_prompt_embeds_2` is passed directly.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback_on_step_end (`Callable[[int, int, Dict], None]`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A callback function or a list of callback functions to be called at the end of each denoising step.
+            callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
+                A list of tensor inputs that should be passed to the callback function. If not defined, all tensor
+                inputs will be passed.
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Rescale the noise_cfg according to `guidance_rescale`. Based on findings of [Common Diffusion Noise
+                Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+            original_size (`Tuple[int, int]`, *optional*, defaults to `(1024, 1024)`):
+                The original size of the image. Used to calculate the time ids.
+            target_size (`Tuple[int, int]`, *optional*):
+                The target size of the image. Used to calculate the time ids.
+            crops_coords_top_left (`Tuple[int, int]`, *optional*, defaults to `(0, 0)`):
+                The top left coordinates of the crop. Used to calculate the time ids.
+            use_resolution_binning (`bool`, *optional*, defaults to `True`):
+                Whether to use resolution binning or not. If `True`, the input resolution will be mapped to the closest
+                standard resolution. Supported resolutions are 1024x1024, 1280x1280, 1024x768, 1152x864, 1280x960,
+                768x1024, 864x1152, 960x1280, 1280x768, and 768x1280. It is recommended to set this to `True`.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. default height and width
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        height = int((height // 16) * 16)
+        width = int((width // 16) * 16)
+
+        if use_resolution_binning and (height, width) not in SUPPORTED_SHAPE:
+            width, height = map_to_standard_shapes(width, height)
+            height = int(height)
+            width = int(width)
+            logger.warning(f"Reshaped to (height, width)=({height}, {width}), Supported shapes are {SUPPORTED_SHAPE}")
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+            prompt_embeds_2,
+            negative_prompt_embeds_2,
+            prompt_attention_mask_2,
+            negative_prompt_attention_mask_2,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            dtype=self.transformer.dtype,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            max_sequence_length=77,
+            text_encoder_index=0,
+        )
+        (
+            prompt_embeds_2,
+            negative_prompt_embeds_2,
+            prompt_attention_mask_2,
+            negative_prompt_attention_mask_2,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            dtype=self.transformer.dtype,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds_2,
+            negative_prompt_embeds=negative_prompt_embeds_2,
+            prompt_attention_mask=prompt_attention_mask_2,
+            negative_prompt_attention_mask=negative_prompt_attention_mask_2,
+            max_sequence_length=256,
+            text_encoder_index=1,
+        )
+
+        # 4. Preprocess image
+        init_image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        map = self.mask_processor.preprocess(
+            map,
+            height=height // self.vae_scale_factor,
+            width=width // self.vae_scale_factor,
+        ).to(device)
+
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # begin diff diff change
+        total_time_steps = num_inference_steps
+        # end diff diff change
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            init_image,
+            latent_timestep,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. create image_rotary_emb, style embedding & time ids
+        grid_height = height // 8 // self.transformer.config.patch_size
+        grid_width = width // 8 // self.transformer.config.patch_size
+        base_size = 512 // 8 // self.transformer.config.patch_size
+        grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size)
+        image_rotary_emb = get_2d_rotary_pos_embed(
+            self.transformer.inner_dim // self.transformer.num_heads,
+            grid_crops_coords,
+            (grid_height, grid_width),
+        )
+
+        style = torch.tensor([0], device=device)
+
+        target_size = target_size or (height, width)
+        add_time_ids = list(original_size + target_size + crops_coords_top_left)
+        add_time_ids = torch.tensor([add_time_ids], dtype=prompt_embeds.dtype)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask])
+            prompt_embeds_2 = torch.cat([negative_prompt_embeds_2, prompt_embeds_2])
+            prompt_attention_mask_2 = torch.cat([negative_prompt_attention_mask_2, prompt_attention_mask_2])
+            add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
+            style = torch.cat([style] * 2, dim=0)
+
+        prompt_embeds = prompt_embeds.to(device=device)
+        prompt_attention_mask = prompt_attention_mask.to(device=device)
+        prompt_embeds_2 = prompt_embeds_2.to(device=device)
+        prompt_attention_mask_2 = prompt_attention_mask_2.to(device=device)
+        add_time_ids = add_time_ids.to(dtype=prompt_embeds.dtype, device=device).repeat(
+            batch_size * num_images_per_prompt, 1
+        )
+        style = style.to(device=device).repeat(batch_size * num_images_per_prompt)
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        # preparations for diff diff
+        original_with_noise = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            init_image,
+            timesteps,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+        thresholds = torch.arange(total_time_steps, dtype=map.dtype) / total_time_steps
+        thresholds = thresholds.unsqueeze(1).unsqueeze(1).to(device)
+        masks = map.squeeze() > (thresholds + (denoising_start or 0))
+        # end diff diff preparations
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # diff diff
+                if i == 0 and denoising_start is None:
+                    latents = original_with_noise[:1]
+                else:
+                    mask = masks[i].unsqueeze(0).to(latents.dtype)
+                    mask = mask.unsqueeze(1)  # fit shape
+                    latents = original_with_noise[i] * mask + latents * (1 - mask)
+                # end diff diff
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # expand scalar t to 1-D tensor to match the 1st dim of latent_model_input
+                t_expand = torch.tensor([t] * latent_model_input.shape[0], device=device).to(
+                    dtype=latent_model_input.dtype
+                )
+
+                # predict the noise residual
+                noise_pred = self.transformer(
+                    latent_model_input,
+                    t_expand,
+                    encoder_hidden_states=prompt_embeds,
+                    text_embedding_mask=prompt_attention_mask,
+                    encoder_hidden_states_t5=prompt_embeds_2,
+                    text_embedding_mask_t5=prompt_attention_mask_2,
+                    image_meta_size=add_time_ids,
+                    style=style,
+                    image_rotary_emb=image_rotary_emb,
+                    return_dict=False,
+                )[0]
+
+                noise_pred, _ = noise_pred.chunk(2, dim=1)
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    prompt_embeds_2 = callback_outputs.pop("prompt_embeds_2", prompt_embeds_2)
+                    negative_prompt_embeds_2 = callback_outputs.pop(
+                        "negative_prompt_embeds_2", negative_prompt_embeds_2
+                    )
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/pipeline_null_text_inversion.py b/diffusers/examples/community/pipeline_null_text_inversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e27b4647bc9fbc7e7db03ff3c32f2a1441e9719
--- /dev/null
+++ b/diffusers/examples/community/pipeline_null_text_inversion.py
@@ -0,0 +1,260 @@
+import inspect
+import os
+
+import numpy as np
+import torch
+import torch.nn.functional as nnf
+from PIL import Image
+from torch.optim.adam import Adam
+from tqdm import tqdm
+
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps=None,
+    device=None,
+    timesteps=None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class NullTextPipeline(StableDiffusionPipeline):
+    def get_noise_pred(self, latents, t, context):
+        latents_input = torch.cat([latents] * 2)
+        guidance_scale = 7.5
+        noise_pred = self.unet(latents_input, t, encoder_hidden_states=context)["sample"]
+        noise_pred_uncond, noise_prediction_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+        latents = self.prev_step(noise_pred, t, latents)
+        return latents
+
+    def get_noise_pred_single(self, latents, t, context):
+        noise_pred = self.unet(latents, t, encoder_hidden_states=context)["sample"]
+        return noise_pred
+
+    @torch.no_grad()
+    def image2latent(self, image_path):
+        image = Image.open(image_path).convert("RGB")
+        image = np.array(image)
+        image = torch.from_numpy(image).float() / 127.5 - 1
+        image = image.permute(2, 0, 1).unsqueeze(0).to(self.device)
+        latents = self.vae.encode(image)["latent_dist"].mean
+        latents = latents * 0.18215
+        return latents
+
+    @torch.no_grad()
+    def latent2image(self, latents):
+        latents = 1 / 0.18215 * latents.detach()
+        image = self.vae.decode(latents)["sample"].detach()
+        image = self.processor.postprocess(image, output_type="pil")[0]
+        return image
+
+    def prev_step(self, model_output, timestep, sample):
+        prev_timestep = timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+        alpha_prod_t_prev = (
+            self.scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.scheduler.final_alpha_cumprod
+        )
+        beta_prod_t = 1 - alpha_prod_t
+        pred_original_sample = (sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+        pred_sample_direction = (1 - alpha_prod_t_prev) ** 0.5 * model_output
+        prev_sample = alpha_prod_t_prev**0.5 * pred_original_sample + pred_sample_direction
+        return prev_sample
+
+    def next_step(self, model_output, timestep, sample):
+        timestep, next_timestep = (
+            min(timestep - self.scheduler.config.num_train_timesteps // self.num_inference_steps, 999),
+            timestep,
+        )
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep] if timestep >= 0 else self.scheduler.final_alpha_cumprod
+        alpha_prod_t_next = self.scheduler.alphas_cumprod[next_timestep]
+        beta_prod_t = 1 - alpha_prod_t
+        next_original_sample = (sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+        next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output
+        next_sample = alpha_prod_t_next**0.5 * next_original_sample + next_sample_direction
+        return next_sample
+
+    def null_optimization(self, latents, context, num_inner_steps, epsilon):
+        uncond_embeddings, cond_embeddings = context.chunk(2)
+        uncond_embeddings_list = []
+        latent_cur = latents[-1]
+        bar = tqdm(total=num_inner_steps * self.num_inference_steps)
+        for i in range(self.num_inference_steps):
+            uncond_embeddings = uncond_embeddings.clone().detach()
+            uncond_embeddings.requires_grad = True
+            optimizer = Adam([uncond_embeddings], lr=1e-2 * (1.0 - i / 100.0))
+            latent_prev = latents[len(latents) - i - 2]
+            t = self.scheduler.timesteps[i]
+            with torch.no_grad():
+                noise_pred_cond = self.get_noise_pred_single(latent_cur, t, cond_embeddings)
+            for j in range(num_inner_steps):
+                noise_pred_uncond = self.get_noise_pred_single(latent_cur, t, uncond_embeddings)
+                noise_pred = noise_pred_uncond + 7.5 * (noise_pred_cond - noise_pred_uncond)
+                latents_prev_rec = self.prev_step(noise_pred, t, latent_cur)
+                loss = nnf.mse_loss(latents_prev_rec, latent_prev)
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+                loss_item = loss.item()
+                bar.update()
+                if loss_item < epsilon + i * 2e-5:
+                    break
+            for j in range(j + 1, num_inner_steps):
+                bar.update()
+            uncond_embeddings_list.append(uncond_embeddings[:1].detach())
+            with torch.no_grad():
+                context = torch.cat([uncond_embeddings, cond_embeddings])
+                latent_cur = self.get_noise_pred(latent_cur, t, context)
+        bar.close()
+        return uncond_embeddings_list
+
+    @torch.no_grad()
+    def ddim_inversion_loop(self, latent, context):
+        self.scheduler.set_timesteps(self.num_inference_steps)
+        _, cond_embeddings = context.chunk(2)
+        all_latent = [latent]
+        latent = latent.clone().detach()
+        with torch.no_grad():
+            for i in range(0, self.num_inference_steps):
+                t = self.scheduler.timesteps[len(self.scheduler.timesteps) - i - 1]
+                noise_pred = self.unet(latent, t, encoder_hidden_states=cond_embeddings)["sample"]
+                latent = self.next_step(noise_pred, t, latent)
+                all_latent.append(latent)
+        return all_latent
+
+    def get_context(self, prompt):
+        uncond_input = self.tokenizer(
+            [""], padding="max_length", max_length=self.tokenizer.model_max_length, return_tensors="pt"
+        )
+        uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+        text_input = self.tokenizer(
+            [prompt],
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        context = torch.cat([uncond_embeddings, text_embeddings])
+        return context
+
+    def invert(
+        self, image_path: str, prompt: str, num_inner_steps=10, early_stop_epsilon=1e-6, num_inference_steps=50
+    ):
+        self.num_inference_steps = num_inference_steps
+        context = self.get_context(prompt)
+        latent = self.image2latent(image_path)
+        ddim_latents = self.ddim_inversion_loop(latent, context)
+        if os.path.exists(image_path + ".pt"):
+            uncond_embeddings = torch.load(image_path + ".pt")
+        else:
+            uncond_embeddings = self.null_optimization(ddim_latents, context, num_inner_steps, early_stop_epsilon)
+            uncond_embeddings = torch.stack(uncond_embeddings, 0)
+            torch.save(uncond_embeddings, image_path + ".pt")
+        return ddim_latents[-1], uncond_embeddings
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt,
+        uncond_embeddings,
+        inverted_latent,
+        num_inference_steps: int = 50,
+        timesteps=None,
+        guidance_scale=7.5,
+        negative_prompt=None,
+        num_images_per_prompt=1,
+        generator=None,
+        latents=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        output_type="pil",
+    ):
+        self._guidance_scale = guidance_scale
+        # 0. Default height and width to unet
+        height = self.unet.config.sample_size * self.vae_scale_factor
+        width = self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hook
+        callback_steps = None
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        # 2. Define call parameter
+        device = self._execution_device
+        # 3. Encode input prompt
+        prompt_embeds, _ = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        latents = inverted_latent
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                noise_pred_uncond = self.unet(latents, t, encoder_hidden_states=uncond_embeddings[i])["sample"]
+                noise_pred = self.unet(latents, t, encoder_hidden_states=prompt_embeds)["sample"]
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                progress_bar.update()
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+        else:
+            image = latents
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=[True] * image.shape[0]
+        )
+        # Offload all models
+        self.maybe_free_model_hooks()
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=False)
diff --git a/diffusers/examples/community/pipeline_prompt2prompt.py b/diffusers/examples/community/pipeline_prompt2prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..508e84177928a86d537da98579b93407c698da21
--- /dev/null
+++ b/diffusers/examples/community/pipeline_prompt2prompt.py
@@ -0,0 +1,1422 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import abc
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from packaging import version
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel
+from diffusers.configuration_utils import FrozenDict, deprecate
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models.attention import Attention
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class Prompt2PromptPipeline(
+    DiffusionPipeline,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device),
+                    attention_mask=attention_mask,
+                    output_hidden_states=True,
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+                The keyword arguments to configure the edit are:
+                - edit_type (`str`). The edit type to apply. Can be either of `replace`, `refine`, `reweight`.
+                - n_cross_replace (`int`): Number of diffusion steps in which cross attention should be replaced
+                - n_self_replace (`int`): Number of diffusion steps in which self attention should be replaced
+                - local_blend_words(`List[str]`, *optional*, default to `None`): Determines which area should be
+                  changed. If None, then the whole image can be changed.
+                - equalizer_words(`List[str]`, *optional*, default to `None`): Required for edit type `reweight`.
+                  Determines which words should be enhanced.
+                - equalizer_strengths (`List[float]`, *optional*, default to `None`) Required for edit type `reweight`.
+                  Determines which how much the words in `equalizer_words` should be enhanced.
+
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        self.controller = create_controller(
+            prompt,
+            cross_attention_kwargs,
+            num_inference_steps,
+            tokenizer=self.tokenizer,
+            device=self.device,
+        )
+        self.register_attention_control(self.controller)  # add attention controller
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # step callback
+                latents = self.controller.step_callback(latents)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 8. Post-processing
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        # 9. Run safety checker
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    def register_attention_control(self, controller):
+        attn_procs = {}
+        cross_att_count = 0
+        for name in self.unet.attn_processors.keys():
+            (None if name.endswith("attn1.processor") else self.unet.config.cross_attention_dim)
+            if name.startswith("mid_block"):
+                self.unet.config.block_out_channels[-1]
+                place_in_unet = "mid"
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                list(reversed(self.unet.config.block_out_channels))[block_id]
+                place_in_unet = "up"
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                self.unet.config.block_out_channels[block_id]
+                place_in_unet = "down"
+            else:
+                continue
+            cross_att_count += 1
+            attn_procs[name] = P2PCrossAttnProcessor(controller=controller, place_in_unet=place_in_unet)
+
+        self.unet.set_attn_processor(attn_procs)
+        controller.num_att_layers = cross_att_count
+
+
+class P2PCrossAttnProcessor:
+    def __init__(self, controller, place_in_unet):
+        super().__init__()
+        self.controller = controller
+        self.place_in_unet = place_in_unet
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+    ):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        query = attn.to_q(hidden_states)
+
+        is_cross = encoder_hidden_states is not None
+        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+
+        # one line change
+        self.controller(attention_probs, is_cross, self.place_in_unet)
+
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+def create_controller(
+    prompts: List[str],
+    cross_attention_kwargs: Dict,
+    num_inference_steps: int,
+    tokenizer,
+    device,
+) -> AttentionControl:
+    edit_type = cross_attention_kwargs.get("edit_type", None)
+    local_blend_words = cross_attention_kwargs.get("local_blend_words", None)
+    equalizer_words = cross_attention_kwargs.get("equalizer_words", None)
+    equalizer_strengths = cross_attention_kwargs.get("equalizer_strengths", None)
+    n_cross_replace = cross_attention_kwargs.get("n_cross_replace", 0.4)
+    n_self_replace = cross_attention_kwargs.get("n_self_replace", 0.4)
+
+    # only replace
+    if edit_type == "replace" and local_blend_words is None:
+        return AttentionReplace(
+            prompts,
+            num_inference_steps,
+            n_cross_replace,
+            n_self_replace,
+            tokenizer=tokenizer,
+            device=device,
+        )
+
+    # replace + localblend
+    if edit_type == "replace" and local_blend_words is not None:
+        lb = LocalBlend(prompts, local_blend_words, tokenizer=tokenizer, device=device)
+        return AttentionReplace(
+            prompts,
+            num_inference_steps,
+            n_cross_replace,
+            n_self_replace,
+            lb,
+            tokenizer=tokenizer,
+            device=device,
+        )
+
+    # only refine
+    if edit_type == "refine" and local_blend_words is None:
+        return AttentionRefine(
+            prompts,
+            num_inference_steps,
+            n_cross_replace,
+            n_self_replace,
+            tokenizer=tokenizer,
+            device=device,
+        )
+
+    # refine + localblend
+    if edit_type == "refine" and local_blend_words is not None:
+        lb = LocalBlend(prompts, local_blend_words, tokenizer=tokenizer, device=device)
+        return AttentionRefine(
+            prompts,
+            num_inference_steps,
+            n_cross_replace,
+            n_self_replace,
+            lb,
+            tokenizer=tokenizer,
+            device=device,
+        )
+
+    # reweight
+    if edit_type == "reweight":
+        assert (
+            equalizer_words is not None and equalizer_strengths is not None
+        ), "To use reweight edit, please specify equalizer_words and equalizer_strengths."
+        assert len(equalizer_words) == len(
+            equalizer_strengths
+        ), "equalizer_words and equalizer_strengths must be of same length."
+        equalizer = get_equalizer(prompts[1], equalizer_words, equalizer_strengths, tokenizer=tokenizer)
+        return AttentionReweight(
+            prompts,
+            num_inference_steps,
+            n_cross_replace,
+            n_self_replace,
+            tokenizer=tokenizer,
+            device=device,
+            equalizer=equalizer,
+        )
+
+    raise ValueError(f"Edit type {edit_type} not recognized. Use one of: replace, refine, reweight.")
+
+
+class AttentionControl(abc.ABC):
+    def step_callback(self, x_t):
+        return x_t
+
+    def between_steps(self):
+        return
+
+    @property
+    def num_uncond_att_layers(self):
+        return 0
+
+    @abc.abstractmethod
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        raise NotImplementedError
+
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if self.cur_att_layer >= self.num_uncond_att_layers:
+            h = attn.shape[0]
+            attn[h // 2 :] = self.forward(attn[h // 2 :], is_cross, place_in_unet)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            self.between_steps()
+        return attn
+
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+
+    def __init__(self):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+
+
+class EmptyControl(AttentionControl):
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        return attn
+
+
+class AttentionStore(AttentionControl):
+    @staticmethod
+    def get_empty_store():
+        return {
+            "down_cross": [],
+            "mid_cross": [],
+            "up_cross": [],
+            "down_self": [],
+            "mid_self": [],
+            "up_self": [],
+        }
+
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[1] <= 32**2:  # avoid memory overhead
+            self.step_store[key].append(attn)
+        return attn
+
+    def between_steps(self):
+        if len(self.attention_store) == 0:
+            self.attention_store = self.step_store
+        else:
+            for key in self.attention_store:
+                for i in range(len(self.attention_store[key])):
+                    self.attention_store[key][i] += self.step_store[key][i]
+        self.step_store = self.get_empty_store()
+
+    def get_average_attention(self):
+        average_attention = {
+            key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store
+        }
+        return average_attention
+
+    def reset(self):
+        super(AttentionStore, self).reset()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+
+    def __init__(self):
+        super(AttentionStore, self).__init__()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+
+
+class LocalBlend:
+    def __call__(self, x_t, attention_store):
+        k = 1
+        maps = attention_store["down_cross"][2:4] + attention_store["up_cross"][:3]
+        maps = [item.reshape(self.alpha_layers.shape[0], -1, 1, 16, 16, self.max_num_words) for item in maps]
+        maps = torch.cat(maps, dim=1)
+        maps = (maps * self.alpha_layers).sum(-1).mean(1)
+        mask = F.max_pool2d(maps, (k * 2 + 1, k * 2 + 1), (1, 1), padding=(k, k))
+        mask = F.interpolate(mask, size=(x_t.shape[2:]))
+        mask = mask / mask.max(2, keepdims=True)[0].max(3, keepdims=True)[0]
+        mask = mask.gt(self.threshold)
+        mask = (mask[:1] + mask[1:]).float()
+        x_t = x_t[:1] + mask * (x_t - x_t[:1])
+        return x_t
+
+    def __init__(
+        self,
+        prompts: List[str],
+        words: [List[List[str]]],
+        tokenizer,
+        device,
+        threshold=0.3,
+        max_num_words=77,
+    ):
+        self.max_num_words = 77
+
+        alpha_layers = torch.zeros(len(prompts), 1, 1, 1, 1, self.max_num_words)
+        for i, (prompt, words_) in enumerate(zip(prompts, words)):
+            if isinstance(words_, str):
+                words_ = [words_]
+            for word in words_:
+                ind = get_word_inds(prompt, word, tokenizer)
+                alpha_layers[i, :, :, :, :, ind] = 1
+        self.alpha_layers = alpha_layers.to(device)
+        self.threshold = threshold
+
+
+class AttentionControlEdit(AttentionStore, abc.ABC):
+    def step_callback(self, x_t):
+        if self.local_blend is not None:
+            x_t = self.local_blend(x_t, self.attention_store)
+        return x_t
+
+    def replace_self_attention(self, attn_base, att_replace):
+        if att_replace.shape[2] <= 16**2:
+            return attn_base.unsqueeze(0).expand(att_replace.shape[0], *attn_base.shape)
+        else:
+            return att_replace
+
+    @abc.abstractmethod
+    def replace_cross_attention(self, attn_base, att_replace):
+        raise NotImplementedError
+
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        super(AttentionControlEdit, self).forward(attn, is_cross, place_in_unet)
+        # FIXME not replace correctly
+        if is_cross or (self.num_self_replace[0] <= self.cur_step < self.num_self_replace[1]):
+            h = attn.shape[0] // (self.batch_size)
+            attn = attn.reshape(self.batch_size, h, *attn.shape[1:])
+            attn_base, attn_repalce = attn[0], attn[1:]
+            if is_cross:
+                alpha_words = self.cross_replace_alpha[self.cur_step]
+                attn_repalce_new = (
+                    self.replace_cross_attention(attn_base, attn_repalce) * alpha_words
+                    + (1 - alpha_words) * attn_repalce
+                )
+                attn[1:] = attn_repalce_new
+            else:
+                attn[1:] = self.replace_self_attention(attn_base, attn_repalce)
+            attn = attn.reshape(self.batch_size * h, *attn.shape[2:])
+        return attn
+
+    def __init__(
+        self,
+        prompts,
+        num_steps: int,
+        cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
+        self_replace_steps: Union[float, Tuple[float, float]],
+        local_blend: Optional[LocalBlend],
+        tokenizer,
+        device,
+    ):
+        super(AttentionControlEdit, self).__init__()
+        # add tokenizer and device here
+
+        self.tokenizer = tokenizer
+        self.device = device
+
+        self.batch_size = len(prompts)
+        self.cross_replace_alpha = get_time_words_attention_alpha(
+            prompts, num_steps, cross_replace_steps, self.tokenizer
+        ).to(self.device)
+        if isinstance(self_replace_steps, float):
+            self_replace_steps = 0, self_replace_steps
+        self.num_self_replace = int(num_steps * self_replace_steps[0]), int(num_steps * self_replace_steps[1])
+        self.local_blend = local_blend  # 在外面定义后传进来
+
+
+class AttentionReplace(AttentionControlEdit):
+    def replace_cross_attention(self, attn_base, att_replace):
+        return torch.einsum("hpw,bwn->bhpn", attn_base, self.mapper)
+
+    def __init__(
+        self,
+        prompts,
+        num_steps: int,
+        cross_replace_steps: float,
+        self_replace_steps: float,
+        local_blend: Optional[LocalBlend] = None,
+        tokenizer=None,
+        device=None,
+    ):
+        super(AttentionReplace, self).__init__(
+            prompts,
+            num_steps,
+            cross_replace_steps,
+            self_replace_steps,
+            local_blend,
+            tokenizer,
+            device,
+        )
+        self.mapper = get_replacement_mapper(prompts, self.tokenizer).to(self.device)
+
+
+class AttentionRefine(AttentionControlEdit):
+    def replace_cross_attention(self, attn_base, att_replace):
+        attn_base_replace = attn_base[:, :, self.mapper].permute(2, 0, 1, 3)
+        attn_replace = attn_base_replace * self.alphas + att_replace * (1 - self.alphas)
+        return attn_replace
+
+    def __init__(
+        self,
+        prompts,
+        num_steps: int,
+        cross_replace_steps: float,
+        self_replace_steps: float,
+        local_blend: Optional[LocalBlend] = None,
+        tokenizer=None,
+        device=None,
+    ):
+        super(AttentionRefine, self).__init__(
+            prompts,
+            num_steps,
+            cross_replace_steps,
+            self_replace_steps,
+            local_blend,
+            tokenizer,
+            device,
+        )
+        self.mapper, alphas = get_refinement_mapper(prompts, self.tokenizer)
+        self.mapper, alphas = self.mapper.to(self.device), alphas.to(self.device)
+        self.alphas = alphas.reshape(alphas.shape[0], 1, 1, alphas.shape[1])
+
+
+class AttentionReweight(AttentionControlEdit):
+    def replace_cross_attention(self, attn_base, att_replace):
+        if self.prev_controller is not None:
+            attn_base = self.prev_controller.replace_cross_attention(attn_base, att_replace)
+        attn_replace = attn_base[None, :, :, :] * self.equalizer[:, None, None, :]
+        return attn_replace
+
+    def __init__(
+        self,
+        prompts,
+        num_steps: int,
+        cross_replace_steps: float,
+        self_replace_steps: float,
+        equalizer,
+        local_blend: Optional[LocalBlend] = None,
+        controller: Optional[AttentionControlEdit] = None,
+        tokenizer=None,
+        device=None,
+    ):
+        super(AttentionReweight, self).__init__(
+            prompts,
+            num_steps,
+            cross_replace_steps,
+            self_replace_steps,
+            local_blend,
+            tokenizer,
+            device,
+        )
+        self.equalizer = equalizer.to(self.device)
+        self.prev_controller = controller
+
+
+### util functions for all Edits
+def update_alpha_time_word(
+    alpha,
+    bounds: Union[float, Tuple[float, float]],
+    prompt_ind: int,
+    word_inds: Optional[torch.Tensor] = None,
+):
+    if isinstance(bounds, float):
+        bounds = 0, bounds
+    start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * alpha.shape[0])
+    if word_inds is None:
+        word_inds = torch.arange(alpha.shape[2])
+    alpha[:start, prompt_ind, word_inds] = 0
+    alpha[start:end, prompt_ind, word_inds] = 1
+    alpha[end:, prompt_ind, word_inds] = 0
+    return alpha
+
+
+def get_time_words_attention_alpha(
+    prompts,
+    num_steps,
+    cross_replace_steps: Union[float, Dict[str, Tuple[float, float]]],
+    tokenizer,
+    max_num_words=77,
+):
+    if not isinstance(cross_replace_steps, dict):
+        cross_replace_steps = {"default_": cross_replace_steps}
+    if "default_" not in cross_replace_steps:
+        cross_replace_steps["default_"] = (0.0, 1.0)
+    alpha_time_words = torch.zeros(num_steps + 1, len(prompts) - 1, max_num_words)
+    for i in range(len(prompts) - 1):
+        alpha_time_words = update_alpha_time_word(alpha_time_words, cross_replace_steps["default_"], i)
+    for key, item in cross_replace_steps.items():
+        if key != "default_":
+            inds = [get_word_inds(prompts[i], key, tokenizer) for i in range(1, len(prompts))]
+            for i, ind in enumerate(inds):
+                if len(ind) > 0:
+                    alpha_time_words = update_alpha_time_word(alpha_time_words, item, i, ind)
+    alpha_time_words = alpha_time_words.reshape(num_steps + 1, len(prompts) - 1, 1, 1, max_num_words)
+    return alpha_time_words
+
+
+### util functions for LocalBlend and ReplacementEdit
+def get_word_inds(text: str, word_place: int, tokenizer):
+    split_text = text.split(" ")
+    if isinstance(word_place, str):
+        word_place = [i for i, word in enumerate(split_text) if word_place == word]
+    elif isinstance(word_place, int):
+        word_place = [word_place]
+    out = []
+    if len(word_place) > 0:
+        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
+        cur_len, ptr = 0, 0
+
+        for i in range(len(words_encode)):
+            cur_len += len(words_encode[i])
+            if ptr in word_place:
+                out.append(i + 1)
+            if cur_len >= len(split_text[ptr]):
+                ptr += 1
+                cur_len = 0
+    return np.array(out)
+
+
+### util functions for ReplacementEdit
+def get_replacement_mapper_(x: str, y: str, tokenizer, max_len=77):
+    words_x = x.split(" ")
+    words_y = y.split(" ")
+    if len(words_x) != len(words_y):
+        raise ValueError(
+            f"attention replacement edit can only be applied on prompts with the same length"
+            f" but prompt A has {len(words_x)} words and prompt B has {len(words_y)} words."
+        )
+    inds_replace = [i for i in range(len(words_y)) if words_y[i] != words_x[i]]
+    inds_source = [get_word_inds(x, i, tokenizer) for i in inds_replace]
+    inds_target = [get_word_inds(y, i, tokenizer) for i in inds_replace]
+    mapper = np.zeros((max_len, max_len))
+    i = j = 0
+    cur_inds = 0
+    while i < max_len and j < max_len:
+        if cur_inds < len(inds_source) and inds_source[cur_inds][0] == i:
+            inds_source_, inds_target_ = inds_source[cur_inds], inds_target[cur_inds]
+            if len(inds_source_) == len(inds_target_):
+                mapper[inds_source_, inds_target_] = 1
+            else:
+                ratio = 1 / len(inds_target_)
+                for i_t in inds_target_:
+                    mapper[inds_source_, i_t] = ratio
+            cur_inds += 1
+            i += len(inds_source_)
+            j += len(inds_target_)
+        elif cur_inds < len(inds_source):
+            mapper[i, j] = 1
+            i += 1
+            j += 1
+        else:
+            mapper[j, j] = 1
+            i += 1
+            j += 1
+
+    return torch.from_numpy(mapper).float()
+
+
+def get_replacement_mapper(prompts, tokenizer, max_len=77):
+    x_seq = prompts[0]
+    mappers = []
+    for i in range(1, len(prompts)):
+        mapper = get_replacement_mapper_(x_seq, prompts[i], tokenizer, max_len)
+        mappers.append(mapper)
+    return torch.stack(mappers)
+
+
+### util functions for ReweightEdit
+def get_equalizer(
+    text: str,
+    word_select: Union[int, Tuple[int, ...]],
+    values: Union[List[float], Tuple[float, ...]],
+    tokenizer,
+):
+    if isinstance(word_select, (int, str)):
+        word_select = (word_select,)
+    equalizer = torch.ones(len(values), 77)
+    values = torch.tensor(values, dtype=torch.float32)
+    for word in word_select:
+        inds = get_word_inds(text, word, tokenizer)
+        equalizer[:, inds] = values
+    return equalizer
+
+
+### util functions for RefinementEdit
+class ScoreParams:
+    def __init__(self, gap, match, mismatch):
+        self.gap = gap
+        self.match = match
+        self.mismatch = mismatch
+
+    def mis_match_char(self, x, y):
+        if x != y:
+            return self.mismatch
+        else:
+            return self.match
+
+
+def get_matrix(size_x, size_y, gap):
+    matrix = np.zeros((size_x + 1, size_y + 1), dtype=np.int32)
+    matrix[0, 1:] = (np.arange(size_y) + 1) * gap
+    matrix[1:, 0] = (np.arange(size_x) + 1) * gap
+    return matrix
+
+
+def get_traceback_matrix(size_x, size_y):
+    matrix = np.zeros((size_x + 1, size_y + 1), dtype=np.int32)
+    matrix[0, 1:] = 1
+    matrix[1:, 0] = 2
+    matrix[0, 0] = 4
+    return matrix
+
+
+def global_align(x, y, score):
+    matrix = get_matrix(len(x), len(y), score.gap)
+    trace_back = get_traceback_matrix(len(x), len(y))
+    for i in range(1, len(x) + 1):
+        for j in range(1, len(y) + 1):
+            left = matrix[i, j - 1] + score.gap
+            up = matrix[i - 1, j] + score.gap
+            diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1], y[j - 1])
+            matrix[i, j] = max(left, up, diag)
+            if matrix[i, j] == left:
+                trace_back[i, j] = 1
+            elif matrix[i, j] == up:
+                trace_back[i, j] = 2
+            else:
+                trace_back[i, j] = 3
+    return matrix, trace_back
+
+
+def get_aligned_sequences(x, y, trace_back):
+    x_seq = []
+    y_seq = []
+    i = len(x)
+    j = len(y)
+    mapper_y_to_x = []
+    while i > 0 or j > 0:
+        if trace_back[i, j] == 3:
+            x_seq.append(x[i - 1])
+            y_seq.append(y[j - 1])
+            i = i - 1
+            j = j - 1
+            mapper_y_to_x.append((j, i))
+        elif trace_back[i][j] == 1:
+            x_seq.append("-")
+            y_seq.append(y[j - 1])
+            j = j - 1
+            mapper_y_to_x.append((j, -1))
+        elif trace_back[i][j] == 2:
+            x_seq.append(x[i - 1])
+            y_seq.append("-")
+            i = i - 1
+        elif trace_back[i][j] == 4:
+            break
+    mapper_y_to_x.reverse()
+    return x_seq, y_seq, torch.tensor(mapper_y_to_x, dtype=torch.int64)
+
+
+def get_mapper(x: str, y: str, tokenizer, max_len=77):
+    x_seq = tokenizer.encode(x)
+    y_seq = tokenizer.encode(y)
+    score = ScoreParams(0, 1, -1)
+    matrix, trace_back = global_align(x_seq, y_seq, score)
+    mapper_base = get_aligned_sequences(x_seq, y_seq, trace_back)[-1]
+    alphas = torch.ones(max_len)
+    alphas[: mapper_base.shape[0]] = mapper_base[:, 1].ne(-1).float()
+    mapper = torch.zeros(max_len, dtype=torch.int64)
+    mapper[: mapper_base.shape[0]] = mapper_base[:, 1]
+    mapper[mapper_base.shape[0] :] = len(y_seq) + torch.arange(max_len - len(y_seq))
+    return mapper, alphas
+
+
+def get_refinement_mapper(prompts, tokenizer, max_len=77):
+    x_seq = prompts[0]
+    mappers, alphas = [], []
+    for i in range(1, len(prompts)):
+        mapper, alpha = get_mapper(x_seq, prompts[i], tokenizer, max_len)
+        mappers.append(mapper)
+        alphas.append(alpha)
+    return torch.stack(mappers), torch.stack(alphas)
diff --git a/diffusers/examples/community/pipeline_sdxl_style_aligned.py b/diffusers/examples/community/pipeline_sdxl_style_aligned.py
new file mode 100644
index 0000000000000000000000000000000000000000..8328bc2caed92f02d21aea7ce386da3dfc6d94a9
--- /dev/null
+++ b/diffusers/examples/community/pipeline_sdxl_style_aligned.py
@@ -0,0 +1,1912 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Based on [Style Aligned Image Generation via Shared Attention](https://arxiv.org/abs/2312.02133).
+# Authors: Amir Hertz, Andrey Voynov, Shlomi Fruchter, Daniel Cohen-Or
+# Project Page: https://style-aligned-gen.github.io/
+# Code: https://github.com/google/style-aligned
+#
+# Adapted to Diffusers by [Aryan V S](https://github.com/a-r-r-o-w/).
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.models.attention_processor import (
+    Attention,
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+if is_invisible_watermark_available():
+    from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from typing import List
+
+        >>> import torch
+        >>> from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+        >>> from PIL import Image
+
+        >>> model_id = "a-r-r-o-w/dreamshaper-xl-turbo"
+        >>> pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16", custom_pipeline="pipeline_sdxl_style_aligned")
+        >>> pipe = pipe.to("cuda")
+
+        # Enable memory saving techniques
+        >>> pipe.enable_vae_slicing()
+        >>> pipe.enable_vae_tiling()
+
+        >>> prompt = [
+        ...     "a toy train. macro photo. 3d game asset",
+        ...     "a toy airplane. macro photo. 3d game asset",
+        ...     "a toy bicycle. macro photo. 3d game asset",
+        ...     "a toy car. macro photo. 3d game asset",
+        ... ]
+        >>> negative_prompt = "low quality, worst quality, "
+
+        >>> # Enable StyleAligned
+        >>> pipe.enable_style_aligned(
+        ...     share_group_norm=False,
+        ...     share_layer_norm=False,
+        ...     share_attention=True,
+        ...     adain_queries=True,
+        ...     adain_keys=True,
+        ...     adain_values=False,
+        ...     full_attention_share=False,
+        ...     shared_score_scale=1.0,
+        ...     shared_score_shift=0.0,
+        ...     only_self_level=0.0,
+        >>> )
+
+        >>> # Run inference
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     guidance_scale=2,
+        ...     height=1024,
+        ...     width=1024,
+        ...     num_inference_steps=10,
+        ...     generator=torch.Generator().manual_seed(42),
+        >>> ).images
+
+        >>> # Disable StyleAligned if you do not wish to use it anymore
+        >>> pipe.disable_style_aligned()
+        ```
+"""
+
+
+def expand_first(feat: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+    b = feat.shape[0]
+    feat_style = torch.stack((feat[0], feat[b // 2])).unsqueeze(1)
+    if scale == 1:
+        feat_style = feat_style.expand(2, b // 2, *feat.shape[1:])
+    else:
+        feat_style = feat_style.repeat(1, b // 2, 1, 1, 1)
+        feat_style = torch.cat([feat_style[:, :1], scale * feat_style[:, 1:]], dim=1)
+    return feat_style.reshape(*feat.shape)
+
+
+def concat_first(feat: torch.Tensor, dim: int = 2, scale: float = 1.0) -> torch.Tensor:
+    feat_style = expand_first(feat, scale=scale)
+    return torch.cat((feat, feat_style), dim=dim)
+
+
+def calc_mean_std(feat: torch.Tensor, eps: float = 1e-5) -> Tuple[torch.Tensor, torch.Tensor]:
+    feat_std = (feat.var(dim=-2, keepdims=True) + eps).sqrt()
+    feat_mean = feat.mean(dim=-2, keepdims=True)
+    return feat_mean, feat_std
+
+
+def adain(feat: torch.Tensor) -> torch.Tensor:
+    feat_mean, feat_std = calc_mean_std(feat)
+    feat_style_mean = expand_first(feat_mean)
+    feat_style_std = expand_first(feat_std)
+    feat = (feat - feat_mean) / feat_std
+    feat = feat * feat_style_std + feat_style_mean
+    return feat
+
+
+def get_switch_vec(total_num_layers, level):
+    if level == 0:
+        return torch.zeros(total_num_layers, dtype=torch.bool)
+    if level == 1:
+        return torch.ones(total_num_layers, dtype=torch.bool)
+    to_flip = level > 0.5
+    if to_flip:
+        level = 1 - level
+    num_switch = int(level * total_num_layers)
+    vec = torch.arange(total_num_layers)
+    vec = vec % (total_num_layers // num_switch)
+    vec = vec == 0
+    if to_flip:
+        vec = ~vec
+    return vec
+
+
+class SharedAttentionProcessor(AttnProcessor2_0):
+    def __init__(
+        self,
+        share_attention: bool = True,
+        adain_queries: bool = True,
+        adain_keys: bool = True,
+        adain_values: bool = False,
+        full_attention_share: bool = False,
+        shared_score_scale: float = 1.0,
+        shared_score_shift: float = 0.0,
+    ):
+        r"""Shared Attention Processor as proposed in the StyleAligned paper."""
+        super().__init__()
+        self.share_attention = share_attention
+        self.adain_queries = adain_queries
+        self.adain_keys = adain_keys
+        self.adain_values = adain_values
+        self.full_attention_share = full_attention_share
+        self.shared_score_scale = shared_score_scale
+        self.shared_score_shift = shared_score_shift
+
+    def shifted_scaled_dot_product_attention(
+        self, attn: Attention, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> torch.Tensor:
+        logits = torch.einsum("bhqd,bhkd->bhqk", query, key) * attn.scale
+        logits[:, :, :, query.shape[2] :] += self.shared_score_shift
+        probs = logits.softmax(-1)
+        return torch.einsum("bhqk,bhkd->bhqd", probs, value)
+
+    def shared_call(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if self.adain_queries:
+            query = adain(query)
+        if self.adain_keys:
+            key = adain(key)
+        if self.adain_values:
+            value = adain(value)
+        if self.share_attention:
+            key = concat_first(key, -2, scale=self.shared_score_scale)
+            value = concat_first(value, -2)
+            if self.shared_score_shift != 0:
+                hidden_states = self.shifted_scaled_dot_product_attention(attn, query, key, value)
+            else:
+                hidden_states = F.scaled_dot_product_attention(
+                    query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+                )
+        else:
+            hidden_states = F.scaled_dot_product_attention(
+                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if self.full_attention_share:
+            b, n, d = hidden_states.shape
+            k = 2
+            hidden_states = hidden_states.view(k, b, n, d).permute(0, 1, 3, 2).contiguous().view(-1, n, d)
+            # hidden_states = einops.rearrange(hidden_states, "(k b) n d -> k (b n) d", k=2)
+            hidden_states = super().__call__(
+                attn,
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                **kwargs,
+            )
+            hidden_states = hidden_states.view(k, b, n, d).permute(0, 1, 3, 2).contiguous().view(-1, n, d)
+            # hidden_states = einops.rearrange(hidden_states, "k (b n) d -> (k b) n d", n=n)
+        else:
+            hidden_states = self.shared_call(attn, hidden_states, hidden_states, attention_mask, **kwargs)
+
+        return hidden_states
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class StyleAlignedSDXLPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This pipeline also adds experimental support for [StyleAligned](https://arxiv.org/abs/2312.02133). It can
+    be enabled/disabled using `.enable_style_aligned()` or `.disable_style_aligned()` respectively.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+        else:
+            t_start = 0
+
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        # Strength is irrelevant if we directly request a timestep to start at;
+        # that is, strength is determined by the denoising_start instead.
+        if denoising_start is not None:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            timesteps = timesteps[-num_inference_steps:]
+            return timesteps, num_inference_steps
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(
+        self,
+        image,
+        mask,
+        width,
+        height,
+        num_channels_latents,
+        timestep,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        device,
+        generator=None,
+        add_noise=True,
+        latents=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        batch_size *= num_images_per_prompt
+
+        if image is None:
+            shape = (
+                batch_size,
+                num_channels_latents,
+                int(height) // self.vae_scale_factor,
+                int(width) // self.vae_scale_factor,
+            )
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if latents is None:
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            else:
+                latents = latents.to(device)
+
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+            return latents
+
+        elif mask is None:
+            if not isinstance(image, (torch.Tensor, Image.Image, list)):
+                raise ValueError(
+                    f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+                )
+
+            # Offload text encoder if `enable_model_cpu_offload` was enabled
+            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                self.text_encoder_2.to("cpu")
+                torch.cuda.empty_cache()
+
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 4:
+                init_latents = image
+
+            else:
+                # make sure the VAE is in float32 mode, as it overflows in float16
+                if self.vae.config.force_upcast:
+                    image = image.float()
+                    self.vae.to(dtype=torch.float32)
+
+                if isinstance(generator, list) and len(generator) != batch_size:
+                    raise ValueError(
+                        f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                        f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                    )
+
+                elif isinstance(generator, list):
+                    init_latents = [
+                        retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                        for i in range(batch_size)
+                    ]
+                    init_latents = torch.cat(init_latents, dim=0)
+                else:
+                    init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+                if self.vae.config.force_upcast:
+                    self.vae.to(dtype)
+
+                init_latents = init_latents.to(dtype)
+                init_latents = self.vae.config.scaling_factor * init_latents
+
+            if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                additional_image_per_prompt = batch_size // init_latents.shape[0]
+                init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+            elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                init_latents = torch.cat([init_latents], dim=0)
+
+            if add_noise:
+                shape = init_latents.shape
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                # get latents
+                init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+            latents = init_latents
+            return latents
+
+        else:
+            shape = (
+                batch_size,
+                num_channels_latents,
+                int(height) // self.vae_scale_factor,
+                int(width) // self.vae_scale_factor,
+            )
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if (image is None or timestep is None) and not is_strength_max:
+                raise ValueError(
+                    "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                    "However, either the image or the noise timestep has not been provided."
+                )
+
+            if image.shape[1] == 4:
+                image_latents = image.to(device=device, dtype=dtype)
+                image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+            elif return_image_latents or (latents is None and not is_strength_max):
+                image = image.to(device=device, dtype=dtype)
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+                image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+            if latents is None and add_noise:
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                # if strength is 1. then initialise the latents to noise, else initial to image + noise
+                latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+                # if pure noise then scale the initial latents by the  Scheduler's init sigma
+                latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+            elif add_noise:
+                noise = latents.to(device)
+                latents = noise * self.scheduler.init_noise_sigma
+            else:
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                latents = image_latents.to(device)
+
+            outputs = (latents,)
+
+            if return_noise:
+                outputs += (noise,)
+
+            if return_image_latents:
+                outputs += (image_latents,)
+
+            return outputs
+
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+
+        if masked_image is not None and masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = None
+
+        if masked_image is not None:
+            if masked_image_latents is None:
+                masked_image = masked_image.to(device=device, dtype=dtype)
+                masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+            if masked_image_latents.shape[0] < batch_size:
+                if not batch_size % masked_image_latents.shape[0] == 0:
+                    raise ValueError(
+                        "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                        f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                        " Make sure the number of images that you pass is divisible by the total requested batch size."
+                    )
+                masked_image_latents = masked_image_latents.repeat(
+                    batch_size // masked_image_latents.shape[0], 1, 1, 1
+                )
+
+            masked_image_latents = (
+                torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+            )
+
+            # aligning device to prevent device errors when concating it with the latent model input
+            masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+
+        return mask, masked_image_latents
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        dtype = image.dtype
+        if self.vae.config.force_upcast:
+            image = image.float()
+            self.vae.to(dtype=torch.float32)
+
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        if self.vae.config.force_upcast:
+            self.vae.to(dtype)
+
+        image_latents = image_latents.to(dtype)
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + self.text_encoder_2.config.projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    def _enable_shared_attention_processors(
+        self,
+        share_attention: bool,
+        adain_queries: bool,
+        adain_keys: bool,
+        adain_values: bool,
+        full_attention_share: bool,
+        shared_score_scale: float,
+        shared_score_shift: float,
+        only_self_level: float,
+    ):
+        r"""Helper method to enable usage of Shared Attention Processor."""
+        attn_procs = {}
+        num_self_layers = len([name for name in self.unet.attn_processors.keys() if "attn1" in name])
+
+        only_self_vec = get_switch_vec(num_self_layers, only_self_level)
+
+        for i, name in enumerate(self.unet.attn_processors.keys()):
+            is_self_attention = "attn1" in name
+            if is_self_attention:
+                if only_self_vec[i // 2]:
+                    attn_procs[name] = AttnProcessor2_0()
+                else:
+                    attn_procs[name] = SharedAttentionProcessor(
+                        share_attention=share_attention,
+                        adain_queries=adain_queries,
+                        adain_keys=adain_keys,
+                        adain_values=adain_values,
+                        full_attention_share=full_attention_share,
+                        shared_score_scale=shared_score_scale,
+                        shared_score_shift=shared_score_shift,
+                    )
+            else:
+                attn_procs[name] = AttnProcessor2_0()
+
+        self.unet.set_attn_processor(attn_procs)
+
+    def _disable_shared_attention_processors(self):
+        r"""
+        Helper method to disable usage of the Shared Attention Processor. All processors
+        are reset to the default Attention Processor for pytorch versions above 2.0.
+        """
+        attn_procs = {}
+
+        for i, name in enumerate(self.unet.attn_processors.keys()):
+            attn_procs[name] = AttnProcessor2_0()
+
+        self.unet.set_attn_processor(attn_procs)
+
+    def _register_shared_norm(self, share_group_norm: bool = True, share_layer_norm: bool = True):
+        r"""Helper method to register shared group/layer normalization layers."""
+
+        def register_norm_forward(norm_layer: Union[nn.GroupNorm, nn.LayerNorm]) -> Union[nn.GroupNorm, nn.LayerNorm]:
+            if not hasattr(norm_layer, "orig_forward"):
+                setattr(norm_layer, "orig_forward", norm_layer.forward)
+            orig_forward = norm_layer.orig_forward
+
+            def forward_(hidden_states: torch.Tensor) -> torch.Tensor:
+                n = hidden_states.shape[-2]
+                hidden_states = concat_first(hidden_states, dim=-2)
+                hidden_states = orig_forward(hidden_states)
+                return hidden_states[..., :n, :]
+
+            norm_layer.forward = forward_
+            return norm_layer
+
+        def get_norm_layers(pipeline_, norm_layers_: Dict[str, List[Union[nn.GroupNorm, nn.LayerNorm]]]):
+            if isinstance(pipeline_, nn.LayerNorm) and share_layer_norm:
+                norm_layers_["layer"].append(pipeline_)
+            if isinstance(pipeline_, nn.GroupNorm) and share_group_norm:
+                norm_layers_["group"].append(pipeline_)
+            else:
+                for layer in pipeline_.children():
+                    get_norm_layers(layer, norm_layers_)
+
+        norm_layers = {"group": [], "layer": []}
+        get_norm_layers(self.unet, norm_layers)
+
+        norm_layers_list = []
+        for key in ["group", "layer"]:
+            for layer in norm_layers[key]:
+                norm_layers_list.append(register_norm_forward(layer))
+
+        return norm_layers_list
+
+    @property
+    def style_aligned_enabled(self):
+        r"""Returns whether StyleAligned has been enabled in the pipeline or not."""
+        return hasattr(self, "_style_aligned_norm_layers") and self._style_aligned_norm_layers is not None
+
+    def enable_style_aligned(
+        self,
+        share_group_norm: bool = True,
+        share_layer_norm: bool = True,
+        share_attention: bool = True,
+        adain_queries: bool = True,
+        adain_keys: bool = True,
+        adain_values: bool = False,
+        full_attention_share: bool = False,
+        shared_score_scale: float = 1.0,
+        shared_score_shift: float = 0.0,
+        only_self_level: float = 0.0,
+    ):
+        r"""
+        Enables the StyleAligned mechanism as in https://arxiv.org/abs/2312.02133.
+
+        Args:
+            share_group_norm (`bool`, defaults to `True`):
+                Whether or not to use shared group normalization layers.
+            share_layer_norm (`bool`, defaults to `True`):
+                Whether or not to use shared layer normalization layers.
+            share_attention (`bool`, defaults to `True`):
+                Whether or not to use attention sharing between batch images.
+            adain_queries (`bool`, defaults to `True`):
+                Whether or not to apply the AdaIn operation on attention queries.
+            adain_keys (`bool`, defaults to `True`):
+                Whether or not to apply the AdaIn operation on attention keys.
+            adain_values (`bool`, defaults to `False`):
+                Whether or not to apply the AdaIn operation on attention values.
+            full_attention_share (`bool`, defaults to `False`):
+                Whether or not to use full attention sharing between all images in a batch. Can
+                lead to content leakage within each batch and some loss in diversity.
+            shared_score_scale (`float`, defaults to `1.0`):
+                Scale for shared attention.
+        """
+        self._style_aligned_norm_layers = self._register_shared_norm(share_group_norm, share_layer_norm)
+        self._enable_shared_attention_processors(
+            share_attention=share_attention,
+            adain_queries=adain_queries,
+            adain_keys=adain_keys,
+            adain_values=adain_values,
+            full_attention_share=full_attention_share,
+            shared_score_scale=shared_score_scale,
+            shared_score_shift=shared_score_shift,
+            only_self_level=only_self_level,
+        )
+
+    def disable_style_aligned(self):
+        r"""Disables the StyleAligned mechanism if it had been previously enabled."""
+        if self.style_aligned_enabled:
+            for layer in self._style_aligned_norm_layers:
+                layer.forward = layer.orig_forward
+
+            self._style_aligned_norm_layers = None
+            self._disable_shared_attention_processors()
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def denoising_start(self):
+        return self._denoising_start
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: Optional[PipelineImageInput] = None,
+        mask_image: Optional[PipelineImageInput] = None,
+        masked_image_latents: Optional[torch.Tensor] = None,
+        strength: float = 0.3,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            height=height,
+            width=width,
+            callback_steps=callback_steps,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Preprocess image and mask_image
+        if image is not None:
+            image = self.image_processor.preprocess(image, height=height, width=width)
+            image = image.to(device=self.device, dtype=prompt_embeds.dtype)
+
+        if mask_image is not None:
+            mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
+            mask = mask.to(device=self.device, dtype=prompt_embeds.dtype)
+
+            if masked_image_latents is not None:
+                masked_image = masked_image_latents
+            elif image.shape[1] == 4:
+                # if image is in latent space, we can't mask it
+                masked_image = None
+            else:
+                masked_image = image * (mask < 0.5)
+        else:
+            mask = None
+
+        # 4. Prepare timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        if image is not None:
+            timesteps, num_inference_steps = self.get_timesteps(
+                num_inference_steps,
+                strength,
+                device,
+                denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
+            )
+
+            # check that number of inference steps is not < 1 - as this doesn't make sense
+            if num_inference_steps < 1:
+                raise ValueError(
+                    f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                    f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+                )
+
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        is_strength_max = strength == 1.0
+        add_noise = True if self.denoising_start is None else False
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
+        latents = self.prepare_latents(
+            image=image,
+            mask=mask,
+            width=width,
+            height=height,
+            num_channels_latents=num_channels_latents,
+            timestep=latent_timestep,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            add_noise=add_noise,
+            latents=latents,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if mask is not None:
+            if return_image_latents:
+                latents, noise, image_latents = latents
+            else:
+                latents, noise = latents
+
+            mask, masked_image_latents = self.prepare_mask_latents(
+                mask=mask,
+                masked_image=masked_image,
+                batch_size=batch_size * num_images_per_prompt,
+                height=height,
+                width=width,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+
+            # Check that sizes of mask, masked image and latents match
+            if num_channels_unet == 9:
+                # default case for runwayml/stable-diffusion-inpainting
+                num_channels_mask = mask.shape[1]
+                num_channels_masked_image = masked_image_latents.shape[1]
+                if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet:
+                    raise ValueError(
+                        f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                        f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                        f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                        f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                        " `pipeline.unet` or your `mask_image` or `image` input."
+                    )
+            elif num_channels_unet != 4:
+                raise ValueError(
+                    f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+                )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        add_time_ids = self._get_add_time_ids(
+            original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
+        )
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+                image_embeds = image_embeds.to(device)
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if mask is not None and num_channels_unet == 4:
+                    init_latents_proper = image_latents
+
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/examples/community/pipeline_stable_diffusion_3_differential_img2img.py b/diffusers/examples/community/pipeline_stable_diffusion_3_differential_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cee5ecbc1411bd763c5fe778c2f508c3074088a
--- /dev/null
+++ b/diffusers/examples/community/pipeline_stable_diffusion_3_differential_img2img.py
@@ -0,0 +1,981 @@
+# Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.transformers import SD3Transformer2DModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion_3.pipeline_output import StableDiffusion3PipelineOutput
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+
+        >>> from diffusers import AutoPipelineForImage2Image
+        >>> from diffusers.utils import load_image
+
+        >>> device = "cuda"
+        >>> model_id_or_path = "stabilityai/stable-diffusion-3-medium-diffusers"
+        >>> pipe = AutoPipelineForImage2Image.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
+        >>> pipe = pipe.to(device)
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> init_image = load_image(url).resize((512, 512))
+
+        >>> prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
+
+        >>> images = pipe(prompt=prompt, image=init_image, strength=0.95, guidance_scale=7.5).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusion3DifferentialImg2ImgPipeline(DiffusionPipeline):
+    r"""
+    Args:
+        transformer ([`SD3Transformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant,
+            with an additional added projection layer that is initialized with a diagonal matrix with the `hidden_size`
+            as its dimension.
+        text_encoder_2 ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        text_encoder_3 ([`T5EncoderModel`]):
+            Frozen text-encoder. Stable Diffusion 3 uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_3 (`T5TokenizerFast`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
+
+    def __init__(
+        self,
+        transformer: SD3Transformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer_2: CLIPTokenizer,
+        text_encoder_3: T5EncoderModel,
+        tokenizer_3: T5TokenizerFast,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            text_encoder_3=text_encoder_3,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            tokenizer_3=tokenizer_3,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, vae_latent_channels=self.vae.config.latent_channels
+        )
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_convert_grayscale=True
+        )
+
+        self.tokenizer_max_length = self.tokenizer.model_max_length
+        self.default_sample_size = self.transformer.config.sample_size
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 256,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        if self.text_encoder_3 is None:
+            return torch.zeros(
+                (
+                    batch_size * num_images_per_prompt,
+                    self.tokenizer_max_length,
+                    self.transformer.config.joint_attention_dim,
+                ),
+                device=device,
+                dtype=dtype,
+            )
+
+        text_inputs = self.tokenizer_3(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_3(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_3.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_3(text_input_ids.to(device))[0]
+
+        dtype = self.text_encoder_3.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        clip_skip: Optional[int] = None,
+        clip_model_index: int = 0,
+    ):
+        device = device or self._execution_device
+
+        clip_tokenizers = [self.tokenizer, self.tokenizer_2]
+        clip_text_encoders = [self.text_encoder, self.text_encoder_2]
+
+        tokenizer = clip_tokenizers[clip_model_index]
+        text_encoder = clip_text_encoders[clip_model_index]
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+        pooled_prompt_embeds = prompt_embeds[0]
+
+        if clip_skip is None:
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+        else:
+            prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds, pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        prompt_3: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clip_skip: Optional[int] = None,
+        max_sequence_length: int = 256,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            prompt_3 = prompt_3 or prompt
+            prompt_3 = [prompt_3] if isinstance(prompt_3, str) else prompt_3
+
+            prompt_embed, pooled_prompt_embed = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=0,
+            )
+            prompt_2_embed, pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                prompt=prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=1,
+            )
+            clip_prompt_embeds = torch.cat([prompt_embed, prompt_2_embed], dim=-1)
+
+            t5_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            clip_prompt_embeds = torch.nn.functional.pad(
+                clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+            )
+
+            prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+            pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1)
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+            negative_prompt_3 = negative_prompt_3 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+            negative_prompt_3 = (
+                batch_size * [negative_prompt_3] if isinstance(negative_prompt_3, str) else negative_prompt_3
+            )
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embed, negative_pooled_prompt_embed = self._get_clip_prompt_embeds(
+                negative_prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=0,
+            )
+            negative_prompt_2_embed, negative_pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                negative_prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=1,
+            )
+            negative_clip_prompt_embeds = torch.cat([negative_prompt_embed, negative_prompt_2_embed], dim=-1)
+
+            t5_negative_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=negative_prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            negative_clip_prompt_embeds = torch.nn.functional.pad(
+                negative_clip_prompt_embeds,
+                (0, t5_negative_prompt_embed.shape[-1] - negative_clip_prompt_embeds.shape[-1]),
+            )
+
+            negative_prompt_embeds = torch.cat([negative_clip_prompt_embeds, t5_negative_prompt_embed], dim=-2)
+            negative_pooled_prompt_embeds = torch.cat(
+                [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1
+            )
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        prompt_3,
+        strength,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        negative_prompt_3=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_3 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_3`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        elif prompt_3 is not None and (not isinstance(prompt_3, str) and not isinstance(prompt_3, list)):
+            raise ValueError(f"`prompt_3` has to be of type `str` or `list` but is {type(prompt_3)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_3 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_3`: {negative_prompt_3} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(
+        self, batch_size, num_channels_latents, height, width, image, timestep, dtype, device, generator=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+
+        image = image.to(device=device, dtype=dtype)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        elif isinstance(generator, list):
+            init_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(batch_size)
+            ]
+            init_latents = torch.cat(init_latents, dim=0)
+        else:
+            init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        init_latents = (init_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        init_latents = self.scheduler.scale_noise(init_latents, timestep, noise)
+        latents = init_latents.to(device=device, dtype=dtype)
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        prompt_3: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.6,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 256,
+        map: PipelineImageInput = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used instead
+            negative_prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used instead
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        # 0. Default height and width
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            prompt_3,
+            strength,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_3=prompt_3,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            device=device,
+            clip_skip=self.clip_skip,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+        )
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+
+        # 3. Preprocess image
+        init_image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+
+        map = self.mask_processor.preprocess(
+            map, height=height // self.vae_scale_factor, width=width // self.vae_scale_factor
+        ).to(device)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # begin diff diff change
+        total_time_steps = num_inference_steps
+        # end diff diff change
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        if latents is None:
+            latents = self.prepare_latents(
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height,
+                width,
+                init_image,
+                latent_timestep,
+                prompt_embeds.dtype,
+                device,
+                generator,
+            )
+
+        # 6. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # preparations for diff diff
+        original_with_noise = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            init_image,
+            timesteps,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+        thresholds = torch.arange(total_time_steps, dtype=map.dtype) / total_time_steps
+        thresholds = thresholds.unsqueeze(1).unsqueeze(1).to(device)
+        masks = map.squeeze() > thresholds
+        # end diff diff preparations
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # diff diff
+                if i == 0:
+                    latents = original_with_noise[:1]
+                else:
+                    mask = masks[i].unsqueeze(0).to(latents.dtype)
+                    mask = mask.unsqueeze(1)  # fit shape
+                    latents = original_with_noise[i] * mask + latents * (1 - mask)
+                # end diff diff
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusion3PipelineOutput(images=image)
diff --git a/diffusers/examples/community/pipeline_stable_diffusion_boxdiff.py b/diffusers/examples/community/pipeline_stable_diffusion_boxdiff.py
new file mode 100644
index 0000000000000000000000000000000000000000..6490c1400138c09c7ab85b5e0ea79951586df281
--- /dev/null
+++ b/diffusers/examples/community/pipeline_stable_diffusion_boxdiff.py
@@ -0,0 +1,1705 @@
+# Copyright 2024 Jingyang Zhang and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import inspect
+import math
+import numbers
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.models.attention_processor import Attention, FusedAttnProcessor2_0
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPipeline
+
+        >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+class GaussianSmoothing(nn.Module):
+    """
+    Copied from official repo: https://github.com/showlab/BoxDiff/blob/master/utils/gaussian_smoothing.py
+    Apply gaussian smoothing on a
+    1d, 2d or 3d tensor. Filtering is performed seperately for each channel
+    in the input using a depthwise convolution.
+    Arguments:
+        channels (int, sequence): Number of channels of the input tensors. Output will
+            have this number of channels as well.
+        kernel_size (int, sequence): Size of the gaussian kernel.
+        sigma (float, sequence): Standard deviation of the gaussian kernel.
+        dim (int, optional): The number of dimensions of the data.
+            Default value is 2 (spatial).
+    """
+
+    def __init__(self, channels, kernel_size, sigma, dim=2):
+        super(GaussianSmoothing, self).__init__()
+        if isinstance(kernel_size, numbers.Number):
+            kernel_size = [kernel_size] * dim
+        if isinstance(sigma, numbers.Number):
+            sigma = [sigma] * dim
+
+        # The gaussian kernel is the product of the
+        # gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
+
+        self.register_buffer("weight", kernel)
+        self.groups = channels
+
+        if dim == 1:
+            self.conv = F.conv1d
+        elif dim == 2:
+            self.conv = F.conv2d
+        elif dim == 3:
+            self.conv = F.conv3d
+        else:
+            raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim))
+
+    def forward(self, input):
+        """
+        Apply gaussian filter to input.
+        Arguments:
+            input (torch.Tensor): Input to apply gaussian filter on.
+        Returns:
+            filtered (torch.Tensor): Filtered output.
+        """
+        return self.conv(input, weight=self.weight.to(input.dtype), groups=self.groups)
+
+
+class AttendExciteCrossAttnProcessor:
+    def __init__(self, attnstore, place_in_unet):
+        super().__init__()
+        self.attnstore = attnstore
+        self.place_in_unet = place_in_unet
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size=1)
+        query = attn.to_q(hidden_states)
+
+        is_cross = encoder_hidden_states is not None
+        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        self.attnstore(attention_probs, is_cross, self.place_in_unet)
+
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class AttentionControl(abc.ABC):
+    def step_callback(self, x_t):
+        return x_t
+
+    def between_steps(self):
+        return
+
+    # @property
+    # def num_uncond_att_layers(self):
+    #     return 0
+
+    @abc.abstractmethod
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        raise NotImplementedError
+
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if self.cur_att_layer >= self.num_uncond_att_layers:
+            self.forward(attn, is_cross, place_in_unet)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            self.between_steps()
+
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+
+    def __init__(self):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+
+
+class AttentionStore(AttentionControl):
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [], "down_self": [], "mid_self": [], "up_self": []}
+
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[1] <= 32**2:  # avoid memory overhead
+            self.step_store[key].append(attn)
+        return attn
+
+    def between_steps(self):
+        self.attention_store = self.step_store
+        if self.save_global_store:
+            with torch.no_grad():
+                if len(self.global_store) == 0:
+                    self.global_store = self.step_store
+                else:
+                    for key in self.global_store:
+                        for i in range(len(self.global_store[key])):
+                            self.global_store[key][i] += self.step_store[key][i].detach()
+        self.step_store = self.get_empty_store()
+        self.step_store = self.get_empty_store()
+
+    def get_average_attention(self):
+        average_attention = self.attention_store
+        return average_attention
+
+    def get_average_global_attention(self):
+        average_attention = {
+            key: [item / self.cur_step for item in self.global_store[key]] for key in self.attention_store
+        }
+        return average_attention
+
+    def reset(self):
+        super(AttentionStore, self).reset()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+        self.global_store = {}
+
+    def __init__(self, save_global_store=False):
+        """
+        Initialize an empty AttentionStore
+        :param step_index: used to visualize only a specific step in the diffusion process
+        """
+        super(AttentionStore, self).__init__()
+        self.save_global_store = save_global_store
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+        self.global_store = {}
+        self.curr_step_index = 0
+        self.num_uncond_att_layers = 0
+
+
+def aggregate_attention(
+    attention_store: AttentionStore, res: int, from_where: List[str], is_cross: bool, select: int
+) -> torch.Tensor:
+    """Aggregates the attention across the different layers and heads at the specified resolution."""
+    out = []
+    attention_maps = attention_store.get_average_attention()
+
+    # for k, v in attention_maps.items():
+    #     for vv in v:
+    #         print(vv.shape)
+    # exit()
+
+    num_pixels = res**2
+    for location in from_where:
+        for item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]:
+            if item.shape[1] == num_pixels:
+                cross_maps = item.reshape(1, -1, res, res, item.shape[-1])[select]
+                out.append(cross_maps)
+    out = torch.cat(out, dim=0)
+    out = out.sum(0) / out.shape[0]
+    return out
+
+
+def register_attention_control(model, controller):
+    attn_procs = {}
+    cross_att_count = 0
+    for name in model.unet.attn_processors.keys():
+        # cross_attention_dim = None if name.endswith("attn1.processor") else model.unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            # hidden_size = model.unet.config.block_out_channels[-1]
+            place_in_unet = "mid"
+        elif name.startswith("up_blocks"):
+            # block_id = int(name[len("up_blocks.")])
+            # hidden_size = list(reversed(model.unet.config.block_out_channels))[block_id]
+            place_in_unet = "up"
+        elif name.startswith("down_blocks"):
+            # block_id = int(name[len("down_blocks.")])
+            # hidden_size = model.unet.config.block_out_channels[block_id]
+            place_in_unet = "down"
+        else:
+            continue
+
+        cross_att_count += 1
+        attn_procs[name] = AttendExciteCrossAttnProcessor(attnstore=controller, place_in_unet=place_in_unet)
+    model.unet.set_attn_processor(attn_procs)
+    controller.num_att_layers = cross_att_count
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionBoxDiffPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with BoxDiff.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return text_inputs, prompt_embeds, negative_prompt_embeds
+
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        boxdiff_phrases,
+        boxdiff_boxes,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if boxdiff_phrases is not None or boxdiff_boxes is not None:
+            if not (boxdiff_phrases is not None and boxdiff_boxes is not None):
+                raise ValueError("Either both `boxdiff_phrases` and `boxdiff_boxes` must be passed or none of them.")
+
+            if not isinstance(boxdiff_phrases, list) or not isinstance(boxdiff_boxes, list):
+                raise ValueError("`boxdiff_phrases` and `boxdiff_boxes` must be lists.")
+
+            if len(boxdiff_phrases) != len(boxdiff_boxes):
+                raise ValueError(
+                    "`boxdiff_phrases` and `boxdiff_boxes` must have the same length,"
+                    f" got: `boxdiff_phrases` {len(boxdiff_phrases)} != `boxdiff_boxes`"
+                    f" {len(boxdiff_boxes)}."
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stages where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    def _compute_max_attention_per_index(
+        self,
+        attention_maps: torch.Tensor,
+        indices_to_alter: List[int],
+        smooth_attentions: bool = False,
+        sigma: float = 0.5,
+        kernel_size: int = 3,
+        normalize_eot: bool = False,
+        bboxes: List[int] = None,
+        L: int = 1,
+        P: float = 0.2,
+    ) -> List[torch.Tensor]:
+        """Computes the maximum attention value for each of the tokens we wish to alter."""
+        last_idx = -1
+        if normalize_eot:
+            prompt = self.prompt
+            if isinstance(self.prompt, list):
+                prompt = self.prompt[0]
+            last_idx = len(self.tokenizer(prompt)["input_ids"]) - 1
+        attention_for_text = attention_maps[:, :, 1:last_idx]
+        attention_for_text *= 100
+        attention_for_text = torch.nn.functional.softmax(attention_for_text, dim=-1)
+
+        # Shift indices since we removed the first token "1:last_idx"
+        indices_to_alter = [index - 1 for index in indices_to_alter]
+
+        # Extract the maximum values
+        max_indices_list_fg = []
+        max_indices_list_bg = []
+        dist_x = []
+        dist_y = []
+
+        cnt = 0
+        for i in indices_to_alter:
+            image = attention_for_text[:, :, i]
+
+            # TODO
+            # box = [max(round(b / (512 / image.shape[0])), 0) for b in bboxes[cnt]]
+            # x1, y1, x2, y2 = box
+            H, W = image.shape
+            x1 = min(max(round(bboxes[cnt][0] * W), 0), W)
+            y1 = min(max(round(bboxes[cnt][1] * H), 0), H)
+            x2 = min(max(round(bboxes[cnt][2] * W), 0), W)
+            y2 = min(max(round(bboxes[cnt][3] * H), 0), H)
+            box = [x1, y1, x2, y2]
+            cnt += 1
+
+            # coordinates to masks
+            obj_mask = torch.zeros_like(image)
+            ones_mask = torch.ones([y2 - y1, x2 - x1], dtype=obj_mask.dtype).to(obj_mask.device)
+            obj_mask[y1:y2, x1:x2] = ones_mask
+            bg_mask = 1 - obj_mask
+
+            if smooth_attentions:
+                smoothing = GaussianSmoothing(channels=1, kernel_size=kernel_size, sigma=sigma, dim=2).to(image.device)
+                input = F.pad(image.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1), mode="reflect")
+                image = smoothing(input).squeeze(0).squeeze(0)
+
+            # Inner-Box constraint
+            k = (obj_mask.sum() * P).long()
+            max_indices_list_fg.append((image * obj_mask).reshape(-1).topk(k)[0].mean())
+
+            # Outer-Box constraint
+            k = (bg_mask.sum() * P).long()
+            max_indices_list_bg.append((image * bg_mask).reshape(-1).topk(k)[0].mean())
+
+            # Corner Constraint
+            gt_proj_x = torch.max(obj_mask, dim=0)[0]
+            gt_proj_y = torch.max(obj_mask, dim=1)[0]
+            corner_mask_x = torch.zeros_like(gt_proj_x)
+            corner_mask_y = torch.zeros_like(gt_proj_y)
+
+            # create gt according to the number config.L
+            N = gt_proj_x.shape[0]
+            corner_mask_x[max(box[0] - L, 0) : min(box[0] + L + 1, N)] = 1.0
+            corner_mask_x[max(box[2] - L, 0) : min(box[2] + L + 1, N)] = 1.0
+            corner_mask_y[max(box[1] - L, 0) : min(box[1] + L + 1, N)] = 1.0
+            corner_mask_y[max(box[3] - L, 0) : min(box[3] + L + 1, N)] = 1.0
+            dist_x.append((F.l1_loss(image.max(dim=0)[0], gt_proj_x, reduction="none") * corner_mask_x).mean())
+            dist_y.append((F.l1_loss(image.max(dim=1)[0], gt_proj_y, reduction="none") * corner_mask_y).mean())
+
+        return max_indices_list_fg, max_indices_list_bg, dist_x, dist_y
+
+    def _aggregate_and_get_max_attention_per_token(
+        self,
+        attention_store: AttentionStore,
+        indices_to_alter: List[int],
+        attention_res: int = 16,
+        smooth_attentions: bool = False,
+        sigma: float = 0.5,
+        kernel_size: int = 3,
+        normalize_eot: bool = False,
+        bboxes: List[int] = None,
+        L: int = 1,
+        P: float = 0.2,
+    ):
+        """Aggregates the attention for each token and computes the max activation value for each token to alter."""
+        attention_maps = aggregate_attention(
+            attention_store=attention_store,
+            res=attention_res,
+            from_where=("up", "down", "mid"),
+            is_cross=True,
+            select=0,
+        )
+        max_attention_per_index_fg, max_attention_per_index_bg, dist_x, dist_y = self._compute_max_attention_per_index(
+            attention_maps=attention_maps,
+            indices_to_alter=indices_to_alter,
+            smooth_attentions=smooth_attentions,
+            sigma=sigma,
+            kernel_size=kernel_size,
+            normalize_eot=normalize_eot,
+            bboxes=bboxes,
+            L=L,
+            P=P,
+        )
+        return max_attention_per_index_fg, max_attention_per_index_bg, dist_x, dist_y
+
+    @staticmethod
+    def _compute_loss(
+        max_attention_per_index_fg: List[torch.Tensor],
+        max_attention_per_index_bg: List[torch.Tensor],
+        dist_x: List[torch.Tensor],
+        dist_y: List[torch.Tensor],
+        return_losses: bool = False,
+    ) -> torch.Tensor:
+        """Computes the attend-and-excite loss using the maximum attention value for each token."""
+        losses_fg = [max(0, 1.0 - curr_max) for curr_max in max_attention_per_index_fg]
+        losses_bg = [max(0, curr_max) for curr_max in max_attention_per_index_bg]
+        loss = sum(losses_fg) + sum(losses_bg) + sum(dist_x) + sum(dist_y)
+        if return_losses:
+            return max(losses_fg), losses_fg
+        else:
+            return max(losses_fg), loss
+
+    @staticmethod
+    def _update_latent(latents: torch.Tensor, loss: torch.Tensor, step_size: float) -> torch.Tensor:
+        """Update the latent according to the computed loss."""
+        grad_cond = torch.autograd.grad(loss.requires_grad_(True), [latents], retain_graph=True)[0]
+        latents = latents - step_size * grad_cond
+        return latents
+
+    def _perform_iterative_refinement_step(
+        self,
+        latents: torch.Tensor,
+        indices_to_alter: List[int],
+        loss_fg: torch.Tensor,
+        threshold: float,
+        text_embeddings: torch.Tensor,
+        text_input,
+        attention_store: AttentionStore,
+        step_size: float,
+        t: int,
+        attention_res: int = 16,
+        smooth_attentions: bool = True,
+        sigma: float = 0.5,
+        kernel_size: int = 3,
+        max_refinement_steps: int = 20,
+        normalize_eot: bool = False,
+        bboxes: List[int] = None,
+        L: int = 1,
+        P: float = 0.2,
+    ):
+        """
+        Performs the iterative latent refinement introduced in the paper. Here, we continuously update the latent
+        code according to our loss objective until the given threshold is reached for all tokens.
+        """
+        iteration = 0
+        target_loss = max(0, 1.0 - threshold)
+
+        while loss_fg > target_loss:
+            iteration += 1
+
+            latents = latents.clone().detach().requires_grad_(True)
+            # noise_pred_text = self.unet(latents, t, encoder_hidden_states=text_embeddings[1].unsqueeze(0)).sample
+            self.unet.zero_grad()
+
+            # Get max activation value for each subject token
+            (
+                max_attention_per_index_fg,
+                max_attention_per_index_bg,
+                dist_x,
+                dist_y,
+            ) = self._aggregate_and_get_max_attention_per_token(
+                attention_store=attention_store,
+                indices_to_alter=indices_to_alter,
+                attention_res=attention_res,
+                smooth_attentions=smooth_attentions,
+                sigma=sigma,
+                kernel_size=kernel_size,
+                normalize_eot=normalize_eot,
+                bboxes=bboxes,
+                L=L,
+                P=P,
+            )
+
+            loss_fg, losses_fg = self._compute_loss(
+                max_attention_per_index_fg, max_attention_per_index_bg, dist_x, dist_y, return_losses=True
+            )
+
+            if loss_fg != 0:
+                latents = self._update_latent(latents, loss_fg, step_size)
+
+            # with torch.no_grad():
+            #     noise_pred_uncond = self.unet(latents, t, encoder_hidden_states=text_embeddings[0].unsqueeze(0)).sample
+            #     noise_pred_text = self.unet(latents, t, encoder_hidden_states=text_embeddings[1].unsqueeze(0)).sample
+
+            # try:
+            #     low_token = np.argmax([l.item() if not isinstance(l, int) else l for l in losses_fg])
+            # except Exception as e:
+            #     print(e)  # catch edge case :)
+            #     low_token = np.argmax(losses_fg)
+
+            # low_word = self.tokenizer.decode(text_input.input_ids[0][indices_to_alter[low_token]])
+            # print(f'\t Try {iteration}. {low_word} has a max attention of {max_attention_per_index_fg[low_token]}')
+
+            if iteration >= max_refinement_steps:
+                # print(f'\t Exceeded max number of iterations ({max_refinement_steps})! '
+                #       f'Finished with a max attention of {max_attention_per_index_fg[low_token]}')
+                break
+
+        # Run one more time but don't compute gradients and update the latents.
+        # We just need to compute the new loss - the grad update will occur below
+        latents = latents.clone().detach().requires_grad_(True)
+        # noise_pred_text = self.unet(latents, t, encoder_hidden_states=text_embeddings[1].unsqueeze(0)).sample
+        self.unet.zero_grad()
+
+        # Get max activation value for each subject token
+        (
+            max_attention_per_index_fg,
+            max_attention_per_index_bg,
+            dist_x,
+            dist_y,
+        ) = self._aggregate_and_get_max_attention_per_token(
+            attention_store=attention_store,
+            indices_to_alter=indices_to_alter,
+            attention_res=attention_res,
+            smooth_attentions=smooth_attentions,
+            sigma=sigma,
+            kernel_size=kernel_size,
+            normalize_eot=normalize_eot,
+            bboxes=bboxes,
+            L=L,
+            P=P,
+        )
+        loss_fg, losses_fg = self._compute_loss(
+            max_attention_per_index_fg, max_attention_per_index_bg, dist_x, dist_y, return_losses=True
+        )
+        # print(f"\t Finished with loss of: {loss_fg}")
+        return loss_fg, latents, max_attention_per_index_fg
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        boxdiff_phrases: List[str] = None,
+        boxdiff_boxes: List[List[float]] = None,  # TODO
+        boxdiff_kwargs: Optional[Dict[str, Any]] = {
+            "attention_res": 16,
+            "P": 0.2,
+            "L": 1,
+            "max_iter_to_alter": 25,
+            "loss_thresholds": {0: 0.05, 10: 0.5, 20: 0.8},
+            "scale_factor": 20,
+            "scale_range": (1.0, 0.5),
+            "smooth_attentions": True,
+            "sigma": 0.5,
+            "kernel_size": 3,
+            "refine": False,
+            "normalize_eot": True,
+        },
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+
+            boxdiff_attention_res (`int`, *optional*, defaults to 16):
+                The resolution of the attention maps used for computing the BoxDiff loss.
+            boxdiff_P (`float`, *optional*, defaults to 0.2):
+
+            boxdiff_L (`int`, *optional*, defaults to 1):
+                The number of pixels around the corner to be selected in BoxDiff loss.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        # -1. Register attention control (for BoxDiff)
+        attention_store = AttentionStore()
+        register_attention_control(self, attention_store)
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            boxdiff_phrases,
+            boxdiff_boxes,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self.prompt = prompt
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        text_inputs, prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 6.3 Prepare BoxDiff inputs
+        # a) Indices to alter
+        input_ids = self.tokenizer(prompt)["input_ids"]
+        decoded = [self.tokenizer.decode([t]) for t in input_ids]
+        indices_to_alter = []
+        bboxes = []
+        for phrase, box in zip(boxdiff_phrases, boxdiff_boxes):
+            # it could happen that phrase does not correspond a single token?
+            if phrase not in decoded:
+                continue
+            indices_to_alter.append(decoded.index(phrase))
+            bboxes.append(box)
+
+        # b) A bunch of hyperparameters
+        attention_res = boxdiff_kwargs.get("attention_res", 16)
+        smooth_attentions = boxdiff_kwargs.get("smooth_attentions", True)
+        sigma = boxdiff_kwargs.get("sigma", 0.5)
+        kernel_size = boxdiff_kwargs.get("kernel_size", 3)
+        L = boxdiff_kwargs.get("L", 1)
+        P = boxdiff_kwargs.get("P", 0.2)
+        thresholds = boxdiff_kwargs.get("loss_thresholds", {0: 0.05, 10: 0.5, 20: 0.8})
+        max_iter_to_alter = boxdiff_kwargs.get("max_iter_to_alter", len(self.scheduler.timesteps) + 1)
+        scale_factor = boxdiff_kwargs.get("scale_factor", 20)
+        refine = boxdiff_kwargs.get("refine", False)
+        normalize_eot = boxdiff_kwargs.get("normalize_eot", True)
+
+        scale_range = boxdiff_kwargs.get("scale_range", (1.0, 0.5))
+        scale_range = np.linspace(scale_range[0], scale_range[1], len(self.scheduler.timesteps))
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # BoxDiff optimization
+                with torch.enable_grad():
+                    latents = latents.clone().detach().requires_grad_(True)
+
+                    # Forward pass of denoising with text conditioning
+                    noise_pred_text = self.unet(
+                        latents,
+                        t,
+                        encoder_hidden_states=prompt_embeds[1].unsqueeze(0),
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    ).sample
+                    self.unet.zero_grad()
+
+                    # Get max activation value for each subject token
+                    (
+                        max_attention_per_index_fg,
+                        max_attention_per_index_bg,
+                        dist_x,
+                        dist_y,
+                    ) = self._aggregate_and_get_max_attention_per_token(
+                        attention_store=attention_store,
+                        indices_to_alter=indices_to_alter,
+                        attention_res=attention_res,
+                        smooth_attentions=smooth_attentions,
+                        sigma=sigma,
+                        kernel_size=kernel_size,
+                        normalize_eot=normalize_eot,
+                        bboxes=bboxes,
+                        L=L,
+                        P=P,
+                    )
+
+                    loss_fg, loss = self._compute_loss(
+                        max_attention_per_index_fg, max_attention_per_index_bg, dist_x, dist_y
+                    )
+
+                    # Refinement from attend-and-excite (not necessary)
+                    if refine and i in thresholds.keys() and loss_fg > 1.0 - thresholds[i]:
+                        del noise_pred_text
+                        torch.cuda.empty_cache()
+                        loss_fg, latents, max_attention_per_index_fg = self._perform_iterative_refinement_step(
+                            latents=latents,
+                            indices_to_alter=indices_to_alter,
+                            loss_fg=loss_fg,
+                            threshold=thresholds[i],
+                            text_embeddings=prompt_embeds,
+                            text_input=text_inputs,
+                            attention_store=attention_store,
+                            step_size=scale_factor * np.sqrt(scale_range[i]),
+                            t=t,
+                            attention_res=attention_res,
+                            smooth_attentions=smooth_attentions,
+                            sigma=sigma,
+                            kernel_size=kernel_size,
+                            normalize_eot=normalize_eot,
+                            bboxes=bboxes,
+                            L=L,
+                            P=P,
+                        )
+
+                    # Perform gradient update
+                    if i < max_iter_to_alter:
+                        _, loss = self._compute_loss(
+                            max_attention_per_index_fg, max_attention_per_index_bg, dist_x, dist_y
+                        )
+                        if loss != 0:
+                            latents = self._update_latent(
+                                latents=latents, loss=loss, step_size=scale_factor * np.sqrt(scale_range[i])
+                            )
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/pipeline_stable_diffusion_pag.py b/diffusers/examples/community/pipeline_stable_diffusion_pag.py
new file mode 100644
index 0000000000000000000000000000000000000000..cea2c9735747e51c2f415e18e51aaefff2712ca3
--- /dev/null
+++ b/diffusers/examples/community/pipeline_stable_diffusion_pag.py
@@ -0,0 +1,1476 @@
+# Implementation of StableDiffusionPipeline with PAG
+# https://ku-cvlab.github.io/Perturbed-Attention-Guidance
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.models.attention_processor import Attention, AttnProcessor2_0, FusedAttnProcessor2_0
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPipeline
+        >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+class PAGIdentitySelfAttnProcessor:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        # chunk
+        hidden_states_org, hidden_states_ptb = hidden_states.chunk(2)
+
+        # original path
+        batch_size, sequence_length, _ = hidden_states_org.shape
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states_org = attn.group_norm(hidden_states_org.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states_org)
+        key = attn.to_k(hidden_states_org)
+        value = attn.to_v(hidden_states_org)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states_org = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states_org = hidden_states_org.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states_org = hidden_states_org.to(query.dtype)
+
+        # linear proj
+        hidden_states_org = attn.to_out[0](hidden_states_org)
+        # dropout
+        hidden_states_org = attn.to_out[1](hidden_states_org)
+
+        if input_ndim == 4:
+            hidden_states_org = hidden_states_org.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        # perturbed path (identity attention)
+        batch_size, sequence_length, _ = hidden_states_ptb.shape
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)
+
+        value = attn.to_v(hidden_states_ptb)
+
+        # hidden_states_ptb = torch.zeros(value.shape).to(value.get_device())
+        hidden_states_ptb = value
+
+        hidden_states_ptb = hidden_states_ptb.to(query.dtype)
+
+        # linear proj
+        hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
+        # dropout
+        hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
+
+        if input_ndim == 4:
+            hidden_states_ptb = hidden_states_ptb.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        # cat
+        hidden_states = torch.cat([hidden_states_org, hidden_states_ptb])
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class PAGCFGIdentitySelfAttnProcessor:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        # chunk
+        hidden_states_uncond, hidden_states_org, hidden_states_ptb = hidden_states.chunk(3)
+        hidden_states_org = torch.cat([hidden_states_uncond, hidden_states_org])
+
+        # original path
+        batch_size, sequence_length, _ = hidden_states_org.shape
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states_org = attn.group_norm(hidden_states_org.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states_org)
+        key = attn.to_k(hidden_states_org)
+        value = attn.to_v(hidden_states_org)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states_org = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states_org = hidden_states_org.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states_org = hidden_states_org.to(query.dtype)
+
+        # linear proj
+        hidden_states_org = attn.to_out[0](hidden_states_org)
+        # dropout
+        hidden_states_org = attn.to_out[1](hidden_states_org)
+
+        if input_ndim == 4:
+            hidden_states_org = hidden_states_org.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        # perturbed path (identity attention)
+        batch_size, sequence_length, _ = hidden_states_ptb.shape
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)
+
+        value = attn.to_v(hidden_states_ptb)
+        hidden_states_ptb = value
+        hidden_states_ptb = hidden_states_ptb.to(query.dtype)
+
+        # linear proj
+        hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
+        # dropout
+        hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
+
+        if input_ndim == 4:
+            hidden_states_ptb = hidden_states_ptb.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        # cat
+        hidden_states = torch.cat([hidden_states_org, hidden_states_ptb])
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionPAGPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if self.do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            image_embeds = ip_adapter_image_embeds
+        return image_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+        The suffixes after the scaling factors represent the stages where they are being applied.
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    def pred_z0(self, sample, model_output, timestep):
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep].to(sample.device)
+
+        beta_prod_t = 1 - alpha_prod_t
+        if self.scheduler.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.scheduler.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.scheduler.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            # predict V
+            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`,"
+                " or `v_prediction`"
+            )
+
+        return pred_original_sample
+
+    def pred_x0(self, latents, noise_pred, t, generator, device, prompt_embeds, output_type):
+        pred_z0 = self.pred_z0(latents, noise_pred, t)
+        pred_x0 = self.vae.decode(pred_z0 / self.vae.config.scaling_factor, return_dict=False, generator=generator)[0]
+        pred_x0, ____ = self.run_safety_checker(pred_x0, device, prompt_embeds.dtype)
+        do_denormalize = [True] * pred_x0.shape[0]
+        pred_x0 = self.image_processor.postprocess(pred_x0, output_type=output_type, do_denormalize=do_denormalize)
+
+        return pred_x0
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @property
+    def pag_scale(self):
+        return self._pag_scale
+
+    @property
+    def do_perturbed_attention_guidance(self):
+        return self._pag_scale > 0
+
+    @property
+    def pag_adaptive_scaling(self):
+        return self._pag_adaptive_scaling
+
+    @property
+    def do_pag_adaptive_scaling(self):
+        return self._pag_adaptive_scaling > 0
+
+    @property
+    def pag_applied_layers_index(self):
+        return self._pag_applied_layers_index
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        pag_scale: float = 0.0,
+        pag_adaptive_scaling: float = 0.0,
+        pag_applied_layers_index: List[str] = ["d4"],  # ['d4', 'd5', 'm0']
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        self._pag_scale = pag_scale
+        self._pag_adaptive_scaling = pag_adaptive_scaling
+        self._pag_applied_layers_index = pag_applied_layers_index
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+
+        # cfg
+        if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        # pag
+        elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
+            prompt_embeds = torch.cat([prompt_embeds, prompt_embeds])
+        # both
+        elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt
+            )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
+            else None
+        )
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        if self.do_perturbed_attention_guidance:
+            down_layers = []
+            mid_layers = []
+            up_layers = []
+            for name, module in self.unet.named_modules():
+                if "attn1" in name and "to" not in name:
+                    layer_type = name.split(".")[0].split("_")[0]
+                    if layer_type == "down":
+                        down_layers.append(module)
+                    elif layer_type == "mid":
+                        mid_layers.append(module)
+                    elif layer_type == "up":
+                        up_layers.append(module)
+                    else:
+                        raise ValueError(f"Invalid layer type: {layer_type}")
+
+        # change attention layer in UNet if use PAG
+        if self.do_perturbed_attention_guidance:
+            if self.do_classifier_free_guidance:
+                replace_processor = PAGCFGIdentitySelfAttnProcessor()
+            else:
+                replace_processor = PAGIdentitySelfAttnProcessor()
+
+            drop_layers = self.pag_applied_layers_index
+            for drop_layer in drop_layers:
+                try:
+                    if drop_layer[0] == "d":
+                        down_layers[int(drop_layer[1])].processor = replace_processor
+                    elif drop_layer[0] == "m":
+                        mid_layers[int(drop_layer[1])].processor = replace_processor
+                    elif drop_layer[0] == "u":
+                        up_layers[int(drop_layer[1])].processor = replace_processor
+                    else:
+                        raise ValueError(f"Invalid layer type: {drop_layer[0]}")
+                except IndexError:
+                    raise ValueError(
+                        f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
+                    )
+
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # cfg
+                if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance:
+                    latent_model_input = torch.cat([latents] * 2)
+                # pag
+                elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
+                    latent_model_input = torch.cat([latents] * 2)
+                # both
+                elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
+                    latent_model_input = torch.cat([latents] * 3)
+                # no
+                else:
+                    latent_model_input = latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+
+                # cfg
+                if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+
+                    delta = noise_pred_text - noise_pred_uncond
+                    noise_pred = noise_pred_uncond + self.guidance_scale * delta
+
+                # pag
+                elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
+                    noise_pred_original, noise_pred_perturb = noise_pred.chunk(2)
+
+                    signal_scale = self.pag_scale
+                    if self.do_pag_adaptive_scaling:
+                        signal_scale = self.pag_scale - self.pag_adaptive_scaling * (1000 - t)
+                        if signal_scale < 0:
+                            signal_scale = 0
+
+                    noise_pred = noise_pred_original + signal_scale * (noise_pred_original - noise_pred_perturb)
+
+                # both
+                elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
+                    noise_pred_uncond, noise_pred_text, noise_pred_text_perturb = noise_pred.chunk(3)
+
+                    signal_scale = self.pag_scale
+                    if self.do_pag_adaptive_scaling:
+                        signal_scale = self.pag_scale - self.pag_adaptive_scaling * (1000 - t)
+                        if signal_scale < 0:
+                            signal_scale = 0
+
+                    noise_pred = (
+                        noise_pred_text
+                        + (self.guidance_scale - 1.0) * (noise_pred_text - noise_pred_uncond)
+                        + signal_scale * (noise_pred_text - noise_pred_text_perturb)
+                    )
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        # change attention layer in UNet if use PAG
+        if self.do_perturbed_attention_guidance:
+            drop_layers = self.pag_applied_layers_index
+            for drop_layer in drop_layers:
+                try:
+                    if drop_layer[0] == "d":
+                        down_layers[int(drop_layer[1])].processor = AttnProcessor2_0()
+                    elif drop_layer[0] == "m":
+                        mid_layers[int(drop_layer[1])].processor = AttnProcessor2_0()
+                    elif drop_layer[0] == "u":
+                        up_layers[int(drop_layer[1])].processor = AttnProcessor2_0()
+                    else:
+                        raise ValueError(f"Invalid layer type: {drop_layer[0]}")
+                except IndexError:
+                    raise ValueError(
+                        f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
+                    )
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py b/diffusers/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ac651a1fe609ee474046ab7421e9184bc29ee13
--- /dev/null
+++ b/diffusers/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py
@@ -0,0 +1,772 @@
+# Copyright 2024 The Intel Labs Team Authors and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import DiffusionPipeline
+from diffusers.image_processor import PipelineDepthInput, PipelineImageInput, VaeImageProcessorLDM3D
+from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from diffusers.pipelines.stable_diffusion_ldm3d.pipeline_stable_diffusion_ldm3d import LDM3DPipelineOutput
+from diffusers.schedulers import DDPMScheduler, KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> from diffusers import StableDiffusionUpscaleLDM3DPipeline
+        >>> from PIL import Image
+        >>> from io import BytesIO
+        >>> import requests
+
+        >>> pipe = StableDiffusionUpscaleLDM3DPipeline.from_pretrained("Intel/ldm3d-sr")
+        >>> pipe = pipe.to("cuda")
+        >>> rgb_path = "https://huggingface.co/Intel/ldm3d-sr/resolve/main/lemons_ldm3d_rgb.jpg"
+        >>> depth_path = "https://huggingface.co/Intel/ldm3d-sr/resolve/main/lemons_ldm3d_depth.png"
+        >>> low_res_rgb = Image.open(BytesIO(requests.get(rgb_path).content)).convert("RGB")
+        >>> low_res_depth = Image.open(BytesIO(requests.get(depth_path).content)).convert("L")
+        >>> output = pipe(
+        ...     prompt="high quality high resolution uhd 4k image",
+        ...     rgb=low_res_rgb,
+        ...     depth=low_res_depth,
+        ...     num_inference_steps=50,
+        ...     target_res=[1024, 1024],
+        ... )
+        >>> rgb_image, depth_image = output.rgb, output.depth
+        >>> rgb_image[0].save("hr_ldm3d_rgb.jpg")
+        >>> depth_image[0].save("hr_ldm3d_depth.png")
+        ```
+"""
+
+
+class StableDiffusionUpscaleLDM3DPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, FromSingleFileMixin
+):
+    r"""
+    Pipeline for text-to-image and 3D generation using LDM3D.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        low_res_scheduler ([`SchedulerMixin`]):
+            A scheduler used to add initial noise to the low resolution conditioning image. It must be an instance of
+            [`DDPMScheduler`].
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        low_res_scheduler: DDPMScheduler,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+        watermarker: Optional[Any] = None,
+        max_noise_level: int = 350,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            watermarker=watermarker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor, resample="bilinear")
+        # self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.register_to_config(max_noise_level=max_noise_level)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d.StableDiffusionLDM3DPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d.StableDiffusionLDM3DPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            rgb_feature_extractor_input = feature_extractor_input[0]
+            safety_checker_input = self.feature_extractor(rgb_feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        noise_level,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        target_res=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, np.ndarray)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `np.ndarray`, `PIL.Image.Image` or `list` but is {type(image)}"
+            )
+
+        # verify batch size of prompt and image are same if image is a list or tensor or numpy array
+        if isinstance(image, (list, np.ndarray, torch.Tensor)):
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+
+            if isinstance(image, list):
+                image_batch_size = len(image)
+            else:
+                image_batch_size = image.shape[0]
+            if batch_size != image_batch_size:
+                raise ValueError(
+                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
+                    " Please make sure that passed `prompt` matches the batch size of `image`."
+                )
+
+        # check noise level
+        if noise_level > self.config.max_noise_level:
+            raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # def upcast_vae(self):
+    #     dtype = self.vae.dtype
+    #     self.vae.to(dtype=torch.float32)
+    #     use_torch_2_0_or_xformers = isinstance(
+    #         self.vae.decoder.mid_block.attentions[0].processor,
+    #         (
+    #             AttnProcessor2_0,
+    #             XFormersAttnProcessor,
+    #             LoRAXFormersAttnProcessor,
+    #             LoRAAttnProcessor2_0,
+    #         ),
+    #     )
+    #     # if xformers or torch_2_0 is used attention block does not need
+    #     # to be in float32 which can save lots of memory
+    #     if use_torch_2_0_or_xformers:
+    #         self.vae.post_quant_conv.to(dtype)
+    #         self.vae.decoder.conv_in.to(dtype)
+    #         self.vae.decoder.mid_block.to(dtype)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        rgb: PipelineImageInput = None,
+        depth: PipelineDepthInput = None,
+        num_inference_steps: int = 75,
+        guidance_scale: float = 9.0,
+        noise_level: int = 20,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        target_res: Optional[List[int]] = [1024, 1024],
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be upscaled.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            rgb,
+            noise_level,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Preprocess image
+        rgb, depth = self.image_processor.preprocess(rgb, depth, target_res=target_res)
+        rgb = rgb.to(dtype=prompt_embeds.dtype, device=device)
+        depth = depth.to(dtype=prompt_embeds.dtype, device=device)
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Encode low resolutiom image to latent space
+        image = torch.cat([rgb, depth], axis=1)
+        latent_space_image = self.vae.encode(image).latent_dist.sample(generator)
+        latent_space_image *= self.vae.scaling_factor
+        noise_level = torch.tensor([noise_level], dtype=torch.long, device=device)
+        # noise_rgb = randn_tensor(rgb.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
+        # rgb = self.low_res_scheduler.add_noise(rgb, noise_rgb, noise_level)
+        # noise_depth = randn_tensor(depth.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
+        # depth = self.low_res_scheduler.add_noise(depth, noise_depth, noise_level)
+
+        batch_multiplier = 2 if do_classifier_free_guidance else 1
+        latent_space_image = torch.cat([latent_space_image] * batch_multiplier * num_images_per_prompt)
+        noise_level = torch.cat([noise_level] * latent_space_image.shape[0])
+
+        # 7. Prepare latent variables
+        height, width = latent_space_image.shape[2:]
+        num_channels_latents = self.vae.config.latent_channels
+
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 8. Check that sizes of image and latents match
+        num_channels_image = latent_space_image.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 10. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = torch.cat([latent_model_input, latent_space_image], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=noise_level,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        rgb, depth = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # 11. Apply watermark
+        if output_type == "pil" and self.watermarker is not None:
+            rgb = self.watermarker.apply_watermark(rgb)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return ((rgb, depth), has_nsfw_concept)
+
+        return LDM3DPipelineOutput(rgb=rgb, depth=depth, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py b/diffusers/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae495979f3664cc5fceff0c07df2b18978db3ecc
--- /dev/null
+++ b/diffusers/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py
@@ -0,0 +1,1401 @@
+# Copyright 2024 TencentARC and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ControlNetModel, MultiAdapter, T2IAdapter, UNet2DConditionModel
+from diffusers.models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import T2IAdapter, StableDiffusionXLAdapterPipeline, DDPMScheduler
+        >>> from diffusers.utils import load_image
+        >>> from controlnet_aux.midas import MidasDetector
+
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+        >>> image = load_image(img_url).resize((1024, 1024))
+        >>> mask_image = load_image(mask_url).resize((1024, 1024))
+
+        >>> midas_depth = MidasDetector.from_pretrained(
+        ...    "valhalla/t2iadapter-aux-models", filename="dpt_large_384.pt", model_type="dpt_large"
+        ... ).to("cuda")
+
+        >>> depth_image = midas_depth(
+        ...    image, detect_resolution=512, image_resolution=1024
+        ... )
+
+        >>> model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+
+        >>> adapter = T2IAdapter.from_pretrained(
+        ...     "Adapter/t2iadapter",
+        ...     subfolder="sketch_sdxl_1.0",
+        ...     torch_dtype=torch.float16,
+        ...     adapter_type="full_adapter_xl",
+        ... )
+
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...    "diffusers/controlnet-depth-sdxl-1.0",
+        ...    torch_dtype=torch.float16,
+        ...    variant="fp16",
+        ...    use_safetensors=True
+        ... ).to("cuda")
+
+        >>> scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")
+
+        >>> pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
+        ...     model_id,
+        ...     adapter=adapter,
+        ...     controlnet=controlnet,
+        ...     torch_dtype=torch.float16,
+        ...     variant="fp16",
+        ...     scheduler=scheduler
+        ... ).to("cuda")
+
+        >>> strength = 0.5
+
+        >>> generator = torch.manual_seed(42)
+        >>> sketch_image_out = pipe(
+        ...     prompt="a photo of a tiger sitting on a park bench",
+        ...     negative_prompt="extra digit, fewer digits, cropped, worst quality, low quality",
+        ...     adapter_image=depth_image,
+        ...     control_image=mask_image,
+        ...     adapter_conditioning_scale=strength,
+        ...     controlnet_conditioning_scale=strength,
+        ...     generator=generator,
+        ...     guidance_scale=7.5,
+        ... ).images[0]
+        ```
+"""
+
+
+def _preprocess_adapter_image(image, height, width):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])) for i in image]
+        image = [
+            i[None, ..., None] if i.ndim == 2 else i[None, ...] for i in image
+        ]  # expand [h, w] or [h, w, c] to [b, h, w, c]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        if image[0].ndim == 3:
+            image = torch.stack(image, dim=0)
+        elif image[0].ndim == 4:
+            image = torch.cat(image, dim=0)
+        else:
+            raise ValueError(
+                f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}"
+            )
+    return image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class StableDiffusionXLControlNetAdapterPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
+    https://arxiv.org/abs/2302.08453
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        adapter ([`T2IAdapter`] or [`MultiAdapter`] or `List[T2IAdapter]`):
+            Provides additional conditioning to the unet during the denoising process. If you set multiple Adapter as a
+            list, the outputs from each Adapter are added together to create one combined additional conditioning.
+        adapter_weights (`List[float]`, *optional*, defaults to None):
+            List of floats representing the weight which will be multiply to each adapter's output before adding them
+            together.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
+        controlnet: Union[ControlNetModel, MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+    ):
+        super().__init__()
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            adapter=adapter,
+            controlnet=controlnet,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.default_sample_size = self.unet.config.sample_size
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    def check_conditions(
+        self,
+        prompt,
+        prompt_embeds,
+        adapter_image,
+        control_image,
+        adapter_conditioning_scale,
+        controlnet_conditioning_scale,
+        control_guidance_start,
+        control_guidance_end,
+    ):
+        # controlnet checks
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        # Check controlnet `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(control_image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(control_image, list):
+                raise TypeError("For multiple controlnets: `control_image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in control_image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(control_image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(control_image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in control_image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        # adapter checks
+        if isinstance(self.adapter, T2IAdapter) or is_compiled and isinstance(self.adapter._orig_mod, T2IAdapter):
+            self.check_image(adapter_image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.adapter, MultiAdapter) or is_compiled and isinstance(self.adapter._orig_mod, MultiAdapter)
+        ):
+            if not isinstance(adapter_image, list):
+                raise TypeError("For multiple adapters: `adapter_image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in adapter_image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(adapter_image) != len(self.adapter.adapters):
+                raise ValueError(
+                    f"For multiple adapters: `image` must have the same length as the number of adapters, but got {len(adapter_image)} images and {len(self.adapters.nets)} Adapters."
+                )
+
+            for image_ in adapter_image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `adapter_conditioning_scale`
+        if isinstance(self.adapter, T2IAdapter) or is_compiled and isinstance(self.adapter._orig_mod, T2IAdapter):
+            if not isinstance(adapter_conditioning_scale, float):
+                raise TypeError("For single adapter: `adapter_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.adapter, MultiAdapter) or is_compiled and isinstance(self.adapter._orig_mod, MultiAdapter)
+        ):
+            if isinstance(adapter_conditioning_scale, list):
+                if any(isinstance(i, list) for i in adapter_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(adapter_conditioning_scale, list) and len(adapter_conditioning_scale) != len(
+                self.adapter.adapters
+            ):
+                raise ValueError(
+                    "For multiple adapters: When `adapter_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of adapters"
+                )
+        else:
+            assert False
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (AttnProcessor2_0, XFormersAttnProcessor),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.t2i_adapter.pipeline_stable_diffusion_adapter.StableDiffusionAdapterPipeline._default_height_width
+    def _default_height_width(self, height, width, image):
+        # NOTE: It is possible that a list of images have different
+        # dimensions for each image, so just checking the first image
+        # is not _exactly_ correct, but it is simple.
+        while isinstance(image, list):
+            image = image[0]
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[-2]
+
+            # round down to nearest multiple of `self.adapter.downscale_factor`
+            height = (height // self.adapter.downscale_factor) * self.adapter.downscale_factor
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[-1]
+
+            # round down to nearest multiple of `self.adapter.downscale_factor`
+            width = (width // self.adapter.downscale_factor) * self.adapter.downscale_factor
+
+        return height, width
+
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        adapter_image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        adapter_conditioning_scale: Union[float, List[float]] = 1.0,
+        adapter_conditioning_factor: float = 1.0,
+        clip_skip: Optional[int] = None,
+        controlnet_conditioning_scale=1.0,
+        guess_mode: bool = False,
+        control_guidance_start: float = 0.0,
+        control_guidance_end: float = 1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            adapter_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
+                The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
+                type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
+                accepted as an image. The control image is automatically resized to fit the output image.
+            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single ControlNet.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionAdapterPipelineOutput`]
+                instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added to the
+                residual in the original unet. If multiple adapters are specified in init, you can set the
+                corresponding scale as a list.
+            adapter_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
+                residual in the original unet. If multiple adapters are specified in init, you can set the
+                corresponding scale as a list.
+            adapter_conditioning_factor (`float`, *optional*, defaults to 1.0):
+                The fraction of timesteps for which adapter should be applied. If `adapter_conditioning_factor` is
+                `0.0`, adapter is not applied at all. If `adapter_conditioning_factor` is `1.0`, adapter is applied for
+                all timesteps. If `adapter_conditioning_factor` is `0.5`, adapter is applied for half of the timesteps.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+        adapter = self.adapter._orig_mod if is_compiled_module(self.adapter) else self.adapter
+
+        # 0. Default height and width to unet
+
+        height, width = self._default_height_width(height, width, adapter_image)
+        device = self._execution_device
+
+        if isinstance(adapter, MultiAdapter):
+            adapter_input = []
+
+            for one_image in adapter_image:
+                one_image = _preprocess_adapter_image(one_image, height, width)
+                one_image = one_image.to(device=device, dtype=adapter.dtype)
+                adapter_input.append(one_image)
+        else:
+            adapter_input = _preprocess_adapter_image(adapter_image, height, width)
+            adapter_input = adapter_input.to(device=device, dtype=adapter.dtype)
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 0.1 align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+        if isinstance(adapter, MultiAdapter) and isinstance(adapter_conditioning_scale, float):
+            adapter_conditioning_scale = [adapter_conditioning_scale] * len(adapter.adapters)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+
+        self.check_conditions(
+            prompt,
+            prompt_embeds,
+            adapter_image,
+            control_image,
+            adapter_conditioning_scale,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings & adapter features
+        if isinstance(adapter, MultiAdapter):
+            adapter_state = adapter(adapter_input, adapter_conditioning_scale)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v
+        else:
+            adapter_state = adapter(adapter_input)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v * adapter_conditioning_scale
+        if num_images_per_prompt > 1:
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v.repeat(num_images_per_prompt, 1, 1, 1)
+        if do_classifier_free_guidance:
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = torch.cat([v] * 2, dim=0)
+
+        # 7.2 Prepare control images
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+        else:
+            raise ValueError(f"{controlnet.__class__} is not supported.")
+
+        # 8.2 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            if isinstance(self.controlnet, MultiControlNetModel):
+                controlnet_keep.append(keeps)
+            else:
+                controlnet_keep.append(keeps[0])
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 7.1 Apply denoising_end
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                if i < int(num_inference_steps * adapter_conditioning_factor):
+                    down_intrablock_additional_residuals = [state.clone() for state in adapter_state]
+                else:
+                    down_intrablock_additional_residuals = None
+
+                # ----------- ControlNet
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input_controlnet = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input_controlnet = self.scheduler.scale_model_input(latent_model_input_controlnet, t)
+
+                # controlnet(s) inference
+                if guess_mode and do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    controlnet_added_cond_kwargs = {
+                        "text_embeds": add_text_embeds.chunk(2)[1],
+                        "time_ids": add_time_ids.chunk(2)[1],
+                    }
+                else:
+                    control_model_input = latent_model_input_controlnet
+                    controlnet_prompt_embeds = prompt_embeds
+                    controlnet_added_cond_kwargs = added_cond_kwargs
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                    down_intrablock_additional_residuals=down_intrablock_additional_residuals,  # t2iadapter
+                    down_block_additional_residuals=down_block_res_samples,  # controlnet
+                    mid_block_additional_residual=mid_block_res_sample,  # controlnet
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+            return StableDiffusionXLPipelineOutput(images=image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py b/diffusers/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ca71cf7b1bf61fb465c5ac134405286ff386c9
--- /dev/null
+++ b/diffusers/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py
@@ -0,0 +1,1840 @@
+# Copyright 2024 Jake Babbidge, TencentARC and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ignore the entire file for precommit
+# type: ignore
+
+import inspect
+from collections.abc import Callable
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+from transformers import (
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+)
+
+from diffusers import DiffusionPipeline
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    StableDiffusionLoraLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import (
+    AutoencoderKL,
+    ControlNetModel,
+    MultiAdapter,
+    T2IAdapter,
+    UNet2DConditionModel,
+)
+from diffusers.models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline, T2IAdapter
+        >>> from diffusers.utils import load_image
+        >>> from PIL import Image
+        >>> from controlnet_aux.midas import MidasDetector
+
+        >>> adapter = T2IAdapter.from_pretrained(
+        ...     "TencentARC/t2i-adapter-sketch-sdxl-1.0", torch_dtype=torch.float16, variant="fp16"
+        ... ).to("cuda")
+
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...    "diffusers/controlnet-depth-sdxl-1.0",
+        ...    torch_dtype=torch.float16,
+        ...    variant="fp16",
+        ...    use_safetensors=True
+        ... ).to("cuda")
+
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
+        ...     torch_dtype=torch.float16,
+        ...     variant="fp16",
+        ...     use_safetensors=True,
+        ...     custom_pipeline="stable_diffusion_xl_adapter_controlnet_inpaint",
+        ...     adapter=adapter,
+        ...     controlnet=controlnet,
+        ... ).to("cuda")
+
+        >>> prompt = "a tiger sitting on a park bench"
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+        >>> image = load_image(img_url).resize((1024, 1024))
+        >>> mask_image = load_image(mask_url).resize((1024, 1024))
+
+        >>> midas_depth = MidasDetector.from_pretrained(
+        ...    "valhalla/t2iadapter-aux-models", filename="dpt_large_384.pt", model_type="dpt_large"
+        ... ).to("cuda")
+
+        >>> depth_image = midas_depth(
+        ...    image, detect_resolution=512, image_resolution=1024
+        ... )
+
+        >>> strength = 0.4
+
+        >>> generator = torch.manual_seed(42)
+
+        >>> result_image = pipe(
+        ...     image=image,
+        ...     mask_image=mask,
+        ...     adapter_image=depth_image,
+        ...     control_image=depth_image,
+        ...     controlnet_conditioning_scale=strength,
+        ...     adapter_conditioning_scale=strength,
+        ...     strength=0.7,
+        ...     generator=generator,
+        ...     prompt=prompt,
+        ...     negative_prompt="extra digit, fewer digits, cropped, worst quality, low quality",
+        ...        num_inference_steps=50
+        ... ).images[0]
+        ```
+"""
+
+
+def _preprocess_adapter_image(image, height, width):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])) for i in image]
+        image = [
+            i[None, ..., None] if i.ndim == 2 else i[None, ...] for i in image
+        ]  # expand [h, w] or [h, w, c] to [b, h, w, c]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        if image[0].ndim == 3:
+            image = torch.stack(image, dim=0)
+        elif image[0].ndim == 4:
+            image = torch.cat(image, dim=0)
+        else:
+            raise ValueError(
+                f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}"
+            )
+    return image
+
+
+def mask_pil_to_torch(mask, height, width):
+    # preprocess mask
+    if isinstance(mask, Union[PIL.Image.Image, np.ndarray]):
+        mask = [mask]
+
+    if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+        mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
+        mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+        mask = mask.astype(np.float32) / 255.0
+    elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+        mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+    mask = torch.from_numpy(mask)
+    return mask
+
+
+def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool = False):
+    """
+    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+
+    # checkpoint. #TODO(Yiyi) - need to clean this up later
+    if image is None:
+        raise ValueError("`image` input cannot be undefined.")
+
+    if mask is None:
+        raise ValueError("`mask_image` input cannot be undefined.")
+
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            mask = mask_pil_to_torch(mask, height, width)
+
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        # assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+        # Check image is in [-1, 1]
+        # if image.min() < -1 or image.max() > 1:
+        #    raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, Union[PIL.Image.Image, np.ndarray]):
+            image = [image]
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            # resize all images w.r.t passed height an width
+            image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        mask = mask_pil_to_torch(mask, height, width)
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+    if image.shape[1] == 4:
+        # images are in latent space and thus can't
+        # be masked set masked_image to None
+        # we assume that the checkpoint is not an inpainting
+        # checkpoint. #TODO(Yiyi) - need to clean this up later
+        masked_image = None
+    else:
+        masked_image = image * (mask < 0.5)
+
+    # n.b. ensure backwards compatibility as old function does not return image
+    if return_image:
+        return mask, masked_image, image
+
+    return mask, masked_image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class StableDiffusionXLControlNetAdapterInpaintPipeline(
+    DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, StableDiffusionLoraLoaderMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
+    https://arxiv.org/abs/2302.08453
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        adapter ([`T2IAdapter`] or [`MultiAdapter`] or `List[T2IAdapter]`):
+            Provides additional conditioning to the unet during the denoising process. If you set multiple Adapter as a
+            list, the outputs from each Adapter are added together to create one combined additional conditioning.
+        adapter_weights (`List[float]`, *optional*, defaults to None):
+            List of floats representing the weight which will be multiply to each adapter's output before adding them
+            together.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+            Whether the `unet` requires a aesthetic_score condition to be passed during inference. Also see the config
+            of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        adapter: Union[T2IAdapter, MultiAdapter],
+        controlnet: Union[ControlNetModel, MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+    ):
+        super().__init__()
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            adapter=adapter,
+            controlnet=controlnet,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.default_sample_size = self.unet.config.sample_size
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    def check_conditions(
+        self,
+        prompt,
+        prompt_embeds,
+        adapter_image,
+        control_image,
+        adapter_conditioning_scale,
+        controlnet_conditioning_scale,
+        control_guidance_start,
+        control_guidance_end,
+    ):
+        # controlnet checks
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        # Check controlnet `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(control_image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(control_image, list):
+                raise TypeError("For multiple controlnets: `control_image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in control_image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(control_image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(control_image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in control_image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        # adapter checks
+        if isinstance(self.adapter, T2IAdapter) or is_compiled and isinstance(self.adapter._orig_mod, T2IAdapter):
+            self.check_image(adapter_image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.adapter, MultiAdapter) or is_compiled and isinstance(self.adapter._orig_mod, MultiAdapter)
+        ):
+            if not isinstance(adapter_image, list):
+                raise TypeError("For multiple adapters: `adapter_image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in adapter_image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(adapter_image) != len(self.adapter.adapters):
+                raise ValueError(
+                    f"For multiple adapters: `image` must have the same length as the number of adapters, but got {len(adapter_image)} images and {len(self.adapters.nets)} Adapters."
+                )
+
+            for image_ in adapter_image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `adapter_conditioning_scale`
+        if isinstance(self.adapter, T2IAdapter) or is_compiled and isinstance(self.adapter._orig_mod, T2IAdapter):
+            if not isinstance(adapter_conditioning_scale, float):
+                raise TypeError("For single adapter: `adapter_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.adapter, MultiAdapter) or is_compiled and isinstance(self.adapter._orig_mod, MultiAdapter)
+        ):
+            if isinstance(adapter_conditioning_scale, list):
+                if any(isinstance(i, list) for i in adapter_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(adapter_conditioning_scale, list) and len(adapter_conditioning_scale) != len(
+                self.adapter.adapters
+            ):
+                raise ValueError(
+                    "For multiple adapters: When `adapter_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of adapters"
+                )
+        else:
+            assert False
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        add_noise=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if image.shape[1] == 4:
+            image_latents = image.to(device=device, dtype=dtype)
+        elif return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+            image_latents = self._encode_vae_image(image=image, generator=generator)
+
+        image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None and add_noise:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        elif add_noise:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+        else:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = image_latents.to(device)
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        dtype = image.dtype
+        if self.vae.config.force_upcast:
+            image = image.float()
+            self.vae.to(dtype=torch.float32)
+
+        if isinstance(generator, list):
+            image_latents = [
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = self.vae.encode(image).latent_dist.sample(generator=generator)
+
+        if self.vae.config.force_upcast:
+            self.vae.to(dtype)
+
+        image_latents = image_latents.to(dtype)
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def prepare_mask_latents(
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        do_classifier_free_guidance,
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask,
+            size=(
+                height // self.vae_scale_factor,
+                width // self.vae_scale_factor,
+            ),
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+
+        masked_image_latents = None
+        if masked_image is not None:
+            masked_image = masked_image.to(device=device, dtype=dtype)
+            masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+            if masked_image_latents.shape[0] < batch_size:
+                if not batch_size % masked_image_latents.shape[0] == 0:
+                    raise ValueError(
+                        "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                        f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                        " Make sure the number of images that you pass is divisible by the total requested batch size."
+                    )
+                masked_image_latents = masked_image_latents.repeat(
+                    batch_size // masked_image_latents.shape[0], 1, 1, 1
+                )
+
+            masked_image_latents = (
+                torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+            )
+
+            # aligning device to prevent device errors when concating it with the latent model input
+            masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+        else:
+            t_start = 0
+
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        # Strength is irrelevant if we directly request a timestep to start at;
+        # that is, strength is determined by the denoising_start instead.
+        if denoising_start is not None:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            timesteps = timesteps[-num_inference_steps:]
+            return timesteps, num_inference_steps
+
+        return timesteps, num_inference_steps - t_start
+
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(original_size + crops_coords_top_left + (negative_aesthetic_score,))
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (AttnProcessor2_0, XFormersAttnProcessor),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.t2i_adapter.pipeline_stable_diffusion_adapter.StableDiffusionAdapterPipeline._default_height_width
+    def _default_height_width(self, height, width, image):
+        # NOTE: It is possible that a list of images have different
+        # dimensions for each image, so just checking the first image
+        # is not _exactly_ correct, but it is simple.
+        while isinstance(image, list):
+            image = image[0]
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[-2]
+
+            # round down to nearest multiple of `self.adapter.downscale_factor`
+            height = (height // self.adapter.downscale_factor) * self.adapter.downscale_factor
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[-1]
+
+            # round down to nearest multiple of `self.adapter.downscale_factor`
+            width = (width // self.adapter.downscale_factor) * self.adapter.downscale_factor
+
+        return height, width
+
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: Optional[Union[torch.Tensor, PIL.Image.Image]] = None,
+        mask_image: Optional[Union[torch.Tensor, PIL.Image.Image]] = None,
+        adapter_image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.9999,
+        num_inference_steps: int = 50,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[Union[torch.Tensor]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Optional[Tuple[int, int]] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        adapter_conditioning_scale: Optional[Union[float, List[float]]] = 1.0,
+        cond_tau: float = 1.0,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        controlnet_conditioning_scale=1.0,
+        guess_mode: bool = False,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            adapter_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
+                The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
+                type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
+                accepted as an image. The control image is automatically resized to fit the output image.
+            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single ControlNet.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionAdapterPipelineOutput`]
+                instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added to the
+                residual in the original unet. If multiple adapters are specified in init, you can set the
+                corresponding scale as a list.
+            adapter_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
+                residual in the original unet. If multiple adapters are specified in init, you can set the
+                corresponding scale as a list.
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+        adapter = self.adapter._orig_mod if is_compiled_module(self.adapter) else self.adapter
+        height, width = self._default_height_width(height, width, adapter_image)
+        device = self._execution_device
+
+        if isinstance(adapter, MultiAdapter):
+            adapter_input = []
+            for one_image in adapter_image:
+                one_image = _preprocess_adapter_image(one_image, height, width)
+                one_image = one_image.to(device=device, dtype=adapter.dtype)
+                adapter_input.append(one_image)
+        else:
+            adapter_input = _preprocess_adapter_image(adapter_image, height, width)
+            adapter_input = adapter_input.to(device=device, dtype=adapter.dtype)
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 0.1 align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+        if isinstance(adapter, MultiAdapter) and isinstance(adapter_conditioning_scale, float):
+            adapter_conditioning_scale = [adapter_conditioning_scale] * len(adapter.nets)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+
+        self.check_conditions(
+            prompt,
+            prompt_embeds,
+            adapter_image,
+            control_image,
+            adapter_conditioning_scale,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+
+        # 4. set timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=denoising_start if denoising_value_valid(denoising_start) else None,
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+
+        # 5. Preprocess mask and image - resizes image and mask w.r.t height and width
+        mask, masked_image, init_image = prepare_mask_and_masked_image(
+            image, mask_image, height, width, return_image=True
+        )
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
+        add_noise = denoising_start is None
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            add_noise=add_noise,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+
+        # 8. Check that sizes of mask, masked image and latents match
+        if num_channels_unet == 9:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
+        elif num_channels_unet != 4:
+            raise ValueError(
+                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+            )
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 10. Prepare added time ids & embeddings & adapter features
+        if isinstance(adapter, MultiAdapter):
+            adapter_state = adapter(adapter_input, adapter_conditioning_scale)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v
+        else:
+            adapter_state = adapter(adapter_input)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v * adapter_conditioning_scale
+        if num_images_per_prompt > 1:
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v.repeat(num_images_per_prompt, 1, 1, 1)
+        if do_classifier_free_guidance:
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = torch.cat([v] * 2, dim=0)
+
+        # 10.2 Prepare control images
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+        else:
+            raise ValueError(f"{controlnet.__class__} is not supported.")
+
+        # 8.2 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            if isinstance(self.controlnet, MultiControlNetModel):
+                controlnet_keep.append(keeps)
+            else:
+                controlnet_keep.append(keeps[0])
+        # ----------------------------------------------------------------
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        # 11. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 11.1 Apply denoising_end
+        if (
+            denoising_end is not None
+            and denoising_start is not None
+            and denoising_value_valid(denoising_end)
+            and denoising_value_valid(denoising_start)
+            and denoising_start >= denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {denoising_end} when using type float."
+            )
+        elif denoising_end is not None and denoising_value_valid(denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if num_channels_unet == 9:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                # predict the noise residual
+                added_cond_kwargs = {
+                    "text_embeds": add_text_embeds,
+                    "time_ids": add_time_ids,
+                }
+
+                if i < int(num_inference_steps * cond_tau):
+                    down_block_additional_residuals = [state.clone() for state in adapter_state]
+                else:
+                    down_block_additional_residuals = None
+
+                # ----------- ControlNet
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input_controlnet = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input_controlnet = self.scheduler.scale_model_input(latent_model_input_controlnet, t)
+
+                # controlnet(s) inference
+                if guess_mode and do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    controlnet_added_cond_kwargs = {
+                        "text_embeds": add_text_embeds.chunk(2)[1],
+                        "time_ids": add_time_ids.chunk(2)[1],
+                    }
+                else:
+                    control_model_input = latent_model_input_controlnet
+                    controlnet_prompt_embeds = prompt_embeds
+                    controlnet_added_cond_kwargs = added_cond_kwargs
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                    down_intrablock_additional_residuals=down_block_additional_residuals,  # t2iadapter
+                    down_block_additional_residuals=down_block_res_samples,  # controlnet
+                    mid_block_additional_residual=mid_block_res_sample,  # controlnet
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_text,
+                        guidance_rescale=guidance_rescale,
+                    )
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                    **extra_step_kwargs,
+                    return_dict=False,
+                )[0]
+
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper,
+                            noise,
+                            torch.tensor([noise_timestep]),
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        # make sure the VAE is in float32 mode, as it overflows in float16
+        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+            self.upcast_vae()
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+        if output_type != "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+            return StableDiffusionXLPipelineOutput(images=image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py b/diffusers/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..584820e86254a65ee0da26669c0eafacdb98729b
--- /dev/null
+++ b/diffusers/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py
@@ -0,0 +1,1466 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torchvision
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+if is_invisible_watermark_available():
+    from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLImg2ImgPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
+
+        >>> init_image = load_image(url).convert("RGB")
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt, image=init_image).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLDifferentialImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+        - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`]
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "add_neg_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        strength,
+        num_inference_steps,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` cannot be None.")
+        elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
+            raise ValueError(
+                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
+                f" {type(num_inference_steps)}."
+            )
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+        else:
+            t_start = 0
+
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        # Strength is irrelevant if we directly request a timestep to start at;
+        # that is, strength is determined by the denoising_start instead.
+        if denoising_start is not None:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            timesteps = timesteps[-num_inference_steps:]
+            return timesteps, num_inference_steps
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        if add_noise:
+            shape = init_latents.shape
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # get latents
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def denoising_start(self):
+        return self._denoising_start
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: Union[
+            torch.Tensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.Tensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        strength: float = 0.3,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        map: torch.Tensor = None,
+        original_image: Union[
+            torch.Tensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.Tensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
+                The image(s) to modify with the pipeline.
+            strength (`float`, *optional*, defaults to 0.3):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. Note that in the case of
+                `denoising_start` being declared as an integer, the value of `strength` will be ignored.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
+                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
+                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
+                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            strength,
+            num_inference_steps,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+
+        # 4. Preprocess image
+        # image = self.image_processor.preprocess(image) #ideally we would have preprocess the image with diffusers, but for this POC we won't --- it throws a deprecated warning
+        map = torchvision.transforms.Resize(
+            tuple(s // self.vae_scale_factor for s in original_image.shape[2:]), antialias=None
+        )(map)
+
+        # 5. Prepare timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # begin diff diff change
+        total_time_steps = num_inference_steps
+        # end diff diff change
+
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
+        )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        add_noise = True if denoising_start is None else False
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            add_noise,
+        )
+        # 7. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 8. Prepare added time ids & embeddings
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 9. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 9.1 Apply denoising_end
+        if (
+            denoising_end is not None
+            and denoising_start is not None
+            and denoising_value_valid(denoising_end)
+            and denoising_value_valid(denoising_start)
+            and denoising_start >= denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {denoising_end} when using type float."
+            )
+        elif denoising_end is not None and denoising_value_valid(denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # preparations for diff diff
+        original_with_noise = self.prepare_latents(
+            original_image, timesteps, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+        )
+        thresholds = torch.arange(total_time_steps, dtype=map.dtype) / total_time_steps
+        thresholds = thresholds.unsqueeze(1).unsqueeze(1).to(device)
+        masks = map > (thresholds + (denoising_start or 0))
+        # end diff diff preparations
+
+        # 9.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # diff diff
+                if i == 0 and denoising_start is None:
+                    latents = original_with_noise[:1]
+                else:
+                    mask = masks[i].unsqueeze(0)
+                    # cast mask to the same type as latents etc
+                    mask = mask.to(latents.dtype)
+                    mask = mask.unsqueeze(1)  # fit shape
+                    latents = original_with_noise[i] * mask + latents * (1 - mask)
+                # end diff diff
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                    else:
+                        raise ValueError(
+                            "For the given accelerator, there seems to be an unexpected problem in type-casting. Please file an issue on the PyTorch GitHub repository. See also: https://github.com/huggingface/diffusers/pull/7446/."
+                        )
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+                else:
+                    raise ValueError(
+                        "For the given accelerator, there seems to be an unexpected problem in type-casting. Please file an issue on the PyTorch GitHub repository. See also: https://github.com/huggingface/diffusers/pull/7446/."
+                    )
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py b/diffusers/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..7aeba79ae930ad6ba7d2b1550cf04c6852ec7f33
--- /dev/null
+++ b/diffusers/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py
@@ -0,0 +1,1077 @@
+# Copyright 2024 The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import cv2
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn as nn
+
+from diffusers import StableDiffusionXLControlNetImg2ImgPipeline
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import ControlNetModel
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+from diffusers.utils import (
+    deprecate,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module, is_torch_version
+
+
+try:
+    import xformers
+    import xformers.ops
+
+    xformers_available = True
+except Exception:
+    xformers_available = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+logger.warning(
+    "To use instant id pipelines, please make sure you have the `insightface` library installed: `pip install insightface`."
+    "Please refer to: https://huggingface.co/InstantX/InstantID for further instructions regarding inference"
+)
+
+
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+
+
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+
+
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+
+        b, l, _ = latents.shape
+
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+
+        return self.to_out(out)
+
+
+class Resampler(nn.Module):
+    def __init__(
+        self,
+        dim=1024,
+        depth=8,
+        dim_head=64,
+        heads=16,
+        num_queries=8,
+        embedding_dim=768,
+        output_dim=1024,
+        ff_mult=4,
+    ):
+        super().__init__()
+
+        self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
+
+        self.proj_in = nn.Linear(embedding_dim, dim)
+
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+
+    def forward(self, x):
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = self.proj_in(x)
+
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+
+
+class AttnProcessor(nn.Module):
+    r"""
+    Default processor for performing attention-related computations.
+    """
+
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class IPAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        if xformers_available:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+        else:
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+
+        if xformers_available:
+            ip_hidden_states = self._memory_efficient_attention_xformers(query, ip_key, ip_value, None)
+        else:
+            ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+            ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+    def _memory_efficient_attention_xformers(self, query, key, value, attention_mask):
+        # TODO attention_mask
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
+        return hidden_states
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate insightface
+        >>> import diffusers
+        >>> from diffusers.utils import load_image
+        >>> from diffusers.models import ControlNetModel
+
+        >>> import cv2
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+
+        >>> from insightface.app import FaceAnalysis
+        >>> from pipeline_stable_diffusion_xl_instantid import StableDiffusionXLInstantIDPipeline, draw_kps
+
+        >>> # download 'antelopev2' under ./models
+        >>> app = FaceAnalysis(name='antelopev2', root='./', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+        >>> app.prepare(ctx_id=0, det_size=(640, 640))
+
+        >>> # download models under ./checkpoints
+        >>> face_adapter = f'./checkpoints/ip-adapter.bin'
+        >>> controlnet_path = f'./checkpoints/ControlNetModel'
+
+        >>> # load IdentityNet
+        >>> controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
+
+        >>> pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+        >>> pipe.cuda()
+
+        >>> # load adapter
+        >>> pipe.load_ip_adapter_instantid(face_adapter)
+
+        >>> prompt = "analog film photo of a man. faded film, desaturated, 35mm photo, grainy, vignette, vintage, Kodachrome, Lomography, stained, highly detailed, found footage, masterpiece, best quality"
+        >>> negative_prompt = "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured (lowres, low quality, worst quality:1.2), (text:1.2), watermark, painting, drawing, illustration, glitch,deformed, mutated, cross-eyed, ugly, disfigured"
+
+        >>> # load an image
+        >>> image = load_image("your-example.jpg")
+
+        >>> face_info = app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR))[-1]
+        >>> face_emb = face_info['embedding']
+        >>> face_kps = draw_kps(face_image, face_info['kps'])
+
+        >>> pipe.set_ip_adapter_scale(0.8)
+
+        >>> # generate image
+        >>> image = pipe(
+        ...     prompt, image_embeds=face_emb, image=face_kps, controlnet_conditioning_scale=0.8
+        ... ).images[0]
+        ```
+"""
+
+
+def draw_kps(image_pil, kps, color_list=[(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]):
+    stickwidth = 4
+    limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]])
+    kps = np.array(kps)
+
+    w, h = image_pil.size
+    out_img = np.zeros([h, w, 3])
+
+    for i in range(len(limbSeq)):
+        index = limbSeq[i]
+        color = color_list[index[0]]
+
+        x = kps[index][:, 0]
+        y = kps[index][:, 1]
+        length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5
+        angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1]))
+        polygon = cv2.ellipse2Poly(
+            (int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1
+        )
+        out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color)
+    out_img = (out_img * 0.6).astype(np.uint8)
+
+    for idx_kp, kp in enumerate(kps):
+        color = color_list[idx_kp]
+        x, y = kp
+        out_img = cv2.circle(out_img.copy(), (int(x), int(y)), 10, color, -1)
+
+    out_img_pil = PIL.Image.fromarray(out_img.astype(np.uint8))
+    return out_img_pil
+
+
+class StableDiffusionXLInstantIDImg2ImgPipeline(StableDiffusionXLControlNetImg2ImgPipeline):
+    def cuda(self, dtype=torch.float16, use_xformers=False):
+        self.to("cuda", dtype)
+
+        if hasattr(self, "image_proj_model"):
+            self.image_proj_model.to(self.unet.device).to(self.unet.dtype)
+
+        if use_xformers:
+            if is_xformers_available():
+                import xformers
+                from packaging import version
+
+                xformers_version = version.parse(xformers.__version__)
+                if xformers_version == version.parse("0.0.16"):
+                    logger.warning(
+                        "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                    )
+                self.enable_xformers_memory_efficient_attention()
+            else:
+                raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    def load_ip_adapter_instantid(self, model_ckpt, image_emb_dim=512, num_tokens=16, scale=0.5):
+        self.set_image_proj_model(model_ckpt, image_emb_dim, num_tokens)
+        self.set_ip_adapter(model_ckpt, num_tokens, scale)
+
+    def set_image_proj_model(self, model_ckpt, image_emb_dim=512, num_tokens=16):
+        image_proj_model = Resampler(
+            dim=1280,
+            depth=4,
+            dim_head=64,
+            heads=20,
+            num_queries=num_tokens,
+            embedding_dim=image_emb_dim,
+            output_dim=self.unet.config.cross_attention_dim,
+            ff_mult=4,
+        )
+
+        image_proj_model.eval()
+
+        self.image_proj_model = image_proj_model.to(self.device, dtype=self.dtype)
+        state_dict = torch.load(model_ckpt, map_location="cpu")
+        if "image_proj" in state_dict:
+            state_dict = state_dict["image_proj"]
+        self.image_proj_model.load_state_dict(state_dict)
+
+        self.image_proj_model_in_features = image_emb_dim
+
+    def set_ip_adapter(self, model_ckpt, num_tokens, scale):
+        unet = self.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor().to(unet.device, dtype=unet.dtype)
+            else:
+                attn_procs[name] = IPAttnProcessor(
+                    hidden_size=hidden_size,
+                    cross_attention_dim=cross_attention_dim,
+                    scale=scale,
+                    num_tokens=num_tokens,
+                ).to(unet.device, dtype=unet.dtype)
+        unet.set_attn_processor(attn_procs)
+
+        state_dict = torch.load(model_ckpt, map_location="cpu")
+        ip_layers = torch.nn.ModuleList(self.unet.attn_processors.values())
+        if "ip_adapter" in state_dict:
+            state_dict = state_dict["ip_adapter"]
+        ip_layers.load_state_dict(state_dict)
+
+    def set_ip_adapter_scale(self, scale):
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        for attn_processor in unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                attn_processor.scale = scale
+
+    def _encode_prompt_image_emb(self, prompt_image_emb, device, dtype, do_classifier_free_guidance):
+        if isinstance(prompt_image_emb, torch.Tensor):
+            prompt_image_emb = prompt_image_emb.clone().detach()
+        else:
+            prompt_image_emb = torch.tensor(prompt_image_emb)
+
+        prompt_image_emb = prompt_image_emb.to(device=device, dtype=dtype)
+        prompt_image_emb = prompt_image_emb.reshape([1, -1, self.image_proj_model_in_features])
+
+        if do_classifier_free_guidance:
+            prompt_image_emb = torch.cat([torch.zeros_like(prompt_image_emb), prompt_image_emb], dim=0)
+        else:
+            prompt_image_emb = torch.cat([prompt_image_emb], dim=0)
+
+        prompt_image_emb = self.image_proj_model(prompt_image_emb)
+        return prompt_image_emb
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        strength: float = 0.8,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. This is sent to `tokenizer_2`
+                and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, pooled text embeddings are generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt
+                weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input
+                argument.
+            image_embeds (`torch.Tensor`, *optional*):
+                Pre-generated image embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned containing the output images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            control_image,
+            strength,
+            num_inference_steps,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            None,
+            None,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3.1 Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt,
+            prompt_2,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 3.2 Encode image prompt
+        prompt_image_emb = self._encode_prompt_image_emb(
+            image_embeds, device, self.unet.dtype, self.do_classifier_free_guidance
+        )
+        bs_embed, seq_len, _ = prompt_image_emb.shape
+        prompt_image_emb = prompt_image_emb.repeat(1, num_images_per_prompt, 1)
+        prompt_image_emb = prompt_image_emb.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # 4. Prepare image and controlnet_conditioning_image
+        image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = control_image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+            height, width = control_image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            True,
+        )
+
+        # # 6.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 7.2 Prepare added time ids & embeddings
+        if isinstance(control_image, list):
+            original_size = original_size or control_image[0].shape[-2:]
+        else:
+            original_size = original_size or control_image.shape[-2:]
+        target_size = target_size or (height, width)
+
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+        add_text_embeds = pooled_prompt_embeds
+
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+        encoder_hidden_states = torch.cat([prompt_embeds, prompt_image_emb], dim=1)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_controlnet_compiled = is_compiled_module(self.controlnet)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    controlnet_added_cond_kwargs = {
+                        "text_embeds": add_text_embeds.chunk(2)[1],
+                        "time_ids": add_time_ids.chunk(2)[1],
+                    }
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                    controlnet_added_cond_kwargs = added_cond_kwargs
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=prompt_image_emb,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Inferred ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=encoder_hidden_states,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/examples/community/pipeline_stable_diffusion_xl_instantid.py b/diffusers/examples/community/pipeline_stable_diffusion_xl_instantid.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eead8861e4d760dfe136f495570ed2c0781ecf5
--- /dev/null
+++ b/diffusers/examples/community/pipeline_stable_diffusion_xl_instantid.py
@@ -0,0 +1,1066 @@
+# Copyright 2024 The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import cv2
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn as nn
+
+from diffusers import StableDiffusionXLControlNetPipeline
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import ControlNetModel
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+from diffusers.utils import (
+    deprecate,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module, is_torch_version
+
+
+try:
+    import xformers
+    import xformers.ops
+
+    xformers_available = True
+except Exception:
+    xformers_available = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+logger.warning(
+    "To use instant id pipelines, please make sure you have the `insightface` library installed: `pip install insightface`."
+    "Please refer to: https://huggingface.co/InstantX/InstantID for further instructions regarding inference"
+)
+
+
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+
+
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+
+
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+
+        b, l, _ = latents.shape
+
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+
+        return self.to_out(out)
+
+
+class Resampler(nn.Module):
+    def __init__(
+        self,
+        dim=1024,
+        depth=8,
+        dim_head=64,
+        heads=16,
+        num_queries=8,
+        embedding_dim=768,
+        output_dim=1024,
+        ff_mult=4,
+    ):
+        super().__init__()
+
+        self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
+
+        self.proj_in = nn.Linear(embedding_dim, dim)
+
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+
+    def forward(self, x):
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = self.proj_in(x)
+
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+
+
+class AttnProcessor(nn.Module):
+    r"""
+    Default processor for performing attention-related computations.
+    """
+
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class IPAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        if xformers_available:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+        else:
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+
+        if xformers_available:
+            ip_hidden_states = self._memory_efficient_attention_xformers(query, ip_key, ip_value, None)
+        else:
+            ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+            ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+    def _memory_efficient_attention_xformers(self, query, key, value, attention_mask):
+        # TODO attention_mask
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
+        return hidden_states
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate insightface
+        >>> import diffusers
+        >>> from diffusers.utils import load_image
+        >>> from diffusers.models import ControlNetModel
+
+        >>> import cv2
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+
+        >>> from insightface.app import FaceAnalysis
+        >>> from pipeline_stable_diffusion_xl_instantid import StableDiffusionXLInstantIDPipeline, draw_kps
+
+        >>> # download 'antelopev2' under ./models
+        >>> app = FaceAnalysis(name='antelopev2', root='./', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+        >>> app.prepare(ctx_id=0, det_size=(640, 640))
+
+        >>> # download models under ./checkpoints
+        >>> face_adapter = f'./checkpoints/ip-adapter.bin'
+        >>> controlnet_path = f'./checkpoints/ControlNetModel'
+
+        >>> # load IdentityNet
+        >>> controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
+
+        >>> pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+        >>> pipe.cuda()
+
+        >>> # load adapter
+        >>> pipe.load_ip_adapter_instantid(face_adapter)
+
+        >>> prompt = "analog film photo of a man. faded film, desaturated, 35mm photo, grainy, vignette, vintage, Kodachrome, Lomography, stained, highly detailed, found footage, masterpiece, best quality"
+        >>> negative_prompt = "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured (lowres, low quality, worst quality:1.2), (text:1.2), watermark, painting, drawing, illustration, glitch,deformed, mutated, cross-eyed, ugly, disfigured"
+
+        >>> # load an image
+        >>> image = load_image("your-example.jpg")
+
+        >>> face_info = app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR))[-1]
+        >>> face_emb = face_info['embedding']
+        >>> face_kps = draw_kps(face_image, face_info['kps'])
+
+        >>> pipe.set_ip_adapter_scale(0.8)
+
+        >>> # generate image
+        >>> image = pipe(
+        ...     prompt, image_embeds=face_emb, image=face_kps, controlnet_conditioning_scale=0.8
+        ... ).images[0]
+        ```
+"""
+
+
+def draw_kps(image_pil, kps, color_list=[(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]):
+    stickwidth = 4
+    limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]])
+    kps = np.array(kps)
+
+    w, h = image_pil.size
+    out_img = np.zeros([h, w, 3])
+
+    for i in range(len(limbSeq)):
+        index = limbSeq[i]
+        color = color_list[index[0]]
+
+        x = kps[index][:, 0]
+        y = kps[index][:, 1]
+        length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5
+        angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1]))
+        polygon = cv2.ellipse2Poly(
+            (int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1
+        )
+        out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color)
+    out_img = (out_img * 0.6).astype(np.uint8)
+
+    for idx_kp, kp in enumerate(kps):
+        color = color_list[idx_kp]
+        x, y = kp
+        out_img = cv2.circle(out_img.copy(), (int(x), int(y)), 10, color, -1)
+
+    out_img_pil = PIL.Image.fromarray(out_img.astype(np.uint8))
+    return out_img_pil
+
+
+class StableDiffusionXLInstantIDPipeline(StableDiffusionXLControlNetPipeline):
+    def cuda(self, dtype=torch.float16, use_xformers=False):
+        self.to("cuda", dtype)
+
+        if hasattr(self, "image_proj_model"):
+            self.image_proj_model.to(self.unet.device).to(self.unet.dtype)
+
+        if use_xformers:
+            if is_xformers_available():
+                import xformers
+                from packaging import version
+
+                xformers_version = version.parse(xformers.__version__)
+                if xformers_version == version.parse("0.0.16"):
+                    logger.warning(
+                        "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                    )
+                self.enable_xformers_memory_efficient_attention()
+            else:
+                raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    def load_ip_adapter_instantid(self, model_ckpt, image_emb_dim=512, num_tokens=16, scale=0.5):
+        self.set_image_proj_model(model_ckpt, image_emb_dim, num_tokens)
+        self.set_ip_adapter(model_ckpt, num_tokens, scale)
+
+    def set_image_proj_model(self, model_ckpt, image_emb_dim=512, num_tokens=16):
+        image_proj_model = Resampler(
+            dim=1280,
+            depth=4,
+            dim_head=64,
+            heads=20,
+            num_queries=num_tokens,
+            embedding_dim=image_emb_dim,
+            output_dim=self.unet.config.cross_attention_dim,
+            ff_mult=4,
+        )
+
+        image_proj_model.eval()
+
+        self.image_proj_model = image_proj_model.to(self.device, dtype=self.dtype)
+        state_dict = torch.load(model_ckpt, map_location="cpu")
+        if "image_proj" in state_dict:
+            state_dict = state_dict["image_proj"]
+        self.image_proj_model.load_state_dict(state_dict)
+
+        self.image_proj_model_in_features = image_emb_dim
+
+    def set_ip_adapter(self, model_ckpt, num_tokens, scale):
+        unet = self.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor().to(unet.device, dtype=unet.dtype)
+            else:
+                attn_procs[name] = IPAttnProcessor(
+                    hidden_size=hidden_size,
+                    cross_attention_dim=cross_attention_dim,
+                    scale=scale,
+                    num_tokens=num_tokens,
+                ).to(unet.device, dtype=unet.dtype)
+        unet.set_attn_processor(attn_procs)
+
+        state_dict = torch.load(model_ckpt, map_location="cpu")
+        ip_layers = torch.nn.ModuleList(self.unet.attn_processors.values())
+        if "ip_adapter" in state_dict:
+            state_dict = state_dict["ip_adapter"]
+        ip_layers.load_state_dict(state_dict)
+
+    def set_ip_adapter_scale(self, scale):
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        for attn_processor in unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                attn_processor.scale = scale
+
+    def _encode_prompt_image_emb(self, prompt_image_emb, device, dtype, do_classifier_free_guidance):
+        if isinstance(prompt_image_emb, torch.Tensor):
+            prompt_image_emb = prompt_image_emb.clone().detach()
+        else:
+            prompt_image_emb = torch.tensor(prompt_image_emb)
+
+        prompt_image_emb = prompt_image_emb.to(device=device, dtype=dtype)
+        prompt_image_emb = prompt_image_emb.reshape([1, -1, self.image_proj_model_in_features])
+
+        if do_classifier_free_guidance:
+            prompt_image_emb = torch.cat([torch.zeros_like(prompt_image_emb), prompt_image_emb], dim=0)
+        else:
+            prompt_image_emb = torch.cat([prompt_image_emb], dim=0)
+
+        prompt_image_emb = self.image_proj_model(prompt_image_emb)
+        return prompt_image_emb
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. This is sent to `tokenizer_2`
+                and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, pooled text embeddings are generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt
+                weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input
+                argument.
+            image_embeds (`torch.Tensor`, *optional*):
+                Pre-generated image embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned containing the output images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            image,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3.1 Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt,
+            prompt_2,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 3.2 Encode image prompt
+        prompt_image_emb = self._encode_prompt_image_emb(
+            image_embeds, device, self.unet.dtype, self.do_classifier_free_guidance
+        )
+        bs_embed, seq_len, _ = prompt_image_emb.shape
+        prompt_image_emb = prompt_image_emb.repeat(1, num_images_per_prompt, 1)
+        prompt_image_emb = prompt_image_emb.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            images = []
+
+            for image_ in image:
+                image_ = self.prepare_image(
+                    image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                images.append(image_)
+
+            image = images
+            height, width = image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 7.2 Prepare added time ids & embeddings
+        if isinstance(image, list):
+            original_size = original_size or image[0].shape[-2:]
+        else:
+            original_size = original_size or image.shape[-2:]
+        target_size = target_size or (height, width)
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+        encoder_hidden_states = torch.cat([prompt_embeds, prompt_image_emb], dim=1)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_controlnet_compiled = is_compiled_module(self.controlnet)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    controlnet_added_cond_kwargs = {
+                        "text_embeds": add_text_embeds.chunk(2)[1],
+                        "time_ids": add_time_ids.chunk(2)[1],
+                    }
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                    controlnet_added_cond_kwargs = added_cond_kwargs
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=prompt_image_emb,
+                    controlnet_cond=image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Inferred ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=encoder_hidden_states,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/examples/community/pipeline_stable_diffusion_xl_ipex.py b/diffusers/examples/community/pipeline_stable_diffusion_xl_ipex.py
new file mode 100644
index 0000000000000000000000000000000000000000..022dfb1abf8247b7fcceca9ab2c301702b16ccef
--- /dev/null
+++ b/diffusers/examples/community/pipeline_stable_diffusion_xl_ipex.py
@@ -0,0 +1,1430 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import intel_extension_for_pytorch as ipex
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import StableDiffusionXLPipeline
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import (
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+if is_invisible_watermark_available():
+    from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLPipelineIpex
+
+        >>> # SDXL-Turbo, a distilled version of SDXL 1.0, trained for real-time synthesis
+        >>> pipe = StableDiffusionXLPipelineIpex.from_pretrained(
+        ...     "stabilityai/sdxl-turbo", low_cpu_mem_usage=True, use_safetensors=True
+        ... )
+
+        >>> num_inference_steps = 1
+        >>> guidance_scale = 0.0
+        >>> use_bf16 = True
+        >>> data_type = torch.bfloat16 if use_bf16 else torch.float32
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+
+        >>> # value of image height/width should be consistent with the pipeline inference
+        >>> # For Float32
+        >>> pipe.prepare_for_ipex(torch.float32, prompt, height=512, width=512)
+        >>> # For BFloat16
+        >>> pipe.prepare_for_ipex(torch.bfloat16, prompt, height=512, width=512)
+
+        >>> # value of image height/width should be consistent with 'prepare_for_ipex()'
+        >>> # For Float32
+        >>> image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=guidance_scale).images[0]
+        >>> # For BFloat16
+        >>> with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+        >>>     image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=guidance_scale).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLPipelineIpex(
+    StableDiffusionXLPipeline,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL on IPEX.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    In addition the pipeline inherits the following loading methods:
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        # super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=torch.float32)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+                image_embeds = image_embeds.to(device)
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                # noise_pred = self.unet(
+                #     latent_model_input,
+                #     t,
+                #     encoder_hidden_states=prompt_embeds,
+                #     timestep_cond=timestep_cond,
+                #     cross_attention_kwargs=self.cross_attention_kwargs,
+                #     added_cond_kwargs=added_cond_kwargs,
+                #     return_dict=False,
+                # )[0]
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    added_cond_kwargs=added_cond_kwargs,
+                )["sample"]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
+
+    @torch.no_grad()
+    def prepare_for_ipex(
+        self,
+        dtype=torch.float32,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = "cpu"
+        do_classifier_free_guidance = self.do_classifier_free_guidance
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+                image_embeds = image_embeds.to(device)
+
+        dummy = torch.ones(1, dtype=torch.int32)
+        latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+        latent_model_input = self.scheduler.scale_model_input(latent_model_input, dummy)
+
+        # predict the noise residual
+        added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+        if ip_adapter_image is not None:
+            added_cond_kwargs["image_embeds"] = image_embeds
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+
+        self.unet = self.unet.to(memory_format=torch.channels_last)
+        self.vae.decoder = self.vae.decoder.to(memory_format=torch.channels_last)
+        self.text_encoder = self.text_encoder.to(memory_format=torch.channels_last)
+
+        unet_input_example = {
+            "sample": latent_model_input,
+            "timestep": dummy,
+            "encoder_hidden_states": prompt_embeds,
+            "added_cond_kwargs": added_cond_kwargs,
+        }
+
+        vae_decoder_input_example = latents
+
+        # optimize with ipex
+        if dtype == torch.bfloat16:
+            self.unet = ipex.optimize(
+                self.unet.eval(),
+                dtype=torch.bfloat16,
+                inplace=True,
+            )
+            self.vae.decoder = ipex.optimize(self.vae.decoder.eval(), dtype=torch.bfloat16, inplace=True)
+            self.text_encoder = ipex.optimize(self.text_encoder.eval(), dtype=torch.bfloat16, inplace=True)
+        elif dtype == torch.float32:
+            self.unet = ipex.optimize(
+                self.unet.eval(),
+                dtype=torch.float32,
+                inplace=True,
+                level="O1",
+                weights_prepack=True,
+                auto_kernel_selection=False,
+            )
+            self.vae.decoder = ipex.optimize(
+                self.vae.decoder.eval(),
+                dtype=torch.float32,
+                inplace=True,
+                level="O1",
+                weights_prepack=True,
+                auto_kernel_selection=False,
+            )
+            self.text_encoder = ipex.optimize(
+                self.text_encoder.eval(),
+                dtype=torch.float32,
+                inplace=True,
+                level="O1",
+                weights_prepack=True,
+                auto_kernel_selection=False,
+            )
+        else:
+            raise ValueError(" The value of 'dtype' should be 'torch.bfloat16' or 'torch.float32' !")
+
+        # trace unet model to get better performance on IPEX
+        with torch.cpu.amp.autocast(enabled=dtype == torch.bfloat16), torch.no_grad():
+            unet_trace_model = torch.jit.trace(
+                self.unet, example_kwarg_inputs=unet_input_example, check_trace=False, strict=False
+            )
+            unet_trace_model = torch.jit.freeze(unet_trace_model)
+            self.unet.forward = unet_trace_model.forward
+
+        # trace vae.decoder model to get better performance on IPEX
+        with torch.cpu.amp.autocast(enabled=dtype == torch.bfloat16), torch.no_grad():
+            vae_decoder_trace_model = torch.jit.trace(
+                self.vae.decoder, vae_decoder_input_example, check_trace=False, strict=False
+            )
+            vae_decoder_trace_model = torch.jit.freeze(vae_decoder_trace_model)
+            self.vae.decoder.forward = vae_decoder_trace_model.forward
diff --git a/diffusers/examples/community/pipeline_zero1to3.py b/diffusers/examples/community/pipeline_zero1to3.py
new file mode 100644
index 0000000000000000000000000000000000000000..95bb37ce02b701611f2d1c265fd0fece302846af
--- /dev/null
+++ b/diffusers/examples/community/pipeline_zero1to3.py
@@ -0,0 +1,793 @@
+# A diffuser version implementation of Zero1to3 (https://github.com/cvlab-columbia/zero123), ICCV 2023
+# by Xin Kong
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import kornia
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+# from ...configuration_utils import FrozenDict
+# from ...models import AutoencoderKL, UNet2DConditionModel
+# from ...schedulers import KarrasDiffusionSchedulers
+# from ...utils import (
+#     deprecate,
+#     is_accelerate_available,
+#     is_accelerate_version,
+#     logging,
+#     randn_tensor,
+#     replace_example_docstring,
+# )
+# from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+# from . import StableDiffusionPipelineOutput
+# from .safety_checker import StableDiffusionSafetyChecker
+from diffusers import AutoencoderKL, DiffusionPipeline, StableDiffusionMixin, UNet2DConditionModel
+from diffusers.configuration_utils import ConfigMixin, FrozenDict
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    deprecate,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# todo
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPipeline
+
+        >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+class CCProjection(ModelMixin, ConfigMixin):
+    def __init__(self, in_channel=772, out_channel=768):
+        super().__init__()
+        self.in_channel = in_channel
+        self.out_channel = out_channel
+        self.projection = torch.nn.Linear(in_channel, out_channel)
+
+    def forward(self, x):
+        return self.projection(x)
+
+
+class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for single view conditioned novel view generation using Zero1to3.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder. Stable Diffusion Image Variation uses the vision portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+        cc_projection ([`CCProjection`]):
+            Projection layer to project the concated CLIP features and pose embeddings to the original CLIP feature size.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        cc_projection: CCProjection,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            cc_projection=cc_projection,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+        # self.model_mode = None
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def CLIP_preprocess(self, x):
+        dtype = x.dtype
+        # following openai's implementation
+        # TODO HF OpenAI CLIP preprocessing issue https://github.com/huggingface/transformers/issues/22505#issuecomment-1650170741
+        # follow openai preprocessing to keep exact same, input tensor [-1, 1], otherwise the preprocessing will be different, https://github.com/huggingface/transformers/pull/22608
+        if isinstance(x, torch.Tensor):
+            if x.min() < -1.0 or x.max() > 1.0:
+                raise ValueError("Expected input tensor to have values in the range [-1, 1]")
+        x = kornia.geometry.resize(
+            x.to(torch.float32), (224, 224), interpolation="bicubic", align_corners=True, antialias=False
+        ).to(dtype=dtype)
+        x = (x + 1.0) / 2.0
+        # renormalize according to clip
+        x = kornia.enhance.normalize(
+            x, torch.Tensor([0.48145466, 0.4578275, 0.40821073]), torch.Tensor([0.26862954, 0.26130258, 0.27577711])
+        )
+        return x
+
+    # from image_variation
+    def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        if isinstance(image, torch.Tensor):
+            # Batch single image
+            if image.ndim == 3:
+                assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+                image = image.unsqueeze(0)
+
+            assert image.ndim == 4, "Image must have 4 dimensions"
+
+            # Check image is in [-1, 1]
+            if image.min() < -1 or image.max() > 1:
+                raise ValueError("Image should be in [-1, 1] range")
+        else:
+            # preprocess image
+            if isinstance(image, (PIL.Image.Image, np.ndarray)):
+                image = [image]
+
+            if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+                image = [np.array(i.convert("RGB"))[None, :] for i in image]
+                image = np.concatenate(image, axis=0)
+            elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+                image = np.concatenate([i[None, :] for i in image], axis=0)
+
+            image = image.transpose(0, 3, 1, 2)
+            image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        image = image.to(device=device, dtype=dtype)
+
+        image = self.CLIP_preprocess(image)
+        # if not isinstance(image, torch.Tensor):
+        #     # 0-255
+        #     print("Warning: image is processed by hf's preprocess, which is different from openai original's.")
+        #     image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+        image_embeddings = self.image_encoder(image).image_embeds.to(dtype=dtype)
+        image_embeddings = image_embeddings.unsqueeze(1)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(image_embeddings)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+
+        return image_embeddings
+
+    def _encode_pose(self, pose, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.cc_projection.parameters()).dtype
+        if isinstance(pose, torch.Tensor):
+            pose_embeddings = pose.unsqueeze(1).to(device=device, dtype=dtype)
+        else:
+            if isinstance(pose[0], list):
+                pose = torch.Tensor(pose)
+            else:
+                pose = torch.Tensor([pose])
+            x, y, z = pose[:, 0].unsqueeze(1), pose[:, 1].unsqueeze(1), pose[:, 2].unsqueeze(1)
+            pose_embeddings = (
+                torch.cat([torch.deg2rad(x), torch.sin(torch.deg2rad(y)), torch.cos(torch.deg2rad(y)), z], dim=-1)
+                .unsqueeze(1)
+                .to(device=device, dtype=dtype)
+            )  # B, 1, 4
+        # duplicate pose embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = pose_embeddings.shape
+        pose_embeddings = pose_embeddings.repeat(1, num_images_per_prompt, 1)
+        pose_embeddings = pose_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(pose_embeddings)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            pose_embeddings = torch.cat([negative_prompt_embeds, pose_embeddings])
+        return pose_embeddings
+
+    def _encode_image_with_pose(self, image, pose, device, num_images_per_prompt, do_classifier_free_guidance):
+        img_prompt_embeds = self._encode_image(image, device, num_images_per_prompt, False)
+        pose_prompt_embeds = self._encode_pose(pose, device, num_images_per_prompt, False)
+        prompt_embeds = torch.cat([img_prompt_embeds, pose_prompt_embeds], dim=-1)
+        prompt_embeds = self.cc_projection(prompt_embeds)
+        # prompt_embeds = img_prompt_embeds
+        # follow 0123, add negative prompt, after projection
+        if do_classifier_free_guidance:
+            negative_prompt = torch.zeros_like(prompt_embeds)
+            prompt_embeds = torch.cat([negative_prompt, prompt_embeds])
+        return prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, image, height, width, callback_steps):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_img_latents(self, image, batch_size, dtype, device, generator=None, do_classifier_free_guidance=False):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        if isinstance(image, torch.Tensor):
+            # Batch single image
+            if image.ndim == 3:
+                assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+                image = image.unsqueeze(0)
+
+            assert image.ndim == 4, "Image must have 4 dimensions"
+
+            # Check image is in [-1, 1]
+            if image.min() < -1 or image.max() > 1:
+                raise ValueError("Image should be in [-1, 1] range")
+        else:
+            # preprocess image
+            if isinstance(image, (PIL.Image.Image, np.ndarray)):
+                image = [image]
+
+            if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+                image = [np.array(i.convert("RGB"))[None, :] for i in image]
+                image = np.concatenate(image, axis=0)
+            elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+                image = np.concatenate([i[None, :] for i in image], axis=0)
+
+            image = image.transpose(0, 3, 1, 2)
+            image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        image = image.to(device=device, dtype=dtype)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if isinstance(generator, list):
+            init_latents = [
+                self.vae.encode(image[i : i + 1]).latent_dist.mode(generator[i])
+                for i in range(batch_size)  # sample
+            ]
+            init_latents = torch.cat(init_latents, dim=0)
+        else:
+            init_latents = self.vae.encode(image).latent_dist.mode()
+
+        # init_latents = self.vae.config.scaling_factor * init_latents  # todo in original zero123's inference gradio_new.py, model.encode_first_stage() is not scaled by scaling_factor
+        if batch_size > init_latents.shape[0]:
+            # init_latents = init_latents.repeat(batch_size // init_latents.shape[0], 1, 1, 1)
+            num_images_per_prompt = batch_size // init_latents.shape[0]
+            # duplicate image latents for each generation per prompt, using mps friendly method
+            bs_embed, emb_c, emb_h, emb_w = init_latents.shape
+            init_latents = init_latents.unsqueeze(1)
+            init_latents = init_latents.repeat(1, num_images_per_prompt, 1, 1, 1)
+            init_latents = init_latents.view(bs_embed * num_images_per_prompt, emb_c, emb_h, emb_w)
+
+        # init_latents = torch.cat([init_latents]*2) if do_classifier_free_guidance else init_latents   # follow zero123
+        init_latents = (
+            torch.cat([torch.zeros_like(init_latents), init_latents]) if do_classifier_free_guidance else init_latents
+        )
+
+        init_latents = init_latents.to(device=device, dtype=dtype)
+        return init_latents
+
+    # def load_cc_projection(self, pretrained_weights=None):
+    #     self.cc_projection = torch.nn.Linear(772, 768)
+    #     torch.nn.init.eye_(list(self.cc_projection.parameters())[0][:768, :768])
+    #     torch.nn.init.zeros_(list(self.cc_projection.parameters())[1])
+    #     if pretrained_weights is not None:
+    #         self.cc_projection.load_state_dict(pretrained_weights)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        input_imgs: Union[torch.Tensor, PIL.Image.Image] = None,
+        prompt_imgs: Union[torch.Tensor, PIL.Image.Image] = None,
+        poses: Union[List[float], List[List[float]]] = None,
+        torch_dtype=torch.float32,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 3.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: float = 1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            input_imgs (`PIL` or `List[PIL]`, *optional*):
+                The single input image for each 3D object
+            prompt_imgs (`PIL` or `List[PIL]`, *optional*):
+                Same as input_imgs, but will be used later as an image prompt condition, encoded by CLIP feature
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        # input_image = hint_imgs
+        self.check_inputs(input_imgs, height, width, callback_steps)
+
+        # 2. Define call parameters
+        if isinstance(input_imgs, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(input_imgs, list):
+            batch_size = len(input_imgs)
+        else:
+            batch_size = input_imgs.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input image with pose as prompt
+        prompt_embeds = self._encode_image_with_pose(
+            prompt_imgs, poses, device, num_images_per_prompt, do_classifier_free_guidance
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            4,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare image latents
+        img_latents = self.prepare_img_latents(
+            input_imgs,
+            batch_size * num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = torch.cat([latent_model_input, img_latents], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                # latents = self.scheduler.step(noise_pred.to(dtype=torch.float32), t, latents.to(dtype=torch.float32)).prev_sample.to(prompt_embeds.dtype)
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 8. Post-processing
+        has_nsfw_concept = None
+        if output_type == "latent":
+            image = latents
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/regional_prompting_stable_diffusion.py b/diffusers/examples/community/regional_prompting_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a022987ba9d379982461056619401aaf44e0341
--- /dev/null
+++ b/diffusers/examples/community/regional_prompting_stable_diffusion.py
@@ -0,0 +1,618 @@
+import math
+from typing import Dict, Optional
+
+import torch
+import torchvision.transforms.functional as FF
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import StableDiffusionPipeline
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import USE_PEFT_BACKEND
+
+
+try:
+    from compel import Compel
+except ImportError:
+    Compel = None
+
+KCOMM = "ADDCOMM"
+KBRK = "BREAK"
+
+
+class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
+    r"""
+    Args for Regional Prompting Pipeline:
+        rp_args:dict
+        Required
+            rp_args["mode"]: cols, rows, prompt, prompt-ex
+        for cols, rows mode
+            rp_args["div"]: ex) 1;1;1(Divide into 3 regions)
+        for prompt, prompt-ex mode
+            rp_args["th"]: ex) 0.5,0.5,0.6 (threshold for prompt mode)
+
+        Optional
+            rp_args["save_mask"]: True/False (save masks in prompt mode)
+
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__(
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            scheduler,
+            safety_checker,
+            feature_extractor,
+            requires_safety_checker,
+        )
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: str,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: str = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        rp_args: Dict[str, str] = None,
+    ):
+        active = KBRK in prompt[0] if isinstance(prompt, list) else KBRK in prompt
+        if negative_prompt is None:
+            negative_prompt = "" if isinstance(prompt, str) else [""] * len(prompt)
+
+        device = self._execution_device
+        regions = 0
+
+        self.power = int(rp_args["power"]) if "power" in rp_args else 1
+
+        prompts = prompt if isinstance(prompt, list) else [prompt]
+        n_prompts = negative_prompt if isinstance(prompt, str) else [negative_prompt]
+        self.batch = batch = num_images_per_prompt * len(prompts)
+        all_prompts_cn, all_prompts_p = promptsmaker(prompts, num_images_per_prompt)
+        all_n_prompts_cn, _ = promptsmaker(n_prompts, num_images_per_prompt)
+
+        equal = len(all_prompts_cn) == len(all_n_prompts_cn)
+
+        if Compel:
+            compel = Compel(tokenizer=self.tokenizer, text_encoder=self.text_encoder)
+
+            def getcompelembs(prps):
+                embl = []
+                for prp in prps:
+                    embl.append(compel.build_conditioning_tensor(prp))
+                return torch.cat(embl)
+
+            conds = getcompelembs(all_prompts_cn)
+            unconds = getcompelembs(all_n_prompts_cn)
+            embs = getcompelembs(prompts)
+            n_embs = getcompelembs(n_prompts)
+            prompt = negative_prompt = None
+        else:
+            conds = self.encode_prompt(prompts, device, 1, True)[0]
+            unconds = (
+                self.encode_prompt(n_prompts, device, 1, True)[0]
+                if equal
+                else self.encode_prompt(all_n_prompts_cn, device, 1, True)[0]
+            )
+            embs = n_embs = None
+
+        if not active:
+            pcallback = None
+            mode = None
+        else:
+            if any(x in rp_args["mode"].upper() for x in ["COL", "ROW"]):
+                mode = "COL" if "COL" in rp_args["mode"].upper() else "ROW"
+                ocells, icells, regions = make_cells(rp_args["div"])
+
+            elif "PRO" in rp_args["mode"].upper():
+                regions = len(all_prompts_p[0])
+                mode = "PROMPT"
+                reset_attnmaps(self)
+                self.ex = "EX" in rp_args["mode"].upper()
+                self.target_tokens = target_tokens = tokendealer(self, all_prompts_p)
+                thresholds = [float(x) for x in rp_args["th"].split(",")]
+
+            orig_hw = (height, width)
+            revers = True
+
+            def pcallback(s_self, step: int, timestep: int, latents: torch.Tensor, selfs=None):
+                if "PRO" in mode:  # in Prompt mode, make masks from sum of attension maps
+                    self.step = step
+
+                    if len(self.attnmaps_sizes) > 3:
+                        self.history[step] = self.attnmaps.copy()
+                        for hw in self.attnmaps_sizes:
+                            allmasks = []
+                            basemasks = [None] * batch
+                            for tt, th in zip(target_tokens, thresholds):
+                                for b in range(batch):
+                                    key = f"{tt}-{b}"
+                                    _, mask, _ = makepmask(self, self.attnmaps[key], hw[0], hw[1], th, step)
+                                    mask = mask.unsqueeze(0).unsqueeze(-1)
+                                    if self.ex:
+                                        allmasks[b::batch] = [x - mask for x in allmasks[b::batch]]
+                                        allmasks[b::batch] = [torch.where(x > 0, 1, 0) for x in allmasks[b::batch]]
+                                    allmasks.append(mask)
+                                    basemasks[b] = mask if basemasks[b] is None else basemasks[b] + mask
+                            basemasks = [1 - mask for mask in basemasks]
+                            basemasks = [torch.where(x > 0, 1, 0) for x in basemasks]
+                            allmasks = basemasks + allmasks
+
+                            self.attnmasks[hw] = torch.cat(allmasks)
+                        self.maskready = True
+                return latents
+
+            def hook_forward(module):
+                # diffusers==0.23.2
+                def forward(
+                    hidden_states: torch.Tensor,
+                    encoder_hidden_states: Optional[torch.Tensor] = None,
+                    attention_mask: Optional[torch.Tensor] = None,
+                    temb: Optional[torch.Tensor] = None,
+                    scale: float = 1.0,
+                ) -> torch.Tensor:
+                    attn = module
+                    xshape = hidden_states.shape
+                    self.hw = (h, w) = split_dims(xshape[1], *orig_hw)
+
+                    if revers:
+                        nx, px = hidden_states.chunk(2)
+                    else:
+                        px, nx = hidden_states.chunk(2)
+
+                    if equal:
+                        hidden_states = torch.cat(
+                            [px for i in range(regions)] + [nx for i in range(regions)],
+                            0,
+                        )
+                        encoder_hidden_states = torch.cat([conds] + [unconds])
+                    else:
+                        hidden_states = torch.cat([px for i in range(regions)] + [nx], 0)
+                        encoder_hidden_states = torch.cat([conds] + [unconds])
+
+                    residual = hidden_states
+
+                    args = () if USE_PEFT_BACKEND else (scale,)
+
+                    if attn.spatial_norm is not None:
+                        hidden_states = attn.spatial_norm(hidden_states, temb)
+
+                    input_ndim = hidden_states.ndim
+
+                    if input_ndim == 4:
+                        batch_size, channel, height, width = hidden_states.shape
+                        hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+                    batch_size, sequence_length, _ = (
+                        hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+                    )
+
+                    if attention_mask is not None:
+                        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+                        attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+                    if attn.group_norm is not None:
+                        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+                    args = () if USE_PEFT_BACKEND else (scale,)
+                    query = attn.to_q(hidden_states, *args)
+
+                    if encoder_hidden_states is None:
+                        encoder_hidden_states = hidden_states
+                    elif attn.norm_cross:
+                        encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+                    key = attn.to_k(encoder_hidden_states, *args)
+                    value = attn.to_v(encoder_hidden_states, *args)
+
+                    inner_dim = key.shape[-1]
+                    head_dim = inner_dim // attn.heads
+
+                    query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+                    key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                    value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+                    # the output of sdp = (batch, num_heads, seq_len, head_dim)
+                    # TODO: add support for attn.scale when we move to Torch 2.1
+                    hidden_states = scaled_dot_product_attention(
+                        self,
+                        query,
+                        key,
+                        value,
+                        attn_mask=attention_mask,
+                        dropout_p=0.0,
+                        is_causal=False,
+                        getattn="PRO" in mode,
+                    )
+
+                    hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+                    hidden_states = hidden_states.to(query.dtype)
+
+                    # linear proj
+                    hidden_states = attn.to_out[0](hidden_states, *args)
+                    # dropout
+                    hidden_states = attn.to_out[1](hidden_states)
+
+                    if input_ndim == 4:
+                        hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+                    if attn.residual_connection:
+                        hidden_states = hidden_states + residual
+
+                    hidden_states = hidden_states / attn.rescale_output_factor
+
+                    #### Regional Prompting Col/Row mode
+                    if any(x in mode for x in ["COL", "ROW"]):
+                        reshaped = hidden_states.reshape(hidden_states.size()[0], h, w, hidden_states.size()[2])
+                        center = reshaped.shape[0] // 2
+                        px = reshaped[0:center] if equal else reshaped[0:-batch]
+                        nx = reshaped[center:] if equal else reshaped[-batch:]
+                        outs = [px, nx] if equal else [px]
+                        for out in outs:
+                            c = 0
+                            for i, ocell in enumerate(ocells):
+                                for icell in icells[i]:
+                                    if "ROW" in mode:
+                                        out[
+                                            0:batch,
+                                            int(h * ocell[0]) : int(h * ocell[1]),
+                                            int(w * icell[0]) : int(w * icell[1]),
+                                            :,
+                                        ] = out[
+                                            c * batch : (c + 1) * batch,
+                                            int(h * ocell[0]) : int(h * ocell[1]),
+                                            int(w * icell[0]) : int(w * icell[1]),
+                                            :,
+                                        ]
+                                    else:
+                                        out[
+                                            0:batch,
+                                            int(h * icell[0]) : int(h * icell[1]),
+                                            int(w * ocell[0]) : int(w * ocell[1]),
+                                            :,
+                                        ] = out[
+                                            c * batch : (c + 1) * batch,
+                                            int(h * icell[0]) : int(h * icell[1]),
+                                            int(w * ocell[0]) : int(w * ocell[1]),
+                                            :,
+                                        ]
+                                    c += 1
+                        px, nx = (px[0:batch], nx[0:batch]) if equal else (px[0:batch], nx)
+                        hidden_states = torch.cat([nx, px], 0) if revers else torch.cat([px, nx], 0)
+                        hidden_states = hidden_states.reshape(xshape)
+
+                    #### Regional Prompting Prompt mode
+                    elif "PRO" in mode:
+                        px, nx = (
+                            torch.chunk(hidden_states) if equal else hidden_states[0:-batch],
+                            hidden_states[-batch:],
+                        )
+
+                        if (h, w) in self.attnmasks and self.maskready:
+
+                            def mask(input):
+                                out = torch.multiply(input, self.attnmasks[(h, w)])
+                                for b in range(batch):
+                                    for r in range(1, regions):
+                                        out[b] = out[b] + out[r * batch + b]
+                                return out
+
+                            px, nx = (mask(px), mask(nx)) if equal else (mask(px), nx)
+                        px, nx = (px[0:batch], nx[0:batch]) if equal else (px[0:batch], nx)
+                        hidden_states = torch.cat([nx, px], 0) if revers else torch.cat([px, nx], 0)
+                    return hidden_states
+
+                return forward
+
+            def hook_forwards(root_module: torch.nn.Module):
+                for name, module in root_module.named_modules():
+                    if "attn2" in name and module.__class__.__name__ == "Attention":
+                        module.forward = hook_forward(module)
+
+            hook_forwards(self.unet)
+
+        output = StableDiffusionPipeline(**self.components)(
+            prompt=prompt,
+            prompt_embeds=embs,
+            negative_prompt=negative_prompt,
+            negative_prompt_embeds=n_embs,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback_on_step_end=pcallback,
+        )
+
+        if "save_mask" in rp_args:
+            save_mask = rp_args["save_mask"]
+        else:
+            save_mask = False
+
+        if mode == "PROMPT" and save_mask:
+            saveattnmaps(
+                self,
+                output,
+                height,
+                width,
+                thresholds,
+                num_inference_steps // 2,
+                regions,
+            )
+
+        return output
+
+
+### Make prompt list for each regions
+def promptsmaker(prompts, batch):
+    out_p = []
+    plen = len(prompts)
+    for prompt in prompts:
+        add = ""
+        if KCOMM in prompt:
+            add, prompt = prompt.split(KCOMM)
+            add = add + " "
+        prompts = prompt.split(KBRK)
+        out_p.append([add + p for p in prompts])
+    out = [None] * batch * len(out_p[0]) * len(out_p)
+    for p, prs in enumerate(out_p):  # inputs prompts
+        for r, pr in enumerate(prs):  # prompts for regions
+            start = (p + r * plen) * batch
+            out[start : start + batch] = [pr] * batch  # P1R1B1,P1R1B2...,P1R2B1,P1R2B2...,P2R1B1...
+    return out, out_p
+
+
+### make regions from ratios
+### ";" makes outercells, "," makes inner cells
+def make_cells(ratios):
+    if ";" not in ratios and "," in ratios:
+        ratios = ratios.replace(",", ";")
+    ratios = ratios.split(";")
+    ratios = [inratios.split(",") for inratios in ratios]
+
+    icells = []
+    ocells = []
+
+    def startend(cells, array):
+        current_start = 0
+        array = [float(x) for x in array]
+        for value in array:
+            end = current_start + (value / sum(array))
+            cells.append([current_start, end])
+            current_start = end
+
+    startend(ocells, [r[0] for r in ratios])
+
+    for inratios in ratios:
+        if 2 > len(inratios):
+            icells.append([[0, 1]])
+        else:
+            add = []
+            startend(add, inratios[1:])
+            icells.append(add)
+
+    return ocells, icells, sum(len(cell) for cell in icells)
+
+
+def make_emblist(self, prompts):
+    with torch.no_grad():
+        tokens = self.tokenizer(
+            prompts,
+            max_length=self.tokenizer.model_max_length,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+        ).input_ids.to(self.device)
+        embs = self.text_encoder(tokens, output_hidden_states=True).last_hidden_state.to(self.device, dtype=self.dtype)
+    return embs
+
+
+def split_dims(xs, height, width):
+    def repeat_div(x, y):
+        while y > 0:
+            x = math.ceil(x / 2)
+            y = y - 1
+        return x
+
+    scale = math.ceil(math.log2(math.sqrt(height * width / xs)))
+    dsh = repeat_div(height, scale)
+    dsw = repeat_div(width, scale)
+    return dsh, dsw
+
+
+##### for prompt mode
+def get_attn_maps(self, attn):
+    height, width = self.hw
+    target_tokens = self.target_tokens
+    if (height, width) not in self.attnmaps_sizes:
+        self.attnmaps_sizes.append((height, width))
+
+    for b in range(self.batch):
+        for t in target_tokens:
+            power = self.power
+            add = attn[b, :, :, t[0] : t[0] + len(t)] ** (power) * (self.attnmaps_sizes.index((height, width)) + 1)
+            add = torch.sum(add, dim=2)
+            key = f"{t}-{b}"
+            if key not in self.attnmaps:
+                self.attnmaps[key] = add
+            else:
+                if self.attnmaps[key].shape[1] != add.shape[1]:
+                    add = add.view(8, height, width)
+                    add = FF.resize(add, self.attnmaps_sizes[0], antialias=None)
+                    add = add.reshape_as(self.attnmaps[key])
+
+                self.attnmaps[key] = self.attnmaps[key] + add
+
+
+def reset_attnmaps(self):  # init parameters in every batch
+    self.step = 0
+    self.attnmaps = {}  # maked from attention maps
+    self.attnmaps_sizes = []  # height,width set of u-net blocks
+    self.attnmasks = {}  # maked from attnmaps for regions
+    self.maskready = False
+    self.history = {}
+
+
+def saveattnmaps(self, output, h, w, th, step, regions):
+    masks = []
+    for i, mask in enumerate(self.history[step].values()):
+        img, _, mask = makepmask(self, mask, h, w, th[i % len(th)], step)
+        if self.ex:
+            masks = [x - mask for x in masks]
+            masks.append(mask)
+            if len(masks) == regions - 1:
+                output.images.extend([FF.to_pil_image(mask) for mask in masks])
+                masks = []
+        else:
+            output.images.append(img)
+
+
+def makepmask(
+    self, mask, h, w, th, step
+):  # make masks from attention cache return [for preview, for attention, for Latent]
+    th = th - step * 0.005
+    if 0.05 >= th:
+        th = 0.05
+    mask = torch.mean(mask, dim=0)
+    mask = mask / mask.max().item()
+    mask = torch.where(mask > th, 1, 0)
+    mask = mask.float()
+    mask = mask.view(1, *self.attnmaps_sizes[0])
+    img = FF.to_pil_image(mask)
+    img = img.resize((w, h))
+    mask = FF.resize(mask, (h, w), interpolation=FF.InterpolationMode.NEAREST, antialias=None)
+    lmask = mask
+    mask = mask.reshape(h * w)
+    mask = torch.where(mask > 0.1, 1, 0)
+    return img, mask, lmask
+
+
+def tokendealer(self, all_prompts):
+    for prompts in all_prompts:
+        targets = [p.split(",")[-1] for p in prompts[1:]]
+        tt = []
+
+        for target in targets:
+            ptokens = (
+                self.tokenizer(
+                    prompts,
+                    max_length=self.tokenizer.model_max_length,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="pt",
+                ).input_ids
+            )[0]
+            ttokens = (
+                self.tokenizer(
+                    target,
+                    max_length=self.tokenizer.model_max_length,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="pt",
+                ).input_ids
+            )[0]
+
+            tlist = []
+
+            for t in range(ttokens.shape[0] - 2):
+                for p in range(ptokens.shape[0]):
+                    if ttokens[t + 1] == ptokens[p]:
+                        tlist.append(p)
+            if tlist != []:
+                tt.append(tlist)
+
+    return tt
+
+
+def scaled_dot_product_attention(
+    self,
+    query,
+    key,
+    value,
+    attn_mask=None,
+    dropout_p=0.0,
+    is_causal=False,
+    scale=None,
+    getattn=False,
+) -> torch.Tensor:
+    # Efficient implementation equivalent to the following:
+    L, S = query.size(-2), key.size(-2)
+    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+    attn_bias = torch.zeros(L, S, dtype=query.dtype, device=self.device)
+    if is_causal:
+        assert attn_mask is None
+        temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+        attn_bias.to(query.dtype)
+
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
+        else:
+            attn_bias += attn_mask
+    attn_weight = query @ key.transpose(-2, -1) * scale_factor
+    attn_weight += attn_bias
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    if getattn:
+        get_attn_maps(self, attn_weight)
+    attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+    return attn_weight @ value
diff --git a/diffusers/examples/community/rerender_a_video.py b/diffusers/examples/community/rerender_a_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9c616ab5ebc9aa6c504e809e21c5c83f87b57b2
--- /dev/null
+++ b/diffusers/examples/community/rerender_a_video.py
@@ -0,0 +1,1194 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as T
+from gmflow.gmflow import GMFlow
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
+from diffusers.models.attention_processor import Attention, AttnProcessor
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.controlnet.pipeline_controlnet_img2img import StableDiffusionControlNetImg2ImgPipeline
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import BaseOutput, deprecate, logging
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def coords_grid(b, h, w, homogeneous=False, device=None):
+    y, x = torch.meshgrid(torch.arange(h), torch.arange(w))  # [H, W]
+
+    stacks = [x, y]
+
+    if homogeneous:
+        ones = torch.ones_like(x)  # [H, W]
+        stacks.append(ones)
+
+    grid = torch.stack(stacks, dim=0).float()  # [2, H, W] or [3, H, W]
+
+    grid = grid[None].repeat(b, 1, 1, 1)  # [B, 2, H, W] or [B, 3, H, W]
+
+    if device is not None:
+        grid = grid.to(device)
+
+    return grid
+
+
+def bilinear_sample(img, sample_coords, mode="bilinear", padding_mode="zeros", return_mask=False):
+    # img: [B, C, H, W]
+    # sample_coords: [B, 2, H, W] in image scale
+    if sample_coords.size(1) != 2:  # [B, H, W, 2]
+        sample_coords = sample_coords.permute(0, 3, 1, 2)
+
+    b, _, h, w = sample_coords.shape
+
+    # Normalize to [-1, 1]
+    x_grid = 2 * sample_coords[:, 0] / (w - 1) - 1
+    y_grid = 2 * sample_coords[:, 1] / (h - 1) - 1
+
+    grid = torch.stack([x_grid, y_grid], dim=-1)  # [B, H, W, 2]
+
+    img = F.grid_sample(img, grid, mode=mode, padding_mode=padding_mode, align_corners=True)
+
+    if return_mask:
+        mask = (x_grid >= -1) & (y_grid >= -1) & (x_grid <= 1) & (y_grid <= 1)  # [B, H, W]
+
+        return img, mask
+
+    return img
+
+
+def flow_warp(feature, flow, mask=False, mode="bilinear", padding_mode="zeros"):
+    b, c, h, w = feature.size()
+    assert flow.size(1) == 2
+
+    grid = coords_grid(b, h, w).to(flow.device) + flow  # [B, 2, H, W]
+    grid = grid.to(feature.dtype)
+    return bilinear_sample(feature, grid, mode=mode, padding_mode=padding_mode, return_mask=mask)
+
+
+def forward_backward_consistency_check(fwd_flow, bwd_flow, alpha=0.01, beta=0.5):
+    # fwd_flow, bwd_flow: [B, 2, H, W]
+    # alpha and beta values are following UnFlow
+    # (https://arxiv.org/abs/1711.07837)
+    assert fwd_flow.dim() == 4 and bwd_flow.dim() == 4
+    assert fwd_flow.size(1) == 2 and bwd_flow.size(1) == 2
+    flow_mag = torch.norm(fwd_flow, dim=1) + torch.norm(bwd_flow, dim=1)  # [B, H, W]
+
+    warped_bwd_flow = flow_warp(bwd_flow, fwd_flow)  # [B, 2, H, W]
+    warped_fwd_flow = flow_warp(fwd_flow, bwd_flow)  # [B, 2, H, W]
+
+    diff_fwd = torch.norm(fwd_flow + warped_bwd_flow, dim=1)  # [B, H, W]
+    diff_bwd = torch.norm(bwd_flow + warped_fwd_flow, dim=1)
+
+    threshold = alpha * flow_mag + beta
+
+    fwd_occ = (diff_fwd > threshold).float()  # [B, H, W]
+    bwd_occ = (diff_bwd > threshold).float()
+
+    return fwd_occ, bwd_occ
+
+
+@torch.no_grad()
+def get_warped_and_mask(flow_model, image1, image2, image3=None, pixel_consistency=False, device=None):
+    if image3 is None:
+        image3 = image1
+    padder = InputPadder(image1.shape, padding_factor=8)
+    image1, image2 = padder.pad(image1[None].to(device), image2[None].to(device))
+    results_dict = flow_model(
+        image1, image2, attn_splits_list=[2], corr_radius_list=[-1], prop_radius_list=[-1], pred_bidir_flow=True
+    )
+    flow_pr = results_dict["flow_preds"][-1]  # [B, 2, H, W]
+    fwd_flow = padder.unpad(flow_pr[0]).unsqueeze(0)  # [1, 2, H, W]
+    bwd_flow = padder.unpad(flow_pr[1]).unsqueeze(0)  # [1, 2, H, W]
+    fwd_occ, bwd_occ = forward_backward_consistency_check(fwd_flow, bwd_flow)  # [1, H, W] float
+    if pixel_consistency:
+        warped_image1 = flow_warp(image1, bwd_flow)
+        bwd_occ = torch.clamp(
+            bwd_occ + (abs(image2 - warped_image1).mean(dim=1) > 255 * 0.25).float(), 0, 1
+        ).unsqueeze(0)
+    warped_results = flow_warp(image3, bwd_flow)
+    return warped_results, bwd_occ, bwd_flow
+
+
+blur = T.GaussianBlur(kernel_size=(9, 9), sigma=(18, 18))
+
+
+@dataclass
+class TextToVideoSDPipelineOutput(BaseOutput):
+    """
+    Output class for text-to-video pipelines.
+
+    Args:
+        frames (`List[np.ndarray]` or `torch.Tensor`)
+            List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as
+            a `torch` tensor. The length of the list denotes the video length (the number of frames).
+    """
+
+    frames: Union[List[np.ndarray], torch.Tensor]
+
+
+@torch.no_grad()
+def find_flat_region(mask):
+    device = mask.device
+    kernel_x = torch.Tensor([[-1, 0, 1], [-1, 0, 1], [-1, 0, 1]]).unsqueeze(0).unsqueeze(0).to(device)
+    kernel_y = torch.Tensor([[-1, -1, -1], [0, 0, 0], [1, 1, 1]]).unsqueeze(0).unsqueeze(0).to(device)
+    mask_ = F.pad(mask.unsqueeze(0), (1, 1, 1, 1), mode="replicate")
+
+    grad_x = torch.nn.functional.conv2d(mask_, kernel_x)
+    grad_y = torch.nn.functional.conv2d(mask_, kernel_y)
+    return ((abs(grad_x) + abs(grad_y)) == 0).float()[0]
+
+
+class AttnState:
+    STORE = 0
+    LOAD = 1
+    LOAD_AND_STORE_PREV = 2
+
+    def __init__(self):
+        self.reset()
+
+    @property
+    def state(self):
+        return self.__state
+
+    @property
+    def timestep(self):
+        return self.__timestep
+
+    def set_timestep(self, t):
+        self.__timestep = t
+
+    def reset(self):
+        self.__state = AttnState.STORE
+        self.__timestep = 0
+
+    def to_load(self):
+        self.__state = AttnState.LOAD
+
+    def to_load_and_store_prev(self):
+        self.__state = AttnState.LOAD_AND_STORE_PREV
+
+
+class CrossFrameAttnProcessor(AttnProcessor):
+    """
+    Cross frame attention processor. Each frame attends the first frame and previous frame.
+
+    Args:
+        attn_state: Whether the model is processing the first frame or an intermediate frame
+    """
+
+    def __init__(self, attn_state: AttnState):
+        super().__init__()
+        self.attn_state = attn_state
+        self.first_maps = {}
+        self.prev_maps = {}
+
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None):
+        # Is self attention
+        if encoder_hidden_states is None:
+            t = self.attn_state.timestep
+            if self.attn_state.state == AttnState.STORE:
+                self.first_maps[t] = hidden_states.detach()
+                self.prev_maps[t] = hidden_states.detach()
+                res = super().__call__(attn, hidden_states, encoder_hidden_states, attention_mask, temb)
+            else:
+                if self.attn_state.state == AttnState.LOAD_AND_STORE_PREV:
+                    tmp = hidden_states.detach()
+                cross_map = torch.cat((self.first_maps[t], self.prev_maps[t]), dim=1)
+                res = super().__call__(attn, hidden_states, cross_map, attention_mask, temb)
+                if self.attn_state.state == AttnState.LOAD_AND_STORE_PREV:
+                    self.prev_maps[t] = tmp
+        else:
+            res = super().__call__(attn, hidden_states, encoder_hidden_states, attention_mask, temb)
+
+        return res
+
+
+def prepare_image(image):
+    if isinstance(image, torch.Tensor):
+        # Batch single image
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        image = image.to(dtype=torch.float32)
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    return image
+
+
+class RerenderAVideoPipeline(StableDiffusionControlNetImg2ImgPipeline):
+    r"""
+    Pipeline for video-to-video translation using Stable Diffusion with Rerender Algorithm.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
+            as a list, the outputs from each ControlNet are added together to create one combined additional
+            conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder=None,
+        requires_safety_checker: bool = True,
+        device=None,
+    ):
+        super().__init__(
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            controlnet,
+            scheduler,
+            safety_checker,
+            feature_extractor,
+            image_encoder,
+            requires_safety_checker,
+        )
+        self.to(device)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.attn_state = AttnState()
+        attn_processor_dict = {}
+        for k in unet.attn_processors.keys():
+            if k.startswith("up"):
+                attn_processor_dict[k] = CrossFrameAttnProcessor(self.attn_state)
+            else:
+                attn_processor_dict[k] = AttnProcessor()
+
+        self.unet.set_attn_processor(attn_processor_dict)
+
+        flow_model = GMFlow(
+            feature_channels=128,
+            num_scales=1,
+            upsample_factor=8,
+            num_head=1,
+            attention_type="swin",
+            ffn_dim_expansion=4,
+            num_transformer_layers=6,
+        ).to(self.device)
+
+        checkpoint = torch.utils.model_zoo.load_url(
+            "https://huggingface.co/Anonymous-sub/Rerender/resolve/main/models/gmflow_sintel-0c07dcb3.pth",
+            map_location=lambda storage, loc: storage,
+        )
+        weights = checkpoint["model"] if "model" in checkpoint else checkpoint
+        flow_model.load_state_dict(weights, strict=False)
+        flow_model.eval()
+        self.flow_model = flow_model
+
+    # Modified from src/diffusers/pipelines/controlnet/pipeline_controlnet.StableDiffusionControlNetImg2ImgPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = self.vae.encode(image).latent_dist.sample(generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        frames: Union[List[np.ndarray], torch.Tensor] = None,
+        control_frames: Union[List[np.ndarray], torch.Tensor] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        warp_start: Union[float, List[float]] = 0.0,
+        warp_end: Union[float, List[float]] = 0.3,
+        mask_start: Union[float, List[float]] = 0.5,
+        mask_end: Union[float, List[float]] = 0.8,
+        smooth_boundary: bool = True,
+        mask_strength: Union[float, List[float]] = 0.5,
+        inner_strength: Union[float, List[float]] = 0.9,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            frames (`List[np.ndarray]` or `torch.Tensor`): The input images to be used as the starting point for the image generation process.
+            control_frames (`List[np.ndarray]` or `torch.Tensor`): The ControlNet input images condition to provide guidance to the `unet` for generation.
+            strength ('float'): SDEdit strength.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
+                corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
+                than for [`~StableDiffusionControlNetPipeline.__call__`].
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the controlnet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the controlnet stops applying.
+            warp_start (`float`): Shape-aware fusion start timestep.
+            warp_end (`float`): Shape-aware fusion end timestep.
+            mask_start (`float`): Pixel-aware fusion start timestep.
+            mask_end (`float`):Pixel-aware fusion end timestep.
+            smooth_boundary (`bool`): Smooth fusion boundary. Set `True` to prevent artifacts at boundary.
+            mask_strength (`float`): Pixel-aware fusion strength.
+            inner_strength (`float`): Pixel-aware fusion detail level.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+        )
+
+        # 2. Define call parameters
+        # Currently we only support 1 prompt
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            assert False
+        else:
+            assert False
+        num_images_per_prompt = 1
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+
+        # 4. Process the first frame
+        height, width = None, None
+        output_frames = []
+        self.attn_state.reset()
+
+        # 4.1 prepare frames
+        image = self.image_processor.preprocess(frames[0]).to(dtype=torch.float32)
+        first_image = image[0]  # C, H, W
+
+        # 4.2 Prepare controlnet_conditioning_image
+        # Currently we only support single control
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_frames[0],
+                width=width,
+                height=height,
+                batch_size=batch_size,
+                num_images_per_prompt=1,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        else:
+            assert False
+
+        # 4.3 Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, cur_num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size)
+
+        # 4.4 Prepare latent variables
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 4.5 Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 4.6 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        first_x0_list = []
+
+        # 4.7 Denoising loop
+        num_warmup_steps = len(timesteps) - cur_num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=cur_num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                self.attn_state.set_timestep(t.item())
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # controlnet(s) inference
+                if guess_mode and do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+
+                if guess_mode and do_classifier_free_guidance:
+                    # Inferred ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                alpha_prod_t = self.scheduler.alphas_cumprod[t]
+                beta_prod_t = 1 - alpha_prod_t
+                pred_x0 = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+                first_x0 = pred_x0.detach()
+                first_x0_list.append(first_x0)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        first_result = image
+        prev_result = image
+        do_denormalize = [True] * image.shape[0]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        output_frames.append(image[0])
+
+        # 5. Process each frame
+        for idx in range(1, len(frames)):
+            image = frames[idx]
+            prev_image = frames[idx - 1]
+            control_image = control_frames[idx]
+            # 5.1 prepare frames
+            image = self.image_processor.preprocess(image).to(dtype=torch.float32)
+            prev_image = self.image_processor.preprocess(prev_image).to(dtype=torch.float32)
+
+            warped_0, bwd_occ_0, bwd_flow_0 = get_warped_and_mask(
+                self.flow_model, first_image, image[0], first_result, False, self.device
+            )
+            blend_mask_0 = blur(F.max_pool2d(bwd_occ_0, kernel_size=9, stride=1, padding=4))
+            blend_mask_0 = torch.clamp(blend_mask_0 + bwd_occ_0, 0, 1)
+
+            warped_pre, bwd_occ_pre, bwd_flow_pre = get_warped_and_mask(
+                self.flow_model, prev_image[0], image[0], prev_result, False, self.device
+            )
+            blend_mask_pre = blur(F.max_pool2d(bwd_occ_pre, kernel_size=9, stride=1, padding=4))
+            blend_mask_pre = torch.clamp(blend_mask_pre + bwd_occ_pre, 0, 1)
+
+            warp_mask = 1 - F.max_pool2d(blend_mask_0, kernel_size=8)
+            warp_flow = F.interpolate(bwd_flow_0 / 8.0, scale_factor=1.0 / 8, mode="bilinear")
+
+            # 5.2 Prepare controlnet_conditioning_image
+            # Currently we only support single control
+            if isinstance(controlnet, ControlNetModel):
+                control_image = self.prepare_control_image(
+                    image=control_image,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size,
+                    num_images_per_prompt=1,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+            else:
+                assert False
+
+            # 5.3 Prepare timesteps
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps, cur_num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+            latent_timestep = timesteps[:1].repeat(batch_size)
+
+            skip_t = int(num_inference_steps * (1 - strength))
+            warp_start_t = int(warp_start * num_inference_steps)
+            warp_end_t = int(warp_end * num_inference_steps)
+            mask_start_t = int(mask_start * num_inference_steps)
+            mask_end_t = int(mask_end * num_inference_steps)
+
+            # 5.4 Prepare latent variables
+            init_latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+            )
+
+            # 5.5 Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+            # 5.6 Create tensor stating which controlnets to keep
+            controlnet_keep = []
+            for i in range(len(timesteps)):
+                keeps = [
+                    1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                    for s, e in zip(control_guidance_start, control_guidance_end)
+                ]
+                controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+            # 5.7 Denoising loop
+            num_warmup_steps = len(timesteps) - cur_num_inference_steps * self.scheduler.order
+
+            def denoising_loop(latents, mask=None, xtrg=None, noise_rescale=None):
+                dir_xt = 0
+                latents_dtype = latents.dtype
+                with self.progress_bar(total=cur_num_inference_steps) as progress_bar:
+                    for i, t in enumerate(timesteps):
+                        self.attn_state.set_timestep(t.item())
+                        if i + skip_t >= mask_start_t and i + skip_t <= mask_end_t and xtrg is not None:
+                            rescale = torch.maximum(1.0 - mask, (1 - mask**2) ** 0.5 * inner_strength)
+                            if noise_rescale is not None:
+                                rescale = (1.0 - mask) * (1 - noise_rescale) + rescale * noise_rescale
+                            noise = randn_tensor(xtrg.shape, generator=generator, device=device, dtype=xtrg.dtype)
+                            latents_ref = self.scheduler.add_noise(xtrg, noise, t)
+                            latents = latents_ref * mask + (1.0 - mask) * (latents - dir_xt) + rescale * dir_xt
+                            latents = latents.to(latents_dtype)
+
+                        # expand the latents if we are doing classifier free guidance
+                        latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                        # controlnet(s) inference
+                        if guess_mode and do_classifier_free_guidance:
+                            # Infer ControlNet only for the conditional batch.
+                            control_model_input = latents
+                            control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                            controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                        else:
+                            control_model_input = latent_model_input
+                            controlnet_prompt_embeds = prompt_embeds
+
+                        if isinstance(controlnet_keep[i], list):
+                            cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                        else:
+                            controlnet_cond_scale = controlnet_conditioning_scale
+                            if isinstance(controlnet_cond_scale, list):
+                                controlnet_cond_scale = controlnet_cond_scale[0]
+                            cond_scale = controlnet_cond_scale * controlnet_keep[i]
+                        down_block_res_samples, mid_block_res_sample = self.controlnet(
+                            control_model_input,
+                            t,
+                            encoder_hidden_states=controlnet_prompt_embeds,
+                            controlnet_cond=control_image,
+                            conditioning_scale=cond_scale,
+                            guess_mode=guess_mode,
+                            return_dict=False,
+                        )
+
+                        if guess_mode and do_classifier_free_guidance:
+                            # Inferred ControlNet only for the conditional batch.
+                            # To apply the output of ControlNet to both the unconditional and conditional batches,
+                            # add 0 to the unconditional batch to keep it unchanged.
+                            down_block_res_samples = [
+                                torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples
+                            ]
+                            mid_block_res_sample = torch.cat(
+                                [torch.zeros_like(mid_block_res_sample), mid_block_res_sample]
+                            )
+
+                        # predict the noise residual
+                        noise_pred = self.unet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=prompt_embeds,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            down_block_additional_residuals=down_block_res_samples,
+                            mid_block_additional_residual=mid_block_res_sample,
+                            return_dict=False,
+                        )[0]
+
+                        # perform guidance
+                        if do_classifier_free_guidance:
+                            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                        # Get pred_x0 from scheduler
+                        alpha_prod_t = self.scheduler.alphas_cumprod[t]
+                        beta_prod_t = 1 - alpha_prod_t
+                        pred_x0 = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+                        if i + skip_t >= warp_start_t and i + skip_t <= warp_end_t:
+                            # warp x_0
+                            pred_x0 = (
+                                flow_warp(first_x0_list[i], warp_flow, mode="nearest") * warp_mask
+                                + (1 - warp_mask) * pred_x0
+                            )
+
+                            # get x_t from x_0
+                            latents = self.scheduler.add_noise(pred_x0, noise_pred, t).to(latents_dtype)
+
+                        prev_t = t - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+                        if i == len(timesteps) - 1:
+                            alpha_t_prev = 1.0
+                        else:
+                            alpha_t_prev = self.scheduler.alphas_cumprod[prev_t]
+
+                        dir_xt = (1.0 - alpha_t_prev) ** 0.5 * noise_pred
+
+                        # compute the previous noisy sample x_t -> x_t-1
+                        latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[
+                            0
+                        ]
+
+                        # call the callback, if provided
+                        if i == len(timesteps) - 1 or (
+                            (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                        ):
+                            progress_bar.update()
+                            if callback is not None and i % callback_steps == 0:
+                                callback(i, t, latents)
+
+                    return latents
+
+            if mask_start_t <= mask_end_t:
+                self.attn_state.to_load()
+            else:
+                self.attn_state.to_load_and_store_prev()
+            latents = denoising_loop(init_latents)
+
+            if mask_start_t <= mask_end_t:
+                direct_result = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+                blend_results = (1 - blend_mask_pre) * warped_pre + blend_mask_pre * direct_result
+                blend_results = (1 - blend_mask_0) * warped_0 + blend_mask_0 * blend_results
+
+                bwd_occ = 1 - torch.clamp(1 - bwd_occ_pre + 1 - bwd_occ_0, 0, 1)
+                blend_mask = blur(F.max_pool2d(bwd_occ, kernel_size=9, stride=1, padding=4))
+                blend_mask = 1 - torch.clamp(blend_mask + bwd_occ, 0, 1)
+
+                blend_results = blend_results.to(latents.dtype)
+                xtrg = self.vae.encode(blend_results).latent_dist.sample(generator)
+                xtrg = self.vae.config.scaling_factor * xtrg
+                blend_results_rec = self.vae.decode(xtrg / self.vae.config.scaling_factor, return_dict=False)[0]
+                xtrg_rec = self.vae.encode(blend_results_rec).latent_dist.sample(generator)
+                xtrg_rec = self.vae.config.scaling_factor * xtrg_rec
+                xtrg_ = xtrg + (xtrg - xtrg_rec)
+                blend_results_rec_new = self.vae.decode(xtrg_ / self.vae.config.scaling_factor, return_dict=False)[0]
+                tmp = (abs(blend_results_rec_new - blend_results).mean(dim=1, keepdims=True) > 0.25).float()
+
+                mask_x = F.max_pool2d(
+                    (F.interpolate(tmp, scale_factor=1 / 8.0, mode="bilinear") > 0).float(),
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                )
+
+                mask = 1 - F.max_pool2d(1 - blend_mask, kernel_size=8)  # * (1-mask_x)
+
+                if smooth_boundary:
+                    noise_rescale = find_flat_region(mask)
+                else:
+                    noise_rescale = torch.ones_like(mask)
+
+                xtrg = (xtrg + (1 - mask_x) * (xtrg - xtrg_rec)) * mask
+                xtrg = xtrg.to(latents.dtype)
+
+                self.scheduler.set_timesteps(num_inference_steps, device=device)
+                timesteps, cur_num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+
+                self.attn_state.to_load_and_store_prev()
+                latents = denoising_loop(init_latents, mask * mask_strength, xtrg, noise_rescale)
+
+            if not output_type == "latent":
+                image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            else:
+                image = latents
+
+            prev_result = image
+
+            do_denormalize = [True] * image.shape[0]
+            image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+            output_frames.append(image[0])
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return output_frames
+
+        return TextToVideoSDPipelineOutput(frames=output_frames)
+
+
+class InputPadder:
+    """Pads images such that dimensions are divisible by 8"""
+
+    def __init__(self, dims, mode="sintel", padding_factor=8):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // padding_factor) + 1) * padding_factor - self.ht) % padding_factor
+        pad_wd = (((self.wd // padding_factor) + 1) * padding_factor - self.wd) % padding_factor
+        if mode == "sintel":
+            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2]
+        else:
+            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht]
+
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode="replicate") for x in inputs]
+
+    def unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
+        return x[..., c[0] : c[1], c[2] : c[3]]
diff --git a/diffusers/examples/community/run_onnx_controlnet.py b/diffusers/examples/community/run_onnx_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..af2672c17e594ab1b03c63cdc16ffbefef947a1c
--- /dev/null
+++ b/diffusers/examples/community/run_onnx_controlnet.py
@@ -0,0 +1,911 @@
+import argparse
+import inspect
+import os
+import time
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from PIL import Image
+from transformers import CLIPTokenizer
+
+from diffusers import OnnxRuntimeModel, StableDiffusionImg2ImgPipeline, UniPCMultistepScheduler
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    deprecate,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+        ... )
+        >>> np_image = np.array(image)
+
+        >>> # get canny image
+        >>> np_image = cv2.Canny(np_image, 100, 200)
+        >>> np_image = np_image[:, :, None]
+        >>> np_image = np.concatenate([np_image, np_image, np_image], axis=2)
+        >>> canny_image = Image.fromarray(np_image)
+
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "futuristic-looking woman",
+        ...     num_inference_steps=20,
+        ...     generator=generator,
+        ...     image=image,
+        ...     control_image=canny_image,
+        ... ).images[0]
+        ```
+"""
+
+
+def prepare_image(image):
+    if isinstance(image, torch.Tensor):
+        # Batch single image
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        image = image.to(dtype=torch.float32)
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    return image
+
+
+class OnnxStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline):
+    vae_encoder: OnnxRuntimeModel
+    vae_decoder: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    scheduler: KarrasDiffusionSchedulers
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (4 - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: Optional[int],
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[str],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        warnings.warn(
+            "The decode_latents method is deprecated and will be removed in a future version. Please"
+            " use VaeImageProcessor instead",
+            FutureWarning,
+        )
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        num_controlnet,
+        prompt,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # Check `image`
+        if num_controlnet == 1:
+            self.check_image(image, prompt, prompt_embeds)
+        elif num_controlnet > 1:
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != num_controlnet:
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {num_controlnet} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if num_controlnet == 1:
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif num_controlnet > 1:
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif (
+                isinstance(controlnet_conditioning_scale, list)
+                and len(controlnet_conditioning_scale) != num_controlnet
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if num_controlnet > 1:
+            if len(control_guidance_start) != num_controlnet:
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {num_controlnet} controlnets available. Make sure to provide {num_controlnet}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            _image = image.cpu().detach().numpy()
+            init_latents = self.vae_encoder(sample=_image)[0]
+            init_latents = torch.from_numpy(init_latents).to(device=device, dtype=dtype)
+            init_latents = 0.18215 * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        num_controlnet: int,
+        fp16: bool = True,
+        prompt: Union[str, List[str]] = None,
+        image: Union[
+            torch.Tensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.Tensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        control_image: Union[
+            torch.Tensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.Tensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The initial image will be used as the starting point for the image generation process. Can also accept
+                image latents as `image`, if passing latents directly, it will not be encoded again.
+            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
+                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
+                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
+                specified in init, images must be passed as a list such that each element of the list can be correctly
+                batched for input to a single controlnet.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
+                corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
+                than for [`~StableDiffusionControlNetPipeline.__call__`].
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the controlnet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the controlnet stops applying.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if fp16:
+            torch_dtype = torch.float16
+            np_dtype = np.float16
+        else:
+            torch_dtype = torch.float32
+            np_dtype = np.float32
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = num_controlnet
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            num_controlnet,
+            prompt,
+            control_image,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if num_controlnet > 1 and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * num_controlnet
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 4. Prepare image
+        image = self.image_processor.preprocess(image).to(dtype=torch.float32)
+
+        # 5. Prepare controlnet_conditioning_image
+        if num_controlnet == 1:
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=torch_dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif num_controlnet > 1:
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=torch_dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            torch_dtype,
+            device,
+            generator,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if num_controlnet == 1 else keeps)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                # predict the noise residual
+                _latent_model_input = latent_model_input.cpu().detach().numpy()
+                _prompt_embeds = np.array(prompt_embeds, dtype=np_dtype)
+                _t = np.array([t.cpu().detach().numpy()], dtype=np_dtype)
+
+                if num_controlnet == 1:
+                    control_images = np.array([control_image], dtype=np_dtype)
+                else:
+                    control_images = []
+                    for _control_img in control_image:
+                        _control_img = _control_img.cpu().detach().numpy()
+                        control_images.append(_control_img)
+                    control_images = np.array(control_images, dtype=np_dtype)
+
+                control_scales = np.array(cond_scale, dtype=np_dtype)
+                control_scales = np.resize(control_scales, (num_controlnet, 1))
+
+                noise_pred = self.unet(
+                    sample=_latent_model_input,
+                    timestep=_t,
+                    encoder_hidden_states=_prompt_embeds,
+                    controlnet_conds=control_images,
+                    conditioning_scales=control_scales,
+                )[0]
+                noise_pred = torch.from_numpy(noise_pred).to(device)
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            _latents = latents.cpu().detach().numpy() / 0.18215
+            _latents = np.array(_latents, dtype=np_dtype)
+            image = self.vae_decoder(latent_sample=_latents)[0]
+            image = torch.from_numpy(image).to(device, dtype=torch.float32)
+            has_nsfw_concept = None
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--sd_model",
+        type=str,
+        required=True,
+        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the Hub).",
+    )
+
+    parser.add_argument(
+        "--onnx_model_dir",
+        type=str,
+        required=True,
+        help="Path to the ONNX directory",
+    )
+
+    parser.add_argument("--qr_img_path", type=str, required=True, help="Path to the qr code image")
+
+    args = parser.parse_args()
+
+    qr_image = Image.open(args.qr_img_path)
+    qr_image = qr_image.resize((512, 512))
+
+    # init stable diffusion pipeline
+    pipeline = StableDiffusionImg2ImgPipeline.from_pretrained(args.sd_model)
+    pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
+
+    provider = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+    onnx_pipeline = OnnxStableDiffusionControlNetImg2ImgPipeline(
+        vae_encoder=OnnxRuntimeModel.from_pretrained(
+            os.path.join(args.onnx_model_dir, "vae_encoder"), provider=provider
+        ),
+        vae_decoder=OnnxRuntimeModel.from_pretrained(
+            os.path.join(args.onnx_model_dir, "vae_decoder"), provider=provider
+        ),
+        text_encoder=OnnxRuntimeModel.from_pretrained(
+            os.path.join(args.onnx_model_dir, "text_encoder"), provider=provider
+        ),
+        tokenizer=pipeline.tokenizer,
+        unet=OnnxRuntimeModel.from_pretrained(os.path.join(args.onnx_model_dir, "unet"), provider=provider),
+        scheduler=pipeline.scheduler,
+    )
+    onnx_pipeline = onnx_pipeline.to("cuda")
+
+    prompt = "a cute cat fly to the moon"
+    negative_prompt = "paintings, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, age spot, glans, nsfw, nipples, necklace, worst quality, low quality, watermark, username, signature, multiple breasts, lowres, bad anatomy, bad hands, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, bad feet, single color, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, ugly, blurry, bad anatomy, bad proportions, extra limbs, disfigured, bad anatomy, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, mutated hands, fused fingers, too many fingers, long neck, bad body perspect"
+
+    for i in range(10):
+        start_time = time.time()
+        image = onnx_pipeline(
+            num_controlnet=2,
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=qr_image,
+            control_image=[qr_image, qr_image],
+            width=512,
+            height=512,
+            strength=0.75,
+            num_inference_steps=20,
+            num_images_per_prompt=1,
+            controlnet_conditioning_scale=[0.8, 0.8],
+            control_guidance_start=[0.3, 0.3],
+            control_guidance_end=[0.9, 0.9],
+        ).images[0]
+        print(time.time() - start_time)
+        image.save("output_qr_code.png")
diff --git a/diffusers/examples/community/run_tensorrt_controlnet.py b/diffusers/examples/community/run_tensorrt_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..873195fa31bae1d8272cdac7c18f6bd12e8b782f
--- /dev/null
+++ b/diffusers/examples/community/run_tensorrt_controlnet.py
@@ -0,0 +1,1022 @@
+import argparse
+import atexit
+import inspect
+import os
+import time
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import pycuda.driver as cuda
+import tensorrt as trt
+import torch
+from PIL import Image
+from pycuda.tools import make_default_context
+from transformers import CLIPTokenizer
+
+from diffusers import OnnxRuntimeModel, StableDiffusionImg2ImgPipeline, UniPCMultistepScheduler
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    deprecate,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+# Initialize CUDA
+cuda.init()
+context = make_default_context()
+device = context.get_device()
+atexit.register(context.pop)
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def load_engine(trt_runtime, engine_path):
+    with open(engine_path, "rb") as f:
+        engine_data = f.read()
+    engine = trt_runtime.deserialize_cuda_engine(engine_data)
+    return engine
+
+
+class TensorRTModel:
+    def __init__(
+        self,
+        trt_engine_path,
+        **kwargs,
+    ):
+        cuda.init()
+        stream = cuda.Stream()
+        TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
+        trt.init_libnvinfer_plugins(TRT_LOGGER, "")
+        trt_runtime = trt.Runtime(TRT_LOGGER)
+        engine = load_engine(trt_runtime, trt_engine_path)
+        context = engine.create_execution_context()
+
+        # allocates memory for network inputs/outputs on both CPU and GPU
+        host_inputs = []
+        cuda_inputs = []
+        host_outputs = []
+        cuda_outputs = []
+        bindings = []
+        input_names = []
+        output_names = []
+
+        for binding in engine:
+            datatype = engine.get_binding_dtype(binding)
+            if datatype == trt.DataType.HALF:
+                dtype = np.float16
+            else:
+                dtype = np.float32
+
+            shape = tuple(engine.get_binding_shape(binding))
+            host_mem = cuda.pagelocked_empty(shape, dtype)
+            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
+            bindings.append(int(cuda_mem))
+
+            if engine.binding_is_input(binding):
+                host_inputs.append(host_mem)
+                cuda_inputs.append(cuda_mem)
+                input_names.append(binding)
+            else:
+                host_outputs.append(host_mem)
+                cuda_outputs.append(cuda_mem)
+                output_names.append(binding)
+
+        self.stream = stream
+        self.context = context
+        self.engine = engine
+
+        self.host_inputs = host_inputs
+        self.cuda_inputs = cuda_inputs
+        self.host_outputs = host_outputs
+        self.cuda_outputs = cuda_outputs
+        self.bindings = bindings
+        self.batch_size = engine.max_batch_size
+
+        self.input_names = input_names
+        self.output_names = output_names
+
+    def __call__(self, **kwargs):
+        context = self.context
+        stream = self.stream
+        bindings = self.bindings
+
+        host_inputs = self.host_inputs
+        cuda_inputs = self.cuda_inputs
+        host_outputs = self.host_outputs
+        cuda_outputs = self.cuda_outputs
+
+        for idx, input_name in enumerate(self.input_names):
+            _input = kwargs[input_name]
+            np.copyto(host_inputs[idx], _input)
+            # transfer input data to the GPU
+            cuda.memcpy_htod_async(cuda_inputs[idx], host_inputs[idx], stream)
+
+        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
+
+        result = {}
+        for idx, output_name in enumerate(self.output_names):
+            # transfer predictions back from the GPU
+            cuda.memcpy_dtoh_async(host_outputs[idx], cuda_outputs[idx], stream)
+            result[output_name] = host_outputs[idx]
+
+        stream.synchronize()
+
+        return result
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+        ... )
+        >>> np_image = np.array(image)
+
+        >>> # get canny image
+        >>> np_image = cv2.Canny(np_image, 100, 200)
+        >>> np_image = np_image[:, :, None]
+        >>> np_image = np.concatenate([np_image, np_image, np_image], axis=2)
+        >>> canny_image = Image.fromarray(np_image)
+
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "futuristic-looking woman",
+        ...     num_inference_steps=20,
+        ...     generator=generator,
+        ...     image=image,
+        ...     control_image=canny_image,
+        ... ).images[0]
+        ```
+"""
+
+
+def prepare_image(image):
+    if isinstance(image, torch.Tensor):
+        # Batch single image
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        image = image.to(dtype=torch.float32)
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    return image
+
+
+class TensorRTStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline):
+    vae_encoder: OnnxRuntimeModel
+    vae_decoder: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: TensorRTModel
+    scheduler: KarrasDiffusionSchedulers
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: TensorRTModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (4 - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: Optional[int],
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[str],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        warnings.warn(
+            "The decode_latents method is deprecated and will be removed in a future version. Please"
+            " use VaeImageProcessor instead",
+            FutureWarning,
+        )
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        num_controlnet,
+        prompt,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # Check `image`
+        if num_controlnet == 1:
+            self.check_image(image, prompt, prompt_embeds)
+        elif num_controlnet > 1:
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != num_controlnet:
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {num_controlnet} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if num_controlnet == 1:
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif num_controlnet > 1:
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif (
+                isinstance(controlnet_conditioning_scale, list)
+                and len(controlnet_conditioning_scale) != num_controlnet
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if num_controlnet > 1:
+            if len(control_guidance_start) != num_controlnet:
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {num_controlnet} controlnets available. Make sure to provide {num_controlnet}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            _image = image.cpu().detach().numpy()
+            init_latents = self.vae_encoder(sample=_image)[0]
+            init_latents = torch.from_numpy(init_latents).to(device=device, dtype=dtype)
+            init_latents = 0.18215 * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        num_controlnet: int,
+        fp16: bool = True,
+        prompt: Union[str, List[str]] = None,
+        image: Union[
+            torch.Tensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.Tensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        control_image: Union[
+            torch.Tensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.Tensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The initial image will be used as the starting point for the image generation process. Can also accept
+                image latents as `image`, if passing latents directly, it will not be encoded again.
+            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
+                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
+                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
+                specified in init, images must be passed as a list such that each element of the list can be correctly
+                batched for input to a single controlnet.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
+                corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
+                than for [`~StableDiffusionControlNetPipeline.__call__`].
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the controlnet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the controlnet stops applying.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if fp16:
+            torch_dtype = torch.float16
+            np_dtype = np.float16
+        else:
+            torch_dtype = torch.float32
+            np_dtype = np.float32
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = num_controlnet
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            num_controlnet,
+            prompt,
+            control_image,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if num_controlnet > 1 and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * num_controlnet
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 4. Prepare image
+        image = self.image_processor.preprocess(image).to(dtype=torch.float32)
+
+        # 5. Prepare controlnet_conditioning_image
+        if num_controlnet == 1:
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=torch_dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif num_controlnet > 1:
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=torch_dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            torch_dtype,
+            device,
+            generator,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if num_controlnet == 1 else keeps)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                # predict the noise residual
+                _latent_model_input = latent_model_input.cpu().detach().numpy()
+                _prompt_embeds = np.array(prompt_embeds, dtype=np_dtype)
+                _t = np.array([t.cpu().detach().numpy()], dtype=np_dtype)
+
+                if num_controlnet == 1:
+                    control_images = np.array([control_image], dtype=np_dtype)
+                else:
+                    control_images = []
+                    for _control_img in control_image:
+                        _control_img = _control_img.cpu().detach().numpy()
+                        control_images.append(_control_img)
+                    control_images = np.array(control_images, dtype=np_dtype)
+
+                control_scales = np.array(cond_scale, dtype=np_dtype)
+                control_scales = np.resize(control_scales, (num_controlnet, 1))
+
+                noise_pred = self.unet(
+                    sample=_latent_model_input,
+                    timestep=_t,
+                    encoder_hidden_states=_prompt_embeds,
+                    controlnet_conds=control_images,
+                    conditioning_scales=control_scales,
+                )["noise_pred"]
+                noise_pred = torch.from_numpy(noise_pred).to(device)
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            _latents = latents.cpu().detach().numpy() / 0.18215
+            _latents = np.array(_latents, dtype=np_dtype)
+            image = self.vae_decoder(latent_sample=_latents)[0]
+            image = torch.from_numpy(image).to(device, dtype=torch.float32)
+            has_nsfw_concept = None
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--sd_model",
+        type=str,
+        required=True,
+        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the Hub).",
+    )
+
+    parser.add_argument(
+        "--onnx_model_dir",
+        type=str,
+        required=True,
+        help="Path to the ONNX directory",
+    )
+
+    parser.add_argument(
+        "--unet_engine_path",
+        type=str,
+        required=True,
+        help="Path to the unet + controlnet tensorrt model",
+    )
+
+    parser.add_argument("--qr_img_path", type=str, required=True, help="Path to the qr code image")
+
+    args = parser.parse_args()
+
+    qr_image = Image.open(args.qr_img_path)
+    qr_image = qr_image.resize((512, 512))
+
+    # init stable diffusion pipeline
+    pipeline = StableDiffusionImg2ImgPipeline.from_pretrained(args.sd_model)
+    pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
+
+    provider = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+    onnx_pipeline = TensorRTStableDiffusionControlNetImg2ImgPipeline(
+        vae_encoder=OnnxRuntimeModel.from_pretrained(
+            os.path.join(args.onnx_model_dir, "vae_encoder"), provider=provider
+        ),
+        vae_decoder=OnnxRuntimeModel.from_pretrained(
+            os.path.join(args.onnx_model_dir, "vae_decoder"), provider=provider
+        ),
+        text_encoder=OnnxRuntimeModel.from_pretrained(
+            os.path.join(args.onnx_model_dir, "text_encoder"), provider=provider
+        ),
+        tokenizer=pipeline.tokenizer,
+        unet=TensorRTModel(args.unet_engine_path),
+        scheduler=pipeline.scheduler,
+    )
+    onnx_pipeline = onnx_pipeline.to("cuda")
+
+    prompt = "a cute cat fly to the moon"
+    negative_prompt = "paintings, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, age spot, glans, nsfw, nipples, necklace, worst quality, low quality, watermark, username, signature, multiple breasts, lowres, bad anatomy, bad hands, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, bad feet, single color, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, ugly, blurry, bad anatomy, bad proportions, extra limbs, disfigured, bad anatomy, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, mutated hands, fused fingers, too many fingers, long neck, bad body perspect"
+
+    for i in range(10):
+        start_time = time.time()
+        image = onnx_pipeline(
+            num_controlnet=2,
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=qr_image,
+            control_image=[qr_image, qr_image],
+            width=512,
+            height=512,
+            strength=0.75,
+            num_inference_steps=20,
+            num_images_per_prompt=1,
+            controlnet_conditioning_scale=[0.8, 0.8],
+            control_guidance_start=[0.3, 0.3],
+            control_guidance_end=[0.9, 0.9],
+        ).images[0]
+        print(time.time() - start_time)
+        image.save("output_qr_code.png")
diff --git a/diffusers/examples/community/scheduling_ufogen.py b/diffusers/examples/community/scheduling_ufogen.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b1b92ff183ac16c6c2a2b6a28d9da8d16f192da
--- /dev/null
+++ b/diffusers/examples/community/scheduling_ufogen.py
@@ -0,0 +1,521 @@
+# Copyright 2024 UC Berkeley Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->UFOGen
+class UFOGenSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class UFOGenScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `UFOGenScheduler` implements multistep and onestep sampling for a UFOGen model, introduced in
+    [UFOGen: You Forward Once Large Scale Text-to-Image Generation via Diffusion GANs](https://arxiv.org/abs/2311.09257)
+    by Yanwu Xu, Yang Zhao, Zhisheng Xiao, and Tingbo Hou. UFOGen is a varianet of the denoising diffusion GAN (DDGAN)
+    model designed for one-step sampling.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+        denoising_step_size (`int`, defaults to 250):
+            The denoising step size parameter from the UFOGen paper. The number of steps used for training is roughly
+            `math.ceil(num_train_timesteps / denoising_step_size)`.
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: bool = False,
+        denoising_step_size: int = 250,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        elif beta_schedule == "sigmoid":
+            # GeoDiff sigmoid schedule
+            betas = torch.linspace(-6, 6, num_train_timesteps)
+            self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.custom_timesteps = False
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model. If used,
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps is used. If `timesteps` is passed,
+                `num_inference_steps` must be `None`.
+
+        """
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+
+        if timesteps is not None:
+            for i in range(1, len(timesteps)):
+                if timesteps[i] >= timesteps[i - 1]:
+                    raise ValueError("`custom_timesteps` must be in descending order.")
+
+            if timesteps[0] >= self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`timesteps` must start before `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps}."
+                )
+
+            timesteps = np.array(timesteps, dtype=np.int64)
+            self.custom_timesteps = True
+        else:
+            if num_inference_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+
+            self.num_inference_steps = num_inference_steps
+            self.custom_timesteps = False
+
+            # TODO: For now, handle special case when num_inference_steps == 1 separately
+            if num_inference_steps == 1:
+                # Set the timestep schedule to num_train_timesteps - 1 rather than 0
+                # (that is, the one-step timestep schedule is always trailing rather than leading or linspace)
+                timesteps = np.array([self.config.num_train_timesteps - 1], dtype=np.int64)
+            else:
+                # TODO: For now, retain the DDPM timestep spacing logic
+                # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+                if self.config.timestep_spacing == "linspace":
+                    timesteps = (
+                        np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                        .round()[::-1]
+                        .copy()
+                        .astype(np.int64)
+                    )
+                elif self.config.timestep_spacing == "leading":
+                    step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+                    # creates integer timesteps by multiplying by ratio
+                    # casting to int to avoid issues when num_inference_step is power of 3
+                    timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+                    timesteps += self.config.steps_offset
+                elif self.config.timestep_spacing == "trailing":
+                    step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+                    # creates integer timesteps by multiplying by ratio
+                    # casting to int to avoid issues when num_inference_step is power of 3
+                    timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+                    timesteps -= 1
+                else:
+                    raise ValueError(
+                        f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+                    )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[UFOGenSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ufogen.UFOGenSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_ddpm.UFOGenSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ufogen.UFOGenSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        # 0. Resolve timesteps
+        t = timestep
+        prev_t = self.previous_timestep(t)
+
+        # 1. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        # beta_prod_t_prev = 1 - alpha_prod_t_prev
+        # current_alpha_t = alpha_prod_t / alpha_prod_t_prev
+        # current_beta_t = 1 - current_alpha_t
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction`  for UFOGenScheduler."
+            )
+
+        # 3. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 4. Single-step or multi-step sampling
+        # Noise is not used on the final timestep of the timestep schedule.
+        # This also means that noise is not used for one-step sampling.
+        if t != self.timesteps[-1]:
+            # TODO: is this correct?
+            # Sample prev sample x_{t - 1} ~ q(x_{t - 1} | x_0 =  G(x_t, t))
+            device = model_output.device
+            noise = randn_tensor(model_output.shape, generator=generator, device=device, dtype=model_output.dtype)
+            sqrt_alpha_prod_t_prev = alpha_prod_t_prev**0.5
+            sqrt_one_minus_alpha_prod_t_prev = (1 - alpha_prod_t_prev) ** 0.5
+            pred_prev_sample = sqrt_alpha_prod_t_prev * pred_original_sample + sqrt_one_minus_alpha_prod_t_prev * noise
+        else:
+            # Simply return the pred_original_sample. If `prediction_type == "sample"`, this is equivalent to returning
+            # the output of the GAN generator U-Net on the initial noisy latents x_T ~ N(0, I).
+            pred_prev_sample = pred_original_sample
+
+        if not return_dict:
+            return (pred_prev_sample,)
+
+        return UFOGenSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
+    def previous_timestep(self, timestep):
+        if self.custom_timesteps:
+            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
+            if index == self.timesteps.shape[0] - 1:
+                prev_t = torch.tensor(-1)
+            else:
+                prev_t = self.timesteps[index + 1]
+        else:
+            num_inference_steps = (
+                self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+            )
+            prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
+
+        return prev_t
diff --git a/diffusers/examples/community/sd_text2img_k_diffusion.py b/diffusers/examples/community/sd_text2img_k_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f83973aba918dc7a166df128d7c27e6dfc8f6d7
--- /dev/null
+++ b/diffusers/examples/community/sd_text2img_k_diffusion.py
@@ -0,0 +1,414 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import warnings
+from typing import Callable, List, Optional, Union
+
+import torch
+from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
+
+from diffusers import DiffusionPipeline, LMSDiscreteScheduler, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class ModelWrapper:
+    def __init__(self, model, alphas_cumprod):
+        self.model = model
+        self.alphas_cumprod = alphas_cumprod
+
+    def apply_model(self, *args, **kwargs):
+        if len(args) == 3:
+            encoder_hidden_states = args[-1]
+            args = args[:2]
+        if kwargs.get("cond", None) is not None:
+            encoder_hidden_states = kwargs.pop("cond")
+        return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample
+
+
+class StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae,
+        text_encoder,
+        tokenizer,
+        unet,
+        scheduler,
+        safety_checker,
+        feature_extractor,
+    ):
+        super().__init__()
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        # get correct sigmas from LMS
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+        model = ModelWrapper(unet, scheduler.alphas_cumprod)
+        if scheduler.config.prediction_type == "v_prediction":
+            self.k_diffusion_model = CompVisVDenoiser(model)
+        else:
+            self.k_diffusion_model = CompVisDenoiser(model)
+
+    def set_sampler(self, scheduler_type: str):
+        warnings.warn("The `set_sampler` method is deprecated, please use `set_scheduler` instead.")
+        return self.set_scheduler(scheduler_type)
+
+    def set_scheduler(self, scheduler_type: str):
+        library = importlib.import_module("k_diffusion")
+        sampling = getattr(library, "sampling")
+        self.sampler = getattr(sampling, scheduler_type)
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = True
+        if guidance_scale <= 1.0:
+            raise ValueError("has to use guidance_scale")
+
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=text_embeddings.device)
+        sigmas = self.scheduler.sigmas
+        sigmas = sigmas.to(text_embeddings.dtype)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents = latents * sigmas[0]
+        self.k_diffusion_model.sigmas = self.k_diffusion_model.sigmas.to(latents.device)
+        self.k_diffusion_model.log_sigmas = self.k_diffusion_model.log_sigmas.to(latents.device)
+
+        def model_fn(x, t):
+            latent_model_input = torch.cat([x] * 2)
+
+            noise_pred = self.k_diffusion_model(latent_model_input, t, cond=text_embeddings)
+
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            return noise_pred
+
+        latents = self.sampler(model_fn, latents, sigmas)
+
+        # 8. Post-processing
+        image = self.decode_latents(latents)
+
+        # 9. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
+
+        # 10. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/sde_drag.py b/diffusers/examples/community/sde_drag.py
new file mode 100644
index 0000000000000000000000000000000000000000..902eaa99f417431b3ffe4b9c96da9b2953828d93
--- /dev/null
+++ b/diffusers/examples/community/sde_drag.py
@@ -0,0 +1,594 @@
+import math
+import tempfile
+from typing import List, Optional
+
+import numpy as np
+import PIL.Image
+import torch
+from accelerate import Accelerator
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DiffusionPipeline, DPMSolverMultistepScheduler, UNet2DConditionModel
+from diffusers.loaders import AttnProcsLayers, StableDiffusionLoraLoaderMixin
+from diffusers.models.attention_processor import (
+    AttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    LoRAAttnAddedKVProcessor,
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_0,
+    SlicedAttnAddedKVProcessor,
+)
+from diffusers.optimization import get_scheduler
+
+
+class SdeDragPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image drag-and-drop editing using stochastic differential equations: https://arxiv.org/abs/2311.01410.
+    Please refer to the [official repository](https://github.com/ML-GSAI/SDE-Drag) for more information.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Please use
+            [`DDIMScheduler`].
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DPMSolverMultistepScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: str,
+        image: PIL.Image.Image,
+        mask_image: PIL.Image.Image,
+        source_points: List[List[int]],
+        target_points: List[List[int]],
+        t0: Optional[float] = 0.6,
+        steps: Optional[int] = 200,
+        step_size: Optional[int] = 2,
+        image_scale: Optional[float] = 0.3,
+        adapt_radius: Optional[int] = 5,
+        min_lora_scale: Optional[float] = 0.5,
+        generator: Optional[torch.Generator] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for image editing.
+        Args:
+            prompt (`str`, *required*):
+                The prompt to guide the image editing.
+            image (`PIL.Image.Image`, *required*):
+                Which will be edited, parts of the image will be masked out with `mask_image` and edited
+                according to `prompt`.
+            mask_image (`PIL.Image.Image`, *required*):
+                To mask `image`. White pixels in the mask will be edited, while black pixels will be preserved.
+            source_points (`List[List[int]]`, *required*):
+                Used to mark the starting positions of drag editing in the image, with each pixel represented as a
+                `List[int]` of length 2.
+            target_points (`List[List[int]]`, *required*):
+                Used to mark the target positions of drag editing in the image, with each pixel represented as a
+                `List[int]` of length 2.
+            t0 (`float`, *optional*, defaults to 0.6):
+                The time parameter. Higher t0 improves the fidelity while lowering the faithfulness of the edited images
+                and vice versa.
+            steps (`int`, *optional*, defaults to 200):
+                The number of sampling iterations.
+            step_size (`int`, *optional*, defaults to 2):
+                The drag diatance of each drag step.
+            image_scale (`float`, *optional*, defaults to 0.3):
+                To avoid duplicating the content, use image_scale to perturbs the source.
+            adapt_radius (`int`, *optional*, defaults to 5):
+                The size of the region for copy and paste operations during each step of the drag process.
+            min_lora_scale (`float`, *optional*, defaults to 0.5):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+                min_lora_scale specifies the minimum LoRA scale during the image drag-editing process.
+            generator ('torch.Generator', *optional*, defaults to None):
+                To make generation deterministic(https://pytorch.org/docs/stable/generated/torch.Generator.html).
+        Examples:
+        ```py
+        >>> import PIL
+        >>> import torch
+        >>> from diffusers import DDIMScheduler, DiffusionPipeline
+
+        >>> # Load the pipeline
+        >>> model_path = "runwayml/stable-diffusion-v1-5"
+        >>> scheduler = DDIMScheduler.from_pretrained(model_path, subfolder="scheduler")
+        >>> pipe = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, custom_pipeline="sde_drag")
+        >>> pipe.to('cuda')
+
+        >>> # To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+        >>> # If not training LoRA, please avoid using torch.float16
+        >>> # pipe.to(torch.float16)
+
+        >>> # Provide prompt, image, mask image, and the starting and target points for drag editing.
+        >>> prompt = "prompt of the image"
+        >>> image = PIL.Image.open('/path/to/image')
+        >>> mask_image = PIL.Image.open('/path/to/mask_image')
+        >>> source_points = [[123, 456]]
+        >>> target_points = [[234, 567]]
+
+        >>> # train_lora is optional, and in most cases, using train_lora can better preserve consistency with the original image.
+        >>> pipe.train_lora(prompt, image)
+
+        >>> output = pipe(prompt, image, mask_image, source_points, target_points)
+        >>> output_image = PIL.Image.fromarray(output)
+        >>> output_image.save("./output.png")
+        ```
+        """
+
+        self.scheduler.set_timesteps(steps)
+
+        noise_scale = (1 - image_scale**2) ** (0.5)
+
+        text_embeddings = self._get_text_embed(prompt)
+        uncond_embeddings = self._get_text_embed([""])
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        latent = self._get_img_latent(image)
+
+        mask = mask_image.resize((latent.shape[3], latent.shape[2]))
+        mask = torch.tensor(np.array(mask))
+        mask = mask.unsqueeze(0).expand_as(latent).to(self.device)
+
+        source_points = torch.tensor(source_points).div(torch.tensor([8]), rounding_mode="trunc")
+        target_points = torch.tensor(target_points).div(torch.tensor([8]), rounding_mode="trunc")
+
+        distance = target_points - source_points
+        distance_norm_max = torch.norm(distance.float(), dim=1, keepdim=True).max()
+
+        if distance_norm_max <= step_size:
+            drag_num = 1
+        else:
+            drag_num = distance_norm_max.div(torch.tensor([step_size]), rounding_mode="trunc")
+            if (distance_norm_max / drag_num - step_size).abs() > (
+                distance_norm_max / (drag_num + 1) - step_size
+            ).abs():
+                drag_num += 1
+
+        latents = []
+        for i in tqdm(range(int(drag_num)), desc="SDE Drag"):
+            source_new = source_points + (i / drag_num * distance).to(torch.int)
+            target_new = source_points + ((i + 1) / drag_num * distance).to(torch.int)
+
+            latent, noises, hook_latents, lora_scales, cfg_scales = self._forward(
+                latent, steps, t0, min_lora_scale, text_embeddings, generator
+            )
+            latent = self._copy_and_paste(
+                latent,
+                source_new,
+                target_new,
+                adapt_radius,
+                latent.shape[2] - 1,
+                latent.shape[3] - 1,
+                image_scale,
+                noise_scale,
+                generator,
+            )
+            latent = self._backward(
+                latent, mask, steps, t0, noises, hook_latents, lora_scales, cfg_scales, text_embeddings, generator
+            )
+
+            latents.append(latent)
+
+        result_image = 1 / 0.18215 * latents[-1]
+
+        with torch.no_grad():
+            result_image = self.vae.decode(result_image).sample
+
+        result_image = (result_image / 2 + 0.5).clamp(0, 1)
+        result_image = result_image.cpu().permute(0, 2, 3, 1).numpy()[0]
+        result_image = (result_image * 255).astype(np.uint8)
+
+        return result_image
+
+    def train_lora(self, prompt, image, lora_step=100, lora_rank=16, generator=None):
+        accelerator = Accelerator(gradient_accumulation_steps=1, mixed_precision="fp16")
+
+        self.vae.requires_grad_(False)
+        self.text_encoder.requires_grad_(False)
+        self.unet.requires_grad_(False)
+
+        unet_lora_attn_procs = {}
+        for name, attn_processor in self.unet.attn_processors.items():
+            cross_attention_dim = None if name.endswith("attn1.processor") else self.unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = self.unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.unet.config.block_out_channels[block_id]
+            else:
+                raise NotImplementedError("name must start with up_blocks, mid_blocks, or down_blocks")
+
+            if isinstance(attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_0)):
+                lora_attn_processor_class = LoRAAttnAddedKVProcessor
+            else:
+                lora_attn_processor_class = (
+                    LoRAAttnProcessor2_0
+                    if hasattr(torch.nn.functional, "scaled_dot_product_attention")
+                    else LoRAAttnProcessor
+                )
+            unet_lora_attn_procs[name] = lora_attn_processor_class(
+                hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=lora_rank
+            )
+
+        self.unet.set_attn_processor(unet_lora_attn_procs)
+        unet_lora_layers = AttnProcsLayers(self.unet.attn_processors)
+        params_to_optimize = unet_lora_layers.parameters()
+
+        optimizer = torch.optim.AdamW(
+            params_to_optimize,
+            lr=2e-4,
+            betas=(0.9, 0.999),
+            weight_decay=1e-2,
+            eps=1e-08,
+        )
+
+        lr_scheduler = get_scheduler(
+            "constant",
+            optimizer=optimizer,
+            num_warmup_steps=0,
+            num_training_steps=lora_step,
+            num_cycles=1,
+            power=1.0,
+        )
+
+        unet_lora_layers = accelerator.prepare_model(unet_lora_layers)
+        optimizer = accelerator.prepare_optimizer(optimizer)
+        lr_scheduler = accelerator.prepare_scheduler(lr_scheduler)
+
+        with torch.no_grad():
+            text_inputs = self._tokenize_prompt(prompt, tokenizer_max_length=None)
+            text_embedding = self._encode_prompt(
+                text_inputs.input_ids, text_inputs.attention_mask, text_encoder_use_attention_mask=False
+            )
+
+        image_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+        image = image_transforms(image).to(self.device, dtype=self.vae.dtype)
+        image = image.unsqueeze(dim=0)
+        latents_dist = self.vae.encode(image).latent_dist
+
+        for _ in tqdm(range(lora_step), desc="Train LoRA"):
+            self.unet.train()
+            model_input = latents_dist.sample() * self.vae.config.scaling_factor
+
+            # Sample noise that we'll add to the latents
+            noise = torch.randn(
+                model_input.size(),
+                dtype=model_input.dtype,
+                layout=model_input.layout,
+                device=model_input.device,
+                generator=generator,
+            )
+            bsz, channels, height, width = model_input.shape
+
+            # Sample a random timestep for each image
+            timesteps = torch.randint(
+                0, self.scheduler.config.num_train_timesteps, (bsz,), device=model_input.device, generator=generator
+            )
+            timesteps = timesteps.long()
+
+            # Add noise to the model input according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_model_input = self.scheduler.add_noise(model_input, noise, timesteps)
+
+            # Predict the noise residual
+            model_pred = self.unet(noisy_model_input, timesteps, text_embedding).sample
+
+            # Get the target for loss depending on the prediction type
+            if self.scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif self.scheduler.config.prediction_type == "v_prediction":
+                target = self.scheduler.get_velocity(model_input, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {self.scheduler.config.prediction_type}")
+
+            loss = torch.nn.functional.mse_loss(model_pred.float(), target.float(), reduction="mean")
+            accelerator.backward(loss)
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+
+        with tempfile.TemporaryDirectory() as save_lora_dir:
+            StableDiffusionLoraLoaderMixin.save_lora_weights(
+                save_directory=save_lora_dir,
+                unet_lora_layers=unet_lora_layers,
+                text_encoder_lora_layers=None,
+            )
+
+            self.unet.load_attn_procs(save_lora_dir)
+
+    def _tokenize_prompt(self, prompt, tokenizer_max_length=None):
+        if tokenizer_max_length is not None:
+            max_length = tokenizer_max_length
+        else:
+            max_length = self.tokenizer.model_max_length
+
+        text_inputs = self.tokenizer(
+            prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=max_length,
+            return_tensors="pt",
+        )
+
+        return text_inputs
+
+    def _encode_prompt(self, input_ids, attention_mask, text_encoder_use_attention_mask=False):
+        text_input_ids = input_ids.to(self.device)
+
+        if text_encoder_use_attention_mask:
+            attention_mask = attention_mask.to(self.device)
+        else:
+            attention_mask = None
+
+        prompt_embeds = self.text_encoder(
+            text_input_ids,
+            attention_mask=attention_mask,
+        )
+        prompt_embeds = prompt_embeds[0]
+
+        return prompt_embeds
+
+    @torch.no_grad()
+    def _get_text_embed(self, prompt):
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        return text_embeddings
+
+    def _copy_and_paste(
+        self, latent, source_new, target_new, adapt_radius, max_height, max_width, image_scale, noise_scale, generator
+    ):
+        def adaption_r(source, target, adapt_radius, max_height, max_width):
+            r_x_lower = min(adapt_radius, source[0], target[0])
+            r_x_upper = min(adapt_radius, max_width - source[0], max_width - target[0])
+            r_y_lower = min(adapt_radius, source[1], target[1])
+            r_y_upper = min(adapt_radius, max_height - source[1], max_height - target[1])
+            return r_x_lower, r_x_upper, r_y_lower, r_y_upper
+
+        for source_, target_ in zip(source_new, target_new):
+            r_x_lower, r_x_upper, r_y_lower, r_y_upper = adaption_r(
+                source_, target_, adapt_radius, max_height, max_width
+            )
+
+            source_feature = latent[
+                :, :, source_[1] - r_y_lower : source_[1] + r_y_upper, source_[0] - r_x_lower : source_[0] + r_x_upper
+            ].clone()
+
+            latent[
+                :, :, source_[1] - r_y_lower : source_[1] + r_y_upper, source_[0] - r_x_lower : source_[0] + r_x_upper
+            ] = image_scale * source_feature + noise_scale * torch.randn(
+                latent.shape[0],
+                4,
+                r_y_lower + r_y_upper,
+                r_x_lower + r_x_upper,
+                device=self.device,
+                generator=generator,
+            )
+
+            latent[
+                :, :, target_[1] - r_y_lower : target_[1] + r_y_upper, target_[0] - r_x_lower : target_[0] + r_x_upper
+            ] = source_feature * 1.1
+        return latent
+
+    @torch.no_grad()
+    def _get_img_latent(self, image, height=None, weight=None):
+        data = image.convert("RGB")
+        if height is not None:
+            data = data.resize((weight, height))
+        transform = transforms.ToTensor()
+        data = transform(data).unsqueeze(0)
+        data = (data * 2.0) - 1.0
+        data = data.to(self.device, dtype=self.vae.dtype)
+        latent = self.vae.encode(data).latent_dist.sample()
+        latent = 0.18215 * latent
+        return latent
+
+    @torch.no_grad()
+    def _get_eps(self, latent, timestep, guidance_scale, text_embeddings, lora_scale=None):
+        latent_model_input = torch.cat([latent] * 2) if guidance_scale > 1.0 else latent
+        text_embeddings = text_embeddings if guidance_scale > 1.0 else text_embeddings.chunk(2)[1]
+
+        cross_attention_kwargs = None if lora_scale is None else {"scale": lora_scale}
+
+        with torch.no_grad():
+            noise_pred = self.unet(
+                latent_model_input,
+                timestep,
+                encoder_hidden_states=text_embeddings,
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
+
+        if guidance_scale > 1.0:
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        elif guidance_scale == 1.0:
+            noise_pred_text = noise_pred
+            noise_pred_uncond = 0.0
+        else:
+            raise NotImplementedError(guidance_scale)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+        return noise_pred
+
+    def _forward_sde(
+        self, timestep, sample, guidance_scale, text_embeddings, steps, eta=1.0, lora_scale=None, generator=None
+    ):
+        num_train_timesteps = len(self.scheduler)
+        alphas_cumprod = self.scheduler.alphas_cumprod
+        initial_alpha_cumprod = torch.tensor(1.0)
+
+        prev_timestep = timestep + num_train_timesteps // steps
+
+        alpha_prod_t = alphas_cumprod[timestep] if timestep >= 0 else initial_alpha_cumprod
+        alpha_prod_t_prev = alphas_cumprod[prev_timestep]
+
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        x_prev = (alpha_prod_t_prev / alpha_prod_t) ** (0.5) * sample + (1 - alpha_prod_t_prev / alpha_prod_t) ** (
+            0.5
+        ) * torch.randn(
+            sample.size(), dtype=sample.dtype, layout=sample.layout, device=self.device, generator=generator
+        )
+        eps = self._get_eps(x_prev, prev_timestep, guidance_scale, text_embeddings, lora_scale)
+
+        sigma_t_prev = (
+            eta
+            * (1 - alpha_prod_t) ** (0.5)
+            * (1 - alpha_prod_t_prev / (1 - alpha_prod_t_prev) * (1 - alpha_prod_t) / alpha_prod_t) ** (0.5)
+        )
+
+        pred_original_sample = (x_prev - beta_prod_t_prev ** (0.5) * eps) / alpha_prod_t_prev ** (0.5)
+        pred_sample_direction_coeff = (1 - alpha_prod_t - sigma_t_prev**2) ** (0.5)
+
+        noise = (
+            sample - alpha_prod_t ** (0.5) * pred_original_sample - pred_sample_direction_coeff * eps
+        ) / sigma_t_prev
+
+        return x_prev, noise
+
+    def _sample(
+        self,
+        timestep,
+        sample,
+        guidance_scale,
+        text_embeddings,
+        steps,
+        sde=False,
+        noise=None,
+        eta=1.0,
+        lora_scale=None,
+        generator=None,
+    ):
+        num_train_timesteps = len(self.scheduler)
+        alphas_cumprod = self.scheduler.alphas_cumprod
+        final_alpha_cumprod = torch.tensor(1.0)
+
+        eps = self._get_eps(sample, timestep, guidance_scale, text_embeddings, lora_scale)
+
+        prev_timestep = timestep - num_train_timesteps // steps
+
+        alpha_prod_t = alphas_cumprod[timestep]
+        alpha_prod_t_prev = alphas_cumprod[prev_timestep] if prev_timestep >= 0 else final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        sigma_t = (
+            eta
+            * ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** (0.5)
+            * (1 - alpha_prod_t / alpha_prod_t_prev) ** (0.5)
+            if sde
+            else 0
+        )
+
+        pred_original_sample = (sample - beta_prod_t ** (0.5) * eps) / alpha_prod_t ** (0.5)
+        pred_sample_direction_coeff = (1 - alpha_prod_t_prev - sigma_t**2) ** (0.5)
+
+        noise = (
+            torch.randn(
+                sample.size(), dtype=sample.dtype, layout=sample.layout, device=self.device, generator=generator
+            )
+            if noise is None
+            else noise
+        )
+        latent = (
+            alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction_coeff * eps + sigma_t * noise
+        )
+
+        return latent
+
+    def _forward(self, latent, steps, t0, lora_scale_min, text_embeddings, generator):
+        def scale_schedule(begin, end, n, length, type="linear"):
+            if type == "constant":
+                return end
+            elif type == "linear":
+                return begin + (end - begin) * n / length
+            elif type == "cos":
+                factor = (1 - math.cos(n * math.pi / length)) / 2
+                return (1 - factor) * begin + factor * end
+            else:
+                raise NotImplementedError(type)
+
+        noises = []
+        latents = []
+        lora_scales = []
+        cfg_scales = []
+        latents.append(latent)
+        t0 = int(t0 * steps)
+        t_begin = steps - t0
+
+        length = len(self.scheduler.timesteps[t_begin - 1 : -1]) - 1
+        index = 1
+        for t in self.scheduler.timesteps[t_begin:].flip(dims=[0]):
+            lora_scale = scale_schedule(1, lora_scale_min, index, length, type="cos")
+            cfg_scale = scale_schedule(1, 3.0, index, length, type="linear")
+            latent, noise = self._forward_sde(
+                t, latent, cfg_scale, text_embeddings, steps, lora_scale=lora_scale, generator=generator
+            )
+
+            noises.append(noise)
+            latents.append(latent)
+            lora_scales.append(lora_scale)
+            cfg_scales.append(cfg_scale)
+            index += 1
+        return latent, noises, latents, lora_scales, cfg_scales
+
+    def _backward(
+        self, latent, mask, steps, t0, noises, hook_latents, lora_scales, cfg_scales, text_embeddings, generator
+    ):
+        t0 = int(t0 * steps)
+        t_begin = steps - t0
+
+        hook_latent = hook_latents.pop()
+        latent = torch.where(mask > 128, latent, hook_latent)
+        for t in self.scheduler.timesteps[t_begin - 1 : -1]:
+            latent = self._sample(
+                t,
+                latent,
+                cfg_scales.pop(),
+                text_embeddings,
+                steps,
+                sde=True,
+                noise=noises.pop(),
+                lora_scale=lora_scales.pop(),
+                generator=generator,
+            )
+            hook_latent = hook_latents.pop()
+            latent = torch.where(mask > 128, latent, hook_latent)
+        return latent
diff --git a/diffusers/examples/community/seed_resize_stable_diffusion.py b/diffusers/examples/community/seed_resize_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae2d8a53b2984ad74606cae2a4293e578f56b939
--- /dev/null
+++ b/diffusers/examples/community/seed_resize_stable_diffusion.py
@@ -0,0 +1,342 @@
+"""
+modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+"""
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import DiffusionPipeline
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class SeedResizeStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        text_embeddings: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        if text_embeddings is None:
+            text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
+        latents_shape_reference = (batch_size * num_images_per_prompt, self.unet.config.in_channels, 64, 64)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                latents_reference = torch.randn(
+                    latents_shape_reference, generator=generator, device="cpu", dtype=latents_dtype
+                ).to(self.device)
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents_reference = torch.randn(
+                    latents_shape_reference, generator=generator, device=self.device, dtype=latents_dtype
+                )
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents_reference.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents_reference = latents_reference.to(self.device)
+            latents = latents.to(self.device)
+
+        # This is the key part of the pipeline where we
+        # try to ensure that the generated images w/ the same seed
+        # but different sizes actually result in similar images
+        dx = (latents_shape[3] - latents_shape_reference[3]) // 2
+        dy = (latents_shape[2] - latents_shape_reference[2]) // 2
+        w = latents_shape_reference[3] if dx >= 0 else latents_shape_reference[3] + 2 * dx
+        h = latents_shape_reference[2] if dy >= 0 else latents_shape_reference[2] + 2 * dy
+        tx = 0 if dx < 0 else dx
+        ty = 0 if dy < 0 else dy
+        dx = max(-dx, 0)
+        dy = max(-dy, 0)
+        # import pdb
+        # pdb.set_trace()
+        latents[:, :, ty : ty + h, tx : tx + w] = latents_reference[:, :, dy : dy + h, dx : dx + w]
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/speech_to_image_diffusion.py b/diffusers/examples/community/speech_to_image_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cb5a2a8c7f96858850c47b03549e579274750d6
--- /dev/null
+++ b/diffusers/examples/community/speech_to_image_diffusion.py
@@ -0,0 +1,255 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class SpeechToImagePipeline(DiffusionPipeline, StableDiffusionMixin):
+    def __init__(
+        self,
+        speech_model: WhisperForConditionalGeneration,
+        speech_processor: WhisperProcessor,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            speech_model=speech_model,
+            speech_processor=speech_processor,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        audio,
+        sampling_rate=16_000,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        inputs = self.speech_processor.feature_extractor(
+            audio, return_tensors="pt", sampling_rate=sampling_rate
+        ).input_features.to(self.device)
+        predicted_ids = self.speech_model.generate(inputs, max_length=480_000)
+
+        prompt = self.speech_processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[
+            0
+        ]
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return image
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
diff --git a/diffusers/examples/community/stable_diffusion_comparison.py b/diffusers/examples/community/stable_diffusion_comparison.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b510a64f854ab37c8ab47bc49f37fe4d48c013f
--- /dev/null
+++ b/diffusers/examples/community/stable_diffusion_comparison.py
@@ -0,0 +1,381 @@
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+pipe1_model_id = "CompVis/stable-diffusion-v1-1"
+pipe2_model_id = "CompVis/stable-diffusion-v1-2"
+pipe3_model_id = "CompVis/stable-diffusion-v1-3"
+pipe4_model_id = "CompVis/stable-diffusion-v1-4"
+
+
+class StableDiffusionComparisonPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for parallel comparison of Stable Diffusion v1-v4
+    This pipeline inherits from DiffusionPipeline and depends on the use of an Auth Token for
+    downloading pre-trained checkpoints from Hugging Face Hub.
+    If using Hugging Face Hub, pass the Model ID for Stable Diffusion v1.4 as the previous 3 checkpoints will be loaded
+    automatically.
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionMegaSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super()._init_()
+
+        self.pipe1 = StableDiffusionPipeline.from_pretrained(pipe1_model_id)
+        self.pipe2 = StableDiffusionPipeline.from_pretrained(pipe2_model_id)
+        self.pipe3 = StableDiffusionPipeline.from_pretrained(pipe3_model_id)
+        self.pipe4 = StableDiffusionPipeline(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker,
+        )
+
+        self.register_modules(pipeline1=self.pipe1, pipeline2=self.pipe2, pipeline3=self.pipe3, pipeline4=self.pipe4)
+
+    @property
+    def layers(self) -> Dict[str, Any]:
+        return {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")}
+
+    @torch.no_grad()
+    def text2img_sd1_1(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        return self.pipe1(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+    @torch.no_grad()
+    def text2img_sd1_2(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        return self.pipe2(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+    @torch.no_grad()
+    def text2img_sd1_3(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        return self.pipe3(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+    @torch.no_grad()
+    def text2img_sd1_4(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        return self.pipe4(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+    @torch.no_grad()
+    def _call_(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation. This function will generate 4 results as part
+        of running all the 4 pipelines for SD1.1-1.4 together in a serial-processing, parallel-invocation fashion.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, optional, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, optional, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, optional, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, optional, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            eta (`float`, optional, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, optional):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.Tensor`, optional):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, optional, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, optional, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.to(device)
+
+        # Checks if the height and width are divisible by 8 or not
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` must be divisible by 8 but are {height} and {width}.")
+
+        # Get first result from Stable Diffusion Checkpoint v1.1
+        res1 = self.text2img_sd1_1(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+        # Get first result from Stable Diffusion Checkpoint v1.2
+        res2 = self.text2img_sd1_2(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+        # Get first result from Stable Diffusion Checkpoint v1.3
+        res3 = self.text2img_sd1_3(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+        # Get first result from Stable Diffusion Checkpoint v1.4
+        res4 = self.text2img_sd1_4(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+        # Get all result images into a single list and pass it via StableDiffusionPipelineOutput for final result
+        return StableDiffusionPipelineOutput([res1[0], res2[0], res3[0], res4[0]])
diff --git a/diffusers/examples/community/stable_diffusion_controlnet_img2img.py b/diffusers/examples/community/stable_diffusion_controlnet_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7c88d6fdcc72a9c39a1f784fa15698ee2d9d1ed
--- /dev/null
+++ b/diffusers/examples/community/stable_diffusion_controlnet_img2img.py
@@ -0,0 +1,907 @@
+# Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, ControlNetModel, UNet2DConditionModel, logging
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    PIL_INTERPOLATION,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import numpy as np
+        >>> import torch
+        >>> from PIL import Image
+        >>> from diffusers import ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+
+        >>> input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
+
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+
+        >>> pipe_controlnet = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
+                "runwayml/stable-diffusion-v1-5",
+                controlnet=controlnet,
+                safety_checker=None,
+                torch_dtype=torch.float16
+                )
+
+        >>> pipe_controlnet.scheduler = UniPCMultistepScheduler.from_config(pipe_controlnet.scheduler.config)
+        >>> pipe_controlnet.enable_xformers_memory_efficient_attention()
+        >>> pipe_controlnet.enable_model_cpu_offload()
+
+        # using image with edges for our canny controlnet
+        >>> control_image = load_image(
+            "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png")
+
+
+        >>> result_img = pipe_controlnet(controlnet_conditioning_image=control_image,
+                        image=input_image,
+                        prompt="an android robot, cyberpank, digitl art masterpiece",
+                        num_inference_steps=20).images[0]
+
+        >>> result_img.show()
+        ```
+"""
+
+
+def prepare_image(image):
+    if isinstance(image, torch.Tensor):
+        # Batch single image
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        image = image.to(dtype=torch.float32)
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    return image
+
+
+def prepare_controlnet_conditioning_image(
+    controlnet_conditioning_image,
+    width,
+    height,
+    batch_size,
+    num_images_per_prompt,
+    device,
+    dtype,
+    do_classifier_free_guidance,
+):
+    if not isinstance(controlnet_conditioning_image, torch.Tensor):
+        if isinstance(controlnet_conditioning_image, PIL.Image.Image):
+            controlnet_conditioning_image = [controlnet_conditioning_image]
+
+        if isinstance(controlnet_conditioning_image[0], PIL.Image.Image):
+            controlnet_conditioning_image = [
+                np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
+                for i in controlnet_conditioning_image
+            ]
+            controlnet_conditioning_image = np.concatenate(controlnet_conditioning_image, axis=0)
+            controlnet_conditioning_image = np.array(controlnet_conditioning_image).astype(np.float32) / 255.0
+            controlnet_conditioning_image = controlnet_conditioning_image.transpose(0, 3, 1, 2)
+            controlnet_conditioning_image = torch.from_numpy(controlnet_conditioning_image)
+        elif isinstance(controlnet_conditioning_image[0], torch.Tensor):
+            controlnet_conditioning_image = torch.cat(controlnet_conditioning_image, dim=0)
+
+    image_batch_size = controlnet_conditioning_image.shape[0]
+
+    if image_batch_size == 1:
+        repeat_by = batch_size
+    else:
+        # image batch size is the same as prompt batch size
+        repeat_by = num_images_per_prompt
+
+    controlnet_conditioning_image = controlnet_conditioning_image.repeat_interleave(repeat_by, dim=0)
+
+    controlnet_conditioning_image = controlnet_conditioning_image.to(device=device, dtype=dtype)
+
+    if do_classifier_free_guidance:
+        controlnet_conditioning_image = torch.cat([controlnet_conditioning_image] * 2)
+
+    return controlnet_conditioning_image
+
+
+class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin):
+    """
+    Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_controlnet_conditioning_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+
+        if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
+            raise TypeError(
+                "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        elif image_is_tensor:
+            image_batch_size = image.shape[0]
+        elif image_is_pil_list:
+            image_batch_size = len(image)
+        elif image_is_tensor_list:
+            image_batch_size = len(image)
+        else:
+            raise ValueError("controlnet condition image is not valid")
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+        else:
+            raise ValueError("prompt or prompt_embeds are not valid")
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        controlnet_conditioning_image,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        strength=None,
+        controlnet_guidance_start=None,
+        controlnet_guidance_end=None,
+        controlnet_conditioning_scale=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # check controlnet condition image
+
+        if isinstance(self.controlnet, ControlNetModel):
+            self.check_controlnet_conditioning_image(controlnet_conditioning_image, prompt, prompt_embeds)
+        elif isinstance(self.controlnet, MultiControlNetModel):
+            if not isinstance(controlnet_conditioning_image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            if len(controlnet_conditioning_image) != len(self.controlnet.nets):
+                raise ValueError(
+                    "For multiple controlnets: `image` must have the same length as the number of controlnets."
+                )
+
+            for image_ in controlnet_conditioning_image:
+                self.check_controlnet_conditioning_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+
+        if isinstance(self.controlnet, ControlNetModel):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if isinstance(image, torch.Tensor):
+            if image.ndim != 3 and image.ndim != 4:
+                raise ValueError("`image` must have 3 or 4 dimensions")
+
+            if image.ndim == 3:
+                image_batch_size = 1
+                image_channels, image_height, image_width = image.shape
+            elif image.ndim == 4:
+                image_batch_size, image_channels, image_height, image_width = image.shape
+            else:
+                assert False
+
+            if image_channels != 3:
+                raise ValueError("`image` must have 3 channels")
+
+            if image.min() < -1 or image.max() > 1:
+                raise ValueError("`image` should be in range [-1, 1]")
+
+        if self.vae.config.latent_channels != self.unet.config.in_channels:
+            raise ValueError(
+                f"The config of `pipeline.unet` expects {self.unet.config.in_channels} but received"
+                f" latent channels: {self.vae.config.latent_channels},"
+                f" Please verify the config of `pipeline.unet` and the `pipeline.vae`"
+            )
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of `strength` should in [0.0, 1.0] but is {strength}")
+
+        if controlnet_guidance_start < 0 or controlnet_guidance_start > 1:
+            raise ValueError(
+                f"The value of `controlnet_guidance_start` should in [0.0, 1.0] but is {controlnet_guidance_start}"
+            )
+
+        if controlnet_guidance_end < 0 or controlnet_guidance_end > 1:
+            raise ValueError(
+                f"The value of `controlnet_guidance_end` should in [0.0, 1.0] but is {controlnet_guidance_end}"
+            )
+
+        if controlnet_guidance_start > controlnet_guidance_end:
+            raise ValueError(
+                "The value of `controlnet_guidance_start` should be less than `controlnet_guidance_end`, but got"
+                f" `controlnet_guidance_start` {controlnet_guidance_start} >= `controlnet_guidance_end` {controlnet_guidance_end}"
+            )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if isinstance(generator, list):
+            init_latents = [
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+            ]
+            init_latents = torch.cat(init_latents, dim=0)
+        else:
+            init_latents = self.vae.encode(image).latent_dist.sample(generator)
+
+        init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    def _default_height_width(self, height, width, image):
+        if isinstance(image, list):
+            image = image[0]
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[3]
+
+            height = (height // 8) * 8  # round down to nearest multiple of 8
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[2]
+
+            width = (width // 8) * 8  # round down to nearest multiple of 8
+
+        return height, width
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_image: Union[
+            torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]
+        ] = None,
+        strength: float = 0.8,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        controlnet_guidance_start: float = 0.0,
+        controlnet_guidance_end: float = 1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            controlnet_conditioning_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `torch.Tensor`, it is passed to ControlNet as is. PIL.Image.Image` can
+                also be accepted as an image. The control image is automatically resized to fit the output image.
+            strength (`float`, *optional*):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet.
+            controlnet_guidance_start ('float', *optional*, defaults to 0.0):
+                The percentage of total steps the controlnet starts applying. Must be between 0 and 1.
+            controlnet_guidance_end ('float', *optional*, defaults to 1.0):
+                The percentage of total steps the controlnet ends applying. Must be between 0 and 1. Must be greater
+                than `controlnet_guidance_start`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height, width = self._default_height_width(height, width, controlnet_conditioning_image)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            image,
+            controlnet_conditioning_image,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            strength,
+            controlnet_guidance_start,
+            controlnet_guidance_end,
+            controlnet_conditioning_scale,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 4. Prepare image, and controlnet_conditioning_image
+        image = prepare_image(image)
+
+        # condition image(s)
+        if isinstance(self.controlnet, ControlNetModel):
+            controlnet_conditioning_image = prepare_controlnet_conditioning_image(
+                controlnet_conditioning_image=controlnet_conditioning_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=self.controlnet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+            )
+        elif isinstance(self.controlnet, MultiControlNetModel):
+            controlnet_conditioning_images = []
+
+            for image_ in controlnet_conditioning_image:
+                image_ = prepare_controlnet_conditioning_image(
+                    controlnet_conditioning_image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=self.controlnet.dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                )
+
+                controlnet_conditioning_images.append(image_)
+
+            controlnet_conditioning_image = controlnet_conditioning_images
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+            )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # compute the percentage of total steps we are at
+                current_sampling_percent = i / len(timesteps)
+
+                if (
+                    current_sampling_percent < controlnet_guidance_start
+                    or current_sampling_percent > controlnet_guidance_end
+                ):
+                    # do not apply the controlnet
+                    down_block_res_samples = None
+                    mid_block_res_sample = None
+                else:
+                    # apply the controlnet
+                    down_block_res_samples, mid_block_res_sample = self.controlnet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        controlnet_cond=controlnet_conditioning_image,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        return_dict=False,
+                    )
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/stable_diffusion_controlnet_inpaint.py b/diffusers/examples/community/stable_diffusion_controlnet_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..b473ffe79933d8ed9fb161359e34d78db90441e7
--- /dev/null
+++ b/diffusers/examples/community/stable_diffusion_controlnet_inpaint.py
@@ -0,0 +1,1060 @@
+# Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, ControlNetModel, UNet2DConditionModel, logging
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    PIL_INTERPOLATION,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import numpy as np
+        >>> import torch
+        >>> from PIL import Image
+        >>> from stable_diffusion_controlnet_inpaint import StableDiffusionControlNetInpaintPipeline
+
+        >>> from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
+        >>> from diffusers import ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+
+        >>> def ade_palette():
+                return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+                        [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+                        [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+                        [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+                        [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+                        [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+                        [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+                        [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+                        [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+                        [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+                        [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+                        [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+                        [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+                        [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+                        [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+                        [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+                        [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+                        [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+                        [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+                        [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+                        [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+                        [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+                        [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+                        [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+                        [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+                        [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+                        [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+                        [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+                        [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+                        [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+                        [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+                        [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+                        [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+                        [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+                        [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+                        [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+                        [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+                        [102, 255, 0], [92, 0, 255]]
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-small")
+        >>> image_segmentor = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-small")
+
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg", torch_dtype=torch.float16)
+
+        >>> pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+                "runwayml/stable-diffusion-inpainting", controlnet=controlnet, safety_checker=None, torch_dtype=torch.float16
+            )
+
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe.enable_xformers_memory_efficient_attention()
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> def image_to_seg(image):
+                pixel_values = image_processor(image, return_tensors="pt").pixel_values
+                with torch.no_grad():
+                    outputs = image_segmentor(pixel_values)
+                seg = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
+                color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)  # height, width, 3
+                palette = np.array(ade_palette())
+                for label, color in enumerate(palette):
+                    color_seg[seg == label, :] = color
+                color_seg = color_seg.astype(np.uint8)
+                seg_image = Image.fromarray(color_seg)
+                return seg_image
+
+        >>> image = load_image(
+                "https://github.com/CompVis/latent-diffusion/raw/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+            )
+
+        >>> mask_image = load_image(
+                "https://github.com/CompVis/latent-diffusion/raw/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+            )
+
+        >>> controlnet_conditioning_image = image_to_seg(image)
+
+        >>> image = pipe(
+                "Face of a yellow cat, high resolution, sitting on a park bench",
+                image,
+                mask_image,
+                controlnet_conditioning_image,
+                num_inference_steps=20,
+            ).images[0]
+
+        >>> image.save("out.png")
+        ```
+"""
+
+
+def prepare_image(image):
+    if isinstance(image, torch.Tensor):
+        # Batch single image
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        image = image.to(dtype=torch.float32)
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    return image
+
+
+def prepare_mask_image(mask_image):
+    if isinstance(mask_image, torch.Tensor):
+        if mask_image.ndim == 2:
+            # Batch and add channel dim for single mask
+            mask_image = mask_image.unsqueeze(0).unsqueeze(0)
+        elif mask_image.ndim == 3 and mask_image.shape[0] == 1:
+            # Single mask, the 0'th dimension is considered to be
+            # the existing batch size of 1
+            mask_image = mask_image.unsqueeze(0)
+        elif mask_image.ndim == 3 and mask_image.shape[0] != 1:
+            # Batch of mask, the 0'th dimension is considered to be
+            # the batching dimension
+            mask_image = mask_image.unsqueeze(1)
+
+        # Binarize mask
+        mask_image[mask_image < 0.5] = 0
+        mask_image[mask_image >= 0.5] = 1
+    else:
+        # preprocess mask
+        if isinstance(mask_image, (PIL.Image.Image, np.ndarray)):
+            mask_image = [mask_image]
+
+        if isinstance(mask_image, list) and isinstance(mask_image[0], PIL.Image.Image):
+            mask_image = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask_image], axis=0)
+            mask_image = mask_image.astype(np.float32) / 255.0
+        elif isinstance(mask_image, list) and isinstance(mask_image[0], np.ndarray):
+            mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
+
+        mask_image[mask_image < 0.5] = 0
+        mask_image[mask_image >= 0.5] = 1
+        mask_image = torch.from_numpy(mask_image)
+
+    return mask_image
+
+
+def prepare_controlnet_conditioning_image(
+    controlnet_conditioning_image,
+    width,
+    height,
+    batch_size,
+    num_images_per_prompt,
+    device,
+    dtype,
+    do_classifier_free_guidance,
+):
+    if not isinstance(controlnet_conditioning_image, torch.Tensor):
+        if isinstance(controlnet_conditioning_image, PIL.Image.Image):
+            controlnet_conditioning_image = [controlnet_conditioning_image]
+
+        if isinstance(controlnet_conditioning_image[0], PIL.Image.Image):
+            controlnet_conditioning_image = [
+                np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
+                for i in controlnet_conditioning_image
+            ]
+            controlnet_conditioning_image = np.concatenate(controlnet_conditioning_image, axis=0)
+            controlnet_conditioning_image = np.array(controlnet_conditioning_image).astype(np.float32) / 255.0
+            controlnet_conditioning_image = controlnet_conditioning_image.transpose(0, 3, 1, 2)
+            controlnet_conditioning_image = torch.from_numpy(controlnet_conditioning_image)
+        elif isinstance(controlnet_conditioning_image[0], torch.Tensor):
+            controlnet_conditioning_image = torch.cat(controlnet_conditioning_image, dim=0)
+
+    image_batch_size = controlnet_conditioning_image.shape[0]
+
+    if image_batch_size == 1:
+        repeat_by = batch_size
+    else:
+        # image batch size is the same as prompt batch size
+        repeat_by = num_images_per_prompt
+
+    controlnet_conditioning_image = controlnet_conditioning_image.repeat_interleave(repeat_by, dim=0)
+
+    controlnet_conditioning_image = controlnet_conditioning_image.to(device=device, dtype=dtype)
+
+    if do_classifier_free_guidance:
+        controlnet_conditioning_image = torch.cat([controlnet_conditioning_image] * 2)
+
+    return controlnet_conditioning_image
+
+
+class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, StableDiffusionMixin):
+    """
+    Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_controlnet_conditioning_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+
+        if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
+            raise TypeError(
+                "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        elif image_is_tensor:
+            image_batch_size = image.shape[0]
+        elif image_is_pil_list:
+            image_batch_size = len(image)
+        elif image_is_tensor_list:
+            image_batch_size = len(image)
+        else:
+            raise ValueError("controlnet condition image is not valid")
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+        else:
+            raise ValueError("prompt or prompt_embeds are not valid")
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        mask_image,
+        controlnet_conditioning_image,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        controlnet_conditioning_scale=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # check controlnet condition image
+        if isinstance(self.controlnet, ControlNetModel):
+            self.check_controlnet_conditioning_image(controlnet_conditioning_image, prompt, prompt_embeds)
+        elif isinstance(self.controlnet, MultiControlNetModel):
+            if not isinstance(controlnet_conditioning_image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+            if len(controlnet_conditioning_image) != len(self.controlnet.nets):
+                raise ValueError(
+                    "For multiple controlnets: `image` must have the same length as the number of controlnets."
+                )
+            for image_ in controlnet_conditioning_image:
+                self.check_controlnet_conditioning_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if isinstance(self.controlnet, ControlNetModel):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if isinstance(image, torch.Tensor) and not isinstance(mask_image, torch.Tensor):
+            raise TypeError("if `image` is a tensor, `mask_image` must also be a tensor")
+
+        if isinstance(image, PIL.Image.Image) and not isinstance(mask_image, PIL.Image.Image):
+            raise TypeError("if `image` is a PIL image, `mask_image` must also be a PIL image")
+
+        if isinstance(image, torch.Tensor):
+            if image.ndim != 3 and image.ndim != 4:
+                raise ValueError("`image` must have 3 or 4 dimensions")
+
+            if mask_image.ndim != 2 and mask_image.ndim != 3 and mask_image.ndim != 4:
+                raise ValueError("`mask_image` must have 2, 3, or 4 dimensions")
+
+            if image.ndim == 3:
+                image_batch_size = 1
+                image_channels, image_height, image_width = image.shape
+            elif image.ndim == 4:
+                image_batch_size, image_channels, image_height, image_width = image.shape
+            else:
+                assert False
+
+            if mask_image.ndim == 2:
+                mask_image_batch_size = 1
+                mask_image_channels = 1
+                mask_image_height, mask_image_width = mask_image.shape
+            elif mask_image.ndim == 3:
+                mask_image_channels = 1
+                mask_image_batch_size, mask_image_height, mask_image_width = mask_image.shape
+            elif mask_image.ndim == 4:
+                mask_image_batch_size, mask_image_channels, mask_image_height, mask_image_width = mask_image.shape
+
+            if image_channels != 3:
+                raise ValueError("`image` must have 3 channels")
+
+            if mask_image_channels != 1:
+                raise ValueError("`mask_image` must have 1 channel")
+
+            if image_batch_size != mask_image_batch_size:
+                raise ValueError("`image` and `mask_image` mush have the same batch sizes")
+
+            if image_height != mask_image_height or image_width != mask_image_width:
+                raise ValueError("`image` and `mask_image` must have the same height and width dimensions")
+
+            if image.min() < -1 or image.max() > 1:
+                raise ValueError("`image` should be in range [-1, 1]")
+
+            if mask_image.min() < 0 or mask_image.max() > 1:
+                raise ValueError("`mask_image` should be in range [0, 1]")
+        else:
+            mask_image_channels = 1
+            image_channels = 3
+
+        single_image_latent_channels = self.vae.config.latent_channels
+
+        total_latent_channels = single_image_latent_channels * 2 + mask_image_channels
+
+        if total_latent_channels != self.unet.config.in_channels:
+            raise ValueError(
+                f"The config of `pipeline.unet` expects {self.unet.config.in_channels} but received"
+                f" non inpainting latent channels: {single_image_latent_channels},"
+                f" mask channels: {mask_image_channels}, and masked image channels: {single_image_latent_channels}."
+                f" Please verify the config of `pipeline.unet` and the `mask_image` and `image` inputs."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        return latents
+
+    def prepare_mask_latents(self, mask_image, batch_size, height, width, dtype, device, do_classifier_free_guidance):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask_image = F.interpolate(mask_image, size=(height // self.vae_scale_factor, width // self.vae_scale_factor))
+        mask_image = mask_image.to(device=device, dtype=dtype)
+
+        # duplicate mask for each generation per prompt, using mps friendly method
+        if mask_image.shape[0] < batch_size:
+            if not batch_size % mask_image.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask_image.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask_image = mask_image.repeat(batch_size // mask_image.shape[0], 1, 1, 1)
+
+        mask_image = torch.cat([mask_image] * 2) if do_classifier_free_guidance else mask_image
+
+        mask_image_latents = mask_image
+
+        return mask_image_latents
+
+    def prepare_masked_image_latents(
+        self, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        # encode the mask image into latents space so we can concatenate it to the latents
+        if isinstance(generator, list):
+            masked_image_latents = [
+                self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i])
+                for i in range(batch_size)
+            ]
+            masked_image_latents = torch.cat(masked_image_latents, dim=0)
+        else:
+            masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
+        masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
+
+        # duplicate masked_image_latents for each generation per prompt, using mps friendly method
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return masked_image_latents
+
+    def _default_height_width(self, height, width, image):
+        if isinstance(image, list):
+            image = image[0]
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[3]
+
+            height = (height // 8) * 8  # round down to nearest multiple of 8
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[2]
+
+            width = (width // 8) * 8  # round down to nearest multiple of 8
+
+        return height, width
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[torch.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_image: Union[
+            torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]
+        ] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            controlnet_conditioning_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `torch.Tensor`, it is passed to ControlNet as is. PIL.Image.Image` can
+                also be accepted as an image. The control image is automatically resized to fit the output image.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height, width = self._default_height_width(height, width, controlnet_conditioning_image)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            image,
+            mask_image,
+            controlnet_conditioning_image,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            controlnet_conditioning_scale,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 4. Prepare mask, image, and controlnet_conditioning_image
+        image = prepare_image(image)
+
+        mask_image = prepare_mask_image(mask_image)
+
+        # condition image(s)
+        if isinstance(self.controlnet, ControlNetModel):
+            controlnet_conditioning_image = prepare_controlnet_conditioning_image(
+                controlnet_conditioning_image=controlnet_conditioning_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=self.controlnet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+            )
+        elif isinstance(self.controlnet, MultiControlNetModel):
+            controlnet_conditioning_images = []
+
+            for image_ in controlnet_conditioning_image:
+                image_ = prepare_controlnet_conditioning_image(
+                    controlnet_conditioning_image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=self.controlnet.dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                )
+                controlnet_conditioning_images.append(image_)
+
+            controlnet_conditioning_image = controlnet_conditioning_images
+        else:
+            assert False
+
+        masked_image = image * (mask_image < 0.5)
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        mask_image_latents = self.prepare_mask_latents(
+            mask_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            do_classifier_free_guidance,
+        )
+
+        masked_image_latents = self.prepare_masked_image_latents(
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                non_inpainting_latent_model_input = (
+                    torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                )
+
+                non_inpainting_latent_model_input = self.scheduler.scale_model_input(
+                    non_inpainting_latent_model_input, t
+                )
+
+                inpainting_latent_model_input = torch.cat(
+                    [non_inpainting_latent_model_input, mask_image_latents, masked_image_latents], dim=1
+                )
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    non_inpainting_latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_cond=controlnet_conditioning_image,
+                    conditioning_scale=controlnet_conditioning_scale,
+                    return_dict=False,
+                )
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    inpainting_latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/stable_diffusion_controlnet_inpaint_img2img.py b/diffusers/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..8928f34239e3a32d7675fdf8590a17f0f0421451
--- /dev/null
+++ b/diffusers/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
@@ -0,0 +1,1037 @@
+# Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, ControlNetModel, UNet2DConditionModel, logging
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    PIL_INTERPOLATION,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import numpy as np
+        >>> import torch
+        >>> from PIL import Image
+        >>> from stable_diffusion_controlnet_inpaint_img2img import StableDiffusionControlNetInpaintImg2ImgPipeline
+
+        >>> from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
+        >>> from diffusers import ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+
+        >>> def ade_palette():
+                return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+                        [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+                        [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+                        [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+                        [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+                        [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+                        [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+                        [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+                        [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+                        [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+                        [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+                        [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+                        [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+                        [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+                        [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+                        [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+                        [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+                        [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+                        [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+                        [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+                        [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+                        [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+                        [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+                        [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+                        [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+                        [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+                        [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+                        [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+                        [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+                        [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+                        [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+                        [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+                        [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+                        [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+                        [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+                        [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+                        [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+                        [102, 255, 0], [92, 0, 255]]
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-small")
+        >>> image_segmentor = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-small")
+
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg", torch_dtype=torch.float16)
+
+        >>> pipe = StableDiffusionControlNetInpaintImg2ImgPipeline.from_pretrained(
+                "runwayml/stable-diffusion-inpainting", controlnet=controlnet, safety_checker=None, torch_dtype=torch.float16
+            )
+
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe.enable_xformers_memory_efficient_attention()
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> def image_to_seg(image):
+                pixel_values = image_processor(image, return_tensors="pt").pixel_values
+                with torch.no_grad():
+                    outputs = image_segmentor(pixel_values)
+                seg = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
+                color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)  # height, width, 3
+                palette = np.array(ade_palette())
+                for label, color in enumerate(palette):
+                    color_seg[seg == label, :] = color
+                color_seg = color_seg.astype(np.uint8)
+                seg_image = Image.fromarray(color_seg)
+                return seg_image
+
+        >>> image = load_image(
+                "https://github.com/CompVis/latent-diffusion/raw/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+            )
+
+        >>> mask_image = load_image(
+                "https://github.com/CompVis/latent-diffusion/raw/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+            )
+
+        >>> controlnet_conditioning_image = image_to_seg(image)
+
+        >>> image = pipe(
+                "Face of a yellow cat, high resolution, sitting on a park bench",
+                image,
+                mask_image,
+                controlnet_conditioning_image,
+                num_inference_steps=20,
+            ).images[0]
+
+        >>> image.save("out.png")
+        ```
+"""
+
+
+def prepare_image(image):
+    if isinstance(image, torch.Tensor):
+        # Batch single image
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        image = image.to(dtype=torch.float32)
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    return image
+
+
+def prepare_mask_image(mask_image):
+    if isinstance(mask_image, torch.Tensor):
+        if mask_image.ndim == 2:
+            # Batch and add channel dim for single mask
+            mask_image = mask_image.unsqueeze(0).unsqueeze(0)
+        elif mask_image.ndim == 3 and mask_image.shape[0] == 1:
+            # Single mask, the 0'th dimension is considered to be
+            # the existing batch size of 1
+            mask_image = mask_image.unsqueeze(0)
+        elif mask_image.ndim == 3 and mask_image.shape[0] != 1:
+            # Batch of mask, the 0'th dimension is considered to be
+            # the batching dimension
+            mask_image = mask_image.unsqueeze(1)
+
+        # Binarize mask
+        mask_image[mask_image < 0.5] = 0
+        mask_image[mask_image >= 0.5] = 1
+    else:
+        # preprocess mask
+        if isinstance(mask_image, (PIL.Image.Image, np.ndarray)):
+            mask_image = [mask_image]
+
+        if isinstance(mask_image, list) and isinstance(mask_image[0], PIL.Image.Image):
+            mask_image = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask_image], axis=0)
+            mask_image = mask_image.astype(np.float32) / 255.0
+        elif isinstance(mask_image, list) and isinstance(mask_image[0], np.ndarray):
+            mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
+
+        mask_image[mask_image < 0.5] = 0
+        mask_image[mask_image >= 0.5] = 1
+        mask_image = torch.from_numpy(mask_image)
+
+    return mask_image
+
+
+def prepare_controlnet_conditioning_image(
+    controlnet_conditioning_image, width, height, batch_size, num_images_per_prompt, device, dtype
+):
+    if not isinstance(controlnet_conditioning_image, torch.Tensor):
+        if isinstance(controlnet_conditioning_image, PIL.Image.Image):
+            controlnet_conditioning_image = [controlnet_conditioning_image]
+
+        if isinstance(controlnet_conditioning_image[0], PIL.Image.Image):
+            controlnet_conditioning_image = [
+                np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
+                for i in controlnet_conditioning_image
+            ]
+            controlnet_conditioning_image = np.concatenate(controlnet_conditioning_image, axis=0)
+            controlnet_conditioning_image = np.array(controlnet_conditioning_image).astype(np.float32) / 255.0
+            controlnet_conditioning_image = controlnet_conditioning_image.transpose(0, 3, 1, 2)
+            controlnet_conditioning_image = torch.from_numpy(controlnet_conditioning_image)
+        elif isinstance(controlnet_conditioning_image[0], torch.Tensor):
+            controlnet_conditioning_image = torch.cat(controlnet_conditioning_image, dim=0)
+
+    image_batch_size = controlnet_conditioning_image.shape[0]
+
+    if image_batch_size == 1:
+        repeat_by = batch_size
+    else:
+        # image batch size is the same as prompt batch size
+        repeat_by = num_images_per_prompt
+
+    controlnet_conditioning_image = controlnet_conditioning_image.repeat_interleave(repeat_by, dim=0)
+
+    controlnet_conditioning_image = controlnet_conditioning_image.to(device=device, dtype=dtype)
+
+    return controlnet_conditioning_image
+
+
+class StableDiffusionControlNetInpaintImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin):
+    """
+    Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: ControlNetModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        mask_image,
+        controlnet_conditioning_image,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        strength=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        controlnet_cond_image_is_pil = isinstance(controlnet_conditioning_image, PIL.Image.Image)
+        controlnet_cond_image_is_tensor = isinstance(controlnet_conditioning_image, torch.Tensor)
+        controlnet_cond_image_is_pil_list = isinstance(controlnet_conditioning_image, list) and isinstance(
+            controlnet_conditioning_image[0], PIL.Image.Image
+        )
+        controlnet_cond_image_is_tensor_list = isinstance(controlnet_conditioning_image, list) and isinstance(
+            controlnet_conditioning_image[0], torch.Tensor
+        )
+
+        if (
+            not controlnet_cond_image_is_pil
+            and not controlnet_cond_image_is_tensor
+            and not controlnet_cond_image_is_pil_list
+            and not controlnet_cond_image_is_tensor_list
+        ):
+            raise TypeError(
+                "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
+            )
+
+        if controlnet_cond_image_is_pil:
+            controlnet_cond_image_batch_size = 1
+        elif controlnet_cond_image_is_tensor:
+            controlnet_cond_image_batch_size = controlnet_conditioning_image.shape[0]
+        elif controlnet_cond_image_is_pil_list:
+            controlnet_cond_image_batch_size = len(controlnet_conditioning_image)
+        elif controlnet_cond_image_is_tensor_list:
+            controlnet_cond_image_batch_size = len(controlnet_conditioning_image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if controlnet_cond_image_batch_size != 1 and controlnet_cond_image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {controlnet_cond_image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+        if isinstance(image, torch.Tensor) and not isinstance(mask_image, torch.Tensor):
+            raise TypeError("if `image` is a tensor, `mask_image` must also be a tensor")
+
+        if isinstance(image, PIL.Image.Image) and not isinstance(mask_image, PIL.Image.Image):
+            raise TypeError("if `image` is a PIL image, `mask_image` must also be a PIL image")
+
+        if isinstance(image, torch.Tensor):
+            if image.ndim != 3 and image.ndim != 4:
+                raise ValueError("`image` must have 3 or 4 dimensions")
+
+            if mask_image.ndim != 2 and mask_image.ndim != 3 and mask_image.ndim != 4:
+                raise ValueError("`mask_image` must have 2, 3, or 4 dimensions")
+
+            if image.ndim == 3:
+                image_batch_size = 1
+                image_channels, image_height, image_width = image.shape
+            elif image.ndim == 4:
+                image_batch_size, image_channels, image_height, image_width = image.shape
+
+            if mask_image.ndim == 2:
+                mask_image_batch_size = 1
+                mask_image_channels = 1
+                mask_image_height, mask_image_width = mask_image.shape
+            elif mask_image.ndim == 3:
+                mask_image_channels = 1
+                mask_image_batch_size, mask_image_height, mask_image_width = mask_image.shape
+            elif mask_image.ndim == 4:
+                mask_image_batch_size, mask_image_channels, mask_image_height, mask_image_width = mask_image.shape
+
+            if image_channels != 3:
+                raise ValueError("`image` must have 3 channels")
+
+            if mask_image_channels != 1:
+                raise ValueError("`mask_image` must have 1 channel")
+
+            if image_batch_size != mask_image_batch_size:
+                raise ValueError("`image` and `mask_image` mush have the same batch sizes")
+
+            if image_height != mask_image_height or image_width != mask_image_width:
+                raise ValueError("`image` and `mask_image` must have the same height and width dimensions")
+
+            if image.min() < -1 or image.max() > 1:
+                raise ValueError("`image` should be in range [-1, 1]")
+
+            if mask_image.min() < 0 or mask_image.max() > 1:
+                raise ValueError("`mask_image` should be in range [0, 1]")
+        else:
+            mask_image_channels = 1
+            image_channels = 3
+
+        single_image_latent_channels = self.vae.config.latent_channels
+
+        total_latent_channels = single_image_latent_channels * 2 + mask_image_channels
+
+        if total_latent_channels != self.unet.config.in_channels:
+            raise ValueError(
+                f"The config of `pipeline.unet` expects {self.unet.config.in_channels} but received"
+                f" non inpainting latent channels: {single_image_latent_channels},"
+                f" mask channels: {mask_image_channels}, and masked image channels: {single_image_latent_channels}."
+                f" Please verify the config of `pipeline.unet` and the `mask_image` and `image` inputs."
+            )
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if isinstance(generator, list):
+            init_latents = [
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+            ]
+            init_latents = torch.cat(init_latents, dim=0)
+        else:
+            init_latents = self.vae.encode(image).latent_dist.sample(generator)
+
+        init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    def prepare_mask_latents(self, mask_image, batch_size, height, width, dtype, device, do_classifier_free_guidance):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask_image = F.interpolate(mask_image, size=(height // self.vae_scale_factor, width // self.vae_scale_factor))
+        mask_image = mask_image.to(device=device, dtype=dtype)
+
+        # duplicate mask for each generation per prompt, using mps friendly method
+        if mask_image.shape[0] < batch_size:
+            if not batch_size % mask_image.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask_image.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask_image = mask_image.repeat(batch_size // mask_image.shape[0], 1, 1, 1)
+
+        mask_image = torch.cat([mask_image] * 2) if do_classifier_free_guidance else mask_image
+
+        mask_image_latents = mask_image
+
+        return mask_image_latents
+
+    def prepare_masked_image_latents(
+        self, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        # encode the mask image into latents space so we can concatenate it to the latents
+        if isinstance(generator, list):
+            masked_image_latents = [
+                self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i])
+                for i in range(batch_size)
+            ]
+            masked_image_latents = torch.cat(masked_image_latents, dim=0)
+        else:
+            masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
+        masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
+
+        # duplicate masked_image_latents for each generation per prompt, using mps friendly method
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return masked_image_latents
+
+    def _default_height_width(self, height, width, image):
+        if isinstance(image, list):
+            image = image[0]
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[3]
+
+            height = (height // 8) * 8  # round down to nearest multiple of 8
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[2]
+
+            width = (width // 8) * 8  # round down to nearest multiple of 8
+
+        return height, width
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[torch.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_image: Union[
+            torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]
+        ] = None,
+        strength: float = 0.8,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: float = 1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            controlnet_conditioning_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `torch.Tensor`, it is passed to ControlNet as is. PIL.Image.Image` can
+                also be accepted as an image. The control image is automatically resized to fit the output image.
+            strength (`float`, *optional*):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height, width = self._default_height_width(height, width, controlnet_conditioning_image)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            image,
+            mask_image,
+            controlnet_conditioning_image,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            strength,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 4. Prepare mask, image, and controlnet_conditioning_image
+        image = prepare_image(image)
+
+        mask_image = prepare_mask_image(mask_image)
+
+        controlnet_conditioning_image = prepare_controlnet_conditioning_image(
+            controlnet_conditioning_image,
+            width,
+            height,
+            batch_size * num_images_per_prompt,
+            num_images_per_prompt,
+            device,
+            self.controlnet.dtype,
+        )
+
+        masked_image = image * (mask_image < 0.5)
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+            )
+
+        mask_image_latents = self.prepare_mask_latents(
+            mask_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            do_classifier_free_guidance,
+        )
+
+        masked_image_latents = self.prepare_masked_image_latents(
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+
+        if do_classifier_free_guidance:
+            controlnet_conditioning_image = torch.cat([controlnet_conditioning_image] * 2)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                non_inpainting_latent_model_input = (
+                    torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                )
+
+                non_inpainting_latent_model_input = self.scheduler.scale_model_input(
+                    non_inpainting_latent_model_input, t
+                )
+
+                inpainting_latent_model_input = torch.cat(
+                    [non_inpainting_latent_model_input, mask_image_latents, masked_image_latents], dim=1
+                )
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    non_inpainting_latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_cond=controlnet_conditioning_image,
+                    return_dict=False,
+                )
+
+                down_block_res_samples = [
+                    down_block_res_sample * controlnet_conditioning_scale
+                    for down_block_res_sample in down_block_res_samples
+                ]
+                mid_block_res_sample *= controlnet_conditioning_scale
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    inpainting_latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/stable_diffusion_controlnet_reference.py b/diffusers/examples/community/stable_diffusion_controlnet_reference.py
new file mode 100644
index 0000000000000000000000000000000000000000..577c7712e7714598f611a7fe07de04238dc4fe57
--- /dev/null
+++ b/diffusers/examples/community/stable_diffusion_controlnet_reference.py
@@ -0,0 +1,838 @@
+# Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236 and https://github.com/Mikubill/sd-webui-controlnet/discussions/1280
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from diffusers import StableDiffusionControlNetPipeline
+from diffusers.models import ControlNetModel
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.models.unets.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import cv2
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> from diffusers import UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+
+        >>> input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
+
+        >>> # get canny image
+        >>> image = cv2.Canny(np.array(input_image), 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionControlNetReferencePipeline.from_pretrained(
+                "runwayml/stable-diffusion-v1-5",
+                controlnet=controlnet,
+                safety_checker=None,
+                torch_dtype=torch.float16
+                ).to('cuda:0')
+
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe_controlnet.scheduler.config)
+
+        >>> result_img = pipe(ref_image=input_image,
+                        prompt="1girl",
+                        image=canny_image,
+                        num_inference_steps=20,
+                        reference_attn=True,
+                        reference_adain=True).images[0]
+
+        >>> result_img.show()
+        ```
+"""
+
+
+def torch_dfs(model: torch.nn.Module):
+    result = [model]
+    for child in model.children():
+        result += torch_dfs(child)
+    return result
+
+
+class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeline):
+    def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do_classifier_free_guidance):
+        refimage = refimage.to(device=device, dtype=dtype)
+
+        # encode the mask image into latents space so we can concatenate it to the latents
+        if isinstance(generator, list):
+            ref_image_latents = [
+                self.vae.encode(refimage[i : i + 1]).latent_dist.sample(generator=generator[i])
+                for i in range(batch_size)
+            ]
+            ref_image_latents = torch.cat(ref_image_latents, dim=0)
+        else:
+            ref_image_latents = self.vae.encode(refimage).latent_dist.sample(generator=generator)
+        ref_image_latents = self.vae.config.scaling_factor * ref_image_latents
+
+        # duplicate mask and ref_image_latents for each generation per prompt, using mps friendly method
+        if ref_image_latents.shape[0] < batch_size:
+            if not batch_size % ref_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {ref_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            ref_image_latents = ref_image_latents.repeat(batch_size // ref_image_latents.shape[0], 1, 1, 1)
+
+        ref_image_latents = torch.cat([ref_image_latents] * 2) if do_classifier_free_guidance else ref_image_latents
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        ref_image_latents = ref_image_latents.to(device=device, dtype=dtype)
+        return ref_image_latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[
+            torch.Tensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.Tensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        ref_image: Union[torch.Tensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        attention_auto_machine_weight: float = 1.0,
+        gn_auto_machine_weight: float = 1.0,
+        style_fidelity: float = 0.5,
+        reference_attn: bool = True,
+        reference_adain: bool = True,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
+                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
+                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
+                specified in init, images must be passed as a list such that each element of the list can be correctly
+                batched for input to a single controlnet.
+            ref_image (`torch.Tensor`, `PIL.Image.Image`):
+                The Reference Control input condition. Reference Control uses this input condition to generate guidance to Unet. If
+                the type is specified as `torch.Tensor`, it is passed to Reference Control as is. `PIL.Image.Image` can
+                also be accepted as an image.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
+                corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+            attention_auto_machine_weight (`float`):
+                Weight of using reference query for self attention's context.
+                If attention_auto_machine_weight=1.0, use reference query for all self attention's context.
+            gn_auto_machine_weight (`float`):
+                Weight of using reference adain. If gn_auto_machine_weight=2.0, use all reference adain plugins.
+            style_fidelity (`float`):
+                style fidelity of ref_uncond_xt. If style_fidelity=1.0, control more important,
+                elif style_fidelity=0.0, prompt more important, else balanced.
+            reference_attn (`bool`):
+                Whether to use reference query for self attention's context.
+            reference_adain (`bool`):
+                Whether to use reference adain.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        assert reference_attn or reference_adain, "`reference_attn` or `reference_adain` must be True."
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            image,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            controlnet_conditioning_scale,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            images = []
+
+            for image_ in image:
+                image_ = self.prepare_image(
+                    image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                images.append(image_)
+
+            image = images
+            height, width = image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Preprocess reference image
+        ref_image = self.prepare_image(
+            image=ref_image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=prompt_embeds.dtype,
+        )
+
+        # 6. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 7. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 8. Prepare reference latent variables
+        ref_image_latents = self.prepare_ref_latents(
+            ref_image,
+            batch_size * num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Modify self attention and group norm
+        MODE = "write"
+        uc_mask = (
+            torch.Tensor([1] * batch_size * num_images_per_prompt + [0] * batch_size * num_images_per_prompt)
+            .type_as(ref_image_latents)
+            .bool()
+        )
+
+        def hacked_basic_transformer_inner_forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            timestep: Optional[torch.LongTensor] = None,
+            cross_attention_kwargs: Dict[str, Any] = None,
+            class_labels: Optional[torch.LongTensor] = None,
+        ):
+            if self.use_ada_layer_norm:
+                norm_hidden_states = self.norm1(hidden_states, timestep)
+            elif self.use_ada_layer_norm_zero:
+                norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                    hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+            else:
+                norm_hidden_states = self.norm1(hidden_states)
+
+            # 1. Self-Attention
+            cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+            if self.only_cross_attention:
+                attn_output = self.attn1(
+                    norm_hidden_states,
+                    encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                    attention_mask=attention_mask,
+                    **cross_attention_kwargs,
+                )
+            else:
+                if MODE == "write":
+                    self.bank.append(norm_hidden_states.detach().clone())
+                    attn_output = self.attn1(
+                        norm_hidden_states,
+                        encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                        attention_mask=attention_mask,
+                        **cross_attention_kwargs,
+                    )
+                if MODE == "read":
+                    if attention_auto_machine_weight > self.attn_weight:
+                        attn_output_uc = self.attn1(
+                            norm_hidden_states,
+                            encoder_hidden_states=torch.cat([norm_hidden_states] + self.bank, dim=1),
+                            # attention_mask=attention_mask,
+                            **cross_attention_kwargs,
+                        )
+                        attn_output_c = attn_output_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            attn_output_c[uc_mask] = self.attn1(
+                                norm_hidden_states[uc_mask],
+                                encoder_hidden_states=norm_hidden_states[uc_mask],
+                                **cross_attention_kwargs,
+                            )
+                        attn_output = style_fidelity * attn_output_c + (1.0 - style_fidelity) * attn_output_uc
+                        self.bank.clear()
+                    else:
+                        attn_output = self.attn1(
+                            norm_hidden_states,
+                            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                            attention_mask=attention_mask,
+                            **cross_attention_kwargs,
+                        )
+            if self.use_ada_layer_norm_zero:
+                attn_output = gate_msa.unsqueeze(1) * attn_output
+            hidden_states = attn_output + hidden_states
+
+            if self.attn2 is not None:
+                norm_hidden_states = (
+                    self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+                )
+
+                # 2. Cross-Attention
+                attn_output = self.attn2(
+                    norm_hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=encoder_attention_mask,
+                    **cross_attention_kwargs,
+                )
+                hidden_states = attn_output + hidden_states
+
+            # 3. Feed-forward
+            norm_hidden_states = self.norm3(hidden_states)
+
+            if self.use_ada_layer_norm_zero:
+                norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+            ff_output = self.ff(norm_hidden_states)
+
+            if self.use_ada_layer_norm_zero:
+                ff_output = gate_mlp.unsqueeze(1) * ff_output
+
+            hidden_states = ff_output + hidden_states
+
+            return hidden_states
+
+        def hacked_mid_forward(self, *args, **kwargs):
+            eps = 1e-6
+            x = self.original_forward(*args, **kwargs)
+            if MODE == "write":
+                if gn_auto_machine_weight >= self.gn_weight:
+                    var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0)
+                    self.mean_bank.append(mean)
+                    self.var_bank.append(var)
+            if MODE == "read":
+                if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                    var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0)
+                    std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                    mean_acc = sum(self.mean_bank) / float(len(self.mean_bank))
+                    var_acc = sum(self.var_bank) / float(len(self.var_bank))
+                    std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                    x_uc = (((x - mean) / std) * std_acc) + mean_acc
+                    x_c = x_uc.clone()
+                    if do_classifier_free_guidance and style_fidelity > 0:
+                        x_c[uc_mask] = x[uc_mask]
+                    x = style_fidelity * x_c + (1.0 - style_fidelity) * x_uc
+                self.mean_bank = []
+                self.var_bank = []
+            return x
+
+        def hack_CrossAttnDownBlock2D_forward(
+            self,
+            hidden_states: torch.Tensor,
+            temb: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+        ):
+            eps = 1e-6
+
+            # TODO(Patrick, William) - attention mask is not used
+            output_states = ()
+
+            for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                if MODE == "write":
+                    if gn_auto_machine_weight >= self.gn_weight:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        self.mean_bank.append([mean])
+                        self.var_bank.append([var])
+                if MODE == "read":
+                    if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                        mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+                        var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+                        std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                        hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+                        hidden_states_c = hidden_states_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            hidden_states_c[uc_mask] = hidden_states[uc_mask]
+                        hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+                output_states = output_states + (hidden_states,)
+
+            if MODE == "read":
+                self.mean_bank = []
+                self.var_bank = []
+
+            if self.downsamplers is not None:
+                for downsampler in self.downsamplers:
+                    hidden_states = downsampler(hidden_states)
+
+                output_states = output_states + (hidden_states,)
+
+            return hidden_states, output_states
+
+        def hacked_DownBlock2D_forward(self, hidden_states, temb=None, *args, **kwargs):
+            eps = 1e-6
+
+            output_states = ()
+
+            for i, resnet in enumerate(self.resnets):
+                hidden_states = resnet(hidden_states, temb)
+
+                if MODE == "write":
+                    if gn_auto_machine_weight >= self.gn_weight:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        self.mean_bank.append([mean])
+                        self.var_bank.append([var])
+                if MODE == "read":
+                    if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                        mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+                        var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+                        std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                        hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+                        hidden_states_c = hidden_states_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            hidden_states_c[uc_mask] = hidden_states[uc_mask]
+                        hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+                output_states = output_states + (hidden_states,)
+
+            if MODE == "read":
+                self.mean_bank = []
+                self.var_bank = []
+
+            if self.downsamplers is not None:
+                for downsampler in self.downsamplers:
+                    hidden_states = downsampler(hidden_states)
+
+                output_states = output_states + (hidden_states,)
+
+            return hidden_states, output_states
+
+        def hacked_CrossAttnUpBlock2D_forward(
+            self,
+            hidden_states: torch.Tensor,
+            res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+            temb: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            upsample_size: Optional[int] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+        ):
+            eps = 1e-6
+            # TODO(Patrick, William) - attention mask is not used
+            for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+
+                if MODE == "write":
+                    if gn_auto_machine_weight >= self.gn_weight:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        self.mean_bank.append([mean])
+                        self.var_bank.append([var])
+                if MODE == "read":
+                    if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                        mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+                        var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+                        std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                        hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+                        hidden_states_c = hidden_states_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            hidden_states_c[uc_mask] = hidden_states[uc_mask]
+                        hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+            if MODE == "read":
+                self.mean_bank = []
+                self.var_bank = []
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        def hacked_UpBlock2D_forward(
+            self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, *args, **kwargs
+        ):
+            eps = 1e-6
+            for i, resnet in enumerate(self.resnets):
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                hidden_states = resnet(hidden_states, temb)
+
+                if MODE == "write":
+                    if gn_auto_machine_weight >= self.gn_weight:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        self.mean_bank.append([mean])
+                        self.var_bank.append([var])
+                if MODE == "read":
+                    if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                        mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+                        var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+                        std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                        hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+                        hidden_states_c = hidden_states_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            hidden_states_c[uc_mask] = hidden_states[uc_mask]
+                        hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+            if MODE == "read":
+                self.mean_bank = []
+                self.var_bank = []
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        if reference_attn:
+            attn_modules = [module for module in torch_dfs(self.unet) if isinstance(module, BasicTransformerBlock)]
+            attn_modules = sorted(attn_modules, key=lambda x: -x.norm1.normalized_shape[0])
+
+            for i, module in enumerate(attn_modules):
+                module._original_inner_forward = module.forward
+                module.forward = hacked_basic_transformer_inner_forward.__get__(module, BasicTransformerBlock)
+                module.bank = []
+                module.attn_weight = float(i) / float(len(attn_modules))
+
+        if reference_adain:
+            gn_modules = [self.unet.mid_block]
+            self.unet.mid_block.gn_weight = 0
+
+            down_blocks = self.unet.down_blocks
+            for w, module in enumerate(down_blocks):
+                module.gn_weight = 1.0 - float(w) / float(len(down_blocks))
+                gn_modules.append(module)
+
+            up_blocks = self.unet.up_blocks
+            for w, module in enumerate(up_blocks):
+                module.gn_weight = float(w) / float(len(up_blocks))
+                gn_modules.append(module)
+
+            for i, module in enumerate(gn_modules):
+                if getattr(module, "original_forward", None) is None:
+                    module.original_forward = module.forward
+                if i == 0:
+                    # mid_block
+                    module.forward = hacked_mid_forward.__get__(module, torch.nn.Module)
+                elif isinstance(module, CrossAttnDownBlock2D):
+                    module.forward = hack_CrossAttnDownBlock2D_forward.__get__(module, CrossAttnDownBlock2D)
+                elif isinstance(module, DownBlock2D):
+                    module.forward = hacked_DownBlock2D_forward.__get__(module, DownBlock2D)
+                elif isinstance(module, CrossAttnUpBlock2D):
+                    module.forward = hacked_CrossAttnUpBlock2D_forward.__get__(module, CrossAttnUpBlock2D)
+                elif isinstance(module, UpBlock2D):
+                    module.forward = hacked_UpBlock2D_forward.__get__(module, UpBlock2D)
+                module.mean_bank = []
+                module.var_bank = []
+                module.gn_weight *= 2
+
+        # 11. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # controlnet(s) inference
+                if guess_mode and do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=image,
+                    conditioning_scale=controlnet_conditioning_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+
+                if guess_mode and do_classifier_free_guidance:
+                    # Inferred ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                # ref only part
+                noise = randn_tensor(
+                    ref_image_latents.shape, generator=generator, device=device, dtype=ref_image_latents.dtype
+                )
+                ref_xt = self.scheduler.add_noise(
+                    ref_image_latents,
+                    noise,
+                    t.reshape(
+                        1,
+                    ),
+                )
+                ref_xt = self.scheduler.scale_model_input(ref_xt, t)
+
+                MODE = "write"
+                self.unet(
+                    ref_xt,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )
+
+                # predict the noise residual
+                MODE = "read"
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/stable_diffusion_ipex.py b/diffusers/examples/community/stable_diffusion_ipex.py
new file mode 100644
index 0000000000000000000000000000000000000000..123892f6229a34da16a87708c8014e80f3ee3385
--- /dev/null
+++ b/diffusers/examples/community/stable_diffusion_ipex.py
@@ -0,0 +1,754 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import intel_extension_for_pytorch as ipex
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    deprecate,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPipeline
+
+        >>> pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="stable_diffusion_ipex")
+
+        >>> # For Float32
+        >>> pipe.prepare_for_ipex(prompt, dtype=torch.float32, height=512, width=512) #value of image height/width should be consistent with the pipeline inference
+        >>> # For BFloat16
+        >>> pipe.prepare_for_ipex(prompt, dtype=torch.bfloat16, height=512, width=512) #value of image height/width should be consistent with the pipeline inference
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> # For Float32
+        >>> image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()'
+        >>> # For BFloat16
+        >>> with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+        >>>     image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()'
+        ```
+"""
+
+
+class StableDiffusionIPEXPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion on IPEX.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def get_input_example(self, prompt, height=None, width=None, guidance_scale=7.5, num_images_per_prompt=1):
+        prompt_embeds = None
+        negative_prompt_embeds = None
+        negative_prompt = None
+        callback_steps = 1
+        generator = None
+        latents = None
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+
+        device = "cpu"
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 5. Prepare latent variables
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            self.unet.config.in_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        dummy = torch.ones(1, dtype=torch.int32)
+        latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+        latent_model_input = self.scheduler.scale_model_input(latent_model_input, dummy)
+
+        unet_input_example = (latent_model_input, dummy, prompt_embeds)
+        vae_decoder_input_example = latents
+
+        return unet_input_example, vae_decoder_input_example
+
+    def prepare_for_ipex(self, promt, dtype=torch.float32, height=None, width=None, guidance_scale=7.5):
+        self.unet = self.unet.to(memory_format=torch.channels_last)
+        self.vae.decoder = self.vae.decoder.to(memory_format=torch.channels_last)
+        self.text_encoder = self.text_encoder.to(memory_format=torch.channels_last)
+        if self.safety_checker is not None:
+            self.safety_checker = self.safety_checker.to(memory_format=torch.channels_last)
+
+        unet_input_example, vae_decoder_input_example = self.get_input_example(promt, height, width, guidance_scale)
+
+        # optimize with ipex
+        if dtype == torch.bfloat16:
+            self.unet = ipex.optimize(self.unet.eval(), dtype=torch.bfloat16, inplace=True)
+            self.vae.decoder = ipex.optimize(self.vae.decoder.eval(), dtype=torch.bfloat16, inplace=True)
+            self.text_encoder = ipex.optimize(self.text_encoder.eval(), dtype=torch.bfloat16, inplace=True)
+            if self.safety_checker is not None:
+                self.safety_checker = ipex.optimize(self.safety_checker.eval(), dtype=torch.bfloat16, inplace=True)
+        elif dtype == torch.float32:
+            self.unet = ipex.optimize(
+                self.unet.eval(),
+                dtype=torch.float32,
+                inplace=True,
+                weights_prepack=True,
+                auto_kernel_selection=False,
+            )
+            self.vae.decoder = ipex.optimize(
+                self.vae.decoder.eval(),
+                dtype=torch.float32,
+                inplace=True,
+                weights_prepack=True,
+                auto_kernel_selection=False,
+            )
+            self.text_encoder = ipex.optimize(
+                self.text_encoder.eval(),
+                dtype=torch.float32,
+                inplace=True,
+                weights_prepack=True,
+                auto_kernel_selection=False,
+            )
+            if self.safety_checker is not None:
+                self.safety_checker = ipex.optimize(
+                    self.safety_checker.eval(),
+                    dtype=torch.float32,
+                    inplace=True,
+                    weights_prepack=True,
+                    auto_kernel_selection=False,
+                )
+        else:
+            raise ValueError(" The value of 'dtype' should be 'torch.bfloat16' or 'torch.float32' !")
+
+        # trace unet model to get better performance on IPEX
+        with torch.cpu.amp.autocast(enabled=dtype == torch.bfloat16), torch.no_grad():
+            unet_trace_model = torch.jit.trace(self.unet, unet_input_example, check_trace=False, strict=False)
+            unet_trace_model = torch.jit.freeze(unet_trace_model)
+        self.unet.forward = unet_trace_model.forward
+
+        # trace vae.decoder model to get better performance on IPEX
+        with torch.cpu.amp.autocast(enabled=dtype == torch.bfloat16), torch.no_grad():
+            ave_decoder_trace_model = torch.jit.trace(
+                self.vae.decoder, vae_decoder_input_example, check_trace=False, strict=False
+            )
+            ave_decoder_trace_model = torch.jit.freeze(ave_decoder_trace_model)
+        self.vae.decoder.forward = ave_decoder_trace_model.forward
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds)["sample"]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/stable_diffusion_mega.py b/diffusers/examples/community/stable_diffusion_mega.py
new file mode 100644
index 0000000000000000000000000000000000000000..95b4b03e4de174c48ab9a687b20b7baee740879f
--- /dev/null
+++ b/diffusers/examples/community/stable_diffusion_mega.py
@@ -0,0 +1,202 @@
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipelineLegacy,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.configuration_utils import FrozenDict
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.utils import deprecate, logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class StableDiffusionMegaPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionMegaSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    @property
+    def components(self) -> Dict[str, Any]:
+        return {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")}
+
+    @torch.no_grad()
+    def inpaint(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.Tensor, PIL.Image.Image],
+        mask_image: Union[torch.Tensor, PIL.Image.Image],
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
+        return StableDiffusionInpaintPipelineLegacy(**self.components)(
+            prompt=prompt,
+            image=image,
+            mask_image=mask_image,
+            strength=strength,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+        )
+
+    @torch.no_grad()
+    def img2img(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.Tensor, PIL.Image.Image],
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
+        return StableDiffusionImg2ImgPipeline(**self.components)(
+            prompt=prompt,
+            image=image,
+            strength=strength,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
+
+    @torch.no_grad()
+    def text2img(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        # For more information on how this function https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionPipeline
+        return StableDiffusionPipeline(**self.components)(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
diff --git a/diffusers/examples/community/stable_diffusion_reference.py b/diffusers/examples/community/stable_diffusion_reference.py
new file mode 100644
index 0000000000000000000000000000000000000000..efb0fa89dbfc2e2e410de0ffb95594cd54300836
--- /dev/null
+++ b/diffusers/examples/community/stable_diffusion_reference.py
@@ -0,0 +1,1465 @@
+# Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236 and https://github.com/Mikubill/sd-webui-controlnet/discussions/1280
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel
+from diffusers.configuration_utils import FrozenDict, deprecate
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.models.unets.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+
+        >>> input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
+
+        >>> pipe = StableDiffusionReferencePipeline.from_pretrained(
+                "runwayml/stable-diffusion-v1-5",
+                safety_checker=None,
+                torch_dtype=torch.float16
+                ).to('cuda:0')
+
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+
+        >>> result_img = pipe(ref_image=input_image,
+                        prompt="1girl",
+                        num_inference_steps=20,
+                        reference_attn=True,
+                        reference_adain=True).images[0]
+
+        >>> result_img.show()
+        ```
+"""
+
+
+def torch_dfs(model: torch.nn.Module):
+    r"""
+    Performs a depth-first search on the given PyTorch model and returns a list of all its child modules.
+
+    Args:
+        model (torch.nn.Module): The PyTorch model to perform the depth-first search on.
+
+    Returns:
+        list: A list of all child modules of the given model.
+    """
+    result = [model]
+    for child in model.children():
+        result += torch_dfs(child)
+    return result
+
+
+class StableDiffusionReferencePipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+):
+    r"""
+    Pipeline for Stable Diffusion Reference.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+    - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+    - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+    - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+    - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+    - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration"
+                " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
+                " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
+                " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
+                " Hub, it would be very nice if you could open a Pull request for the"
+                " `scheduler/scheduler_config.json` file"
+            )
+            deprecate(
+                "skip_prk_steps not set",
+                "1.0.0",
+                deprecation_message,
+                standard_warn=False,
+            )
+            new_config = dict(scheduler.config)
+            new_config["skip_prk_steps"] = True
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        # Check shapes, assume num_channels_latents == 4, num_channels_mask == 1, num_channels_masked == 4
+        if unet.config.in_channels != 4:
+            logger.warning(
+                f"You have loaded a UNet with {unet.config.in_channels} input channels, whereas by default,"
+                f" {self.__class__} assumes that `pipeline.unet` has 4 input channels: 4 for `num_channels_latents`,"
+                ". If you did not intend to modify"
+                " this behavior, please check whether you have loaded the right checkpoint."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _default_height_width(
+        self,
+        height: Optional[int],
+        width: Optional[int],
+        image: Union[PIL.Image.Image, torch.Tensor, List[PIL.Image.Image]],
+    ) -> Tuple[int, int]:
+        r"""
+        Calculate the default height and width for the given image.
+
+        Args:
+            height (int or None): The desired height of the image. If None, the height will be determined based on the input image.
+            width (int or None): The desired width of the image. If None, the width will be determined based on the input image.
+            image (PIL.Image.Image or torch.Tensor or list[PIL.Image.Image]): The input image or a list of images.
+
+        Returns:
+            Tuple[int, int]: A tuple containing the calculated height and width.
+
+        """
+        # NOTE: It is possible that a list of images have different
+        # dimensions for each image, so just checking the first image
+        # is not _exactly_ correct, but it is simple.
+        while isinstance(image, list):
+            image = image[0]
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[2]
+
+            height = (height // 8) * 8  # round down to nearest multiple of 8
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[3]
+
+            width = (width // 8) * 8  # round down to nearest multiple of 8
+
+        return height, width
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt: Optional[Union[str, List[str]]],
+        height: int,
+        width: int,
+        callback_steps: Optional[int],
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[torch.Tensor] = None,
+        ip_adapter_image_embeds: Optional[torch.Tensor] = None,
+        callback_on_step_end_tensor_inputs: Optional[List[str]] = None,
+    ) -> None:
+        """
+        Check the validity of the input arguments for the diffusion model.
+
+        Args:
+            prompt (Optional[Union[str, List[str]]]): The prompt text or list of prompt texts.
+            height (int): The height of the input image.
+            width (int): The width of the input image.
+            callback_steps (Optional[int]): The number of steps to perform the callback on.
+            negative_prompt (Optional[str]): The negative prompt text.
+            prompt_embeds (Optional[torch.Tensor]): The prompt embeddings.
+            negative_prompt_embeds (Optional[torch.Tensor]): The negative prompt embeddings.
+            ip_adapter_image (Optional[torch.Tensor]): The input adapter image.
+            ip_adapter_image_embeds (Optional[torch.Tensor]): The input adapter image embeddings.
+            callback_on_step_end_tensor_inputs (Optional[List[str]]): The list of tensor inputs to perform the callback on.
+
+        Raises:
+            ValueError: If `height` or `width` is not divisible by 8.
+            ValueError: If `callback_steps` is not a positive integer.
+            ValueError: If `callback_on_step_end_tensor_inputs` contains invalid tensor inputs.
+            ValueError: If both `prompt` and `prompt_embeds` are provided.
+            ValueError: If neither `prompt` nor `prompt_embeds` are provided.
+            ValueError: If `prompt` is not of type `str` or `list`.
+            ValueError: If both `negative_prompt` and `negative_prompt_embeds` are provided.
+            ValueError: If both `prompt_embeds` and `negative_prompt_embeds` are provided and have different shapes.
+            ValueError: If both `ip_adapter_image` and `ip_adapter_image_embeds` are provided.
+
+        Returns:
+            None
+        """
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: torch.device,
+        num_images_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Encodes the prompt into embeddings.
+
+        Args:
+            prompt (Union[str, List[str]]): The prompt text or a list of prompt texts.
+            device (torch.device): The device to use for encoding.
+            num_images_per_prompt (int): The number of images per prompt.
+            do_classifier_free_guidance (bool): Whether to use classifier-free guidance.
+            negative_prompt (Optional[Union[str, List[str]]], optional): The negative prompt text or a list of negative prompt texts. Defaults to None.
+            prompt_embeds (Optional[torch.Tensor], optional): The prompt embeddings. Defaults to None.
+            negative_prompt_embeds (Optional[torch.Tensor], optional): The negative prompt embeddings. Defaults to None.
+            lora_scale (Optional[float], optional): The LoRA scale. Defaults to None.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            torch.Tensor: The encoded prompt embeddings.
+        """
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Optional[str],
+        device: torch.device,
+        num_images_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ) -> torch.Tensor:
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: Union[torch.Generator, List[torch.Generator]],
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""
+        Prepare the latent vectors for diffusion.
+
+        Args:
+            batch_size (int): The number of samples in the batch.
+            num_channels_latents (int): The number of channels in the latent vectors.
+            height (int): The height of the latent vectors.
+            width (int): The width of the latent vectors.
+            dtype (torch.dtype): The data type of the latent vectors.
+            device (torch.device): The device to place the latent vectors on.
+            generator (Union[torch.Generator, List[torch.Generator]]): The generator(s) to use for random number generation.
+            latents (Optional[torch.Tensor]): The pre-existing latent vectors. If None, new latent vectors will be generated.
+
+        Returns:
+            torch.Tensor: The prepared latent vectors.
+        """
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(
+        self, generator: Union[torch.Generator, List[torch.Generator]], eta: float
+    ) -> Dict[str, Any]:
+        r"""
+        Prepare extra keyword arguments for the scheduler step.
+
+        Args:
+            generator (Union[torch.Generator, List[torch.Generator]]): The generator used for sampling.
+            eta (float): The value of eta (η) used with the DDIMScheduler. Should be between 0 and 1.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the extra keyword arguments for the scheduler step.
+        """
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def prepare_image(
+        self,
+        image: Union[torch.Tensor, PIL.Image.Image, List[Union[torch.Tensor, PIL.Image.Image]]],
+        width: int,
+        height: int,
+        batch_size: int,
+        num_images_per_prompt: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        do_classifier_free_guidance: bool = False,
+        guess_mode: bool = False,
+    ) -> torch.Tensor:
+        r"""
+        Prepares the input image for processing.
+
+        Args:
+            image (torch.Tensor or PIL.Image.Image or list): The input image(s).
+            width (int): The desired width of the image.
+            height (int): The desired height of the image.
+            batch_size (int): The batch size for processing.
+            num_images_per_prompt (int): The number of images per prompt.
+            device (torch.device): The device to use for processing.
+            dtype (torch.dtype): The data type of the image.
+            do_classifier_free_guidance (bool, optional): Whether to perform classifier-free guidance. Defaults to False.
+            guess_mode (bool, optional): Whether to use guess mode. Defaults to False.
+
+        Returns:
+            torch.Tensor: The prepared image for processing.
+        """
+        if not isinstance(image, torch.Tensor):
+            if isinstance(image, PIL.Image.Image):
+                image = [image]
+
+            if isinstance(image[0], PIL.Image.Image):
+                images = []
+
+                for image_ in image:
+                    image_ = image_.convert("RGB")
+                    image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
+                    image_ = np.array(image_)
+                    image_ = image_[None, :]
+                    images.append(image_)
+
+                image = images
+
+                image = np.concatenate(image, axis=0)
+                image = np.array(image).astype(np.float32) / 255.0
+                image = (image - 0.5) / 0.5
+                image = image.transpose(0, 3, 1, 2)
+                image = torch.from_numpy(image)
+            elif isinstance(image[0], torch.Tensor):
+                image = torch.cat(image, dim=0)
+
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    def prepare_ref_latents(
+        self,
+        refimage: torch.Tensor,
+        batch_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: Union[int, List[int]],
+        do_classifier_free_guidance: bool,
+    ) -> torch.Tensor:
+        r"""
+        Prepares reference latents for generating images.
+
+        Args:
+            refimage (torch.Tensor): The reference image.
+            batch_size (int): The desired batch size.
+            dtype (torch.dtype): The data type of the tensors.
+            device (torch.device): The device to perform computations on.
+            generator (int or list): The generator index or a list of generator indices.
+            do_classifier_free_guidance (bool): Whether to use classifier-free guidance.
+
+        Returns:
+            torch.Tensor: The prepared reference latents.
+        """
+        refimage = refimage.to(device=device, dtype=dtype)
+
+        # encode the mask image into latents space so we can concatenate it to the latents
+        if isinstance(generator, list):
+            ref_image_latents = [
+                self.vae.encode(refimage[i : i + 1]).latent_dist.sample(generator=generator[i])
+                for i in range(batch_size)
+            ]
+            ref_image_latents = torch.cat(ref_image_latents, dim=0)
+        else:
+            ref_image_latents = self.vae.encode(refimage).latent_dist.sample(generator=generator)
+        ref_image_latents = self.vae.config.scaling_factor * ref_image_latents
+
+        # duplicate mask and ref_image_latents for each generation per prompt, using mps friendly method
+        if ref_image_latents.shape[0] < batch_size:
+            if not batch_size % ref_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {ref_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            ref_image_latents = ref_image_latents.repeat(batch_size // ref_image_latents.shape[0], 1, 1, 1)
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        ref_image_latents = ref_image_latents.to(device=device, dtype=dtype)
+        return ref_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(
+        self, image: Union[torch.Tensor, PIL.Image.Image], device: torch.device, dtype: torch.dtype
+    ) -> Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]:
+        r"""
+        Runs the safety checker on the given image.
+
+        Args:
+            image (Union[torch.Tensor, PIL.Image.Image]): The input image to be checked.
+            device (torch.device): The device to run the safety checker on.
+            dtype (torch.dtype): The data type of the input image.
+
+        Returns:
+            (image, has_nsfw_concept) Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]: A tuple containing the processed image and
+            a boolean indicating whether the image has a NSFW (Not Safe for Work) concept.
+        """
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        ref_image: Union[torch.Tensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        attention_auto_machine_weight: float = 1.0,
+        gn_auto_machine_weight: float = 1.0,
+        style_fidelity: float = 0.5,
+        reference_attn: bool = True,
+        reference_adain: bool = True,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            ref_image (`torch.Tensor`, `PIL.Image.Image`):
+                The Reference Control input condition. Reference Control uses this input condition to generate guidance to Unet. If
+                the type is specified as `torch.Tensor`, it is passed to Reference Control as is. `PIL.Image.Image` can
+                also be accepted as an image.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            attention_auto_machine_weight (`float`):
+                Weight of using reference query for self attention's context.
+                If attention_auto_machine_weight=1.0, use reference query for all self attention's context.
+            gn_auto_machine_weight (`float`):
+                Weight of using reference adain. If gn_auto_machine_weight=2.0, use all reference adain plugins.
+            style_fidelity (`float`):
+                style fidelity of ref_uncond_xt. If style_fidelity=1.0, control more important,
+                elif style_fidelity=0.0, prompt more important, else balanced.
+            reference_attn (`bool`):
+                Whether to use reference query for self attention's context.
+            reference_adain (`bool`):
+                Whether to use reference adain.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        assert reference_attn or reference_adain, "`reference_attn` or `reference_adain` must be True."
+
+        # 0. Default height and width to unet
+        height, width = self._default_height_width(height, width, ref_image)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+
+        # 4. Preprocess reference image
+        ref_image = self.prepare_image(
+            image=ref_image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=prompt_embeds.dtype,
+        )
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Prepare reference latent variables
+        ref_image_latents = self.prepare_ref_latents(
+            ref_image,
+            batch_size * num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Modify self attention and group norm
+        MODE = "write"
+        uc_mask = (
+            torch.Tensor([1] * batch_size * num_images_per_prompt + [0] * batch_size * num_images_per_prompt)
+            .type_as(ref_image_latents)
+            .bool()
+        )
+
+        def hacked_basic_transformer_inner_forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            timestep: Optional[torch.LongTensor] = None,
+            cross_attention_kwargs: Dict[str, Any] = None,
+            class_labels: Optional[torch.LongTensor] = None,
+        ):
+            if self.use_ada_layer_norm:
+                norm_hidden_states = self.norm1(hidden_states, timestep)
+            elif self.use_ada_layer_norm_zero:
+                norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                    hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+            else:
+                norm_hidden_states = self.norm1(hidden_states)
+
+            # 1. Self-Attention
+            cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+            if self.only_cross_attention:
+                attn_output = self.attn1(
+                    norm_hidden_states,
+                    encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                    attention_mask=attention_mask,
+                    **cross_attention_kwargs,
+                )
+            else:
+                if MODE == "write":
+                    self.bank.append(norm_hidden_states.detach().clone())
+                    attn_output = self.attn1(
+                        norm_hidden_states,
+                        encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                        attention_mask=attention_mask,
+                        **cross_attention_kwargs,
+                    )
+                if MODE == "read":
+                    if attention_auto_machine_weight > self.attn_weight:
+                        attn_output_uc = self.attn1(
+                            norm_hidden_states,
+                            encoder_hidden_states=torch.cat([norm_hidden_states] + self.bank, dim=1),
+                            # attention_mask=attention_mask,
+                            **cross_attention_kwargs,
+                        )
+                        attn_output_c = attn_output_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            attn_output_c[uc_mask] = self.attn1(
+                                norm_hidden_states[uc_mask],
+                                encoder_hidden_states=norm_hidden_states[uc_mask],
+                                **cross_attention_kwargs,
+                            )
+                        attn_output = style_fidelity * attn_output_c + (1.0 - style_fidelity) * attn_output_uc
+                        self.bank.clear()
+                    else:
+                        attn_output = self.attn1(
+                            norm_hidden_states,
+                            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                            attention_mask=attention_mask,
+                            **cross_attention_kwargs,
+                        )
+            if self.use_ada_layer_norm_zero:
+                attn_output = gate_msa.unsqueeze(1) * attn_output
+            hidden_states = attn_output + hidden_states
+
+            if self.attn2 is not None:
+                norm_hidden_states = (
+                    self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+                )
+
+                # 2. Cross-Attention
+                attn_output = self.attn2(
+                    norm_hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=encoder_attention_mask,
+                    **cross_attention_kwargs,
+                )
+                hidden_states = attn_output + hidden_states
+
+            # 3. Feed-forward
+            norm_hidden_states = self.norm3(hidden_states)
+
+            if self.use_ada_layer_norm_zero:
+                norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+            ff_output = self.ff(norm_hidden_states)
+
+            if self.use_ada_layer_norm_zero:
+                ff_output = gate_mlp.unsqueeze(1) * ff_output
+
+            hidden_states = ff_output + hidden_states
+
+            return hidden_states
+
+        def hacked_mid_forward(self, *args, **kwargs):
+            eps = 1e-6
+            x = self.original_forward(*args, **kwargs)
+            if MODE == "write":
+                if gn_auto_machine_weight >= self.gn_weight:
+                    var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0)
+                    self.mean_bank.append(mean)
+                    self.var_bank.append(var)
+            if MODE == "read":
+                if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                    var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0)
+                    std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                    mean_acc = sum(self.mean_bank) / float(len(self.mean_bank))
+                    var_acc = sum(self.var_bank) / float(len(self.var_bank))
+                    std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                    x_uc = (((x - mean) / std) * std_acc) + mean_acc
+                    x_c = x_uc.clone()
+                    if do_classifier_free_guidance and style_fidelity > 0:
+                        x_c[uc_mask] = x[uc_mask]
+                    x = style_fidelity * x_c + (1.0 - style_fidelity) * x_uc
+                self.mean_bank = []
+                self.var_bank = []
+            return x
+
+        def hack_CrossAttnDownBlock2D_forward(
+            self,
+            hidden_states: torch.Tensor,
+            temb: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+        ):
+            eps = 1e-6
+
+            # TODO(Patrick, William) - attention mask is not used
+            output_states = ()
+
+            for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                if MODE == "write":
+                    if gn_auto_machine_weight >= self.gn_weight:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        self.mean_bank.append([mean])
+                        self.var_bank.append([var])
+                if MODE == "read":
+                    if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                        mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+                        var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+                        std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                        hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+                        hidden_states_c = hidden_states_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            hidden_states_c[uc_mask] = hidden_states[uc_mask]
+                        hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+                output_states = output_states + (hidden_states,)
+
+            if MODE == "read":
+                self.mean_bank = []
+                self.var_bank = []
+
+            if self.downsamplers is not None:
+                for downsampler in self.downsamplers:
+                    hidden_states = downsampler(hidden_states)
+
+                output_states = output_states + (hidden_states,)
+
+            return hidden_states, output_states
+
+        def hacked_DownBlock2D_forward(
+            self,
+            hidden_states: torch.Tensor,
+            temb: Optional[torch.Tensor] = None,
+            **kwargs: Any,
+        ) -> Tuple[torch.Tensor, ...]:
+            eps = 1e-6
+
+            output_states = ()
+
+            for i, resnet in enumerate(self.resnets):
+                hidden_states = resnet(hidden_states, temb)
+
+                if MODE == "write":
+                    if gn_auto_machine_weight >= self.gn_weight:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        self.mean_bank.append([mean])
+                        self.var_bank.append([var])
+                if MODE == "read":
+                    if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                        mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+                        var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+                        std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                        hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+                        hidden_states_c = hidden_states_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            hidden_states_c[uc_mask] = hidden_states[uc_mask]
+                        hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+                output_states = output_states + (hidden_states,)
+
+            if MODE == "read":
+                self.mean_bank = []
+                self.var_bank = []
+
+            if self.downsamplers is not None:
+                for downsampler in self.downsamplers:
+                    hidden_states = downsampler(hidden_states)
+
+                output_states = output_states + (hidden_states,)
+
+            return hidden_states, output_states
+
+        def hacked_CrossAttnUpBlock2D_forward(
+            self,
+            hidden_states: torch.Tensor,
+            res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+            temb: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            upsample_size: Optional[int] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+        ) -> torch.Tensor:
+            eps = 1e-6
+            # TODO(Patrick, William) - attention mask is not used
+            for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+
+                if MODE == "write":
+                    if gn_auto_machine_weight >= self.gn_weight:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        self.mean_bank.append([mean])
+                        self.var_bank.append([var])
+                if MODE == "read":
+                    if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                        mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+                        var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+                        std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                        hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+                        hidden_states_c = hidden_states_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            hidden_states_c[uc_mask] = hidden_states[uc_mask]
+                        hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+            if MODE == "read":
+                self.mean_bank = []
+                self.var_bank = []
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        def hacked_UpBlock2D_forward(
+            self,
+            hidden_states: torch.Tensor,
+            res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+            temb: Optional[torch.Tensor] = None,
+            upsample_size: Optional[int] = None,
+            **kwargs: Any,
+        ) -> torch.Tensor:
+            eps = 1e-6
+            for i, resnet in enumerate(self.resnets):
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                hidden_states = resnet(hidden_states, temb)
+
+                if MODE == "write":
+                    if gn_auto_machine_weight >= self.gn_weight:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        self.mean_bank.append([mean])
+                        self.var_bank.append([var])
+                if MODE == "read":
+                    if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                        mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+                        var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+                        std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                        hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+                        hidden_states_c = hidden_states_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            hidden_states_c[uc_mask] = hidden_states[uc_mask]
+                        hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+            if MODE == "read":
+                self.mean_bank = []
+                self.var_bank = []
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        if reference_attn:
+            attn_modules = [module for module in torch_dfs(self.unet) if isinstance(module, BasicTransformerBlock)]
+            attn_modules = sorted(attn_modules, key=lambda x: -x.norm1.normalized_shape[0])
+
+            for i, module in enumerate(attn_modules):
+                module._original_inner_forward = module.forward
+                module.forward = hacked_basic_transformer_inner_forward.__get__(module, BasicTransformerBlock)
+                module.bank = []
+                module.attn_weight = float(i) / float(len(attn_modules))
+
+        if reference_adain:
+            gn_modules = [self.unet.mid_block]
+            self.unet.mid_block.gn_weight = 0
+
+            down_blocks = self.unet.down_blocks
+            for w, module in enumerate(down_blocks):
+                module.gn_weight = 1.0 - float(w) / float(len(down_blocks))
+                gn_modules.append(module)
+
+            up_blocks = self.unet.up_blocks
+            for w, module in enumerate(up_blocks):
+                module.gn_weight = float(w) / float(len(up_blocks))
+                gn_modules.append(module)
+
+            for i, module in enumerate(gn_modules):
+                if getattr(module, "original_forward", None) is None:
+                    module.original_forward = module.forward
+                if i == 0:
+                    # mid_block
+                    module.forward = hacked_mid_forward.__get__(module, torch.nn.Module)
+                elif isinstance(module, CrossAttnDownBlock2D):
+                    module.forward = hack_CrossAttnDownBlock2D_forward.__get__(module, CrossAttnDownBlock2D)
+                elif isinstance(module, DownBlock2D):
+                    module.forward = hacked_DownBlock2D_forward.__get__(module, DownBlock2D)
+                elif isinstance(module, CrossAttnUpBlock2D):
+                    module.forward = hacked_CrossAttnUpBlock2D_forward.__get__(module, CrossAttnUpBlock2D)
+                elif isinstance(module, UpBlock2D):
+                    module.forward = hacked_UpBlock2D_forward.__get__(module, UpBlock2D)
+                module.mean_bank = []
+                module.var_bank = []
+                module.gn_weight *= 2
+
+        # 10. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # ref only part
+                noise = randn_tensor(
+                    ref_image_latents.shape, generator=generator, device=device, dtype=ref_image_latents.dtype
+                )
+                ref_xt = self.scheduler.add_noise(
+                    ref_image_latents,
+                    noise,
+                    t.reshape(
+                        1,
+                    ),
+                )
+                ref_xt = torch.cat([ref_xt] * 2) if do_classifier_free_guidance else ref_xt
+                ref_xt = self.scheduler.scale_model_input(ref_xt, t)
+
+                MODE = "write"
+                self.unet(
+                    ref_xt,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )
+
+                # predict the noise residual
+                MODE = "read"
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/stable_diffusion_repaint.py b/diffusers/examples/community/stable_diffusion_repaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..980e9a1559970ae71bb1e00e180151d1f5e19b39
--- /dev/null
+++ b/diffusers/examples/community/stable_diffusion_repaint.py
@@ -0,0 +1,885 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel
+from diffusers.configuration_utils import FrozenDict, deprecate
+from diffusers.loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    logging,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def prepare_mask_and_masked_image(image, mask):
+    """
+    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+
+        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+    # masked_image = image * (mask >= 0.5)
+    masked_image = image
+
+    return mask, masked_image
+
+
+class StableDiffusionRepaintPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
+):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+        - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`]
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration"
+                " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
+                " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
+                " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
+                " Hub, it would be very nice if you could open a Pull request for the"
+                " `scheduler/scheduler_config.json` file"
+            )
+            deprecate(
+                "skip_prk_steps not set",
+                "1.0.0",
+                deprecation_message,
+                standard_warn=False,
+            )
+            new_config = dict(scheduler.config)
+            new_config["skip_prk_steps"] = True
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        # Check shapes, assume num_channels_latents == 4, num_channels_mask == 1, num_channels_masked == 4
+        if unet.config.in_channels != 4:
+            logger.warning(
+                f"You have loaded a UNet with {unet.config.in_channels} input channels, whereas by default,"
+                f" {self.__class__} assumes that `pipeline.unet` has 4 input channels: 4 for `num_channels_latents`,"
+                ". If you did not intend to modify"
+                " this behavior, please check whether you have loaded the right checkpoint."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_mask_latents(
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        do_classifier_free_guidance,
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        # encode the mask image into latents space so we can concatenate it to the latents
+        if isinstance(generator, list):
+            masked_image_latents = [
+                self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i])
+                for i in range(batch_size)
+            ]
+            masked_image_latents = torch.cat(masked_image_latents, dim=0)
+        else:
+            masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
+        masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return mask, masked_image_latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[torch.Tensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        jump_length: Optional[int] = 10,
+        jump_n_sample: Optional[int] = 10,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            jump_length (`int`, *optional*, defaults to 10):
+                The number of steps taken forward in time before going backward in time for a single jump ("j" in
+                RePaint paper). Take a look at Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
+            jump_n_sample (`int`, *optional*, defaults to 10):
+                The number of times we will make forward time jump for a given chosen time sample. Take a look at
+                Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Examples:
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+        >>> from diffusers import StableDiffusionPipeline, RePaintScheduler
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+        >>> base_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/"
+        >>> img_url = base_url + "overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = base_url + "overture-creations-5sI6fQgYIuo_mask.png "
+        >>> init_image = download_image(img_url).resize((512, 512))
+        >>> mask_image = download_image(mask_url).resize((512, 512))
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16, custom_pipeline="stable_diffusion_repaint",
+        ... )
+        >>> pipe.scheduler = RePaintScheduler.from_config(pipe.scheduler.config)
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+        >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+        ```
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        if image is None:
+            raise ValueError("`image` input cannot be undefined.")
+
+        if mask_image is None:
+            raise ValueError("`mask_image` input cannot be undefined.")
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 4. Preprocess mask and image
+        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample, device)
+        self.scheduler.eta = eta
+
+        timesteps = self.scheduler.timesteps
+        # latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance=False,  # We do not need duplicate mask and image
+        )
+
+        # 8. Check that sizes of mask, masked image and latents match
+        # num_channels_mask = mask.shape[1]
+        # num_channels_masked_image = masked_image_latents.shape[1]
+        if num_channels_latents != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} "
+                f" = Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        t_last = timesteps[0] + 1
+
+        # 10. Denoising loop
+        with self.progress_bar(total=len(timesteps)) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if t >= t_last:
+                    # compute the reverse: x_t-1 -> x_t
+                    latents = self.scheduler.undo_step(latents, t_last, generator)
+                    progress_bar.update()
+                    t_last = t
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                    masked_image_latents,
+                    mask,
+                    **extra_step_kwargs,
+                ).prev_sample
+
+                # call the callback, if provided
+                progress_bar.update()
+                if callback is not None and i % callback_steps == 0:
+                    step_idx = i // getattr(self.scheduler, "order", 1)
+                    callback(step_idx, t, latents)
+
+                t_last = t
+
+        # 11. Post-processing
+        image = self.decode_latents(latents)
+
+        # 12. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # 13. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/stable_diffusion_tensorrt_img2img.py b/diffusers/examples/community/stable_diffusion_tensorrt_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..91540d1f4159d7389f91da4c3ed80d45905970df
--- /dev/null
+++ b/diffusers/examples/community/stable_diffusion_tensorrt_img2img.py
@@ -0,0 +1,1134 @@
+#
+# Copyright 2024 The HuggingFace Inc. team.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+from collections import OrderedDict
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+import PIL.Image
+import tensorrt as trt
+import torch
+from cuda import cudart
+from huggingface_hub import snapshot_download
+from huggingface_hub.utils import validate_hf_hub_args
+from onnx import shape_inference
+from packaging import version
+from polygraphy import cuda
+from polygraphy.backend.common import bytes_from_path
+from polygraphy.backend.onnx.loader import fold_constants
+from polygraphy.backend.trt import (
+    CreateConfig,
+    Profile,
+    engine_from_bytes,
+    engine_from_network,
+    network_from_onnx_path,
+    save_engine,
+)
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import FrozenDict, deprecate
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion import (
+    StableDiffusionPipelineOutput,
+    StableDiffusionSafetyChecker,
+)
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_latents
+from diffusers.schedulers import DDIMScheduler
+from diffusers.utils import logging
+
+
+"""
+Installation instructions
+python3 -m pip install --upgrade transformers diffusers>=0.16.0
+python3 -m pip install --upgrade tensorrt~=10.2.0
+python3 -m pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
+python3 -m pip install onnxruntime
+"""
+
+TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Map of numpy dtype -> torch dtype
+numpy_to_torch_dtype_dict = {
+    np.uint8: torch.uint8,
+    np.int8: torch.int8,
+    np.int16: torch.int16,
+    np.int32: torch.int32,
+    np.int64: torch.int64,
+    np.float16: torch.float16,
+    np.float32: torch.float32,
+    np.float64: torch.float64,
+    np.complex64: torch.complex64,
+    np.complex128: torch.complex128,
+}
+if np.version.full_version >= "1.24.0":
+    numpy_to_torch_dtype_dict[np.bool_] = torch.bool
+else:
+    numpy_to_torch_dtype_dict[np.bool] = torch.bool
+
+# Map of torch dtype -> numpy dtype
+torch_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()}
+
+
+def preprocess_image(image):
+    """
+    image: torch.Tensor
+    """
+    w, h = image.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h))
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image).contiguous()
+    return 2.0 * image - 1.0
+
+
+class Engine:
+    def __init__(self, engine_path):
+        self.engine_path = engine_path
+        self.engine = None
+        self.context = None
+        self.buffers = OrderedDict()
+        self.tensors = OrderedDict()
+
+    def __del__(self):
+        [buf.free() for buf in self.buffers.values() if isinstance(buf, cuda.DeviceArray)]
+        del self.engine
+        del self.context
+        del self.buffers
+        del self.tensors
+
+    def build(
+        self,
+        onnx_path,
+        fp16,
+        input_profile=None,
+        enable_all_tactics=False,
+        timing_cache=None,
+    ):
+        logger.warning(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
+        p = Profile()
+        if input_profile:
+            for name, dims in input_profile.items():
+                assert len(dims) == 3
+                p.add(name, min=dims[0], opt=dims[1], max=dims[2])
+
+        extra_build_args = {}
+        if not enable_all_tactics:
+            extra_build_args["tactic_sources"] = []
+
+        engine = engine_from_network(
+            network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]),
+            config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **extra_build_args),
+            save_timing_cache=timing_cache,
+        )
+        save_engine(engine, path=self.engine_path)
+
+    def load(self):
+        logger.warning(f"Loading TensorRT engine: {self.engine_path}")
+        self.engine = engine_from_bytes(bytes_from_path(self.engine_path))
+
+    def activate(self):
+        self.context = self.engine.create_execution_context()
+
+    def allocate_buffers(self, shape_dict=None, device="cuda"):
+        for binding in range(self.engine.num_io_tensors):
+            name = self.engine.get_tensor_name(binding)
+            if shape_dict and name in shape_dict:
+                shape = shape_dict[name]
+            else:
+                shape = self.engine.get_tensor_shape(name)
+            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                self.context.set_input_shape(name, shape)
+            tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to(device=device)
+            self.tensors[name] = tensor
+
+    def infer(self, feed_dict, stream):
+        for name, buf in feed_dict.items():
+            self.tensors[name].copy_(buf)
+        for name, tensor in self.tensors.items():
+            self.context.set_tensor_address(name, tensor.data_ptr())
+        noerror = self.context.execute_async_v3(stream)
+        if not noerror:
+            raise ValueError("ERROR: inference failed.")
+
+        return self.tensors
+
+
+class Optimizer:
+    def __init__(self, onnx_graph):
+        self.graph = gs.import_onnx(onnx_graph)
+
+    def cleanup(self, return_onnx=False):
+        self.graph.cleanup().toposort()
+        if return_onnx:
+            return gs.export_onnx(self.graph)
+
+    def select_outputs(self, keep, names=None):
+        self.graph.outputs = [self.graph.outputs[o] for o in keep]
+        if names:
+            for i, name in enumerate(names):
+                self.graph.outputs[i].name = name
+
+    def fold_constants(self, return_onnx=False):
+        onnx_graph = fold_constants(gs.export_onnx(self.graph), allow_onnxruntime_shape_inference=True)
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+
+    def infer_shapes(self, return_onnx=False):
+        onnx_graph = gs.export_onnx(self.graph)
+        if onnx_graph.ByteSize() > 2147483648:
+            raise TypeError("ERROR: model size exceeds supported 2GB limit")
+        else:
+            onnx_graph = shape_inference.infer_shapes(onnx_graph)
+
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+
+
+class BaseModel:
+    def __init__(self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77):
+        self.model = model
+        self.name = "SD Model"
+        self.fp16 = fp16
+        self.device = device
+
+        self.min_batch = 1
+        self.max_batch = max_batch_size
+        self.min_image_shape = 256  # min image resolution: 256x256
+        self.max_image_shape = 1024  # max image resolution: 1024x1024
+        self.min_latent_shape = self.min_image_shape // 8
+        self.max_latent_shape = self.max_image_shape // 8
+
+        self.embedding_dim = embedding_dim
+        self.text_maxlen = text_maxlen
+
+    def get_model(self):
+        return self.model
+
+    def get_input_names(self):
+        pass
+
+    def get_output_names(self):
+        pass
+
+    def get_dynamic_axes(self):
+        return None
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        pass
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        return None
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        return None
+
+    def optimize(self, onnx_graph):
+        opt = Optimizer(onnx_graph)
+        opt.cleanup()
+        opt.fold_constants()
+        opt.infer_shapes()
+        onnx_opt_graph = opt.cleanup(return_onnx=True)
+        return onnx_opt_graph
+
+    def check_dims(self, batch_size, image_height, image_width):
+        assert batch_size >= self.min_batch and batch_size <= self.max_batch
+        assert image_height % 8 == 0 or image_width % 8 == 0
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        assert latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape
+        assert latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape
+        return (latent_height, latent_width)
+
+    def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape):
+        min_batch = batch_size if static_batch else self.min_batch
+        max_batch = batch_size if static_batch else self.max_batch
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        min_image_height = image_height if static_shape else self.min_image_shape
+        max_image_height = image_height if static_shape else self.max_image_shape
+        min_image_width = image_width if static_shape else self.min_image_shape
+        max_image_width = image_width if static_shape else self.max_image_shape
+        min_latent_height = latent_height if static_shape else self.min_latent_shape
+        max_latent_height = latent_height if static_shape else self.max_latent_shape
+        min_latent_width = latent_width if static_shape else self.min_latent_shape
+        max_latent_width = latent_width if static_shape else self.max_latent_shape
+        return (
+            min_batch,
+            max_batch,
+            min_image_height,
+            max_image_height,
+            min_image_width,
+            max_image_width,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        )
+
+
+def getOnnxPath(model_name, onnx_dir, opt=True):
+    return os.path.join(onnx_dir, model_name + (".opt" if opt else "") + ".onnx")
+
+
+def getEnginePath(model_name, engine_dir):
+    return os.path.join(engine_dir, model_name + ".plan")
+
+
+def build_engines(
+    models: dict,
+    engine_dir,
+    onnx_dir,
+    onnx_opset,
+    opt_image_height,
+    opt_image_width,
+    opt_batch_size=1,
+    force_engine_rebuild=False,
+    static_batch=False,
+    static_shape=True,
+    enable_all_tactics=False,
+    timing_cache=None,
+):
+    built_engines = {}
+    if not os.path.isdir(onnx_dir):
+        os.makedirs(onnx_dir)
+    if not os.path.isdir(engine_dir):
+        os.makedirs(engine_dir)
+
+    # Export models to ONNX
+    for model_name, model_obj in models.items():
+        engine_path = getEnginePath(model_name, engine_dir)
+        if force_engine_rebuild or not os.path.exists(engine_path):
+            logger.warning("Building Engines...")
+            logger.warning("Engine build can take a while to complete")
+            onnx_path = getOnnxPath(model_name, onnx_dir, opt=False)
+            onnx_opt_path = getOnnxPath(model_name, onnx_dir)
+            if force_engine_rebuild or not os.path.exists(onnx_opt_path):
+                if force_engine_rebuild or not os.path.exists(onnx_path):
+                    logger.warning(f"Exporting model: {onnx_path}")
+                    model = model_obj.get_model()
+                    with torch.inference_mode(), torch.autocast("cuda"):
+                        inputs = model_obj.get_sample_input(opt_batch_size, opt_image_height, opt_image_width)
+                        torch.onnx.export(
+                            model,
+                            inputs,
+                            onnx_path,
+                            export_params=True,
+                            opset_version=onnx_opset,
+                            do_constant_folding=True,
+                            input_names=model_obj.get_input_names(),
+                            output_names=model_obj.get_output_names(),
+                            dynamic_axes=model_obj.get_dynamic_axes(),
+                        )
+                    del model
+                    torch.cuda.empty_cache()
+                    gc.collect()
+                else:
+                    logger.warning(f"Found cached model: {onnx_path}")
+
+                # Optimize onnx
+                if force_engine_rebuild or not os.path.exists(onnx_opt_path):
+                    logger.warning(f"Generating optimizing model: {onnx_opt_path}")
+                    onnx_opt_graph = model_obj.optimize(onnx.load(onnx_path))
+                    onnx.save(onnx_opt_graph, onnx_opt_path)
+                else:
+                    logger.warning(f"Found cached optimized model: {onnx_opt_path} ")
+
+    # Build TensorRT engines
+    for model_name, model_obj in models.items():
+        engine_path = getEnginePath(model_name, engine_dir)
+        engine = Engine(engine_path)
+        onnx_path = getOnnxPath(model_name, onnx_dir, opt=False)
+        onnx_opt_path = getOnnxPath(model_name, onnx_dir)
+
+        if force_engine_rebuild or not os.path.exists(engine.engine_path):
+            engine.build(
+                onnx_opt_path,
+                fp16=True,
+                input_profile=model_obj.get_input_profile(
+                    opt_batch_size,
+                    opt_image_height,
+                    opt_image_width,
+                    static_batch=static_batch,
+                    static_shape=static_shape,
+                ),
+                timing_cache=timing_cache,
+            )
+        built_engines[model_name] = engine
+
+    # Load and activate TensorRT engines
+    for model_name, model_obj in models.items():
+        engine = built_engines[model_name]
+        engine.load()
+        engine.activate()
+
+    return built_engines
+
+
+def runEngine(engine, feed_dict, stream):
+    return engine.infer(feed_dict, stream)
+
+
+class CLIP(BaseModel):
+    def __init__(self, model, device, max_batch_size, embedding_dim):
+        super(CLIP, self).__init__(
+            model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim
+        )
+        self.name = "CLIP"
+
+    def get_input_names(self):
+        return ["input_ids"]
+
+    def get_output_names(self):
+        return ["text_embeddings", "pooler_output"]
+
+    def get_dynamic_axes(self):
+        return {"input_ids": {0: "B"}, "text_embeddings": {0: "B"}}
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        self.check_dims(batch_size, image_height, image_width)
+        min_batch, max_batch, _, _, _, _, _, _, _, _ = self.get_minmax_dims(
+            batch_size, image_height, image_width, static_batch, static_shape
+        )
+        return {
+            "input_ids": [(min_batch, self.text_maxlen), (batch_size, self.text_maxlen), (max_batch, self.text_maxlen)]
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return {
+            "input_ids": (batch_size, self.text_maxlen),
+            "text_embeddings": (batch_size, self.text_maxlen, self.embedding_dim),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return torch.zeros(batch_size, self.text_maxlen, dtype=torch.int32, device=self.device)
+
+    def optimize(self, onnx_graph):
+        opt = Optimizer(onnx_graph)
+        opt.select_outputs([0])  # delete graph output#1
+        opt.cleanup()
+        opt.fold_constants()
+        opt.infer_shapes()
+        opt.select_outputs([0], names=["text_embeddings"])  # rename network output
+        opt_onnx_graph = opt.cleanup(return_onnx=True)
+        return opt_onnx_graph
+
+
+def make_CLIP(model, device, max_batch_size, embedding_dim, inpaint=False):
+    return CLIP(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)
+
+
+class UNet(BaseModel):
+    def __init__(
+        self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77, unet_dim=4
+    ):
+        super(UNet, self).__init__(
+            model=model,
+            fp16=fp16,
+            device=device,
+            max_batch_size=max_batch_size,
+            embedding_dim=embedding_dim,
+            text_maxlen=text_maxlen,
+        )
+        self.unet_dim = unet_dim
+        self.name = "UNet"
+
+    def get_input_names(self):
+        return ["sample", "timestep", "encoder_hidden_states"]
+
+    def get_output_names(self):
+        return ["latent"]
+
+    def get_dynamic_axes(self):
+        return {
+            "sample": {0: "2B", 2: "H", 3: "W"},
+            "encoder_hidden_states": {0: "2B"},
+            "latent": {0: "2B", 2: "H", 3: "W"},
+        }
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        (
+            min_batch,
+            max_batch,
+            _,
+            _,
+            _,
+            _,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            "sample": [
+                (2 * min_batch, self.unet_dim, min_latent_height, min_latent_width),
+                (2 * batch_size, self.unet_dim, latent_height, latent_width),
+                (2 * max_batch, self.unet_dim, max_latent_height, max_latent_width),
+            ],
+            "encoder_hidden_states": [
+                (2 * min_batch, self.text_maxlen, self.embedding_dim),
+                (2 * batch_size, self.text_maxlen, self.embedding_dim),
+                (2 * max_batch, self.text_maxlen, self.embedding_dim),
+            ],
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "sample": (2 * batch_size, self.unet_dim, latent_height, latent_width),
+            "encoder_hidden_states": (2 * batch_size, self.text_maxlen, self.embedding_dim),
+            "latent": (2 * batch_size, 4, latent_height, latent_width),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        dtype = torch.float16 if self.fp16 else torch.float32
+        return (
+            torch.randn(
+                2 * batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device
+            ),
+            torch.tensor([1.0], dtype=torch.float32, device=self.device),
+            torch.randn(2 * batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device),
+        )
+
+
+def make_UNet(model, device, max_batch_size, embedding_dim, inpaint=False):
+    return UNet(
+        model,
+        fp16=True,
+        device=device,
+        max_batch_size=max_batch_size,
+        embedding_dim=embedding_dim,
+        unet_dim=(9 if inpaint else 4),
+    )
+
+
+class VAE(BaseModel):
+    def __init__(self, model, device, max_batch_size, embedding_dim):
+        super(VAE, self).__init__(
+            model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim
+        )
+        self.name = "VAE decoder"
+
+    def get_input_names(self):
+        return ["latent"]
+
+    def get_output_names(self):
+        return ["images"]
+
+    def get_dynamic_axes(self):
+        return {"latent": {0: "B", 2: "H", 3: "W"}, "images": {0: "B", 2: "8H", 3: "8W"}}
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        (
+            min_batch,
+            max_batch,
+            _,
+            _,
+            _,
+            _,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            "latent": [
+                (min_batch, 4, min_latent_height, min_latent_width),
+                (batch_size, 4, latent_height, latent_width),
+                (max_batch, 4, max_latent_height, max_latent_width),
+            ]
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "latent": (batch_size, 4, latent_height, latent_width),
+            "images": (batch_size, 3, image_height, image_width),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return torch.randn(batch_size, 4, latent_height, latent_width, dtype=torch.float32, device=self.device)
+
+
+def make_VAE(model, device, max_batch_size, embedding_dim, inpaint=False):
+    return VAE(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)
+
+
+class TorchVAEEncoder(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.vae_encoder = model
+
+    def forward(self, x):
+        return retrieve_latents(self.vae_encoder.encode(x))
+
+
+class VAEEncoder(BaseModel):
+    def __init__(self, model, device, max_batch_size, embedding_dim):
+        super(VAEEncoder, self).__init__(
+            model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim
+        )
+        self.name = "VAE encoder"
+
+    def get_model(self):
+        vae_encoder = TorchVAEEncoder(self.model)
+        return vae_encoder
+
+    def get_input_names(self):
+        return ["images"]
+
+    def get_output_names(self):
+        return ["latent"]
+
+    def get_dynamic_axes(self):
+        return {"images": {0: "B", 2: "8H", 3: "8W"}, "latent": {0: "B", 2: "H", 3: "W"}}
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        assert batch_size >= self.min_batch and batch_size <= self.max_batch
+        min_batch = batch_size if static_batch else self.min_batch
+        max_batch = batch_size if static_batch else self.max_batch
+        self.check_dims(batch_size, image_height, image_width)
+        (
+            min_batch,
+            max_batch,
+            min_image_height,
+            max_image_height,
+            min_image_width,
+            max_image_width,
+            _,
+            _,
+            _,
+            _,
+        ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+
+        return {
+            "images": [
+                (min_batch, 3, min_image_height, min_image_width),
+                (batch_size, 3, image_height, image_width),
+                (max_batch, 3, max_image_height, max_image_width),
+            ]
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "images": (batch_size, 3, image_height, image_width),
+            "latent": (batch_size, 4, latent_height, latent_width),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return torch.randn(batch_size, 3, image_height, image_width, dtype=torch.float32, device=self.device)
+
+
+def make_VAEEncoder(model, device, max_batch_size, embedding_dim, inpaint=False):
+    return VAEEncoder(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)
+
+
+class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image-to-image generation using TensorRT accelerated Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+        stages=["clip", "unet", "vae", "vae_encoder"],
+        image_height: int = 512,
+        image_width: int = 512,
+        max_batch_size: int = 16,
+        # ONNX export parameters
+        onnx_opset: int = 17,
+        onnx_dir: str = "onnx",
+        # TensorRT engine build parameters
+        engine_dir: str = "engine",
+        force_engine_rebuild: bool = False,
+        timing_cache: str = "timing_cache",
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+
+        self.stages = stages
+        self.image_height, self.image_width = image_height, image_width
+        self.inpaint = False
+        self.onnx_opset = onnx_opset
+        self.onnx_dir = onnx_dir
+        self.engine_dir = engine_dir
+        self.force_engine_rebuild = force_engine_rebuild
+        self.timing_cache = timing_cache
+        self.build_static_batch = False
+        self.build_dynamic_shape = False
+
+        self.max_batch_size = max_batch_size
+        # TODO: Restrict batch size to 4 for larger image dimensions as a WAR for TensorRT limitation.
+        if self.build_dynamic_shape or self.image_height > 512 or self.image_width > 512:
+            self.max_batch_size = 4
+
+        self.stream = None  # loaded in loadResources()
+        self.models = {}  # loaded in __loadModels()
+        self.engine = {}  # loaded in build_engines()
+
+        self.vae.forward = self.vae.decode
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def __loadModels(self):
+        # Load pipeline models
+        self.embedding_dim = self.text_encoder.config.hidden_size
+        models_args = {
+            "device": self.torch_device,
+            "max_batch_size": self.max_batch_size,
+            "embedding_dim": self.embedding_dim,
+            "inpaint": self.inpaint,
+        }
+        if "clip" in self.stages:
+            self.models["clip"] = make_CLIP(self.text_encoder, **models_args)
+        if "unet" in self.stages:
+            self.models["unet"] = make_UNet(self.unet, **models_args)
+        if "vae" in self.stages:
+            self.models["vae"] = make_VAE(self.vae, **models_args)
+        if "vae_encoder" in self.stages:
+            self.models["vae_encoder"] = make_VAEEncoder(self.vae, **models_args)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(
+        self, image: Union[torch.Tensor, PIL.Image.Image], device: torch.device, dtype: torch.dtype
+    ) -> Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]:
+        r"""
+        Runs the safety checker on the given image.
+        Args:
+            image (Union[torch.Tensor, PIL.Image.Image]): The input image to be checked.
+            device (torch.device): The device to run the safety checker on.
+            dtype (torch.dtype): The data type of the input image.
+        Returns:
+            (image, has_nsfw_concept) Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]: A tuple containing the processed image and
+            a boolean indicating whether the image has a NSFW (Not Safe for Work) concept.
+        """
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    @classmethod
+    @validate_hf_hub_args
+    def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        cache_dir = kwargs.pop("cache_dir", None)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+
+        cls.cached_folder = (
+            pretrained_model_name_or_path
+            if os.path.isdir(pretrained_model_name_or_path)
+            else snapshot_download(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+            )
+        )
+
+    def to(self, torch_device: Optional[Union[str, torch.device]] = None, silence_dtype_warnings: bool = False):
+        super().to(torch_device, silence_dtype_warnings=silence_dtype_warnings)
+
+        self.onnx_dir = os.path.join(self.cached_folder, self.onnx_dir)
+        self.engine_dir = os.path.join(self.cached_folder, self.engine_dir)
+        self.timing_cache = os.path.join(self.cached_folder, self.timing_cache)
+
+        # set device
+        self.torch_device = self._execution_device
+        logger.warning(f"Running inference on device: {self.torch_device}")
+
+        # load models
+        self.__loadModels()
+
+        # build engines
+        self.engine = build_engines(
+            self.models,
+            self.engine_dir,
+            self.onnx_dir,
+            self.onnx_opset,
+            opt_image_height=self.image_height,
+            opt_image_width=self.image_width,
+            force_engine_rebuild=self.force_engine_rebuild,
+            static_batch=self.build_static_batch,
+            static_shape=not self.build_dynamic_shape,
+            timing_cache=self.timing_cache,
+        )
+
+        return self
+
+    def __initialize_timesteps(self, timesteps, strength):
+        self.scheduler.set_timesteps(timesteps)
+        offset = self.scheduler.steps_offset if hasattr(self.scheduler, "steps_offset") else 0
+        init_timestep = int(timesteps * strength) + offset
+        init_timestep = min(init_timestep, timesteps)
+        t_start = max(timesteps - init_timestep + offset, 0)
+        timesteps = self.scheduler.timesteps[t_start:].to(self.torch_device)
+        return timesteps, t_start
+
+    def __preprocess_images(self, batch_size, images=()):
+        init_images = []
+        for image in images:
+            image = image.to(self.torch_device).float()
+            image = image.repeat(batch_size, 1, 1, 1)
+            init_images.append(image)
+        return tuple(init_images)
+
+    def __encode_image(self, init_image):
+        init_latents = runEngine(self.engine["vae_encoder"], {"images": init_image}, self.stream)["latent"]
+        init_latents = 0.18215 * init_latents
+        return init_latents
+
+    def __encode_prompt(self, prompt, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+        """
+        # Tokenize prompt
+        text_input_ids = (
+            self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            .input_ids.type(torch.int32)
+            .to(self.torch_device)
+        )
+
+        # NOTE: output tensor for CLIP must be cloned because it will be overwritten when called again for negative prompt
+        text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids}, self.stream)[
+            "text_embeddings"
+        ].clone()
+
+        # Tokenize negative prompt
+        uncond_input_ids = (
+            self.tokenizer(
+                negative_prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            .input_ids.type(torch.int32)
+            .to(self.torch_device)
+        )
+        uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids}, self.stream)[
+            "text_embeddings"
+        ]
+
+        # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings]).to(dtype=torch.float16)
+
+        return text_embeddings
+
+    def __denoise_latent(
+        self, latents, text_embeddings, timesteps=None, step_offset=0, mask=None, masked_image_latents=None
+    ):
+        if not isinstance(timesteps, torch.Tensor):
+            timesteps = self.scheduler.timesteps
+        for step_index, timestep in enumerate(timesteps):
+            # Expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep)
+            if isinstance(mask, torch.Tensor):
+                latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+            # Predict the noise residual
+            timestep_float = timestep.float() if timestep.dtype != torch.float32 else timestep
+
+            noise_pred = runEngine(
+                self.engine["unet"],
+                {"sample": latent_model_input, "timestep": timestep_float, "encoder_hidden_states": text_embeddings},
+                self.stream,
+            )["latent"]
+
+            # Perform guidance
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample
+
+        latents = 1.0 / 0.18215 * latents
+        return latents
+
+    def __decode_latent(self, latents):
+        images = runEngine(self.engine["vae"], {"latent": latents}, self.stream)["images"]
+        images = (images / 2 + 0.5).clamp(0, 1)
+        return images.cpu().permute(0, 2, 3, 1).float().numpy()
+
+    def __loadResources(self, image_height, image_width, batch_size):
+        self.stream = cudart.cudaStreamCreate()[1]
+
+        # Allocate buffers for TensorRT engine bindings
+        for model_name, obj in self.models.items():
+            self.engine[model_name].allocate_buffers(
+                shape_dict=obj.get_shape_dict(batch_size, image_height, image_width), device=self.torch_device
+            )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+
+        """
+        self.generator = generator
+        self.denoising_steps = num_inference_steps
+        self._guidance_scale = guidance_scale
+
+        # Pre-compute latent input scales and linear multistep coefficients
+        self.scheduler.set_timesteps(self.denoising_steps, device=self.torch_device)
+
+        # Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"Expected prompt to be of type list or str but got {type(prompt)}")
+
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+
+        if negative_prompt is not None and isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+
+        assert len(prompt) == len(negative_prompt)
+
+        if batch_size > self.max_batch_size:
+            raise ValueError(
+                f"Batch size {len(prompt)} is larger than allowed {self.max_batch_size}. If dynamic shape is used, then maximum batch size is 4"
+            )
+
+        # load resources
+        self.__loadResources(self.image_height, self.image_width, batch_size)
+
+        with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER):
+            # Initialize timesteps
+            timesteps, t_start = self.__initialize_timesteps(self.denoising_steps, strength)
+            latent_timestep = timesteps[:1].repeat(batch_size)
+
+            # Pre-process input image
+            if isinstance(image, PIL.Image.Image):
+                image = preprocess_image(image)
+            init_image = self.__preprocess_images(batch_size, (image,))[0]
+
+            # VAE encode init image
+            init_latents = self.__encode_image(init_image)
+
+            # Add noise to latents using timesteps
+            noise = torch.randn(
+                init_latents.shape, generator=self.generator, device=self.torch_device, dtype=torch.float32
+            )
+            latents = self.scheduler.add_noise(init_latents, noise, latent_timestep)
+
+            # CLIP text encoder
+            text_embeddings = self.__encode_prompt(prompt, negative_prompt)
+
+            # UNet denoiser
+            latents = self.__denoise_latent(latents, text_embeddings, timesteps=timesteps, step_offset=t_start)
+
+            # VAE decode latent
+            images = self.__decode_latent(latents)
+
+        images, has_nsfw_concept = self.run_safety_checker(images, self.torch_device, text_embeddings.dtype)
+        images = self.numpy_to_pil(images)
+        return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/stable_diffusion_tensorrt_inpaint.py b/diffusers/examples/community/stable_diffusion_tensorrt_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6f6711a53e7c416100deadaa8d4f6a3169e2d39
--- /dev/null
+++ b/diffusers/examples/community/stable_diffusion_tensorrt_inpaint.py
@@ -0,0 +1,1268 @@
+#
+# Copyright 2024 The HuggingFace Inc. team.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+from collections import OrderedDict
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+import PIL.Image
+import tensorrt as trt
+import torch
+from cuda import cudart
+from huggingface_hub import snapshot_download
+from huggingface_hub.utils import validate_hf_hub_args
+from onnx import shape_inference
+from packaging import version
+from polygraphy import cuda
+from polygraphy.backend.common import bytes_from_path
+from polygraphy.backend.onnx.loader import fold_constants
+from polygraphy.backend.trt import (
+    CreateConfig,
+    Profile,
+    engine_from_bytes,
+    engine_from_network,
+    network_from_onnx_path,
+    save_engine,
+)
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import FrozenDict, deprecate
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion import (
+    StableDiffusionPipelineOutput,
+    StableDiffusionSafetyChecker,
+)
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import (
+    prepare_mask_and_masked_image,
+    retrieve_latents,
+)
+from diffusers.schedulers import DDIMScheduler
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+
+
+"""
+Installation instructions
+python3 -m pip install --upgrade transformers diffusers>=0.16.0
+python3 -m pip install --upgrade tensorrt~=10.2.0
+python3 -m pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
+python3 -m pip install onnxruntime
+"""
+
+TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Map of numpy dtype -> torch dtype
+numpy_to_torch_dtype_dict = {
+    np.uint8: torch.uint8,
+    np.int8: torch.int8,
+    np.int16: torch.int16,
+    np.int32: torch.int32,
+    np.int64: torch.int64,
+    np.float16: torch.float16,
+    np.float32: torch.float32,
+    np.float64: torch.float64,
+    np.complex64: torch.complex64,
+    np.complex128: torch.complex128,
+}
+if np.version.full_version >= "1.24.0":
+    numpy_to_torch_dtype_dict[np.bool_] = torch.bool
+else:
+    numpy_to_torch_dtype_dict[np.bool] = torch.bool
+
+# Map of torch dtype -> numpy dtype
+torch_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()}
+
+
+def preprocess_image(image):
+    """
+    image: torch.Tensor
+    """
+    w, h = image.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h))
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image).contiguous()
+    return 2.0 * image - 1.0
+
+
+class Engine:
+    def __init__(self, engine_path):
+        self.engine_path = engine_path
+        self.engine = None
+        self.context = None
+        self.buffers = OrderedDict()
+        self.tensors = OrderedDict()
+
+    def __del__(self):
+        [buf.free() for buf in self.buffers.values() if isinstance(buf, cuda.DeviceArray)]
+        del self.engine
+        del self.context
+        del self.buffers
+        del self.tensors
+
+    def build(
+        self,
+        onnx_path,
+        fp16,
+        input_profile=None,
+        enable_all_tactics=False,
+        timing_cache=None,
+    ):
+        logger.warning(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
+        p = Profile()
+        if input_profile:
+            for name, dims in input_profile.items():
+                assert len(dims) == 3
+                p.add(name, min=dims[0], opt=dims[1], max=dims[2])
+
+        extra_build_args = {}
+        if not enable_all_tactics:
+            extra_build_args["tactic_sources"] = []
+
+        engine = engine_from_network(
+            network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]),
+            config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **extra_build_args),
+            save_timing_cache=timing_cache,
+        )
+        save_engine(engine, path=self.engine_path)
+
+    def load(self):
+        logger.warning(f"Loading TensorRT engine: {self.engine_path}")
+        self.engine = engine_from_bytes(bytes_from_path(self.engine_path))
+
+    def activate(self):
+        self.context = self.engine.create_execution_context()
+
+    def allocate_buffers(self, shape_dict=None, device="cuda"):
+        for binding in range(self.engine.num_io_tensors):
+            name = self.engine.get_tensor_name(binding)
+            if shape_dict and name in shape_dict:
+                shape = shape_dict[name]
+            else:
+                shape = self.engine.get_tensor_shape(name)
+            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                self.context.set_input_shape(name, shape)
+            tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to(device=device)
+            self.tensors[name] = tensor
+
+    def infer(self, feed_dict, stream):
+        for name, buf in feed_dict.items():
+            self.tensors[name].copy_(buf)
+        for name, tensor in self.tensors.items():
+            self.context.set_tensor_address(name, tensor.data_ptr())
+        noerror = self.context.execute_async_v3(stream)
+        if not noerror:
+            raise ValueError("ERROR: inference failed.")
+
+        return self.tensors
+
+
+class Optimizer:
+    def __init__(self, onnx_graph):
+        self.graph = gs.import_onnx(onnx_graph)
+
+    def cleanup(self, return_onnx=False):
+        self.graph.cleanup().toposort()
+        if return_onnx:
+            return gs.export_onnx(self.graph)
+
+    def select_outputs(self, keep, names=None):
+        self.graph.outputs = [self.graph.outputs[o] for o in keep]
+        if names:
+            for i, name in enumerate(names):
+                self.graph.outputs[i].name = name
+
+    def fold_constants(self, return_onnx=False):
+        onnx_graph = fold_constants(gs.export_onnx(self.graph), allow_onnxruntime_shape_inference=True)
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+
+    def infer_shapes(self, return_onnx=False):
+        onnx_graph = gs.export_onnx(self.graph)
+        if onnx_graph.ByteSize() > 2147483648:
+            raise TypeError("ERROR: model size exceeds supported 2GB limit")
+        else:
+            onnx_graph = shape_inference.infer_shapes(onnx_graph)
+
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+
+
+class BaseModel:
+    def __init__(self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77):
+        self.model = model
+        self.name = "SD Model"
+        self.fp16 = fp16
+        self.device = device
+
+        self.min_batch = 1
+        self.max_batch = max_batch_size
+        self.min_image_shape = 256  # min image resolution: 256x256
+        self.max_image_shape = 1024  # max image resolution: 1024x1024
+        self.min_latent_shape = self.min_image_shape // 8
+        self.max_latent_shape = self.max_image_shape // 8
+
+        self.embedding_dim = embedding_dim
+        self.text_maxlen = text_maxlen
+
+    def get_model(self):
+        return self.model
+
+    def get_input_names(self):
+        pass
+
+    def get_output_names(self):
+        pass
+
+    def get_dynamic_axes(self):
+        return None
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        pass
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        return None
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        return None
+
+    def optimize(self, onnx_graph):
+        opt = Optimizer(onnx_graph)
+        opt.cleanup()
+        opt.fold_constants()
+        opt.infer_shapes()
+        onnx_opt_graph = opt.cleanup(return_onnx=True)
+        return onnx_opt_graph
+
+    def check_dims(self, batch_size, image_height, image_width):
+        assert batch_size >= self.min_batch and batch_size <= self.max_batch
+        assert image_height % 8 == 0 or image_width % 8 == 0
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        assert latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape
+        assert latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape
+        return (latent_height, latent_width)
+
+    def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape):
+        min_batch = batch_size if static_batch else self.min_batch
+        max_batch = batch_size if static_batch else self.max_batch
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        min_image_height = image_height if static_shape else self.min_image_shape
+        max_image_height = image_height if static_shape else self.max_image_shape
+        min_image_width = image_width if static_shape else self.min_image_shape
+        max_image_width = image_width if static_shape else self.max_image_shape
+        min_latent_height = latent_height if static_shape else self.min_latent_shape
+        max_latent_height = latent_height if static_shape else self.max_latent_shape
+        min_latent_width = latent_width if static_shape else self.min_latent_shape
+        max_latent_width = latent_width if static_shape else self.max_latent_shape
+        return (
+            min_batch,
+            max_batch,
+            min_image_height,
+            max_image_height,
+            min_image_width,
+            max_image_width,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        )
+
+
+def getOnnxPath(model_name, onnx_dir, opt=True):
+    return os.path.join(onnx_dir, model_name + (".opt" if opt else "") + ".onnx")
+
+
+def getEnginePath(model_name, engine_dir):
+    return os.path.join(engine_dir, model_name + ".plan")
+
+
+def build_engines(
+    models: dict,
+    engine_dir,
+    onnx_dir,
+    onnx_opset,
+    opt_image_height,
+    opt_image_width,
+    opt_batch_size=1,
+    force_engine_rebuild=False,
+    static_batch=False,
+    static_shape=True,
+    enable_all_tactics=False,
+    timing_cache=None,
+):
+    built_engines = {}
+    if not os.path.isdir(onnx_dir):
+        os.makedirs(onnx_dir)
+    if not os.path.isdir(engine_dir):
+        os.makedirs(engine_dir)
+
+    # Export models to ONNX
+    for model_name, model_obj in models.items():
+        engine_path = getEnginePath(model_name, engine_dir)
+        if force_engine_rebuild or not os.path.exists(engine_path):
+            logger.warning("Building Engines...")
+            logger.warning("Engine build can take a while to complete")
+            onnx_path = getOnnxPath(model_name, onnx_dir, opt=False)
+            onnx_opt_path = getOnnxPath(model_name, onnx_dir)
+            if force_engine_rebuild or not os.path.exists(onnx_opt_path):
+                if force_engine_rebuild or not os.path.exists(onnx_path):
+                    logger.warning(f"Exporting model: {onnx_path}")
+                    model = model_obj.get_model()
+                    with torch.inference_mode(), torch.autocast("cuda"):
+                        inputs = model_obj.get_sample_input(opt_batch_size, opt_image_height, opt_image_width)
+                        torch.onnx.export(
+                            model,
+                            inputs,
+                            onnx_path,
+                            export_params=True,
+                            opset_version=onnx_opset,
+                            do_constant_folding=True,
+                            input_names=model_obj.get_input_names(),
+                            output_names=model_obj.get_output_names(),
+                            dynamic_axes=model_obj.get_dynamic_axes(),
+                        )
+                    del model
+                    torch.cuda.empty_cache()
+                    gc.collect()
+                else:
+                    logger.warning(f"Found cached model: {onnx_path}")
+
+                # Optimize onnx
+                if force_engine_rebuild or not os.path.exists(onnx_opt_path):
+                    logger.warning(f"Generating optimizing model: {onnx_opt_path}")
+                    onnx_opt_graph = model_obj.optimize(onnx.load(onnx_path))
+                    onnx.save(onnx_opt_graph, onnx_opt_path)
+                else:
+                    logger.warning(f"Found cached optimized model: {onnx_opt_path} ")
+
+    # Build TensorRT engines
+    for model_name, model_obj in models.items():
+        engine_path = getEnginePath(model_name, engine_dir)
+        engine = Engine(engine_path)
+        onnx_path = getOnnxPath(model_name, onnx_dir, opt=False)
+        onnx_opt_path = getOnnxPath(model_name, onnx_dir)
+
+        if force_engine_rebuild or not os.path.exists(engine.engine_path):
+            engine.build(
+                onnx_opt_path,
+                fp16=True,
+                input_profile=model_obj.get_input_profile(
+                    opt_batch_size,
+                    opt_image_height,
+                    opt_image_width,
+                    static_batch=static_batch,
+                    static_shape=static_shape,
+                ),
+                timing_cache=timing_cache,
+            )
+        built_engines[model_name] = engine
+
+    # Load and activate TensorRT engines
+    for model_name, model_obj in models.items():
+        engine = built_engines[model_name]
+        engine.load()
+        engine.activate()
+
+    return built_engines
+
+
+def runEngine(engine, feed_dict, stream):
+    return engine.infer(feed_dict, stream)
+
+
+class CLIP(BaseModel):
+    def __init__(self, model, device, max_batch_size, embedding_dim):
+        super(CLIP, self).__init__(
+            model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim
+        )
+        self.name = "CLIP"
+
+    def get_input_names(self):
+        return ["input_ids"]
+
+    def get_output_names(self):
+        return ["text_embeddings", "pooler_output"]
+
+    def get_dynamic_axes(self):
+        return {"input_ids": {0: "B"}, "text_embeddings": {0: "B"}}
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        self.check_dims(batch_size, image_height, image_width)
+        min_batch, max_batch, _, _, _, _, _, _, _, _ = self.get_minmax_dims(
+            batch_size, image_height, image_width, static_batch, static_shape
+        )
+        return {
+            "input_ids": [(min_batch, self.text_maxlen), (batch_size, self.text_maxlen), (max_batch, self.text_maxlen)]
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return {
+            "input_ids": (batch_size, self.text_maxlen),
+            "text_embeddings": (batch_size, self.text_maxlen, self.embedding_dim),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return torch.zeros(batch_size, self.text_maxlen, dtype=torch.int32, device=self.device)
+
+    def optimize(self, onnx_graph):
+        opt = Optimizer(onnx_graph)
+        opt.select_outputs([0])  # delete graph output#1
+        opt.cleanup()
+        opt.fold_constants()
+        opt.infer_shapes()
+        opt.select_outputs([0], names=["text_embeddings"])  # rename network output
+        opt_onnx_graph = opt.cleanup(return_onnx=True)
+        return opt_onnx_graph
+
+
+def make_CLIP(model, device, max_batch_size, embedding_dim, inpaint=False):
+    return CLIP(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)
+
+
+class UNet(BaseModel):
+    def __init__(
+        self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77, unet_dim=4
+    ):
+        super(UNet, self).__init__(
+            model=model,
+            fp16=fp16,
+            device=device,
+            max_batch_size=max_batch_size,
+            embedding_dim=embedding_dim,
+            text_maxlen=text_maxlen,
+        )
+        self.unet_dim = unet_dim
+        self.name = "UNet"
+
+    def get_input_names(self):
+        return ["sample", "timestep", "encoder_hidden_states"]
+
+    def get_output_names(self):
+        return ["latent"]
+
+    def get_dynamic_axes(self):
+        return {
+            "sample": {0: "2B", 2: "H", 3: "W"},
+            "encoder_hidden_states": {0: "2B"},
+            "latent": {0: "2B", 2: "H", 3: "W"},
+        }
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        (
+            min_batch,
+            max_batch,
+            _,
+            _,
+            _,
+            _,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            "sample": [
+                (2 * min_batch, self.unet_dim, min_latent_height, min_latent_width),
+                (2 * batch_size, self.unet_dim, latent_height, latent_width),
+                (2 * max_batch, self.unet_dim, max_latent_height, max_latent_width),
+            ],
+            "encoder_hidden_states": [
+                (2 * min_batch, self.text_maxlen, self.embedding_dim),
+                (2 * batch_size, self.text_maxlen, self.embedding_dim),
+                (2 * max_batch, self.text_maxlen, self.embedding_dim),
+            ],
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "sample": (2 * batch_size, self.unet_dim, latent_height, latent_width),
+            "encoder_hidden_states": (2 * batch_size, self.text_maxlen, self.embedding_dim),
+            "latent": (2 * batch_size, 4, latent_height, latent_width),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        dtype = torch.float16 if self.fp16 else torch.float32
+        return (
+            torch.randn(
+                2 * batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device
+            ),
+            torch.tensor([1.0], dtype=torch.float32, device=self.device),
+            torch.randn(2 * batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device),
+        )
+
+
+def make_UNet(model, device, max_batch_size, embedding_dim, inpaint=False, unet_dim=4):
+    return UNet(
+        model,
+        fp16=True,
+        device=device,
+        max_batch_size=max_batch_size,
+        embedding_dim=embedding_dim,
+        unet_dim=unet_dim,
+    )
+
+
+class VAE(BaseModel):
+    def __init__(self, model, device, max_batch_size, embedding_dim):
+        super(VAE, self).__init__(
+            model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim
+        )
+        self.name = "VAE decoder"
+
+    def get_input_names(self):
+        return ["latent"]
+
+    def get_output_names(self):
+        return ["images"]
+
+    def get_dynamic_axes(self):
+        return {"latent": {0: "B", 2: "H", 3: "W"}, "images": {0: "B", 2: "8H", 3: "8W"}}
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        (
+            min_batch,
+            max_batch,
+            _,
+            _,
+            _,
+            _,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            "latent": [
+                (min_batch, 4, min_latent_height, min_latent_width),
+                (batch_size, 4, latent_height, latent_width),
+                (max_batch, 4, max_latent_height, max_latent_width),
+            ]
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "latent": (batch_size, 4, latent_height, latent_width),
+            "images": (batch_size, 3, image_height, image_width),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return torch.randn(batch_size, 4, latent_height, latent_width, dtype=torch.float32, device=self.device)
+
+
+def make_VAE(model, device, max_batch_size, embedding_dim, inpaint=False):
+    return VAE(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)
+
+
+class TorchVAEEncoder(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.vae_encoder = model
+
+    def forward(self, x):
+        return self.vae_encoder.encode(x).latent_dist.sample()
+
+
+class VAEEncoder(BaseModel):
+    def __init__(self, model, device, max_batch_size, embedding_dim):
+        super(VAEEncoder, self).__init__(
+            model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim
+        )
+        self.name = "VAE encoder"
+
+    def get_model(self):
+        vae_encoder = TorchVAEEncoder(self.model)
+        return vae_encoder
+
+    def get_input_names(self):
+        return ["images"]
+
+    def get_output_names(self):
+        return ["latent"]
+
+    def get_dynamic_axes(self):
+        return {"images": {0: "B", 2: "8H", 3: "8W"}, "latent": {0: "B", 2: "H", 3: "W"}}
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        assert batch_size >= self.min_batch and batch_size <= self.max_batch
+        min_batch = batch_size if static_batch else self.min_batch
+        max_batch = batch_size if static_batch else self.max_batch
+        self.check_dims(batch_size, image_height, image_width)
+        (
+            min_batch,
+            max_batch,
+            min_image_height,
+            max_image_height,
+            min_image_width,
+            max_image_width,
+            _,
+            _,
+            _,
+            _,
+        ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+
+        return {
+            "images": [
+                (min_batch, 3, min_image_height, min_image_width),
+                (batch_size, 3, image_height, image_width),
+                (max_batch, 3, max_image_height, max_image_width),
+            ]
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "images": (batch_size, 3, image_height, image_width),
+            "latent": (batch_size, 4, latent_height, latent_width),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return torch.randn(batch_size, 3, image_height, image_width, dtype=torch.float32, device=self.device)
+
+
+def make_VAEEncoder(model, device, max_batch_size, embedding_dim, inpaint=False):
+    return VAEEncoder(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)
+
+
+class TensorRTStableDiffusionInpaintPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for inpainting using TensorRT accelerated Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+        stages=["clip", "unet", "vae", "vae_encoder"],
+        image_height: int = 512,
+        image_width: int = 512,
+        max_batch_size: int = 16,
+        # ONNX export parameters
+        onnx_opset: int = 17,
+        onnx_dir: str = "onnx",
+        # TensorRT engine build parameters
+        engine_dir: str = "engine",
+        force_engine_rebuild: bool = False,
+        timing_cache: str = "timing_cache",
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+
+        self.stages = stages
+        self.image_height, self.image_width = image_height, image_width
+        self.inpaint = True
+        self.onnx_opset = onnx_opset
+        self.onnx_dir = onnx_dir
+        self.engine_dir = engine_dir
+        self.force_engine_rebuild = force_engine_rebuild
+        self.timing_cache = timing_cache
+        self.build_static_batch = False
+        self.build_dynamic_shape = False
+
+        self.max_batch_size = max_batch_size
+        # TODO: Restrict batch size to 4 for larger image dimensions as a WAR for TensorRT limitation.
+        if self.build_dynamic_shape or self.image_height > 512 or self.image_width > 512:
+            self.max_batch_size = 4
+
+        self.stream = None  # loaded in loadResources()
+        self.models = {}  # loaded in __loadModels()
+        self.engine = {}  # loaded in build_engines()
+
+        self.vae.forward = self.vae.decode
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def __loadModels(self):
+        # Load pipeline models
+        self.embedding_dim = self.text_encoder.config.hidden_size
+        models_args = {
+            "device": self.torch_device,
+            "max_batch_size": self.max_batch_size,
+            "embedding_dim": self.embedding_dim,
+            "inpaint": self.inpaint,
+        }
+        if "clip" in self.stages:
+            self.models["clip"] = make_CLIP(self.text_encoder, **models_args)
+        if "unet" in self.stages:
+            self.models["unet"] = make_UNet(self.unet, **models_args, unet_dim=self.unet.config.in_channels)
+        if "vae" in self.stages:
+            self.models["vae"] = make_VAE(self.vae, **models_args)
+        if "vae_encoder" in self.stages:
+            self.models["vae_encoder"] = make_VAEEncoder(self.vae, **models_args)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 4:
+                image_latents = image
+            else:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        else:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(
+        self, image: Union[torch.Tensor, PIL.Image.Image], device: torch.device, dtype: torch.dtype
+    ) -> Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]:
+        r"""
+        Runs the safety checker on the given image.
+        Args:
+            image (Union[torch.Tensor, PIL.Image.Image]): The input image to be checked.
+            device (torch.device): The device to run the safety checker on.
+            dtype (torch.dtype): The data type of the input image.
+        Returns:
+            (image, has_nsfw_concept) Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]: A tuple containing the processed image and
+            a boolean indicating whether the image has a NSFW (Not Safe for Work) concept.
+        """
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    @classmethod
+    @validate_hf_hub_args
+    def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        cache_dir = kwargs.pop("cache_dir", None)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+
+        cls.cached_folder = (
+            pretrained_model_name_or_path
+            if os.path.isdir(pretrained_model_name_or_path)
+            else snapshot_download(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+            )
+        )
+
+    def to(self, torch_device: Optional[Union[str, torch.device]] = None, silence_dtype_warnings: bool = False):
+        super().to(torch_device, silence_dtype_warnings=silence_dtype_warnings)
+
+        self.onnx_dir = os.path.join(self.cached_folder, self.onnx_dir)
+        self.engine_dir = os.path.join(self.cached_folder, self.engine_dir)
+        self.timing_cache = os.path.join(self.cached_folder, self.timing_cache)
+
+        # set device
+        self.torch_device = self._execution_device
+        logger.warning(f"Running inference on device: {self.torch_device}")
+
+        # load models
+        self.__loadModels()
+
+        # build engines
+        self.engine = build_engines(
+            self.models,
+            self.engine_dir,
+            self.onnx_dir,
+            self.onnx_opset,
+            opt_image_height=self.image_height,
+            opt_image_width=self.image_width,
+            force_engine_rebuild=self.force_engine_rebuild,
+            static_batch=self.build_static_batch,
+            static_shape=not self.build_dynamic_shape,
+            timing_cache=self.timing_cache,
+        )
+
+        return self
+
+    def __initialize_timesteps(self, num_inference_steps, strength):
+        self.scheduler.set_timesteps(num_inference_steps)
+        offset = self.scheduler.config.steps_offset if hasattr(self.scheduler, "steps_offset") else 0
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :].to(self.torch_device)
+        return timesteps, num_inference_steps - t_start
+
+    def __preprocess_images(self, batch_size, images=()):
+        init_images = []
+        for image in images:
+            image = image.to(self.torch_device).float()
+            image = image.repeat(batch_size, 1, 1, 1)
+            init_images.append(image)
+        return tuple(init_images)
+
+    def __encode_image(self, init_image):
+        init_latents = runEngine(self.engine["vae_encoder"], {"images": init_image}, self.stream)["latent"]
+        init_latents = 0.18215 * init_latents
+        return init_latents
+
+    def __encode_prompt(self, prompt, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+        """
+        # Tokenize prompt
+        text_input_ids = (
+            self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            .input_ids.type(torch.int32)
+            .to(self.torch_device)
+        )
+
+        # NOTE: output tensor for CLIP must be cloned because it will be overwritten when called again for negative prompt
+        text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids}, self.stream)[
+            "text_embeddings"
+        ].clone()
+
+        # Tokenize negative prompt
+        uncond_input_ids = (
+            self.tokenizer(
+                negative_prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            .input_ids.type(torch.int32)
+            .to(self.torch_device)
+        )
+        uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids}, self.stream)[
+            "text_embeddings"
+        ]
+
+        # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings]).to(dtype=torch.float16)
+
+        return text_embeddings
+
+    def __denoise_latent(
+        self, latents, text_embeddings, timesteps=None, step_offset=0, mask=None, masked_image_latents=None
+    ):
+        if not isinstance(timesteps, torch.Tensor):
+            timesteps = self.scheduler.timesteps
+        for step_index, timestep in enumerate(timesteps):
+            # Expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep)
+            if isinstance(mask, torch.Tensor):
+                latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+            # Predict the noise residual
+            timestep_float = timestep.float() if timestep.dtype != torch.float32 else timestep
+
+            noise_pred = runEngine(
+                self.engine["unet"],
+                {"sample": latent_model_input, "timestep": timestep_float, "encoder_hidden_states": text_embeddings},
+                self.stream,
+            )["latent"]
+
+            # Perform guidance
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample
+
+        latents = 1.0 / 0.18215 * latents
+        return latents
+
+    def __decode_latent(self, latents):
+        images = runEngine(self.engine["vae"], {"latent": latents}, self.stream)["images"]
+        images = (images / 2 + 0.5).clamp(0, 1)
+        return images.cpu().permute(0, 2, 3, 1).float().numpy()
+
+    def __loadResources(self, image_height, image_width, batch_size):
+        self.stream = cudart.cudaStreamCreate()[1]
+
+        # Allocate buffers for TensorRT engine bindings
+        for model_name, obj in self.models.items():
+            self.engine[model_name].allocate_buffers(
+                shape_dict=obj.get_shape_dict(batch_size, image_height, image_width), device=self.torch_device
+            )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[torch.Tensor, PIL.Image.Image] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+
+        """
+        self.generator = generator
+        self.denoising_steps = num_inference_steps
+        self._guidance_scale = guidance_scale
+
+        # Pre-compute latent input scales and linear multistep coefficients
+        self.scheduler.set_timesteps(self.denoising_steps, device=self.torch_device)
+
+        # Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"Expected prompt to be of type list or str but got {type(prompt)}")
+
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+
+        if negative_prompt is not None and isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+
+        assert len(prompt) == len(negative_prompt)
+
+        if batch_size > self.max_batch_size:
+            raise ValueError(
+                f"Batch size {len(prompt)} is larger than allowed {self.max_batch_size}. If dynamic shape is used, then maximum batch size is 4"
+            )
+
+        # Validate image dimensions
+        mask_width, mask_height = mask_image.size
+        if mask_height != self.image_height or mask_width != self.image_width:
+            raise ValueError(
+                f"Input image height and width {self.image_height} and {self.image_width} are not equal to "
+                f"the respective dimensions of the mask image {mask_height} and {mask_width}"
+            )
+
+        # load resources
+        self.__loadResources(self.image_height, self.image_width, batch_size)
+
+        with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER):
+            # Spatial dimensions of latent tensor
+            latent_height = self.image_height // 8
+            latent_width = self.image_width // 8
+
+            # Pre-process input images
+            mask, masked_image, init_image = self.__preprocess_images(
+                batch_size,
+                prepare_mask_and_masked_image(
+                    image,
+                    mask_image,
+                    self.image_height,
+                    self.image_width,
+                    return_image=True,
+                ),
+            )
+
+            mask = torch.nn.functional.interpolate(mask, size=(latent_height, latent_width))
+            mask = torch.cat([mask] * 2)
+
+            # Initialize timesteps
+            timesteps, t_start = self.__initialize_timesteps(self.denoising_steps, strength)
+
+            # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+            latent_timestep = timesteps[:1].repeat(batch_size)
+            # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+            is_strength_max = strength == 1.0
+
+            # Pre-initialize latents
+            num_channels_latents = self.vae.config.latent_channels
+            latents_outputs = self.prepare_latents(
+                batch_size,
+                num_channels_latents,
+                self.image_height,
+                self.image_width,
+                torch.float32,
+                self.torch_device,
+                generator,
+                image=init_image,
+                timestep=latent_timestep,
+                is_strength_max=is_strength_max,
+            )
+
+            latents = latents_outputs[0]
+
+            # VAE encode masked image
+            masked_latents = self.__encode_image(masked_image)
+            masked_latents = torch.cat([masked_latents] * 2)
+
+            # CLIP text encoder
+            text_embeddings = self.__encode_prompt(prompt, negative_prompt)
+
+            # UNet denoiser
+            latents = self.__denoise_latent(
+                latents,
+                text_embeddings,
+                timesteps=timesteps,
+                step_offset=t_start,
+                mask=mask,
+                masked_image_latents=masked_latents,
+            )
+
+            # VAE decode latent
+            images = self.__decode_latent(latents)
+
+        images, has_nsfw_concept = self.run_safety_checker(images, self.torch_device, text_embeddings.dtype)
+        images = self.numpy_to_pil(images)
+        return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/stable_diffusion_tensorrt_txt2img.py b/diffusers/examples/community/stable_diffusion_tensorrt_txt2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8761053ed1aae7144ce74b916ac1b63ef2c0f0d
--- /dev/null
+++ b/diffusers/examples/community/stable_diffusion_tensorrt_txt2img.py
@@ -0,0 +1,1051 @@
+#
+# Copyright 2024 The HuggingFace Inc. team.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+from collections import OrderedDict
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+import PIL.Image
+import tensorrt as trt
+import torch
+from cuda import cudart
+from huggingface_hub import snapshot_download
+from huggingface_hub.utils import validate_hf_hub_args
+from onnx import shape_inference
+from packaging import version
+from polygraphy import cuda
+from polygraphy.backend.common import bytes_from_path
+from polygraphy.backend.onnx.loader import fold_constants
+from polygraphy.backend.trt import (
+    CreateConfig,
+    Profile,
+    engine_from_bytes,
+    engine_from_network,
+    network_from_onnx_path,
+    save_engine,
+)
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import FrozenDict, deprecate
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion import (
+    StableDiffusionPipelineOutput,
+    StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import DDIMScheduler
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+
+
+"""
+Installation instructions
+python3 -m pip install --upgrade transformers diffusers>=0.16.0
+python3 -m pip install --upgrade tensorrt~=10.2.0
+python3 -m pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
+python3 -m pip install onnxruntime
+"""
+
+TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Map of numpy dtype -> torch dtype
+numpy_to_torch_dtype_dict = {
+    np.uint8: torch.uint8,
+    np.int8: torch.int8,
+    np.int16: torch.int16,
+    np.int32: torch.int32,
+    np.int64: torch.int64,
+    np.float16: torch.float16,
+    np.float32: torch.float32,
+    np.float64: torch.float64,
+    np.complex64: torch.complex64,
+    np.complex128: torch.complex128,
+}
+if np.version.full_version >= "1.24.0":
+    numpy_to_torch_dtype_dict[np.bool_] = torch.bool
+else:
+    numpy_to_torch_dtype_dict[np.bool] = torch.bool
+
+# Map of torch dtype -> numpy dtype
+torch_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()}
+
+
+class Engine:
+    def __init__(self, engine_path):
+        self.engine_path = engine_path
+        self.engine = None
+        self.context = None
+        self.buffers = OrderedDict()
+        self.tensors = OrderedDict()
+
+    def __del__(self):
+        [buf.free() for buf in self.buffers.values() if isinstance(buf, cuda.DeviceArray)]
+        del self.engine
+        del self.context
+        del self.buffers
+        del self.tensors
+
+    def build(
+        self,
+        onnx_path,
+        fp16,
+        input_profile=None,
+        enable_all_tactics=False,
+        timing_cache=None,
+    ):
+        logger.warning(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
+        p = Profile()
+        if input_profile:
+            for name, dims in input_profile.items():
+                assert len(dims) == 3
+                p.add(name, min=dims[0], opt=dims[1], max=dims[2])
+
+        extra_build_args = {}
+        if not enable_all_tactics:
+            extra_build_args["tactic_sources"] = []
+
+        engine = engine_from_network(
+            network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]),
+            config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **extra_build_args),
+            save_timing_cache=timing_cache,
+        )
+        save_engine(engine, path=self.engine_path)
+
+    def load(self):
+        logger.warning(f"Loading TensorRT engine: {self.engine_path}")
+        self.engine = engine_from_bytes(bytes_from_path(self.engine_path))
+
+    def activate(self):
+        self.context = self.engine.create_execution_context()
+
+    def allocate_buffers(self, shape_dict=None, device="cuda"):
+        for binding in range(self.engine.num_io_tensors):
+            name = self.engine.get_tensor_name(binding)
+            if shape_dict and name in shape_dict:
+                shape = shape_dict[name]
+            else:
+                shape = self.engine.get_tensor_shape(name)
+            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                self.context.set_input_shape(name, shape)
+            tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to(device=device)
+            self.tensors[name] = tensor
+
+    def infer(self, feed_dict, stream):
+        for name, buf in feed_dict.items():
+            self.tensors[name].copy_(buf)
+        for name, tensor in self.tensors.items():
+            self.context.set_tensor_address(name, tensor.data_ptr())
+        noerror = self.context.execute_async_v3(stream)
+        if not noerror:
+            raise ValueError("ERROR: inference failed.")
+
+        return self.tensors
+
+
+class Optimizer:
+    def __init__(self, onnx_graph):
+        self.graph = gs.import_onnx(onnx_graph)
+
+    def cleanup(self, return_onnx=False):
+        self.graph.cleanup().toposort()
+        if return_onnx:
+            return gs.export_onnx(self.graph)
+
+    def select_outputs(self, keep, names=None):
+        self.graph.outputs = [self.graph.outputs[o] for o in keep]
+        if names:
+            for i, name in enumerate(names):
+                self.graph.outputs[i].name = name
+
+    def fold_constants(self, return_onnx=False):
+        onnx_graph = fold_constants(gs.export_onnx(self.graph), allow_onnxruntime_shape_inference=True)
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+
+    def infer_shapes(self, return_onnx=False):
+        onnx_graph = gs.export_onnx(self.graph)
+        if onnx_graph.ByteSize() > 2147483648:
+            raise TypeError("ERROR: model size exceeds supported 2GB limit")
+        else:
+            onnx_graph = shape_inference.infer_shapes(onnx_graph)
+
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+
+
+class BaseModel:
+    def __init__(self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77):
+        self.model = model
+        self.name = "SD Model"
+        self.fp16 = fp16
+        self.device = device
+
+        self.min_batch = 1
+        self.max_batch = max_batch_size
+        self.min_image_shape = 256  # min image resolution: 256x256
+        self.max_image_shape = 1024  # max image resolution: 1024x1024
+        self.min_latent_shape = self.min_image_shape // 8
+        self.max_latent_shape = self.max_image_shape // 8
+
+        self.embedding_dim = embedding_dim
+        self.text_maxlen = text_maxlen
+
+    def get_model(self):
+        return self.model
+
+    def get_input_names(self):
+        pass
+
+    def get_output_names(self):
+        pass
+
+    def get_dynamic_axes(self):
+        return None
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        pass
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        return None
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        return None
+
+    def optimize(self, onnx_graph):
+        opt = Optimizer(onnx_graph)
+        opt.cleanup()
+        opt.fold_constants()
+        opt.infer_shapes()
+        onnx_opt_graph = opt.cleanup(return_onnx=True)
+        return onnx_opt_graph
+
+    def check_dims(self, batch_size, image_height, image_width):
+        assert batch_size >= self.min_batch and batch_size <= self.max_batch
+        assert image_height % 8 == 0 or image_width % 8 == 0
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        assert latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape
+        assert latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape
+        return (latent_height, latent_width)
+
+    def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape):
+        min_batch = batch_size if static_batch else self.min_batch
+        max_batch = batch_size if static_batch else self.max_batch
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        min_image_height = image_height if static_shape else self.min_image_shape
+        max_image_height = image_height if static_shape else self.max_image_shape
+        min_image_width = image_width if static_shape else self.min_image_shape
+        max_image_width = image_width if static_shape else self.max_image_shape
+        min_latent_height = latent_height if static_shape else self.min_latent_shape
+        max_latent_height = latent_height if static_shape else self.max_latent_shape
+        min_latent_width = latent_width if static_shape else self.min_latent_shape
+        max_latent_width = latent_width if static_shape else self.max_latent_shape
+        return (
+            min_batch,
+            max_batch,
+            min_image_height,
+            max_image_height,
+            min_image_width,
+            max_image_width,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        )
+
+
+def getOnnxPath(model_name, onnx_dir, opt=True):
+    return os.path.join(onnx_dir, model_name + (".opt" if opt else "") + ".onnx")
+
+
+def getEnginePath(model_name, engine_dir):
+    return os.path.join(engine_dir, model_name + ".plan")
+
+
+def build_engines(
+    models: dict,
+    engine_dir,
+    onnx_dir,
+    onnx_opset,
+    opt_image_height,
+    opt_image_width,
+    opt_batch_size=1,
+    force_engine_rebuild=False,
+    static_batch=False,
+    static_shape=True,
+    enable_all_tactics=False,
+    timing_cache=None,
+):
+    built_engines = {}
+    if not os.path.isdir(onnx_dir):
+        os.makedirs(onnx_dir)
+    if not os.path.isdir(engine_dir):
+        os.makedirs(engine_dir)
+
+    # Export models to ONNX
+    for model_name, model_obj in models.items():
+        engine_path = getEnginePath(model_name, engine_dir)
+        if force_engine_rebuild or not os.path.exists(engine_path):
+            logger.warning("Building Engines...")
+            logger.warning("Engine build can take a while to complete")
+            onnx_path = getOnnxPath(model_name, onnx_dir, opt=False)
+            onnx_opt_path = getOnnxPath(model_name, onnx_dir)
+            if force_engine_rebuild or not os.path.exists(onnx_opt_path):
+                if force_engine_rebuild or not os.path.exists(onnx_path):
+                    logger.warning(f"Exporting model: {onnx_path}")
+                    model = model_obj.get_model()
+                    with torch.inference_mode(), torch.autocast("cuda"):
+                        inputs = model_obj.get_sample_input(opt_batch_size, opt_image_height, opt_image_width)
+                        torch.onnx.export(
+                            model,
+                            inputs,
+                            onnx_path,
+                            export_params=True,
+                            opset_version=onnx_opset,
+                            do_constant_folding=True,
+                            input_names=model_obj.get_input_names(),
+                            output_names=model_obj.get_output_names(),
+                            dynamic_axes=model_obj.get_dynamic_axes(),
+                        )
+                    del model
+                    torch.cuda.empty_cache()
+                    gc.collect()
+                else:
+                    logger.warning(f"Found cached model: {onnx_path}")
+
+                # Optimize onnx
+                if force_engine_rebuild or not os.path.exists(onnx_opt_path):
+                    logger.warning(f"Generating optimizing model: {onnx_opt_path}")
+                    onnx_opt_graph = model_obj.optimize(onnx.load(onnx_path))
+                    onnx.save(onnx_opt_graph, onnx_opt_path)
+                else:
+                    logger.warning(f"Found cached optimized model: {onnx_opt_path} ")
+
+    # Build TensorRT engines
+    for model_name, model_obj in models.items():
+        engine_path = getEnginePath(model_name, engine_dir)
+        engine = Engine(engine_path)
+        onnx_path = getOnnxPath(model_name, onnx_dir, opt=False)
+        onnx_opt_path = getOnnxPath(model_name, onnx_dir)
+
+        if force_engine_rebuild or not os.path.exists(engine.engine_path):
+            engine.build(
+                onnx_opt_path,
+                fp16=True,
+                input_profile=model_obj.get_input_profile(
+                    opt_batch_size,
+                    opt_image_height,
+                    opt_image_width,
+                    static_batch=static_batch,
+                    static_shape=static_shape,
+                ),
+                timing_cache=timing_cache,
+            )
+        built_engines[model_name] = engine
+
+    # Load and activate TensorRT engines
+    for model_name, model_obj in models.items():
+        engine = built_engines[model_name]
+        engine.load()
+        engine.activate()
+
+    return built_engines
+
+
+def runEngine(engine, feed_dict, stream):
+    return engine.infer(feed_dict, stream)
+
+
+class CLIP(BaseModel):
+    def __init__(self, model, device, max_batch_size, embedding_dim):
+        super(CLIP, self).__init__(
+            model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim
+        )
+        self.name = "CLIP"
+
+    def get_input_names(self):
+        return ["input_ids"]
+
+    def get_output_names(self):
+        return ["text_embeddings", "pooler_output"]
+
+    def get_dynamic_axes(self):
+        return {"input_ids": {0: "B"}, "text_embeddings": {0: "B"}}
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        self.check_dims(batch_size, image_height, image_width)
+        min_batch, max_batch, _, _, _, _, _, _, _, _ = self.get_minmax_dims(
+            batch_size, image_height, image_width, static_batch, static_shape
+        )
+        return {
+            "input_ids": [(min_batch, self.text_maxlen), (batch_size, self.text_maxlen), (max_batch, self.text_maxlen)]
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return {
+            "input_ids": (batch_size, self.text_maxlen),
+            "text_embeddings": (batch_size, self.text_maxlen, self.embedding_dim),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return torch.zeros(batch_size, self.text_maxlen, dtype=torch.int32, device=self.device)
+
+    def optimize(self, onnx_graph):
+        opt = Optimizer(onnx_graph)
+        opt.select_outputs([0])  # delete graph output#1
+        opt.cleanup()
+        opt.fold_constants()
+        opt.infer_shapes()
+        opt.select_outputs([0], names=["text_embeddings"])  # rename network output
+        opt_onnx_graph = opt.cleanup(return_onnx=True)
+        return opt_onnx_graph
+
+
+def make_CLIP(model, device, max_batch_size, embedding_dim, inpaint=False):
+    return CLIP(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)
+
+
+class UNet(BaseModel):
+    def __init__(
+        self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77, unet_dim=4
+    ):
+        super(UNet, self).__init__(
+            model=model,
+            fp16=fp16,
+            device=device,
+            max_batch_size=max_batch_size,
+            embedding_dim=embedding_dim,
+            text_maxlen=text_maxlen,
+        )
+        self.unet_dim = unet_dim
+        self.name = "UNet"
+
+    def get_input_names(self):
+        return ["sample", "timestep", "encoder_hidden_states"]
+
+    def get_output_names(self):
+        return ["latent"]
+
+    def get_dynamic_axes(self):
+        return {
+            "sample": {0: "2B", 2: "H", 3: "W"},
+            "encoder_hidden_states": {0: "2B"},
+            "latent": {0: "2B", 2: "H", 3: "W"},
+        }
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        (
+            min_batch,
+            max_batch,
+            _,
+            _,
+            _,
+            _,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            "sample": [
+                (2 * min_batch, self.unet_dim, min_latent_height, min_latent_width),
+                (2 * batch_size, self.unet_dim, latent_height, latent_width),
+                (2 * max_batch, self.unet_dim, max_latent_height, max_latent_width),
+            ],
+            "encoder_hidden_states": [
+                (2 * min_batch, self.text_maxlen, self.embedding_dim),
+                (2 * batch_size, self.text_maxlen, self.embedding_dim),
+                (2 * max_batch, self.text_maxlen, self.embedding_dim),
+            ],
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "sample": (2 * batch_size, self.unet_dim, latent_height, latent_width),
+            "encoder_hidden_states": (2 * batch_size, self.text_maxlen, self.embedding_dim),
+            "latent": (2 * batch_size, 4, latent_height, latent_width),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        dtype = torch.float16 if self.fp16 else torch.float32
+        return (
+            torch.randn(
+                2 * batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device
+            ),
+            torch.tensor([1.0], dtype=torch.float32, device=self.device),
+            torch.randn(2 * batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device),
+        )
+
+
+def make_UNet(model, device, max_batch_size, embedding_dim, inpaint=False):
+    return UNet(
+        model,
+        fp16=True,
+        device=device,
+        max_batch_size=max_batch_size,
+        embedding_dim=embedding_dim,
+        unet_dim=(9 if inpaint else 4),
+    )
+
+
+class VAE(BaseModel):
+    def __init__(self, model, device, max_batch_size, embedding_dim):
+        super(VAE, self).__init__(
+            model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim
+        )
+        self.name = "VAE decoder"
+
+    def get_input_names(self):
+        return ["latent"]
+
+    def get_output_names(self):
+        return ["images"]
+
+    def get_dynamic_axes(self):
+        return {"latent": {0: "B", 2: "H", 3: "W"}, "images": {0: "B", 2: "8H", 3: "8W"}}
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        (
+            min_batch,
+            max_batch,
+            _,
+            _,
+            _,
+            _,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            "latent": [
+                (min_batch, 4, min_latent_height, min_latent_width),
+                (batch_size, 4, latent_height, latent_width),
+                (max_batch, 4, max_latent_height, max_latent_width),
+            ]
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "latent": (batch_size, 4, latent_height, latent_width),
+            "images": (batch_size, 3, image_height, image_width),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return torch.randn(batch_size, 4, latent_height, latent_width, dtype=torch.float32, device=self.device)
+
+
+def make_VAE(model, device, max_batch_size, embedding_dim, inpaint=False):
+    return VAE(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)
+
+
+class TensorRTStableDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using TensorRT accelerated Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+        stages=["clip", "unet", "vae"],
+        image_height: int = 768,
+        image_width: int = 768,
+        max_batch_size: int = 16,
+        # ONNX export parameters
+        onnx_opset: int = 18,
+        onnx_dir: str = "onnx",
+        # TensorRT engine build parameters
+        engine_dir: str = "engine",
+        force_engine_rebuild: bool = False,
+        timing_cache: str = "timing_cache",
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+
+        self.stages = stages
+        self.image_height, self.image_width = image_height, image_width
+        self.inpaint = False
+        self.onnx_opset = onnx_opset
+        self.onnx_dir = onnx_dir
+        self.engine_dir = engine_dir
+        self.force_engine_rebuild = force_engine_rebuild
+        self.timing_cache = timing_cache
+        self.build_static_batch = False
+        self.build_dynamic_shape = False
+
+        self.max_batch_size = max_batch_size
+        # TODO: Restrict batch size to 4 for larger image dimensions as a WAR for TensorRT limitation.
+        if self.build_dynamic_shape or self.image_height > 512 or self.image_width > 512:
+            self.max_batch_size = 4
+
+        self.stream = None  # loaded in loadResources()
+        self.models = {}  # loaded in __loadModels()
+        self.engine = {}  # loaded in build_engines()
+
+        self.vae.forward = self.vae.decode
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def __loadModels(self):
+        # Load pipeline models
+        self.embedding_dim = self.text_encoder.config.hidden_size
+        models_args = {
+            "device": self.torch_device,
+            "max_batch_size": self.max_batch_size,
+            "embedding_dim": self.embedding_dim,
+            "inpaint": self.inpaint,
+        }
+        if "clip" in self.stages:
+            self.models["clip"] = make_CLIP(self.text_encoder, **models_args)
+        if "unet" in self.stages:
+            self.models["unet"] = make_UNet(self.unet, **models_args)
+        if "vae" in self.stages:
+            self.models["vae"] = make_VAE(self.vae, **models_args)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: Union[torch.Generator, List[torch.Generator]],
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""
+        Prepare the latent vectors for diffusion.
+        Args:
+            batch_size (int): The number of samples in the batch.
+            num_channels_latents (int): The number of channels in the latent vectors.
+            height (int): The height of the latent vectors.
+            width (int): The width of the latent vectors.
+            dtype (torch.dtype): The data type of the latent vectors.
+            device (torch.device): The device to place the latent vectors on.
+            generator (Union[torch.Generator, List[torch.Generator]]): The generator(s) to use for random number generation.
+            latents (Optional[torch.Tensor]): The pre-existing latent vectors. If None, new latent vectors will be generated.
+        Returns:
+            torch.Tensor: The prepared latent vectors.
+        """
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(
+        self, image: Union[torch.Tensor, PIL.Image.Image], device: torch.device, dtype: torch.dtype
+    ) -> Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]:
+        r"""
+        Runs the safety checker on the given image.
+        Args:
+            image (Union[torch.Tensor, PIL.Image.Image]): The input image to be checked.
+            device (torch.device): The device to run the safety checker on.
+            dtype (torch.dtype): The data type of the input image.
+        Returns:
+            (image, has_nsfw_concept) Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]: A tuple containing the processed image and
+            a boolean indicating whether the image has a NSFW (Not Safe for Work) concept.
+        """
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    @classmethod
+    @validate_hf_hub_args
+    def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        cache_dir = kwargs.pop("cache_dir", None)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+
+        cls.cached_folder = (
+            pretrained_model_name_or_path
+            if os.path.isdir(pretrained_model_name_or_path)
+            else snapshot_download(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+            )
+        )
+
+    def to(self, torch_device: Optional[Union[str, torch.device]] = None, silence_dtype_warnings: bool = False):
+        super().to(torch_device, silence_dtype_warnings=silence_dtype_warnings)
+
+        self.onnx_dir = os.path.join(self.cached_folder, self.onnx_dir)
+        self.engine_dir = os.path.join(self.cached_folder, self.engine_dir)
+        self.timing_cache = os.path.join(self.cached_folder, self.timing_cache)
+
+        # set device
+        self.torch_device = self._execution_device
+        logger.warning(f"Running inference on device: {self.torch_device}")
+
+        # load models
+        self.__loadModels()
+
+        # build engines
+        self.engine = build_engines(
+            self.models,
+            self.engine_dir,
+            self.onnx_dir,
+            self.onnx_opset,
+            opt_image_height=self.image_height,
+            opt_image_width=self.image_width,
+            force_engine_rebuild=self.force_engine_rebuild,
+            static_batch=self.build_static_batch,
+            static_shape=not self.build_dynamic_shape,
+            timing_cache=self.timing_cache,
+        )
+
+        return self
+
+    def __encode_prompt(self, prompt, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+        """
+        # Tokenize prompt
+        text_input_ids = (
+            self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            .input_ids.type(torch.int32)
+            .to(self.torch_device)
+        )
+
+        # NOTE: output tensor for CLIP must be cloned because it will be overwritten when called again for negative prompt
+        text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids}, self.stream)[
+            "text_embeddings"
+        ].clone()
+
+        # Tokenize negative prompt
+        uncond_input_ids = (
+            self.tokenizer(
+                negative_prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            .input_ids.type(torch.int32)
+            .to(self.torch_device)
+        )
+        uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids}, self.stream)[
+            "text_embeddings"
+        ]
+
+        # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings]).to(dtype=torch.float16)
+
+        return text_embeddings
+
+    def __denoise_latent(
+        self, latents, text_embeddings, timesteps=None, step_offset=0, mask=None, masked_image_latents=None
+    ):
+        if not isinstance(timesteps, torch.Tensor):
+            timesteps = self.scheduler.timesteps
+        for step_index, timestep in enumerate(timesteps):
+            # Expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep)
+            if isinstance(mask, torch.Tensor):
+                latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+            # Predict the noise residual
+            timestep_float = timestep.float() if timestep.dtype != torch.float32 else timestep
+
+            noise_pred = runEngine(
+                self.engine["unet"],
+                {"sample": latent_model_input, "timestep": timestep_float, "encoder_hidden_states": text_embeddings},
+                self.stream,
+            )["latent"]
+
+            # Perform guidance
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample
+
+        latents = 1.0 / 0.18215 * latents
+        return latents
+
+    def __decode_latent(self, latents):
+        images = runEngine(self.engine["vae"], {"latent": latents}, self.stream)["images"]
+        images = (images / 2 + 0.5).clamp(0, 1)
+        return images.cpu().permute(0, 2, 3, 1).float().numpy()
+
+    def __loadResources(self, image_height, image_width, batch_size):
+        self.stream = cudart.cudaStreamCreate()[1]
+
+        # Allocate buffers for TensorRT engine bindings
+        for model_name, obj in self.models.items():
+            self.engine[model_name].allocate_buffers(
+                shape_dict=obj.get_shape_dict(batch_size, image_height, image_width), device=self.torch_device
+            )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+
+        """
+        self.generator = generator
+        self.denoising_steps = num_inference_steps
+        self._guidance_scale = guidance_scale
+
+        # Pre-compute latent input scales and linear multistep coefficients
+        self.scheduler.set_timesteps(self.denoising_steps, device=self.torch_device)
+
+        # Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"Expected prompt to be of type list or str but got {type(prompt)}")
+
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+
+        if negative_prompt is not None and isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+
+        assert len(prompt) == len(negative_prompt)
+
+        if batch_size > self.max_batch_size:
+            raise ValueError(
+                f"Batch size {len(prompt)} is larger than allowed {self.max_batch_size}. If dynamic shape is used, then maximum batch size is 4"
+            )
+
+        # load resources
+        self.__loadResources(self.image_height, self.image_width, batch_size)
+
+        with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER):
+            # CLIP text encoder
+            text_embeddings = self.__encode_prompt(prompt, negative_prompt)
+
+            # Pre-initialize latents
+            num_channels_latents = self.unet.config.in_channels
+            latents = self.prepare_latents(
+                batch_size,
+                num_channels_latents,
+                self.image_height,
+                self.image_width,
+                torch.float32,
+                self.torch_device,
+                generator,
+            )
+
+            # UNet denoiser
+            latents = self.__denoise_latent(latents, text_embeddings)
+
+            # VAE decode latent
+            images = self.__decode_latent(latents)
+
+        images, has_nsfw_concept = self.run_safety_checker(images, self.torch_device, text_embeddings.dtype)
+        images = self.numpy_to_pil(images)
+        return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/community/stable_diffusion_xl_reference.py b/diffusers/examples/community/stable_diffusion_xl_reference.py
new file mode 100644
index 0000000000000000000000000000000000000000..107afc1f8b7aadc77738910af1b92939d4f8f914
--- /dev/null
+++ b/diffusers/examples/community/stable_diffusion_xl_reference.py
@@ -0,0 +1,818 @@
+# Based on stable_diffusion_reference.py
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from diffusers import StableDiffusionXLPipeline
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.models.unets.unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    UpBlock2D,
+)
+from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+from diffusers.utils import PIL_INTERPOLATION, logging
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+
+        >>> input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
+
+        >>> pipe = StableDiffusionXLReferencePipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            torch_dtype=torch.float16,
+            use_safetensors=True,
+            variant="fp16").to('cuda:0')
+
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> result_img = pipe(ref_image=input_image,
+                        prompt="1girl",
+                        num_inference_steps=20,
+                        reference_attn=True,
+                        reference_adain=True).images[0]
+
+        >>> result_img.show()
+        ```
+"""
+
+
+def torch_dfs(model: torch.nn.Module):
+    result = [model]
+    for child in model.children():
+        result += torch_dfs(child)
+    return result
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
+    def _default_height_width(self, height, width, image):
+        # NOTE: It is possible that a list of images have different
+        # dimensions for each image, so just checking the first image
+        # is not _exactly_ correct, but it is simple.
+        while isinstance(image, list):
+            image = image[0]
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[2]
+
+            height = (height // 8) * 8  # round down to nearest multiple of 8
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[3]
+
+            width = (width // 8) * 8
+
+        return height, width
+
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if not isinstance(image, torch.Tensor):
+            if isinstance(image, PIL.Image.Image):
+                image = [image]
+
+            if isinstance(image[0], PIL.Image.Image):
+                images = []
+
+                for image_ in image:
+                    image_ = image_.convert("RGB")
+                    image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
+                    image_ = np.array(image_)
+                    image_ = image_[None, :]
+                    images.append(image_)
+
+                image = images
+
+                image = np.concatenate(image, axis=0)
+                image = np.array(image).astype(np.float32) / 255.0
+                image = (image - 0.5) / 0.5
+                image = image.transpose(0, 3, 1, 2)
+                image = torch.from_numpy(image)
+
+            elif isinstance(image[0], torch.Tensor):
+                image = torch.stack(image, dim=0)
+
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do_classifier_free_guidance):
+        refimage = refimage.to(device=device)
+        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+            self.upcast_vae()
+            refimage = refimage.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+        if refimage.dtype != self.vae.dtype:
+            refimage = refimage.to(dtype=self.vae.dtype)
+        # encode the mask image into latents space so we can concatenate it to the latents
+        if isinstance(generator, list):
+            ref_image_latents = [
+                self.vae.encode(refimage[i : i + 1]).latent_dist.sample(generator=generator[i])
+                for i in range(batch_size)
+            ]
+            ref_image_latents = torch.cat(ref_image_latents, dim=0)
+        else:
+            ref_image_latents = self.vae.encode(refimage).latent_dist.sample(generator=generator)
+        ref_image_latents = self.vae.config.scaling_factor * ref_image_latents
+
+        # duplicate mask and ref_image_latents for each generation per prompt, using mps friendly method
+        if ref_image_latents.shape[0] < batch_size:
+            if not batch_size % ref_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {ref_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            ref_image_latents = ref_image_latents.repeat(batch_size // ref_image_latents.shape[0], 1, 1, 1)
+
+        ref_image_latents = torch.cat([ref_image_latents] * 2) if do_classifier_free_guidance else ref_image_latents
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        ref_image_latents = ref_image_latents.to(device=device, dtype=dtype)
+        return ref_image_latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        ref_image: Union[torch.Tensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        attention_auto_machine_weight: float = 1.0,
+        gn_auto_machine_weight: float = 1.0,
+        style_fidelity: float = 0.5,
+        reference_attn: bool = True,
+        reference_adain: bool = True,
+    ):
+        assert reference_attn or reference_adain, "`reference_attn` or `reference_adain` must be True."
+
+        # 0. Default height and width to unet
+        # height, width = self._default_height_width(height, width, ref_image)
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+        # 4. Preprocess reference image
+        ref_image = self.prepare_image(
+            image=ref_image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=prompt_embeds.dtype,
+        )
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7. Prepare reference latent variables
+        ref_image_latents = self.prepare_ref_latents(
+            ref_image,
+            batch_size * num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Modify self attebtion and group norm
+        MODE = "write"
+        uc_mask = (
+            torch.Tensor([1] * batch_size * num_images_per_prompt + [0] * batch_size * num_images_per_prompt)
+            .type_as(ref_image_latents)
+            .bool()
+        )
+
+        def hacked_basic_transformer_inner_forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            timestep: Optional[torch.LongTensor] = None,
+            cross_attention_kwargs: Dict[str, Any] = None,
+            class_labels: Optional[torch.LongTensor] = None,
+        ):
+            if self.use_ada_layer_norm:
+                norm_hidden_states = self.norm1(hidden_states, timestep)
+            elif self.use_ada_layer_norm_zero:
+                norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                    hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+            else:
+                norm_hidden_states = self.norm1(hidden_states)
+
+            # 1. Self-Attention
+            cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+            if self.only_cross_attention:
+                attn_output = self.attn1(
+                    norm_hidden_states,
+                    encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                    attention_mask=attention_mask,
+                    **cross_attention_kwargs,
+                )
+            else:
+                if MODE == "write":
+                    self.bank.append(norm_hidden_states.detach().clone())
+                    attn_output = self.attn1(
+                        norm_hidden_states,
+                        encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                        attention_mask=attention_mask,
+                        **cross_attention_kwargs,
+                    )
+                if MODE == "read":
+                    if attention_auto_machine_weight > self.attn_weight:
+                        attn_output_uc = self.attn1(
+                            norm_hidden_states,
+                            encoder_hidden_states=torch.cat([norm_hidden_states] + self.bank, dim=1),
+                            # attention_mask=attention_mask,
+                            **cross_attention_kwargs,
+                        )
+                        attn_output_c = attn_output_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            attn_output_c[uc_mask] = self.attn1(
+                                norm_hidden_states[uc_mask],
+                                encoder_hidden_states=norm_hidden_states[uc_mask],
+                                **cross_attention_kwargs,
+                            )
+                        attn_output = style_fidelity * attn_output_c + (1.0 - style_fidelity) * attn_output_uc
+                        self.bank.clear()
+                    else:
+                        attn_output = self.attn1(
+                            norm_hidden_states,
+                            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                            attention_mask=attention_mask,
+                            **cross_attention_kwargs,
+                        )
+            if self.use_ada_layer_norm_zero:
+                attn_output = gate_msa.unsqueeze(1) * attn_output
+            hidden_states = attn_output + hidden_states
+
+            if self.attn2 is not None:
+                norm_hidden_states = (
+                    self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+                )
+
+                # 2. Cross-Attention
+                attn_output = self.attn2(
+                    norm_hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=encoder_attention_mask,
+                    **cross_attention_kwargs,
+                )
+                hidden_states = attn_output + hidden_states
+
+            # 3. Feed-forward
+            norm_hidden_states = self.norm3(hidden_states)
+
+            if self.use_ada_layer_norm_zero:
+                norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+            ff_output = self.ff(norm_hidden_states)
+
+            if self.use_ada_layer_norm_zero:
+                ff_output = gate_mlp.unsqueeze(1) * ff_output
+
+            hidden_states = ff_output + hidden_states
+
+            return hidden_states
+
+        def hacked_mid_forward(self, *args, **kwargs):
+            eps = 1e-6
+            x = self.original_forward(*args, **kwargs)
+            if MODE == "write":
+                if gn_auto_machine_weight >= self.gn_weight:
+                    var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0)
+                    self.mean_bank.append(mean)
+                    self.var_bank.append(var)
+            if MODE == "read":
+                if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                    var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0)
+                    std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                    mean_acc = sum(self.mean_bank) / float(len(self.mean_bank))
+                    var_acc = sum(self.var_bank) / float(len(self.var_bank))
+                    std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                    x_uc = (((x - mean) / std) * std_acc) + mean_acc
+                    x_c = x_uc.clone()
+                    if do_classifier_free_guidance and style_fidelity > 0:
+                        x_c[uc_mask] = x[uc_mask]
+                    x = style_fidelity * x_c + (1.0 - style_fidelity) * x_uc
+                self.mean_bank = []
+                self.var_bank = []
+            return x
+
+        def hack_CrossAttnDownBlock2D_forward(
+            self,
+            hidden_states: torch.Tensor,
+            temb: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+        ):
+            eps = 1e-6
+
+            # TODO(Patrick, William) - attention mask is not used
+            output_states = ()
+
+            for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                if MODE == "write":
+                    if gn_auto_machine_weight >= self.gn_weight:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        self.mean_bank.append([mean])
+                        self.var_bank.append([var])
+                if MODE == "read":
+                    if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                        mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+                        var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+                        std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                        hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+                        hidden_states_c = hidden_states_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            hidden_states_c[uc_mask] = hidden_states[uc_mask]
+                        hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+                output_states = output_states + (hidden_states,)
+
+            if MODE == "read":
+                self.mean_bank = []
+                self.var_bank = []
+
+            if self.downsamplers is not None:
+                for downsampler in self.downsamplers:
+                    hidden_states = downsampler(hidden_states)
+
+                output_states = output_states + (hidden_states,)
+
+            return hidden_states, output_states
+
+        def hacked_DownBlock2D_forward(self, hidden_states, temb=None, *args, **kwargs):
+            eps = 1e-6
+
+            output_states = ()
+
+            for i, resnet in enumerate(self.resnets):
+                hidden_states = resnet(hidden_states, temb)
+
+                if MODE == "write":
+                    if gn_auto_machine_weight >= self.gn_weight:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        self.mean_bank.append([mean])
+                        self.var_bank.append([var])
+                if MODE == "read":
+                    if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                        mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+                        var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+                        std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                        hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+                        hidden_states_c = hidden_states_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            hidden_states_c[uc_mask] = hidden_states[uc_mask]
+                        hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+                output_states = output_states + (hidden_states,)
+
+            if MODE == "read":
+                self.mean_bank = []
+                self.var_bank = []
+
+            if self.downsamplers is not None:
+                for downsampler in self.downsamplers:
+                    hidden_states = downsampler(hidden_states)
+
+                output_states = output_states + (hidden_states,)
+
+            return hidden_states, output_states
+
+        def hacked_CrossAttnUpBlock2D_forward(
+            self,
+            hidden_states: torch.Tensor,
+            res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+            temb: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            upsample_size: Optional[int] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+        ):
+            eps = 1e-6
+            # TODO(Patrick, William) - attention mask is not used
+            for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+
+                if MODE == "write":
+                    if gn_auto_machine_weight >= self.gn_weight:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        self.mean_bank.append([mean])
+                        self.var_bank.append([var])
+                if MODE == "read":
+                    if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                        mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+                        var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+                        std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                        hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+                        hidden_states_c = hidden_states_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            hidden_states_c[uc_mask] = hidden_states[uc_mask]
+                        hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+            if MODE == "read":
+                self.mean_bank = []
+                self.var_bank = []
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        def hacked_UpBlock2D_forward(
+            self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, **kwargs
+        ):
+            eps = 1e-6
+            for i, resnet in enumerate(self.resnets):
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                hidden_states = resnet(hidden_states, temb)
+
+                if MODE == "write":
+                    if gn_auto_machine_weight >= self.gn_weight:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        self.mean_bank.append([mean])
+                        self.var_bank.append([var])
+                if MODE == "read":
+                    if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                        var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+                        std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                        mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+                        var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+                        std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                        hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+                        hidden_states_c = hidden_states_uc.clone()
+                        if do_classifier_free_guidance and style_fidelity > 0:
+                            hidden_states_c[uc_mask] = hidden_states[uc_mask]
+                        hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+            if MODE == "read":
+                self.mean_bank = []
+                self.var_bank = []
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        if reference_attn:
+            attn_modules = [module for module in torch_dfs(self.unet) if isinstance(module, BasicTransformerBlock)]
+            attn_modules = sorted(attn_modules, key=lambda x: -x.norm1.normalized_shape[0])
+
+            for i, module in enumerate(attn_modules):
+                module._original_inner_forward = module.forward
+                module.forward = hacked_basic_transformer_inner_forward.__get__(module, BasicTransformerBlock)
+                module.bank = []
+                module.attn_weight = float(i) / float(len(attn_modules))
+
+        if reference_adain:
+            gn_modules = [self.unet.mid_block]
+            self.unet.mid_block.gn_weight = 0
+
+            down_blocks = self.unet.down_blocks
+            for w, module in enumerate(down_blocks):
+                module.gn_weight = 1.0 - float(w) / float(len(down_blocks))
+                gn_modules.append(module)
+
+            up_blocks = self.unet.up_blocks
+            for w, module in enumerate(up_blocks):
+                module.gn_weight = float(w) / float(len(up_blocks))
+                gn_modules.append(module)
+
+            for i, module in enumerate(gn_modules):
+                if getattr(module, "original_forward", None) is None:
+                    module.original_forward = module.forward
+                if i == 0:
+                    # mid_block
+                    module.forward = hacked_mid_forward.__get__(module, torch.nn.Module)
+                elif isinstance(module, CrossAttnDownBlock2D):
+                    module.forward = hack_CrossAttnDownBlock2D_forward.__get__(module, CrossAttnDownBlock2D)
+                elif isinstance(module, DownBlock2D):
+                    module.forward = hacked_DownBlock2D_forward.__get__(module, DownBlock2D)
+                elif isinstance(module, CrossAttnUpBlock2D):
+                    module.forward = hacked_CrossAttnUpBlock2D_forward.__get__(module, CrossAttnUpBlock2D)
+                elif isinstance(module, UpBlock2D):
+                    module.forward = hacked_UpBlock2D_forward.__get__(module, UpBlock2D)
+                module.mean_bank = []
+                module.var_bank = []
+                module.gn_weight *= 2
+
+        # 10. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        # 11. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 10.1 Apply denoising_end
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                # ref only part
+                noise = randn_tensor(
+                    ref_image_latents.shape, generator=generator, device=device, dtype=ref_image_latents.dtype
+                )
+                ref_xt = self.scheduler.add_noise(
+                    ref_image_latents,
+                    noise,
+                    t.reshape(
+                        1,
+                    ),
+                )
+                ref_xt = self.scheduler.scale_model_input(ref_xt, t)
+
+                MODE = "write"
+
+                self.unet(
+                    ref_xt,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                # predict the noise residual
+                MODE = "read"
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+            return StableDiffusionXLPipelineOutput(images=image)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/examples/community/stable_unclip.py b/diffusers/examples/community/stable_unclip.py
new file mode 100644
index 0000000000000000000000000000000000000000..f13c4e0a490bdc114eb2dbc3cb5ed9253912f965
--- /dev/null
+++ b/diffusers/examples/community/stable_unclip.py
@@ -0,0 +1,288 @@
+import types
+from typing import List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+from transformers.models.clip.modeling_clip import CLIPTextModelOutput
+
+from diffusers.models import PriorTransformer
+from diffusers.pipelines import DiffusionPipeline, StableDiffusionImageVariationPipeline
+from diffusers.schedulers import UnCLIPScheduler
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
+    image = image.to(device=device)
+    image_embeddings = image  # take image as image_embeddings
+    image_embeddings = image_embeddings.unsqueeze(1)
+
+    # duplicate image embeddings for each generation per prompt, using mps friendly method
+    bs_embed, seq_len, _ = image_embeddings.shape
+    image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+    image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+    if do_classifier_free_guidance:
+        uncond_embeddings = torch.zeros_like(image_embeddings)
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        image_embeddings = torch.cat([uncond_embeddings, image_embeddings])
+
+    return image_embeddings
+
+
+class StableUnCLIPPipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        prior_scheduler: UnCLIPScheduler,
+        decoder_pipe_kwargs: Optional[dict] = None,
+    ):
+        super().__init__()
+
+        decoder_pipe_kwargs = {"image_encoder": None} if decoder_pipe_kwargs is None else decoder_pipe_kwargs
+
+        decoder_pipe_kwargs["torch_dtype"] = decoder_pipe_kwargs.get("torch_dtype", None) or prior.dtype
+
+        self.decoder_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            "lambdalabs/sd-image-variations-diffusers", **decoder_pipe_kwargs
+        )
+
+        # replace `_encode_image` method
+        self.decoder_pipe._encode_image = types.MethodType(_encode_image, self.decoder_pipe)
+
+        self.register_modules(
+            prior=prior,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            prior_scheduler=prior_scheduler,
+        )
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+        text_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if text_model_output is None:
+            batch_size = len(prompt) if isinstance(prompt, list) else 1
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            text_mask = text_inputs.attention_mask.bool().to(device)
+
+            if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+                removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+            text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+            text_embeddings = text_encoder_output.text_embeds
+            text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        else:
+            batch_size = text_model_output[0].shape[0]
+            text_embeddings, text_encoder_hidden_states = text_model_output[0], text_model_output[1]
+            text_mask = text_attention_mask
+
+        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens = [""] * batch_size
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            uncond_embeddings_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            uncond_embeddings = uncond_embeddings_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = uncond_embeddings_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return text_embeddings, text_encoder_hidden_states, text_mask
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.prior, "_hf_hook"):
+            return self.device
+        for module in self.prior.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def to(self, torch_device: Optional[Union[str, torch.device]] = None):
+        self.decoder_pipe.to(torch_device)
+        super().to(torch_device)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_images_per_prompt: int = 1,
+        prior_num_inference_steps: int = 25,
+        generator: Optional[torch.Generator] = None,
+        prior_latents: Optional[torch.Tensor] = None,
+        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+        text_attention_mask: Optional[torch.Tensor] = None,
+        prior_guidance_scale: float = 4.0,
+        decoder_guidance_scale: float = 8.0,
+        decoder_num_inference_steps: int = 50,
+        decoder_num_images_per_prompt: Optional[int] = 1,
+        decoder_eta: float = 0.0,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        if prompt is not None:
+            if isinstance(prompt, str):
+                batch_size = 1
+            elif isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        else:
+            batch_size = text_model_output[0].shape[0]
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = prior_guidance_scale > 1.0 or decoder_guidance_scale > 1.0
+
+        text_embeddings, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, text_model_output, text_attention_mask
+        )
+
+        # prior
+
+        self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
+        prior_timesteps_tensor = self.prior_scheduler.timesteps
+
+        embedding_dim = self.prior.config.embedding_dim
+
+        prior_latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            text_embeddings.dtype,
+            device,
+            generator,
+            prior_latents,
+            self.prior_scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=text_embeddings,
+                encoder_hidden_states=text_encoder_hidden_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == prior_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = prior_timesteps_tensor[i + 1]
+
+            prior_latents = self.prior_scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=prior_latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+        prior_latents = self.prior.post_process_latents(prior_latents)
+
+        image_embeddings = prior_latents
+
+        output = self.decoder_pipe(
+            image=image_embeddings,
+            height=height,
+            width=width,
+            num_inference_steps=decoder_num_inference_steps,
+            guidance_scale=decoder_guidance_scale,
+            generator=generator,
+            output_type=output_type,
+            return_dict=return_dict,
+            num_images_per_prompt=decoder_num_images_per_prompt,
+            eta=decoder_eta,
+        )
+        return output
diff --git a/diffusers/examples/community/text_inpainting.py b/diffusers/examples/community/text_inpainting.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4378ab96f28972988ce419e7370d582621cd69d
--- /dev/null
+++ b/diffusers/examples/community/text_inpainting.py
@@ -0,0 +1,240 @@
+from typing import Callable, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPSegForImageSegmentation,
+    CLIPSegProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+)
+
+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class TextInpainting(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text based inpainting using Stable Diffusion.
+    Uses CLIPSeg to get a mask from the given text, then calls the Inpainting pipeline with the generated mask
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        segmentation_model ([`CLIPSegForImageSegmentation`]):
+            CLIPSeg Model to generate mask from the given text. Please refer to the [model card]() for details.
+        segmentation_processor ([`CLIPSegProcessor`]):
+            CLIPSeg processor to get image, text features to translate prompt to English, if necessary. Please refer to the
+            [model card](https://huggingface.co/docs/transformers/model_doc/clipseg) for details.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        segmentation_model: CLIPSegForImageSegmentation,
+        segmentation_processor: CLIPSegProcessor,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration"
+                " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
+                " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
+                " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
+                " Hub, it would be very nice if you could open a Pull request for the"
+                " `scheduler/scheduler_config.json` file"
+            )
+            deprecate("skip_prk_steps not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["skip_prk_steps"] = True
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            segmentation_model=segmentation_model,
+            segmentation_processor=segmentation_processor,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.Tensor, PIL.Image.Image],
+        text: str,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            text (`str``):
+                The text to use to generate the mask.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # We use the input text to generate the mask
+        inputs = self.segmentation_processor(
+            text=[text], images=[image], padding="max_length", return_tensors="pt"
+        ).to(self.device)
+        outputs = self.segmentation_model(**inputs)
+        mask = torch.sigmoid(outputs.logits).cpu().detach().unsqueeze(-1).numpy()
+        mask_pil = self.numpy_to_pil(mask)[0].resize(image.size)
+
+        # Run inpainting pipeline with the generated mask
+        inpainting_pipeline = StableDiffusionInpaintPipeline(
+            vae=self.vae,
+            text_encoder=self.text_encoder,
+            tokenizer=self.tokenizer,
+            unet=self.unet,
+            scheduler=self.scheduler,
+            safety_checker=self.safety_checker,
+            feature_extractor=self.feature_extractor,
+        )
+        return inpainting_pipeline(
+            prompt=prompt,
+            image=image,
+            mask_image=mask_pil,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
diff --git a/diffusers/examples/community/tiled_upscaling.py b/diffusers/examples/community/tiled_upscaling.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fe7399c8aeddf9693fe8e2d07ac5bd696679dc2
--- /dev/null
+++ b/diffusers/examples/community/tiled_upscaling.py
@@ -0,0 +1,298 @@
+# Copyright 2024 Peter Willemsen <peter@codebuffet.co>. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from PIL import Image
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline
+from diffusers.schedulers import DDIMScheduler, DDPMScheduler, LMSDiscreteScheduler, PNDMScheduler
+
+
+def make_transparency_mask(size, overlap_pixels, remove_borders=[]):
+    size_x = size[0] - overlap_pixels * 2
+    size_y = size[1] - overlap_pixels * 2
+    for letter in ["l", "r"]:
+        if letter in remove_borders:
+            size_x += overlap_pixels
+    for letter in ["t", "b"]:
+        if letter in remove_borders:
+            size_y += overlap_pixels
+    mask = np.ones((size_y, size_x), dtype=np.uint8) * 255
+    mask = np.pad(mask, mode="linear_ramp", pad_width=overlap_pixels, end_values=0)
+
+    if "l" in remove_borders:
+        mask = mask[:, overlap_pixels : mask.shape[1]]
+    if "r" in remove_borders:
+        mask = mask[:, 0 : mask.shape[1] - overlap_pixels]
+    if "t" in remove_borders:
+        mask = mask[overlap_pixels : mask.shape[0], :]
+    if "b" in remove_borders:
+        mask = mask[0 : mask.shape[0] - overlap_pixels, :]
+    return mask
+
+
+def clamp(n, smallest, largest):
+    return max(smallest, min(n, largest))
+
+
+def clamp_rect(rect: [int], min: [int], max: [int]):
+    return (
+        clamp(rect[0], min[0], max[0]),
+        clamp(rect[1], min[1], max[1]),
+        clamp(rect[2], min[0], max[0]),
+        clamp(rect[3], min[1], max[1]),
+    )
+
+
+def add_overlap_rect(rect: [int], overlap: int, image_size: [int]):
+    rect = list(rect)
+    rect[0] -= overlap
+    rect[1] -= overlap
+    rect[2] += overlap
+    rect[3] += overlap
+    rect = clamp_rect(rect, [0, 0], [image_size[0], image_size[1]])
+    return rect
+
+
+def squeeze_tile(tile, original_image, original_slice, slice_x):
+    result = Image.new("RGB", (tile.size[0] + original_slice, tile.size[1]))
+    result.paste(
+        original_image.resize((tile.size[0], tile.size[1]), Image.BICUBIC).crop(
+            (slice_x, 0, slice_x + original_slice, tile.size[1])
+        ),
+        (0, 0),
+    )
+    result.paste(tile, (original_slice, 0))
+    return result
+
+
+def unsqueeze_tile(tile, original_image_slice):
+    crop_rect = (original_image_slice * 4, 0, tile.size[0], tile.size[1])
+    tile = tile.crop(crop_rect)
+    return tile
+
+
+def next_divisible(n, d):
+    divisor = n % d
+    return n - divisor
+
+
+class StableDiffusionTiledUpscalePipeline(StableDiffusionUpscalePipeline):
+    r"""
+    Pipeline for tile-based text-guided image super-resolution using Stable Diffusion 2, trading memory for compute
+    to create gigantic images.
+
+    This model inherits from [`StableDiffusionUpscalePipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        low_res_scheduler ([`SchedulerMixin`]):
+            A scheduler used to add initial noise to the low res conditioning image. It must be an instance of
+            [`DDPMScheduler`].
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        low_res_scheduler: DDPMScheduler,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        max_noise_level: int = 350,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+            max_noise_level=max_noise_level,
+        )
+
+    def _process_tile(self, original_image_slice, x, y, tile_size, tile_border, image, final_image, **kwargs):
+        torch.manual_seed(0)
+        crop_rect = (
+            min(image.size[0] - (tile_size + original_image_slice), x * tile_size),
+            min(image.size[1] - (tile_size + original_image_slice), y * tile_size),
+            min(image.size[0], (x + 1) * tile_size),
+            min(image.size[1], (y + 1) * tile_size),
+        )
+        crop_rect_with_overlap = add_overlap_rect(crop_rect, tile_border, image.size)
+        tile = image.crop(crop_rect_with_overlap)
+        translated_slice_x = ((crop_rect[0] + ((crop_rect[2] - crop_rect[0]) / 2)) / image.size[0]) * tile.size[0]
+        translated_slice_x = translated_slice_x - (original_image_slice / 2)
+        translated_slice_x = max(0, translated_slice_x)
+        to_input = squeeze_tile(tile, image, original_image_slice, translated_slice_x)
+        orig_input_size = to_input.size
+        to_input = to_input.resize((tile_size, tile_size), Image.BICUBIC)
+        upscaled_tile = super(StableDiffusionTiledUpscalePipeline, self).__call__(image=to_input, **kwargs).images[0]
+        upscaled_tile = upscaled_tile.resize((orig_input_size[0] * 4, orig_input_size[1] * 4), Image.BICUBIC)
+        upscaled_tile = unsqueeze_tile(upscaled_tile, original_image_slice)
+        upscaled_tile = upscaled_tile.resize((tile.size[0] * 4, tile.size[1] * 4), Image.BICUBIC)
+        remove_borders = []
+        if x == 0:
+            remove_borders.append("l")
+        elif crop_rect[2] == image.size[0]:
+            remove_borders.append("r")
+        if y == 0:
+            remove_borders.append("t")
+        elif crop_rect[3] == image.size[1]:
+            remove_borders.append("b")
+        transparency_mask = Image.fromarray(
+            make_transparency_mask(
+                (upscaled_tile.size[0], upscaled_tile.size[1]), tile_border * 4, remove_borders=remove_borders
+            ),
+            mode="L",
+        )
+        final_image.paste(
+            upscaled_tile, (crop_rect_with_overlap[0] * 4, crop_rect_with_overlap[1] * 4), transparency_mask
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[PIL.Image.Image, List[PIL.Image.Image]],
+        num_inference_steps: int = 75,
+        guidance_scale: float = 9.0,
+        noise_level: int = 50,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        tile_size: int = 128,
+        tile_border: int = 32,
+        original_image_slice: int = 32,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.Tensor`):
+                `Image`, or tensor representing an image batch which will be upscaled. *
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            tile_size (`int`, *optional*):
+                The size of the tiles. Too big can result in an OOM-error.
+            tile_border (`int`, *optional*):
+                The number of pixels around a tile to consider (bigger means less seams, too big can lead to an OOM-error).
+            original_image_slice (`int`, *optional*):
+                The amount of pixels of the original image to calculate with the current tile (bigger means more depth
+                is preserved, less blur occurs in the final image, too big can lead to an OOM-error or loss in detail).
+            callback (`Callable`, *optional*):
+                A function that take a callback function with a single argument, a dict,
+                that contains the (partially) processed image under "image",
+                as well as the progress (0 to 1, where 1 is completed) under "progress".
+
+        Returns: A PIL.Image that is 4 times larger than the original input image.
+
+        """
+
+        final_image = Image.new("RGB", (image.size[0] * 4, image.size[1] * 4))
+        tcx = math.ceil(image.size[0] / tile_size)
+        tcy = math.ceil(image.size[1] / tile_size)
+        total_tile_count = tcx * tcy
+        current_count = 0
+        for y in range(tcy):
+            for x in range(tcx):
+                self._process_tile(
+                    original_image_slice,
+                    x,
+                    y,
+                    tile_size,
+                    tile_border,
+                    image,
+                    final_image,
+                    prompt=prompt,
+                    num_inference_steps=num_inference_steps,
+                    guidance_scale=guidance_scale,
+                    noise_level=noise_level,
+                    negative_prompt=negative_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    eta=eta,
+                    generator=generator,
+                    latents=latents,
+                )
+                current_count += 1
+                if callback is not None:
+                    callback({"progress": current_count / total_tile_count, "image": final_image})
+        return final_image
+
+
+def main():
+    # Run a demo
+    model_id = "stabilityai/stable-diffusion-x4-upscaler"
+    pipe = StableDiffusionTiledUpscalePipeline.from_pretrained(model_id, variant="fp16", torch_dtype=torch.float16)
+    pipe = pipe.to("cuda")
+    image = Image.open("../../docs/source/imgs/diffusers_library.jpg")
+
+    def callback(obj):
+        print(f"progress: {obj['progress']:.4f}")
+        obj["image"].save("diffusers_library_progress.jpg")
+
+    final_image = pipe(image=image, prompt="Black font, white background, vector", noise_level=40, callback=callback)
+    final_image.save("diffusers_library.jpg")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/community/unclip_image_interpolation.py b/diffusers/examples/community/unclip_image_interpolation.py
new file mode 100644
index 0000000000000000000000000000000000000000..210bd61ecd1d46523e3c48061703133e5a585fc0
--- /dev/null
+++ b/diffusers/examples/community/unclip_image_interpolation.py
@@ -0,0 +1,452 @@
+import inspect
+from typing import List, Optional, Union
+
+import PIL.Image
+import torch
+from torch.nn import functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    DiffusionPipeline,
+    ImagePipelineOutput,
+    UnCLIPScheduler,
+    UNet2DConditionModel,
+    UNet2DModel,
+)
+from diffusers.pipelines.unclip import UnCLIPTextProjModel
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def slerp(val, low, high):
+    """
+    Find the interpolation point between the 'low' and 'high' values for the given 'val'. See https://en.wikipedia.org/wiki/Slerp for more details on the topic.
+    """
+    low_norm = low / torch.norm(low)
+    high_norm = high / torch.norm(high)
+    omega = torch.acos((low_norm * high_norm))
+    so = torch.sin(omega)
+    res = (torch.sin((1.0 - val) * omega) / so) * low + (torch.sin(val * omega) / so) * high
+    return res
+
+
+class UnCLIPImageInterpolationPipeline(DiffusionPipeline):
+    """
+    Pipeline to generate variations from an input image using unCLIP
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder. unCLIP Image Variation uses the vision portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_proj ([`UnCLIPTextProjModel`]):
+            Utility class to prepare and combine the embeddings before they are passed to the decoder.
+        decoder ([`UNet2DConditionModel`]):
+            The decoder to invert the image embedding into an image.
+        super_res_first ([`UNet2DModel`]):
+            Super resolution unet. Used in all but the last step of the super resolution diffusion process.
+        super_res_last ([`UNet2DModel`]):
+            Super resolution unet. Used in the last step of the super resolution diffusion process.
+        decoder_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the decoder denoising process. Just a modified DDPMScheduler.
+        super_res_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the super resolution denoising process. Just a modified DDPMScheduler.
+
+    """
+
+    decoder: UNet2DConditionModel
+    text_proj: UnCLIPTextProjModel
+    text_encoder: CLIPTextModelWithProjection
+    tokenizer: CLIPTokenizer
+    feature_extractor: CLIPImageProcessor
+    image_encoder: CLIPVisionModelWithProjection
+    super_res_first: UNet2DModel
+    super_res_last: UNet2DModel
+
+    decoder_scheduler: UnCLIPScheduler
+    super_res_scheduler: UnCLIPScheduler
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip_image_variation.UnCLIPImageVariationPipeline.__init__
+    def __init__(
+        self,
+        decoder: UNet2DConditionModel,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_proj: UnCLIPTextProjModel,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
+        super_res_first: UNet2DModel,
+        super_res_last: UNet2DModel,
+        decoder_scheduler: UnCLIPScheduler,
+        super_res_scheduler: UnCLIPScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            decoder=decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_proj=text_proj,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            super_res_first=super_res_first,
+            super_res_last=super_res_last,
+            decoder_scheduler=decoder_scheduler,
+            super_res_scheduler=super_res_scheduler,
+        )
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip_image_variation.UnCLIPImageVariationPipeline._encode_prompt
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens = [""] * batch_size
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip_image_variation.UnCLIPImageVariationPipeline._encode_image
+    def _encode_image(self, image, device, num_images_per_prompt, image_embeddings: Optional[torch.Tensor] = None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if image_embeddings is None:
+            if not isinstance(image, torch.Tensor):
+                image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+
+            image = image.to(device=device, dtype=dtype)
+            image_embeddings = self.image_encoder(image).image_embeds
+
+        image_embeddings = image_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+        return image_embeddings
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Optional[Union[List[PIL.Image.Image], torch.Tensor]] = None,
+        steps: int = 5,
+        decoder_num_inference_steps: int = 25,
+        super_res_num_inference_steps: int = 7,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        image_embeddings: Optional[torch.Tensor] = None,
+        decoder_latents: Optional[torch.Tensor] = None,
+        super_res_latents: Optional[torch.Tensor] = None,
+        decoder_guidance_scale: float = 8.0,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`List[PIL.Image.Image]` or `torch.Tensor`):
+                The images to use for the image interpolation. Only accepts a list of two PIL Images or If you provide a tensor, it needs to comply with the
+                configuration of
+                [this](https://huggingface.co/fusing/karlo-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json)
+                `CLIPImageProcessor` while still having a shape of two in the 0th dimension. Can be left to `None` only when `image_embeddings` are passed.
+            steps (`int`, *optional*, defaults to 5):
+                The number of interpolation images to generate.
+            decoder_num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
+                image at the expense of slower inference.
+            super_res_num_inference_steps (`int`, *optional*, defaults to 7):
+                The number of denoising steps for super resolution. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            image_embeddings (`torch.Tensor`, *optional*):
+                Pre-defined image embeddings that can be derived from the image encoder. Pre-defined image embeddings
+                can be passed for tasks like image interpolations. `image` can the be left to `None`.
+            decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*):
+                Pre-generated noisy latents to be used as inputs for the decoder.
+            super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*):
+                Pre-generated noisy latents to be used as inputs for the decoder.
+            decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        """
+
+        batch_size = steps
+
+        device = self._execution_device
+
+        if isinstance(image, List):
+            if len(image) != 2:
+                raise AssertionError(
+                    f"Expected 'image' List to be of size 2, but passed 'image' length is {len(image)}"
+                )
+            elif not (isinstance(image[0], PIL.Image.Image) and isinstance(image[0], PIL.Image.Image)):
+                raise AssertionError(
+                    f"Expected 'image' List to contain PIL.Image.Image, but passed 'image' contents are {type(image[0])} and {type(image[1])}"
+                )
+        elif isinstance(image, torch.Tensor):
+            if image.shape[0] != 2:
+                raise AssertionError(
+                    f"Expected 'image' to be torch.Tensor of shape 2 in 0th dimension, but passed 'image' size is {image.shape[0]}"
+                )
+        elif isinstance(image_embeddings, torch.Tensor):
+            if image_embeddings.shape[0] != 2:
+                raise AssertionError(
+                    f"Expected 'image_embeddings' to be torch.Tensor of shape 2 in 0th dimension, but passed 'image_embeddings' shape is {image_embeddings.shape[0]}"
+                )
+        else:
+            raise AssertionError(
+                f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or torch.Tensor respectively. Received {type(image)} and {type(image_embeddings)} repsectively"
+            )
+
+        original_image_embeddings = self._encode_image(
+            image=image, device=device, num_images_per_prompt=1, image_embeddings=image_embeddings
+        )
+
+        image_embeddings = []
+
+        for interp_step in torch.linspace(0, 1, steps):
+            temp_image_embeddings = slerp(
+                interp_step, original_image_embeddings[0], original_image_embeddings[1]
+            ).unsqueeze(0)
+            image_embeddings.append(temp_image_embeddings)
+
+        image_embeddings = torch.cat(image_embeddings).to(device)
+
+        do_classifier_free_guidance = decoder_guidance_scale > 1.0
+
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt=["" for i in range(steps)],
+            device=device,
+            num_images_per_prompt=1,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+
+        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
+            image_embeddings=image_embeddings,
+            prompt_embeds=prompt_embeds,
+            text_encoder_hidden_states=text_encoder_hidden_states,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+
+        if device.type == "mps":
+            # HACK: MPS: There is a panic when padding bool tensors,
+            # so cast to int tensor for the pad and back to bool afterwards
+            text_mask = text_mask.type(torch.int)
+            decoder_text_mask = F.pad(text_mask, (self.text_proj.clip_extra_context_tokens, 0), value=1)
+            decoder_text_mask = decoder_text_mask.type(torch.bool)
+        else:
+            decoder_text_mask = F.pad(text_mask, (self.text_proj.clip_extra_context_tokens, 0), value=True)
+
+        self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device)
+        decoder_timesteps_tensor = self.decoder_scheduler.timesteps
+
+        num_channels_latents = self.decoder.config.in_channels
+        height = self.decoder.config.sample_size
+        width = self.decoder.config.sample_size
+
+        # Get the decoder latents for 1 step and then repeat the same tensor for the entire batch to keep same noise across all interpolation steps.
+        decoder_latents = self.prepare_latents(
+            (1, num_channels_latents, height, width),
+            text_encoder_hidden_states.dtype,
+            device,
+            generator,
+            decoder_latents,
+            self.decoder_scheduler,
+        )
+        decoder_latents = decoder_latents.repeat((batch_size, 1, 1, 1))
+
+        for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents
+
+            noise_pred = self.decoder(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_encoder_hidden_states,
+                class_labels=additive_clip_time_embeddings,
+                attention_mask=decoder_text_mask,
+            ).sample
+
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred_uncond, _ = noise_pred_uncond.split(latent_model_input.shape[1], dim=1)
+                noise_pred_text, predicted_variance = noise_pred_text.split(latent_model_input.shape[1], dim=1)
+                noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+            if i + 1 == decoder_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = decoder_timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            decoder_latents = self.decoder_scheduler.step(
+                noise_pred, t, decoder_latents, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
+
+        decoder_latents = decoder_latents.clamp(-1, 1)
+
+        image_small = decoder_latents
+
+        # done decoder
+
+        # super res
+
+        self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device)
+        super_res_timesteps_tensor = self.super_res_scheduler.timesteps
+
+        channels = self.super_res_first.config.in_channels // 2
+        height = self.super_res_first.config.sample_size
+        width = self.super_res_first.config.sample_size
+
+        super_res_latents = self.prepare_latents(
+            (batch_size, channels, height, width),
+            image_small.dtype,
+            device,
+            generator,
+            super_res_latents,
+            self.super_res_scheduler,
+        )
+
+        if device.type == "mps":
+            # MPS does not support many interpolations
+            image_upscaled = F.interpolate(image_small, size=[height, width])
+        else:
+            interpolate_antialias = {}
+            if "antialias" in inspect.signature(F.interpolate).parameters:
+                interpolate_antialias["antialias"] = True
+
+            image_upscaled = F.interpolate(
+                image_small, size=[height, width], mode="bicubic", align_corners=False, **interpolate_antialias
+            )
+
+        for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)):
+            # no classifier free guidance
+
+            if i == super_res_timesteps_tensor.shape[0] - 1:
+                unet = self.super_res_last
+            else:
+                unet = self.super_res_first
+
+            latent_model_input = torch.cat([super_res_latents, image_upscaled], dim=1)
+
+            noise_pred = unet(
+                sample=latent_model_input,
+                timestep=t,
+            ).sample
+
+            if i + 1 == super_res_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = super_res_timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            super_res_latents = self.super_res_scheduler.step(
+                noise_pred, t, super_res_latents, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
+
+        image = super_res_latents
+        # done super res
+
+        # post processing
+
+        image = image * 0.5 + 0.5
+        image = image.clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/examples/community/unclip_text_interpolation.py b/diffusers/examples/community/unclip_text_interpolation.py
new file mode 100644
index 0000000000000000000000000000000000000000..84f1c5a21f7a2c0ceb434815fe4e2b6272e83b01
--- /dev/null
+++ b/diffusers/examples/community/unclip_text_interpolation.py
@@ -0,0 +1,528 @@
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch.nn import functional as F
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+from transformers.models.clip.modeling_clip import CLIPTextModelOutput
+
+from diffusers import (
+    DiffusionPipeline,
+    ImagePipelineOutput,
+    PriorTransformer,
+    UnCLIPScheduler,
+    UNet2DConditionModel,
+    UNet2DModel,
+)
+from diffusers.pipelines.unclip import UnCLIPTextProjModel
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def slerp(val, low, high):
+    """
+    Find the interpolation point between the 'low' and 'high' values for the given 'val'. See https://en.wikipedia.org/wiki/Slerp for more details on the topic.
+    """
+    low_norm = low / torch.norm(low)
+    high_norm = high / torch.norm(high)
+    omega = torch.acos((low_norm * high_norm))
+    so = torch.sin(omega)
+    res = (torch.sin((1.0 - val) * omega) / so) * low + (torch.sin(val * omega) / so) * high
+    return res
+
+
+class UnCLIPTextInterpolationPipeline(DiffusionPipeline):
+    """
+    Pipeline for prompt-to-prompt interpolation on CLIP text embeddings and using the UnCLIP / Dall-E to decode them to images.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        text_proj ([`UnCLIPTextProjModel`]):
+            Utility class to prepare and combine the embeddings before they are passed to the decoder.
+        decoder ([`UNet2DConditionModel`]):
+            The decoder to invert the image embedding into an image.
+        super_res_first ([`UNet2DModel`]):
+            Super resolution unet. Used in all but the last step of the super resolution diffusion process.
+        super_res_last ([`UNet2DModel`]):
+            Super resolution unet. Used in the last step of the super resolution diffusion process.
+        prior_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the prior denoising process. Just a modified DDPMScheduler.
+        decoder_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the decoder denoising process. Just a modified DDPMScheduler.
+        super_res_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the super resolution denoising process. Just a modified DDPMScheduler.
+
+    """
+
+    prior: PriorTransformer
+    decoder: UNet2DConditionModel
+    text_proj: UnCLIPTextProjModel
+    text_encoder: CLIPTextModelWithProjection
+    tokenizer: CLIPTokenizer
+    super_res_first: UNet2DModel
+    super_res_last: UNet2DModel
+
+    prior_scheduler: UnCLIPScheduler
+    decoder_scheduler: UnCLIPScheduler
+    super_res_scheduler: UnCLIPScheduler
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.__init__
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        decoder: UNet2DConditionModel,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_proj: UnCLIPTextProjModel,
+        super_res_first: UNet2DModel,
+        super_res_last: UNet2DModel,
+        prior_scheduler: UnCLIPScheduler,
+        decoder_scheduler: UnCLIPScheduler,
+        super_res_scheduler: UnCLIPScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            decoder=decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_proj=text_proj,
+            super_res_first=super_res_first,
+            super_res_last=super_res_last,
+            prior_scheduler=prior_scheduler,
+            decoder_scheduler=decoder_scheduler,
+            super_res_scheduler=super_res_scheduler,
+        )
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+        text_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if text_model_output is None:
+            batch_size = len(prompt) if isinstance(prompt, list) else 1
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            text_mask = text_inputs.attention_mask.bool().to(device)
+
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+            text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+            prompt_embeds = text_encoder_output.text_embeds
+            text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        else:
+            batch_size = text_model_output[0].shape[0]
+            prompt_embeds, text_encoder_hidden_states = text_model_output[0], text_model_output[1]
+            text_mask = text_attention_mask
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens = [""] * batch_size
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        start_prompt: str,
+        end_prompt: str,
+        steps: int = 5,
+        prior_num_inference_steps: int = 25,
+        decoder_num_inference_steps: int = 25,
+        super_res_num_inference_steps: int = 7,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prior_guidance_scale: float = 4.0,
+        decoder_guidance_scale: float = 8.0,
+        enable_sequential_cpu_offload=True,
+        gpu_id=0,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            start_prompt (`str`):
+                The prompt to start the image generation interpolation from.
+            end_prompt (`str`):
+                The prompt to end the image generation interpolation at.
+            steps (`int`, *optional*, defaults to 5):
+                The number of steps over which to interpolate from start_prompt to end_prompt. The pipeline returns
+                the same number of images as this value.
+            prior_num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps for the prior. More denoising steps usually lead to a higher quality
+                image at the expense of slower inference.
+            decoder_num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
+                image at the expense of slower inference.
+            super_res_num_inference_steps (`int`, *optional*, defaults to 7):
+                The number of denoising steps for super resolution. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            enable_sequential_cpu_offload (`bool`, *optional*, defaults to `True`):
+                If True, offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+                models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+                when their specific submodule has its `forward` method called.
+            gpu_id (`int`, *optional*, defaults to `0`):
+                The gpu_id to be passed to enable_sequential_cpu_offload. Only works when enable_sequential_cpu_offload is set to True.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        """
+
+        if not isinstance(start_prompt, str) or not isinstance(end_prompt, str):
+            raise ValueError(
+                f"`start_prompt` and `end_prompt` should be of type `str` but got {type(start_prompt)} and"
+                f" {type(end_prompt)} instead"
+            )
+
+        if enable_sequential_cpu_offload:
+            self.enable_sequential_cpu_offload(gpu_id=gpu_id)
+
+        device = self._execution_device
+
+        # Turn the prompts into embeddings.
+        inputs = self.tokenizer(
+            [start_prompt, end_prompt],
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        inputs.to(device)
+        text_model_output = self.text_encoder(**inputs)
+
+        text_attention_mask = torch.max(inputs.attention_mask[0], inputs.attention_mask[1])
+        text_attention_mask = torch.cat([text_attention_mask.unsqueeze(0)] * steps).to(device)
+
+        # Interpolate from the start to end prompt using slerp and add the generated images to an image output pipeline
+        batch_text_embeds = []
+        batch_last_hidden_state = []
+
+        for interp_val in torch.linspace(0, 1, steps):
+            text_embeds = slerp(interp_val, text_model_output.text_embeds[0], text_model_output.text_embeds[1])
+            last_hidden_state = slerp(
+                interp_val, text_model_output.last_hidden_state[0], text_model_output.last_hidden_state[1]
+            )
+            batch_text_embeds.append(text_embeds.unsqueeze(0))
+            batch_last_hidden_state.append(last_hidden_state.unsqueeze(0))
+
+        batch_text_embeds = torch.cat(batch_text_embeds)
+        batch_last_hidden_state = torch.cat(batch_last_hidden_state)
+
+        text_model_output = CLIPTextModelOutput(
+            text_embeds=batch_text_embeds, last_hidden_state=batch_last_hidden_state
+        )
+
+        batch_size = text_model_output[0].shape[0]
+
+        do_classifier_free_guidance = prior_guidance_scale > 1.0 or decoder_guidance_scale > 1.0
+
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt=None,
+            device=device,
+            num_images_per_prompt=1,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            text_model_output=text_model_output,
+            text_attention_mask=text_attention_mask,
+        )
+
+        # prior
+
+        self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
+        prior_timesteps_tensor = self.prior_scheduler.timesteps
+
+        embedding_dim = self.prior.config.embedding_dim
+
+        prior_latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            None,
+            self.prior_scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+                encoder_hidden_states=text_encoder_hidden_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == prior_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = prior_timesteps_tensor[i + 1]
+
+            prior_latents = self.prior_scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=prior_latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+        prior_latents = self.prior.post_process_latents(prior_latents)
+
+        image_embeddings = prior_latents
+
+        # done prior
+
+        # decoder
+
+        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
+            image_embeddings=image_embeddings,
+            prompt_embeds=prompt_embeds,
+            text_encoder_hidden_states=text_encoder_hidden_states,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+
+        if device.type == "mps":
+            # HACK: MPS: There is a panic when padding bool tensors,
+            # so cast to int tensor for the pad and back to bool afterwards
+            text_mask = text_mask.type(torch.int)
+            decoder_text_mask = F.pad(text_mask, (self.text_proj.clip_extra_context_tokens, 0), value=1)
+            decoder_text_mask = decoder_text_mask.type(torch.bool)
+        else:
+            decoder_text_mask = F.pad(text_mask, (self.text_proj.clip_extra_context_tokens, 0), value=True)
+
+        self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device)
+        decoder_timesteps_tensor = self.decoder_scheduler.timesteps
+
+        num_channels_latents = self.decoder.config.in_channels
+        height = self.decoder.config.sample_size
+        width = self.decoder.config.sample_size
+
+        decoder_latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            text_encoder_hidden_states.dtype,
+            device,
+            generator,
+            None,
+            self.decoder_scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents
+
+            noise_pred = self.decoder(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_encoder_hidden_states,
+                class_labels=additive_clip_time_embeddings,
+                attention_mask=decoder_text_mask,
+            ).sample
+
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred_uncond, _ = noise_pred_uncond.split(latent_model_input.shape[1], dim=1)
+                noise_pred_text, predicted_variance = noise_pred_text.split(latent_model_input.shape[1], dim=1)
+                noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+            if i + 1 == decoder_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = decoder_timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            decoder_latents = self.decoder_scheduler.step(
+                noise_pred, t, decoder_latents, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
+
+        decoder_latents = decoder_latents.clamp(-1, 1)
+
+        image_small = decoder_latents
+
+        # done decoder
+
+        # super res
+
+        self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device)
+        super_res_timesteps_tensor = self.super_res_scheduler.timesteps
+
+        channels = self.super_res_first.config.in_channels // 2
+        height = self.super_res_first.config.sample_size
+        width = self.super_res_first.config.sample_size
+
+        super_res_latents = self.prepare_latents(
+            (batch_size, channels, height, width),
+            image_small.dtype,
+            device,
+            generator,
+            None,
+            self.super_res_scheduler,
+        )
+
+        if device.type == "mps":
+            # MPS does not support many interpolations
+            image_upscaled = F.interpolate(image_small, size=[height, width])
+        else:
+            interpolate_antialias = {}
+            if "antialias" in inspect.signature(F.interpolate).parameters:
+                interpolate_antialias["antialias"] = True
+
+            image_upscaled = F.interpolate(
+                image_small, size=[height, width], mode="bicubic", align_corners=False, **interpolate_antialias
+            )
+
+        for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)):
+            # no classifier free guidance
+
+            if i == super_res_timesteps_tensor.shape[0] - 1:
+                unet = self.super_res_last
+            else:
+                unet = self.super_res_first
+
+            latent_model_input = torch.cat([super_res_latents, image_upscaled], dim=1)
+
+            noise_pred = unet(
+                sample=latent_model_input,
+                timestep=t,
+            ).sample
+
+            if i + 1 == super_res_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = super_res_timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            super_res_latents = self.super_res_scheduler.step(
+                noise_pred, t, super_res_latents, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
+
+        image = super_res_latents
+        # done super res
+
+        # post processing
+
+        image = image * 0.5 + 0.5
+        image = image.clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/examples/community/wildcard_stable_diffusion.py b/diffusers/examples/community/wildcard_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..c866ce2ae9042e8e3ae33b98732330e95aa42962
--- /dev/null
+++ b/diffusers/examples/community/wildcard_stable_diffusion.py
@@ -0,0 +1,419 @@
+import inspect
+import os
+import random
+import re
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+global_re_wildcard = re.compile(r"__([^_]*)__")
+
+
+def get_filename(path: str):
+    # this doesn't work on Windows
+    return os.path.basename(path).split(".txt")[0]
+
+
+def read_wildcard_values(path: str):
+    with open(path, encoding="utf8") as f:
+        return f.read().splitlines()
+
+
+def grab_wildcard_values(wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = []):
+    for wildcard_file in wildcard_files:
+        filename = get_filename(wildcard_file)
+        read_values = read_wildcard_values(wildcard_file)
+        if filename not in wildcard_option_dict:
+            wildcard_option_dict[filename] = []
+        wildcard_option_dict[filename].extend(read_values)
+    return wildcard_option_dict
+
+
+def replace_prompt_with_wildcards(
+    prompt: str, wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = []
+):
+    new_prompt = prompt
+
+    # get wildcard options
+    wildcard_option_dict = grab_wildcard_values(wildcard_option_dict, wildcard_files)
+
+    for m in global_re_wildcard.finditer(new_prompt):
+        wildcard_value = m.group()
+        replace_value = random.choice(wildcard_option_dict[wildcard_value.strip("__")])
+        new_prompt = new_prompt.replace(wildcard_value, replace_value, 1)
+
+    return new_prompt
+
+
+@dataclass
+class WildcardStableDiffusionOutput(StableDiffusionPipelineOutput):
+    prompts: List[str]
+
+
+class WildcardStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Example Usage:
+        pipe = WildcardStableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+
+            torch_dtype=torch.float16,
+        )
+        prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
+        out = pipe(
+            prompt,
+            wildcard_option_dict={
+                "clothing":["hat", "shirt", "scarf", "beret"]
+            },
+            wildcard_files=["object.txt", "animal.txt"],
+            num_prompt_samples=1
+        )
+
+
+    Pipeline for text-to-image generation with wild cards using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        wildcard_option_dict: Dict[str, List[str]] = {},
+        wildcard_files: List[str] = [],
+        num_prompt_samples: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            wildcard_option_dict (Dict[str, List[str]]):
+                dict with key as `wildcard` and values as a list of possible replacements. For example if a prompt, "A __animal__ sitting on a chair". A wildcard_option_dict can provide possible values for "animal" like this: {"animal":["dog", "cat", "fox"]}
+            wildcard_files: (List[str])
+               List of filenames of txt files for wildcard replacements. For example if a prompt, "A __animal__ sitting on a chair". A file can be provided ["animal.txt"]
+            num_prompt_samples: int
+                Number of times to sample wildcards for each prompt provided
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if isinstance(prompt, str):
+            prompt = [
+                replace_prompt_with_wildcards(prompt, wildcard_option_dict, wildcard_files)
+                for i in range(num_prompt_samples)
+            ]
+            batch_size = len(prompt)
+        elif isinstance(prompt, list):
+            prompt_list = []
+            for p in prompt:
+                for i in range(num_prompt_samples):
+                    prompt_list.append(replace_prompt_with_wildcards(p, wildcard_option_dict, wildcard_files))
+            prompt = prompt_list
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return WildcardStableDiffusionOutput(images=image, nsfw_content_detected=has_nsfw_concept, prompts=prompt)
diff --git a/diffusers/examples/conftest.py b/diffusers/examples/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..57d77d18a915095f27965da79463c4698749042e
--- /dev/null
+++ b/diffusers/examples/conftest.py
@@ -0,0 +1,45 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tests directory-specific settings - this file is run automatically
+# by pytest before any tests are run
+
+import sys
+import warnings
+from os.path import abspath, dirname, join
+
+
+# allow having multiple repository checkouts and not needing to remember to rerun
+# 'pip install -e .[dev]' when switching between checkouts and running tests.
+git_repo_path = abspath(join(dirname(dirname(dirname(__file__))), "src"))
+sys.path.insert(1, git_repo_path)
+
+
+# silence FutureWarning warnings in tests since often we can't act on them until
+# they become normal warnings - i.e. the tests still need to test the current functionality
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
+
+def pytest_addoption(parser):
+    from diffusers.utils.testing_utils import pytest_addoption_shared
+
+    pytest_addoption_shared(parser)
+
+
+def pytest_terminal_summary(terminalreporter):
+    from diffusers.utils.testing_utils import pytest_terminal_summary_main
+
+    make_reports = terminalreporter.config.getoption("--make-reports")
+    if make_reports:
+        pytest_terminal_summary_main(terminalreporter, id=make_reports)
diff --git a/diffusers/examples/consistency_distillation/README.md b/diffusers/examples/consistency_distillation/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f5cb72fa8682040a61e7372a02ddfae934222b08
--- /dev/null
+++ b/diffusers/examples/consistency_distillation/README.md
@@ -0,0 +1,112 @@
+# Latent Consistency Distillation Example:
+
+[Latent Consistency Models (LCMs)](https://arxiv.org/abs/2310.04378) is a method to distill a latent diffusion model to enable swift inference with minimal steps. This example demonstrates how to use latent consistency distillation to distill stable-diffusion-v1.5 for inference with few timesteps.
+
+## Full model distillation
+
+### Running locally with PyTorch
+
+#### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the example folder and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+
+
+#### Example
+
+The following uses the [Conceptual Captions 12M (CC12M) dataset](https://github.com/google-research-datasets/conceptual-12m) as an example, and for illustrative purposes only. For best results you may consider large and high-quality text-image datasets such as [LAION](https://laion.ai/blog/laion-400-open-dataset/). You may also need to search the hyperparameter space according to the dataset you use.
+
+```bash
+export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export OUTPUT_DIR="path/to/saved/model"
+
+accelerate launch train_lcm_distill_sd_wds.py \
+    --pretrained_teacher_model=$MODEL_NAME \
+    --output_dir=$OUTPUT_DIR \
+    --mixed_precision=fp16 \
+    --resolution=512 \
+    --learning_rate=1e-6 --loss_type="huber" --ema_decay=0.95 --adam_weight_decay=0.0 \
+    --max_train_steps=1000 \
+    --max_train_samples=4000000 \
+    --dataloader_num_workers=8 \
+    --train_shards_path_or_url="pipe:curl -L -s https://huggingface.co/datasets/laion/conceptual-captions-12m-webdataset/resolve/main/data/{00000..01099}.tar?download=true" \
+    --validation_steps=200 \
+    --checkpointing_steps=200 --checkpoints_total_limit=10 \
+    --train_batch_size=12 \
+    --gradient_checkpointing --enable_xformers_memory_efficient_attention \
+    --gradient_accumulation_steps=1 \
+    --use_8bit_adam \
+    --resume_from_checkpoint=latest \
+    --report_to=wandb \
+    --seed=453645634 \
+    --push_to_hub
+```
+
+## LCM-LoRA
+
+Instead of fine-tuning the full model, we can also just train a LoRA that can be injected into any SDXL model.
+
+### Example
+
+The following uses the [Conceptual Captions 12M (CC12M) dataset](https://github.com/google-research-datasets/conceptual-12m) as an example. For best results you may consider large and high-quality text-image datasets such as [LAION](https://laion.ai/blog/laion-400-open-dataset/).
+
+```bash
+export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export OUTPUT_DIR="path/to/saved/model"
+
+accelerate launch train_lcm_distill_lora_sd_wds.py \
+    --pretrained_teacher_model=$MODEL_NAME \
+    --output_dir=$OUTPUT_DIR \
+    --mixed_precision=fp16 \
+    --resolution=512 \
+    --lora_rank=64 \
+    --learning_rate=1e-4 --loss_type="huber" --adam_weight_decay=0.0 \
+    --max_train_steps=1000 \
+    --max_train_samples=4000000 \
+    --dataloader_num_workers=8 \
+    --train_shards_path_or_url="pipe:curl -L -s https://huggingface.co/datasets/laion/conceptual-captions-12m-webdataset/resolve/main/data/{00000..01099}.tar?download=true" \
+    --validation_steps=200 \
+    --checkpointing_steps=200 --checkpoints_total_limit=10 \
+    --train_batch_size=12 \
+    --gradient_checkpointing --enable_xformers_memory_efficient_attention \
+    --gradient_accumulation_steps=1 \
+    --use_8bit_adam \
+    --resume_from_checkpoint=latest \
+    --report_to=wandb \
+    --seed=453645634 \
+    --push_to_hub \
+```
\ No newline at end of file
diff --git a/diffusers/examples/consistency_distillation/README_sdxl.md b/diffusers/examples/consistency_distillation/README_sdxl.md
new file mode 100644
index 0000000000000000000000000000000000000000..edb4bd57f291a597fa01dc2ecbe60c4c69b7f9cd
--- /dev/null
+++ b/diffusers/examples/consistency_distillation/README_sdxl.md
@@ -0,0 +1,148 @@
+# Latent Consistency Distillation Example:
+
+[Latent Consistency Models (LCMs)](https://arxiv.org/abs/2310.04378) is a method to distill a latent diffusion model to enable swift inference with minimal steps. This example demonstrates how to use latent consistency distillation to distill SDXL for inference with few timesteps.
+
+## Full model distillation
+
+### Running locally with PyTorch
+
+#### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the example folder and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+
+
+#### Example
+
+The following uses the [Conceptual Captions 12M (CC12M) dataset](https://github.com/google-research-datasets/conceptual-12m) as an example, and for illustrative purposes only. For best results you may consider large and high-quality text-image datasets such as [LAION](https://laion.ai/blog/laion-400-open-dataset/). You may also need to search the hyperparameter space according to the dataset you use.
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export OUTPUT_DIR="path/to/saved/model"
+
+accelerate launch train_lcm_distill_sdxl_wds.py \
+    --pretrained_teacher_model=$MODEL_NAME \
+    --pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix \
+    --output_dir=$OUTPUT_DIR \
+    --mixed_precision=fp16 \
+    --resolution=1024 \
+    --learning_rate=1e-6 --loss_type="huber" --use_fix_crop_and_size --ema_decay=0.95 --adam_weight_decay=0.0 \
+    --max_train_steps=1000 \
+    --max_train_samples=4000000 \
+    --dataloader_num_workers=8 \
+    --train_shards_path_or_url="pipe:curl -L -s https://huggingface.co/datasets/laion/conceptual-captions-12m-webdataset/resolve/main/data/{00000..01099}.tar?download=true" \
+    --validation_steps=200 \
+    --checkpointing_steps=200 --checkpoints_total_limit=10 \
+    --train_batch_size=12 \
+    --gradient_checkpointing --enable_xformers_memory_efficient_attention \
+    --gradient_accumulation_steps=1 \
+    --use_8bit_adam \
+    --resume_from_checkpoint=latest \
+    --report_to=wandb \
+    --seed=453645634 \
+    --push_to_hub \
+```
+
+## LCM-LoRA
+
+Instead of fine-tuning the full model, we can also just train a LoRA that can be injected into any SDXL model.
+
+### Example
+
+The following uses the [Conceptual Captions 12M (CC12M) dataset](https://github.com/google-research-datasets/conceptual-12m) as an example. For best results you may consider large and high-quality text-image datasets such as [LAION](https://laion.ai/blog/laion-400-open-dataset/).
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export OUTPUT_DIR="path/to/saved/model"
+
+accelerate launch train_lcm_distill_lora_sdxl_wds.py \
+    --pretrained_teacher_model=$MODEL_DIR \
+    --pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix \
+    --output_dir=$OUTPUT_DIR \
+    --mixed_precision=fp16 \
+    --resolution=1024 \
+    --lora_rank=64 \
+    --learning_rate=1e-4 --loss_type="huber" --use_fix_crop_and_size --adam_weight_decay=0.0 \
+    --max_train_steps=1000 \
+    --max_train_samples=4000000 \
+    --dataloader_num_workers=8 \
+    --train_shards_path_or_url="pipe:curl -L -s https://huggingface.co/datasets/laion/conceptual-captions-12m-webdataset/resolve/main/data/{00000..01099}.tar?download=true" \
+    --validation_steps=200 \
+    --checkpointing_steps=200 --checkpoints_total_limit=10 \
+    --train_batch_size=12 \
+    --gradient_checkpointing --enable_xformers_memory_efficient_attention \
+    --gradient_accumulation_steps=1 \
+    --use_8bit_adam \
+    --resume_from_checkpoint=latest \
+    --report_to=wandb \
+    --seed=453645634 \
+    --push_to_hub \
+```
+
+We provide another version for LCM LoRA SDXL that follows best practices of `peft` and leverages the `datasets` library for quick experimentation. The script doesn't load two UNets unlike `train_lcm_distill_lora_sdxl_wds.py` which reduces the memory requirements quite a bit.
+
+Below is an example training command that trains an LCM LoRA on the [Narutos dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions):
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+export VAE_PATH="madebyollin/sdxl-vae-fp16-fix"
+
+accelerate launch train_lcm_distill_lora_sdxl.py \
+  --pretrained_teacher_model=${MODEL_NAME}  \
+  --pretrained_vae_model_name_or_path=${VAE_PATH} \
+  --output_dir="narutos-lora-lcm-sdxl" \
+  --mixed_precision="fp16" \
+  --dataset_name=$DATASET_NAME \
+  --resolution=1024 \
+  --train_batch_size=24 \
+  --gradient_accumulation_steps=1 \
+  --gradient_checkpointing \
+  --use_8bit_adam \
+  --lora_rank=64 \
+  --learning_rate=1e-4 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=3000 \
+  --checkpointing_steps=500 \
+  --validation_steps=50 \
+  --seed="0" \
+  --report_to="wandb" \
+  --push_to_hub
+```
+
diff --git a/diffusers/examples/consistency_distillation/requirements.txt b/diffusers/examples/consistency_distillation/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..09fb84270a8a897c2082748fff25220b94d4532e
--- /dev/null
+++ b/diffusers/examples/consistency_distillation/requirements.txt
@@ -0,0 +1,7 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
+webdataset
\ No newline at end of file
diff --git a/diffusers/examples/consistency_distillation/test_lcm_lora.py b/diffusers/examples/consistency_distillation/test_lcm_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ca66909a2061dd5918bb657ea8cd1d26c099a3a
--- /dev/null
+++ b/diffusers/examples/consistency_distillation/test_lcm_lora.py
@@ -0,0 +1,112 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+import safetensors
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class TextToImageLCM(ExamplesTestsAccelerate):
+    def test_text_to_image_lcm_lora_sdxl(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
+                --pretrained_teacher_model hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --lora_rank 4
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+    def test_text_to_image_lcm_lora_sdxl_checkpointing(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
+                --pretrained_teacher_model hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --lora_rank 4
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 7
+                --checkpointing_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4", "checkpoint-6"},
+            )
+
+            test_args = f"""
+                examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
+                --pretrained_teacher_model hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --lora_rank 4
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 9
+                --checkpointing_steps 2
+                --resume_from_checkpoint latest
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
+            )
diff --git a/diffusers/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py b/diffusers/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py
new file mode 100644
index 0000000000000000000000000000000000000000..611026675daf619b4cd91250740d0f76426ac640
--- /dev/null
+++ b/diffusers/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py
@@ -0,0 +1,1467 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import functools
+import gc
+import itertools
+import json
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+from typing import List, Union
+
+import accelerate
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import torchvision.transforms.functional as TF
+import transformers
+import webdataset as wds
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from braceexpand import braceexpand
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig, get_peft_model, get_peft_model_state_dict
+from torch.utils.data import default_collate
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, CLIPTextModel, PretrainedConfig
+from webdataset.tariterators import (
+    base_plus_ext,
+    tar_file_expander,
+    url_opener,
+    valid_sample,
+)
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    LCMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import resolve_interpolation_mode
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+MAX_SEQ_LENGTH = 77
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def get_module_kohya_state_dict(module, prefix: str, dtype: torch.dtype, adapter_name: str = "default"):
+    kohya_ss_state_dict = {}
+    for peft_key, weight in get_peft_model_state_dict(module, adapter_name=adapter_name).items():
+        kohya_key = peft_key.replace("base_model.model", prefix)
+        kohya_key = kohya_key.replace("lora_A", "lora_down")
+        kohya_key = kohya_key.replace("lora_B", "lora_up")
+        kohya_key = kohya_key.replace(".", "_", kohya_key.count(".") - 2)
+        kohya_ss_state_dict[kohya_key] = weight.to(dtype)
+
+        # Set alpha parameter
+        if "lora_down" in kohya_key:
+            alpha_key = f'{kohya_key.split(".")[0]}.alpha'
+            kohya_ss_state_dict[alpha_key] = torch.tensor(module.peft_config[adapter_name].lora_alpha).to(dtype)
+
+    return kohya_ss_state_dict
+
+
+def filter_keys(key_set):
+    def _f(dictionary):
+        return {k: v for k, v in dictionary.items() if k in key_set}
+
+    return _f
+
+
+def group_by_keys_nothrow(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None):
+    """Return function over iterator that groups key, value pairs into samples.
+
+    :param keys: function that splits the key into key and extension (base_plus_ext) :param lcase: convert suffixes to
+    lower case (Default value = True)
+    """
+    current_sample = None
+    for filesample in data:
+        assert isinstance(filesample, dict)
+        fname, value = filesample["fname"], filesample["data"]
+        prefix, suffix = keys(fname)
+        if prefix is None:
+            continue
+        if lcase:
+            suffix = suffix.lower()
+        # FIXME webdataset version throws if suffix in current_sample, but we have a potential for
+        #  this happening in the current LAION400m dataset if a tar ends with same prefix as the next
+        #  begins, rare, but can happen since prefix aren't unique across tar files in that dataset
+        if current_sample is None or prefix != current_sample["__key__"] or suffix in current_sample:
+            if valid_sample(current_sample):
+                yield current_sample
+            current_sample = {"__key__": prefix, "__url__": filesample["__url__"]}
+        if suffixes is None or suffix in suffixes:
+            current_sample[suffix] = value
+    if valid_sample(current_sample):
+        yield current_sample
+
+
+def tarfile_to_samples_nothrow(src, handler=wds.warn_and_continue):
+    # NOTE this is a re-impl of the webdataset impl with group_by_keys that doesn't throw
+    streams = url_opener(src, handler=handler)
+    files = tar_file_expander(streams, handler=handler)
+    samples = group_by_keys_nothrow(files, handler=handler)
+    return samples
+
+
+class WebdatasetFilter:
+    def __init__(self, min_size=1024, max_pwatermark=0.5):
+        self.min_size = min_size
+        self.max_pwatermark = max_pwatermark
+
+    def __call__(self, x):
+        try:
+            if "json" in x:
+                x_json = json.loads(x["json"])
+                filter_size = (x_json.get("original_width", 0.0) or 0.0) >= self.min_size and x_json.get(
+                    "original_height", 0
+                ) >= self.min_size
+                filter_watermark = (x_json.get("pwatermark", 1.0) or 1.0) <= self.max_pwatermark
+                return filter_size and filter_watermark
+            else:
+                return False
+        except Exception:
+            return False
+
+
+class SDText2ImageDataset:
+    def __init__(
+        self,
+        train_shards_path_or_url: Union[str, List[str]],
+        num_train_examples: int,
+        per_gpu_batch_size: int,
+        global_batch_size: int,
+        num_workers: int,
+        resolution: int = 512,
+        interpolation_type: str = "bilinear",
+        shuffle_buffer_size: int = 1000,
+        pin_memory: bool = False,
+        persistent_workers: bool = False,
+    ):
+        if not isinstance(train_shards_path_or_url, str):
+            train_shards_path_or_url = [list(braceexpand(urls)) for urls in train_shards_path_or_url]
+            # flatten list using itertools
+            train_shards_path_or_url = list(itertools.chain.from_iterable(train_shards_path_or_url))
+
+        interpolation_mode = resolve_interpolation_mode(interpolation_type)
+
+        def transform(example):
+            # resize image
+            image = example["image"]
+            image = TF.resize(image, resolution, interpolation=interpolation_mode)
+
+            # get crop coordinates and crop image
+            c_top, c_left, _, _ = transforms.RandomCrop.get_params(image, output_size=(resolution, resolution))
+            image = TF.crop(image, c_top, c_left, resolution, resolution)
+            image = TF.to_tensor(image)
+            image = TF.normalize(image, [0.5], [0.5])
+
+            example["image"] = image
+            return example
+
+        processing_pipeline = [
+            wds.decode("pil", handler=wds.ignore_and_continue),
+            wds.rename(image="jpg;png;jpeg;webp", text="text;txt;caption", handler=wds.warn_and_continue),
+            wds.map(filter_keys({"image", "text"})),
+            wds.map(transform),
+            wds.to_tuple("image", "text"),
+        ]
+
+        # Create train dataset and loader
+        pipeline = [
+            wds.ResampledShards(train_shards_path_or_url),
+            tarfile_to_samples_nothrow,
+            wds.shuffle(shuffle_buffer_size),
+            *processing_pipeline,
+            wds.batched(per_gpu_batch_size, partial=False, collation_fn=default_collate),
+        ]
+
+        num_worker_batches = math.ceil(num_train_examples / (global_batch_size * num_workers))  # per dataloader worker
+        num_batches = num_worker_batches * num_workers
+        num_samples = num_batches * global_batch_size
+
+        # each worker is iterating over this
+        self._train_dataset = wds.DataPipeline(*pipeline).with_epoch(num_worker_batches)
+        self._train_dataloader = wds.WebLoader(
+            self._train_dataset,
+            batch_size=None,
+            shuffle=False,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=persistent_workers,
+        )
+        # add meta-data to dataloader instance for convenience
+        self._train_dataloader.num_batches = num_batches
+        self._train_dataloader.num_samples = num_samples
+
+    @property
+    def train_dataset(self):
+        return self._train_dataset
+
+    @property
+    def train_dataloader(self):
+        return self._train_dataloader
+
+
+def log_validation(vae, unet, args, accelerator, weight_dtype, step):
+    logger.info("Running validation... ")
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
+
+    unet = accelerator.unwrap_model(unet)
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        args.pretrained_teacher_model,
+        vae=vae,
+        scheduler=LCMScheduler.from_pretrained(args.pretrained_teacher_model, subfolder="scheduler"),
+        revision=args.revision,
+        torch_dtype=weight_dtype,
+        safety_checker=None,
+    )
+    pipeline.set_progress_bar_config(disable=True)
+
+    lora_state_dict = get_module_kohya_state_dict(unet, "lora_unet", weight_dtype)
+    pipeline.load_lora_weights(lora_state_dict)
+    pipeline.fuse_lora()
+
+    pipeline = pipeline.to(accelerator.device, dtype=weight_dtype)
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    validation_prompts = [
+        "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",
+        "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+        "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
+    ]
+
+    image_logs = []
+
+    for _, prompt in enumerate(validation_prompts):
+        images = []
+        with autocast_ctx:
+            images = pipeline(
+                prompt=prompt,
+                num_inference_steps=4,
+                num_images_per_prompt=4,
+                generator=generator,
+                guidance_scale=1.0,
+            ).images
+        image_logs.append({"validation_prompt": prompt, "images": images})
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                formatted_images = []
+                for image in images:
+                    formatted_images.append(np.asarray(image))
+
+                formatted_images = np.stack(formatted_images)
+
+                tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            formatted_images = []
+
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                for image in images:
+                    image = wandb.Image(image, caption=validation_prompt)
+                    formatted_images.append(image)
+
+            tracker.log({"validation": formatted_images})
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+        del pipeline
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        return image_logs
+
+
+# From LatentConsistencyModel.get_guidance_scale_embedding
+def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32):
+    """
+    See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+    Args:
+        timesteps (`torch.Tensor`):
+            generate embedding vectors at these timesteps
+        embedding_dim (`int`, *optional*, defaults to 512):
+            dimension of the embeddings to generate
+        dtype:
+            data type of the generated embeddings
+
+    Returns:
+        `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+    """
+    assert len(w.shape) == 1
+    w = w * 1000.0
+
+    half_dim = embedding_dim // 2
+    emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+    emb = w.to(dtype)[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0, 1))
+    assert emb.shape == (w.shape[0], embedding_dim)
+    return emb
+
+
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+
+
+# From LCMScheduler.get_scalings_for_boundary_condition_discrete
+def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=10.0):
+    scaled_timestep = timestep_scaling * timestep
+    c_skip = sigma_data**2 / (scaled_timestep**2 + sigma_data**2)
+    c_out = scaled_timestep / (scaled_timestep**2 + sigma_data**2) ** 0.5
+    return c_skip, c_out
+
+
+# Compare LCMScheduler.step, Step 4
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
+    elif prediction_type == "v_prediction":
+        pred_x_0 = alphas * sample - sigmas * model_output
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_x_0
+
+
+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+
+
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
+
+class DDIMSolver:
+    def __init__(self, alpha_cumprods, timesteps=1000, ddim_timesteps=50):
+        # DDIM sampling parameters
+        step_ratio = timesteps // ddim_timesteps
+        self.ddim_timesteps = (np.arange(1, ddim_timesteps + 1) * step_ratio).round().astype(np.int64) - 1
+        self.ddim_alpha_cumprods = alpha_cumprods[self.ddim_timesteps]
+        self.ddim_alpha_cumprods_prev = np.asarray(
+            [alpha_cumprods[0]] + alpha_cumprods[self.ddim_timesteps[:-1]].tolist()
+        )
+        # convert to torch tensors
+        self.ddim_timesteps = torch.from_numpy(self.ddim_timesteps).long()
+        self.ddim_alpha_cumprods = torch.from_numpy(self.ddim_alpha_cumprods)
+        self.ddim_alpha_cumprods_prev = torch.from_numpy(self.ddim_alpha_cumprods_prev)
+
+    def to(self, device):
+        self.ddim_timesteps = self.ddim_timesteps.to(device)
+        self.ddim_alpha_cumprods = self.ddim_alpha_cumprods.to(device)
+        self.ddim_alpha_cumprods_prev = self.ddim_alpha_cumprods_prev.to(device)
+        return self
+
+    def ddim_step(self, pred_x0, pred_noise, timestep_index):
+        alpha_cumprod_prev = extract_into_tensor(self.ddim_alpha_cumprods_prev, timestep_index, pred_x0.shape)
+        dir_xt = (1.0 - alpha_cumprod_prev).sqrt() * pred_noise
+        x_prev = alpha_cumprod_prev.sqrt() * pred_x0 + dir_xt
+        return x_prev
+
+
+@torch.no_grad()
+def update_ema(target_params, source_params, rate=0.99):
+    """
+    Update target parameters to be closer to those of source parameters using
+    an exponential moving average.
+
+    :param target_params: the target parameter sequence.
+    :param source_params: the source parameter sequence.
+    :param rate: the EMA rate (closer to 1 means slower).
+    """
+    for targ, src in zip(target_params, source_params):
+        targ.detach().mul_(rate).add_(src, alpha=1 - rate)
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    # ----------Model Checkpoint Loading Arguments----------
+    parser.add_argument(
+        "--pretrained_teacher_model",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained LDM teacher model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--teacher_revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained LDM teacher model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained LDM model identifier from huggingface.co/models.",
+    )
+    # ----------Training Arguments----------
+    # ----General Training Arguments----
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="lcm-xl-distilled",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    # ----Logging----
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    # ----Checkpointing----
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    # ----Image Processing----
+    parser.add_argument(
+        "--train_shards_path_or_url",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--interpolation_type",
+        type=str,
+        default="bilinear",
+        help=(
+            "The interpolation function used when resizing images to the desired resolution. Choose between `bilinear`,"
+            " `bicubic`, `box`, `nearest`, `nearest_exact`, `hamming`, and `lanczos`."
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    # ----Dataloader----
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    # ----Batch Size and Training Steps----
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    # ----Learning Rate----
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    # ----Optimizer (Adam)----
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    # ----Diffusion Training Arguments----
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    # ----Latent Consistency Distillation (LCD) Specific Arguments----
+    parser.add_argument(
+        "--w_min",
+        type=float,
+        default=5.0,
+        required=False,
+        help=(
+            "The minimum guidance scale value for guidance scale sampling. Note that we are using the Imagen CFG"
+            " formulation rather than the LCM formulation, which means all guidance scales have 1 added to them as"
+            " compared to the original paper."
+        ),
+    )
+    parser.add_argument(
+        "--w_max",
+        type=float,
+        default=15.0,
+        required=False,
+        help=(
+            "The maximum guidance scale value for guidance scale sampling. Note that we are using the Imagen CFG"
+            " formulation rather than the LCM formulation, which means all guidance scales have 1 added to them as"
+            " compared to the original paper."
+        ),
+    )
+    parser.add_argument(
+        "--num_ddim_timesteps",
+        type=int,
+        default=50,
+        help="The number of timesteps to use for DDIM sampling.",
+    )
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber"],
+        help="The type of loss to use for the LCD loss.",
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.001,
+        help="The huber loss parameter. Only used if `--loss_type=huber`.",
+    )
+    parser.add_argument(
+        "--lora_rank",
+        type=int,
+        default=64,
+        help="The rank of the LoRA projection matrix.",
+    )
+    parser.add_argument(
+        "--lora_alpha",
+        type=int,
+        default=64,
+        help=(
+            "The value of the LoRA alpha parameter, which controls the scaling factor in front of the LoRA weight"
+            " update delta_W. No scaling will be performed if this value is equal to `lora_rank`."
+        ),
+    )
+    parser.add_argument(
+        "--lora_dropout",
+        type=float,
+        default=0.0,
+        help="The dropout probability for the dropout layer added before applying the LoRA to each layer input.",
+    )
+    parser.add_argument(
+        "--lora_target_modules",
+        type=str,
+        default=None,
+        help=(
+            "A comma-separated string of target module keys to add LoRA to. If not set, a default list of modules will"
+            " be used. By default, LoRA will be applied to all conv and linear layers."
+        ),
+    )
+    parser.add_argument(
+        "--vae_encode_batch_size",
+        type=int,
+        default=32,
+        required=False,
+        help=(
+            "The batch size used when encoding (and decoding) images to latents (and vice versa) using the VAE."
+            " Encoding or decoding the whole batch at once may run into OOM issues."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_scaling_factor",
+        type=float,
+        default=10.0,
+        help=(
+            "The multiplicative timestep scaling factor used when calculating the boundary scalings for LCM. The"
+            " higher the scaling is, the lower the approximation error, but the default value of 10.0 should typically"
+            " suffice."
+        ),
+    )
+    # ----Mixed Precision----
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--cast_teacher_unet",
+        action="store_true",
+        help="Whether to cast the teacher U-Net to the precision specified by `--mixed_precision`.",
+    )
+    # ----Training Optimizations----
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    # ----Distributed Training----
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    # ----------Validation Arguments----------
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=200,
+        help="Run validation every X steps.",
+    )
+    # ----------Huggingface Hub Arguments-----------
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    # ----------Accelerate Arguments----------
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="text2image-fine-tune",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    return args
+
+
+# Adapted from pipelines.StableDiffusionPipeline.encode_prompt
+def encode_prompt(prompt_batch, text_encoder, tokenizer, proportion_empty_prompts, is_train=True):
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+
+    with torch.no_grad():
+        text_inputs = tokenizer(
+            captions,
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_embeds = text_encoder(text_input_ids.to(text_encoder.device))[0]
+
+    return prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        split_batches=True,  # It's important to set this to True when using webdataset to get the right number of steps for lr scheduling. If set to False, the number of steps will be devide by the number of processes assuming batches are multiplied by the number of processes
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+                token=args.hub_token,
+                private=True,
+            ).repo_id
+
+    # 1. Create the noise scheduler and the desired noise schedule.
+    noise_scheduler = DDPMScheduler.from_pretrained(
+        args.pretrained_teacher_model, subfolder="scheduler", revision=args.teacher_revision
+    )
+
+    # DDPMScheduler calculates the alpha and sigma noise schedules (based on the alpha bars) for us
+    alpha_schedule = torch.sqrt(noise_scheduler.alphas_cumprod)
+    sigma_schedule = torch.sqrt(1 - noise_scheduler.alphas_cumprod)
+    # Initialize the DDIM ODE solver for distillation.
+    solver = DDIMSolver(
+        noise_scheduler.alphas_cumprod.numpy(),
+        timesteps=noise_scheduler.config.num_train_timesteps,
+        ddim_timesteps=args.num_ddim_timesteps,
+    )
+
+    # 2. Load tokenizers from SD 1.X/2.X checkpoint.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.pretrained_teacher_model, subfolder="tokenizer", revision=args.teacher_revision, use_fast=False
+    )
+
+    # 3. Load text encoders from SD 1.X/2.X checkpoint.
+    # import correct text encoder classes
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_teacher_model, subfolder="text_encoder", revision=args.teacher_revision
+    )
+
+    # 4. Load VAE from SD 1.X/2.X checkpoint
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_teacher_model,
+        subfolder="vae",
+        revision=args.teacher_revision,
+    )
+
+    # 5. Load teacher U-Net from SD 1.X/2.X checkpoint
+    teacher_unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
+    )
+
+    # 6. Freeze teacher vae, text_encoder, and teacher_unet
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    teacher_unet.requires_grad_(False)
+
+    # 7. Create online student U-Net.
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
+    )
+    unet.train()
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        " Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training, copy of the weights should still be float32."
+    )
+
+    if accelerator.unwrap_model(unet).dtype != torch.float32:
+        raise ValueError(
+            f"Controlnet loaded as datatype {accelerator.unwrap_model(unet).dtype}. {low_precision_error_string}"
+        )
+
+    # 8. Add LoRA to the student U-Net, only the LoRA projection matrix will be updated by the optimizer.
+    if args.lora_target_modules is not None:
+        lora_target_modules = [module_key.strip() for module_key in args.lora_target_modules.split(",")]
+    else:
+        lora_target_modules = [
+            "to_q",
+            "to_k",
+            "to_v",
+            "to_out.0",
+            "proj_in",
+            "proj_out",
+            "ff.net.0.proj",
+            "ff.net.2",
+            "conv1",
+            "conv2",
+            "conv_shortcut",
+            "downsamplers.0.conv",
+            "upsamplers.0.conv",
+            "time_emb_proj",
+        ]
+    lora_config = LoraConfig(
+        r=args.lora_rank,
+        target_modules=lora_target_modules,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+    )
+    unet = get_peft_model(unet, lora_config)
+
+    # 9. Handle mixed precision and device placement
+    # For mixed precision training we cast all non-trainable weigths to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    vae.to(accelerator.device)
+    if args.pretrained_vae_model_name_or_path is not None:
+        vae.to(dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # Move teacher_unet to device, optionally cast to weight_dtype
+    teacher_unet.to(accelerator.device)
+    if args.cast_teacher_unet:
+        teacher_unet.to(dtype=weight_dtype)
+
+    # Also move the alpha and sigma noise schedules to accelerator.device.
+    alpha_schedule = alpha_schedule.to(accelerator.device)
+    sigma_schedule = sigma_schedule.to(accelerator.device)
+    # Move the ODE solver to accelerator.device.
+    solver = solver.to(accelerator.device)
+
+    # 10. Handle saving and loading of checkpoints
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                unet_ = accelerator.unwrap_model(unet)
+                lora_state_dict = get_peft_model_state_dict(unet_, adapter_name="default")
+                StableDiffusionPipeline.save_lora_weights(os.path.join(output_dir, "unet_lora"), lora_state_dict)
+                # save weights in peft format to be able to load them back
+                unet_.save_pretrained(output_dir)
+
+                for _, model in enumerate(models):
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            # load the LoRA into the model
+            unet_ = accelerator.unwrap_model(unet)
+            unet_.load_adapter(input_dir, "default", is_trainable=True)
+
+            for _ in range(len(models)):
+                # pop models so that they are not loaded again
+                models.pop()
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # 11. Enable optimizations
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+            teacher_unet.enable_xformers_memory_efficient_attention()
+            # target_unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # 12. Optimizer creation
+    optimizer = optimizer_class(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # 13. Dataset creation and data processing
+    # Here, we compute not just the text embeddings but also the additional embeddings
+    # needed for the SD XL UNet to operate.
+    def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tokenizer, is_train=True):
+        prompt_embeds = encode_prompt(prompt_batch, text_encoder, tokenizer, proportion_empty_prompts, is_train)
+        return {"prompt_embeds": prompt_embeds}
+
+    dataset = SDText2ImageDataset(
+        train_shards_path_or_url=args.train_shards_path_or_url,
+        num_train_examples=args.max_train_samples,
+        per_gpu_batch_size=args.train_batch_size,
+        global_batch_size=args.train_batch_size * accelerator.num_processes,
+        num_workers=args.dataloader_num_workers,
+        resolution=args.resolution,
+        interpolation_type=args.interpolation_type,
+        shuffle_buffer_size=1000,
+        pin_memory=True,
+        persistent_workers=True,
+    )
+    train_dataloader = dataset.train_dataloader
+
+    compute_embeddings_fn = functools.partial(
+        compute_embeddings,
+        proportion_empty_prompts=0,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+    )
+
+    # 14. LR Scheduler creation
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # 15. Prepare for training
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+
+    uncond_input_ids = tokenizer(
+        [""] * args.train_batch_size, return_tensors="pt", padding="max_length", max_length=77
+    ).input_ids.to(accelerator.device)
+    uncond_prompt_embeds = text_encoder(uncond_input_ids)[0]
+
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+
+    # 16. Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num batches each epoch = {train_dataloader.num_batches}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # 1. Load and process the image and text conditioning
+                image, text = batch
+
+                image = image.to(accelerator.device, non_blocking=True)
+                encoded_text = compute_embeddings_fn(text)
+
+                pixel_values = image.to(dtype=weight_dtype)
+                if vae.dtype != weight_dtype:
+                    vae.to(dtype=weight_dtype)
+
+                # encode pixel values with batch size of at most args.vae_encode_batch_size
+                latents = []
+                for i in range(0, pixel_values.shape[0], args.vae_encode_batch_size):
+                    latents.append(vae.encode(pixel_values[i : i + args.vae_encode_batch_size]).latent_dist.sample())
+                latents = torch.cat(latents, dim=0)
+
+                latents = latents * vae.config.scaling_factor
+                latents = latents.to(weight_dtype)
+                bsz = latents.shape[0]
+
+                # 2. Sample a random timestep for each image t_n from the ODE solver timesteps without bias.
+                # For the DDIM solver, the timestep schedule is [T - 1, T - k - 1, T - 2 * k - 1, ...]
+                topk = noise_scheduler.config.num_train_timesteps // args.num_ddim_timesteps
+                index = torch.randint(0, args.num_ddim_timesteps, (bsz,), device=latents.device).long()
+                start_timesteps = solver.ddim_timesteps[index]
+                timesteps = start_timesteps - topk
+                timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)
+
+                # 3. Get boundary scalings for start_timesteps and (end) timesteps.
+                c_skip_start, c_out_start = scalings_for_boundary_conditions(
+                    start_timesteps, timestep_scaling=args.timestep_scaling_factor
+                )
+                c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
+                c_skip, c_out = scalings_for_boundary_conditions(
+                    timesteps, timestep_scaling=args.timestep_scaling_factor
+                )
+                c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]
+
+                # 4. Sample noise from the prior and add it to the latents according to the noise magnitude at each
+                # timestep (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                noise = torch.randn_like(latents)
+                noisy_model_input = noise_scheduler.add_noise(latents, noise, start_timesteps)
+
+                # 5. Sample a random guidance scale w from U[w_min, w_max]
+                # Note that for LCM-LoRA distillation it is not necessary to use a guidance scale embedding
+                w = (args.w_max - args.w_min) * torch.rand((bsz,)) + args.w_min
+                w = w.reshape(bsz, 1, 1, 1)
+                w = w.to(device=latents.device, dtype=latents.dtype)
+
+                # 6. Prepare prompt embeds and unet_added_conditions
+                prompt_embeds = encoded_text.pop("prompt_embeds")
+
+                # 7. Get online LCM prediction on z_{t_{n + k}} (noisy_model_input), w, c, t_{n + k} (start_timesteps)
+                noise_pred = unet(
+                    noisy_model_input,
+                    start_timesteps,
+                    timestep_cond=None,
+                    encoder_hidden_states=prompt_embeds.float(),
+                    added_cond_kwargs=encoded_text,
+                ).sample
+
+                pred_x_0 = get_predicted_original_sample(
+                    noise_pred,
+                    start_timesteps,
+                    noisy_model_input,
+                    noise_scheduler.config.prediction_type,
+                    alpha_schedule,
+                    sigma_schedule,
+                )
+
+                model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0
+
+                # 8. Compute the conditional and unconditional teacher model predictions to get CFG estimates of the
+                # predicted noise eps_0 and predicted original sample x_0, then run the ODE solver using these
+                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
+                # solver timestep.
+                with torch.no_grad():
+                    with autocast_ctx:
+                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
+                        cond_teacher_output = teacher_unet(
+                            noisy_model_input.to(weight_dtype),
+                            start_timesteps,
+                            encoder_hidden_states=prompt_embeds.to(weight_dtype),
+                        ).sample
+                        cond_pred_x0 = get_predicted_original_sample(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        cond_pred_noise = get_predicted_noise(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
+                        uncond_teacher_output = teacher_unet(
+                            noisy_model_input.to(weight_dtype),
+                            start_timesteps,
+                            encoder_hidden_states=uncond_prompt_embeds.to(weight_dtype),
+                        ).sample
+                        uncond_pred_x0 = get_predicted_original_sample(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        uncond_pred_noise = get_predicted_noise(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+
+                        # 3. Calculate the CFG estimate of x_0 (pred_x0) and eps_0 (pred_noise)
+                        # Note that this uses the LCM paper's CFG formulation rather than the Imagen CFG formulation
+                        pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
+                        pred_noise = cond_pred_noise + w * (cond_pred_noise - uncond_pred_noise)
+                        # 4. Run one step of the ODE solver to estimate the next point x_prev on the
+                        # augmented PF-ODE trajectory (solving backward in time)
+                        # Note that the DDIM step depends on both the predicted x_0 and source noise eps_0.
+                        x_prev = solver.ddim_step(pred_x0, pred_noise, index)
+
+                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
+                # Note that we do not use a separate target network for LCM-LoRA distillation.
+                with torch.no_grad():
+                    with autocast_ctx:
+                        target_noise_pred = unet(
+                            x_prev.float(),
+                            timesteps,
+                            timestep_cond=None,
+                            encoder_hidden_states=prompt_embeds.float(),
+                        ).sample
+                    pred_x_0 = get_predicted_original_sample(
+                        target_noise_pred,
+                        timesteps,
+                        x_prev,
+                        noise_scheduler.config.prediction_type,
+                        alpha_schedule,
+                        sigma_schedule,
+                    )
+                    target = c_skip * x_prev + c_out * pred_x_0
+
+                # 10. Calculate loss
+                if args.loss_type == "l2":
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                elif args.loss_type == "huber":
+                    loss = torch.mean(
+                        torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
+                    )
+
+                # 11. Backpropagate on the online student model (`unet`)
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if global_step % args.validation_steps == 0:
+                        log_validation(vae, unet, args, accelerator, weight_dtype, global_step)
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        unet.save_pretrained(args.output_dir)
+        lora_state_dict = get_peft_model_state_dict(unet, adapter_name="default")
+        StableDiffusionPipeline.save_lora_weights(os.path.join(args.output_dir, "unet_lora"), lora_state_dict)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py b/diffusers/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..8090926974c44463d1475c94ac8e0bb055d0d756
--- /dev/null
+++ b/diffusers/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
@@ -0,0 +1,1467 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The LCM team and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import copy
+import functools
+import gc
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+
+import accelerate
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig, get_peft_model_state_dict, set_peft_model_state_dict
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    LCMScheduler,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import cast_training_params, resolve_interpolation_mode
+from diffusers.utils import (
+    check_min_version,
+    convert_state_dict_to_diffusers,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.import_utils import is_xformers_available
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+class DDIMSolver:
+    def __init__(self, alpha_cumprods, timesteps=1000, ddim_timesteps=50):
+        # DDIM sampling parameters
+        step_ratio = timesteps // ddim_timesteps
+
+        self.ddim_timesteps = (np.arange(1, ddim_timesteps + 1) * step_ratio).round().astype(np.int64) - 1
+        self.ddim_alpha_cumprods = alpha_cumprods[self.ddim_timesteps]
+        self.ddim_alpha_cumprods_prev = np.asarray(
+            [alpha_cumprods[0]] + alpha_cumprods[self.ddim_timesteps[:-1]].tolist()
+        )
+        # convert to torch tensors
+        self.ddim_timesteps = torch.from_numpy(self.ddim_timesteps).long()
+        self.ddim_alpha_cumprods = torch.from_numpy(self.ddim_alpha_cumprods)
+        self.ddim_alpha_cumprods_prev = torch.from_numpy(self.ddim_alpha_cumprods_prev)
+
+    def to(self, device):
+        self.ddim_timesteps = self.ddim_timesteps.to(device)
+        self.ddim_alpha_cumprods = self.ddim_alpha_cumprods.to(device)
+        self.ddim_alpha_cumprods_prev = self.ddim_alpha_cumprods_prev.to(device)
+        return self
+
+    def ddim_step(self, pred_x0, pred_noise, timestep_index):
+        alpha_cumprod_prev = extract_into_tensor(self.ddim_alpha_cumprods_prev, timestep_index, pred_x0.shape)
+        dir_xt = (1.0 - alpha_cumprod_prev).sqrt() * pred_noise
+        x_prev = alpha_cumprod_prev.sqrt() * pred_x0 + dir_xt
+        return x_prev
+
+
+def log_validation(vae, args, accelerator, weight_dtype, step, unet=None, is_final_validation=False):
+    logger.info("Running validation... ")
+
+    pipeline = StableDiffusionXLPipeline.from_pretrained(
+        args.pretrained_teacher_model,
+        vae=vae,
+        scheduler=LCMScheduler.from_pretrained(args.pretrained_teacher_model, subfolder="scheduler"),
+        revision=args.revision,
+        torch_dtype=weight_dtype,
+    ).to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    to_load = None
+    if not is_final_validation:
+        if unet is None:
+            raise ValueError("Must provide a `unet` when doing intermediate validation.")
+        unet = accelerator.unwrap_model(unet)
+        state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+        to_load = state_dict
+    else:
+        to_load = args.output_dir
+
+    pipeline.load_lora_weights(to_load)
+    pipeline.fuse_lora()
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    validation_prompts = [
+        "cute sundar pichai character",
+        "robotic cat with wings",
+        "a photo of yoda",
+        "a cute creature with blue eyes",
+    ]
+
+    image_logs = []
+
+    for _, prompt in enumerate(validation_prompts):
+        images = []
+        if torch.backends.mps.is_available():
+            autocast_ctx = nullcontext()
+        else:
+            autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
+
+        with autocast_ctx:
+            images = pipeline(
+                prompt=prompt,
+                num_inference_steps=4,
+                num_images_per_prompt=4,
+                generator=generator,
+                guidance_scale=0.0,
+            ).images
+        image_logs.append({"validation_prompt": prompt, "images": images})
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                formatted_images = []
+                for image in images:
+                    formatted_images.append(np.asarray(image))
+
+                formatted_images = np.stack(formatted_images)
+
+                tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            formatted_images = []
+
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                for image in images:
+                    image = wandb.Image(image, caption=validation_prompt)
+                    formatted_images.append(image)
+            logger_name = "test" if is_final_validation else "validation"
+            tracker.log({logger_name: formatted_images})
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+        del pipeline
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        return image_logs
+
+
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+
+
+# From LCMScheduler.get_scalings_for_boundary_condition_discrete
+def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=10.0):
+    scaled_timestep = timestep_scaling * timestep
+    c_skip = sigma_data**2 / (scaled_timestep**2 + sigma_data**2)
+    c_out = scaled_timestep / (scaled_timestep**2 + sigma_data**2) ** 0.5
+    return c_skip, c_out
+
+
+# Compare LCMScheduler.step, Step 4
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
+    elif prediction_type == "v_prediction":
+        pred_x_0 = alphas * sample - sigmas * model_output
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_x_0
+
+
+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+
+
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    # ----------Model Checkpoint Loading Arguments----------
+    parser.add_argument(
+        "--pretrained_teacher_model",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained LDM teacher model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--teacher_revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained LDM teacher model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained LDM model identifier from huggingface.co/models.",
+    )
+    # ----------Training Arguments----------
+    # ----General Training Arguments----
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="lcm-xl-distilled",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    # ----Logging----
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    # ----Checkpointing----
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    # ----Image Processing----
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--interpolation_type",
+        type=str,
+        default="bilinear",
+        help=(
+            "The interpolation function used when resizing images to the desired resolution. Choose between `bilinear`,"
+            " `bicubic`, `box`, `nearest`, `nearest_exact`, `hamming`, and `lanczos`."
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--encode_batch_size",
+        type=int,
+        default=8,
+        help="Batch size to use for VAE encoding of the images for efficient processing.",
+    )
+    # ----Dataloader----
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    # ----Batch Size and Training Steps----
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    # ----Learning Rate----
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    # ----Optimizer (Adam)----
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    # ----Diffusion Training Arguments----
+    # ----Latent Consistency Distillation (LCD) Specific Arguments----
+    parser.add_argument(
+        "--w_min",
+        type=float,
+        default=3.0,
+        required=False,
+        help=(
+            "The minimum guidance scale value for guidance scale sampling. Note that we are using the Imagen CFG"
+            " formulation rather than the LCM formulation, which means all guidance scales have 1 added to them as"
+            " compared to the original paper."
+        ),
+    )
+    parser.add_argument(
+        "--w_max",
+        type=float,
+        default=15.0,
+        required=False,
+        help=(
+            "The maximum guidance scale value for guidance scale sampling. Note that we are using the Imagen CFG"
+            " formulation rather than the LCM formulation, which means all guidance scales have 1 added to them as"
+            " compared to the original paper."
+        ),
+    )
+    parser.add_argument(
+        "--num_ddim_timesteps",
+        type=int,
+        default=50,
+        help="The number of timesteps to use for DDIM sampling.",
+    )
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber"],
+        help="The type of loss to use for the LCD loss.",
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.001,
+        help="The huber loss parameter. Only used if `--loss_type=huber`.",
+    )
+    parser.add_argument(
+        "--lora_rank",
+        type=int,
+        default=64,
+        help="The rank of the LoRA projection matrix.",
+    )
+    parser.add_argument(
+        "--lora_alpha",
+        type=int,
+        default=64,
+        help=(
+            "The value of the LoRA alpha parameter, which controls the scaling factor in front of the LoRA weight"
+            " update delta_W. No scaling will be performed if this value is equal to `lora_rank`."
+        ),
+    )
+    parser.add_argument(
+        "--lora_dropout",
+        type=float,
+        default=0.0,
+        help="The dropout probability for the dropout layer added before applying the LoRA to each layer input.",
+    )
+    parser.add_argument(
+        "--lora_target_modules",
+        type=str,
+        default=None,
+        help=(
+            "A comma-separated string of target module keys to add LoRA to. If not set, a default list of modules will"
+            " be used. By default, LoRA will be applied to all conv and linear layers."
+        ),
+    )
+    parser.add_argument(
+        "--vae_encode_batch_size",
+        type=int,
+        default=8,
+        required=False,
+        help=(
+            "The batch size used when encoding (and decoding) images to latents (and vice versa) using the VAE."
+            " Encoding or decoding the whole batch at once may run into OOM issues."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_scaling_factor",
+        type=float,
+        default=10.0,
+        help=(
+            "The multiplicative timestep scaling factor used when calculating the boundary scalings for LCM. The"
+            " higher the scaling is, the lower the approximation error, but the default value of 10.0 should typically"
+            " suffice."
+        ),
+    )
+    # ----Mixed Precision----
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    # ----Training Optimizations----
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    # ----Distributed Training----
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    # ----------Validation Arguments----------
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=200,
+        help="Run validation every X steps.",
+    )
+    # ----------Huggingface Hub Arguments-----------
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    # ----------Accelerate Arguments----------
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="text2image-fine-tune",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    return args
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(prompt_batch, text_encoders, tokenizers, is_train=True):
+    prompt_embeds_list = []
+
+    captions = []
+    for caption in prompt_batch:
+        if isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+
+    with torch.no_grad():
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                captions,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = text_encoder(
+                text_input_ids.to(text_encoder.device),
+                output_hidden_states=True,
+            )
+
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+            prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        split_batches=True,  # It's important to set this to True when using webdataset to get the right number of steps for lr scheduling. If set to False, the number of steps will be devide by the number of processes assuming batches are multiplied by the number of processes
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+                token=args.hub_token,
+                private=True,
+            ).repo_id
+
+    # 1. Create the noise scheduler and the desired noise schedule.
+    noise_scheduler = DDPMScheduler.from_pretrained(
+        args.pretrained_teacher_model, subfolder="scheduler", revision=args.teacher_revision
+    )
+
+    # DDPMScheduler calculates the alpha and sigma noise schedules (based on the alpha bars) for us
+    alpha_schedule = torch.sqrt(noise_scheduler.alphas_cumprod)
+    sigma_schedule = torch.sqrt(1 - noise_scheduler.alphas_cumprod)
+    # Initialize the DDIM ODE solver for distillation.
+    solver = DDIMSolver(
+        noise_scheduler.alphas_cumprod.numpy(),
+        timesteps=noise_scheduler.config.num_train_timesteps,
+        ddim_timesteps=args.num_ddim_timesteps,
+    )
+
+    # 2. Load tokenizers from SDXL checkpoint.
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_teacher_model, subfolder="tokenizer", revision=args.teacher_revision, use_fast=False
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_teacher_model, subfolder="tokenizer_2", revision=args.teacher_revision, use_fast=False
+    )
+
+    # 3. Load text encoders from SDXL checkpoint.
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_teacher_model, args.teacher_revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_teacher_model, args.teacher_revision, subfolder="text_encoder_2"
+    )
+
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_teacher_model, subfolder="text_encoder", revision=args.teacher_revision
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_teacher_model, subfolder="text_encoder_2", revision=args.teacher_revision
+    )
+
+    # 4. Load VAE from SDXL checkpoint (or more stable VAE)
+    vae_path = (
+        args.pretrained_teacher_model
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.teacher_revision,
+    )
+
+    # 6. Freeze teacher vae, text_encoders.
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+
+    # 7. Create online student U-Net.
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
+    )
+    unet.requires_grad_(False)
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        " Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training, copy of the weights should still be float32."
+    )
+
+    if accelerator.unwrap_model(unet).dtype != torch.float32:
+        raise ValueError(
+            f"Controlnet loaded as datatype {accelerator.unwrap_model(unet).dtype}. {low_precision_error_string}"
+        )
+
+    # 8. Handle mixed precision and device placement
+    # For mixed precision training we cast all non-trainable weigths to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    unet.to(accelerator.device, dtype=weight_dtype)
+    if args.pretrained_vae_model_name_or_path is None:
+        vae.to(accelerator.device, dtype=torch.float32)
+    else:
+        vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    # 9. Add LoRA to the student U-Net, only the LoRA projection matrix will be updated by the optimizer.
+    if args.lora_target_modules is not None:
+        lora_target_modules = [module_key.strip() for module_key in args.lora_target_modules.split(",")]
+    else:
+        lora_target_modules = [
+            "to_q",
+            "to_k",
+            "to_v",
+            "to_out.0",
+            "proj_in",
+            "proj_out",
+            "ff.net.0.proj",
+            "ff.net.2",
+            "conv1",
+            "conv2",
+            "conv_shortcut",
+            "downsamplers.0.conv",
+            "upsamplers.0.conv",
+            "time_emb_proj",
+        ]
+    lora_config = LoraConfig(
+        r=args.lora_rank,
+        target_modules=lora_target_modules,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+    )
+    unet.add_adapter(lora_config)
+
+    # Also move the alpha and sigma noise schedules to accelerator.device.
+    alpha_schedule = alpha_schedule.to(accelerator.device)
+    sigma_schedule = sigma_schedule.to(accelerator.device)
+    solver = solver.to(accelerator.device)
+
+    # 10. Handle saving and loading of checkpoints
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                unet_ = accelerator.unwrap_model(unet)
+                # also save the checkpoints in native `diffusers` format so that it can be easily
+                # be independently loaded via `load_lora_weights()`.
+                state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet_))
+                StableDiffusionXLPipeline.save_lora_weights(output_dir, unet_lora_layers=state_dict)
+
+                for _, model in enumerate(models):
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            # load the LoRA into the model
+            unet_ = accelerator.unwrap_model(unet)
+            lora_state_dict, _ = StableDiffusionXLPipeline.lora_state_dict(input_dir)
+            unet_state_dict = {
+                f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")
+            }
+            unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+            incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+            if incompatible_keys is not None:
+                # check only for unexpected keys
+                unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+                if unexpected_keys:
+                    logger.warning(
+                        f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                        f" {unexpected_keys}. "
+                    )
+
+            for _ in range(len(models)):
+                # pop models so that they are not loaded again
+                models.pop()
+
+            # Make sure the trainable params are in float32. This is again needed since the base models
+            # are in `weight_dtype`. More details:
+            # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+            if args.mixed_precision == "fp16":
+                cast_training_params(unet_, dtype=torch.float32)
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # 11. Enable optimizations
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # 12. Optimizer creation
+    params_to_optimize = filter(lambda p: p.requires_grad, unet.parameters())
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # 13. Dataset creation and data processing
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    column_names = dataset["train"].column_names
+
+    # Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    interpolation_mode = resolve_interpolation_mode(args.interpolation_type)
+    train_resize = transforms.Resize(args.resolution, interpolation=interpolation_mode)
+    train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
+    train_flip = transforms.RandomHorizontalFlip(p=1.0)
+    train_transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        # image aug
+        original_sizes = []
+        all_images = []
+        crop_top_lefts = []
+        for image in images:
+            original_sizes.append((image.height, image.width))
+            image = train_resize(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                x1 = image.width - x1
+                image = train_flip(image)
+            crop_top_left = (y1, x1)
+            crop_top_lefts.append(crop_top_left)
+            image = train_transforms(image)
+            all_images.append(image)
+
+        examples["original_sizes"] = original_sizes
+        examples["crop_top_lefts"] = crop_top_lefts
+        examples["pixel_values"] = all_images
+        examples["captions"] = list(examples[caption_column])
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        original_sizes = [example["original_sizes"] for example in examples]
+        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
+        captions = [example["captions"] for example in examples]
+
+        return {
+            "pixel_values": pixel_values,
+            "captions": captions,
+            "original_sizes": original_sizes,
+            "crop_top_lefts": crop_top_lefts,
+        }
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # 14. Embeddings for the UNet.
+    # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+    def compute_embeddings(prompt_batch, original_sizes, crop_coords, text_encoders, tokenizers, is_train=True):
+        def compute_time_ids(original_size, crops_coords_top_left):
+            target_size = (args.resolution, args.resolution)
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_time_ids = torch.tensor([add_time_ids])
+            add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+            return add_time_ids
+
+        prompt_embeds, pooled_prompt_embeds = encode_prompt(prompt_batch, text_encoders, tokenizers, is_train)
+        add_text_embeds = pooled_prompt_embeds
+
+        add_time_ids = torch.cat([compute_time_ids(s, c) for s, c in zip(original_sizes, crop_coords)])
+
+        prompt_embeds = prompt_embeds.to(accelerator.device)
+        add_text_embeds = add_text_embeds.to(accelerator.device)
+        unet_added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        return {"prompt_embeds": prompt_embeds, **unet_added_cond_kwargs}
+
+    text_encoders = [text_encoder_one, text_encoder_two]
+    tokenizers = [tokenizer_one, tokenizer_two]
+
+    compute_embeddings_fn = functools.partial(compute_embeddings, text_encoders=text_encoders, tokenizers=tokenizers)
+
+    # 15. LR Scheduler creation
+    # Scheduler and math around the number of training steps.
+    # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
+    num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
+    if args.max_train_steps is None:
+        len_train_dataloader_after_sharding = math.ceil(len(train_dataloader) / accelerator.num_processes)
+        num_update_steps_per_epoch = math.ceil(len_train_dataloader_after_sharding / args.gradient_accumulation_steps)
+        num_training_steps_for_scheduler = (
+            args.num_train_epochs * num_update_steps_per_epoch * accelerator.num_processes
+        )
+    else:
+        num_training_steps_for_scheduler = args.max_train_steps * accelerator.num_processes
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(unet, dtype=torch.float32)
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps_for_scheduler,
+        num_training_steps=num_training_steps_for_scheduler,
+    )
+
+    # 16. Prepare for training
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        if num_training_steps_for_scheduler != args.max_train_steps * accelerator.num_processes:
+            logger.warning(
+                f"The length of the 'train_dataloader' after 'accelerator.prepare' ({len(train_dataloader)}) does not match "
+                f"the expected length ({len_train_dataloader_after_sharding}) when the learning rate scheduler was created. "
+                f"This inconsistency may result in the learning rate scheduler not functioning properly."
+            )
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+
+    # 17. Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    unet.train()
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # 1. Load and process the image and text conditioning
+                pixel_values, text, orig_size, crop_coords = (
+                    batch["pixel_values"],
+                    batch["captions"],
+                    batch["original_sizes"],
+                    batch["crop_top_lefts"],
+                )
+
+                encoded_text = compute_embeddings_fn(text, orig_size, crop_coords)
+
+                # encode pixel values with batch size of at most args.vae_encode_batch_size
+                pixel_values = pixel_values.to(dtype=vae.dtype)
+                latents = []
+                for i in range(0, pixel_values.shape[0], args.vae_encode_batch_size):
+                    latents.append(vae.encode(pixel_values[i : i + args.vae_encode_batch_size]).latent_dist.sample())
+                latents = torch.cat(latents, dim=0)
+
+                latents = latents * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    latents = latents.to(weight_dtype)
+
+                # 2. Sample a random timestep for each image t_n from the ODE solver timesteps without bias.
+                # For the DDIM solver, the timestep schedule is [T - 1, T - k - 1, T - 2 * k - 1, ...]
+                bsz = latents.shape[0]
+                topk = noise_scheduler.config.num_train_timesteps // args.num_ddim_timesteps
+                index = torch.randint(0, args.num_ddim_timesteps, (bsz,), device=latents.device).long()
+                start_timesteps = solver.ddim_timesteps[index]
+                timesteps = start_timesteps - topk
+                timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)
+
+                # 3. Get boundary scalings for start_timesteps and (end) timesteps.
+                c_skip_start, c_out_start = scalings_for_boundary_conditions(
+                    start_timesteps, timestep_scaling=args.timestep_scaling_factor
+                )
+                c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
+                c_skip, c_out = scalings_for_boundary_conditions(
+                    timesteps, timestep_scaling=args.timestep_scaling_factor
+                )
+                c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]
+
+                # 4. Sample noise from the prior and add it to the latents according to the noise magnitude at each
+                # timestep (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                noise = torch.randn_like(latents)
+                noisy_model_input = noise_scheduler.add_noise(latents, noise, start_timesteps)
+
+                # 5. Sample a random guidance scale w from U[w_min, w_max]
+                # Note that for LCM-LoRA distillation it is not necessary to use a guidance scale embedding
+                w = (args.w_max - args.w_min) * torch.rand((bsz,)) + args.w_min
+                w = w.reshape(bsz, 1, 1, 1)
+                w = w.to(device=latents.device, dtype=latents.dtype)
+
+                # 6. Prepare prompt embeds and unet_added_conditions
+                prompt_embeds = encoded_text.pop("prompt_embeds")
+
+                # 7. Get online LCM prediction on z_{t_{n + k}} (noisy_model_input), w, c, t_{n + k} (start_timesteps)
+                noise_pred = unet(
+                    noisy_model_input,
+                    start_timesteps,
+                    encoder_hidden_states=prompt_embeds,
+                    added_cond_kwargs=encoded_text,
+                ).sample
+                pred_x_0 = get_predicted_original_sample(
+                    noise_pred,
+                    start_timesteps,
+                    noisy_model_input,
+                    noise_scheduler.config.prediction_type,
+                    alpha_schedule,
+                    sigma_schedule,
+                )
+                model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0
+
+                # 8. Compute the conditional and unconditional teacher model predictions to get CFG estimates of the
+                # predicted noise eps_0 and predicted original sample x_0, then run the ODE solver using these
+                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
+                # solver timestep.
+
+                # With the adapters disabled, the `unet` is the regular teacher model.
+                accelerator.unwrap_model(unet).disable_adapters()
+                with torch.no_grad():
+                    # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
+                    cond_teacher_output = unet(
+                        noisy_model_input,
+                        start_timesteps,
+                        encoder_hidden_states=prompt_embeds,
+                        added_cond_kwargs={k: v.to(weight_dtype) for k, v in encoded_text.items()},
+                    ).sample
+                    cond_pred_x0 = get_predicted_original_sample(
+                        cond_teacher_output,
+                        start_timesteps,
+                        noisy_model_input,
+                        noise_scheduler.config.prediction_type,
+                        alpha_schedule,
+                        sigma_schedule,
+                    )
+                    cond_pred_noise = get_predicted_noise(
+                        cond_teacher_output,
+                        start_timesteps,
+                        noisy_model_input,
+                        noise_scheduler.config.prediction_type,
+                        alpha_schedule,
+                        sigma_schedule,
+                    )
+
+                    # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
+                    uncond_prompt_embeds = torch.zeros_like(prompt_embeds)
+                    uncond_pooled_prompt_embeds = torch.zeros_like(encoded_text["text_embeds"])
+                    uncond_added_conditions = copy.deepcopy(encoded_text)
+                    uncond_added_conditions["text_embeds"] = uncond_pooled_prompt_embeds
+                    uncond_teacher_output = unet(
+                        noisy_model_input,
+                        start_timesteps,
+                        encoder_hidden_states=uncond_prompt_embeds.to(weight_dtype),
+                        added_cond_kwargs={k: v.to(weight_dtype) for k, v in uncond_added_conditions.items()},
+                    ).sample
+                    uncond_pred_x0 = get_predicted_original_sample(
+                        uncond_teacher_output,
+                        start_timesteps,
+                        noisy_model_input,
+                        noise_scheduler.config.prediction_type,
+                        alpha_schedule,
+                        sigma_schedule,
+                    )
+                    uncond_pred_noise = get_predicted_noise(
+                        uncond_teacher_output,
+                        start_timesteps,
+                        noisy_model_input,
+                        noise_scheduler.config.prediction_type,
+                        alpha_schedule,
+                        sigma_schedule,
+                    )
+
+                    # 3. Calculate the CFG estimate of x_0 (pred_x0) and eps_0 (pred_noise)
+                    # Note that this uses the LCM paper's CFG formulation rather than the Imagen CFG formulation
+                    pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
+                    pred_noise = cond_pred_noise + w * (cond_pred_noise - uncond_pred_noise)
+                    # 4. Run one step of the ODE solver to estimate the next point x_prev on the
+                    # augmented PF-ODE trajectory (solving backward in time)
+                    # Note that the DDIM step depends on both the predicted x_0 and source noise eps_0.
+                    x_prev = solver.ddim_step(pred_x0, pred_noise, index).to(unet.dtype)
+
+                # re-enable unet adapters to turn the `unet` into a student unet.
+                accelerator.unwrap_model(unet).enable_adapters()
+
+                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
+                # Note that we do not use a separate target network for LCM-LoRA distillation.
+                with torch.no_grad():
+                    target_noise_pred = unet(
+                        x_prev,
+                        timesteps,
+                        encoder_hidden_states=prompt_embeds,
+                        added_cond_kwargs={k: v.to(weight_dtype) for k, v in encoded_text.items()},
+                    ).sample
+                    pred_x_0 = get_predicted_original_sample(
+                        target_noise_pred,
+                        timesteps,
+                        x_prev,
+                        noise_scheduler.config.prediction_type,
+                        alpha_schedule,
+                        sigma_schedule,
+                    )
+                    target = c_skip * x_prev + c_out * pred_x_0
+
+                # 10. Calculate loss
+                if args.loss_type == "l2":
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                elif args.loss_type == "huber":
+                    loss = torch.mean(
+                        torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
+                    )
+
+                # 11. Backpropagate on the online student model (`unet`) (only LoRA)
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if global_step % args.validation_steps == 0:
+                        log_validation(
+                            vae, args, accelerator, weight_dtype, global_step, unet=unet, is_final_validation=False
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+        StableDiffusionXLPipeline.save_lora_weights(args.output_dir, unet_lora_layers=unet_lora_state_dict)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+        del unet
+        torch.cuda.empty_cache()
+
+        # Final inference.
+        if args.validation_steps is not None:
+            log_validation(vae, args, accelerator, weight_dtype, step=global_step, unet=None, is_final_validation=True)
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py b/diffusers/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa7e7f1febee8a57c0fc6bb767deb72366df0281
--- /dev/null
+++ b/diffusers/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py
@@ -0,0 +1,1530 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import copy
+import functools
+import gc
+import itertools
+import json
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+from typing import List, Union
+
+import accelerate
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import torchvision.transforms.functional as TF
+import transformers
+import webdataset as wds
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from braceexpand import braceexpand
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig, get_peft_model, get_peft_model_state_dict
+from torch.utils.data import default_collate
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+from webdataset.tariterators import (
+    base_plus_ext,
+    tar_file_expander,
+    url_opener,
+    valid_sample,
+)
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    LCMScheduler,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import resolve_interpolation_mode
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+MAX_SEQ_LENGTH = 77
+
+# Adjust for your dataset
+WDS_JSON_WIDTH = "width"  # original_width for LAION
+WDS_JSON_HEIGHT = "height"  # original_height for LAION
+MIN_SIZE = 700  # ~960 for LAION, ideal: 1024 if the dataset contains large images
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def get_module_kohya_state_dict(module, prefix: str, dtype: torch.dtype, adapter_name: str = "default"):
+    kohya_ss_state_dict = {}
+    for peft_key, weight in get_peft_model_state_dict(module, adapter_name=adapter_name).items():
+        kohya_key = peft_key.replace("base_model.model", prefix)
+        kohya_key = kohya_key.replace("lora_A", "lora_down")
+        kohya_key = kohya_key.replace("lora_B", "lora_up")
+        kohya_key = kohya_key.replace(".", "_", kohya_key.count(".") - 2)
+        kohya_ss_state_dict[kohya_key] = weight.to(dtype)
+
+        # Set alpha parameter
+        if "lora_down" in kohya_key:
+            alpha_key = f'{kohya_key.split(".")[0]}.alpha'
+            kohya_ss_state_dict[alpha_key] = torch.tensor(module.peft_config[adapter_name].lora_alpha).to(dtype)
+
+    return kohya_ss_state_dict
+
+
+def filter_keys(key_set):
+    def _f(dictionary):
+        return {k: v for k, v in dictionary.items() if k in key_set}
+
+    return _f
+
+
+def group_by_keys_nothrow(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None):
+    """Return function over iterator that groups key, value pairs into samples.
+
+    :param keys: function that splits the key into key and extension (base_plus_ext) :param lcase: convert suffixes to
+    lower case (Default value = True)
+    """
+    current_sample = None
+    for filesample in data:
+        assert isinstance(filesample, dict)
+        fname, value = filesample["fname"], filesample["data"]
+        prefix, suffix = keys(fname)
+        if prefix is None:
+            continue
+        if lcase:
+            suffix = suffix.lower()
+        # FIXME webdataset version throws if suffix in current_sample, but we have a potential for
+        #  this happening in the current LAION400m dataset if a tar ends with same prefix as the next
+        #  begins, rare, but can happen since prefix aren't unique across tar files in that dataset
+        if current_sample is None or prefix != current_sample["__key__"] or suffix in current_sample:
+            if valid_sample(current_sample):
+                yield current_sample
+            current_sample = {"__key__": prefix, "__url__": filesample["__url__"]}
+        if suffixes is None or suffix in suffixes:
+            current_sample[suffix] = value
+    if valid_sample(current_sample):
+        yield current_sample
+
+
+def tarfile_to_samples_nothrow(src, handler=wds.warn_and_continue):
+    # NOTE this is a re-impl of the webdataset impl with group_by_keys that doesn't throw
+    streams = url_opener(src, handler=handler)
+    files = tar_file_expander(streams, handler=handler)
+    samples = group_by_keys_nothrow(files, handler=handler)
+    return samples
+
+
+class WebdatasetFilter:
+    def __init__(self, min_size=1024, max_pwatermark=0.5):
+        self.min_size = min_size
+        self.max_pwatermark = max_pwatermark
+
+    def __call__(self, x):
+        try:
+            if "json" in x:
+                x_json = json.loads(x["json"])
+                filter_size = (x_json.get(WDS_JSON_WIDTH, 0.0) or 0.0) >= self.min_size and x_json.get(
+                    WDS_JSON_HEIGHT, 0
+                ) >= self.min_size
+                filter_watermark = (x_json.get("pwatermark", 0.0) or 0.0) <= self.max_pwatermark
+                return filter_size and filter_watermark
+            else:
+                return False
+        except Exception:
+            return False
+
+
+class SDXLText2ImageDataset:
+    def __init__(
+        self,
+        train_shards_path_or_url: Union[str, List[str]],
+        num_train_examples: int,
+        per_gpu_batch_size: int,
+        global_batch_size: int,
+        num_workers: int,
+        resolution: int = 1024,
+        interpolation_type: str = "bilinear",
+        shuffle_buffer_size: int = 1000,
+        pin_memory: bool = False,
+        persistent_workers: bool = False,
+        use_fix_crop_and_size: bool = False,
+    ):
+        if not isinstance(train_shards_path_or_url, str):
+            train_shards_path_or_url = [list(braceexpand(urls)) for urls in train_shards_path_or_url]
+            # flatten list using itertools
+            train_shards_path_or_url = list(itertools.chain.from_iterable(train_shards_path_or_url))
+
+        def get_orig_size(json):
+            if use_fix_crop_and_size:
+                return (resolution, resolution)
+            else:
+                return (int(json.get(WDS_JSON_WIDTH, 0.0)), int(json.get(WDS_JSON_HEIGHT, 0.0)))
+
+        interpolation_mode = resolve_interpolation_mode(interpolation_type)
+
+        def transform(example):
+            # resize image
+            image = example["image"]
+            image = TF.resize(image, resolution, interpolation=interpolation_mode)
+
+            # get crop coordinates and crop image
+            c_top, c_left, _, _ = transforms.RandomCrop.get_params(image, output_size=(resolution, resolution))
+            image = TF.crop(image, c_top, c_left, resolution, resolution)
+            image = TF.to_tensor(image)
+            image = TF.normalize(image, [0.5], [0.5])
+
+            example["image"] = image
+            example["crop_coords"] = (c_top, c_left) if not use_fix_crop_and_size else (0, 0)
+            return example
+
+        processing_pipeline = [
+            wds.decode("pil", handler=wds.ignore_and_continue),
+            wds.rename(
+                image="jpg;png;jpeg;webp", text="text;txt;caption", orig_size="json", handler=wds.warn_and_continue
+            ),
+            wds.map(filter_keys({"image", "text", "orig_size"})),
+            wds.map_dict(orig_size=get_orig_size),
+            wds.map(transform),
+            wds.to_tuple("image", "text", "orig_size", "crop_coords"),
+        ]
+
+        # Create train dataset and loader
+        pipeline = [
+            wds.ResampledShards(train_shards_path_or_url),
+            tarfile_to_samples_nothrow,
+            wds.select(WebdatasetFilter(min_size=MIN_SIZE)),
+            wds.shuffle(shuffle_buffer_size),
+            *processing_pipeline,
+            wds.batched(per_gpu_batch_size, partial=False, collation_fn=default_collate),
+        ]
+
+        num_worker_batches = math.ceil(num_train_examples / (global_batch_size * num_workers))  # per dataloader worker
+        num_batches = num_worker_batches * num_workers
+        num_samples = num_batches * global_batch_size
+
+        # each worker is iterating over this
+        self._train_dataset = wds.DataPipeline(*pipeline).with_epoch(num_worker_batches)
+        self._train_dataloader = wds.WebLoader(
+            self._train_dataset,
+            batch_size=None,
+            shuffle=False,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=persistent_workers,
+        )
+        # add meta-data to dataloader instance for convenience
+        self._train_dataloader.num_batches = num_batches
+        self._train_dataloader.num_samples = num_samples
+
+    @property
+    def train_dataset(self):
+        return self._train_dataset
+
+    @property
+    def train_dataloader(self):
+        return self._train_dataloader
+
+
+def log_validation(vae, unet, args, accelerator, weight_dtype, step):
+    logger.info("Running validation... ")
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
+
+    unet = accelerator.unwrap_model(unet)
+    pipeline = StableDiffusionXLPipeline.from_pretrained(
+        args.pretrained_teacher_model,
+        vae=vae,
+        scheduler=LCMScheduler.from_pretrained(args.pretrained_teacher_model, subfolder="scheduler"),
+        revision=args.revision,
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    lora_state_dict = get_module_kohya_state_dict(unet, "lora_unet", weight_dtype)
+    pipeline.load_lora_weights(lora_state_dict)
+    pipeline.fuse_lora()
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    validation_prompts = [
+        "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",
+        "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+        "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
+    ]
+
+    image_logs = []
+
+    for _, prompt in enumerate(validation_prompts):
+        images = []
+        with autocast_ctx:
+            images = pipeline(
+                prompt=prompt,
+                num_inference_steps=4,
+                num_images_per_prompt=4,
+                generator=generator,
+                guidance_scale=0.0,
+            ).images
+        image_logs.append({"validation_prompt": prompt, "images": images})
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                formatted_images = []
+                for image in images:
+                    formatted_images.append(np.asarray(image))
+
+                formatted_images = np.stack(formatted_images)
+
+                tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            formatted_images = []
+
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                for image in images:
+                    image = wandb.Image(image, caption=validation_prompt)
+                    formatted_images.append(image)
+
+            tracker.log({"validation": formatted_images})
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+        del pipeline
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        return image_logs
+
+
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+
+
+# From LCMScheduler.get_scalings_for_boundary_condition_discrete
+def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=10.0):
+    scaled_timestep = timestep_scaling * timestep
+    c_skip = sigma_data**2 / (scaled_timestep**2 + sigma_data**2)
+    c_out = scaled_timestep / (scaled_timestep**2 + sigma_data**2) ** 0.5
+    return c_skip, c_out
+
+
+# Compare LCMScheduler.step, Step 4
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
+    elif prediction_type == "v_prediction":
+        pred_x_0 = alphas * sample - sigmas * model_output
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_x_0
+
+
+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+
+
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
+
+class DDIMSolver:
+    def __init__(self, alpha_cumprods, timesteps=1000, ddim_timesteps=50):
+        # DDIM sampling parameters
+        step_ratio = timesteps // ddim_timesteps
+
+        self.ddim_timesteps = (np.arange(1, ddim_timesteps + 1) * step_ratio).round().astype(np.int64) - 1
+        self.ddim_alpha_cumprods = alpha_cumprods[self.ddim_timesteps]
+        self.ddim_alpha_cumprods_prev = np.asarray(
+            [alpha_cumprods[0]] + alpha_cumprods[self.ddim_timesteps[:-1]].tolist()
+        )
+        # convert to torch tensors
+        self.ddim_timesteps = torch.from_numpy(self.ddim_timesteps).long()
+        self.ddim_alpha_cumprods = torch.from_numpy(self.ddim_alpha_cumprods)
+        self.ddim_alpha_cumprods_prev = torch.from_numpy(self.ddim_alpha_cumprods_prev)
+
+    def to(self, device):
+        self.ddim_timesteps = self.ddim_timesteps.to(device)
+        self.ddim_alpha_cumprods = self.ddim_alpha_cumprods.to(device)
+        self.ddim_alpha_cumprods_prev = self.ddim_alpha_cumprods_prev.to(device)
+        return self
+
+    def ddim_step(self, pred_x0, pred_noise, timestep_index):
+        alpha_cumprod_prev = extract_into_tensor(self.ddim_alpha_cumprods_prev, timestep_index, pred_x0.shape)
+        dir_xt = (1.0 - alpha_cumprod_prev).sqrt() * pred_noise
+        x_prev = alpha_cumprod_prev.sqrt() * pred_x0 + dir_xt
+        return x_prev
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    # ----------Model Checkpoint Loading Arguments----------
+    parser.add_argument(
+        "--pretrained_teacher_model",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained LDM teacher model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--teacher_revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained LDM teacher model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained LDM model identifier from huggingface.co/models.",
+    )
+    # ----------Training Arguments----------
+    # ----General Training Arguments----
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="lcm-xl-distilled",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    # ----Logging----
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    # ----Checkpointing----
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    # ----Image Processing----
+    parser.add_argument(
+        "--train_shards_path_or_url",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--interpolation_type",
+        type=str,
+        default="bilinear",
+        help=(
+            "The interpolation function used when resizing images to the desired resolution. Choose between `bilinear`,"
+            " `bicubic`, `box`, `nearest`, `nearest_exact`, `hamming`, and `lanczos`."
+        ),
+    )
+    parser.add_argument(
+        "--use_fix_crop_and_size",
+        action="store_true",
+        help="Whether or not to use the fixed crop and size for the teacher model.",
+        default=False,
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    # ----Dataloader----
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    # ----Batch Size and Training Steps----
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    # ----Learning Rate----
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    # ----Optimizer (Adam)----
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    # ----Diffusion Training Arguments----
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    # ----Latent Consistency Distillation (LCD) Specific Arguments----
+    parser.add_argument(
+        "--w_min",
+        type=float,
+        default=3.0,
+        required=False,
+        help=(
+            "The minimum guidance scale value for guidance scale sampling. Note that we are using the Imagen CFG"
+            " formulation rather than the LCM formulation, which means all guidance scales have 1 added to them as"
+            " compared to the original paper."
+        ),
+    )
+    parser.add_argument(
+        "--w_max",
+        type=float,
+        default=15.0,
+        required=False,
+        help=(
+            "The maximum guidance scale value for guidance scale sampling. Note that we are using the Imagen CFG"
+            " formulation rather than the LCM formulation, which means all guidance scales have 1 added to them as"
+            " compared to the original paper."
+        ),
+    )
+    parser.add_argument(
+        "--num_ddim_timesteps",
+        type=int,
+        default=50,
+        help="The number of timesteps to use for DDIM sampling.",
+    )
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber"],
+        help="The type of loss to use for the LCD loss.",
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.001,
+        help="The huber loss parameter. Only used if `--loss_type=huber`.",
+    )
+    parser.add_argument(
+        "--lora_rank",
+        type=int,
+        default=64,
+        help="The rank of the LoRA projection matrix.",
+    )
+    parser.add_argument(
+        "--lora_alpha",
+        type=int,
+        default=64,
+        help=(
+            "The value of the LoRA alpha parameter, which controls the scaling factor in front of the LoRA weight"
+            " update delta_W. No scaling will be performed if this value is equal to `lora_rank`."
+        ),
+    )
+    parser.add_argument(
+        "--lora_dropout",
+        type=float,
+        default=0.0,
+        help="The dropout probability for the dropout layer added before applying the LoRA to each layer input.",
+    )
+    parser.add_argument(
+        "--lora_target_modules",
+        type=str,
+        default=None,
+        help=(
+            "A comma-separated string of target module keys to add LoRA to. If not set, a default list of modules will"
+            " be used. By default, LoRA will be applied to all conv and linear layers."
+        ),
+    )
+    parser.add_argument(
+        "--vae_encode_batch_size",
+        type=int,
+        default=8,
+        required=False,
+        help=(
+            "The batch size used when encoding (and decoding) images to latents (and vice versa) using the VAE."
+            " Encoding or decoding the whole batch at once may run into OOM issues."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_scaling_factor",
+        type=float,
+        default=10.0,
+        help=(
+            "The multiplicative timestep scaling factor used when calculating the boundary scalings for LCM. The"
+            " higher the scaling is, the lower the approximation error, but the default value of 10.0 should typically"
+            " suffice."
+        ),
+    )
+    # ----Mixed Precision----
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--cast_teacher_unet",
+        action="store_true",
+        help="Whether to cast the teacher U-Net to the precision specified by `--mixed_precision`.",
+    )
+    # ----Training Optimizations----
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    # ----Distributed Training----
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    # ----------Validation Arguments----------
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=200,
+        help="Run validation every X steps.",
+    )
+    # ----------Huggingface Hub Arguments-----------
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    # ----------Accelerate Arguments----------
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="text2image-fine-tune",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    return args
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train=True):
+    prompt_embeds_list = []
+
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+
+    with torch.no_grad():
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                captions,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = text_encoder(
+                text_input_ids.to(text_encoder.device),
+                output_hidden_states=True,
+            )
+
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+            prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        split_batches=True,  # It's important to set this to True when using webdataset to get the right number of steps for lr scheduling. If set to False, the number of steps will be devide by the number of processes assuming batches are multiplied by the number of processes
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+                token=args.hub_token,
+                private=True,
+            ).repo_id
+
+    # 1. Create the noise scheduler and the desired noise schedule.
+    noise_scheduler = DDPMScheduler.from_pretrained(
+        args.pretrained_teacher_model, subfolder="scheduler", revision=args.teacher_revision
+    )
+
+    # DDPMScheduler calculates the alpha and sigma noise schedules (based on the alpha bars) for us
+    alpha_schedule = torch.sqrt(noise_scheduler.alphas_cumprod)
+    sigma_schedule = torch.sqrt(1 - noise_scheduler.alphas_cumprod)
+    # Initialize the DDIM ODE solver for distillation.
+    solver = DDIMSolver(
+        noise_scheduler.alphas_cumprod.numpy(),
+        timesteps=noise_scheduler.config.num_train_timesteps,
+        ddim_timesteps=args.num_ddim_timesteps,
+    )
+
+    # 2. Load tokenizers from SD-XL checkpoint.
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_teacher_model, subfolder="tokenizer", revision=args.teacher_revision, use_fast=False
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_teacher_model, subfolder="tokenizer_2", revision=args.teacher_revision, use_fast=False
+    )
+
+    # 3. Load text encoders from SD-XL checkpoint.
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_teacher_model, args.teacher_revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_teacher_model, args.teacher_revision, subfolder="text_encoder_2"
+    )
+
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_teacher_model, subfolder="text_encoder", revision=args.teacher_revision
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_teacher_model, subfolder="text_encoder_2", revision=args.teacher_revision
+    )
+
+    # 4. Load VAE from SD-XL checkpoint (or more stable VAE)
+    vae_path = (
+        args.pretrained_teacher_model
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.teacher_revision,
+    )
+
+    # 5. Load teacher U-Net from SD-XL checkpoint
+    teacher_unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
+    )
+
+    # 6. Freeze teacher vae, text_encoders, and teacher_unet
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    teacher_unet.requires_grad_(False)
+
+    # 7. Create online student U-Net.
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
+    )
+    unet.train()
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        " Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training, copy of the weights should still be float32."
+    )
+
+    if accelerator.unwrap_model(unet).dtype != torch.float32:
+        raise ValueError(
+            f"Controlnet loaded as datatype {accelerator.unwrap_model(unet).dtype}. {low_precision_error_string}"
+        )
+
+    # 8. Add LoRA to the student U-Net, only the LoRA projection matrix will be updated by the optimizer.
+    if args.lora_target_modules is not None:
+        lora_target_modules = [module_key.strip() for module_key in args.lora_target_modules.split(",")]
+    else:
+        lora_target_modules = [
+            "to_q",
+            "to_k",
+            "to_v",
+            "to_out.0",
+            "proj_in",
+            "proj_out",
+            "ff.net.0.proj",
+            "ff.net.2",
+            "conv1",
+            "conv2",
+            "conv_shortcut",
+            "downsamplers.0.conv",
+            "upsamplers.0.conv",
+            "time_emb_proj",
+        ]
+    lora_config = LoraConfig(
+        r=args.lora_rank,
+        target_modules=lora_target_modules,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+    )
+    unet = get_peft_model(unet, lora_config)
+
+    # 9. Handle mixed precision and device placement
+    # For mixed precision training we cast all non-trainable weigths to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    vae.to(accelerator.device)
+    if args.pretrained_vae_model_name_or_path is not None:
+        vae.to(dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    # Move teacher_unet to device, optionally cast to weight_dtype
+    teacher_unet.to(accelerator.device)
+    if args.cast_teacher_unet:
+        teacher_unet.to(dtype=weight_dtype)
+
+    # Also move the alpha and sigma noise schedules to accelerator.device.
+    alpha_schedule = alpha_schedule.to(accelerator.device)
+    sigma_schedule = sigma_schedule.to(accelerator.device)
+    # Move the ODE solver to accelerator.device.
+    solver = solver.to(accelerator.device)
+
+    # 10. Handle saving and loading of checkpoints
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                unet_ = accelerator.unwrap_model(unet)
+                lora_state_dict = get_peft_model_state_dict(unet_, adapter_name="default")
+                StableDiffusionXLPipeline.save_lora_weights(os.path.join(output_dir, "unet_lora"), lora_state_dict)
+                # save weights in peft format to be able to load them back
+                unet_.save_pretrained(output_dir)
+
+                for _, model in enumerate(models):
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            # load the LoRA into the model
+            unet_ = accelerator.unwrap_model(unet)
+            unet_.load_adapter(input_dir, "default", is_trainable=True)
+
+            for _ in range(len(models)):
+                # pop models so that they are not loaded again
+                models.pop()
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # 11. Enable optimizations
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+            teacher_unet.enable_xformers_memory_efficient_attention()
+            # target_unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # 12. Optimizer creation
+    optimizer = optimizer_class(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # 13. Dataset creation and data processing
+    # Here, we compute not just the text embeddings but also the additional embeddings
+    # needed for the SD XL UNet to operate.
+    def compute_embeddings(
+        prompt_batch, original_sizes, crop_coords, proportion_empty_prompts, text_encoders, tokenizers, is_train=True
+    ):
+        target_size = (args.resolution, args.resolution)
+        original_sizes = list(map(list, zip(*original_sizes)))
+        crops_coords_top_left = list(map(list, zip(*crop_coords)))
+
+        original_sizes = torch.tensor(original_sizes, dtype=torch.long)
+        crops_coords_top_left = torch.tensor(crops_coords_top_left, dtype=torch.long)
+
+        prompt_embeds, pooled_prompt_embeds = encode_prompt(
+            prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train
+        )
+        add_text_embeds = pooled_prompt_embeds
+
+        # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+        add_time_ids = list(target_size)
+        add_time_ids = torch.tensor([add_time_ids])
+        add_time_ids = add_time_ids.repeat(len(prompt_batch), 1)
+        add_time_ids = torch.cat([original_sizes, crops_coords_top_left, add_time_ids], dim=-1)
+        add_time_ids = add_time_ids.to(accelerator.device, dtype=prompt_embeds.dtype)
+
+        prompt_embeds = prompt_embeds.to(accelerator.device)
+        add_text_embeds = add_text_embeds.to(accelerator.device)
+        unet_added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        return {"prompt_embeds": prompt_embeds, **unet_added_cond_kwargs}
+
+    dataset = SDXLText2ImageDataset(
+        train_shards_path_or_url=args.train_shards_path_or_url,
+        num_train_examples=args.max_train_samples,
+        per_gpu_batch_size=args.train_batch_size,
+        global_batch_size=args.train_batch_size * accelerator.num_processes,
+        num_workers=args.dataloader_num_workers,
+        resolution=args.resolution,
+        interpolation_type=args.interpolation_type,
+        shuffle_buffer_size=1000,
+        pin_memory=True,
+        persistent_workers=True,
+        use_fix_crop_and_size=args.use_fix_crop_and_size,
+    )
+    train_dataloader = dataset.train_dataloader
+
+    # Let's first compute all the embeddings so that we can free up the text encoders
+    # from memory.
+    text_encoders = [text_encoder_one, text_encoder_two]
+    tokenizers = [tokenizer_one, tokenizer_two]
+
+    compute_embeddings_fn = functools.partial(
+        compute_embeddings,
+        proportion_empty_prompts=0,
+        text_encoders=text_encoders,
+        tokenizers=tokenizers,
+    )
+
+    # 14. LR Scheduler creation
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # 15. Prepare for training
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+
+    # Create uncond embeds for classifier free guidance
+    uncond_prompt_embeds = torch.zeros(args.train_batch_size, 77, 2048).to(accelerator.device)
+    uncond_pooled_prompt_embeds = torch.zeros(args.train_batch_size, 1280).to(accelerator.device)
+
+    # 16. Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num batches each epoch = {train_dataloader.num_batches}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # 1. Load and process the image, text, and micro-conditioning (original image size, crop coordinates)
+                image, text, orig_size, crop_coords = batch
+
+                image = image.to(accelerator.device, non_blocking=True)
+                encoded_text = compute_embeddings_fn(text, orig_size, crop_coords)
+
+                if args.pretrained_vae_model_name_or_path is not None:
+                    pixel_values = image.to(dtype=weight_dtype)
+                    if vae.dtype != weight_dtype:
+                        vae.to(dtype=weight_dtype)
+                else:
+                    pixel_values = image
+
+                # encode pixel values with batch size of at most args.vae_encode_batch_size
+                latents = []
+                for i in range(0, pixel_values.shape[0], args.vae_encode_batch_size):
+                    latents.append(vae.encode(pixel_values[i : i + args.vae_encode_batch_size]).latent_dist.sample())
+                latents = torch.cat(latents, dim=0)
+
+                latents = latents * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    latents = latents.to(weight_dtype)
+                bsz = latents.shape[0]
+
+                # 2. Sample a random timestep for each image t_n from the ODE solver timesteps without bias.
+                # For the DDIM solver, the timestep schedule is [T - 1, T - k - 1, T - 2 * k - 1, ...]
+                topk = noise_scheduler.config.num_train_timesteps // args.num_ddim_timesteps
+                index = torch.randint(0, args.num_ddim_timesteps, (bsz,), device=latents.device).long()
+                start_timesteps = solver.ddim_timesteps[index]
+                timesteps = start_timesteps - topk
+                timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)
+
+                # 3. Get boundary scalings for start_timesteps and (end) timesteps.
+                c_skip_start, c_out_start = scalings_for_boundary_conditions(
+                    start_timesteps, timestep_scaling=args.timestep_scaling_factor
+                )
+                c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
+                c_skip, c_out = scalings_for_boundary_conditions(
+                    timesteps, timestep_scaling=args.timestep_scaling_factor
+                )
+                c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]
+
+                # 4. Sample noise from the prior and add it to the latents according to the noise magnitude at each
+                # timestep (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                noise = torch.randn_like(latents)
+                noisy_model_input = noise_scheduler.add_noise(latents, noise, start_timesteps)
+
+                # 5. Sample a random guidance scale w from U[w_min, w_max]
+                # Note that for LCM-LoRA distillation it is not necessary to use a guidance scale embedding
+                w = (args.w_max - args.w_min) * torch.rand((bsz,)) + args.w_min
+                w = w.reshape(bsz, 1, 1, 1)
+                w = w.to(device=latents.device, dtype=latents.dtype)
+
+                # 6. Prepare prompt embeds and unet_added_conditions
+                prompt_embeds = encoded_text.pop("prompt_embeds")
+
+                # 7. Get online LCM prediction on z_{t_{n + k}} (noisy_model_input), w, c, t_{n + k} (start_timesteps)
+                noise_pred = unet(
+                    noisy_model_input,
+                    start_timesteps,
+                    timestep_cond=None,
+                    encoder_hidden_states=prompt_embeds.float(),
+                    added_cond_kwargs=encoded_text,
+                ).sample
+
+                pred_x_0 = get_predicted_original_sample(
+                    noise_pred,
+                    start_timesteps,
+                    noisy_model_input,
+                    noise_scheduler.config.prediction_type,
+                    alpha_schedule,
+                    sigma_schedule,
+                )
+
+                model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0
+
+                # 8. Compute the conditional and unconditional teacher model predictions to get CFG estimates of the
+                # predicted noise eps_0 and predicted original sample x_0, then run the ODE solver using these
+                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
+                # solver timestep.
+                with torch.no_grad():
+                    if torch.backends.mps.is_available() or "playground" in args.pretrained_teacher_model:
+                        autocast_ctx = nullcontext()
+                    else:
+                        autocast_ctx = torch.autocast(accelerator.device.type)
+
+                    with autocast_ctx:
+                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
+                        cond_teacher_output = teacher_unet(
+                            noisy_model_input.to(weight_dtype),
+                            start_timesteps,
+                            encoder_hidden_states=prompt_embeds.to(weight_dtype),
+                            added_cond_kwargs={k: v.to(weight_dtype) for k, v in encoded_text.items()},
+                        ).sample
+                        cond_pred_x0 = get_predicted_original_sample(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        cond_pred_noise = get_predicted_noise(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
+                        uncond_added_conditions = copy.deepcopy(encoded_text)
+                        uncond_added_conditions["text_embeds"] = uncond_pooled_prompt_embeds
+                        uncond_teacher_output = teacher_unet(
+                            noisy_model_input.to(weight_dtype),
+                            start_timesteps,
+                            encoder_hidden_states=uncond_prompt_embeds.to(weight_dtype),
+                            added_cond_kwargs={k: v.to(weight_dtype) for k, v in uncond_added_conditions.items()},
+                        ).sample
+                        uncond_pred_x0 = get_predicted_original_sample(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        uncond_pred_noise = get_predicted_noise(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+
+                        # 3. Calculate the CFG estimate of x_0 (pred_x0) and eps_0 (pred_noise)
+                        # Note that this uses the LCM paper's CFG formulation rather than the Imagen CFG formulation
+                        pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
+                        pred_noise = cond_pred_noise + w * (cond_pred_noise - uncond_pred_noise)
+                        # 4. Run one step of the ODE solver to estimate the next point x_prev on the
+                        # augmented PF-ODE trajectory (solving backward in time)
+                        # Note that the DDIM step depends on both the predicted x_0 and source noise eps_0.
+                        x_prev = solver.ddim_step(pred_x0, pred_noise, index)
+
+                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
+                # Note that we do not use a separate target network for LCM-LoRA distillation.
+                with torch.no_grad():
+                    if torch.backends.mps.is_available():
+                        autocast_ctx = nullcontext()
+                    else:
+                        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
+
+                    with autocast_ctx:
+                        target_noise_pred = unet(
+                            x_prev.float(),
+                            timesteps,
+                            timestep_cond=None,
+                            encoder_hidden_states=prompt_embeds.float(),
+                            added_cond_kwargs=encoded_text,
+                        ).sample
+                    pred_x_0 = get_predicted_original_sample(
+                        target_noise_pred,
+                        timesteps,
+                        x_prev,
+                        noise_scheduler.config.prediction_type,
+                        alpha_schedule,
+                        sigma_schedule,
+                    )
+                    target = c_skip * x_prev + c_out * pred_x_0
+
+                # 10. Calculate loss
+                if args.loss_type == "l2":
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                elif args.loss_type == "huber":
+                    loss = torch.mean(
+                        torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
+                    )
+
+                # 11. Backpropagate on the online student model (`unet`)
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if global_step % args.validation_steps == 0:
+                        log_validation(vae, unet, args, accelerator, weight_dtype, global_step)
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        unet.save_pretrained(args.output_dir)
+        lora_state_dict = get_peft_model_state_dict(unet, adapter_name="default")
+        StableDiffusionXLPipeline.save_lora_weights(os.path.join(args.output_dir, "unet_lora"), lora_state_dict)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/consistency_distillation/train_lcm_distill_sd_wds.py b/diffusers/examples/consistency_distillation/train_lcm_distill_sd_wds.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d7db09a361a59516bbe319c67731c345091f4d
--- /dev/null
+++ b/diffusers/examples/consistency_distillation/train_lcm_distill_sd_wds.py
@@ -0,0 +1,1433 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import functools
+import gc
+import itertools
+import json
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+from typing import List, Union
+
+import accelerate
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import torchvision.transforms.functional as TF
+import transformers
+import webdataset as wds
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from braceexpand import braceexpand
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torch.utils.data import default_collate
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, CLIPTextModel, PretrainedConfig
+from webdataset.tariterators import (
+    base_plus_ext,
+    tar_file_expander,
+    url_opener,
+    valid_sample,
+)
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    LCMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import resolve_interpolation_mode
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+MAX_SEQ_LENGTH = 77
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def filter_keys(key_set):
+    def _f(dictionary):
+        return {k: v for k, v in dictionary.items() if k in key_set}
+
+    return _f
+
+
+def group_by_keys_nothrow(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None):
+    """Return function over iterator that groups key, value pairs into samples.
+
+    :param keys: function that splits the key into key and extension (base_plus_ext) :param lcase: convert suffixes to
+    lower case (Default value = True)
+    """
+    current_sample = None
+    for filesample in data:
+        assert isinstance(filesample, dict)
+        fname, value = filesample["fname"], filesample["data"]
+        prefix, suffix = keys(fname)
+        if prefix is None:
+            continue
+        if lcase:
+            suffix = suffix.lower()
+        # FIXME webdataset version throws if suffix in current_sample, but we have a potential for
+        #  this happening in the current LAION400m dataset if a tar ends with same prefix as the next
+        #  begins, rare, but can happen since prefix aren't unique across tar files in that dataset
+        if current_sample is None or prefix != current_sample["__key__"] or suffix in current_sample:
+            if valid_sample(current_sample):
+                yield current_sample
+            current_sample = {"__key__": prefix, "__url__": filesample["__url__"]}
+        if suffixes is None or suffix in suffixes:
+            current_sample[suffix] = value
+    if valid_sample(current_sample):
+        yield current_sample
+
+
+def tarfile_to_samples_nothrow(src, handler=wds.warn_and_continue):
+    # NOTE this is a re-impl of the webdataset impl with group_by_keys that doesn't throw
+    streams = url_opener(src, handler=handler)
+    files = tar_file_expander(streams, handler=handler)
+    samples = group_by_keys_nothrow(files, handler=handler)
+    return samples
+
+
+class WebdatasetFilter:
+    def __init__(self, min_size=1024, max_pwatermark=0.5):
+        self.min_size = min_size
+        self.max_pwatermark = max_pwatermark
+
+    def __call__(self, x):
+        try:
+            if "json" in x:
+                x_json = json.loads(x["json"])
+                filter_size = (x_json.get("original_width", 0.0) or 0.0) >= self.min_size and x_json.get(
+                    "original_height", 0
+                ) >= self.min_size
+                filter_watermark = (x_json.get("pwatermark", 1.0) or 1.0) <= self.max_pwatermark
+                return filter_size and filter_watermark
+            else:
+                return False
+        except Exception:
+            return False
+
+
+class SDText2ImageDataset:
+    def __init__(
+        self,
+        train_shards_path_or_url: Union[str, List[str]],
+        num_train_examples: int,
+        per_gpu_batch_size: int,
+        global_batch_size: int,
+        num_workers: int,
+        resolution: int = 512,
+        interpolation_type: str = "bilinear",
+        shuffle_buffer_size: int = 1000,
+        pin_memory: bool = False,
+        persistent_workers: bool = False,
+    ):
+        if not isinstance(train_shards_path_or_url, str):
+            train_shards_path_or_url = [list(braceexpand(urls)) for urls in train_shards_path_or_url]
+            # flatten list using itertools
+            train_shards_path_or_url = list(itertools.chain.from_iterable(train_shards_path_or_url))
+
+        interpolation_mode = resolve_interpolation_mode(interpolation_type)
+
+        def transform(example):
+            # resize image
+            image = example["image"]
+            image = TF.resize(image, resolution, interpolation=interpolation_mode)
+
+            # get crop coordinates and crop image
+            c_top, c_left, _, _ = transforms.RandomCrop.get_params(image, output_size=(resolution, resolution))
+            image = TF.crop(image, c_top, c_left, resolution, resolution)
+            image = TF.to_tensor(image)
+            image = TF.normalize(image, [0.5], [0.5])
+
+            example["image"] = image
+            return example
+
+        processing_pipeline = [
+            wds.decode("pil", handler=wds.ignore_and_continue),
+            wds.rename(image="jpg;png;jpeg;webp", text="text;txt;caption", handler=wds.warn_and_continue),
+            wds.map(filter_keys({"image", "text"})),
+            wds.map(transform),
+            wds.to_tuple("image", "text"),
+        ]
+
+        # Create train dataset and loader
+        pipeline = [
+            wds.ResampledShards(train_shards_path_or_url),
+            tarfile_to_samples_nothrow,
+            wds.shuffle(shuffle_buffer_size),
+            *processing_pipeline,
+            wds.batched(per_gpu_batch_size, partial=False, collation_fn=default_collate),
+        ]
+
+        num_worker_batches = math.ceil(num_train_examples / (global_batch_size * num_workers))  # per dataloader worker
+        num_batches = num_worker_batches * num_workers
+        num_samples = num_batches * global_batch_size
+
+        # each worker is iterating over this
+        self._train_dataset = wds.DataPipeline(*pipeline).with_epoch(num_worker_batches)
+        self._train_dataloader = wds.WebLoader(
+            self._train_dataset,
+            batch_size=None,
+            shuffle=False,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=persistent_workers,
+        )
+        # add meta-data to dataloader instance for convenience
+        self._train_dataloader.num_batches = num_batches
+        self._train_dataloader.num_samples = num_samples
+
+    @property
+    def train_dataset(self):
+        return self._train_dataset
+
+    @property
+    def train_dataloader(self):
+        return self._train_dataloader
+
+
+def log_validation(vae, unet, args, accelerator, weight_dtype, step, name="target"):
+    logger.info("Running validation... ")
+
+    unet = accelerator.unwrap_model(unet)
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        args.pretrained_teacher_model,
+        vae=vae,
+        unet=unet,
+        scheduler=LCMScheduler.from_pretrained(args.pretrained_teacher_model, subfolder="scheduler"),
+        revision=args.revision,
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    validation_prompts = [
+        "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",
+        "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+        "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
+    ]
+
+    image_logs = []
+
+    for _, prompt in enumerate(validation_prompts):
+        images = []
+        if torch.backends.mps.is_available():
+            autocast_ctx = nullcontext()
+        else:
+            autocast_ctx = torch.autocast(accelerator.device.type)
+
+        with autocast_ctx:
+            images = pipeline(
+                prompt=prompt,
+                num_inference_steps=4,
+                num_images_per_prompt=4,
+                generator=generator,
+            ).images
+        image_logs.append({"validation_prompt": prompt, "images": images})
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                formatted_images = []
+                for image in images:
+                    formatted_images.append(np.asarray(image))
+
+                formatted_images = np.stack(formatted_images)
+
+                tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            formatted_images = []
+
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                for image in images:
+                    image = wandb.Image(image, caption=validation_prompt)
+                    formatted_images.append(image)
+
+            tracker.log({f"validation/{name}": formatted_images})
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+        del pipeline
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        return image_logs
+
+
+# From LatentConsistencyModel.get_guidance_scale_embedding
+def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32):
+    """
+    See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+    Args:
+        timesteps (`torch.Tensor`):
+            generate embedding vectors at these timesteps
+        embedding_dim (`int`, *optional*, defaults to 512):
+            dimension of the embeddings to generate
+        dtype:
+            data type of the generated embeddings
+
+    Returns:
+        `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+    """
+    assert len(w.shape) == 1
+    w = w * 1000.0
+
+    half_dim = embedding_dim // 2
+    emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+    emb = w.to(dtype)[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0, 1))
+    assert emb.shape == (w.shape[0], embedding_dim)
+    return emb
+
+
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+
+
+# From LCMScheduler.get_scalings_for_boundary_condition_discrete
+def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=10.0):
+    scaled_timestep = timestep_scaling * timestep
+    c_skip = sigma_data**2 / (scaled_timestep**2 + sigma_data**2)
+    c_out = scaled_timestep / (scaled_timestep**2 + sigma_data**2) ** 0.5
+    return c_skip, c_out
+
+
+# Compare LCMScheduler.step, Step 4
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
+    elif prediction_type == "v_prediction":
+        pred_x_0 = alphas * sample - sigmas * model_output
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_x_0
+
+
+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+
+
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
+
+class DDIMSolver:
+    def __init__(self, alpha_cumprods, timesteps=1000, ddim_timesteps=50):
+        # DDIM sampling parameters
+        step_ratio = timesteps // ddim_timesteps
+        self.ddim_timesteps = (np.arange(1, ddim_timesteps + 1) * step_ratio).round().astype(np.int64) - 1
+        self.ddim_alpha_cumprods = alpha_cumprods[self.ddim_timesteps]
+        self.ddim_alpha_cumprods_prev = np.asarray(
+            [alpha_cumprods[0]] + alpha_cumprods[self.ddim_timesteps[:-1]].tolist()
+        )
+        # convert to torch tensors
+        self.ddim_timesteps = torch.from_numpy(self.ddim_timesteps).long()
+        self.ddim_alpha_cumprods = torch.from_numpy(self.ddim_alpha_cumprods)
+        self.ddim_alpha_cumprods_prev = torch.from_numpy(self.ddim_alpha_cumprods_prev)
+
+    def to(self, device):
+        self.ddim_timesteps = self.ddim_timesteps.to(device)
+        self.ddim_alpha_cumprods = self.ddim_alpha_cumprods.to(device)
+        self.ddim_alpha_cumprods_prev = self.ddim_alpha_cumprods_prev.to(device)
+        return self
+
+    def ddim_step(self, pred_x0, pred_noise, timestep_index):
+        alpha_cumprod_prev = extract_into_tensor(self.ddim_alpha_cumprods_prev, timestep_index, pred_x0.shape)
+        dir_xt = (1.0 - alpha_cumprod_prev).sqrt() * pred_noise
+        x_prev = alpha_cumprod_prev.sqrt() * pred_x0 + dir_xt
+        return x_prev
+
+
+@torch.no_grad()
+def update_ema(target_params, source_params, rate=0.99):
+    """
+    Update target parameters to be closer to those of source parameters using
+    an exponential moving average.
+
+    :param target_params: the target parameter sequence.
+    :param source_params: the source parameter sequence.
+    :param rate: the EMA rate (closer to 1 means slower).
+    """
+    for targ, src in zip(target_params, source_params):
+        targ.detach().mul_(rate).add_(src, alpha=1 - rate)
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    # ----------Model Checkpoint Loading Arguments----------
+    parser.add_argument(
+        "--pretrained_teacher_model",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained LDM teacher model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--teacher_revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained LDM teacher model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained LDM model identifier from huggingface.co/models.",
+    )
+    # ----------Training Arguments----------
+    # ----General Training Arguments----
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="lcm-xl-distilled",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    # ----Logging----
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    # ----Checkpointing----
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    # ----Image Processing----
+    parser.add_argument(
+        "--train_shards_path_or_url",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--interpolation_type",
+        type=str,
+        default="bilinear",
+        help=(
+            "The interpolation function used when resizing images to the desired resolution. Choose between `bilinear`,"
+            " `bicubic`, `box`, `nearest`, `nearest_exact`, `hamming`, and `lanczos`."
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    # ----Dataloader----
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    # ----Batch Size and Training Steps----
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    # ----Learning Rate----
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    # ----Optimizer (Adam)----
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    # ----Diffusion Training Arguments----
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    # ----Latent Consistency Distillation (LCD) Specific Arguments----
+    parser.add_argument(
+        "--w_min",
+        type=float,
+        default=5.0,
+        required=False,
+        help=(
+            "The minimum guidance scale value for guidance scale sampling. Note that we are using the Imagen CFG"
+            " formulation rather than the LCM formulation, which means all guidance scales have 1 added to them as"
+            " compared to the original paper."
+        ),
+    )
+    parser.add_argument(
+        "--w_max",
+        type=float,
+        default=15.0,
+        required=False,
+        help=(
+            "The maximum guidance scale value for guidance scale sampling. Note that we are using the Imagen CFG"
+            " formulation rather than the LCM formulation, which means all guidance scales have 1 added to them as"
+            " compared to the original paper."
+        ),
+    )
+    parser.add_argument(
+        "--num_ddim_timesteps",
+        type=int,
+        default=50,
+        help="The number of timesteps to use for DDIM sampling.",
+    )
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber"],
+        help="The type of loss to use for the LCD loss.",
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.001,
+        help="The huber loss parameter. Only used if `--loss_type=huber`.",
+    )
+    parser.add_argument(
+        "--unet_time_cond_proj_dim",
+        type=int,
+        default=256,
+        help=(
+            "The dimension of the guidance scale embedding in the U-Net, which will be used if the teacher U-Net"
+            " does not have `time_cond_proj_dim` set."
+        ),
+    )
+    parser.add_argument(
+        "--vae_encode_batch_size",
+        type=int,
+        default=32,
+        required=False,
+        help=(
+            "The batch size used when encoding (and decoding) images to latents (and vice versa) using the VAE."
+            " Encoding or decoding the whole batch at once may run into OOM issues."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_scaling_factor",
+        type=float,
+        default=10.0,
+        help=(
+            "The multiplicative timestep scaling factor used when calculating the boundary scalings for LCM. The"
+            " higher the scaling is, the lower the approximation error, but the default value of 10.0 should typically"
+            " suffice."
+        ),
+    )
+    # ----Exponential Moving Average (EMA)----
+    parser.add_argument(
+        "--ema_decay",
+        type=float,
+        default=0.95,
+        required=False,
+        help="The exponential moving average (EMA) rate or decay factor.",
+    )
+    # ----Mixed Precision----
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--cast_teacher_unet",
+        action="store_true",
+        help="Whether to cast the teacher U-Net to the precision specified by `--mixed_precision`.",
+    )
+    # ----Training Optimizations----
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    # ----Distributed Training----
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    # ----------Validation Arguments----------
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=200,
+        help="Run validation every X steps.",
+    )
+    # ----------Huggingface Hub Arguments-----------
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    # ----------Accelerate Arguments----------
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="text2image-fine-tune",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    return args
+
+
+# Adapted from pipelines.StableDiffusionPipeline.encode_prompt
+def encode_prompt(prompt_batch, text_encoder, tokenizer, proportion_empty_prompts, is_train=True):
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+
+    with torch.no_grad():
+        text_inputs = tokenizer(
+            captions,
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_embeds = text_encoder(text_input_ids.to(text_encoder.device))[0]
+
+    return prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        split_batches=True,  # It's important to set this to True when using webdataset to get the right number of steps for lr scheduling. If set to False, the number of steps will be devide by the number of processes assuming batches are multiplied by the number of processes
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+                token=args.hub_token,
+                private=True,
+            ).repo_id
+
+    # 1. Create the noise scheduler and the desired noise schedule.
+    noise_scheduler = DDPMScheduler.from_pretrained(
+        args.pretrained_teacher_model, subfolder="scheduler", revision=args.teacher_revision
+    )
+
+    # DDPMScheduler calculates the alpha and sigma noise schedules (based on the alpha bars) for us
+    alpha_schedule = torch.sqrt(noise_scheduler.alphas_cumprod)
+    sigma_schedule = torch.sqrt(1 - noise_scheduler.alphas_cumprod)
+    # Initialize the DDIM ODE solver for distillation.
+    solver = DDIMSolver(
+        noise_scheduler.alphas_cumprod.numpy(),
+        timesteps=noise_scheduler.config.num_train_timesteps,
+        ddim_timesteps=args.num_ddim_timesteps,
+    )
+
+    # 2. Load tokenizers from SD 1.X/2.X checkpoint.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.pretrained_teacher_model, subfolder="tokenizer", revision=args.teacher_revision, use_fast=False
+    )
+
+    # 3. Load text encoders from SD 1.X/2.X checkpoint.
+    # import correct text encoder classes
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_teacher_model, subfolder="text_encoder", revision=args.teacher_revision
+    )
+
+    # 4. Load VAE from SD 1.X/2.X checkpoint
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_teacher_model,
+        subfolder="vae",
+        revision=args.teacher_revision,
+    )
+
+    # 5. Load teacher U-Net from SD 1.X/2.X checkpoint
+    teacher_unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
+    )
+
+    # 6. Freeze teacher vae, text_encoder, and teacher_unet
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    teacher_unet.requires_grad_(False)
+
+    # 7. Create online student U-Net. This will be updated by the optimizer (e.g. via backpropagation.)
+    # Add `time_cond_proj_dim` to the student U-Net if `teacher_unet.config.time_cond_proj_dim` is None
+    time_cond_proj_dim = (
+        teacher_unet.config.time_cond_proj_dim
+        if teacher_unet.config.time_cond_proj_dim is not None
+        else args.unet_time_cond_proj_dim
+    )
+    unet = UNet2DConditionModel.from_config(teacher_unet.config, time_cond_proj_dim=time_cond_proj_dim)
+    # load teacher_unet weights into unet
+    unet.load_state_dict(teacher_unet.state_dict(), strict=False)
+    unet.train()
+
+    # 8. Create target student U-Net. This will be updated via EMA updates (polyak averaging).
+    # Initialize from (online) unet
+    target_unet = UNet2DConditionModel.from_config(unet.config)
+    target_unet.load_state_dict(unet.state_dict())
+    target_unet.train()
+    target_unet.requires_grad_(False)
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        " Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training, copy of the weights should still be float32."
+    )
+
+    if accelerator.unwrap_model(unet).dtype != torch.float32:
+        raise ValueError(
+            f"Controlnet loaded as datatype {accelerator.unwrap_model(unet).dtype}. {low_precision_error_string}"
+        )
+
+    # 9. Handle mixed precision and device placement
+    # For mixed precision training we cast all non-trainable weigths to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    vae.to(accelerator.device)
+    if args.pretrained_vae_model_name_or_path is not None:
+        vae.to(dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # Move teacher_unet to device, optionally cast to weight_dtype
+    target_unet.to(accelerator.device)
+    teacher_unet.to(accelerator.device)
+    if args.cast_teacher_unet:
+        teacher_unet.to(dtype=weight_dtype)
+
+    # Also move the alpha and sigma noise schedules to accelerator.device.
+    alpha_schedule = alpha_schedule.to(accelerator.device)
+    sigma_schedule = sigma_schedule.to(accelerator.device)
+    solver = solver.to(accelerator.device)
+
+    # 10. Handle saving and loading of checkpoints
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                target_unet.save_pretrained(os.path.join(output_dir, "unet_target"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            load_model = UNet2DConditionModel.from_pretrained(os.path.join(input_dir, "unet_target"))
+            target_unet.load_state_dict(load_model.state_dict())
+            target_unet.to(accelerator.device)
+            del load_model
+
+            for i in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # 11. Enable optimizations
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+            teacher_unet.enable_xformers_memory_efficient_attention()
+            target_unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # 12. Optimizer creation
+    optimizer = optimizer_class(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # 13. Dataset creation and data processing
+    # Here, we compute not just the text embeddings but also the additional embeddings
+    # needed for the SD XL UNet to operate.
+    def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tokenizer, is_train=True):
+        prompt_embeds = encode_prompt(prompt_batch, text_encoder, tokenizer, proportion_empty_prompts, is_train)
+        return {"prompt_embeds": prompt_embeds}
+
+    dataset = SDText2ImageDataset(
+        train_shards_path_or_url=args.train_shards_path_or_url,
+        num_train_examples=args.max_train_samples,
+        per_gpu_batch_size=args.train_batch_size,
+        global_batch_size=args.train_batch_size * accelerator.num_processes,
+        num_workers=args.dataloader_num_workers,
+        resolution=args.resolution,
+        interpolation_type=args.interpolation_type,
+        shuffle_buffer_size=1000,
+        pin_memory=True,
+        persistent_workers=True,
+    )
+    train_dataloader = dataset.train_dataloader
+
+    compute_embeddings_fn = functools.partial(
+        compute_embeddings,
+        proportion_empty_prompts=0,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+    )
+
+    # 14. LR Scheduler creation
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # 15. Prepare for training
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+
+    uncond_input_ids = tokenizer(
+        [""] * args.train_batch_size, return_tensors="pt", padding="max_length", max_length=77
+    ).input_ids.to(accelerator.device)
+    uncond_prompt_embeds = text_encoder(uncond_input_ids)[0]
+
+    # 16. Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num batches each epoch = {train_dataloader.num_batches}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # 1. Load and process the image and text conditioning
+                image, text = batch
+
+                image = image.to(accelerator.device, non_blocking=True)
+                encoded_text = compute_embeddings_fn(text)
+
+                pixel_values = image.to(dtype=weight_dtype)
+                if vae.dtype != weight_dtype:
+                    vae.to(dtype=weight_dtype)
+
+                # encode pixel values with batch size of at most args.vae_encode_batch_size
+                latents = []
+                for i in range(0, pixel_values.shape[0], args.vae_encode_batch_size):
+                    latents.append(vae.encode(pixel_values[i : i + args.vae_encode_batch_size]).latent_dist.sample())
+                latents = torch.cat(latents, dim=0)
+
+                latents = latents * vae.config.scaling_factor
+                latents = latents.to(weight_dtype)
+                bsz = latents.shape[0]
+
+                # 2. Sample a random timestep for each image t_n from the ODE solver timesteps without bias.
+                # For the DDIM solver, the timestep schedule is [T - 1, T - k - 1, T - 2 * k - 1, ...]
+                topk = noise_scheduler.config.num_train_timesteps // args.num_ddim_timesteps
+                index = torch.randint(0, args.num_ddim_timesteps, (bsz,), device=latents.device).long()
+                start_timesteps = solver.ddim_timesteps[index]
+                timesteps = start_timesteps - topk
+                timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)
+
+                # 3. Get boundary scalings for start_timesteps and (end) timesteps.
+                c_skip_start, c_out_start = scalings_for_boundary_conditions(
+                    start_timesteps, timestep_scaling=args.timestep_scaling_factor
+                )
+                c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
+                c_skip, c_out = scalings_for_boundary_conditions(
+                    timesteps, timestep_scaling=args.timestep_scaling_factor
+                )
+                c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]
+
+                # 4. Sample noise from the prior and add it to the latents according to the noise magnitude at each
+                # timestep (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                noise = torch.randn_like(latents)
+                noisy_model_input = noise_scheduler.add_noise(latents, noise, start_timesteps)
+
+                # 5. Sample a random guidance scale w from U[w_min, w_max] and embed it
+                w = (args.w_max - args.w_min) * torch.rand((bsz,)) + args.w_min
+                w_embedding = guidance_scale_embedding(w, embedding_dim=time_cond_proj_dim)
+                w = w.reshape(bsz, 1, 1, 1)
+                # Move to U-Net device and dtype
+                w = w.to(device=latents.device, dtype=latents.dtype)
+                w_embedding = w_embedding.to(device=latents.device, dtype=latents.dtype)
+
+                # 6. Prepare prompt embeds and unet_added_conditions
+                prompt_embeds = encoded_text.pop("prompt_embeds")
+
+                # 7. Get online LCM prediction on z_{t_{n + k}} (noisy_model_input), w, c, t_{n + k} (start_timesteps)
+                noise_pred = unet(
+                    noisy_model_input,
+                    start_timesteps,
+                    timestep_cond=w_embedding,
+                    encoder_hidden_states=prompt_embeds.float(),
+                    added_cond_kwargs=encoded_text,
+                ).sample
+
+                pred_x_0 = get_predicted_original_sample(
+                    noise_pred,
+                    start_timesteps,
+                    noisy_model_input,
+                    noise_scheduler.config.prediction_type,
+                    alpha_schedule,
+                    sigma_schedule,
+                )
+
+                model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0
+
+                # 8. Compute the conditional and unconditional teacher model predictions to get CFG estimates of the
+                # predicted noise eps_0 and predicted original sample x_0, then run the ODE solver using these
+                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
+                # solver timestep.
+                with torch.no_grad():
+                    if torch.backends.mps.is_available():
+                        autocast_ctx = nullcontext()
+                    else:
+                        autocast_ctx = torch.autocast(accelerator.device.type)
+
+                    with autocast_ctx:
+                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
+                        cond_teacher_output = teacher_unet(
+                            noisy_model_input.to(weight_dtype),
+                            start_timesteps,
+                            encoder_hidden_states=prompt_embeds.to(weight_dtype),
+                        ).sample
+                        cond_pred_x0 = get_predicted_original_sample(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        cond_pred_noise = get_predicted_noise(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
+                        uncond_teacher_output = teacher_unet(
+                            noisy_model_input.to(weight_dtype),
+                            start_timesteps,
+                            encoder_hidden_states=uncond_prompt_embeds.to(weight_dtype),
+                        ).sample
+                        uncond_pred_x0 = get_predicted_original_sample(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        uncond_pred_noise = get_predicted_noise(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+
+                        # 3. Calculate the CFG estimate of x_0 (pred_x0) and eps_0 (pred_noise)
+                        # Note that this uses the LCM paper's CFG formulation rather than the Imagen CFG formulation
+                        pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
+                        pred_noise = cond_pred_noise + w * (cond_pred_noise - uncond_pred_noise)
+                        # 4. Run one step of the ODE solver to estimate the next point x_prev on the
+                        # augmented PF-ODE trajectory (solving backward in time)
+                        # Note that the DDIM step depends on both the predicted x_0 and source noise eps_0.
+                        x_prev = solver.ddim_step(pred_x0, pred_noise, index)
+
+                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
+                with torch.no_grad():
+                    if torch.backends.mps.is_available():
+                        autocast_ctx = nullcontext()
+                    else:
+                        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
+
+                    with autocast_ctx:
+                        target_noise_pred = target_unet(
+                            x_prev.float(),
+                            timesteps,
+                            timestep_cond=w_embedding,
+                            encoder_hidden_states=prompt_embeds.float(),
+                        ).sample
+                    pred_x_0 = get_predicted_original_sample(
+                        target_noise_pred,
+                        timesteps,
+                        x_prev,
+                        noise_scheduler.config.prediction_type,
+                        alpha_schedule,
+                        sigma_schedule,
+                    )
+                    target = c_skip * x_prev + c_out * pred_x_0
+
+                # 10. Calculate loss
+                if args.loss_type == "l2":
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                elif args.loss_type == "huber":
+                    loss = torch.mean(
+                        torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
+                    )
+
+                # 11. Backpropagate on the online student model (`unet`)
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                # 12. Make EMA update to target student model parameters (`target_unet`)
+                update_ema(target_unet.parameters(), unet.parameters(), args.ema_decay)
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if global_step % args.validation_steps == 0:
+                        log_validation(vae, target_unet, args, accelerator, weight_dtype, global_step, "target")
+                        log_validation(vae, unet, args, accelerator, weight_dtype, global_step, "online")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        unet.save_pretrained(os.path.join(args.output_dir, "unet"))
+
+        target_unet = accelerator.unwrap_model(target_unet)
+        target_unet.save_pretrained(os.path.join(args.output_dir, "unet_target"))
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py b/diffusers/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc5e6812127e1cb1101f31ef3016ee8e8718aaa1
--- /dev/null
+++ b/diffusers/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py
@@ -0,0 +1,1536 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import copy
+import functools
+import gc
+import itertools
+import json
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+from typing import List, Union
+
+import accelerate
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import torchvision.transforms.functional as TF
+import transformers
+import webdataset as wds
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from braceexpand import braceexpand
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torch.utils.data import default_collate
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+from webdataset.tariterators import (
+    base_plus_ext,
+    tar_file_expander,
+    url_opener,
+    valid_sample,
+)
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    LCMScheduler,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import resolve_interpolation_mode
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+MAX_SEQ_LENGTH = 77
+
+# Adjust for your dataset
+WDS_JSON_WIDTH = "width"  # original_width for LAION
+WDS_JSON_HEIGHT = "height"  # original_height for LAION
+MIN_SIZE = 700  # ~960 for LAION, ideal: 1024 if the dataset contains large images
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def filter_keys(key_set):
+    def _f(dictionary):
+        return {k: v for k, v in dictionary.items() if k in key_set}
+
+    return _f
+
+
+def group_by_keys_nothrow(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None):
+    """Return function over iterator that groups key, value pairs into samples.
+
+    :param keys: function that splits the key into key and extension (base_plus_ext) :param lcase: convert suffixes to
+    lower case (Default value = True)
+    """
+    current_sample = None
+    for filesample in data:
+        assert isinstance(filesample, dict)
+        fname, value = filesample["fname"], filesample["data"]
+        prefix, suffix = keys(fname)
+        if prefix is None:
+            continue
+        if lcase:
+            suffix = suffix.lower()
+        # FIXME webdataset version throws if suffix in current_sample, but we have a potential for
+        #  this happening in the current LAION400m dataset if a tar ends with same prefix as the next
+        #  begins, rare, but can happen since prefix aren't unique across tar files in that dataset
+        if current_sample is None or prefix != current_sample["__key__"] or suffix in current_sample:
+            if valid_sample(current_sample):
+                yield current_sample
+            current_sample = {"__key__": prefix, "__url__": filesample["__url__"]}
+        if suffixes is None or suffix in suffixes:
+            current_sample[suffix] = value
+    if valid_sample(current_sample):
+        yield current_sample
+
+
+def tarfile_to_samples_nothrow(src, handler=wds.warn_and_continue):
+    # NOTE this is a re-impl of the webdataset impl with group_by_keys that doesn't throw
+    streams = url_opener(src, handler=handler)
+    files = tar_file_expander(streams, handler=handler)
+    samples = group_by_keys_nothrow(files, handler=handler)
+    return samples
+
+
+class WebdatasetFilter:
+    def __init__(self, min_size=1024, max_pwatermark=0.5):
+        self.min_size = min_size
+        self.max_pwatermark = max_pwatermark
+
+    def __call__(self, x):
+        try:
+            if "json" in x:
+                x_json = json.loads(x["json"])
+                filter_size = (x_json.get(WDS_JSON_WIDTH, 0.0) or 0.0) >= self.min_size and x_json.get(
+                    WDS_JSON_HEIGHT, 0
+                ) >= self.min_size
+                filter_watermark = (x_json.get("pwatermark", 0.0) or 0.0) <= self.max_pwatermark
+                return filter_size and filter_watermark
+            else:
+                return False
+        except Exception:
+            return False
+
+
+class SDXLText2ImageDataset:
+    def __init__(
+        self,
+        train_shards_path_or_url: Union[str, List[str]],
+        num_train_examples: int,
+        per_gpu_batch_size: int,
+        global_batch_size: int,
+        num_workers: int,
+        resolution: int = 1024,
+        interpolation_type: str = "bilinear",
+        shuffle_buffer_size: int = 1000,
+        pin_memory: bool = False,
+        persistent_workers: bool = False,
+        use_fix_crop_and_size: bool = False,
+    ):
+        if not isinstance(train_shards_path_or_url, str):
+            train_shards_path_or_url = [list(braceexpand(urls)) for urls in train_shards_path_or_url]
+            # flatten list using itertools
+            train_shards_path_or_url = list(itertools.chain.from_iterable(train_shards_path_or_url))
+
+        def get_orig_size(json):
+            if use_fix_crop_and_size:
+                return (resolution, resolution)
+            else:
+                return (int(json.get(WDS_JSON_WIDTH, 0.0)), int(json.get(WDS_JSON_HEIGHT, 0.0)))
+
+        interpolation_mode = resolve_interpolation_mode(interpolation_type)
+
+        def transform(example):
+            # resize image
+            image = example["image"]
+            image = TF.resize(image, resolution, interpolation=interpolation_mode)
+
+            # get crop coordinates and crop image
+            c_top, c_left, _, _ = transforms.RandomCrop.get_params(image, output_size=(resolution, resolution))
+            image = TF.crop(image, c_top, c_left, resolution, resolution)
+            image = TF.to_tensor(image)
+            image = TF.normalize(image, [0.5], [0.5])
+
+            example["image"] = image
+            example["crop_coords"] = (c_top, c_left) if not use_fix_crop_and_size else (0, 0)
+            return example
+
+        processing_pipeline = [
+            wds.decode("pil", handler=wds.ignore_and_continue),
+            wds.rename(
+                image="jpg;png;jpeg;webp", text="text;txt;caption", orig_size="json", handler=wds.warn_and_continue
+            ),
+            wds.map(filter_keys({"image", "text", "orig_size"})),
+            wds.map_dict(orig_size=get_orig_size),
+            wds.map(transform),
+            wds.to_tuple("image", "text", "orig_size", "crop_coords"),
+        ]
+
+        # Create train dataset and loader
+        pipeline = [
+            wds.ResampledShards(train_shards_path_or_url),
+            tarfile_to_samples_nothrow,
+            wds.select(WebdatasetFilter(min_size=MIN_SIZE)),
+            wds.shuffle(shuffle_buffer_size),
+            *processing_pipeline,
+            wds.batched(per_gpu_batch_size, partial=False, collation_fn=default_collate),
+        ]
+
+        num_worker_batches = math.ceil(num_train_examples / (global_batch_size * num_workers))  # per dataloader worker
+        num_batches = num_worker_batches * num_workers
+        num_samples = num_batches * global_batch_size
+
+        # each worker is iterating over this
+        self._train_dataset = wds.DataPipeline(*pipeline).with_epoch(num_worker_batches)
+        self._train_dataloader = wds.WebLoader(
+            self._train_dataset,
+            batch_size=None,
+            shuffle=False,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=persistent_workers,
+        )
+        # add meta-data to dataloader instance for convenience
+        self._train_dataloader.num_batches = num_batches
+        self._train_dataloader.num_samples = num_samples
+
+    @property
+    def train_dataset(self):
+        return self._train_dataset
+
+    @property
+    def train_dataloader(self):
+        return self._train_dataloader
+
+
+def log_validation(vae, unet, args, accelerator, weight_dtype, step, name="target"):
+    logger.info("Running validation... ")
+
+    unet = accelerator.unwrap_model(unet)
+    pipeline = StableDiffusionXLPipeline.from_pretrained(
+        args.pretrained_teacher_model,
+        vae=vae,
+        unet=unet,
+        scheduler=LCMScheduler.from_pretrained(args.pretrained_teacher_model, subfolder="scheduler"),
+        revision=args.revision,
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    validation_prompts = [
+        "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",
+        "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+        "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
+    ]
+
+    image_logs = []
+
+    for _, prompt in enumerate(validation_prompts):
+        images = []
+        if torch.backends.mps.is_available():
+            autocast_ctx = nullcontext()
+        else:
+            autocast_ctx = torch.autocast(accelerator.device.type)
+
+        with autocast_ctx:
+            images = pipeline(
+                prompt=prompt,
+                num_inference_steps=4,
+                num_images_per_prompt=4,
+                generator=generator,
+            ).images
+        image_logs.append({"validation_prompt": prompt, "images": images})
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                formatted_images = []
+                for image in images:
+                    formatted_images.append(np.asarray(image))
+
+                formatted_images = np.stack(formatted_images)
+
+                tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            formatted_images = []
+
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                for image in images:
+                    image = wandb.Image(image, caption=validation_prompt)
+                    formatted_images.append(image)
+
+            tracker.log({f"validation/{name}": formatted_images})
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+        del pipeline
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        return image_logs
+
+
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+
+
+# From LCMScheduler.get_scalings_for_boundary_condition_discrete
+def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=10.0):
+    scaled_timestep = timestep_scaling * timestep
+    c_skip = sigma_data**2 / (scaled_timestep**2 + sigma_data**2)
+    c_out = scaled_timestep / (scaled_timestep**2 + sigma_data**2) ** 0.5
+    return c_skip, c_out
+
+
+# Compare LCMScheduler.step, Step 4
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
+    elif prediction_type == "v_prediction":
+        pred_x_0 = alphas * sample - sigmas * model_output
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_x_0
+
+
+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+
+
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
+
+@torch.no_grad()
+def update_ema(target_params, source_params, rate=0.99):
+    """
+    Update target parameters to be closer to those of source parameters using
+    an exponential moving average.
+
+    :param target_params: the target parameter sequence.
+    :param source_params: the source parameter sequence.
+    :param rate: the EMA rate (closer to 1 means slower).
+    """
+    for targ, src in zip(target_params, source_params):
+        targ.detach().mul_(rate).add_(src, alpha=1 - rate)
+
+
+# From LatentConsistencyModel.get_guidance_scale_embedding
+def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32):
+    """
+    See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+    Args:
+        timesteps (`torch.Tensor`):
+            generate embedding vectors at these timesteps
+        embedding_dim (`int`, *optional*, defaults to 512):
+            dimension of the embeddings to generate
+        dtype:
+            data type of the generated embeddings
+
+    Returns:
+        `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+    """
+    assert len(w.shape) == 1
+    w = w * 1000.0
+
+    half_dim = embedding_dim // 2
+    emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+    emb = w.to(dtype)[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0, 1))
+    assert emb.shape == (w.shape[0], embedding_dim)
+    return emb
+
+
+class DDIMSolver:
+    def __init__(self, alpha_cumprods, timesteps=1000, ddim_timesteps=50):
+        # DDIM sampling parameters
+        step_ratio = timesteps // ddim_timesteps
+
+        self.ddim_timesteps = (np.arange(1, ddim_timesteps + 1) * step_ratio).round().astype(np.int64) - 1
+        self.ddim_alpha_cumprods = alpha_cumprods[self.ddim_timesteps]
+        self.ddim_alpha_cumprods_prev = np.asarray(
+            [alpha_cumprods[0]] + alpha_cumprods[self.ddim_timesteps[:-1]].tolist()
+        )
+        # convert to torch tensors
+        self.ddim_timesteps = torch.from_numpy(self.ddim_timesteps).long()
+        self.ddim_alpha_cumprods = torch.from_numpy(self.ddim_alpha_cumprods)
+        self.ddim_alpha_cumprods_prev = torch.from_numpy(self.ddim_alpha_cumprods_prev)
+
+    def to(self, device):
+        self.ddim_timesteps = self.ddim_timesteps.to(device)
+        self.ddim_alpha_cumprods = self.ddim_alpha_cumprods.to(device)
+        self.ddim_alpha_cumprods_prev = self.ddim_alpha_cumprods_prev.to(device)
+        return self
+
+    def ddim_step(self, pred_x0, pred_noise, timestep_index):
+        alpha_cumprod_prev = extract_into_tensor(self.ddim_alpha_cumprods_prev, timestep_index, pred_x0.shape)
+        dir_xt = (1.0 - alpha_cumprod_prev).sqrt() * pred_noise
+        x_prev = alpha_cumprod_prev.sqrt() * pred_x0 + dir_xt
+        return x_prev
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    # ----------Model Checkpoint Loading Arguments----------
+    parser.add_argument(
+        "--pretrained_teacher_model",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained LDM teacher model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--teacher_revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained LDM teacher model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained LDM model identifier from huggingface.co/models.",
+    )
+    # ----------Training Arguments----------
+    # ----General Training Arguments----
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="lcm-xl-distilled",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    # ----Logging----
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    # ----Checkpointing----
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    # ----Image Processing----
+    parser.add_argument(
+        "--train_shards_path_or_url",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--interpolation_type",
+        type=str,
+        default="bilinear",
+        help=(
+            "The interpolation function used when resizing images to the desired resolution. Choose between `bilinear`,"
+            " `bicubic`, `box`, `nearest`, `nearest_exact`, `hamming`, and `lanczos`."
+        ),
+    )
+    parser.add_argument(
+        "--use_fix_crop_and_size",
+        action="store_true",
+        help="Whether or not to use the fixed crop and size for the teacher model.",
+        default=False,
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    # ----Dataloader----
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    # ----Batch Size and Training Steps----
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    # ----Learning Rate----
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    # ----Optimizer (Adam)----
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    # ----Diffusion Training Arguments----
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    # ----Latent Consistency Distillation (LCD) Specific Arguments----
+    parser.add_argument(
+        "--w_min",
+        type=float,
+        default=3.0,
+        required=False,
+        help=(
+            "The minimum guidance scale value for guidance scale sampling. Note that we are using the Imagen CFG"
+            " formulation rather than the LCM formulation, which means all guidance scales have 1 added to them as"
+            " compared to the original paper."
+        ),
+    )
+    parser.add_argument(
+        "--w_max",
+        type=float,
+        default=15.0,
+        required=False,
+        help=(
+            "The maximum guidance scale value for guidance scale sampling. Note that we are using the Imagen CFG"
+            " formulation rather than the LCM formulation, which means all guidance scales have 1 added to them as"
+            " compared to the original paper."
+        ),
+    )
+    parser.add_argument(
+        "--num_ddim_timesteps",
+        type=int,
+        default=50,
+        help="The number of timesteps to use for DDIM sampling.",
+    )
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber"],
+        help="The type of loss to use for the LCD loss.",
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.001,
+        help="The huber loss parameter. Only used if `--loss_type=huber`.",
+    )
+    parser.add_argument(
+        "--unet_time_cond_proj_dim",
+        type=int,
+        default=256,
+        help=(
+            "The dimension of the guidance scale embedding in the U-Net, which will be used if the teacher U-Net"
+            " does not have `time_cond_proj_dim` set."
+        ),
+    )
+    parser.add_argument(
+        "--vae_encode_batch_size",
+        type=int,
+        default=8,
+        required=False,
+        help=(
+            "The batch size used when encoding (and decoding) images to latents (and vice versa) using the VAE."
+            " Encoding or decoding the whole batch at once may run into OOM issues."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_scaling_factor",
+        type=float,
+        default=10.0,
+        help=(
+            "The multiplicative timestep scaling factor used when calculating the boundary scalings for LCM. The"
+            " higher the scaling is, the lower the approximation error, but the default value of 10.0 should typically"
+            " suffice."
+        ),
+    )
+    # ----Exponential Moving Average (EMA)----
+    parser.add_argument(
+        "--ema_decay",
+        type=float,
+        default=0.95,
+        required=False,
+        help="The exponential moving average (EMA) rate or decay factor.",
+    )
+    # ----Mixed Precision----
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--cast_teacher_unet",
+        action="store_true",
+        help="Whether to cast the teacher U-Net to the precision specified by `--mixed_precision`.",
+    )
+    # ----Training Optimizations----
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    # ----Distributed Training----
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    # ----------Validation Arguments----------
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=200,
+        help="Run validation every X steps.",
+    )
+    # ----------Huggingface Hub Arguments-----------
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    # ----------Accelerate Arguments----------
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="text2image-fine-tune",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    return args
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train=True):
+    prompt_embeds_list = []
+
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+
+    with torch.no_grad():
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                captions,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = text_encoder(
+                text_input_ids.to(text_encoder.device),
+                output_hidden_states=True,
+            )
+
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+            prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        split_batches=True,  # It's important to set this to True when using webdataset to get the right number of steps for lr scheduling. If set to False, the number of steps will be devide by the number of processes assuming batches are multiplied by the number of processes
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+                token=args.hub_token,
+                private=True,
+            ).repo_id
+
+    # 1. Create the noise scheduler and the desired noise schedule.
+    noise_scheduler = DDPMScheduler.from_pretrained(
+        args.pretrained_teacher_model, subfolder="scheduler", revision=args.teacher_revision
+    )
+
+    # DDPMScheduler calculates the alpha and sigma noise schedules (based on the alpha bars) for us
+    alpha_schedule = torch.sqrt(noise_scheduler.alphas_cumprod)
+    sigma_schedule = torch.sqrt(1 - noise_scheduler.alphas_cumprod)
+    # Initialize the DDIM ODE solver for distillation.
+    solver = DDIMSolver(
+        noise_scheduler.alphas_cumprod.numpy(),
+        timesteps=noise_scheduler.config.num_train_timesteps,
+        ddim_timesteps=args.num_ddim_timesteps,
+    )
+
+    # 2. Load tokenizers from SD-XL checkpoint.
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_teacher_model, subfolder="tokenizer", revision=args.teacher_revision, use_fast=False
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_teacher_model, subfolder="tokenizer_2", revision=args.teacher_revision, use_fast=False
+    )
+
+    # 3. Load text encoders from SD-XL checkpoint.
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_teacher_model, args.teacher_revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_teacher_model, args.teacher_revision, subfolder="text_encoder_2"
+    )
+
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_teacher_model, subfolder="text_encoder", revision=args.teacher_revision
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_teacher_model, subfolder="text_encoder_2", revision=args.teacher_revision
+    )
+
+    # 4. Load VAE from SD-XL checkpoint (or more stable VAE)
+    vae_path = (
+        args.pretrained_teacher_model
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.teacher_revision,
+    )
+
+    # 5. Load teacher U-Net from SD-XL checkpoint
+    teacher_unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
+    )
+
+    # 6. Freeze teacher vae, text_encoders, and teacher_unet
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    teacher_unet.requires_grad_(False)
+
+    # 7. Create online student U-Net. This will be updated by the optimizer (e.g. via backpropagation.)
+    # Add `time_cond_proj_dim` to the student U-Net if `teacher_unet.config.time_cond_proj_dim` is None
+    time_cond_proj_dim = (
+        teacher_unet.config.time_cond_proj_dim
+        if teacher_unet.config.time_cond_proj_dim is not None
+        else args.unet_time_cond_proj_dim
+    )
+    unet = UNet2DConditionModel.from_config(teacher_unet.config, time_cond_proj_dim=time_cond_proj_dim)
+    # load teacher_unet weights into unet
+    unet.load_state_dict(teacher_unet.state_dict(), strict=False)
+    unet.train()
+
+    # 8. Create target student U-Net. This will be updated via EMA updates (polyak averaging).
+    # Initialize from (online) unet
+    target_unet = UNet2DConditionModel.from_config(unet.config)
+    target_unet.load_state_dict(unet.state_dict())
+    target_unet.train()
+    target_unet.requires_grad_(False)
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        " Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training, copy of the weights should still be float32."
+    )
+
+    if accelerator.unwrap_model(unet).dtype != torch.float32:
+        raise ValueError(
+            f"Controlnet loaded as datatype {accelerator.unwrap_model(unet).dtype}. {low_precision_error_string}"
+        )
+
+    # 9. Handle mixed precision and device placement
+    # For mixed precision training we cast all non-trainable weigths to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    vae.to(accelerator.device)
+    if args.pretrained_vae_model_name_or_path is not None:
+        vae.to(dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+    target_unet.to(accelerator.device)
+    # Move teacher_unet to device, optionally cast to weight_dtype
+    teacher_unet.to(accelerator.device)
+    if args.cast_teacher_unet:
+        teacher_unet.to(dtype=weight_dtype)
+
+    # Also move the alpha and sigma noise schedules to accelerator.device.
+    alpha_schedule = alpha_schedule.to(accelerator.device)
+    sigma_schedule = sigma_schedule.to(accelerator.device)
+    # Move the ODE solver to accelerator.device.
+    solver = solver.to(accelerator.device)
+
+    # 10. Handle saving and loading of checkpoints
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                target_unet.save_pretrained(os.path.join(output_dir, "unet_target"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            load_model = UNet2DConditionModel.from_pretrained(os.path.join(input_dir, "unet_target"))
+            target_unet.load_state_dict(load_model.state_dict())
+            target_unet.to(accelerator.device)
+            del load_model
+
+            for i in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # 11. Enable optimizations
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+            teacher_unet.enable_xformers_memory_efficient_attention()
+            target_unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # 12. Optimizer creation
+    optimizer = optimizer_class(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # 13. Dataset creation and data processing
+    # Here, we compute not just the text embeddings but also the additional embeddings
+    # needed for the SD XL UNet to operate.
+    def compute_embeddings(
+        prompt_batch, original_sizes, crop_coords, proportion_empty_prompts, text_encoders, tokenizers, is_train=True
+    ):
+        target_size = (args.resolution, args.resolution)
+        original_sizes = list(map(list, zip(*original_sizes)))
+        crops_coords_top_left = list(map(list, zip(*crop_coords)))
+
+        original_sizes = torch.tensor(original_sizes, dtype=torch.long)
+        crops_coords_top_left = torch.tensor(crops_coords_top_left, dtype=torch.long)
+
+        prompt_embeds, pooled_prompt_embeds = encode_prompt(
+            prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train
+        )
+        add_text_embeds = pooled_prompt_embeds
+
+        # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+        add_time_ids = list(target_size)
+        add_time_ids = torch.tensor([add_time_ids])
+        add_time_ids = add_time_ids.repeat(len(prompt_batch), 1)
+        add_time_ids = torch.cat([original_sizes, crops_coords_top_left, add_time_ids], dim=-1)
+        add_time_ids = add_time_ids.to(accelerator.device, dtype=prompt_embeds.dtype)
+
+        prompt_embeds = prompt_embeds.to(accelerator.device)
+        add_text_embeds = add_text_embeds.to(accelerator.device)
+        unet_added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        return {"prompt_embeds": prompt_embeds, **unet_added_cond_kwargs}
+
+    dataset = SDXLText2ImageDataset(
+        train_shards_path_or_url=args.train_shards_path_or_url,
+        num_train_examples=args.max_train_samples,
+        per_gpu_batch_size=args.train_batch_size,
+        global_batch_size=args.train_batch_size * accelerator.num_processes,
+        num_workers=args.dataloader_num_workers,
+        resolution=args.resolution,
+        interpolation_type=args.interpolation_type,
+        shuffle_buffer_size=1000,
+        pin_memory=True,
+        persistent_workers=True,
+        use_fix_crop_and_size=args.use_fix_crop_and_size,
+    )
+    train_dataloader = dataset.train_dataloader
+
+    # Let's first compute all the embeddings so that we can free up the text encoders
+    # from memory.
+    text_encoders = [text_encoder_one, text_encoder_two]
+    tokenizers = [tokenizer_one, tokenizer_two]
+
+    compute_embeddings_fn = functools.partial(
+        compute_embeddings,
+        proportion_empty_prompts=0,
+        text_encoders=text_encoders,
+        tokenizers=tokenizers,
+    )
+
+    # 14. LR Scheduler creation
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # 15. Prepare for training
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+
+    # Create uncond embeds for classifier free guidance
+    uncond_prompt_embeds = torch.zeros(args.train_batch_size, 77, 2048).to(accelerator.device)
+    uncond_pooled_prompt_embeds = torch.zeros(args.train_batch_size, 1280).to(accelerator.device)
+
+    # 16. Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num batches each epoch = {train_dataloader.num_batches}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # 1. Load and process the image, text, and micro-conditioning (original image size, crop coordinates)
+                image, text, orig_size, crop_coords = batch
+
+                image = image.to(accelerator.device, non_blocking=True)
+                encoded_text = compute_embeddings_fn(text, orig_size, crop_coords)
+
+                if args.pretrained_vae_model_name_or_path is not None:
+                    pixel_values = image.to(dtype=weight_dtype)
+                    if vae.dtype != weight_dtype:
+                        vae.to(dtype=weight_dtype)
+                else:
+                    pixel_values = image
+
+                # encode pixel values with batch size of at most args.vae_encode_batch_size
+                latents = []
+                for i in range(0, pixel_values.shape[0], args.vae_encode_batch_size):
+                    latents.append(vae.encode(pixel_values[i : i + args.vae_encode_batch_size]).latent_dist.sample())
+                latents = torch.cat(latents, dim=0)
+
+                latents = latents * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    latents = latents.to(weight_dtype)
+                bsz = latents.shape[0]
+
+                # 2. Sample a random timestep for each image t_n from the ODE solver timesteps without bias.
+                # For the DDIM solver, the timestep schedule is [T - 1, T - k - 1, T - 2 * k - 1, ...]
+                topk = noise_scheduler.config.num_train_timesteps // args.num_ddim_timesteps
+                index = torch.randint(0, args.num_ddim_timesteps, (bsz,), device=latents.device).long()
+                start_timesteps = solver.ddim_timesteps[index]
+                timesteps = start_timesteps - topk
+                timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)
+
+                # 3. Get boundary scalings for start_timesteps and (end) timesteps.
+                c_skip_start, c_out_start = scalings_for_boundary_conditions(
+                    start_timesteps, timestep_scaling=args.timestep_scaling_factor
+                )
+                c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
+                c_skip, c_out = scalings_for_boundary_conditions(
+                    timesteps, timestep_scaling=args.timestep_scaling_factor
+                )
+                c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]
+
+                # 4. Sample noise from the prior and add it to the latents according to the noise magnitude at each
+                # timestep (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                noise = torch.randn_like(latents)
+                noisy_model_input = noise_scheduler.add_noise(latents, noise, start_timesteps)
+
+                # 5. Sample a random guidance scale w from U[w_min, w_max] and embed it
+                w = (args.w_max - args.w_min) * torch.rand((bsz,)) + args.w_min
+                w_embedding = guidance_scale_embedding(w, embedding_dim=time_cond_proj_dim)
+                w = w.reshape(bsz, 1, 1, 1)
+                # Move to U-Net device and dtype
+                w = w.to(device=latents.device, dtype=latents.dtype)
+                w_embedding = w_embedding.to(device=latents.device, dtype=latents.dtype)
+
+                # 6. Prepare prompt embeds and unet_added_conditions
+                prompt_embeds = encoded_text.pop("prompt_embeds")
+
+                # 7. Get online LCM prediction on z_{t_{n + k}} (noisy_model_input), w, c, t_{n + k} (start_timesteps)
+                noise_pred = unet(
+                    noisy_model_input,
+                    start_timesteps,
+                    timestep_cond=w_embedding,
+                    encoder_hidden_states=prompt_embeds.float(),
+                    added_cond_kwargs=encoded_text,
+                ).sample
+
+                pred_x_0 = get_predicted_original_sample(
+                    noise_pred,
+                    start_timesteps,
+                    noisy_model_input,
+                    noise_scheduler.config.prediction_type,
+                    alpha_schedule,
+                    sigma_schedule,
+                )
+
+                model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0
+
+                # 8. Compute the conditional and unconditional teacher model predictions to get CFG estimates of the
+                # predicted noise eps_0 and predicted original sample x_0, then run the ODE solver using these
+                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
+                # solver timestep.
+                with torch.no_grad():
+                    if torch.backends.mps.is_available():
+                        autocast_ctx = nullcontext()
+                    else:
+                        autocast_ctx = torch.autocast(accelerator.device.type)
+
+                    with autocast_ctx:
+                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
+                        cond_teacher_output = teacher_unet(
+                            noisy_model_input.to(weight_dtype),
+                            start_timesteps,
+                            encoder_hidden_states=prompt_embeds.to(weight_dtype),
+                            added_cond_kwargs={k: v.to(weight_dtype) for k, v in encoded_text.items()},
+                        ).sample
+                        cond_pred_x0 = get_predicted_original_sample(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        cond_pred_noise = get_predicted_noise(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
+                        uncond_added_conditions = copy.deepcopy(encoded_text)
+                        uncond_added_conditions["text_embeds"] = uncond_pooled_prompt_embeds
+                        uncond_teacher_output = teacher_unet(
+                            noisy_model_input.to(weight_dtype),
+                            start_timesteps,
+                            encoder_hidden_states=uncond_prompt_embeds.to(weight_dtype),
+                            added_cond_kwargs={k: v.to(weight_dtype) for k, v in uncond_added_conditions.items()},
+                        ).sample
+                        uncond_pred_x0 = get_predicted_original_sample(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        uncond_pred_noise = get_predicted_noise(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+
+                        # 3. Calculate the CFG estimate of x_0 (pred_x0) and eps_0 (pred_noise)
+                        # Note that this uses the LCM paper's CFG formulation rather than the Imagen CFG formulation
+                        pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
+                        pred_noise = cond_pred_noise + w * (cond_pred_noise - uncond_pred_noise)
+                        # 4. Run one step of the ODE solver to estimate the next point x_prev on the
+                        # augmented PF-ODE trajectory (solving backward in time)
+                        # Note that the DDIM step depends on both the predicted x_0 and source noise eps_0.
+                        x_prev = solver.ddim_step(pred_x0, pred_noise, index)
+
+                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
+                with torch.no_grad():
+                    if torch.backends.mps.is_available():
+                        autocast_ctx = nullcontext()
+                    else:
+                        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
+
+                    with autocast_ctx:
+                        target_noise_pred = target_unet(
+                            x_prev.float(),
+                            timesteps,
+                            timestep_cond=w_embedding,
+                            encoder_hidden_states=prompt_embeds.float(),
+                            added_cond_kwargs=encoded_text,
+                        ).sample
+                    pred_x_0 = get_predicted_original_sample(
+                        target_noise_pred,
+                        timesteps,
+                        x_prev,
+                        noise_scheduler.config.prediction_type,
+                        alpha_schedule,
+                        sigma_schedule,
+                    )
+                    target = c_skip * x_prev + c_out * pred_x_0
+
+                # 10. Calculate loss
+                if args.loss_type == "l2":
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                elif args.loss_type == "huber":
+                    loss = torch.mean(
+                        torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
+                    )
+
+                # 11. Backpropagate on the online student model (`unet`)
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                # 12. Make EMA update to target student model parameters (`target_unet`)
+                update_ema(target_unet.parameters(), unet.parameters(), args.ema_decay)
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if global_step % args.validation_steps == 0:
+                        log_validation(vae, target_unet, args, accelerator, weight_dtype, global_step, "target")
+                        log_validation(vae, unet, args, accelerator, weight_dtype, global_step, "online")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        unet.save_pretrained(os.path.join(args.output_dir, "unet"))
+
+        target_unet = accelerator.unwrap_model(target_unet)
+        target_unet.save_pretrained(os.path.join(args.output_dir, "unet_target"))
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/controlnet/README.md b/diffusers/examples/controlnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0555857b773834a14462d3a09e3e17866b713663
--- /dev/null
+++ b/diffusers/examples/controlnet/README.md
@@ -0,0 +1,465 @@
+# ControlNet training example
+
+[Adding Conditional Control to Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.05543) by Lvmin Zhang and Maneesh Agrawala.
+
+This example is based on the [training example in the original ControlNet repository](https://github.com/lllyasviel/ControlNet/blob/main/docs/train.md). It trains a ControlNet to fill circles using a [small synthetic dataset](https://huggingface.co/datasets/fusing/fill50k).
+
+## Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the example folder and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+## Circle filling dataset
+
+The original dataset is hosted in the [ControlNet repo](https://huggingface.co/lllyasviel/ControlNet/blob/main/training/fill50k.zip). We re-uploaded it to be compatible with `datasets` [here](https://huggingface.co/datasets/fusing/fill50k). Note that `datasets` handles dataloading within the training script.
+
+Our training examples use [Stable Diffusion 1.5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) as the original set of ControlNet models were trained from it. However, ControlNet can be trained to augment any Stable Diffusion compatible model (such as [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4)) or [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1).
+
+## Training
+
+Our training examples use two test conditioning images. They can be downloaded by running
+
+```sh
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png
+
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
+```
+
+
+```bash
+export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export OUTPUT_DIR="path to save model"
+
+accelerate launch train_controlnet.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=fusing/fill50k \
+ --resolution=512 \
+ --learning_rate=1e-5 \
+ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+ --train_batch_size=4
+```
+
+This default configuration requires ~38GB VRAM.
+
+By default, the training script logs outputs to tensorboard. Pass `--report_to wandb` to use weights and
+biases.
+
+Gradient accumulation with a smaller batch size can be used to reduce training requirements to ~20 GB VRAM.
+
+```bash
+export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export OUTPUT_DIR="path to save model"
+
+accelerate launch train_controlnet.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=fusing/fill50k \
+ --resolution=512 \
+ --learning_rate=1e-5 \
+ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=4
+```
+
+## Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash
+export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export OUTPUT_DIR="path to save model"
+
+accelerate launch --mixed_precision="fp16" --multi_gpu train_controlnet.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=fusing/fill50k \
+ --resolution=512 \
+ --learning_rate=1e-5 \
+ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+ --train_batch_size=4 \
+ --mixed_precision="fp16" \
+ --tracker_project_name="controlnet-demo" \
+ --report_to=wandb
+```
+
+## Example results
+
+#### After 300 steps with batch size 8
+
+| |  |
+|-------------------|:-------------------------:|
+| | red circle with blue background  |
+![conditioning image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png) | ![red circle with blue background](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/red_circle_with_blue_background_300_steps.png) |
+| | cyan circle with brown floral background |
+![conditioning image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png) | ![cyan circle with brown floral background](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/cyan_circle_with_brown_floral_background_300_steps.png) |
+
+
+#### After 6000 steps with batch size 8:
+
+| |  |
+|-------------------|:-------------------------:|
+| | red circle with blue background  |
+![conditioning image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png) | ![red circle with blue background](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/red_circle_with_blue_background_6000_steps.png) |
+| | cyan circle with brown floral background |
+![conditioning image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png) | ![cyan circle with brown floral background](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/cyan_circle_with_brown_floral_background_6000_steps.png) |
+
+## Training on a 16 GB GPU
+
+Optimizations:
+- Gradient checkpointing
+- bitsandbyte's 8-bit optimizer
+
+[bitandbytes install instructions](https://github.com/TimDettmers/bitsandbytes#requirements--installation).
+
+```bash
+export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export OUTPUT_DIR="path to save model"
+
+accelerate launch train_controlnet.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=fusing/fill50k \
+ --resolution=512 \
+ --learning_rate=1e-5 \
+ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=4 \
+ --gradient_checkpointing \
+ --use_8bit_adam
+```
+
+## Training on a 12 GB GPU
+
+Optimizations:
+- Gradient checkpointing
+- bitsandbyte's 8-bit optimizer
+- xformers
+- set grads to none
+
+```bash
+export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export OUTPUT_DIR="path to save model"
+
+accelerate launch train_controlnet.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=fusing/fill50k \
+ --resolution=512 \
+ --learning_rate=1e-5 \
+ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=4 \
+ --gradient_checkpointing \
+ --use_8bit_adam \
+ --enable_xformers_memory_efficient_attention \
+ --set_grads_to_none
+```
+
+When using `enable_xformers_memory_efficient_attention`, please make sure to install `xformers` by `pip install xformers`.
+
+## Training on an 8 GB GPU
+
+We have not exhaustively tested DeepSpeed support for ControlNet. While the configuration does
+save memory, we have not confirmed the configuration to train successfully. You will very likely
+have to make changes to the config to have a successful training run.
+
+Optimizations:
+- Gradient checkpointing
+- xformers
+- set grads to none
+- DeepSpeed stage 2 with parameter and optimizer offloading
+- fp16 mixed precision
+
+[DeepSpeed](https://www.deepspeed.ai/) can offload tensors from VRAM to either
+CPU or NVME. This requires significantly more RAM (about 25 GB).
+
+Use `accelerate config` to enable DeepSpeed stage 2.
+
+The relevant parts of the resulting accelerate config file are
+
+```yaml
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  gradient_accumulation_steps: 4
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+```
+
+See [documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more DeepSpeed configuration options.
+
+Changing the default Adam optimizer to DeepSpeed's Adam
+`deepspeed.ops.adam.DeepSpeedCPUAdam` gives a substantial speedup but
+it requires CUDA toolchain with the same version as pytorch. 8-bit optimizer
+does not seem to be compatible with DeepSpeed at the moment.
+
+```bash
+export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export OUTPUT_DIR="path to save model"
+
+accelerate launch train_controlnet.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=fusing/fill50k \
+ --resolution=512 \
+ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=4 \
+ --gradient_checkpointing \
+ --enable_xformers_memory_efficient_attention \
+ --set_grads_to_none \
+ --mixed_precision fp16
+```
+
+## Performing inference with the trained ControlNet
+
+The trained model can be run the same as the original ControlNet pipeline with the newly trained ControlNet.
+Set `base_model_path` and `controlnet_path` to the values `--pretrained_model_name_or_path` and
+`--output_dir` were respectively set to in the training script.
+
+```py
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
+from diffusers.utils import load_image
+import torch
+
+base_model_path = "path to model"
+controlnet_path = "path to controlnet"
+
+controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
+pipe = StableDiffusionControlNetPipeline.from_pretrained(
+    base_model_path, controlnet=controlnet, torch_dtype=torch.float16
+)
+
+# speed up diffusion process with faster scheduler and memory optimization
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+# remove following line if xformers is not installed or when using Torch 2.0.
+pipe.enable_xformers_memory_efficient_attention()
+# memory optimization.
+pipe.enable_model_cpu_offload()
+
+control_image = load_image("./conditioning_image_1.png")
+prompt = "pale golden rod circle with old lace background"
+
+# generate image
+generator = torch.manual_seed(0)
+image = pipe(
+    prompt, num_inference_steps=20, generator=generator, image=control_image
+).images[0]
+image.save("./output.png")
+```
+
+## Training with Flax/JAX
+
+For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
+
+### Running on Google Cloud TPU
+
+See below for commands to set up a TPU VM(`--accelerator-type v4-8`). For more details about how to set up and use TPUs, refer to [Cloud docs for single VM setup](https://cloud.google.com/tpu/docs/run-calculation-jax).
+
+First create a single TPUv4-8 VM and connect to it:
+
+```
+ZONE=us-central2-b
+TPU_TYPE=v4-8
+VM_NAME=hg_flax
+
+gcloud alpha compute tpus tpu-vm create $VM_NAME \
+ --zone $ZONE \
+ --accelerator-type $TPU_TYPE \
+ --version  tpu-vm-v4-base
+
+gcloud alpha compute tpus tpu-vm ssh $VM_NAME --zone $ZONE -- \
+```
+
+When connected install JAX `0.4.5`:
+
+```sh
+pip install "jax[tpu]==0.4.5" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+```
+
+To verify that JAX was correctly installed, you can run the following command:
+
+```py
+import jax
+jax.device_count()
+```
+
+This should display the number of TPU cores, which should be 4 on a TPUv4-8 VM.
+
+Then install Diffusers and the library's training dependencies:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder and run
+
+```bash
+pip install -U -r requirements_flax.txt
+```
+
+If you want to use Weights and Biases logging, you should also install `wandb` now
+
+```bash
+pip install wandb
+```
+
+
+Now let's downloading two conditioning images that we will use to run validation during the training in order to track our progress
+
+```sh
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
+```
+
+We encourage you to store or share your model with the community. To use huggingface hub, please login to your Hugging Face account, or ([create one](https://huggingface.co/docs/diffusers/main/en/training/hf.co/join) if you don’t have one already):
+
+```sh
+huggingface-cli login
+```
+
+Make sure you have the `MODEL_DIR`,`OUTPUT_DIR` and `HUB_MODEL_ID` environment variables set. The `OUTPUT_DIR` and `HUB_MODEL_ID` variables specify where to save the model to on the Hub:
+
+```bash
+export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export OUTPUT_DIR="runs/fill-circle-{timestamp}"
+export HUB_MODEL_ID="controlnet-fill-circle"
+```
+
+And finally start the training
+
+```bash
+python3 train_controlnet_flax.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=fusing/fill50k \
+ --resolution=512 \
+ --learning_rate=1e-5 \
+ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+ --validation_steps=1000 \
+ --train_batch_size=2 \
+ --revision="non-ema" \
+ --from_pt \
+ --report_to="wandb" \
+ --tracker_project_name=$HUB_MODEL_ID \
+ --num_train_epochs=11 \
+ --push_to_hub \
+ --hub_model_id=$HUB_MODEL_ID
+ ```
+
+Since we passed the `--push_to_hub` flag, it will automatically create a model repo under your huggingface account based on `$HUB_MODEL_ID`. By the end of training, the final checkpoint will be automatically stored on the hub. You can find an example model repo [here](https://huggingface.co/YiYiXu/fill-circle-controlnet).
+
+Our training script also provides limited support for streaming large datasets from the Hugging Face Hub. In order to enable streaming, one must also set `--max_train_samples`.  Here is an example command (from [this blog article](https://huggingface.co/blog/train-your-controlnet)):
+
+```bash
+export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export OUTPUT_DIR="runs/uncanny-faces-{timestamp}"
+export HUB_MODEL_ID="controlnet-uncanny-faces"
+
+python3 train_controlnet_flax.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=multimodalart/facesyntheticsspigacaptioned \
+ --streaming \
+ --conditioning_image_column=spiga_seg \
+ --image_column=image \
+ --caption_column=image_caption \
+ --resolution=512 \
+ --max_train_samples 100000 \
+ --learning_rate=1e-5 \
+ --train_batch_size=1 \
+ --revision="flax" \
+ --report_to="wandb" \
+ --tracker_project_name=$HUB_MODEL_ID
+```
+
+Note, however, that the performance of the TPUs might get bottlenecked as streaming with `datasets` is not optimized for images. For ensuring maximum throughput, we encourage you to explore the following options:
+
+* [Webdataset](https://webdataset.github.io/webdataset/)
+* [TorchData](https://github.com/pytorch/data)
+* [TensorFlow Datasets](https://www.tensorflow.org/datasets/tfless_tfds)
+
+When work with a larger dataset, you may need to run training process for a long time and it’s useful to save regular checkpoints during the process. You can use the following argument to enable intermediate checkpointing:
+
+```bash
+ --checkpointing_steps=500
+```
+This will save the trained model in subfolders of your output_dir. Subfolder names is the number of steps performed so far; for example: a checkpoint saved after 500 training steps would be saved in a subfolder named 500
+
+You can then start your training from this saved checkpoint with
+
+```bash
+ --controlnet_model_name_or_path="./control_out/500"
+```
+
+We support training with the Min-SNR weighting strategy proposed in [Efficient Diffusion Training via Min-SNR Weighting Strategy](https://arxiv.org/abs/2303.09556) which helps to achieve faster convergence by rebalancing the loss. To use it, one needs to set the `--snr_gamma` argument. The recommended value when using it is `5.0`.
+
+We also support gradient accumulation - it is a technique that lets you use a bigger batch size than your machine would normally be able to fit into memory. You can use `gradient_accumulation_steps` argument to set gradient accumulation steps. The ControlNet author recommends using gradient accumulation to achieve better convergence. Read more [here](https://github.com/lllyasviel/ControlNet/blob/main/docs/train.md#more-consideration-sudden-converge-phenomenon-and-gradient-accumulation).
+
+You can **profile your code** with:
+
+```bash
+ --profile_steps==5
+```
+
+Refer to the [JAX documentation on profiling](https://jax.readthedocs.io/en/latest/profiling.html). To inspect the profile trace, you'll have to install and start Tensorboard with the profile plugin:
+
+```bash
+pip install tensorflow tensorboard-plugin-profile
+tensorboard --logdir runs/fill-circle-100steps-20230411_165612/
+```
+
+The profile can then be inspected at http://localhost:6006/#profile
+
+Sometimes you'll get version conflicts (error messages like `Duplicate plugins for name projector`), which means that you have to uninstall and reinstall all versions of Tensorflow/Tensorboard (e.g. with `pip uninstall tensorflow tf-nightly tensorboard tb-nightly tensorboard-plugin-profile && pip install tf-nightly tbp-nightly tensorboard-plugin-profile`).
+
+Note that the debugging functionality of the Tensorboard `profile` plugin is still under active development. Not all views are fully functional, and for example the `trace_viewer` cuts off events after 1M (which can result in all your device traces getting lost if you for example profile the compilation step by accident).
+
+## Support for Stable Diffusion XL
+
+We provide a training script for training a ControlNet with [Stable Diffusion XL](https://huggingface.co/papers/2307.01952). Please refer to [README_sdxl.md](./README_sdxl.md) for more details.
diff --git a/diffusers/examples/controlnet/README_sd3.md b/diffusers/examples/controlnet/README_sd3.md
new file mode 100644
index 0000000000000000000000000000000000000000..1788e07a21d69297f12f2f3dae6645305940cf19
--- /dev/null
+++ b/diffusers/examples/controlnet/README_sd3.md
@@ -0,0 +1,152 @@
+# ControlNet training example for Stable Diffusion 3 (SD3)
+
+The `train_controlnet_sd3.py` script shows how to implement the ControlNet training procedure and adapt it for [Stable Diffusion 3](https://arxiv.org/abs/2403.03206).
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the `examples/controlnet` folder and run
+```bash
+pip install -r requirements_sd3.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell (e.g., a notebook)
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+
+## Circle filling dataset
+
+The original dataset is hosted in the [ControlNet repo](https://huggingface.co/lllyasviel/ControlNet/blob/main/training/fill50k.zip). We re-uploaded it to be compatible with `datasets` [here](https://huggingface.co/datasets/fusing/fill50k). Note that `datasets` handles dataloading within the training script.
+Please download the dataset and unzip it in the directory `fill50k` in the `examples/controlnet` folder.
+
+## Training
+
+First download the SD3 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3-medium). We will use it as a base model for the ControlNet training.
+> [!NOTE]
+> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
+
+```bash
+huggingface-cli login
+```
+
+This will also allow us to push the trained model parameters to the Hugging Face Hub platform.
+
+
+Our training examples use two test conditioning images. They can be downloaded by running
+
+```sh
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png
+
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
+```
+
+Then run the following commands to train a ControlNet model.
+
+```bash
+export MODEL_DIR="stabilityai/stable-diffusion-3-medium-diffusers"
+export OUTPUT_DIR="sd3-controlnet-out"
+
+accelerate launch train_controlnet_sd3.py \
+    --pretrained_model_name_or_path=$MODEL_DIR \
+    --output_dir=$OUTPUT_DIR \
+    --train_data_dir="fill50k" \
+    --resolution=1024 \
+    --learning_rate=1e-5 \
+    --max_train_steps=15000 \
+    --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+    --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+    --validation_steps=100 \
+    --train_batch_size=1 \
+    --gradient_accumulation_steps=4
+```
+
+To better track our training experiments, we're using flags `validation_image`, `validation_prompt`, and `validation_steps` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
+
+Our experiments were conducted on a single 40GB A100 GPU.
+
+### Inference
+
+Once training is done, we can perform inference like so:
+
+```python
+from diffusers import StableDiffusion3ControlNetPipeline, SD3ControlNetModel
+from diffusers.utils import load_image
+import torch
+
+base_model_path = "stabilityai/stable-diffusion-3-medium-diffusers"
+controlnet_path = "sd3-controlnet-out/checkpoint-6500/controlnet"
+
+controlnet = SD3ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
+pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
+    base_model_path, controlnet=controlnet
+)
+pipe.to("cuda", torch.float16)
+
+
+control_image = load_image("./conditioning_image_1.png").resize((1024, 1024))
+prompt = "pale golden rod circle with old lace background"
+
+# generate image
+generator = torch.manual_seed(0)
+image = pipe(
+    prompt, num_inference_steps=20, generator=generator, control_image=control_image
+).images[0]
+image.save("./output.png")
+```
+
+## Notes
+
+### GPU usage
+
+SD3 is a large model and requires a lot of GPU memory. 
+We recommend using one GPU with at least 80GB of memory.
+Make sure to use the right GPU when configuring the [accelerator](https://huggingface.co/docs/transformers/en/accelerate).
+
+
+## Example results
+
+#### After 500 steps with batch size 8
+
+| |  |
+|-------------------|:-------------------------:|
+|| pale golden rod circle with old lace background |
+ ![conditioning image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png) | ![pale golden rod circle with old lace background](https://huggingface.co/datasets/DavyMorgan/sd3-controlnet-results/resolve/main/step-500.png) |
+
+
+#### After 6500 steps with batch size 8:
+
+| |  |
+|-------------------|:-------------------------:|
+|| pale golden rod circle with old lace background |
+ ![conditioning image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png) | ![pale golden rod circle with old lace background](https://huggingface.co/datasets/DavyMorgan/sd3-controlnet-results/resolve/main/step-6500.png) |
+
diff --git a/diffusers/examples/controlnet/README_sdxl.md b/diffusers/examples/controlnet/README_sdxl.md
new file mode 100644
index 0000000000000000000000000000000000000000..75511385ff37d5dc198a22c68731dcbf224f8fe0
--- /dev/null
+++ b/diffusers/examples/controlnet/README_sdxl.md
@@ -0,0 +1,141 @@
+# ControlNet training example for Stable Diffusion XL (SDXL)
+
+The `train_controlnet_sdxl.py` script shows how to implement the ControlNet training procedure and adapt it for [Stable Diffusion XL](https://huggingface.co/papers/2307.01952).
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the `examples/controlnet` folder and run
+```bash
+pip install -r requirements_sdxl.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell (e.g., a notebook)
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+
+## Circle filling dataset
+
+The original dataset is hosted in the [ControlNet repo](https://huggingface.co/lllyasviel/ControlNet/blob/main/training/fill50k.zip). We re-uploaded it to be compatible with `datasets` [here](https://huggingface.co/datasets/fusing/fill50k). Note that `datasets` handles dataloading within the training script.
+
+## Training
+
+Our training examples use two test conditioning images. They can be downloaded by running
+
+```sh
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png
+
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
+```
+
+Then run `huggingface-cli login` to log into your Hugging Face account. This is needed to be able to push the trained ControlNet parameters to Hugging Face Hub.
+
+```bash
+export MODEL_DIR="stabilityai/stable-diffusion-xl-base-1.0"
+export OUTPUT_DIR="path to save model"
+
+accelerate launch train_controlnet_sdxl.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=fusing/fill50k \
+ --mixed_precision="fp16" \
+ --resolution=1024 \
+ --learning_rate=1e-5 \
+ --max_train_steps=15000 \
+ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+ --validation_steps=100 \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=4 \
+ --report_to="wandb" \
+ --seed=42 \
+ --push_to_hub
+```
+
+To better track our training experiments, we're using the following flags in the command above:
+
+* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
+* `validation_image`, `validation_prompt`, and `validation_steps` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
+
+Our experiments were conducted on a single 40GB A100 GPU.
+
+### Inference
+
+Once training is done, we can perform inference like so:
+
+```python
+from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
+from diffusers.utils import load_image
+import torch
+
+base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
+controlnet_path = "path to controlnet"
+
+controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    base_model_path, controlnet=controlnet, torch_dtype=torch.float16
+)
+
+# speed up diffusion process with faster scheduler and memory optimization
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+# remove following line if xformers is not installed or when using Torch 2.0.
+pipe.enable_xformers_memory_efficient_attention()
+# memory optimization.
+pipe.enable_model_cpu_offload()
+
+control_image = load_image("./conditioning_image_1.png").resize((1024, 1024))
+prompt = "pale golden rod circle with old lace background"
+
+# generate image
+generator = torch.manual_seed(0)
+image = pipe(
+    prompt, num_inference_steps=20, generator=generator, image=control_image
+).images[0]
+image.save("./output.png")
+```
+
+## Notes
+
+### Specifying a better VAE
+
+SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of an alternative VAE (such as [`madebyollin/sdxl-vae-fp16-fix`](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
+
+If you're using this VAE during training, you need to ensure you're using it during inference too. You do so by:
+
+```diff
++ vae = AutoencoderKL.from_pretrained(vae_path_or_repo_id, torch_dtype=torch.float16)
+controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    base_model_path, controlnet=controlnet, torch_dtype=torch.float16,
++   vae=vae,
+)
diff --git a/diffusers/examples/controlnet/requirements.txt b/diffusers/examples/controlnet/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d19c62296702868c768596bdd866dd5b504e4180
--- /dev/null
+++ b/diffusers/examples/controlnet/requirements.txt
@@ -0,0 +1,6 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+datasets
diff --git a/diffusers/examples/controlnet/requirements_flax.txt b/diffusers/examples/controlnet/requirements_flax.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b6eb64e254625ee8eff2ef126d67adfd5b6994dc
--- /dev/null
+++ b/diffusers/examples/controlnet/requirements_flax.txt
@@ -0,0 +1,9 @@
+transformers>=4.25.1
+datasets
+flax
+optax
+torch
+torchvision
+ftfy
+tensorboard
+Jinja2
diff --git a/diffusers/examples/controlnet/requirements_sd3.txt b/diffusers/examples/controlnet/requirements_sd3.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5ab6e9932e10a1e5337f3bc3faa8a192f4f60a52
--- /dev/null
+++ b/diffusers/examples/controlnet/requirements_sd3.txt
@@ -0,0 +1,8 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
+datasets
+wandb
diff --git a/diffusers/examples/controlnet/requirements_sdxl.txt b/diffusers/examples/controlnet/requirements_sdxl.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5ab6e9932e10a1e5337f3bc3faa8a192f4f60a52
--- /dev/null
+++ b/diffusers/examples/controlnet/requirements_sdxl.txt
@@ -0,0 +1,8 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
+datasets
+wandb
diff --git a/diffusers/examples/controlnet/test_controlnet.py b/diffusers/examples/controlnet/test_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..77b5614c7fb0362447529d95c05b948510215f09
--- /dev/null
+++ b/diffusers/examples/controlnet/test_controlnet.py
@@ -0,0 +1,138 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class ControlNet(ExamplesTestsAccelerate):
+    def test_controlnet_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/controlnet/train_controlnet.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=6
+            --checkpoints_total_limit=2
+            --checkpointing_steps=2
+            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_controlnet_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/controlnet/train_controlnet.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
+            --max_train_steps=6
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4", "checkpoint-6"},
+            )
+
+            resume_run_args = f"""
+            examples/controlnet/train_controlnet.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
+            --max_train_steps=8
+            --checkpointing_steps=2
+            --resume_from_checkpoint=checkpoint-6
+            --checkpoints_total_limit=2
+            """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
+
+
+class ControlNetSDXL(ExamplesTestsAccelerate):
+    def test_controlnet_sdxl(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/controlnet/train_controlnet_sdxl.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-xl-pipe
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet-sdxl
+            --max_train_steps=4
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
+
+
+class ControlNetSD3(ExamplesTestsAccelerate):
+    def test_controlnet_sd3(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/controlnet/train_controlnet_sd3.py
+            --pretrained_model_name_or_path=DavyMorgan/tiny-sd3-pipe
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --controlnet_model_name_or_path=DavyMorgan/tiny-controlnet-sd3
+            --max_train_steps=4
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
diff --git a/diffusers/examples/controlnet/train_controlnet.py b/diffusers/examples/controlnet/train_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e902db7ffc7e64a7c29977b93370be2afe7b9fb
--- /dev/null
+++ b/diffusers/examples/controlnet/train_controlnet.py
@@ -0,0 +1,1187 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import contextlib
+import gc
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import accelerate
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    DDPMScheduler,
+    StableDiffusionControlNetPipeline,
+    UNet2DConditionModel,
+    UniPCMultistepScheduler,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+
+
+def log_validation(
+    vae, text_encoder, tokenizer, unet, controlnet, args, accelerator, weight_dtype, step, is_final_validation=False
+):
+    logger.info("Running validation... ")
+
+    if not is_final_validation:
+        controlnet = accelerator.unwrap_model(controlnet)
+    else:
+        controlnet = ControlNetModel.from_pretrained(args.output_dir, torch_dtype=weight_dtype)
+
+    pipeline = StableDiffusionControlNetPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        unet=unet,
+        controlnet=controlnet,
+        safety_checker=None,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+    )
+    pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    if len(args.validation_image) == len(args.validation_prompt):
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_image) == 1:
+        validation_images = args.validation_image * len(args.validation_prompt)
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_prompt) == 1:
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt * len(args.validation_image)
+    else:
+        raise ValueError(
+            "number of `args.validation_image` and `args.validation_prompt` should be checked in `parse_args`"
+        )
+
+    image_logs = []
+    inference_ctx = contextlib.nullcontext() if is_final_validation else torch.autocast("cuda")
+
+    for validation_prompt, validation_image in zip(validation_prompts, validation_images):
+        validation_image = Image.open(validation_image).convert("RGB")
+
+        images = []
+
+        for _ in range(args.num_validation_images):
+            with inference_ctx:
+                image = pipeline(
+                    validation_prompt, validation_image, num_inference_steps=20, generator=generator
+                ).images[0]
+
+            images.append(image)
+
+        image_logs.append(
+            {"validation_image": validation_image, "images": images, "validation_prompt": validation_prompt}
+        )
+
+    tracker_key = "test" if is_final_validation else "validation"
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                validation_image = log["validation_image"]
+
+                formatted_images = []
+
+                formatted_images.append(np.asarray(validation_image))
+
+                for image in images:
+                    formatted_images.append(np.asarray(image))
+
+                formatted_images = np.stack(formatted_images)
+
+                tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            formatted_images = []
+
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                validation_image = log["validation_image"]
+
+                formatted_images.append(wandb.Image(validation_image, caption="Controlnet conditioning"))
+
+                for image in images:
+                    image = wandb.Image(image, caption=validation_prompt)
+                    formatted_images.append(image)
+
+            tracker.log({tracker_key: formatted_images})
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+        del pipeline
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        return image_logs
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_folder=None):
+    img_str = ""
+    if image_logs is not None:
+        img_str = "You can find some example images below.\n\n"
+        for i, log in enumerate(image_logs):
+            images = log["images"]
+            validation_prompt = log["validation_prompt"]
+            validation_image = log["validation_image"]
+            validation_image.save(os.path.join(repo_folder, "image_control.png"))
+            img_str += f"prompt: {validation_prompt}\n"
+            images = [validation_image] + images
+            image_grid(images, 1, len(images)).save(os.path.join(repo_folder, f"images_{i}.png"))
+            img_str += f"![images_{i})](./images_{i}.png)\n"
+
+    model_description = f"""
+# controlnet-{repo_id}
+
+These are controlnet weights trained on {base_model} with new type of conditioning.
+{img_str}
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion",
+        "stable-diffusion-diffusers",
+        "text-to-image",
+        "diffusers",
+        "controlnet",
+        "diffusers-training",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a ControlNet training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--controlnet_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained controlnet model or model identifier from huggingface.co/models."
+        " If not specified controlnet weights are initialized from unet.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="controlnet-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing the target image."
+    )
+    parser.add_argument(
+        "--conditioning_image_column",
+        type=str,
+        default="conditioning_image",
+        help="The column of the dataset containing the controlnet conditioning image.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        nargs="+",
+        help=(
+            "A set of prompts evaluated every `--validation_steps` and logged to `--report_to`."
+            " Provide either a matching number of `--validation_image`s, a single `--validation_image`"
+            " to be used with all prompts, or a single prompt that will be used with all `--validation_image`s."
+        ),
+    )
+    parser.add_argument(
+        "--validation_image",
+        type=str,
+        default=None,
+        nargs="+",
+        help=(
+            "A set of paths to the controlnet conditioning image be evaluated every `--validation_steps`"
+            " and logged to `--report_to`. Provide either a matching number of `--validation_prompt`s, a"
+            " a single `--validation_prompt` to be used with all `--validation_image`s, or a single"
+            " `--validation_image` that will be used with all `--validation_prompt`s."
+        ),
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images to be generated for each `--validation_image`, `--validation_prompt` pair",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="train_controlnet",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--train_data_dir`")
+
+    if args.dataset_name is not None and args.train_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--train_data_dir`")
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    if args.validation_prompt is not None and args.validation_image is None:
+        raise ValueError("`--validation_image` must be set if `--validation_prompt` is set")
+
+    if args.validation_prompt is None and args.validation_image is not None:
+        raise ValueError("`--validation_prompt` must be set if `--validation_image` is set")
+
+    if (
+        args.validation_image is not None
+        and args.validation_prompt is not None
+        and len(args.validation_image) != 1
+        and len(args.validation_prompt) != 1
+        and len(args.validation_image) != len(args.validation_prompt)
+    ):
+        raise ValueError(
+            "Must provide either 1 `--validation_image`, 1 `--validation_prompt`,"
+            " or the same number of `--validation_prompt`s and `--validation_image`s"
+        )
+
+    if args.resolution % 8 != 0:
+        raise ValueError(
+            "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the controlnet encoder."
+        )
+
+    return args
+
+
+def make_train_dataset(args, tokenizer, accelerator):
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        if args.train_data_dir is not None:
+            dataset = load_dataset(
+                args.train_data_dir,
+                cache_dir=args.cache_dir,
+            )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    if args.image_column is None:
+        image_column = column_names[0]
+        logger.info(f"image column defaulting to {image_column}")
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    if args.caption_column is None:
+        caption_column = column_names[1]
+        logger.info(f"caption column defaulting to {caption_column}")
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    if args.conditioning_image_column is None:
+        conditioning_image_column = column_names[2]
+        logger.info(f"conditioning image column defaulting to {conditioning_image_column}")
+    else:
+        conditioning_image_column = args.conditioning_image_column
+        if conditioning_image_column not in column_names:
+            raise ValueError(
+                f"`--conditioning_image_column` value '{args.conditioning_image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if random.random() < args.proportion_empty_prompts:
+                captions.append("")
+            elif isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        return inputs.input_ids
+
+    image_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    conditioning_image_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        images = [image_transforms(image) for image in images]
+
+        conditioning_images = [image.convert("RGB") for image in examples[conditioning_image_column]]
+        conditioning_images = [conditioning_image_transforms(image) for image in conditioning_images]
+
+        examples["pixel_values"] = images
+        examples["conditioning_pixel_values"] = conditioning_images
+        examples["input_ids"] = tokenize_captions(examples)
+
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    return train_dataset
+
+
+def collate_fn(examples):
+    pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    conditioning_pixel_values = torch.stack([example["conditioning_pixel_values"] for example in examples])
+    conditioning_pixel_values = conditioning_pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    input_ids = torch.stack([example["input_ids"] for example in examples])
+
+    return {
+        "pixel_values": pixel_values,
+        "conditioning_pixel_values": conditioning_pixel_values,
+        "input_ids": input_ids,
+    }
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    if args.controlnet_model_name_or_path:
+        logger.info("Loading existing controlnet weights")
+        controlnet = ControlNetModel.from_pretrained(args.controlnet_model_name_or_path)
+    else:
+        logger.info("Initializing controlnet weights from unet")
+        controlnet = ControlNetModel.from_unet(unet)
+
+    # Taken from [Sayak Paul's Diffusers PR #6511](https://github.com/huggingface/diffusers/pull/6511/files)
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                i = len(weights) - 1
+
+                while len(weights) > 0:
+                    weights.pop()
+                    model = models[i]
+
+                    sub_dir = "controlnet"
+                    model.save_pretrained(os.path.join(output_dir, sub_dir))
+
+                    i -= 1
+
+        def load_model_hook(models, input_dir):
+            while len(models) > 0:
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = ControlNetModel.from_pretrained(input_dir, subfolder="controlnet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    vae.requires_grad_(False)
+    unet.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    controlnet.train()
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+            controlnet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        controlnet.enable_gradient_checkpointing()
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        " Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training, copy of the weights should still be float32."
+    )
+
+    if unwrap_model(controlnet).dtype != torch.float32:
+        raise ValueError(
+            f"Controlnet loaded as datatype {unwrap_model(controlnet).dtype}. {low_precision_error_string}"
+        )
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = controlnet.parameters()
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    train_dataset = make_train_dataset(args, tokenizer, accelerator)
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        controlnet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae, unet and text_encoder to device and cast to weight_dtype
+    vae.to(accelerator.device, dtype=weight_dtype)
+    unet.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+
+        # tensorboard cannot handle list types for config
+        tracker_config.pop("validation_prompt")
+        tracker_config.pop("validation_image")
+
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    image_logs = None
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(controlnet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"], return_dict=False)[0]
+
+                controlnet_image = batch["conditioning_pixel_values"].to(dtype=weight_dtype)
+
+                down_block_res_samples, mid_block_res_sample = controlnet(
+                    noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=encoder_hidden_states,
+                    controlnet_cond=controlnet_image,
+                    return_dict=False,
+                )
+
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=encoder_hidden_states,
+                    down_block_additional_residuals=[
+                        sample.to(dtype=weight_dtype) for sample in down_block_res_samples
+                    ],
+                    mid_block_additional_residual=mid_block_res_sample.to(dtype=weight_dtype),
+                    return_dict=False,
+                )[0]
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = controlnet.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                        image_logs = log_validation(
+                            vae,
+                            text_encoder,
+                            tokenizer,
+                            unet,
+                            controlnet,
+                            args,
+                            accelerator,
+                            weight_dtype,
+                            global_step,
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        controlnet = unwrap_model(controlnet)
+        controlnet.save_pretrained(args.output_dir)
+
+        # Run a final round of validation.
+        image_logs = None
+        if args.validation_prompt is not None:
+            image_logs = log_validation(
+                vae=vae,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                unet=unet,
+                controlnet=None,
+                args=args,
+                accelerator=accelerator,
+                weight_dtype=weight_dtype,
+                step=global_step,
+                is_final_validation=True,
+            )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                image_logs=image_logs,
+                base_model=args.pretrained_model_name_or_path,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/controlnet/train_controlnet_flax.py b/diffusers/examples/controlnet/train_controlnet_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aa9e881fca53f87c1901747c08032adcb27cb2d
--- /dev/null
+++ b/diffusers/examples/controlnet/train_controlnet_flax.py
@@ -0,0 +1,1152 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import math
+import os
+import random
+import time
+from pathlib import Path
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+import torch
+import torch.utils.checkpoint
+import transformers
+from datasets import load_dataset, load_from_disk
+from flax import jax_utils
+from flax.core.frozen_dict import unfreeze
+from flax.training import train_state
+from flax.training.common_utils import shard
+from huggingface_hub import create_repo, upload_folder
+from PIL import Image, PngImagePlugin
+from torch.utils.data import IterableDataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTokenizer, FlaxCLIPTextModel, set_seed
+
+from diffusers import (
+    FlaxAutoencoderKL,
+    FlaxControlNetModel,
+    FlaxDDPMScheduler,
+    FlaxStableDiffusionControlNetPipeline,
+    FlaxUNet2DConditionModel,
+)
+from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+
+
+# To prevent an error that occurs when there are abnormally large compressed data chunk in the png image
+# see more https://github.com/python-pillow/Pillow/issues/5610
+LARGE_ENOUGH_NUMBER = 100
+PngImagePlugin.MAX_TEXT_CHUNK = LARGE_ENOUGH_NUMBER * (1024**2)
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = logging.getLogger(__name__)
+
+
+def log_validation(pipeline, pipeline_params, controlnet_params, tokenizer, args, rng, weight_dtype):
+    logger.info("Running validation...")
+
+    pipeline_params = pipeline_params.copy()
+    pipeline_params["controlnet"] = controlnet_params
+
+    num_samples = jax.device_count()
+    prng_seed = jax.random.split(rng, jax.device_count())
+
+    if len(args.validation_image) == len(args.validation_prompt):
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_image) == 1:
+        validation_images = args.validation_image * len(args.validation_prompt)
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_prompt) == 1:
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt * len(args.validation_image)
+    else:
+        raise ValueError(
+            "number of `args.validation_image` and `args.validation_prompt` should be checked in `parse_args`"
+        )
+
+    image_logs = []
+
+    for validation_prompt, validation_image in zip(validation_prompts, validation_images):
+        prompts = num_samples * [validation_prompt]
+        prompt_ids = pipeline.prepare_text_inputs(prompts)
+        prompt_ids = shard(prompt_ids)
+
+        validation_image = Image.open(validation_image).convert("RGB")
+        processed_image = pipeline.prepare_image_inputs(num_samples * [validation_image])
+        processed_image = shard(processed_image)
+        images = pipeline(
+            prompt_ids=prompt_ids,
+            image=processed_image,
+            params=pipeline_params,
+            prng_seed=prng_seed,
+            num_inference_steps=50,
+            jit=True,
+        ).images
+
+        images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
+        images = pipeline.numpy_to_pil(images)
+
+        image_logs.append(
+            {"validation_image": validation_image, "images": images, "validation_prompt": validation_prompt}
+        )
+
+    if args.report_to == "wandb":
+        formatted_images = []
+        for log in image_logs:
+            images = log["images"]
+            validation_prompt = log["validation_prompt"]
+            validation_image = log["validation_image"]
+
+            formatted_images.append(wandb.Image(validation_image, caption="Controlnet conditioning"))
+            for image in images:
+                image = wandb.Image(image, caption=validation_prompt)
+                formatted_images.append(image)
+
+        wandb.log({"validation": formatted_images})
+    else:
+        logger.warning(f"image logging not implemented for {args.report_to}")
+
+    return image_logs
+
+
+def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_folder=None):
+    img_str = ""
+    if image_logs is not None:
+        for i, log in enumerate(image_logs):
+            images = log["images"]
+            validation_prompt = log["validation_prompt"]
+            validation_image = log["validation_image"]
+            validation_image.save(os.path.join(repo_folder, "image_control.png"))
+            img_str += f"prompt: {validation_prompt}\n"
+            images = [validation_image] + images
+            make_image_grid(images, 1, len(images)).save(os.path.join(repo_folder, f"images_{i}.png"))
+            img_str += f"![images_{i})](./images_{i}.png)\n"
+
+    model_description = f"""
+# controlnet- {repo_id}
+
+These are controlnet weights trained on {base_model} with new type of conditioning. You can find some example images in the following. \n
+{img_str}
+"""
+
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion",
+        "stable-diffusion-diffusers",
+        "text-to-image",
+        "diffusers",
+        "controlnet",
+        "jax-diffusers-event",
+        "diffusers-training",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--controlnet_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained controlnet model or model identifier from huggingface.co/models."
+        " If not specified controlnet weights are initialized from unet.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--from_pt",
+        action="store_true",
+        help="Load the pretrained model from a PyTorch checkpoint.",
+    )
+    parser.add_argument(
+        "--controlnet_revision",
+        type=str,
+        default=None,
+        help="Revision of controlnet model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--profile_steps",
+        type=int,
+        default=0,
+        help="How many training steps to profile in the beginning.",
+    )
+    parser.add_argument(
+        "--profile_validation",
+        action="store_true",
+        help="Whether to profile the (last) validation.",
+    )
+    parser.add_argument(
+        "--profile_memory",
+        action="store_true",
+        help="Whether to dump an initial (before training loop) and a final (at program end) memory profile.",
+    )
+    parser.add_argument(
+        "--ccache",
+        type=str,
+        default=None,
+        help="Enables compilation cache.",
+    )
+    parser.add_argument(
+        "--controlnet_from_pt",
+        action="store_true",
+        help="Load the controlnet model from a PyTorch checkpoint.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="runs/{timestamp}",
+        help="The output directory where the model predictions and checkpoints will be written. "
+        "Can contain placeholders: {timestamp}.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=1, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=5000,
+        help=("Save a checkpoint of the training state every X updates."),
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_steps",
+        type=int,
+        default=100,
+        help=("log training metric every X steps to `--report_t`"),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="wandb",
+        help=('The integration to report the results and logs to. Currently only supported platforms are `"wandb"`'),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument("--streaming", action="store_true", help="To stream a large dataset from Hub.")
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training dataset. By default it will use `load_dataset` method to load a custom dataset from the folder."
+            "Folder must contain a dataset script as described here https://huggingface.co/docs/datasets/dataset_script) ."
+            "If `--load_from_disk` flag is passed, it will use `load_from_disk` method instead. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--load_from_disk",
+        action="store_true",
+        help=(
+            "If True, will load a dataset that was previously saved using `save_to_disk` from `--train_data_dir`"
+            "See more https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.load_from_disk"
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing the target image."
+    )
+    parser.add_argument(
+        "--conditioning_image_column",
+        type=str,
+        default="conditioning_image",
+        help="The column of the dataset containing the controlnet conditioning image.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set. Needed if `streaming` is set to True."
+        ),
+    )
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        nargs="+",
+        help=(
+            "A set of prompts evaluated every `--validation_steps` and logged to `--report_to`."
+            " Provide either a matching number of `--validation_image`s, a single `--validation_image`"
+            " to be used with all prompts, or a single prompt that will be used with all `--validation_image`s."
+        ),
+    )
+    parser.add_argument(
+        "--validation_image",
+        type=str,
+        default=None,
+        nargs="+",
+        help=(
+            "A set of paths to the controlnet conditioning image be evaluated every `--validation_steps`"
+            " and logged to `--report_to`. Provide either a matching number of `--validation_prompt`s, a"
+            " a single `--validation_prompt` to be used with all `--validation_image`s, or a single"
+            " `--validation_image` that will be used with all `--validation_prompt`s."
+        ),
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` and logging the images."
+        ),
+    )
+    parser.add_argument("--wandb_entity", type=str, default=None, help=("The wandb entity to use (for teams)."))
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="train_controlnet_flax",
+        help=("The `project` argument passed to wandb"),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps", type=int, default=1, help="Number of steps to accumulate gradients over"
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    args.output_dir = args.output_dir.replace("{timestamp}", time.strftime("%Y%m%d_%H%M%S"))
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+    if args.dataset_name is not None and args.train_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--train_data_dir`")
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    if args.validation_prompt is not None and args.validation_image is None:
+        raise ValueError("`--validation_image` must be set if `--validation_prompt` is set")
+
+    if args.validation_prompt is None and args.validation_image is not None:
+        raise ValueError("`--validation_prompt` must be set if `--validation_image` is set")
+
+    if (
+        args.validation_image is not None
+        and args.validation_prompt is not None
+        and len(args.validation_image) != 1
+        and len(args.validation_prompt) != 1
+        and len(args.validation_image) != len(args.validation_prompt)
+    ):
+        raise ValueError(
+            "Must provide either 1 `--validation_image`, 1 `--validation_prompt`,"
+            " or the same number of `--validation_prompt`s and `--validation_image`s"
+        )
+
+    # This idea comes from
+    # https://github.com/borisdayma/dalle-mini/blob/d2be512d4a6a9cda2d63ba04afc33038f98f705f/src/dalle_mini/data.py#L370
+    if args.streaming and args.max_train_samples is None:
+        raise ValueError("You must specify `max_train_samples` when using dataset streaming.")
+
+    return args
+
+
+def make_train_dataset(args, tokenizer, batch_size=None):
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            streaming=args.streaming,
+        )
+    else:
+        if args.train_data_dir is not None:
+            if args.load_from_disk:
+                dataset = load_from_disk(
+                    args.train_data_dir,
+                )
+            else:
+                dataset = load_dataset(
+                    args.train_data_dir,
+                    cache_dir=args.cache_dir,
+                )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    if isinstance(dataset["train"], IterableDataset):
+        column_names = next(iter(dataset["train"])).keys()
+    else:
+        column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    if args.image_column is None:
+        image_column = column_names[0]
+        logger.info(f"image column defaulting to {image_column}")
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    if args.caption_column is None:
+        caption_column = column_names[1]
+        logger.info(f"caption column defaulting to {caption_column}")
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    if args.conditioning_image_column is None:
+        conditioning_image_column = column_names[2]
+        logger.info(f"conditioning image column defaulting to {caption_column}")
+    else:
+        conditioning_image_column = args.conditioning_image_column
+        if conditioning_image_column not in column_names:
+            raise ValueError(
+                f"`--conditioning_image_column` value '{args.conditioning_image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if random.random() < args.proportion_empty_prompts:
+                captions.append("")
+            elif isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        return inputs.input_ids
+
+    image_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    conditioning_image_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        images = [image_transforms(image) for image in images]
+
+        conditioning_images = [image.convert("RGB") for image in examples[conditioning_image_column]]
+        conditioning_images = [conditioning_image_transforms(image) for image in conditioning_images]
+
+        examples["pixel_values"] = images
+        examples["conditioning_pixel_values"] = conditioning_images
+        examples["input_ids"] = tokenize_captions(examples)
+
+        return examples
+
+    if jax.process_index() == 0:
+        if args.max_train_samples is not None:
+            if args.streaming:
+                dataset["train"] = dataset["train"].shuffle(seed=args.seed).take(args.max_train_samples)
+            else:
+                dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        if args.streaming:
+            train_dataset = dataset["train"].map(
+                preprocess_train,
+                batched=True,
+                batch_size=batch_size,
+                remove_columns=list(dataset["train"].features.keys()),
+            )
+        else:
+            train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    return train_dataset
+
+
+def collate_fn(examples):
+    pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    conditioning_pixel_values = torch.stack([example["conditioning_pixel_values"] for example in examples])
+    conditioning_pixel_values = conditioning_pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    input_ids = torch.stack([example["input_ids"] for example in examples])
+
+    batch = {
+        "pixel_values": pixel_values,
+        "conditioning_pixel_values": conditioning_pixel_values,
+        "input_ids": input_ids,
+    }
+    batch = {k: v.numpy() for k, v in batch.items()}
+    return batch
+
+
+def get_params_to_save(params):
+    return jax.device_get(jax.tree_util.tree_map(lambda x: x[0], params))
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+
+    # wandb init
+    if jax.process_index() == 0 and args.report_to == "wandb":
+        wandb.init(
+            entity=args.wandb_entity,
+            project=args.tracker_project_name,
+            job_type="train",
+            config=args,
+        )
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    rng = jax.random.PRNGKey(0)
+
+    # Handle the repository creation
+    if jax.process_index() == 0:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer and add the placeholder token as a additional special token
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+        )
+    else:
+        raise NotImplementedError("No tokenizer specified!")
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    total_train_batch_size = args.train_batch_size * jax.local_device_count() * args.gradient_accumulation_steps
+    train_dataset = make_train_dataset(args, tokenizer, batch_size=total_train_batch_size)
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=not args.streaming,
+        collate_fn=collate_fn,
+        batch_size=total_train_batch_size,
+        num_workers=args.dataloader_num_workers,
+        drop_last=True,
+    )
+
+    weight_dtype = jnp.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = jnp.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = jnp.bfloat16
+
+    # Load models and create wrapper for stable diffusion
+    text_encoder = FlaxCLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        dtype=weight_dtype,
+        revision=args.revision,
+        from_pt=args.from_pt,
+    )
+    vae, vae_params = FlaxAutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        revision=args.revision,
+        subfolder="vae",
+        dtype=weight_dtype,
+        from_pt=args.from_pt,
+    )
+    unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="unet",
+        dtype=weight_dtype,
+        revision=args.revision,
+        from_pt=args.from_pt,
+    )
+
+    if args.controlnet_model_name_or_path:
+        logger.info("Loading existing controlnet weights")
+        controlnet, controlnet_params = FlaxControlNetModel.from_pretrained(
+            args.controlnet_model_name_or_path,
+            revision=args.controlnet_revision,
+            from_pt=args.controlnet_from_pt,
+            dtype=jnp.float32,
+        )
+    else:
+        logger.info("Initializing controlnet weights from unet")
+        rng, rng_params = jax.random.split(rng)
+
+        controlnet = FlaxControlNetModel(
+            in_channels=unet.config.in_channels,
+            down_block_types=unet.config.down_block_types,
+            only_cross_attention=unet.config.only_cross_attention,
+            block_out_channels=unet.config.block_out_channels,
+            layers_per_block=unet.config.layers_per_block,
+            attention_head_dim=unet.config.attention_head_dim,
+            cross_attention_dim=unet.config.cross_attention_dim,
+            use_linear_projection=unet.config.use_linear_projection,
+            flip_sin_to_cos=unet.config.flip_sin_to_cos,
+            freq_shift=unet.config.freq_shift,
+        )
+        controlnet_params = controlnet.init_weights(rng=rng_params)
+        controlnet_params = unfreeze(controlnet_params)
+        for key in [
+            "conv_in",
+            "time_embedding",
+            "down_blocks_0",
+            "down_blocks_1",
+            "down_blocks_2",
+            "down_blocks_3",
+            "mid_block",
+        ]:
+            controlnet_params[key] = unet_params[key]
+
+    pipeline, pipeline_params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        tokenizer=tokenizer,
+        controlnet=controlnet,
+        safety_checker=None,
+        dtype=weight_dtype,
+        revision=args.revision,
+        from_pt=args.from_pt,
+    )
+    pipeline_params = jax_utils.replicate(pipeline_params)
+
+    # Optimization
+    if args.scale_lr:
+        args.learning_rate = args.learning_rate * total_train_batch_size
+
+    constant_scheduler = optax.constant_schedule(args.learning_rate)
+
+    adamw = optax.adamw(
+        learning_rate=constant_scheduler,
+        b1=args.adam_beta1,
+        b2=args.adam_beta2,
+        eps=args.adam_epsilon,
+        weight_decay=args.adam_weight_decay,
+    )
+
+    optimizer = optax.chain(
+        optax.clip_by_global_norm(args.max_grad_norm),
+        adamw,
+    )
+
+    state = train_state.TrainState.create(apply_fn=controlnet.__call__, params=controlnet_params, tx=optimizer)
+
+    noise_scheduler, noise_scheduler_state = FlaxDDPMScheduler.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="scheduler"
+    )
+
+    # Initialize our training
+    validation_rng, train_rngs = jax.random.split(rng)
+    train_rngs = jax.random.split(train_rngs, jax.local_device_count())
+
+    def compute_snr(timesteps):
+        """
+        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+        """
+        alphas_cumprod = noise_scheduler_state.common.alphas_cumprod
+        sqrt_alphas_cumprod = alphas_cumprod**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+
+        alpha = sqrt_alphas_cumprod[timesteps]
+        sigma = sqrt_one_minus_alphas_cumprod[timesteps]
+        # Compute SNR.
+        snr = (alpha / sigma) ** 2
+        return snr
+
+    def train_step(state, unet_params, text_encoder_params, vae_params, batch, train_rng):
+        # reshape batch, add grad_step_dim if gradient_accumulation_steps > 1
+        if args.gradient_accumulation_steps > 1:
+            grad_steps = args.gradient_accumulation_steps
+            batch = jax.tree_map(lambda x: x.reshape((grad_steps, x.shape[0] // grad_steps) + x.shape[1:]), batch)
+
+        def compute_loss(params, minibatch, sample_rng):
+            # Convert images to latent space
+            vae_outputs = vae.apply(
+                {"params": vae_params}, minibatch["pixel_values"], deterministic=True, method=vae.encode
+            )
+            latents = vae_outputs.latent_dist.sample(sample_rng)
+            # (NHWC) -> (NCHW)
+            latents = jnp.transpose(latents, (0, 3, 1, 2))
+            latents = latents * vae.config.scaling_factor
+
+            # Sample noise that we'll add to the latents
+            noise_rng, timestep_rng = jax.random.split(sample_rng)
+            noise = jax.random.normal(noise_rng, latents.shape)
+            # Sample a random timestep for each image
+            bsz = latents.shape[0]
+            timesteps = jax.random.randint(
+                timestep_rng,
+                (bsz,),
+                0,
+                noise_scheduler.config.num_train_timesteps,
+            )
+
+            # Add noise to the latents according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)
+
+            # Get the text embedding for conditioning
+            encoder_hidden_states = text_encoder(
+                minibatch["input_ids"],
+                params=text_encoder_params,
+                train=False,
+            )[0]
+
+            controlnet_cond = minibatch["conditioning_pixel_values"]
+
+            # Predict the noise residual and compute loss
+            down_block_res_samples, mid_block_res_sample = controlnet.apply(
+                {"params": params},
+                noisy_latents,
+                timesteps,
+                encoder_hidden_states,
+                controlnet_cond,
+                train=True,
+                return_dict=False,
+            )
+
+            model_pred = unet.apply(
+                {"params": unet_params},
+                noisy_latents,
+                timesteps,
+                encoder_hidden_states,
+                down_block_additional_residuals=down_block_res_samples,
+                mid_block_additional_residual=mid_block_res_sample,
+            ).sample
+
+            # Get the target for loss depending on the prediction type
+            if noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif noise_scheduler.config.prediction_type == "v_prediction":
+                target = noise_scheduler.get_velocity(noise_scheduler_state, latents, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+            loss = (target - model_pred) ** 2
+
+            if args.snr_gamma is not None:
+                snr = jnp.array(compute_snr(timesteps))
+                snr_loss_weights = jnp.where(snr < args.snr_gamma, snr, jnp.ones_like(snr) * args.snr_gamma)
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    snr_loss_weights = snr_loss_weights / snr
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    snr_loss_weights = snr_loss_weights / (snr + 1)
+
+                loss = loss * snr_loss_weights
+
+            loss = loss.mean()
+
+            return loss
+
+        grad_fn = jax.value_and_grad(compute_loss)
+
+        # get a minibatch (one gradient accumulation slice)
+        def get_minibatch(batch, grad_idx):
+            return jax.tree_util.tree_map(
+                lambda x: jax.lax.dynamic_index_in_dim(x, grad_idx, keepdims=False),
+                batch,
+            )
+
+        def loss_and_grad(grad_idx, train_rng):
+            # create minibatch for the grad step
+            minibatch = get_minibatch(batch, grad_idx) if grad_idx is not None else batch
+            sample_rng, train_rng = jax.random.split(train_rng, 2)
+            loss, grad = grad_fn(state.params, minibatch, sample_rng)
+            return loss, grad, train_rng
+
+        if args.gradient_accumulation_steps == 1:
+            loss, grad, new_train_rng = loss_and_grad(None, train_rng)
+        else:
+            init_loss_grad_rng = (
+                0.0,  # initial value for cumul_loss
+                jax.tree_map(jnp.zeros_like, state.params),  # initial value for cumul_grad
+                train_rng,  # initial value for train_rng
+            )
+
+            def cumul_grad_step(grad_idx, loss_grad_rng):
+                cumul_loss, cumul_grad, train_rng = loss_grad_rng
+                loss, grad, new_train_rng = loss_and_grad(grad_idx, train_rng)
+                cumul_loss, cumul_grad = jax.tree_map(jnp.add, (cumul_loss, cumul_grad), (loss, grad))
+                return cumul_loss, cumul_grad, new_train_rng
+
+            loss, grad, new_train_rng = jax.lax.fori_loop(
+                0,
+                args.gradient_accumulation_steps,
+                cumul_grad_step,
+                init_loss_grad_rng,
+            )
+            loss, grad = jax.tree_map(lambda x: x / args.gradient_accumulation_steps, (loss, grad))
+
+        grad = jax.lax.pmean(grad, "batch")
+
+        new_state = state.apply_gradients(grads=grad)
+
+        metrics = {"loss": loss}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+
+        def l2(xs):
+            return jnp.sqrt(sum([jnp.vdot(x, x) for x in jax.tree_util.tree_leaves(xs)]))
+
+        metrics["l2_grads"] = l2(jax.tree_util.tree_leaves(grad))
+
+        return new_state, metrics, new_train_rng
+
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    unet_params = jax_utils.replicate(unet_params)
+    text_encoder_params = jax_utils.replicate(text_encoder.params)
+    vae_params = jax_utils.replicate(vae_params)
+
+    # Train!
+    if args.streaming:
+        dataset_length = args.max_train_samples
+    else:
+        dataset_length = len(train_dataloader)
+    num_update_steps_per_epoch = math.ceil(dataset_length / args.gradient_accumulation_steps)
+
+    # Scheduler and math around the number of training steps.
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {args.max_train_samples if args.streaming else len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {total_train_batch_size}")
+    logger.info(f"  Total optimization steps = {args.num_train_epochs * num_update_steps_per_epoch}")
+
+    if jax.process_index() == 0 and args.report_to == "wandb":
+        wandb.define_metric("*", step_metric="train/step")
+        wandb.define_metric("train/step", step_metric="walltime")
+        wandb.config.update(
+            {
+                "num_train_examples": args.max_train_samples if args.streaming else len(train_dataset),
+                "total_train_batch_size": total_train_batch_size,
+                "total_optimization_step": args.num_train_epochs * num_update_steps_per_epoch,
+                "num_devices": jax.device_count(),
+                "controlnet_params": sum(np.prod(x.shape) for x in jax.tree_util.tree_leaves(state.params)),
+            }
+        )
+
+    global_step = step0 = 0
+    epochs = tqdm(
+        range(args.num_train_epochs),
+        desc="Epoch ... ",
+        position=0,
+        disable=jax.process_index() > 0,
+    )
+    if args.profile_memory:
+        jax.profiler.save_device_memory_profile(os.path.join(args.output_dir, "memory_initial.prof"))
+    t00 = t0 = time.monotonic()
+    for epoch in epochs:
+        # ======================== Training ================================
+
+        train_metrics = []
+        train_metric = None
+
+        steps_per_epoch = (
+            args.max_train_samples // total_train_batch_size
+            if args.streaming or args.max_train_samples
+            else len(train_dataset) // total_train_batch_size
+        )
+        train_step_progress_bar = tqdm(
+            total=steps_per_epoch,
+            desc="Training...",
+            position=1,
+            leave=False,
+            disable=jax.process_index() > 0,
+        )
+        # train
+        for batch in train_dataloader:
+            if args.profile_steps and global_step == 1:
+                train_metric["loss"].block_until_ready()
+                jax.profiler.start_trace(args.output_dir)
+            if args.profile_steps and global_step == 1 + args.profile_steps:
+                train_metric["loss"].block_until_ready()
+                jax.profiler.stop_trace()
+
+            batch = shard(batch)
+            with jax.profiler.StepTraceAnnotation("train", step_num=global_step):
+                state, train_metric, train_rngs = p_train_step(
+                    state, unet_params, text_encoder_params, vae_params, batch, train_rngs
+                )
+            train_metrics.append(train_metric)
+
+            train_step_progress_bar.update(1)
+
+            global_step += 1
+            if global_step >= args.max_train_steps:
+                break
+
+            if (
+                args.validation_prompt is not None
+                and global_step % args.validation_steps == 0
+                and jax.process_index() == 0
+            ):
+                _ = log_validation(
+                    pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype
+                )
+
+            if global_step % args.logging_steps == 0 and jax.process_index() == 0:
+                if args.report_to == "wandb":
+                    train_metrics = jax_utils.unreplicate(train_metrics)
+                    train_metrics = jax.tree_util.tree_map(lambda *m: jnp.array(m).mean(), *train_metrics)
+                    wandb.log(
+                        {
+                            "walltime": time.monotonic() - t00,
+                            "train/step": global_step,
+                            "train/epoch": global_step / dataset_length,
+                            "train/steps_per_sec": (global_step - step0) / (time.monotonic() - t0),
+                            **{f"train/{k}": v for k, v in train_metrics.items()},
+                        }
+                    )
+                t0, step0 = time.monotonic(), global_step
+                train_metrics = []
+            if global_step % args.checkpointing_steps == 0 and jax.process_index() == 0:
+                controlnet.save_pretrained(
+                    f"{args.output_dir}/{global_step}",
+                    params=get_params_to_save(state.params),
+                )
+
+        train_metric = jax_utils.unreplicate(train_metric)
+        train_step_progress_bar.close()
+        epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})")
+
+    # Final validation & store model.
+    if jax.process_index() == 0:
+        if args.validation_prompt is not None:
+            if args.profile_validation:
+                jax.profiler.start_trace(args.output_dir)
+            image_logs = log_validation(
+                pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype
+            )
+            if args.profile_validation:
+                jax.profiler.stop_trace()
+        else:
+            image_logs = None
+
+        controlnet.save_pretrained(
+            args.output_dir,
+            params=get_params_to_save(state.params),
+        )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                image_logs=image_logs,
+                base_model=args.pretrained_model_name_or_path,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    if args.profile_memory:
+        jax.profiler.save_device_memory_profile(os.path.join(args.output_dir, "memory_final.prof"))
+    logger.info("Finished training.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/controlnet/train_controlnet_sd3.py b/diffusers/examples/controlnet/train_controlnet_sd3.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b255c501d99aadd39015d99056966c7bfa18a6f
--- /dev/null
+++ b/diffusers/examples/controlnet/train_controlnet_sd3.py
@@ -0,0 +1,1416 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import contextlib
+import copy
+import functools
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import accelerate
+import numpy as np
+import torch
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTokenizer, PretrainedConfig, T5TokenizerFast
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    SD3ControlNetModel,
+    SD3Transformer2DModel,
+    StableDiffusion3ControlNetPipeline,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import (
+    clear_objs_and_retain_memory,
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+)
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.30.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+
+
+def log_validation(controlnet, args, accelerator, weight_dtype, step, is_final_validation=False):
+    logger.info("Running validation... ")
+
+    if not is_final_validation:
+        controlnet = accelerator.unwrap_model(controlnet)
+    else:
+        controlnet = SD3ControlNetModel.from_pretrained(args.output_dir, torch_dtype=weight_dtype)
+
+    pipeline = StableDiffusion3ControlNetPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        controlnet=controlnet,
+        safety_checker=None,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(torch.device(accelerator.device))
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.manual_seed(args.seed)
+
+    if len(args.validation_image) == len(args.validation_prompt):
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_image) == 1:
+        validation_images = args.validation_image * len(args.validation_prompt)
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_prompt) == 1:
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt * len(args.validation_image)
+    else:
+        raise ValueError(
+            "number of `args.validation_image` and `args.validation_prompt` should be checked in `parse_args`"
+        )
+
+    image_logs = []
+    inference_ctx = contextlib.nullcontext() if is_final_validation else torch.autocast(accelerator.device.type)
+
+    for validation_prompt, validation_image in zip(validation_prompts, validation_images):
+        validation_image = Image.open(validation_image).convert("RGB")
+
+        images = []
+
+        for _ in range(args.num_validation_images):
+            with inference_ctx:
+                image = pipeline(
+                    validation_prompt, control_image=validation_image, num_inference_steps=20, generator=generator
+                ).images[0]
+
+            images.append(image)
+
+        image_logs.append(
+            {"validation_image": validation_image, "images": images, "validation_prompt": validation_prompt}
+        )
+
+    tracker_key = "test" if is_final_validation else "validation"
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                validation_image = log["validation_image"]
+
+                tracker.writer.add_image(
+                    "Controlnet conditioning", np.asarray([validation_image]), step, dataformats="NHWC"
+                )
+
+                formatted_images = []
+                for image in images:
+                    formatted_images.append(np.asarray(image))
+
+                formatted_images = np.stack(formatted_images)
+
+                tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            formatted_images = []
+
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                validation_image = log["validation_image"]
+
+                formatted_images.append(wandb.Image(validation_image, caption="Controlnet conditioning"))
+
+                for image in images:
+                    image = wandb.Image(image, caption=validation_prompt)
+                    formatted_images.append(image)
+
+            tracker.log({tracker_key: formatted_images})
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+        clear_objs_and_retain_memory(pipeline)
+
+        if not is_final_validation:
+            controlnet.to(accelerator.device)
+
+        return image_logs
+
+
+# Copied from dreambooth sd3 example
+def load_text_encoders(class_one, class_two, class_three):
+    text_encoder_one = class_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = class_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    text_encoder_three = class_three.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_3", revision=args.revision, variant=args.variant
+    )
+    return text_encoder_one, text_encoder_two, text_encoder_three
+
+
+# Copied from dreambooth sd3 example
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_folder=None):
+    img_str = ""
+    if image_logs is not None:
+        img_str = "You can find some example images below.\n\n"
+        for i, log in enumerate(image_logs):
+            images = log["images"]
+            validation_prompt = log["validation_prompt"]
+            validation_image = log["validation_image"]
+            validation_image.save(os.path.join(repo_folder, "image_control.png"))
+            img_str += f"prompt: {validation_prompt}\n"
+            images = [validation_image] + images
+            image_grid(images, 1, len(images)).save(os.path.join(repo_folder, f"images_{i}.png"))
+            img_str += f"![images_{i})](./images_{i}.png)\n"
+
+    model_description = f"""
+# SD3 controlnet-{repo_id}
+
+These are controlnet weights trained on {base_model} with new type of conditioning.
+The weights were trained using [ControlNet](https://github.com/lllyasviel/ControlNet) with the [SD3 diffusers trainer](https://github.com/huggingface/diffusers/blob/main/examples/controlnet/README_sd3.md).
+{img_str}
+
+Please adhere to the licensing terms as described `[here](https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/LICENSE)`.
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="openrail++",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "text-to-image",
+        "diffusers-training",
+        "diffusers",
+        "sd3",
+        "sd3-diffusers",
+        "controlnet",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a ControlNet training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--controlnet_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained controlnet model or model identifier from huggingface.co/models."
+        " If not specified controlnet weights are initialized from unet.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="controlnet-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="logit_normal",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap"],
+    )
+    parser.add_argument(
+        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+    )
+    parser.add_argument(
+        "--precondition_outputs",
+        type=int,
+        default=1,
+        help="Flag indicating if we are preconditioning the model outputs or not as done in EDM. This affects how "
+        "model `target` is calculated.",
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing the target image."
+    )
+    parser.add_argument(
+        "--conditioning_image_column",
+        type=str,
+        default="conditioning_image",
+        help="The column of the dataset containing the controlnet conditioning image.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    parser.add_argument(
+        "--max_sequence_length",
+        type=int,
+        default=77,
+        help="Maximum sequence length to use with with the T5 text encoder",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        nargs="+",
+        help=(
+            "A set of prompts evaluated every `--validation_steps` and logged to `--report_to`."
+            " Provide either a matching number of `--validation_image`s, a single `--validation_image`"
+            " to be used with all prompts, or a single prompt that will be used with all `--validation_image`s."
+        ),
+    )
+    parser.add_argument(
+        "--validation_image",
+        type=str,
+        default=None,
+        nargs="+",
+        help=(
+            "A set of paths to the controlnet conditioning image be evaluated every `--validation_steps`"
+            " and logged to `--report_to`. Provide either a matching number of `--validation_prompt`s, a"
+            " a single `--validation_prompt` to be used with all `--validation_image`s, or a single"
+            " `--validation_image` that will be used with all `--validation_prompt`s."
+        ),
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images to be generated for each `--validation_image`, `--validation_prompt` pair",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="train_controlnet",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--train_data_dir`")
+
+    if args.dataset_name is not None and args.train_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--train_data_dir`")
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    if args.validation_prompt is not None and args.validation_image is None:
+        raise ValueError("`--validation_image` must be set if `--validation_prompt` is set")
+
+    if args.validation_prompt is None and args.validation_image is not None:
+        raise ValueError("`--validation_prompt` must be set if `--validation_image` is set")
+
+    if (
+        args.validation_image is not None
+        and args.validation_prompt is not None
+        and len(args.validation_image) != 1
+        and len(args.validation_prompt) != 1
+        and len(args.validation_image) != len(args.validation_prompt)
+    ):
+        raise ValueError(
+            "Must provide either 1 `--validation_image`, 1 `--validation_prompt`,"
+            " or the same number of `--validation_prompt`s and `--validation_image`s"
+        )
+
+    if args.resolution % 8 != 0:
+        raise ValueError(
+            "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the controlnet encoder."
+        )
+
+    return args
+
+
+def make_train_dataset(args, tokenizer_one, tokenizer_two, tokenizer_three, accelerator):
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        if args.train_data_dir is not None:
+            dataset = load_dataset(
+                args.train_data_dir,
+                cache_dir=args.cache_dir,
+            )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    if args.image_column is None:
+        image_column = column_names[0]
+        logger.info(f"image column defaulting to {image_column}")
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    if args.caption_column is None:
+        caption_column = column_names[1]
+        logger.info(f"caption column defaulting to {caption_column}")
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    if args.conditioning_image_column is None:
+        conditioning_image_column = column_names[2]
+        logger.info(f"conditioning image column defaulting to {conditioning_image_column}")
+    else:
+        conditioning_image_column = args.conditioning_image_column
+        if conditioning_image_column not in column_names:
+            raise ValueError(
+                f"`--conditioning_image_column` value '{args.conditioning_image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    def process_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if random.random() < args.proportion_empty_prompts:
+                captions.append("")
+            elif isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        return captions
+
+    image_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    conditioning_image_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        images = [image_transforms(image) for image in images]
+
+        conditioning_images = [image.convert("RGB") for image in examples[conditioning_image_column]]
+        conditioning_images = [conditioning_image_transforms(image) for image in conditioning_images]
+
+        examples["pixel_values"] = images
+        examples["conditioning_pixel_values"] = conditioning_images
+        examples["prompts"] = process_captions(examples)
+
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    return train_dataset
+
+
+def collate_fn(examples):
+    pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    conditioning_pixel_values = torch.stack([example["conditioning_pixel_values"] for example in examples])
+    conditioning_pixel_values = conditioning_pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    prompt_embeds = torch.stack([torch.tensor(example["prompt_embeds"]) for example in examples])
+    pooled_prompt_embeds = torch.stack([torch.tensor(example["pooled_prompt_embeds"]) for example in examples])
+
+    return {
+        "pixel_values": pixel_values,
+        "conditioning_pixel_values": conditioning_pixel_values,
+        "prompt_embeds": prompt_embeds,
+        "pooled_prompt_embeds": pooled_prompt_embeds,
+    }
+
+
+# Copied from dreambooth sd3 example
+def _encode_prompt_with_t5(
+    text_encoder,
+    tokenizer,
+    max_sequence_length,
+    prompt=None,
+    num_images_per_prompt=1,
+    device=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        add_special_tokens=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+
+    dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+    _, seq_len, _ = prompt_embeds.shape
+
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+    return prompt_embeds
+
+
+# Copied from dreambooth sd3 example
+def _encode_prompt_with_clip(
+    text_encoder,
+    tokenizer,
+    prompt: str,
+    device=None,
+    num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_tensors="pt",
+    )
+
+    text_input_ids = text_inputs.input_ids
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+    pooled_prompt_embeds = prompt_embeds[0]
+    prompt_embeds = prompt_embeds.hidden_states[-2]
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+    return prompt_embeds, pooled_prompt_embeds
+
+
+# Copied from dreambooth sd3 example
+def encode_prompt(
+    text_encoders,
+    tokenizers,
+    prompt: str,
+    max_sequence_length,
+    device=None,
+    num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+
+    clip_tokenizers = tokenizers[:2]
+    clip_text_encoders = text_encoders[:2]
+
+    clip_prompt_embeds_list = []
+    clip_pooled_prompt_embeds_list = []
+    for tokenizer, text_encoder in zip(clip_tokenizers, clip_text_encoders):
+        prompt_embeds, pooled_prompt_embeds = _encode_prompt_with_clip(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            prompt=prompt,
+            device=device if device is not None else text_encoder.device,
+            num_images_per_prompt=num_images_per_prompt,
+        )
+        clip_prompt_embeds_list.append(prompt_embeds)
+        clip_pooled_prompt_embeds_list.append(pooled_prompt_embeds)
+
+    clip_prompt_embeds = torch.cat(clip_prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = torch.cat(clip_pooled_prompt_embeds_list, dim=-1)
+
+    t5_prompt_embed = _encode_prompt_with_t5(
+        text_encoders[-1],
+        tokenizers[-1],
+        max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        device=device if device is not None else text_encoders[-1].device,
+    )
+
+    clip_prompt_embeds = torch.nn.functional.pad(
+        clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+    )
+    prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    tokenizer_one = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+    )
+    tokenizer_two = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+    )
+    tokenizer_three = T5TokenizerFast.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_3",
+        revision=args.revision,
+    )
+
+    # import correct text encoder class
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+    text_encoder_cls_three = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_3"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="scheduler"
+    )
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    text_encoder_one, text_encoder_two, text_encoder_three = load_text_encoders(
+        text_encoder_cls_one, text_encoder_cls_two, text_encoder_cls_three
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+        variant=args.variant,
+    )
+    transformer = SD3Transformer2DModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant
+    )
+
+    if args.controlnet_model_name_or_path:
+        logger.info("Loading existing controlnet weights")
+        controlnet = SD3ControlNetModel.from_pretrained(args.controlnet_model_name_or_path)
+    else:
+        logger.info("Initializing controlnet weights from transformer")
+        controlnet = SD3ControlNetModel.from_transformer(transformer)
+
+    transformer.requires_grad_(False)
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    text_encoder_three.requires_grad_(False)
+    controlnet.train()
+
+    # Taken from [Sayak Paul's Diffusers PR #6511](https://github.com/huggingface/diffusers/pull/6511/files)
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                i = len(weights) - 1
+
+                while len(weights) > 0:
+                    weights.pop()
+                    model = models[i]
+
+                    sub_dir = "controlnet"
+                    model.save_pretrained(os.path.join(output_dir, sub_dir))
+
+                    i -= 1
+
+        def load_model_hook(models, input_dir):
+            while len(models) > 0:
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = SD3ControlNetModel.from_pretrained(input_dir, subfolder="controlnet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        controlnet.enable_gradient_checkpointing()
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        " Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training, copy of the weights should still be float32."
+    )
+
+    if unwrap_model(controlnet).dtype != torch.float32:
+        raise ValueError(
+            f"Controlnet loaded as datatype {unwrap_model(controlnet).dtype}. {low_precision_error_string}"
+        )
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = controlnet.parameters()
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae, transformer and text_encoder to device and cast to weight_dtype
+    vae.to(accelerator.device, dtype=torch.float32)
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_three.to(accelerator.device, dtype=weight_dtype)
+
+    train_dataset = make_train_dataset(args, tokenizer_one, tokenizer_two, tokenizer_three, accelerator)
+
+    tokenizers = [tokenizer_one, tokenizer_two, tokenizer_three]
+    text_encoders = [text_encoder_one, text_encoder_two, text_encoder_three]
+
+    def compute_text_embeddings(batch, text_encoders, tokenizers):
+        with torch.no_grad():
+            prompt = batch["prompts"]
+            prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                text_encoders, tokenizers, prompt, args.max_sequence_length
+            )
+            prompt_embeds = prompt_embeds.to(accelerator.device)
+            pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device)
+        return {"prompt_embeds": prompt_embeds, "pooled_prompt_embeds": pooled_prompt_embeds}
+
+    compute_embeddings_fn = functools.partial(
+        compute_text_embeddings,
+        text_encoders=text_encoders,
+        tokenizers=tokenizers,
+    )
+    with accelerator.main_process_first():
+        from datasets.fingerprint import Hasher
+
+        # fingerprint used by the cache for the other processes to load the result
+        # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401
+        new_fingerprint = Hasher.hash(args)
+        train_dataset = train_dataset.map(compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint)
+
+    clear_objs_and_retain_memory(text_encoders + tokenizers)
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        controlnet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+
+        # tensorboard cannot handle list types for config
+        tracker_config.pop("validation_prompt")
+        tracker_config.pop("validation_image")
+
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    image_logs = None
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(controlnet):
+                # Convert images to latent space
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                model_input = (model_input - vae.config.shift_factor) * vae.config.scaling_factor
+                model_input = model_input.to(dtype=weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+                # Sample a random timestep for each image
+                # for weighting schemes where we sample timesteps non-uniformly
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+
+                # Add noise according to flow matching.
+                # zt = (1 - texp) * x + texp * z1
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
+
+                # Get the text embedding for conditioning
+                prompt_embeds = batch["prompt_embeds"]
+                pooled_prompt_embeds = batch["pooled_prompt_embeds"]
+
+                # controlnet(s) inference
+                controlnet_image = batch["conditioning_pixel_values"].to(dtype=weight_dtype)
+                controlnet_image = vae.encode(controlnet_image).latent_dist.sample()
+                controlnet_image = controlnet_image * vae.config.scaling_factor
+
+                control_block_res_samples = controlnet(
+                    hidden_states=noisy_model_input,
+                    timestep=timesteps,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    controlnet_cond=controlnet_image,
+                    return_dict=False,
+                )[0]
+                control_block_res_samples = [sample.to(dtype=weight_dtype) for sample in control_block_res_samples]
+
+                # Predict the noise residual
+                model_pred = transformer(
+                    hidden_states=noisy_model_input,
+                    timestep=timesteps,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    block_controlnet_hidden_states=control_block_res_samples,
+                    return_dict=False,
+                )[0]
+
+                # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                # Preconditioning of the model outputs.
+                if args.precondition_outputs:
+                    model_pred = model_pred * (-sigmas) + noisy_model_input
+
+                # these weighting schemes use a uniform timestep sampling
+                # and instead post-weight the loss
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+
+                # flow matching loss
+                if args.precondition_outputs:
+                    target = model_input
+                else:
+                    target = noise - model_input
+
+                # Compute regular loss.
+                loss = torch.mean(
+                    (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    1,
+                )
+                loss = loss.mean()
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = controlnet.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                        image_logs = log_validation(
+                            controlnet,
+                            args,
+                            accelerator,
+                            weight_dtype,
+                            global_step,
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        controlnet = unwrap_model(controlnet)
+        controlnet.save_pretrained(args.output_dir)
+
+        # Run a final round of validation.
+        image_logs = None
+        if args.validation_prompt is not None:
+            image_logs = log_validation(
+                controlnet=None,
+                args=args,
+                accelerator=accelerator,
+                weight_dtype=weight_dtype,
+                step=global_step,
+                is_final_validation=True,
+            )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                image_logs=image_logs,
+                base_model=args.pretrained_model_name_or_path,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/controlnet/train_controlnet_sdxl.py b/diffusers/examples/controlnet/train_controlnet_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..877ca61358499168a633480e86f349648e4e0e89
--- /dev/null
+++ b/diffusers/examples/controlnet/train_controlnet_sdxl.py
@@ -0,0 +1,1346 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import functools
+import gc
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+
+import accelerate
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedType, ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    DDPMScheduler,
+    StableDiffusionXLControlNetPipeline,
+    UNet2DConditionModel,
+    UniPCMultistepScheduler,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+if is_torch_npu_available():
+    torch.npu.config.allow_internal_format = False
+
+
+def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step, is_final_validation=False):
+    logger.info("Running validation... ")
+
+    if not is_final_validation:
+        controlnet = accelerator.unwrap_model(controlnet)
+        pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            vae=vae,
+            unet=unet,
+            controlnet=controlnet,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+    else:
+        controlnet = ControlNetModel.from_pretrained(args.output_dir, torch_dtype=weight_dtype)
+        if args.pretrained_vae_model_name_or_path is not None:
+            vae = AutoencoderKL.from_pretrained(args.pretrained_vae_model_name_or_path, torch_dtype=weight_dtype)
+        else:
+            vae = AutoencoderKL.from_pretrained(
+                args.pretrained_model_name_or_path, subfolder="vae", torch_dtype=weight_dtype
+            )
+
+        pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            vae=vae,
+            controlnet=controlnet,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+
+    pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    if len(args.validation_image) == len(args.validation_prompt):
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_image) == 1:
+        validation_images = args.validation_image * len(args.validation_prompt)
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_prompt) == 1:
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt * len(args.validation_image)
+    else:
+        raise ValueError(
+            "number of `args.validation_image` and `args.validation_prompt` should be checked in `parse_args`"
+        )
+
+    image_logs = []
+    if is_final_validation or torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+
+    for validation_prompt, validation_image in zip(validation_prompts, validation_images):
+        validation_image = Image.open(validation_image).convert("RGB")
+        validation_image = validation_image.resize((args.resolution, args.resolution))
+
+        images = []
+
+        for _ in range(args.num_validation_images):
+            with autocast_ctx:
+                image = pipeline(
+                    prompt=validation_prompt, image=validation_image, num_inference_steps=20, generator=generator
+                ).images[0]
+            images.append(image)
+
+        image_logs.append(
+            {"validation_image": validation_image, "images": images, "validation_prompt": validation_prompt}
+        )
+
+    tracker_key = "test" if is_final_validation else "validation"
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                validation_image = log["validation_image"]
+
+                formatted_images = []
+
+                formatted_images.append(np.asarray(validation_image))
+
+                for image in images:
+                    formatted_images.append(np.asarray(image))
+
+                formatted_images = np.stack(formatted_images)
+
+                tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            formatted_images = []
+
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                validation_image = log["validation_image"]
+
+                formatted_images.append(wandb.Image(validation_image, caption="Controlnet conditioning"))
+
+                for image in images:
+                    image = wandb.Image(image, caption=validation_prompt)
+                    formatted_images.append(image)
+
+            tracker.log({tracker_key: formatted_images})
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+        del pipeline
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        return image_logs
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_folder=None):
+    img_str = ""
+    if image_logs is not None:
+        img_str = "You can find some example images below.\n\n"
+        for i, log in enumerate(image_logs):
+            images = log["images"]
+            validation_prompt = log["validation_prompt"]
+            validation_image = log["validation_image"]
+            validation_image.save(os.path.join(repo_folder, "image_control.png"))
+            img_str += f"prompt: {validation_prompt}\n"
+            images = [validation_image] + images
+            make_image_grid(images, 1, len(images)).save(os.path.join(repo_folder, f"images_{i}.png"))
+            img_str += f"![images_{i})](./images_{i}.png)\n"
+
+    model_description = f"""
+# controlnet-{repo_id}
+
+These are controlnet weights trained on {base_model} with new type of conditioning.
+{img_str}
+"""
+
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="openrail++",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion-xl",
+        "stable-diffusion-xl-diffusers",
+        "text-to-image",
+        "diffusers",
+        "controlnet",
+        "diffusers-training",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a ControlNet training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to an improved VAE to stabilize training. For more details check out: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--controlnet_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained controlnet model or model identifier from huggingface.co/models."
+        " If not specified controlnet weights are initialized from unet.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="controlnet-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--crops_coords_top_left_h",
+        type=int,
+        default=0,
+        help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."),
+    )
+    parser.add_argument(
+        "--crops_coords_top_left_w",
+        type=int,
+        default=0,
+        help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention."
+    )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing the target image."
+    )
+    parser.add_argument(
+        "--conditioning_image_column",
+        type=str,
+        default="conditioning_image",
+        help="The column of the dataset containing the controlnet conditioning image.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        nargs="+",
+        help=(
+            "A set of prompts evaluated every `--validation_steps` and logged to `--report_to`."
+            " Provide either a matching number of `--validation_image`s, a single `--validation_image`"
+            " to be used with all prompts, or a single prompt that will be used with all `--validation_image`s."
+        ),
+    )
+    parser.add_argument(
+        "--validation_image",
+        type=str,
+        default=None,
+        nargs="+",
+        help=(
+            "A set of paths to the controlnet conditioning image be evaluated every `--validation_steps`"
+            " and logged to `--report_to`. Provide either a matching number of `--validation_prompt`s, a"
+            " a single `--validation_prompt` to be used with all `--validation_image`s, or a single"
+            " `--validation_image` that will be used with all `--validation_prompt`s."
+        ),
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images to be generated for each `--validation_image`, `--validation_prompt` pair",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="sd_xl_train_controlnet",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--train_data_dir`")
+
+    if args.dataset_name is not None and args.train_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--train_data_dir`")
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    if args.validation_prompt is not None and args.validation_image is None:
+        raise ValueError("`--validation_image` must be set if `--validation_prompt` is set")
+
+    if args.validation_prompt is None and args.validation_image is not None:
+        raise ValueError("`--validation_prompt` must be set if `--validation_image` is set")
+
+    if (
+        args.validation_image is not None
+        and args.validation_prompt is not None
+        and len(args.validation_image) != 1
+        and len(args.validation_prompt) != 1
+        and len(args.validation_image) != len(args.validation_prompt)
+    ):
+        raise ValueError(
+            "Must provide either 1 `--validation_image`, 1 `--validation_prompt`,"
+            " or the same number of `--validation_prompt`s and `--validation_image`s"
+        )
+
+    if args.resolution % 8 != 0:
+        raise ValueError(
+            "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the controlnet encoder."
+        )
+
+    return args
+
+
+def get_train_dataset(args, accelerator):
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        if args.train_data_dir is not None:
+            dataset = load_dataset(
+                args.train_data_dir,
+                cache_dir=args.cache_dir,
+            )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    if args.image_column is None:
+        image_column = column_names[0]
+        logger.info(f"image column defaulting to {image_column}")
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    if args.caption_column is None:
+        caption_column = column_names[1]
+        logger.info(f"caption column defaulting to {caption_column}")
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    if args.conditioning_image_column is None:
+        conditioning_image_column = column_names[2]
+        logger.info(f"conditioning image column defaulting to {conditioning_image_column}")
+    else:
+        conditioning_image_column = args.conditioning_image_column
+        if conditioning_image_column not in column_names:
+            raise ValueError(
+                f"`--conditioning_image_column` value '{args.conditioning_image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    with accelerator.main_process_first():
+        train_dataset = dataset["train"].shuffle(seed=args.seed)
+        if args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(args.max_train_samples))
+    return train_dataset
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train=True):
+    prompt_embeds_list = []
+
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+
+    with torch.no_grad():
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                captions,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = text_encoder(
+                text_input_ids.to(text_encoder.device),
+                output_hidden_states=True,
+            )
+
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+            prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def prepare_train_dataset(dataset, accelerator):
+    image_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    conditioning_image_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[args.image_column]]
+        images = [image_transforms(image) for image in images]
+
+        conditioning_images = [image.convert("RGB") for image in examples[args.conditioning_image_column]]
+        conditioning_images = [conditioning_image_transforms(image) for image in conditioning_images]
+
+        examples["pixel_values"] = images
+        examples["conditioning_pixel_values"] = conditioning_images
+
+        return examples
+
+    with accelerator.main_process_first():
+        dataset = dataset.with_transform(preprocess_train)
+
+    return dataset
+
+
+def collate_fn(examples):
+    pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    conditioning_pixel_values = torch.stack([example["conditioning_pixel_values"] for example in examples])
+    conditioning_pixel_values = conditioning_pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    prompt_ids = torch.stack([torch.tensor(example["prompt_embeds"]) for example in examples])
+
+    add_text_embeds = torch.stack([torch.tensor(example["text_embeds"]) for example in examples])
+    add_time_ids = torch.stack([torch.tensor(example["time_ids"]) for example in examples])
+
+    return {
+        "pixel_values": pixel_values,
+        "conditioning_pixel_values": conditioning_pixel_values,
+        "prompt_ids": prompt_ids,
+        "unet_added_conditions": {"text_embeds": add_text_embeds, "time_ids": add_time_ids},
+    }
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    if args.controlnet_model_name_or_path:
+        logger.info("Loading existing controlnet weights")
+        controlnet = ControlNetModel.from_pretrained(args.controlnet_model_name_or_path)
+    else:
+        logger.info("Initializing controlnet weights from unet")
+        controlnet = ControlNetModel.from_unet(unet)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                i = len(weights) - 1
+
+                while len(weights) > 0:
+                    weights.pop()
+                    model = models[i]
+
+                    sub_dir = "controlnet"
+                    model.save_pretrained(os.path.join(output_dir, sub_dir))
+
+                    i -= 1
+
+        def load_model_hook(models, input_dir):
+            while len(models) > 0:
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = ControlNetModel.from_pretrained(input_dir, subfolder="controlnet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    vae.requires_grad_(False)
+    unet.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    controlnet.train()
+
+    if args.enable_npu_flash_attention:
+        if is_torch_npu_available():
+            logger.info("npu flash attention enabled.")
+            unet.enable_npu_flash_attention()
+        else:
+            raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.")
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+            controlnet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        controlnet.enable_gradient_checkpointing()
+        unet.enable_gradient_checkpointing()
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        " Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training, copy of the weights should still be float32."
+    )
+
+    if unwrap_model(controlnet).dtype != torch.float32:
+        raise ValueError(
+            f"Controlnet loaded as datatype {unwrap_model(controlnet).dtype}. {low_precision_error_string}"
+        )
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = controlnet.parameters()
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae, unet and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    if args.pretrained_vae_model_name_or_path is not None:
+        vae.to(accelerator.device, dtype=weight_dtype)
+    else:
+        vae.to(accelerator.device, dtype=torch.float32)
+    unet.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    # Here, we compute not just the text embeddings but also the additional embeddings
+    # needed for the SD XL UNet to operate.
+    def compute_embeddings(batch, proportion_empty_prompts, text_encoders, tokenizers, is_train=True):
+        original_size = (args.resolution, args.resolution)
+        target_size = (args.resolution, args.resolution)
+        crops_coords_top_left = (args.crops_coords_top_left_h, args.crops_coords_top_left_w)
+        prompt_batch = batch[args.caption_column]
+
+        prompt_embeds, pooled_prompt_embeds = encode_prompt(
+            prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train
+        )
+        add_text_embeds = pooled_prompt_embeds
+
+        # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids])
+
+        prompt_embeds = prompt_embeds.to(accelerator.device)
+        add_text_embeds = add_text_embeds.to(accelerator.device)
+        add_time_ids = add_time_ids.repeat(len(prompt_batch), 1)
+        add_time_ids = add_time_ids.to(accelerator.device, dtype=prompt_embeds.dtype)
+        unet_added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        return {"prompt_embeds": prompt_embeds, **unet_added_cond_kwargs}
+
+    # Let's first compute all the embeddings so that we can free up the text encoders
+    # from memory.
+    text_encoders = [text_encoder_one, text_encoder_two]
+    tokenizers = [tokenizer_one, tokenizer_two]
+    train_dataset = get_train_dataset(args, accelerator)
+    compute_embeddings_fn = functools.partial(
+        compute_embeddings,
+        text_encoders=text_encoders,
+        tokenizers=tokenizers,
+        proportion_empty_prompts=args.proportion_empty_prompts,
+    )
+    with accelerator.main_process_first():
+        from datasets.fingerprint import Hasher
+
+        # fingerprint used by the cache for the other processes to load the result
+        # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401
+        new_fingerprint = Hasher.hash(args)
+        train_dataset = train_dataset.map(compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint)
+
+    del text_encoders, tokenizers
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    # Then get the training dataset ready to be passed to the dataloader.
+    train_dataset = prepare_train_dataset(train_dataset, accelerator)
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
+    num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
+    if args.max_train_steps is None:
+        len_train_dataloader_after_sharding = math.ceil(len(train_dataloader) / accelerator.num_processes)
+        num_update_steps_per_epoch = math.ceil(len_train_dataloader_after_sharding / args.gradient_accumulation_steps)
+        num_training_steps_for_scheduler = (
+            args.num_train_epochs * num_update_steps_per_epoch * accelerator.num_processes
+        )
+    else:
+        num_training_steps_for_scheduler = args.max_train_steps * accelerator.num_processes
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps_for_scheduler,
+        num_training_steps=num_training_steps_for_scheduler,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        controlnet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        if num_training_steps_for_scheduler != args.max_train_steps * accelerator.num_processes:
+            logger.warning(
+                f"The length of the 'train_dataloader' after 'accelerator.prepare' ({len(train_dataloader)}) does not match "
+                f"the expected length ({len_train_dataloader_after_sharding}) when the learning rate scheduler was created. "
+                f"This inconsistency may result in the learning rate scheduler not functioning properly."
+            )
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+
+        # tensorboard cannot handle list types for config
+        tracker_config.pop("validation_prompt")
+        tracker_config.pop("validation_image")
+
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    image_logs = None
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(controlnet):
+                # Convert images to latent space
+                if args.pretrained_vae_model_name_or_path is not None:
+                    pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
+                else:
+                    pixel_values = batch["pixel_values"]
+                latents = vae.encode(pixel_values).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    latents = latents.to(weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # ControlNet conditioning.
+                controlnet_image = batch["conditioning_pixel_values"].to(dtype=weight_dtype)
+                down_block_res_samples, mid_block_res_sample = controlnet(
+                    noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=batch["prompt_ids"],
+                    added_cond_kwargs=batch["unet_added_conditions"],
+                    controlnet_cond=controlnet_image,
+                    return_dict=False,
+                )
+
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=batch["prompt_ids"],
+                    added_cond_kwargs=batch["unet_added_conditions"],
+                    down_block_additional_residuals=[
+                        sample.to(dtype=weight_dtype) for sample in down_block_res_samples
+                    ],
+                    mid_block_additional_residual=mid_block_res_sample.to(dtype=weight_dtype),
+                    return_dict=False,
+                )[0]
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = controlnet.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
+                if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                        image_logs = log_validation(
+                            vae=vae,
+                            unet=unet,
+                            controlnet=controlnet,
+                            args=args,
+                            accelerator=accelerator,
+                            weight_dtype=weight_dtype,
+                            step=global_step,
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        controlnet = unwrap_model(controlnet)
+        controlnet.save_pretrained(args.output_dir)
+
+        # Run a final round of validation.
+        # Setting `vae`, `unet`, and `controlnet` to None to load automatically from `args.output_dir`.
+        image_logs = None
+        if args.validation_prompt is not None:
+            image_logs = log_validation(
+                vae=None,
+                unet=None,
+                controlnet=None,
+                args=args,
+                accelerator=accelerator,
+                weight_dtype=weight_dtype,
+                step=global_step,
+                is_final_validation=True,
+            )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                image_logs=image_logs,
+                base_model=args.pretrained_model_name_or_path,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/custom_diffusion/README.md b/diffusers/examples/custom_diffusion/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9b0b52e5ae7e18e4ea8b0215c8f6cba5058b56ce
--- /dev/null
+++ b/diffusers/examples/custom_diffusion/README.md
@@ -0,0 +1,280 @@
+# Custom Diffusion training example
+
+[Custom Diffusion](https://arxiv.org/abs/2212.04488) is a method to customize text-to-image models like Stable Diffusion given just a few (4~5) images of a subject.
+The `train_custom_diffusion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the example folder and run
+
+```bash
+pip install -r requirements.txt
+pip install clip-retrieval
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+### Cat example 😺
+
+Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip) and unzip it.
+
+We also collect 200 real images using `clip-retrieval` which are combined with the target images in the training dataset as a regularization. This prevents overfitting to the given target image. The following flags enable the regularization `with_prior_preservation`, `real_prior` with `prior_loss_weight=1.`.
+The `class_prompt` should be the category name same as target image. The collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated images as regularization. To collect the real images use this command first before training.
+
+```bash
+pip install clip-retrieval
+python retrieve.py --class_prompt cat --class_data_dir real_reg/samples_cat --num_class_images 200
+```
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+export INSTANCE_DIR="./data/cat"
+
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_cat/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="cat" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> cat"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=250 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>"
+```
+
+**Use `--enable_xformers_memory_efficient_attention` for faster training with lower VRAM requirement (16GB per GPU). Follow [this guide](https://github.com/facebookresearch/xformers) for installation instructions.**
+
+To track your experiments using Weights and Biases (`wandb`) and to save intermediate results (which we HIGHLY recommend), follow these steps:
+
+* Install `wandb`: `pip install wandb`.
+* Authorize: `wandb login`.
+* Then specify a `validation_prompt` and set `report_to` to `wandb` while launching training. You can also configure the following related arguments:
+    * `num_validation_images`
+    * `validation_steps`
+
+Here is an example command:
+
+```bash
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_cat/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="cat" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> cat"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=250 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>" \
+  --validation_prompt="<new1> cat sitting in a bucket" \
+  --report_to="wandb"
+```
+
+Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/26ghrcau) where you can check out the intermediate results along with other training details.
+
+If you specify `--push_to_hub`, the learned parameters will be pushed to a repository on the Hugging Face Hub. Here is an [example repository](https://huggingface.co/sayakpaul/custom-diffusion-cat).
+
+### Training on multiple concepts 🐱🪵
+
+Provide a [json](https://github.com/adobe-research/custom-diffusion/blob/main/assets/concept_list.json) file with the info about each concept, similar to [this](https://github.com/ShivamShrirao/diffusers/blob/main/examples/dreambooth/train_dreambooth.py).
+
+To collect the real images run this command for each concept in the json file.
+
+```bash
+pip install clip-retrieval
+python retrieve.py --class_prompt {} --class_data_dir {} --num_class_images 200
+```
+
+And then we're ready to start training!
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --output_dir=$OUTPUT_DIR \
+  --concepts_list=./concept_list.json \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --num_class_images=200 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>+<new2>"
+```
+
+Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/3990tzkg) where you can check out the intermediate results along with other training details.
+
+### Training on human faces
+
+For fine-tuning on human faces we found the following configuration to work better: `learning_rate=5e-6`, `max_train_steps=1000 to 2000`, and `freeze_model=crossattn` with at least 15-20 images.
+
+To collect the real images use this command first before training.
+
+```bash
+pip install clip-retrieval
+python retrieve.py --class_prompt person --class_data_dir real_reg/samples_person --num_class_images 200
+```
+
+Then start training!
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+export INSTANCE_DIR="path-to-images"
+
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_person/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="person" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> person"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=5e-6  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=1000 \
+  --scale_lr --hflip --noaug \
+  --freeze_model crossattn \
+  --modifier_token "<new1>" \
+  --enable_xformers_memory_efficient_attention
+```
+
+## Inference
+
+Once you have trained a model using the above command, you can run inference using the below command. Make sure to include the `modifier token` (e.g. \<new1\> in above example) in your prompt.
+
+```python
+import torch
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
+).to("cuda")
+pipe.unet.load_attn_procs(
+    "path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin"
+)
+pipe.load_textual_inversion("path-to-save-model", weight_name="<new1>.bin")
+
+image = pipe(
+    "<new1> cat sitting in a bucket",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
+image.save("cat.png")
+```
+
+It's possible to directly load these parameters from a Hub repository:
+
+```python
+import torch
+from huggingface_hub.repocard import RepoCard
+from diffusers import DiffusionPipeline
+
+model_id = "sayakpaul/custom-diffusion-cat"
+card = RepoCard.load(model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to(
+"cuda")
+pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new1>.bin")
+
+image = pipe(
+    "<new1> cat sitting in a bucket",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
+image.save("cat.png")
+```
+
+Here is an example of performing inference with multiple concepts:
+
+```python
+import torch
+from huggingface_hub.repocard import RepoCard
+from diffusers import DiffusionPipeline
+
+model_id = "sayakpaul/custom-diffusion-cat-wooden-pot"
+card = RepoCard.load(model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to(
+"cuda")
+pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new1>.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new2>.bin")
+
+image = pipe(
+    "the <new1> cat sculpture in the style of a <new2> wooden pot",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
+image.save("multi-subject.png")
+```
+
+Here, `cat` and `wooden pot` refer to the multiple concepts.
+
+### Inference from a training checkpoint
+
+You can also perform inference from one of the complete checkpoint saved during the training process, if you used the `--checkpointing_steps` argument.
+
+TODO.
+
+## Set grads to none
+To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument.
+
+More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
+
+## Experimental results
+You can refer to [our webpage](https://www.cs.cmu.edu/~custom-diffusion/) that discusses our experiments in detail. We also released a more extensive dataset of 101 concepts for evaluating model customization methods. For more details please refer to our [dataset webpage](https://www.cs.cmu.edu/~custom-diffusion/dataset.html).
\ No newline at end of file
diff --git a/diffusers/examples/custom_diffusion/requirements.txt b/diffusers/examples/custom_diffusion/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7d93f3d03bd8eba09b8cab5e570d15380456b66a
--- /dev/null
+++ b/diffusers/examples/custom_diffusion/requirements.txt
@@ -0,0 +1,6 @@
+accelerate
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
diff --git a/diffusers/examples/custom_diffusion/retrieve.py b/diffusers/examples/custom_diffusion/retrieve.py
new file mode 100644
index 0000000000000000000000000000000000000000..a28fe344d93b469f414909cd8c4d61d7ff16e50f
--- /dev/null
+++ b/diffusers/examples/custom_diffusion/retrieve.py
@@ -0,0 +1,87 @@
+#  Copyright 2024 Custom Diffusion authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from io import BytesIO
+from pathlib import Path
+
+import requests
+from clip_retrieval.clip_client import ClipClient
+from PIL import Image
+from tqdm import tqdm
+
+
+def retrieve(class_prompt, class_data_dir, num_class_images):
+    factor = 1.5
+    num_images = int(factor * num_class_images)
+    client = ClipClient(
+        url="https://knn.laion.ai/knn-service", indice_name="laion_400m", num_images=num_images, aesthetic_weight=0.1
+    )
+
+    os.makedirs(f"{class_data_dir}/images", exist_ok=True)
+    if len(list(Path(f"{class_data_dir}/images").iterdir())) >= num_class_images:
+        return
+
+    while True:
+        class_images = client.query(text=class_prompt)
+        if len(class_images) >= factor * num_class_images or num_images > 1e4:
+            break
+        else:
+            num_images = int(factor * num_images)
+            client = ClipClient(
+                url="https://knn.laion.ai/knn-service",
+                indice_name="laion_400m",
+                num_images=num_images,
+                aesthetic_weight=0.1,
+            )
+
+    count = 0
+    total = 0
+    pbar = tqdm(desc="downloading real regularization images", total=num_class_images)
+
+    with open(f"{class_data_dir}/caption.txt", "w") as f1, open(f"{class_data_dir}/urls.txt", "w") as f2, open(
+        f"{class_data_dir}/images.txt", "w"
+    ) as f3:
+        while total < num_class_images:
+            images = class_images[count]
+            count += 1
+            try:
+                img = requests.get(images["url"], timeout=30)
+                if img.status_code == 200:
+                    _ = Image.open(BytesIO(img.content))
+                    with open(f"{class_data_dir}/images/{total}.jpg", "wb") as f:
+                        f.write(img.content)
+                    f1.write(images["caption"] + "\n")
+                    f2.write(images["url"] + "\n")
+                    f3.write(f"{class_data_dir}/images/{total}.jpg" + "\n")
+                    total += 1
+                    pbar.update(1)
+                else:
+                    continue
+            except Exception:
+                continue
+    return
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("", add_help=False)
+    parser.add_argument("--class_prompt", help="text prompt to retrieve images", required=True, type=str)
+    parser.add_argument("--class_data_dir", help="path to save images", required=True, type=str)
+    parser.add_argument("--num_class_images", help="number of images to download", default=200, type=int)
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    retrieve(args.class_prompt, args.class_data_dir, args.num_class_images)
diff --git a/diffusers/examples/custom_diffusion/test_custom_diffusion.py b/diffusers/examples/custom_diffusion/test_custom_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d7e895b3bb838d2a432baae340e3eb2c44f14db
--- /dev/null
+++ b/diffusers/examples/custom_diffusion/test_custom_diffusion.py
@@ -0,0 +1,124 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class CustomDiffusion(ExamplesTestsAccelerate):
+    def test_custom_diffusion(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/custom_diffusion/train_custom_diffusion.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt <new1>
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 1.0e-05
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --modifier_token <new1>
+                --no_safe_serialization
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_custom_diffusion_weights.bin")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "<new1>.bin")))
+
+    def test_custom_diffusion_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/custom_diffusion/train_custom_diffusion.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --instance_data_dir=docs/source/en/imgs
+            --output_dir={tmpdir}
+            --instance_prompt=<new1>
+            --resolution=64
+            --train_batch_size=1
+            --modifier_token=<new1>
+            --dataloader_num_workers=0
+            --max_train_steps=6
+            --checkpoints_total_limit=2
+            --checkpointing_steps=2
+            --no_safe_serialization
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
+
+    def test_custom_diffusion_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/custom_diffusion/train_custom_diffusion.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --instance_data_dir=docs/source/en/imgs
+            --output_dir={tmpdir}
+            --instance_prompt=<new1>
+            --resolution=64
+            --train_batch_size=1
+            --modifier_token=<new1>
+            --dataloader_num_workers=0
+            --max_train_steps=4
+            --checkpointing_steps=2
+            --no_safe_serialization
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            resume_run_args = f"""
+            examples/custom_diffusion/train_custom_diffusion.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --instance_data_dir=docs/source/en/imgs
+            --output_dir={tmpdir}
+            --instance_prompt=<new1>
+            --resolution=64
+            --train_batch_size=1
+            --modifier_token=<new1>
+            --dataloader_num_workers=0
+            --max_train_steps=8
+            --checkpointing_steps=2
+            --resume_from_checkpoint=checkpoint-4
+            --checkpoints_total_limit=2
+            --no_safe_serialization
+            """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
diff --git a/diffusers/examples/custom_diffusion/train_custom_diffusion.py b/diffusers/examples/custom_diffusion/train_custom_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..e498ca98b1c76abed63adba20e7e6118bb2339f5
--- /dev/null
+++ b/diffusers/examples/custom_diffusion/train_custom_diffusion.py
@@ -0,0 +1,1378 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 Custom Diffusion authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import itertools
+import json
+import logging
+import math
+import os
+import random
+import shutil
+import warnings
+from pathlib import Path
+
+import numpy as np
+import safetensors
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import HfApi, create_repo
+from huggingface_hub.utils import insecure_hashlib
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import AttnProcsLayers
+from diffusers.models.attention_processor import (
+    CustomDiffusionAttnProcessor,
+    CustomDiffusionAttnProcessor2_0,
+    CustomDiffusionXFormersAttnProcessor,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def freeze_params(params):
+    for param in params:
+        param.requires_grad = False
+
+
+def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_folder=None):
+    img_str = ""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# Custom Diffusion - {repo_id}
+
+These are Custom Diffusion adaption weights for {base_model}. The weights were trained on {prompt} using [Custom Diffusion](https://www.cs.cmu.edu/~custom-diffusion). You can find some example images in the following. \n
+{img_str}
+
+\nFor more details on the training, please follow [this link](https://github.com/huggingface/diffusers/blob/main/examples/custom_diffusion).
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        prompt=prompt,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "text-to-image",
+        "diffusers",
+        "stable-diffusion",
+        "stable-diffusion-diffusers",
+        "custom-diffusion",
+        "diffusers-training",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def collate_fn(examples, with_prior_preservation):
+    input_ids = [example["instance_prompt_ids"] for example in examples]
+    pixel_values = [example["instance_images"] for example in examples]
+    mask = [example["mask"] for example in examples]
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        input_ids += [example["class_prompt_ids"] for example in examples]
+        pixel_values += [example["class_images"] for example in examples]
+        mask += [example["class_mask"] for example in examples]
+
+    input_ids = torch.cat(input_ids, dim=0)
+    pixel_values = torch.stack(pixel_values)
+    mask = torch.stack(mask)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+    mask = mask.to(memory_format=torch.contiguous_format).float()
+
+    batch = {"input_ids": input_ids, "pixel_values": pixel_values, "mask": mask.unsqueeze(1)}
+    return batch
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+class CustomDiffusionDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        concepts_list,
+        tokenizer,
+        size=512,
+        mask_size=64,
+        center_crop=False,
+        with_prior_preservation=False,
+        num_class_images=200,
+        hflip=False,
+        aug=True,
+    ):
+        self.size = size
+        self.mask_size = mask_size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+        self.interpolation = Image.BILINEAR
+        self.aug = aug
+
+        self.instance_images_path = []
+        self.class_images_path = []
+        self.with_prior_preservation = with_prior_preservation
+        for concept in concepts_list:
+            inst_img_path = [
+                (x, concept["instance_prompt"]) for x in Path(concept["instance_data_dir"]).iterdir() if x.is_file()
+            ]
+            self.instance_images_path.extend(inst_img_path)
+
+            if with_prior_preservation:
+                class_data_root = Path(concept["class_data_dir"])
+                if os.path.isdir(class_data_root):
+                    class_images_path = list(class_data_root.iterdir())
+                    class_prompt = [concept["class_prompt"] for _ in range(len(class_images_path))]
+                else:
+                    with open(class_data_root, "r") as f:
+                        class_images_path = f.read().splitlines()
+                    with open(concept["class_prompt"], "r") as f:
+                        class_prompt = f.read().splitlines()
+
+                class_img_path = list(zip(class_images_path, class_prompt))
+                self.class_images_path.extend(class_img_path[:num_class_images])
+
+        random.shuffle(self.instance_images_path)
+        self.num_instance_images = len(self.instance_images_path)
+        self.num_class_images = len(self.class_images_path)
+        self._length = max(self.num_class_images, self.num_instance_images)
+        self.flip = transforms.RandomHorizontalFlip(0.5 * hflip)
+
+        self.image_transforms = transforms.Compose(
+            [
+                self.flip,
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def preprocess(self, image, scale, resample):
+        outer, inner = self.size, scale
+        factor = self.size // self.mask_size
+        if scale > self.size:
+            outer, inner = scale, self.size
+        top, left = np.random.randint(0, outer - inner + 1), np.random.randint(0, outer - inner + 1)
+        image = image.resize((scale, scale), resample=resample)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+        instance_image = np.zeros((self.size, self.size, 3), dtype=np.float32)
+        mask = np.zeros((self.size // factor, self.size // factor))
+        if scale > self.size:
+            instance_image = image[top : top + inner, left : left + inner, :]
+            mask = np.ones((self.size // factor, self.size // factor))
+        else:
+            instance_image[top : top + inner, left : left + inner, :] = image
+            mask[
+                top // factor + 1 : (top + scale) // factor - 1, left // factor + 1 : (left + scale) // factor - 1
+            ] = 1.0
+        return instance_image, mask
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image, instance_prompt = self.instance_images_path[index % self.num_instance_images]
+        instance_image = Image.open(instance_image)
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        instance_image = self.flip(instance_image)
+
+        # apply resize augmentation and create a valid image region mask
+        random_scale = self.size
+        if self.aug:
+            random_scale = (
+                np.random.randint(self.size // 3, self.size + 1)
+                if np.random.uniform() < 0.66
+                else np.random.randint(int(1.2 * self.size), int(1.4 * self.size))
+            )
+        instance_image, mask = self.preprocess(instance_image, random_scale, self.interpolation)
+
+        if random_scale < 0.6 * self.size:
+            instance_prompt = np.random.choice(["a far away ", "very small "]) + instance_prompt
+        elif random_scale > self.size:
+            instance_prompt = np.random.choice(["zoomed in ", "close up "]) + instance_prompt
+
+        example["instance_images"] = torch.from_numpy(instance_image).permute(2, 0, 1)
+        example["mask"] = torch.from_numpy(mask)
+        example["instance_prompt_ids"] = self.tokenizer(
+            instance_prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+
+        if self.with_prior_preservation:
+            class_image, class_prompt = self.class_images_path[index % self.num_class_images]
+            class_image = Image.open(class_image)
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_mask"] = torch.ones_like(example["mask"])
+            example["class_prompt_ids"] = self.tokenizer(
+                class_prompt,
+                truncation=True,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            ).input_ids
+
+        return example
+
+
+def save_new_embed(text_encoder, modifier_token_id, accelerator, args, output_dir, safe_serialization=True):
+    """Saves the new token embeddings from the text encoder."""
+    logger.info("Saving embeddings")
+    learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight
+    for x, y in zip(modifier_token_id, args.modifier_token):
+        learned_embeds_dict = {}
+        learned_embeds_dict[y] = learned_embeds[x]
+
+        if safe_serialization:
+            filename = f"{output_dir}/{y}.safetensors"
+            safetensors.torch.save_file(learned_embeds_dict, filename, metadata={"format": "pt"})
+        else:
+            filename = f"{output_dir}/{y}.bin"
+            torch.save(learned_embeds_dict, filename)
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Custom Diffusion training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=2,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument(
+        "--real_prior",
+        default=False,
+        action="store_true",
+        help="real images as prior.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=200,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="custom-diffusion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=250,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=2,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--freeze_model",
+        type=str,
+        default="crossattn_kv",
+        choices=["crossattn_kv", "crossattn"],
+        help="crossattn to enable fine-tuning of all params in the cross attention",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument(
+        "--concepts_list",
+        type=str,
+        default=None,
+        help="Path to json containing multiple concepts, will overwrite parameters like instance_prompt, class_prompt, etc.",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+    parser.add_argument(
+        "--modifier_token",
+        type=str,
+        default=None,
+        help="A token to use as a modifier for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default="ktn+pll+ucd", help="A token to use as initializer word."
+    )
+    parser.add_argument("--hflip", action="store_true", help="Apply horizontal flip data augmentation.")
+    parser.add_argument(
+        "--noaug",
+        action="store_true",
+        help="Dont apply augmentation during data augmentation when this flag is enabled.",
+    )
+    parser.add_argument(
+        "--no_safe_serialization",
+        action="store_true",
+        help="If specified save the checkpoint not in `safetensors` format, but in original PyTorch format instead.",
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.concepts_list is None:
+            if args.class_data_dir is None:
+                raise ValueError("You must specify a data directory for class images.")
+            if args.class_prompt is None:
+                raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("custom-diffusion", config=vars(args))
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    if args.concepts_list is None:
+        args.concepts_list = [
+            {
+                "instance_prompt": args.instance_prompt,
+                "class_prompt": args.class_prompt,
+                "instance_data_dir": args.instance_data_dir,
+                "class_data_dir": args.class_data_dir,
+            }
+        ]
+    else:
+        with open(args.concepts_list, "r") as f:
+            args.concepts_list = json.load(f)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        for i, concept in enumerate(args.concepts_list):
+            class_images_dir = Path(concept["class_data_dir"])
+            if not class_images_dir.exists():
+                class_images_dir.mkdir(parents=True, exist_ok=True)
+            if args.real_prior:
+                assert (
+                    class_images_dir / "images"
+                ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                assert (
+                    len(list((class_images_dir / "images").iterdir())) == args.num_class_images
+                ), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                assert (
+                    class_images_dir / "caption.txt"
+                ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                assert (
+                    class_images_dir / "images.txt"
+                ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                concept["class_prompt"] = os.path.join(class_images_dir, "caption.txt")
+                concept["class_data_dir"] = os.path.join(class_images_dir, "images.txt")
+                args.concepts_list[i] = concept
+                accelerator.wait_for_everyone()
+            else:
+                cur_class_images = len(list(class_images_dir.iterdir()))
+
+                if cur_class_images < args.num_class_images:
+                    torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+                    if args.prior_generation_precision == "fp32":
+                        torch_dtype = torch.float32
+                    elif args.prior_generation_precision == "fp16":
+                        torch_dtype = torch.float16
+                    elif args.prior_generation_precision == "bf16":
+                        torch_dtype = torch.bfloat16
+                    pipeline = DiffusionPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        torch_dtype=torch_dtype,
+                        safety_checker=None,
+                        revision=args.revision,
+                        variant=args.variant,
+                    )
+                    pipeline.set_progress_bar_config(disable=True)
+
+                    num_new_images = args.num_class_images - cur_class_images
+                    logger.info(f"Number of class images to sample: {num_new_images}.")
+
+                    sample_dataset = PromptDataset(concept["class_prompt"], num_new_images)
+                    sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+                    sample_dataloader = accelerator.prepare(sample_dataloader)
+                    pipeline.to(accelerator.device)
+
+                    for example in tqdm(
+                        sample_dataloader,
+                        desc="Generating class images",
+                        disable=not accelerator.is_local_main_process,
+                    ):
+                        images = pipeline(example["prompt"]).images
+
+                        for i, image in enumerate(images):
+                            hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                            image_filename = (
+                                class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                            )
+                            image.save(image_filename)
+
+                    del pipeline
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name,
+            revision=args.revision,
+            use_fast=False,
+        )
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # Adding a modifier token which is optimized ####
+    # Code taken from https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py
+    modifier_token_id = []
+    initializer_token_id = []
+    if args.modifier_token is not None:
+        args.modifier_token = args.modifier_token.split("+")
+        args.initializer_token = args.initializer_token.split("+")
+        if len(args.modifier_token) > len(args.initializer_token):
+            raise ValueError("You must specify + separated initializer token for each modifier token.")
+        for modifier_token, initializer_token in zip(
+            args.modifier_token, args.initializer_token[: len(args.modifier_token)]
+        ):
+            # Add the placeholder token in tokenizer
+            num_added_tokens = tokenizer.add_tokens(modifier_token)
+            if num_added_tokens == 0:
+                raise ValueError(
+                    f"The tokenizer already contains the token {modifier_token}. Please pass a different"
+                    " `modifier_token` that is not already in the tokenizer."
+                )
+
+            # Convert the initializer_token, placeholder_token to ids
+            token_ids = tokenizer.encode([initializer_token], add_special_tokens=False)
+            print(token_ids)
+            # Check if initializer_token is a single token or a sequence of tokens
+            if len(token_ids) > 1:
+                raise ValueError("The initializer token must be a single token.")
+
+            initializer_token_id.append(token_ids[0])
+            modifier_token_id.append(tokenizer.convert_tokens_to_ids(modifier_token))
+
+        # Resize the token embeddings as we are adding new special tokens to the tokenizer
+        text_encoder.resize_token_embeddings(len(tokenizer))
+
+        # Initialise the newly added placeholder token with the embeddings of the initializer token
+        token_embeds = text_encoder.get_input_embeddings().weight.data
+        for x, y in zip(modifier_token_id, initializer_token_id):
+            token_embeds[x] = token_embeds[y]
+
+        # Freeze all parameters except for the token embeddings in text encoder
+        params_to_freeze = itertools.chain(
+            text_encoder.text_model.encoder.parameters(),
+            text_encoder.text_model.final_layer_norm.parameters(),
+            text_encoder.text_model.embeddings.position_embedding.parameters(),
+        )
+        freeze_params(params_to_freeze)
+    ########################################################
+    ########################################################
+
+    vae.requires_grad_(False)
+    if args.modifier_token is None:
+        text_encoder.requires_grad_(False)
+    unet.requires_grad_(False)
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    if accelerator.mixed_precision != "fp16" and args.modifier_token is not None:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    attention_class = (
+        CustomDiffusionAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else CustomDiffusionAttnProcessor
+    )
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            attention_class = CustomDiffusionXFormersAttnProcessor
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # now we will add new Custom Diffusion weights to the attention layers
+    # It's important to realize here how many attention weights will be added and of which sizes
+    # The sizes of the attention layers consist only of two different variables:
+    # 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`.
+    # 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`.
+
+    # Let's first see how many attention processors we will have to set.
+    # For Stable Diffusion, it should be equal to:
+    # - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12
+    # - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2
+    # - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18
+    # => 32 layers
+
+    # Only train key, value projection layers if freeze_model = 'crossattn_kv' else train all params in the cross attention layer
+    train_kv = True
+    train_q_out = False if args.freeze_model == "crossattn_kv" else True
+    custom_diffusion_attn_procs = {}
+
+    st = unet.state_dict()
+    for name, _ in unet.attn_processors.items():
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+        layer_name = name.split(".processor")[0]
+        weights = {
+            "to_k_custom_diffusion.weight": st[layer_name + ".to_k.weight"],
+            "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"],
+        }
+        if train_q_out:
+            weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"]
+            weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"]
+            weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"]
+        if cross_attention_dim is not None:
+            custom_diffusion_attn_procs[name] = attention_class(
+                train_kv=train_kv,
+                train_q_out=train_q_out,
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+            ).to(unet.device)
+            custom_diffusion_attn_procs[name].load_state_dict(weights)
+        else:
+            custom_diffusion_attn_procs[name] = attention_class(
+                train_kv=False,
+                train_q_out=False,
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+            )
+    del st
+    unet.set_attn_processor(custom_diffusion_attn_procs)
+    custom_diffusion_layers = AttnProcsLayers(unet.attn_processors)
+
+    accelerator.register_for_checkpointing(custom_diffusion_layers)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.modifier_token is not None:
+            text_encoder.gradient_checkpointing_enable()
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+        if args.with_prior_preservation:
+            args.learning_rate = args.learning_rate * 2.0
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    optimizer = optimizer_class(
+        itertools.chain(text_encoder.get_input_embeddings().parameters(), custom_diffusion_layers.parameters())
+        if args.modifier_token is not None
+        else custom_diffusion_layers.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = CustomDiffusionDataset(
+        concepts_list=args.concepts_list,
+        tokenizer=tokenizer,
+        with_prior_preservation=args.with_prior_preservation,
+        size=args.resolution,
+        mask_size=vae.encode(
+            torch.randn(1, 3, args.resolution, args.resolution).to(dtype=weight_dtype).to(accelerator.device)
+        )
+        .latent_dist.sample()
+        .size()[-1],
+        center_crop=args.center_crop,
+        num_class_images=args.num_class_images,
+        hflip=args.hflip,
+        aug=not args.noaug,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
+    num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
+    if args.max_train_steps is None:
+        len_train_dataloader_after_sharding = math.ceil(len(train_dataloader) / accelerator.num_processes)
+        num_update_steps_per_epoch = math.ceil(len_train_dataloader_after_sharding / args.gradient_accumulation_steps)
+        num_training_steps_for_scheduler = (
+            args.num_train_epochs * num_update_steps_per_epoch * accelerator.num_processes
+        )
+    else:
+        num_training_steps_for_scheduler = args.max_train_steps * accelerator.num_processes
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps_for_scheduler,
+        num_training_steps=num_training_steps_for_scheduler,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.modifier_token is not None:
+        custom_diffusion_layers, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            custom_diffusion_layers, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        custom_diffusion_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            custom_diffusion_layers, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        if num_training_steps_for_scheduler != args.max_train_steps * accelerator.num_processes:
+            logger.warning(
+                f"The length of the 'train_dataloader' after 'accelerator.prepare' ({len(train_dataloader)}) does not match "
+                f"the expected length ({len_train_dataloader_after_sharding}) when the learning rate scheduler was created. "
+                f"This inconsistency may result in the learning rate scheduler not functioning properly."
+            )
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.modifier_token is not None:
+            text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet), accelerator.accumulate(text_encoder):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Predict the noise residual
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+                    mask = torch.chunk(batch["mask"], 2, dim=0)[0]
+                    # Compute instance loss
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = ((loss * mask).sum([1, 2, 3]) / mask.sum([1, 2, 3])).mean()
+
+                    # Compute prior loss
+                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+                else:
+                    mask = batch["mask"]
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = ((loss * mask).sum([1, 2, 3]) / mask.sum([1, 2, 3])).mean()
+                accelerator.backward(loss)
+                # Zero out the gradients for all token embeddings except the newly added
+                # embeddings for the concept, as we only want to optimize the concept embeddings
+                if args.modifier_token is not None:
+                    if accelerator.num_processes > 1:
+                        grads_text_encoder = text_encoder.module.get_input_embeddings().weight.grad
+                    else:
+                        grads_text_encoder = text_encoder.get_input_embeddings().weight.grad
+                    # Get the index for tokens that we want to zero the grads for
+                    index_grads_to_zero = torch.arange(len(tokenizer)) != modifier_token_id[0]
+                    for i in range(1, len(modifier_token_id)):
+                        index_grads_to_zero = index_grads_to_zero & (
+                            torch.arange(len(tokenizer)) != modifier_token_id[i]
+                        )
+                    grads_text_encoder.data[index_grads_to_zero, :] = grads_text_encoder.data[
+                        index_grads_to_zero, :
+                    ].fill_(0)
+
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(text_encoder.parameters(), custom_diffusion_layers.parameters())
+                        if args.modifier_token is not None
+                        else custom_diffusion_layers.parameters()
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+            if accelerator.is_main_process:
+                images = []
+
+                if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                    logger.info(
+                        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                        f" {args.validation_prompt}."
+                    )
+                    # create pipeline
+                    pipeline = DiffusionPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        unet=accelerator.unwrap_model(unet),
+                        text_encoder=accelerator.unwrap_model(text_encoder),
+                        tokenizer=tokenizer,
+                        revision=args.revision,
+                        variant=args.variant,
+                        torch_dtype=weight_dtype,
+                    )
+                    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+                    pipeline = pipeline.to(accelerator.device)
+                    pipeline.set_progress_bar_config(disable=True)
+
+                    # run inference
+                    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+                    images = [
+                        pipeline(args.validation_prompt, num_inference_steps=25, generator=generator, eta=1.0).images[
+                            0
+                        ]
+                        for _ in range(args.num_validation_images)
+                    ]
+
+                    for tracker in accelerator.trackers:
+                        if tracker.name == "tensorboard":
+                            np_images = np.stack([np.asarray(img) for img in images])
+                            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                        if tracker.name == "wandb":
+                            tracker.log(
+                                {
+                                    "validation": [
+                                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                        for i, image in enumerate(images)
+                                    ]
+                                }
+                            )
+
+                    del pipeline
+                    torch.cuda.empty_cache()
+
+    # Save the custom diffusion layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unet.to(torch.float32)
+        unet.save_attn_procs(args.output_dir, safe_serialization=not args.no_safe_serialization)
+        save_new_embed(
+            text_encoder,
+            modifier_token_id,
+            accelerator,
+            args,
+            args.output_dir,
+            safe_serialization=not args.no_safe_serialization,
+        )
+
+        # Final inference
+        # Load previous pipeline
+        pipeline = DiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path, revision=args.revision, variant=args.variant, torch_dtype=weight_dtype
+        )
+        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+        pipeline = pipeline.to(accelerator.device)
+
+        # load attention processors
+        weight_name = (
+            "pytorch_custom_diffusion_weights.safetensors"
+            if not args.no_safe_serialization
+            else "pytorch_custom_diffusion_weights.bin"
+        )
+        pipeline.unet.load_attn_procs(args.output_dir, weight_name=weight_name)
+        for token in args.modifier_token:
+            token_weight_name = f"{token}.safetensors" if not args.no_safe_serialization else f"{token}.bin"
+            pipeline.load_textual_inversion(args.output_dir, weight_name=token_weight_name)
+
+        # run inference
+        if args.validation_prompt and args.num_validation_images > 0:
+            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+            images = [
+                pipeline(args.validation_prompt, num_inference_steps=25, generator=generator, eta=1.0).images[0]
+                for _ in range(args.num_validation_images)
+            ]
+
+            for tracker in accelerator.trackers:
+                if tracker.name == "tensorboard":
+                    np_images = np.stack([np.asarray(img) for img in images])
+                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+                if tracker.name == "wandb":
+                    tracker.log(
+                        {
+                            "test": [
+                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                for i, image in enumerate(images)
+                            ]
+                        }
+                    )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                prompt=args.instance_prompt,
+                repo_folder=args.output_dir,
+            )
+            api = HfApi(token=args.hub_token)
+            api.upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/dreambooth/README.md b/diffusers/examples/dreambooth/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f97a4d0cd0f4fb31b09e0a1e537ab532ee5d2837
--- /dev/null
+++ b/diffusers/examples/dreambooth/README.md
@@ -0,0 +1,744 @@
+# DreamBooth training example
+
+[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject.
+The `train_dreambooth.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the example folder and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+### Dog toy example
+
+Now let's get our dataset. For this example we will use some dog images: https://huggingface.co/datasets/diffusers/dog-example.
+
+Let's first download it locally:
+
+```python
+from huggingface_hub import snapshot_download
+
+local_dir = "./dog"
+snapshot_download(
+    "diffusers/dog-example",
+    local_dir=local_dir, repo_type="dataset",
+    ignore_patterns=".gitattributes",
+)
+```
+
+And launch the training using:
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=400 \
+  --push_to_hub
+```
+
+### Training with prior-preservation loss
+
+Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
+According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases. The `num_class_images` flag sets the number of images to generate with the class prompt. You can place existing images in `class_data_dir`, and the training script will generate any additional images so that `num_class_images` are present in `class_data_dir` during training time.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="dog"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800 \
+  --push_to_hub
+```
+
+
+### Training on a 16GB GPU:
+
+With the help of gradient checkpointing and the 8-bit optimizer from bitsandbytes it's possible to run train dreambooth on a 16GB GPU.
+
+To install `bitsandbytes` please refer to this [readme](https://github.com/TimDettmers/bitsandbytes#requirements--installation).
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="dog"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=2 --gradient_checkpointing \
+  --use_8bit_adam \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800 \
+  --push_to_hub
+```
+
+
+### Training on a 12GB GPU:
+
+It is possible to run dreambooth on a 12GB GPU by using the following optimizations:
+- [gradient checkpointing and the 8-bit optimizer](#training-on-a-16gb-gpu)
+- [xformers](#training-with-xformers)
+- [setting grads to none](#set-grads-to-none)
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="dog"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 --gradient_checkpointing \
+  --use_8bit_adam \
+  --enable_xformers_memory_efficient_attention \
+  --set_grads_to_none \
+  --learning_rate=2e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800 \
+  --push_to_hub
+```
+
+
+### Training on a 8 GB GPU:
+
+By using [DeepSpeed](https://www.deepspeed.ai/) it's possible to offload some
+tensors from VRAM to either CPU or NVME allowing to train with less VRAM.
+
+DeepSpeed needs to be enabled with `accelerate config`. During configuration
+answer yes to "Do you want to use DeepSpeed?". With DeepSpeed stage 2, fp16
+mixed precision and offloading both parameters and optimizer state to cpu it's
+possible to train on under 8 GB VRAM with a drawback of requiring significantly
+more RAM (about 25 GB). See [documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more DeepSpeed configuration options.
+
+Changing the default Adam optimizer to DeepSpeed's special version of Adam
+`deepspeed.ops.adam.DeepSpeedCPUAdam` gives a substantial speedup but enabling
+it requires CUDA toolchain with the same version as pytorch. 8-bit optimizer
+does not seem to be compatible with DeepSpeed at the moment.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="dog"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch --mixed_precision="fp16" train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --sample_batch_size=1 \
+  --gradient_accumulation_steps=1 --gradient_checkpointing \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800 \
+  --push_to_hub
+```
+
+### Fine-tune text encoder with the UNet.
+
+The script also allows to fine-tune the `text_encoder` along with the `unet`. It's been observed experimentally that fine-tuning `text_encoder` gives much better results especially on faces.
+Pass the `--train_text_encoder` argument to the script to enable training `text_encoder`.
+
+___Note: Training text encoder requires more memory, with this option the training won't fit on 16GB GPU. It needs at least 24GB VRAM.___
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="dog"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --train_text_encoder \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --use_8bit_adam \
+  --gradient_checkpointing \
+  --learning_rate=2e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800 \
+  --push_to_hub
+```
+
+### Using DreamBooth for pipelines other than Stable Diffusion
+
+The [AltDiffusion pipeline](https://huggingface.co/docs/diffusers/api/pipelines/alt_diffusion) also supports dreambooth fine-tuning. The process is the same as above, all you need to do is replace the `MODEL_NAME` like this:
+
+```
+export MODEL_NAME="CompVis/stable-diffusion-v1-4" --> export MODEL_NAME="BAAI/AltDiffusion-m9"
+or
+export MODEL_NAME="CompVis/stable-diffusion-v1-4" --> export MODEL_NAME="BAAI/AltDiffusion"
+```
+
+### Inference
+
+Once you have trained a model using the above command, you can run inference simply using the `StableDiffusionPipeline`. Make sure to include the `identifier` (e.g. sks in above example) in your prompt.
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+model_id = "path-to-your-trained-model"
+pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+prompt = "A photo of sks dog in a bucket"
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+
+image.save("dog-bucket.png")
+```
+
+### Inference from a training checkpoint
+
+You can also perform inference from one of the checkpoints saved during the training process, if you used the `--checkpointing_steps` argument. Please, refer to [the documentation](https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint) to see how to do it.
+
+## Training with Low-Rank Adaptation of Large Language Models (LoRA)
+
+Low-Rank Adaption of Large Language Models was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*
+
+In a nutshell, LoRA allows to adapt pretrained models by adding pairs of rank-decomposition matrices to existing weights and **only** training those newly added weights. This has a couple of advantages:
+- Previous pretrained weights are kept frozen so that the model is not prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114)
+- Rank-decomposition matrices have significantly fewer parameters than the original model, which means that trained LoRA weights are easily portable.
+- LoRA attention layers allow to control to which extent the model is adapted towards new training images via a `scale` parameter.
+
+[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in
+the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository.
+
+### Training
+
+Let's get started with a simple example. We will re-use the dog example of the [previous section](#dog-toy-example).
+
+First, you need to set-up your dreambooth training example as is explained in the [installation section](#Installing-the-dependencies).
+Next, let's download the dog dataset. Download images from [here](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ) and save them in a directory. Make sure to set `INSTANCE_DIR` to the name of your directory further below. This will be our training data.
+
+Now, you can launch the training. Here we will use [Stable Diffusion 1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5).
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+**___Note: It is quite useful to monitor the training progress by regularly generating sample images during training. [wandb](https://docs.wandb.ai/quickstart) is a nice solution to easily see generating images during training. All you need to do is to run `pip install wandb` before training and pass `--report_to="wandb"` to automatically log images.___**
+
+
+```bash
+export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="path-to-save-model"
+```
+
+For this example we want to directly store the trained LoRA embeddings on the Hub, so
+we need to be logged in and add the `--push_to_hub` flag.
+
+```bash
+huggingface-cli login
+```
+
+Now we can start training!
+
+```bash
+accelerate launch train_dreambooth_lora.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --checkpointing_steps=100 \
+  --learning_rate=1e-4 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --validation_prompt="A photo of sks dog in a bucket" \
+  --validation_epochs=50 \
+  --seed="0" \
+  --push_to_hub
+```
+
+**___Note: When using LoRA we can use a much higher learning rate compared to vanilla dreambooth. Here we
+use *1e-4* instead of the usual *2e-6*.___**
+
+The final LoRA embedding weights have been uploaded to [patrickvonplaten/lora_dreambooth_dog_example](https://huggingface.co/patrickvonplaten/lora_dreambooth_dog_example). **___Note: [The final weights](https://huggingface.co/patrickvonplaten/lora/blob/main/pytorch_attn_procs.bin) are only 3 MB in size which is orders of magnitudes smaller than the original model.**
+
+The training results are summarized [here](https://api.wandb.ai/report/patrickvonplaten/xm6cd5q5).
+You can use the `Step` slider to see how the model learned the features of our subject while the model trained.
+
+Optionally, we can also train additional LoRA layers for the text encoder. Specify the `--train_text_encoder` argument above for that. If you're interested to know more about how we
+enable this support, check out this [PR](https://github.com/huggingface/diffusers/pull/2918).
+
+With the default hyperparameters from the above, the training seems to go in a positive direction. Check out [this panel](https://wandb.ai/sayakpaul/dreambooth-lora/reports/test-23-04-17-17-00-13---Vmlldzo0MDkwNjMy). The trained LoRA layers are available [here](https://huggingface.co/sayakpaul/dreambooth).
+
+
+### Inference
+
+After training, LoRA weights can be loaded very easily into the original pipeline. First, you need to
+load the original pipeline:
+
+```python
+from diffusers import DiffusionPipeline
+pipe = DiffusionPipeline.from_pretrained("base-model-name").to("cuda")
+```
+
+Next, we can load the adapter layers into the pipeline with the [`load_lora_weights` function](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading_adapters#lora).
+
+```python
+pipe.load_lora_weights("path-to-the-lora-checkpoint")
+```
+
+Finally, we can run the model in inference.
+
+```python
+image = pipe("A picture of a sks dog in a bucket", num_inference_steps=25).images[0]
+```
+
+If you are loading the LoRA parameters from the Hub and if the Hub repository has
+a `base_model` tag (such as [this](https://huggingface.co/patrickvonplaten/lora_dreambooth_dog_example/blob/main/README.md?code=true#L4)), then
+you can do:
+
+```py
+from huggingface_hub.repocard import RepoCard
+
+lora_model_id = "patrickvonplaten/lora_dreambooth_dog_example"
+card = RepoCard.load(lora_model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)
+...
+```
+
+If you used `--train_text_encoder` during training, then use `pipe.load_lora_weights()` to load the LoRA
+weights. For example:
+
+```python
+from huggingface_hub.repocard import RepoCard
+from diffusers import StableDiffusionPipeline
+import torch
+
+lora_model_id = "sayakpaul/dreambooth-text-encoder-test"
+card = RepoCard.load(lora_model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+pipe.load_lora_weights(lora_model_id)
+image = pipe("A picture of a sks dog in a bucket", num_inference_steps=25).images[0]
+```
+
+Note that the use of [`StableDiffusionLoraLoaderMixin.load_lora_weights`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.StableDiffusionLoraLoaderMixin.load_lora_weights) is preferred to [`UNet2DConditionLoadersMixin.load_attn_procs`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs) for loading LoRA parameters. This is because
+`StableDiffusionLoraLoaderMixin.load_lora_weights` can handle the following situations:
+
+* LoRA parameters that don't have separate identifiers for the UNet and the text encoder (such as [`"patrickvonplaten/lora_dreambooth_dog_example"`](https://huggingface.co/patrickvonplaten/lora_dreambooth_dog_example)). So, you can just do:
+
+  ```py
+  pipe.load_lora_weights(lora_model_path)
+  ```
+
+* LoRA parameters that have separate identifiers for the UNet and the text encoder such as: [`"sayakpaul/dreambooth"`](https://huggingface.co/sayakpaul/dreambooth).
+
+## Training with Flax/JAX
+
+For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
+
+____Note: The flax example don't yet support features like gradient checkpoint, gradient accumulation etc, so to use flax for faster training we will need >30GB cards.___
+
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install -U -r requirements_flax.txt
+```
+
+
+### Training without prior preservation loss
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="path-to-save-model"
+
+python train_dreambooth_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --max_train_steps=400
+```
+
+
+### Training with prior preservation loss
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export INSTANCE_DIR="dog"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+python train_dreambooth_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+
+### Fine-tune text encoder with the UNet.
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export INSTANCE_DIR="dog"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+python train_dreambooth_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --train_text_encoder \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=2e-6 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+### Training with xformers:
+You can enable memory efficient attention by [installing xFormers](https://github.com/facebookresearch/xformers#installing-xformers) and padding the `--enable_xformers_memory_efficient_attention` argument to the script. This is not available with the Flax/JAX implementation.
+
+You can also use Dreambooth to train the specialized in-painting model. See [the script in the research folder for details](https://github.com/huggingface/diffusers/tree/main/examples/research_projects/dreambooth_inpaint).
+
+### Set grads to none
+
+To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument.
+
+More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
+
+### Experimental results
+You can refer to [this blog post](https://huggingface.co/blog/dreambooth) that discusses some of DreamBooth experiments in detail. Specifically, it recommends a set of DreamBooth-specific tips and tricks that we have found to work well for a variety of subjects.
+
+## IF
+
+You can use the lora and full dreambooth scripts to train the text to image [IF model](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0) and the stage II upscaler
+[IF model](https://huggingface.co/DeepFloyd/IF-II-L-v1.0).
+
+Note that IF has a predicted variance, and our finetuning scripts only train the models predicted error, so for finetuned IF models we switch to a fixed
+variance schedule. The full finetuning scripts will update the scheduler config for the full saved model. However, when loading saved LoRA weights, you
+must also update the pipeline's scheduler config.
+
+```py
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0")
+
+pipe.load_lora_weights("<lora weights path>")
+
+# Update scheduler config to fixed variance schedule
+pipe.scheduler = pipe.scheduler.__class__.from_config(pipe.scheduler.config, variance_type="fixed_small")
+```
+
+Additionally, a few alternative cli flags are needed for IF.
+
+`--resolution=64`: IF is a pixel space diffusion model. In order to operate on un-compressed pixels, the input images are of a much smaller resolution.
+
+`--pre_compute_text_embeddings`: IF uses [T5](https://huggingface.co/docs/transformers/model_doc/t5) for its text encoder. In order to save GPU memory, we pre compute all text embeddings and then de-allocate
+T5.
+
+`--tokenizer_max_length=77`: T5 has a longer default text length, but the default IF encoding procedure uses a smaller number.
+
+`--text_encoder_use_attention_mask`: T5 passes the attention mask to the text encoder.
+
+### Tips and Tricks
+We find LoRA to be sufficient for finetuning the stage I model as the low resolution of the model makes representing finegrained detail hard regardless.
+
+For common and/or not-visually complex object concepts, you can get away with not-finetuning the upscaler. Just be sure to adjust the prompt passed to the
+upscaler to remove the new token from the instance prompt. I.e. if your stage I prompt is "a sks dog", use "a dog" for your stage II prompt.
+
+For finegrained detail like faces that aren't present in the original training set, we find that full finetuning of the stage II upscaler is better than
+LoRA finetuning stage II.
+
+For finegrained detail like faces, we find that lower learning rates along with larger batch sizes work best.
+
+For stage II, we find that lower learning rates are also needed.
+
+We found experimentally that the DDPM scheduler with the default larger number of denoising steps to sometimes work better than the DPM Solver scheduler
+used in the training scripts.
+
+### Stage II additional validation images
+
+The stage II validation requires images to upscale, we can download a downsized version of the training set:
+
+```py
+from huggingface_hub import snapshot_download
+
+local_dir = "./dog_downsized"
+snapshot_download(
+    "diffusers/dog-example-downsized",
+    local_dir=local_dir,
+    repo_type="dataset",
+    ignore_patterns=".gitattributes",
+)
+```
+
+### IF stage I LoRA Dreambooth
+This training configuration requires ~28 GB VRAM.
+
+```sh
+export MODEL_NAME="DeepFloyd/IF-I-XL-v1.0"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="dreambooth_dog_lora"
+
+accelerate launch train_dreambooth_lora.py \
+  --report_to wandb \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a sks dog" \
+  --resolution=64 \
+  --train_batch_size=4 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=5e-6 \
+  --scale_lr \
+  --max_train_steps=1200 \
+  --validation_prompt="a sks dog" \
+  --validation_epochs=25 \
+  --checkpointing_steps=100 \
+  --pre_compute_text_embeddings \
+  --tokenizer_max_length=77 \
+  --text_encoder_use_attention_mask
+```
+
+### IF stage II LoRA Dreambooth
+
+`--validation_images`: These images are upscaled during validation steps.
+
+`--class_labels_conditioning=timesteps`: Pass additional conditioning to the UNet needed for stage II.
+
+`--learning_rate=1e-6`: Lower learning rate than stage I.
+
+`--resolution=256`: The upscaler expects higher resolution inputs
+
+```sh
+export MODEL_NAME="DeepFloyd/IF-II-L-v1.0"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="dreambooth_dog_upscale"
+export VALIDATION_IMAGES="dog_downsized/image_1.png dog_downsized/image_2.png dog_downsized/image_3.png dog_downsized/image_4.png"
+
+python train_dreambooth_lora.py \
+    --report_to wandb \
+    --pretrained_model_name_or_path=$MODEL_NAME \
+    --instance_data_dir=$INSTANCE_DIR \
+    --output_dir=$OUTPUT_DIR \
+    --instance_prompt="a sks dog" \
+    --resolution=256 \
+    --train_batch_size=4 \
+    --gradient_accumulation_steps=1 \
+    --learning_rate=1e-6 \
+    --max_train_steps=2000 \
+    --validation_prompt="a sks dog" \
+    --validation_epochs=100 \
+    --checkpointing_steps=500 \
+    --pre_compute_text_embeddings \
+    --tokenizer_max_length=77 \
+    --text_encoder_use_attention_mask \
+    --validation_images $VALIDATION_IMAGES \
+    --class_labels_conditioning=timesteps
+```
+
+### IF Stage I Full Dreambooth
+`--skip_save_text_encoder`: When training the full model, this will skip saving the entire T5 with the finetuned model. You can still load the pipeline
+with a T5 loaded from the original model.
+
+`use_8bit_adam`: Due to the size of the optimizer states, we recommend training the full XL IF model with 8bit adam.
+
+`--learning_rate=1e-7`: For full dreambooth, IF requires very low learning rates. With higher learning rates model quality will degrade. Note that it is
+likely the learning rate can be increased with larger batch sizes.
+
+Using 8bit adam and a batch size of 4, the model can be trained in ~48 GB VRAM.
+
+`--validation_scheduler`: Set a particular scheduler via a string. We found that it is better to use the DDPMScheduler for validation when training DeepFloyd IF.
+
+```sh
+export MODEL_NAME="DeepFloyd/IF-I-XL-v1.0"
+
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="dreambooth_if"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=64 \
+  --train_batch_size=4 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=1e-7 \
+  --max_train_steps=150 \
+  --validation_prompt "a photo of sks dog" \
+  --validation_steps 25 \
+  --text_encoder_use_attention_mask \
+  --tokenizer_max_length 77 \
+  --pre_compute_text_embeddings \
+  --use_8bit_adam \
+  --set_grads_to_none \
+  --skip_save_text_encoder \
+  --validation_scheduler DDPMScheduler \
+  --push_to_hub
+```
+
+### IF Stage II Full Dreambooth
+
+`--learning_rate=5e-6`: With a smaller effective batch size of 4, we found that we required learning rates as low as
+1e-8.
+
+`--resolution=256`: The upscaler expects higher resolution inputs
+
+`--train_batch_size=2` and `--gradient_accumulation_steps=6`: We found that full training of stage II particularly with
+faces required large effective batch sizes.
+
+```sh
+export MODEL_NAME="DeepFloyd/IF-II-L-v1.0"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="dreambooth_dog_upscale"
+export VALIDATION_IMAGES="dog_downsized/image_1.png dog_downsized/image_2.png dog_downsized/image_3.png dog_downsized/image_4.png"
+
+accelerate launch train_dreambooth.py \
+  --report_to wandb \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a sks dog" \
+  --resolution=256 \
+  --train_batch_size=2 \
+  --gradient_accumulation_steps=6 \
+  --learning_rate=5e-6 \
+  --max_train_steps=2000 \
+  --validation_prompt="a sks dog" \
+  --validation_steps=150 \
+  --checkpointing_steps=500 \
+  --pre_compute_text_embeddings \
+  --tokenizer_max_length=77 \
+  --text_encoder_use_attention_mask \
+  --validation_images $VALIDATION_IMAGES \
+  --class_labels_conditioning timesteps \
+  --validation_scheduler DDPMScheduler\
+  --push_to_hub
+```
+
+## Stable Diffusion XL
+
+We support fine-tuning of the UNet shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) with DreamBooth and LoRA via the `train_dreambooth_lora_sdxl.py` script. Please refer to the docs [here](./README_sdxl.md).
diff --git a/diffusers/examples/dreambooth/README_flux.md b/diffusers/examples/dreambooth/README_flux.md
new file mode 100644
index 0000000000000000000000000000000000000000..69dfd241395b8779ff037ea7d7da4902a9560962
--- /dev/null
+++ b/diffusers/examples/dreambooth/README_flux.md
@@ -0,0 +1,232 @@
+# DreamBooth training example for FLUX.1 [dev]
+
+[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few (3~5) images of a subject.
+
+The `train_dreambooth_flux.py` script shows how to implement the training procedure and adapt it for [FLUX.1 [dev]](https://blackforestlabs.ai/announcing-black-forest-labs/). We also provide a LoRA implementation in the `train_dreambooth_lora_flux.py` script.
+> [!NOTE]
+> **Memory consumption**
+>
+> Flux can be quite expensive to run on consumer hardware devices and as a result finetuning it comes with high memory requirements -
+> a LoRA with a rank of 16 (w/ all components trained) can exceed 40GB of VRAM for training.
+
+> For more tips & guidance on training on a resource-constrained device and general good practices please check out these great guides and trainers for FLUX: 
+> 1) [`@bghira`'s guide](https://github.com/bghira/SimpleTuner/blob/main/documentation/quickstart/FLUX.md)
+> 2) [`ostris`'s guide](https://github.com/ostris/ai-toolkit?tab=readme-ov-file#flux1-training)
+
+> [!NOTE]
+> **Gated model**
+>
+> As the model is gated, before using it with diffusers you first need to go to the [FLUX.1 [dev] Hugging Face page](https://huggingface.co/black-forest-labs/FLUX.1-dev), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
+
+```bash
+huggingface-cli login
+```
+
+This will also allow us to push the trained model parameters to the Hugging Face Hub platform.
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the `examples/dreambooth` folder and run
+```bash
+pip install -r requirements_flux.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell (e.g., a notebook)
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+
+### Dog toy example
+
+Now let's get our dataset. For this example we will use some dog images: https://huggingface.co/datasets/diffusers/dog-example.
+
+Let's first download it locally:
+
+```python
+from huggingface_hub import snapshot_download
+
+local_dir = "./dog"
+snapshot_download(
+    "diffusers/dog-example",
+    local_dir=local_dir, repo_type="dataset",
+    ignore_patterns=".gitattributes",
+)
+```
+
+This will also allow us to push the trained LoRA parameters to the Hugging Face Hub platform.
+
+Now, we can launch training using:
+
+```bash
+export MODEL_NAME="black-forest-labs/FLUX.1-dev"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="trained-flux"
+
+accelerate launch train_dreambooth_flux.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --mixed_precision="bf16" \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=1024 \
+  --train_batch_size=1 \
+  --guidance_scale=1 \
+  --gradient_accumulation_steps=4 \
+  --optimizer="prodigy" \
+  --learning_rate=1. \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --validation_prompt="A photo of sks dog in a bucket" \
+  --validation_epochs=25 \
+  --seed="0" \
+  --push_to_hub
+```
+
+To better track our training experiments, we're using the following flags in the command above:
+
+* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
+* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
+
+> [!NOTE]
+> If you want to train using long prompts with the T5 text encoder, you can use `--max_sequence_length` to set the token limit. The default is 77, but it can be increased to as high as 512. Note that this will use more resources and may slow down the training in some cases.
+
+## LoRA + DreamBooth
+
+[LoRA](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a popular parameter-efficient fine-tuning technique that allows you to achieve full-finetuning like performance but with a fraction of learnable parameters.
+
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+### Prodigy Optimizer
+Prodigy is an adaptive optimizer that dynamically adjusts the learning rate learned parameters based on past gradients, allowing for more efficient convergence. 
+By using prodigy we can "eliminate" the need for manual learning rate tuning. read more [here](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers).
+
+to use prodigy, specify
+```bash
+--optimizer="prodigy"
+```
+> [!TIP]
+> When using prodigy it's generally good practice to set- `--learning_rate=1.0`
+
+To perform DreamBooth with LoRA, run:
+
+```bash
+export MODEL_NAME="black-forest-labs/FLUX.1-dev"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="trained-flux-lora"
+
+accelerate launch train_dreambooth_lora_flux.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --mixed_precision="bf16" \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --guidance_scale=1 \
+  --gradient_accumulation_steps=4 \
+  --optimizer="prodigy" \
+  --learning_rate=1. \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --validation_prompt="A photo of sks dog in a bucket" \
+  --validation_epochs=25 \
+  --seed="0" \
+  --push_to_hub
+```
+
+### Text Encoder Training
+
+Alongside the transformer, fine-tuning of the CLIP text encoder is also supported.
+To do so, just specify `--train_text_encoder` while launching training. Please keep the following points in mind:
+
+> [!NOTE]
+> This is still an experimental feature. 
+> FLUX.1 has 2 text encoders (CLIP L/14 and T5-v1.1-XXL).
+By enabling `--train_text_encoder`, fine-tuning of the **CLIP encoder** is performed.
+> At the moment, T5 fine-tuning is not supported and weights remain frozen when text encoder training is enabled.
+
+To perform DreamBooth LoRA with text-encoder training, run:
+```bash
+export MODEL_NAME="black-forest-labs/FLUX.1-dev"
+export OUTPUT_DIR="trained-flux-dev-dreambooth-lora"
+
+accelerate launch train_dreambooth_lora_flux.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --mixed_precision="bf16" \
+  --train_text_encoder\
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --guidance_scale=1 \
+  --gradient_accumulation_steps=4 \
+  --optimizer="prodigy" \
+  --learning_rate=1. \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --validation_prompt="A photo of sks dog in a bucket" \
+  --seed="0" \
+  --push_to_hub
+```
+
+## Memory Optimizations
+As mentioned, Flux Dreambooth LoRA training is very memory intensive Here are some options (some still experimental) for a more memory efficient training.
+### Image Resolution
+An easy way to mitigate some of the memory requirements is through `--resolution`. `--resolution` refers to the resolution for input images, all the images in the train/validation dataset are resized to this.
+Note that by default, images are resized to resolution of 512, but it's good to keep in mind in case you're accustomed to training on higher resolutions. 
+### Gradient Checkpointing and Accumulation
+* `--gradient accumulation` refers to the number of updates steps to accumulate before performing a backward/update pass.
+by passing a value > 1 you can reduce the amount of backward/update passes and hence also memory reqs.
+* with `--gradient checkpointing` we can save memory by not storing all intermediate activations during the forward pass.
+Instead, only a subset of these activations (the checkpoints) are stored and the rest is recomputed as needed during the backward pass. Note that this comes at the expanse of a slower backward pass.
+### 8-bit-Adam Optimizer
+When training with `AdamW`(doesn't apply to `prodigy`) You can pass `--use_8bit_adam` to reduce the memory requirements of training. 
+Make sure to install `bitsandbytes` if you want to do so.
+### Latent caching
+When training w/o validation runs, we can pre-encode the training images with the vae, and then delete it to free up some memory. 
+to enable `latent_caching` simply pass `--cache_latents`.
+### Precision of saved LoRA layers
+By default, trained transformer layers are saved in the precision dtype in which training was performed. E.g. when training in mixed precision is enabled with `--mixed_precision="bf16"`, final finetuned layers will be saved in `torch.bfloat16` as well. 
+This reduces memory requirements significantly w/o a significant quality loss. Note that if you do wish to save the final layers in float32 at the expanse of more memory usage, you can do so by passing `--upcast_before_saving`.
+
+## Other notes
+Thanks to `bghira` and `ostris` for their help with reviewing & insight sharing ♥️
\ No newline at end of file
diff --git a/diffusers/examples/dreambooth/README_sd3.md b/diffusers/examples/dreambooth/README_sd3.md
new file mode 100644
index 0000000000000000000000000000000000000000..6f41c395629ac96307807448f02ab8e85a66ea5a
--- /dev/null
+++ b/diffusers/examples/dreambooth/README_sd3.md
@@ -0,0 +1,188 @@
+# DreamBooth training example for Stable Diffusion 3 (SD3)
+
+[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few (3~5) images of a subject.
+
+The `train_dreambooth_sd3.py` script shows how to implement the training procedure and adapt it for [Stable Diffusion 3](https://huggingface.co/papers/2403.03206). We also provide a LoRA implementation in the `train_dreambooth_lora_sd3.py` script.
+
+> [!NOTE]
+> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
+
+```bash
+huggingface-cli login
+```
+
+This will also allow us to push the trained model parameters to the Hugging Face Hub platform.
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the `examples/dreambooth` folder and run
+```bash
+pip install -r requirements_sd3.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell (e.g., a notebook)
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+
+### Dog toy example
+
+Now let's get our dataset. For this example we will use some dog images: https://huggingface.co/datasets/diffusers/dog-example.
+
+Let's first download it locally:
+
+```python
+from huggingface_hub import snapshot_download
+
+local_dir = "./dog"
+snapshot_download(
+    "diffusers/dog-example",
+    local_dir=local_dir, repo_type="dataset",
+    ignore_patterns=".gitattributes",
+)
+```
+
+This will also allow us to push the trained LoRA parameters to the Hugging Face Hub platform.
+
+Now, we can launch training using:
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-3-medium-diffusers"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="trained-sd3"
+
+accelerate launch train_dreambooth_sd3.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --mixed_precision="fp16" \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=1024 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --learning_rate=1e-4 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --validation_prompt="A photo of sks dog in a bucket" \
+  --validation_epochs=25 \
+  --seed="0" \
+  --push_to_hub
+```
+
+To better track our training experiments, we're using the following flags in the command above:
+
+* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
+* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
+
+> [!NOTE]
+> If you want to train using long prompts with the T5 text encoder, you can use `--max_sequence_length` to set the token limit. The default is 77, but it can be increased to as high as 512. Note that this will use more resources and may slow down the training in some cases.
+
+> [!TIP]
+> You can pass `--use_8bit_adam` to reduce the memory requirements of training. Make sure to install `bitsandbytes` if you want to do so.
+
+## LoRA + DreamBooth
+
+[LoRA](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a popular parameter-efficient fine-tuning technique that allows you to achieve full-finetuning like performance but with a fraction of learnable parameters.
+
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+To perform DreamBooth with LoRA, run:
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-3-medium-diffusers"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="trained-sd3-lora"
+
+accelerate launch train_dreambooth_lora_sd3.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --mixed_precision="fp16" \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --learning_rate=1e-5 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --validation_prompt="A photo of sks dog in a bucket" \
+  --validation_epochs=25 \
+  --seed="0" \
+  --push_to_hub
+```
+
+### Text Encoder Training
+Alongside the transformer, LoRA fine-tuning of the CLIP text encoders is now also supported.
+To do so, just specify `--train_text_encoder` while launching training. Please keep the following points in mind:
+
+> [!NOTE]
+> SD3 has three text encoders (CLIP L/14, OpenCLIP bigG/14, and T5-v1.1-XXL).
+By enabling `--train_text_encoder`, LoRA fine-tuning of both **CLIP encoders** is performed. At the moment, T5 fine-tuning is not supported and weights remain frozen when text encoder training is enabled.
+
+To perform DreamBooth LoRA with text-encoder training, run:
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-3-medium-diffusers"
+export OUTPUT_DIR="trained-sd3-lora"
+
+accelerate launch train_dreambooth_lora_sd3.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --output_dir=$OUTPUT_DIR \
+  --dataset_name="Norod78/Yarn-art-style" \
+  --instance_prompt="a photo of TOK yarn art dog" \
+  --resolution=1024 \
+  --train_batch_size=1 \
+  --train_text_encoder\
+  --gradient_accumulation_steps=1 \
+  --optimizer="prodigy"\
+  --learning_rate=1.0 \
+  --text_encoder_lr=1.0 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=1500 \
+  --rank=32 \
+  --seed="0" \
+  --push_to_hub
+```
+
+## Other notes
+
+1. We default to the "logit_normal" weighting scheme for the loss following the SD3 paper. Thanks to @bghira for helping us discover that for other weighting schemes supported from the training script, training may incur numerical instabilities.
+2. Thanks to `bghira`, `JinxuXiang`, and `bendanzzc` for helping us discover a bug in how VAE encoding was being done previously. This has been fixed in [#8917](https://github.com/huggingface/diffusers/pull/8917).
+3. Additionally, we now have the option to control if we want to apply preconditioning to the model outputs via a `--precondition_outputs` CLI arg. It affects how the model `target` is calculated as well.
\ No newline at end of file
diff --git a/diffusers/examples/dreambooth/README_sdxl.md b/diffusers/examples/dreambooth/README_sdxl.md
new file mode 100644
index 0000000000000000000000000000000000000000..7a42bf8fffd8bf1db76f6db76302480be6dacf8c
--- /dev/null
+++ b/diffusers/examples/dreambooth/README_sdxl.md
@@ -0,0 +1,275 @@
+# DreamBooth training example for Stable Diffusion XL (SDXL)
+
+[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few (3~5) images of a subject.
+
+The `train_dreambooth_lora_sdxl.py` script shows how to implement the training procedure and adapt it for [Stable Diffusion XL](https://huggingface.co/papers/2307.01952).
+
+> 💡 **Note**: For now, we only allow DreamBooth fine-tuning of the SDXL UNet via LoRA. LoRA is a parameter-efficient fine-tuning technique introduced in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*.
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the `examples/dreambooth` folder and run
+```bash
+pip install -r requirements_sdxl.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell (e.g., a notebook)
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+### Dog toy example
+
+Now let's get our dataset. For this example we will use some dog images: https://huggingface.co/datasets/diffusers/dog-example.
+
+Let's first download it locally:
+
+```python
+from huggingface_hub import snapshot_download
+
+local_dir = "./dog"
+snapshot_download(
+    "diffusers/dog-example",
+    local_dir=local_dir, repo_type="dataset",
+    ignore_patterns=".gitattributes",
+)
+```
+
+This will also allow us to push the trained LoRA parameters to the Hugging Face Hub platform.
+
+Now, we can launch training using:
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="lora-trained-xl"
+export VAE_PATH="madebyollin/sdxl-vae-fp16-fix"
+
+accelerate launch train_dreambooth_lora_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --pretrained_vae_model_name_or_path=$VAE_PATH \
+  --output_dir=$OUTPUT_DIR \
+  --mixed_precision="fp16" \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=1024 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --learning_rate=1e-4 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --validation_prompt="A photo of sks dog in a bucket" \
+  --validation_epochs=25 \
+  --seed="0" \
+  --push_to_hub
+```
+
+To better track our training experiments, we're using the following flags in the command above:
+
+* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
+* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
+
+Our experiments were conducted on a single 40GB A100 GPU.
+
+### Dog toy example with < 16GB VRAM
+
+By making use of [`gradient_checkpointing`](https://pytorch.org/docs/stable/checkpoint.html) (which is natively supported in Diffusers), [`xformers`](https://github.com/facebookresearch/xformers), and [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) libraries, you can train SDXL LoRAs with less than 16GB of VRAM by adding the following flags to your accelerate launch command:
+
+```diff
++  --enable_xformers_memory_efficient_attention \
++  --gradient_checkpointing \
++  --use_8bit_adam \
++  --mixed_precision="fp16" \
+```
+
+and making sure that you have the following libraries installed:
+
+```
+bitsandbytes>=0.40.0
+xformers>=0.0.20
+```
+
+### Inference
+
+Once training is done, we can perform inference like so:
+
+```python
+from huggingface_hub.repocard import RepoCard
+from diffusers import DiffusionPipeline
+import torch
+
+lora_model_id = <"lora-sdxl-dreambooth-id">
+card = RepoCard.load(lora_model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+pipe.load_lora_weights(lora_model_id)
+image = pipe("A picture of a sks dog in a bucket", num_inference_steps=25).images[0]
+image.save("sks_dog.png")
+```
+
+We can further refine the outputs with the [Refiner](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0):
+
+```python
+from huggingface_hub.repocard import RepoCard
+from diffusers import DiffusionPipeline, StableDiffusionXLImg2ImgPipeline
+import torch
+
+lora_model_id = <"lora-sdxl-dreambooth-id">
+card = RepoCard.load(lora_model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+# Load the base pipeline and load the LoRA parameters into it.
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+pipe.load_lora_weights(lora_model_id)
+
+# Load the refiner.
+refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
+)
+refiner.to("cuda")
+
+prompt = "A picture of a sks dog in a bucket"
+generator = torch.Generator("cuda").manual_seed(0)
+
+# Run inference.
+image = pipe(prompt=prompt, output_type="latent", generator=generator).images[0]
+image = refiner(prompt=prompt, image=image[None, :], generator=generator).images[0]
+image.save("refined_sks_dog.png")
+```
+
+Here's a side-by-side comparison of the with and without Refiner pipeline outputs:
+
+| Without Refiner | With Refiner |
+|---|---|
+| ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/sks_dog.png) | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/refined_sks_dog.png) |
+
+### Training with text encoder(s)
+
+Alongside the UNet, LoRA fine-tuning of the text encoders is also supported. To do so, just specify `--train_text_encoder` while launching training. Please keep the following points in mind:
+
+* SDXL has two text encoders. So, we fine-tune both using LoRA.
+* When not fine-tuning the text encoders, we ALWAYS precompute the text embeddings to save memory.
+
+### Specifying a better VAE
+
+SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
+
+## Notes
+
+In our experiments, we found that SDXL yields good initial results without extensive hyperparameter tuning. For example, without fine-tuning the text encoders and without using prior-preservation, we observed decent results. We didn't explore further hyper-parameter tuning experiments, but we do encourage the community to explore this avenue further and share their results with us 🤗
+
+## Results
+
+You can explore the results from a couple of our internal experiments by checking out this link: [https://wandb.ai/sayakpaul/dreambooth-lora-sd-xl](https://wandb.ai/sayakpaul/dreambooth-lora-sd-xl). Specifically, we used the same script with the exact same hyperparameters on the following datasets:
+
+* [Dogs](https://huggingface.co/datasets/diffusers/dog-example)
+* [Starbucks logo](https://huggingface.co/datasets/diffusers/starbucks-example)
+* [Mr. Potato Head](https://huggingface.co/datasets/diffusers/potato-head-example)
+* [Keramer face](https://huggingface.co/datasets/diffusers/keramer-face-example)
+
+## Running on a free-tier Colab Notebook
+
+Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_.ipynb).
+
+## Conducting EDM-style training
+
+It's now possible to perform EDM-style training as proposed in [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364).
+
+For the SDXL model, simple set:
+
+```diff
++  --do_edm_style_training \
+```
+
+Other SDXL-like models that use the EDM formulation, such as [playgroundai/playground-v2.5-1024px-aesthetic](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic), can also be DreamBooth'd with the script. Below is an example command:
+
+```bash
+accelerate launch train_dreambooth_lora_sdxl.py \
+  --pretrained_model_name_or_path="playgroundai/playground-v2.5-1024px-aesthetic"  \
+  --instance_data_dir="dog" \
+  --output_dir="dog-playground-lora" \
+  --mixed_precision="fp16" \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=1024 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --learning_rate=1e-4 \
+  --use_8bit_adam \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --validation_prompt="A photo of sks dog in a bucket" \
+  --validation_epochs=25 \
+  --seed="0" \
+  --push_to_hub
+```
+
+> [!CAUTION]
+> Min-SNR gamma is not supported with the EDM-style training yet. When training with the PlaygroundAI model, it's recommended to not pass any "variant".
+
+### DoRA training
+The script now supports DoRA training too!
+> Proposed in [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353),
+**DoRA** is very similar to LoRA, except it decomposes the pre-trained weight into two components, **magnitude** and **direction** and employs LoRA for _directional_ updates to efficiently minimize the number of trainable parameters.
+The authors found that by using DoRA, both the learning capacity and training stability of LoRA are enhanced without any additional overhead during inference.
+
+> [!NOTE]
+> 💡DoRA training is still _experimental_
+> and is likely to require different hyperparameter values to perform best compared to a LoRA.
+> Specifically, we've noticed 2 differences to take into account your training:
+> 1. **LoRA seem to converge faster than DoRA** (so a set of parameters that may lead to overfitting when training a LoRA may be working well for a DoRA)
+> 2. **DoRA quality superior to LoRA especially in lower ranks** the difference in quality of DoRA of rank 8 and LoRA of rank 8 appears to be more significant than when training ranks of 32 or 64 for example.
+> This is also aligned with some of the quantitative analysis shown in the paper.
+
+**Usage**
+1. To use DoRA you need to upgrade the installation of `peft`:
+```bash
+pip install -U peft
+```
+2. Enable DoRA training by adding this flag
+```bash
+--use_dora
+```
+**Inference**
+The inference is the same as if you train a regular LoRA 🤗
+
+## Format compatibility
+
+You can pass `--output_kohya_format` to additionally generate a state dictionary which should be compatible with other platforms and tools such as Automatic 1111, Comfy, Kohya, etc. The `output_dir` will contain a file named "pytorch_lora_weights_kohya.safetensors".
\ No newline at end of file
diff --git a/diffusers/examples/dreambooth/requirements.txt b/diffusers/examples/dreambooth/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3f86855e1d1e03c20874e9de4a6037a119509444
--- /dev/null
+++ b/diffusers/examples/dreambooth/requirements.txt
@@ -0,0 +1,7 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
+peft==0.7.0
\ No newline at end of file
diff --git a/diffusers/examples/dreambooth/requirements_flax.txt b/diffusers/examples/dreambooth/requirements_flax.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8f85ad523a3b46b65abf0138c05ecdd656e6845c
--- /dev/null
+++ b/diffusers/examples/dreambooth/requirements_flax.txt
@@ -0,0 +1,8 @@
+transformers>=4.25.1
+flax
+optax
+torch
+torchvision
+ftfy
+tensorboard
+Jinja2
diff --git a/diffusers/examples/dreambooth/requirements_flux.txt b/diffusers/examples/dreambooth/requirements_flux.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dbc124ff6526c07b0357c42f3635e75cf484105f
--- /dev/null
+++ b/diffusers/examples/dreambooth/requirements_flux.txt
@@ -0,0 +1,8 @@
+accelerate>=0.31.0
+torchvision
+transformers>=4.41.2
+ftfy
+tensorboard
+Jinja2
+peft>=0.11.1
+sentencepiece
\ No newline at end of file
diff --git a/diffusers/examples/dreambooth/requirements_sd3.txt b/diffusers/examples/dreambooth/requirements_sd3.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1c1ad1f3a7380ba5ed685d8ce0279939968dffdd
--- /dev/null
+++ b/diffusers/examples/dreambooth/requirements_sd3.txt
@@ -0,0 +1,8 @@
+accelerate>=0.31.0
+torchvision
+transformers>=4.41.2
+ftfy
+tensorboard
+Jinja2
+peft==0.11.1
+sentencepiece
\ No newline at end of file
diff --git a/diffusers/examples/dreambooth/requirements_sdxl.txt b/diffusers/examples/dreambooth/requirements_sdxl.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3f86855e1d1e03c20874e9de4a6037a119509444
--- /dev/null
+++ b/diffusers/examples/dreambooth/requirements_sdxl.txt
@@ -0,0 +1,7 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
+peft==0.7.0
\ No newline at end of file
diff --git a/diffusers/examples/dreambooth/test_dreambooth.py b/diffusers/examples/dreambooth/test_dreambooth.py
new file mode 100644
index 0000000000000000000000000000000000000000..86d0438fc25f4f0d8da4a98d9c1a6a9632dccb6c
--- /dev/null
+++ b/diffusers/examples/dreambooth/test_dreambooth.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import shutil
+import sys
+import tempfile
+
+from diffusers import DiffusionPipeline, UNet2DConditionModel
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class DreamBooth(ExamplesTestsAccelerate):
+    def test_dreambooth(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
+
+    def test_dreambooth_if(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-if-pipe
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --pre_compute_text_embeddings
+                --tokenizer_max_length=77
+                --text_encoder_use_attention_mask
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
+
+    def test_dreambooth_checkpointing(self):
+        instance_prompt = "photo"
+        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/dreambooth/train_dreambooth.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt {instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            # check can run the original fully trained output pipeline
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(instance_prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4")))
+
+            # check can run an intermediate checkpoint
+            unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
+            pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
+            pipe(instance_prompt, num_inference_steps=1)
+
+            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
+            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
+
+            # Run training script for 7 total steps resuming from checkpoint 4
+
+            resume_run_args = f"""
+                examples/dreambooth/train_dreambooth.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt {instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --resume_from_checkpoint=checkpoint-4
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check can run new fully trained pipeline
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(instance_prompt, num_inference_steps=1)
+
+            # check old checkpoints do not exist
+            self.assertFalse(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
+
+            # check new checkpoints exist
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4")))
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-6")))
+
+    def test_dreambooth_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/dreambooth/train_dreambooth.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --instance_data_dir=docs/source/en/imgs
+            --output_dir={tmpdir}
+            --instance_prompt=prompt
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=6
+            --checkpoints_total_limit=2
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_dreambooth_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/dreambooth/train_dreambooth.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --instance_data_dir=docs/source/en/imgs
+            --output_dir={tmpdir}
+            --instance_prompt=prompt
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=4
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            resume_run_args = f"""
+            examples/dreambooth/train_dreambooth.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --instance_data_dir=docs/source/en/imgs
+            --output_dir={tmpdir}
+            --instance_prompt=prompt
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=8
+            --checkpointing_steps=2
+            --resume_from_checkpoint=checkpoint-4
+            --checkpoints_total_limit=2
+            """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
diff --git a/diffusers/examples/dreambooth/test_dreambooth_flux.py b/diffusers/examples/dreambooth/test_dreambooth_flux.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d5703d2a24aad78dc1fe7c3b1a91187ed3f8614
--- /dev/null
+++ b/diffusers/examples/dreambooth/test_dreambooth_flux.py
@@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import shutil
+import sys
+import tempfile
+
+from diffusers import DiffusionPipeline, FluxTransformer2DModel
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class DreamBoothFlux(ExamplesTestsAccelerate):
+    instance_data_dir = "docs/source/en/imgs"
+    instance_prompt = "photo"
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-pipe"
+    script_path = "examples/dreambooth/train_dreambooth_flux.py"
+
+    def test_dreambooth(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "transformer", "diffusion_pytorch_model.safetensors")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
+
+    def test_dreambooth_checkpointing(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            # check can run the original fully trained output pipeline
+            pipe = DiffusionPipeline.from_pretrained(tmpdir)
+            pipe(self.instance_prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4")))
+
+            # check can run an intermediate checkpoint
+            transformer = FluxTransformer2DModel.from_pretrained(tmpdir, subfolder="checkpoint-2/transformer")
+            pipe = DiffusionPipeline.from_pretrained(self.pretrained_model_name_or_path, transformer=transformer)
+            pipe(self.instance_prompt, num_inference_steps=1)
+
+            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
+            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
+
+            # Run training script for 7 total steps resuming from checkpoint 4
+
+            resume_run_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --resume_from_checkpoint=checkpoint-4
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check can run new fully trained pipeline
+            pipe = DiffusionPipeline.from_pretrained(tmpdir)
+            pipe(self.instance_prompt, num_inference_steps=1)
+
+            # check old checkpoints do not exist
+            self.assertFalse(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
+
+            # check new checkpoints exist
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4")))
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-6")))
+
+    def test_dreambooth_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=6
+            --checkpoints_total_limit=2
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_dreambooth_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=4
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            resume_run_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=8
+            --checkpointing_steps=2
+            --resume_from_checkpoint=checkpoint-4
+            --checkpoints_total_limit=2
+            """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
diff --git a/diffusers/examples/dreambooth/test_dreambooth_lora.py b/diffusers/examples/dreambooth/test_dreambooth_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..9df104e071af9fc02d573678f86a651e80aa16d6
--- /dev/null
+++ b/diffusers/examples/dreambooth/test_dreambooth_lora.py
@@ -0,0 +1,379 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+import safetensors
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+from diffusers import DiffusionPipeline  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class DreamBoothLoRA(ExamplesTestsAccelerate):
+    def test_dreambooth_lora(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth_lora.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"unet"` in their names.
+            starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
+            self.assertTrue(starts_with_unet)
+
+    def test_dreambooth_lora_with_text_encoder(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth_lora.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --train_text_encoder
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # check `text_encoder` is present at all.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            keys = lora_state_dict.keys()
+            is_text_encoder_present = any(k.startswith("text_encoder") for k in keys)
+            self.assertTrue(is_text_encoder_present)
+
+            # the names of the keys of the state dict should either start with `unet`
+            # or `text_encoder`.
+            is_correct_naming = all(k.startswith("unet") or k.startswith("text_encoder") for k in keys)
+            self.assertTrue(is_correct_naming)
+
+    def test_dreambooth_lora_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/dreambooth/train_dreambooth_lora.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --instance_data_dir=docs/source/en/imgs
+            --output_dir={tmpdir}
+            --instance_prompt=prompt
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=6
+            --checkpoints_total_limit=2
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_dreambooth_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/dreambooth/train_dreambooth_lora.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --instance_data_dir=docs/source/en/imgs
+            --output_dir={tmpdir}
+            --instance_prompt=prompt
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=4
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-2", "checkpoint-4"})
+
+            resume_run_args = f"""
+            examples/dreambooth/train_dreambooth_lora.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --instance_data_dir=docs/source/en/imgs
+            --output_dir={tmpdir}
+            --instance_prompt=prompt
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=8
+            --checkpointing_steps=2
+            --resume_from_checkpoint=checkpoint-4
+            --checkpoints_total_limit=2
+            """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
+
+    def test_dreambooth_lora_if_model(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth_lora.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-if-pipe
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --pre_compute_text_embeddings
+                --tokenizer_max_length=77
+                --text_encoder_use_attention_mask
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"unet"` in their names.
+            starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
+            self.assertTrue(starts_with_unet)
+
+
+class DreamBoothLoRASDXL(ExamplesTestsAccelerate):
+    def test_dreambooth_lora_sdxl(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth_lora_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"unet"` in their names.
+            starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
+            self.assertTrue(starts_with_unet)
+
+    def test_dreambooth_lora_sdxl_with_text_encoder(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth_lora_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --train_text_encoder
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"unet"` or `"text_encoder"` or `"text_encoder_2"` in their names.
+            keys = lora_state_dict.keys()
+            starts_with_unet = all(
+                k.startswith("unet") or k.startswith("text_encoder") or k.startswith("text_encoder_2") for k in keys
+            )
+            self.assertTrue(starts_with_unet)
+
+    def test_dreambooth_lora_sdxl_custom_captions(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth_lora_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --caption_column text
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+    def test_dreambooth_lora_sdxl_text_encoder_custom_captions(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth_lora_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --caption_column text
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --train_text_encoder
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+    def test_dreambooth_lora_sdxl_checkpointing_checkpoints_total_limit(self):
+        pipeline_path = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth_lora_sdxl.py
+                --pretrained_model_name_or_path {pipeline_path}
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+            pipe = DiffusionPipeline.from_pretrained(pipeline_path)
+            pipe.load_lora_weights(tmpdir)
+            pipe("a prompt", num_inference_steps=1)
+
+            # check checkpoint directories exist
+            # checkpoint-2 should have been deleted
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
+
+    def test_dreambooth_lora_sdxl_text_encoder_checkpointing_checkpoints_total_limit(self):
+        pipeline_path = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth_lora_sdxl.py
+                --pretrained_model_name_or_path {pipeline_path}
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 7
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                --train_text_encoder
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+            pipe = DiffusionPipeline.from_pretrained(pipeline_path)
+            pipe.load_lora_weights(tmpdir)
+            pipe("a prompt", num_inference_steps=2)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                # checkpoint-2 should have been deleted
+                {"checkpoint-4", "checkpoint-6"},
+            )
diff --git a/diffusers/examples/dreambooth/test_dreambooth_lora_edm.py b/diffusers/examples/dreambooth/test_dreambooth_lora_edm.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f6b3674b8b35a53ed1145c5415886f89f134726
--- /dev/null
+++ b/diffusers/examples/dreambooth/test_dreambooth_lora_edm.py
@@ -0,0 +1,99 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+import safetensors
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class DreamBoothLoRASDXLWithEDM(ExamplesTestsAccelerate):
+    def test_dreambooth_lora_sdxl_with_edm(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth_lora_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --do_edm_style_training
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"unet"` in their names.
+            starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
+            self.assertTrue(starts_with_unet)
+
+    def test_dreambooth_lora_playground(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth_lora_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-playground-v2-5-pipe
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"unet"` in their names.
+            starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
+            self.assertTrue(starts_with_unet)
diff --git a/diffusers/examples/dreambooth/test_dreambooth_lora_flux.py b/diffusers/examples/dreambooth/test_dreambooth_lora_flux.py
new file mode 100644
index 0000000000000000000000000000000000000000..d197c8187b8745fc44a412474ae338fb16ab6414
--- /dev/null
+++ b/diffusers/examples/dreambooth/test_dreambooth_lora_flux.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+import safetensors
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class DreamBoothLoRAFlux(ExamplesTestsAccelerate):
+    instance_data_dir = "docs/source/en/imgs"
+    instance_prompt = "photo"
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-pipe"
+    script_path = "examples/dreambooth/train_dreambooth_lora_flux.py"
+
+    def test_dreambooth_lora_flux(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"transformer"` in their names.
+            starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
+            self.assertTrue(starts_with_transformer)
+
+    def test_dreambooth_lora_text_encoder_flux(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --train_text_encoder
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            starts_with_expected_prefix = all(
+                (key.startswith("transformer") or key.startswith("text_encoder")) for key in lora_state_dict.keys()
+            )
+            self.assertTrue(starts_with_expected_prefix)
+
+    def test_dreambooth_lora_latent_caching(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --cache_latents
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"transformer"` in their names.
+            starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
+            self.assertTrue(starts_with_transformer)
+
+    def test_dreambooth_lora_flux_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=6
+            --checkpoints_total_limit=2
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_dreambooth_lora_flux_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=4
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-2", "checkpoint-4"})
+
+            resume_run_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=8
+            --checkpointing_steps=2
+            --resume_from_checkpoint=checkpoint-4
+            --checkpoints_total_limit=2
+            """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
diff --git a/diffusers/examples/dreambooth/test_dreambooth_lora_sd3.py b/diffusers/examples/dreambooth/test_dreambooth_lora_sd3.py
new file mode 100644
index 0000000000000000000000000000000000000000..518738b78246b3f1d97c5a79d6f378a36d3e96d3
--- /dev/null
+++ b/diffusers/examples/dreambooth/test_dreambooth_lora_sd3.py
@@ -0,0 +1,165 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+import safetensors
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class DreamBoothLoRASD3(ExamplesTestsAccelerate):
+    instance_data_dir = "docs/source/en/imgs"
+    instance_prompt = "photo"
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-sd3-pipe"
+    script_path = "examples/dreambooth/train_dreambooth_lora_sd3.py"
+
+    def test_dreambooth_lora_sd3(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"transformer"` in their names.
+            starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
+            self.assertTrue(starts_with_transformer)
+
+    def test_dreambooth_lora_text_encoder_sd3(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --train_text_encoder
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            starts_with_expected_prefix = all(
+                (key.startswith("transformer") or key.startswith("text_encoder")) for key in lora_state_dict.keys()
+            )
+            self.assertTrue(starts_with_expected_prefix)
+
+    def test_dreambooth_lora_sd3_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=6
+            --checkpoints_total_limit=2
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_dreambooth_lora_sd3_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=4
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-2", "checkpoint-4"})
+
+            resume_run_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=8
+            --checkpointing_steps=2
+            --resume_from_checkpoint=checkpoint-4
+            --checkpoints_total_limit=2
+            """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
diff --git a/diffusers/examples/dreambooth/test_dreambooth_sd3.py b/diffusers/examples/dreambooth/test_dreambooth_sd3.py
new file mode 100644
index 0000000000000000000000000000000000000000..19fb7243cefdb8fefd2b6fc0b3d36e0c1522c1bf
--- /dev/null
+++ b/diffusers/examples/dreambooth/test_dreambooth_sd3.py
@@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import shutil
+import sys
+import tempfile
+
+from diffusers import DiffusionPipeline, SD3Transformer2DModel
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class DreamBoothSD3(ExamplesTestsAccelerate):
+    instance_data_dir = "docs/source/en/imgs"
+    instance_prompt = "photo"
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-sd3-pipe"
+    script_path = "examples/dreambooth/train_dreambooth_sd3.py"
+
+    def test_dreambooth(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "transformer", "diffusion_pytorch_model.safetensors")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
+
+    def test_dreambooth_checkpointing(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            # check can run the original fully trained output pipeline
+            pipe = DiffusionPipeline.from_pretrained(tmpdir)
+            pipe(self.instance_prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4")))
+
+            # check can run an intermediate checkpoint
+            transformer = SD3Transformer2DModel.from_pretrained(tmpdir, subfolder="checkpoint-2/transformer")
+            pipe = DiffusionPipeline.from_pretrained(self.pretrained_model_name_or_path, transformer=transformer)
+            pipe(self.instance_prompt, num_inference_steps=1)
+
+            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
+            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
+
+            # Run training script for 7 total steps resuming from checkpoint 4
+
+            resume_run_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --resume_from_checkpoint=checkpoint-4
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check can run new fully trained pipeline
+            pipe = DiffusionPipeline.from_pretrained(tmpdir)
+            pipe(self.instance_prompt, num_inference_steps=1)
+
+            # check old checkpoints do not exist
+            self.assertFalse(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
+
+            # check new checkpoints exist
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4")))
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-6")))
+
+    def test_dreambooth_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=6
+            --checkpoints_total_limit=2
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_dreambooth_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=4
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            resume_run_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=8
+            --checkpointing_steps=2
+            --resume_from_checkpoint=checkpoint-4
+            --checkpoints_total_limit=2
+            """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
diff --git a/diffusers/examples/dreambooth/train_dreambooth.py b/diffusers/examples/dreambooth/train_dreambooth.py
new file mode 100644
index 0000000000000000000000000000000000000000..5099107118e4e10767d5e9ecb0babfac7a437e5f
--- /dev/null
+++ b/diffusers/examples/dreambooth/train_dreambooth.py
@@ -0,0 +1,1443 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import copy
+import gc
+import importlib
+import itertools
+import logging
+import math
+import os
+import shutil
+import warnings
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, model_info, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from packaging import version
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import compute_snr
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    images: list = None,
+    base_model: str = None,
+    train_text_encoder=False,
+    prompt: str = None,
+    repo_folder: str = None,
+    pipeline: DiffusionPipeline = None,
+):
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# DreamBooth - {repo_id}
+
+This is a dreambooth model derived from {base_model}. The weights were trained on {prompt} using [DreamBooth](https://dreambooth.github.io/).
+You can find some example images in the following. \n
+{img_str}
+
+DreamBooth for the text encoder was enabled: {train_text_encoder}.
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        prompt=prompt,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = ["text-to-image", "dreambooth", "diffusers-training"]
+    if isinstance(pipeline, StableDiffusionPipeline):
+        tags.extend(["stable-diffusion", "stable-diffusion-diffusers"])
+    else:
+        tags.extend(["if", "if-diffusers"])
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(
+    text_encoder,
+    tokenizer,
+    unet,
+    vae,
+    args,
+    accelerator,
+    weight_dtype,
+    global_step,
+    prompt_embeds,
+    negative_prompt_embeds,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+
+    pipeline_args = {}
+
+    if vae is not None:
+        pipeline_args["vae"] = vae
+
+    # create pipeline (note: unet and vae are loaded again in float32)
+    pipeline = DiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        unet=unet,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+        **pipeline_args,
+    )
+
+    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+    scheduler_args = {}
+
+    if "variance_type" in pipeline.scheduler.config:
+        variance_type = pipeline.scheduler.config.variance_type
+
+        if variance_type in ["learned", "learned_range"]:
+            variance_type = "fixed_small"
+
+        scheduler_args["variance_type"] = variance_type
+
+    module = importlib.import_module("diffusers")
+    scheduler_class = getattr(module, args.validation_scheduler)
+    pipeline.scheduler = scheduler_class.from_config(pipeline.scheduler.config, **scheduler_args)
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.pre_compute_text_embeddings:
+        pipeline_args = {
+            "prompt_embeds": prompt_embeds,
+            "negative_prompt_embeds": negative_prompt_embeds,
+        }
+    else:
+        pipeline_args = {"prompt": args.validation_prompt}
+
+    # run inference
+    generator = None if args.seed is None else torch.Generator(device=accelerator.device).manual_seed(args.seed)
+    images = []
+    if args.validation_images is None:
+        for _ in range(args.num_validation_images):
+            with torch.autocast("cuda"):
+                image = pipeline(**pipeline_args, num_inference_steps=25, generator=generator).images[0]
+            images.append(image)
+    else:
+        for image in args.validation_images:
+            image = Image.open(image)
+            image = pipeline(**pipeline_args, image=image, generator=generator).images[0]
+            images.append(image)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, global_step, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+    return images
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="dreambooth-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more details"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+
+    parser.add_argument(
+        "--offset_noise",
+        action="store_true",
+        default=False,
+        help=(
+            "Fine-tuning against a modified noise"
+            " See: https://www.crosslabs.org//blog/diffusion-with-offset-noise for more information."
+        ),
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--pre_compute_text_embeddings",
+        action="store_true",
+        help="Whether or not to pre-compute text embeddings. If text embeddings are pre-computed, the text encoder will not be kept in memory during training and will leave more GPU memory available for training the rest of the model. This is not compatible with `--train_text_encoder`.",
+    )
+    parser.add_argument(
+        "--tokenizer_max_length",
+        type=int,
+        default=None,
+        required=False,
+        help="The maximum length of the tokenizer. If not set, will default to the tokenizer's max length.",
+    )
+    parser.add_argument(
+        "--text_encoder_use_attention_mask",
+        action="store_true",
+        required=False,
+        help="Whether to use attention mask for the text encoder",
+    )
+    parser.add_argument(
+        "--skip_save_text_encoder", action="store_true", required=False, help="Set to not save text encoder"
+    )
+    parser.add_argument(
+        "--validation_images",
+        required=False,
+        default=None,
+        nargs="+",
+        help="Optional set of images to use for validation. Used when the target pipeline takes an initial image as input such as when training image variation or superresolution.",
+    )
+    parser.add_argument(
+        "--class_labels_conditioning",
+        required=False,
+        default=None,
+        help="The optional `class_label` conditioning to pass to the unet, available values are `timesteps`.",
+    )
+    parser.add_argument(
+        "--validation_scheduler",
+        type=str,
+        default="DPMSolverMultistepScheduler",
+        choices=["DPMSolverMultistepScheduler", "DDPMScheduler"],
+        help="Select which scheduler to use for validation. DDPMScheduler is recommended for DeepFloyd IF.",
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    if args.train_text_encoder and args.pre_compute_text_embeddings:
+        raise ValueError("`--train_text_encoder` cannot be used with `--pre_compute_text_embeddings`")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        class_num=None,
+        size=512,
+        center_crop=False,
+        encoder_hidden_states=None,
+        class_prompt_encoder_hidden_states=None,
+        tokenizer_max_length=None,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+        self.encoder_hidden_states = encoder_hidden_states
+        self.class_prompt_encoder_hidden_states = class_prompt_encoder_hidden_states
+        self.tokenizer_max_length = tokenizer_max_length
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError(f"Instance {self.instance_data_root} images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        instance_image = exif_transpose(instance_image)
+
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+
+        if self.encoder_hidden_states is not None:
+            example["instance_prompt_ids"] = self.encoder_hidden_states
+        else:
+            text_inputs = tokenize_prompt(
+                self.tokenizer, self.instance_prompt, tokenizer_max_length=self.tokenizer_max_length
+            )
+            example["instance_prompt_ids"] = text_inputs.input_ids
+            example["instance_attention_mask"] = text_inputs.attention_mask
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            class_image = exif_transpose(class_image)
+
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+
+            if self.class_prompt_encoder_hidden_states is not None:
+                example["class_prompt_ids"] = self.class_prompt_encoder_hidden_states
+            else:
+                class_text_inputs = tokenize_prompt(
+                    self.tokenizer, self.class_prompt, tokenizer_max_length=self.tokenizer_max_length
+                )
+                example["class_prompt_ids"] = class_text_inputs.input_ids
+                example["class_attention_mask"] = class_text_inputs.attention_mask
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    has_attention_mask = "instance_attention_mask" in examples[0]
+
+    input_ids = [example["instance_prompt_ids"] for example in examples]
+    pixel_values = [example["instance_images"] for example in examples]
+
+    if has_attention_mask:
+        attention_mask = [example["instance_attention_mask"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        input_ids += [example["class_prompt_ids"] for example in examples]
+        pixel_values += [example["class_images"] for example in examples]
+
+        if has_attention_mask:
+            attention_mask += [example["class_attention_mask"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    input_ids = torch.cat(input_ids, dim=0)
+
+    batch = {
+        "input_ids": input_ids,
+        "pixel_values": pixel_values,
+    }
+
+    if has_attention_mask:
+        attention_mask = torch.cat(attention_mask, dim=0)
+        batch["attention_mask"] = attention_mask
+
+    return batch
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def model_has_vae(args):
+    config_file_name = Path("vae", AutoencoderKL.config_name).as_posix()
+    if os.path.isdir(args.pretrained_model_name_or_path):
+        config_file_name = os.path.join(args.pretrained_model_name_or_path, config_file_name)
+        return os.path.isfile(config_file_name)
+    else:
+        files_in_repo = model_info(args.pretrained_model_name_or_path, revision=args.revision).siblings
+        return any(file.rfilename == config_file_name for file in files_in_repo)
+
+
+def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None):
+    if tokenizer_max_length is not None:
+        max_length = tokenizer_max_length
+    else:
+        max_length = tokenizer.model_max_length
+
+    text_inputs = tokenizer(
+        prompt,
+        truncation=True,
+        padding="max_length",
+        max_length=max_length,
+        return_tensors="pt",
+    )
+
+    return text_inputs
+
+
+def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_attention_mask=None):
+    text_input_ids = input_ids.to(text_encoder.device)
+
+    if text_encoder_use_attention_mask:
+        attention_mask = attention_mask.to(text_encoder.device)
+    else:
+        attention_mask = None
+
+    prompt_embeds = text_encoder(
+        text_input_ids,
+        attention_mask=attention_mask,
+        return_dict=False,
+    )
+    prompt_embeds = prompt_embeds[0]
+
+    return prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
+                variant=args.variant,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+
+    if model_has_vae(args):
+        vae = AutoencoderKL.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+        )
+    else:
+        vae = None
+
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            for model in models:
+                sub_dir = "unet" if isinstance(model, type(unwrap_model(unet))) else "text_encoder"
+                model.save_pretrained(os.path.join(output_dir, sub_dir))
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+    def load_model_hook(models, input_dir):
+        while len(models) > 0:
+            # pop models so that they are not loaded again
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(text_encoder))):
+                # load transformers style into model
+                load_model = text_encoder_cls.from_pretrained(input_dir, subfolder="text_encoder")
+                model.config = load_model.config
+            else:
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+            model.load_state_dict(load_model.state_dict())
+            del load_model
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if vae is not None:
+        vae.requires_grad_(False)
+
+    if not args.train_text_encoder:
+        text_encoder.requires_grad_(False)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        "Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training. copy of the weights should still be float32."
+    )
+
+    if unwrap_model(unet).dtype != torch.float32:
+        raise ValueError(f"Unet loaded as datatype {unwrap_model(unet).dtype}. {low_precision_error_string}")
+
+    if args.train_text_encoder and unwrap_model(text_encoder).dtype != torch.float32:
+        raise ValueError(
+            f"Text encoder loaded as datatype {unwrap_model(text_encoder).dtype}." f" {low_precision_error_string}"
+        )
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = (
+        itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
+    )
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    if args.pre_compute_text_embeddings:
+
+        def compute_text_embeddings(prompt):
+            with torch.no_grad():
+                text_inputs = tokenize_prompt(tokenizer, prompt, tokenizer_max_length=args.tokenizer_max_length)
+                prompt_embeds = encode_prompt(
+                    text_encoder,
+                    text_inputs.input_ids,
+                    text_inputs.attention_mask,
+                    text_encoder_use_attention_mask=args.text_encoder_use_attention_mask,
+                )
+
+            return prompt_embeds
+
+        pre_computed_encoder_hidden_states = compute_text_embeddings(args.instance_prompt)
+        validation_prompt_negative_prompt_embeds = compute_text_embeddings("")
+
+        if args.validation_prompt is not None:
+            validation_prompt_encoder_hidden_states = compute_text_embeddings(args.validation_prompt)
+        else:
+            validation_prompt_encoder_hidden_states = None
+
+        if args.class_prompt is not None:
+            pre_computed_class_prompt_encoder_hidden_states = compute_text_embeddings(args.class_prompt)
+        else:
+            pre_computed_class_prompt_encoder_hidden_states = None
+
+        text_encoder = None
+        tokenizer = None
+
+        gc.collect()
+        torch.cuda.empty_cache()
+    else:
+        pre_computed_encoder_hidden_states = None
+        validation_prompt_encoder_hidden_states = None
+        validation_prompt_negative_prompt_embeds = None
+        pre_computed_class_prompt_encoder_hidden_states = None
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        class_num=args.num_class_images,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+        encoder_hidden_states=pre_computed_encoder_hidden_states,
+        class_prompt_encoder_hidden_states=pre_computed_class_prompt_encoder_hidden_states,
+        tokenizer_max_length=args.tokenizer_max_length,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae and text_encoder to device and cast to weight_dtype
+    if vae is not None:
+        vae.to(accelerator.device, dtype=weight_dtype)
+
+    if not args.train_text_encoder and text_encoder is not None:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = vars(copy.deepcopy(args))
+        tracker_config.pop("validation_images")
+        accelerator.init_trackers("dreambooth", config=tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
+
+                if vae is not None:
+                    # Convert images to latent space
+                    model_input = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                    model_input = model_input * vae.config.scaling_factor
+                else:
+                    model_input = pixel_values
+
+                # Sample noise that we'll add to the model input
+                if args.offset_noise:
+                    noise = torch.randn_like(model_input) + 0.1 * torch.randn(
+                        model_input.shape[0], model_input.shape[1], 1, 1, device=model_input.device
+                    )
+                else:
+                    noise = torch.randn_like(model_input)
+                bsz, channels, height, width = model_input.shape
+                # Sample a random timestep for each image
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                )
+                timesteps = timesteps.long()
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                if args.pre_compute_text_embeddings:
+                    encoder_hidden_states = batch["input_ids"]
+                else:
+                    encoder_hidden_states = encode_prompt(
+                        text_encoder,
+                        batch["input_ids"],
+                        batch["attention_mask"],
+                        text_encoder_use_attention_mask=args.text_encoder_use_attention_mask,
+                    )
+
+                if unwrap_model(unet).config.in_channels == channels * 2:
+                    noisy_model_input = torch.cat([noisy_model_input, noisy_model_input], dim=1)
+
+                if args.class_labels_conditioning == "timesteps":
+                    class_labels = timesteps
+                else:
+                    class_labels = None
+
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_model_input, timesteps, encoder_hidden_states, class_labels=class_labels, return_dict=False
+                )[0]
+
+                if model_pred.shape[1] == 6:
+                    model_pred, _ = torch.chunk(model_pred, 2, dim=1)
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+                    # Compute prior loss
+                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                # Compute instance loss
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    base_weight = (
+                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
+                    )
+
+                    if noise_scheduler.config.prediction_type == "v_prediction":
+                        # Velocity objective needs to be floored to an SNR weight of one.
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                if args.with_prior_preservation:
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(unet.parameters(), text_encoder.parameters())
+                        if args.train_text_encoder
+                        else unet.parameters()
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    images = []
+
+                    if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                        images = log_validation(
+                            unwrap_model(text_encoder) if text_encoder is not None else text_encoder,
+                            tokenizer,
+                            unwrap_model(unet),
+                            vae,
+                            args,
+                            accelerator,
+                            weight_dtype,
+                            global_step,
+                            validation_prompt_encoder_hidden_states,
+                            validation_prompt_negative_prompt_embeds,
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        pipeline_args = {}
+
+        if text_encoder is not None:
+            pipeline_args["text_encoder"] = unwrap_model(text_encoder)
+
+        if args.skip_save_text_encoder:
+            pipeline_args["text_encoder"] = None
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            unet=unwrap_model(unet),
+            revision=args.revision,
+            variant=args.variant,
+            **pipeline_args,
+        )
+
+        # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+        scheduler_args = {}
+
+        if "variance_type" in pipeline.scheduler.config:
+            variance_type = pipeline.scheduler.config.variance_type
+
+            if variance_type in ["learned", "learned_range"]:
+                variance_type = "fixed_small"
+
+            scheduler_args["variance_type"] = variance_type
+
+        pipeline.scheduler = pipeline.scheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+
+        pipeline.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                train_text_encoder=args.train_text_encoder,
+                prompt=args.instance_prompt,
+                repo_folder=args.output_dir,
+                pipeline=pipeline,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/dreambooth/train_dreambooth_flax.py b/diffusers/examples/dreambooth/train_dreambooth_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..29fd5e78535de36097f6adc52dec00d475b78aba
--- /dev/null
+++ b/diffusers/examples/dreambooth/train_dreambooth_flax.py
@@ -0,0 +1,701 @@
+import argparse
+import logging
+import math
+import os
+from pathlib import Path
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+import torch
+import torch.utils.checkpoint
+import transformers
+from flax import jax_utils
+from flax.training import train_state
+from flax.training.common_utils import shard
+from huggingface_hub import create_repo, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from jax.experimental.compilation_cache import compilation_cache as cc
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPImageProcessor, CLIPTokenizer, FlaxCLIPTextModel, set_seed
+
+from diffusers import (
+    FlaxAutoencoderKL,
+    FlaxDDPMScheduler,
+    FlaxPNDMScheduler,
+    FlaxStableDiffusionPipeline,
+    FlaxUNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
+from diffusers.utils import check_min_version
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+# Cache compiled models across invocations of this script.
+cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained vae or vae identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--save_steps", type=int, default=None, help="Save a checkpoint every X steps.")
+    parser.add_argument("--seed", type=int, default=0, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.instance_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        class_num=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+        ).input_ids
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                padding="do_not_pad",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids
+
+        return example
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def get_params_to_save(params):
+    return jax.device_get(jax.tree_util.tree_map(lambda x: x[0], params))
+
+
+def main():
+    args = parse_args()
+
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    rng = jax.random.PRNGKey(args.seed)
+
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path, safety_checker=None, revision=args.revision
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            total_sample_batch_size = args.sample_batch_size * jax.local_device_count()
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=total_sample_batch_size)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not jax.process_index() == 0
+            ):
+                prompt_ids = pipeline.prepare_inputs(example["prompt"])
+                prompt_ids = shard(prompt_ids)
+                p_params = jax_utils.replicate(params)
+                rng = jax.random.split(rng)[0]
+                sample_rng = jax.random.split(rng, jax.device_count())
+                images = pipeline(prompt_ids, p_params, sample_rng, jit=True).images
+                images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
+                images = pipeline.numpy_to_pil(np.array(images))
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+
+    # Handle the repository creation
+    if jax.process_index() == 0:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer and add the placeholder token as a additional special token
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+        )
+    else:
+        raise NotImplementedError("No tokenizer specified!")
+
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        class_num=args.num_class_images,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if args.with_prior_preservation:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+        input_ids = tokenizer.pad(
+            {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt"
+        ).input_ids
+
+        batch = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+        }
+        batch = {k: v.numpy() for k, v in batch.items()}
+        return batch
+
+    total_train_batch_size = args.train_batch_size * jax.local_device_count()
+    if len(train_dataset) < total_train_batch_size:
+        raise ValueError(
+            f"Training batch size is {total_train_batch_size}, but your dataset only contains"
+            f" {len(train_dataset)} images. Please, use a larger dataset or reduce the effective batch size. Note that"
+            f" there are {jax.local_device_count()} parallel devices, so your batch size can't be smaller than that."
+        )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=total_train_batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True
+    )
+
+    weight_dtype = jnp.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = jnp.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = jnp.bfloat16
+
+    if args.pretrained_vae_name_or_path:
+        # TODO(patil-suraj): Upload flax weights for the VAE
+        vae_arg, vae_kwargs = (args.pretrained_vae_name_or_path, {"from_pt": True})
+    else:
+        vae_arg, vae_kwargs = (args.pretrained_model_name_or_path, {"subfolder": "vae", "revision": args.revision})
+
+    # Load models and create wrapper for stable diffusion
+    text_encoder = FlaxCLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        dtype=weight_dtype,
+        revision=args.revision,
+    )
+    vae, vae_params = FlaxAutoencoderKL.from_pretrained(
+        vae_arg,
+        dtype=weight_dtype,
+        **vae_kwargs,
+    )
+    unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="unet",
+        dtype=weight_dtype,
+        revision=args.revision,
+    )
+
+    # Optimization
+    if args.scale_lr:
+        args.learning_rate = args.learning_rate * total_train_batch_size
+
+    constant_scheduler = optax.constant_schedule(args.learning_rate)
+
+    adamw = optax.adamw(
+        learning_rate=constant_scheduler,
+        b1=args.adam_beta1,
+        b2=args.adam_beta2,
+        eps=args.adam_epsilon,
+        weight_decay=args.adam_weight_decay,
+    )
+
+    optimizer = optax.chain(
+        optax.clip_by_global_norm(args.max_grad_norm),
+        adamw,
+    )
+
+    unet_state = train_state.TrainState.create(apply_fn=unet.__call__, params=unet_params, tx=optimizer)
+    text_encoder_state = train_state.TrainState.create(
+        apply_fn=text_encoder.__call__, params=text_encoder.params, tx=optimizer
+    )
+
+    noise_scheduler = FlaxDDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
+    )
+    noise_scheduler_state = noise_scheduler.create_state()
+
+    # Initialize our training
+    train_rngs = jax.random.split(rng, jax.local_device_count())
+
+    def train_step(unet_state, text_encoder_state, vae_params, batch, train_rng):
+        dropout_rng, sample_rng, new_train_rng = jax.random.split(train_rng, 3)
+
+        if args.train_text_encoder:
+            params = {"text_encoder": text_encoder_state.params, "unet": unet_state.params}
+        else:
+            params = {"unet": unet_state.params}
+
+        def compute_loss(params):
+            # Convert images to latent space
+            vae_outputs = vae.apply(
+                {"params": vae_params}, batch["pixel_values"], deterministic=True, method=vae.encode
+            )
+            latents = vae_outputs.latent_dist.sample(sample_rng)
+            # (NHWC) -> (NCHW)
+            latents = jnp.transpose(latents, (0, 3, 1, 2))
+            latents = latents * vae.config.scaling_factor
+
+            # Sample noise that we'll add to the latents
+            noise_rng, timestep_rng = jax.random.split(sample_rng)
+            noise = jax.random.normal(noise_rng, latents.shape)
+            # Sample a random timestep for each image
+            bsz = latents.shape[0]
+            timesteps = jax.random.randint(
+                timestep_rng,
+                (bsz,),
+                0,
+                noise_scheduler.config.num_train_timesteps,
+            )
+
+            # Add noise to the latents according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)
+
+            # Get the text embedding for conditioning
+            if args.train_text_encoder:
+                encoder_hidden_states = text_encoder_state.apply_fn(
+                    batch["input_ids"], params=params["text_encoder"], dropout_rng=dropout_rng, train=True
+                )[0]
+            else:
+                encoder_hidden_states = text_encoder(
+                    batch["input_ids"], params=text_encoder_state.params, train=False
+                )[0]
+
+            # Predict the noise residual
+            model_pred = unet.apply(
+                {"params": params["unet"]}, noisy_latents, timesteps, encoder_hidden_states, train=True
+            ).sample
+
+            # Get the target for loss depending on the prediction type
+            if noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif noise_scheduler.config.prediction_type == "v_prediction":
+                target = noise_scheduler.get_velocity(noise_scheduler_state, latents, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+            if args.with_prior_preservation:
+                # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
+                model_pred, model_pred_prior = jnp.split(model_pred, 2, axis=0)
+                target, target_prior = jnp.split(target, 2, axis=0)
+
+                # Compute instance loss
+                loss = (target - model_pred) ** 2
+                loss = loss.mean()
+
+                # Compute prior loss
+                prior_loss = (target_prior - model_pred_prior) ** 2
+                prior_loss = prior_loss.mean()
+
+                # Add the prior loss to the instance loss.
+                loss = loss + args.prior_loss_weight * prior_loss
+            else:
+                loss = (target - model_pred) ** 2
+                loss = loss.mean()
+
+            return loss
+
+        grad_fn = jax.value_and_grad(compute_loss)
+        loss, grad = grad_fn(params)
+        grad = jax.lax.pmean(grad, "batch")
+
+        new_unet_state = unet_state.apply_gradients(grads=grad["unet"])
+        if args.train_text_encoder:
+            new_text_encoder_state = text_encoder_state.apply_gradients(grads=grad["text_encoder"])
+        else:
+            new_text_encoder_state = text_encoder_state
+
+        metrics = {"loss": loss}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+
+        return new_unet_state, new_text_encoder_state, metrics, new_train_rng
+
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0, 1))
+
+    # Replicate the train state on each device
+    unet_state = jax_utils.replicate(unet_state)
+    text_encoder_state = jax_utils.replicate(text_encoder_state)
+    vae_params = jax_utils.replicate(vae_params)
+
+    # Train!
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+
+    # Scheduler and math around the number of training steps.
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {total_train_batch_size}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    def checkpoint(step=None):
+        # Create the pipeline using the trained modules and save it.
+        scheduler, _ = FlaxPNDMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+        safety_checker = FlaxStableDiffusionSafetyChecker.from_pretrained(
+            "CompVis/stable-diffusion-safety-checker", from_pt=True
+        )
+        pipeline = FlaxStableDiffusionPipeline(
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32"),
+        )
+
+        outdir = os.path.join(args.output_dir, str(step)) if step else args.output_dir
+        pipeline.save_pretrained(
+            outdir,
+            params={
+                "text_encoder": get_params_to_save(text_encoder_state.params),
+                "vae": get_params_to_save(vae_params),
+                "unet": get_params_to_save(unet_state.params),
+                "safety_checker": safety_checker.params,
+            },
+        )
+
+        if args.push_to_hub:
+            message = f"checkpoint-{step}" if step is not None else "End of training"
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message=message,
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    global_step = 0
+
+    epochs = tqdm(range(args.num_train_epochs), desc="Epoch ... ", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+
+        train_metrics = []
+
+        steps_per_epoch = len(train_dataset) // total_train_batch_size
+        train_step_progress_bar = tqdm(total=steps_per_epoch, desc="Training...", position=1, leave=False)
+        # train
+        for batch in train_dataloader:
+            batch = shard(batch)
+            unet_state, text_encoder_state, train_metric, train_rngs = p_train_step(
+                unet_state, text_encoder_state, vae_params, batch, train_rngs
+            )
+            train_metrics.append(train_metric)
+
+            train_step_progress_bar.update(jax.local_device_count())
+
+            global_step += 1
+            if jax.process_index() == 0 and args.save_steps and global_step % args.save_steps == 0:
+                checkpoint(global_step)
+            if global_step >= args.max_train_steps:
+                break
+
+        train_metric = jax_utils.unreplicate(train_metric)
+
+        train_step_progress_bar.close()
+        epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})")
+
+    if jax.process_index() == 0:
+        checkpoint()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/dreambooth/train_dreambooth_flux.py b/diffusers/examples/dreambooth/train_dreambooth_flux.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e0f4e09a461b219af169b5b77b84121692b369a
--- /dev/null
+++ b/diffusers/examples/dreambooth/train_dreambooth_flux.py
@@ -0,0 +1,1791 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import copy
+import gc
+import itertools
+import logging
+import math
+import os
+import random
+import shutil
+import warnings
+from contextlib import nullcontext
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer, PretrainedConfig, T5EncoderModel, T5TokenizerFast
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    FluxPipeline,
+    FluxTransformer2DModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import compute_density_for_timestep_sampling, compute_loss_weighting_for_sd3
+from diffusers.utils import (
+    check_min_version,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    images=None,
+    base_model: str = None,
+    train_text_encoder=False,
+    instance_prompt=None,
+    validation_prompt=None,
+    repo_folder=None,
+):
+    widget_dict = []
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            widget_dict.append(
+                {"text": validation_prompt if validation_prompt else " ", "output": {"url": f"image_{i}.png"}}
+            )
+
+    model_description = f"""
+# Flux [dev] DreamBooth - {repo_id}
+
+<Gallery />
+
+## Model description
+
+These are {repo_id} DreamBooth weights for {base_model}.
+
+The weights were trained using [DreamBooth](https://dreambooth.github.io/) with the [Flux diffusers trainer](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_flux.md).
+
+Was the text encoder fine-tuned? {train_text_encoder}.
+
+## Trigger words
+
+You should use `{instance_prompt}` to trigger the image generation.
+
+## Use it with the [🧨 diffusers library](https://github.com/huggingface/diffusers)
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+pipeline = AutoPipelineForText2Image.from_pretrained('{repo_id}', torch_dtype=torch.bfloat16).to('cuda')
+image = pipeline('{validation_prompt if validation_prompt else instance_prompt}').images[0]
+```
+
+## License
+
+Please adhere to the licensing terms as described [here](https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/LICENSE.md).
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="other",
+        base_model=base_model,
+        prompt=instance_prompt,
+        model_description=model_description,
+        widget=widget_dict,
+    )
+    tags = [
+        "text-to-image",
+        "diffusers-training",
+        "diffusers",
+        "flux",
+        "flux-diffusers",
+        "template:sd-lora",
+    ]
+
+    model_card = populate_model_card(model_card, tags=tags)
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def load_text_encoders(class_one, class_two):
+    text_encoder_one = class_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = class_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    return text_encoder_one, text_encoder_two
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    torch_dtype,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = pipeline.to(accelerator.device, dtype=torch_dtype)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    # autocast_ctx = torch.autocast(accelerator.device.type) if not is_final_validation else nullcontext()
+    autocast_ctx = nullcontext()
+
+    with autocast_ctx:
+        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    return images
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help=("A folder containing the training data. "),
+    )
+
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+
+    parser.add_argument(
+        "--image_column",
+        type=str,
+        default="image",
+        help="The column of the dataset containing the target image. By "
+        "default, the standard Image Dataset maps out 'file_name' "
+        "to 'image'.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default=None,
+        help="The column of the dataset containing the instance prompt for each image",
+    )
+
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance, e.g. 'photo of a TOK dog', 'in the style of TOK'",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--max_sequence_length",
+        type=int,
+        default=77,
+        help="Maximum sequence length to use with with the T5 text encoder",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="flux-dreambooth",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=3.5,
+        help="the FLUX.1 dev variant is a guidance distilled model",
+    )
+
+    parser.add_argument(
+        "--text_encoder_lr",
+        type=float,
+        default=5e-6,
+        help="Text encoder learning rate to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="none",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"],
+        help=('We default to the "none" weighting scheme for uniform sampling and uniform loss'),
+    )
+    parser.add_argument(
+        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+    )
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="AdamW",
+        help=('The optimizer type to use. Choose between ["AdamW", "prodigy"]'),
+    )
+
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+
+    parser.add_argument(
+        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--prodigy_beta3",
+        type=float,
+        default=None,
+        help="coefficients for computing the Prodigy stepsize using running averages. If set to None, "
+        "uses the value of square root of beta2. Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+    parser.add_argument(
+        "--adam_weight_decay_text_encoder", type=float, default=1e-03, help="Weight decay to use for text_encoder"
+    )
+
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
+    )
+
+    parser.add_argument(
+        "--prodigy_use_bias_correction",
+        type=bool,
+        default=True,
+        help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW",
+    )
+    parser.add_argument(
+        "--prodigy_safeguard_warmup",
+        type=bool,
+        default=True,
+        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. "
+        "Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.instance_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--instance_data_dir`")
+
+    if args.dataset_name is not None and args.instance_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--instance_data_dir`")
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        class_prompt,
+        class_data_root=None,
+        class_num=None,
+        size=1024,
+        repeats=1,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+
+        self.instance_prompt = instance_prompt
+        self.custom_instance_prompts = None
+        self.class_prompt = class_prompt
+
+        # if --dataset_name is provided or a metadata jsonl file is provided in the local --instance_data directory,
+        # we load the training data using load_dataset
+        if args.dataset_name is not None:
+            try:
+                from datasets import load_dataset
+            except ImportError:
+                raise ImportError(
+                    "You are trying to load your data using the datasets library. If you wish to train using custom "
+                    "captions please install the datasets library: `pip install datasets`. If you wish to load a "
+                    "local folder containing images only, specify --instance_data_dir instead."
+                )
+            # Downloading and loading a dataset from the hub.
+            # See more about loading custom images at
+            # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+            dataset = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                cache_dir=args.cache_dir,
+            )
+            # Preprocessing the datasets.
+            column_names = dataset["train"].column_names
+
+            # 6. Get the column names for input/target.
+            if args.image_column is None:
+                image_column = column_names[0]
+                logger.info(f"image column defaulting to {image_column}")
+            else:
+                image_column = args.image_column
+                if image_column not in column_names:
+                    raise ValueError(
+                        f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+            instance_images = dataset["train"][image_column]
+
+            if args.caption_column is None:
+                logger.info(
+                    "No caption column provided, defaulting to instance_prompt for all images. If your dataset "
+                    "contains captions/prompts for the images, make sure to specify the "
+                    "column as --caption_column"
+                )
+                self.custom_instance_prompts = None
+            else:
+                if args.caption_column not in column_names:
+                    raise ValueError(
+                        f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+                custom_instance_prompts = dataset["train"][args.caption_column]
+                # create final list of captions according to --repeats
+                self.custom_instance_prompts = []
+                for caption in custom_instance_prompts:
+                    self.custom_instance_prompts.extend(itertools.repeat(caption, repeats))
+        else:
+            self.instance_data_root = Path(instance_data_root)
+            if not self.instance_data_root.exists():
+                raise ValueError("Instance images root doesn't exists.")
+
+            instance_images = [Image.open(path) for path in list(Path(instance_data_root).iterdir())]
+            self.custom_instance_prompts = None
+
+        self.instance_images = []
+        for img in instance_images:
+            self.instance_images.extend(itertools.repeat(img, repeats))
+
+        self.pixel_values = []
+        train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
+        train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
+        train_flip = transforms.RandomHorizontalFlip(p=1.0)
+        train_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        for image in self.instance_images:
+            image = exif_transpose(image)
+            if not image.mode == "RGB":
+                image = image.convert("RGB")
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            image = train_transforms(image)
+            self.pixel_values.append(image)
+
+        self.num_instance_images = len(self.instance_images)
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = self.pixel_values[index % self.num_instance_images]
+        example["instance_images"] = instance_image
+
+        if self.custom_instance_prompts:
+            caption = self.custom_instance_prompts[index % self.num_instance_images]
+            if caption:
+                example["instance_prompt"] = caption
+            else:
+                example["instance_prompt"] = self.instance_prompt
+
+        else:  # custom prompts were provided, but length does not match size of image dataset
+            example["instance_prompt"] = self.instance_prompt
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            class_image = exif_transpose(class_image)
+
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt"] = self.class_prompt
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    pixel_values = [example["instance_images"] for example in examples]
+    prompts = [example["instance_prompt"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        pixel_values += [example["class_images"] for example in examples]
+        prompts += [example["class_prompt"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    batch = {"pixel_values": pixel_values, "prompts": prompts}
+    return batch
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def tokenize_prompt(tokenizer, prompt, max_sequence_length):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+
+
+def _encode_prompt_with_t5(
+    text_encoder,
+    tokenizer,
+    max_sequence_length=512,
+    prompt=None,
+    num_images_per_prompt=1,
+    device=None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+
+    dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+    _, seq_len, _ = prompt_embeds.shape
+
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+    return prompt_embeds
+
+
+def _encode_prompt_with_clip(
+    text_encoder,
+    tokenizer,
+    prompt: str,
+    device=None,
+    text_input_ids=None,
+    num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
+
+    # Use pooled output of CLIPTextModel
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+    return prompt_embeds
+
+
+def encode_prompt(
+    text_encoders,
+    tokenizers,
+    prompt: str,
+    max_sequence_length,
+    device=None,
+    num_images_per_prompt: int = 1,
+    text_input_ids_list=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    dtype = text_encoders[0].dtype
+    device = device if device is not None else text_encoders[1].device
+    pooled_prompt_embeds = _encode_prompt_with_clip(
+        text_encoder=text_encoders[0],
+        tokenizer=tokenizers[0],
+        prompt=prompt,
+        device=device,
+        num_images_per_prompt=num_images_per_prompt,
+        text_input_ids=text_input_ids_list[0] if text_input_ids_list else None,
+    )
+
+    prompt_embeds = _encode_prompt_with_t5(
+        text_encoder=text_encoders[1],
+        tokenizer=tokenizers[1],
+        max_sequence_length=max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        device=device,
+        text_input_ids=text_input_ids_list[1] if text_input_ids_list else None,
+    )
+
+    text_ids = torch.zeros(batch_size, prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+    text_ids = text_ids.repeat(num_images_per_prompt, 1, 1)
+
+    return prompt_embeds, pooled_prompt_embeds, text_ids
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            has_supported_fp16_accelerator = torch.cuda.is_available() or torch.backends.mps.is_available()
+            torch_dtype = torch.float16 if has_supported_fp16_accelerator else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = FluxPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                revision=args.revision,
+                variant=args.variant,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+    )
+    tokenizer_two = T5TokenizerFast.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="scheduler"
+    )
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    text_encoder_one, text_encoder_two = load_text_encoders(text_encoder_cls_one, text_encoder_cls_two)
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+        variant=args.variant,
+    )
+    transformer = FluxTransformer2DModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant
+    )
+
+    transformer.requires_grad_(True)
+    vae.requires_grad_(False)
+    if args.train_text_encoder:
+        text_encoder_one.requires_grad_(True)
+        text_encoder_two.requires_grad_(False)
+    else:
+        text_encoder_one.requires_grad_(False)
+        text_encoder_two.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if not args.train_text_encoder:
+        text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+        text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder_one.gradient_checkpointing_enable()
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            for i, model in enumerate(models):
+                if isinstance(unwrap_model(model), FluxTransformer2DModel):
+                    unwrap_model(model).save_pretrained(os.path.join(output_dir, "transformer"))
+                elif isinstance(unwrap_model(model), (CLIPTextModelWithProjection, T5EncoderModel)):
+                    if isinstance(unwrap_model(model), CLIPTextModelWithProjection):
+                        unwrap_model(model).save_pretrained(os.path.join(output_dir, "text_encoder"))
+                    else:
+                        unwrap_model(model).save_pretrained(os.path.join(output_dir, "text_encoder_2"))
+                else:
+                    raise ValueError(f"Wrong model supplied: {type(model)=}.")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+    def load_model_hook(models, input_dir):
+        for _ in range(len(models)):
+            # pop models so that they are not loaded again
+            model = models.pop()
+
+            # load diffusers style into model
+            if isinstance(unwrap_model(model), FluxTransformer2DModel):
+                load_model = FluxTransformer2DModel.from_pretrained(input_dir, subfolder="transformer")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+            elif isinstance(unwrap_model(model), (CLIPTextModelWithProjection, T5EncoderModel)):
+                try:
+                    load_model = CLIPTextModelWithProjection.from_pretrained(input_dir, subfolder="text_encoder")
+                    model(**load_model.config)
+                    model.load_state_dict(load_model.state_dict())
+                except Exception:
+                    try:
+                        load_model = T5EncoderModel.from_pretrained(input_dir, subfolder="text_encoder_2")
+                        model(**load_model.config)
+                        model.load_state_dict(load_model.state_dict())
+                    except Exception:
+                        raise ValueError(f"Couldn't load the model of type: ({type(model)}).")
+            else:
+                raise ValueError(f"Unsupported model found: {type(model)=}")
+
+            del load_model
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Optimization parameters
+    transformer_parameters_with_lr = {"params": transformer.parameters(), "lr": args.learning_rate}
+    if args.train_text_encoder:
+        # different learning rate for text encoder and unet
+        text_parameters_one_with_lr = {
+            "params": text_encoder_one.parameters(),
+            "weight_decay": args.adam_weight_decay_text_encoder,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        params_to_optimize = [
+            transformer_parameters_with_lr,
+            text_parameters_one_with_lr,
+        ]
+    else:
+        params_to_optimize = [transformer_parameters_with_lr]
+
+    # Optimizer creation
+    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+            "Defaulting to adamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.optimizer.lower() == "adamw":
+        if args.use_8bit_adam:
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError(
+                    "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+                )
+
+            optimizer_class = bnb.optim.AdamW8bit
+        else:
+            optimizer_class = torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+
+    if args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+        if args.train_text_encoder and args.text_encoder_lr:
+            logger.warning(
+                f"Learning rates were provided both for the transformer and the text encoder- e.g. text_encoder_lr:"
+                f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
+                f"When using prodigy only learning_rate is used as the initial learning rate."
+            )
+            # changes the learning rate of text_encoder_parameters_one and text_encoder_parameters_two to be
+            # --learning_rate
+            params_to_optimize[1]["lr"] = args.learning_rate
+            params_to_optimize[2]["lr"] = args.learning_rate
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_prompt=args.class_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_num=args.num_class_images,
+        size=args.resolution,
+        repeats=args.repeats,
+        center_crop=args.center_crop,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    if not args.train_text_encoder:
+        tokenizers = [tokenizer_one, tokenizer_two]
+        text_encoders = [text_encoder_one, text_encoder_two]
+
+        def compute_text_embeddings(prompt, text_encoders, tokenizers):
+            with torch.no_grad():
+                prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
+                    text_encoders, tokenizers, prompt, args.max_sequence_length
+                )
+                prompt_embeds = prompt_embeds.to(accelerator.device)
+                pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device)
+                text_ids = text_ids.to(accelerator.device)
+            return prompt_embeds, pooled_prompt_embeds, text_ids
+
+    # If no type of tuning is done on the text_encoder and custom instance prompts are NOT
+    # provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid
+    # the redundant encoding.
+    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
+        instance_prompt_hidden_states, instance_pooled_prompt_embeds, instance_text_ids = compute_text_embeddings(
+            args.instance_prompt, text_encoders, tokenizers
+        )
+
+    # Handle class prompt for prior-preservation.
+    if args.with_prior_preservation:
+        if not args.train_text_encoder:
+            class_prompt_hidden_states, class_pooled_prompt_embeds, class_text_ids = compute_text_embeddings(
+                args.class_prompt, text_encoders, tokenizers
+            )
+
+    # Clear the memory here
+    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
+        del tokenizers, text_encoders
+        # Explicitly delete the objects as well, otherwise only the lists are deleted and the original references remain, preventing garbage collection
+        del text_encoder_one, text_encoder_two
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+    # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
+    # pack the statically computed variables appropriately here. This is so that we don't
+    # have to pass them to the dataloader.
+
+    if not train_dataset.custom_instance_prompts:
+        if not args.train_text_encoder:
+            prompt_embeds = instance_prompt_hidden_states
+            pooled_prompt_embeds = instance_pooled_prompt_embeds
+            text_ids = instance_text_ids
+            if args.with_prior_preservation:
+                prompt_embeds = torch.cat([prompt_embeds, class_prompt_hidden_states], dim=0)
+                pooled_prompt_embeds = torch.cat([pooled_prompt_embeds, class_pooled_prompt_embeds], dim=0)
+                text_ids = torch.cat([text_ids, class_text_ids], dim=0)
+        # if we're optimizing the text encoder (both if instance prompt is used for all images or custom prompts) we need to tokenize and encode the
+        # batch prompts on all training steps
+        else:
+            tokens_one = tokenize_prompt(tokenizer_one, args.instance_prompt, max_sequence_length=77)
+            tokens_two = tokenize_prompt(tokenizer_two, args.instance_prompt, max_sequence_length=512)
+            if args.with_prior_preservation:
+                class_tokens_one = tokenize_prompt(tokenizer_one, args.class_prompt, max_sequence_length=77)
+                class_tokens_two = tokenize_prompt(tokenizer_two, args.class_prompt, max_sequence_length=512)
+                tokens_one = torch.cat([tokens_one, class_tokens_one], dim=0)
+                tokens_two = torch.cat([tokens_two, class_tokens_two], dim=0)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        (
+            transformer,
+            text_encoder_one,
+            optimizer,
+            train_dataloader,
+            lr_scheduler,
+        ) = accelerator.prepare(
+            transformer,
+            text_encoder_one,
+            optimizer,
+            train_dataloader,
+            lr_scheduler,
+        )
+    else:
+        transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            transformer, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = "dreambooth-flux-dev-lora"
+        accelerator.init_trackers(tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+        if args.train_text_encoder:
+            text_encoder_one.train()
+
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [transformer]
+            if args.train_text_encoder:
+                models_to_accumulate.extend([text_encoder_one])
+            with accelerator.accumulate(models_to_accumulate):
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                prompts = batch["prompts"]
+
+                # encode batch prompts when custom prompts are provided for each image -
+                if train_dataset.custom_instance_prompts:
+                    if not args.train_text_encoder:
+                        prompt_embeds, pooled_prompt_embeds, text_ids = compute_text_embeddings(
+                            prompts, text_encoders, tokenizers
+                        )
+                    else:
+                        tokens_one = tokenize_prompt(tokenizer_one, prompts, max_sequence_length=77)
+                        tokens_two = tokenize_prompt(
+                            tokenizer_two, prompts, max_sequence_length=args.max_sequence_length
+                        )
+                        prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
+                            text_encoders=[text_encoder_one, text_encoder_two],
+                            tokenizers=[None, None],
+                            text_input_ids_list=[tokens_one, tokens_two],
+                            max_sequence_length=args.max_sequence_length,
+                            prompt=prompts,
+                        )
+                else:
+                    if args.train_text_encoder:
+                        prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
+                            text_encoders=[text_encoder_one, text_encoder_two],
+                            tokenizers=[None, None],
+                            text_input_ids_list=[tokens_one, tokens_two],
+                            max_sequence_length=args.max_sequence_length,
+                            prompt=args.instance_prompt,
+                        )
+
+                # Convert images to latent space
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                model_input = (model_input - vae.config.shift_factor) * vae.config.scaling_factor
+                model_input = model_input.to(dtype=weight_dtype)
+
+                vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
+
+                latent_image_ids = FluxPipeline._prepare_latent_image_ids(
+                    model_input.shape[0],
+                    model_input.shape[2],
+                    model_input.shape[3],
+                    accelerator.device,
+                    weight_dtype,
+                )
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+
+                # Sample a random timestep for each image
+                # for weighting schemes where we sample timesteps non-uniformly
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+
+                # Add noise according to flow matching.
+                # zt = (1 - texp) * x + texp * z1
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
+
+                packed_noisy_model_input = FluxPipeline._pack_latents(
+                    noisy_model_input,
+                    batch_size=model_input.shape[0],
+                    num_channels_latents=model_input.shape[1],
+                    height=model_input.shape[2],
+                    width=model_input.shape[3],
+                )
+
+                # handle guidance
+                if transformer.config.guidance_embeds:
+                    guidance = torch.tensor([args.guidance_scale], device=accelerator.device)
+                    guidance = guidance.expand(model_input.shape[0])
+                else:
+                    guidance = None
+
+                # Predict the noise residual
+                model_pred = transformer(
+                    hidden_states=packed_noisy_model_input,
+                    # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+                    timestep=timesteps / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    return_dict=False,
+                )[0]
+                # upscaling height & width as discussed in https://github.com/huggingface/diffusers/pull/9257#discussion_r1731108042
+                model_pred = FluxPipeline._unpack_latents(
+                    model_pred,
+                    height=int(model_input.shape[2] * vae_scale_factor / 2),
+                    width=int(model_input.shape[3] * vae_scale_factor / 2),
+                    vae_scale_factor=vae_scale_factor,
+                )
+
+                # these weighting schemes use a uniform timestep sampling
+                # and instead post-weight the loss
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+
+                # flow matching loss
+                target = noise - model_input
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute prior loss
+                    prior_loss = torch.mean(
+                        (weighting.float() * (model_pred_prior.float() - target_prior.float()) ** 2).reshape(
+                            target_prior.shape[0], -1
+                        ),
+                        1,
+                    )
+                    prior_loss = prior_loss.mean()
+
+                # Compute regular loss.
+                loss = torch.mean(
+                    (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    1,
+                )
+                loss = loss.mean()
+
+                if args.with_prior_preservation:
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(transformer.parameters(), text_encoder_one.parameters())
+                        if args.train_text_encoder
+                        else transformer.parameters()
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                # create pipeline
+                if not args.train_text_encoder:
+                    text_encoder_one, text_encoder_two = load_text_encoders(text_encoder_cls_one, text_encoder_cls_two)
+                else:  # even when training the text encoder we're only training text encoder one
+                    text_encoder_two = text_encoder_cls_two.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        subfolder="text_encoder_2",
+                        revision=args.revision,
+                        variant=args.variant,
+                    )
+                pipeline = FluxPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=accelerator.unwrap_model(text_encoder_one),
+                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
+                    transformer=accelerator.unwrap_model(transformer),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline_args = {"prompt": args.validation_prompt}
+                images = log_validation(
+                    pipeline=pipeline,
+                    args=args,
+                    accelerator=accelerator,
+                    pipeline_args=pipeline_args,
+                    epoch=epoch,
+                    torch_dtype=weight_dtype,
+                )
+                if not args.train_text_encoder:
+                    del text_encoder_one, text_encoder_two
+                    torch.cuda.empty_cache()
+                    gc.collect()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        transformer = unwrap_model(transformer)
+
+        if args.train_text_encoder:
+            text_encoder_one = unwrap_model(text_encoder_one)
+            pipeline = FluxPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                transformer=transformer,
+                text_encoder=text_encoder_one,
+            )
+        else:
+            pipeline = FluxPipeline.from_pretrained(args.pretrained_model_name_or_path, transformer=transformer)
+
+        # save the pipeline
+        pipeline.save_pretrained(args.output_dir)
+
+        # Final inference
+        # Load previous pipeline
+        pipeline = FluxPipeline.from_pretrained(
+            args.output_dir,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline_args = {"prompt": args.validation_prompt}
+            images = log_validation(
+                pipeline=pipeline,
+                args=args,
+                accelerator=accelerator,
+                pipeline_args=pipeline_args,
+                epoch=epoch,
+                is_final_validation=True,
+                torch_dtype=weight_dtype,
+            )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                train_text_encoder=args.train_text_encoder,
+                instance_prompt=args.instance_prompt,
+                validation_prompt=args.validation_prompt,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/dreambooth/train_dreambooth_lora.py b/diffusers/examples/dreambooth/train_dreambooth_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d7d697bb21de4629b5376d600a2be0c9d7429a1
--- /dev/null
+++ b/diffusers/examples/dreambooth/train_dreambooth_lora.py
@@ -0,0 +1,1432 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import copy
+import gc
+import logging
+import math
+import os
+import shutil
+import warnings
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from packaging import version
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import StableDiffusionLoraLoaderMixin
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params
+from diffusers.utils import (
+    check_min_version,
+    convert_state_dict_to_diffusers,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    images=None,
+    base_model=str,
+    train_text_encoder=False,
+    prompt=str,
+    repo_folder=None,
+    pipeline: DiffusionPipeline = None,
+):
+    img_str = ""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# LoRA DreamBooth - {repo_id}
+
+These are LoRA adaption weights for {base_model}. The weights were trained on {prompt} using [DreamBooth](https://dreambooth.github.io/). You can find some example images in the following. \n
+{img_str}
+
+LoRA for the text encoder was enabled: {train_text_encoder}.
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        prompt=prompt,
+        model_description=model_description,
+        inference=True,
+    )
+    tags = ["text-to-image", "diffusers", "lora", "diffusers-training"]
+    if isinstance(pipeline, StableDiffusionPipeline):
+        tags.extend(["stable-diffusion", "stable-diffusion-diffusers"])
+    else:
+        tags.extend(["if", "if-diffusers"])
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    torch_dtype,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+    scheduler_args = {}
+
+    if "variance_type" in pipeline.scheduler.config:
+        variance_type = pipeline.scheduler.config.variance_type
+
+        if variance_type in ["learned", "learned_range"]:
+            variance_type = "fixed_small"
+
+        scheduler_args["variance_type"] = variance_type
+
+    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+
+    pipeline = pipeline.to(accelerator.device, dtype=torch_dtype)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+
+    if args.validation_images is None:
+        images = []
+        for _ in range(args.num_validation_images):
+            with torch.cuda.amp.autocast():
+                image = pipeline(**pipeline_args, generator=generator).images[0]
+                images.append(image)
+    else:
+        images = []
+        for image in args.validation_images:
+            image = Image.open(image)
+            with torch.cuda.amp.autocast():
+                image = pipeline(**pipeline_args, image=image, generator=generator).images[0]
+            images.append(image)
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+    return images
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="lora-dreambooth-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--pre_compute_text_embeddings",
+        action="store_true",
+        help="Whether or not to pre-compute text embeddings. If text embeddings are pre-computed, the text encoder will not be kept in memory during training and will leave more GPU memory available for training the rest of the model. This is not compatible with `--train_text_encoder`.",
+    )
+    parser.add_argument(
+        "--tokenizer_max_length",
+        type=int,
+        default=None,
+        required=False,
+        help="The maximum length of the tokenizer. If not set, will default to the tokenizer's max length.",
+    )
+    parser.add_argument(
+        "--text_encoder_use_attention_mask",
+        action="store_true",
+        required=False,
+        help="Whether to use attention mask for the text encoder",
+    )
+    parser.add_argument(
+        "--validation_images",
+        required=False,
+        default=None,
+        nargs="+",
+        help="Optional set of images to use for validation. Used when the target pipeline takes an initial image as input such as when training image variation or superresolution.",
+    )
+    parser.add_argument(
+        "--class_labels_conditioning",
+        required=False,
+        default=None,
+        help="The optional `class_label` conditioning to pass to the unet, available values are `timesteps`.",
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    if args.train_text_encoder and args.pre_compute_text_embeddings:
+        raise ValueError("`--train_text_encoder` cannot be used with `--pre_compute_text_embeddings`")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        class_num=None,
+        size=512,
+        center_crop=False,
+        encoder_hidden_states=None,
+        class_prompt_encoder_hidden_states=None,
+        tokenizer_max_length=None,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+        self.encoder_hidden_states = encoder_hidden_states
+        self.class_prompt_encoder_hidden_states = class_prompt_encoder_hidden_states
+        self.tokenizer_max_length = tokenizer_max_length
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        instance_image = exif_transpose(instance_image)
+
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+
+        if self.encoder_hidden_states is not None:
+            example["instance_prompt_ids"] = self.encoder_hidden_states
+        else:
+            text_inputs = tokenize_prompt(
+                self.tokenizer, self.instance_prompt, tokenizer_max_length=self.tokenizer_max_length
+            )
+            example["instance_prompt_ids"] = text_inputs.input_ids
+            example["instance_attention_mask"] = text_inputs.attention_mask
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            class_image = exif_transpose(class_image)
+
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+
+            if self.class_prompt_encoder_hidden_states is not None:
+                example["class_prompt_ids"] = self.class_prompt_encoder_hidden_states
+            else:
+                class_text_inputs = tokenize_prompt(
+                    self.tokenizer, self.class_prompt, tokenizer_max_length=self.tokenizer_max_length
+                )
+                example["class_prompt_ids"] = class_text_inputs.input_ids
+                example["class_attention_mask"] = class_text_inputs.attention_mask
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    has_attention_mask = "instance_attention_mask" in examples[0]
+
+    input_ids = [example["instance_prompt_ids"] for example in examples]
+    pixel_values = [example["instance_images"] for example in examples]
+
+    if has_attention_mask:
+        attention_mask = [example["instance_attention_mask"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        input_ids += [example["class_prompt_ids"] for example in examples]
+        pixel_values += [example["class_images"] for example in examples]
+        if has_attention_mask:
+            attention_mask += [example["class_attention_mask"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    input_ids = torch.cat(input_ids, dim=0)
+
+    batch = {
+        "input_ids": input_ids,
+        "pixel_values": pixel_values,
+    }
+
+    if has_attention_mask:
+        batch["attention_mask"] = attention_mask
+
+    return batch
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None):
+    if tokenizer_max_length is not None:
+        max_length = tokenizer_max_length
+    else:
+        max_length = tokenizer.model_max_length
+
+    text_inputs = tokenizer(
+        prompt,
+        truncation=True,
+        padding="max_length",
+        max_length=max_length,
+        return_tensors="pt",
+    )
+
+    return text_inputs
+
+
+def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_attention_mask=None):
+    text_input_ids = input_ids.to(text_encoder.device)
+
+    if text_encoder_use_attention_mask:
+        attention_mask = attention_mask.to(text_encoder.device)
+    else:
+        attention_mask = None
+
+    prompt_embeds = text_encoder(
+        text_input_ids,
+        attention_mask=attention_mask,
+        return_dict=False,
+    )
+    prompt_embeds = prompt_embeds[0]
+
+    return prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (sayakpaul): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
+                variant=args.variant,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    try:
+        vae = AutoencoderKL.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+        )
+    except OSError:
+        # IF does not have a VAE so let's just set it to None
+        # We don't have to error out here
+        vae = None
+
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # We only train the additional adapter LoRA layers
+    if vae is not None:
+        vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    if vae is not None:
+        vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
+
+    # now we will add new LoRA weights to the attention layers
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0", "add_k_proj", "add_v_proj"],
+    )
+    unet.add_adapter(unet_lora_config)
+
+    # The text encoder comes from 🤗 transformers, we will also attach adapters to it.
+    if args.train_text_encoder:
+        text_lora_config = LoraConfig(
+            r=args.rank,
+            lora_alpha=args.rank,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder.add_adapter(text_lora_config)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder atten layers
+            unet_lora_layers_to_save = None
+            text_encoder_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(unwrap_model(unet))):
+                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
+                elif isinstance(model, type(unwrap_model(text_encoder))):
+                    text_encoder_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusionLoraLoaderMixin.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_lora_layers_to_save,
+            )
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+        text_encoder_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(unet))):
+                unet_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder))):
+                text_encoder_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir)
+
+        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        if args.train_text_encoder:
+            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_)
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [unet_]
+            if args.train_text_encoder:
+                models.append(text_encoder_)
+
+            # only upcast trainable parameters (LoRA) into fp32
+            cast_training_params(models, dtype=torch.float32)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [unet]
+        if args.train_text_encoder:
+            models.append(text_encoder)
+
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(models, dtype=torch.float32)
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = list(filter(lambda p: p.requires_grad, unet.parameters()))
+    if args.train_text_encoder:
+        params_to_optimize = params_to_optimize + list(filter(lambda p: p.requires_grad, text_encoder.parameters()))
+
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    if args.pre_compute_text_embeddings:
+
+        def compute_text_embeddings(prompt):
+            with torch.no_grad():
+                text_inputs = tokenize_prompt(tokenizer, prompt, tokenizer_max_length=args.tokenizer_max_length)
+                prompt_embeds = encode_prompt(
+                    text_encoder,
+                    text_inputs.input_ids,
+                    text_inputs.attention_mask,
+                    text_encoder_use_attention_mask=args.text_encoder_use_attention_mask,
+                )
+
+            return prompt_embeds
+
+        pre_computed_encoder_hidden_states = compute_text_embeddings(args.instance_prompt)
+        validation_prompt_negative_prompt_embeds = compute_text_embeddings("")
+
+        if args.validation_prompt is not None:
+            validation_prompt_encoder_hidden_states = compute_text_embeddings(args.validation_prompt)
+        else:
+            validation_prompt_encoder_hidden_states = None
+
+        if args.class_prompt is not None:
+            pre_computed_class_prompt_encoder_hidden_states = compute_text_embeddings(args.class_prompt)
+        else:
+            pre_computed_class_prompt_encoder_hidden_states = None
+
+        text_encoder = None
+        tokenizer = None
+
+        gc.collect()
+        torch.cuda.empty_cache()
+    else:
+        pre_computed_encoder_hidden_states = None
+        validation_prompt_encoder_hidden_states = None
+        validation_prompt_negative_prompt_embeds = None
+        pre_computed_class_prompt_encoder_hidden_states = None
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        class_num=args.num_class_images,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+        encoder_hidden_states=pre_computed_encoder_hidden_states,
+        class_prompt_encoder_hidden_states=pre_computed_class_prompt_encoder_hidden_states,
+        tokenizer_max_length=args.tokenizer_max_length,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = vars(copy.deepcopy(args))
+        tracker_config.pop("validation_images")
+        accelerator.init_trackers("dreambooth-lora", config=tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
+
+                if vae is not None:
+                    # Convert images to latent space
+                    model_input = vae.encode(pixel_values).latent_dist.sample()
+                    model_input = model_input * vae.config.scaling_factor
+                else:
+                    model_input = pixel_values
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                bsz, channels, height, width = model_input.shape
+                # Sample a random timestep for each image
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                )
+                timesteps = timesteps.long()
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                if args.pre_compute_text_embeddings:
+                    encoder_hidden_states = batch["input_ids"]
+                else:
+                    encoder_hidden_states = encode_prompt(
+                        text_encoder,
+                        batch["input_ids"],
+                        batch["attention_mask"],
+                        text_encoder_use_attention_mask=args.text_encoder_use_attention_mask,
+                    )
+
+                if unwrap_model(unet).config.in_channels == channels * 2:
+                    noisy_model_input = torch.cat([noisy_model_input, noisy_model_input], dim=1)
+
+                if args.class_labels_conditioning == "timesteps":
+                    class_labels = timesteps
+                else:
+                    class_labels = None
+
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_model_input,
+                    timesteps,
+                    encoder_hidden_states,
+                    class_labels=class_labels,
+                    return_dict=False,
+                )[0]
+
+                # if model predicts variance, throw away the prediction. we will only train on the
+                # simplified training objective. This means that all schedulers using the fine tuned
+                # model must be configured to use one of the fixed variance variance types.
+                if model_pred.shape[1] == 6:
+                    model_pred, _ = torch.chunk(model_pred, 2, dim=1)
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute instance loss
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                    # Compute prior loss
+                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+                else:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                # create pipeline
+                pipeline = DiffusionPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    unet=unwrap_model(unet),
+                    text_encoder=None if args.pre_compute_text_embeddings else unwrap_model(text_encoder),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+
+                if args.pre_compute_text_embeddings:
+                    pipeline_args = {
+                        "prompt_embeds": validation_prompt_encoder_hidden_states,
+                        "negative_prompt_embeds": validation_prompt_negative_prompt_embeds,
+                    }
+                else:
+                    pipeline_args = {"prompt": args.validation_prompt}
+
+                images = log_validation(
+                    pipeline,
+                    args,
+                    accelerator,
+                    pipeline_args,
+                    epoch,
+                    torch_dtype=weight_dtype,
+                )
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        unet = unet.to(torch.float32)
+
+        unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+
+        if args.train_text_encoder:
+            text_encoder = unwrap_model(text_encoder)
+            text_encoder_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(text_encoder))
+        else:
+            text_encoder_state_dict = None
+
+        StableDiffusionLoraLoaderMixin.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_state_dict,
+            text_encoder_lora_layers=text_encoder_state_dict,
+        )
+
+        # Final inference
+        # Load previous pipeline
+        pipeline = DiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path, revision=args.revision, variant=args.variant, torch_dtype=weight_dtype
+        )
+
+        # load attention processors
+        pipeline.load_lora_weights(args.output_dir, weight_name="pytorch_lora_weights.safetensors")
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline_args = {"prompt": args.validation_prompt, "num_inference_steps": 25}
+            images = log_validation(
+                pipeline,
+                args,
+                accelerator,
+                pipeline_args,
+                epoch,
+                is_final_validation=True,
+                torch_dtype=weight_dtype,
+            )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                train_text_encoder=args.train_text_encoder,
+                prompt=args.instance_prompt,
+                repo_folder=args.output_dir,
+                pipeline=pipeline,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/dreambooth/train_dreambooth_lora_flux.py b/diffusers/examples/dreambooth/train_dreambooth_lora_flux.py
new file mode 100644
index 0000000000000000000000000000000000000000..6091622719eedf6b477dc8f73bdf0fccde9422d7
--- /dev/null
+++ b/diffusers/examples/dreambooth/train_dreambooth_lora_flux.py
@@ -0,0 +1,1891 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import copy
+import itertools
+import logging
+import math
+import os
+import random
+import shutil
+import warnings
+from contextlib import nullcontext
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import CLIPTokenizer, PretrainedConfig, T5TokenizerFast
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    FluxPipeline,
+    FluxTransformer2DModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import (
+    _set_state_dict_into_text_encoder,
+    cast_training_params,
+    clear_objs_and_retain_memory,
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+)
+from diffusers.utils import (
+    check_min_version,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    images=None,
+    base_model: str = None,
+    train_text_encoder=False,
+    instance_prompt=None,
+    validation_prompt=None,
+    repo_folder=None,
+):
+    widget_dict = []
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            widget_dict.append(
+                {"text": validation_prompt if validation_prompt else " ", "output": {"url": f"image_{i}.png"}}
+            )
+
+    model_description = f"""
+# Flux DreamBooth LoRA - {repo_id}
+
+<Gallery />
+
+## Model description
+
+These are {repo_id} DreamBooth LoRA weights for {base_model}.
+
+The weights were trained using [DreamBooth](https://dreambooth.github.io/) with the [Flux diffusers trainer](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_flux.md).
+
+Was LoRA for the text encoder enabled? {train_text_encoder}.
+
+## Trigger words
+
+You should use `{instance_prompt}` to trigger the image generation.
+
+## Download model
+
+[Download the *.safetensors LoRA]({repo_id}/tree/main) in the Files & versions tab.
+
+## Use it with the [🧨 diffusers library](https://github.com/huggingface/diffusers)
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+pipeline = AutoPipelineForText2Image.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to('cuda')
+pipeline.load_lora_weights('{repo_id}', weight_name='pytorch_lora_weights.safetensors')
+image = pipeline('{validation_prompt if validation_prompt else instance_prompt}').images[0]
+```
+
+For more details, including weighting, merging and fusing LoRAs, check the [documentation on loading LoRAs in diffusers](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading_adapters)
+
+## License
+
+Please adhere to the licensing terms as described [here](https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/LICENSE.md).
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="other",
+        base_model=base_model,
+        prompt=instance_prompt,
+        model_description=model_description,
+        widget=widget_dict,
+    )
+    tags = [
+        "text-to-image",
+        "diffusers-training",
+        "diffusers",
+        "lora",
+        "flux",
+        "flux-diffusers",
+        "template:sd-lora",
+    ]
+
+    model_card = populate_model_card(model_card, tags=tags)
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def load_text_encoders(class_one, class_two):
+    text_encoder_one = class_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = class_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    return text_encoder_one, text_encoder_two
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    torch_dtype,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = pipeline.to(accelerator.device, dtype=torch_dtype)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    # autocast_ctx = torch.autocast(accelerator.device.type) if not is_final_validation else nullcontext()
+    autocast_ctx = nullcontext()
+
+    with autocast_ctx:
+        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    return images
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help=("A folder containing the training data. "),
+    )
+
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+
+    parser.add_argument(
+        "--image_column",
+        type=str,
+        default="image",
+        help="The column of the dataset containing the target image. By "
+        "default, the standard Image Dataset maps out 'file_name' "
+        "to 'image'.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default=None,
+        help="The column of the dataset containing the instance prompt for each image",
+    )
+
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance, e.g. 'photo of a TOK dog', 'in the style of TOK'",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--max_sequence_length",
+        type=int,
+        default=512,
+        help="Maximum sequence length to use with with the T5 text encoder",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="flux-dreambooth-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=3.5,
+        help="the FLUX.1 dev variant is a guidance distilled model",
+    )
+
+    parser.add_argument(
+        "--text_encoder_lr",
+        type=float,
+        default=5e-6,
+        help="Text encoder learning rate to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="none",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"],
+        help=('We default to the "none" weighting scheme for uniform sampling and uniform loss'),
+    )
+    parser.add_argument(
+        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+    )
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="AdamW",
+        help=('The optimizer type to use. Choose between ["AdamW", "prodigy"]'),
+    )
+
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+
+    parser.add_argument(
+        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--prodigy_beta3",
+        type=float,
+        default=None,
+        help="coefficients for computing the Prodigy stepsize using running averages. If set to None, "
+        "uses the value of square root of beta2. Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+    parser.add_argument(
+        "--adam_weight_decay_text_encoder", type=float, default=1e-03, help="Weight decay to use for text_encoder"
+    )
+
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
+    )
+
+    parser.add_argument(
+        "--prodigy_use_bias_correction",
+        type=bool,
+        default=True,
+        help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW",
+    )
+    parser.add_argument(
+        "--prodigy_safeguard_warmup",
+        type=bool,
+        default=True,
+        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. "
+        "Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--cache_latents",
+        action="store_true",
+        default=False,
+        help="Cache the VAE latents",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--upcast_before_saving",
+        action="store_true",
+        default=False,
+        help=(
+            "Whether to upcast the trained transformer layers to float32 before saving (at the end of training). "
+            "Defaults to precision dtype used for training to save memory"
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.instance_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--instance_data_dir`")
+
+    if args.dataset_name is not None and args.instance_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--instance_data_dir`")
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        class_prompt,
+        class_data_root=None,
+        class_num=None,
+        size=1024,
+        repeats=1,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+
+        self.instance_prompt = instance_prompt
+        self.custom_instance_prompts = None
+        self.class_prompt = class_prompt
+
+        # if --dataset_name is provided or a metadata jsonl file is provided in the local --instance_data directory,
+        # we load the training data using load_dataset
+        if args.dataset_name is not None:
+            try:
+                from datasets import load_dataset
+            except ImportError:
+                raise ImportError(
+                    "You are trying to load your data using the datasets library. If you wish to train using custom "
+                    "captions please install the datasets library: `pip install datasets`. If you wish to load a "
+                    "local folder containing images only, specify --instance_data_dir instead."
+                )
+            # Downloading and loading a dataset from the hub.
+            # See more about loading custom images at
+            # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+            dataset = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                cache_dir=args.cache_dir,
+            )
+            # Preprocessing the datasets.
+            column_names = dataset["train"].column_names
+
+            # 6. Get the column names for input/target.
+            if args.image_column is None:
+                image_column = column_names[0]
+                logger.info(f"image column defaulting to {image_column}")
+            else:
+                image_column = args.image_column
+                if image_column not in column_names:
+                    raise ValueError(
+                        f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+            instance_images = dataset["train"][image_column]
+
+            if args.caption_column is None:
+                logger.info(
+                    "No caption column provided, defaulting to instance_prompt for all images. If your dataset "
+                    "contains captions/prompts for the images, make sure to specify the "
+                    "column as --caption_column"
+                )
+                self.custom_instance_prompts = None
+            else:
+                if args.caption_column not in column_names:
+                    raise ValueError(
+                        f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+                custom_instance_prompts = dataset["train"][args.caption_column]
+                # create final list of captions according to --repeats
+                self.custom_instance_prompts = []
+                for caption in custom_instance_prompts:
+                    self.custom_instance_prompts.extend(itertools.repeat(caption, repeats))
+        else:
+            self.instance_data_root = Path(instance_data_root)
+            if not self.instance_data_root.exists():
+                raise ValueError("Instance images root doesn't exists.")
+
+            instance_images = [Image.open(path) for path in list(Path(instance_data_root).iterdir())]
+            self.custom_instance_prompts = None
+
+        self.instance_images = []
+        for img in instance_images:
+            self.instance_images.extend(itertools.repeat(img, repeats))
+
+        self.pixel_values = []
+        train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
+        train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
+        train_flip = transforms.RandomHorizontalFlip(p=1.0)
+        train_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        for image in self.instance_images:
+            image = exif_transpose(image)
+            if not image.mode == "RGB":
+                image = image.convert("RGB")
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            image = train_transforms(image)
+            self.pixel_values.append(image)
+
+        self.num_instance_images = len(self.instance_images)
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = self.pixel_values[index % self.num_instance_images]
+        example["instance_images"] = instance_image
+
+        if self.custom_instance_prompts:
+            caption = self.custom_instance_prompts[index % self.num_instance_images]
+            if caption:
+                example["instance_prompt"] = caption
+            else:
+                example["instance_prompt"] = self.instance_prompt
+
+        else:  # custom prompts were provided, but length does not match size of image dataset
+            example["instance_prompt"] = self.instance_prompt
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            class_image = exif_transpose(class_image)
+
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt"] = self.class_prompt
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    pixel_values = [example["instance_images"] for example in examples]
+    prompts = [example["instance_prompt"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        pixel_values += [example["class_images"] for example in examples]
+        prompts += [example["class_prompt"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    batch = {"pixel_values": pixel_values, "prompts": prompts}
+    return batch
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def tokenize_prompt(tokenizer, prompt, max_sequence_length):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+
+
+def _encode_prompt_with_t5(
+    text_encoder,
+    tokenizer,
+    max_sequence_length=512,
+    prompt=None,
+    num_images_per_prompt=1,
+    device=None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+
+    dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+    _, seq_len, _ = prompt_embeds.shape
+
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+    return prompt_embeds
+
+
+def _encode_prompt_with_clip(
+    text_encoder,
+    tokenizer,
+    prompt: str,
+    device=None,
+    text_input_ids=None,
+    num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
+
+    # Use pooled output of CLIPTextModel
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+    return prompt_embeds
+
+
+def encode_prompt(
+    text_encoders,
+    tokenizers,
+    prompt: str,
+    max_sequence_length,
+    device=None,
+    num_images_per_prompt: int = 1,
+    text_input_ids_list=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    dtype = text_encoders[0].dtype
+
+    pooled_prompt_embeds = _encode_prompt_with_clip(
+        text_encoder=text_encoders[0],
+        tokenizer=tokenizers[0],
+        prompt=prompt,
+        device=device if device is not None else text_encoders[0].device,
+        num_images_per_prompt=num_images_per_prompt,
+        text_input_ids=text_input_ids_list[0] if text_input_ids_list else None,
+    )
+
+    prompt_embeds = _encode_prompt_with_t5(
+        text_encoder=text_encoders[1],
+        tokenizer=tokenizers[1],
+        max_sequence_length=max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        device=device if device is not None else text_encoders[1].device,
+        text_input_ids=text_input_ids_list[1] if text_input_ids_list else None,
+    )
+
+    text_ids = torch.zeros(batch_size, prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+    text_ids = text_ids.repeat(num_images_per_prompt, 1, 1)
+
+    return prompt_embeds, pooled_prompt_embeds, text_ids
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            has_supported_fp16_accelerator = torch.cuda.is_available() or torch.backends.mps.is_available()
+            torch_dtype = torch.float16 if has_supported_fp16_accelerator else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = FluxPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                revision=args.revision,
+                variant=args.variant,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+    )
+    tokenizer_two = T5TokenizerFast.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="scheduler"
+    )
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    text_encoder_one, text_encoder_two = load_text_encoders(text_encoder_cls_one, text_encoder_cls_two)
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+        variant=args.variant,
+    )
+    transformer = FluxTransformer2DModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant
+    )
+
+    # We only train the additional adapter LoRA layers
+    transformer.requires_grad_(False)
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    vae.to(accelerator.device, dtype=weight_dtype)
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder_one.gradient_checkpointing_enable()
+
+    # now we will add new LoRA weights to the attention layers
+    transformer_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+    transformer.add_adapter(transformer_lora_config)
+    if args.train_text_encoder:
+        text_lora_config = LoraConfig(
+            r=args.rank,
+            lora_alpha=args.rank,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder_one.add_adapter(text_lora_config)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            transformer_lora_layers_to_save = None
+            text_encoder_one_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(unwrap_model(transformer))):
+                    transformer_lora_layers_to_save = get_peft_model_state_dict(model)
+                elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                    text_encoder_one_lora_layers_to_save = get_peft_model_state_dict(model)
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            FluxPipeline.save_lora_weights(
+                output_dir,
+                transformer_lora_layers=transformer_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
+            )
+
+    def load_model_hook(models, input_dir):
+        transformer_ = None
+        text_encoder_one_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(transformer))):
+                transformer_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                text_encoder_one_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict = FluxPipeline.lora_state_dict(input_dir)
+
+        transformer_state_dict = {
+            f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.")
+        }
+        transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)
+        incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+        if args.train_text_encoder:
+            # Do we need to call `scale_lora_layers()` here?
+            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [transformer_]
+            if args.train_text_encoder:
+                models.extend([text_encoder_one_])
+            # only upcast trainable parameters (LoRA) into fp32
+            cast_training_params(models)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [transformer]
+        if args.train_text_encoder:
+            models.extend([text_encoder_one])
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(models, dtype=torch.float32)
+
+    transformer_lora_parameters = list(filter(lambda p: p.requires_grad, transformer.parameters()))
+    if args.train_text_encoder:
+        text_lora_parameters_one = list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
+
+    # Optimization parameters
+    transformer_parameters_with_lr = {"params": transformer_lora_parameters, "lr": args.learning_rate}
+    if args.train_text_encoder:
+        # different learning rate for text encoder and unet
+        text_parameters_one_with_lr = {
+            "params": text_lora_parameters_one,
+            "weight_decay": args.adam_weight_decay_text_encoder,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        params_to_optimize = [
+            transformer_parameters_with_lr,
+            text_parameters_one_with_lr,
+        ]
+    else:
+        params_to_optimize = [transformer_parameters_with_lr]
+
+    # Optimizer creation
+    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+            "Defaulting to adamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.optimizer.lower() == "adamw":
+        if args.use_8bit_adam:
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError(
+                    "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+                )
+
+            optimizer_class = bnb.optim.AdamW8bit
+        else:
+            optimizer_class = torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+
+    if args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+        if args.train_text_encoder and args.text_encoder_lr:
+            logger.warning(
+                f"Learning rates were provided both for the transformer and the text encoder- e.g. text_encoder_lr:"
+                f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
+                f"When using prodigy only learning_rate is used as the initial learning rate."
+            )
+            # changes the learning rate of text_encoder_parameters_one and text_encoder_parameters_two to be
+            # --learning_rate
+            params_to_optimize[1]["lr"] = args.learning_rate
+            params_to_optimize[2]["lr"] = args.learning_rate
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_prompt=args.class_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_num=args.num_class_images,
+        size=args.resolution,
+        repeats=args.repeats,
+        center_crop=args.center_crop,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    if not args.train_text_encoder:
+        tokenizers = [tokenizer_one, tokenizer_two]
+        text_encoders = [text_encoder_one, text_encoder_two]
+
+        def compute_text_embeddings(prompt, text_encoders, tokenizers):
+            with torch.no_grad():
+                prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
+                    text_encoders, tokenizers, prompt, args.max_sequence_length
+                )
+                prompt_embeds = prompt_embeds.to(accelerator.device)
+                pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device)
+                text_ids = text_ids.to(accelerator.device)
+            return prompt_embeds, pooled_prompt_embeds, text_ids
+
+    # If no type of tuning is done on the text_encoder and custom instance prompts are NOT
+    # provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid
+    # the redundant encoding.
+    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
+        instance_prompt_hidden_states, instance_pooled_prompt_embeds, instance_text_ids = compute_text_embeddings(
+            args.instance_prompt, text_encoders, tokenizers
+        )
+
+    # Handle class prompt for prior-preservation.
+    if args.with_prior_preservation:
+        if not args.train_text_encoder:
+            class_prompt_hidden_states, class_pooled_prompt_embeds, class_text_ids = compute_text_embeddings(
+                args.class_prompt, text_encoders, tokenizers
+            )
+
+    # Clear the memory here
+    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
+        clear_objs_and_retain_memory([tokenizers, text_encoders, text_encoder_one, text_encoder_two])
+
+    # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
+    # pack the statically computed variables appropriately here. This is so that we don't
+    # have to pass them to the dataloader.
+
+    if not train_dataset.custom_instance_prompts:
+        if not args.train_text_encoder:
+            prompt_embeds = instance_prompt_hidden_states
+            pooled_prompt_embeds = instance_pooled_prompt_embeds
+            text_ids = instance_text_ids
+            if args.with_prior_preservation:
+                prompt_embeds = torch.cat([prompt_embeds, class_prompt_hidden_states], dim=0)
+                pooled_prompt_embeds = torch.cat([pooled_prompt_embeds, class_pooled_prompt_embeds], dim=0)
+                text_ids = torch.cat([text_ids, class_text_ids], dim=0)
+        # if we're optimizing the text encoder (both if instance prompt is used for all images or custom prompts)
+        # we need to tokenize and encode the batch prompts on all training steps
+        else:
+            tokens_one = tokenize_prompt(tokenizer_one, args.instance_prompt, max_sequence_length=77)
+            tokens_two = tokenize_prompt(
+                tokenizer_two, args.instance_prompt, max_sequence_length=args.max_sequence_length
+            )
+            if args.with_prior_preservation:
+                class_tokens_one = tokenize_prompt(tokenizer_one, args.class_prompt, max_sequence_length=77)
+                class_tokens_two = tokenize_prompt(
+                    tokenizer_two, args.class_prompt, max_sequence_length=args.max_sequence_length
+                )
+                tokens_one = torch.cat([tokens_one, class_tokens_one], dim=0)
+                tokens_two = torch.cat([tokens_two, class_tokens_two], dim=0)
+
+    vae_config_shift_factor = vae.config.shift_factor
+    vae_config_scaling_factor = vae.config.scaling_factor
+    vae_config_block_out_channels = vae.config.block_out_channels
+    if args.cache_latents:
+        latents_cache = []
+        for batch in tqdm(train_dataloader, desc="Caching latents"):
+            with torch.no_grad():
+                batch["pixel_values"] = batch["pixel_values"].to(
+                    accelerator.device, non_blocking=True, dtype=weight_dtype
+                )
+                latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
+
+        if args.validation_prompt is None:
+            clear_objs_and_retain_memory([vae])
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        (
+            transformer,
+            text_encoder_one,
+            optimizer,
+            train_dataloader,
+            lr_scheduler,
+        ) = accelerator.prepare(
+            transformer,
+            text_encoder_one,
+            optimizer,
+            train_dataloader,
+            lr_scheduler,
+        )
+    else:
+        transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            transformer, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = "dreambooth-flux-dev-lora"
+        accelerator.init_trackers(tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+        if args.train_text_encoder:
+            text_encoder_one.train()
+            # set top parameter requires_grad = True for gradient checkpointing works
+            accelerator.unwrap_model(text_encoder_one).text_model.embeddings.requires_grad_(True)
+
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [transformer]
+            if args.train_text_encoder:
+                models_to_accumulate.extend([text_encoder_one])
+            with accelerator.accumulate(models_to_accumulate):
+                prompts = batch["prompts"]
+
+                # encode batch prompts when custom prompts are provided for each image -
+                if train_dataset.custom_instance_prompts:
+                    if not args.train_text_encoder:
+                        prompt_embeds, pooled_prompt_embeds, text_ids = compute_text_embeddings(
+                            prompts, text_encoders, tokenizers
+                        )
+                    else:
+                        tokens_one = tokenize_prompt(tokenizer_one, prompts, max_sequence_length=77)
+                        tokens_two = tokenize_prompt(
+                            tokenizer_two, prompts, max_sequence_length=args.max_sequence_length
+                        )
+                        prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
+                            text_encoders=[text_encoder_one, text_encoder_two],
+                            tokenizers=[None, None],
+                            text_input_ids_list=[tokens_one, tokens_two],
+                            max_sequence_length=args.max_sequence_length,
+                            device=accelerator.device,
+                            prompt=prompts,
+                        )
+                else:
+                    if args.train_text_encoder:
+                        prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
+                            text_encoders=[text_encoder_one, text_encoder_two],
+                            tokenizers=[None, None],
+                            text_input_ids_list=[tokens_one, tokens_two],
+                            max_sequence_length=args.max_sequence_length,
+                            device=accelerator.device,
+                            prompt=args.instance_prompt,
+                        )
+
+                # Convert images to latent space
+                if args.cache_latents:
+                    model_input = latents_cache[step].sample()
+                else:
+                    pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                    model_input = vae.encode(pixel_values).latent_dist.sample()
+                model_input = (model_input - vae_config_shift_factor) * vae_config_scaling_factor
+                model_input = model_input.to(dtype=weight_dtype)
+
+                vae_scale_factor = 2 ** (len(vae_config_block_out_channels))
+
+                latent_image_ids = FluxPipeline._prepare_latent_image_ids(
+                    model_input.shape[0],
+                    model_input.shape[2],
+                    model_input.shape[3],
+                    accelerator.device,
+                    weight_dtype,
+                )
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+
+                # Sample a random timestep for each image
+                # for weighting schemes where we sample timesteps non-uniformly
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+
+                # Add noise according to flow matching.
+                # zt = (1 - texp) * x + texp * z1
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
+
+                packed_noisy_model_input = FluxPipeline._pack_latents(
+                    noisy_model_input,
+                    batch_size=model_input.shape[0],
+                    num_channels_latents=model_input.shape[1],
+                    height=model_input.shape[2],
+                    width=model_input.shape[3],
+                )
+
+                # handle guidance
+                if transformer.config.guidance_embeds:
+                    guidance = torch.tensor([args.guidance_scale], device=accelerator.device)
+                    guidance = guidance.expand(model_input.shape[0])
+                else:
+                    guidance = None
+
+                # Predict the noise residual
+                model_pred = transformer(
+                    hidden_states=packed_noisy_model_input,
+                    # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+                    timestep=timesteps / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    return_dict=False,
+                )[0]
+                model_pred = FluxPipeline._unpack_latents(
+                    model_pred,
+                    height=int(model_input.shape[2] * vae_scale_factor / 2),
+                    width=int(model_input.shape[3] * vae_scale_factor / 2),
+                    vae_scale_factor=vae_scale_factor,
+                )
+
+                # these weighting schemes use a uniform timestep sampling
+                # and instead post-weight the loss
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+
+                # flow matching loss
+                target = noise - model_input
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute prior loss
+                    prior_loss = torch.mean(
+                        (weighting.float() * (model_pred_prior.float() - target_prior.float()) ** 2).reshape(
+                            target_prior.shape[0], -1
+                        ),
+                        1,
+                    )
+                    prior_loss = prior_loss.mean()
+
+                # Compute regular loss.
+                loss = torch.mean(
+                    (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    1,
+                )
+                loss = loss.mean()
+
+                if args.with_prior_preservation:
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(transformer.parameters(), text_encoder_one.parameters())
+                        if args.train_text_encoder
+                        else transformer.parameters()
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                # create pipeline
+                if not args.train_text_encoder:
+                    text_encoder_one, text_encoder_two = load_text_encoders(text_encoder_cls_one, text_encoder_cls_two)
+                pipeline = FluxPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=accelerator.unwrap_model(text_encoder_one),
+                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
+                    transformer=accelerator.unwrap_model(transformer),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline_args = {"prompt": args.validation_prompt}
+                images = log_validation(
+                    pipeline=pipeline,
+                    args=args,
+                    accelerator=accelerator,
+                    pipeline_args=pipeline_args,
+                    epoch=epoch,
+                    torch_dtype=weight_dtype,
+                )
+                if not args.train_text_encoder:
+                    clear_objs_and_retain_memory([text_encoder_one, text_encoder_two])
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        transformer = unwrap_model(transformer)
+        if args.upcast_before_saving:
+            transformer.to(torch.float32)
+        else:
+            transformer = transformer.to(weight_dtype)
+        transformer_lora_layers = get_peft_model_state_dict(transformer)
+
+        if args.train_text_encoder:
+            text_encoder_one = unwrap_model(text_encoder_one)
+            text_encoder_lora_layers = get_peft_model_state_dict(text_encoder_one.to(torch.float32))
+        else:
+            text_encoder_lora_layers = None
+
+        FluxPipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            transformer_lora_layers=transformer_lora_layers,
+            text_encoder_lora_layers=text_encoder_lora_layers,
+        )
+
+        # Final inference
+        # Load previous pipeline
+        pipeline = FluxPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        # load attention processors
+        pipeline.load_lora_weights(args.output_dir)
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline_args = {"prompt": args.validation_prompt}
+            images = log_validation(
+                pipeline=pipeline,
+                args=args,
+                accelerator=accelerator,
+                pipeline_args=pipeline_args,
+                epoch=epoch,
+                is_final_validation=True,
+                torch_dtype=weight_dtype,
+            )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                train_text_encoder=args.train_text_encoder,
+                instance_prompt=args.instance_prompt,
+                validation_prompt=args.validation_prompt,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/dreambooth/train_dreambooth_lora_sd3.py b/diffusers/examples/dreambooth/train_dreambooth_lora_sd3.py
new file mode 100644
index 0000000000000000000000000000000000000000..3060813bbbdcbed164c1de1e9fd433463e30d2fe
--- /dev/null
+++ b/diffusers/examples/dreambooth/train_dreambooth_lora_sd3.py
@@ -0,0 +1,1870 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import copy
+import itertools
+import logging
+import math
+import os
+import random
+import shutil
+import warnings
+from contextlib import nullcontext
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import CLIPTokenizer, PretrainedConfig, T5TokenizerFast
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    SD3Transformer2DModel,
+    StableDiffusion3Pipeline,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import (
+    _set_state_dict_into_text_encoder,
+    cast_training_params,
+    clear_objs_and_retain_memory,
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+)
+from diffusers.utils import (
+    check_min_version,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    images=None,
+    base_model: str = None,
+    train_text_encoder=False,
+    instance_prompt=None,
+    validation_prompt=None,
+    repo_folder=None,
+):
+    widget_dict = []
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            widget_dict.append(
+                {"text": validation_prompt if validation_prompt else " ", "output": {"url": f"image_{i}.png"}}
+            )
+
+    model_description = f"""
+# SD3 DreamBooth LoRA - {repo_id}
+
+<Gallery />
+
+## Model description
+
+These are {repo_id} DreamBooth LoRA weights for {base_model}.
+
+The weights were trained using [DreamBooth](https://dreambooth.github.io/) with the [SD3 diffusers trainer](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_sd3.md).
+
+Was LoRA for the text encoder enabled? {train_text_encoder}.
+
+## Trigger words
+
+You should use `{instance_prompt}` to trigger the image generation.
+
+## Download model
+
+[Download the *.safetensors LoRA]({repo_id}/tree/main) in the Files & versions tab.
+
+## Use it with the [🧨 diffusers library](https://github.com/huggingface/diffusers)
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+pipeline = AutoPipelineForText2Image.from_pretrained('stabilityai/stable-diffusion-3-medium-diffusers', torch_dtype=torch.float16).to('cuda')
+pipeline.load_lora_weights('{repo_id}', weight_name='pytorch_lora_weights.safetensors')
+image = pipeline('{validation_prompt if validation_prompt else instance_prompt}').images[0]
+```
+
+### Use it with UIs such as AUTOMATIC1111, Comfy UI, SD.Next, Invoke
+
+- **LoRA**: download **[`diffusers_lora_weights.safetensors` here 💾](/{repo_id}/blob/main/diffusers_lora_weights.safetensors)**.
+    - Rename it and place it on your `models/Lora` folder.
+    - On AUTOMATIC1111, load the LoRA by adding `<lora:your_new_name:1>` to your prompt. On ComfyUI just [load it as a regular LoRA](https://comfyanonymous.github.io/ComfyUI_examples/lora/).
+
+For more details, including weighting, merging and fusing LoRAs, check the [documentation on loading LoRAs in diffusers](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading_adapters)
+
+## License
+
+Please adhere to the licensing terms as described [here](https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/LICENSE).
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="openrail++",
+        base_model=base_model,
+        prompt=instance_prompt,
+        model_description=model_description,
+        widget=widget_dict,
+    )
+    tags = [
+        "text-to-image",
+        "diffusers-training",
+        "diffusers",
+        "lora",
+        "sd3",
+        "sd3-diffusers",
+        "template:sd-lora",
+    ]
+
+    model_card = populate_model_card(model_card, tags=tags)
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def load_text_encoders(class_one, class_two, class_three):
+    text_encoder_one = class_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = class_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    text_encoder_three = class_three.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_3", revision=args.revision, variant=args.variant
+    )
+    return text_encoder_one, text_encoder_two, text_encoder_three
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    torch_dtype,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = pipeline.to(accelerator.device, dtype=torch_dtype)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    # autocast_ctx = torch.autocast(accelerator.device.type) if not is_final_validation else nullcontext()
+    autocast_ctx = nullcontext()
+
+    with autocast_ctx:
+        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    clear_objs_and_retain_memory(objs=[pipeline])
+
+    return images
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help=("A folder containing the training data. "),
+    )
+
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+
+    parser.add_argument(
+        "--image_column",
+        type=str,
+        default="image",
+        help="The column of the dataset containing the target image. By "
+        "default, the standard Image Dataset maps out 'file_name' "
+        "to 'image'.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default=None,
+        help="The column of the dataset containing the instance prompt for each image",
+    )
+
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance, e.g. 'photo of a TOK dog', 'in the style of TOK'",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--max_sequence_length",
+        type=int,
+        default=77,
+        help="Maximum sequence length to use with with the T5 text encoder",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd3-dreambooth",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder (clip text encoders only). If set, the text encoder should be float32 precision.",
+    )
+
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+
+    parser.add_argument(
+        "--text_encoder_lr",
+        type=float,
+        default=5e-6,
+        help="Text encoder learning rate to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="logit_normal",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap"],
+    )
+    parser.add_argument(
+        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+    )
+    parser.add_argument(
+        "--precondition_outputs",
+        type=int,
+        default=1,
+        help="Flag indicating if we are preconditioning the model outputs or not as done in EDM. This affects how "
+        "model `target` is calculated.",
+    )
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="AdamW",
+        help=('The optimizer type to use. Choose between ["AdamW", "prodigy"]'),
+    )
+
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+
+    parser.add_argument(
+        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--prodigy_beta3",
+        type=float,
+        default=None,
+        help="coefficients for computing the Prodigy stepsize using running averages. If set to None, "
+        "uses the value of square root of beta2. Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+    parser.add_argument(
+        "--adam_weight_decay_text_encoder", type=float, default=1e-03, help="Weight decay to use for text_encoder"
+    )
+
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
+    )
+
+    parser.add_argument(
+        "--prodigy_use_bias_correction",
+        type=bool,
+        default=True,
+        help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW",
+    )
+    parser.add_argument(
+        "--prodigy_safeguard_warmup",
+        type=bool,
+        default=True,
+        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. "
+        "Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.instance_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--instance_data_dir`")
+
+    if args.dataset_name is not None and args.instance_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--instance_data_dir`")
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        class_prompt,
+        class_data_root=None,
+        class_num=None,
+        size=1024,
+        repeats=1,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+
+        self.instance_prompt = instance_prompt
+        self.custom_instance_prompts = None
+        self.class_prompt = class_prompt
+
+        # if --dataset_name is provided or a metadata jsonl file is provided in the local --instance_data directory,
+        # we load the training data using load_dataset
+        if args.dataset_name is not None:
+            try:
+                from datasets import load_dataset
+            except ImportError:
+                raise ImportError(
+                    "You are trying to load your data using the datasets library. If you wish to train using custom "
+                    "captions please install the datasets library: `pip install datasets`. If you wish to load a "
+                    "local folder containing images only, specify --instance_data_dir instead."
+                )
+            # Downloading and loading a dataset from the hub.
+            # See more about loading custom images at
+            # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+            dataset = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                cache_dir=args.cache_dir,
+            )
+            # Preprocessing the datasets.
+            column_names = dataset["train"].column_names
+
+            # 6. Get the column names for input/target.
+            if args.image_column is None:
+                image_column = column_names[0]
+                logger.info(f"image column defaulting to {image_column}")
+            else:
+                image_column = args.image_column
+                if image_column not in column_names:
+                    raise ValueError(
+                        f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+            instance_images = dataset["train"][image_column]
+
+            if args.caption_column is None:
+                logger.info(
+                    "No caption column provided, defaulting to instance_prompt for all images. If your dataset "
+                    "contains captions/prompts for the images, make sure to specify the "
+                    "column as --caption_column"
+                )
+                self.custom_instance_prompts = None
+            else:
+                if args.caption_column not in column_names:
+                    raise ValueError(
+                        f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+                custom_instance_prompts = dataset["train"][args.caption_column]
+                # create final list of captions according to --repeats
+                self.custom_instance_prompts = []
+                for caption in custom_instance_prompts:
+                    self.custom_instance_prompts.extend(itertools.repeat(caption, repeats))
+        else:
+            self.instance_data_root = Path(instance_data_root)
+            if not self.instance_data_root.exists():
+                raise ValueError("Instance images root doesn't exists.")
+
+            instance_images = [Image.open(path) for path in list(Path(instance_data_root).iterdir())]
+            self.custom_instance_prompts = None
+
+        self.instance_images = []
+        for img in instance_images:
+            self.instance_images.extend(itertools.repeat(img, repeats))
+
+        self.pixel_values = []
+        train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
+        train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
+        train_flip = transforms.RandomHorizontalFlip(p=1.0)
+        train_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        for image in self.instance_images:
+            image = exif_transpose(image)
+            if not image.mode == "RGB":
+                image = image.convert("RGB")
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            image = train_transforms(image)
+            self.pixel_values.append(image)
+
+        self.num_instance_images = len(self.instance_images)
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = self.pixel_values[index % self.num_instance_images]
+        example["instance_images"] = instance_image
+
+        if self.custom_instance_prompts:
+            caption = self.custom_instance_prompts[index % self.num_instance_images]
+            if caption:
+                example["instance_prompt"] = caption
+            else:
+                example["instance_prompt"] = self.instance_prompt
+
+        else:  # custom prompts were provided, but length does not match size of image dataset
+            example["instance_prompt"] = self.instance_prompt
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            class_image = exif_transpose(class_image)
+
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt"] = self.class_prompt
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    pixel_values = [example["instance_images"] for example in examples]
+    prompts = [example["instance_prompt"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        pixel_values += [example["class_images"] for example in examples]
+        prompts += [example["class_prompt"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    batch = {"pixel_values": pixel_values, "prompts": prompts}
+    return batch
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def tokenize_prompt(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+
+
+def _encode_prompt_with_t5(
+    text_encoder,
+    tokenizer,
+    max_sequence_length,
+    prompt=None,
+    num_images_per_prompt=1,
+    device=None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+
+    dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+    _, seq_len, _ = prompt_embeds.shape
+
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+    return prompt_embeds
+
+
+def _encode_prompt_with_clip(
+    text_encoder,
+    tokenizer,
+    prompt: str,
+    device=None,
+    text_input_ids=None,
+    num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+    pooled_prompt_embeds = prompt_embeds[0]
+    prompt_embeds = prompt_embeds.hidden_states[-2]
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def encode_prompt(
+    text_encoders,
+    tokenizers,
+    prompt: str,
+    max_sequence_length,
+    device=None,
+    num_images_per_prompt: int = 1,
+    text_input_ids_list=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+
+    clip_tokenizers = tokenizers[:2]
+    clip_text_encoders = text_encoders[:2]
+
+    clip_prompt_embeds_list = []
+    clip_pooled_prompt_embeds_list = []
+    for i, (tokenizer, text_encoder) in enumerate(zip(clip_tokenizers, clip_text_encoders)):
+        prompt_embeds, pooled_prompt_embeds = _encode_prompt_with_clip(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            prompt=prompt,
+            device=device if device is not None else text_encoder.device,
+            num_images_per_prompt=num_images_per_prompt,
+            text_input_ids=text_input_ids_list[i] if text_input_ids_list else None,
+        )
+        clip_prompt_embeds_list.append(prompt_embeds)
+        clip_pooled_prompt_embeds_list.append(pooled_prompt_embeds)
+
+    clip_prompt_embeds = torch.cat(clip_prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = torch.cat(clip_pooled_prompt_embeds_list, dim=-1)
+
+    t5_prompt_embed = _encode_prompt_with_t5(
+        text_encoders[-1],
+        tokenizers[-1],
+        max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        text_input_ids=text_input_ids_list[-1] if text_input_ids_list else None,
+        device=device if device is not None else text_encoders[-1].device,
+    )
+
+    clip_prompt_embeds = torch.nn.functional.pad(
+        clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+    )
+    prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            has_supported_fp16_accelerator = torch.cuda.is_available() or torch.backends.mps.is_available()
+            torch_dtype = torch.float16 if has_supported_fp16_accelerator else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = StableDiffusion3Pipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                revision=args.revision,
+                variant=args.variant,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            clear_objs_and_retain_memory(objs=[pipeline])
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+    )
+    tokenizer_two = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+    )
+    tokenizer_three = T5TokenizerFast.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_3",
+        revision=args.revision,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+    text_encoder_cls_three = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_3"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="scheduler"
+    )
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    text_encoder_one, text_encoder_two, text_encoder_three = load_text_encoders(
+        text_encoder_cls_one, text_encoder_cls_two, text_encoder_cls_three
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+        variant=args.variant,
+    )
+    transformer = SD3Transformer2DModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant
+    )
+
+    transformer.requires_grad_(False)
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    text_encoder_three.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    vae.to(accelerator.device, dtype=torch.float32)
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_three.to(accelerator.device, dtype=weight_dtype)
+
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder_one.gradient_checkpointing_enable()
+            text_encoder_two.gradient_checkpointing_enable()
+
+    # now we will add new LoRA weights to the attention layers
+    transformer_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+    transformer.add_adapter(transformer_lora_config)
+
+    if args.train_text_encoder:
+        text_lora_config = LoraConfig(
+            r=args.rank,
+            lora_alpha=args.rank,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder_one.add_adapter(text_lora_config)
+        text_encoder_two.add_adapter(text_lora_config)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            transformer_lora_layers_to_save = None
+            text_encoder_one_lora_layers_to_save = None
+            text_encoder_two_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(unwrap_model(transformer))):
+                    transformer_lora_layers_to_save = get_peft_model_state_dict(model)
+                elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                    text_encoder_one_lora_layers_to_save = get_peft_model_state_dict(model)
+                elif isinstance(model, type(unwrap_model(text_encoder_two))):
+                    text_encoder_two_lora_layers_to_save = get_peft_model_state_dict(model)
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusion3Pipeline.save_lora_weights(
+                output_dir,
+                transformer_lora_layers=transformer_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
+                text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save,
+            )
+
+    def load_model_hook(models, input_dir):
+        transformer_ = None
+        text_encoder_one_ = None
+        text_encoder_two_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(transformer))):
+                transformer_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                text_encoder_one_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_two))):
+                text_encoder_two_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict = StableDiffusion3Pipeline.lora_state_dict(input_dir)
+
+        transformer_state_dict = {
+            f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.")
+        }
+        transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)
+        incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+        if args.train_text_encoder:
+            # Do we need to call `scale_lora_layers()` here?
+            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)
+
+            _set_state_dict_into_text_encoder(
+                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_two_
+            )
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [transformer_]
+            if args.train_text_encoder:
+                models.extend([text_encoder_one_, text_encoder_two_])
+            # only upcast trainable parameters (LoRA) into fp32
+            cast_training_params(models)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [transformer]
+        if args.train_text_encoder:
+            models.extend([text_encoder_one, text_encoder_two])
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(models, dtype=torch.float32)
+
+    transformer_lora_parameters = list(filter(lambda p: p.requires_grad, transformer.parameters()))
+    if args.train_text_encoder:
+        text_lora_parameters_one = list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
+        text_lora_parameters_two = list(filter(lambda p: p.requires_grad, text_encoder_two.parameters()))
+
+    # Optimization parameters
+    transformer_parameters_with_lr = {"params": transformer_lora_parameters, "lr": args.learning_rate}
+    if args.train_text_encoder:
+        # different learning rate for text encoder and unet
+        text_lora_parameters_one_with_lr = {
+            "params": text_lora_parameters_one,
+            "weight_decay": args.adam_weight_decay_text_encoder,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        text_lora_parameters_two_with_lr = {
+            "params": text_lora_parameters_two,
+            "weight_decay": args.adam_weight_decay_text_encoder,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        params_to_optimize = [
+            transformer_parameters_with_lr,
+            text_lora_parameters_one_with_lr,
+            text_lora_parameters_two_with_lr,
+        ]
+    else:
+        params_to_optimize = [transformer_parameters_with_lr]
+
+    # Optimizer creation
+    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+            "Defaulting to adamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.optimizer.lower() == "adamw":
+        if args.use_8bit_adam:
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError(
+                    "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+                )
+
+            optimizer_class = bnb.optim.AdamW8bit
+        else:
+            optimizer_class = torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+
+    if args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_prompt=args.class_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_num=args.num_class_images,
+        size=args.resolution,
+        repeats=args.repeats,
+        center_crop=args.center_crop,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    if not args.train_text_encoder:
+        tokenizers = [tokenizer_one, tokenizer_two, tokenizer_three]
+        text_encoders = [text_encoder_one, text_encoder_two, text_encoder_three]
+
+        def compute_text_embeddings(prompt, text_encoders, tokenizers):
+            with torch.no_grad():
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                    text_encoders, tokenizers, prompt, args.max_sequence_length
+                )
+                prompt_embeds = prompt_embeds.to(accelerator.device)
+                pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device)
+            return prompt_embeds, pooled_prompt_embeds
+
+    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
+        instance_prompt_hidden_states, instance_pooled_prompt_embeds = compute_text_embeddings(
+            args.instance_prompt, text_encoders, tokenizers
+        )
+
+    # Handle class prompt for prior-preservation.
+    if args.with_prior_preservation:
+        if not args.train_text_encoder:
+            class_prompt_hidden_states, class_pooled_prompt_embeds = compute_text_embeddings(
+                args.class_prompt, text_encoders, tokenizers
+            )
+
+    # Clear the memory here
+    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
+        # Explicitly delete the objects as well, otherwise only the lists are deleted and the original references remain, preventing garbage collection
+        clear_objs_and_retain_memory(
+            objs=[tokenizers, text_encoders, text_encoder_one, text_encoder_two, text_encoder_three]
+        )
+
+    # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
+    # pack the statically computed variables appropriately here. This is so that we don't
+    # have to pass them to the dataloader.
+
+    if not train_dataset.custom_instance_prompts:
+        if not args.train_text_encoder:
+            prompt_embeds = instance_prompt_hidden_states
+            pooled_prompt_embeds = instance_pooled_prompt_embeds
+            if args.with_prior_preservation:
+                prompt_embeds = torch.cat([prompt_embeds, class_prompt_hidden_states], dim=0)
+                pooled_prompt_embeds = torch.cat([pooled_prompt_embeds, class_pooled_prompt_embeds], dim=0)
+            # if we're optimizing the text encoder (both if instance prompt is used for all images or custom prompts) we need to tokenize and encode the
+        # batch prompts on all training steps
+        else:
+            tokens_one = tokenize_prompt(tokenizer_one, args.instance_prompt)
+            tokens_two = tokenize_prompt(tokenizer_two, args.instance_prompt)
+            tokens_three = tokenize_prompt(tokenizer_three, args.instance_prompt)
+            if args.with_prior_preservation:
+                class_tokens_one = tokenize_prompt(tokenizer_one, args.class_prompt)
+                class_tokens_two = tokenize_prompt(tokenizer_two, args.class_prompt)
+                class_tokens_three = tokenize_prompt(tokenizer_three, args.class_prompt)
+                tokens_one = torch.cat([tokens_one, class_tokens_one], dim=0)
+                tokens_two = torch.cat([tokens_two, class_tokens_two], dim=0)
+                tokens_three = torch.cat([tokens_three, class_tokens_three], dim=0)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        (
+            transformer,
+            text_encoder_one,
+            text_encoder_two,
+            optimizer,
+            train_dataloader,
+            lr_scheduler,
+        ) = accelerator.prepare(
+            transformer, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler
+        )
+        assert text_encoder_one is not None
+        assert text_encoder_two is not None
+        assert text_encoder_three is not None
+    else:
+        transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            transformer, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = "dreambooth-sd3-lora"
+        accelerator.init_trackers(tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+        if args.train_text_encoder:
+            text_encoder_one.train()
+            text_encoder_two.train()
+
+            # set top parameter requires_grad = True for gradient checkpointing works
+            accelerator.unwrap_model(text_encoder_one).text_model.embeddings.requires_grad_(True)
+            accelerator.unwrap_model(text_encoder_two).text_model.embeddings.requires_grad_(True)
+
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [transformer]
+            with accelerator.accumulate(models_to_accumulate):
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                prompts = batch["prompts"]
+
+                # encode batch prompts when custom prompts are provided for each image -
+                if train_dataset.custom_instance_prompts:
+                    if not args.train_text_encoder:
+                        prompt_embeds, pooled_prompt_embeds = compute_text_embeddings(
+                            prompts, text_encoders, tokenizers
+                        )
+                    else:
+                        tokens_one = tokenize_prompt(tokenizer_one, prompts)
+                        tokens_two = tokenize_prompt(tokenizer_two, prompts)
+                        tokens_three = tokenize_prompt(tokenizer_three, prompts)
+                        prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                            text_encoders=[text_encoder_one, text_encoder_two, text_encoder_three],
+                            tokenizers=[None, None, None],
+                            prompt=prompts,
+                            max_sequence_length=args.max_sequence_length,
+                            text_input_ids_list=[tokens_one, tokens_two, tokens_three],
+                        )
+                else:
+                    if args.train_text_encoder:
+                        prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                            text_encoders=[text_encoder_one, text_encoder_two, text_encoder_three],
+                            tokenizers=[None, None, tokenizer_three],
+                            prompt=args.instance_prompt,
+                            max_sequence_length=args.max_sequence_length,
+                            text_input_ids_list=[tokens_one, tokens_two, tokens_three],
+                        )
+
+                # Convert images to latent space
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                model_input = (model_input - vae.config.shift_factor) * vae.config.scaling_factor
+                model_input = model_input.to(dtype=weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+
+                # Sample a random timestep for each image
+                # for weighting schemes where we sample timesteps non-uniformly
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+
+                # Add noise according to flow matching.
+                # zt = (1 - texp) * x + texp * z1
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
+
+                # Predict the noise residual
+                model_pred = transformer(
+                    hidden_states=noisy_model_input,
+                    timestep=timesteps,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    return_dict=False,
+                )[0]
+
+                # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                # Preconditioning of the model outputs.
+                if args.precondition_outputs:
+                    model_pred = model_pred * (-sigmas) + noisy_model_input
+
+                # these weighting schemes use a uniform timestep sampling
+                # and instead post-weight the loss
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+
+                # flow matching loss
+                if args.precondition_outputs:
+                    target = model_input
+                else:
+                    target = noise - model_input
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute prior loss
+                    prior_loss = torch.mean(
+                        (weighting.float() * (model_pred_prior.float() - target_prior.float()) ** 2).reshape(
+                            target_prior.shape[0], -1
+                        ),
+                        1,
+                    )
+                    prior_loss = prior_loss.mean()
+
+                # Compute regular loss.
+                loss = torch.mean(
+                    (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    1,
+                )
+                loss = loss.mean()
+
+                if args.with_prior_preservation:
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(
+                            transformer_lora_parameters, text_lora_parameters_one, text_lora_parameters_two
+                        )
+                        if args.train_text_encoder
+                        else transformer_lora_parameters
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                if not args.train_text_encoder:
+                    # create pipeline
+                    text_encoder_one, text_encoder_two, text_encoder_three = load_text_encoders(
+                        text_encoder_cls_one, text_encoder_cls_two, text_encoder_cls_three
+                    )
+                pipeline = StableDiffusion3Pipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=accelerator.unwrap_model(text_encoder_one),
+                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
+                    text_encoder_3=accelerator.unwrap_model(text_encoder_three),
+                    transformer=accelerator.unwrap_model(transformer),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline_args = {"prompt": args.validation_prompt}
+                images = log_validation(
+                    pipeline=pipeline,
+                    args=args,
+                    accelerator=accelerator,
+                    pipeline_args=pipeline_args,
+                    epoch=epoch,
+                    torch_dtype=weight_dtype,
+                )
+                objs = []
+                if not args.train_text_encoder:
+                    objs.extend([text_encoder_one, text_encoder_two, text_encoder_three])
+
+                clear_objs_and_retain_memory(objs=objs)
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        transformer = unwrap_model(transformer)
+        transformer = transformer.to(torch.float32)
+        transformer_lora_layers = get_peft_model_state_dict(transformer)
+
+        if args.train_text_encoder:
+            text_encoder_one = unwrap_model(text_encoder_one)
+            text_encoder_lora_layers = get_peft_model_state_dict(text_encoder_one.to(torch.float32))
+            text_encoder_two = unwrap_model(text_encoder_two)
+            text_encoder_2_lora_layers = get_peft_model_state_dict(text_encoder_two.to(torch.float32))
+        else:
+            text_encoder_lora_layers = None
+            text_encoder_2_lora_layers = None
+
+        StableDiffusion3Pipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            transformer_lora_layers=transformer_lora_layers,
+            text_encoder_lora_layers=text_encoder_lora_layers,
+            text_encoder_2_lora_layers=text_encoder_2_lora_layers,
+        )
+
+        # Final inference
+        # Load previous pipeline
+        pipeline = StableDiffusion3Pipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        # load attention processors
+        pipeline.load_lora_weights(args.output_dir)
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline_args = {"prompt": args.validation_prompt}
+            images = log_validation(
+                pipeline=pipeline,
+                args=args,
+                accelerator=accelerator,
+                pipeline_args=pipeline_args,
+                epoch=epoch,
+                is_final_validation=True,
+                torch_dtype=weight_dtype,
+            )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                instance_prompt=args.instance_prompt,
+                validation_prompt=args.validation_prompt,
+                train_text_encoder=args.train_text_encoder,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/dreambooth/train_dreambooth_lora_sdxl.py b/diffusers/examples/dreambooth/train_dreambooth_lora_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..016464165c440506d25a754e151d7761808740df
--- /dev/null
+++ b/diffusers/examples/dreambooth/train_dreambooth_lora_sdxl.py
@@ -0,0 +1,1987 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import gc
+import itertools
+import json
+import logging
+import math
+import os
+import random
+import shutil
+import warnings
+from contextlib import nullcontext
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, hf_hub_download, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from packaging import version
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from safetensors.torch import load_file, save_file
+from torch.utils.data import Dataset
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EDMEulerScheduler,
+    EulerDiscreteScheduler,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import StableDiffusionLoraLoaderMixin
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr
+from diffusers.utils import (
+    check_min_version,
+    convert_all_state_dict_to_peft,
+    convert_state_dict_to_diffusers,
+    convert_state_dict_to_kohya,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def determine_scheduler_type(pretrained_model_name_or_path, revision):
+    model_index_filename = "model_index.json"
+    if os.path.isdir(pretrained_model_name_or_path):
+        model_index = os.path.join(pretrained_model_name_or_path, model_index_filename)
+    else:
+        model_index = hf_hub_download(
+            repo_id=pretrained_model_name_or_path, filename=model_index_filename, revision=revision
+        )
+
+    with open(model_index, "r") as f:
+        scheduler_type = json.load(f)["scheduler"][1]
+    return scheduler_type
+
+
+def save_model_card(
+    repo_id: str,
+    use_dora: bool,
+    images=None,
+    base_model: str = None,
+    train_text_encoder=False,
+    instance_prompt=None,
+    validation_prompt=None,
+    repo_folder=None,
+    vae_path=None,
+):
+    widget_dict = []
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            widget_dict.append(
+                {"text": validation_prompt if validation_prompt else " ", "output": {"url": f"image_{i}.png"}}
+            )
+
+    model_description = f"""
+# {'SDXL' if 'playground' not in base_model else 'Playground'} LoRA DreamBooth - {repo_id}
+
+<Gallery />
+
+## Model description
+
+These are {repo_id} LoRA adaption weights for {base_model}.
+
+The weights were trained  using [DreamBooth](https://dreambooth.github.io/).
+
+LoRA for the text encoder was enabled: {train_text_encoder}.
+
+Special VAE used for training: {vae_path}.
+
+## Trigger words
+
+You should use {instance_prompt} to trigger the image generation.
+
+## Download model
+
+Weights for this model are available in Safetensors format.
+
+[Download]({repo_id}/tree/main) them in the Files & versions tab.
+
+"""
+    if "playground" in base_model:
+        model_description += """\n
+## License
+
+Please adhere to the licensing terms as described [here](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic/blob/main/LICENSE.md).
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="openrail++" if "playground" not in base_model else "playground-v2dot5-community",
+        base_model=base_model,
+        prompt=instance_prompt,
+        model_description=model_description,
+        widget=widget_dict,
+    )
+    tags = [
+        "text-to-image",
+        "text-to-image",
+        "diffusers-training",
+        "diffusers",
+        "lora" if not use_dora else "dora",
+        "template:sd-lora",
+    ]
+    if "playground" in base_model:
+        tags.extend(["playground", "playground-diffusers"])
+    else:
+        tags.extend(["stable-diffusion-xl", "stable-diffusion-xl-diffusers"])
+
+    model_card = populate_model_card(model_card, tags=tags)
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    torch_dtype,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+
+    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+    scheduler_args = {}
+
+    if not args.do_edm_style_training:
+        if "variance_type" in pipeline.scheduler.config:
+            variance_type = pipeline.scheduler.config.variance_type
+
+            if variance_type in ["learned", "learned_range"]:
+                variance_type = "fixed_small"
+
+            scheduler_args["variance_type"] = variance_type
+
+        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+
+    pipeline = pipeline.to(accelerator.device, dtype=torch_dtype)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    # Currently the context determination is a bit hand-wavy. We can improve it in the future if there's a better
+    # way to condition it. Reference: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
+    if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+
+    with autocast_ctx:
+        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    return images
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help=("A folder containing the training data. "),
+    )
+
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+
+    parser.add_argument(
+        "--image_column",
+        type=str,
+        default="image",
+        help="The column of the dataset containing the target image. By "
+        "default, the standard Image Dataset maps out 'file_name' "
+        "to 'image'.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default=None,
+        help="The column of the dataset containing the instance prompt for each image",
+    )
+
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance, e.g. 'photo of a TOK dog', 'in the style of TOK'",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--do_edm_style_training",
+        default=False,
+        action="store_true",
+        help="Flag to conduct training using the EDM formulation as introduced in https://arxiv.org/abs/2206.00364.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="lora-dreambooth-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--output_kohya_format",
+        action="store_true",
+        help="Flag to additionally generate final state dict in the Kohya format so that it becomes compatible with A111, Comfy, Kohya, etc.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+
+    parser.add_argument(
+        "--text_encoder_lr",
+        type=float,
+        default=5e-6,
+        help="Text encoder learning rate to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="AdamW",
+        help=('The optimizer type to use. Choose between ["AdamW", "prodigy"]'),
+    )
+
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+
+    parser.add_argument(
+        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--prodigy_beta3",
+        type=float,
+        default=None,
+        help="coefficients for computing the Prodigy stepsize using running averages. If set to None, "
+        "uses the value of square root of beta2. Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+    parser.add_argument(
+        "--adam_weight_decay_text_encoder", type=float, default=1e-03, help="Weight decay to use for text_encoder"
+    )
+
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
+    )
+
+    parser.add_argument(
+        "--prodigy_use_bias_correction",
+        type=bool,
+        default=True,
+        help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW",
+    )
+    parser.add_argument(
+        "--prodigy_safeguard_warmup",
+        type=bool,
+        default=True,
+        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. "
+        "Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--use_dora",
+        action="store_true",
+        default=False,
+        help=(
+            "Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
+            "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
+        ),
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.instance_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--instance_data_dir`")
+
+    if args.dataset_name is not None and args.instance_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--instance_data_dir`")
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        class_prompt,
+        class_data_root=None,
+        class_num=None,
+        size=1024,
+        repeats=1,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+
+        self.instance_prompt = instance_prompt
+        self.custom_instance_prompts = None
+        self.class_prompt = class_prompt
+
+        # if --dataset_name is provided or a metadata jsonl file is provided in the local --instance_data directory,
+        # we load the training data using load_dataset
+        if args.dataset_name is not None:
+            try:
+                from datasets import load_dataset
+            except ImportError:
+                raise ImportError(
+                    "You are trying to load your data using the datasets library. If you wish to train using custom "
+                    "captions please install the datasets library: `pip install datasets`. If you wish to load a "
+                    "local folder containing images only, specify --instance_data_dir instead."
+                )
+            # Downloading and loading a dataset from the hub.
+            # See more about loading custom images at
+            # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+            dataset = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                cache_dir=args.cache_dir,
+            )
+            # Preprocessing the datasets.
+            column_names = dataset["train"].column_names
+
+            # 6. Get the column names for input/target.
+            if args.image_column is None:
+                image_column = column_names[0]
+                logger.info(f"image column defaulting to {image_column}")
+            else:
+                image_column = args.image_column
+                if image_column not in column_names:
+                    raise ValueError(
+                        f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+            instance_images = dataset["train"][image_column]
+
+            if args.caption_column is None:
+                logger.info(
+                    "No caption column provided, defaulting to instance_prompt for all images. If your dataset "
+                    "contains captions/prompts for the images, make sure to specify the "
+                    "column as --caption_column"
+                )
+                self.custom_instance_prompts = None
+            else:
+                if args.caption_column not in column_names:
+                    raise ValueError(
+                        f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+                custom_instance_prompts = dataset["train"][args.caption_column]
+                # create final list of captions according to --repeats
+                self.custom_instance_prompts = []
+                for caption in custom_instance_prompts:
+                    self.custom_instance_prompts.extend(itertools.repeat(caption, repeats))
+        else:
+            self.instance_data_root = Path(instance_data_root)
+            if not self.instance_data_root.exists():
+                raise ValueError("Instance images root doesn't exists.")
+
+            instance_images = [Image.open(path) for path in list(Path(instance_data_root).iterdir())]
+            self.custom_instance_prompts = None
+
+        self.instance_images = []
+        for img in instance_images:
+            self.instance_images.extend(itertools.repeat(img, repeats))
+
+        # image processing to prepare for using SD-XL micro-conditioning
+        self.original_sizes = []
+        self.crop_top_lefts = []
+        self.pixel_values = []
+        train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
+        train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
+        train_flip = transforms.RandomHorizontalFlip(p=1.0)
+        train_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        for image in self.instance_images:
+            image = exif_transpose(image)
+            if not image.mode == "RGB":
+                image = image.convert("RGB")
+            self.original_sizes.append((image.height, image.width))
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            crop_top_left = (y1, x1)
+            self.crop_top_lefts.append(crop_top_left)
+            image = train_transforms(image)
+            self.pixel_values.append(image)
+
+        self.num_instance_images = len(self.instance_images)
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = self.pixel_values[index % self.num_instance_images]
+        original_size = self.original_sizes[index % self.num_instance_images]
+        crop_top_left = self.crop_top_lefts[index % self.num_instance_images]
+        example["instance_images"] = instance_image
+        example["original_size"] = original_size
+        example["crop_top_left"] = crop_top_left
+
+        if self.custom_instance_prompts:
+            caption = self.custom_instance_prompts[index % self.num_instance_images]
+            if caption:
+                example["instance_prompt"] = caption
+            else:
+                example["instance_prompt"] = self.instance_prompt
+
+        else:  # custom prompts were provided, but length does not match size of image dataset
+            example["instance_prompt"] = self.instance_prompt
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            class_image = exif_transpose(class_image)
+
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt"] = self.class_prompt
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    pixel_values = [example["instance_images"] for example in examples]
+    prompts = [example["instance_prompt"] for example in examples]
+    original_sizes = [example["original_size"] for example in examples]
+    crop_top_lefts = [example["crop_top_left"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        pixel_values += [example["class_images"] for example in examples]
+        prompts += [example["class_prompt"] for example in examples]
+        original_sizes += [example["original_size"] for example in examples]
+        crop_top_lefts += [example["crop_top_left"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    batch = {
+        "pixel_values": pixel_values,
+        "prompts": prompts,
+        "original_sizes": original_sizes,
+        "crop_top_lefts": crop_top_lefts,
+    }
+    return batch
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def tokenize_prompt(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(text_encoders, tokenizers, prompt, text_input_ids_list=None):
+    prompt_embeds_list = []
+
+    for i, text_encoder in enumerate(text_encoders):
+        if tokenizers is not None:
+            tokenizer = tokenizers[i]
+            text_input_ids = tokenize_prompt(tokenizer, prompt)
+        else:
+            assert text_input_ids_list is not None
+            text_input_ids = text_input_ids_list[i]
+
+        prompt_embeds = text_encoder(
+            text_input_ids.to(text_encoder.device), output_hidden_states=True, return_dict=False
+        )
+
+        # We are only ALWAYS interested in the pooled output of the final text encoder
+        pooled_prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds[-1][-2]
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+        prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if args.do_edm_style_training and args.snr_gamma is not None:
+        raise ValueError("Min-SNR formulation is not supported when conducting EDM-style training.")
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            has_supported_fp16_accelerator = torch.cuda.is_available() or torch.backends.mps.is_available()
+            torch_dtype = torch.float16 if has_supported_fp16_accelerator else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = StableDiffusionXLPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                revision=args.revision,
+                variant=args.variant,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    scheduler_type = determine_scheduler_type(args.pretrained_model_name_or_path, args.revision)
+    if "EDM" in scheduler_type:
+        args.do_edm_style_training = True
+        noise_scheduler = EDMEulerScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+        logger.info("Performing EDM-style training!")
+    elif args.do_edm_style_training:
+        noise_scheduler = EulerDiscreteScheduler.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="scheduler"
+        )
+        logger.info("Performing EDM-style training!")
+    else:
+        noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    latents_mean = latents_std = None
+    if hasattr(vae.config, "latents_mean") and vae.config.latents_mean is not None:
+        latents_mean = torch.tensor(vae.config.latents_mean).view(1, 4, 1, 1)
+    if hasattr(vae.config, "latents_std") and vae.config.latents_std is not None:
+        latents_std = torch.tensor(vae.config.latents_std).view(1, 4, 1, 1)
+
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # We only train the additional adapter LoRA layers
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+
+    # The VAE is always in float32 to avoid NaN losses.
+    vae.to(accelerator.device, dtype=torch.float32)
+
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, "
+                    "please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder_one.gradient_checkpointing_enable()
+            text_encoder_two.gradient_checkpointing_enable()
+
+    # now we will add new LoRA weights to the attention layers
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        use_dora=args.use_dora,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+    unet.add_adapter(unet_lora_config)
+
+    # The text encoder comes from 🤗 transformers, so we cannot directly modify it.
+    # So, instead, we monkey-patch the forward calls of its attention-blocks.
+    if args.train_text_encoder:
+        text_lora_config = LoraConfig(
+            r=args.rank,
+            use_dora=args.use_dora,
+            lora_alpha=args.rank,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder_one.add_adapter(text_lora_config)
+        text_encoder_two.add_adapter(text_lora_config)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder atten layers
+            unet_lora_layers_to_save = None
+            text_encoder_one_lora_layers_to_save = None
+            text_encoder_two_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(unwrap_model(unet))):
+                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
+                elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                    text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
+                elif isinstance(model, type(unwrap_model(text_encoder_two))):
+                    text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusionXLPipeline.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
+                text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save,
+            )
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+        text_encoder_one_ = None
+        text_encoder_two_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(unet))):
+                unet_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                text_encoder_one_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_two))):
+                text_encoder_two_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir)
+
+        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        if args.train_text_encoder:
+            # Do we need to call `scale_lora_layers()` here?
+            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)
+
+            _set_state_dict_into_text_encoder(
+                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_two_
+            )
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [unet_]
+            if args.train_text_encoder:
+                models.extend([text_encoder_one_, text_encoder_two_])
+            # only upcast trainable parameters (LoRA) into fp32
+            cast_training_params(models)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [unet]
+        if args.train_text_encoder:
+            models.extend([text_encoder_one, text_encoder_two])
+
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(models, dtype=torch.float32)
+
+    unet_lora_parameters = list(filter(lambda p: p.requires_grad, unet.parameters()))
+
+    if args.train_text_encoder:
+        text_lora_parameters_one = list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
+        text_lora_parameters_two = list(filter(lambda p: p.requires_grad, text_encoder_two.parameters()))
+
+    # Optimization parameters
+    unet_lora_parameters_with_lr = {"params": unet_lora_parameters, "lr": args.learning_rate}
+    if args.train_text_encoder:
+        # different learning rate for text encoder and unet
+        text_lora_parameters_one_with_lr = {
+            "params": text_lora_parameters_one,
+            "weight_decay": args.adam_weight_decay_text_encoder,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        text_lora_parameters_two_with_lr = {
+            "params": text_lora_parameters_two,
+            "weight_decay": args.adam_weight_decay_text_encoder,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        params_to_optimize = [
+            unet_lora_parameters_with_lr,
+            text_lora_parameters_one_with_lr,
+            text_lora_parameters_two_with_lr,
+        ]
+    else:
+        params_to_optimize = [unet_lora_parameters_with_lr]
+
+    # Optimizer creation
+    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+            "Defaulting to adamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.optimizer.lower() == "adamw":
+        if args.use_8bit_adam:
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError(
+                    "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+                )
+
+            optimizer_class = bnb.optim.AdamW8bit
+        else:
+            optimizer_class = torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+
+    if args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+        if args.train_text_encoder and args.text_encoder_lr:
+            logger.warning(
+                f"Learning rates were provided both for the unet and the text encoder- e.g. text_encoder_lr:"
+                f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
+                f"When using prodigy only learning_rate is used as the initial learning rate."
+            )
+            # changes the learning rate of text_encoder_parameters_one and text_encoder_parameters_two to be
+            # --learning_rate
+            params_to_optimize[1]["lr"] = args.learning_rate
+            params_to_optimize[2]["lr"] = args.learning_rate
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_prompt=args.class_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_num=args.num_class_images,
+        size=args.resolution,
+        repeats=args.repeats,
+        center_crop=args.center_crop,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Computes additional embeddings/ids required by the SDXL UNet.
+    # regular text embeddings (when `train_text_encoder` is not True)
+    # pooled text embeddings
+    # time ids
+
+    def compute_time_ids(original_size, crops_coords_top_left):
+        # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+        target_size = (args.resolution, args.resolution)
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids])
+        add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+        return add_time_ids
+
+    if not args.train_text_encoder:
+        tokenizers = [tokenizer_one, tokenizer_two]
+        text_encoders = [text_encoder_one, text_encoder_two]
+
+        def compute_text_embeddings(prompt, text_encoders, tokenizers):
+            with torch.no_grad():
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(text_encoders, tokenizers, prompt)
+                prompt_embeds = prompt_embeds.to(accelerator.device)
+                pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device)
+            return prompt_embeds, pooled_prompt_embeds
+
+    # If no type of tuning is done on the text_encoder and custom instance prompts are NOT
+    # provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid
+    # the redundant encoding.
+    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
+        instance_prompt_hidden_states, instance_pooled_prompt_embeds = compute_text_embeddings(
+            args.instance_prompt, text_encoders, tokenizers
+        )
+
+    # Handle class prompt for prior-preservation.
+    if args.with_prior_preservation:
+        if not args.train_text_encoder:
+            class_prompt_hidden_states, class_pooled_prompt_embeds = compute_text_embeddings(
+                args.class_prompt, text_encoders, tokenizers
+            )
+
+    # Clear the memory here
+    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
+        del tokenizers, text_encoders
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+    # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
+    # pack the statically computed variables appropriately here. This is so that we don't
+    # have to pass them to the dataloader.
+
+    if not train_dataset.custom_instance_prompts:
+        if not args.train_text_encoder:
+            prompt_embeds = instance_prompt_hidden_states
+            unet_add_text_embeds = instance_pooled_prompt_embeds
+            if args.with_prior_preservation:
+                prompt_embeds = torch.cat([prompt_embeds, class_prompt_hidden_states], dim=0)
+                unet_add_text_embeds = torch.cat([unet_add_text_embeds, class_pooled_prompt_embeds], dim=0)
+        # if we're optimizing the text encoder (both if instance prompt is used for all images or custom prompts) we need to tokenize and encode the
+        # batch prompts on all training steps
+        else:
+            tokens_one = tokenize_prompt(tokenizer_one, args.instance_prompt)
+            tokens_two = tokenize_prompt(tokenizer_two, args.instance_prompt)
+            if args.with_prior_preservation:
+                class_tokens_one = tokenize_prompt(tokenizer_one, args.class_prompt)
+                class_tokens_two = tokenize_prompt(tokenizer_two, args.class_prompt)
+                tokens_one = torch.cat([tokens_one, class_tokens_one], dim=0)
+                tokens_two = torch.cat([tokens_two, class_tokens_two], dim=0)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = (
+            "dreambooth-lora-sd-xl"
+            if "playground" not in args.pretrained_model_name_or_path
+            else "dreambooth-lora-playground"
+        )
+        accelerator.init_trackers(tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder_one.train()
+            text_encoder_two.train()
+
+            # set top parameter requires_grad = True for gradient checkpointing works
+            accelerator.unwrap_model(text_encoder_one).text_model.embeddings.requires_grad_(True)
+            accelerator.unwrap_model(text_encoder_two).text_model.embeddings.requires_grad_(True)
+
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                prompts = batch["prompts"]
+
+                # encode batch prompts when custom prompts are provided for each image -
+                if train_dataset.custom_instance_prompts:
+                    if not args.train_text_encoder:
+                        prompt_embeds, unet_add_text_embeds = compute_text_embeddings(
+                            prompts, text_encoders, tokenizers
+                        )
+                    else:
+                        tokens_one = tokenize_prompt(tokenizer_one, prompts)
+                        tokens_two = tokenize_prompt(tokenizer_two, prompts)
+
+                # Convert images to latent space
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+
+                if latents_mean is None and latents_std is None:
+                    model_input = model_input * vae.config.scaling_factor
+                    if args.pretrained_vae_model_name_or_path is None:
+                        model_input = model_input.to(weight_dtype)
+                else:
+                    latents_mean = latents_mean.to(device=model_input.device, dtype=model_input.dtype)
+                    latents_std = latents_std.to(device=model_input.device, dtype=model_input.dtype)
+                    model_input = (model_input - latents_mean) * vae.config.scaling_factor / latents_std
+                    model_input = model_input.to(dtype=weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+
+                # Sample a random timestep for each image
+                if not args.do_edm_style_training:
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                    )
+                    timesteps = timesteps.long()
+                else:
+                    # in EDM formulation, the model is conditioned on the pre-conditioned noise levels
+                    # instead of discrete timesteps, so here we sample indices to get the noise levels
+                    # from `scheduler.timesteps`
+                    indices = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,))
+                    timesteps = noise_scheduler.timesteps[indices].to(device=model_input.device)
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+                # For EDM-style training, we first obtain the sigmas based on the continuous timesteps.
+                # We then precondition the final model inputs based on these sigmas instead of the timesteps.
+                # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                if args.do_edm_style_training:
+                    sigmas = get_sigmas(timesteps, len(noisy_model_input.shape), noisy_model_input.dtype)
+                    if "EDM" in scheduler_type:
+                        inp_noisy_latents = noise_scheduler.precondition_inputs(noisy_model_input, sigmas)
+                    else:
+                        inp_noisy_latents = noisy_model_input / ((sigmas**2 + 1) ** 0.5)
+
+                # time ids
+                add_time_ids = torch.cat(
+                    [
+                        compute_time_ids(original_size=s, crops_coords_top_left=c)
+                        for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])
+                    ]
+                )
+
+                # Calculate the elements to repeat depending on the use of prior-preservation and custom captions.
+                if not train_dataset.custom_instance_prompts:
+                    elems_to_repeat_text_embeds = bsz // 2 if args.with_prior_preservation else bsz
+                else:
+                    elems_to_repeat_text_embeds = 1
+
+                # Predict the noise residual
+                if not args.train_text_encoder:
+                    unet_added_conditions = {
+                        "time_ids": add_time_ids,
+                        "text_embeds": unet_add_text_embeds.repeat(elems_to_repeat_text_embeds, 1),
+                    }
+                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
+                    model_pred = unet(
+                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
+                        timesteps,
+                        prompt_embeds_input,
+                        added_cond_kwargs=unet_added_conditions,
+                        return_dict=False,
+                    )[0]
+                else:
+                    unet_added_conditions = {"time_ids": add_time_ids}
+                    prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                        text_encoders=[text_encoder_one, text_encoder_two],
+                        tokenizers=None,
+                        prompt=None,
+                        text_input_ids_list=[tokens_one, tokens_two],
+                    )
+                    unet_added_conditions.update(
+                        {"text_embeds": pooled_prompt_embeds.repeat(elems_to_repeat_text_embeds, 1)}
+                    )
+                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
+                    model_pred = unet(
+                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
+                        timesteps,
+                        prompt_embeds_input,
+                        added_cond_kwargs=unet_added_conditions,
+                        return_dict=False,
+                    )[0]
+
+                weighting = None
+                if args.do_edm_style_training:
+                    # Similar to the input preconditioning, the model predictions are also preconditioned
+                    # on noised model inputs (before preconditioning) and the sigmas.
+                    # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                    if "EDM" in scheduler_type:
+                        model_pred = noise_scheduler.precondition_outputs(noisy_model_input, model_pred, sigmas)
+                    else:
+                        if noise_scheduler.config.prediction_type == "epsilon":
+                            model_pred = model_pred * (-sigmas) + noisy_model_input
+                        elif noise_scheduler.config.prediction_type == "v_prediction":
+                            model_pred = model_pred * (-sigmas / (sigmas**2 + 1) ** 0.5) + (
+                                noisy_model_input / (sigmas**2 + 1)
+                            )
+                    # We are not doing weighting here because it tends result in numerical problems.
+                    # See: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
+                    # There might be other alternatives for weighting as well:
+                    # https://github.com/huggingface/diffusers/pull/7126#discussion_r1505404686
+                    if "EDM" not in scheduler_type:
+                        weighting = (sigmas**-2.0).float()
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = model_input if args.do_edm_style_training else noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = (
+                        model_input
+                        if args.do_edm_style_training
+                        else noise_scheduler.get_velocity(model_input, noise, timesteps)
+                    )
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute prior loss
+                    if weighting is not None:
+                        prior_loss = torch.mean(
+                            (weighting.float() * (model_pred_prior.float() - target_prior.float()) ** 2).reshape(
+                                target_prior.shape[0], -1
+                            ),
+                            1,
+                        )
+                        prior_loss = prior_loss.mean()
+                    else:
+                        prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                if args.snr_gamma is None:
+                    if weighting is not None:
+                        loss = torch.mean(
+                            (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(
+                                target.shape[0], -1
+                            ),
+                            1,
+                        )
+                        loss = loss.mean()
+                    else:
+                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    base_weight = (
+                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
+                    )
+
+                    if noise_scheduler.config.prediction_type == "v_prediction":
+                        # Velocity objective needs to be floored to an SNR weight of one.
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                if args.with_prior_preservation:
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(unet_lora_parameters, text_lora_parameters_one, text_lora_parameters_two)
+                        if args.train_text_encoder
+                        else unet_lora_parameters
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                # create pipeline
+                if not args.train_text_encoder:
+                    text_encoder_one = text_encoder_cls_one.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        subfolder="text_encoder",
+                        revision=args.revision,
+                        variant=args.variant,
+                    )
+                    text_encoder_two = text_encoder_cls_two.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        subfolder="text_encoder_2",
+                        revision=args.revision,
+                        variant=args.variant,
+                    )
+                pipeline = StableDiffusionXLPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=accelerator.unwrap_model(text_encoder_one),
+                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
+                    unet=accelerator.unwrap_model(unet),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline_args = {"prompt": args.validation_prompt}
+
+                images = log_validation(
+                    pipeline,
+                    args,
+                    accelerator,
+                    pipeline_args,
+                    epoch,
+                    torch_dtype=weight_dtype,
+                )
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        unet = unet.to(torch.float32)
+        unet_lora_layers = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+
+        if args.train_text_encoder:
+            text_encoder_one = unwrap_model(text_encoder_one)
+            text_encoder_lora_layers = convert_state_dict_to_diffusers(
+                get_peft_model_state_dict(text_encoder_one.to(torch.float32))
+            )
+            text_encoder_two = unwrap_model(text_encoder_two)
+            text_encoder_2_lora_layers = convert_state_dict_to_diffusers(
+                get_peft_model_state_dict(text_encoder_two.to(torch.float32))
+            )
+        else:
+            text_encoder_lora_layers = None
+            text_encoder_2_lora_layers = None
+
+        StableDiffusionXLPipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_layers,
+            text_encoder_lora_layers=text_encoder_lora_layers,
+            text_encoder_2_lora_layers=text_encoder_2_lora_layers,
+        )
+        if args.output_kohya_format:
+            lora_state_dict = load_file(f"{args.output_dir}/pytorch_lora_weights.safetensors")
+            peft_state_dict = convert_all_state_dict_to_peft(lora_state_dict)
+            kohya_state_dict = convert_state_dict_to_kohya(peft_state_dict)
+            save_file(kohya_state_dict, f"{args.output_dir}/pytorch_lora_weights_kohya.safetensors")
+
+        # Final inference
+        # Load previous pipeline
+        vae = AutoencoderKL.from_pretrained(
+            vae_path,
+            subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            vae=vae,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+
+        # load attention processors
+        pipeline.load_lora_weights(args.output_dir)
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline_args = {"prompt": args.validation_prompt, "num_inference_steps": 25}
+            images = log_validation(
+                pipeline,
+                args,
+                accelerator,
+                pipeline_args,
+                epoch,
+                is_final_validation=True,
+                torch_dtype=weight_dtype,
+            )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                use_dora=args.use_dora,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                train_text_encoder=args.train_text_encoder,
+                instance_prompt=args.instance_prompt,
+                validation_prompt=args.validation_prompt,
+                repo_folder=args.output_dir,
+                vae_path=args.pretrained_vae_model_name_or_path,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/dreambooth/train_dreambooth_sd3.py b/diffusers/examples/dreambooth/train_dreambooth_sd3.py
new file mode 100644
index 0000000000000000000000000000000000000000..c34024f478c163b9f130a39aa3648a5c45cae918
--- /dev/null
+++ b/diffusers/examples/dreambooth/train_dreambooth_sd3.py
@@ -0,0 +1,1805 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import copy
+import gc
+import itertools
+import logging
+import math
+import os
+import random
+import shutil
+import warnings
+from contextlib import nullcontext
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer, PretrainedConfig, T5EncoderModel, T5TokenizerFast
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    SD3Transformer2DModel,
+    StableDiffusion3Pipeline,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import compute_density_for_timestep_sampling, compute_loss_weighting_for_sd3
+from diffusers.utils import (
+    check_min_version,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    images=None,
+    base_model: str = None,
+    train_text_encoder=False,
+    instance_prompt=None,
+    validation_prompt=None,
+    repo_folder=None,
+):
+    widget_dict = []
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            widget_dict.append(
+                {"text": validation_prompt if validation_prompt else " ", "output": {"url": f"image_{i}.png"}}
+            )
+
+    model_description = f"""
+# SD3 DreamBooth - {repo_id}
+
+<Gallery />
+
+## Model description
+
+These are {repo_id} DreamBooth weights for {base_model}.
+
+The weights were trained using [DreamBooth](https://dreambooth.github.io/) with the [SD3 diffusers trainer](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_sd3.md).
+
+Was the text encoder fine-tuned? {train_text_encoder}.
+
+## Trigger words
+
+You should use `{instance_prompt}` to trigger the image generation.
+
+## Use it with the [🧨 diffusers library](https://github.com/huggingface/diffusers)
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+pipeline = AutoPipelineForText2Image.from_pretrained('{repo_id}', torch_dtype=torch.float16).to('cuda')
+image = pipeline('{validation_prompt if validation_prompt else instance_prompt}').images[0]
+```
+
+## License
+
+Please adhere to the licensing terms as described `[here](https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/LICENSE)`.
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="openrail++",
+        base_model=base_model,
+        prompt=instance_prompt,
+        model_description=model_description,
+        widget=widget_dict,
+    )
+    tags = [
+        "text-to-image",
+        "diffusers-training",
+        "diffusers",
+        "sd3",
+        "sd3-diffusers",
+        "template:sd-lora",
+    ]
+
+    model_card = populate_model_card(model_card, tags=tags)
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def load_text_encoders(class_one, class_two, class_three):
+    text_encoder_one = class_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = class_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    text_encoder_three = class_three.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_3", revision=args.revision, variant=args.variant
+    )
+    return text_encoder_one, text_encoder_two, text_encoder_three
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    torch_dtype,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = pipeline.to(accelerator.device, dtype=torch_dtype)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    # autocast_ctx = torch.autocast(accelerator.device.type) if not is_final_validation else nullcontext()
+    autocast_ctx = nullcontext()
+
+    with autocast_ctx:
+        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    return images
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help=("A folder containing the training data. "),
+    )
+
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+
+    parser.add_argument(
+        "--image_column",
+        type=str,
+        default="image",
+        help="The column of the dataset containing the target image. By "
+        "default, the standard Image Dataset maps out 'file_name' "
+        "to 'image'.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default=None,
+        help="The column of the dataset containing the instance prompt for each image",
+    )
+
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance, e.g. 'photo of a TOK dog', 'in the style of TOK'",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--max_sequence_length",
+        type=int,
+        default=77,
+        help="Maximum sequence length to use with with the T5 text encoder",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd3-dreambooth",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+
+    parser.add_argument(
+        "--text_encoder_lr",
+        type=float,
+        default=5e-6,
+        help="Text encoder learning rate to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="logit_normal",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap"],
+    )
+    parser.add_argument(
+        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+    )
+    parser.add_argument(
+        "--precondition_outputs",
+        type=int,
+        default=1,
+        help="Flag indicating if we are preconditioning the model outputs or not as done in EDM. This affects how "
+        "model `target` is calculated.",
+    )
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="AdamW",
+        help=('The optimizer type to use. Choose between ["AdamW", "prodigy"]'),
+    )
+
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+
+    parser.add_argument(
+        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--prodigy_beta3",
+        type=float,
+        default=None,
+        help="coefficients for computing the Prodigy stepsize using running averages. If set to None, "
+        "uses the value of square root of beta2. Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+    parser.add_argument(
+        "--adam_weight_decay_text_encoder", type=float, default=1e-03, help="Weight decay to use for text_encoder"
+    )
+
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
+    )
+
+    parser.add_argument(
+        "--prodigy_use_bias_correction",
+        type=bool,
+        default=True,
+        help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW",
+    )
+    parser.add_argument(
+        "--prodigy_safeguard_warmup",
+        type=bool,
+        default=True,
+        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. "
+        "Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.instance_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--instance_data_dir`")
+
+    if args.dataset_name is not None and args.instance_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--instance_data_dir`")
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        class_prompt,
+        class_data_root=None,
+        class_num=None,
+        size=1024,
+        repeats=1,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+
+        self.instance_prompt = instance_prompt
+        self.custom_instance_prompts = None
+        self.class_prompt = class_prompt
+
+        # if --dataset_name is provided or a metadata jsonl file is provided in the local --instance_data directory,
+        # we load the training data using load_dataset
+        if args.dataset_name is not None:
+            try:
+                from datasets import load_dataset
+            except ImportError:
+                raise ImportError(
+                    "You are trying to load your data using the datasets library. If you wish to train using custom "
+                    "captions please install the datasets library: `pip install datasets`. If you wish to load a "
+                    "local folder containing images only, specify --instance_data_dir instead."
+                )
+            # Downloading and loading a dataset from the hub.
+            # See more about loading custom images at
+            # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+            dataset = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                cache_dir=args.cache_dir,
+            )
+            # Preprocessing the datasets.
+            column_names = dataset["train"].column_names
+
+            # 6. Get the column names for input/target.
+            if args.image_column is None:
+                image_column = column_names[0]
+                logger.info(f"image column defaulting to {image_column}")
+            else:
+                image_column = args.image_column
+                if image_column not in column_names:
+                    raise ValueError(
+                        f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+            instance_images = dataset["train"][image_column]
+
+            if args.caption_column is None:
+                logger.info(
+                    "No caption column provided, defaulting to instance_prompt for all images. If your dataset "
+                    "contains captions/prompts for the images, make sure to specify the "
+                    "column as --caption_column"
+                )
+                self.custom_instance_prompts = None
+            else:
+                if args.caption_column not in column_names:
+                    raise ValueError(
+                        f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+                custom_instance_prompts = dataset["train"][args.caption_column]
+                # create final list of captions according to --repeats
+                self.custom_instance_prompts = []
+                for caption in custom_instance_prompts:
+                    self.custom_instance_prompts.extend(itertools.repeat(caption, repeats))
+        else:
+            self.instance_data_root = Path(instance_data_root)
+            if not self.instance_data_root.exists():
+                raise ValueError("Instance images root doesn't exists.")
+
+            instance_images = [Image.open(path) for path in list(Path(instance_data_root).iterdir())]
+            self.custom_instance_prompts = None
+
+        self.instance_images = []
+        for img in instance_images:
+            self.instance_images.extend(itertools.repeat(img, repeats))
+
+        self.pixel_values = []
+        train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
+        train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
+        train_flip = transforms.RandomHorizontalFlip(p=1.0)
+        train_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        for image in self.instance_images:
+            image = exif_transpose(image)
+            if not image.mode == "RGB":
+                image = image.convert("RGB")
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            image = train_transforms(image)
+            self.pixel_values.append(image)
+
+        self.num_instance_images = len(self.instance_images)
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = self.pixel_values[index % self.num_instance_images]
+        example["instance_images"] = instance_image
+
+        if self.custom_instance_prompts:
+            caption = self.custom_instance_prompts[index % self.num_instance_images]
+            if caption:
+                example["instance_prompt"] = caption
+            else:
+                example["instance_prompt"] = self.instance_prompt
+
+        else:  # custom prompts were provided, but length does not match size of image dataset
+            example["instance_prompt"] = self.instance_prompt
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            class_image = exif_transpose(class_image)
+
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt"] = self.class_prompt
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    pixel_values = [example["instance_images"] for example in examples]
+    prompts = [example["instance_prompt"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        pixel_values += [example["class_images"] for example in examples]
+        prompts += [example["class_prompt"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    batch = {"pixel_values": pixel_values, "prompts": prompts}
+    return batch
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def tokenize_prompt(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+
+
+def _encode_prompt_with_t5(
+    text_encoder,
+    tokenizer,
+    max_sequence_length,
+    prompt=None,
+    num_images_per_prompt=1,
+    device=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        add_special_tokens=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+
+    dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+    _, seq_len, _ = prompt_embeds.shape
+
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+    return prompt_embeds
+
+
+def _encode_prompt_with_clip(
+    text_encoder,
+    tokenizer,
+    prompt: str,
+    device=None,
+    num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_tensors="pt",
+    )
+
+    text_input_ids = text_inputs.input_ids
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+    pooled_prompt_embeds = prompt_embeds[0]
+    prompt_embeds = prompt_embeds.hidden_states[-2]
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def encode_prompt(
+    text_encoders,
+    tokenizers,
+    prompt: str,
+    max_sequence_length,
+    device=None,
+    num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+
+    clip_tokenizers = tokenizers[:2]
+    clip_text_encoders = text_encoders[:2]
+
+    clip_prompt_embeds_list = []
+    clip_pooled_prompt_embeds_list = []
+    for tokenizer, text_encoder in zip(clip_tokenizers, clip_text_encoders):
+        prompt_embeds, pooled_prompt_embeds = _encode_prompt_with_clip(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            prompt=prompt,
+            device=device if device is not None else text_encoder.device,
+            num_images_per_prompt=num_images_per_prompt,
+        )
+        clip_prompt_embeds_list.append(prompt_embeds)
+        clip_pooled_prompt_embeds_list.append(pooled_prompt_embeds)
+
+    clip_prompt_embeds = torch.cat(clip_prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = torch.cat(clip_pooled_prompt_embeds_list, dim=-1)
+
+    t5_prompt_embed = _encode_prompt_with_t5(
+        text_encoders[-1],
+        tokenizers[-1],
+        max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        device=device if device is not None else text_encoders[-1].device,
+    )
+
+    clip_prompt_embeds = torch.nn.functional.pad(
+        clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+    )
+    prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            has_supported_fp16_accelerator = torch.cuda.is_available() or torch.backends.mps.is_available()
+            torch_dtype = torch.float16 if has_supported_fp16_accelerator else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = StableDiffusion3Pipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                revision=args.revision,
+                variant=args.variant,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+    )
+    tokenizer_two = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+    )
+    tokenizer_three = T5TokenizerFast.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_3",
+        revision=args.revision,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+    text_encoder_cls_three = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_3"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="scheduler"
+    )
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    text_encoder_one, text_encoder_two, text_encoder_three = load_text_encoders(
+        text_encoder_cls_one, text_encoder_cls_two, text_encoder_cls_three
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+        variant=args.variant,
+    )
+    transformer = SD3Transformer2DModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant
+    )
+
+    transformer.requires_grad_(True)
+    vae.requires_grad_(False)
+    if args.train_text_encoder:
+        text_encoder_one.requires_grad_(True)
+        text_encoder_two.requires_grad_(True)
+        text_encoder_three.requires_grad_(True)
+    else:
+        text_encoder_one.requires_grad_(False)
+        text_encoder_two.requires_grad_(False)
+        text_encoder_three.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    vae.to(accelerator.device, dtype=torch.float32)
+    if not args.train_text_encoder:
+        text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+        text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+        text_encoder_three.to(accelerator.device, dtype=weight_dtype)
+
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder_one.gradient_checkpointing_enable()
+            text_encoder_two.gradient_checkpointing_enable()
+            text_encoder_three.gradient_checkpointing_enable()
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            for i, model in enumerate(models):
+                if isinstance(unwrap_model(model), SD3Transformer2DModel):
+                    unwrap_model(model).save_pretrained(os.path.join(output_dir, "transformer"))
+                elif isinstance(unwrap_model(model), (CLIPTextModelWithProjection, T5EncoderModel)):
+                    if isinstance(unwrap_model(model), CLIPTextModelWithProjection):
+                        hidden_size = unwrap_model(model).config.hidden_size
+                        if hidden_size == 768:
+                            unwrap_model(model).save_pretrained(os.path.join(output_dir, "text_encoder"))
+                        elif hidden_size == 1280:
+                            unwrap_model(model).save_pretrained(os.path.join(output_dir, "text_encoder_2"))
+                    else:
+                        unwrap_model(model).save_pretrained(os.path.join(output_dir, "text_encoder_3"))
+                else:
+                    raise ValueError(f"Wrong model supplied: {type(model)=}.")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+    def load_model_hook(models, input_dir):
+        for _ in range(len(models)):
+            # pop models so that they are not loaded again
+            model = models.pop()
+
+            # load diffusers style into model
+            if isinstance(unwrap_model(model), SD3Transformer2DModel):
+                load_model = SD3Transformer2DModel.from_pretrained(input_dir, subfolder="transformer")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+            elif isinstance(unwrap_model(model), (CLIPTextModelWithProjection, T5EncoderModel)):
+                try:
+                    load_model = CLIPTextModelWithProjection.from_pretrained(input_dir, subfolder="text_encoder")
+                    model(**load_model.config)
+                    model.load_state_dict(load_model.state_dict())
+                except Exception:
+                    try:
+                        load_model = CLIPTextModelWithProjection.from_pretrained(input_dir, subfolder="text_encoder_2")
+                        model(**load_model.config)
+                        model.load_state_dict(load_model.state_dict())
+                    except Exception:
+                        try:
+                            load_model = T5EncoderModel.from_pretrained(input_dir, subfolder="text_encoder_3")
+                            model(**load_model.config)
+                            model.load_state_dict(load_model.state_dict())
+                        except Exception:
+                            raise ValueError(f"Couldn't load the model of type: ({type(model)}).")
+            else:
+                raise ValueError(f"Unsupported model found: {type(model)=}")
+
+            del load_model
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Optimization parameters
+    transformer_parameters_with_lr = {"params": transformer.parameters(), "lr": args.learning_rate}
+    if args.train_text_encoder:
+        # different learning rate for text encoder and unet
+        text_parameters_one_with_lr = {
+            "params": text_encoder_one.parameters(),
+            "weight_decay": args.adam_weight_decay_text_encoder,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        text_parameters_two_with_lr = {
+            "params": text_encoder_two.parameters(),
+            "weight_decay": args.adam_weight_decay_text_encoder,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        text_parameters_three_with_lr = {
+            "params": text_encoder_three.parameters(),
+            "weight_decay": args.adam_weight_decay_text_encoder,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        params_to_optimize = [
+            transformer_parameters_with_lr,
+            text_parameters_one_with_lr,
+            text_parameters_two_with_lr,
+            text_parameters_three_with_lr,
+        ]
+    else:
+        params_to_optimize = [transformer_parameters_with_lr]
+
+    # Optimizer creation
+    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+            "Defaulting to adamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.optimizer.lower() == "adamw":
+        if args.use_8bit_adam:
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError(
+                    "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+                )
+
+            optimizer_class = bnb.optim.AdamW8bit
+        else:
+            optimizer_class = torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+
+    if args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+        if args.train_text_encoder and args.text_encoder_lr:
+            logger.warning(
+                f"Learning rates were provided both for the transformer and the text encoder- e.g. text_encoder_lr:"
+                f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
+                f"When using prodigy only learning_rate is used as the initial learning rate."
+            )
+            # changes the learning rate of text_encoder_parameters_one and text_encoder_parameters_two to be
+            # --learning_rate
+            params_to_optimize[1]["lr"] = args.learning_rate
+            params_to_optimize[2]["lr"] = args.learning_rate
+            params_to_optimize[3]["lr"] = args.learning_rate
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_prompt=args.class_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_num=args.num_class_images,
+        size=args.resolution,
+        repeats=args.repeats,
+        center_crop=args.center_crop,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    if not args.train_text_encoder:
+        tokenizers = [tokenizer_one, tokenizer_two, tokenizer_three]
+        text_encoders = [text_encoder_one, text_encoder_two, text_encoder_three]
+
+        def compute_text_embeddings(prompt, text_encoders, tokenizers):
+            with torch.no_grad():
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                    text_encoders, tokenizers, prompt, args.max_sequence_length
+                )
+                prompt_embeds = prompt_embeds.to(accelerator.device)
+                pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device)
+            return prompt_embeds, pooled_prompt_embeds
+
+    # If no type of tuning is done on the text_encoder and custom instance prompts are NOT
+    # provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid
+    # the redundant encoding.
+    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
+        instance_prompt_hidden_states, instance_pooled_prompt_embeds = compute_text_embeddings(
+            args.instance_prompt, text_encoders, tokenizers
+        )
+
+    # Handle class prompt for prior-preservation.
+    if args.with_prior_preservation:
+        if not args.train_text_encoder:
+            class_prompt_hidden_states, class_pooled_prompt_embeds = compute_text_embeddings(
+                args.class_prompt, text_encoders, tokenizers
+            )
+
+    # Clear the memory here
+    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
+        del tokenizers, text_encoders
+        # Explicitly delete the objects as well, otherwise only the lists are deleted and the original references remain, preventing garbage collection
+        del text_encoder_one, text_encoder_two, text_encoder_three
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+    # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
+    # pack the statically computed variables appropriately here. This is so that we don't
+    # have to pass them to the dataloader.
+
+    if not train_dataset.custom_instance_prompts:
+        if not args.train_text_encoder:
+            prompt_embeds = instance_prompt_hidden_states
+            pooled_prompt_embeds = instance_pooled_prompt_embeds
+            if args.with_prior_preservation:
+                prompt_embeds = torch.cat([prompt_embeds, class_prompt_hidden_states], dim=0)
+                pooled_prompt_embeds = torch.cat([pooled_prompt_embeds, class_pooled_prompt_embeds], dim=0)
+        # if we're optimizing the text encoder (both if instance prompt is used for all images or custom prompts) we need to tokenize and encode the
+        # batch prompts on all training steps
+        else:
+            tokens_one = tokenize_prompt(tokenizer_one, args.instance_prompt)
+            tokens_two = tokenize_prompt(tokenizer_two, args.instance_prompt)
+            tokens_three = tokenize_prompt(tokenizer_three, args.instance_prompt)
+            if args.with_prior_preservation:
+                class_tokens_one = tokenize_prompt(tokenizer_one, args.class_prompt)
+                class_tokens_two = tokenize_prompt(tokenizer_two, args.class_prompt)
+                class_tokens_three = tokenize_prompt(tokenizer_three, args.class_prompt)
+                tokens_one = torch.cat([tokens_one, class_tokens_one], dim=0)
+                tokens_two = torch.cat([tokens_two, class_tokens_two], dim=0)
+                tokens_three = torch.cat([tokens_three, class_tokens_three], dim=0)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        (
+            transformer,
+            text_encoder_one,
+            text_encoder_two,
+            text_encoder_three,
+            optimizer,
+            train_dataloader,
+            lr_scheduler,
+        ) = accelerator.prepare(
+            transformer,
+            text_encoder_one,
+            text_encoder_two,
+            text_encoder_three,
+            optimizer,
+            train_dataloader,
+            lr_scheduler,
+        )
+    else:
+        transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            transformer, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = "dreambooth-sd3"
+        accelerator.init_trackers(tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+        if args.train_text_encoder:
+            text_encoder_one.train()
+            text_encoder_two.train()
+            text_encoder_three.train()
+
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [transformer]
+            if args.train_text_encoder:
+                models_to_accumulate.extend([text_encoder_one, text_encoder_two, text_encoder_three])
+            with accelerator.accumulate(models_to_accumulate):
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                prompts = batch["prompts"]
+
+                # encode batch prompts when custom prompts are provided for each image -
+                if train_dataset.custom_instance_prompts:
+                    if not args.train_text_encoder:
+                        prompt_embeds, pooled_prompt_embeds = compute_text_embeddings(
+                            prompts, text_encoders, tokenizers
+                        )
+                    else:
+                        tokens_one = tokenize_prompt(tokenizer_one, prompts)
+                        tokens_two = tokenize_prompt(tokenizer_two, prompts)
+                        tokens_three = tokenize_prompt(tokenizer_three, prompts)
+
+                # Convert images to latent space
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                model_input = (model_input - vae.config.shift_factor) * vae.config.scaling_factor
+                model_input = model_input.to(dtype=weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+
+                # Sample a random timestep for each image
+                # for weighting schemes where we sample timesteps non-uniformly
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+
+                # Add noise according to flow matching.
+                # zt = (1 - texp) * x + texp * z1
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
+
+                # Predict the noise residual
+                if not args.train_text_encoder:
+                    model_pred = transformer(
+                        hidden_states=noisy_model_input,
+                        timestep=timesteps,
+                        encoder_hidden_states=prompt_embeds,
+                        pooled_projections=pooled_prompt_embeds,
+                        return_dict=False,
+                    )[0]
+                else:
+                    prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                        text_encoders=[text_encoder_one, text_encoder_two, text_encoder_three],
+                        tokenizers=None,
+                        prompt=None,
+                        text_input_ids_list=[tokens_one, tokens_two, tokens_three],
+                    )
+                    model_pred = transformer(
+                        hidden_states=noisy_model_input,
+                        timestep=timesteps,
+                        encoder_hidden_states=prompt_embeds,
+                        pooled_projections=pooled_prompt_embeds,
+                        return_dict=False,
+                    )[0]
+
+                # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                # Preconditioning of the model outputs.
+                if args.precondition_outputs:
+                    model_pred = model_pred * (-sigmas) + noisy_model_input
+
+                # these weighting schemes use a uniform timestep sampling
+                # and instead post-weight the loss
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+
+                # flow matching loss
+                if args.precondition_outputs:
+                    target = model_input
+                else:
+                    target = noise - model_input
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute prior loss
+                    prior_loss = torch.mean(
+                        (weighting.float() * (model_pred_prior.float() - target_prior.float()) ** 2).reshape(
+                            target_prior.shape[0], -1
+                        ),
+                        1,
+                    )
+                    prior_loss = prior_loss.mean()
+
+                # Compute regular loss.
+                loss = torch.mean(
+                    (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    1,
+                )
+                loss = loss.mean()
+
+                if args.with_prior_preservation:
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(
+                            transformer.parameters(),
+                            text_encoder_one.parameters(),
+                            text_encoder_two.parameters(),
+                            text_encoder_three.parameters(),
+                        )
+                        if args.train_text_encoder
+                        else transformer.parameters()
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                # create pipeline
+                if not args.train_text_encoder:
+                    text_encoder_one, text_encoder_two, text_encoder_three = load_text_encoders(
+                        text_encoder_cls_one, text_encoder_cls_two, text_encoder_cls_three
+                    )
+                pipeline = StableDiffusion3Pipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=accelerator.unwrap_model(text_encoder_one),
+                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
+                    text_encoder_3=accelerator.unwrap_model(text_encoder_three),
+                    transformer=accelerator.unwrap_model(transformer),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline_args = {"prompt": args.validation_prompt}
+                images = log_validation(
+                    pipeline=pipeline,
+                    args=args,
+                    accelerator=accelerator,
+                    pipeline_args=pipeline_args,
+                    epoch=epoch,
+                    torch_dtype=weight_dtype,
+                )
+                if not args.train_text_encoder:
+                    del text_encoder_one, text_encoder_two, text_encoder_three
+                    torch.cuda.empty_cache()
+                    gc.collect()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        transformer = unwrap_model(transformer)
+
+        if args.train_text_encoder:
+            text_encoder_one = unwrap_model(text_encoder_one)
+            text_encoder_two = unwrap_model(text_encoder_two)
+            text_encoder_three = unwrap_model(text_encoder_three)
+            pipeline = StableDiffusion3Pipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                transformer=transformer,
+                text_encoder=text_encoder_one,
+                text_encoder_2=text_encoder_two,
+                text_encoder_3=text_encoder_three,
+            )
+        else:
+            pipeline = StableDiffusion3Pipeline.from_pretrained(
+                args.pretrained_model_name_or_path, transformer=transformer
+            )
+
+        # save the pipeline
+        pipeline.save_pretrained(args.output_dir)
+
+        # Final inference
+        # Load previous pipeline
+        pipeline = StableDiffusion3Pipeline.from_pretrained(
+            args.output_dir,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline_args = {"prompt": args.validation_prompt}
+            images = log_validation(
+                pipeline=pipeline,
+                args=args,
+                accelerator=accelerator,
+                pipeline_args=pipeline_args,
+                epoch=epoch,
+                is_final_validation=True,
+                torch_dtype=weight_dtype,
+            )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                train_text_encoder=args.train_text_encoder,
+                instance_prompt=args.instance_prompt,
+                validation_prompt=args.validation_prompt,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/inference/README.md b/diffusers/examples/inference/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..52d66be8e228d312f1d079e6c8123448b6fa86fd
--- /dev/null
+++ b/diffusers/examples/inference/README.md
@@ -0,0 +1,8 @@
+# Inference Examples
+
+**The inference examples folder is deprecated and will be removed in a future version**.
+**Officially supported inference examples can be found in the [Pipelines folder](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines)**.
+
+- For `Image-to-Image text-guided generation with Stable Diffusion`, please have a look at the official [Pipeline examples](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines#examples)
+- For `In-painting using Stable Diffusion`, please have a look at the official [Pipeline examples](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines#examples)
+- For `Tweak prompts reusing seeds and latents`, please have a look at the official [Pipeline examples](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines#examples)
diff --git a/diffusers/examples/inference/image_to_image.py b/diffusers/examples/inference/image_to_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..86b46c4e606e039cb2ad80b341b2685694f883b4
--- /dev/null
+++ b/diffusers/examples/inference/image_to_image.py
@@ -0,0 +1,9 @@
+import warnings
+
+from diffusers import StableDiffusionImg2ImgPipeline  # noqa F401
+
+
+warnings.warn(
+    "The `image_to_image.py` script is outdated. Please use directly `from diffusers import"
+    " StableDiffusionImg2ImgPipeline` instead."
+)
diff --git a/diffusers/examples/inference/inpainting.py b/diffusers/examples/inference/inpainting.py
new file mode 100644
index 0000000000000000000000000000000000000000..8aad208ff34eb4d4ba1c6acfdfe0f97ac9afc4bc
--- /dev/null
+++ b/diffusers/examples/inference/inpainting.py
@@ -0,0 +1,9 @@
+import warnings
+
+from diffusers import StableDiffusionInpaintPipeline as StableDiffusionInpaintPipeline  # noqa F401
+
+
+warnings.warn(
+    "The `inpainting.py` script is outdated. Please use directly `from diffusers import"
+    " StableDiffusionInpaintPipeline` instead."
+)
diff --git a/diffusers/examples/instruct_pix2pix/README.md b/diffusers/examples/instruct_pix2pix/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a5e91a5356257ad9e41c23e8acc49b097e2aee2e
--- /dev/null
+++ b/diffusers/examples/instruct_pix2pix/README.md
@@ -0,0 +1,196 @@
+# InstructPix2Pix training example
+
+[InstructPix2Pix](https://arxiv.org/abs/2211.09800) is a method to fine-tune text-conditioned diffusion models such that they can follow an edit instruction for an input image. Models fine-tuned using this method take the following as inputs:
+
+<p align="center">
+    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/evaluation_diffusion_models/edit-instruction.png" alt="instructpix2pix-inputs" width=600/>
+</p>
+
+The output is an "edited" image that reflects the edit instruction applied on the input image:
+
+<p align="center">
+    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/output-gs%407-igs%401-steps%4050.png" alt="instructpix2pix-output" width=600/>
+</p>
+
+The `train_instruct_pix2pix.py` script shows how to implement the training procedure and adapt it for Stable Diffusion.
+
+***Disclaimer: Even though `train_instruct_pix2pix.py` implements the InstructPix2Pix
+training procedure while being faithful to the [original implementation](https://github.com/timothybrooks/instruct-pix2pix) we have only tested it on a [small-scale dataset](https://huggingface.co/datasets/fusing/instructpix2pix-1000-samples). This can impact the end results. For better results, we recommend longer training runs with a larger dataset. [Here](https://huggingface.co/datasets/timbrooks/instructpix2pix-clip-filtered) you can find a large dataset for InstructPix2Pix training.***
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the example folder and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+### Toy example
+
+As mentioned before, we'll use a [small toy dataset](https://huggingface.co/datasets/fusing/instructpix2pix-1000-samples) for training. The dataset
+is a smaller version of the [original dataset](https://huggingface.co/datasets/timbrooks/instructpix2pix-clip-filtered) used in the InstructPix2Pix paper.
+
+Configure environment variables such as the dataset identifier and the Stable Diffusion
+checkpoint:
+
+```bash
+export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export DATASET_ID="fusing/instructpix2pix-1000-samples"
+```
+
+Now, we can launch training:
+
+```bash
+accelerate launch --mixed_precision="fp16" train_instruct_pix2pix.py \
+    --pretrained_model_name_or_path=$MODEL_NAME \
+    --dataset_name=$DATASET_ID \
+    --enable_xformers_memory_efficient_attention \
+    --resolution=256 --random_flip \
+    --train_batch_size=4 --gradient_accumulation_steps=4 --gradient_checkpointing \
+    --max_train_steps=15000 \
+    --checkpointing_steps=5000 --checkpoints_total_limit=1 \
+    --learning_rate=5e-05 --max_grad_norm=1 --lr_warmup_steps=0 \
+    --conditioning_dropout_prob=0.05 \
+    --mixed_precision=fp16 \
+    --seed=42 \
+    --push_to_hub
+```
+
+Additionally, we support performing validation inference to monitor training progress
+with Weights and Biases. You can enable this feature with `report_to="wandb"`:
+
+```bash
+accelerate launch --mixed_precision="fp16" train_instruct_pix2pix.py \
+    --pretrained_model_name_or_path=$MODEL_NAME \
+    --dataset_name=$DATASET_ID \
+    --enable_xformers_memory_efficient_attention \
+    --resolution=256 --random_flip \
+    --train_batch_size=4 --gradient_accumulation_steps=4 --gradient_checkpointing \
+    --max_train_steps=15000 \
+    --checkpointing_steps=5000 --checkpoints_total_limit=1 \
+    --learning_rate=5e-05 --max_grad_norm=1 --lr_warmup_steps=0 \
+    --conditioning_dropout_prob=0.05 \
+    --mixed_precision=fp16 \
+    --val_image_url="https://hf.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png" \
+    --validation_prompt="make the mountains snowy" \
+    --seed=42 \
+    --report_to=wandb \
+    --push_to_hub
+ ```
+
+ We recommend this type of validation as it can be useful for model debugging. Note that you need `wandb` installed to use this. You can install `wandb` by running `pip install wandb`.
+
+ [Here](https://wandb.ai/sayakpaul/instruct-pix2pix/runs/ctr3kovq), you can find an example training run that includes some validation samples and the training hyperparameters.
+
+ ***Note: In the original paper, the authors observed that even when the model is trained with an image resolution of 256x256, it generalizes well to bigger resolutions such as 512x512. This is likely because of the larger dataset they used during training.***
+
+ ## Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash
+accelerate launch --mixed_precision="fp16" --multi_gpu train_instruct_pix2pix.py \
+ --pretrained_model_name_or_path=stable-diffusion-v1-5/stable-diffusion-v1-5 \
+ --dataset_name=sayakpaul/instructpix2pix-1000-samples \
+ --use_ema \
+ --enable_xformers_memory_efficient_attention \
+ --resolution=512 --random_flip \
+ --train_batch_size=4 --gradient_accumulation_steps=4 --gradient_checkpointing \
+ --max_train_steps=15000 \
+ --checkpointing_steps=5000 --checkpoints_total_limit=1 \
+ --learning_rate=5e-05 --lr_warmup_steps=0 \
+ --conditioning_dropout_prob=0.05 \
+ --mixed_precision=fp16 \
+ --seed=42 \
+ --push_to_hub
+```
+
+ ## Inference
+
+ Once training is complete, we can perform inference:
+
+ ```python
+import PIL
+import requests
+import torch
+from diffusers import StableDiffusionInstructPix2PixPipeline
+
+model_id = "your_model_id" # <- replace this
+pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+generator = torch.Generator("cuda").manual_seed(0)
+
+url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/test_pix2pix_4.png"
+
+
+def download_image(url):
+    image = PIL.Image.open(requests.get(url, stream=True).raw)
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+
+image = download_image(url)
+prompt = "wipe out the lake"
+num_inference_steps = 20
+image_guidance_scale = 1.5
+guidance_scale = 10
+
+edited_image = pipe(prompt,
+    image=image,
+    num_inference_steps=num_inference_steps,
+    image_guidance_scale=image_guidance_scale,
+    guidance_scale=guidance_scale,
+    generator=generator,
+).images[0]
+edited_image.save("edited_image.png")
+```
+
+An example model repo obtained using this training script can be found
+here - [sayakpaul/instruct-pix2pix](https://huggingface.co/sayakpaul/instruct-pix2pix).
+
+We encourage you to play with the following three parameters to control
+speed and quality during performance:
+
+* `num_inference_steps`
+* `image_guidance_scale`
+* `guidance_scale`
+
+Particularly, `image_guidance_scale` and `guidance_scale` can have a profound impact
+on the generated ("edited") image (see [here](https://twitter.com/RisingSayak/status/1628392199196151808?s=20) for an example).
+
+If you're looking for some interesting ways to use the InstructPix2Pix training methodology, we welcome you to check out this blog post: [Instruction-tuning Stable Diffusion with InstructPix2Pix](https://huggingface.co/blog/instruction-tuning-sd).
+
+## Stable Diffusion XL
+
+There's an equivalent `train_instruct_pix2pix_sdxl.py` script for [Stable Diffusion XL](https://huggingface.co/papers/2307.01952). Please refer to the docs [here](./README_sdxl.md) to learn more.
diff --git a/diffusers/examples/instruct_pix2pix/README_sdxl.md b/diffusers/examples/instruct_pix2pix/README_sdxl.md
new file mode 100644
index 0000000000000000000000000000000000000000..dcf828e80ba8c163b444b1b4d7b819c1e25dacc1
--- /dev/null
+++ b/diffusers/examples/instruct_pix2pix/README_sdxl.md
@@ -0,0 +1,197 @@
+# InstructPix2Pix SDXL training example
+
+***This is based on the original InstructPix2Pix training example.***
+
+[Stable Diffusion XL](https://huggingface.co/papers/2307.01952) (or SDXL) is the latest image generation model that is tailored towards more photorealistic outputs with more detailed imagery and composition compared to previous SD models. It leverages a three times larger UNet backbone. The increase of model parameters is mainly due to more attention blocks and a larger cross-attention context as SDXL uses a second text encoder.
+
+The `train_instruct_pix2pix_sdxl.py` script shows how to implement the training procedure and adapt it for Stable Diffusion XL.
+
+***Disclaimer: Even though `train_instruct_pix2pix_sdxl.py` implements the InstructPix2Pix
+training procedure while being faithful to the [original implementation](https://github.com/timothybrooks/instruct-pix2pix) we have only tested it on a [small-scale dataset](https://huggingface.co/datasets/fusing/instructpix2pix-1000-samples). This can impact the end results. For better results, we recommend longer training runs with a larger dataset. [Here](https://huggingface.co/datasets/timbrooks/instructpix2pix-clip-filtered) you can find a large dataset for InstructPix2Pix training.***
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Refer to the original InstructPix2Pix training example for installing the dependencies.
+
+You will also need to get access of SDXL by filling the [form](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0).
+
+### Toy example
+
+As mentioned before, we'll use a [small toy dataset](https://huggingface.co/datasets/fusing/instructpix2pix-1000-samples) for training. The dataset
+is a smaller version of the [original dataset](https://huggingface.co/datasets/timbrooks/instructpix2pix-clip-filtered) used in the InstructPix2Pix paper.
+
+Configure environment variables such as the dataset identifier and the Stable Diffusion
+checkpoint:
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export DATASET_ID="fusing/instructpix2pix-1000-samples"
+```
+
+Now, we can launch training:
+
+```bash
+accelerate launch train_instruct_pix2pix_sdxl.py \
+    --pretrained_model_name_or_path=$MODEL_NAME \
+    --dataset_name=$DATASET_ID \
+    --enable_xformers_memory_efficient_attention \
+    --resolution=256 --random_flip \
+    --train_batch_size=4 --gradient_accumulation_steps=4 --gradient_checkpointing \
+    --max_train_steps=15000 \
+    --checkpointing_steps=5000 --checkpoints_total_limit=1 \
+    --learning_rate=5e-05 --max_grad_norm=1 --lr_warmup_steps=0 \
+    --conditioning_dropout_prob=0.05 \
+    --seed=42 \
+    --push_to_hub
+```
+
+Additionally, we support performing validation inference to monitor training progress
+with Weights and Biases. You can enable this feature with `report_to="wandb"`:
+
+```bash
+accelerate launch train_instruct_pix2pix_sdxl.py \
+    --pretrained_model_name_or_path=stabilityai/stable-diffusion-xl-base-1.0 \
+    --dataset_name=$DATASET_ID \
+    --use_ema \
+    --enable_xformers_memory_efficient_attention \
+    --resolution=512 --random_flip \
+    --train_batch_size=4 --gradient_accumulation_steps=4 --gradient_checkpointing \
+    --max_train_steps=15000 \
+    --checkpointing_steps=5000 --checkpoints_total_limit=1 \
+    --learning_rate=5e-05 --lr_warmup_steps=0 \
+    --conditioning_dropout_prob=0.05 \
+    --seed=42 \
+    --val_image_url_or_path="https://datasets-server.huggingface.co/assets/fusing/instructpix2pix-1000-samples/--/fusing--instructpix2pix-1000-samples/train/23/input_image/image.jpg" \
+    --validation_prompt="make it in japan" \
+    --report_to=wandb \
+    --push_to_hub
+ ```
+
+ We recommend this type of validation as it can be useful for model debugging. Note that you need `wandb` installed to use this. You can install `wandb` by running `pip install wandb`.
+
+ [Here](https://wandb.ai/sayakpaul/instruct-pix2pix-sdxl-new/runs/sw53gxmc), you can find an example training run that includes some validation samples and the training hyperparameters.
+
+ ***Note: In the original paper, the authors observed that even when the model is trained with an image resolution of 256x256, it generalizes well to bigger resolutions such as 512x512. This is likely because of the larger dataset they used during training.***
+
+ ## Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash
+accelerate launch --mixed_precision="fp16" --multi_gpu train_instruct_pix2pix_sdxl.py \
+    --pretrained_model_name_or_path=stabilityai/stable-diffusion-xl-base-1.0 \
+    --dataset_name=$DATASET_ID \
+    --use_ema \
+    --enable_xformers_memory_efficient_attention \
+    --resolution=512 --random_flip \
+    --train_batch_size=4 --gradient_accumulation_steps=4 --gradient_checkpointing \
+    --max_train_steps=15000 \
+    --checkpointing_steps=5000 --checkpoints_total_limit=1 \
+    --learning_rate=5e-05 --lr_warmup_steps=0 \
+    --conditioning_dropout_prob=0.05 \
+    --seed=42 \
+    --val_image_url_or_path="https://datasets-server.huggingface.co/assets/fusing/instructpix2pix-1000-samples/--/fusing--instructpix2pix-1000-samples/train/23/input_image/image.jpg" \
+    --validation_prompt="make it in japan" \
+    --report_to=wandb \
+    --push_to_hub
+```
+
+ ## Inference
+
+ Once training is complete, we can perform inference:
+
+ ```python
+import PIL
+import requests
+import torch
+from diffusers import StableDiffusionXLInstructPix2PixPipeline
+
+model_id = "your_model_id" # <- replace this
+pipe = StableDiffusionXLInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+generator = torch.Generator("cuda").manual_seed(0)
+
+url = "https://datasets-server.huggingface.co/assets/fusing/instructpix2pix-1000-samples/--/fusing--instructpix2pix-1000-samples/train/23/input_image/image.jpg"
+
+
+def download_image(url):
+    image = PIL.Image.open(requests.get(url, stream=True).raw)
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+
+image = download_image(url)
+prompt = "make it Japan"
+num_inference_steps = 20
+image_guidance_scale = 1.5
+guidance_scale = 10
+
+edited_image = pipe(prompt,
+    image=image,
+    num_inference_steps=num_inference_steps,
+    image_guidance_scale=image_guidance_scale,
+    guidance_scale=guidance_scale,
+    generator=generator,
+).images[0]
+edited_image.save("edited_image.png")
+```
+
+We encourage you to play with the following three parameters to control
+speed and quality during performance:
+
+* `num_inference_steps`
+* `image_guidance_scale`
+* `guidance_scale`
+
+Particularly, `image_guidance_scale` and `guidance_scale` can have a profound impact
+on the generated ("edited") image (see [here](https://twitter.com/RisingSayak/status/1628392199196151808?s=20) for an example).
+
+If you're looking for some interesting ways to use the InstructPix2Pix training methodology, we welcome you to check out this blog post: [Instruction-tuning Stable Diffusion with InstructPix2Pix](https://huggingface.co/blog/instruction-tuning-sd).
+
+## Compare between SD and SDXL
+
+We aim to understand the differences resulting from the use of SD-1.5 and SDXL-0.9 as pretrained models. To achieve this, we trained on the [small toy dataset](https://huggingface.co/datasets/fusing/instructpix2pix-1000-samples) using both of these pretrained models. The training script is as follows:
+
+```bash
+export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5" or "stabilityai/stable-diffusion-xl-base-0.9"
+export DATASET_ID="fusing/instructpix2pix-1000-samples"
+
+accelerate launch train_instruct_pix2pix.py \
+    --pretrained_model_name_or_path=$MODEL_NAME \
+    --dataset_name=$DATASET_ID \
+    --use_ema \
+    --enable_xformers_memory_efficient_attention \
+    --resolution=512 --random_flip \
+    --train_batch_size=4 --gradient_accumulation_steps=4 --gradient_checkpointing \
+    --max_train_steps=15000 \
+    --checkpointing_steps=5000 --checkpoints_total_limit=1 \
+    --learning_rate=5e-05 --lr_warmup_steps=0 \
+    --conditioning_dropout_prob=0.05 \
+    --seed=42 \
+    --val_image_url="https://datasets-server.huggingface.co/assets/fusing/instructpix2pix-1000-samples/--/fusing--instructpix2pix-1000-samples/train/23/input_image/image.jpg" \
+    --validation_prompt="make it in Japan" \
+    --report_to=wandb \
+    --push_to_hub
+```
+
+We discovered that compared to training with SD-1.5 as the pretrained model, SDXL-0.9 results in a lower training loss value (SD-1.5 yields 0.0599, SDXL scores 0.0254). Moreover, from a visual perspective, the results obtained using SDXL demonstrated fewer artifacts and a richer detail. Notably, SDXL starts to preserve the structure of the original image earlier on.
+
+The following two GIFs provide intuitive visual results. We observed, for each step, what kind of results could be achieved using the image
+<p align="center">
+    <img src="https://datasets-server.huggingface.co/assets/fusing/instructpix2pix-1000-samples/--/fusing--instructpix2pix-1000-samples/train/23/input_image/image.jpg" alt="input for make it Japan" width=600/>
+</p>
+with "make it in Japan” as the prompt. It can be seen that SDXL starts preserving the details of the original image earlier, resulting in higher fidelity outcomes sooner.
+
+* SD-1.5: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_ip2p_training_val_img_progress.gif
+
+<p align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_ip2p_training_val_img_progress.gif" alt="input for make it Japan" width=600/>
+</p>
+
+* SDXL: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_ip2p_training_val_img_progress.gif
+
+<p align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_ip2p_training_val_img_progress.gif" alt="input for make it Japan" width=600/>
+</p>
diff --git a/diffusers/examples/instruct_pix2pix/requirements.txt b/diffusers/examples/instruct_pix2pix/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e18cc9e4215eaa760c8d29c946396dba9ff2c9ac
--- /dev/null
+++ b/diffusers/examples/instruct_pix2pix/requirements.txt
@@ -0,0 +1,6 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+datasets
+ftfy
+tensorboard
\ No newline at end of file
diff --git a/diffusers/examples/instruct_pix2pix/test_instruct_pix2pix.py b/diffusers/examples/instruct_pix2pix/test_instruct_pix2pix.py
new file mode 100644
index 0000000000000000000000000000000000000000..f72f95db6afd46716d8a6e1c910879ca354e042c
--- /dev/null
+++ b/diffusers/examples/instruct_pix2pix/test_instruct_pix2pix.py
@@ -0,0 +1,101 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class InstructPix2Pix(ExamplesTestsAccelerate):
+    def test_instruct_pix2pix_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/instruct_pix2pix/train_instruct_pix2pix.py
+                --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+                --dataset_name=hf-internal-testing/instructpix2pix-10-samples
+                --resolution=64
+                --random_flip
+                --train_batch_size=1
+                --max_train_steps=6
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                --output_dir {tmpdir}
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_instruct_pix2pix_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/instruct_pix2pix/train_instruct_pix2pix.py
+                --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+                --dataset_name=hf-internal-testing/instructpix2pix-10-samples
+                --resolution=64
+                --random_flip
+                --train_batch_size=1
+                --max_train_steps=4
+                --checkpointing_steps=2
+                --output_dir {tmpdir}
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            resume_run_args = f"""
+                examples/instruct_pix2pix/train_instruct_pix2pix.py
+                --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+                --dataset_name=hf-internal-testing/instructpix2pix-10-samples
+                --resolution=64
+                --random_flip
+                --train_batch_size=1
+                --max_train_steps=8
+                --checkpointing_steps=2
+                --output_dir {tmpdir}
+                --seed=0
+                --resume_from_checkpoint=checkpoint-4
+                --checkpoints_total_limit=2
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-6", "checkpoint-8"},
+            )
diff --git a/diffusers/examples/instruct_pix2pix/train_instruct_pix2pix.py b/diffusers/examples/instruct_pix2pix/train_instruct_pix2pix.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5b7eaac4a1fadbec03d84822121f6fcfb6f4646
--- /dev/null
+++ b/diffusers/examples/instruct_pix2pix/train_instruct_pix2pix.py
@@ -0,0 +1,1031 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script to fine-tune Stable Diffusion for InstructPix2Pix."""
+
+import argparse
+import logging
+import math
+import os
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+
+import accelerate
+import datasets
+import numpy as np
+import PIL
+import requests
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionInstructPix2PixPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel
+from diffusers.utils import check_min_version, deprecate, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+DATASET_NAME_MAPPING = {
+    "fusing/instructpix2pix-1000-samples": ("input_image", "edit_prompt", "edited_image"),
+}
+WANDB_TABLE_COL_NAMES = ["original_image", "edited_image", "edit_prompt"]
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    generator,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    original_image = download_image(args.val_image_url)
+    edited_images = []
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+
+    with autocast_ctx:
+        for _ in range(args.num_validation_images):
+            edited_images.append(
+                pipeline(
+                    args.validation_prompt,
+                    image=original_image,
+                    num_inference_steps=20,
+                    image_guidance_scale=1.5,
+                    guidance_scale=7,
+                    generator=generator,
+                ).images[0]
+            )
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "wandb":
+            wandb_table = wandb.Table(columns=WANDB_TABLE_COL_NAMES)
+            for edited_image in edited_images:
+                wandb_table.add_data(wandb.Image(original_image), wandb.Image(edited_image), args.validation_prompt)
+            tracker.log({"validation": wandb_table})
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script for InstructPix2Pix.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--original_image_column",
+        type=str,
+        default="input_image",
+        help="The column of the dataset containing the original image on which edits where made.",
+    )
+    parser.add_argument(
+        "--edited_image_column",
+        type=str,
+        default="edited_image",
+        help="The column of the dataset containing the edited image.",
+    )
+    parser.add_argument(
+        "--edit_prompt_column",
+        type=str,
+        default="edit_prompt",
+        help="The column of the dataset containing the edit instruction.",
+    )
+    parser.add_argument(
+        "--val_image_url",
+        type=str,
+        default=None,
+        help="URL to the original image that you would like to edit (used during inference for debugging purposes).",
+    )
+    parser.add_argument(
+        "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="instruct-pix2pix-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=256,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--conditioning_dropout_prob",
+        type=float,
+        default=None,
+        help="Conditioning dropout probability. Drops out the conditionings (image and edit prompt) used in training InstructPix2Pix. See section 3.2.1 in the paper: https://arxiv.org/abs/2211.09800.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--non_ema_revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or"
+            " remote repository specified with --pretrained_model_name_or_path."
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    # default to using the same revision for the non-ema model if not specified
+    if args.non_ema_revision is None:
+        args.non_ema_revision = args.revision
+
+    return args
+
+
+def convert_to_np(image, resolution):
+    image = image.convert("RGB").resize((resolution, resolution))
+    return np.array(image).transpose(2, 0, 1)
+
+
+def download_image(url):
+    image = PIL.Image.open(requests.get(url, stream=True).raw)
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+
+
+def main():
+    args = parse_args()
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if args.non_ema_revision is not None:
+        deprecate(
+            "non_ema_revision!=None",
+            "0.15.0",
+            message=(
+                "Downloading 'non_ema' weights from revision branches of the Hub is deprecated. Please make sure to"
+                " use `--variant=non_ema` instead."
+            ),
+        )
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.non_ema_revision
+    )
+
+    # InstructPix2Pix uses an additional image for conditioning. To accommodate that,
+    # it uses 8 channels (instead of 4) in the first (conv) layer of the UNet. This UNet is
+    # then fine-tuned on the custom InstructPix2Pix dataset. This modified UNet is initialized
+    # from the pre-trained checkpoints. For the extra channels added to the first layer, they are
+    # initialized to zero.
+    logger.info("Initializing the InstructPix2Pix UNet from the pretrained UNet.")
+    in_channels = 8
+    out_channels = unet.conv_in.out_channels
+    unet.register_to_config(in_channels=in_channels)
+
+    with torch.no_grad():
+        new_conv_in = nn.Conv2d(
+            in_channels, out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding
+        )
+        new_conv_in.weight.zero_()
+        new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
+        unet.conv_in = new_conv_in
+
+    # Freeze vae and text_encoder
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+
+    # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = EMAModel(unet.parameters(), model_cls=UNet2DConditionModel, model_config=unet.config)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    if weights:
+                        weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DConditionModel)
+                ema_unet.load_state_dict(load_model.state_dict())
+                ema_unet.to(accelerator.device)
+                del load_model
+
+            for i in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/main/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.original_image_column is None:
+        original_image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        original_image_column = args.original_image_column
+        if original_image_column not in column_names:
+            raise ValueError(
+                f"--original_image_column' value '{args.original_image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.edit_prompt_column is None:
+        edit_prompt_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        edit_prompt_column = args.edit_prompt_column
+        if edit_prompt_column not in column_names:
+            raise ValueError(
+                f"--edit_prompt_column' value '{args.edit_prompt_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.edited_image_column is None:
+        edited_image_column = dataset_columns[2] if dataset_columns is not None else column_names[2]
+    else:
+        edited_image_column = args.edited_image_column
+        if edited_image_column not in column_names:
+            raise ValueError(
+                f"--edited_image_column' value '{args.edited_image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(captions):
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        return inputs.input_ids
+
+    # Preprocessing the datasets.
+    train_transforms = transforms.Compose(
+        [
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+        ]
+    )
+
+    def preprocess_images(examples):
+        original_images = np.concatenate(
+            [convert_to_np(image, args.resolution) for image in examples[original_image_column]]
+        )
+        edited_images = np.concatenate(
+            [convert_to_np(image, args.resolution) for image in examples[edited_image_column]]
+        )
+        # We need to ensure that the original and the edited images undergo the same
+        # augmentation transforms.
+        images = np.concatenate([original_images, edited_images])
+        images = torch.tensor(images)
+        images = 2 * (images / 255) - 1
+        return train_transforms(images)
+
+    def preprocess_train(examples):
+        # Preprocess images.
+        preprocessed_images = preprocess_images(examples)
+        # Since the original and edited images were concatenated before
+        # applying the transformations, we need to separate them and reshape
+        # them accordingly.
+        original_images, edited_images = preprocessed_images.chunk(2)
+        original_images = original_images.reshape(-1, 3, args.resolution, args.resolution)
+        edited_images = edited_images.reshape(-1, 3, args.resolution, args.resolution)
+
+        # Collate the preprocessed images into the `examples`.
+        examples["original_pixel_values"] = original_images
+        examples["edited_pixel_values"] = edited_images
+
+        # Preprocess the captions.
+        captions = list(examples[edit_prompt_column])
+        examples["input_ids"] = tokenize_captions(captions)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        original_pixel_values = torch.stack([example["original_pixel_values"] for example in examples])
+        original_pixel_values = original_pixel_values.to(memory_format=torch.contiguous_format).float()
+        edited_pixel_values = torch.stack([example["edited_pixel_values"] for example in examples])
+        edited_pixel_values = edited_pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+        return {
+            "original_pixel_values": original_pixel_values,
+            "edited_pixel_values": edited_pixel_values,
+            "input_ids": input_ids,
+        }
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.use_ema:
+        ema_unet.to(accelerator.device)
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move text_encode and vae to gpu and cast to weight_dtype
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("instruct-pix2pix", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            with accelerator.accumulate(unet):
+                # We want to learn the denoising process w.r.t the edited images which
+                # are conditioned on the original image (which was edited) and the edit instruction.
+                # So, first, convert images to latent space.
+                latents = vae.encode(batch["edited_pixel_values"].to(weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning.
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Get the additional image embedding for conditioning.
+                # Instead of getting a diagonal Gaussian here, we simply take the mode.
+                original_image_embeds = vae.encode(batch["original_pixel_values"].to(weight_dtype)).latent_dist.mode()
+
+                # Conditioning dropout to support classifier-free guidance during inference. For more details
+                # check out the section 3.2.1 of the original paper https://arxiv.org/abs/2211.09800.
+                if args.conditioning_dropout_prob is not None:
+                    random_p = torch.rand(bsz, device=latents.device, generator=generator)
+                    # Sample masks for the edit prompts.
+                    prompt_mask = random_p < 2 * args.conditioning_dropout_prob
+                    prompt_mask = prompt_mask.reshape(bsz, 1, 1)
+                    # Final text conditioning.
+                    null_conditioning = text_encoder(tokenize_captions([""]).to(accelerator.device))[0]
+                    encoder_hidden_states = torch.where(prompt_mask, null_conditioning, encoder_hidden_states)
+
+                    # Sample masks for the original images.
+                    image_mask_dtype = original_image_embeds.dtype
+                    image_mask = 1 - (
+                        (random_p >= args.conditioning_dropout_prob).to(image_mask_dtype)
+                        * (random_p < 3 * args.conditioning_dropout_prob).to(image_mask_dtype)
+                    )
+                    image_mask = image_mask.reshape(bsz, 1, 1, 1)
+                    # Final image conditioning.
+                    original_image_embeds = image_mask * original_image_embeds
+
+                # Concatenate the `original_image_embeds` with the `noisy_latents`.
+                concatenated_noisy_latents = torch.cat([noisy_latents, original_image_embeds], dim=1)
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # Predict the noise residual and compute loss
+                model_pred = unet(concatenated_noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_unet.step(unet.parameters())
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if (
+                (args.val_image_url is not None)
+                and (args.validation_prompt is not None)
+                and (epoch % args.validation_epochs == 0)
+            ):
+                if args.use_ema:
+                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                    ema_unet.store(unet.parameters())
+                    ema_unet.copy_to(unet.parameters())
+                # The models need unwrapping because for compatibility in distributed training mode.
+                pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    unet=unwrap_model(unet),
+                    text_encoder=unwrap_model(text_encoder),
+                    vae=unwrap_model(vae),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+
+                log_validation(
+                    pipeline,
+                    args,
+                    accelerator,
+                    generator,
+                )
+
+                if args.use_ema:
+                    # Switch back to the original UNet parameters.
+                    ema_unet.restore(unet.parameters())
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+
+        pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            text_encoder=unwrap_model(text_encoder),
+            vae=unwrap_model(vae),
+            unet=unwrap_model(unet),
+            revision=args.revision,
+            variant=args.variant,
+        )
+        pipeline.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+        if (args.val_image_url is not None) and (args.validation_prompt is not None):
+            log_validation(
+                pipeline,
+                args,
+                accelerator,
+                generator,
+            )
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py b/diffusers/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..c88be6d16d88d57276f149522741afaf65e4c026
--- /dev/null
+++ b/diffusers/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py
@@ -0,0 +1,1261 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 Harutatsu Akiyama and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import math
+import os
+import shutil
+import warnings
+from contextlib import nullcontext
+from pathlib import Path
+from urllib.parse import urlparse
+
+import accelerate
+import datasets
+import numpy as np
+import PIL
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_instruct_pix2pix import (
+    StableDiffusionXLInstructPix2PixPipeline,
+)
+from diffusers.training_utils import EMAModel
+from diffusers.utils import check_min_version, deprecate, is_wandb_available, load_image
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+DATASET_NAME_MAPPING = {
+    "fusing/instructpix2pix-1000-samples": ("file_name", "edited_image", "edit_prompt"),
+}
+WANDB_TABLE_COL_NAMES = ["file_name", "edited_image", "edit_prompt"]
+TORCH_DTYPE_MAPPING = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
+
+
+def log_validation(pipeline, args, accelerator, generator, global_step, is_final_validation=False):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    val_save_dir = os.path.join(args.output_dir, "validation_images")
+    if not os.path.exists(val_save_dir):
+        os.makedirs(val_save_dir)
+
+    original_image = (
+        lambda image_url_or_path: load_image(image_url_or_path)
+        if urlparse(image_url_or_path).scheme
+        else Image.open(image_url_or_path).convert("RGB")
+    )(args.val_image_url_or_path)
+
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+
+    with autocast_ctx:
+        edited_images = []
+        # Run inference
+        for val_img_idx in range(args.num_validation_images):
+            a_val_img = pipeline(
+                args.validation_prompt,
+                image=original_image,
+                num_inference_steps=20,
+                image_guidance_scale=1.5,
+                guidance_scale=7,
+                generator=generator,
+            ).images[0]
+            edited_images.append(a_val_img)
+            # Save validation images
+            a_val_img.save(os.path.join(val_save_dir, f"step_{global_step}_val_img_{val_img_idx}.png"))
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "wandb":
+            wandb_table = wandb.Table(columns=WANDB_TABLE_COL_NAMES)
+            for edited_image in edited_images:
+                wandb_table.add_data(wandb.Image(original_image), wandb.Image(edited_image), args.validation_prompt)
+            logger_name = "test" if is_final_validation else "validation"
+            tracker.log({logger_name: wandb_table})
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Script to train Stable Diffusion XL for InstructPix2Pix.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to an improved VAE to stabilize training. For more details check out: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--vae_precision",
+        type=str,
+        choices=["fp32", "fp16", "bf16"],
+        default="fp32",
+        help=(
+            "The vanilla SDXL 1.0 VAE can cause NaNs due to large activation values. Some custom models might already have a solution"
+            " to this problem, and this flag allows you to use mixed precision to stabilize training."
+        ),
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--original_image_column",
+        type=str,
+        default="input_image",
+        help="The column of the dataset containing the original image on which edits where made.",
+    )
+    parser.add_argument(
+        "--edited_image_column",
+        type=str,
+        default="edited_image",
+        help="The column of the dataset containing the edited image.",
+    )
+    parser.add_argument(
+        "--edit_prompt_column",
+        type=str,
+        default="edit_prompt",
+        help="The column of the dataset containing the edit instruction.",
+    )
+    parser.add_argument(
+        "--val_image_url_or_path",
+        type=str,
+        default=None,
+        help="URL to the original image that you would like to edit (used during inference for debugging purposes).",
+    )
+    parser.add_argument(
+        "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run fine-tuning validation every X steps. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="instruct-pix2pix-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=256,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
+        ),
+    )
+    parser.add_argument(
+        "--crops_coords_top_left_h",
+        type=int,
+        default=0,
+        help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."),
+    )
+    parser.add_argument(
+        "--crops_coords_top_left_w",
+        type=int,
+        default=0,
+        help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--conditioning_dropout_prob",
+        type=float,
+        default=None,
+        help="Conditioning dropout probability. Drops out the conditionings (image and edit prompt) used in training InstructPix2Pix. See section 3.2.1 in the paper: https://arxiv.org/abs/2211.09800.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--non_ema_revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or"
+            " remote repository specified with --pretrained_model_name_or_path."
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    # default to using the same revision for the non-ema model if not specified
+    if args.non_ema_revision is None:
+        args.non_ema_revision = args.revision
+
+    return args
+
+
+def convert_to_np(image, resolution):
+    if isinstance(image, str):
+        image = PIL.Image.open(image)
+    image = image.convert("RGB").resize((resolution, resolution))
+    return np.array(image).transpose(2, 0, 1)
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if args.non_ema_revision is not None:
+        deprecate(
+            "non_ema_revision!=None",
+            "0.15.0",
+            message=(
+                "Downloading 'non_ema' weights from revision branches of the Hub is deprecated. Please make sure to"
+                " use `--variant=non_ema` instead."
+            ),
+        )
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # InstructPix2Pix uses an additional image for conditioning. To accommodate that,
+    # it uses 8 channels (instead of 4) in the first (conv) layer of the UNet. This UNet is
+    # then fine-tuned on the custom InstructPix2Pix dataset. This modified UNet is initialized
+    # from the pre-trained checkpoints. For the extra channels added to the first layer, they are
+    # initialized to zero.
+    logger.info("Initializing the XL InstructPix2Pix UNet from the pretrained UNet.")
+    in_channels = 8
+    out_channels = unet.conv_in.out_channels
+    unet.register_to_config(in_channels=in_channels)
+
+    with torch.no_grad():
+        new_conv_in = nn.Conv2d(
+            in_channels, out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding
+        )
+        new_conv_in.weight.zero_()
+        new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
+        unet.conv_in = new_conv_in
+
+    # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = EMAModel(unet.parameters(), model_cls=UNet2DConditionModel, model_config=unet.config)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DConditionModel)
+                ema_unet.load_state_dict(load_model.state_dict())
+                ema_unet.to(accelerator.device)
+                del load_model
+
+            for i in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/main/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.original_image_column is None:
+        original_image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        original_image_column = args.original_image_column
+        if original_image_column not in column_names:
+            raise ValueError(
+                f"--original_image_column' value '{args.original_image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.edit_prompt_column is None:
+        edit_prompt_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        edit_prompt_column = args.edit_prompt_column
+        if edit_prompt_column not in column_names:
+            raise ValueError(
+                f"--edit_prompt_column' value '{args.edit_prompt_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.edited_image_column is None:
+        edited_image_column = dataset_columns[2] if dataset_columns is not None else column_names[2]
+    else:
+        edited_image_column = args.edited_image_column
+        if edited_image_column not in column_names:
+            raise ValueError(
+                f"--edited_image_column' value '{args.edited_image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+        warnings.warn(f"weight_dtype {weight_dtype} may cause nan during vae encoding", UserWarning)
+
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+        warnings.warn(f"weight_dtype {weight_dtype} may cause nan during vae encoding", UserWarning)
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(captions, tokenizer):
+        inputs = tokenizer(
+            captions,
+            max_length=tokenizer.model_max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        return inputs.input_ids
+
+    # Preprocessing the datasets.
+    train_transforms = transforms.Compose(
+        [
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+        ]
+    )
+
+    def preprocess_images(examples):
+        original_images = np.concatenate(
+            [convert_to_np(image, args.resolution) for image in examples[original_image_column]]
+        )
+        edited_images = np.concatenate(
+            [convert_to_np(image, args.resolution) for image in examples[edited_image_column]]
+        )
+        # We need to ensure that the original and the edited images undergo the same
+        # augmentation transforms.
+        images = np.concatenate([original_images, edited_images])
+        images = torch.tensor(images)
+        images = 2 * (images / 255) - 1
+        return train_transforms(images)
+
+    # Load scheduler, tokenizer and models.
+    tokenizer_1 = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_2 = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+    text_encoder_cls_1 = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+    text_encoder_cls_2 = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder_1 = text_encoder_cls_1.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_2 = text_encoder_cls_2.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+
+    # We ALWAYS pre-compute the additional condition embeddings needed for SDXL
+    # UNet as the model is already big and it uses two text encoders.
+    text_encoder_1.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_2.to(accelerator.device, dtype=weight_dtype)
+    tokenizers = [tokenizer_1, tokenizer_2]
+    text_encoders = [text_encoder_1, text_encoder_2]
+
+    # Freeze vae and text_encoders
+    vae.requires_grad_(False)
+    text_encoder_1.requires_grad_(False)
+    text_encoder_2.requires_grad_(False)
+
+    # Set UNet to trainable.
+    unet.train()
+
+    # Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(text_encoders, tokenizers, prompt):
+        prompt_embeds_list = []
+
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = text_encoder(
+                text_input_ids.to(text_encoder.device),
+                output_hidden_states=True,
+            )
+
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+            prompt_embeds_list.append(prompt_embeds)
+
+        prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+        return prompt_embeds, pooled_prompt_embeds
+
+    # Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompts(text_encoders, tokenizers, prompts):
+        prompt_embeds_all = []
+        pooled_prompt_embeds_all = []
+
+        for prompt in prompts:
+            prompt_embeds, pooled_prompt_embeds = encode_prompt(text_encoders, tokenizers, prompt)
+            prompt_embeds_all.append(prompt_embeds)
+            pooled_prompt_embeds_all.append(pooled_prompt_embeds)
+
+        return torch.stack(prompt_embeds_all), torch.stack(pooled_prompt_embeds_all)
+
+    # Adapted from examples.dreambooth.train_dreambooth_lora_sdxl
+    # Here, we compute not just the text embeddings but also the additional embeddings
+    # needed for the SD XL UNet to operate.
+    def compute_embeddings_for_prompts(prompts, text_encoders, tokenizers):
+        with torch.no_grad():
+            prompt_embeds_all, pooled_prompt_embeds_all = encode_prompts(text_encoders, tokenizers, prompts)
+            add_text_embeds_all = pooled_prompt_embeds_all
+
+            prompt_embeds_all = prompt_embeds_all.to(accelerator.device)
+            add_text_embeds_all = add_text_embeds_all.to(accelerator.device)
+        return prompt_embeds_all, add_text_embeds_all
+
+    # Get null conditioning
+    def compute_null_conditioning():
+        null_conditioning_list = []
+        for a_tokenizer, a_text_encoder in zip(tokenizers, text_encoders):
+            null_conditioning_list.append(
+                a_text_encoder(
+                    tokenize_captions([""], tokenizer=a_tokenizer).to(accelerator.device),
+                    output_hidden_states=True,
+                ).hidden_states[-2]
+            )
+        return torch.concat(null_conditioning_list, dim=-1)
+
+    null_conditioning = compute_null_conditioning()
+
+    def compute_time_ids():
+        crops_coords_top_left = (args.crops_coords_top_left_h, args.crops_coords_top_left_w)
+        original_size = target_size = (args.resolution, args.resolution)
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids], dtype=weight_dtype)
+        return add_time_ids.to(accelerator.device).repeat(args.train_batch_size, 1)
+
+    add_time_ids = compute_time_ids()
+
+    def preprocess_train(examples):
+        # Preprocess images.
+        preprocessed_images = preprocess_images(examples)
+        # Since the original and edited images were concatenated before
+        # applying the transformations, we need to separate them and reshape
+        # them accordingly.
+        original_images, edited_images = preprocessed_images.chunk(2)
+        original_images = original_images.reshape(-1, 3, args.resolution, args.resolution)
+        edited_images = edited_images.reshape(-1, 3, args.resolution, args.resolution)
+
+        # Collate the preprocessed images into the `examples`.
+        examples["original_pixel_values"] = original_images
+        examples["edited_pixel_values"] = edited_images
+
+        # Preprocess the captions.
+        captions = list(examples[edit_prompt_column])
+        prompt_embeds_all, add_text_embeds_all = compute_embeddings_for_prompts(captions, text_encoders, tokenizers)
+        examples["prompt_embeds"] = prompt_embeds_all
+        examples["add_text_embeds"] = add_text_embeds_all
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        original_pixel_values = torch.stack([example["original_pixel_values"] for example in examples])
+        original_pixel_values = original_pixel_values.to(memory_format=torch.contiguous_format).float()
+        edited_pixel_values = torch.stack([example["edited_pixel_values"] for example in examples])
+        edited_pixel_values = edited_pixel_values.to(memory_format=torch.contiguous_format).float()
+        prompt_embeds = torch.concat([example["prompt_embeds"] for example in examples], dim=0)
+        add_text_embeds = torch.concat([example["add_text_embeds"] for example in examples], dim=0)
+        return {
+            "original_pixel_values": original_pixel_values,
+            "edited_pixel_values": edited_pixel_values,
+            "prompt_embeds": prompt_embeds,
+            "add_text_embeds": add_text_embeds,
+        }
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.use_ema:
+        ema_unet.to(accelerator.device)
+
+    # Move vae, unet and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    if args.pretrained_vae_model_name_or_path is not None:
+        vae.to(accelerator.device, dtype=weight_dtype)
+    else:
+        vae.to(accelerator.device, dtype=TORCH_DTYPE_MAPPING[args.vae_precision])
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("instruct-pix2pix-xl", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # We want to learn the denoising process w.r.t the edited images which
+                # are conditioned on the original image (which was edited) and the edit instruction.
+                # So, first, convert images to latent space.
+                if args.pretrained_vae_model_name_or_path is not None:
+                    edited_pixel_values = batch["edited_pixel_values"].to(dtype=weight_dtype)
+                else:
+                    edited_pixel_values = batch["edited_pixel_values"]
+                latents = vae.encode(edited_pixel_values).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    latents = latents.to(weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # SDXL additional inputs
+                encoder_hidden_states = batch["prompt_embeds"]
+                add_text_embeds = batch["add_text_embeds"]
+
+                # Get the additional image embedding for conditioning.
+                # Instead of getting a diagonal Gaussian here, we simply take the mode.
+                if args.pretrained_vae_model_name_or_path is not None:
+                    original_pixel_values = batch["original_pixel_values"].to(dtype=weight_dtype)
+                else:
+                    original_pixel_values = batch["original_pixel_values"]
+                original_image_embeds = vae.encode(original_pixel_values).latent_dist.sample()
+                if args.pretrained_vae_model_name_or_path is None:
+                    original_image_embeds = original_image_embeds.to(weight_dtype)
+
+                # Conditioning dropout to support classifier-free guidance during inference. For more details
+                # check out the section 3.2.1 of the original paper https://arxiv.org/abs/2211.09800.
+                if args.conditioning_dropout_prob is not None:
+                    random_p = torch.rand(bsz, device=latents.device, generator=generator)
+                    # Sample masks for the edit prompts.
+                    prompt_mask = random_p < 2 * args.conditioning_dropout_prob
+                    prompt_mask = prompt_mask.reshape(bsz, 1, 1)
+                    # Final text conditioning.
+                    encoder_hidden_states = torch.where(prompt_mask, null_conditioning, encoder_hidden_states)
+
+                    # Sample masks for the original images.
+                    image_mask_dtype = original_image_embeds.dtype
+                    image_mask = 1 - (
+                        (random_p >= args.conditioning_dropout_prob).to(image_mask_dtype)
+                        * (random_p < 3 * args.conditioning_dropout_prob).to(image_mask_dtype)
+                    )
+                    image_mask = image_mask.reshape(bsz, 1, 1, 1)
+                    # Final image conditioning.
+                    original_image_embeds = image_mask * original_image_embeds
+
+                # Concatenate the `original_image_embeds` with the `noisy_latents`.
+                concatenated_noisy_latents = torch.cat([noisy_latents, original_image_embeds], dim=1)
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # Predict the noise residual and compute loss
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                model_pred = unet(
+                    concatenated_noisy_latents,
+                    timesteps,
+                    encoder_hidden_states,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_unet.step(unet.parameters())
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            ### BEGIN: Perform validation every `validation_epochs` steps
+            if global_step % args.validation_steps == 0:
+                if (args.val_image_url_or_path is not None) and (args.validation_prompt is not None):
+                    # create pipeline
+                    if args.use_ema:
+                        # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                        ema_unet.store(unet.parameters())
+                        ema_unet.copy_to(unet.parameters())
+
+                    # The models need unwrapping because for compatibility in distributed training mode.
+                    pipeline = StableDiffusionXLInstructPix2PixPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        unet=unwrap_model(unet),
+                        text_encoder=text_encoder_1,
+                        text_encoder_2=text_encoder_2,
+                        tokenizer=tokenizer_1,
+                        tokenizer_2=tokenizer_2,
+                        vae=vae,
+                        revision=args.revision,
+                        variant=args.variant,
+                        torch_dtype=weight_dtype,
+                    )
+
+                    log_validation(
+                        pipeline,
+                        args,
+                        accelerator,
+                        generator,
+                        global_step,
+                        is_final_validation=False,
+                    )
+
+                    if args.use_ema:
+                        # Switch back to the original UNet parameters.
+                        ema_unet.restore(unet.parameters())
+
+                    del pipeline
+                    torch.cuda.empty_cache()
+            ### END: Perform validation every `validation_epochs` steps
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+
+        pipeline = StableDiffusionXLInstructPix2PixPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            text_encoder=text_encoder_1,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer_1,
+            tokenizer_2=tokenizer_2,
+            vae=vae,
+            unet=unwrap_model(unet),
+            revision=args.revision,
+            variant=args.variant,
+        )
+
+        pipeline.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+        if (args.val_image_url_or_path is not None) and (args.validation_prompt is not None):
+            log_validation(
+                pipeline,
+                args,
+                accelerator,
+                generator,
+                global_step,
+                is_final_validation=True,
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/kandinsky2_2/text_to_image/README.md b/diffusers/examples/kandinsky2_2/text_to_image/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6daf06dcb8c076173d576a0ed7a27effe794a61b
--- /dev/null
+++ b/diffusers/examples/kandinsky2_2/text_to_image/README.md
@@ -0,0 +1,317 @@
+# Kandinsky2.2 text-to-image fine-tuning
+
+Kandinsky 2.2 includes a prior pipeline that generates image embeddings from text prompts, and a decoder pipeline that generates the output image based on the image embeddings. We provide `train_text_to_image_prior.py` and `train_text_to_image_decoder.py` scripts to show you how to fine-tune the Kandinsky prior and decoder models separately based on your own dataset. To achieve the best results, you should fine-tune **_both_** your prior and decoder models.
+
+___Note___:
+
+___This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparameters to get the best result on your dataset.___
+
+
+## Running locally with PyTorch
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder  and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+For this example we want to directly store the trained LoRA embeddings on the Hub, so we need to be logged in and add the --push_to_hub flag.
+
+___
+
+### Naruto example
+
+For all our examples, we will directly store the trained weights on the Hub, so we need to be logged in and add the `--push_to_hub` flag. In order to do that, you have to be a registered user on the 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to the [User Access Tokens](https://huggingface.co/docs/hub/security-tokens) guide.
+
+Run the following command to authenticate your token
+
+```bash
+huggingface-cli login
+```
+
+We also use [Weights and Biases](https://docs.wandb.ai/quickstart) logging by default, because it is really useful to monitor the training progress by regularly generating sample images during training. To install wandb, run
+
+```bash
+pip install wandb
+```
+
+To disable wandb logging, remove the `--report_to=="wandb"` and `--validation_prompts="A robot naruto, 4k photo"` flags from below examples
+
+#### Fine-tune decoder
+<br>
+
+<!-- accelerate_snippet_start -->
+```bash
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+
+accelerate launch --mixed_precision="fp16"  train_text_to_image_decoder.py \
+  --dataset_name=$DATASET_NAME \
+  --resolution=768 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --checkpoints_total_limit=3 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --validation_prompts="A robot naruto, 4k photo" \
+  --report_to="wandb" \
+  --push_to_hub \
+  --output_dir="kandi2-decoder-naruto-model"
+```
+<!-- accelerate_snippet_end -->
+
+
+To train on your own training files, prepare the dataset according to the format required by `datasets`. You can find the instructions for how to do that in the [ImageFolder with metadata](https://huggingface.co/docs/datasets/en/image_load#imagefolder-with-metadata) guide.
+If you wish to use custom loading logic, you should modify the script and we have left pointers for that in the training script.
+
+```bash
+export TRAIN_DIR="path_to_your_dataset"
+
+accelerate launch --mixed_precision="fp16" train_text_to_image_decoder.py \
+  --train_data_dir=$TRAIN_DIR \
+  --resolution=768 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --checkpoints_total_limit=3 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --validation_prompts="A robot naruto, 4k photo" \
+  --report_to="wandb" \
+  --push_to_hub \
+  --output_dir="kandi22-decoder-naruto-model"
+```
+
+
+Once the training is finished the model will be saved in the `output_dir` specified in the command. In this example it's `kandi22-decoder-naruto-model`. To load the fine-tuned model for inference just pass that path to `AutoPipelineForText2Image`
+
+```python
+from diffusers import AutoPipelineForText2Image
+import torch
+
+pipe = AutoPipelineForText2Image.from_pretrained(output_dir, torch_dtype=torch.float16)
+pipe.enable_model_cpu_offload()
+
+prompt='A robot naruto, 4k photo'
+images = pipe(prompt=prompt).images
+images[0].save("robot-naruto.png")
+```
+
+Checkpoints only save the unet, so to run inference from a checkpoint, just load the unet
+```python
+from diffusers import AutoPipelineForText2Image, UNet2DConditionModel
+
+model_path = "path_to_saved_model"
+
+unet = UNet2DConditionModel.from_pretrained(model_path + "/checkpoint-<N>/unet")
+
+pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", unet=unet, torch_dtype=torch.float16)
+pipe.enable_model_cpu_offload()
+
+image = pipe(prompt="A robot naruto, 4k photo").images[0]
+image.save("robot-naruto.png")
+```
+
+#### Fine-tune prior
+
+You can fine-tune the Kandinsky prior model with `train_text_to_image_prior.py` script. Note that we currently do not support `--gradient_checkpointing` for prior model fine-tuning.
+
+<br>
+
+<!-- accelerate_snippet_start -->
+```bash
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+
+accelerate launch --mixed_precision="fp16"  train_text_to_image_prior.py \
+  --dataset_name=$DATASET_NAME \
+  --resolution=768 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --checkpoints_total_limit=3 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --validation_prompts="A robot naruto, 4k photo" \
+  --report_to="wandb" \
+  --push_to_hub \
+  --output_dir="kandi2-prior-naruto-model"
+```
+<!-- accelerate_snippet_end -->
+
+
+To perform inference with the fine-tuned prior model, you will need to first create a prior pipeline by passing the `output_dir` to `DiffusionPipeline`. Then create a `KandinskyV22CombinedPipeline` from a pretrained or fine-tuned decoder checkpoint along with all the modules of the prior pipeline you just created.
+
+```python
+from diffusers import AutoPipelineForText2Image, DiffusionPipeline
+import torch
+
+pipe_prior = DiffusionPipeline.from_pretrained(output_dir, torch_dtype=torch.float16)
+prior_components = {"prior_" + k: v for k,v in pipe_prior.components.items()}
+pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", **prior_components, torch_dtype=torch.float16)
+
+pipe.enable_model_cpu_offload()
+prompt='A robot naruto, 4k photo'
+images = pipe(prompt=prompt, negative_prompt=negative_prompt).images
+images[0]
+```
+
+If you want to use a fine-tuned decoder checkpoint along with your fine-tuned prior checkpoint, you can simply replace the "kandinsky-community/kandinsky-2-2-decoder" in above code with your custom model repo name. Note that in order to be able to create a `KandinskyV22CombinedPipeline`, your model repository need to have a prior tag. If you have created your model repo using our training script, the prior tag is automatically included.
+
+#### Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+
+accelerate launch --mixed_precision="fp16" --multi_gpu  train_text_to_image_decoder.py \
+  --dataset_name=$DATASET_NAME \
+  --resolution=768 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --checkpoints_total_limit=3 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --validation_prompts="A robot naruto, 4k photo" \
+  --report_to="wandb" \
+  --push_to_hub \
+  --output_dir="kandi2-decoder-naruto-model"
+```
+
+
+#### Training with Min-SNR weighting
+
+We support training with the Min-SNR weighting strategy proposed in [Efficient Diffusion Training via Min-SNR Weighting Strategy](https://arxiv.org/abs/2303.09556) which helps achieve faster convergence
+by rebalancing the loss. Enable the `--snr_gamma` argument and set it to the recommended
+value of 5.0.
+
+
+## Training with LoRA
+
+Low-Rank Adaption of Large Language Models was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*.
+
+In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-decomposition matrices to existing weights and **only** training those newly added weights. This has a couple of advantages:
+
+- Previous pretrained weights are kept frozen so that model is not prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114).
+- Rank-decomposition matrices have significantly fewer parameters than original model, which means that trained LoRA weights are easily portable.
+- LoRA attention layers allow to control to which extent the model is adapted toward new training images via a `scale` parameter.
+
+[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository.
+
+With LoRA, it's possible to fine-tune Kandinsky 2.2 on a custom image-caption pair dataset
+on consumer GPUs like Tesla T4, Tesla V100.
+
+### Training
+
+First, you need to set up your development environment as explained in the [installation](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder) and the [Narutos dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).
+
+
+#### Train decoder
+
+```bash
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+
+accelerate launch --mixed_precision="fp16" train_text_to_image_decoder_lora.py \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=768 \
+  --train_batch_size=1 \
+  --num_train_epochs=100 --checkpointing_steps=5000 \
+  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --seed=42 \
+  --rank=4 \
+  --gradient_checkpointing \
+  --output_dir="kandi22-decoder-naruto-lora" \
+  --validation_prompt="cute dragon creature" --report_to="wandb" \
+  --push_to_hub \
+```
+
+#### Train prior
+
+```bash
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+
+accelerate launch --mixed_precision="fp16" train_text_to_image_prior_lora.py \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=768 \
+  --train_batch_size=1 \
+  --num_train_epochs=100 --checkpointing_steps=5000 \
+  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --seed=42 \
+  --rank=4 \
+  --output_dir="kandi22-prior-naruto-lora" \
+  --validation_prompt="cute dragon creature" --report_to="wandb" \
+  --push_to_hub \
+```
+
+**___Note: When using LoRA we can use a much higher learning rate compared to non-LoRA fine-tuning. Here we use *1e-4* instead of the usual *1e-5*. Also, by using LoRA, it's possible to run above scripts in consumer GPUs like T4 or V100.___**
+
+
+### Inference
+
+#### Inference using fine-tuned LoRA checkpoint for decoder
+
+Once you have trained a Kandinsky decoder model using the above command, inference can be done with the `AutoPipelineForText2Image` after loading the trained LoRA weights.  You need to pass the `output_dir` for loading the LoRA weights, which in this case is `kandi22-decoder-naruto-lora`.
+
+
+```python
+from diffusers import AutoPipelineForText2Image
+import torch
+
+pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
+pipe.unet.load_attn_procs(output_dir)
+pipe.enable_model_cpu_offload()
+
+prompt='A robot naruto, 4k photo'
+image = pipe(prompt=prompt).images[0]
+image.save("robot_naruto.png")
+```
+
+#### Inference using fine-tuned LoRA checkpoint for prior
+
+```python
+from diffusers import AutoPipelineForText2Image
+import torch
+
+pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
+pipe.prior_prior.load_attn_procs(output_dir)
+pipe.enable_model_cpu_offload()
+
+prompt='A robot naruto, 4k photo'
+image = pipe(prompt=prompt).images[0]
+image.save("robot_naruto.png")
+image
+```
+
+### Training with xFormers:
+
+You can enable memory efficient attention by [installing xFormers](https://huggingface.co/docs/diffusers/main/en/optimization/xformers) and passing the `--enable_xformers_memory_efficient_attention` argument to the script.
+
+xFormers training is not available for fine-tuning the prior model.
+
+**Note**:
+
+According to [this issue](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212), xFormers `v0.0.16` cannot be used for training in some GPUs. If you observe that problem, please install a development version as indicated in that comment.
\ No newline at end of file
diff --git a/diffusers/examples/kandinsky2_2/text_to_image/requirements.txt b/diffusers/examples/kandinsky2_2/text_to_image/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..31b9026efdc2799b1d02e2e3f4d8dfc463737fdc
--- /dev/null
+++ b/diffusers/examples/kandinsky2_2/text_to_image/requirements.txt
@@ -0,0 +1,7 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+datasets
+ftfy
+tensorboard
+Jinja2
diff --git a/diffusers/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py b/diffusers/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9caa3694d636733cc179e1b4372198e4fa415988
--- /dev/null
+++ b/diffusers/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py
@@ -0,0 +1,929 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+
+import accelerate
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.state import AcceleratorState
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from tqdm import tqdm
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from transformers.utils import ContextManagers
+
+import diffusers
+from diffusers import AutoPipelineForText2Image, DDPMScheduler, UNet2DConditionModel, VQModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel, compute_snr
+from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
+from diffusers.utils.import_utils import is_xformers_available
+
+
+if is_wandb_available():
+    import wandb
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def save_model_card(
+    args,
+    repo_id: str,
+    images=None,
+    repo_folder=None,
+):
+    img_str = ""
+    if len(images) > 0:
+        image_grid = make_image_grid(images, 1, len(args.validation_prompts))
+        image_grid.save(os.path.join(repo_folder, "val_imgs_grid.png"))
+        img_str += "![val_imgs_grid](./val_imgs_grid.png)\n"
+
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {args.pretrained_decoder_model_name_or_path}
+datasets:
+- {args.dataset_name}
+prior:
+- {args.pretrained_prior_model_name_or_path}
+tags:
+- kandinsky
+- text-to-image
+- diffusers
+- diffusers-training
+inference: true
+---
+    """
+    model_card = f"""
+# Finetuning - {repo_id}
+
+This pipeline was finetuned from **{args.pretrained_decoder_model_name_or_path}** on the **{args.dataset_name}** dataset. Below are some example images generated with the finetuned pipeline using the following prompts: {args.validation_prompts}: \n
+{img_str}
+
+## Pipeline usage
+
+You can use the pipeline like so:
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = AutoPipelineForText2Image.from_pretrained("{repo_id}", torch_dtype=torch.float16)
+prompt = "{args.validation_prompts[0]}"
+image = pipeline(prompt).images[0]
+image.save("my_image.png")
+```
+
+## Training info
+
+These are the key hyperparameters used during training:
+
+* Epochs: {args.num_train_epochs}
+* Learning rate: {args.learning_rate}
+* Batch size: {args.train_batch_size}
+* Gradient accumulation steps: {args.gradient_accumulation_steps}
+* Image resolution: {args.resolution}
+* Mixed-precision: {args.mixed_precision}
+
+"""
+    wandb_info = ""
+    if is_wandb_available():
+        wandb_run_url = None
+        if wandb.run is not None:
+            wandb_run_url = wandb.run.url
+
+    if wandb_run_url is not None:
+        wandb_info = f"""
+More information on all the CLI arguments and the environment are available on your [`wandb` run page]({wandb_run_url}).
+"""
+
+    model_card += wandb_info
+
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def log_validation(vae, image_encoder, image_processor, unet, args, accelerator, weight_dtype, epoch):
+    logger.info("Running validation... ")
+
+    pipeline = AutoPipelineForText2Image.from_pretrained(
+        args.pretrained_decoder_model_name_or_path,
+        vae=accelerator.unwrap_model(vae),
+        prior_image_encoder=accelerator.unwrap_model(image_encoder),
+        prior_image_processor=image_processor,
+        unet=accelerator.unwrap_model(unet),
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    images = []
+    for i in range(len(args.validation_prompts)):
+        with torch.autocast("cuda"):
+            image = pipeline(args.validation_prompts[i], num_inference_steps=20, generator=generator).images[0]
+
+        images.append(image)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompts[i]}")
+                        for i, image in enumerate(images)
+                    ]
+                }
+            )
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+    return images
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of finetuning Kandinsky 2.2.")
+    parser.add_argument(
+        "--pretrained_decoder_model_name_or_path",
+        type=str,
+        default="kandinsky-community/kandinsky-2-2-decoder",
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_prior_model_name_or_path",
+        type=str,
+        default="kandinsky-community/kandinsky-2-2-prior",
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompts",
+        type=str,
+        default=None,
+        nargs="+",
+        help=("A set of prompts evaluated every `--validation_epochs` and logged to `--report_to`."),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="kandi_2_2-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=1, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="learning rate",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument(
+        "--adam_weight_decay",
+        type=float,
+        default=0.0,
+        required=False,
+        help="weight decay_to_use",
+    )
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=5,
+        help="Run validation every X epochs.",
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="text2image-fine-tune",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_decoder_model_name_or_path, subfolder="scheduler")
+    image_processor = CLIPImageProcessor.from_pretrained(
+        args.pretrained_prior_model_name_or_path, subfolder="image_processor"
+    )
+
+    def deepspeed_zero_init_disabled_context_manager():
+        """
+        returns either a context list that includes one that will disable zero.Init or an empty context list
+        """
+        deepspeed_plugin = AcceleratorState().deepspeed_plugin if accelerate.state.is_initialized() else None
+        if deepspeed_plugin is None:
+            return []
+
+        return [deepspeed_plugin.zero3_init_context_manager(enable=False)]
+
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
+        vae = VQModel.from_pretrained(
+            args.pretrained_decoder_model_name_or_path, subfolder="movq", torch_dtype=weight_dtype
+        ).eval()
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            args.pretrained_prior_model_name_or_path, subfolder="image_encoder", torch_dtype=weight_dtype
+        ).eval()
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_decoder_model_name_or_path, subfolder="unet")
+
+    # Freeze vae and image_encoder
+    vae.requires_grad_(False)
+    image_encoder.requires_grad_(False)
+
+    # Set unet to trainable.
+    unet.train()
+
+    # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = UNet2DConditionModel.from_pretrained(args.pretrained_decoder_model_name_or_path, subfolder="unet")
+        ema_unet = EMAModel(ema_unet.parameters(), model_cls=UNet2DConditionModel, model_config=ema_unet.config)
+        ema_unet.to(accelerator.device)
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if args.use_ema:
+                ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+            for i, model in enumerate(models):
+                model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DConditionModel)
+                ema_unet.load_state_dict(load_model.state_dict())
+                ema_unet.to(accelerator.device)
+                del load_model
+
+            for i in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    image_column = args.image_column
+    if image_column not in column_names:
+        raise ValueError(f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}")
+
+    def center_crop(image):
+        width, height = image.size
+        new_size = min(width, height)
+        left = (width - new_size) / 2
+        top = (height - new_size) / 2
+        right = (width + new_size) / 2
+        bottom = (height + new_size) / 2
+        return image.crop((left, top, right, bottom))
+
+    def train_transforms(img):
+        img = center_crop(img)
+        img = img.resize((args.resolution, args.resolution), resample=Image.BICUBIC, reducing_gap=1)
+        img = np.array(img).astype(np.float32) / 127.5 - 1
+        img = torch.from_numpy(np.transpose(img, [2, 0, 1]))
+        return img
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["clip_pixel_values"] = image_processor(images, return_tensors="pt").pixel_values
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        clip_pixel_values = torch.stack([example["clip_pixel_values"] for example in examples])
+        clip_pixel_values = clip_pixel_values.to(memory_format=torch.contiguous_format).float()
+        return {"pixel_values": pixel_values, "clip_pixel_values": clip_pixel_values}
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+    # Move image_encode and vae to gpu and cast to weight_dtype
+    image_encoder.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        tracker_config.pop("validation_prompts")
+        accelerator.init_trackers(args.tracker_project_name, tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                images = batch["pixel_values"].to(weight_dtype)
+                clip_images = batch["clip_pixel_values"].to(weight_dtype)
+                latents = vae.encode(images).latents
+                image_embeds = image_encoder(clip_images).image_embeds
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                target = noise
+
+                # Predict the noise residual and compute loss
+                added_cond_kwargs = {"image_embeds": image_embeds}
+
+                model_pred = unet(noisy_latents, timesteps, None, added_cond_kwargs=added_cond_kwargs).sample[:, :4]
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_unet.step(unet.parameters())
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompts is not None and epoch % args.validation_epochs == 0:
+                if args.use_ema:
+                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                    ema_unet.store(unet.parameters())
+                    ema_unet.copy_to(unet.parameters())
+                log_validation(
+                    vae,
+                    image_encoder,
+                    image_processor,
+                    unet,
+                    args,
+                    accelerator,
+                    weight_dtype,
+                    global_step,
+                )
+                if args.use_ema:
+                    # Switch back to the original UNet parameters.
+                    ema_unet.restore(unet.parameters())
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            args.pretrained_decoder_model_name_or_path,
+            vae=vae,
+            unet=unet,
+        )
+        pipeline.decoder_pipe.save_pretrained(args.output_dir)
+
+        # Run a final round of inference.
+        images = []
+        if args.validation_prompts is not None:
+            logger.info("Running inference for collecting generated images...")
+            pipeline.torch_dtype = weight_dtype
+            pipeline.set_progress_bar_config(disable=True)
+            pipeline.enable_model_cpu_offload()
+
+            if args.enable_xformers_memory_efficient_attention:
+                pipeline.enable_xformers_memory_efficient_attention()
+
+            if args.seed is None:
+                generator = None
+            else:
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+            for i in range(len(args.validation_prompts)):
+                with torch.autocast("cuda"):
+                    image = pipeline(args.validation_prompts[i], num_inference_steps=20, generator=generator).images[0]
+                images.append(image)
+
+        if args.push_to_hub:
+            save_model_card(args, repo_id, images, repo_folder=args.output_dir)
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_decoder.py b/diffusers/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..23f5d342b396b81146ee6a8f1ada3829047abf0c
--- /dev/null
+++ b/diffusers/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_decoder.py
@@ -0,0 +1,812 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning script for Kandinsky with support for LoRA."""
+
+import argparse
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from PIL import Image
+from tqdm import tqdm
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+import diffusers
+from diffusers import AutoPipelineForText2Image, DDPMScheduler, UNet2DConditionModel, VQModel
+from diffusers.loaders import AttnProcsLayers
+from diffusers.models.attention_processor import LoRAAttnAddedKVProcessor
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import compute_snr
+from diffusers.utils import check_min_version, is_wandb_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None):
+    img_str = ""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+tags:
+- kandinsky
+- text-to-image
+- diffusers
+- diffusers-training
+- lora
+inference: true
+---
+    """
+    model_card = f"""
+# LoRA text2image fine-tuning - {repo_id}
+These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
+{img_str}
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of finetuning Kandinsky 2.2 with LoRA.")
+    parser.add_argument(
+        "--pretrained_decoder_model_name_or_path",
+        type=str,
+        default="kandinsky-community/kandinsky-2-2-decoder",
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_prior_model_name_or_path",
+        type=str,
+        default="kandinsky-community/kandinsky-2-2-prior",
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="kandi_2_2-model-finetuned-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=1, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_decoder_model_name_or_path, subfolder="scheduler")
+    image_processor = CLIPImageProcessor.from_pretrained(
+        args.pretrained_prior_model_name_or_path, subfolder="image_processor"
+    )
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+        args.pretrained_prior_model_name_or_path, subfolder="image_encoder"
+    )
+
+    vae = VQModel.from_pretrained(args.pretrained_decoder_model_name_or_path, subfolder="movq")
+
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_decoder_model_name_or_path, subfolder="unet")
+    # freeze parameters of models to save more memory
+    unet.requires_grad_(False)
+    vae.requires_grad_(False)
+
+    image_encoder.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    image_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    lora_attn_procs = {}
+    for name in unet.attn_processors.keys():
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+
+        lora_attn_procs[name] = LoRAAttnAddedKVProcessor(
+            hidden_size=hidden_size,
+            cross_attention_dim=cross_attention_dim,
+            rank=args.rank,
+        )
+
+    unet.set_attn_processor(lora_attn_procs)
+
+    lora_layers = AttnProcsLayers(unet.attn_processors)
+
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        lora_layers.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    image_column = args.image_column
+    if image_column not in column_names:
+        raise ValueError(f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}")
+
+    def center_crop(image):
+        width, height = image.size
+        new_size = min(width, height)
+        left = (width - new_size) / 2
+        top = (height - new_size) / 2
+        right = (width + new_size) / 2
+        bottom = (height + new_size) / 2
+        return image.crop((left, top, right, bottom))
+
+    def train_transforms(img):
+        img = center_crop(img)
+        img = img.resize((args.resolution, args.resolution), resample=Image.BICUBIC, reducing_gap=1)
+        img = np.array(img).astype(np.float32) / 127.5 - 1
+        img = torch.from_numpy(np.transpose(img, [2, 0, 1]))
+        return img
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["clip_pixel_values"] = image_processor(images, return_tensors="pt").pixel_values
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        clip_pixel_values = torch.stack([example["clip_pixel_values"] for example in examples])
+        clip_pixel_values = clip_pixel_values.to(memory_format=torch.contiguous_format).float()
+        return {"pixel_values": pixel_values, "clip_pixel_values": clip_pixel_values}
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+    # Prepare everything with our `accelerator`.
+    lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        lora_layers, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2image-fine-tune", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                images = batch["pixel_values"].to(weight_dtype)
+                clip_images = batch["clip_pixel_values"].to(weight_dtype)
+                latents = vae.encode(images).latents
+                image_embeds = image_encoder(clip_images).image_embeds
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                target = noise
+
+                # Predict the noise residual and compute loss
+                added_cond_kwargs = {"image_embeds": image_embeds}
+
+                model_pred = unet(noisy_latents, timesteps, None, added_cond_kwargs=added_cond_kwargs).sample[:, :4]
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = lora_layers.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                # create pipeline
+                pipeline = AutoPipelineForText2Image.from_pretrained(
+                    args.pretrained_decoder_model_name_or_path,
+                    unet=accelerator.unwrap_model(unet),
+                    torch_dtype=weight_dtype,
+                )
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                generator = torch.Generator(device=accelerator.device)
+                if args.seed is not None:
+                    generator = generator.manual_seed(args.seed)
+                images = []
+                for _ in range(args.num_validation_images):
+                    images.append(
+                        pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
+                    )
+
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "validation": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unet.to(torch.float32)
+        unet.save_attn_procs(args.output_dir)
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_decoder_model_name_or_path,
+                dataset_name=args.dataset_name,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    # Final inference
+    # Load previous pipeline
+    pipeline = AutoPipelineForText2Image.from_pretrained(
+        args.pretrained_decoder_model_name_or_path, torch_dtype=weight_dtype
+    )
+    pipeline = pipeline.to(accelerator.device)
+
+    # load attention processors
+    pipeline.unet.load_attn_procs(args.output_dir)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device)
+    if args.seed is not None:
+        generator = generator.manual_seed(args.seed)
+    images = []
+    for _ in range(args.num_validation_images):
+        images.append(pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0])
+
+    if accelerator.is_main_process:
+        for tracker in accelerator.trackers:
+            if len(images) != 0:
+                if tracker.name == "tensorboard":
+                    np_images = np.stack([np.asarray(img) for img in images])
+                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+                if tracker.name == "wandb":
+                    tracker.log(
+                        {
+                            "test": [
+                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                for i, image in enumerate(images)
+                            ]
+                        }
+                    )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py b/diffusers/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ed3377db131a5c35219a644488a8a2900813e36
--- /dev/null
+++ b/diffusers/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py
@@ -0,0 +1,844 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning script for Stable Diffusion for text2image with support for LoRA."""
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from tqdm import tqdm
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+import diffusers
+from diffusers import AutoPipelineForText2Image, DDPMScheduler, PriorTransformer
+from diffusers.loaders import AttnProcsLayers
+from diffusers.models.attention_processor import LoRAAttnProcessor
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import compute_snr
+from diffusers.utils import check_min_version, is_wandb_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None):
+    img_str = ""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+tags:
+- kandinsky
+- text-to-image
+- diffusers
+- diffusers-training
+- lora
+inference: true
+---
+    """
+    model_card = f"""
+# LoRA text2image fine-tuning - {repo_id}
+These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
+{img_str}
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of finetuning Kandinsky 2.2.")
+    parser.add_argument(
+        "--pretrained_decoder_model_name_or_path",
+        type=str,
+        default="kandinsky-community/kandinsky-2-2-decoder",
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_prior_model_name_or_path",
+        type=str,
+        default="kandinsky-community/kandinsky-2-2-prior",
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="kandi_2_2-model-finetuned-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=1, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="learning rate",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument(
+        "--adam_weight_decay",
+        type=float,
+        default=0.0,
+        required=False,
+        help="weight decay_to_use",
+    )
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+    # Load scheduler, image_processor, tokenizer and models.
+    noise_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", prediction_type="sample")
+    image_processor = CLIPImageProcessor.from_pretrained(
+        args.pretrained_prior_model_name_or_path, subfolder="image_processor"
+    )
+    tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_prior_model_name_or_path, subfolder="tokenizer")
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+        args.pretrained_prior_model_name_or_path, subfolder="image_encoder"
+    )
+    text_encoder = CLIPTextModelWithProjection.from_pretrained(
+        args.pretrained_prior_model_name_or_path, subfolder="text_encoder"
+    )
+    prior = PriorTransformer.from_pretrained(args.pretrained_prior_model_name_or_path, subfolder="prior")
+    # freeze parameters of models to save more memory
+    image_encoder.requires_grad_(False)
+    prior.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move image_encoder, text_encoder and prior to device and cast to weight_dtype
+    prior.to(accelerator.device, dtype=weight_dtype)
+    image_encoder.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    lora_attn_procs = {}
+    for name in prior.attn_processors.keys():
+        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=2048, rank=args.rank)
+
+    prior.set_attn_processor(lora_attn_procs)
+    lora_layers = AttnProcsLayers(prior.attn_processors)
+
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        lora_layers.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        text_input_ids = inputs.input_ids
+        text_mask = inputs.attention_mask.bool()
+        return text_input_ids, text_mask
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["clip_pixel_values"] = image_processor(images, return_tensors="pt").pixel_values
+        examples["text_input_ids"], examples["text_mask"] = tokenize_captions(examples)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        clip_pixel_values = torch.stack([example["clip_pixel_values"] for example in examples])
+        clip_pixel_values = clip_pixel_values.to(memory_format=torch.contiguous_format).float()
+        text_input_ids = torch.stack([example["text_input_ids"] for example in examples])
+        text_mask = torch.stack([example["text_mask"] for example in examples])
+        return {"clip_pixel_values": clip_pixel_values, "text_input_ids": text_input_ids, "text_mask": text_mask}
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+    clip_mean = prior.clip_mean.clone()
+    clip_std = prior.clip_std.clone()
+    lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        lora_layers, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2image-fine-tune", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    clip_mean = clip_mean.to(weight_dtype).to(accelerator.device)
+    clip_std = clip_std.to(weight_dtype).to(accelerator.device)
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        prior.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(prior):
+                # Convert images to latent space
+                text_input_ids, text_mask, clip_images = (
+                    batch["text_input_ids"],
+                    batch["text_mask"],
+                    batch["clip_pixel_values"].to(weight_dtype),
+                )
+                with torch.no_grad():
+                    text_encoder_output = text_encoder(text_input_ids)
+                    prompt_embeds = text_encoder_output.text_embeds
+                    text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+                    image_embeds = image_encoder(clip_images).image_embeds
+                    # Sample noise that we'll add to the image_embeds
+                    noise = torch.randn_like(image_embeds)
+                    bsz = image_embeds.shape[0]
+                    # Sample a random timestep for each image
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=image_embeds.device
+                    )
+                    timesteps = timesteps.long()
+                    image_embeds = (image_embeds - clip_mean) / clip_std
+                    noisy_latents = noise_scheduler.add_noise(image_embeds, noise, timesteps)
+
+                    target = image_embeds
+
+                # Predict the noise residual and compute loss
+                model_pred = prior(
+                    noisy_latents,
+                    timestep=timesteps,
+                    proj_embedding=prompt_embeds,
+                    encoder_hidden_states=text_encoder_hidden_states,
+                    attention_mask=text_mask,
+                ).predicted_image_embedding
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(lora_layers.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                # create pipeline
+                pipeline = AutoPipelineForText2Image.from_pretrained(
+                    args.pretrained_decoder_model_name_or_path,
+                    prior_prior=accelerator.unwrap_model(prior),
+                    torch_dtype=weight_dtype,
+                )
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                generator = torch.Generator(device=accelerator.device)
+                if args.seed is not None:
+                    generator = generator.manual_seed(args.seed)
+                images = []
+                for _ in range(args.num_validation_images):
+                    images.append(
+                        pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
+                    )
+
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "validation": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        prior = prior.to(torch.float32)
+        prior.save_attn_procs(args.output_dir)
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_prior_model_name_or_path,
+                dataset_name=args.dataset_name,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    # Final inference
+    # Load previous pipeline
+    pipeline = AutoPipelineForText2Image.from_pretrained(
+        args.pretrained_decoder_model_name_or_path, torch_dtype=weight_dtype
+    )
+    pipeline = pipeline.to(accelerator.device)
+
+    # load attention processors
+    pipeline.prior_prior.load_attn_procs(args.output_dir)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device)
+    if args.seed is not None:
+        generator = generator.manual_seed(args.seed)
+    images = []
+    for _ in range(args.num_validation_images):
+        images.append(pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0])
+
+    if accelerator.is_main_process:
+        for tracker in accelerator.trackers:
+            if len(images) != 0:
+                if tracker.name == "tensorboard":
+                    np_images = np.stack([np.asarray(img) for img in images])
+                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+                if tracker.name == "wandb":
+                    tracker.log(
+                        {
+                            "test": [
+                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                for i, image in enumerate(images)
+                            ]
+                        }
+                    )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py b/diffusers/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..448429444448820e18c646f08857bf1da7bdcb95
--- /dev/null
+++ b/diffusers/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py
@@ -0,0 +1,958 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import accelerate
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.state import AcceleratorState
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from tqdm import tqdm
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers.utils import ContextManagers
+
+import diffusers
+from diffusers import AutoPipelineForText2Image, DDPMScheduler, PriorTransformer
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel, compute_snr
+from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
+
+
+if is_wandb_available():
+    import wandb
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def save_model_card(
+    args,
+    repo_id: str,
+    images=None,
+    repo_folder=None,
+):
+    img_str = ""
+    if len(images) > 0:
+        image_grid = make_image_grid(images, 1, len(args.validation_prompts))
+        image_grid.save(os.path.join(repo_folder, "val_imgs_grid.png"))
+        img_str += "![val_imgs_grid](./val_imgs_grid.png)\n"
+
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {args.pretrained_prior_model_name_or_path}
+datasets:
+- {args.dataset_name}
+tags:
+- kandinsky
+- text-to-image
+- diffusers
+- diffusers-training
+inference: true
+---
+    """
+    model_card = f"""
+# Finetuning - {repo_id}
+
+This pipeline was finetuned from **{args.pretrained_prior_model_name_or_path}** on the **{args.dataset_name}** dataset. Below are some example images generated with the finetuned pipeline using the following prompts: {args.validation_prompts}: \n
+{img_str}
+
+## Pipeline usage
+
+You can use the pipeline like so:
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe_prior = DiffusionPipeline.from_pretrained("{repo_id}", torch_dtype=torch.float16)
+pipe_t2i = DiffusionPipeline.from_pretrained("{args.pretrained_decoder_model_name_or_path}", torch_dtype=torch.float16)
+prompt = "{args.validation_prompts[0]}"
+image_embeds, negative_image_embeds = pipe_prior(prompt, guidance_scale=1.0).to_tuple()
+image = pipe_t2i(image_embeds=image_embeds, negative_image_embeds=negative_image_embeds).images[0]
+image.save("my_image.png")
+```
+
+## Training info
+
+These are the key hyperparameters used during training:
+
+* Epochs: {args.num_train_epochs}
+* Learning rate: {args.learning_rate}
+* Batch size: {args.train_batch_size}
+* Gradient accumulation steps: {args.gradient_accumulation_steps}
+* Image resolution: {args.resolution}
+* Mixed-precision: {args.mixed_precision}
+
+"""
+    wandb_info = ""
+    if is_wandb_available():
+        wandb_run_url = None
+        if wandb.run is not None:
+            wandb_run_url = wandb.run.url
+
+    if wandb_run_url is not None:
+        wandb_info = f"""
+More information on all the CLI arguments and the environment are available on your [`wandb` run page]({wandb_run_url}).
+"""
+
+    model_card += wandb_info
+
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def log_validation(
+    image_encoder, image_processor, text_encoder, tokenizer, prior, args, accelerator, weight_dtype, epoch
+):
+    logger.info("Running validation... ")
+
+    pipeline = AutoPipelineForText2Image.from_pretrained(
+        args.pretrained_decoder_model_name_or_path,
+        prior_image_encoder=accelerator.unwrap_model(image_encoder),
+        prior_image_processor=image_processor,
+        prior_text_encoder=accelerator.unwrap_model(text_encoder),
+        prior_tokenizer=tokenizer,
+        prior_prior=accelerator.unwrap_model(prior),
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    images = []
+    for i in range(len(args.validation_prompts)):
+        with torch.autocast("cuda"):
+            image = pipeline(args.validation_prompts[i], num_inference_steps=20, generator=generator).images[0]
+
+        images.append(image)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompts[i]}")
+                        for i, image in enumerate(images)
+                    ]
+                }
+            )
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+    return images
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of finetuning Kandinsky 2.2.")
+    parser.add_argument(
+        "--pretrained_decoder_model_name_or_path",
+        type=str,
+        default="kandinsky-community/kandinsky-2-2-decoder",
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_prior_model_name_or_path",
+        type=str,
+        default="kandinsky-community/kandinsky-2-2-prior",
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompts",
+        type=str,
+        default=None,
+        nargs="+",
+        help=("A set of prompts evaluated every `--validation_epochs` and logged to `--report_to`."),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="kandi_2_2-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=1, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="learning rate",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument(
+        "--adam_weight_decay",
+        type=float,
+        default=0.0,
+        required=False,
+        help="weight decay_to_use",
+    )
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=5,
+        help="Run validation every X epochs.",
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="text2image-fine-tune",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load scheduler, image_processor, tokenizer and models.
+    noise_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", prediction_type="sample")
+    image_processor = CLIPImageProcessor.from_pretrained(
+        args.pretrained_prior_model_name_or_path, subfolder="image_processor"
+    )
+    tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_prior_model_name_or_path, subfolder="tokenizer")
+
+    def deepspeed_zero_init_disabled_context_manager():
+        """
+        returns either a context list that includes one that will disable zero.Init or an empty context list
+        """
+        deepspeed_plugin = AcceleratorState().deepspeed_plugin if accelerate.state.is_initialized() else None
+        if deepspeed_plugin is None:
+            return []
+
+        return [deepspeed_plugin.zero3_init_context_manager(enable=False)]
+
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            args.pretrained_prior_model_name_or_path, subfolder="image_encoder", torch_dtype=weight_dtype
+        ).eval()
+        text_encoder = CLIPTextModelWithProjection.from_pretrained(
+            args.pretrained_prior_model_name_or_path, subfolder="text_encoder", torch_dtype=weight_dtype
+        ).eval()
+
+    prior = PriorTransformer.from_pretrained(args.pretrained_prior_model_name_or_path, subfolder="prior")
+
+    # Freeze text_encoder and image_encoder
+    text_encoder.requires_grad_(False)
+    image_encoder.requires_grad_(False)
+
+    # Set prior to trainable.
+    prior.train()
+
+    # Create EMA for the prior.
+    if args.use_ema:
+        ema_prior = PriorTransformer.from_pretrained(args.pretrained_prior_model_name_or_path, subfolder="prior")
+        ema_prior = EMAModel(ema_prior.parameters(), model_cls=PriorTransformer, model_config=ema_prior.config)
+        ema_prior.to(accelerator.device)
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if args.use_ema:
+                ema_prior.save_pretrained(os.path.join(output_dir, "prior_ema"))
+
+            for i, model in enumerate(models):
+                model.save_pretrained(os.path.join(output_dir, "prior"))
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "prior_ema"), PriorTransformer)
+                ema_prior.load_state_dict(load_model.state_dict())
+                ema_prior.to(accelerator.device)
+                del load_model
+
+            for i in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = PriorTransformer.from_pretrained(input_dir, subfolder="prior")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+    optimizer = optimizer_cls(
+        prior.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        text_input_ids = inputs.input_ids
+        text_mask = inputs.attention_mask.bool()
+        return text_input_ids, text_mask
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["clip_pixel_values"] = image_processor(images, return_tensors="pt").pixel_values
+        examples["text_input_ids"], examples["text_mask"] = tokenize_captions(examples)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        clip_pixel_values = torch.stack([example["clip_pixel_values"] for example in examples])
+        clip_pixel_values = clip_pixel_values.to(memory_format=torch.contiguous_format).float()
+        text_input_ids = torch.stack([example["text_input_ids"] for example in examples])
+        text_mask = torch.stack([example["text_mask"] for example in examples])
+        return {"clip_pixel_values": clip_pixel_values, "text_input_ids": text_input_ids, "text_mask": text_mask}
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    clip_mean = prior.clip_mean.clone()
+    clip_std = prior.clip_std.clone()
+
+    prior, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        prior, optimizer, train_dataloader, lr_scheduler
+    )
+
+    image_encoder.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        tracker_config.pop("validation_prompts")
+        accelerator.init_trackers(args.tracker_project_name, tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    clip_mean = clip_mean.to(weight_dtype).to(accelerator.device)
+    clip_std = clip_std.to(weight_dtype).to(accelerator.device)
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(prior):
+                # Convert images to latent space
+                text_input_ids, text_mask, clip_images = (
+                    batch["text_input_ids"],
+                    batch["text_mask"],
+                    batch["clip_pixel_values"].to(weight_dtype),
+                )
+                with torch.no_grad():
+                    text_encoder_output = text_encoder(text_input_ids)
+                    prompt_embeds = text_encoder_output.text_embeds
+                    text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+                    image_embeds = image_encoder(clip_images).image_embeds
+                    # Sample noise that we'll add to the image_embeds
+                    noise = torch.randn_like(image_embeds)
+                    bsz = image_embeds.shape[0]
+                    # Sample a random timestep for each image
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=image_embeds.device
+                    )
+                    timesteps = timesteps.long()
+                    image_embeds = (image_embeds - clip_mean) / clip_std
+                    noisy_latents = noise_scheduler.add_noise(image_embeds, noise, timesteps)
+
+                    target = image_embeds
+
+                # Predict the noise residual and compute loss
+                model_pred = prior(
+                    noisy_latents,
+                    timestep=timesteps,
+                    proj_embedding=prompt_embeds,
+                    encoder_hidden_states=text_encoder_hidden_states,
+                    attention_mask=text_mask,
+                ).predicted_image_embedding
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(prior.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_prior.step(prior.parameters())
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompts is not None and epoch % args.validation_epochs == 0:
+                if args.use_ema:
+                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                    ema_prior.store(prior.parameters())
+                    ema_prior.copy_to(prior.parameters())
+                log_validation(
+                    image_encoder,
+                    image_processor,
+                    text_encoder,
+                    tokenizer,
+                    prior,
+                    args,
+                    accelerator,
+                    weight_dtype,
+                    global_step,
+                )
+                if args.use_ema:
+                    # Switch back to the original UNet parameters.
+                    ema_prior.restore(prior.parameters())
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        prior = accelerator.unwrap_model(prior)
+        if args.use_ema:
+            ema_prior.copy_to(prior.parameters())
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            args.pretrained_decoder_model_name_or_path,
+            prior_image_encoder=image_encoder,
+            prior_text_encoder=text_encoder,
+            prior_prior=prior,
+        )
+        pipeline.prior_pipe.save_pretrained(args.output_dir)
+
+        # Run a final round of inference.
+        images = []
+        if args.validation_prompts is not None:
+            logger.info("Running inference for collecting generated images...")
+            pipeline = pipeline.to(accelerator.device)
+            pipeline.torch_dtype = weight_dtype
+            pipeline.set_progress_bar_config(disable=True)
+
+            if args.seed is None:
+                generator = None
+            else:
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+            for i in range(len(args.validation_prompts)):
+                with torch.autocast("cuda"):
+                    image = pipeline(args.validation_prompts[i], num_inference_steps=20, generator=generator).images[0]
+                images.append(image)
+
+        if args.push_to_hub:
+            save_model_card(args, repo_id, images, repo_folder=args.output_dir)
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/reinforcement_learning/README.md b/diffusers/examples/reinforcement_learning/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c3ada2031cf76678f7cdea0a53db27532951025
--- /dev/null
+++ b/diffusers/examples/reinforcement_learning/README.md
@@ -0,0 +1,22 @@
+# Overview
+
+These examples show how to run [Diffuser](https://arxiv.org/abs/2205.09991) in Diffusers.
+There are two ways to use the script, `run_diffuser_locomotion.py`.
+
+The key option is a change of the variable `n_guide_steps`.
+When `n_guide_steps=0`, the trajectories are sampled from the diffusion model, but not fine-tuned to maximize reward in the environment.
+By default, `n_guide_steps=2` to match the original implementation.
+
+
+You will need some RL specific requirements to run the examples:
+
+```sh
+pip install -f https://download.pytorch.org/whl/torch_stable.html \
+                free-mujoco-py \
+                einops \
+                gym==0.24.1 \
+                protobuf==3.20.1 \
+                git+https://github.com/rail-berkeley/d4rl.git \
+                mediapy \
+                Pillow==9.0.0
+```
diff --git a/diffusers/examples/reinforcement_learning/run_diffuser_locomotion.py b/diffusers/examples/reinforcement_learning/run_diffuser_locomotion.py
new file mode 100644
index 0000000000000000000000000000000000000000..adf6d1443d1c2e7caca7bdc1a26da1f2f186b8f9
--- /dev/null
+++ b/diffusers/examples/reinforcement_learning/run_diffuser_locomotion.py
@@ -0,0 +1,59 @@
+import d4rl  # noqa
+import gym
+import tqdm
+from diffusers.experimental import ValueGuidedRLPipeline
+
+
+config = {
+    "n_samples": 64,
+    "horizon": 32,
+    "num_inference_steps": 20,
+    "n_guide_steps": 2,  # can set to 0 for faster sampling, does not use value network
+    "scale_grad_by_std": True,
+    "scale": 0.1,
+    "eta": 0.0,
+    "t_grad_cutoff": 2,
+    "device": "cpu",
+}
+
+
+if __name__ == "__main__":
+    env_name = "hopper-medium-v2"
+    env = gym.make(env_name)
+
+    pipeline = ValueGuidedRLPipeline.from_pretrained(
+        "bglick13/hopper-medium-v2-value-function-hor32",
+        env=env,
+    )
+
+    env.seed(0)
+    obs = env.reset()
+    total_reward = 0
+    total_score = 0
+    T = 1000
+    rollout = [obs.copy()]
+    try:
+        for t in tqdm.tqdm(range(T)):
+            # call the policy
+            denorm_actions = pipeline(obs, planning_horizon=32)
+
+            # execute action in environment
+            next_observation, reward, terminal, _ = env.step(denorm_actions)
+            score = env.get_normalized_score(total_reward)
+
+            # update return
+            total_reward += reward
+            total_score += score
+            print(
+                f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:"
+                f" {total_score}"
+            )
+
+            # save observations for rendering
+            rollout.append(next_observation.copy())
+
+            obs = next_observation
+    except KeyboardInterrupt:
+        pass
+
+    print(f"Total reward: {total_reward}")
diff --git a/diffusers/examples/research_projects/README.md b/diffusers/examples/research_projects/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bd0fec9690e6be01b989861510ddc4d7f7d4a099
--- /dev/null
+++ b/diffusers/examples/research_projects/README.md
@@ -0,0 +1,14 @@
+# Research projects
+
+This folder contains various research projects using 🧨 Diffusers.
+They are not really maintained by the core maintainers of this library and often require a specific version of Diffusers that is indicated in the requirements file of each folder.
+Updating them to the most recent version of the library will require some work.
+
+To use any of them, just run the command
+
+```sh
+pip install -r requirements.txt
+```
+inside the folder of your choice.
+
+If you need help with any of those, please open an issue where you directly ping the author(s), as indicated at the top of the README of each folder.
diff --git a/diffusers/examples/research_projects/colossalai/README.md b/diffusers/examples/research_projects/colossalai/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..be94950b772eaaac994104f0787d9ffbfc769f63
--- /dev/null
+++ b/diffusers/examples/research_projects/colossalai/README.md
@@ -0,0 +1,111 @@
+# [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) by [colossalai](https://github.com/hpcaitech/ColossalAI.git)
+
+[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject.
+The `train_dreambooth_colossalai.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+By accommodating model data in CPU and GPU and moving the data to the computing device when necessary, [Gemini](https://www.colossalai.org/docs/advanced_tutorials/meet_gemini), the Heterogeneous Memory Manager of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) can breakthrough the GPU memory wall by using GPU and CPU memory (composed of CPU DRAM or nvme SSD memory) together at the same time. Moreover, the model scale can be further improved by combining heterogeneous training with the other parallel approaches, such as data parallel, tensor parallel and pipeline parallel.
+
+## Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Install [ColossalAI](https://github.com/hpcaitech/ColossalAI.git)
+
+**From PyPI**
+```bash
+pip install colossalai
+```
+
+**From source**
+
+```bash
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+
+# install colossalai
+pip install .
+```
+
+## Dataset for Teyvat BLIP captions
+Dataset used to train [Teyvat characters text to image model](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion).
+
+BLIP generated captions for characters images from [genshin-impact fandom wiki](https://genshin-impact.fandom.com/wiki/Character#Playable_Characters)and [biligame wiki for genshin impact](https://wiki.biligame.com/ys/%E8%A7%92%E8%89%B2).
+
+For each row the dataset contains `image` and `text` keys. `image` is a varying size PIL png, and `text` is the accompanying text caption. Only a train split is provided.
+
+The `text` include the tag `Teyvat`, `Name`,`Element`, `Weapon`, `Region`, `Model type`, and `Description`, the `Description` is captioned with the [pre-trained BLIP model](https://github.com/salesforce/BLIP).
+
+## Training
+
+The argument `placement` can be `cpu`, `auto`, `cuda`, with `cpu` the GPU RAM required can be minimized to 4GB but will deceleration, with `cuda` you can also reduce GPU memory by half but accelerated training， with `auto` a more balanced solution for speed and memory can be obtained。
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export OUTPUT_DIR="path-to-save-model"
+
+torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=400 \
+  --placement="cuda"
+```
+
+
+### Training with prior-preservation loss
+
+Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
+According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases. The `num_class_images` flag sets the number of images to generate with the class prompt. You can place existing images in `class_data_dir`, and the training script will generate any additional images so that `num_class_images` are present in `class_data_dir` during training time.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=800 \
+  --placement="cuda"
+```
+
+## Inference
+
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. sks in above example) in your prompt.
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+model_id = "path-to-save-model"
+pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+prompt = "A photo of sks dog in a bucket"
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+
+image.save("dog-bucket.png")
+```
diff --git a/diffusers/examples/research_projects/colossalai/inference.py b/diffusers/examples/research_projects/colossalai/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b115c2d2b8f5bcdb3a0c053a6c71b91a965c573
--- /dev/null
+++ b/diffusers/examples/research_projects/colossalai/inference.py
@@ -0,0 +1,12 @@
+import torch
+
+from diffusers import StableDiffusionPipeline
+
+
+model_id = "path-to-your-trained-model"
+pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+prompt = "A photo of sks dog in a bucket"
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+
+image.save("dog-bucket.png")
diff --git a/diffusers/examples/research_projects/colossalai/requirement.txt b/diffusers/examples/research_projects/colossalai/requirement.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f80467dcff521bfed1fa72109e1e01e92ab05646
--- /dev/null
+++ b/diffusers/examples/research_projects/colossalai/requirement.txt
@@ -0,0 +1,7 @@
+diffusers
+torch
+torchvision
+ftfy
+tensorboard
+Jinja2
+transformers
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/colossalai/train_dreambooth_colossalai.py b/diffusers/examples/research_projects/colossalai/train_dreambooth_colossalai.py
new file mode 100644
index 0000000000000000000000000000000000000000..10c8e095a69652eb47e01ed7d1368dbdf9ed735c
--- /dev/null
+++ b/diffusers/examples/research_projects/colossalai/train_dreambooth_colossalai.py
@@ -0,0 +1,671 @@
+import argparse
+import math
+import os
+from pathlib import Path
+
+import colossalai
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
+from colossalai.nn.parallel.utils import get_static_torch_model
+from colossalai.utils import get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext
+from huggingface_hub import create_repo, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+
+
+disable_existing_loggers()
+logger = get_dist_logger()
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default="a photo of sks dog",
+        required=False,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--placement",
+        type=str,
+        default="cpu",
+        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        if args.class_data_dir is not None:
+            logger.warning("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            logger.warning("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+        ).input_ids
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                padding="do_not_pad",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids
+
+        return example
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+# Gemini + ZeRO DDP
+def gemini_zero_dpp(model: torch.nn.Module, placememt_policy: str = "auto"):
+    from colossalai.nn.parallel import GeminiDDP
+
+    model = GeminiDDP(
+        model, device=get_current_device(), placement_policy=placememt_policy, pin_memory=True, search_range_mb=64
+    )
+    return model
+
+
+def main(args):
+    if args.seed is None:
+        colossalai.launch_from_torch(config={})
+    else:
+        colossalai.launch_from_torch(config={}, seed=args.seed)
+
+    local_rank = gpc.get_local_rank(ParallelMode.DATA)
+    world_size = gpc.get_world_size(ParallelMode.DATA)
+
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if get_current_device() == "cuda" else torch.float32
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            pipeline.to(get_current_device())
+
+            for example in tqdm(
+                sample_dataloader,
+                desc="Generating class images",
+                disable=not local_rank == 0,
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+
+    # Handle the repository creation
+    if local_rank == 0:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        logger.info(f"Loading tokenizer from {args.tokenizer_name}", ranks=[0])
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name,
+            revision=args.revision,
+            use_fast=False,
+        )
+    elif args.pretrained_model_name_or_path:
+        logger.info("Loading tokenizer from pretrained model", ranks=[0])
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+        # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
+
+    # Load models and create wrapper for stable diffusion
+
+    logger.info(f"Loading text_encoder from {args.pretrained_model_name_or_path}", ranks=[0])
+
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+
+    logger.info(f"Loading AutoencoderKL from {args.pretrained_model_name_or_path}", ranks=[0])
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+    )
+
+    logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
+    with ColoInitContext(device=get_current_device()):
+        unet = UNet2DConditionModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, low_cpu_mem_usage=False
+        )
+
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    if args.scale_lr:
+        args.learning_rate = args.learning_rate * args.train_batch_size * world_size
+
+    unet = gemini_zero_dpp(unet, args.placement)
+
+    # config optimizer for colossalai zero
+    optimizer = GeminiAdamOptimizer(unet, lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
+
+    # load noise_scheduler
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    # prepare dataset
+    logger.info(f"Prepare dataset from {args.instance_data_dir}", ranks=[0])
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if args.with_prior_preservation:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+        input_ids = tokenizer.pad(
+            {"input_ids": input_ids},
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+
+        batch = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+        }
+        return batch
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn, num_workers=1
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+    weight_dtype = torch.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move text_encode and vae to gpu.
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    vae.to(get_current_device(), dtype=weight_dtype)
+    text_encoder.to(get_current_device(), dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Train!
+    total_batch_size = args.train_batch_size * world_size
+
+    logger.info("***** Running training *****", ranks=[0])
+    logger.info(f"  Num examples = {len(train_dataset)}", ranks=[0])
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}", ranks=[0])
+    logger.info(f"  Num Epochs = {args.num_train_epochs}", ranks=[0])
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}", ranks=[0])
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}", ranks=[0])
+    logger.info(f"  Total optimization steps = {args.max_train_steps}", ranks=[0])
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not local_rank == 0)
+    progress_bar.set_description("Steps")
+    global_step = 0
+
+    torch.cuda.synchronize()
+    for epoch in range(args.num_train_epochs):
+        unet.train()
+        for step, batch in enumerate(train_dataloader):
+            torch.cuda.reset_peak_memory_stats()
+            # Move batch to gpu
+            for key, value in batch.items():
+                batch[key] = value.to(get_current_device(), non_blocking=True)
+
+            # Convert images to latent space
+            optimizer.zero_grad()
+
+            latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+            latents = latents * 0.18215
+
+            # Sample noise that we'll add to the latents
+            noise = torch.randn_like(latents)
+            bsz = latents.shape[0]
+            # Sample a random timestep for each image
+            timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+            timesteps = timesteps.long()
+
+            # Add noise to the latents according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+            # Get the text embedding for conditioning
+            encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+            # Predict the noise residual
+            model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+            # Get the target for loss depending on the prediction type
+            if noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif noise_scheduler.config.prediction_type == "v_prediction":
+                target = noise_scheduler.get_velocity(latents, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+            if args.with_prior_preservation:
+                # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                target, target_prior = torch.chunk(target, 2, dim=0)
+
+                # Compute instance loss
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
+
+                # Compute prior loss
+                prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                # Add the prior loss to the instance loss.
+                loss = loss + args.prior_loss_weight * prior_loss
+            else:
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+            optimizer.backward(loss)
+
+            optimizer.step()
+            lr_scheduler.step()
+            logger.info(f"max GPU_mem cost is {torch.cuda.max_memory_allocated()/2**20} MB", ranks=[0])
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            progress_bar.update(1)
+            global_step += 1
+            logs = {
+                "loss": loss.detach().item(),
+                "lr": optimizer.param_groups[0]["lr"],
+            }  # lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step % args.save_steps == 0:
+                torch.cuda.synchronize()
+                torch_unet = get_static_torch_model(unet)
+                if local_rank == 0:
+                    pipeline = DiffusionPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        unet=torch_unet,
+                        revision=args.revision,
+                    )
+                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                    pipeline.save_pretrained(save_path)
+                    logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
+            if global_step >= args.max_train_steps:
+                break
+
+    torch.cuda.synchronize()
+    unet = get_static_torch_model(unet)
+
+    if local_rank == 0:
+        pipeline = DiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            unet=unet,
+            revision=args.revision,
+        )
+
+        pipeline.save_pretrained(args.output_dir)
+        logger.info(f"Saving model checkpoint to {args.output_dir}", ranks=[0])
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/consistency_training/README.md b/diffusers/examples/research_projects/consistency_training/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..67eb83514192cf33fc33bebd01559efcf6744387
--- /dev/null
+++ b/diffusers/examples/research_projects/consistency_training/README.md
@@ -0,0 +1,24 @@
+# Consistency Training
+
+`train_cm_ct_unconditional.py` trains a consistency model (CM) from scratch following the consistency training (CT) algorithm introduced in [Consistency Models](https://arxiv.org/abs/2303.01469) and refined in [Improved Techniques for Training Consistency Models](https://arxiv.org/abs/2310.14189). Both unconditional and class-conditional training are supported.
+
+A usage example is as follows:
+
+```bash
+accelerate launch examples/research_projects/consistency_training/train_cm_ct_unconditional.py \
+    --dataset_name="cifar10" \
+    --dataset_image_column_name="img" \
+    --output_dir="/path/to/output/dir" \
+    --mixed_precision=fp16 \
+    --resolution=32 \
+    --max_train_steps=1000 --max_train_samples=10000 \
+    --dataloader_num_workers=8 \
+    --noise_precond_type="cm" --input_precond_type="cm" \
+    --train_batch_size=4 \
+    --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
+    --use_8bit_adam \
+    --use_ema \
+    --validation_steps=100 --eval_batch_size=4 \
+    --checkpointing_steps=100 --checkpoints_total_limit=10 \
+    --class_conditional --num_classes=10 \
+```
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/consistency_training/requirements.txt b/diffusers/examples/research_projects/consistency_training/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e19b0ce60bf407bec21a9b85f9232cad957bfa6f
--- /dev/null
+++ b/diffusers/examples/research_projects/consistency_training/requirements.txt
@@ -0,0 +1,6 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/consistency_training/train_cm_ct_unconditional.py b/diffusers/examples/research_projects/consistency_training/train_cm_ct_unconditional.py
new file mode 100644
index 0000000000000000000000000000000000000000..eccc539f230c672ce309751cb277ecb315c22639
--- /dev/null
+++ b/diffusers/examples/research_projects/consistency_training/train_cm_ct_unconditional.py
@@ -0,0 +1,1438 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+"""Script to train a consistency model from scratch via (improved) consistency training."""
+
+import argparse
+import gc
+import logging
+import math
+import os
+import shutil
+from datetime import timedelta
+from pathlib import Path
+
+import accelerate
+import datasets
+import numpy as np
+import torch
+from accelerate import Accelerator, InitProcessGroupKwargs
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+
+import diffusers
+from diffusers import (
+    CMStochasticIterativeScheduler,
+    ConsistencyModelPipeline,
+    UNet2DModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel, resolve_interpolation_mode
+from diffusers.utils import is_tensorboard_available, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    if not isinstance(arr, torch.Tensor):
+        arr = torch.from_numpy(arr)
+    res = arr[timesteps].float().to(timesteps.device)
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res.expand(broadcast_shape)
+
+
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+
+
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
+
+def get_discretization_steps(global_step: int, max_train_steps: int, s_0: int = 10, s_1: int = 1280, constant=False):
+    """
+    Calculates the current discretization steps at global step k using the discretization curriculum N(k).
+    """
+    if constant:
+        return s_0 + 1
+
+    k_prime = math.floor(max_train_steps / (math.log2(math.floor(s_1 / s_0)) + 1))
+    num_discretization_steps = min(s_0 * 2 ** math.floor(global_step / k_prime), s_1) + 1
+
+    return num_discretization_steps
+
+
+def get_skip_steps(global_step, initial_skip: int = 1):
+    # Currently only support constant skip curriculum.
+    return initial_skip
+
+
+def get_karras_sigmas(
+    num_discretization_steps: int,
+    sigma_min: float = 0.002,
+    sigma_max: float = 80.0,
+    rho: float = 7.0,
+    dtype=torch.float32,
+):
+    """
+    Calculates the Karras sigmas timestep discretization of [sigma_min, sigma_max].
+    """
+    ramp = np.linspace(0, 1, num_discretization_steps)
+    min_inv_rho = sigma_min ** (1 / rho)
+    max_inv_rho = sigma_max ** (1 / rho)
+    sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+    # Make sure sigmas are in increasing rather than decreasing order (see section 2 of the iCT paper)
+    sigmas = sigmas[::-1].copy()
+    sigmas = torch.from_numpy(sigmas).to(dtype=dtype)
+    return sigmas
+
+
+def get_discretized_lognormal_weights(noise_levels: torch.Tensor, p_mean: float = -1.1, p_std: float = 2.0):
+    """
+    Calculates the unnormalized weights for a 1D array of noise level sigma_i based on the discretized lognormal"
+    " distribution used in the iCT paper (given in Equation 10).
+    """
+    upper_prob = torch.special.erf((torch.log(noise_levels[1:]) - p_mean) / (math.sqrt(2) * p_std))
+    lower_prob = torch.special.erf((torch.log(noise_levels[:-1]) - p_mean) / (math.sqrt(2) * p_std))
+    weights = upper_prob - lower_prob
+    return weights
+
+
+def get_loss_weighting_schedule(noise_levels: torch.Tensor):
+    """
+    Calculates the loss weighting schedule lambda given a set of noise levels.
+    """
+    return 1.0 / (noise_levels[1:] - noise_levels[:-1])
+
+
+def add_noise(original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor):
+    # Make sure timesteps (Karras sigmas) have the same device and dtype as original_samples
+    sigmas = timesteps.to(device=original_samples.device, dtype=original_samples.dtype)
+    while len(sigmas.shape) < len(original_samples.shape):
+        sigmas = sigmas.unsqueeze(-1)
+
+    noisy_samples = original_samples + noise * sigmas
+
+    return noisy_samples
+
+
+def get_noise_preconditioning(sigmas, noise_precond_type: str = "cm"):
+    """
+    Calculates the noise preconditioning function c_noise, which is used to transform the raw Karras sigmas into the
+    timestep input for the U-Net.
+    """
+    if noise_precond_type == "none":
+        return sigmas
+    elif noise_precond_type == "edm":
+        return 0.25 * torch.log(sigmas)
+    elif noise_precond_type == "cm":
+        return 1000 * 0.25 * torch.log(sigmas + 1e-44)
+    else:
+        raise ValueError(
+            f"Noise preconditioning type {noise_precond_type} is not current supported. Currently supported noise"
+            f" preconditioning types are `none` (which uses the sigmas as is), `edm`, and `cm`."
+        )
+
+
+def get_input_preconditioning(sigmas, sigma_data=0.5, input_precond_type: str = "cm"):
+    """
+    Calculates the input preconditioning factor c_in, which is used to scale the U-Net image input.
+    """
+    if input_precond_type == "none":
+        return 1
+    elif input_precond_type == "cm":
+        return 1.0 / (sigmas**2 + sigma_data**2)
+    else:
+        raise ValueError(
+            f"Input preconditioning type {input_precond_type} is not current supported. Currently supported input"
+            f" preconditioning types are `none` (which uses a scaling factor of 1.0) and `cm`."
+        )
+
+
+def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=1.0):
+    scaled_timestep = timestep_scaling * timestep
+    c_skip = sigma_data**2 / (scaled_timestep**2 + sigma_data**2)
+    c_out = scaled_timestep / (scaled_timestep**2 + sigma_data**2) ** 0.5
+    return c_skip, c_out
+
+
+def log_validation(unet, scheduler, args, accelerator, weight_dtype, step, name="teacher"):
+    logger.info("Running validation... ")
+
+    unet = accelerator.unwrap_model(unet)
+    pipeline = ConsistencyModelPipeline(
+        unet=unet,
+        scheduler=scheduler,
+    )
+    pipeline = pipeline.to(device=accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    class_labels = [None]
+    if args.class_conditional:
+        if args.num_classes is not None:
+            class_labels = list(range(args.num_classes))
+        else:
+            logger.warning(
+                "The model is class-conditional but the number of classes is not set. The generated images will be"
+                " unconditional rather than class-conditional."
+            )
+
+    image_logs = []
+
+    for class_label in class_labels:
+        images = []
+        with torch.autocast("cuda"):
+            images = pipeline(
+                num_inference_steps=1,
+                batch_size=args.eval_batch_size,
+                class_labels=[class_label] * args.eval_batch_size,
+                generator=generator,
+            ).images
+        log = {"images": images}
+        if args.class_conditional and class_label is not None:
+            log["class_label"] = str(class_label)
+        else:
+            log["class_label"] = "images"
+        image_logs.append(log)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            for log in image_logs:
+                images = log["images"]
+                class_label = log["class_label"]
+                formatted_images = []
+                for image in images:
+                    formatted_images.append(np.asarray(image))
+
+                formatted_images = np.stack(formatted_images)
+
+                tracker.writer.add_images(class_label, formatted_images, step, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            formatted_images = []
+
+            for log in image_logs:
+                images = log["images"]
+                class_label = log["class_label"]
+                for image in images:
+                    image = wandb.Image(image, caption=class_label)
+                    formatted_images.append(image)
+
+            tracker.log({f"validation/{name}": formatted_images})
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+    del pipeline
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    return image_logs
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    # ------------Model Arguments-----------
+    parser.add_argument(
+        "--model_config_name_or_path",
+        type=str,
+        default=None,
+        help="The config of the UNet model to train, leave as None to use standard DDPM configuration.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        help=(
+            "If initializing the weights from a pretrained model, the path to the pretrained model or model identifier"
+            " from huggingface.co/models."
+        ),
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help=(
+            "Variant of the model files of the pretrained model identifier from huggingface.co/models, e.g. `fp16`,"
+            " `non_ema`, etc.",
+        ),
+    )
+    # ------------Dataset Arguments-----------
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that HF Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--dataset_image_column_name",
+        type=str,
+        default="image",
+        help="The name of the image column in the dataset to use for training.",
+    )
+    parser.add_argument(
+        "--dataset_class_label_column_name",
+        type=str,
+        default="label",
+        help="If doing class-conditional training, the name of the class label column in the dataset to use.",
+    )
+    # ------------Image Processing Arguments-----------
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=64,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--interpolation_type",
+        type=str,
+        default="bilinear",
+        help=(
+            "The interpolation function used when resizing images to the desired resolution. Choose between `bilinear`,"
+            " `bicubic`, `box`, `nearest`, `nearest_exact`, `hamming`, and `lanczos`."
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        default=False,
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--class_conditional",
+        action="store_true",
+        help=(
+            "Whether to train a class-conditional model. If set, the class labels will be taken from the `label`"
+            " column of the provided dataset."
+        ),
+    )
+    parser.add_argument(
+        "--num_classes",
+        type=int,
+        default=None,
+        help="The number of classes in the training data, if training a class-conditional model.",
+    )
+    parser.add_argument(
+        "--class_embed_type",
+        type=str,
+        default=None,
+        help=(
+            "The class embedding type to use. Choose from `None`, `identity`, and `timestep`. If `class_conditional`"
+            " and `num_classes` and set, but `class_embed_type` is `None`, a embedding matrix will be used."
+        ),
+    )
+    # ------------Dataloader Arguments-----------
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "The number of subprocesses to use for data loading. 0 means that the data will be loaded in the main"
+            " process."
+        ),
+    )
+    # ------------Training Arguments-----------
+    # ----General Training Arguments----
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="ddpm-model-64",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--overwrite_output_dir", action="store_true")
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    # ----Batch Size and Training Length----
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    # ----Learning Rate----
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="cosine",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    # ----Optimizer (Adam) Arguments----
+    parser.add_argument(
+        "--optimizer_type",
+        type=str,
+        default="adamw",
+        help=(
+            "The optimizer algorithm to use for training. Choose between `radam` and `adamw`. The iCT paper uses"
+            " RAdam."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.95, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument(
+        "--adam_weight_decay", type=float, default=1e-6, help="Weight decay magnitude for the Adam optimizer."
+    )
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    # ----Consistency Training (CT) Specific Arguments----
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default="sample",
+        choices=["sample"],
+        help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.",
+    )
+    parser.add_argument("--ddpm_num_steps", type=int, default=1000)
+    parser.add_argument("--ddpm_num_inference_steps", type=int, default=1000)
+    parser.add_argument("--ddpm_beta_schedule", type=str, default="linear")
+    parser.add_argument(
+        "--sigma_min",
+        type=float,
+        default=0.002,
+        help=(
+            "The lower boundary for the timestep discretization, which should be set to a small positive value close"
+            " to zero to avoid numerical issues when solving the PF-ODE backwards in time."
+        ),
+    )
+    parser.add_argument(
+        "--sigma_max",
+        type=float,
+        default=80.0,
+        help=(
+            "The upper boundary for the timestep discretization, which also determines the variance of the Gaussian"
+            " prior."
+        ),
+    )
+    parser.add_argument(
+        "--rho",
+        type=float,
+        default=7.0,
+        help="The rho parameter for the Karras sigmas timestep dicretization.",
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=None,
+        help=(
+            "The Pseudo-Huber loss parameter c. If not set, this will default to the value recommended in the Improved"
+            " Consistency Training (iCT) paper of 0.00054 * sqrt(d), where d is the data dimensionality."
+        ),
+    )
+    parser.add_argument(
+        "--discretization_s_0",
+        type=int,
+        default=10,
+        help=(
+            "The s_0 parameter in the discretization curriculum N(k). This controls the number of training steps after"
+            " which the number of discretization steps N will be doubled."
+        ),
+    )
+    parser.add_argument(
+        "--discretization_s_1",
+        type=int,
+        default=1280,
+        help=(
+            "The s_1 parameter in the discretization curriculum N(k). This controls the upper limit to the number of"
+            " discretization steps used. Increasing this value will reduce the bias at the cost of higher variance."
+        ),
+    )
+    parser.add_argument(
+        "--constant_discretization_steps",
+        action="store_true",
+        help=(
+            "Whether to set the discretization curriculum N(k) to be the constant value `discretization_s_0 + 1`. This"
+            " is useful for testing when `max_number_steps` is small, when `k_prime` would otherwise be 0, causing"
+            " a divide-by-zero error."
+        ),
+    )
+    parser.add_argument(
+        "--p_mean",
+        type=float,
+        default=-1.1,
+        help=(
+            "The mean parameter P_mean for the (discretized) lognormal noise schedule, which controls the probability"
+            " of sampling a (discrete) noise level sigma_i."
+        ),
+    )
+    parser.add_argument(
+        "--p_std",
+        type=float,
+        default=2.0,
+        help=(
+            "The standard deviation parameter P_std for the (discretized) noise schedule, which controls the"
+            " probability of sampling a (discrete) noise level sigma_i."
+        ),
+    )
+    parser.add_argument(
+        "--noise_precond_type",
+        type=str,
+        default="cm",
+        help=(
+            "The noise preconditioning function to use for transforming the raw Karras sigmas into the timestep"
+            " argument of the U-Net. Choose between `none` (the identity function), `edm`, and `cm`."
+        ),
+    )
+    parser.add_argument(
+        "--input_precond_type",
+        type=str,
+        default="cm",
+        help=(
+            "The input preconditioning function to use for scaling the image input of the U-Net. Choose between `none`"
+            " (a scaling factor of 1) and `cm`."
+        ),
+    )
+    parser.add_argument(
+        "--skip_steps",
+        type=int,
+        default=1,
+        help=(
+            "The gap in indices between the student and teacher noise levels. In the iCT paper this is always set to"
+            " 1, but theoretically this could be greater than 1 and/or altered according to a curriculum throughout"
+            " training, much like the number of discretization steps is."
+        ),
+    )
+    parser.add_argument(
+        "--cast_teacher",
+        action="store_true",
+        help="Whether to cast the teacher U-Net model to `weight_dtype` or leave it in full precision.",
+    )
+    # ----Exponential Moving Average (EMA) Arguments----
+    parser.add_argument(
+        "--use_ema",
+        action="store_true",
+        help="Whether to use Exponential Moving Average for the final model weights.",
+    )
+    parser.add_argument(
+        "--ema_min_decay",
+        type=float,
+        default=None,
+        help=(
+            "The minimum decay magnitude for EMA. If not set, this will default to the value of `ema_max_decay`,"
+            " resulting in a constant EMA decay rate."
+        ),
+    )
+    parser.add_argument(
+        "--ema_max_decay",
+        type=float,
+        default=0.99993,
+        help=(
+            "The maximum decay magnitude for EMA. Setting `ema_min_decay` equal to this value will result in a"
+            " constant decay rate."
+        ),
+    )
+    parser.add_argument(
+        "--use_ema_warmup",
+        action="store_true",
+        help="Whether to use EMA warmup.",
+    )
+    parser.add_argument("--ema_inv_gamma", type=float, default=1.0, help="The inverse gamma value for the EMA decay.")
+    parser.add_argument("--ema_power", type=float, default=3 / 4, help="The power value for the EMA decay.")
+    # ----Training Optimization Arguments----
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    # ----Distributed Training Arguments----
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    # ------------Validation Arguments-----------
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=200,
+        help="Run validation every X steps.",
+    )
+    parser.add_argument(
+        "--eval_batch_size",
+        type=int,
+        default=16,
+        help=(
+            "The number of images to generate for evaluation. Note that if `class_conditional` and `num_classes` is"
+            " set the effective number of images generated per evaluation step is `eval_batch_size * num_classes`."
+        ),
+    )
+    parser.add_argument("--save_images_epochs", type=int, default=10, help="How often to save images during training.")
+    # ------------Validation Arguments-----------
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--save_model_epochs", type=int, default=10, help="How often to save the model during training."
+    )
+    # ------------Logging Arguments-----------
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    # ------------HuggingFace Hub Arguments-----------
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--hub_private_repo", action="store_true", help="Whether or not to create a private repository."
+    )
+    # ------------Accelerate Arguments-----------
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="consistency-training",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("You must specify either a dataset name from the hub or a train data directory.")
+
+    return args
+
+
+def main(args):
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=7200))  # a big number for high resolution or big dataset
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    if args.report_to == "tensorboard":
+        if not is_tensorboard_available():
+            raise ImportError("Make sure to install tensorboard if you want to use it for logging during training.")
+
+    elif args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # 1. Initialize the noise scheduler.
+    initial_discretization_steps = get_discretization_steps(
+        0,
+        args.max_train_steps,
+        s_0=args.discretization_s_0,
+        s_1=args.discretization_s_1,
+        constant=args.constant_discretization_steps,
+    )
+    noise_scheduler = CMStochasticIterativeScheduler(
+        num_train_timesteps=initial_discretization_steps,
+        sigma_min=args.sigma_min,
+        sigma_max=args.sigma_max,
+        rho=args.rho,
+    )
+
+    # 2. Initialize the student U-Net model.
+    if args.pretrained_model_name_or_path is not None:
+        logger.info(f"Loading pretrained U-Net weights from {args.pretrained_model_name_or_path}... ")
+        unet = UNet2DModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+        )
+    elif args.model_config_name_or_path is None:
+        # TODO: use default architectures from iCT paper
+        if not args.class_conditional and (args.num_classes is not None or args.class_embed_type is not None):
+            logger.warning(
+                f"`--class_conditional` is set to `False` but `--num_classes` is set to {args.num_classes} and"
+                f" `--class_embed_type` is set to {args.class_embed_type}. These values will be overridden to `None`."
+            )
+            args.num_classes = None
+            args.class_embed_type = None
+        elif args.class_conditional and args.num_classes is None and args.class_embed_type is None:
+            logger.warning(
+                "`--class_conditional` is set to `True` but neither `--num_classes` nor `--class_embed_type` is set."
+                "`class_conditional` will be overridden to `False`."
+            )
+            args.class_conditional = False
+        unet = UNet2DModel(
+            sample_size=args.resolution,
+            in_channels=3,
+            out_channels=3,
+            layers_per_block=2,
+            block_out_channels=(128, 128, 256, 256, 512, 512),
+            down_block_types=(
+                "DownBlock2D",
+                "DownBlock2D",
+                "DownBlock2D",
+                "DownBlock2D",
+                "AttnDownBlock2D",
+                "DownBlock2D",
+            ),
+            up_block_types=(
+                "UpBlock2D",
+                "AttnUpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+            ),
+            class_embed_type=args.class_embed_type,
+            num_class_embeds=args.num_classes,
+        )
+    else:
+        config = UNet2DModel.load_config(args.model_config_name_or_path)
+        unet = UNet2DModel.from_config(config)
+    unet.train()
+
+    # Create EMA for the student U-Net model.
+    if args.use_ema:
+        if args.ema_min_decay is None:
+            args.ema_min_decay = args.ema_max_decay
+        ema_unet = EMAModel(
+            unet.parameters(),
+            decay=args.ema_max_decay,
+            min_decay=args.ema_min_decay,
+            use_ema_warmup=args.use_ema_warmup,
+            inv_gamma=args.ema_inv_gamma,
+            power=args.ema_power,
+            model_cls=UNet2DModel,
+            model_config=unet.config,
+        )
+
+    # 3. Initialize the teacher U-Net model from the student U-Net model.
+    # Note that following the improved Consistency Training paper, the teacher U-Net is not updated via EMA (e.g. the
+    # EMA decay rate is 0.)
+    teacher_unet = UNet2DModel.from_config(unet.config)
+    teacher_unet.load_state_dict(unet.state_dict())
+    teacher_unet.train()
+    teacher_unet.requires_grad_(False)
+
+    # 4. Handle mixed precision and device placement
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+        args.mixed_precision = accelerator.mixed_precision
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+        args.mixed_precision = accelerator.mixed_precision
+
+    # Cast teacher_unet to weight_dtype if cast_teacher is set.
+    if args.cast_teacher:
+        teacher_dtype = weight_dtype
+    else:
+        teacher_dtype = torch.float32
+
+    teacher_unet.to(accelerator.device)
+    if args.use_ema:
+        ema_unet.to(accelerator.device)
+
+    # 5. Handle saving and loading of checkpoints.
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                teacher_unet.save_pretrained(os.path.join(output_dir, "unet_teacher"))
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            load_model = UNet2DModel.from_pretrained(os.path.join(input_dir, "unet_teacher"))
+            teacher_unet.load_state_dict(load_model.state_dict())
+            teacher_unet.to(accelerator.device)
+            del load_model
+
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DModel)
+                ema_unet.load_state_dict(load_model.state_dict())
+                ema_unet.to(accelerator.device)
+                del load_model
+
+            for i in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # 6. Enable optimizations
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+            teacher_unet.enable_xformers_memory_efficient_attention()
+            if args.use_ema:
+                ema_unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    if args.optimizer_type == "radam":
+        optimizer_class = torch.optim.RAdam
+    elif args.optimizer_type == "adamw":
+        # Use 8-bit Adam for lower memory usage or to fine-tune the model for 16GB GPUs
+        if args.use_8bit_adam:
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError(
+                    "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+                )
+
+            optimizer_class = bnb.optim.AdamW8bit
+        else:
+            optimizer_class = torch.optim.AdamW
+    else:
+        raise ValueError(
+            f"Optimizer type {args.optimizer_type} is not supported. Currently supported optimizer types are `radam`"
+            f" and `adamw`."
+        )
+
+    # 7. Initialize the optimizer
+    optimizer = optimizer_class(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # 8. Dataset creation and data preprocessing
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            split="train",
+        )
+    else:
+        dataset = load_dataset("imagefolder", data_dir=args.train_data_dir, cache_dir=args.cache_dir, split="train")
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets and DataLoaders creation.
+    interpolation_mode = resolve_interpolation_mode(args.interpolation_type)
+    augmentations = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=interpolation_mode),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def transform_images(examples):
+        images = [augmentations(image.convert("RGB")) for image in examples[args.dataset_image_column_name]]
+        batch_dict = {"images": images}
+        if args.class_conditional:
+            batch_dict["class_labels"] = examples[args.dataset_class_label_column_name]
+        return batch_dict
+
+    logger.info(f"Dataset size: {len(dataset)}")
+
+    dataset.set_transform(transform_images)
+    train_dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
+    )
+
+    # 9. Initialize the learning rate scheduler
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # 10. Prepare for training
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    def recalculate_num_discretization_step_values(discretization_steps, skip_steps):
+        """
+        Recalculates all quantities depending on the number of discretization steps N.
+        """
+        noise_scheduler = CMStochasticIterativeScheduler(
+            num_train_timesteps=discretization_steps,
+            sigma_min=args.sigma_min,
+            sigma_max=args.sigma_max,
+            rho=args.rho,
+        )
+        current_timesteps = get_karras_sigmas(discretization_steps, args.sigma_min, args.sigma_max, args.rho)
+        valid_teacher_timesteps_plus_one = current_timesteps[: len(current_timesteps) - skip_steps + 1]
+        # timestep_weights are the unnormalized probabilities of sampling the timestep/noise level at each index
+        timestep_weights = get_discretized_lognormal_weights(
+            valid_teacher_timesteps_plus_one, p_mean=args.p_mean, p_std=args.p_std
+        )
+        # timestep_loss_weights is the timestep-dependent loss weighting schedule lambda(sigma_i)
+        timestep_loss_weights = get_loss_weighting_schedule(valid_teacher_timesteps_plus_one)
+
+        current_timesteps = current_timesteps.to(accelerator.device)
+        timestep_weights = timestep_weights.to(accelerator.device)
+        timestep_loss_weights = timestep_loss_weights.to(accelerator.device)
+
+        return noise_scheduler, current_timesteps, timestep_weights, timestep_loss_weights
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+
+    # Function for unwraping if torch.compile() was used in accelerate.
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    # Resolve the c parameter for the Pseudo-Huber loss
+    if args.huber_c is None:
+        args.huber_c = 0.00054 * args.resolution * math.sqrt(unwrap_model(unet).config.in_channels)
+
+    # Get current number of discretization steps N according to our discretization curriculum
+    current_discretization_steps = get_discretization_steps(
+        initial_global_step,
+        args.max_train_steps,
+        s_0=args.discretization_s_0,
+        s_1=args.discretization_s_1,
+        constant=args.constant_discretization_steps,
+    )
+    current_skip_steps = get_skip_steps(initial_global_step, initial_skip=args.skip_steps)
+    if current_skip_steps >= current_discretization_steps:
+        raise ValueError(
+            f"The current skip steps is {current_skip_steps}, but should be smaller than the current number of"
+            f" discretization steps {current_discretization_steps}"
+        )
+    # Recalculate all quantities depending on the number of discretization steps N
+    (
+        noise_scheduler,
+        current_timesteps,
+        timestep_weights,
+        timestep_loss_weights,
+    ) = recalculate_num_discretization_step_values(current_discretization_steps, current_skip_steps)
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    # 11. Train!
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        for step, batch in enumerate(train_dataloader):
+            # 1. Get batch of images from dataloader (sample x ~ p_data(x))
+            clean_images = batch["images"].to(weight_dtype)
+            if args.class_conditional:
+                class_labels = batch["class_labels"]
+            else:
+                class_labels = None
+            bsz = clean_images.shape[0]
+
+            # 2. Sample a random timestep for each image according to the noise schedule.
+            # Sample random indices i ~ p(i), where p(i) is the dicretized lognormal distribution in the iCT paper
+            # NOTE: timestep_indices should be in the range [0, len(current_timesteps) - k - 1] inclusive
+            timestep_indices = torch.multinomial(timestep_weights, bsz, replacement=True).long()
+            teacher_timesteps = current_timesteps[timestep_indices]
+            student_timesteps = current_timesteps[timestep_indices + current_skip_steps]
+
+            # 3. Sample noise and add it to the clean images for both teacher and student unets
+            # Sample noise z ~ N(0, I) that we'll add to the images
+            noise = torch.randn(clean_images.shape, dtype=weight_dtype, device=clean_images.device)
+            # Add noise to the clean images according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            teacher_noisy_images = add_noise(clean_images, noise, teacher_timesteps)
+            student_noisy_images = add_noise(clean_images, noise, student_timesteps)
+
+            # 4. Calculate preconditioning and scalings for boundary conditions for the consistency model.
+            teacher_rescaled_timesteps = get_noise_preconditioning(teacher_timesteps, args.noise_precond_type)
+            student_rescaled_timesteps = get_noise_preconditioning(student_timesteps, args.noise_precond_type)
+
+            c_in_teacher = get_input_preconditioning(teacher_timesteps, input_precond_type=args.input_precond_type)
+            c_in_student = get_input_preconditioning(student_timesteps, input_precond_type=args.input_precond_type)
+
+            c_skip_teacher, c_out_teacher = scalings_for_boundary_conditions(teacher_timesteps)
+            c_skip_student, c_out_student = scalings_for_boundary_conditions(student_timesteps)
+
+            c_skip_teacher, c_out_teacher, c_in_teacher = [
+                append_dims(x, clean_images.ndim) for x in [c_skip_teacher, c_out_teacher, c_in_teacher]
+            ]
+            c_skip_student, c_out_student, c_in_student = [
+                append_dims(x, clean_images.ndim) for x in [c_skip_student, c_out_student, c_in_student]
+            ]
+
+            with accelerator.accumulate(unet):
+                # 5. Get the student unet denoising prediction on the student timesteps
+                # Get rng state now to ensure that dropout is synced between the student and teacher models.
+                dropout_state = torch.get_rng_state()
+                student_model_output = unet(
+                    c_in_student * student_noisy_images, student_rescaled_timesteps, class_labels=class_labels
+                ).sample
+                # NOTE: currently only support prediction_type == sample, so no need to convert model_output
+                student_denoise_output = c_skip_student * student_noisy_images + c_out_student * student_model_output
+
+                # 6. Get the teacher unet denoising prediction on the teacher timesteps
+                with torch.no_grad(), torch.autocast("cuda", dtype=teacher_dtype):
+                    torch.set_rng_state(dropout_state)
+                    teacher_model_output = teacher_unet(
+                        c_in_teacher * teacher_noisy_images, teacher_rescaled_timesteps, class_labels=class_labels
+                    ).sample
+                    # NOTE: currently only support prediction_type == sample, so no need to convert model_output
+                    teacher_denoise_output = (
+                        c_skip_teacher * teacher_noisy_images + c_out_teacher * teacher_model_output
+                    )
+
+                # 7. Calculate the weighted Pseudo-Huber loss
+                if args.prediction_type == "sample":
+                    # Note that the loss weights should be those at the (teacher) timestep indices.
+                    lambda_t = _extract_into_tensor(
+                        timestep_loss_weights, timestep_indices, (bsz,) + (1,) * (clean_images.ndim - 1)
+                    )
+                    loss = lambda_t * (
+                        torch.sqrt(
+                            (student_denoise_output.float() - teacher_denoise_output.float()) ** 2 + args.huber_c**2
+                        )
+                        - args.huber_c
+                    )
+                    loss = loss.mean()
+                else:
+                    raise ValueError(
+                        f"Unsupported prediction type: {args.prediction_type}. Currently, only `sample` is supported."
+                    )
+
+                # 8. Backpropagate on the consistency training loss
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                # 9. Update teacher_unet and ema_unet parameters using unet's parameters.
+                teacher_unet.load_state_dict(unet.state_dict())
+                if args.use_ema:
+                    ema_unet.step(unet.parameters())
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    # 10. Recalculate quantities depending on the global step, if necessary.
+                    new_discretization_steps = get_discretization_steps(
+                        global_step,
+                        args.max_train_steps,
+                        s_0=args.discretization_s_0,
+                        s_1=args.discretization_s_1,
+                        constant=args.constant_discretization_steps,
+                    )
+                    current_skip_steps = get_skip_steps(global_step, initial_skip=args.skip_steps)
+                    if current_skip_steps >= new_discretization_steps:
+                        raise ValueError(
+                            f"The current skip steps is {current_skip_steps}, but should be smaller than the current"
+                            f" number of discretization steps {new_discretization_steps}."
+                        )
+                    if new_discretization_steps != current_discretization_steps:
+                        (
+                            noise_scheduler,
+                            current_timesteps,
+                            timestep_weights,
+                            timestep_loss_weights,
+                        ) = recalculate_num_discretization_step_values(new_discretization_steps, current_skip_steps)
+                        current_discretization_steps = new_discretization_steps
+
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if global_step % args.validation_steps == 0:
+                        # NOTE: since we do not use EMA for the teacher model, the teacher parameters and student
+                        # parameters are the same at this point in time
+                        log_validation(unet, noise_scheduler, args, accelerator, weight_dtype, global_step, "teacher")
+                        # teacher_unet.to(dtype=teacher_dtype)
+
+                        if args.use_ema:
+                            # Store the student unet weights and load the EMA weights.
+                            ema_unet.store(unet.parameters())
+                            ema_unet.copy_to(unet.parameters())
+
+                            log_validation(
+                                unet,
+                                noise_scheduler,
+                                args,
+                                accelerator,
+                                weight_dtype,
+                                global_step,
+                                "ema_student",
+                            )
+
+                            # Restore student unet weights
+                            ema_unet.restore(unet.parameters())
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
+            if args.use_ema:
+                logs["ema_decay"] = ema_unet.cur_decay_value
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+        # progress_bar.close()
+
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        pipeline = ConsistencyModelPipeline(unet=unet, scheduler=noise_scheduler)
+        pipeline.save_pretrained(args.output_dir)
+
+        # If using EMA, save EMA weights as well.
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+
+            unet.save_pretrained(os.path.join(args.output_dir, "ema_unet"))
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/controlnet/train_controlnet_webdataset.py b/diffusers/examples/research_projects/controlnet/train_controlnet_webdataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..88a5d93d8edfc3044d3f325d52b6352d8d7b2dbf
--- /dev/null
+++ b/diffusers/examples/research_projects/controlnet/train_controlnet_webdataset.py
@@ -0,0 +1,1472 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import functools
+import gc
+import itertools
+import json
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+from typing import List, Optional, Union
+
+import accelerate
+import cv2
+import numpy as np
+import torch
+import torch.utils.checkpoint
+import transformers
+import webdataset as wds
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from braceexpand import braceexpand
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from torch.utils.data import default_collate
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, DPTForDepthEstimation, DPTImageProcessor, PretrainedConfig
+from webdataset.tariterators import (
+    base_plus_ext,
+    tar_file_expander,
+    url_opener,
+    valid_sample,
+)
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    EulerDiscreteScheduler,
+    StableDiffusionXLControlNetPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+MAX_SEQ_LENGTH = 77
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.18.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def filter_keys(key_set):
+    def _f(dictionary):
+        return {k: v for k, v in dictionary.items() if k in key_set}
+
+    return _f
+
+
+def group_by_keys_nothrow(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None):
+    """Return function over iterator that groups key, value pairs into samples.
+
+    :param keys: function that splits the key into key and extension (base_plus_ext)
+    :param lcase: convert suffixes to lower case (Default value = True)
+    """
+    current_sample = None
+    for filesample in data:
+        assert isinstance(filesample, dict)
+        fname, value = filesample["fname"], filesample["data"]
+        prefix, suffix = keys(fname)
+        if prefix is None:
+            continue
+        if lcase:
+            suffix = suffix.lower()
+        # FIXME webdataset version throws if suffix in current_sample, but we have a potential for
+        #  this happening in the current LAION400m dataset if a tar ends with same prefix as the next
+        #  begins, rare, but can happen since prefix aren't unique across tar files in that dataset
+        if current_sample is None or prefix != current_sample["__key__"] or suffix in current_sample:
+            if valid_sample(current_sample):
+                yield current_sample
+            current_sample = {"__key__": prefix, "__url__": filesample["__url__"]}
+        if suffixes is None or suffix in suffixes:
+            current_sample[suffix] = value
+    if valid_sample(current_sample):
+        yield current_sample
+
+
+def tarfile_to_samples_nothrow(src, handler=wds.warn_and_continue):
+    # NOTE this is a re-impl of the webdataset impl with group_by_keys that doesn't throw
+    streams = url_opener(src, handler=handler)
+    files = tar_file_expander(streams, handler=handler)
+    samples = group_by_keys_nothrow(files, handler=handler)
+    return samples
+
+
+def control_transform(image):
+    image = np.array(image)
+
+    low_threshold = 100
+    high_threshold = 200
+
+    image = cv2.Canny(image, low_threshold, high_threshold)
+    image = image[:, :, None]
+    image = np.concatenate([image, image, image], axis=2)
+    control_image = Image.fromarray(image)
+    return control_image
+
+
+def canny_image_transform(example, resolution=1024):
+    image = example["image"]
+    image = transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BILINEAR)(image)
+    # get crop coordinates
+    c_top, c_left, _, _ = transforms.RandomCrop.get_params(image, output_size=(resolution, resolution))
+    image = transforms.functional.crop(image, c_top, c_left, resolution, resolution)
+    control_image = control_transform(image)
+
+    image = transforms.ToTensor()(image)
+    image = transforms.Normalize([0.5], [0.5])(image)
+    control_image = transforms.ToTensor()(control_image)
+
+    example["image"] = image
+    example["control_image"] = control_image
+    example["crop_coords"] = (c_top, c_left)
+
+    return example
+
+
+def depth_image_transform(example, feature_extractor, resolution=1024):
+    image = example["image"]
+    image = transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BILINEAR)(image)
+    # get crop coordinates
+    c_top, c_left, _, _ = transforms.RandomCrop.get_params(image, output_size=(resolution, resolution))
+    image = transforms.functional.crop(image, c_top, c_left, resolution, resolution)
+
+    control_image = feature_extractor(images=image, return_tensors="pt").pixel_values.squeeze(0)
+
+    image = transforms.ToTensor()(image)
+    image = transforms.Normalize([0.5], [0.5])(image)
+
+    example["image"] = image
+    example["control_image"] = control_image
+    example["crop_coords"] = (c_top, c_left)
+
+    return example
+
+
+class WebdatasetFilter:
+    def __init__(self, min_size=1024, max_pwatermark=0.5):
+        self.min_size = min_size
+        self.max_pwatermark = max_pwatermark
+
+    def __call__(self, x):
+        try:
+            if "json" in x:
+                x_json = json.loads(x["json"])
+                filter_size = (x_json.get("original_width", 0.0) or 0.0) >= self.min_size and x_json.get(
+                    "original_height", 0
+                ) >= self.min_size
+                filter_watermark = (x_json.get("pwatermark", 1.0) or 1.0) <= self.max_pwatermark
+                return filter_size and filter_watermark
+            else:
+                return False
+        except Exception:
+            return False
+
+
+class Text2ImageDataset:
+    def __init__(
+        self,
+        train_shards_path_or_url: Union[str, List[str]],
+        eval_shards_path_or_url: Union[str, List[str]],
+        num_train_examples: int,
+        per_gpu_batch_size: int,
+        global_batch_size: int,
+        num_workers: int,
+        resolution: int = 256,
+        center_crop: bool = True,
+        random_flip: bool = False,
+        shuffle_buffer_size: int = 1000,
+        pin_memory: bool = False,
+        persistent_workers: bool = False,
+        control_type: str = "canny",
+        feature_extractor: Optional[DPTImageProcessor] = None,
+    ):
+        if not isinstance(train_shards_path_or_url, str):
+            train_shards_path_or_url = [list(braceexpand(urls)) for urls in train_shards_path_or_url]
+            # flatten list using itertools
+            train_shards_path_or_url = list(itertools.chain.from_iterable(train_shards_path_or_url))
+
+        if not isinstance(eval_shards_path_or_url, str):
+            eval_shards_path_or_url = [list(braceexpand(urls)) for urls in eval_shards_path_or_url]
+            # flatten list using itertools
+            eval_shards_path_or_url = list(itertools.chain.from_iterable(eval_shards_path_or_url))
+
+        def get_orig_size(json):
+            return (int(json.get("original_width", 0.0)), int(json.get("original_height", 0.0)))
+
+        if control_type == "canny":
+            image_transform = functools.partial(canny_image_transform, resolution=resolution)
+        elif control_type == "depth":
+            image_transform = functools.partial(
+                depth_image_transform, feature_extractor=feature_extractor, resolution=resolution
+            )
+
+        processing_pipeline = [
+            wds.decode("pil", handler=wds.ignore_and_continue),
+            wds.rename(
+                image="jpg;png;jpeg;webp",
+                control_image="jpg;png;jpeg;webp",
+                text="text;txt;caption",
+                orig_size="json",
+                handler=wds.warn_and_continue,
+            ),
+            wds.map(filter_keys({"image", "control_image", "text", "orig_size"})),
+            wds.map_dict(orig_size=get_orig_size),
+            wds.map(image_transform),
+            wds.to_tuple("image", "control_image", "text", "orig_size", "crop_coords"),
+        ]
+
+        # Create train dataset and loader
+        pipeline = [
+            wds.ResampledShards(train_shards_path_or_url),
+            tarfile_to_samples_nothrow,
+            wds.select(WebdatasetFilter(min_size=512)),
+            wds.shuffle(shuffle_buffer_size),
+            *processing_pipeline,
+            wds.batched(per_gpu_batch_size, partial=False, collation_fn=default_collate),
+        ]
+
+        num_worker_batches = math.ceil(num_train_examples / (global_batch_size * num_workers))  # per dataloader worker
+        num_batches = num_worker_batches * num_workers
+        num_samples = num_batches * global_batch_size
+
+        # each worker is iterating over this
+        self._train_dataset = wds.DataPipeline(*pipeline).with_epoch(num_worker_batches)
+        self._train_dataloader = wds.WebLoader(
+            self._train_dataset,
+            batch_size=None,
+            shuffle=False,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=persistent_workers,
+        )
+        # add meta-data to dataloader instance for convenience
+        self._train_dataloader.num_batches = num_batches
+        self._train_dataloader.num_samples = num_samples
+
+        # Create eval dataset and loader
+        pipeline = [
+            wds.SimpleShardList(eval_shards_path_or_url),
+            wds.split_by_worker,
+            wds.tarfile_to_samples(handler=wds.ignore_and_continue),
+            *processing_pipeline,
+            wds.batched(per_gpu_batch_size, partial=False, collation_fn=default_collate),
+        ]
+        self._eval_dataset = wds.DataPipeline(*pipeline)
+        self._eval_dataloader = wds.WebLoader(
+            self._eval_dataset,
+            batch_size=None,
+            shuffle=False,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=persistent_workers,
+        )
+
+    @property
+    def train_dataset(self):
+        return self._train_dataset
+
+    @property
+    def train_dataloader(self):
+        return self._train_dataloader
+
+    @property
+    def eval_dataset(self):
+        return self._eval_dataset
+
+    @property
+    def eval_dataloader(self):
+        return self._eval_dataloader
+
+
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+
+
+def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step):
+    logger.info("Running validation... ")
+
+    controlnet = accelerator.unwrap_model(controlnet)
+
+    pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        vae=vae,
+        unet=unet,
+        controlnet=controlnet,
+        revision=args.revision,
+        torch_dtype=weight_dtype,
+    )
+    # pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    if len(args.validation_image) == len(args.validation_prompt):
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_image) == 1:
+        validation_images = args.validation_image * len(args.validation_prompt)
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_prompt) == 1:
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt * len(args.validation_image)
+    else:
+        raise ValueError(
+            "number of `args.validation_image` and `args.validation_prompt` should be checked in `parse_args`"
+        )
+
+    image_logs = []
+
+    for validation_prompt, validation_image in zip(validation_prompts, validation_images):
+        validation_image = Image.open(validation_image).convert("RGB")
+        validation_image = validation_image.resize((args.resolution, args.resolution))
+
+        images = []
+
+        for _ in range(args.num_validation_images):
+            with torch.autocast("cuda"):
+                image = pipeline(
+                    validation_prompt, image=validation_image, num_inference_steps=20, generator=generator
+                ).images[0]
+            images.append(image)
+
+        image_logs.append(
+            {"validation_image": validation_image, "images": images, "validation_prompt": validation_prompt}
+        )
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                validation_image = log["validation_image"]
+
+                formatted_images = []
+
+                formatted_images.append(np.asarray(validation_image))
+
+                for image in images:
+                    formatted_images.append(np.asarray(image))
+
+                formatted_images = np.stack(formatted_images)
+
+                tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            formatted_images = []
+
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                validation_image = log["validation_image"]
+
+                formatted_images.append(wandb.Image(validation_image, caption="Controlnet conditioning"))
+
+                for image in images:
+                    image = wandb.Image(image, caption=validation_prompt)
+                    formatted_images.append(image)
+
+            tracker.log({"validation": formatted_images})
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+        del pipeline
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        return image_logs
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_folder=None):
+    img_str = ""
+    if image_logs is not None:
+        img_str = "You can find some example images below.\n"
+        for i, log in enumerate(image_logs):
+            images = log["images"]
+            validation_prompt = log["validation_prompt"]
+            validation_image = log["validation_image"]
+            validation_image.save(os.path.join(repo_folder, "image_control.png"))
+            img_str += f"prompt: {validation_prompt}\n"
+            images = [validation_image] + images
+            image_grid(images, 1, len(images)).save(os.path.join(repo_folder, f"images_{i}.png"))
+            img_str += f"![images_{i})](./images_{i}.png)\n"
+
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+tags:
+- stable-diffusion-xl
+- stable-diffusion-xl-diffusers
+- text-to-image
+- diffusers
+- controlnet
+- diffusers-training
+- webdataset
+inference: true
+---
+    """
+    model_card = f"""
+# controlnet-{repo_id}
+
+These are controlnet weights trained on {base_model} with new type of conditioning.
+{img_str}
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a ControlNet training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to an improved VAE to stabilize training. For more details check out: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--controlnet_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained controlnet model or model identifier from huggingface.co/models."
+        " If not specified controlnet weights are initialized from unet.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be"
+            " float32 precision."
+        ),
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="controlnet-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--crops_coords_top_left_h",
+        type=int,
+        default=0,
+        help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."),
+    )
+    parser.add_argument(
+        "--crops_coords_top_left_w",
+        type=int,
+        default=0,
+        help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=3,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=1,
+        help=("Number of subprocesses to use for data loading."),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+    parser.add_argument(
+        "--train_shards_path_or_url",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--eval_shards_path_or_url",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing the target image."
+    )
+    parser.add_argument(
+        "--conditioning_image_column",
+        type=str,
+        default="conditioning_image",
+        help="The column of the dataset containing the controlnet conditioning image.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        nargs="+",
+        help=(
+            "A set of prompts evaluated every `--validation_steps` and logged to `--report_to`."
+            " Provide either a matching number of `--validation_image`s, a single `--validation_image`"
+            " to be used with all prompts, or a single prompt that will be used with all `--validation_image`s."
+        ),
+    )
+    parser.add_argument(
+        "--validation_image",
+        type=str,
+        default=None,
+        nargs="+",
+        help=(
+            "A set of paths to the controlnet conditioning image be evaluated every `--validation_steps`"
+            " and logged to `--report_to`. Provide either a matching number of `--validation_prompt`s, a"
+            " a single `--validation_prompt` to be used with all `--validation_image`s, or a single"
+            " `--validation_image` that will be used with all `--validation_prompt`s."
+        ),
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images to be generated for each `--validation_image`, `--validation_prompt` pair",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="sd_xl_train_controlnet",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+    parser.add_argument(
+        "--control_type",
+        type=str,
+        default="canny",
+        help=("The type of controlnet conditioning image to use. One of `canny`, `depth`" " Defaults to `canny`."),
+    )
+    parser.add_argument(
+        "--transformer_layers_per_block",
+        type=str,
+        default=None,
+        help=("The number of layers per block in the transformer. If None, defaults to" " `args.transformer_layers`."),
+    )
+    parser.add_argument(
+        "--old_style_controlnet",
+        action="store_true",
+        default=False,
+        help=(
+            "Use the old style controlnet, which is a single transformer layer with"
+            " a single head. Defaults to False."
+        ),
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    if args.validation_prompt is not None and args.validation_image is None:
+        raise ValueError("`--validation_image` must be set if `--validation_prompt` is set")
+
+    if args.validation_prompt is None and args.validation_image is not None:
+        raise ValueError("`--validation_prompt` must be set if `--validation_image` is set")
+
+    if (
+        args.validation_image is not None
+        and args.validation_prompt is not None
+        and len(args.validation_image) != 1
+        and len(args.validation_prompt) != 1
+        and len(args.validation_image) != len(args.validation_prompt)
+    ):
+        raise ValueError(
+            "Must provide either 1 `--validation_image`, 1 `--validation_prompt`,"
+            " or the same number of `--validation_prompt`s and `--validation_image`s"
+        )
+
+    if args.resolution % 8 != 0:
+        raise ValueError(
+            "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the controlnet encoder."
+        )
+
+    return args
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train=True):
+    prompt_embeds_list = []
+
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+
+    with torch.no_grad():
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                captions,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = text_encoder(
+                text_input_ids.to(text_encoder.device),
+                output_hidden_states=True,
+            )
+
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+            prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+                token=args.hub_token,
+                private=True,
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, use_fast=False
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer_2", revision=args.revision, use_fast=False
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    # noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    noise_scheduler = EulerDiscreteScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+
+    if args.controlnet_model_name_or_path:
+        logger.info("Loading existing controlnet weights")
+        pre_controlnet = ControlNetModel.from_pretrained(args.controlnet_model_name_or_path)
+    else:
+        logger.info("Initializing controlnet weights from unet")
+        pre_controlnet = ControlNetModel.from_unet(unet)
+
+    if args.transformer_layers_per_block is not None:
+        transformer_layers_per_block = [int(x) for x in args.transformer_layers_per_block.split(",")]
+        down_block_types = ["DownBlock2D" if l == 0 else "CrossAttnDownBlock2D" for l in transformer_layers_per_block]
+        controlnet = ControlNetModel.from_config(
+            pre_controlnet.config,
+            down_block_types=down_block_types,
+            transformer_layers_per_block=transformer_layers_per_block,
+        )
+        controlnet.load_state_dict(pre_controlnet.state_dict(), strict=False)
+        del pre_controlnet
+    else:
+        controlnet = pre_controlnet
+
+    if args.control_type == "depth":
+        feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
+        depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas")
+        depth_model.requires_grad_(False)
+    else:
+        feature_extractor = None
+        depth_model = None
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                i = len(weights) - 1
+
+                while len(weights) > 0:
+                    weights.pop()
+                    model = models[i]
+
+                    sub_dir = "controlnet"
+                    model.save_pretrained(os.path.join(output_dir, sub_dir))
+
+                    i -= 1
+
+        def load_model_hook(models, input_dir):
+            while len(models) > 0:
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = ControlNetModel.from_pretrained(input_dir, subfolder="controlnet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    vae.requires_grad_(False)
+    unet.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    controlnet.train()
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+            controlnet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        controlnet.enable_gradient_checkpointing()
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        " Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training, copy of the weights should still be float32."
+    )
+
+    if accelerator.unwrap_model(controlnet).dtype != torch.float32:
+        raise ValueError(
+            f"Controlnet loaded as datatype {accelerator.unwrap_model(controlnet).dtype}. {low_precision_error_string}"
+        )
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = controlnet.parameters()
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae, unet and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    if args.pretrained_vae_model_name_or_path is not None:
+        vae.to(accelerator.device, dtype=weight_dtype)
+    else:
+        vae.to(accelerator.device, dtype=torch.float32)
+    unet.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+    if args.control_type == "depth":
+        depth_model.to(accelerator.device, dtype=weight_dtype)
+
+    # Here, we compute not just the text embeddings but also the additional embeddings
+    # needed for the SD XL UNet to operate.
+    def compute_embeddings(
+        prompt_batch, original_sizes, crop_coords, proportion_empty_prompts, text_encoders, tokenizers, is_train=True
+    ):
+        target_size = (args.resolution, args.resolution)
+        original_sizes = list(map(list, zip(*original_sizes)))
+        crops_coords_top_left = list(map(list, zip(*crop_coords)))
+
+        original_sizes = torch.tensor(original_sizes, dtype=torch.long)
+        crops_coords_top_left = torch.tensor(crops_coords_top_left, dtype=torch.long)
+
+        # crops_coords_top_left = (args.crops_coords_top_left_h, args.crops_coords_top_left_w)
+        prompt_embeds, pooled_prompt_embeds = encode_prompt(
+            prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train
+        )
+        add_text_embeds = pooled_prompt_embeds
+
+        # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+        # add_time_ids = list(crops_coords_top_left + target_size)
+        add_time_ids = list(target_size)
+        add_time_ids = torch.tensor([add_time_ids])
+        add_time_ids = add_time_ids.repeat(len(prompt_batch), 1)
+        # add_time_ids = torch.cat([torch.tensor(original_sizes, dtype=torch.long), add_time_ids], dim=-1)
+        add_time_ids = torch.cat([original_sizes, crops_coords_top_left, add_time_ids], dim=-1)
+        add_time_ids = add_time_ids.to(accelerator.device, dtype=prompt_embeds.dtype)
+
+        prompt_embeds = prompt_embeds.to(accelerator.device)
+        add_text_embeds = add_text_embeds.to(accelerator.device)
+        unet_added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        return {"prompt_embeds": prompt_embeds, **unet_added_cond_kwargs}
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    dataset = Text2ImageDataset(
+        train_shards_path_or_url=args.train_shards_path_or_url,
+        eval_shards_path_or_url=args.eval_shards_path_or_url,
+        num_train_examples=args.max_train_samples,
+        per_gpu_batch_size=args.train_batch_size,
+        global_batch_size=args.train_batch_size * accelerator.num_processes,
+        num_workers=args.dataloader_num_workers,
+        resolution=args.resolution,
+        center_crop=False,
+        random_flip=False,
+        shuffle_buffer_size=1000,
+        pin_memory=True,
+        persistent_workers=True,
+        control_type=args.control_type,
+        feature_extractor=feature_extractor,
+    )
+    train_dataloader = dataset.train_dataloader
+
+    # Let's first compute all the embeddings so that we can free up the text encoders
+    # from memory.
+    text_encoders = [text_encoder_one, text_encoder_two]
+    tokenizers = [tokenizer_one, tokenizer_two]
+
+    compute_embeddings_fn = functools.partial(
+        compute_embeddings,
+        proportion_empty_prompts=args.proportion_empty_prompts,
+        text_encoders=text_encoders,
+        tokenizers=tokenizers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    controlnet, optimizer, lr_scheduler = accelerator.prepare(controlnet, optimizer, lr_scheduler)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+
+        # tensorboard cannot handle list types for config
+        tracker_config.pop("validation_prompt")
+        tracker_config.pop("validation_image")
+
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num batches each epoch = {train_dataloader.num_batches}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    image_logs = None
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(controlnet):
+                image, control_image, text, orig_size, crop_coords = batch
+
+                encoded_text = compute_embeddings_fn(text, orig_size, crop_coords)
+                image = image.to(accelerator.device, non_blocking=True)
+                control_image = control_image.to(accelerator.device, non_blocking=True)
+
+                if args.pretrained_vae_model_name_or_path is not None:
+                    pixel_values = image.to(dtype=weight_dtype)
+                    if vae.dtype != weight_dtype:
+                        vae.to(dtype=weight_dtype)
+                else:
+                    pixel_values = image
+
+                # latents = vae.encode(pixel_values).latent_dist.sample()
+                # encode pixel values with batch size of at most 8
+                latents = []
+                for i in range(0, pixel_values.shape[0], 8):
+                    latents.append(vae.encode(pixel_values[i : i + 8]).latent_dist.sample())
+                latents = torch.cat(latents, dim=0)
+
+                latents = latents * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    latents = latents.to(weight_dtype)
+
+                if args.control_type == "depth":
+                    control_image = control_image.to(weight_dtype)
+                    with torch.autocast("cuda"):
+                        depth_map = depth_model(control_image).predicted_depth
+                    depth_map = torch.nn.functional.interpolate(
+                        depth_map.unsqueeze(1),
+                        size=image.shape[2:],
+                        mode="bicubic",
+                        align_corners=False,
+                    )
+                    depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
+                    depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
+                    depth_map = (depth_map - depth_min) / (depth_max - depth_min)
+                    control_image = (depth_map * 255.0).to(torch.uint8).float() / 255.0  # hack to match inference
+                    control_image = torch.cat([control_image] * 3, dim=1)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                sigmas = get_sigmas(timesteps, len(noisy_latents.shape), noisy_latents.dtype)
+                inp_noisy_latents = noisy_latents / ((sigmas**2 + 1) ** 0.5)
+
+                # ControlNet conditioning.
+                controlnet_image = control_image.to(dtype=weight_dtype)
+                prompt_embeds = encoded_text.pop("prompt_embeds")
+                down_block_res_samples, mid_block_res_sample = controlnet(
+                    inp_noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=prompt_embeds,
+                    added_cond_kwargs=encoded_text,
+                    controlnet_cond=controlnet_image,
+                    return_dict=False,
+                )
+
+                # Predict the noise residual
+                model_pred = unet(
+                    inp_noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=prompt_embeds,
+                    added_cond_kwargs=encoded_text,
+                    down_block_additional_residuals=[
+                        sample.to(dtype=weight_dtype) for sample in down_block_res_samples
+                    ],
+                    mid_block_additional_residual=mid_block_res_sample.to(dtype=weight_dtype),
+                ).sample
+
+                model_pred = model_pred * (-sigmas) + noisy_latents
+                weighing = sigmas**-2.0
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = latents  # compute loss against the denoised latents
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+                loss = torch.mean(
+                    (weighing.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1), 1
+                )
+                loss = loss.mean()
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = controlnet.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                        image_logs = log_validation(
+                            vae, unet, controlnet, args, accelerator, weight_dtype, global_step
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        controlnet = accelerator.unwrap_model(controlnet)
+        controlnet.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                image_logs=image_logs,
+                base_model=args.pretrained_model_name_or_path,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/diffusion_dpo/README.md b/diffusers/examples/research_projects/diffusion_dpo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..32704b6f772bc2cdf9a24e61ebe59ae78458598a
--- /dev/null
+++ b/diffusers/examples/research_projects/diffusion_dpo/README.md
@@ -0,0 +1,94 @@
+# Diffusion Model Alignment Using Direct Preference Optimization
+
+This directory provides LoRA implementations of Diffusion DPO proposed in [DiffusionModel Alignment Using Direct Preference Optimization](https://arxiv.org/abs/2311.12908) by Bram Wallace, Meihua Dang, Rafael Rafailov, Linqi Zhou, Aaron Lou, Senthil Purushwalkam, Stefano Ermon, Caiming Xiong, Shafiq Joty, and Nikhil Naik.
+
+We provide implementations for both Stable Diffusion (SD) and Stable Diffusion XL (SDXL). The original checkpoints are available at the URLs below:
+
+* [mhdang/dpo-sd1.5-text2image-v1](https://huggingface.co/mhdang/dpo-sd1.5-text2image-v1)
+* [mhdang/dpo-sdxl-text2image-v1](https://huggingface.co/mhdang/dpo-sdxl-text2image-v1)
+
+> 💡 Note: The scripts are highly experimental and were only tested on low-data regimes. Proceed with caution. Feel free to let us know about your findings via GitHub issues.
+
+## SD training command
+
+```bash
+accelerate launch train_diffusion_dpo.py \
+  --pretrained_model_name_or_path=stable-diffusion-v1-5/stable-diffusion-v1-5  \
+  --output_dir="diffusion-dpo" \
+  --mixed_precision="fp16" \
+  --dataset_name=kashif/pickascore \
+  --resolution=512 \
+  --train_batch_size=16 \
+  --gradient_accumulation_steps=2 \
+  --gradient_checkpointing \
+  --use_8bit_adam \
+  --rank=8 \
+  --learning_rate=1e-5 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=10000 \
+  --checkpointing_steps=2000 \
+  --run_validation --validation_steps=200 \
+  --seed="0" \
+  --report_to="wandb" \
+  --push_to_hub
+```
+
+## SDXL training command
+
+```bash
+accelerate launch train_diffusion_dpo_sdxl.py \
+  --pretrained_model_name_or_path=stabilityai/stable-diffusion-xl-base-1.0  \
+  --pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix \
+  --output_dir="diffusion-sdxl-dpo" \
+  --mixed_precision="fp16" \
+  --dataset_name=kashif/pickascore \
+  --train_batch_size=8 \
+  --gradient_accumulation_steps=2 \
+  --gradient_checkpointing \
+  --use_8bit_adam \
+  --rank=8 \
+  --learning_rate=1e-5 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=2000 \
+  --checkpointing_steps=500 \
+  --run_validation --validation_steps=50 \
+  --seed="0" \
+  --report_to="wandb" \
+  --push_to_hub
+```
+
+## SDXL Turbo training command
+
+```bash
+accelerate launch train_diffusion_dpo_sdxl.py \
+  --pretrained_model_name_or_path=stabilityai/sdxl-turbo \
+  --pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix \
+  --output_dir="diffusion-sdxl-turbo-dpo" \
+  --mixed_precision="fp16" \
+  --dataset_name=kashif/pickascore \
+  --train_batch_size=8 \
+  --gradient_accumulation_steps=2 \
+  --gradient_checkpointing \
+  --use_8bit_adam \
+  --rank=8 \
+  --learning_rate=1e-5 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=2000 \
+  --checkpointing_steps=500 \
+  --run_validation --validation_steps=50 \
+  --seed="0" \
+  --report_to="wandb" \
+  --is_turbo --resolution 512 \
+  --push_to_hub
+```
+
+
+## Acknowledgements
+
+This is based on the amazing work done by [Bram](https://github.com/bram-w) here for Diffusion DPO: https://github.com/bram-w/trl/blob/dpo/.
diff --git a/diffusers/examples/research_projects/diffusion_dpo/requirements.txt b/diffusers/examples/research_projects/diffusion_dpo/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..11a5b9df6dad38b5807823afbdc2b0bf66fabc76
--- /dev/null
+++ b/diffusers/examples/research_projects/diffusion_dpo/requirements.txt
@@ -0,0 +1,8 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
+peft
+wandb
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/diffusion_dpo/train_diffusion_dpo.py b/diffusers/examples/research_projects/diffusion_dpo/train_diffusion_dpo.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab88d49677664732f25ebe542da3d95f2113208c
--- /dev/null
+++ b/diffusers/examples/research_projects/diffusion_dpo/train_diffusion_dpo.py
@@ -0,0 +1,982 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 bram-w, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import contextlib
+import io
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+import wandb
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import StableDiffusionLoraLoaderMixin
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, convert_state_dict_to_diffusers
+from diffusers.utils.import_utils import is_xformers_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.25.0.dev0")
+
+logger = get_logger(__name__)
+
+
+VALIDATION_PROMPTS = [
+    "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",
+    "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
+    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+    "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
+]
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def log_validation(args, unet, accelerator, weight_dtype, epoch, is_final_validation=False):
+    logger.info(f"Running validation... \n Generating images with prompts:\n" f" {VALIDATION_PROMPTS}.")
+
+    # create pipeline
+    pipeline = DiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+    )
+    if not is_final_validation:
+        pipeline.unet = accelerator.unwrap_model(unet)
+    else:
+        pipeline.load_lora_weights(args.output_dir, weight_name="pytorch_lora_weights.safetensors")
+
+    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    images = []
+    context = contextlib.nullcontext() if is_final_validation else torch.cuda.amp.autocast()
+
+    for prompt in VALIDATION_PROMPTS:
+        with context:
+            image = pipeline(prompt, num_inference_steps=25, generator=generator).images[0]
+            images.append(image)
+
+    tracker_key = "test" if is_final_validation else "validation"
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(tracker_key, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    tracker_key: [
+                        wandb.Image(image, caption=f"{i}: {VALIDATION_PROMPTS[i]}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    # Also log images without the LoRA params for comparison.
+    if is_final_validation:
+        pipeline.disable_lora()
+        no_lora_images = [
+            pipeline(prompt, num_inference_steps=25, generator=generator).images[0] for prompt in VALIDATION_PROMPTS
+        ]
+
+        for tracker in accelerator.trackers:
+            if tracker.name == "tensorboard":
+                np_images = np.stack([np.asarray(img) for img in no_lora_images])
+                tracker.writer.add_images("test_without_lora", np_images, epoch, dataformats="NHWC")
+            if tracker.name == "wandb":
+                tracker.log(
+                    {
+                        "test_without_lora": [
+                            wandb.Image(image, caption=f"{i}: {VALIDATION_PROMPTS[i]}")
+                            for i, image in enumerate(no_lora_images)
+                        ]
+                    }
+                )
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_split_name",
+        type=str,
+        default="validation",
+        help="Dataset split to be used during training. Helpful to specify for conducting experimental runs.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--run_validation",
+        default=False,
+        action="store_true",
+        help="Whether to run validation inference in between training and also after training. Helps to track progress.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=200,
+        help="Run validation every X steps.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="diffusion-dpo-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--vae_encode_batch_size",
+        type=int,
+        default=8,
+        help="Batch size to use for VAE encoding of the images for efficient processing.",
+    )
+    parser.add_argument(
+        "--no_hflip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--random_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to random crop the input images to the resolution. If not set, the images will be center-cropped."
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--beta_dpo",
+        type=int,
+        default=2500,
+        help="DPO KL Divergence penalty.",
+    )
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="sigmoid",
+        help="DPO loss type. Can be one of 'sigmoid' (default), 'ipo', or 'cpo'",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--tracker_name",
+        type=str,
+        default="diffusion-dpo-lora",
+        help=("The name of the tracker to report results to."),
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None:
+        raise ValueError("Must provide a `dataset_name`.")
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    return args
+
+
+def tokenize_captions(tokenizer, examples):
+    max_length = tokenizer.model_max_length
+    captions = []
+    for caption in examples["caption"]:
+        captions.append(caption)
+
+    text_inputs = tokenizer(
+        captions, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt"
+    )
+
+    return text_inputs.input_ids
+
+
+@torch.no_grad()
+def encode_prompt(text_encoder, input_ids):
+    text_input_ids = input_ids.to(text_encoder.device)
+    attention_mask = None
+
+    prompt_embeds = text_encoder(text_input_ids, attention_mask=attention_mask)
+    prompt_embeds = prompt_embeds[0]
+
+    return prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+    )
+
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # Set up LoRA.
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+    # Add adapter and make sure the trainable params are in float32.
+    unet.add_adapter(unet_lora_config)
+    if args.mixed_precision == "fp16":
+        for param in unet.parameters():
+            # only upcast trainable parameters (LoRA) into fp32
+            if param.requires_grad:
+                param.data = param.to(torch.float32)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder atten layers
+            unet_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(accelerator.unwrap_model(unet))):
+                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusionLoraLoaderMixin.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=None,
+            )
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(accelerator.unwrap_model(unet))):
+                unet_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir)
+        StableDiffusionLoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=unet_)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = list(filter(lambda p: p.requires_grad, unet.parameters()))
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = load_dataset(
+        args.dataset_name,
+        cache_dir=args.cache_dir,
+        split=args.dataset_split_name,
+    )
+
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(int(args.resolution), interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.RandomCrop(args.resolution) if args.random_crop else transforms.CenterCrop(args.resolution),
+            transforms.Lambda(lambda x: x) if args.no_hflip else transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        all_pixel_values = []
+        for col_name in ["jpg_0", "jpg_1"]:
+            images = [Image.open(io.BytesIO(im_bytes)).convert("RGB") for im_bytes in examples[col_name]]
+            pixel_values = [train_transforms(image) for image in images]
+            all_pixel_values.append(pixel_values)
+
+        # Double on channel dim, jpg_y then jpg_w
+        im_tup_iterator = zip(*all_pixel_values)
+        combined_pixel_values = []
+        for im_tup, label_0 in zip(im_tup_iterator, examples["label_0"]):
+            if label_0 == 0:
+                im_tup = im_tup[::-1]
+            combined_im = torch.cat(im_tup, dim=0)  # no batch dim
+            combined_pixel_values.append(combined_im)
+        examples["pixel_values"] = combined_pixel_values
+
+        examples["input_ids"] = tokenize_captions(tokenizer, examples)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            train_dataset = train_dataset.shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = train_dataset.with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        final_dict = {"pixel_values": pixel_values}
+        final_dict["input_ids"] = torch.stack([example["input_ids"] for example in examples])
+        return final_dict
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers(args.tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    unet.train()
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # (batch_size, 2*channels, h, w) -> (2*batch_size, channels, h, w)
+                pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
+                feed_pixel_values = torch.cat(pixel_values.chunk(2, dim=1))
+
+                latents = []
+                for i in range(0, feed_pixel_values.shape[0], args.vae_encode_batch_size):
+                    latents.append(
+                        vae.encode(feed_pixel_values[i : i + args.vae_encode_batch_size]).latent_dist.sample()
+                    )
+                latents = torch.cat(latents, dim=0)
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents).chunk(2)[0].repeat(2, 1, 1, 1)
+
+                # Sample a random timestep for each image
+                bsz = latents.shape[0] // 2
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device, dtype=torch.long
+                ).repeat(2)
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = encode_prompt(text_encoder, batch["input_ids"]).repeat(2, 1, 1)
+
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_model_input,
+                    timesteps,
+                    encoder_hidden_states,
+                ).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # Compute losses.
+                model_losses = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                model_losses = model_losses.mean(dim=list(range(1, len(model_losses.shape))))
+                model_losses_w, model_losses_l = model_losses.chunk(2)
+
+                # For logging
+                raw_model_loss = 0.5 * (model_losses_w.mean() + model_losses_l.mean())
+                model_diff = model_losses_w - model_losses_l  # These are both LBS (as is t)
+
+                # Reference model predictions.
+                accelerator.unwrap_model(unet).disable_adapters()
+                with torch.no_grad():
+                    ref_preds = unet(
+                        noisy_model_input,
+                        timesteps,
+                        encoder_hidden_states,
+                    ).sample.detach()
+                    ref_loss = F.mse_loss(ref_preds.float(), target.float(), reduction="none")
+                    ref_loss = ref_loss.mean(dim=list(range(1, len(ref_loss.shape))))
+
+                    ref_losses_w, ref_losses_l = ref_loss.chunk(2)
+                    ref_diff = ref_losses_w - ref_losses_l
+                    raw_ref_loss = ref_loss.mean()
+
+                # Re-enable adapters.
+                accelerator.unwrap_model(unet).enable_adapters()
+
+                # Final loss.
+                logits = ref_diff - model_diff
+                if args.loss_type == "sigmoid":
+                    loss = -1 * F.logsigmoid(args.beta_dpo * logits).mean()
+                elif args.loss_type == "hinge":
+                    loss = torch.relu(1 - args.beta_dpo * logits).mean()
+                elif args.loss_type == "ipo":
+                    losses = (logits - 1 / (2 * args.beta)) ** 2
+                    loss = losses.mean()
+                else:
+                    raise ValueError(f"Unknown loss type {args.loss_type}")
+
+                implicit_acc = (logits > 0).sum().float() / logits.size(0)
+                implicit_acc += 0.5 * (logits == 0).sum().float() / logits.size(0)
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if args.run_validation and global_step % args.validation_steps == 0:
+                        log_validation(
+                            args, unet=unet, accelerator=accelerator, weight_dtype=weight_dtype, epoch=epoch
+                        )
+
+            logs = {
+                "loss": loss.detach().item(),
+                "raw_model_loss": raw_model_loss.detach().item(),
+                "ref_loss": raw_ref_loss.detach().item(),
+                "implicit_acc": implicit_acc.detach().item(),
+                "lr": lr_scheduler.get_last_lr()[0],
+            }
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        unet = unet.to(torch.float32)
+        unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+
+        StableDiffusionLoraLoaderMixin.save_lora_weights(
+            save_directory=args.output_dir, unet_lora_layers=unet_lora_state_dict, text_encoder_lora_layers=None
+        )
+
+        # Final validation?
+        if args.run_validation:
+            log_validation(
+                args,
+                unet=None,
+                accelerator=accelerator,
+                weight_dtype=weight_dtype,
+                epoch=epoch,
+                is_final_validation=True,
+            )
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/diffusion_dpo/train_diffusion_dpo_sdxl.py b/diffusers/examples/research_projects/diffusion_dpo/train_diffusion_dpo_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..0297a06f5b2ce18dcee2c438a06dc0cdc33ac12b
--- /dev/null
+++ b/diffusers/examples/research_projects/diffusion_dpo/train_diffusion_dpo_sdxl.py
@@ -0,0 +1,1140 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 bram-w, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import contextlib
+import io
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+import wandb
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import StableDiffusionXLLoraLoaderMixin
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, convert_state_dict_to_diffusers
+from diffusers.utils.import_utils import is_xformers_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.25.0.dev0")
+
+logger = get_logger(__name__)
+
+
+VALIDATION_PROMPTS = [
+    "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",
+    "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
+    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+    "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
+]
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def log_validation(args, unet, vae, accelerator, weight_dtype, epoch, is_final_validation=False):
+    logger.info(f"Running validation... \n Generating images with prompts:\n" f" {VALIDATION_PROMPTS}.")
+
+    if is_final_validation:
+        if args.mixed_precision == "fp16":
+            vae.to(weight_dtype)
+
+    # create pipeline
+    pipeline = DiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        vae=vae,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+    )
+    if not is_final_validation:
+        pipeline.unet = accelerator.unwrap_model(unet)
+    else:
+        pipeline.load_lora_weights(args.output_dir, weight_name="pytorch_lora_weights.safetensors")
+
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    images = []
+    context = contextlib.nullcontext() if is_final_validation else torch.cuda.amp.autocast()
+
+    guidance_scale = 5.0
+    num_inference_steps = 25
+    if args.is_turbo:
+        guidance_scale = 0.0
+        num_inference_steps = 4
+    for prompt in VALIDATION_PROMPTS:
+        with context:
+            image = pipeline(
+                prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=generator
+            ).images[0]
+            images.append(image)
+
+    tracker_key = "test" if is_final_validation else "validation"
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(tracker_key, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    tracker_key: [
+                        wandb.Image(image, caption=f"{i}: {VALIDATION_PROMPTS[i]}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    # Also log images without the LoRA params for comparison.
+    if is_final_validation:
+        pipeline.disable_lora()
+        no_lora_images = [
+            pipeline(
+                prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=generator
+            ).images[0]
+            for prompt in VALIDATION_PROMPTS
+        ]
+
+        for tracker in accelerator.trackers:
+            if tracker.name == "tensorboard":
+                np_images = np.stack([np.asarray(img) for img in no_lora_images])
+                tracker.writer.add_images("test_without_lora", np_images, epoch, dataformats="NHWC")
+            if tracker.name == "wandb":
+                tracker.log(
+                    {
+                        "test_without_lora": [
+                            wandb.Image(image, caption=f"{i}: {VALIDATION_PROMPTS[i]}")
+                            for i, image in enumerate(no_lora_images)
+                        ]
+                    }
+                )
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_split_name",
+        type=str,
+        default="validation",
+        help="Dataset split to be used during training. Helpful to specify for conducting experimental runs.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--run_validation",
+        default=False,
+        action="store_true",
+        help="Whether to run validation inference in between training and also after training. Helps to track progress.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=200,
+        help="Run validation every X steps.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="diffusion-dpo-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--vae_encode_batch_size",
+        type=int,
+        default=8,
+        help="Batch size to use for VAE encoding of the images for efficient processing.",
+    )
+    parser.add_argument(
+        "--no_hflip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--random_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to random crop the input images to the resolution. If not set, the images will be center-cropped."
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--beta_dpo",
+        type=int,
+        default=5000,
+        help="DPO KL Divergence penalty.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--is_turbo",
+        action="store_true",
+        help=("Use if tuning SDXL Turbo instead of SDXL"),
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--tracker_name",
+        type=str,
+        default="diffusion-dpo-lora-sdxl",
+        help=("The name of the tracker to report results to."),
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None:
+        raise ValueError("Must provide a `dataset_name`.")
+
+    if args.is_turbo:
+        assert "turbo" in args.pretrained_model_name_or_path
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    return args
+
+
+def tokenize_captions(tokenizers, examples):
+    captions = []
+    for caption in examples["caption"]:
+        captions.append(caption)
+
+    tokens_one = tokenizers[0](
+        captions, truncation=True, padding="max_length", max_length=tokenizers[0].model_max_length, return_tensors="pt"
+    ).input_ids
+    tokens_two = tokenizers[1](
+        captions, truncation=True, padding="max_length", max_length=tokenizers[1].model_max_length, return_tensors="pt"
+    ).input_ids
+
+    return tokens_one, tokens_two
+
+
+@torch.no_grad()
+def encode_prompt(text_encoders, text_input_ids_list):
+    prompt_embeds_list = []
+
+    for i, text_encoder in enumerate(text_encoders):
+        text_input_ids = text_input_ids_list[i]
+
+        prompt_embeds = text_encoder(
+            text_input_ids.to(text_encoder.device),
+            output_hidden_states=True,
+        )
+
+        # We are only ALWAYS interested in the pooled output of the final text encoder
+        pooled_prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds.hidden_states[-2]
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+        prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    def enforce_zero_terminal_snr(scheduler):
+        # Modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_ddpm.py#L93
+        # Original implementation https://arxiv.org/pdf/2305.08891.pdf
+        # Turbo needs zero terminal SNR
+        # Turbo: https://static1.squarespace.com/static/6213c340453c3f502425776e/t/65663480a92fba51d0e1023f/1701197769659/adversarial_diffusion_distillation.pdf
+        # Convert betas to alphas_bar_sqrt
+        alphas = 1 - scheduler.betas
+        alphas_bar = alphas.cumprod(0)
+        alphas_bar_sqrt = alphas_bar.sqrt()
+
+        # Store old values.
+        alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+        alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+        # Shift so last timestep is zero.
+        alphas_bar_sqrt -= alphas_bar_sqrt_T
+        # Scale so first timestep is back to old value.
+        alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+        alphas_bar = alphas_bar_sqrt**2
+        alphas = alphas_bar[1:] / alphas_bar[:-1]
+        alphas = torch.cat([alphas_bar[0:1], alphas])
+
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        scheduler.alphas_cumprod = alphas_cumprod
+        return
+
+    if args.is_turbo:
+        enforce_zero_terminal_snr(noise_scheduler)
+
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # We only train the additional adapter LoRA layers
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet and text_encoders to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    # The VAE is always in float32 to avoid NaN losses.
+    vae.to(accelerator.device, dtype=torch.float32)
+
+    # Set up LoRA.
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+    # Add adapter and make sure the trainable params are in float32.
+    unet.add_adapter(unet_lora_config)
+    if args.mixed_precision == "fp16":
+        for param in unet.parameters():
+            # only upcast trainable parameters (LoRA) into fp32
+            if param.requires_grad:
+                param.data = param.to(torch.float32)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder atten layers
+            unet_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(accelerator.unwrap_model(unet))):
+                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusionXLLoraLoaderMixin.save_lora_weights(output_dir, unet_lora_layers=unet_lora_layers_to_save)
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(accelerator.unwrap_model(unet))):
+                unet_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, network_alphas = StableDiffusionXLLoraLoaderMixin.lora_state_dict(input_dir)
+        StableDiffusionXLLoraLoaderMixin.load_lora_into_unet(
+            lora_state_dict, network_alphas=network_alphas, unet=unet_
+        )
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = list(filter(lambda p: p.requires_grad, unet.parameters()))
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = load_dataset(
+        args.dataset_name,
+        cache_dir=args.cache_dir,
+        split=args.dataset_split_name,
+    )
+
+    # Preprocessing the datasets.
+    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
+    train_crop = transforms.RandomCrop(args.resolution) if args.random_crop else transforms.CenterCrop(args.resolution)
+    train_flip = transforms.RandomHorizontalFlip(p=1.0)
+    to_tensor = transforms.ToTensor()
+    normalize = transforms.Normalize([0.5], [0.5])
+
+    def preprocess_train(examples):
+        all_pixel_values = []
+        images = [Image.open(io.BytesIO(im_bytes)).convert("RGB") for im_bytes in examples["jpg_0"]]
+        original_sizes = [(image.height, image.width) for image in images]
+        crop_top_lefts = []
+
+        for col_name in ["jpg_0", "jpg_1"]:
+            images = [Image.open(io.BytesIO(im_bytes)).convert("RGB") for im_bytes in examples[col_name]]
+            if col_name == "jpg_1":
+                # Need to bring down the image to the same resolution.
+                # This seems like the simplest reasonable approach.
+                # "::-1" because PIL resize takes (width, height).
+                images = [image.resize(original_sizes[i][::-1]) for i, image in enumerate(images)]
+            pixel_values = [to_tensor(image) for image in images]
+            all_pixel_values.append(pixel_values)
+
+        # Double on channel dim, jpg_y then jpg_w
+        im_tup_iterator = zip(*all_pixel_values)
+        combined_pixel_values = []
+        for im_tup, label_0 in zip(im_tup_iterator, examples["label_0"]):
+            if label_0 == 0:
+                im_tup = im_tup[::-1]
+
+            combined_im = torch.cat(im_tup, dim=0)  # no batch dim
+
+            # Resize.
+            combined_im = train_resize(combined_im)
+
+            # Flipping.
+            if not args.no_hflip and random.random() < 0.5:
+                combined_im = train_flip(combined_im)
+
+            # Cropping.
+            if not args.random_crop:
+                y1 = max(0, int(round((combined_im.shape[1] - args.resolution) / 2.0)))
+                x1 = max(0, int(round((combined_im.shape[2] - args.resolution) / 2.0)))
+                combined_im = train_crop(combined_im)
+            else:
+                y1, x1, h, w = train_crop.get_params(combined_im, (args.resolution, args.resolution))
+                combined_im = crop(combined_im, y1, x1, h, w)
+
+            crop_top_left = (y1, x1)
+            crop_top_lefts.append(crop_top_left)
+            combined_im = normalize(combined_im)
+            combined_pixel_values.append(combined_im)
+
+        examples["pixel_values"] = combined_pixel_values
+        examples["original_sizes"] = original_sizes
+        examples["crop_top_lefts"] = crop_top_lefts
+        tokens_one, tokens_two = tokenize_captions([tokenizer_one, tokenizer_two], examples)
+        examples["input_ids_one"] = tokens_one
+        examples["input_ids_two"] = tokens_two
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            train_dataset = train_dataset.shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = train_dataset.with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        original_sizes = [example["original_sizes"] for example in examples]
+        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
+        input_ids_one = torch.stack([example["input_ids_one"] for example in examples])
+        input_ids_two = torch.stack([example["input_ids_two"] for example in examples])
+
+        return {
+            "pixel_values": pixel_values,
+            "input_ids_one": input_ids_one,
+            "input_ids_two": input_ids_two,
+            "original_sizes": original_sizes,
+            "crop_top_lefts": crop_top_lefts,
+        }
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers(args.tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    unet.train()
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # (batch_size, 2*channels, h, w) -> (2*batch_size, channels, h, w)
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                feed_pixel_values = torch.cat(pixel_values.chunk(2, dim=1))
+
+                latents = []
+                for i in range(0, feed_pixel_values.shape[0], args.vae_encode_batch_size):
+                    latents.append(
+                        vae.encode(feed_pixel_values[i : i + args.vae_encode_batch_size]).latent_dist.sample()
+                    )
+                latents = torch.cat(latents, dim=0)
+                latents = latents * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    latents = latents.to(weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents).chunk(2)[0].repeat(2, 1, 1, 1)
+
+                # Sample a random timestep for each image
+                bsz = latents.shape[0] // 2
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device, dtype=torch.long
+                ).repeat(2)
+                if args.is_turbo:
+                    # Learn a 4 timestep schedule
+                    timesteps_0_to_3 = timesteps % 4
+                    timesteps = 250 * timesteps_0_to_3 + 249
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # time ids
+                def compute_time_ids(original_size, crops_coords_top_left):
+                    # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+                    target_size = (args.resolution, args.resolution)
+                    add_time_ids = list(original_size + crops_coords_top_left + target_size)
+                    add_time_ids = torch.tensor([add_time_ids])
+                    add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+                    return add_time_ids
+
+                add_time_ids = torch.cat(
+                    [compute_time_ids(s, c) for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])]
+                ).repeat(2, 1)
+
+                # Get the text embedding for conditioning
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                    [text_encoder_one, text_encoder_two], [batch["input_ids_one"], batch["input_ids_two"]]
+                )
+                prompt_embeds = prompt_embeds.repeat(2, 1, 1)
+                pooled_prompt_embeds = pooled_prompt_embeds.repeat(2, 1)
+
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_model_input,
+                    timesteps,
+                    prompt_embeds,
+                    added_cond_kwargs={"time_ids": add_time_ids, "text_embeds": pooled_prompt_embeds},
+                ).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # Compute losses.
+                model_losses = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                model_losses = model_losses.mean(dim=list(range(1, len(model_losses.shape))))
+                model_losses_w, model_losses_l = model_losses.chunk(2)
+
+                # For logging
+                raw_model_loss = 0.5 * (model_losses_w.mean() + model_losses_l.mean())
+                model_diff = model_losses_w - model_losses_l  # These are both LBS (as is t)
+
+                # Reference model predictions.
+                accelerator.unwrap_model(unet).disable_adapters()
+                with torch.no_grad():
+                    ref_preds = unet(
+                        noisy_model_input,
+                        timesteps,
+                        prompt_embeds,
+                        added_cond_kwargs={"time_ids": add_time_ids, "text_embeds": pooled_prompt_embeds},
+                    ).sample
+                    ref_loss = F.mse_loss(ref_preds.float(), target.float(), reduction="none")
+                    ref_loss = ref_loss.mean(dim=list(range(1, len(ref_loss.shape))))
+
+                    ref_losses_w, ref_losses_l = ref_loss.chunk(2)
+                    ref_diff = ref_losses_w - ref_losses_l
+                    raw_ref_loss = ref_loss.mean()
+
+                # Re-enable adapters.
+                accelerator.unwrap_model(unet).enable_adapters()
+
+                # Final loss.
+                scale_term = -0.5 * args.beta_dpo
+                inside_term = scale_term * (model_diff - ref_diff)
+                loss = -1 * F.logsigmoid(inside_term).mean()
+
+                implicit_acc = (inside_term > 0).sum().float() / inside_term.size(0)
+                implicit_acc += 0.5 * (inside_term == 0).sum().float() / inside_term.size(0)
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if args.run_validation and global_step % args.validation_steps == 0:
+                        log_validation(
+                            args, unet=unet, vae=vae, accelerator=accelerator, weight_dtype=weight_dtype, epoch=epoch
+                        )
+
+            logs = {
+                "loss": loss.detach().item(),
+                "raw_model_loss": raw_model_loss.detach().item(),
+                "ref_loss": raw_ref_loss.detach().item(),
+                "implicit_acc": implicit_acc.detach().item(),
+                "lr": lr_scheduler.get_last_lr()[0],
+            }
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        unet = unet.to(torch.float32)
+        unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+
+        StableDiffusionXLLoraLoaderMixin.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_state_dict,
+            text_encoder_lora_layers=None,
+            text_encoder_2_lora_layers=None,
+        )
+
+        # Final validation?
+        if args.run_validation:
+            log_validation(
+                args,
+                unet=None,
+                vae=vae,
+                accelerator=accelerator,
+                weight_dtype=weight_dtype,
+                epoch=epoch,
+                is_final_validation=True,
+            )
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/diffusion_orpo/README.md b/diffusers/examples/research_projects/diffusion_orpo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3f1ee0413e33e451b5778e3aff72c9cdd0650772
--- /dev/null
+++ b/diffusers/examples/research_projects/diffusion_orpo/README.md
@@ -0,0 +1 @@
+This project has a new home now: [https://mapo-t2i.github.io/](https://mapo-t2i.github.io/). We formally studied the use of ORPO in the context of diffusion models and open-sourced our codebase, models, and datasets. We released our paper too!
diff --git a/diffusers/examples/research_projects/diffusion_orpo/requirements.txt b/diffusers/examples/research_projects/diffusion_orpo/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9c8c6600b824c9e1f811577e9fcc6bd26a53f0e2
--- /dev/null
+++ b/diffusers/examples/research_projects/diffusion_orpo/requirements.txt
@@ -0,0 +1,7 @@
+datasets
+accelerate
+transformers
+torchvision
+wandb
+peft
+webdataset
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/diffusion_orpo/train_diffusion_orpo_sdxl_lora.py b/diffusers/examples/research_projects/diffusion_orpo/train_diffusion_orpo_sdxl_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdc096190f0873aa85d9d839c0d252282e26ab37
--- /dev/null
+++ b/diffusers/examples/research_projects/diffusion_orpo/train_diffusion_orpo_sdxl_lora.py
@@ -0,0 +1,1092 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import contextlib
+import io
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+import wandb
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import StableDiffusionXLLoraLoaderMixin
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, convert_unet_state_dict_to_peft
+from diffusers.utils.import_utils import is_xformers_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.25.0.dev0")
+
+logger = get_logger(__name__)
+
+
+VALIDATION_PROMPTS = [
+    "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",
+    "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
+    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+    "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
+]
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def log_validation(args, unet, vae, accelerator, weight_dtype, epoch, is_final_validation=False):
+    logger.info(f"Running validation... \n Generating images with prompts:\n" f" {VALIDATION_PROMPTS}.")
+
+    if is_final_validation:
+        if args.mixed_precision == "fp16":
+            vae.to(weight_dtype)
+
+    # create pipeline
+    pipeline = DiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        vae=vae,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+    )
+    if not is_final_validation:
+        pipeline.unet = accelerator.unwrap_model(unet)
+    else:
+        pipeline.load_lora_weights(args.output_dir, weight_name="pytorch_lora_weights.safetensors")
+
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    images = []
+    context = contextlib.nullcontext() if is_final_validation else torch.cuda.amp.autocast()
+
+    guidance_scale = 5.0
+    num_inference_steps = 25
+    for prompt in VALIDATION_PROMPTS:
+        with context:
+            image = pipeline(
+                prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=generator
+            ).images[0]
+            images.append(image)
+
+    tracker_key = "test" if is_final_validation else "validation"
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(tracker_key, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    tracker_key: [
+                        wandb.Image(image, caption=f"{i}: {VALIDATION_PROMPTS[i]}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    # Also log images without the LoRA params for comparison.
+    if is_final_validation:
+        pipeline.disable_lora()
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+        no_lora_images = [
+            pipeline(
+                prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=generator
+            ).images[0]
+            for prompt in VALIDATION_PROMPTS
+        ]
+
+        for tracker in accelerator.trackers:
+            if tracker.name == "tensorboard":
+                np_images = np.stack([np.asarray(img) for img in no_lora_images])
+                tracker.writer.add_images("test_without_lora", np_images, epoch, dataformats="NHWC")
+            if tracker.name == "wandb":
+                tracker.log(
+                    {
+                        "test_without_lora": [
+                            wandb.Image(image, caption=f"{i}: {VALIDATION_PROMPTS[i]}")
+                            for i, image in enumerate(no_lora_images)
+                        ]
+                    }
+                )
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_split_name",
+        type=str,
+        default="validation",
+        help="Dataset split to be used during training. Helpful to specify for conducting experimental runs.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--run_validation",
+        default=False,
+        action="store_true",
+        help="Whether to run validation inference in between training and also after training. Helps to track progress.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=200,
+        help="Run validation every X steps.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="diffusion-orpo-lora-sdxl",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--vae_encode_batch_size",
+        type=int,
+        default=8,
+        help="Batch size to use for VAE encoding of the images for efficient processing.",
+    )
+    parser.add_argument(
+        "--no_hflip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--random_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to random crop the input images to the resolution. If not set, the images will be center-cropped."
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--beta_orpo",
+        type=float,
+        default=0.1,
+        help="ORPO contribution factor.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--tracker_name",
+        type=str,
+        default="diffusion-orpo-lora-sdxl",
+        help=("The name of the tracker to report results to."),
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None:
+        raise ValueError("Must provide a `dataset_name`.")
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    return args
+
+
+def tokenize_captions(tokenizers, examples):
+    captions = []
+    for caption in examples["caption"]:
+        captions.append(caption)
+
+    tokens_one = tokenizers[0](
+        captions, truncation=True, padding="max_length", max_length=tokenizers[0].model_max_length, return_tensors="pt"
+    ).input_ids
+    tokens_two = tokenizers[1](
+        captions, truncation=True, padding="max_length", max_length=tokenizers[1].model_max_length, return_tensors="pt"
+    ).input_ids
+
+    return tokens_one, tokens_two
+
+
+@torch.no_grad()
+def encode_prompt(text_encoders, text_input_ids_list):
+    prompt_embeds_list = []
+
+    for i, text_encoder in enumerate(text_encoders):
+        text_input_ids = text_input_ids_list[i]
+
+        prompt_embeds = text_encoder(
+            text_input_ids.to(text_encoder.device),
+            output_hidden_states=True,
+        )
+
+        # We are only ALWAYS interested in the pooled output of the final text encoder
+        pooled_prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds.hidden_states[-2]
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+        prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # We only train the additional adapter LoRA layers
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet and text_encoders to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    # The VAE is always in float32 to avoid NaN losses.
+    vae.to(accelerator.device, dtype=torch.float32)
+
+    # Set up LoRA.
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+    # Add adapter and make sure the trainable params are in float32.
+    unet.add_adapter(unet_lora_config)
+    if args.mixed_precision == "fp16":
+        for param in unet.parameters():
+            # only upcast trainable parameters (LoRA) into fp32
+            if param.requires_grad:
+                param.data = param.to(torch.float32)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder atten layers
+            unet_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(accelerator.unwrap_model(unet))):
+                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusionXLLoraLoaderMixin.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=None,
+                text_encoder_2_lora_layers=None,
+            )
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(accelerator.unwrap_model(unet))):
+                unet_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, network_alphas = StableDiffusionXLLoraLoaderMixin.lora_state_dict(input_dir)
+
+        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = list(filter(lambda p: p.requires_grad, unet.parameters()))
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = load_dataset(
+        args.dataset_name,
+        cache_dir=args.cache_dir,
+        split=args.dataset_split_name,
+    )
+
+    # Preprocessing the datasets.
+    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
+    train_crop = transforms.RandomCrop(args.resolution) if args.random_crop else transforms.CenterCrop(args.resolution)
+    train_flip = transforms.RandomHorizontalFlip(p=1.0)
+    to_tensor = transforms.ToTensor()
+    normalize = transforms.Normalize([0.5], [0.5])
+
+    def preprocess_train(examples):
+        all_pixel_values = []
+        images = [Image.open(io.BytesIO(im_bytes)).convert("RGB") for im_bytes in examples["jpg_0"]]
+        original_sizes = [(image.height, image.width) for image in images]
+        crop_top_lefts = []
+
+        for col_name in ["jpg_0", "jpg_1"]:
+            images = [Image.open(io.BytesIO(im_bytes)).convert("RGB") for im_bytes in examples[col_name]]
+            if col_name == "jpg_1":
+                # Need to bring down the image to the same resolution.
+                # This seems like the simplest reasonable approach.
+                # "::-1" because PIL resize takes (width, height).
+                images = [image.resize(original_sizes[i][::-1]) for i, image in enumerate(images)]
+            pixel_values = [to_tensor(image) for image in images]
+            all_pixel_values.append(pixel_values)
+
+        # Double on channel dim, jpg_y then jpg_w
+        im_tup_iterator = zip(*all_pixel_values)
+        combined_pixel_values = []
+        for im_tup, label_0 in zip(im_tup_iterator, examples["label_0"]):
+            # We randomize selection and rejection.
+            if label_0 == 0.5:
+                if random.random() < 0.5:
+                    label_0 = 0
+                else:
+                    label_0 = 1
+
+            if label_0 == 0:
+                im_tup = im_tup[::-1]
+
+            combined_im = torch.cat(im_tup, dim=0)  # no batch dim
+
+            # Resize.
+            combined_im = train_resize(combined_im)
+
+            # Flipping.
+            if not args.no_hflip and random.random() < 0.5:
+                combined_im = train_flip(combined_im)
+
+            # Cropping.
+            if not args.random_crop:
+                y1 = max(0, int(round((combined_im.shape[1] - args.resolution) / 2.0)))
+                x1 = max(0, int(round((combined_im.shape[2] - args.resolution) / 2.0)))
+                combined_im = train_crop(combined_im)
+            else:
+                y1, x1, h, w = train_crop.get_params(combined_im, (args.resolution, args.resolution))
+                combined_im = crop(combined_im, y1, x1, h, w)
+
+            crop_top_left = (y1, x1)
+            crop_top_lefts.append(crop_top_left)
+            combined_im = normalize(combined_im)
+            combined_pixel_values.append(combined_im)
+
+        examples["pixel_values"] = combined_pixel_values
+        examples["original_sizes"] = original_sizes
+        examples["crop_top_lefts"] = crop_top_lefts
+        tokens_one, tokens_two = tokenize_captions([tokenizer_one, tokenizer_two], examples)
+        examples["input_ids_one"] = tokens_one
+        examples["input_ids_two"] = tokens_two
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            train_dataset = train_dataset.shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = train_dataset.with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        original_sizes = [example["original_sizes"] for example in examples]
+        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
+        input_ids_one = torch.stack([example["input_ids_one"] for example in examples])
+        input_ids_two = torch.stack([example["input_ids_two"] for example in examples])
+
+        return {
+            "pixel_values": pixel_values,
+            "input_ids_one": input_ids_one,
+            "input_ids_two": input_ids_two,
+            "original_sizes": original_sizes,
+            "crop_top_lefts": crop_top_lefts,
+        }
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers(args.tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    unet.train()
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # (batch_size, 2*channels, h, w) -> (2*batch_size, channels, h, w)
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                feed_pixel_values = torch.cat(pixel_values.chunk(2, dim=1))
+
+                latents = []
+                for i in range(0, feed_pixel_values.shape[0], args.vae_encode_batch_size):
+                    latents.append(
+                        vae.encode(feed_pixel_values[i : i + args.vae_encode_batch_size]).latent_dist.sample()
+                    )
+                latents = torch.cat(latents, dim=0)
+                latents = latents * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    latents = latents.to(weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents).chunk(2)[0].repeat(2, 1, 1, 1)
+
+                # Sample a random timestep for each image
+                bsz = latents.shape[0] // 2
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device, dtype=torch.long
+                ).repeat(2)
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # time ids
+                def compute_time_ids(original_size, crops_coords_top_left):
+                    # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+                    target_size = (args.resolution, args.resolution)
+                    add_time_ids = list(original_size + crops_coords_top_left + target_size)
+                    add_time_ids = torch.tensor([add_time_ids])
+                    add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+                    return add_time_ids
+
+                add_time_ids = torch.cat(
+                    [compute_time_ids(s, c) for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])]
+                ).repeat(2, 1)
+
+                # Get the text embedding for conditioning
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                    [text_encoder_one, text_encoder_two], [batch["input_ids_one"], batch["input_ids_two"]]
+                )
+                prompt_embeds = prompt_embeds.repeat(2, 1, 1)
+                pooled_prompt_embeds = pooled_prompt_embeds.repeat(2, 1)
+
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_model_input,
+                    timesteps,
+                    prompt_embeds,
+                    added_cond_kwargs={"time_ids": add_time_ids, "text_embeds": pooled_prompt_embeds},
+                ).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # ODDS ratio loss.
+                # In the diffusion formulation, we're assuming that the MSE loss
+                # approximates the logp.
+                model_losses = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                model_losses = model_losses.mean(dim=list(range(1, len(model_losses.shape))))
+                model_losses_w, model_losses_l = model_losses.chunk(2)
+                log_odds = model_losses_w - model_losses_l
+
+                # Ratio loss.
+                ratio = F.logsigmoid(log_odds)
+                ratio_losses = args.beta_orpo * ratio
+
+                # Full ORPO loss
+                loss = model_losses_w.mean() - ratio_losses.mean()
+
+                # Backprop.
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if args.run_validation and global_step % args.validation_steps == 0:
+                        log_validation(
+                            args, unet=unet, vae=vae, accelerator=accelerator, weight_dtype=weight_dtype, epoch=epoch
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        unet = unet.to(torch.float32)
+        unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+
+        StableDiffusionXLLoraLoaderMixin.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_state_dict,
+            text_encoder_lora_layers=None,
+            text_encoder_2_lora_layers=None,
+        )
+
+        # Final validation?
+        if args.run_validation:
+            log_validation(
+                args,
+                unet=None,
+                vae=vae,
+                accelerator=accelerator,
+                weight_dtype=weight_dtype,
+                epoch=epoch,
+                is_final_validation=True,
+            )
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/diffusion_orpo/train_diffusion_orpo_sdxl_lora_wds.py b/diffusers/examples/research_projects/diffusion_orpo/train_diffusion_orpo_sdxl_lora_wds.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd1ef265d23eb3518847c00db1ea111afc6f1ba5
--- /dev/null
+++ b/diffusers/examples/research_projects/diffusion_orpo/train_diffusion_orpo_sdxl_lora_wds.py
@@ -0,0 +1,1095 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import contextlib
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+import wandb
+import webdataset as wds
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import StableDiffusionXLLoraLoaderMixin
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, convert_unet_state_dict_to_peft
+from diffusers.utils.import_utils import is_xformers_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.25.0.dev0")
+
+logger = get_logger(__name__)
+
+
+VALIDATION_PROMPTS = [
+    "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",
+    "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
+    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+    "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
+]
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def log_validation(args, unet, vae, accelerator, weight_dtype, epoch, is_final_validation=False):
+    logger.info(f"Running validation... \n Generating images with prompts:\n" f" {VALIDATION_PROMPTS}.")
+
+    if is_final_validation:
+        if args.mixed_precision == "fp16":
+            vae.to(weight_dtype)
+
+    # create pipeline
+    pipeline = DiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        vae=vae,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+    )
+    if not is_final_validation:
+        pipeline.unet = accelerator.unwrap_model(unet)
+    else:
+        pipeline.load_lora_weights(args.output_dir, weight_name="pytorch_lora_weights.safetensors")
+
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    images = []
+    context = contextlib.nullcontext() if is_final_validation else torch.cuda.amp.autocast()
+
+    guidance_scale = 5.0
+    num_inference_steps = 25
+    for prompt in VALIDATION_PROMPTS:
+        with context:
+            image = pipeline(
+                prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=generator
+            ).images[0]
+            images.append(image)
+
+    tracker_key = "test" if is_final_validation else "validation"
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(tracker_key, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    tracker_key: [
+                        wandb.Image(image, caption=f"{i}: {VALIDATION_PROMPTS[i]}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    # Also log images without the LoRA params for comparison.
+    if is_final_validation:
+        pipeline.disable_lora()
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+        no_lora_images = [
+            pipeline(
+                prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=generator
+            ).images[0]
+            for prompt in VALIDATION_PROMPTS
+        ]
+
+        for tracker in accelerator.trackers:
+            if tracker.name == "tensorboard":
+                np_images = np.stack([np.asarray(img) for img in no_lora_images])
+                tracker.writer.add_images("test_without_lora", np_images, epoch, dataformats="NHWC")
+            if tracker.name == "wandb":
+                tracker.log(
+                    {
+                        "test_without_lora": [
+                            wandb.Image(image, caption=f"{i}: {VALIDATION_PROMPTS[i]}")
+                            for i, image in enumerate(no_lora_images)
+                        ]
+                    }
+                )
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        default="pipe:aws s3 cp s3://diffusion-preference-opt/{00000..00644}.tar -",
+    )
+    parser.add_argument(
+        "--num_train_examples",
+        type=int,
+        default=1001352,
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--run_validation",
+        default=False,
+        action="store_true",
+        help="Whether to run validation inference in between training and also after training. Helps to track progress.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=200,
+        help="Run validation every X steps.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="diffusion-orpo-lora-sdxl",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--vae_encode_batch_size",
+        type=int,
+        default=8,
+        help="Batch size to use for VAE encoding of the images for efficient processing.",
+    )
+    parser.add_argument(
+        "--no_hflip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--random_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to random crop the input images to the resolution. If not set, the images will be center-cropped."
+        ),
+    )
+    parser.add_argument("--global_batch_size", type=int, default=64, help="Total batch size.")
+    parser.add_argument(
+        "--per_gpu_batch_size", type=int, default=8, help="Number of samples in a batch for a single GPU."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--beta_orpo",
+        type=float,
+        default=0.1,
+        help="ORPO contribution factor.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--tracker_name",
+        type=str,
+        default="diffusion-orpo-lora-sdxl",
+        help=("The name of the tracker to report results to."),
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    return args
+
+
+def tokenize_captions(tokenizers, sample):
+    tokens_one = tokenizers[0](
+        sample["original_prompt"],
+        truncation=True,
+        padding="max_length",
+        max_length=tokenizers[0].model_max_length,
+        return_tensors="pt",
+    ).input_ids
+    tokens_two = tokenizers[1](
+        sample["original_prompt"],
+        truncation=True,
+        padding="max_length",
+        max_length=tokenizers[1].model_max_length,
+        return_tensors="pt",
+    ).input_ids
+
+    return tokens_one, tokens_two
+
+
+@torch.no_grad()
+def encode_prompt(text_encoders, text_input_ids_list):
+    prompt_embeds_list = []
+
+    for i, text_encoder in enumerate(text_encoders):
+        text_input_ids = text_input_ids_list[i]
+
+        prompt_embeds = text_encoder(
+            text_input_ids.to(text_encoder.device),
+            output_hidden_states=True,
+        )
+
+        # We are only ALWAYS interested in the pooled output of the final text encoder
+        pooled_prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds.hidden_states[-2]
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+        prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def get_dataset(args):
+    dataset = (
+        wds.WebDataset(args.dataset_path, resampled=True, handler=wds.warn_and_continue)
+        .shuffle(690, handler=wds.warn_and_continue)
+        .decode("pil", handler=wds.warn_and_continue)
+        .rename(
+            original_prompt="original_prompt.txt",
+            jpg_0="jpg_0.jpg",
+            jpg_1="jpg_1.jpg",
+            label_0="label_0.txt",
+            label_1="label_1.txt",
+            handler=wds.warn_and_continue,
+        )
+    )
+    return dataset
+
+
+def get_loader(args, tokenizer_one, tokenizer_two):
+    # 1,001,352
+    num_batches = math.ceil(args.num_train_examples / args.global_batch_size)
+    num_worker_batches = math.ceil(
+        args.num_train_examples / (args.global_batch_size * args.dataloader_num_workers)
+    )  # per dataloader worker
+    num_batches = num_worker_batches * args.dataloader_num_workers
+    num_samples = num_batches * args.global_batch_size
+
+    dataset = get_dataset(args)
+
+    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
+    train_crop = transforms.RandomCrop(args.resolution) if args.random_crop else transforms.CenterCrop(args.resolution)
+    train_flip = transforms.RandomHorizontalFlip(p=1.0)
+    to_tensor = transforms.ToTensor()
+    normalize = transforms.Normalize([0.5], [0.5])
+
+    def preprocess_images(sample):
+        jpg_0_image = sample["jpg_0"]
+        original_size = (jpg_0_image.height, jpg_0_image.width)
+        crop_top_left = []
+
+        jpg_1_image = sample["jpg_1"]
+        # Need to bring down the image to the same resolution.
+        # This seems like the simplest reasonable approach.
+        # "::-1" because PIL resize takes (width, height).
+        jpg_1_image = jpg_1_image.resize(original_size[::-1])
+
+        # We randomize selection and rejection.
+        label_0 = sample["label_0"]
+        if sample["label_0"] == 0.5:
+            if random.random() < 0.5:
+                label_0 = 0
+            else:
+                label_0 = 1
+
+        # Double on channel dim, jpg_y then jpg_w
+        if label_0 == 0:
+            pixel_values = torch.cat([to_tensor(image) for image in [jpg_1_image, jpg_0_image]])
+        else:
+            pixel_values = torch.cat([to_tensor(image) for image in [jpg_0_image, jpg_1_image]])
+
+        # Resize.
+        combined_im = train_resize(pixel_values)
+
+        # Flipping.
+        if not args.no_hflip and random.random() < 0.5:
+            combined_im = train_flip(combined_im)
+
+        # Cropping.
+        if not args.random_crop:
+            y1 = max(0, int(round((combined_im.shape[1] - args.resolution) / 2.0)))
+            x1 = max(0, int(round((combined_im.shape[2] - args.resolution) / 2.0)))
+            combined_im = train_crop(combined_im)
+        else:
+            y1, x1, h, w = train_crop.get_params(combined_im, (args.resolution, args.resolution))
+            combined_im = crop(combined_im, y1, x1, h, w)
+
+        crop_top_left = (y1, x1)
+        combined_im = normalize(combined_im)
+        tokens_one, tokens_two = tokenize_captions([tokenizer_one, tokenizer_two], sample)
+
+        return {
+            "pixel_values": combined_im,
+            "original_size": original_size,
+            "crop_top_left": crop_top_left,
+            "tokens_one": tokens_one,
+            "tokens_two": tokens_two,
+        }
+
+    def collate_fn(samples):
+        pixel_values = torch.stack([sample["pixel_values"] for sample in samples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+        original_sizes = [example["original_size"] for example in samples]
+        crop_top_lefts = [example["crop_top_left"] for example in samples]
+        input_ids_one = torch.stack([example["tokens_one"] for example in samples])
+        input_ids_two = torch.stack([example["tokens_two"] for example in samples])
+
+        return {
+            "pixel_values": pixel_values,
+            "input_ids_one": input_ids_one,
+            "input_ids_two": input_ids_two,
+            "original_sizes": original_sizes,
+            "crop_top_lefts": crop_top_lefts,
+        }
+
+    dataset = dataset.map(preprocess_images, handler=wds.warn_and_continue)
+    dataset = dataset.batched(args.per_gpu_batch_size, partial=False, collation_fn=collate_fn)
+    dataset = dataset.with_epoch(num_worker_batches)
+
+    dataloader = wds.WebLoader(
+        dataset,
+        batch_size=None,
+        shuffle=False,
+        num_workers=args.dataloader_num_workers,
+        pin_memory=True,
+        persistent_workers=True,
+    )
+    # add meta-data to dataloader instance for convenience
+    dataloader.num_batches = num_batches
+    dataloader.num_samples = num_samples
+    return dataloader
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # We only train the additional adapter LoRA layers
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet and text_encoders to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    # The VAE is always in float32 to avoid NaN losses.
+    vae.to(accelerator.device, dtype=torch.float32)
+
+    # Set up LoRA.
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+    # Add adapter and make sure the trainable params are in float32.
+    unet.add_adapter(unet_lora_config)
+    if args.mixed_precision == "fp16":
+        for param in unet.parameters():
+            # only upcast trainable parameters (LoRA) into fp32
+            if param.requires_grad:
+                param.data = param.to(torch.float32)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder atten layers
+            unet_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(accelerator.unwrap_model(unet))):
+                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusionXLLoraLoaderMixin.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=None,
+                text_encoder_2_lora_layers=None,
+            )
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(accelerator.unwrap_model(unet))):
+                unet_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, network_alphas = StableDiffusionXLLoraLoaderMixin.lora_state_dict(input_dir)
+
+        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rat * args.gradient_accumulation_steps * args.per_gpu_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = list(filter(lambda p: p.requires_grad, unet.parameters()))
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Dataset and DataLoaders creation:
+    args.global_batch_size = args.per_gpu_batch_size * accelerator.num_processes
+    train_dataloader = get_loader(args, tokenizer_one=tokenizer_one, tokenizer_two=tokenizer_two)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers(args.tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.per_gpu_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {train_dataloader.num_samples}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_gpu_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    unet.train()
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # (batch_size, 2*channels, h, w) -> (2*batch_size, channels, h, w)
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype, device=accelerator.device, non_blocking=True)
+                feed_pixel_values = torch.cat(pixel_values.chunk(2, dim=1))
+
+                latents = []
+                for i in range(0, feed_pixel_values.shape[0], args.vae_encode_batch_size):
+                    latents.append(
+                        vae.encode(feed_pixel_values[i : i + args.vae_encode_batch_size]).latent_dist.sample()
+                    )
+                latents = torch.cat(latents, dim=0)
+                latents = latents * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    latents = latents.to(weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents).chunk(2)[0].repeat(2, 1, 1, 1)
+
+                # Sample a random timestep for each image
+                bsz = latents.shape[0] // 2
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device, dtype=torch.long
+                ).repeat(2)
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # time ids
+                def compute_time_ids(original_size, crops_coords_top_left):
+                    # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+                    target_size = (args.resolution, args.resolution)
+                    add_time_ids = list(tuple(original_size) + tuple(crops_coords_top_left) + target_size)
+                    add_time_ids = torch.tensor([add_time_ids])
+                    add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+                    return add_time_ids
+
+                add_time_ids = torch.cat(
+                    [compute_time_ids(s, c) for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])]
+                ).repeat(2, 1)
+
+                # Get the text embedding for conditioning
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                    [text_encoder_one, text_encoder_two], [batch["input_ids_one"], batch["input_ids_two"]]
+                )
+                prompt_embeds = prompt_embeds.repeat(2, 1, 1)
+                pooled_prompt_embeds = pooled_prompt_embeds.repeat(2, 1)
+
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_model_input,
+                    timesteps,
+                    prompt_embeds,
+                    added_cond_kwargs={"time_ids": add_time_ids, "text_embeds": pooled_prompt_embeds},
+                ).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # ODDS ratio loss.
+                # In the diffusion formulation, we're assuming that the MSE loss
+                # approximates the logp.
+                model_losses = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                model_losses = model_losses.mean(dim=list(range(1, len(model_losses.shape))))
+                model_losses_w, model_losses_l = model_losses.chunk(2)
+                log_odds = model_losses_w - model_losses_l
+
+                # Ratio loss.
+                ratio = F.logsigmoid(log_odds)
+                ratio_losses = args.beta_orpo * ratio
+
+                # Full ORPO loss
+                loss = model_losses_w.mean() - ratio_losses.mean()
+
+                # Backprop.
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if args.run_validation and global_step % args.validation_steps == 0:
+                        log_validation(
+                            args, unet=unet, vae=vae, accelerator=accelerator, weight_dtype=weight_dtype, epoch=epoch
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        unet = unet.to(torch.float32)
+        unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+
+        StableDiffusionXLLoraLoaderMixin.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_state_dict,
+            text_encoder_lora_layers=None,
+            text_encoder_2_lora_layers=None,
+        )
+
+        # Final validation?
+        if args.run_validation:
+            log_validation(
+                args,
+                unet=None,
+                vae=vae,
+                accelerator=accelerator,
+                weight_dtype=weight_dtype,
+                epoch=epoch,
+                is_final_validation=True,
+            )
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/dreambooth_inpaint/README.md b/diffusers/examples/research_projects/dreambooth_inpaint/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..46703fa982e50ecaea2c71e07c564e5509215dcb
--- /dev/null
+++ b/diffusers/examples/research_projects/dreambooth_inpaint/README.md
@@ -0,0 +1,118 @@
+# Dreambooth for the inpainting model
+
+This script was added by @thedarkzeno .
+
+Please note that this script is not actively maintained, you can open an issue and tag @thedarkzeno or @patil-suraj though.
+
+```bash
+export MODEL_NAME="runwayml/stable-diffusion-inpainting"
+export INSTANCE_DIR="path-to-instance-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth_inpaint.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=400
+```
+
+### Training with prior-preservation loss
+
+Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
+According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases.
+
+```bash
+export MODEL_NAME="runwayml/stable-diffusion-inpainting"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth_inpaint.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+
+### Training with gradient checkpointing and 8-bit optimizer:
+
+With the help of gradient checkpointing and the 8-bit optimizer from bitsandbytes it's possible to run train dreambooth on a 16GB GPU.
+
+To install `bitandbytes` please refer to this [readme](https://github.com/TimDettmers/bitsandbytes#requirements--installation).
+
+```bash
+export MODEL_NAME="runwayml/stable-diffusion-inpainting"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth_inpaint.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=2 --gradient_checkpointing \
+  --use_8bit_adam \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+### Fine-tune text encoder with the UNet.
+
+The script also allows to fine-tune the `text_encoder` along with the `unet`. It's been observed experimentally that fine-tuning `text_encoder` gives much better results especially on faces.
+Pass the `--train_text_encoder` argument to the script to enable training `text_encoder`.
+
+___Note: Training text encoder requires more memory, with this option the training won't fit on 16GB GPU. It needs at least 24GB VRAM.___
+
+```bash
+export MODEL_NAME="runwayml/stable-diffusion-inpainting"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth_inpaint.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --train_text_encoder \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --use_8bit_adam \
+  --gradient_checkpointing \
+  --learning_rate=2e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
diff --git a/diffusers/examples/research_projects/dreambooth_inpaint/requirements.txt b/diffusers/examples/research_projects/dreambooth_inpaint/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..aad6387026f181053d1872fd1961c7b56e86f1df
--- /dev/null
+++ b/diffusers/examples/research_projects/dreambooth_inpaint/requirements.txt
@@ -0,0 +1,7 @@
+diffusers==0.9.0
+accelerate>=0.16.0
+torchvision
+transformers>=4.21.0
+ftfy
+tensorboard
+Jinja2
diff --git a/diffusers/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py b/diffusers/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b484c89fdbd87720084675ea106fdf1223ce231
--- /dev/null
+++ b/diffusers/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py
@@ -0,0 +1,812 @@
+import argparse
+import itertools
+import math
+import os
+import random
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from PIL import Image, ImageDraw
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    StableDiffusionInpaintPipeline,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.13.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def prepare_mask_and_masked_image(image, mask):
+    image = np.array(image.convert("RGB"))
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    mask = np.array(mask.convert("L"))
+    mask = mask.astype(np.float32) / 255.0
+    mask = mask[None, None]
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+    mask = torch.from_numpy(mask)
+
+    masked_image = image * (mask < 0.5)
+
+    return mask, masked_image
+
+
+# generate random masks
+def random_mask(im_shape, ratio=1, mask_full_image=False):
+    mask = Image.new("L", im_shape, 0)
+    draw = ImageDraw.Draw(mask)
+    size = (random.randint(0, int(im_shape[0] * ratio)), random.randint(0, int(im_shape[1] * ratio)))
+    # use this to always mask the whole image
+    if mask_full_image:
+        size = (int(im_shape[0] * ratio), int(im_shape[1] * ratio))
+    limits = (im_shape[0] - size[0] // 2, im_shape[1] - size[1] // 2)
+    center = (random.randint(size[0] // 2, limits[0]), random.randint(size[1] // 2, limits[1]))
+    draw_type = random.randint(0, 1)
+    if draw_type == 0 or mask_full_image:
+        draw.rectangle(
+            (center[0] - size[0] // 2, center[1] - size[1] // 2, center[0] + size[0] // 2, center[1] + size[1] // 2),
+            fill=255,
+        )
+    else:
+        draw.ellipse(
+            (center[0] - size[0] // 2, center[1] - size[1] // 2, center[0] + size[0] // 2, center[1] + size[1] // 2),
+            fill=255,
+        )
+
+    return mask
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If not have enough images, additional images will be"
+            " sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint and are suitable for resuming training"
+            " using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more docs"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.instance_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms_resize_and_crop = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+            ]
+        )
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        instance_image = self.image_transforms_resize_and_crop(instance_image)
+
+        example["PIL_images"] = instance_image
+        example["instance_images"] = self.image_transforms(instance_image)
+
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+        ).input_ids
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            class_image = self.image_transforms_resize_and_crop(class_image)
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_PIL_images"] = class_image
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                padding="do_not_pad",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids
+
+        return example
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def main():
+    args = parse_args()
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with="tensorboard",
+        project_config=project_config,
+    )
+
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            pipeline = StableDiffusionInpaintPipeline.from_pretrained(
+                args.pretrained_model_name_or_path, torch_dtype=torch_dtype, safety_checker=None
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(
+                sample_dataset, batch_size=args.sample_batch_size, num_workers=1
+            )
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+            transform_to_pil = transforms.ToPILImage()
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                bsz = len(example["prompt"])
+                fake_images = torch.rand((3, args.resolution, args.resolution))
+                transform_to_pil = transforms.ToPILImage()
+                fake_pil_images = transform_to_pil(fake_images)
+
+                fake_mask = random_mask((args.resolution, args.resolution), ratio=1, mask_full_image=True)
+
+                images = pipeline(prompt=example["prompt"], mask_image=fake_mask, image=fake_pil_images).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Load models and create wrapper for stable diffusion
+    text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+
+    vae.requires_grad_(False)
+    if not args.train_text_encoder:
+        text_encoder.requires_grad_(False)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    params_to_optimize = (
+        itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
+    )
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if args.with_prior_preservation:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+            pior_pil = [example["class_PIL_images"] for example in examples]
+
+        masks = []
+        masked_images = []
+        for example in examples:
+            pil_image = example["PIL_images"]
+            # generate a random mask
+            mask = random_mask(pil_image.size, 1, False)
+            # prepare mask and masked image
+            mask, masked_image = prepare_mask_and_masked_image(pil_image, mask)
+
+            masks.append(mask)
+            masked_images.append(masked_image)
+
+        if args.with_prior_preservation:
+            for pil_image in pior_pil:
+                # generate a random mask
+                mask = random_mask(pil_image.size, 1, False)
+                # prepare mask and masked image
+                mask, masked_image = prepare_mask_and_masked_image(pil_image, mask)
+
+                masks.append(mask)
+                masked_images.append(masked_image)
+
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+        input_ids = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids
+        masks = torch.stack(masks)
+        masked_images = torch.stack(masked_images)
+        batch = {"input_ids": input_ids, "pixel_values": pixel_values, "masks": masks, "masked_images": masked_images}
+        return batch
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+    accelerator.register_for_checkpointing(lr_scheduler)
+
+    weight_dtype = torch.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move text_encode and vae to gpu.
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if not args.train_text_encoder:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("dreambooth", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Convert masked images to latent space
+                masked_latents = vae.encode(
+                    batch["masked_images"].reshape(batch["pixel_values"].shape).to(dtype=weight_dtype)
+                ).latent_dist.sample()
+                masked_latents = masked_latents * vae.config.scaling_factor
+
+                masks = batch["masks"]
+                # resize the mask to latents shape as we concatenate the mask to the latents
+                mask = torch.stack(
+                    [
+                        torch.nn.functional.interpolate(mask, size=(args.resolution // 8, args.resolution // 8))
+                        for mask in masks
+                    ]
+                )
+                mask = mask.reshape(-1, 1, args.resolution // 8, args.resolution // 8)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # concatenate the noised latents with the mask and the masked latents
+                latent_model_input = torch.cat([noisy_latents, mask, masked_latents], dim=1)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Predict the noise residual
+                noise_pred = unet(latent_model_input, timesteps, encoder_hidden_states).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
+                    noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute instance loss
+                    loss = F.mse_loss(noise_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
+
+                    # Compute prior loss
+                    prior_loss = F.mse_loss(noise_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+                else:
+                    loss = F.mse_loss(noise_pred.float(), target.float(), reduction="mean")
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(unet.parameters(), text_encoder.parameters())
+                        if args.train_text_encoder
+                        else unet.parameters()
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        accelerator.wait_for_everyone()
+
+    # Create the pipeline using using the trained modules and save it.
+    if accelerator.is_main_process:
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            unet=accelerator.unwrap_model(unet),
+            text_encoder=accelerator.unwrap_model(text_encoder),
+        )
+        pipeline.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py b/diffusers/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..720a69b4e7917d858a97c8855229efcf8a239c99
--- /dev/null
+++ b/diffusers/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py
@@ -0,0 +1,831 @@
+import argparse
+import math
+import os
+import random
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from PIL import Image, ImageDraw
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel
+from diffusers.loaders import AttnProcsLayers
+from diffusers.models.attention_processor import LoRAAttnProcessor
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version
+from diffusers.utils.import_utils import is_xformers_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.13.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def prepare_mask_and_masked_image(image, mask):
+    image = np.array(image.convert("RGB"))
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    mask = np.array(mask.convert("L"))
+    mask = mask.astype(np.float32) / 255.0
+    mask = mask[None, None]
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+    mask = torch.from_numpy(mask)
+
+    masked_image = image * (mask < 0.5)
+
+    return mask, masked_image
+
+
+# generate random masks
+def random_mask(im_shape, ratio=1, mask_full_image=False):
+    mask = Image.new("L", im_shape, 0)
+    draw = ImageDraw.Draw(mask)
+    size = (random.randint(0, int(im_shape[0] * ratio)), random.randint(0, int(im_shape[1] * ratio)))
+    # use this to always mask the whole image
+    if mask_full_image:
+        size = (int(im_shape[0] * ratio), int(im_shape[1] * ratio))
+    limits = (im_shape[0] - size[0] // 2, im_shape[1] - size[1] // 2)
+    center = (random.randint(size[0] // 2, limits[0]), random.randint(size[1] // 2, limits[1]))
+    draw_type = random.randint(0, 1)
+    if draw_type == 0 or mask_full_image:
+        draw.rectangle(
+            (center[0] - size[0] // 2, center[1] - size[1] // 2, center[0] + size[0] // 2, center[1] + size[1] // 2),
+            fill=255,
+        )
+    else:
+        draw.ellipse(
+            (center[0] - size[0] // 2, center[1] - size[1] // 2, center[0] + size[0] // 2, center[1] + size[1] // 2),
+            fill=255,
+        )
+
+    return mask
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If not have enough images, additional images will be"
+            " sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="dreambooth-inpaint-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint and are suitable for resuming training"
+            " using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more docs"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.instance_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms_resize_and_crop = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+            ]
+        )
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        instance_image = self.image_transforms_resize_and_crop(instance_image)
+
+        example["PIL_images"] = instance_image
+        example["instance_images"] = self.image_transforms(instance_image)
+
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+        ).input_ids
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            class_image = self.image_transforms_resize_and_crop(class_image)
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_PIL_images"] = class_image
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                padding="do_not_pad",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids
+
+        return example
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def main():
+    args = parse_args()
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with="tensorboard",
+        project_config=accelerator_project_config,
+    )
+
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            pipeline = StableDiffusionInpaintPipeline.from_pretrained(
+                args.pretrained_model_name_or_path, torch_dtype=torch_dtype, safety_checker=None
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(
+                sample_dataset, batch_size=args.sample_batch_size, num_workers=1
+            )
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+            transform_to_pil = transforms.ToPILImage()
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                bsz = len(example["prompt"])
+                fake_images = torch.rand((3, args.resolution, args.resolution))
+                transform_to_pil = transforms.ToPILImage()
+                fake_pil_images = transform_to_pil(fake_images)
+
+                fake_mask = random_mask((args.resolution, args.resolution), ratio=1, mask_full_image=True)
+
+                images = pipeline(prompt=example["prompt"], mask_image=fake_mask, image=fake_pil_images).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Load models and create wrapper for stable diffusion
+    text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+
+    # We only train the additional adapter LoRA layers
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    weight_dtype = torch.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move text_encode and vae to gpu.
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # now we will add new LoRA weights to the attention layers
+    # It's important to realize here how many attention weights will be added and of which sizes
+    # The sizes of the attention layers consist only of two different variables:
+    # 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`.
+    # 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`.
+
+    # Let's first see how many attention processors we will have to set.
+    # For Stable Diffusion, it should be equal to:
+    # - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12
+    # - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2
+    # - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18
+    # => 32 layers
+
+    # Set correct lora layers
+    lora_attn_procs = {}
+    for name in unet.attn_processors.keys():
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+
+        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+
+    unet.set_attn_processor(lora_attn_procs)
+    lora_layers = AttnProcsLayers(unet.attn_processors)
+
+    accelerator.register_for_checkpointing(lora_layers)
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    optimizer = optimizer_class(
+        lora_layers.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if args.with_prior_preservation:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+            pior_pil = [example["class_PIL_images"] for example in examples]
+
+        masks = []
+        masked_images = []
+        for example in examples:
+            pil_image = example["PIL_images"]
+            # generate a random mask
+            mask = random_mask(pil_image.size, 1, False)
+            # prepare mask and masked image
+            mask, masked_image = prepare_mask_and_masked_image(pil_image, mask)
+
+            masks.append(mask)
+            masked_images.append(masked_image)
+
+        if args.with_prior_preservation:
+            for pil_image in pior_pil:
+                # generate a random mask
+                mask = random_mask(pil_image.size, 1, False)
+                # prepare mask and masked image
+                mask, masked_image = prepare_mask_and_masked_image(pil_image, mask)
+
+                masks.append(mask)
+                masked_images.append(masked_image)
+
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+        input_ids = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids
+        masks = torch.stack(masks)
+        masked_images = torch.stack(masked_images)
+        batch = {"input_ids": input_ids, "pixel_values": pixel_values, "masks": masks, "masked_images": masked_images}
+        return batch
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    # Prepare everything with our `accelerator`.
+    lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        lora_layers, optimizer, train_dataloader, lr_scheduler
+    )
+    # accelerator.register_for_checkpointing(lr_scheduler)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("dreambooth-inpaint-lora", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Convert masked images to latent space
+                masked_latents = vae.encode(
+                    batch["masked_images"].reshape(batch["pixel_values"].shape).to(dtype=weight_dtype)
+                ).latent_dist.sample()
+                masked_latents = masked_latents * vae.config.scaling_factor
+
+                masks = batch["masks"]
+                # resize the mask to latents shape as we concatenate the mask to the latents
+                mask = torch.stack(
+                    [
+                        torch.nn.functional.interpolate(mask, size=(args.resolution // 8, args.resolution // 8))
+                        for mask in masks
+                    ]
+                ).to(dtype=weight_dtype)
+                mask = mask.reshape(-1, 1, args.resolution // 8, args.resolution // 8)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # concatenate the noised latents with the mask and the masked latents
+                latent_model_input = torch.cat([noisy_latents, mask, masked_latents], dim=1)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Predict the noise residual
+                noise_pred = unet(latent_model_input, timesteps, encoder_hidden_states).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
+                    noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute instance loss
+                    loss = F.mse_loss(noise_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
+
+                    # Compute prior loss
+                    prior_loss = F.mse_loss(noise_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+                else:
+                    loss = F.mse_loss(noise_pred.float(), target.float(), reduction="mean")
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = lora_layers.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        accelerator.wait_for_everyone()
+
+    # Save the lora layers
+    if accelerator.is_main_process:
+        unet = unet.to(torch.float32)
+        unet.save_attn_procs(args.output_dir)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/research_projects/geodiff/README.md b/diffusers/examples/research_projects/geodiff/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c85bbdf792cf7b7185f5df0b4ad24863e367eb3a
--- /dev/null
+++ b/diffusers/examples/research_projects/geodiff/README.md
@@ -0,0 +1,6 @@
+# GeoDiff
+
+> [!TIP]
+> This notebook is not actively maintained by the Diffusers team. For any questions or comments, please contact [natolambert](https://twitter.com/natolambert).
+
+This is an experimental research notebook demonstrating how to generate stable 3D structures of molecules with [GeoDiff](https://github.com/MinkaiXu/GeoDiff) and Diffusers.
diff --git a/diffusers/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb b/diffusers/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..bde093802a5d057570ac7f891bae4fbb9bb16602
--- /dev/null
+++ b/diffusers/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb
@@ -0,0 +1,3652 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "F88mignPnalS"
+      },
+      "source": [
+        "# Introduction\n",
+        "\n",
+        "This colab is design to run the pretrained models from [GeoDiff](https://github.com/MinkaiXu/GeoDiff).\n",
+        "The visualization code is inspired by this PyMol [colab](https://colab.research.google.com/gist/iwatobipen/2ec7faeafe5974501e69fcc98c122922/pymol.ipynb#scrollTo=Hm4kY7CaZSlw).\n",
+        "\n",
+        "The goal is to generate physically accurate molecules. Given the input of a molecule graph (atom and bond structures with their connectivity -- in the form of a 2d graph). What we want to generate is a stable 3d structure of the molecule.\n",
+        "\n",
+        "This colab uses GEOM datasets that have multiple 3d targets per configuration, which provide more compelling targets for generative methods.\n",
+        "\n",
+        "> Colab made by [natolambert](https://twitter.com/natolambert).\n",
+        "\n",
+        "![diffusers_library](https://github.com/huggingface/diffusers/raw/main/docs/source/imgs/diffusers_library.jpg)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7cnwXMocnuzB"
+      },
+      "source": [
+        "## Installations\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Install Conda"
+      ],
+      "metadata": {
+        "id": "ff9SxWnaNId9"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "1g_6zOabItDk"
+      },
+      "source": [
+        "Here we check the `cuda` version of colab. When this was built, the version was always 11.1, which impacts some installation decisions below."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "K0ofXobG5Y-X",
+        "outputId": "572c3d25-6f19-4c1e-83f5-a1d084a3207f"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "nvcc: NVIDIA (R) Cuda compiler driver\n",
+            "Copyright (c) 2005-2021 NVIDIA Corporation\n",
+            "Built on Sun_Feb_14_21:12:58_PST_2021\n",
+            "Cuda compilation tools, release 11.2, V11.2.152\n",
+            "Build cuda_11.2.r11.2/compiler.29618528_0\n"
+          ]
+        }
+      ],
+      "source": [
+        "!nvcc --version"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "VfthW90vI0nw"
+      },
+      "source": [
+        "Install Conda for some more complex dependencies for geometric networks."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2WNFzSnbiE0k",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "690d0d4d-9d0a-4ead-c6dc-086f113f532f"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+            "\u001b[0m"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install -q condacolab"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NUsbWYCUI7Km"
+      },
+      "source": [
+        "Setup Conda"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "FZelreINdmd0",
+        "outputId": "635f0cb8-0af4-499f-e0a4-b3790cb12e9f"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✨🍰✨ Everything looks OK!\n"
+          ]
+        }
+      ],
+      "source": [
+        "import condacolab\n",
+        "condacolab.install()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "JzDHaPU7I9Sn"
+      },
+      "source": [
+        "Install pytorch requirements (this takes a few minutes, go grab yourself a coffee 🤗)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "JMxRjHhL7w8V",
+        "outputId": "6ed511b3-9262-49e8-b340-08e76b05ebd8"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Collecting package metadata (current_repodata.json): - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\bdone\n",
+            "Solving environment: \\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\bdone\n",
+            "\n",
+            "## Package Plan ##\n",
+            "\n",
+            "  environment location: /usr/local\n",
+            "\n",
+            "  added / updated specs:\n",
+            "    - cudatoolkit=11.1\n",
+            "    - pytorch\n",
+            "    - torchaudio\n",
+            "    - torchvision\n",
+            "\n",
+            "\n",
+            "The following packages will be downloaded:\n",
+            "\n",
+            "    package                    |            build\n",
+            "    ---------------------------|-----------------\n",
+            "    conda-22.9.0               |   py37h89c1867_1         960 KB  conda-forge\n",
+            "    ------------------------------------------------------------\n",
+            "                                           Total:         960 KB\n",
+            "\n",
+            "The following packages will be UPDATED:\n",
+            "\n",
+            "  conda                               4.14.0-py37h89c1867_0 --> 22.9.0-py37h89c1867_1\n",
+            "\n",
+            "\n",
+            "\n",
+            "Downloading and Extracting Packages\n",
+            "conda-22.9.0         | 960 KB    | : 100% 1.0/1 [00:00<00:00,  4.15it/s]\n",
+            "Preparing transaction: / \b\bdone\n",
+            "Verifying transaction: \\ \b\bdone\n",
+            "Executing transaction: / \b\bdone\n",
+            "Retrieving notices: ...working... done\n"
+          ]
+        }
+      ],
+      "source": [
+        "!conda install pytorch torchvision torchaudio cudatoolkit=11.1 -c pytorch-lts -c nvidia\n",
+        "# !conda install pytorch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 cudatoolkit=11.1 -c pytorch -c conda-forge"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Need to remove a pathspec for colab that specifies the incorrect cuda version."
+      ],
+      "metadata": {
+        "id": "QDS6FPZ0Tu5b"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!rm /usr/local/conda-meta/pinned"
+      ],
+      "metadata": {
+        "id": "dq1lxR10TtrR",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "ed9c5a71-b449-418f-abb7-072b74e7f6c8"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "rm: cannot remove '/usr/local/conda-meta/pinned': No such file or directory\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Z1L3DdZOJB30"
+      },
+      "source": [
+        "Install torch geometric (used in the model later)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "D5ukfCOWfjzK",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "8437485a-5aa6-4d53-8f7f-23517ac1ace6"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Collecting package metadata (current_repodata.json): - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\bdone\n",
+            "Solving environment: | \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\bdone\n",
+            "\n",
+            "## Package Plan ##\n",
+            "\n",
+            "  environment location: /usr/local\n",
+            "\n",
+            "  added / updated specs:\n",
+            "    - pytorch-geometric=1.7.2\n",
+            "\n",
+            "\n",
+            "The following packages will be downloaded:\n",
+            "\n",
+            "    package                    |            build\n",
+            "    ---------------------------|-----------------\n",
+            "    decorator-4.4.2            |             py_0          11 KB  conda-forge\n",
+            "    googledrivedownloader-0.4  |     pyhd3deb0d_1           7 KB  conda-forge\n",
+            "    jinja2-3.1.2               |     pyhd8ed1ab_1          99 KB  conda-forge\n",
+            "    joblib-1.2.0               |     pyhd8ed1ab_0         205 KB  conda-forge\n",
+            "    markupsafe-2.1.1           |   py37h540881e_1          22 KB  conda-forge\n",
+            "    networkx-2.5.1             |     pyhd8ed1ab_0         1.2 MB  conda-forge\n",
+            "    pandas-1.2.3               |   py37hdc94413_0        11.8 MB  conda-forge\n",
+            "    pyparsing-3.0.9            |     pyhd8ed1ab_0          79 KB  conda-forge\n",
+            "    python-dateutil-2.8.2      |     pyhd8ed1ab_0         240 KB  conda-forge\n",
+            "    python-louvain-0.15        |     pyhd8ed1ab_1          13 KB  conda-forge\n",
+            "    pytorch-cluster-1.5.9      |py37_torch_1.8.0_cu111         1.2 MB  rusty1s\n",
+            "    pytorch-geometric-1.7.2    |py37_torch_1.8.0_cu111         445 KB  rusty1s\n",
+            "    pytorch-scatter-2.0.8      |py37_torch_1.8.0_cu111         6.1 MB  rusty1s\n",
+            "    pytorch-sparse-0.6.12      |py37_torch_1.8.0_cu111         2.9 MB  rusty1s\n",
+            "    pytorch-spline-conv-1.2.1  |py37_torch_1.8.0_cu111         736 KB  rusty1s\n",
+            "    pytz-2022.4                |     pyhd8ed1ab_0         232 KB  conda-forge\n",
+            "    scikit-learn-1.0.2         |   py37hf9e9bfc_0         7.8 MB  conda-forge\n",
+            "    scipy-1.7.3                |   py37hf2a6cf1_0        21.8 MB  conda-forge\n",
+            "    setuptools-59.8.0          |   py37h89c1867_1         1.0 MB  conda-forge\n",
+            "    threadpoolctl-3.1.0        |     pyh8a188c0_0          18 KB  conda-forge\n",
+            "    ------------------------------------------------------------\n",
+            "                                           Total:        55.9 MB\n",
+            "\n",
+            "The following NEW packages will be INSTALLED:\n",
+            "\n",
+            "  decorator          conda-forge/noarch::decorator-4.4.2-py_0 None\n",
+            "  googledrivedownlo~ conda-forge/noarch::googledrivedownloader-0.4-pyhd3deb0d_1 None\n",
+            "  jinja2             conda-forge/noarch::jinja2-3.1.2-pyhd8ed1ab_1 None\n",
+            "  joblib             conda-forge/noarch::joblib-1.2.0-pyhd8ed1ab_0 None\n",
+            "  markupsafe         conda-forge/linux-64::markupsafe-2.1.1-py37h540881e_1 None\n",
+            "  networkx           conda-forge/noarch::networkx-2.5.1-pyhd8ed1ab_0 None\n",
+            "  pandas             conda-forge/linux-64::pandas-1.2.3-py37hdc94413_0 None\n",
+            "  pyparsing          conda-forge/noarch::pyparsing-3.0.9-pyhd8ed1ab_0 None\n",
+            "  python-dateutil    conda-forge/noarch::python-dateutil-2.8.2-pyhd8ed1ab_0 None\n",
+            "  python-louvain     conda-forge/noarch::python-louvain-0.15-pyhd8ed1ab_1 None\n",
+            "  pytorch-cluster    rusty1s/linux-64::pytorch-cluster-1.5.9-py37_torch_1.8.0_cu111 None\n",
+            "  pytorch-geometric  rusty1s/linux-64::pytorch-geometric-1.7.2-py37_torch_1.8.0_cu111 None\n",
+            "  pytorch-scatter    rusty1s/linux-64::pytorch-scatter-2.0.8-py37_torch_1.8.0_cu111 None\n",
+            "  pytorch-sparse     rusty1s/linux-64::pytorch-sparse-0.6.12-py37_torch_1.8.0_cu111 None\n",
+            "  pytorch-spline-co~ rusty1s/linux-64::pytorch-spline-conv-1.2.1-py37_torch_1.8.0_cu111 None\n",
+            "  pytz               conda-forge/noarch::pytz-2022.4-pyhd8ed1ab_0 None\n",
+            "  scikit-learn       conda-forge/linux-64::scikit-learn-1.0.2-py37hf9e9bfc_0 None\n",
+            "  scipy              conda-forge/linux-64::scipy-1.7.3-py37hf2a6cf1_0 None\n",
+            "  threadpoolctl      conda-forge/noarch::threadpoolctl-3.1.0-pyh8a188c0_0 None\n",
+            "\n",
+            "The following packages will be DOWNGRADED:\n",
+            "\n",
+            "  setuptools                          65.3.0-py37h89c1867_0 --> 59.8.0-py37h89c1867_1 None\n",
+            "\n",
+            "\n",
+            "\n",
+            "Downloading and Extracting Packages\n",
+            "scikit-learn-1.0.2   | 7.8 MB    | : 100% 1.0/1 [00:01<00:00,  1.37s/it]              \n",
+            "pytorch-scatter-2.0. | 6.1 MB    | : 100% 1.0/1 [00:06<00:00,  6.18s/it]\n",
+            "pytorch-geometric-1. | 445 KB    | : 100% 1.0/1 [00:02<00:00,  2.53s/it]\n",
+            "scipy-1.7.3          | 21.8 MB   | : 100% 1.0/1 [00:03<00:00,  3.06s/it]\n",
+            "python-dateutil-2.8. | 240 KB    | : 100% 1.0/1 [00:00<00:00, 21.48it/s]\n",
+            "pytorch-spline-conv- | 736 KB    | : 100% 1.0/1 [00:01<00:00,  1.00s/it]\n",
+            "pytorch-sparse-0.6.1 | 2.9 MB    | : 100% 1.0/1 [00:07<00:00,  7.51s/it]\n",
+            "pyparsing-3.0.9      | 79 KB     | : 100% 1.0/1 [00:00<00:00, 26.32it/s]\n",
+            "pytorch-cluster-1.5. | 1.2 MB    | : 100% 1.0/1 [00:02<00:00,  2.78s/it]\n",
+            "jinja2-3.1.2         | 99 KB     | : 100% 1.0/1 [00:00<00:00, 20.28it/s]\n",
+            "decorator-4.4.2      | 11 KB     | : 100% 1.0/1 [00:00<00:00, 21.57it/s]\n",
+            "joblib-1.2.0         | 205 KB    | : 100% 1.0/1 [00:00<00:00, 15.04it/s]\n",
+            "pytz-2022.4          | 232 KB    | : 100% 1.0/1 [00:00<00:00, 10.21it/s]\n",
+            "python-louvain-0.15  | 13 KB     | : 100% 1.0/1 [00:00<00:00,  3.34it/s]\n",
+            "googledrivedownloade | 7 KB      | : 100% 1.0/1 [00:00<00:00,  3.33it/s]\n",
+            "threadpoolctl-3.1.0  | 18 KB     | : 100% 1.0/1 [00:00<00:00, 29.40it/s]\n",
+            "markupsafe-2.1.1     | 22 KB     | : 100% 1.0/1 [00:00<00:00, 28.62it/s]\n",
+            "pandas-1.2.3         | 11.8 MB   | : 100% 1.0/1 [00:02<00:00,  2.08s/it]               \n",
+            "networkx-2.5.1       | 1.2 MB    | : 100% 1.0/1 [00:01<00:00,  1.39s/it]\n",
+            "setuptools-59.8.0    | 1.0 MB    | : 100% 1.0/1 [00:00<00:00,  4.25it/s]\n",
+            "Preparing transaction: / \b\b- \b\b\\ \b\bdone\n",
+            "Verifying transaction: / \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\bdone\n",
+            "Executing transaction: / \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\bdone\n",
+            "Retrieving notices: ...working... done\n"
+          ]
+        }
+      ],
+      "source": [
+        "!conda install -c rusty1s pytorch-geometric=1.7.2"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ppxv6Mdkalbc"
+      },
+      "source": [
+        "### Install Diffusers"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "mgQA_XN-XGY2",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "85392615-b6a4-4052-9d2a-79604be62c94"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "/content\n",
+            "Cloning into 'diffusers'...\n",
+            "remote: Enumerating objects: 9298, done.\u001b[K\n",
+            "remote: Counting objects: 100% (40/40), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (23/23), done.\u001b[K\n",
+            "remote: Total 9298 (delta 17), reused 23 (delta 11), pack-reused 9258\u001b[K\n",
+            "Receiving objects: 100% (9298/9298), 7.38 MiB | 5.28 MiB/s, done.\n",
+            "Resolving deltas: 100% (6168/6168), done.\n",
+            "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m757.0/757.0 kB\u001b[0m \u001b[31m52.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.5/163.5 kB\u001b[0m \u001b[31m21.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.8/40.8 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m596.3/596.3 kB\u001b[0m \u001b[31m51.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Building wheel for diffusers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+            "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m432.7/432.7 kB\u001b[0m \u001b[31m36.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.3/5.3 MB\u001b[0m \u001b[31m90.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m35.3/35.3 MB\u001b[0m \u001b[31m39.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.1/115.1 kB\u001b[0m \u001b[31m16.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m948.0/948.0 kB\u001b[0m \u001b[31m63.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.2/212.2 kB\u001b[0m \u001b[31m21.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.8/95.8 kB\u001b[0m \u001b[31m12.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m140.8/140.8 kB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m104.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m148.0/148.0 kB\u001b[0m \u001b[31m20.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m231.3/231.3 kB\u001b[0m \u001b[31m30.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.8/58.8 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+            "\u001b[0m"
+          ]
+        }
+      ],
+      "source": [
+        "%cd /content\n",
+        "\n",
+        "# install latest HF diffusers (will update to the release once added)\n",
+        "!git clone https://github.com/huggingface/diffusers.git\n",
+        "!pip install -q /content/diffusers\n",
+        "\n",
+        "# dependencies for diffusers\n",
+        "!pip install -q datasets transformers"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "LZO6AJKuJKO8"
+      },
+      "source": [
+        "Check that torch is installed correctly and utilizing the GPU in the colab"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gZt7BNi1e1PA",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        },
+        "outputId": "a0e1832c-9c02-49aa-cff8-1339e6cdc889"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "True\n"
+          ]
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "'1.8.2'"
+            ],
+            "application/vnd.google.colaboratory.intrinsic+json": {
+              "type": "string"
+            }
+          },
+          "metadata": {},
+          "execution_count": 8
+        }
+      ],
+      "source": [
+        "import torch\n",
+        "print(torch.cuda.is_available())\n",
+        "torch.__version__"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KLE7CqlfJNUO"
+      },
+      "source": [
+        "### Install Chemistry-specific Dependencies\n",
+        "\n",
+        "Install RDKit, a tool for working with and visualizing chemsitry in python (you use this to visualize the generate models later)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "0CPv_NvehRz3",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "6ee0ae4e-4511-4816-de29-22b1c21d49bc"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting rdkit\n",
+            "  Downloading rdkit-2022.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.8 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.8/36.8 MB\u001b[0m \u001b[31m34.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: Pillow in /usr/local/lib/python3.7/site-packages (from rdkit) (9.2.0)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/site-packages (from rdkit) (1.21.6)\n",
+            "Installing collected packages: rdkit\n",
+            "Successfully installed rdkit-2022.3.5\n",
+            "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+            "\u001b[0m"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install rdkit"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "88GaDbDPxJ5I"
+      },
+      "source": [
+        "### Get viewer from nglview\n",
+        "\n",
+        "The model you will use outputs a position matrix tensor. This pytorch geometric data object will have many features (positions, known features, edge features -- all tensors).\n",
+        "The data we give to the model will also have a rdmol object (which can extract features to geometric if needed).\n",
+        "The rdmol in this object is a source of ground truth for the generated molecules.\n",
+        "\n",
+        "You will use one rendering function from nglviewer later!\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jcl8GCS2mz6t",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "outputId": "99b5cc40-67bb-4d8e-faa0-47d7cb33e98f"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting nglview\n",
+            "  Downloading nglview-3.0.3.tar.gz (5.7 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.7/5.7 MB\u001b[0m \u001b[31m91.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/site-packages (from nglview) (1.21.6)\n",
+            "Collecting jupyterlab-widgets\n",
+            "  Downloading jupyterlab_widgets-3.0.3-py3-none-any.whl (384 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m384.1/384.1 kB\u001b[0m \u001b[31m40.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting ipywidgets>=7\n",
+            "  Downloading ipywidgets-8.0.2-py3-none-any.whl (134 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.4/134.4 kB\u001b[0m \u001b[31m21.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting widgetsnbextension~=4.0\n",
+            "  Downloading widgetsnbextension-4.0.3-py3-none-any.whl (2.0 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m84.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting ipython>=6.1.0\n",
+            "  Downloading ipython-7.34.0-py3-none-any.whl (793 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m793.8/793.8 kB\u001b[0m \u001b[31m60.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting ipykernel>=4.5.1\n",
+            "  Downloading ipykernel-6.16.0-py3-none-any.whl (138 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.4/138.4 kB\u001b[0m \u001b[31m20.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting traitlets>=4.3.1\n",
+            "  Downloading traitlets-5.4.0-py3-none-any.whl (107 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m107.1/107.1 kB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/site-packages (from ipykernel>=4.5.1->ipywidgets>=7->nglview) (21.3)\n",
+            "Collecting pyzmq>=17\n",
+            "  Downloading pyzmq-24.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.1 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m68.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting matplotlib-inline>=0.1\n",
+            "  Downloading matplotlib_inline-0.1.6-py3-none-any.whl (9.4 kB)\n",
+            "Collecting tornado>=6.1\n",
+            "  Downloading tornado-6.2-cp37-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (423 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m424.0/424.0 kB\u001b[0m \u001b[31m41.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting nest-asyncio\n",
+            "  Downloading nest_asyncio-1.5.6-py3-none-any.whl (5.2 kB)\n",
+            "Collecting debugpy>=1.0\n",
+            "  Downloading debugpy-1.6.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.8 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m83.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting psutil\n",
+            "  Downloading psutil-5.9.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (281 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.3/281.3 kB\u001b[0m \u001b[31m33.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting jupyter-client>=6.1.12\n",
+            "  Downloading jupyter_client-7.4.2-py3-none-any.whl (132 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.2/132.2 kB\u001b[0m \u001b[31m19.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting pickleshare\n",
+            "  Downloading pickleshare-0.7.5-py2.py3-none-any.whl (6.9 kB)\n",
+            "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets>=7->nglview) (59.8.0)\n",
+            "Collecting backcall\n",
+            "  Downloading backcall-0.2.0-py2.py3-none-any.whl (11 kB)\n",
+            "Collecting pexpect>4.3\n",
+            "  Downloading pexpect-4.8.0-py2.py3-none-any.whl (59 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.0/59.0 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting pygments\n",
+            "  Downloading Pygments-2.13.0-py3-none-any.whl (1.1 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m70.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting jedi>=0.16\n",
+            "  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m83.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0\n",
+            "  Downloading prompt_toolkit-3.0.31-py3-none-any.whl (382 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m382.3/382.3 kB\u001b[0m \u001b[31m40.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: decorator in /usr/local/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets>=7->nglview) (4.4.2)\n",
+            "Collecting parso<0.9.0,>=0.8.0\n",
+            "  Downloading parso-0.8.3-py2.py3-none-any.whl (100 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m100.8/100.8 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.7/site-packages (from jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets>=7->nglview) (2.8.2)\n",
+            "Collecting entrypoints\n",
+            "  Downloading entrypoints-0.4-py3-none-any.whl (5.3 kB)\n",
+            "Collecting jupyter-core>=4.9.2\n",
+            "  Downloading jupyter_core-4.11.1-py3-none-any.whl (88 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m88.4/88.4 kB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting ptyprocess>=0.5\n",
+            "  Downloading ptyprocess-0.7.0-py2.py3-none-any.whl (13 kB)\n",
+            "Collecting wcwidth\n",
+            "  Downloading wcwidth-0.2.5-py2.py3-none-any.whl (30 kB)\n",
+            "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/site-packages (from packaging->ipykernel>=4.5.1->ipywidgets>=7->nglview) (3.0.9)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil>=2.8.2->jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets>=7->nglview) (1.16.0)\n",
+            "Building wheels for collected packages: nglview\n",
+            "  Building wheel for nglview (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for nglview: filename=nglview-3.0.3-py3-none-any.whl size=8057538 sha256=b7e1071bb91822e48515bf27f4e6b197c6e85e06b90912b3439edc8be1e29514\n",
+            "  Stored in directory: /root/.cache/pip/wheels/01/0c/49/c6f79d8edba8fe89752bf20de2d99040bfa57db0548975c5d5\n",
+            "Successfully built nglview\n",
+            "Installing collected packages: wcwidth, ptyprocess, pickleshare, backcall, widgetsnbextension, traitlets, tornado, pyzmq, pygments, psutil, prompt-toolkit, pexpect, parso, nest-asyncio, jupyterlab-widgets, entrypoints, debugpy, matplotlib-inline, jupyter-core, jedi, jupyter-client, ipython, ipykernel, ipywidgets, nglview\n",
+            "Successfully installed backcall-0.2.0 debugpy-1.6.3 entrypoints-0.4 ipykernel-6.16.0 ipython-7.34.0 ipywidgets-8.0.2 jedi-0.18.1 jupyter-client-7.4.2 jupyter-core-4.11.1 jupyterlab-widgets-3.0.3 matplotlib-inline-0.1.6 nest-asyncio-1.5.6 nglview-3.0.3 parso-0.8.3 pexpect-4.8.0 pickleshare-0.7.5 prompt-toolkit-3.0.31 psutil-5.9.2 ptyprocess-0.7.0 pygments-2.13.0 pyzmq-24.0.1 tornado-6.2 traitlets-5.4.0 wcwidth-0.2.5 widgetsnbextension-4.0.3\n",
+            "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+            "\u001b[0m"
+          ]
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/vnd.colab-display-data+json": {
+              "pip_warning": {
+                "packages": [
+                  "pexpect",
+                  "pickleshare",
+                  "wcwidth"
+                ]
+              }
+            }
+          },
+          "metadata": {}
+        }
+      ],
+      "source": [
+        "!pip install nglview"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Create a diffusion model"
+      ],
+      "metadata": {
+        "id": "8t8_e_uVLdKB"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Model class(es)"
+      ],
+      "metadata": {
+        "id": "G0rMncVtNSqU"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Imports"
+      ],
+      "metadata": {
+        "id": "L5FEXz5oXkzt"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Model adapted from GeoDiff https://github.com/MinkaiXu/GeoDiff\n",
+        "# Model inspired by https://github.com/DeepGraphLearning/torchdrug/tree/master/torchdrug/models\n",
+        "from dataclasses import dataclass\n",
+        "from typing import Callable, Tuple, Union\n",
+        "\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import torch.nn.functional as F\n",
+        "from torch import Tensor, nn\n",
+        "from torch.nn import Embedding, Linear, Module, ModuleList, Sequential\n",
+        "\n",
+        "from torch_geometric.nn import MessagePassing, radius, radius_graph\n",
+        "from torch_geometric.typing import Adj, OptPairTensor, OptTensor, Size\n",
+        "from torch_geometric.utils import dense_to_sparse, to_dense_adj\n",
+        "from torch_scatter import scatter_add\n",
+        "from torch_sparse import SparseTensor, coalesce\n",
+        "\n",
+        "from diffusers.configuration_utils import ConfigMixin, register_to_config\n",
+        "from diffusers.modeling_utils import ModelMixin\n",
+        "from diffusers.utils import BaseOutput\n"
+      ],
+      "metadata": {
+        "id": "-3-P4w5sXkRU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Helper classes"
+      ],
+      "metadata": {
+        "id": "EzJQXPN_XrMX"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "@dataclass\n",
+        "class MoleculeGNNOutput(BaseOutput):\n",
+        "    \"\"\"\n",
+        "    Args:\n",
+        "        sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):\n",
+        "            Hidden states output. Output of last layer of model.\n",
+        "    \"\"\"\n",
+        "\n",
+        "    sample: torch.Tensor\n",
+        "\n",
+        "\n",
+        "class MultiLayerPerceptron(nn.Module):\n",
+        "    \"\"\"\n",
+        "    Multi-layer Perceptron. Note there is no activation or dropout in the last layer.\n",
+        "    Args:\n",
+        "        input_dim (int): input dimension\n",
+        "        hidden_dim (list of int): hidden dimensions\n",
+        "        activation (str or function, optional): activation function\n",
+        "        dropout (float, optional): dropout rate\n",
+        "    \"\"\"\n",
+        "\n",
+        "    def __init__(self, input_dim, hidden_dims, activation=\"relu\", dropout=0):\n",
+        "        super(MultiLayerPerceptron, self).__init__()\n",
+        "\n",
+        "        self.dims = [input_dim] + hidden_dims\n",
+        "        if isinstance(activation, str):\n",
+        "            self.activation = getattr(F, activation)\n",
+        "        else:\n",
+        "            print(f\"Warning, activation passed {activation} is not string and ignored\")\n",
+        "            self.activation = None\n",
+        "        if dropout > 0:\n",
+        "            self.dropout = nn.Dropout(dropout)\n",
+        "        else:\n",
+        "            self.dropout = None\n",
+        "\n",
+        "        self.layers = nn.ModuleList()\n",
+        "        for i in range(len(self.dims) - 1):\n",
+        "            self.layers.append(nn.Linear(self.dims[i], self.dims[i + 1]))\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        \"\"\"\"\"\"\n",
+        "        for i, layer in enumerate(self.layers):\n",
+        "            x = layer(x)\n",
+        "            if i < len(self.layers) - 1:\n",
+        "                if self.activation:\n",
+        "                    x = self.activation(x)\n",
+        "                if self.dropout:\n",
+        "                    x = self.dropout(x)\n",
+        "        return x\n",
+        "\n",
+        "\n",
+        "class ShiftedSoftplus(torch.nn.Module):\n",
+        "    def __init__(self):\n",
+        "        super(ShiftedSoftplus, self).__init__()\n",
+        "        self.shift = torch.log(torch.tensor(2.0)).item()\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        return F.softplus(x) - self.shift\n",
+        "\n",
+        "\n",
+        "class CFConv(MessagePassing):\n",
+        "    def __init__(self, in_channels, out_channels, num_filters, mlp, cutoff, smooth):\n",
+        "        super(CFConv, self).__init__(aggr=\"add\")\n",
+        "        self.lin1 = Linear(in_channels, num_filters, bias=False)\n",
+        "        self.lin2 = Linear(num_filters, out_channels)\n",
+        "        self.nn = mlp\n",
+        "        self.cutoff = cutoff\n",
+        "        self.smooth = smooth\n",
+        "\n",
+        "        self.reset_parameters()\n",
+        "\n",
+        "    def reset_parameters(self):\n",
+        "        torch.nn.init.xavier_uniform_(self.lin1.weight)\n",
+        "        torch.nn.init.xavier_uniform_(self.lin2.weight)\n",
+        "        self.lin2.bias.data.fill_(0)\n",
+        "\n",
+        "    def forward(self, x, edge_index, edge_length, edge_attr):\n",
+        "        if self.smooth:\n",
+        "            C = 0.5 * (torch.cos(edge_length * np.pi / self.cutoff) + 1.0)\n",
+        "            C = C * (edge_length <= self.cutoff) * (edge_length >= 0.0)  # Modification: cutoff\n",
+        "        else:\n",
+        "            C = (edge_length <= self.cutoff).float()\n",
+        "        W = self.nn(edge_attr) * C.view(-1, 1)\n",
+        "\n",
+        "        x = self.lin1(x)\n",
+        "        x = self.propagate(edge_index, x=x, W=W)\n",
+        "        x = self.lin2(x)\n",
+        "        return x\n",
+        "\n",
+        "    def message(self, x_j: torch.Tensor, W) -> torch.Tensor:\n",
+        "        return x_j * W\n",
+        "\n",
+        "\n",
+        "class InteractionBlock(torch.nn.Module):\n",
+        "    def __init__(self, hidden_channels, num_gaussians, num_filters, cutoff, smooth):\n",
+        "        super(InteractionBlock, self).__init__()\n",
+        "        mlp = Sequential(\n",
+        "            Linear(num_gaussians, num_filters),\n",
+        "            ShiftedSoftplus(),\n",
+        "            Linear(num_filters, num_filters),\n",
+        "        )\n",
+        "        self.conv = CFConv(hidden_channels, hidden_channels, num_filters, mlp, cutoff, smooth)\n",
+        "        self.act = ShiftedSoftplus()\n",
+        "        self.lin = Linear(hidden_channels, hidden_channels)\n",
+        "\n",
+        "    def forward(self, x, edge_index, edge_length, edge_attr):\n",
+        "        x = self.conv(x, edge_index, edge_length, edge_attr)\n",
+        "        x = self.act(x)\n",
+        "        x = self.lin(x)\n",
+        "        return x\n",
+        "\n",
+        "\n",
+        "class SchNetEncoder(Module):\n",
+        "    def __init__(\n",
+        "        self, hidden_channels=128, num_filters=128, num_interactions=6, edge_channels=100, cutoff=10.0, smooth=False\n",
+        "    ):\n",
+        "        super().__init__()\n",
+        "\n",
+        "        self.hidden_channels = hidden_channels\n",
+        "        self.num_filters = num_filters\n",
+        "        self.num_interactions = num_interactions\n",
+        "        self.cutoff = cutoff\n",
+        "\n",
+        "        self.embedding = Embedding(100, hidden_channels, max_norm=10.0)\n",
+        "\n",
+        "        self.interactions = ModuleList()\n",
+        "        for _ in range(num_interactions):\n",
+        "            block = InteractionBlock(hidden_channels, edge_channels, num_filters, cutoff, smooth)\n",
+        "            self.interactions.append(block)\n",
+        "\n",
+        "    def forward(self, z, edge_index, edge_length, edge_attr, embed_node=True):\n",
+        "        if embed_node:\n",
+        "            assert z.dim() == 1 and z.dtype == torch.long\n",
+        "            h = self.embedding(z)\n",
+        "        else:\n",
+        "            h = z\n",
+        "        for interaction in self.interactions:\n",
+        "            h = h + interaction(h, edge_index, edge_length, edge_attr)\n",
+        "\n",
+        "        return h\n",
+        "\n",
+        "\n",
+        "class GINEConv(MessagePassing):\n",
+        "    \"\"\"\n",
+        "    Custom class of the graph isomorphism operator from the \"How Powerful are Graph Neural Networks?\n",
+        "    https://arxiv.org/abs/1810.00826 paper. Note that this implementation has the added option of a custom activation.\n",
+        "    \"\"\"\n",
+        "\n",
+        "    def __init__(self, mlp: Callable, eps: float = 0.0, train_eps: bool = False, activation=\"softplus\", **kwargs):\n",
+        "        super(GINEConv, self).__init__(aggr=\"add\", **kwargs)\n",
+        "        self.nn = mlp\n",
+        "        self.initial_eps = eps\n",
+        "\n",
+        "        if isinstance(activation, str):\n",
+        "            self.activation = getattr(F, activation)\n",
+        "        else:\n",
+        "            self.activation = None\n",
+        "\n",
+        "        if train_eps:\n",
+        "            self.eps = torch.nn.Parameter(torch.Tensor([eps]))\n",
+        "        else:\n",
+        "            self.register_buffer(\"eps\", torch.Tensor([eps]))\n",
+        "\n",
+        "    def forward(\n",
+        "        self, x: Union[Tensor, OptPairTensor], edge_index: Adj, edge_attr: OptTensor = None, size: Size = None\n",
+        "    ) -> torch.Tensor:\n",
+        "        \"\"\"\"\"\"\n",
+        "        if isinstance(x, torch.Tensor):\n",
+        "            x: OptPairTensor = (x, x)\n",
+        "\n",
+        "        # Node and edge feature dimensionalites need to match.\n",
+        "        if isinstance(edge_index, torch.Tensor):\n",
+        "            assert edge_attr is not None\n",
+        "            assert x[0].size(-1) == edge_attr.size(-1)\n",
+        "        elif isinstance(edge_index, SparseTensor):\n",
+        "            assert x[0].size(-1) == edge_index.size(-1)\n",
+        "\n",
+        "        # propagate_type: (x: OptPairTensor, edge_attr: OptTensor)\n",
+        "        out = self.propagate(edge_index, x=x, edge_attr=edge_attr, size=size)\n",
+        "\n",
+        "        x_r = x[1]\n",
+        "        if x_r is not None:\n",
+        "            out += (1 + self.eps) * x_r\n",
+        "\n",
+        "        return self.nn(out)\n",
+        "\n",
+        "    def message(self, x_j: torch.Tensor, edge_attr: torch.Tensor) -> torch.Tensor:\n",
+        "        if self.activation:\n",
+        "            return self.activation(x_j + edge_attr)\n",
+        "        else:\n",
+        "            return x_j + edge_attr\n",
+        "\n",
+        "    def __repr__(self):\n",
+        "        return \"{}(nn={})\".format(self.__class__.__name__, self.nn)\n",
+        "\n",
+        "\n",
+        "class GINEncoder(torch.nn.Module):\n",
+        "    def __init__(self, hidden_dim, num_convs=3, activation=\"relu\", short_cut=True, concat_hidden=False):\n",
+        "        super().__init__()\n",
+        "\n",
+        "        self.hidden_dim = hidden_dim\n",
+        "        self.num_convs = num_convs\n",
+        "        self.short_cut = short_cut\n",
+        "        self.concat_hidden = concat_hidden\n",
+        "        self.node_emb = nn.Embedding(100, hidden_dim)\n",
+        "\n",
+        "        if isinstance(activation, str):\n",
+        "            self.activation = getattr(F, activation)\n",
+        "        else:\n",
+        "            self.activation = None\n",
+        "\n",
+        "        self.convs = nn.ModuleList()\n",
+        "        for i in range(self.num_convs):\n",
+        "            self.convs.append(\n",
+        "                GINEConv(\n",
+        "                    MultiLayerPerceptron(hidden_dim, [hidden_dim, hidden_dim], activation=activation),\n",
+        "                    activation=activation,\n",
+        "                )\n",
+        "            )\n",
+        "\n",
+        "    def forward(self, z, edge_index, edge_attr):\n",
+        "        \"\"\"\n",
+        "        Input:\n",
+        "            data: (torch_geometric.data.Data): batched graph edge_index: bond indices of the original graph (num_node,\n",
+        "            hidden) edge_attr: edge feature tensor with shape (num_edge, hidden)\n",
+        "        Output:\n",
+        "            node_feature: graph feature\n",
+        "        \"\"\"\n",
+        "\n",
+        "        node_attr = self.node_emb(z)  # (num_node, hidden)\n",
+        "\n",
+        "        hiddens = []\n",
+        "        conv_input = node_attr  # (num_node, hidden)\n",
+        "\n",
+        "        for conv_idx, conv in enumerate(self.convs):\n",
+        "            hidden = conv(conv_input, edge_index, edge_attr)\n",
+        "            if conv_idx < len(self.convs) - 1 and self.activation is not None:\n",
+        "                hidden = self.activation(hidden)\n",
+        "            assert hidden.shape == conv_input.shape\n",
+        "            if self.short_cut and hidden.shape == conv_input.shape:\n",
+        "                hidden += conv_input\n",
+        "\n",
+        "            hiddens.append(hidden)\n",
+        "            conv_input = hidden\n",
+        "\n",
+        "        if self.concat_hidden:\n",
+        "            node_feature = torch.cat(hiddens, dim=-1)\n",
+        "        else:\n",
+        "            node_feature = hiddens[-1]\n",
+        "\n",
+        "        return node_feature\n",
+        "\n",
+        "\n",
+        "class MLPEdgeEncoder(Module):\n",
+        "    def __init__(self, hidden_dim=100, activation=\"relu\"):\n",
+        "        super().__init__()\n",
+        "        self.hidden_dim = hidden_dim\n",
+        "        self.bond_emb = Embedding(100, embedding_dim=self.hidden_dim)\n",
+        "        self.mlp = MultiLayerPerceptron(1, [self.hidden_dim, self.hidden_dim], activation=activation)\n",
+        "\n",
+        "    @property\n",
+        "    def out_channels(self):\n",
+        "        return self.hidden_dim\n",
+        "\n",
+        "    def forward(self, edge_length, edge_type):\n",
+        "        \"\"\"\n",
+        "        Input:\n",
+        "            edge_length: The length of edges, shape=(E, 1). edge_type: The type pf edges, shape=(E,)\n",
+        "        Returns:\n",
+        "            edge_attr: The representation of edges. (E, 2 * num_gaussians)\n",
+        "        \"\"\"\n",
+        "        d_emb = self.mlp(edge_length)  # (num_edge, hidden_dim)\n",
+        "        edge_attr = self.bond_emb(edge_type)  # (num_edge, hidden_dim)\n",
+        "        return d_emb * edge_attr  # (num_edge, hidden)\n",
+        "\n",
+        "\n",
+        "def assemble_atom_pair_feature(node_attr, edge_index, edge_attr):\n",
+        "    h_row, h_col = node_attr[edge_index[0]], node_attr[edge_index[1]]\n",
+        "    h_pair = torch.cat([h_row * h_col, edge_attr], dim=-1)  # (E, 2H)\n",
+        "    return h_pair\n",
+        "\n",
+        "\n",
+        "def _extend_graph_order(num_nodes, edge_index, edge_type, order=3):\n",
+        "    \"\"\"\n",
+        "    Args:\n",
+        "        num_nodes:  Number of atoms.\n",
+        "        edge_index: Bond indices of the original graph.\n",
+        "        edge_type:  Bond types of the original graph.\n",
+        "        order:  Extension order.\n",
+        "    Returns:\n",
+        "        new_edge_index: Extended edge indices. new_edge_type: Extended edge types.\n",
+        "    \"\"\"\n",
+        "\n",
+        "    def binarize(x):\n",
+        "        return torch.where(x > 0, torch.ones_like(x), torch.zeros_like(x))\n",
+        "\n",
+        "    def get_higher_order_adj_matrix(adj, order):\n",
+        "        \"\"\"\n",
+        "        Args:\n",
+        "            adj:        (N, N)\n",
+        "            type_mat:   (N, N)\n",
+        "        Returns:\n",
+        "            Following attributes will be updated:\n",
+        "              - edge_index\n",
+        "              - edge_type\n",
+        "            Following attributes will be added to the data object:\n",
+        "              - bond_edge_index: Original edge_index.\n",
+        "        \"\"\"\n",
+        "        adj_mats = [\n",
+        "            torch.eye(adj.size(0), dtype=torch.long, device=adj.device),\n",
+        "            binarize(adj + torch.eye(adj.size(0), dtype=torch.long, device=adj.device)),\n",
+        "        ]\n",
+        "\n",
+        "        for i in range(2, order + 1):\n",
+        "            adj_mats.append(binarize(adj_mats[i - 1] @ adj_mats[1]))\n",
+        "        order_mat = torch.zeros_like(adj)\n",
+        "\n",
+        "        for i in range(1, order + 1):\n",
+        "            order_mat += (adj_mats[i] - adj_mats[i - 1]) * i\n",
+        "\n",
+        "        return order_mat\n",
+        "\n",
+        "    num_types = 22\n",
+        "    # given from len(BOND_TYPES), where BOND_TYPES = {t: i for i, t in enumerate(BT.names.values())}\n",
+        "    # from rdkit.Chem.rdchem import BondType as BT\n",
+        "    N = num_nodes\n",
+        "    adj = to_dense_adj(edge_index).squeeze(0)\n",
+        "    adj_order = get_higher_order_adj_matrix(adj, order)  # (N, N)\n",
+        "\n",
+        "    type_mat = to_dense_adj(edge_index, edge_attr=edge_type).squeeze(0)  # (N, N)\n",
+        "    type_highorder = torch.where(adj_order > 1, num_types + adj_order - 1, torch.zeros_like(adj_order))\n",
+        "    assert (type_mat * type_highorder == 0).all()\n",
+        "    type_new = type_mat + type_highorder\n",
+        "\n",
+        "    new_edge_index, new_edge_type = dense_to_sparse(type_new)\n",
+        "    _, edge_order = dense_to_sparse(adj_order)\n",
+        "\n",
+        "    # data.bond_edge_index = data.edge_index  # Save original edges\n",
+        "    new_edge_index, new_edge_type = coalesce(new_edge_index, new_edge_type.long(), N, N)  # modify data\n",
+        "\n",
+        "    return new_edge_index, new_edge_type\n",
+        "\n",
+        "\n",
+        "def _extend_to_radius_graph(pos, edge_index, edge_type, cutoff, batch, unspecified_type_number=0, is_sidechain=None):\n",
+        "    assert edge_type.dim() == 1\n",
+        "    N = pos.size(0)\n",
+        "\n",
+        "    bgraph_adj = torch.sparse.LongTensor(edge_index, edge_type, torch.Size([N, N]))\n",
+        "\n",
+        "    if is_sidechain is None:\n",
+        "        rgraph_edge_index = radius_graph(pos, r=cutoff, batch=batch)  # (2, E_r)\n",
+        "    else:\n",
+        "        # fetch sidechain and its batch index\n",
+        "        is_sidechain = is_sidechain.bool()\n",
+        "        dummy_index = torch.arange(pos.size(0), device=pos.device)\n",
+        "        sidechain_pos = pos[is_sidechain]\n",
+        "        sidechain_index = dummy_index[is_sidechain]\n",
+        "        sidechain_batch = batch[is_sidechain]\n",
+        "\n",
+        "        assign_index = radius(x=pos, y=sidechain_pos, r=cutoff, batch_x=batch, batch_y=sidechain_batch)\n",
+        "        r_edge_index_x = assign_index[1]\n",
+        "        r_edge_index_y = assign_index[0]\n",
+        "        r_edge_index_y = sidechain_index[r_edge_index_y]\n",
+        "\n",
+        "        rgraph_edge_index1 = torch.stack((r_edge_index_x, r_edge_index_y))  # (2, E)\n",
+        "        rgraph_edge_index2 = torch.stack((r_edge_index_y, r_edge_index_x))  # (2, E)\n",
+        "        rgraph_edge_index = torch.cat((rgraph_edge_index1, rgraph_edge_index2), dim=-1)  # (2, 2E)\n",
+        "        # delete self loop\n",
+        "        rgraph_edge_index = rgraph_edge_index[:, (rgraph_edge_index[0] != rgraph_edge_index[1])]\n",
+        "\n",
+        "    rgraph_adj = torch.sparse.LongTensor(\n",
+        "        rgraph_edge_index,\n",
+        "        torch.ones(rgraph_edge_index.size(1)).long().to(pos.device) * unspecified_type_number,\n",
+        "        torch.Size([N, N]),\n",
+        "    )\n",
+        "\n",
+        "    composed_adj = (bgraph_adj + rgraph_adj).coalesce()  # Sparse (N, N, T)\n",
+        "\n",
+        "    new_edge_index = composed_adj.indices()\n",
+        "    new_edge_type = composed_adj.values().long()\n",
+        "\n",
+        "    return new_edge_index, new_edge_type\n",
+        "\n",
+        "\n",
+        "def extend_graph_order_radius(\n",
+        "    num_nodes,\n",
+        "    pos,\n",
+        "    edge_index,\n",
+        "    edge_type,\n",
+        "    batch,\n",
+        "    order=3,\n",
+        "    cutoff=10.0,\n",
+        "    extend_order=True,\n",
+        "    extend_radius=True,\n",
+        "    is_sidechain=None,\n",
+        "):\n",
+        "    if extend_order:\n",
+        "        edge_index, edge_type = _extend_graph_order(\n",
+        "            num_nodes=num_nodes, edge_index=edge_index, edge_type=edge_type, order=order\n",
+        "        )\n",
+        "\n",
+        "    if extend_radius:\n",
+        "        edge_index, edge_type = _extend_to_radius_graph(\n",
+        "            pos=pos, edge_index=edge_index, edge_type=edge_type, cutoff=cutoff, batch=batch, is_sidechain=is_sidechain\n",
+        "        )\n",
+        "\n",
+        "    return edge_index, edge_type\n",
+        "\n",
+        "\n",
+        "def get_distance(pos, edge_index):\n",
+        "    return (pos[edge_index[0]] - pos[edge_index[1]]).norm(dim=-1)\n",
+        "\n",
+        "\n",
+        "def graph_field_network(score_d, pos, edge_index, edge_length):\n",
+        "    \"\"\"\n",
+        "    Transformation to make the epsilon predicted from the diffusion model roto-translational equivariant. See equations\n",
+        "    5-7 of the GeoDiff Paper https://arxiv.org/pdf/2203.02923.pdf\n",
+        "    \"\"\"\n",
+        "    N = pos.size(0)\n",
+        "    dd_dr = (1.0 / edge_length) * (pos[edge_index[0]] - pos[edge_index[1]])  # (E, 3)\n",
+        "    score_pos = scatter_add(dd_dr * score_d, edge_index[0], dim=0, dim_size=N) + scatter_add(\n",
+        "        -dd_dr * score_d, edge_index[1], dim=0, dim_size=N\n",
+        "    )  # (N, 3)\n",
+        "    return score_pos\n",
+        "\n",
+        "\n",
+        "def clip_norm(vec, limit, p=2):\n",
+        "    norm = torch.norm(vec, dim=-1, p=2, keepdim=True)\n",
+        "    denom = torch.where(norm > limit, limit / norm, torch.ones_like(norm))\n",
+        "    return vec * denom\n",
+        "\n",
+        "\n",
+        "def is_local_edge(edge_type):\n",
+        "    return edge_type > 0\n"
+      ],
+      "metadata": {
+        "id": "oR1Y56QiLY90"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Main model class!"
+      ],
+      "metadata": {
+        "id": "QWrHJFcYXyUB"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "class MoleculeGNN(ModelMixin, ConfigMixin):\n",
+        "    @register_to_config\n",
+        "    def __init__(\n",
+        "        self,\n",
+        "        hidden_dim=128,\n",
+        "        num_convs=6,\n",
+        "        num_convs_local=4,\n",
+        "        cutoff=10.0,\n",
+        "        mlp_act=\"relu\",\n",
+        "        edge_order=3,\n",
+        "        edge_encoder=\"mlp\",\n",
+        "        smooth_conv=True,\n",
+        "    ):\n",
+        "        super().__init__()\n",
+        "        self.cutoff = cutoff\n",
+        "        self.edge_encoder = edge_encoder\n",
+        "        self.edge_order = edge_order\n",
+        "\n",
+        "        \"\"\"\n",
+        "        edge_encoder: Takes both edge type and edge length as input and outputs a vector [Note]: node embedding is done\n",
+        "        in SchNetEncoder\n",
+        "        \"\"\"\n",
+        "        self.edge_encoder_global = MLPEdgeEncoder(hidden_dim, mlp_act)  # get_edge_encoder(config)\n",
+        "        self.edge_encoder_local = MLPEdgeEncoder(hidden_dim, mlp_act)  # get_edge_encoder(config)\n",
+        "\n",
+        "        \"\"\"\n",
+        "        The graph neural network that extracts node-wise features.\n",
+        "        \"\"\"\n",
+        "        self.encoder_global = SchNetEncoder(\n",
+        "            hidden_channels=hidden_dim,\n",
+        "            num_filters=hidden_dim,\n",
+        "            num_interactions=num_convs,\n",
+        "            edge_channels=self.edge_encoder_global.out_channels,\n",
+        "            cutoff=cutoff,\n",
+        "            smooth=smooth_conv,\n",
+        "        )\n",
+        "        self.encoder_local = GINEncoder(\n",
+        "            hidden_dim=hidden_dim,\n",
+        "            num_convs=num_convs_local,\n",
+        "        )\n",
+        "\n",
+        "        \"\"\"\n",
+        "        `output_mlp` takes a mixture of two nodewise features and edge features as input and outputs\n",
+        "            gradients w.r.t. edge_length (out_dim = 1).\n",
+        "        \"\"\"\n",
+        "        self.grad_global_dist_mlp = MultiLayerPerceptron(\n",
+        "            2 * hidden_dim, [hidden_dim, hidden_dim // 2, 1], activation=mlp_act\n",
+        "        )\n",
+        "\n",
+        "        self.grad_local_dist_mlp = MultiLayerPerceptron(\n",
+        "            2 * hidden_dim, [hidden_dim, hidden_dim // 2, 1], activation=mlp_act\n",
+        "        )\n",
+        "\n",
+        "        \"\"\"\n",
+        "        Incorporate parameters together\n",
+        "        \"\"\"\n",
+        "        self.model_global = nn.ModuleList([self.edge_encoder_global, self.encoder_global, self.grad_global_dist_mlp])\n",
+        "        self.model_local = nn.ModuleList([self.edge_encoder_local, self.encoder_local, self.grad_local_dist_mlp])\n",
+        "\n",
+        "    def _forward(\n",
+        "        self,\n",
+        "        atom_type,\n",
+        "        pos,\n",
+        "        bond_index,\n",
+        "        bond_type,\n",
+        "        batch,\n",
+        "        time_step,  # NOTE, model trained without timestep performed best\n",
+        "        edge_index=None,\n",
+        "        edge_type=None,\n",
+        "        edge_length=None,\n",
+        "        return_edges=False,\n",
+        "        extend_order=True,\n",
+        "        extend_radius=True,\n",
+        "        is_sidechain=None,\n",
+        "    ):\n",
+        "        \"\"\"\n",
+        "        Args:\n",
+        "            atom_type:  Types of atoms, (N, ).\n",
+        "            bond_index: Indices of bonds (not extended, not radius-graph), (2, E).\n",
+        "            bond_type:  Bond types, (E, ).\n",
+        "            batch:      Node index to graph index, (N, ).\n",
+        "        \"\"\"\n",
+        "        N = atom_type.size(0)\n",
+        "        if edge_index is None or edge_type is None or edge_length is None:\n",
+        "            edge_index, edge_type = extend_graph_order_radius(\n",
+        "                num_nodes=N,\n",
+        "                pos=pos,\n",
+        "                edge_index=bond_index,\n",
+        "                edge_type=bond_type,\n",
+        "                batch=batch,\n",
+        "                order=self.edge_order,\n",
+        "                cutoff=self.cutoff,\n",
+        "                extend_order=extend_order,\n",
+        "                extend_radius=extend_radius,\n",
+        "                is_sidechain=is_sidechain,\n",
+        "            )\n",
+        "            edge_length = get_distance(pos, edge_index).unsqueeze(-1)  # (E, 1)\n",
+        "        local_edge_mask = is_local_edge(edge_type)  # (E, )\n",
+        "\n",
+        "        # with the parameterization of NCSNv2\n",
+        "        # DDPM loss implicit handle the noise variance scale conditioning\n",
+        "        sigma_edge = torch.ones(size=(edge_index.size(1), 1), device=pos.device)  # (E, 1)\n",
+        "\n",
+        "        # Encoding global\n",
+        "        edge_attr_global = self.edge_encoder_global(edge_length=edge_length, edge_type=edge_type)  # Embed edges\n",
+        "\n",
+        "        # Global\n",
+        "        node_attr_global = self.encoder_global(\n",
+        "            z=atom_type,\n",
+        "            edge_index=edge_index,\n",
+        "            edge_length=edge_length,\n",
+        "            edge_attr=edge_attr_global,\n",
+        "        )\n",
+        "        # Assemble pairwise features\n",
+        "        h_pair_global = assemble_atom_pair_feature(\n",
+        "            node_attr=node_attr_global,\n",
+        "            edge_index=edge_index,\n",
+        "            edge_attr=edge_attr_global,\n",
+        "        )  # (E_global, 2H)\n",
+        "        # Invariant features of edges (radius graph, global)\n",
+        "        edge_inv_global = self.grad_global_dist_mlp(h_pair_global) * (1.0 / sigma_edge)  # (E_global, 1)\n",
+        "\n",
+        "        # Encoding local\n",
+        "        edge_attr_local = self.edge_encoder_global(edge_length=edge_length, edge_type=edge_type)  # Embed edges\n",
+        "        # edge_attr += temb_edge\n",
+        "\n",
+        "        # Local\n",
+        "        node_attr_local = self.encoder_local(\n",
+        "            z=atom_type,\n",
+        "            edge_index=edge_index[:, local_edge_mask],\n",
+        "            edge_attr=edge_attr_local[local_edge_mask],\n",
+        "        )\n",
+        "        # Assemble pairwise features\n",
+        "        h_pair_local = assemble_atom_pair_feature(\n",
+        "            node_attr=node_attr_local,\n",
+        "            edge_index=edge_index[:, local_edge_mask],\n",
+        "            edge_attr=edge_attr_local[local_edge_mask],\n",
+        "        )  # (E_local, 2H)\n",
+        "\n",
+        "        # Invariant features of edges (bond graph, local)\n",
+        "        if isinstance(sigma_edge, torch.Tensor):\n",
+        "            edge_inv_local = self.grad_local_dist_mlp(h_pair_local) * (\n",
+        "                1.0 / sigma_edge[local_edge_mask]\n",
+        "            )  # (E_local, 1)\n",
+        "        else:\n",
+        "            edge_inv_local = self.grad_local_dist_mlp(h_pair_local) * (1.0 / sigma_edge)  # (E_local, 1)\n",
+        "\n",
+        "        if return_edges:\n",
+        "            return edge_inv_global, edge_inv_local, edge_index, edge_type, edge_length, local_edge_mask\n",
+        "        else:\n",
+        "            return edge_inv_global, edge_inv_local\n",
+        "\n",
+        "    def forward(\n",
+        "        self,\n",
+        "        sample,\n",
+        "        timestep: Union[torch.Tensor, float, int],\n",
+        "        return_dict: bool = True,\n",
+        "        sigma=1.0,\n",
+        "        global_start_sigma=0.5,\n",
+        "        w_global=1.0,\n",
+        "        extend_order=False,\n",
+        "        extend_radius=True,\n",
+        "        clip_local=None,\n",
+        "        clip_global=1000.0,\n",
+        "    ) -> Union[MoleculeGNNOutput, Tuple]:\n",
+        "        r\"\"\"\n",
+        "        Args:\n",
+        "            sample: packed torch geometric object\n",
+        "            timestep (`torch.Tensor` or `float` or `int): TODO verify type and shape (batch) timesteps\n",
+        "            return_dict (`bool`, *optional*, defaults to `True`):\n",
+        "                Whether or not to return a [`~models.molecule_gnn.MoleculeGNNOutput`] instead of a plain tuple.\n",
+        "        Returns:\n",
+        "            [`~models.molecule_gnn.MoleculeGNNOutput`] or `tuple`: [`~models.molecule_gnn.MoleculeGNNOutput`] if\n",
+        "            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.\n",
+        "        \"\"\"\n",
+        "\n",
+        "        # unpack sample\n",
+        "        atom_type = sample.atom_type\n",
+        "        bond_index = sample.edge_index\n",
+        "        bond_type = sample.edge_type\n",
+        "        num_graphs = sample.num_graphs\n",
+        "        pos = sample.pos\n",
+        "\n",
+        "        timesteps = torch.full(size=(num_graphs,), fill_value=timestep, dtype=torch.long, device=pos.device)\n",
+        "\n",
+        "        edge_inv_global, edge_inv_local, edge_index, edge_type, edge_length, local_edge_mask = self._forward(\n",
+        "            atom_type=atom_type,\n",
+        "            pos=sample.pos,\n",
+        "            bond_index=bond_index,\n",
+        "            bond_type=bond_type,\n",
+        "            batch=sample.batch,\n",
+        "            time_step=timesteps,\n",
+        "            return_edges=True,\n",
+        "            extend_order=extend_order,\n",
+        "            extend_radius=extend_radius,\n",
+        "        )  # (E_global, 1), (E_local, 1)\n",
+        "\n",
+        "        # Important equation in the paper for equivariant features - eqns 5-7 of GeoDiff\n",
+        "        node_eq_local = graph_field_network(\n",
+        "            edge_inv_local, pos, edge_index[:, local_edge_mask], edge_length[local_edge_mask]\n",
+        "        )\n",
+        "        if clip_local is not None:\n",
+        "            node_eq_local = clip_norm(node_eq_local, limit=clip_local)\n",
+        "\n",
+        "        # Global\n",
+        "        if sigma < global_start_sigma:\n",
+        "            edge_inv_global = edge_inv_global * (1 - local_edge_mask.view(-1, 1).float())\n",
+        "            node_eq_global = graph_field_network(edge_inv_global, pos, edge_index, edge_length)\n",
+        "            node_eq_global = clip_norm(node_eq_global, limit=clip_global)\n",
+        "        else:\n",
+        "            node_eq_global = 0\n",
+        "\n",
+        "        # Sum\n",
+        "        eps_pos = node_eq_local + node_eq_global * w_global\n",
+        "\n",
+        "        if not return_dict:\n",
+        "            return (-eps_pos,)\n",
+        "\n",
+        "        return MoleculeGNNOutput(sample=torch.Tensor(-eps_pos).to(pos.device))"
+      ],
+      "metadata": {
+        "id": "MCeZA1qQXzoK"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CCIrPYSJj9wd"
+      },
+      "source": [
+        "### Load pretrained model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "YdrAr6Ch--Ab"
+      },
+      "source": [
+        "#### Load a model\n",
+        "The model used is a design an\n",
+        "equivariant convolutional layer, named graph field network (GFN).\n",
+        "\n",
+        "The warning about `betas` and `alphas` can be ignored, those were moved to the scheduler."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "DyCo0nsqjbml",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 172,
+          "referenced_widgets": [
+            "d90f304e9560472eacfbdd11e46765eb",
+            "1c6246f15b654f4daa11c9bcf997b78c",
+            "c2321b3bff6f490ca12040a20308f555",
+            "b7feb522161f4cf4b7cc7c1a078ff12d",
+            "e2d368556e494ae7ae4e2e992af2cd4f",
+            "bbef741e76ec41b7ab7187b487a383df",
+            "561f742d418d4721b0670cc8dd62e22c",
+            "872915dd1bb84f538c44e26badabafdd",
+            "d022575f1fa2446d891650897f187b4d",
+            "fdc393f3468c432aa0ada05e238a5436",
+            "2c9362906e4b40189f16d14aa9a348da",
+            "6010fc8daa7a44d5aec4b830ec2ebaa1",
+            "7e0bb1b8d65249d3974200686b193be2",
+            "ba98aa6d6a884e4ab8bbb5dfb5e4cf7a",
+            "6526646be5ed415c84d1245b040e629b",
+            "24d31fc3576e43dd9f8301d2ef3a37ab",
+            "2918bfaadc8d4b1a9832522c40dfefb8",
+            "a4bfdca35cc54dae8812720f1b276a08",
+            "e4901541199b45c6a18824627692fc39",
+            "f915cf874246446595206221e900b2fe",
+            "a9e388f22a9742aaaf538e22575c9433",
+            "42f6c3db29d7484ba6b4f73590abd2f4"
+          ]
+        },
+        "outputId": "d6bce9d5-c51e-43a4-e680-e1e81bdfaf45"
+      },
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "Downloading:   0%|          | 0.00/3.27M [00:00<?, ?B/s]"
+            ],
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "d90f304e9560472eacfbdd11e46765eb"
+            }
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "Downloading:   0%|          | 0.00/401 [00:00<?, ?B/s]"
+            ],
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "6010fc8daa7a44d5aec4b830ec2ebaa1"
+            }
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "The config attributes {'type': 'diffusion', 'network': 'dualenc', 'beta_schedule': 'sigmoid', 'beta_start': 1e-07, 'beta_end': 0.002, 'num_diffusion_timesteps': 5000} were passed to MoleculeGNN, but are not expected and will be ignored. Please verify your config.json configuration file.\n",
+            "Some weights of the model checkpoint at fusing/gfn-molecule-gen-drugs were not used when initializing MoleculeGNN: ['betas', 'alphas']\n",
+            "- This IS expected if you are initializing MoleculeGNN from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+            "- This IS NOT expected if you are initializing MoleculeGNN from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+          ]
+        }
+      ],
+      "source": [
+        "DEVICE = 'cuda'\n",
+        "model = MoleculeGNN.from_pretrained(\"fusing/gfn-molecule-gen-drugs\").to(DEVICE)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "The warnings above are because the pre-trained model was uploaded before cleaning the code!"
+      ],
+      "metadata": {
+        "id": "HdclRaqoUWUD"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PlOkPySoJ1m9"
+      },
+      "source": [
+        "#### Create scheduler\n",
+        "Note, other schedulers are used in the paper for slightly improved performance over DDPM."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "nNHnIk9CkAb2"
+      },
+      "outputs": [],
+      "source": [
+        "from diffusers import DDPMScheduler"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "RnDJdDBztjFF"
+      },
+      "outputs": [],
+      "source": [
+        "num_timesteps = 1000\n",
+        "scheduler = DDPMScheduler(num_train_timesteps=num_timesteps,beta_schedule=\"sigmoid\",beta_start=1e-7, beta_end=2e-3, clip_sample=False)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "1vh3fpSAflkL"
+      },
+      "source": [
+        "### Get a dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "B6qzaGjVKFVk"
+      },
+      "source": [
+        "Grab a google tool so we can upload our data directly. Note you need to download the data from ***this [file](https://huggingface.co/datasets/fusing/geodiff-example-data/blob/main/data/molecules.pkl)***\n",
+        "\n",
+        "(direct downloading from the hub does not yet work for this datatype)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jbLl3EJdgj3x"
+      },
+      "outputs": [],
+      "source": [
+        "# from google.colab import files"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "E591lVuTgxPE"
+      },
+      "outputs": [],
+      "source": [
+        "# uploaded = files.upload()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KUNxfK3ln98Q"
+      },
+      "source": [
+        "Load the dataset with torch."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7L4iOShTpcQX",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "7f2dcd29-493e-44de-98d1-3ad50f109a4a"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "--2022-10-12 18:32:19--  https://huggingface.co/datasets/fusing/geodiff-example-data/resolve/main/data/molecules.pkl\n",
+            "Resolving huggingface.co (huggingface.co)... 44.195.102.200, 52.5.54.249, 54.210.225.113, ...\n",
+            "Connecting to huggingface.co (huggingface.co)|44.195.102.200|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 127774 (125K) [application/octet-stream]\n",
+            "Saving to: ‘molecules.pkl’\n",
+            "\n",
+            "molecules.pkl       100%[===================>] 124.78K   180KB/s    in 0.7s    \n",
+            "\n",
+            "2022-10-12 18:32:20 (180 KB/s) - ‘molecules.pkl’ saved [127774/127774]\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "import torch\n",
+        "import numpy as np\n",
+        "\n",
+        "!wget https://huggingface.co/datasets/fusing/geodiff-example-data/resolve/main/data/molecules.pkl\n",
+        "dataset = torch.load('/content/molecules.pkl')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "QZcmy1EvKQRk"
+      },
+      "source": [
+        "Print out one entry of the dataset, it contains molecular formulas, atom types, positions, and more."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "JVjz6iH_H6Eh",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "898cb0cf-a0b3-411b-fd4c-bea1fbfd17fe"
+      },
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "Data(atom_type=[51], bond_edge_index=[2, 108], edge_index=[2, 598], edge_order=[598], edge_type=[598], idx=[1], is_bond=[598], num_nodes_per_graph=[1], num_pos_ref=[1], nx=, pos=[51, 3], pos_ref=[255, 3], rdmol=<rdkit.Chem.rdchem.Mol object at 0x7f707d2cb130>, smiles=\"CC1CCCN(C(=O)C2CCN(S(=O)(=O)c3cccc4nonc34)CC2)C1\")"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 20
+        }
+      ],
+      "source": [
+        "dataset[0]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Run the diffusion process"
+      ],
+      "metadata": {
+        "id": "vHNiZAUxNgoy"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jZ1KZrxKqENg"
+      },
+      "source": [
+        "#### Helper Functions"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "s240tYueqKKf"
+      },
+      "outputs": [],
+      "source": [
+        "from torch_geometric.data import Data, Batch\n",
+        "from torch_scatter import scatter_add, scatter_mean\n",
+        "from tqdm import tqdm\n",
+        "import copy\n",
+        "import os\n",
+        "\n",
+        "def repeat_data(data: Data, num_repeat) -> Batch:\n",
+        "    datas = [copy.deepcopy(data) for i in range(num_repeat)]\n",
+        "    return Batch.from_data_list(datas)\n",
+        "\n",
+        "def repeat_batch(batch: Batch, num_repeat) -> Batch:\n",
+        "    datas = batch.to_data_list()\n",
+        "    new_data = []\n",
+        "    for i in range(num_repeat):\n",
+        "        new_data += copy.deepcopy(datas)\n",
+        "    return Batch.from_data_list(new_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "AMnQTk0eqT7Z"
+      },
+      "source": [
+        "#### Constants"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WYGkzqgzrHmF"
+      },
+      "outputs": [],
+      "source": [
+        "num_samples = 1 # solutions per molecule\n",
+        "num_molecules = 3\n",
+        "\n",
+        "DEVICE = 'cuda'\n",
+        "sampling_type = 'ddpm_noisy' #'' # paper also uses \"generalize\" and \"ld\"\n",
+        "# constants for inference\n",
+        "w_global = 0.5 #0,.3 for qm9\n",
+        "global_start_sigma = 0.5\n",
+        "eta = 1.0\n",
+        "clip_local = None\n",
+        "clip_pos = None\n",
+        "\n",
+        "# constands for data handling\n",
+        "save_traj = False\n",
+        "save_data = False\n",
+        "output_dir = '/content/'"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-xD5bJ3SqM7t"
+      },
+      "source": [
+        "#### Generate samples!\n",
+        "Note that the 3d representation of a molecule is referred to as the **conformation**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "x9xuLUNg26z1",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "236d2a60-09ed-4c4d-97c1-6e3c0f2d26c4"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:4: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
+            "  after removing the cwd from sys.path.\n",
+            "100%|██████████| 5/5 [00:55<00:00, 11.06s/it]\n"
+          ]
+        }
+      ],
+      "source": [
+        "results = []\n",
+        "\n",
+        "# define sigmas\n",
+        "sigmas = torch.tensor(1.0 - scheduler.alphas_cumprod).sqrt() / torch.tensor(scheduler.alphas_cumprod).sqrt()\n",
+        "sigmas = sigmas.to(DEVICE)\n",
+        "\n",
+        "for count, data in enumerate(tqdm(dataset)):\n",
+        "    num_samples = max(data.pos_ref.size(0) // data.num_nodes, 1)\n",
+        "\n",
+        "    data_input = data.clone()\n",
+        "    data_input['pos_ref'] = None\n",
+        "    batch = repeat_data(data_input, num_samples).to(DEVICE)\n",
+        "\n",
+        "    # initial configuration\n",
+        "    pos_init = torch.randn(batch.num_nodes, 3).to(DEVICE)\n",
+        "\n",
+        "    # for logging animation of denoising\n",
+        "    pos_traj = []\n",
+        "    with torch.no_grad():\n",
+        "\n",
+        "        # scale initial sample\n",
+        "        pos = pos_init * sigmas[-1]\n",
+        "        for t in scheduler.timesteps:\n",
+        "            batch.pos = pos\n",
+        "\n",
+        "            # generate geometry with model, then filter it\n",
+        "            epsilon = model.forward(batch, t, sigma=sigmas[t], return_dict=False)[0]\n",
+        "\n",
+        "            # Update\n",
+        "            reconstructed_pos = scheduler.step(epsilon, t, pos)[\"prev_sample\"].to(DEVICE)\n",
+        "\n",
+        "            pos = reconstructed_pos\n",
+        "\n",
+        "            if torch.isnan(pos).any():\n",
+        "                print(\"NaN detected. Please restart.\")\n",
+        "                raise FloatingPointError()\n",
+        "\n",
+        "            # recenter graph of positions for next iteration\n",
+        "            pos = pos - scatter_mean(pos, batch.batch, dim=0)[batch.batch]\n",
+        "\n",
+        "            # optional clipping\n",
+        "            if clip_pos is not None:\n",
+        "                pos = torch.clamp(pos, min=-clip_pos, max=clip_pos)\n",
+        "            pos_traj.append(pos.clone().cpu())\n",
+        "\n",
+        "    pos_gen = pos.cpu()\n",
+        "    if save_traj:\n",
+        "        pos_gen_traj = pos_traj.cpu()\n",
+        "        data.pos_gen = torch.stack(pos_gen_traj)\n",
+        "    else:\n",
+        "        data.pos_gen = pos_gen\n",
+        "    results.append(data)\n",
+        "\n",
+        "\n",
+        "if save_data:\n",
+        "  save_path = os.path.join(output_dir, 'samples_all.pkl')\n",
+        "\n",
+        "  with open(save_path, 'wb') as f:\n",
+        "      pickle.dump(results, f)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Render the results!"
+      ],
+      "metadata": {
+        "id": "fSApwSaZNndW"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "d47Zxo2OKdgZ"
+      },
+      "source": [
+        "This function allows us to render 3d in colab."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "e9Cd0kCAv9b8"
+      },
+      "outputs": [],
+      "source": [
+        "from google.colab import output\n",
+        "output.enable_custom_widget_manager()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Helper functions"
+      ],
+      "metadata": {
+        "id": "RjaVuR15NqzF"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "28rBYa9NKhlz"
+      },
+      "source": [
+        "Here is a helper function for copying the generated tensors into a format used by RDKit & NGLViewer."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LKdKdwxcyTQ6"
+      },
+      "outputs": [],
+      "source": [
+        "from copy import deepcopy\n",
+        "def set_rdmol_positions(rdkit_mol, pos):\n",
+        "    \"\"\"\n",
+        "    Args:\n",
+        "        rdkit_mol:  An `rdkit.Chem.rdchem.Mol` object.\n",
+        "        pos: (N_atoms, 3)\n",
+        "    \"\"\"\n",
+        "    mol = deepcopy(rdkit_mol)\n",
+        "    set_rdmol_positions_(mol, pos)\n",
+        "    return mol\n",
+        "\n",
+        "def set_rdmol_positions_(mol, pos):\n",
+        "    \"\"\"\n",
+        "    Args:\n",
+        "        rdkit_mol:  An `rdkit.Chem.rdchem.Mol` object.\n",
+        "        pos: (N_atoms, 3)\n",
+        "    \"\"\"\n",
+        "    for i in range(pos.shape[0]):\n",
+        "        mol.GetConformer(0).SetAtomPosition(i, pos[i].tolist())\n",
+        "    return mol\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NuE10hcpKmzK"
+      },
+      "source": [
+        "Process the generated data to make it easy to view."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "KieVE1vc0_Vs",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "6faa185d-b1bc-47e8-be18-30d1e557e7c8"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "collect 5 generated molecules in `mols`\n"
+          ]
+        }
+      ],
+      "source": [
+        "# the model can generate multiple conformations per 2d geometry\n",
+        "num_gen = results[0]['pos_gen'].shape[0]\n",
+        "\n",
+        "# init storage objects\n",
+        "mols_gen = []\n",
+        "mols_orig = []\n",
+        "for to_process in results:\n",
+        "\n",
+        "    # store the reference 3d position\n",
+        "    to_process['pos_ref'] = to_process['pos_ref'].reshape(-1, to_process['rdmol'].GetNumAtoms(), 3)\n",
+        "\n",
+        "    # store the generated 3d position\n",
+        "    to_process['pos_gen'] = to_process['pos_gen'].reshape(-1, to_process['rdmol'].GetNumAtoms(), 3)\n",
+        "\n",
+        "    # copy data to new object\n",
+        "    new_mol = set_rdmol_positions(to_process.rdmol, to_process['pos_gen'][0])\n",
+        "\n",
+        "    # append results\n",
+        "    mols_gen.append(new_mol)\n",
+        "    mols_orig.append(to_process.rdmol)\n",
+        "\n",
+        "print(f\"collect {len(mols_gen)} generated molecules in `mols`\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tin89JwMKp4v"
+      },
+      "source": [
+        "Import tools to visualize the 2d chemical diagram of the molecule."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yqV6gllSZn38"
+      },
+      "outputs": [],
+      "source": [
+        "from rdkit.Chem import AllChem\n",
+        "from rdkit import Chem\n",
+        "from rdkit.Chem.Draw import rdMolDraw2D as MD2\n",
+        "from IPython.display import SVG, display"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "TFNKmGddVoOk"
+      },
+      "source": [
+        "Select molecule to visualize"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "KzuwLlrrVaGc"
+      },
+      "outputs": [],
+      "source": [
+        "idx = 0\n",
+        "assert idx < len(results), \"selected molecule that was not generated\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Viewing"
+      ],
+      "metadata": {
+        "id": "hkb8w0_SNtU8"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "I3R4QBQeKttN"
+      },
+      "source": [
+        "This 2D rendering is the equivalent of the **input to the model**!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gkQRWjraaKex",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 321
+        },
+        "outputId": "9c3d1a91-a51d-475d-9e34-2be2459abc47"
+      },
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.SVG object>"
+            ],
+            "image/svg+xml": "<svg baseProfile=\"full\" height=\"300px\" version=\"1.1\" viewBox=\"0 0 450 300\" width=\"450px\" xml:space=\"preserve\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:rdkit=\"http://www.rdkit.org/xml\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<!-- END OF HEADER -->\n<rect height=\"300.0\" style=\"opacity:1.0;fill:#FFFFFF;stroke:none\" width=\"450.0\" x=\"0.0\" y=\"0.0\"> </rect>\n<path class=\"bond-0 atom-0 atom-1\" d=\"M 20.5,147.6 L 57.8,136.7\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-1 atom-1 atom-2\" d=\"M 57.8,136.7 L 67.1,98.9\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-2 atom-2 atom-3\" d=\"M 67.1,98.9 L 104.4,88.1\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-3 atom-3 atom-4\" d=\"M 104.4,88.1 L 132.5,115.0\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-4 atom-4 atom-5\" d=\"M 132.5,115.0 L 128.7,130.5\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-4 atom-4 atom-5\" d=\"M 128.7,130.5 L 124.9,146.0\" style=\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-5 atom-5 atom-6\" d=\"M 128.7,158.0 L 140.0,168.8\" style=\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-5 atom-5 atom-6\" d=\"M 140.0,168.8 L 151.3,179.7\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-6 atom-6 atom-7\" d=\"M 155.1,180.6 L 151.3,196.1\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-6 atom-6 atom-7\" d=\"M 151.3,196.1 L 147.5,211.5\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-6 atom-6 atom-7\" d=\"M 147.5,178.8 L 143.7,194.2\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-6 atom-6 atom-7\" d=\"M 143.7,194.2 L 139.9,209.7\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-7 atom-6 atom-8\" d=\"M 151.3,179.7 L 188.7,168.8\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-8 atom-8 atom-9\" d=\"M 188.7,168.8 L 216.7,195.8\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-9 atom-9 atom-10\" d=\"M 216.7,195.8 L 254.1,184.9\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-10 atom-10 atom-11\" d=\"M 254.1,184.9 L 257.9,169.4\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-10 atom-10 atom-11\" d=\"M 257.9,169.4 L 261.7,153.9\" style=\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-11 atom-11 atom-12\" d=\"M 268.8,145.5 L 282.4,141.6\" style=\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-11 atom-11 atom-12\" d=\"M 282.4,141.6 L 295.9,137.7\" style=\"fill:none;fill-rule:evenodd;stroke:#CCCC00;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-12 atom-12 atom-13\" d=\"M 295.0,130.6 L 291.6,118.8\" style=\"fill:none;fill-rule:evenodd;stroke:#CCCC00;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-12 atom-12 atom-13\" d=\"M 291.6,118.8 L 288.2,107.0\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-12 atom-12 atom-13\" d=\"M 302.5,128.4 L 299.1,116.6\" style=\"fill:none;fill-rule:evenodd;stroke:#CCCC00;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-12 atom-12 atom-13\" d=\"M 299.1,116.6 L 295.6,104.9\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-13 atom-12 atom-14\" d=\"M 306.5,142.3 L 309.9,154.0\" style=\"fill:none;fill-rule:evenodd;stroke:#CCCC00;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-13 atom-12 atom-14\" d=\"M 309.9,154.0 L 313.3,165.7\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-13 atom-12 atom-14\" d=\"M 299.0,144.4 L 302.4,156.1\" style=\"fill:none;fill-rule:evenodd;stroke:#CCCC00;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-13 atom-12 atom-14\" d=\"M 302.4,156.1 L 305.8,167.9\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-14 atom-12 atom-15\" d=\"M 305.5,134.9 L 321.8,130.1\" style=\"fill:none;fill-rule:evenodd;stroke:#CCCC00;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-14 atom-12 atom-15\" d=\"M 321.8,130.1 L 338.1,125.4\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-15 atom-15 atom-16\" d=\"M 338.1,125.4 L 347.4,87.6\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-15 atom-15 atom-16\" d=\"M 347.0,121.6 L 353.5,95.2\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-16 atom-16 atom-17\" d=\"M 347.4,87.6 L 384.7,76.8\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-17 atom-17 atom-18\" d=\"M 384.7,76.8 L 412.8,103.7\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-17 atom-17 atom-18\" d=\"M 383.5,86.4 L 403.2,105.3\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-18 atom-18 atom-19\" d=\"M 412.8,103.7 L 403.5,141.5\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-19 atom-19 atom-20\" d=\"M 403.5,141.5 L 412.1,154.2\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-19 atom-19 atom-20\" d=\"M 412.1,154.2 L 420.8,166.9\" style=\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-19 atom-19 atom-20\" d=\"M 399.7,149.7 L 405.7,158.6\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-19 atom-19 atom-20\" d=\"M 405.7,158.6 L 411.7,167.4\" style=\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-20 atom-20 atom-21\" d=\"M 420.1,180.5 L 413.5,189.0\" style=\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-20 atom-20 atom-21\" d=\"M 413.5,189.0 L 406.8,197.5\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-21 atom-21 atom-22\" d=\"M 395.2,202.1 L 382.8,197.7\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-21 atom-21 atom-22\" d=\"M 382.8,197.7 L 370.4,193.2\" style=\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-22 atom-22 atom-23\" d=\"M 365.1,184.4 L 365.6,168.4\" style=\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-22 atom-22 atom-23\" d=\"M 365.6,168.4 L 366.2,152.3\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-22 atom-22 atom-23\" d=\"M 373.1,179.9 L 373.4,168.6\" style=\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-22 atom-22 atom-23\" d=\"M 373.4,168.6 L 373.8,157.4\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-23 atom-11 atom-24\" d=\"M 257.9,141.9 L 246.6,131.1\" style=\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-23 atom-11 atom-24\" d=\"M 246.6,131.1 L 235.3,120.2\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-24 atom-24 atom-25\" d=\"M 235.3,120.2 L 197.9,131.1\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-25 atom-5 atom-26\" d=\"M 117.8,154.4 L 101.8,159.0\" style=\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-25 atom-5 atom-26\" d=\"M 101.8,159.0 L 85.9,163.6\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-26 atom-26 atom-1\" d=\"M 85.9,163.6 L 57.8,136.7\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-27 atom-25 atom-8\" d=\"M 197.9,131.1 L 188.7,168.8\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-28 atom-23 atom-15\" d=\"M 366.2,152.3 L 338.1,125.4\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"bond-29 atom-23 atom-19\" d=\"M 366.2,152.3 L 403.5,141.5\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n<path class=\"atom-5\" d=\"M 120.8 147.3 L 124.4 153.1 Q 124.8 153.7, 125.3 154.7 Q 125.9 155.8, 126.0 155.8 L 126.0 147.3 L 127.4 147.3 L 127.4 158.3 L 125.9 158.3 L 122.0 151.9 Q 121.6 151.2, 121.1 150.3 Q 120.6 149.4, 120.5 149.2 L 120.5 158.3 L 119.1 158.3 L 119.1 147.3 L 120.8 147.3 \" fill=\"#0000FF\"/>\n<path class=\"atom-7\" d=\"M 137.0 217.5 Q 137.0 214.9, 138.3 213.4 Q 139.6 211.9, 142.0 211.9 Q 144.5 211.9, 145.8 213.4 Q 147.1 214.9, 147.1 217.5 Q 147.1 220.2, 145.8 221.7 Q 144.4 223.2, 142.0 223.2 Q 139.6 223.2, 138.3 221.7 Q 137.0 220.2, 137.0 217.5 M 142.0 222.0 Q 143.7 222.0, 144.6 220.8 Q 145.5 219.7, 145.5 217.5 Q 145.5 215.3, 144.6 214.2 Q 143.7 213.1, 142.0 213.1 Q 140.4 213.1, 139.4 214.2 Q 138.5 215.3, 138.5 217.5 Q 138.5 219.7, 139.4 220.8 Q 140.4 222.0, 142.0 222.0 \" fill=\"#FF0000\"/>\n<path class=\"atom-11\" d=\"M 260.9 141.6 L 264.5 147.5 Q 264.9 148.0, 265.5 149.1 Q 266.1 150.1, 266.1 150.2 L 266.1 141.6 L 267.5 141.6 L 267.5 152.6 L 266.0 152.6 L 262.2 146.3 Q 261.7 145.5, 261.2 144.7 Q 260.8 143.8, 260.6 143.5 L 260.6 152.6 L 259.2 152.6 L 259.2 141.6 L 260.9 141.6 \" fill=\"#0000FF\"/>\n<path class=\"atom-12\" d=\"M 297.6 140.1 Q 297.7 140.1, 298.2 140.3 Q 298.8 140.5, 299.3 140.7 Q 299.9 140.8, 300.5 140.8 Q 301.5 140.8, 302.1 140.3 Q 302.7 139.8, 302.7 138.9 Q 302.7 138.3, 302.4 137.9 Q 302.1 137.6, 301.6 137.3 Q 301.2 137.1, 300.4 136.9 Q 299.4 136.6, 298.8 136.3 Q 298.2 136.1, 297.8 135.5 Q 297.4 134.9, 297.4 133.9 Q 297.4 132.5, 298.4 131.6 Q 299.3 130.8, 301.2 130.8 Q 302.4 130.8, 303.9 131.4 L 303.5 132.6 Q 302.2 132.0, 301.2 132.0 Q 300.1 132.0, 299.6 132.5 Q 299.0 132.9, 299.0 133.7 Q 299.0 134.3, 299.3 134.6 Q 299.6 135.0, 300.0 135.2 Q 300.5 135.4, 301.2 135.6 Q 302.2 135.9, 302.8 136.3 Q 303.4 136.6, 303.8 137.2 Q 304.3 137.8, 304.3 138.9 Q 304.3 140.4, 303.2 141.3 Q 302.2 142.1, 300.5 142.1 Q 299.5 142.1, 298.8 141.8 Q 298.1 141.6, 297.2 141.3 L 297.6 140.1 \" fill=\"#CCCC00\"/>\n<path class=\"atom-13\" d=\"M 284.8 99.0 Q 284.8 96.3, 286.1 94.8 Q 287.4 93.4, 289.9 93.4 Q 292.3 93.4, 293.6 94.8 Q 294.9 96.3, 294.9 99.0 Q 294.9 101.6, 293.6 103.2 Q 292.3 104.7, 289.9 104.7 Q 287.4 104.7, 286.1 103.2 Q 284.8 101.6, 284.8 99.0 M 289.9 103.4 Q 291.5 103.4, 292.5 102.3 Q 293.4 101.2, 293.4 99.0 Q 293.4 96.8, 292.5 95.7 Q 291.5 94.6, 289.9 94.6 Q 288.2 94.6, 287.3 95.7 Q 286.4 96.8, 286.4 99.0 Q 286.4 101.2, 287.3 102.3 Q 288.2 103.4, 289.9 103.4 \" fill=\"#FF0000\"/>\n<path class=\"atom-14\" d=\"M 306.5 173.7 Q 306.5 171.0, 307.8 169.5 Q 309.1 168.1, 311.6 168.1 Q 314.0 168.1, 315.3 169.5 Q 316.6 171.0, 316.6 173.7 Q 316.6 176.3, 315.3 177.9 Q 314.0 179.4, 311.6 179.4 Q 309.1 179.4, 307.8 177.9 Q 306.5 176.4, 306.5 173.7 M 311.6 178.1 Q 313.3 178.1, 314.2 177.0 Q 315.1 175.9, 315.1 173.7 Q 315.1 171.5, 314.2 170.4 Q 313.3 169.3, 311.6 169.3 Q 309.9 169.3, 309.0 170.4 Q 308.1 171.5, 308.1 173.7 Q 308.1 175.9, 309.0 177.0 Q 309.9 178.1, 311.6 178.1 \" fill=\"#FF0000\"/>\n<path class=\"atom-20\" d=\"M 422.9 168.2 L 426.5 174.0 Q 426.9 174.6, 427.5 175.6 Q 428.1 176.6, 428.1 176.7 L 428.1 168.2 L 429.5 168.2 L 429.5 179.2 L 428.0 179.2 L 424.2 172.8 Q 423.7 172.0, 423.2 171.2 Q 422.8 170.3, 422.6 170.1 L 422.6 179.2 L 421.2 179.2 L 421.2 168.2 L 422.9 168.2 \" fill=\"#0000FF\"/>\n<path class=\"atom-21\" d=\"M 396.5 204.4 Q 396.5 201.8, 397.8 200.3 Q 399.1 198.8, 401.5 198.8 Q 404.0 198.8, 405.3 200.3 Q 406.6 201.8, 406.6 204.4 Q 406.6 207.1, 405.3 208.6 Q 403.9 210.1, 401.5 210.1 Q 399.1 210.1, 397.8 208.6 Q 396.5 207.1, 396.5 204.4 M 401.5 208.9 Q 403.2 208.9, 404.1 207.8 Q 405.0 206.6, 405.0 204.4 Q 405.0 202.3, 404.1 201.2 Q 403.2 200.1, 401.5 200.1 Q 399.8 200.1, 398.9 201.2 Q 398.0 202.2, 398.0 204.4 Q 398.0 206.7, 398.9 207.8 Q 399.8 208.9, 401.5 208.9 \" fill=\"#FF0000\"/>\n<path class=\"atom-22\" d=\"M 362.5 185.7 L 366.1 191.5 Q 366.5 192.1, 367.0 193.2 Q 367.6 194.2, 367.6 194.3 L 367.6 185.7 L 369.1 185.7 L 369.1 196.7 L 367.6 196.7 L 363.7 190.4 Q 363.3 189.6, 362.8 188.7 Q 362.3 187.9, 362.2 187.6 L 362.2 196.7 L 360.8 196.7 L 360.8 185.7 L 362.5 185.7 \" fill=\"#0000FF\"/>\n</svg>"
+          },
+          "metadata": {}
+        }
+      ],
+      "source": [
+        "mc = Chem.MolFromSmiles(dataset[0]['smiles'])\n",
+        "molSize=(450,300)\n",
+        "drawer = MD2.MolDraw2DSVG(molSize[0],molSize[1])\n",
+        "drawer.DrawMolecule(mc)\n",
+        "drawer.FinishDrawing()\n",
+        "svg = drawer.GetDrawingText()\n",
+        "display(SVG(svg.replace('svg:','')))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "z4FDMYMxKw2I"
+      },
+      "source": [
+        "Generate the 3d molecule!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aT1Bkb8YxJfV",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 17,
+          "referenced_widgets": [
+            "695ab5bbf30a4ab19df1f9f33469f314",
+            "eac6a8dcdc9d4335a2e51031793ead29"
+          ]
+        },
+        "outputId": "b98870ae-049d-4386-b676-166e9526bda2"
+      },
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [],
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "695ab5bbf30a4ab19df1f9f33469f314"
+            }
+          },
+          "metadata": {
+            "application/vnd.jupyter.widget-view+json": {
+              "colab": {
+                "custom_widget_manager": {
+                  "url": "https://ssl.gstatic.com/colaboratory-static/widgets/colab-cdn-widget-manager/d2e234f7cc04bf79/manager.min.js"
+                }
+              }
+            }
+          }
+        }
+      ],
+      "source": [
+        "from nglview import show_rdkit as show"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pxtq8I-I18C-",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 337,
+          "referenced_widgets": [
+            "be446195da2b4ff2aec21ec5ff963a54",
+            "c6596896148b4a8a9c57963b67c7782f",
+            "2489b5e5648541fbbdceadb05632a050",
+            "01e0ba4e5da04914b4652b8d58565d7b",
+            "c30e6c2f3e2a44dbbb3d63bd519acaa4",
+            "f31c6e40e9b2466a9064a2669933ecd5",
+            "19308ccac642498ab8b58462e3f1b0bb",
+            "4a081cdc2ec3421ca79dd933b7e2b0c4",
+            "e5c0d75eb5e1447abd560c8f2c6017e1",
+            "5146907ef6764654ad7d598baebc8b58",
+            "144ec959b7604a2cabb5ca46ae5e5379",
+            "abce2a80e6304df3899109c6d6cac199",
+            "65195cb7a4134f4887e9dd19f3676462"
+          ]
+        },
+        "outputId": "72ed63ac-d2ec-4f5c-a0b1-4e7c1840a4e7"
+      },
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "NGLWidget()"
+            ],
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "be446195da2b4ff2aec21ec5ff963a54"
+            }
+          },
+          "metadata": {
+            "application/vnd.jupyter.widget-view+json": {
+              "colab": {
+                "custom_widget_manager": {
+                  "url": "https://ssl.gstatic.com/colaboratory-static/widgets/colab-cdn-widget-manager/d2e234f7cc04bf79/manager.min.js"
+                }
+              }
+            }
+          }
+        }
+      ],
+      "source": [
+        "# new molecule\n",
+        "show(mols_gen[idx])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "KJr4h2mwXeTo"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": []
+    },
+    "gpuClass": "standard",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "d90f304e9560472eacfbdd11e46765eb": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_1c6246f15b654f4daa11c9bcf997b78c",
+              "IPY_MODEL_c2321b3bff6f490ca12040a20308f555",
+              "IPY_MODEL_b7feb522161f4cf4b7cc7c1a078ff12d"
+            ],
+            "layout": "IPY_MODEL_e2d368556e494ae7ae4e2e992af2cd4f"
+          }
+        },
+        "1c6246f15b654f4daa11c9bcf997b78c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_bbef741e76ec41b7ab7187b487a383df",
+            "placeholder": "​",
+            "style": "IPY_MODEL_561f742d418d4721b0670cc8dd62e22c",
+            "value": "Downloading: 100%"
+          }
+        },
+        "c2321b3bff6f490ca12040a20308f555": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_872915dd1bb84f538c44e26badabafdd",
+            "max": 3271865,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_d022575f1fa2446d891650897f187b4d",
+            "value": 3271865
+          }
+        },
+        "b7feb522161f4cf4b7cc7c1a078ff12d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_fdc393f3468c432aa0ada05e238a5436",
+            "placeholder": "​",
+            "style": "IPY_MODEL_2c9362906e4b40189f16d14aa9a348da",
+            "value": " 3.27M/3.27M [00:01&lt;00:00, 3.25MB/s]"
+          }
+        },
+        "e2d368556e494ae7ae4e2e992af2cd4f": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "bbef741e76ec41b7ab7187b487a383df": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "561f742d418d4721b0670cc8dd62e22c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "872915dd1bb84f538c44e26badabafdd": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "d022575f1fa2446d891650897f187b4d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "fdc393f3468c432aa0ada05e238a5436": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "2c9362906e4b40189f16d14aa9a348da": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "6010fc8daa7a44d5aec4b830ec2ebaa1": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_7e0bb1b8d65249d3974200686b193be2",
+              "IPY_MODEL_ba98aa6d6a884e4ab8bbb5dfb5e4cf7a",
+              "IPY_MODEL_6526646be5ed415c84d1245b040e629b"
+            ],
+            "layout": "IPY_MODEL_24d31fc3576e43dd9f8301d2ef3a37ab"
+          }
+        },
+        "7e0bb1b8d65249d3974200686b193be2": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_2918bfaadc8d4b1a9832522c40dfefb8",
+            "placeholder": "​",
+            "style": "IPY_MODEL_a4bfdca35cc54dae8812720f1b276a08",
+            "value": "Downloading: 100%"
+          }
+        },
+        "ba98aa6d6a884e4ab8bbb5dfb5e4cf7a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_e4901541199b45c6a18824627692fc39",
+            "max": 401,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_f915cf874246446595206221e900b2fe",
+            "value": 401
+          }
+        },
+        "6526646be5ed415c84d1245b040e629b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_a9e388f22a9742aaaf538e22575c9433",
+            "placeholder": "​",
+            "style": "IPY_MODEL_42f6c3db29d7484ba6b4f73590abd2f4",
+            "value": " 401/401 [00:00&lt;00:00, 13.5kB/s]"
+          }
+        },
+        "24d31fc3576e43dd9f8301d2ef3a37ab": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "2918bfaadc8d4b1a9832522c40dfefb8": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "a4bfdca35cc54dae8812720f1b276a08": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "e4901541199b45c6a18824627692fc39": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "f915cf874246446595206221e900b2fe": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "a9e388f22a9742aaaf538e22575c9433": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "42f6c3db29d7484ba6b4f73590abd2f4": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "695ab5bbf30a4ab19df1f9f33469f314": {
+          "model_module": "nglview-js-widgets",
+          "model_name": "ColormakerRegistryModel",
+          "model_module_version": "3.0.1",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "nglview-js-widgets",
+            "_model_module_version": "3.0.1",
+            "_model_name": "ColormakerRegistryModel",
+            "_msg_ar": [],
+            "_msg_q": [],
+            "_ready": false,
+            "_view_count": null,
+            "_view_module": "nglview-js-widgets",
+            "_view_module_version": "3.0.1",
+            "_view_name": "ColormakerRegistryView",
+            "layout": "IPY_MODEL_eac6a8dcdc9d4335a2e51031793ead29"
+          }
+        },
+        "eac6a8dcdc9d4335a2e51031793ead29": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "be446195da2b4ff2aec21ec5ff963a54": {
+          "model_module": "nglview-js-widgets",
+          "model_name": "NGLModel",
+          "model_module_version": "3.0.1",
+          "state": {
+            "_camera_orientation": [
+              -15.519693580202304,
+              -14.065056548036177,
+              -23.53197484807691,
+              0,
+              -23.357853515109753,
+              20.94055073042662,
+              2.888695042134944,
+              0,
+              14.352363398292777,
+              18.870825741878015,
+              -20.744689572909344,
+              0,
+              0.2724999189376831,
+              0.6940000057220459,
+              -0.3734999895095825,
+              1
+            ],
+            "_camera_str": "orthographic",
+            "_dom_classes": [],
+            "_gui_theme": null,
+            "_ibtn_fullscreen": "IPY_MODEL_2489b5e5648541fbbdceadb05632a050",
+            "_igui": null,
+            "_iplayer": "IPY_MODEL_01e0ba4e5da04914b4652b8d58565d7b",
+            "_model_module": "nglview-js-widgets",
+            "_model_module_version": "3.0.1",
+            "_model_name": "NGLModel",
+            "_ngl_color_dict": {},
+            "_ngl_coordinate_resource": {},
+            "_ngl_full_stage_parameters": {
+              "impostor": true,
+              "quality": "medium",
+              "workerDefault": true,
+              "sampleLevel": 0,
+              "backgroundColor": "white",
+              "rotateSpeed": 2,
+              "zoomSpeed": 1.2,
+              "panSpeed": 1,
+              "clipNear": 0,
+              "clipFar": 100,
+              "clipDist": 10,
+              "fogNear": 50,
+              "fogFar": 100,
+              "cameraFov": 40,
+              "cameraEyeSep": 0.3,
+              "cameraType": "perspective",
+              "lightColor": 14540253,
+              "lightIntensity": 1,
+              "ambientColor": 14540253,
+              "ambientIntensity": 0.2,
+              "hoverTimeout": 0,
+              "tooltip": true,
+              "mousePreset": "default"
+            },
+            "_ngl_msg_archive": [
+              {
+                "target": "Stage",
+                "type": "call_method",
+                "methodName": "loadFile",
+                "reconstruc_color_scheme": false,
+                "args": [
+                  {
+                    "type": "blob",
+                    "data": "HETATM    1  C1  UNL     1      -0.025   3.128   2.316  1.00  0.00           C  \nHETATM    2  H1  UNL     1       0.183   3.657   2.823  1.00  0.00           H  \nHETATM    3  C2  UNL     1       0.590   3.559   0.963  1.00  0.00           C  \nHETATM    4  C3  UNL     1       0.056   4.479   0.406  1.00  0.00           C  \nHETATM    5  C4  UNL     1      -0.219   4.802  -1.065  1.00  0.00           C  \nHETATM    6  H2  UNL     1       0.686   4.431  -1.575  1.00  0.00           H  \nHETATM    7  H3  UNL     1      -0.524   5.217  -1.274  1.00  0.00           H  \nHETATM    8  C5  UNL     1      -1.284   3.766  -1.342  1.00  0.00           C  \nHETATM    9  N1  UNL     1      -1.073   2.494  -0.580  1.00  0.00           N  \nHETATM   10  C6  UNL     1      -1.909   1.494  -0.964  1.00  0.00           C  \nHETATM   11  O1  UNL     1      -2.487   1.531  -2.092  1.00  0.00           O  \nHETATM   12  C7  UNL     1      -2.232   0.242  -0.130  1.00  0.00           C  \nHETATM   13  C8  UNL     1      -2.161  -1.057  -1.037  1.00  0.00           C  \nHETATM   14  C9  UNL     1      -0.744  -1.111  -1.610  1.00  0.00           C  \nHETATM   15  N2  UNL     1       0.290  -0.917  -0.628  1.00  0.00           N  \nHETATM   16  S1  UNL     1       1.717  -1.597  -0.914  1.00  0.00           S  \nHETATM   17  O2  UNL     1       1.960  -1.671  -2.338  1.00  0.00           O  \nHETATM   18  O3  UNL     1       2.713  -0.968  -0.082  1.00  0.00           O  \nHETATM   19  C10 UNL     1       1.425  -3.170  -0.345  1.00  0.00           C  \nHETATM   20  C11 UNL     1       1.225  -4.400  -1.271  1.00  0.00           C  \nHETATM   21  C12 UNL     1       1.314  -5.913  -0.895  1.00  0.00           C  \nHETATM   22  C13 UNL     1       1.823  -6.229   0.386  1.00  0.00           C  \nHETATM   23  C14 UNL     1       2.031  -5.110   1.365  1.00  0.00           C  \nHETATM   24  N3  UNL     1       1.850  -5.267   2.712  1.00  0.00           N  \nHETATM   25  O4  UNL     1       1.382  -4.029   3.126  1.00  0.00           O  \nHETATM   26  N4  UNL     1       1.300  -3.023   2.154  1.00  0.00           N  \nHETATM   27  C15 UNL     1       1.731  -3.672   1.032  1.00  0.00           C  \nHETATM   28  H4  UNL     1       2.380  -6.874   0.436  1.00  0.00           H  \nHETATM   29  H5  UNL     1       0.704  -6.526  -1.420  1.00  0.00           H  \nHETATM   30  H6  UNL     1       1.144  -4.035  -2.291  1.00  0.00           H  \nHETATM   31  C16 UNL     1       0.044  -0.371   0.685  1.00  0.00           C  \nHETATM   32  C17 UNL     1      -1.352  -0.045   1.077  1.00  0.00           C  \nHETATM   33  H7  UNL     1      -1.395   0.770   1.768  1.00  0.00           H  \nHETATM   34  H8  UNL     1      -1.792  -0.941   1.582  1.00  0.00           H  \nHETATM   35  H9  UNL     1       0.583  -1.035   1.393  1.00  0.00           H  \nHETATM   36  H10 UNL     1       0.664   0.613   0.663  1.00  0.00           H  \nHETATM   37  H11 UNL     1      -0.631  -0.267  -2.335  1.00  0.00           H  \nHETATM   38  H12 UNL     1      -0.571  -2.046  -2.098  1.00  0.00           H  \nHETATM   39  H13 UNL     1      -2.872  -0.992  -1.826  1.00  0.00           H  \nHETATM   40  H14 UNL     1      -2.370  -1.924  -0.444  1.00  0.00           H  \nHETATM   41  H15 UNL     1      -3.258   0.364   0.197  1.00  0.00           H  \nHETATM   42  C18 UNL     1       0.276   2.337  -0.078  1.00  0.00           C  \nHETATM   43  H16 UNL     1       0.514   1.371   0.252  1.00  0.00           H  \nHETATM   44  H17 UNL     1       0.988   2.413  -0.949  1.00  0.00           H  \nHETATM   45  H18 UNL     1      -1.349   3.451  -2.379  1.00  0.00           H  \nHETATM   46  H19 UNL     1      -2.224   4.055  -0.958  1.00  0.00           H  \nHETATM   47  H20 UNL     1       0.793   5.486   0.669  1.00  0.00           H  \nHETATM   48  H21 UNL     1      -0.849   4.974   0.937  1.00  0.00           H  \nHETATM   49  H22 UNL     1       1.667   3.431   1.070  1.00  0.00           H  \nHETATM   50  H23 UNL     1       0.379   2.143   2.689  1.00  0.00           H  \nHETATM   51  H24 UNL     1      -1.094   2.983   2.223  1.00  0.00           H  \nCONECT    1    2    3   50   51\nCONECT    3    4   42   49\nCONECT    4    5   47   48\nCONECT    5    6    7    8\nCONECT    8    9   45   46\nCONECT    9   10   42\nCONECT   10   11   11   12\nCONECT   12   13   32   41\nCONECT   13   14   39   40\nCONECT   14   15   37   38\nCONECT   15   16   31\nCONECT   16   17   17   18   18\nCONECT   16   19\nCONECT   19   20   20   27\nCONECT   20   21   30\nCONECT   21   22   22   29\nCONECT   22   23   28\nCONECT   23   24   24   27\nCONECT   24   25\nCONECT   25   26\nCONECT   26   27   27\nCONECT   31   32   35   36\nCONECT   32   33   34\nCONECT   42   43   44\nEND\n",
+                    "binary": false
+                  }
+                ],
+                "kwargs": {
+                  "defaultRepresentation": true,
+                  "ext": "pdb"
+                }
+              }
+            ],
+            "_ngl_original_stage_parameters": {
+              "impostor": true,
+              "quality": "medium",
+              "workerDefault": true,
+              "sampleLevel": 0,
+              "backgroundColor": "white",
+              "rotateSpeed": 2,
+              "zoomSpeed": 1.2,
+              "panSpeed": 1,
+              "clipNear": 0,
+              "clipFar": 100,
+              "clipDist": 10,
+              "fogNear": 50,
+              "fogFar": 100,
+              "cameraFov": 40,
+              "cameraEyeSep": 0.3,
+              "cameraType": "perspective",
+              "lightColor": 14540253,
+              "lightIntensity": 1,
+              "ambientColor": 14540253,
+              "ambientIntensity": 0.2,
+              "hoverTimeout": 0,
+              "tooltip": true,
+              "mousePreset": "default"
+            },
+            "_ngl_repr_dict": {
+              "0": {
+                "0": {
+                  "type": "ball+stick",
+                  "params": {
+                    "lazy": false,
+                    "visible": true,
+                    "quality": "high",
+                    "sphereDetail": 2,
+                    "radialSegments": 20,
+                    "openEnded": true,
+                    "disableImpostor": false,
+                    "aspectRatio": 1.5,
+                    "lineOnly": false,
+                    "cylinderOnly": false,
+                    "multipleBond": "off",
+                    "bondScale": 0.3,
+                    "bondSpacing": 0.75,
+                    "linewidth": 2,
+                    "radiusType": "size",
+                    "radiusData": {},
+                    "radiusSize": 0.15,
+                    "radiusScale": 2,
+                    "assembly": "default",
+                    "defaultAssembly": "",
+                    "clipNear": 0,
+                    "clipRadius": 0,
+                    "clipCenter": {
+                      "x": 0,
+                      "y": 0,
+                      "z": 0
+                    },
+                    "flatShaded": false,
+                    "opacity": 1,
+                    "depthWrite": true,
+                    "side": "double",
+                    "wireframe": false,
+                    "colorScheme": "element",
+                    "colorScale": "",
+                    "colorReverse": false,
+                    "colorValue": 9474192,
+                    "colorMode": "hcl",
+                    "roughness": 0.4,
+                    "metalness": 0,
+                    "diffuse": 16777215,
+                    "diffuseInterior": false,
+                    "useInteriorColor": true,
+                    "interiorColor": 2236962,
+                    "interiorDarkening": 0,
+                    "matrix": {
+                      "elements": [
+                        1,
+                        0,
+                        0,
+                        0,
+                        0,
+                        1,
+                        0,
+                        0,
+                        0,
+                        0,
+                        1,
+                        0,
+                        0,
+                        0,
+                        0,
+                        1
+                      ]
+                    },
+                    "disablePicking": false,
+                    "sele": ""
+                  }
+                }
+              },
+              "1": {
+                "0": {
+                  "type": "ball+stick",
+                  "params": {
+                    "lazy": false,
+                    "visible": true,
+                    "quality": "high",
+                    "sphereDetail": 2,
+                    "radialSegments": 20,
+                    "openEnded": true,
+                    "disableImpostor": false,
+                    "aspectRatio": 1.5,
+                    "lineOnly": false,
+                    "cylinderOnly": false,
+                    "multipleBond": "off",
+                    "bondScale": 0.3,
+                    "bondSpacing": 0.75,
+                    "linewidth": 2,
+                    "radiusType": "size",
+                    "radiusData": {},
+                    "radiusSize": 0.15,
+                    "radiusScale": 2,
+                    "assembly": "default",
+                    "defaultAssembly": "",
+                    "clipNear": 0,
+                    "clipRadius": 0,
+                    "clipCenter": {
+                      "x": 0,
+                      "y": 0,
+                      "z": 0
+                    },
+                    "flatShaded": false,
+                    "opacity": 1,
+                    "depthWrite": true,
+                    "side": "double",
+                    "wireframe": false,
+                    "colorScheme": "element",
+                    "colorScale": "",
+                    "colorReverse": false,
+                    "colorValue": 9474192,
+                    "colorMode": "hcl",
+                    "roughness": 0.4,
+                    "metalness": 0,
+                    "diffuse": 16777215,
+                    "diffuseInterior": false,
+                    "useInteriorColor": true,
+                    "interiorColor": 2236962,
+                    "interiorDarkening": 0,
+                    "matrix": {
+                      "elements": [
+                        1,
+                        0,
+                        0,
+                        0,
+                        0,
+                        1,
+                        0,
+                        0,
+                        0,
+                        0,
+                        1,
+                        0,
+                        0,
+                        0,
+                        0,
+                        1
+                      ]
+                    },
+                    "disablePicking": false,
+                    "sele": ""
+                  }
+                }
+              }
+            },
+            "_ngl_serialize": false,
+            "_ngl_version": "",
+            "_ngl_view_id": [
+              "FB989FD1-5B9C-446B-8914-6B58AF85446D"
+            ],
+            "_player_dict": {},
+            "_scene_position": {},
+            "_scene_rotation": {},
+            "_synced_model_ids": [],
+            "_synced_repr_model_ids": [],
+            "_view_count": null,
+            "_view_height": "",
+            "_view_module": "nglview-js-widgets",
+            "_view_module_version": "3.0.1",
+            "_view_name": "NGLView",
+            "_view_width": "",
+            "background": "white",
+            "frame": 0,
+            "gui_style": null,
+            "layout": "IPY_MODEL_c6596896148b4a8a9c57963b67c7782f",
+            "max_frame": 0,
+            "n_components": 2,
+            "picked": {}
+          }
+        },
+        "c6596896148b4a8a9c57963b67c7782f": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "2489b5e5648541fbbdceadb05632a050": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ButtonModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ButtonModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ButtonView",
+            "button_style": "",
+            "description": "",
+            "disabled": false,
+            "icon": "compress",
+            "layout": "IPY_MODEL_abce2a80e6304df3899109c6d6cac199",
+            "style": "IPY_MODEL_65195cb7a4134f4887e9dd19f3676462",
+            "tooltip": ""
+          }
+        },
+        "01e0ba4e5da04914b4652b8d58565d7b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_e5c0d75eb5e1447abd560c8f2c6017e1",
+              "IPY_MODEL_5146907ef6764654ad7d598baebc8b58"
+            ],
+            "layout": "IPY_MODEL_144ec959b7604a2cabb5ca46ae5e5379"
+          }
+        },
+        "c30e6c2f3e2a44dbbb3d63bd519acaa4": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "f31c6e40e9b2466a9064a2669933ecd5": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "19308ccac642498ab8b58462e3f1b0bb": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "4a081cdc2ec3421ca79dd933b7e2b0c4": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "SliderStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "SliderStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": "",
+            "handle_color": null
+          }
+        },
+        "e5c0d75eb5e1447abd560c8f2c6017e1": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "PlayModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "PlayModel",
+            "_playing": false,
+            "_repeat": false,
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "PlayView",
+            "description": "",
+            "description_tooltip": null,
+            "disabled": false,
+            "interval": 100,
+            "layout": "IPY_MODEL_c30e6c2f3e2a44dbbb3d63bd519acaa4",
+            "max": 0,
+            "min": 0,
+            "show_repeat": true,
+            "step": 1,
+            "style": "IPY_MODEL_f31c6e40e9b2466a9064a2669933ecd5",
+            "value": 0
+          }
+        },
+        "5146907ef6764654ad7d598baebc8b58": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "IntSliderModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "IntSliderModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "IntSliderView",
+            "continuous_update": true,
+            "description": "",
+            "description_tooltip": null,
+            "disabled": false,
+            "layout": "IPY_MODEL_19308ccac642498ab8b58462e3f1b0bb",
+            "max": 0,
+            "min": 0,
+            "orientation": "horizontal",
+            "readout": true,
+            "readout_format": "d",
+            "step": 1,
+            "style": "IPY_MODEL_4a081cdc2ec3421ca79dd933b7e2b0c4",
+            "value": 0
+          }
+        },
+        "144ec959b7604a2cabb5ca46ae5e5379": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "abce2a80e6304df3899109c6d6cac199": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": "34px"
+          }
+        },
+        "65195cb7a4134f4887e9dd19f3676462": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ButtonStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ButtonStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "button_color": null,
+            "font_weight": ""
+          }
+        }
+      }
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/gligen/README.md b/diffusers/examples/research_projects/gligen/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fa922617d984ff1f22c7d8d9f35f396adb85c33a
--- /dev/null
+++ b/diffusers/examples/research_projects/gligen/README.md
@@ -0,0 +1,156 @@
+# GLIGEN: Open-Set Grounded Text-to-Image Generation
+
+These scripts contain the code to prepare the grounding data and train the GLIGEN model on COCO dataset.
+
+### Install the requirements
+
+```bash
+conda create -n diffusers python==3.10
+conda activate diffusers
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+
+write_basic_config()
+```
+
+### Prepare the training data
+
+If you want to make your own grounding data, you need to install the requirements.
+
+I used [RAM](https://github.com/xinyu1205/recognize-anything) to tag
+images, [Grounding DINO](https://github.com/IDEA-Research/GroundingDINO/issues?q=refer) to detect objects,
+and [BLIP2](https://huggingface.co/docs/transformers/en/model_doc/blip-2) to caption instances.
+
+Only RAM needs to be installed manually:
+
+```bash
+pip install git+https://github.com/xinyu1205/recognize-anything.git --no-deps
+```
+
+Download the pre-trained model:
+
+```bash
+huggingface-cli download --resume-download xinyu1205/recognize_anything_model ram_swin_large_14m.pth
+huggingface-cli download --resume-download IDEA-Research/grounding-dino-base
+huggingface-cli download --resume-download Salesforce/blip2-flan-t5-xxl
+huggingface-cli download --resume-download clip-vit-large-patch14
+huggingface-cli download --resume-download masterful/gligen-1-4-generation-text-box
+```
+
+Make the training data on 8 GPUs:
+
+```bash
+torchrun --master_port 17673 --nproc_per_node=8 make_datasets.py \
+    --data_root /mnt/workspace/workgroup/zhizhonghuang/dataset/COCO/train2017 \
+    --save_root /root/gligen_data \
+    --ram_checkpoint /root/.cache/huggingface/hub/models--xinyu1205--recognize_anything_model/snapshots/ebc52dc741e86466202a5ab8ab22eae6e7d48bf1/ram_swin_large_14m.pth
+```
+
+You can download the COCO training data from
+
+```bash
+huggingface-cli download --resume-download Hzzone/GLIGEN_COCO coco_train2017.pth
+```
+
+It's in the format of
+
+```json
+[
+  ...
+  {
+    'file_path': Path,
+    'annos': [
+      {
+        'caption': Instance
+        Caption,
+        'bbox': bbox
+        in
+        xyxy,
+        'text_embeddings_before_projection': CLIP
+        text
+        embedding
+        before
+        linear
+        projection
+      }
+    ]
+  }
+  ...
+]
+```
+
+### Training commands
+
+The training script is heavily based
+on https://github.com/huggingface/diffusers/blob/main/examples/controlnet/train_controlnet.py
+
+```bash
+accelerate launch train_gligen_text.py \
+    --data_path /root/data/zhizhonghuang/coco_train2017.pth \
+    --image_path /mnt/workspace/workgroup/zhizhonghuang/dataset/COCO/train2017 \
+    --train_batch_size 8 \
+    --max_train_steps 100000 \
+    --checkpointing_steps 1000 \
+    --checkpoints_total_limit 10 \
+    --learning_rate 5e-5 \
+    --dataloader_num_workers 16 \
+    --mixed_precision fp16 \
+    --report_to wandb \
+    --tracker_project_name gligen \
+    --output_dir /root/data/zhizhonghuang/ckpt/GLIGEN_Text_Retrain_COCO
+```
+
+I trained the model on 8 A100 GPUs for about 11 hours (at least 24GB GPU memory). The generated images will follow the
+layout possibly at 50k iterations.
+
+Note that although the pre-trained GLIGEN model has been loaded, the parameters of `fuser` and `position_net` have been reset (see line 420 in `train_gligen_text.py`)
+
+The trained model can be downloaded from
+
+```bash
+huggingface-cli download --resume-download Hzzone/GLIGEN_COCO config.json diffusion_pytorch_model.safetensors
+```
+
+You can run `demo.ipynb` to visualize the generated images.
+
+Example prompts:
+
+```python
+prompt = 'A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky'
+boxes = [[0.041015625, 0.548828125, 0.453125, 0.859375],
+         [0.525390625, 0.552734375, 0.93359375, 0.865234375],
+         [0.12890625, 0.015625, 0.412109375, 0.279296875],
+         [0.578125, 0.08203125, 0.857421875, 0.27734375]]
+gligen_phrases = ['a green car', 'a blue truck', 'a red air balloon', 'a bird']
+```
+
+Example images:
+![alt text](generated-images-100000-00.png)
+
+### Citation
+
+```
+@article{li2023gligen,
+  title={GLIGEN: Open-Set Grounded Text-to-Image Generation},
+  author={Li, Yuheng and Liu, Haotian and Wu, Qingyang and Mu, Fangzhou and Yang, Jianwei and Gao, Jianfeng and Li, Chunyuan and Lee, Yong Jae},
+  journal={CVPR},
+  year={2023}
+}
+```
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/gligen/dataset.py b/diffusers/examples/research_projects/gligen/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b851bc6ded7d068c2920a66b3f1fd0e6916541d
--- /dev/null
+++ b/diffusers/examples/research_projects/gligen/dataset.py
@@ -0,0 +1,110 @@
+import os
+import random
+
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+
+
+def recalculate_box_and_verify_if_valid(x, y, w, h, image_size, original_image_size, min_box_size):
+    scale = image_size / min(original_image_size)
+    crop_y = (original_image_size[1] * scale - image_size) // 2
+    crop_x = (original_image_size[0] * scale - image_size) // 2
+    x0 = max(x * scale - crop_x, 0)
+    y0 = max(y * scale - crop_y, 0)
+    x1 = min((x + w) * scale - crop_x, image_size)
+    y1 = min((y + h) * scale - crop_y, image_size)
+    if (x1 - x0) * (y1 - y0) / (image_size * image_size) < min_box_size:
+        return False, (None, None, None, None)
+    return True, (x0, y0, x1, y1)
+
+
+class COCODataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        data_path,
+        image_path,
+        image_size=512,
+        min_box_size=0.01,
+        max_boxes_per_data=8,
+        tokenizer=None,
+    ):
+        super().__init__()
+        self.min_box_size = min_box_size
+        self.max_boxes_per_data = max_boxes_per_data
+        self.image_size = image_size
+        self.image_path = image_path
+        self.tokenizer = tokenizer
+        self.transforms = transforms.Compose(
+            [
+                transforms.Resize(image_size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(image_size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+        self.data_list = torch.load(data_path, map_location="cpu")
+
+    def __getitem__(self, index):
+        if self.max_boxes_per_data > 99:
+            assert False, "Are you sure setting such large number of boxes per image?"
+
+        out = {}
+
+        data = self.data_list[index]
+        image = Image.open(os.path.join(self.image_path, data["file_path"])).convert("RGB")
+        original_image_size = image.size
+        out["pixel_values"] = self.transforms(image)
+
+        annos = data["annos"]
+
+        areas, valid_annos = [], []
+        for anno in annos:
+            # x, y, w, h = anno['bbox']
+            x0, y0, x1, y1 = anno["bbox"]
+            x, y, w, h = x0, y0, x1 - x0, y1 - y0
+            valid, (x0, y0, x1, y1) = recalculate_box_and_verify_if_valid(
+                x, y, w, h, self.image_size, original_image_size, self.min_box_size
+            )
+            if valid:
+                anno["bbox"] = [x0, y0, x1, y1]
+                areas.append((x1 - x0) * (y1 - y0))
+                valid_annos.append(anno)
+
+        # Sort according to area and choose the largest N objects
+        wanted_idxs = torch.tensor(areas).sort(descending=True)[1]
+        wanted_idxs = wanted_idxs[: self.max_boxes_per_data]
+        valid_annos = [valid_annos[i] for i in wanted_idxs]
+
+        out["boxes"] = torch.zeros(self.max_boxes_per_data, 4)
+        out["masks"] = torch.zeros(self.max_boxes_per_data)
+        out["text_embeddings_before_projection"] = torch.zeros(self.max_boxes_per_data, 768)
+
+        for i, anno in enumerate(valid_annos):
+            out["boxes"][i] = torch.tensor(anno["bbox"]) / self.image_size
+            out["masks"][i] = 1
+            out["text_embeddings_before_projection"][i] = anno["text_embeddings_before_projection"]
+
+        prob_drop_boxes = 0.1
+        if random.random() < prob_drop_boxes:
+            out["masks"][:] = 0
+
+        caption = random.choice(data["captions"])
+
+        prob_drop_captions = 0.5
+        if random.random() < prob_drop_captions:
+            caption = ""
+        caption = self.tokenizer(
+            caption,
+            max_length=self.tokenizer.model_max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        out["caption"] = caption
+
+        return out
+
+    def __len__(self):
+        return len(self.data_list)
diff --git a/diffusers/examples/research_projects/gligen/demo.ipynb b/diffusers/examples/research_projects/gligen/demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..571f1a0323a2bb99c662f5d6955836e04d91677b
--- /dev/null
+++ b/diffusers/examples/research_projects/gligen/demo.ipynb
@@ -0,0 +1,201 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/root/miniconda/envs/densecaption/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import torch\n",
+    "from diffusers import StableDiffusionGLIGENTextImagePipeline, StableDiffusionGLIGENPipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import diffusers\n",
+    "from diffusers import (\n",
+    "    AutoencoderKL,\n",
+    "    DDPMScheduler,\n",
+    "    UNet2DConditionModel,\n",
+    "    UniPCMultistepScheduler,\n",
+    "    EulerDiscreteScheduler,\n",
+    ")\n",
+    "from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer\n",
+    "# pretrained_model_name_or_path = 'masterful/gligen-1-4-generation-text-box'\n",
+    "\n",
+    "pretrained_model_name_or_path = '/root/data/zhizhonghuang/checkpoints/models--masterful--gligen-1-4-generation-text-box/snapshots/d2820dc1e9ba6ca082051ce79cfd3eb468ae2c83'\n",
+    "\n",
+    "tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder=\"tokenizer\")\n",
+    "noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder=\"scheduler\")\n",
+    "text_encoder = CLIPTextModel.from_pretrained(\n",
+    "    pretrained_model_name_or_path, subfolder=\"text_encoder\"\n",
+    ")\n",
+    "vae = AutoencoderKL.from_pretrained(\n",
+    "    pretrained_model_name_or_path, subfolder=\"vae\"\n",
+    ")\n",
+    "# unet = UNet2DConditionModel.from_pretrained(\n",
+    "#     pretrained_model_name_or_path, subfolder=\"unet\"\n",
+    "# )\n",
+    "\n",
+    "noise_scheduler = EulerDiscreteScheduler.from_config(noise_scheduler.config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unet = UNet2DConditionModel.from_pretrained(\n",
+    "    '/root/data/zhizhonghuang/ckpt/GLIGEN_Text_Retrain_COCO'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion_gligen.pipeline_stable_diffusion_gligen.StableDiffusionGLIGENPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .\n"
+     ]
+    }
+   ],
+   "source": [
+    "pipe = StableDiffusionGLIGENPipeline(\n",
+    "    vae,\n",
+    "    text_encoder,\n",
+    "    tokenizer,\n",
+    "    unet,\n",
+    "    noise_scheduler,\n",
+    "    safety_checker=None,\n",
+    "    feature_extractor=None,\n",
+    ")\n",
+    "pipe = pipe.to(\"cuda\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# prompt = 'A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky'\n",
+    "# gen_boxes = [('a green car', [21, 281, 211, 159]), ('a blue truck', [269, 283, 209, 160]), ('a red air balloon', [66, 8, 145, 135]), ('a bird', [296, 42, 143, 100])]\n",
+    "\n",
+    "# prompt = 'A realistic top-down view of a wooden table with two apples on it'\n",
+    "# gen_boxes = [('a wooden table', [20, 148, 472, 216]), ('an apple', [150, 226, 100, 100]), ('an apple', [280, 226, 100, 100])]\n",
+    "\n",
+    "# prompt = 'A realistic scene of three skiers standing in a line on the snow near a palm tree'\n",
+    "# gen_boxes = [('a skier', [5, 152, 139, 168]), ('a skier', [278, 192, 121, 158]), ('a skier', [148, 173, 124, 155]), ('a palm tree', [404, 105, 103, 251])]\n",
+    "\n",
+    "prompt = 'An oil painting of a pink dolphin jumping on the left of a steam boat on the sea'\n",
+    "gen_boxes = [('a steam boat', [232, 225, 257, 149]), ('a jumping pink dolphin', [21, 249, 189, 123])]\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "boxes = np.array([x[1] for x in gen_boxes])\n",
+    "boxes = boxes / 512\n",
+    "boxes[:, 2] = boxes[:, 0] + boxes[:, 2]\n",
+    "boxes[:, 3] = boxes[:, 1] + boxes[:, 3]\n",
+    "boxes = boxes.tolist()\n",
+    "gligen_phrases = [x[0] for x in gen_boxes]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/root/miniconda/envs/densecaption/lib/python3.11/site-packages/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py:683: FutureWarning: Accessing config attribute `in_channels` directly via 'UNet2DConditionModel' object attribute is deprecated. Please access 'in_channels' over 'UNet2DConditionModel's config object instead, e.g. 'unet.config.in_channels'.\n",
+      "  num_channels_latents = self.unet.in_channels\n",
+      "/root/miniconda/envs/densecaption/lib/python3.11/site-packages/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py:716: FutureWarning: Accessing config attribute `cross_attention_dim` directly via 'UNet2DConditionModel' object attribute is deprecated. Please access 'cross_attention_dim' over 'UNet2DConditionModel's config object instead, e.g. 'unet.config.cross_attention_dim'.\n",
+      "  max_objs, self.unet.cross_attention_dim, device=device, dtype=self.text_encoder.dtype\n",
+      "100%|██████████| 50/50 [01:21<00:00,  1.64s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "images = pipe(\n",
+    "    prompt=prompt,\n",
+    "    gligen_phrases=gligen_phrases,\n",
+    "    gligen_boxes=boxes,\n",
+    "    gligen_scheduled_sampling_beta=1.0,\n",
+    "    output_type=\"pil\",\n",
+    "    num_inference_steps=50,\n",
+    "    negative_prompt=\"artifacts, blurry, smooth texture, bad quality, distortions, unrealistic, distorted image, bad proportions, duplicate\",\n",
+    "    num_images_per_prompt=16,\n",
+    ").images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "diffusers.utils.make_image_grid(images, 4, len(images)//4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "densecaption",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/diffusers/examples/research_projects/gligen/make_datasets.py b/diffusers/examples/research_projects/gligen/make_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..37950a7b73ca2229b8e130035380d0c6a1e69129
--- /dev/null
+++ b/diffusers/examples/research_projects/gligen/make_datasets.py
@@ -0,0 +1,119 @@
+import argparse
+import os
+import random
+
+import torch
+import torchvision
+import torchvision.transforms as TS
+from PIL import Image
+from ram import inference_ram
+from ram.models import ram
+from tqdm import tqdm
+from transformers import (
+    AutoModelForZeroShotObjectDetection,
+    AutoProcessor,
+    Blip2ForConditionalGeneration,
+    Blip2Processor,
+    CLIPTextModel,
+    CLIPTokenizer,
+)
+
+
+torch.autograd.set_grad_enabled(False)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Caption Generation script", add_help=False)
+    parser.add_argument("--data_root", type=str, required=True, help="path to COCO")
+    parser.add_argument("--save_root", type=str, required=True, help="path to save")
+    parser.add_argument("--ram_checkpoint", type=str, required=True, help="path to save")
+    args = parser.parse_args()
+
+    # ram_checkpoint = '/root/.cache/huggingface/hub/models--xinyu1205--recognize_anything_model/snapshots/ebc52dc741e86466202a5ab8ab22eae6e7d48bf1/ram_swin_large_14m.pth'
+    # data_root = '/mnt/workspace/workgroup/zhizhonghuang/dataset/COCO/train2017'
+    # save_root = '/root/gligen_data'
+    box_threshold = 0.25
+    text_threshold = 0.2
+
+    import torch.distributed as dist
+
+    dist.init_process_group(backend="nccl", init_method="env://")
+    local_rank = torch.distributed.get_rank() % torch.cuda.device_count()
+    device = f"cuda:{local_rank}"
+    torch.cuda.set_device(local_rank)
+
+    ram_model = ram(pretrained=args.ram_checkpoint, image_size=384, vit="swin_l").cuda().eval()
+    ram_processor = TS.Compose(
+        [TS.Resize((384, 384)), TS.ToTensor(), TS.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
+    )
+
+    grounding_dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
+    grounding_dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
+        "IDEA-Research/grounding-dino-base"
+    ).cuda()
+
+    blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xxl")
+    blip2_model = Blip2ForConditionalGeneration.from_pretrained(
+        "Salesforce/blip2-flan-t5-xxl", torch_dtype=torch.float16
+    ).cuda()
+
+    clip_text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").cuda()
+    clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+
+    image_paths = [os.path.join(args.data_root, x) for x in os.listdir(args.data_root)]
+    random.shuffle(image_paths)
+
+    for image_path in tqdm.tqdm(image_paths):
+        pth_path = os.path.join(args.save_root, os.path.basename(image_path))
+        if os.path.exists(pth_path):
+            continue
+
+        sample = {"file_path": os.path.basename(image_path), "annos": []}
+
+        raw_image = Image.open(image_path).convert("RGB")
+
+        res = inference_ram(ram_processor(raw_image).unsqueeze(0).cuda(), ram_model)
+
+        text = res[0].replace(" |", ".")
+
+        inputs = grounding_dino_processor(images=raw_image, text=text, return_tensors="pt")
+        inputs = {k: v.cuda() for k, v in inputs.items()}
+        outputs = grounding_dino_model(**inputs)
+
+        results = grounding_dino_processor.post_process_grounded_object_detection(
+            outputs,
+            inputs["input_ids"],
+            box_threshold=box_threshold,
+            text_threshold=text_threshold,
+            target_sizes=[raw_image.size[::-1]],
+        )
+        boxes = results[0]["boxes"]
+        labels = results[0]["labels"]
+        scores = results[0]["scores"]
+        indices = torchvision.ops.nms(boxes, scores, 0.5)
+        boxes = boxes[indices]
+        category_names = [labels[i] for i in indices]
+
+        for i, bbox in enumerate(boxes):
+            bbox = bbox.tolist()
+            inputs = blip2_processor(images=raw_image.crop(bbox), return_tensors="pt")
+            inputs = {k: v.cuda().to(torch.float16) for k, v in inputs.items()}
+            outputs = blip2_model.generate(**inputs)
+            caption = blip2_processor.decode(outputs[0], skip_special_tokens=True)
+            inputs = clip_tokenizer(
+                caption,
+                padding="max_length",
+                max_length=clip_tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            inputs = {k: v.cuda() for k, v in inputs.items()}
+            text_embeddings_before_projection = clip_text_encoder(**inputs).pooler_output.squeeze(0)
+
+            sample["annos"].append(
+                {
+                    "caption": caption,
+                    "bbox": bbox,
+                    "text_embeddings_before_projection": text_embeddings_before_projection,
+                }
+            )
+        torch.save(sample, pth_path)
diff --git a/diffusers/examples/research_projects/gligen/requirements.txt b/diffusers/examples/research_projects/gligen/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b0dc16b7578cf84956329750d1f2146f3655f6e7
--- /dev/null
+++ b/diffusers/examples/research_projects/gligen/requirements.txt
@@ -0,0 +1,11 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
+diffusers
+scipy
+timm
+fairscale
+wandb
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/gligen/train_gligen_text.py b/diffusers/examples/research_projects/gligen/train_gligen_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..6645d19b73e6d761b9e752454380aa4bcfcd9ecd
--- /dev/null
+++ b/diffusers/examples/research_projects/gligen/train_gligen_text.py
@@ -0,0 +1,715 @@
+# from accelerate.utils import write_basic_config
+#
+# write_basic_config()
+import argparse
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+
+import accelerate
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from packaging import version
+from tqdm.auto import tqdm
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    EulerDiscreteScheduler,
+    StableDiffusionGLIGENPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import is_wandb_available, make_image_grid
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    pass
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+# check_min_version("0.28.0.dev0")
+
+logger = get_logger(__name__)
+
+
+@torch.no_grad()
+def log_validation(vae, text_encoder, tokenizer, unet, noise_scheduler, args, accelerator, step, weight_dtype):
+    if accelerator.is_main_process:
+        print("generate test images...")
+    unet = accelerator.unwrap_model(unet)
+    vae.to(accelerator.device, dtype=torch.float32)
+
+    pipeline = StableDiffusionGLIGENPipeline(
+        vae,
+        text_encoder,
+        tokenizer,
+        unet,
+        EulerDiscreteScheduler.from_config(noise_scheduler.config),
+        safety_checker=None,
+        feature_extractor=None,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=not accelerator.is_main_process)
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    prompt = "A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky"
+    boxes = [
+        [0.041015625, 0.548828125, 0.453125, 0.859375],
+        [0.525390625, 0.552734375, 0.93359375, 0.865234375],
+        [0.12890625, 0.015625, 0.412109375, 0.279296875],
+        [0.578125, 0.08203125, 0.857421875, 0.27734375],
+    ]
+    gligen_phrases = ["a green car", "a blue truck", "a red air balloon", "a bird"]
+    images = pipeline(
+        prompt=prompt,
+        gligen_phrases=gligen_phrases,
+        gligen_boxes=boxes,
+        gligen_scheduled_sampling_beta=1.0,
+        output_type="pil",
+        num_inference_steps=50,
+        negative_prompt="artifacts, blurry, smooth texture, bad quality, distortions, unrealistic, distorted image, bad proportions, duplicate",
+        num_images_per_prompt=4,
+        generator=generator,
+    ).images
+    os.makedirs(os.path.join(args.output_dir, "images"), exist_ok=True)
+    make_image_grid(images, 1, 4).save(
+        os.path.join(args.output_dir, "images", f"generated-images-{step:06d}-{accelerator.process_index:02d}.png")
+    )
+
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a ControlNet training script.")
+    parser.add_argument(
+        "--data_path",
+        type=str,
+        default="coco_train2017.pth",
+        help="Path to training dataset.",
+    )
+    parser.add_argument(
+        "--image_path",
+        type=str,
+        default="coco_train2017.pth",
+        help="Path to training images.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="controlnet-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="train_controlnet",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # import correct text encoder class
+    # text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+    # Load scheduler and models
+    from transformers import CLIPTextModel, CLIPTokenizer
+
+    pretrained_model_name_or_path = "masterful/gligen-1-4-generation-text-box"
+    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer")
+    noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_name_or_path, subfolder="text_encoder")
+
+    vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(pretrained_model_name_or_path, subfolder="unet")
+
+    # Taken from [Sayak Paul's Diffusers PR #6511](https://github.com/huggingface/diffusers/pull/6511/files)
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                i = len(weights) - 1
+
+                while len(weights) > 0:
+                    weights.pop()
+                    model = models[i]
+
+                    sub_dir = "unet"
+                    model.save_pretrained(os.path.join(output_dir, sub_dir))
+
+                    i -= 1
+
+        def load_model_hook(models, input_dir):
+            while len(models) > 0:
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = unet.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    vae.requires_grad_(False)
+    unet.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+            # controlnet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # if args.gradient_checkpointing:
+    #     controlnet.enable_gradient_checkpointing()
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        " Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training, copy of the weights should still be float32."
+    )
+
+    if unwrap_model(unet).dtype != torch.float32:
+        raise ValueError(f"Controlnet loaded as datatype {unwrap_model(unet).dtype}. {low_precision_error_string}")
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    optimizer_class = torch.optim.AdamW
+    # Optimizer creation
+    for n, m in unet.named_modules():
+        if ("fuser" in n) or ("position_net" in n):
+            import torch.nn as nn
+
+            if isinstance(m, (nn.Linear, nn.LayerNorm)):
+                m.reset_parameters()
+    params_to_optimize = []
+    for n, p in unet.named_parameters():
+        if ("fuser" in n) or ("position_net" in n):
+            p.requires_grad = True
+            params_to_optimize.append(p)
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    from dataset import COCODataset
+
+    train_dataset = COCODataset(
+        data_path=args.data_path,
+        image_path=args.image_path,
+        tokenizer=tokenizer,
+        image_size=args.resolution,
+        max_boxes_per_data=30,
+    )
+
+    print("num samples: ", len(train_dataset))
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        # collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae, unet and text_encoder to device and cast to weight_dtype
+    vae.to(accelerator.device, dtype=weight_dtype)
+    # unet.to(accelerator.device, dtype=weight_dtype)
+    unet.to(accelerator.device, dtype=torch.float32)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+
+        # tensorboard cannot handle list types for config
+        # tracker_config.pop("validation_prompt")
+        # tracker_config.pop("validation_image")
+
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+
+    # Train!
+    # total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    # logger.info("***** Running training *****")
+    # logger.info(f"  Num examples = {len(train_dataset)}")
+    # logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    # logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    # logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    # logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    # logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    # logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    log_validation(
+        vae,
+        text_encoder,
+        tokenizer,
+        unet,
+        noise_scheduler,
+        args,
+        accelerator,
+        global_step,
+        weight_dtype,
+    )
+
+    # image_logs = None
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                with torch.no_grad():
+                    # Get the text embedding for conditioning
+                    encoder_hidden_states = text_encoder(
+                        batch["caption"]["input_ids"].squeeze(1),
+                        # batch['caption']['attention_mask'].squeeze(1),
+                        return_dict=False,
+                    )[0]
+
+                cross_attention_kwargs = {}
+                cross_attention_kwargs["gligen"] = {
+                    "boxes": batch["boxes"],
+                    "positive_embeddings": batch["text_embeddings_before_projection"],
+                    "masks": batch["masks"],
+                }
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step:06d}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    # if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                    log_validation(
+                        vae,
+                        text_encoder,
+                        tokenizer,
+                        unet,
+                        noise_scheduler,
+                        args,
+                        accelerator,
+                        global_step,
+                        weight_dtype,
+                    )
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        unet.save_pretrained(args.output_dir)
+    #
+    #     # Run a final round of validation.
+    #     image_logs = None
+    #     if args.validation_prompt is not None:
+    #         image_logs = log_validation(
+    #             vae=vae,
+    #             text_encoder=text_encoder,
+    #             tokenizer=tokenizer,
+    #             unet=unet,
+    #             controlnet=None,
+    #             args=args,
+    #             accelerator=accelerator,
+    #             weight_dtype=weight_dtype,
+    #             step=global_step,
+    #             is_final_validation=True,
+    #         )
+    #
+    #     if args.push_to_hub:
+    #         save_model_card(
+    #             repo_id,
+    #             image_logs=image_logs,
+    #             base_model=args.pretrained_model_name_or_path,
+    #             repo_folder=args.output_dir,
+    #         )
+    #         upload_folder(
+    #             repo_id=repo_id,
+    #             folder_path=args.output_dir,
+    #             commit_message="End of training",
+    #             ignore_patterns=["step_*", "epoch_*"],
+    #         )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/instructpix2pix_lora/README.md b/diffusers/examples/research_projects/instructpix2pix_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cfcd98926c074f9d1e32a73f81b0a46a6bf5c6f2
--- /dev/null
+++ b/diffusers/examples/research_projects/instructpix2pix_lora/README.md
@@ -0,0 +1,75 @@
+# InstructPix2Pix text-to-edit-image fine-tuning
+This extended LoRA training script was authored by [Aiden-Frost](https://github.com/Aiden-Frost).
+This is an experimental LoRA extension of [this example](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/train_instruct_pix2pix.py). This script provides further support add LoRA layers for unet model.
+
+## Training script example
+
+```bash
+export MODEL_ID="timbrooks/instruct-pix2pix"
+export DATASET_ID="instruction-tuning-sd/cartoonization"
+export OUTPUT_DIR="instructPix2Pix-cartoonization"
+
+accelerate launch finetune_instruct_pix2pix.py \
+  --pretrained_model_name_or_path=$MODEL_ID \
+  --dataset_name=$DATASET_ID \
+  --enable_xformers_memory_efficient_attention \
+  --resolution=256 --random_flip \
+  --train_batch_size=2 --gradient_accumulation_steps=4 --gradient_checkpointing \
+  --max_train_steps=15000 \
+  --checkpointing_steps=5000 --checkpoints_total_limit=1 \
+  --learning_rate=5e-05 --lr_warmup_steps=0 \
+  --val_image_url="https://hf.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png" \
+  --validation_prompt="Generate a cartoonized version of the natural image" \
+  --seed=42 \
+  --rank=4 \
+  --output_dir=$OUTPUT_DIR \
+  --report_to=wandb \
+  --push_to_hub
+```
+
+## Inference
+After training the model and the lora weight of the model is stored in the ```$OUTPUT_DIR```.
+
+```py
+# load the base model pipeline
+pipe_lora = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix")
+
+# Load LoRA weights from the provided path
+output_dir = "path/to/lora_weight_directory"
+pipe_lora.unet.load_attn_procs(output_dir)
+
+input_image_path = "/path/to/input_image"
+input_image = Image.open(input_image_path)
+edited_images = pipe_lora(num_images_per_prompt=1, prompt=args.edit_prompt, image=input_image, num_inference_steps=1000).images
+edited_images[0].show()
+```
+
+## Results
+
+Here is an example of using the script to train a instructpix2pix model.
+Trained on google colab T4 GPU
+
+```bash
+MODEL_ID="timbrooks/instruct-pix2pix"
+DATASET_ID="instruction-tuning-sd/cartoonization"
+TRAIN_EPOCHS=100
+```
+
+Below are few examples for given the input image, edit_prompt and the edited_image (output of the model)
+
+<p align="center">
+    <img src="https://github.com/Aiden-Frost/Efficiently-teaching-counting-and-cartoonization-to-InstructPix2Pix.-/blob/main/diffusers_result_assets/edited_image_results.png?raw=true" alt="instructpix2pix-inputs" width=600/>
+</p>
+
+
+Here are some rough statistics about the training model using this script
+
+<p align="center">
+    <img src="https://github.com/Aiden-Frost/Efficiently-teaching-counting-and-cartoonization-to-InstructPix2Pix.-/blob/main/diffusers_result_assets/results.png?raw=true" alt="instructpix2pix-inputs" width=600/>
+</p>
+
+## References
+
+* InstructPix2Pix - https://github.com/timothybrooks/instruct-pix2pix
+* Dataset and example training script - https://huggingface.co/blog/instruction-tuning-sd
+* For more information about the project - https://github.com/Aiden-Frost/Efficiently-teaching-counting-and-cartoonization-to-InstructPix2Pix.-
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/instructpix2pix_lora/train_instruct_pix2pix_lora.py b/diffusers/examples/research_projects/instructpix2pix_lora/train_instruct_pix2pix_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..997d448fa281bc0c5ba807d8efd176344eb0a6c4
--- /dev/null
+++ b/diffusers/examples/research_projects/instructpix2pix_lora/train_instruct_pix2pix_lora.py
@@ -0,0 +1,1063 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script to fine-tune Stable Diffusion for InstructPix2Pix."""
+
+import argparse
+import logging
+import math
+import os
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+
+import accelerate
+import datasets
+import numpy as np
+import PIL
+import requests
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionInstructPix2PixPipeline, UNet2DConditionModel
+from diffusers.models.lora import LoRALinearLayer
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel
+from diffusers.utils import check_min_version, deprecate, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.26.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+DATASET_NAME_MAPPING = {
+    "fusing/instructpix2pix-1000-samples": ("input_image", "edit_prompt", "edited_image"),
+}
+WANDB_TABLE_COL_NAMES = ["original_image", "edited_image", "edit_prompt"]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script for InstructPix2Pix.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--original_image_column",
+        type=str,
+        default="input_image",
+        help="The column of the dataset containing the original image on which edits where made.",
+    )
+    parser.add_argument(
+        "--edited_image_column",
+        type=str,
+        default="edited_image",
+        help="The column of the dataset containing the edited image.",
+    )
+    parser.add_argument(
+        "--edit_prompt_column",
+        type=str,
+        default="edit_prompt",
+        help="The column of the dataset containing the edit instruction.",
+    )
+    parser.add_argument(
+        "--val_image_url",
+        type=str,
+        default=None,
+        help="URL to the original image that you would like to edit (used during inference for debugging purposes).",
+    )
+    parser.add_argument(
+        "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="instruct-pix2pix-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=256,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--conditioning_dropout_prob",
+        type=float,
+        default=None,
+        help="Conditioning dropout probability. Drops out the conditionings (image and edit prompt) used in training InstructPix2Pix. See section 3.2.1 in the paper: https://arxiv.org/abs/2211.09800.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--non_ema_revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or"
+            " remote repository specified with --pretrained_model_name_or_path."
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    # default to using the same revision for the non-ema model if not specified
+    if args.non_ema_revision is None:
+        args.non_ema_revision = args.revision
+
+    return args
+
+
+def convert_to_np(image, resolution):
+    image = image.convert("RGB").resize((resolution, resolution))
+    return np.array(image).transpose(2, 0, 1)
+
+
+def download_image(url):
+    image = PIL.Image.open(requests.get(url, stream=True).raw)
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if args.non_ema_revision is not None:
+        deprecate(
+            "non_ema_revision!=None",
+            "0.15.0",
+            message=(
+                "Downloading 'non_ema' weights from revision branches of the Hub is deprecated. Please make sure to"
+                " use `--variant=non_ema` instead."
+            ),
+        )
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.non_ema_revision
+    )
+
+    # Freeze vae, text_encoder and unet
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # referred to https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py
+    unet_lora_parameters = []
+    for attn_processor_name, attn_processor in unet.attn_processors.items():
+        # Parse the attention module.
+        attn_module = unet
+        for n in attn_processor_name.split(".")[:-1]:
+            attn_module = getattr(attn_module, n)
+
+        # Set the `lora_layer` attribute of the attention-related matrices.
+        attn_module.to_q.set_lora_layer(
+            LoRALinearLayer(
+                in_features=attn_module.to_q.in_features, out_features=attn_module.to_q.out_features, rank=args.rank
+            )
+        )
+        attn_module.to_k.set_lora_layer(
+            LoRALinearLayer(
+                in_features=attn_module.to_k.in_features, out_features=attn_module.to_k.out_features, rank=args.rank
+            )
+        )
+
+        attn_module.to_v.set_lora_layer(
+            LoRALinearLayer(
+                in_features=attn_module.to_v.in_features, out_features=attn_module.to_v.out_features, rank=args.rank
+            )
+        )
+        attn_module.to_out[0].set_lora_layer(
+            LoRALinearLayer(
+                in_features=attn_module.to_out[0].in_features,
+                out_features=attn_module.to_out[0].out_features,
+                rank=args.rank,
+            )
+        )
+
+        # Accumulate the LoRA params to optimize.
+        unet_lora_parameters.extend(attn_module.to_q.lora_layer.parameters())
+        unet_lora_parameters.extend(attn_module.to_k.lora_layer.parameters())
+        unet_lora_parameters.extend(attn_module.to_v.lora_layer.parameters())
+        unet_lora_parameters.extend(attn_module.to_out[0].lora_layer.parameters())
+
+    # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = EMAModel(unet.parameters(), model_cls=UNet2DConditionModel, model_config=unet.config)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DConditionModel)
+                ema_unet.load_state_dict(load_model.state_dict())
+                ema_unet.to(accelerator.device)
+                del load_model
+
+            for i in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    # train on only unet_lora_parameters
+    optimizer = optimizer_cls(
+        unet_lora_parameters,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/main/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.original_image_column is None:
+        original_image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        original_image_column = args.original_image_column
+        if original_image_column not in column_names:
+            raise ValueError(
+                f"--original_image_column' value '{args.original_image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.edit_prompt_column is None:
+        edit_prompt_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        edit_prompt_column = args.edit_prompt_column
+        if edit_prompt_column not in column_names:
+            raise ValueError(
+                f"--edit_prompt_column' value '{args.edit_prompt_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.edited_image_column is None:
+        edited_image_column = dataset_columns[2] if dataset_columns is not None else column_names[2]
+    else:
+        edited_image_column = args.edited_image_column
+        if edited_image_column not in column_names:
+            raise ValueError(
+                f"--edited_image_column' value '{args.edited_image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(captions):
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        return inputs.input_ids
+
+    # Preprocessing the datasets.
+    train_transforms = transforms.Compose(
+        [
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+        ]
+    )
+
+    def preprocess_images(examples):
+        original_images = np.concatenate(
+            [convert_to_np(image, args.resolution) for image in examples[original_image_column]]
+        )
+        edited_images = np.concatenate(
+            [convert_to_np(image, args.resolution) for image in examples[edited_image_column]]
+        )
+        # We need to ensure that the original and the edited images undergo the same
+        # augmentation transforms.
+        images = np.concatenate([original_images, edited_images])
+        images = torch.tensor(images)
+        images = 2 * (images / 255) - 1
+        return train_transforms(images)
+
+    def preprocess_train(examples):
+        # Preprocess images.
+        preprocessed_images = preprocess_images(examples)
+        # Since the original and edited images were concatenated before
+        # applying the transformations, we need to separate them and reshape
+        # them accordingly.
+        original_images, edited_images = preprocessed_images.chunk(2)
+        original_images = original_images.reshape(-1, 3, args.resolution, args.resolution)
+        edited_images = edited_images.reshape(-1, 3, args.resolution, args.resolution)
+
+        # Collate the preprocessed images into the `examples`.
+        examples["original_pixel_values"] = original_images
+        examples["edited_pixel_values"] = edited_images
+
+        # Preprocess the captions.
+        captions = list(examples[edit_prompt_column])
+        examples["input_ids"] = tokenize_captions(captions)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        original_pixel_values = torch.stack([example["original_pixel_values"] for example in examples])
+        original_pixel_values = original_pixel_values.to(memory_format=torch.contiguous_format).float()
+        edited_pixel_values = torch.stack([example["edited_pixel_values"] for example in examples])
+        edited_pixel_values = edited_pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+        return {
+            "original_pixel_values": original_pixel_values,
+            "edited_pixel_values": edited_pixel_values,
+            "input_ids": input_ids,
+        }
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, unet_lora_parameters, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, unet_lora_parameters, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.use_ema:
+        ema_unet.to(accelerator.device)
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move text_encode and vae to gpu and cast to weight_dtype
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("instruct-pix2pix", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            with accelerator.accumulate(unet):
+                # We want to learn the denoising process w.r.t the edited images which
+                # are conditioned on the original image (which was edited) and the edit instruction.
+                # So, first, convert images to latent space.
+                latents = vae.encode(batch["edited_pixel_values"].to(weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning.
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Get the additional image embedding for conditioning.
+                # Instead of getting a diagonal Gaussian here, we simply take the mode.
+                original_image_embeds = vae.encode(batch["original_pixel_values"].to(weight_dtype)).latent_dist.mode()
+
+                # Conditioning dropout to support classifier-free guidance during inference. For more details
+                # check out the section 3.2.1 of the original paper https://arxiv.org/abs/2211.09800.
+                if args.conditioning_dropout_prob is not None:
+                    random_p = torch.rand(bsz, device=latents.device, generator=generator)
+                    # Sample masks for the edit prompts.
+                    prompt_mask = random_p < 2 * args.conditioning_dropout_prob
+                    prompt_mask = prompt_mask.reshape(bsz, 1, 1)
+                    # Final text conditioning.
+                    null_conditioning = text_encoder(tokenize_captions([""]).to(accelerator.device))[0]
+                    encoder_hidden_states = torch.where(prompt_mask, null_conditioning, encoder_hidden_states)
+
+                    # Sample masks for the original images.
+                    image_mask_dtype = original_image_embeds.dtype
+                    image_mask = 1 - (
+                        (random_p >= args.conditioning_dropout_prob).to(image_mask_dtype)
+                        * (random_p < 3 * args.conditioning_dropout_prob).to(image_mask_dtype)
+                    )
+                    image_mask = image_mask.reshape(bsz, 1, 1, 1)
+                    # Final image conditioning.
+                    original_image_embeds = image_mask * original_image_embeds
+
+                # Concatenate the `original_image_embeds` with the `noisy_latents`.
+                concatenated_noisy_latents = torch.cat([noisy_latents, original_image_embeds], dim=1)
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # Predict the noise residual and compute loss
+                model_pred = unet(concatenated_noisy_latents, timesteps, encoder_hidden_states).sample
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet_lora_parameters, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_unet.step(unet_lora_parameters)
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if (
+                (args.val_image_url is not None)
+                and (args.validation_prompt is not None)
+                and (epoch % args.validation_epochs == 0)
+            ):
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                # create pipeline
+                if args.use_ema:
+                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                    ema_unet.store(unet.parameters())
+                    ema_unet.copy_to(unet.parameters())
+                # The models need unwrapping because for compatibility in distributed training mode.
+                pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    unet=accelerator.unwrap_model(unet),
+                    text_encoder=accelerator.unwrap_model(text_encoder),
+                    vae=accelerator.unwrap_model(vae),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                original_image = download_image(args.val_image_url)
+                edited_images = []
+                if torch.backends.mps.is_available():
+                    autocast_ctx = nullcontext()
+                else:
+                    autocast_ctx = torch.autocast(accelerator.device.type)
+
+                with autocast_ctx:
+                    for _ in range(args.num_validation_images):
+                        edited_images.append(
+                            pipeline(
+                                args.validation_prompt,
+                                image=original_image,
+                                num_inference_steps=20,
+                                image_guidance_scale=1.5,
+                                guidance_scale=7,
+                                generator=generator,
+                            ).images[0]
+                        )
+
+                for tracker in accelerator.trackers:
+                    if tracker.name == "wandb":
+                        wandb_table = wandb.Table(columns=WANDB_TABLE_COL_NAMES)
+                        for edited_image in edited_images:
+                            wandb_table.add_data(
+                                wandb.Image(original_image), wandb.Image(edited_image), args.validation_prompt
+                            )
+                        tracker.log({"validation": wandb_table})
+                if args.use_ema:
+                    # Switch back to the original UNet parameters.
+                    ema_unet.restore(unet.parameters())
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+
+        pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            text_encoder=accelerator.unwrap_model(text_encoder),
+            vae=accelerator.unwrap_model(vae),
+            unet=unet,
+            revision=args.revision,
+            variant=args.variant,
+        )
+        # store only LORA layers
+        unet.save_attn_procs(args.output_dir)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+        if args.validation_prompt is not None:
+            edited_images = []
+            pipeline = pipeline.to(accelerator.device)
+            with torch.autocast(str(accelerator.device).replace(":0", "")):
+                for _ in range(args.num_validation_images):
+                    edited_images.append(
+                        pipeline(
+                            args.validation_prompt,
+                            image=original_image,
+                            num_inference_steps=20,
+                            image_guidance_scale=1.5,
+                            guidance_scale=7,
+                            generator=generator,
+                        ).images[0]
+                    )
+
+            for tracker in accelerator.trackers:
+                if tracker.name == "wandb":
+                    wandb_table = wandb.Table(columns=WANDB_TABLE_COL_NAMES)
+                    for edited_image in edited_images:
+                        wandb_table.add_data(
+                            wandb.Image(original_image), wandb.Image(edited_image), args.validation_prompt
+                        )
+                    tracker.log({"test": wandb_table})
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/research_projects/intel_opts/README.md b/diffusers/examples/research_projects/intel_opts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b2f75b3b0e3c4e7c90f51e55980a5b6044073f37
--- /dev/null
+++ b/diffusers/examples/research_projects/intel_opts/README.md
@@ -0,0 +1,36 @@
+## Diffusers examples with Intel optimizations
+
+**This research project is not actively maintained by the diffusers team. For any questions or comments, please make sure to tag @hshen14 .**
+
+This aims to provide diffusers examples with Intel optimizations such as Bfloat16 for training/fine-tuning acceleration and 8-bit integer (INT8) for inference acceleration on Intel platforms.
+
+## Accelerating the fine-tuning for textual inversion
+
+We accelerate the fine-tuning for textual inversion with Intel Extension for PyTorch. The [examples](textual_inversion) enable both single node and multi-node distributed training with Bfloat16 support on Intel Xeon Scalable Processor.
+
+## Accelerating the inference for Stable Diffusion using Bfloat16
+
+We start the inference acceleration with Bfloat16 using Intel Extension for PyTorch. The [script](inference_bf16.py) is generally designed to support standard Stable Diffusion models with Bfloat16 support.
+```bash
+pip install diffusers transformers accelerate scipy safetensors
+
+export KMP_BLOCKTIME=1
+export KMP_SETTINGS=1
+export KMP_AFFINITY=granularity=fine,compact,1,0
+
+# Intel OpenMP
+export OMP_NUM_THREADS=< Cores to use >
+export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libiomp5.so
+# Jemalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support.
+export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libjemalloc.so
+export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:9000000000"
+
+# Launch with default DDIM
+numactl --membind <node N> -C <cpu list> python python inference_bf16.py
+# Launch with DPMSolverMultistepScheduler
+numactl --membind <node N> -C <cpu list> python python inference_bf16.py --dpm
+```
+
+## Accelerating the inference for Stable Diffusion using INT8
+
+Coming soon ...
diff --git a/diffusers/examples/research_projects/intel_opts/inference_bf16.py b/diffusers/examples/research_projects/intel_opts/inference_bf16.py
new file mode 100644
index 0000000000000000000000000000000000000000..96ec709f433cd13dad0b93d5368d61e169b9df28
--- /dev/null
+++ b/diffusers/examples/research_projects/intel_opts/inference_bf16.py
@@ -0,0 +1,56 @@
+import argparse
+
+import intel_extension_for_pytorch as ipex
+import torch
+
+from diffusers import DPMSolverMultistepScheduler, StableDiffusionPipeline
+
+
+parser = argparse.ArgumentParser("Stable Diffusion script with intel optimization", add_help=False)
+parser.add_argument("--dpm", action="store_true", help="Enable DPMSolver or not")
+parser.add_argument("--steps", default=None, type=int, help="Num inference steps")
+args = parser.parse_args()
+
+
+device = "cpu"
+prompt = "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings"
+
+model_id = "path-to-your-trained-model"
+pipe = StableDiffusionPipeline.from_pretrained(model_id)
+if args.dpm:
+    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to(device)
+
+# to channels last
+pipe.unet = pipe.unet.to(memory_format=torch.channels_last)
+pipe.vae = pipe.vae.to(memory_format=torch.channels_last)
+pipe.text_encoder = pipe.text_encoder.to(memory_format=torch.channels_last)
+if pipe.requires_safety_checker:
+    pipe.safety_checker = pipe.safety_checker.to(memory_format=torch.channels_last)
+
+# optimize with ipex
+sample = torch.randn(2, 4, 64, 64)
+timestep = torch.rand(1) * 999
+encoder_hidden_status = torch.randn(2, 77, 768)
+input_example = (sample, timestep, encoder_hidden_status)
+try:
+    pipe.unet = ipex.optimize(pipe.unet.eval(), dtype=torch.bfloat16, inplace=True, sample_input=input_example)
+except Exception:
+    pipe.unet = ipex.optimize(pipe.unet.eval(), dtype=torch.bfloat16, inplace=True)
+pipe.vae = ipex.optimize(pipe.vae.eval(), dtype=torch.bfloat16, inplace=True)
+pipe.text_encoder = ipex.optimize(pipe.text_encoder.eval(), dtype=torch.bfloat16, inplace=True)
+if pipe.requires_safety_checker:
+    pipe.safety_checker = ipex.optimize(pipe.safety_checker.eval(), dtype=torch.bfloat16, inplace=True)
+
+# compute
+seed = 666
+generator = torch.Generator(device).manual_seed(seed)
+generate_kwargs = {"generator": generator}
+if args.steps is not None:
+    generate_kwargs["num_inference_steps"] = args.steps
+
+with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+    image = pipe(prompt, **generate_kwargs).images[0]
+
+# save image
+image.save("generated.png")
diff --git a/diffusers/examples/research_projects/intel_opts/textual_inversion/README.md b/diffusers/examples/research_projects/intel_opts/textual_inversion/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3339b8e2cb63674481baf23de1bbe26772120ba7
--- /dev/null
+++ b/diffusers/examples/research_projects/intel_opts/textual_inversion/README.md
@@ -0,0 +1,68 @@
+## Textual Inversion fine-tuning example
+
+[Textual inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like stable diffusion on your own images using just 3-5 examples.
+The `textual_inversion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+## Training with Intel Extension for PyTorch
+
+Intel Extension for PyTorch provides the optimizations for faster training and inference on CPUs. You can leverage the training example "textual_inversion.py". Follow the [instructions](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) to get the model and [dataset](https://huggingface.co/sd-concepts-library/dicoo2) before running the script.
+
+The example supports both single node and multi-node distributed training:
+
+### Single node training
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATA_DIR="path-to-dir-containing-dicoo-images"
+
+python textual_inversion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<dicoo>" --initializer_token="toy" \
+  --seed=7 \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --max_train_steps=3000 \
+  --learning_rate=2.5e-03 --scale_lr \
+  --output_dir="textual_inversion_dicoo"
+```
+
+Note: Bfloat16 is available on Intel Xeon Scalable Processors Cooper Lake or Sapphire Rapids. You may not get performance speedup without Bfloat16 support.
+
+### Multi-node distributed training
+
+Before running the scripts, make sure to install the library's training dependencies successfully:
+
+```bash
+python -m pip install oneccl_bind_pt==1.13 -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATA_DIR="path-to-dir-containing-dicoo-images"
+
+oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)")
+source $oneccl_bindings_for_pytorch_path/env/setvars.sh
+
+python -m intel_extension_for_pytorch.cpu.launch --distributed \
+  --hostfile hostfile --nnodes 2 --nproc_per_node 2 textual_inversion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<dicoo>" --initializer_token="toy" \
+  --seed=7 \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --max_train_steps=750 \
+  --learning_rate=2.5e-03 --scale_lr \
+  --output_dir="textual_inversion_dicoo"
+```
+The above is a simple distributed training usage on 2 nodes with 2 processes on each node. Add the right hostname or ip address in the "hostfile" and make sure these 2 nodes are reachable from each other. For more details, please refer to the [user guide](https://github.com/intel/torch-ccl).
+
+
+### Reference
+
+We publish a [Medium blog](https://medium.com/intel-analytics-software/personalized-stable-diffusion-with-few-shot-fine-tuning-on-a-single-cpu-f01a3316b13) on how to create your own Stable Diffusion model on CPUs using textual inversion. Try it out now, if you have interests.
diff --git a/diffusers/examples/research_projects/intel_opts/textual_inversion/requirements.txt b/diffusers/examples/research_projects/intel_opts/textual_inversion/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..af7ed6b21f6fb4518930a37786199643b1c60ece
--- /dev/null
+++ b/diffusers/examples/research_projects/intel_opts/textual_inversion/requirements.txt
@@ -0,0 +1,7 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.21.0
+ftfy
+tensorboard
+Jinja2
+intel_extension_for_pytorch>=1.13
diff --git a/diffusers/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py b/diffusers/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea4a0d255b680ffce9c143cacfbcb711e114fdd7
--- /dev/null
+++ b/diffusers/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py
@@ -0,0 +1,646 @@
+import argparse
+import itertools
+import math
+import os
+import random
+from pathlib import Path
+
+import intel_extension_for_pytorch as ipex
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from diffusers.utils import check_min_version
+
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.13.0.dev0")
+
+
+logger = get_logger(__name__)
+
+
+def save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path):
+    logger.info("Saving embeddings")
+    learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[placeholder_token_id]
+    learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
+    torch.save(learned_embeds_dict, save_path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save learned_embeds.bin every X updates steps.",
+    )
+    parser.add_argument(
+        "--only_save_embeds",
+        action="store_true",
+        default=False,
+        help="Save only the embeddings for the new concept.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
+    )
+    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
+    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution."
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=True,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    return args
+
+
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+
+
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+
+        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+
+        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+
+        example["input_ids"] = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            (
+                h,
+                w,
+            ) = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
+
+        image = Image.fromarray(img)
+        image = image.resize((self.size, self.size), resample=self.interpolation)
+
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+def freeze_params(params):
+    for param in params:
+        param.requires_grad = False
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer and add the placeholder token as a additional special token
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Add the placeholder token in tokenizer
+    num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
+    if num_added_tokens == 0:
+        raise ValueError(
+            f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+            " `placeholder_token` that is not already in the tokenizer."
+        )
+
+    # Convert the initializer_token, placeholder_token to ids
+    token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
+    # Check if initializer_token is a single token or a sequence of tokens
+    if len(token_ids) > 1:
+        raise ValueError("The initializer token must be a single token.")
+
+    initializer_token_id = token_ids[0]
+    placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
+
+    # Load models and create wrapper for stable diffusion
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="unet",
+        revision=args.revision,
+    )
+
+    # Resize the token embeddings as we are adding new special tokens to the tokenizer
+    text_encoder.resize_token_embeddings(len(tokenizer))
+
+    # Initialise the newly added placeholder token with the embeddings of the initializer token
+    token_embeds = text_encoder.get_input_embeddings().weight.data
+    token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
+
+    # Freeze vae and unet
+    freeze_params(vae.parameters())
+    freeze_params(unet.parameters())
+    # Freeze all parameters except for the token embeddings in text encoder
+    params_to_freeze = itertools.chain(
+        text_encoder.text_model.encoder.parameters(),
+        text_encoder.text_model.final_layer_norm.parameters(),
+        text_encoder.text_model.embeddings.position_embedding.parameters(),
+    )
+    freeze_params(params_to_freeze)
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    train_dataset = TextualInversionDataset(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=args.placeholder_token,
+        repeats=args.repeats,
+        learnable_property=args.learnable_property,
+        center_crop=args.center_crop,
+        set="train",
+    )
+    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        text_encoder, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # Move vae and unet to device
+    vae.to(accelerator.device)
+    unet.to(accelerator.device)
+
+    # Keep vae and unet in eval model as we don't train these
+    vae.eval()
+    unet.eval()
+
+    unet = ipex.optimize(unet, dtype=torch.bfloat16, inplace=True)
+    vae = ipex.optimize(vae, dtype=torch.bfloat16, inplace=True)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("textual_inversion", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    global_step = 0
+
+    text_encoder.train()
+    text_encoder, optimizer = ipex.optimize(text_encoder, optimizer=optimizer, dtype=torch.bfloat16)
+
+    for epoch in range(args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+                with accelerator.accumulate(text_encoder):
+                    # Convert images to latent space
+                    latents = vae.encode(batch["pixel_values"]).latent_dist.sample().detach()
+                    latents = latents * vae.config.scaling_factor
+
+                    # Sample noise that we'll add to the latents
+                    noise = torch.randn(latents.shape).to(latents.device)
+                    bsz = latents.shape[0]
+                    # Sample a random timestep for each image
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+                    ).long()
+
+                    # Add noise to the latents according to the noise magnitude at each timestep
+                    # (this is the forward diffusion process)
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                    # Get the text embedding for conditioning
+                    encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                    # Predict the noise residual
+                    model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                    # Get the target for loss depending on the prediction type
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        target = noise
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                    else:
+                        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                    loss = F.mse_loss(model_pred, target, reduction="none").mean([1, 2, 3]).mean()
+                    accelerator.backward(loss)
+
+                    # Zero out the gradients for all token embeddings except the newly added
+                    # embeddings for the concept, as we only want to optimize the concept embeddings
+                    if accelerator.num_processes > 1:
+                        grads = text_encoder.module.get_input_embeddings().weight.grad
+                    else:
+                        grads = text_encoder.get_input_embeddings().weight.grad
+                    # Get the index for tokens that we want to zero the grads for
+                    index_grads_to_zero = torch.arange(len(tokenizer)) != placeholder_token_id
+                    grads.data[index_grads_to_zero, :] = grads.data[index_grads_to_zero, :].fill_(0)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.save_steps == 0:
+                    save_path = os.path.join(args.output_dir, f"learned_embeds-steps-{global_step}.bin")
+                    save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        accelerator.wait_for_everyone()
+
+    # Create the pipeline using using the trained modules and save it.
+    if accelerator.is_main_process:
+        if args.push_to_hub and args.only_save_embeds:
+            logger.warning("Enabling full model saving because --push_to_hub=True was specified.")
+            save_full_model = True
+        else:
+            save_full_model = not args.only_save_embeds
+        if save_full_model:
+            pipeline = StableDiffusionPipeline(
+                text_encoder=accelerator.unwrap_model(text_encoder),
+                vae=vae,
+                unet=unet,
+                tokenizer=tokenizer,
+                scheduler=PNDMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler"),
+                safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
+                feature_extractor=CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32"),
+            )
+            pipeline.save_pretrained(args.output_dir)
+        # Save the newly trained embeddings
+        save_path = os.path.join(args.output_dir, "learned_embeds.bin")
+        save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/research_projects/intel_opts/textual_inversion_dfq/README.md b/diffusers/examples/research_projects/intel_opts/textual_inversion_dfq/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4a227cdb4d63585cc0f0ab76424be8a0b2c5b604
--- /dev/null
+++ b/diffusers/examples/research_projects/intel_opts/textual_inversion_dfq/README.md
@@ -0,0 +1,93 @@
+# Distillation for quantization on Textual Inversion models to personalize text2image
+
+[Textual inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like stable diffusion on your own images._By using just 3-5 images new concepts can be taught to Stable Diffusion and the model personalized on your own images_
+The `textual_inversion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+We have enabled distillation for quantization in `textual_inversion.py` to do quantization aware training as well as distillation on the model generated by Textual Inversion method.
+
+## Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Prepare Datasets
+
+One picture which is from the huggingface datasets [sd-concepts-library/dicoo2](https://huggingface.co/sd-concepts-library/dicoo2) is needed, and save it to the `./dicoo` directory. The picture is shown below:
+
+<a href="https://huggingface.co/sd-concepts-library/dicoo2/blob/main/concept_images/1.jpeg">
+    <img src="https://huggingface.co/sd-concepts-library/dicoo2/resolve/main/concept_images/1.jpeg" width = "300" height="300">
+</a>
+
+## Get a FP32 Textual Inversion model
+
+Use the following command to fine-tune the Stable Diffusion model on the above dataset to obtain the FP32 Textual Inversion model.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATA_DIR="./dicoo"
+
+accelerate launch textual_inversion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<dicoo>" --initializer_token="toy" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --max_train_steps=3000 \
+  --learning_rate=5.0e-04 --scale_lr \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="dicoo_model"
+```
+
+## Do distillation for quantization
+
+Distillation for quantization is a method that combines [intermediate layer knowledge distillation](https://github.com/intel/neural-compressor/blob/master/docs/source/distillation.md#intermediate-layer-knowledge-distillation) and [quantization aware training](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization.md#quantization-aware-training) in the same training process to improve the performance of the quantized model. Provided a FP32 model, the distillation for quantization approach will take this model itself as the teacher model and transfer the knowledges of the specified layers to the student model, i.e. quantized version of the FP32 model, during the quantization aware training process.
+
+Once you have the FP32 Textual Inversion model, the following command will take the FP32 Textual Inversion model as input to do distillation for quantization and generate the INT8 Textual Inversion model.
+
+```bash
+export FP32_MODEL_NAME="./dicoo_model"
+export DATA_DIR="./dicoo"
+
+accelerate launch textual_inversion.py \
+  --pretrained_model_name_or_path=$FP32_MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --use_ema --learnable_property="object" \
+  --placeholder_token="<dicoo>" --initializer_token="toy" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --max_train_steps=300 \
+  --learning_rate=5.0e-04 --max_grad_norm=3 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="int8_model" \
+  --do_quantization --do_distillation --verify_loading
+```
+
+After the distillation for quantization process, the quantized UNet would be 4 times smaller (3279MB -> 827MB).
+
+## Inference
+
+Once you have trained a INT8 model with the above command, the inference can be done simply using the `text2images.py` script. Make sure to include the `placeholder_token` in your prompt.
+
+```bash
+export INT8_MODEL_NAME="./int8_model"
+
+python text2images.py \
+  --pretrained_model_name_or_path=$INT8_MODEL_NAME \
+  --caption "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings." \
+  --images_num 4
+```
+
+Here is the comparison of images generated by the FP32 model (left) and INT8 model (right) respectively:
+
+<p float="left">
+  <img src="https://huggingface.co/datasets/Intel/textual_inversion_dicoo_dfq/resolve/main/FP32.png" width = "300" height = "300" alt="FP32" align=center />
+  <img src="https://huggingface.co/datasets/Intel/textual_inversion_dicoo_dfq/resolve/main/INT8.png" width = "300" height = "300" alt="INT8" align=center />
+</p>
+
diff --git a/diffusers/examples/research_projects/intel_opts/textual_inversion_dfq/requirements.txt b/diffusers/examples/research_projects/intel_opts/textual_inversion_dfq/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cbd4c957be441a1aaf9a52e7ff02d772cb9d302b
--- /dev/null
+++ b/diffusers/examples/research_projects/intel_opts/textual_inversion_dfq/requirements.txt
@@ -0,0 +1,7 @@
+accelerate
+torchvision
+transformers>=4.25.0
+ftfy
+tensorboard
+modelcards
+neural-compressor
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/intel_opts/textual_inversion_dfq/text2images.py b/diffusers/examples/research_projects/intel_opts/textual_inversion_dfq/text2images.py
new file mode 100644
index 0000000000000000000000000000000000000000..a99d727712eb44b875576443837c81a442c72a6f
--- /dev/null
+++ b/diffusers/examples/research_projects/intel_opts/textual_inversion_dfq/text2images.py
@@ -0,0 +1,112 @@
+import argparse
+import math
+import os
+
+import torch
+from neural_compressor.utils.pytorch import load
+from PIL import Image
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, StableDiffusionPipeline, UNet2DConditionModel
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m",
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "-c",
+        "--caption",
+        type=str,
+        default="robotic cat with wings",
+        help="Text used to generate images.",
+    )
+    parser.add_argument(
+        "-n",
+        "--images_num",
+        type=int,
+        default=4,
+        help="How much images to generate.",
+    )
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=42,
+        help="Seed for random process.",
+    )
+    parser.add_argument(
+        "-ci",
+        "--cuda_id",
+        type=int,
+        default=0,
+        help="cuda_id.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def image_grid(imgs, rows, cols):
+    if not len(imgs) == rows * cols:
+        raise ValueError("The specified number of rows and columns are not correct.")
+
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    grid_w, grid_h = grid.size
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+
+
+def generate_images(
+    pipeline,
+    prompt="robotic cat with wings",
+    guidance_scale=7.5,
+    num_inference_steps=50,
+    num_images_per_prompt=1,
+    seed=42,
+):
+    generator = torch.Generator(pipeline.device).manual_seed(seed)
+    images = pipeline(
+        prompt,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        generator=generator,
+        num_images_per_prompt=num_images_per_prompt,
+    ).images
+    _rows = int(math.sqrt(num_images_per_prompt))
+    grid = image_grid(images, rows=_rows, cols=num_images_per_prompt // _rows)
+    return grid, images
+
+
+args = parse_args()
+# Load models and create wrapper for stable diffusion
+tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
+vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+
+pipeline = StableDiffusionPipeline.from_pretrained(
+    args.pretrained_model_name_or_path, text_encoder=text_encoder, vae=vae, unet=unet, tokenizer=tokenizer
+)
+pipeline.safety_checker = lambda images, clip_input: (images, False)
+if os.path.exists(os.path.join(args.pretrained_model_name_or_path, "best_model.pt")):
+    unet = load(args.pretrained_model_name_or_path, model=unet)
+    unet.eval()
+    setattr(pipeline, "unet", unet)
+else:
+    unet = unet.to(torch.device("cuda", args.cuda_id))
+pipeline = pipeline.to(unet.device)
+grid, images = generate_images(pipeline, prompt=args.caption, num_images_per_prompt=args.images_num, seed=args.seed)
+grid.save(os.path.join(args.pretrained_model_name_or_path, "{}.png".format("_".join(args.caption.split()))))
+dirname = os.path.join(args.pretrained_model_name_or_path, "_".join(args.caption.split()))
+os.makedirs(dirname, exist_ok=True)
+for idx, image in enumerate(images):
+    image.save(os.path.join(dirname, "{}.png".format(idx + 1)))
diff --git a/diffusers/examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py b/diffusers/examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..43667187596ef9b038d2ad03cbbdb02c0a6e40cf
--- /dev/null
+++ b/diffusers/examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py
@@ -0,0 +1,996 @@
+import argparse
+import itertools
+import math
+import os
+import random
+from pathlib import Path
+from typing import Iterable
+
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from accelerate import Accelerator
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from neural_compressor.utils import logger
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.utils import make_image_grid
+
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
+
+def save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path):
+    logger.info("Saving embeddings")
+    learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[placeholder_token_id]
+    learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
+    torch.save(learned_embeds_dict, save_path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Example of distillation for quantization on Textual Inversion.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save learned_embeds.bin every X updates steps.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
+    )
+    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
+    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--do_quantization", action="store_true", help="Whether or not to do quantization.")
+    parser.add_argument("--do_distillation", action="store_true", help="Whether or not to do distillation.")
+    parser.add_argument(
+        "--verify_loading", action="store_true", help="Whether or not to verify the loading of the quantized model."
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    return args
+
+
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+
+
+# Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14
+class EMAModel:
+    """
+    Exponential Moving Average of models weights
+    """
+
+    def __init__(self, parameters: Iterable[torch.nn.Parameter], decay=0.9999):
+        parameters = list(parameters)
+        self.shadow_params = [p.clone().detach() for p in parameters]
+
+        self.decay = decay
+        self.optimization_step = 0
+
+    def get_decay(self, optimization_step):
+        """
+        Compute the decay factor for the exponential moving average.
+        """
+        value = (1 + optimization_step) / (10 + optimization_step)
+        return 1 - min(self.decay, value)
+
+    @torch.no_grad()
+    def step(self, parameters):
+        parameters = list(parameters)
+
+        self.optimization_step += 1
+        self.decay = self.get_decay(self.optimization_step)
+
+        for s_param, param in zip(self.shadow_params, parameters):
+            if param.requires_grad:
+                tmp = self.decay * (s_param - param)
+                s_param.sub_(tmp)
+            else:
+                s_param.copy_(param)
+
+        torch.cuda.empty_cache()
+
+    def copy_to(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        """
+        Copy current averaged parameters into given collection of parameters.
+        Args:
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                updated with the stored moving averages. If `None`, the
+                parameters with which this `ExponentialMovingAverage` was
+                initialized will be used.
+        """
+        parameters = list(parameters)
+        for s_param, param in zip(self.shadow_params, parameters):
+            param.data.copy_(s_param.data)
+
+    def to(self, device=None, dtype=None) -> None:
+        r"""Move internal buffers of the ExponentialMovingAverage to `device`.
+        Args:
+            device: like `device` argument to `torch.Tensor.to`
+        """
+        # .to() on the tensors handles None correctly
+        self.shadow_params = [
+            p.to(device=device, dtype=dtype) if p.is_floating_point() else p.to(device=device)
+            for p in self.shadow_params
+        ]
+
+
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+
+        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+
+        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+
+        example["input_ids"] = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            (
+                h,
+                w,
+            ) = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
+
+        image = Image.fromarray(img)
+        image = image.resize((self.size, self.size), resample=self.interpolation)
+
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+def freeze_params(params):
+    for param in params:
+        param.requires_grad = False
+
+
+def generate_images(pipeline, prompt="", guidance_scale=7.5, num_inference_steps=50, num_images_per_prompt=1, seed=42):
+    generator = torch.Generator(pipeline.device).manual_seed(seed)
+    images = pipeline(
+        prompt,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        generator=generator,
+        num_images_per_prompt=num_images_per_prompt,
+    ).images
+    _rows = int(math.sqrt(num_images_per_prompt))
+    grid = make_image_grid(images, rows=_rows, cols=num_images_per_prompt // _rows)
+    return grid
+
+
+def main():
+    args = parse_args()
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with="tensorboard",
+        project_config=accelerator_project_config,
+    )
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer and add the placeholder token as a additional special token
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Load models and create wrapper for stable diffusion
+    noise_scheduler = DDPMScheduler.from_config(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="unet",
+        revision=args.revision,
+    )
+
+    train_unet = False
+    # Freeze vae and unet
+    freeze_params(vae.parameters())
+    if not args.do_quantization and not args.do_distillation:
+        # Add the placeholder token in tokenizer
+        num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
+        if num_added_tokens == 0:
+            raise ValueError(
+                f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+                " `placeholder_token` that is not already in the tokenizer."
+            )
+
+        # Convert the initializer_token, placeholder_token to ids
+        token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
+        # Check if initializer_token is a single token or a sequence of tokens
+        if len(token_ids) > 1:
+            raise ValueError("The initializer token must be a single token.")
+
+        initializer_token_id = token_ids[0]
+        placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
+        # Resize the token embeddings as we are adding new special tokens to the tokenizer
+        text_encoder.resize_token_embeddings(len(tokenizer))
+
+        # Initialise the newly added placeholder token with the embeddings of the initializer token
+        token_embeds = text_encoder.get_input_embeddings().weight.data
+        token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
+
+        freeze_params(unet.parameters())
+        # Freeze all parameters except for the token embeddings in text encoder
+        params_to_freeze = itertools.chain(
+            text_encoder.text_model.encoder.parameters(),
+            text_encoder.text_model.final_layer_norm.parameters(),
+            text_encoder.text_model.embeddings.position_embedding.parameters(),
+        )
+        freeze_params(params_to_freeze)
+    else:
+        train_unet = True
+        freeze_params(text_encoder.parameters())
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        # only optimize the unet or embeddings of text_encoder
+        unet.parameters() if train_unet else text_encoder.get_input_embeddings().parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    train_dataset = TextualInversionDataset(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=args.placeholder_token,
+        repeats=args.repeats,
+        learnable_property=args.learnable_property,
+        center_crop=args.center_crop,
+        set="train",
+    )
+    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    if not train_unet:
+        text_encoder = accelerator.prepare(text_encoder)
+        unet.to(accelerator.device)
+        unet.eval()
+    else:
+        unet = accelerator.prepare(unet)
+        text_encoder.to(accelerator.device)
+        text_encoder.eval()
+    optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
+
+    # Move vae to device
+    vae.to(accelerator.device)
+
+    # Keep vae in eval model as we don't train these
+    vae.eval()
+
+    compression_manager = None
+
+    def train_func(model):
+        if train_unet:
+            unet_ = model
+            text_encoder_ = text_encoder
+        else:
+            unet_ = unet
+            text_encoder_ = model
+        # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+        num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+        if overrode_max_train_steps:
+            args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        # Afterwards we recalculate our number of training epochs
+        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+        # We need to initialize the trackers we use, and also store our configuration.
+        # The trackers initializes automatically on the main process.
+        if accelerator.is_main_process:
+            accelerator.init_trackers("textual_inversion", config=vars(args))
+
+        # Train!
+        total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {len(train_dataset)}")
+        logger.info(f"  Num Epochs = {args.num_train_epochs}")
+        logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+        logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+        logger.info(f"  Total optimization steps = {args.max_train_steps}")
+        # Only show the progress bar once on each machine.
+        progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+        progress_bar.set_description("Steps")
+        global_step = 0
+
+        if train_unet and args.use_ema:
+            ema_unet = EMAModel(unet_.parameters())
+
+        for epoch in range(args.num_train_epochs):
+            model.train()
+            train_loss = 0.0
+            for step, batch in enumerate(train_dataloader):
+                with accelerator.accumulate(model):
+                    # Convert images to latent space
+                    latents = vae.encode(batch["pixel_values"]).latent_dist.sample().detach()
+                    latents = latents * 0.18215
+
+                    # Sample noise that we'll add to the latents
+                    noise = torch.randn(latents.shape).to(latents.device)
+                    bsz = latents.shape[0]
+                    # Sample a random timestep for each image
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+                    ).long()
+
+                    # Add noise to the latents according to the noise magnitude at each timestep
+                    # (this is the forward diffusion process)
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                    # Get the text embedding for conditioning
+                    encoder_hidden_states = text_encoder_(batch["input_ids"])[0]
+
+                    # Predict the noise residual
+                    model_pred = unet_(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                    loss = F.mse_loss(model_pred, noise, reduction="none").mean([1, 2, 3]).mean()
+                    if train_unet and compression_manager:
+                        unet_inputs = {
+                            "sample": noisy_latents,
+                            "timestep": timesteps,
+                            "encoder_hidden_states": encoder_hidden_states,
+                        }
+                        loss = compression_manager.callbacks.on_after_compute_loss(unet_inputs, model_pred, loss)
+
+                    # Gather the losses across all processes for logging (if we use distributed training).
+                    avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                    train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                    # Backpropagate
+                    accelerator.backward(loss)
+
+                    if train_unet:
+                        if accelerator.sync_gradients:
+                            accelerator.clip_grad_norm_(unet_.parameters(), args.max_grad_norm)
+                    else:
+                        # Zero out the gradients for all token embeddings except the newly added
+                        # embeddings for the concept, as we only want to optimize the concept embeddings
+                        if accelerator.num_processes > 1:
+                            grads = text_encoder_.module.get_input_embeddings().weight.grad
+                        else:
+                            grads = text_encoder_.get_input_embeddings().weight.grad
+                        # Get the index for tokens that we want to zero the grads for
+                        index_grads_to_zero = torch.arange(len(tokenizer)) != placeholder_token_id
+                        grads.data[index_grads_to_zero, :] = grads.data[index_grads_to_zero, :].fill_(0)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad()
+
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    if train_unet and args.use_ema:
+                        ema_unet.step(unet_.parameters())
+                    progress_bar.update(1)
+                    global_step += 1
+                    accelerator.log({"train_loss": train_loss}, step=global_step)
+                    train_loss = 0.0
+                    if not train_unet and global_step % args.save_steps == 0:
+                        save_path = os.path.join(args.output_dir, f"learned_embeds-steps-{global_step}.bin")
+                        save_progress(text_encoder_, placeholder_token_id, accelerator, args, save_path)
+
+                logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+                progress_bar.set_postfix(**logs)
+                accelerator.log(logs, step=global_step)
+
+                if global_step >= args.max_train_steps:
+                    break
+            accelerator.wait_for_everyone()
+
+        if train_unet and args.use_ema:
+            ema_unet.copy_to(unet_.parameters())
+
+        if not train_unet:
+            return text_encoder_
+
+    if not train_unet:
+        text_encoder = train_func(text_encoder)
+    else:
+        import copy
+
+        model = copy.deepcopy(unet)
+        confs = []
+        if args.do_quantization:
+            from neural_compressor import QuantizationAwareTrainingConfig
+
+            q_conf = QuantizationAwareTrainingConfig()
+            confs.append(q_conf)
+
+        if args.do_distillation:
+            teacher_model = copy.deepcopy(model)
+
+            def attention_fetcher(x):
+                return x.sample
+
+            layer_mappings = [
+                [
+                    [
+                        "conv_in",
+                    ]
+                ],
+                [
+                    [
+                        "time_embedding",
+                    ]
+                ],
+                [["down_blocks.0.attentions.0", attention_fetcher]],
+                [["down_blocks.0.attentions.1", attention_fetcher]],
+                [
+                    [
+                        "down_blocks.0.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.0.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.0.downsamplers.0",
+                    ]
+                ],
+                [["down_blocks.1.attentions.0", attention_fetcher]],
+                [["down_blocks.1.attentions.1", attention_fetcher]],
+                [
+                    [
+                        "down_blocks.1.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.1.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.1.downsamplers.0",
+                    ]
+                ],
+                [["down_blocks.2.attentions.0", attention_fetcher]],
+                [["down_blocks.2.attentions.1", attention_fetcher]],
+                [
+                    [
+                        "down_blocks.2.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.2.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.2.downsamplers.0",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.3.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.3.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.0.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.0.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.0.resnets.2",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.0.upsamplers.0",
+                    ]
+                ],
+                [["up_blocks.1.attentions.0", attention_fetcher]],
+                [["up_blocks.1.attentions.1", attention_fetcher]],
+                [["up_blocks.1.attentions.2", attention_fetcher]],
+                [
+                    [
+                        "up_blocks.1.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.1.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.1.resnets.2",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.1.upsamplers.0",
+                    ]
+                ],
+                [["up_blocks.2.attentions.0", attention_fetcher]],
+                [["up_blocks.2.attentions.1", attention_fetcher]],
+                [["up_blocks.2.attentions.2", attention_fetcher]],
+                [
+                    [
+                        "up_blocks.2.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.2.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.2.resnets.2",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.2.upsamplers.0",
+                    ]
+                ],
+                [["up_blocks.3.attentions.0", attention_fetcher]],
+                [["up_blocks.3.attentions.1", attention_fetcher]],
+                [["up_blocks.3.attentions.2", attention_fetcher]],
+                [
+                    [
+                        "up_blocks.3.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.3.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.3.resnets.2",
+                    ]
+                ],
+                [["mid_block.attentions.0", attention_fetcher]],
+                [
+                    [
+                        "mid_block.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "mid_block.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "conv_out",
+                    ]
+                ],
+            ]
+            layer_names = [layer_mapping[0][0] for layer_mapping in layer_mappings]
+            if not set(layer_names).issubset([n[0] for n in model.named_modules()]):
+                raise ValueError(
+                    "Provided model is not compatible with the default layer_mappings, "
+                    'please use the model fine-tuned from "CompVis/stable-diffusion-v1-4", '
+                    "or modify the layer_mappings variable to fit your model."
+                    f"\nDefault layer_mappings are as such:\n{layer_mappings}"
+                )
+            from neural_compressor.config import DistillationConfig, IntermediateLayersKnowledgeDistillationLossConfig
+
+            distillation_criterion = IntermediateLayersKnowledgeDistillationLossConfig(
+                layer_mappings=layer_mappings,
+                loss_types=["MSE"] * len(layer_mappings),
+                loss_weights=[1.0 / len(layer_mappings)] * len(layer_mappings),
+                add_origin_loss=True,
+            )
+            d_conf = DistillationConfig(teacher_model=teacher_model, criterion=distillation_criterion)
+            confs.append(d_conf)
+
+        from neural_compressor.training import prepare_compression
+
+        compression_manager = prepare_compression(model, confs)
+        compression_manager.callbacks.on_train_begin()
+        model = compression_manager.model
+        train_func(model)
+        compression_manager.callbacks.on_train_end()
+
+        # Save the resulting model and its corresponding configuration in the given directory
+        model.save(args.output_dir)
+
+        logger.info(f"Optimized model saved to: {args.output_dir}.")
+
+        # change to framework model for further use
+        model = model.model
+
+    # Create the pipeline using using the trained modules and save it.
+    templates = imagenet_style_templates_small if args.learnable_property == "style" else imagenet_templates_small
+    prompt = templates[0].format(args.placeholder_token)
+    if accelerator.is_main_process:
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            text_encoder=accelerator.unwrap_model(text_encoder),
+            vae=vae,
+            unet=accelerator.unwrap_model(unet),
+            tokenizer=tokenizer,
+        )
+        pipeline.save_pretrained(args.output_dir)
+        pipeline = pipeline.to(unet.device)
+        baseline_model_images = generate_images(pipeline, prompt=prompt, seed=args.seed)
+        baseline_model_images.save(
+            os.path.join(args.output_dir, "{}_baseline_model.png".format("_".join(prompt.split())))
+        )
+
+        if not train_unet:
+            # Also save the newly trained embeddings
+            save_path = os.path.join(args.output_dir, "learned_embeds.bin")
+            save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
+        else:
+            setattr(pipeline, "unet", accelerator.unwrap_model(model))
+            if args.do_quantization:
+                pipeline = pipeline.to(torch.device("cpu"))
+
+            optimized_model_images = generate_images(pipeline, prompt=prompt, seed=args.seed)
+            optimized_model_images.save(
+                os.path.join(args.output_dir, "{}_optimized_model.png".format("_".join(prompt.split())))
+            )
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+    if args.do_quantization and args.verify_loading:
+        # Load the model obtained after Intel Neural Compressor quantization
+        from neural_compressor.utils.pytorch import load
+
+        loaded_model = load(args.output_dir, model=unet)
+        loaded_model.eval()
+
+        setattr(pipeline, "unet", loaded_model)
+        if args.do_quantization:
+            pipeline = pipeline.to(torch.device("cpu"))
+
+        loaded_model_images = generate_images(pipeline, prompt=prompt, seed=args.seed)
+        if loaded_model_images != optimized_model_images:
+            logger.info("The quantized model was not successfully loaded.")
+        else:
+            logger.info("The quantized model was successfully loaded.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/research_projects/lora/README.md b/diffusers/examples/research_projects/lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..643f664ce1eba50a8871c9b9ba75f914acdd5f5f
--- /dev/null
+++ b/diffusers/examples/research_projects/lora/README.md
@@ -0,0 +1,83 @@
+# Stable Diffusion text-to-image fine-tuning
+This extended LoRA training script was authored by [haofanwang](https://github.com/haofanwang).
+This is an experimental LoRA extension of [this example](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py). We further support add LoRA layers for text encoder.
+
+## Training with LoRA
+
+Low-Rank Adaption of Large Language Models was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*.
+
+In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-decomposition matrices to existing weights and **only** training those newly added weights. This has a couple of advantages:
+
+- Previous pretrained weights are kept frozen so that model is not prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114).
+- Rank-decomposition matrices have significantly fewer parameters than original model, which means that trained LoRA weights are easily portable.
+- LoRA attention layers allow to control to which extent the model is adapted toward new training images via a `scale` parameter.
+
+[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository.
+
+With LoRA, it's possible to fine-tune Stable Diffusion on a custom image-caption pair dataset
+on consumer GPUs like Tesla T4, Tesla V100.
+
+### Training
+
+First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Narutos dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+**___Note: It is quite useful to monitor the training progress by regularly generating sample images during training. [Weights and Biases](https://docs.wandb.ai/quickstart) is a nice solution to easily see generating images during training. All you need to do is to run `pip install wandb` before training to automatically log images.___**
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+```
+
+For this example we want to directly store the trained LoRA embeddings on the Hub, so
+we need to be logged in and add the `--push_to_hub` flag.
+
+```bash
+huggingface-cli login
+```
+
+Now we can start training!
+
+```bash
+accelerate launch --mixed_precision="fp16" train_text_to_image_lora.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=512 --random_flip \
+  --train_batch_size=1 \
+  --num_train_epochs=100 --checkpointing_steps=5000 \
+  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --seed=42 \
+  --output_dir="sd-naruto-model-lora" \
+  --validation_prompt="cute dragon creature" --report_to="wandb"
+  --use_peft \
+  --lora_r=4 --lora_alpha=32 \
+  --lora_text_encoder_r=4 --lora_text_encoder_alpha=32
+```
+
+The above command will also run inference as fine-tuning progresses and log the results to Weights and Biases.
+
+**___Note: When using LoRA we can use a much higher learning rate compared to non-LoRA fine-tuning. Here we use *1e-4* instead of the usual *1e-5*. Also, by using LoRA, it's possible to run `train_text_to_image_lora.py` in consumer GPUs like T4 or V100.___**
+
+The final LoRA embedding weights have been uploaded to [sayakpaul/sd-model-finetuned-lora-t4](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4). **___Note: [The final weights](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4/blob/main/pytorch_lora_weights.bin) are only 3 MB in size, which is orders of magnitudes smaller than the original model.___**
+
+You can check some inference samples that were logged during the course of the fine-tuning process [here](https://wandb.ai/sayakpaul/text2image-fine-tune/runs/q4lc0xsw).
+
+### Inference
+
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline` after loading the trained LoRA weights.  You
+need to pass the `output_dir` for loading the LoRA weights which, in this case, is `sd-naruto-model-lora`.
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+model_path = "sayakpaul/sd-model-finetuned-lora-t4"
+pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+pipe.unet.load_attn_procs(model_path)
+pipe.to("cuda")
+
+prompt = "A naruto with green eyes and red legs."
+image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+image.save("naruto.png")
+```
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/lora/requirements.txt b/diffusers/examples/research_projects/lora/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..89a1b73e70728b2395ca5f121f22def70f2076f9
--- /dev/null
+++ b/diffusers/examples/research_projects/lora/requirements.txt
@@ -0,0 +1,8 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+datasets
+ftfy
+tensorboard
+Jinja2
+git+https://github.com/huggingface/peft.git
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/lora/train_text_to_image_lora.py b/diffusers/examples/research_projects/lora/train_text_to_image_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ebc1422b06424077dee45760a61a7727b182b3e
--- /dev/null
+++ b/diffusers/examples/research_projects/lora/train_text_to_image_lora.py
@@ -0,0 +1,1026 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning script for Stable Diffusion for text2image with support for LoRA."""
+
+import argparse
+import itertools
+import json
+import logging
+import math
+import os
+import random
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
+from diffusers.loaders import AttnProcsLayers
+from diffusers.models.attention_processor import LoRAAttnProcessor
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.14.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None):
+    img_str = ""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+- diffusers-training
+- lora
+inference: true
+---
+    """
+    model_card = f"""
+# LoRA text2image fine-tuning - {repo_id}
+These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
+{img_str}
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
+
+    # lora args
+    parser.add_argument("--use_peft", action="store_true", help="Whether to use peft to support lora")
+    parser.add_argument("--lora_r", type=int, default=4, help="Lora rank, only used if use_lora is True")
+    parser.add_argument("--lora_alpha", type=int, default=32, help="Lora alpha, only used if lora is True")
+    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Lora dropout, only used if use_lora is True")
+    parser.add_argument(
+        "--lora_bias",
+        type=str,
+        default="none",
+        help="Bias type for Lora. Can be 'none', 'all' or 'lora_only', only used if use_lora is True",
+    )
+    parser.add_argument(
+        "--lora_text_encoder_r",
+        type=int,
+        default=4,
+        help="Lora rank for text encoder, only used if `use_lora` and `train_text_encoder` are True",
+    )
+    parser.add_argument(
+        "--lora_text_encoder_alpha",
+        type=int,
+        default=32,
+        help="Lora alpha for text encoder, only used if `use_lora` and `train_text_encoder` are True",
+    )
+    parser.add_argument(
+        "--lora_text_encoder_dropout",
+        type=float,
+        default=0.0,
+        help="Lora dropout for text encoder, only used if `use_lora` and `train_text_encoder` are True",
+    )
+    parser.add_argument(
+        "--lora_text_encoder_bias",
+        type=str,
+        default="none",
+        help="Bias type for Lora. Can be 'none', 'all' or 'lora_only', only used if use_lora and `train_text_encoder` are True",
+    )
+
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more docs"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def main():
+    args = parse_args()
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    if args.use_peft:
+        from peft import LoraConfig, LoraModel, get_peft_model_state_dict, set_peft_model_state_dict
+
+        UNET_TARGET_MODULES = ["to_q", "to_v", "query", "value"]
+        TEXT_ENCODER_TARGET_MODULES = ["q_proj", "v_proj"]
+
+        config = LoraConfig(
+            r=args.lora_r,
+            lora_alpha=args.lora_alpha,
+            target_modules=UNET_TARGET_MODULES,
+            lora_dropout=args.lora_dropout,
+            bias=args.lora_bias,
+        )
+        unet = LoraModel(config, unet)
+
+        vae.requires_grad_(False)
+        if args.train_text_encoder:
+            config = LoraConfig(
+                r=args.lora_text_encoder_r,
+                lora_alpha=args.lora_text_encoder_alpha,
+                target_modules=TEXT_ENCODER_TARGET_MODULES,
+                lora_dropout=args.lora_text_encoder_dropout,
+                bias=args.lora_text_encoder_bias,
+            )
+            text_encoder = LoraModel(config, text_encoder)
+    else:
+        # freeze parameters of models to save more memory
+        unet.requires_grad_(False)
+        vae.requires_grad_(False)
+
+        text_encoder.requires_grad_(False)
+
+        # now we will add new LoRA weights to the attention layers
+        # It's important to realize here how many attention weights will be added and of which sizes
+        # The sizes of the attention layers consist only of two different variables:
+        # 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`.
+        # 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`.
+
+        # Let's first see how many attention processors we will have to set.
+        # For Stable Diffusion, it should be equal to:
+        # - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12
+        # - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2
+        # - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18
+        # => 32 layers
+
+        # Set correct lora layers
+        lora_attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+
+            lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+
+        unet.set_attn_processor(lora_attn_procs)
+        lora_layers = AttnProcsLayers(unet.attn_processors)
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if not args.train_text_encoder:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    if args.use_peft:
+        # Optimizer creation
+        params_to_optimize = (
+            itertools.chain(unet.parameters(), text_encoder.parameters())
+            if args.train_text_encoder
+            else unet.parameters()
+        )
+        optimizer = optimizer_cls(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+    else:
+        optimizer = optimizer_cls(
+            lora_layers.parameters(),
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        return inputs.input_ids
+
+    # Preprocessing the datasets.
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["input_ids"] = tokenize_captions(examples)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+        return {"pixel_values": pixel_values, "input_ids": input_ids}
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.use_peft:
+        if args.train_text_encoder:
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+            )
+        else:
+            unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                unet, optimizer, train_dataloader, lr_scheduler
+            )
+    else:
+        lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            lora_layers, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2image-fine-tune", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # Predict the noise residual and compute loss
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    if args.use_peft:
+                        params_to_clip = (
+                            itertools.chain(unet.parameters(), text_encoder.parameters())
+                            if args.train_text_encoder
+                            else unet.parameters()
+                        )
+                    else:
+                        params_to_clip = lora_layers.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                # create pipeline
+                pipeline = DiffusionPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    unet=accelerator.unwrap_model(unet),
+                    text_encoder=accelerator.unwrap_model(text_encoder),
+                    revision=args.revision,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+                images = []
+                for _ in range(args.num_validation_images):
+                    images.append(
+                        pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
+                    )
+
+                if accelerator.is_main_process:
+                    for tracker in accelerator.trackers:
+                        if tracker.name == "tensorboard":
+                            np_images = np.stack([np.asarray(img) for img in images])
+                            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                        if tracker.name == "wandb":
+                            tracker.log(
+                                {
+                                    "validation": [
+                                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                        for i, image in enumerate(images)
+                                    ]
+                                }
+                            )
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        if args.use_peft:
+            lora_config = {}
+            unwarpped_unet = accelerator.unwrap_model(unet)
+            state_dict = get_peft_model_state_dict(unwarpped_unet, state_dict=accelerator.get_state_dict(unet))
+            lora_config["peft_config"] = unwarpped_unet.get_peft_config_as_dict(inference=True)
+            if args.train_text_encoder:
+                unwarpped_text_encoder = accelerator.unwrap_model(text_encoder)
+                text_encoder_state_dict = get_peft_model_state_dict(
+                    unwarpped_text_encoder, state_dict=accelerator.get_state_dict(text_encoder)
+                )
+                text_encoder_state_dict = {f"text_encoder_{k}": v for k, v in text_encoder_state_dict.items()}
+                state_dict.update(text_encoder_state_dict)
+                lora_config["text_encoder_peft_config"] = unwarpped_text_encoder.get_peft_config_as_dict(
+                    inference=True
+                )
+
+            accelerator.save(state_dict, os.path.join(args.output_dir, f"{global_step}_lora.pt"))
+            with open(os.path.join(args.output_dir, f"{global_step}_lora_config.json"), "w") as f:
+                json.dump(lora_config, f)
+        else:
+            unet = unet.to(torch.float32)
+            unet.save_attn_procs(args.output_dir)
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                dataset_name=args.dataset_name,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    # Final inference
+    # Load previous pipeline
+    pipeline = DiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path, revision=args.revision, torch_dtype=weight_dtype
+    )
+
+    if args.use_peft:
+
+        def load_and_set_lora_ckpt(pipe, ckpt_dir, global_step, device, dtype):
+            with open(os.path.join(args.output_dir, f"{global_step}_lora_config.json"), "r") as f:
+                lora_config = json.load(f)
+            print(lora_config)
+
+            checkpoint = os.path.join(args.output_dir, f"{global_step}_lora.pt")
+            lora_checkpoint_sd = torch.load(checkpoint)
+            unet_lora_ds = {k: v for k, v in lora_checkpoint_sd.items() if "text_encoder_" not in k}
+            text_encoder_lora_ds = {
+                k.replace("text_encoder_", ""): v for k, v in lora_checkpoint_sd.items() if "text_encoder_" in k
+            }
+
+            unet_config = LoraConfig(**lora_config["peft_config"])
+            pipe.unet = LoraModel(unet_config, pipe.unet)
+            set_peft_model_state_dict(pipe.unet, unet_lora_ds)
+
+            if "text_encoder_peft_config" in lora_config:
+                text_encoder_config = LoraConfig(**lora_config["text_encoder_peft_config"])
+                pipe.text_encoder = LoraModel(text_encoder_config, pipe.text_encoder)
+                set_peft_model_state_dict(pipe.text_encoder, text_encoder_lora_ds)
+
+            if dtype in (torch.float16, torch.bfloat16):
+                pipe.unet.half()
+                pipe.text_encoder.half()
+
+            pipe.to(device)
+            return pipe
+
+        pipeline = load_and_set_lora_ckpt(pipeline, args.output_dir, global_step, accelerator.device, weight_dtype)
+
+    else:
+        pipeline = pipeline.to(accelerator.device)
+        # load attention processors
+        pipeline.unet.load_attn_procs(args.output_dir)
+
+    # run inference
+    if args.seed is not None:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+    else:
+        generator = None
+    images = []
+    for _ in range(args.num_validation_images):
+        images.append(pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0])
+
+    if accelerator.is_main_process:
+        for tracker in accelerator.trackers:
+            if tracker.name == "tensorboard":
+                np_images = np.stack([np.asarray(img) for img in images])
+                tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+            if tracker.name == "wandb":
+                tracker.log(
+                    {
+                        "test": [
+                            wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                            for i, image in enumerate(images)
+                        ]
+                    }
+                )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/research_projects/multi_subject_dreambooth/README.md b/diffusers/examples/research_projects/multi_subject_dreambooth/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7c2e6f400935613391863b2cc73544baea1c144e
--- /dev/null
+++ b/diffusers/examples/research_projects/multi_subject_dreambooth/README.md
@@ -0,0 +1,338 @@
+# Multi Subject DreamBooth training
+
+[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject.
+This `train_multi_subject_dreambooth.py` script shows how to implement the training procedure for one or more subjects and adapt it for stable diffusion. Note that this code is based off of the `examples/dreambooth/train_dreambooth.py` script as of 01/06/2022.
+
+This script was added by @kopsahlong, and is not actively maintained. However, if you come across anything that could use fixing, feel free to open an issue and tag @kopsahlong.
+
+## Running locally with PyTorch
+### Installing the dependencies
+
+Before running the script, make sure to install the library's training dependencies:
+
+To start, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd into the folder `diffusers/examples/research_projects/multi_subject_dreambooth` and run the following:
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+### Multi Subject Training Example
+In order to have your model learn multiple concepts at once, we simply add in the additional data directories and prompts to our `instance_data_dir` and `instance_prompt` (as well as `class_data_dir` and `class_prompt` if `--with_prior_preservation` is specified) as one comma separated string.
+
+See an example with 2 subjects below, which learns a model for one dog subject and one human subject:
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+
+# Subject 1
+export INSTANCE_DIR_1="path-to-instance-images-concept-1"
+export INSTANCE_PROMPT_1="a photo of a sks dog"
+export CLASS_DIR_1="path-to-class-images-dog"
+export CLASS_PROMPT_1="a photo of a dog"
+
+# Subject 2
+export INSTANCE_DIR_2="path-to-instance-images-concept-2"
+export INSTANCE_PROMPT_2="a photo of a t@y person"
+export CLASS_DIR_2="path-to-class-images-person"
+export CLASS_PROMPT_2="a photo of a person"
+
+accelerate launch train_multi_subject_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir="$INSTANCE_DIR_1,$INSTANCE_DIR_2" \
+  --output_dir=$OUTPUT_DIR \
+  --train_text_encoder \
+  --instance_prompt="$INSTANCE_PROMPT_1,$INSTANCE_PROMPT_2" \
+  --with_prior_preservation \
+  --prior_loss_weight=1.0 \
+  --class_data_dir="$CLASS_DIR_1,$CLASS_DIR_2" \
+  --class_prompt="$CLASS_PROMPT_1,$CLASS_PROMPT_2"\
+  --num_class_images=50 \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=1e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=1500
+```
+
+This example shows training for 2 subjects, but please note that the model can be trained on any number of new concepts. This can be done by continuing to add in the corresponding directories and prompts to the corresponding comma separated string.
+
+Note also that in this script, `sks` and `t@y` were used as tokens to learn the new subjects ([this thread](https://github.com/XavierXiao/Dreambooth-Stable-Diffusion/issues/71) inspired the use of `t@y` as our second identifier). However, there may be better rare tokens to experiment with, and results also seemed to be good when more intuitive words are used.
+
+**Important**: New parameters are added to the script, making possible to validate the progress of the training by
+generating images at specified steps. Taking also into account that a comma separated list in a text field for a prompt
+it's never a good idea (simply because it is very common in prompts to have them as part of a regular text) we
+introduce the `concept_list` parameter: allowing to specify a json-like file where you can define the different
+configuration for each subject that you want to train.
+
+An example of how to generate the file:
+```python
+import json
+
+# here we are using parameters for prior-preservation and validation as well.
+concepts_list = [
+    {
+        "instance_prompt":      "drawing of a t@y meme",
+        "class_prompt":         "drawing of a meme",
+        "instance_data_dir":    "/some_folder/meme_toy",
+        "class_data_dir":       "/data/meme",
+        "validation_prompt":    "drawing of a t@y meme about football in Uruguay",
+        "validation_negative_prompt": "black and white"
+    },
+    {
+        "instance_prompt":      "drawing of a sks sir",
+        "class_prompt":         "drawing of a sir",
+        "instance_data_dir":    "/some_other_folder/sir_sks",
+        "class_data_dir":       "/data/sir",
+        "validation_prompt":    "drawing of a sks sir with the Uruguayan sun in his chest",
+        "validation_negative_prompt": "an old man",
+        "validation_guidance_scale": 20,
+        "validation_number_images": 3,
+        "validation_inference_steps": 10
+    }
+]
+
+with open("concepts_list.json", "w") as f:
+    json.dump(concepts_list, f, indent=4)
+```
+And then just point to the file when executing the script:
+
+```bash
+# exports...
+accelerate launch train_multi_subject_dreambooth.py \
+# more parameters...
+--concepts_list="concepts_list.json"
+```
+
+You can use the helper from the script to get a better sense of each parameter.
+
+### Inference
+
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. sks in above example) in your prompt.
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+model_id = "path-to-your-trained-model"
+pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+prompt = "A photo of a t@y person petting an sks dog"
+image = pipe(prompt, num_inference_steps=200, guidance_scale=7.5).images[0]
+
+image.save("person-petting-dog.png")
+```
+
+### Inference from a training checkpoint
+
+You can also perform inference from one of the checkpoints saved during the training process, if you used the `--checkpointing_steps` argument. Please, refer to [the documentation](https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint) to see how to do it.
+
+## Additional Dreambooth documentation
+Because the `train_multi_subject_dreambooth.py` script here was forked from an original version of `train_dreambooth.py` in the `examples/dreambooth` folder, I've included the original applicable training documentation for single subject examples below.
+
+This should explain how to play with training variables such as prior preservation, fine tuning the text encoder, etc. which is still applicable to our multi subject training code. Note also that the examples below, which are single subject examples, also work with `train_multi_subject_dreambooth.py`, as this script supports 1 (or more) subjects.
+
+### Single subject dog toy example
+
+Let's get our dataset. Download images from [here](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ) and save them in a directory. This will be our training data.
+
+And launch the training using
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=400
+```
+
+### Training with prior-preservation loss
+
+Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
+According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases. The `num_class_images` flag sets the number of images to generate with the class prompt. You can place existing images in `class_data_dir`, and the training script will generate any additional images so that `num_class_images` are present in `class_data_dir` during training time.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+
+### Training on a 16GB GPU:
+
+With the help of gradient checkpointing and the 8-bit optimizer from bitsandbytes it's possible to run train dreambooth on a 16GB GPU.
+
+To install `bitandbytes` please refer to this [readme](https://github.com/TimDettmers/bitsandbytes#requirements--installation).
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=2 --gradient_checkpointing \
+  --use_8bit_adam \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+### Training on a 8 GB GPU:
+
+By using [DeepSpeed](https://www.deepspeed.ai/) it's possible to offload some
+tensors from VRAM to either CPU or NVME allowing to train with less VRAM.
+
+DeepSpeed needs to be enabled with `accelerate config`. During configuration
+answer yes to "Do you want to use DeepSpeed?". With DeepSpeed stage 2, fp16
+mixed precision and offloading both parameters and optimizer state to cpu it's
+possible to train on under 8 GB VRAM with a drawback of requiring significantly
+more RAM (about 25 GB). See [documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more DeepSpeed configuration options.
+
+Changing the default Adam optimizer to DeepSpeed's special version of Adam
+`deepspeed.ops.adam.DeepSpeedCPUAdam` gives a substantial speedup but enabling
+it requires CUDA toolchain with the same version as pytorch. 8-bit optimizer
+does not seem to be compatible with DeepSpeed at the moment.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch --mixed_precision="fp16" train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --sample_batch_size=1 \
+  --gradient_accumulation_steps=1 --gradient_checkpointing \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+### Fine-tune text encoder with the UNet.
+
+The script also allows to fine-tune the `text_encoder` along with the `unet`. It's been observed experimentally that fine-tuning `text_encoder` gives much better results especially on faces.
+Pass the `--train_text_encoder` argument to the script to enable training `text_encoder`.
+
+___Note: Training text encoder requires more memory, with this option the training won't fit on 16GB GPU. It needs at least 24GB VRAM.___
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --train_text_encoder \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --use_8bit_adam \
+  --gradient_checkpointing \
+  --learning_rate=2e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+### Using DreamBooth for other pipelines than Stable Diffusion
+
+Altdiffusion also supports dreambooth now, the running command is basically the same as above, all you need to do is replace the `MODEL_NAME` like this:
+One can now simply change the `pretrained_model_name_or_path` to another architecture such as [`AltDiffusion`](https://huggingface.co/docs/diffusers/api/pipelines/alt_diffusion).
+
+```
+export MODEL_NAME="CompVis/stable-diffusion-v1-4" --> export MODEL_NAME="BAAI/AltDiffusion-m9"
+or
+export MODEL_NAME="CompVis/stable-diffusion-v1-4" --> export MODEL_NAME="BAAI/AltDiffusion"
+```
+
+### Training with xformers:
+You can enable memory efficient attention by [installing xFormers](https://github.com/facebookresearch/xformers#installing-xformers) and padding the `--enable_xformers_memory_efficient_attention` argument to the script. This is not available with the Flax/JAX implementation.
+
+You can also use Dreambooth to train the specialized in-painting model. See [the script in the research folder for details](https://github.com/huggingface/diffusers/tree/main/examples/research_projects/dreambooth_inpaint).
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/multi_subject_dreambooth/requirements.txt b/diffusers/examples/research_projects/multi_subject_dreambooth/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e19b0ce60bf407bec21a9b85f9232cad957bfa6f
--- /dev/null
+++ b/diffusers/examples/research_projects/multi_subject_dreambooth/requirements.txt
@@ -0,0 +1,6 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py b/diffusers/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f507b26d6a81cf64e7c699e6d5e2c47d9d175b7
--- /dev/null
+++ b/diffusers/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py
@@ -0,0 +1,1195 @@
+import argparse
+import itertools
+import json
+import logging
+import math
+import uuid
+import warnings
+from os import environ, listdir, makedirs
+from os.path import basename, join
+from pathlib import Path
+from typing import List
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from PIL import Image
+from torch import dtype
+from torch.nn import Module
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.13.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def log_validation_images_to_tracker(
+    images: List[np.array], label: str, validation_prompt: str, accelerator: Accelerator, epoch: int
+):
+    logger.info(f"Logging images to tracker for validation prompt: {validation_prompt}.")
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{label}_{epoch}_{i}: {validation_prompt}")
+                        for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+
+# TODO: Add `prompt_embeds` and `negative_prompt_embeds` parameters to the function when `pre_compute_text_embeddings`
+#  argument is implemented.
+def generate_validation_images(
+    text_encoder: Module,
+    tokenizer: Module,
+    unet: Module,
+    vae: Module,
+    arguments: argparse.Namespace,
+    accelerator: Accelerator,
+    weight_dtype: dtype,
+):
+    logger.info("Running validation images.")
+
+    pipeline_args = {}
+
+    if text_encoder is not None:
+        pipeline_args["text_encoder"] = accelerator.unwrap_model(text_encoder)
+
+    if vae is not None:
+        pipeline_args["vae"] = vae
+
+    # create pipeline (note: unet and vae are loaded again in float32)
+    pipeline = DiffusionPipeline.from_pretrained(
+        arguments.pretrained_model_name_or_path,
+        tokenizer=tokenizer,
+        unet=accelerator.unwrap_model(unet),
+        revision=arguments.revision,
+        torch_dtype=weight_dtype,
+        **pipeline_args,
+    )
+
+    # We train on the simplified learning objective. If we were previously predicting a variance, we need the
+    # scheduler to ignore it
+    scheduler_args = {}
+
+    if "variance_type" in pipeline.scheduler.config:
+        variance_type = pipeline.scheduler.config.variance_type
+
+        if variance_type in ["learned", "learned_range"]:
+            variance_type = "fixed_small"
+
+        scheduler_args["variance_type"] = variance_type
+
+    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    generator = (
+        None if arguments.seed is None else torch.Generator(device=accelerator.device).manual_seed(arguments.seed)
+    )
+
+    images_sets = []
+    for vp, nvi, vnp, vis, vgs in zip(
+        arguments.validation_prompt,
+        arguments.validation_number_images,
+        arguments.validation_negative_prompt,
+        arguments.validation_inference_steps,
+        arguments.validation_guidance_scale,
+    ):
+        images = []
+        if vp is not None:
+            logger.info(
+                f"Generating {nvi} images with prompt: '{vp}', negative prompt: '{vnp}', inference steps: {vis}, "
+                f"guidance scale: {vgs}."
+            )
+
+            pipeline_args = {"prompt": vp, "negative_prompt": vnp, "num_inference_steps": vis, "guidance_scale": vgs}
+
+            # run inference
+            # TODO: it would be good to measure whether it's faster to run inference on all images at once, one at a
+            #  time or in small batches
+            for _ in range(nvi):
+                with torch.autocast("cuda"):
+                    image = pipeline(**pipeline_args, num_images_per_prompt=1, generator=generator).images[0]
+                images.append(image)
+
+        images_sets.append(images)
+
+    del pipeline
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    return images_sets
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=False,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more docs"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=None,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt(s) `validation_prompt` "
+            "multiple times (`validation_number_images`) and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning. You can use commas to "
+        "define multiple negative prompts. This parameter can be defined also within the file given by "
+        "`concepts_list` parameter in the respective subject.",
+    )
+    parser.add_argument(
+        "--validation_number_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with the validation parameters given. This "
+        "can be defined within the file given by `concepts_list` parameter in the respective subject.",
+    )
+    parser.add_argument(
+        "--validation_negative_prompt",
+        type=str,
+        default=None,
+        help="A negative prompt that is used during validation to verify that the model is learning. You can use commas"
+        " to define multiple negative prompts, each one corresponding to a validation prompt. This parameter can "
+        "be defined also within the file given by `concepts_list` parameter in the respective subject.",
+    )
+    parser.add_argument(
+        "--validation_inference_steps",
+        type=int,
+        default=25,
+        help="Number of inference steps (denoising steps) to run during validation. This can be defined within the "
+        "file given by `concepts_list` parameter in the respective subject.",
+    )
+    parser.add_argument(
+        "--validation_guidance_scale",
+        type=float,
+        default=7.5,
+        help="To control how much the image generation process follows the text prompt. This can be defined within the "
+        "file given by `concepts_list` parameter in the respective subject.",
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+    parser.add_argument(
+        "--concepts_list",
+        type=str,
+        default=None,
+        help="Path to json file containing a list of multiple concepts, will overwrite parameters like instance_prompt,"
+        " class_prompt, etc.",
+    )
+
+    if input_args:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if not args.concepts_list and (not args.instance_data_dir or not args.instance_prompt):
+        raise ValueError(
+            "You must specify either instance parameters (data directory, prompt, etc.) or use "
+            "the `concept_list` parameter and specify them within the file."
+        )
+
+    if args.concepts_list:
+        if args.instance_prompt:
+            raise ValueError("If you are using `concepts_list` parameter, define the instance prompt within the file.")
+        if args.instance_data_dir:
+            raise ValueError(
+                "If you are using `concepts_list` parameter, define the instance data directory within the file."
+            )
+        if args.validation_steps and (args.validation_prompt or args.validation_negative_prompt):
+            raise ValueError(
+                "If you are using `concepts_list` parameter, define validation parameters for "
+                "each subject within the file:\n - `validation_prompt`."
+                "\n - `validation_negative_prompt`.\n - `validation_guidance_scale`."
+                "\n - `validation_number_images`.\n - `validation_prompt`."
+                "\n - `validation_inference_steps`.\nThe `validation_steps` parameter is the only one "
+                "that needs to be defined outside the file."
+            )
+
+    env_local_rank = int(environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if not args.concepts_list:
+            if not args.class_data_dir:
+                raise ValueError("You must specify a data directory for class images.")
+            if not args.class_prompt:
+                raise ValueError("You must specify prompt for class images.")
+        else:
+            if args.class_data_dir:
+                raise ValueError(
+                    "If you are using `concepts_list` parameter, define the class data directory within the file."
+                )
+            if args.class_prompt:
+                raise ValueError(
+                    "If you are using `concepts_list` parameter, define the class prompt within the file."
+                )
+    else:
+        # logger is not available yet
+        if not args.class_data_dir:
+            warnings.warn(
+                "Ignoring `class_data_dir` parameter, you need to use it together with `with_prior_preservation`."
+            )
+        if not args.class_prompt:
+            warnings.warn(
+                "Ignoring `class_prompt` parameter, you need to use it together with `with_prior_preservation`."
+            )
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and then tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_root = []
+        self.instance_images_path = []
+        self.num_instance_images = []
+        self.instance_prompt = []
+        self.class_data_root = [] if class_data_root is not None else None
+        self.class_images_path = []
+        self.num_class_images = []
+        self.class_prompt = []
+        self._length = 0
+
+        for i in range(len(instance_data_root)):
+            self.instance_data_root.append(Path(instance_data_root[i]))
+            if not self.instance_data_root[i].exists():
+                raise ValueError("Instance images root doesn't exists.")
+
+            self.instance_images_path.append(list(Path(instance_data_root[i]).iterdir()))
+            self.num_instance_images.append(len(self.instance_images_path[i]))
+            self.instance_prompt.append(instance_prompt[i])
+            self._length += self.num_instance_images[i]
+
+            if class_data_root is not None:
+                self.class_data_root.append(Path(class_data_root[i]))
+                self.class_data_root[i].mkdir(parents=True, exist_ok=True)
+                self.class_images_path.append(list(self.class_data_root[i].iterdir()))
+                self.num_class_images.append(len(self.class_images_path))
+                if self.num_class_images[i] > self.num_instance_images[i]:
+                    self._length -= self.num_instance_images[i]
+                    self._length += self.num_class_images[i]
+                self.class_prompt.append(class_prompt[i])
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        for i in range(len(self.instance_images_path)):
+            instance_image = Image.open(self.instance_images_path[i][index % self.num_instance_images[i]])
+            if not instance_image.mode == "RGB":
+                instance_image = instance_image.convert("RGB")
+            example[f"instance_images_{i}"] = self.image_transforms(instance_image)
+            example[f"instance_prompt_ids_{i}"] = self.tokenizer(
+                self.instance_prompt[i],
+                truncation=True,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            ).input_ids
+
+        if self.class_data_root:
+            for i in range(len(self.class_data_root)):
+                class_image = Image.open(self.class_images_path[i][index % self.num_class_images[i]])
+                if not class_image.mode == "RGB":
+                    class_image = class_image.convert("RGB")
+                example[f"class_images_{i}"] = self.image_transforms(class_image)
+                example[f"class_prompt_ids_{i}"] = self.tokenizer(
+                    self.class_prompt[i],
+                    truncation=True,
+                    padding="max_length",
+                    max_length=self.tokenizer.model_max_length,
+                    return_tensors="pt",
+                ).input_ids
+
+        return example
+
+
+def collate_fn(num_instances, examples, with_prior_preservation=False):
+    input_ids = []
+    pixel_values = []
+
+    for i in range(num_instances):
+        input_ids += [example[f"instance_prompt_ids_{i}"] for example in examples]
+        pixel_values += [example[f"instance_images_{i}"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        for i in range(num_instances):
+            input_ids += [example[f"class_prompt_ids_{i}"] for example in examples]
+            pixel_values += [example[f"class_images_{i}"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    input_ids = torch.cat(input_ids, dim=0)
+
+    batch = {
+        "input_ids": input_ids,
+        "pixel_values": pixel_values,
+    }
+    return batch
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
+    instance_data_dir = []
+    instance_prompt = []
+    class_data_dir = [] if args.with_prior_preservation else None
+    class_prompt = [] if args.with_prior_preservation else None
+    if args.concepts_list:
+        with open(args.concepts_list, "r") as f:
+            concepts_list = json.load(f)
+
+        if args.validation_steps:
+            args.validation_prompt = []
+            args.validation_number_images = []
+            args.validation_negative_prompt = []
+            args.validation_inference_steps = []
+            args.validation_guidance_scale = []
+
+        for concept in concepts_list:
+            instance_data_dir.append(concept["instance_data_dir"])
+            instance_prompt.append(concept["instance_prompt"])
+
+            if args.with_prior_preservation:
+                try:
+                    class_data_dir.append(concept["class_data_dir"])
+                    class_prompt.append(concept["class_prompt"])
+                except KeyError:
+                    raise KeyError(
+                        "`class_data_dir` or `class_prompt` not found in concepts_list while using "
+                        "`with_prior_preservation`."
+                    )
+            else:
+                if "class_data_dir" in concept:
+                    warnings.warn(
+                        "Ignoring `class_data_dir` key, to use it you need to enable `with_prior_preservation`."
+                    )
+                if "class_prompt" in concept:
+                    warnings.warn(
+                        "Ignoring `class_prompt` key, to use it you need to enable `with_prior_preservation`."
+                    )
+
+            if args.validation_steps:
+                args.validation_prompt.append(concept.get("validation_prompt", None))
+                args.validation_number_images.append(concept.get("validation_number_images", 4))
+                args.validation_negative_prompt.append(concept.get("validation_negative_prompt", None))
+                args.validation_inference_steps.append(concept.get("validation_inference_steps", 25))
+                args.validation_guidance_scale.append(concept.get("validation_guidance_scale", 7.5))
+    else:
+        # Parse instance and class inputs, and double check that lengths match
+        instance_data_dir = args.instance_data_dir.split(",")
+        instance_prompt = args.instance_prompt.split(",")
+        assert all(
+            x == len(instance_data_dir) for x in [len(instance_data_dir), len(instance_prompt)]
+        ), "Instance data dir and prompt inputs are not of the same length."
+
+        if args.with_prior_preservation:
+            class_data_dir = args.class_data_dir.split(",")
+            class_prompt = args.class_prompt.split(",")
+            assert all(
+                x == len(instance_data_dir)
+                for x in [len(instance_data_dir), len(instance_prompt), len(class_data_dir), len(class_prompt)]
+            ), "Instance & class data dir or prompt inputs are not of the same length."
+
+        if args.validation_steps:
+            validation_prompts = args.validation_prompt.split(",")
+            num_of_validation_prompts = len(validation_prompts)
+            args.validation_prompt = validation_prompts
+            args.validation_number_images = [args.validation_number_images] * num_of_validation_prompts
+
+            negative_validation_prompts = [None] * num_of_validation_prompts
+            if args.validation_negative_prompt:
+                negative_validation_prompts = args.validation_negative_prompt.split(",")
+                while len(negative_validation_prompts) < num_of_validation_prompts:
+                    negative_validation_prompts.append(None)
+            args.validation_negative_prompt = negative_validation_prompts
+
+            assert num_of_validation_prompts == len(
+                negative_validation_prompts
+            ), "The length of negative prompts for validation is greater than the number of validation prompts."
+            args.validation_inference_steps = [args.validation_inference_steps] * num_of_validation_prompts
+            args.validation_guidance_scale = [args.validation_guidance_scale] * num_of_validation_prompts
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        for i in range(len(class_data_dir)):
+            class_images_dir = Path(class_data_dir[i])
+            if not class_images_dir.exists():
+                class_images_dir.mkdir(parents=True)
+            cur_class_images = len(list(class_images_dir.iterdir()))
+
+            if cur_class_images < args.num_class_images:
+                torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+                if args.prior_generation_precision == "fp32":
+                    torch_dtype = torch.float32
+                elif args.prior_generation_precision == "fp16":
+                    torch_dtype = torch.float16
+                elif args.prior_generation_precision == "bf16":
+                    torch_dtype = torch.bfloat16
+                pipeline = DiffusionPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    torch_dtype=torch_dtype,
+                    safety_checker=None,
+                    revision=args.revision,
+                )
+                pipeline.set_progress_bar_config(disable=True)
+
+                num_new_images = args.num_class_images - cur_class_images
+                logger.info(f"Number of class images to sample: {num_new_images}.")
+
+                sample_dataset = PromptDataset(class_prompt[i], num_new_images)
+                sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+                sample_dataloader = accelerator.prepare(sample_dataloader)
+                pipeline.to(accelerator.device)
+
+                for example in tqdm(
+                    sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+                ):
+                    images = pipeline(example["prompt"]).images
+
+                    for ii, image in enumerate(images):
+                        hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                        image_filename = (
+                            class_images_dir / f"{example['index'][ii] + cur_class_images}-{hash_image}.jpg"
+                        )
+                        image.save(image_filename)
+
+                # Clean up the memory deleting one-time-use variables.
+                del pipeline
+                del sample_dataloader
+                del sample_dataset
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    tokenizer = None
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+
+    vae.requires_grad_(False)
+    if not args.train_text_encoder:
+        text_encoder.requires_grad_(False)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = (
+        itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
+    )
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=instance_data_dir,
+        instance_prompt=instance_prompt,
+        class_data_root=class_data_dir,
+        class_prompt=class_prompt,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(len(instance_data_dir), examples, args.with_prior_preservation),
+        num_workers=1,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae and text_encoder to device and cast to weight_dtype
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if not args.train_text_encoder:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initialize automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("dreambooth", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                time_steps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+                )
+                time_steps = time_steps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, time_steps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Predict the noise residual
+                model_pred = unet(noisy_latents, time_steps, encoder_hidden_states).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, time_steps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute instance loss
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                    # Compute prior loss
+                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+                else:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(unet.parameters(), text_encoder.parameters())
+                        if args.train_text_encoder
+                        else unet.parameters()
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        save_path = join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if (
+                        args.validation_steps
+                        and any(args.validation_prompt)
+                        and global_step % args.validation_steps == 0
+                    ):
+                        images_set = generate_validation_images(
+                            text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype
+                        )
+                        for images, validation_prompt in zip(images_set, args.validation_prompt):
+                            if len(images) > 0:
+                                label = str(uuid.uuid1())[:8]  # generate an id for different set of images
+                                log_validation_images_to_tracker(
+                                    images, label, validation_prompt, accelerator, global_step
+                                )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        pipeline = DiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            unet=accelerator.unwrap_model(unet),
+            text_encoder=accelerator.unwrap_model(text_encoder),
+            revision=args.revision,
+        )
+        pipeline.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/multi_subject_dreambooth_inpainting/README.md b/diffusers/examples/research_projects/multi_subject_dreambooth_inpainting/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ffd8e304efceac7e5f921c9f3f45dcf9957939f8
--- /dev/null
+++ b/diffusers/examples/research_projects/multi_subject_dreambooth_inpainting/README.md
@@ -0,0 +1,93 @@
+# Multi Subject Dreambooth for Inpainting Models
+
+Please note that this project is not actively maintained. However, you can open an issue and tag @gzguevara.
+
+[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject. This project consists of **two parts**. Training Stable Diffusion for inpainting requieres prompt-image-mask pairs. The Unet of inpainiting models have 5 additional input channels (4 for the encoded masked-image and 1 for the mask itself).
+
+**The first part**, the `multi_inpaint_dataset.ipynb` notebook, demonstrates how make a 🤗 dataset of prompt-image-mask pairs. You can, however, skip the first part and move straight to the second part with the example datasets in this project. ([cat toy dataset masked](https://huggingface.co/datasets/gzguevara/cat_toy_masked), [mr. potato head dataset masked](https://huggingface.co/datasets/gzguevara/mr_potato_head_masked))
+
+**The second part**, the `train_multi_subject_inpainting.py` training script, demonstrates how to implement a training procedure for one or more subjects and adapt it for stable diffusion for inpainting.
+
+## 1. Data Collection: Make Prompt-Image-Mask Pairs
+
+ Earlier training scripts have provided approaches like random masking for the training images. This project provides a notebook for more precise mask setting.
+
+The notebook can be found here: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1JNEASI_B7pLW1srxhgln6nM0HoGAQT32?usp=sharing)
+
+The `multi_inpaint_dataset.ipynb` notebook, takes training & validation images, on which the user draws masks and provides prompts to make a prompt-image-mask pairs. This ensures that during training, the loss is computed on the area masking the object of interest, rather than on random areas. Moreover, the `multi_inpaint_dataset.ipynb` notebook allows you to build a validation dataset with corresponding masks for monitoring the training process. Example below:
+
+![train_val_pairs](https://drive.google.com/uc?id=1PzwH8E3icl_ubVmA19G0HZGLImFX3x5I)
+
+You can build multiple datasets for every subject and upload them to the 🤗 hub. Later, when launching the training script you can indicate the paths of the datasets, on which you would like to finetune Stable Diffusion for inpaining.
+
+## 2. Train Multi Subject Dreambooth for Inpainting
+
+### 2.1. Setting The Training Configuration
+
+Before launching the training script, make sure to select the inpainting the target model, the output directory and the 🤗 datasets.
+
+```bash
+export MODEL_NAME="runwayml/stable-diffusion-inpainting"
+export OUTPUT_DIR="path-to-save-model"
+
+export DATASET_1="gzguevara/mr_potato_head_masked"
+export DATASET_2="gzguevara/cat_toy_masked"
+... # Further paths to 🤗 datasets
+```
+
+### 2.2. Launching The Training Script
+
+```bash
+accelerate launch train_multi_subject_dreambooth_inpaint.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir $DATASET_1 $DATASET_2 \
+  --output_dir=$OUTPUT_DIR \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=2 \
+  --learning_rate=3e-6 \
+  --max_train_steps=500 \
+  --report_to_wandb
+```
+
+### 2.3. Fine-tune text encoder with the UNet.
+
+The script also allows to fine-tune the `text_encoder` along with the `unet`. It's been observed experimentally that fine-tuning `text_encoder` gives much better results especially on faces.
+Pass the `--train_text_encoder` argument to the script to enable training `text_encoder`.
+
+___Note: Training text encoder requires more memory, with this option the training won't fit on 16GB GPU. It needs at least 24GB VRAM.___
+
+```bash
+accelerate launch train_multi_subject_dreambooth_inpaint.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir $DATASET_1 $DATASET_2 \
+  --output_dir=$OUTPUT_DIR \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=2 \
+  --learning_rate=2e-6 \
+  --max_train_steps=500 \
+  --report_to_wandb \
+  --train_text_encoder
+```
+
+## 3. Results
+
+A [![Weights & Biases](https://img.shields.io/badge/Weights%20&%20Biases-Report-blue)](https://wandb.ai/gzguevara/uncategorized/reports/Multi-Subject-Dreambooth-for-Inpainting--Vmlldzo2MzY5NDQ4?accessToken=y0nya2d7baguhbryxaikbfr1203amvn1jsmyl07vk122mrs7tnph037u1nqgse8t) is provided showing the training progress by every 50 steps. Note, the reported  weights & baises run was performed on a A100 GPU with the following stetting:
+
+```bash
+accelerate launch train_multi_subject_dreambooth_inpaint.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir $DATASET_1 $DATASET_2 \
+  --output_dir=$OUTPUT_DIR \
+  --resolution=512 \
+  --train_batch_size=10 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=1e-6 \
+  --max_train_steps=500 \
+  --report_to_wandb \
+  --train_text_encoder
+```
+Here you can see the target objects on my desk and next to my plant:
+
+![Results](https://drive.google.com/uc?id=1kQisOiiF5cj4rOYjdq8SCZenNsUP2aK0)
diff --git a/diffusers/examples/research_projects/multi_subject_dreambooth_inpainting/requirements.txt b/diffusers/examples/research_projects/multi_subject_dreambooth_inpainting/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..351287c77cbe1c6ca8917dc1ff6f184ddbb1378a
--- /dev/null
+++ b/diffusers/examples/research_projects/multi_subject_dreambooth_inpainting/requirements.txt
@@ -0,0 +1,8 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+datasets>=2.16.0
+wandb>=0.16.1
+ftfy
+tensorboard
+Jinja2
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/multi_subject_dreambooth_inpainting/train_multi_subject_dreambooth_inpainting.py b/diffusers/examples/research_projects/multi_subject_dreambooth_inpainting/train_multi_subject_dreambooth_inpainting.py
new file mode 100644
index 0000000000000000000000000000000000000000..567e5e3caaa36896b278aa816ffe32638b7b8734
--- /dev/null
+++ b/diffusers/examples/research_projects/multi_subject_dreambooth_inpainting/train_multi_subject_dreambooth_inpainting.py
@@ -0,0 +1,661 @@
+import argparse
+import copy
+import itertools
+import logging
+import math
+import os
+import random
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import concatenate_datasets, load_dataset
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    StableDiffusionInpaintPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.13.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument("--instance_data_dir", nargs="+", help="Instance data directories")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_text_encoder", default=False, action="store_true", help="Whether to train the text encoder"
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=1000,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint and are suitable for resuming training"
+            " using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpointing_from",
+        type=int,
+        default=1000,
+        help=("Start to checkpoint from step"),
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=50,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--validation_from",
+        type=int,
+        default=0,
+        help=("Start to validate from step"),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more docs"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--validation_project_name",
+        type=str,
+        default=None,
+        help="The w&b name.",
+    )
+    parser.add_argument(
+        "--report_to_wandb", default=False, action="store_true", help="Whether to report to weights and biases"
+    )
+
+    args = parser.parse_args()
+
+    return args
+
+
+def prepare_mask_and_masked_image(image, mask):
+    image = np.array(image.convert("RGB"))
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    mask = np.array(mask.convert("L"))
+    mask = mask.astype(np.float32) / 255.0
+    mask = mask[None, None]
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+    mask = torch.from_numpy(mask)
+
+    masked_image = image * (mask < 0.5)
+
+    return mask, masked_image
+
+
+class DreamBoothDataset(Dataset):
+    def __init__(
+        self,
+        tokenizer,
+        datasets_paths,
+    ):
+        self.tokenizer = tokenizer
+        self.datasets_paths = (datasets_paths,)
+        self.datasets = [load_dataset(dataset_path) for dataset_path in self.datasets_paths[0]]
+        self.train_data = concatenate_datasets([dataset["train"] for dataset in self.datasets])
+        self.test_data = concatenate_datasets([dataset["test"] for dataset in self.datasets])
+
+        self.image_normalize = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def set_image(self, img, switch):
+        if img.mode not in ["RGB", "L"]:
+            img = img.convert("RGB")
+
+        if switch:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+
+        img = img.resize((512, 512), Image.BILINEAR)
+
+        return img
+
+    def __len__(self):
+        return len(self.train_data)
+
+    def __getitem__(self, index):
+        # Lettings
+        example = {}
+        img_idx = index % len(self.train_data)
+        switch = random.choice([True, False])
+
+        # Load image
+        image = self.set_image(self.train_data[img_idx]["image"], switch)
+
+        # Normalize image
+        image_norm = self.image_normalize(image)
+
+        # Tokenise prompt
+        tokenized_prompt = self.tokenizer(
+            self.train_data[img_idx]["prompt"],
+            padding="do_not_pad",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+        ).input_ids
+
+        # Load masks for image
+        masks = [
+            self.set_image(self.train_data[img_idx][key], switch) for key in self.train_data[img_idx] if "mask" in key
+        ]
+
+        # Build example
+        example["PIL_image"] = image
+        example["instance_image"] = image_norm
+        example["instance_prompt_id"] = tokenized_prompt
+        example["instance_masks"] = masks
+
+        return example
+
+
+def weighted_mask(masks):
+    # Convert each mask to a NumPy array and ensure it's binary
+    mask_arrays = [np.array(mask) / 255 for mask in masks]  # Normalizing to 0-1 range
+
+    # Generate random weights and apply them to each mask
+    weights = [random.random() for _ in masks]
+    weights = [weight / sum(weights) for weight in weights]
+    weighted_masks = [mask * weight for mask, weight in zip(mask_arrays, weights)]
+
+    # Sum the weighted masks
+    summed_mask = np.sum(weighted_masks, axis=0)
+
+    # Apply a threshold to create the final mask
+    threshold = 0.5  # This threshold can be adjusted
+    result_mask = summed_mask >= threshold
+
+    # Convert the result back to a PIL image
+    return Image.fromarray(result_mask.astype(np.uint8) * 255)
+
+
+def collate_fn(examples, tokenizer):
+    input_ids = [example["instance_prompt_id"] for example in examples]
+    pixel_values = [example["instance_image"] for example in examples]
+
+    masks, masked_images = [], []
+
+    for example in examples:
+        # generate a random mask
+        mask = weighted_mask(example["instance_masks"])
+
+        # prepare mask and masked image
+        mask, masked_image = prepare_mask_and_masked_image(example["PIL_image"], mask)
+
+        masks.append(mask)
+        masked_images.append(masked_image)
+
+    pixel_values = torch.stack(pixel_values).to(memory_format=torch.contiguous_format).float()
+    masks = torch.stack(masks)
+    masked_images = torch.stack(masked_images)
+    input_ids = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids
+
+    batch = {"input_ids": input_ids, "pixel_values": pixel_values, "masks": masks, "masked_images": masked_images}
+
+    return batch
+
+
+def log_validation(pipeline, text_encoder, unet, val_pairs, accelerator):
+    # update pipeline (note: unet and vae are loaded again in float32)
+    pipeline.text_encoder = accelerator.unwrap_model(text_encoder)
+    pipeline.unet = accelerator.unwrap_model(unet)
+
+    with torch.autocast("cuda"):
+        val_results = [{"data_or_path": pipeline(**pair).images[0], "caption": pair["prompt"]} for pair in val_pairs]
+
+    torch.cuda.empty_cache()
+
+    wandb.log({"validation": [wandb.Image(**val_result) for val_result in val_results]})
+
+
+def checkpoint(args, global_step, accelerator):
+    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+    accelerator.save_state(save_path)
+    logger.info(f"Saved state to {save_path}")
+
+
+def main():
+    args = parse_args()
+
+    project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit,
+        project_dir=args.output_dir,
+        logging_dir=Path(args.output_dir, args.logging_dir),
+    )
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        project_config=project_config,
+        log_with="wandb" if args.report_to_wandb else None,
+    )
+
+    if args.report_to_wandb and not is_wandb_available():
+        raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+
+    # Load the tokenizer & models and create wrapper for stable diffusion
+    tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder"
+    ).requires_grad_(args.train_text_encoder)
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae").requires_grad_(False)
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    optimizer = torch.optim.AdamW(
+        params=itertools.chain(unet.parameters(), text_encoder.parameters())
+        if args.train_text_encoder
+        else unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    train_dataset = DreamBoothDataset(
+        tokenizer=tokenizer,
+        datasets_paths=args.instance_data_dir,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, tokenizer),
+    )
+
+    # Scheduler and math around the number of training steps.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    accelerator.register_for_checkpointing(lr_scheduler)
+
+    if args.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    else:
+        weight_dtype = torch.float32
+
+    # Move text_encode and vae to gpu.
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if not args.train_text_encoder:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+
+    # Afterwards we calculate our number of training epochs
+    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = vars(copy.deepcopy(args))
+        accelerator.init_trackers(args.validation_project_name, config=tracker_config)
+
+    # create validation pipeline (note: unet and vae are loaded again in float32)
+    val_pipeline = StableDiffusionInpaintPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        unet=unet,
+        vae=vae,
+        torch_dtype=weight_dtype,
+        safety_checker=None,
+    )
+    val_pipeline.set_progress_bar_config(disable=True)
+
+    # prepare validation dataset
+    val_pairs = [
+        {
+            "image": example["image"],
+            "mask_image": mask,
+            "prompt": example["prompt"],
+        }
+        for example in train_dataset.test_data
+        for mask in [example[key] for key in example if "mask" in key]
+    ]
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            for model in models:
+                sub_dir = "unet" if isinstance(model, type(accelerator.unwrap_model(unet))) else "text_encoder"
+                model.save_pretrained(os.path.join(output_dir, sub_dir))
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+
+    print()
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    global_step = 0
+    first_epoch = 0
+
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, num_train_epochs):
+        unet.train()
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Convert masked images to latent space
+                masked_latents = vae.encode(
+                    batch["masked_images"].reshape(batch["pixel_values"].shape).to(dtype=weight_dtype)
+                ).latent_dist.sample()
+                masked_latents = masked_latents * vae.config.scaling_factor
+
+                masks = batch["masks"]
+                # resize the mask to latents shape as we concatenate the mask to the latents
+                mask = torch.stack(
+                    [
+                        torch.nn.functional.interpolate(mask, size=(args.resolution // 8, args.resolution // 8))
+                        for mask in masks
+                    ]
+                )
+                mask = mask.reshape(-1, 1, args.resolution // 8, args.resolution // 8)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # concatenate the noised latents with the mask and the masked latents
+                latent_model_input = torch.cat([noisy_latents, mask, masked_latents], dim=1)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Predict the noise residual
+                noise_pred = unet(latent_model_input, timesteps, encoder_hidden_states).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                loss = F.mse_loss(noise_pred.float(), target.float(), reduction="mean")
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(unet.parameters(), text_encoder.parameters())
+                        if args.train_text_encoder
+                        else unet.parameters()
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if (
+                        global_step % args.validation_steps == 0
+                        and global_step >= args.validation_from
+                        and args.report_to_wandb
+                    ):
+                        log_validation(
+                            val_pipeline,
+                            text_encoder,
+                            unet,
+                            val_pairs,
+                            accelerator,
+                        )
+
+                    if global_step % args.checkpointing_steps == 0 and global_step >= args.checkpointing_from:
+                        checkpoint(
+                            args,
+                            global_step,
+                            accelerator,
+                        )
+
+            # Step logging
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        accelerator.wait_for_everyone()
+
+    # Terminate training
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/research_projects/multi_token_textual_inversion/README.md b/diffusers/examples/research_projects/multi_token_textual_inversion/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e0aaf2c0575b5f404dbf9c4b4ec9ac5e4e00a3b
--- /dev/null
+++ b/diffusers/examples/research_projects/multi_token_textual_inversion/README.md
@@ -0,0 +1,143 @@
+## [Deprecated] Multi Token Textual Inversion
+
+**IMPORTART: This research project is deprecated. Multi Token Textual Inversion is now supported natively in [the official textual inversion example](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion#running-locally-with-pytorch).**
+
+The author of this project is [Isamu Isozaki](https://github.com/isamu-isozaki) - please make sure to tag the author for issue and PRs as well as @patrickvonplaten.
+
+We add multi token support to textual inversion. I added
+1. num_vec_per_token for the number of used to reference that token
+2. progressive_tokens for progressively training the token from 1 token to 2 token etc
+3. progressive_tokens_max_steps for the max number of steps until we start full training
+4. vector_shuffle to shuffle vectors
+
+Feel free to add these options to your training! In practice num_vec_per_token around 10+vector shuffle works great!
+
+## Textual Inversion fine-tuning example
+
+[Textual inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like stable diffusion on your own images using just 3-5 examples.
+The `textual_inversion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+## Running on Colab
+
+Colab for training
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
+
+Colab for inference
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_conceptualizer_inference.ipynb)
+
+## Running locally with PyTorch
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder  and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+
+### Cat toy example
+
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-5`, so you'll need to visit [its card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5), read the license and tick the checkbox if you agree.
+
+You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+
+Run the following command to authenticate your token
+
+```bash
+huggingface-cli login
+```
+
+If you have already cloned the repo, then you won't need to go through these steps.
+
+<br>
+
+Now let's get our dataset.Download 3-4 images from [here](https://drive.google.com/drive/folders/1fmJMs25nxS_rSNqS5hTcRdLem_YQXbq5) and save them in a directory. This will be our training data.
+
+And launch the training using
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+```bash
+export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export DATA_DIR="path-to-dir-containing-images"
+
+accelerate launch textual_inversion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<cat-toy>" --initializer_token="toy" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --max_train_steps=3000 \
+  --learning_rate=5.0e-04 --scale_lr \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="textual_inversion_cat"
+```
+
+A full training run takes ~1 hour on one V100 GPU.
+
+### Inference
+
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `placeholder_token` in your prompt.
+
+```python
+from diffusers import StableDiffusionPipeline
+
+model_id = "path-to-your-trained-model"
+pipe = StableDiffusionPipeline.from_pretrained(model_id,torch_dtype=torch.float16).to("cuda")
+
+prompt = "A <cat-toy> backpack"
+
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+
+image.save("cat-backpack.png")
+```
+
+
+## Training with Flax/JAX
+
+For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install -U -r requirements_flax.txt
+```
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export DATA_DIR="path-to-dir-containing-images"
+
+python textual_inversion_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<cat-toy>" --initializer_token="toy" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --max_train_steps=3000 \
+  --learning_rate=5.0e-04 --scale_lr \
+  --output_dir="textual_inversion_cat"
+```
+It should be at least 70% faster than the PyTorch script with the same configuration.
+
+### Training with xformers:
+You can enable memory efficient attention by [installing xFormers](https://github.com/facebookresearch/xformers#installing-xformers) and padding the `--enable_xformers_memory_efficient_attention` argument to the script. This is not available with the Flax/JAX implementation.
diff --git a/diffusers/examples/research_projects/multi_token_textual_inversion/multi_token_clip.py b/diffusers/examples/research_projects/multi_token_textual_inversion/multi_token_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdc344376ada63da590729cb82bc521c6fd2b5d1
--- /dev/null
+++ b/diffusers/examples/research_projects/multi_token_textual_inversion/multi_token_clip.py
@@ -0,0 +1,104 @@
+"""
+The main idea for this code is to provide a way for users to not need to bother with the hassle of multiple tokens for a concept by typing
+a photo of <concept>_0 <concept>_1 ... and so on
+and instead just do
+a photo of <concept>
+which gets translated to the above. This needs to work for both inference and training.
+For inference,
+the tokenizer encodes the text. So, we would want logic for our tokenizer to replace the placeholder token with
+it's underlying vectors
+For training,
+we would want to abstract away some logic like
+1. Adding tokens
+2. Updating gradient mask
+3. Saving embeddings
+to our Util class here.
+so
+TODO:
+1. have tokenizer keep track of concept, multiconcept pairs and replace during encode call x
+2. have mechanism for adding tokens x
+3. have mech for saving emebeddings x
+4. get mask to update x
+5. Loading tokens from embedding x
+6. Integrate to training x
+7. Test
+"""
+
+import copy
+import random
+
+from transformers import CLIPTokenizer
+
+
+class MultiTokenCLIPTokenizer(CLIPTokenizer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.token_map = {}
+
+    def try_adding_tokens(self, placeholder_token, *args, **kwargs):
+        num_added_tokens = super().add_tokens(placeholder_token, *args, **kwargs)
+        if num_added_tokens == 0:
+            raise ValueError(
+                f"The tokenizer already contains the token {placeholder_token}. Please pass a different"
+                " `placeholder_token` that is not already in the tokenizer."
+            )
+
+    def add_placeholder_tokens(self, placeholder_token, *args, num_vec_per_token=1, **kwargs):
+        output = []
+        if num_vec_per_token == 1:
+            self.try_adding_tokens(placeholder_token, *args, **kwargs)
+            output.append(placeholder_token)
+        else:
+            output = []
+            for i in range(num_vec_per_token):
+                ith_token = placeholder_token + f"_{i}"
+                self.try_adding_tokens(ith_token, *args, **kwargs)
+                output.append(ith_token)
+        # handle cases where there is a new placeholder token that contains the current placeholder token but is larger
+        for token in self.token_map:
+            if token in placeholder_token:
+                raise ValueError(
+                    f"The tokenizer already has placeholder token {token} that can get confused with"
+                    f" {placeholder_token}keep placeholder tokens independent"
+                )
+        self.token_map[placeholder_token] = output
+
+    def replace_placeholder_tokens_in_text(self, text, vector_shuffle=False, prop_tokens_to_load=1.0):
+        """
+        Here, we replace the placeholder tokens in text recorded in token_map so that the text_encoder
+        can encode them
+        vector_shuffle was inspired by https://github.com/rinongal/textual_inversion/pull/119
+        where shuffling tokens were found to force the model to learn the concepts more descriptively.
+        """
+        if isinstance(text, list):
+            output = []
+            for i in range(len(text)):
+                output.append(self.replace_placeholder_tokens_in_text(text[i], vector_shuffle=vector_shuffle))
+            return output
+        for placeholder_token in self.token_map:
+            if placeholder_token in text:
+                tokens = self.token_map[placeholder_token]
+                tokens = tokens[: 1 + int(len(tokens) * prop_tokens_to_load)]
+                if vector_shuffle:
+                    tokens = copy.copy(tokens)
+                    random.shuffle(tokens)
+                text = text.replace(placeholder_token, " ".join(tokens))
+        return text
+
+    def __call__(self, text, *args, vector_shuffle=False, prop_tokens_to_load=1.0, **kwargs):
+        return super().__call__(
+            self.replace_placeholder_tokens_in_text(
+                text, vector_shuffle=vector_shuffle, prop_tokens_to_load=prop_tokens_to_load
+            ),
+            *args,
+            **kwargs,
+        )
+
+    def encode(self, text, *args, vector_shuffle=False, prop_tokens_to_load=1.0, **kwargs):
+        return super().encode(
+            self.replace_placeholder_tokens_in_text(
+                text, vector_shuffle=vector_shuffle, prop_tokens_to_load=prop_tokens_to_load
+            ),
+            *args,
+            **kwargs,
+        )
diff --git a/diffusers/examples/research_projects/multi_token_textual_inversion/requirements.txt b/diffusers/examples/research_projects/multi_token_textual_inversion/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7a612982f4abbaa64f83db52e411a1235a372259
--- /dev/null
+++ b/diffusers/examples/research_projects/multi_token_textual_inversion/requirements.txt
@@ -0,0 +1,6 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
diff --git a/diffusers/examples/research_projects/multi_token_textual_inversion/requirements_flax.txt b/diffusers/examples/research_projects/multi_token_textual_inversion/requirements_flax.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8f85ad523a3b46b65abf0138c05ecdd656e6845c
--- /dev/null
+++ b/diffusers/examples/research_projects/multi_token_textual_inversion/requirements_flax.txt
@@ -0,0 +1,8 @@
+transformers>=4.25.1
+flax
+optax
+torch
+torchvision
+ftfy
+tensorboard
+Jinja2
diff --git a/diffusers/examples/research_projects/multi_token_textual_inversion/textual_inversion.py b/diffusers/examples/research_projects/multi_token_textual_inversion/textual_inversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..57ad77477b0d491044a1fc852a7ac3abe9563b1a
--- /dev/null
+++ b/diffusers/examples/research_projects/multi_token_textual_inversion/textual_inversion.py
@@ -0,0 +1,937 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import math
+import os
+import random
+from pathlib import Path
+
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from multi_token_clip import MultiTokenCLIPTokenizer
+
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.14.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def add_tokens(tokenizer, text_encoder, placeholder_token, num_vec_per_token=1, initializer_token=None):
+    """
+    Add tokens to the tokenizer and set the initial value of token embeddings
+    """
+    tokenizer.add_placeholder_tokens(placeholder_token, num_vec_per_token=num_vec_per_token)
+    text_encoder.resize_token_embeddings(len(tokenizer))
+    token_embeds = text_encoder.get_input_embeddings().weight.data
+    placeholder_token_ids = tokenizer.encode(placeholder_token, add_special_tokens=False)
+    if initializer_token:
+        token_ids = tokenizer.encode(initializer_token, add_special_tokens=False)
+        for i, placeholder_token_id in enumerate(placeholder_token_ids):
+            token_embeds[placeholder_token_id] = token_embeds[token_ids[i * len(token_ids) // num_vec_per_token]]
+    else:
+        for i, placeholder_token_id in enumerate(placeholder_token_ids):
+            token_embeds[placeholder_token_id] = torch.randn_like(token_embeds[placeholder_token_id])
+    return placeholder_token
+
+
+def save_progress(tokenizer, text_encoder, accelerator, save_path):
+    for placeholder_token in tokenizer.token_map:
+        placeholder_token_ids = tokenizer.encode(placeholder_token, add_special_tokens=False)
+        learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[placeholder_token_ids]
+        if len(placeholder_token_ids) == 1:
+            learned_embeds = learned_embeds[None]
+        learned_embeds_dict = {placeholder_token: learned_embeds.detach().cpu()}
+        torch.save(learned_embeds_dict, save_path)
+
+
+def load_multitoken_tokenizer(tokenizer, text_encoder, learned_embeds_dict):
+    for placeholder_token in learned_embeds_dict:
+        placeholder_embeds = learned_embeds_dict[placeholder_token]
+        num_vec_per_token = placeholder_embeds.shape[0]
+        placeholder_embeds = placeholder_embeds.to(dtype=text_encoder.dtype)
+        add_tokens(tokenizer, text_encoder, placeholder_token, num_vec_per_token=num_vec_per_token)
+        placeholder_token_ids = tokenizer.encode(placeholder_token, add_special_tokens=False)
+        token_embeds = text_encoder.get_input_embeddings().weight.data
+        for i, placeholder_token_id in enumerate(placeholder_token_ids):
+            token_embeds[placeholder_token_id] = placeholder_embeds[i]
+
+
+def load_multitoken_tokenizer_from_automatic(tokenizer, text_encoder, automatic_dict, placeholder_token):
+    """
+    Automatic1111's tokens have format
+    {'string_to_token': {'*': 265}, 'string_to_param': {'*': tensor([[ 0.0833,  0.0030,  0.0057,  ..., -0.0264, -0.0616, -0.0529],
+        [ 0.0058, -0.0190, -0.0584,  ..., -0.0025, -0.0945, -0.0490],
+        [ 0.0916,  0.0025,  0.0365,  ..., -0.0685, -0.0124,  0.0728],
+        [ 0.0812, -0.0199, -0.0100,  ..., -0.0581, -0.0780,  0.0254]],
+       requires_grad=True)}, 'name': 'FloralMarble-400', 'step': 399, 'sd_checkpoint': '4bdfc29c', 'sd_checkpoint_name': 'SD2.1-768'}
+    """
+    learned_embeds_dict = {}
+    learned_embeds_dict[placeholder_token] = automatic_dict["string_to_param"]["*"]
+    load_multitoken_tokenizer(tokenizer, text_encoder, learned_embeds_dict)
+
+
+def get_mask(tokenizer, accelerator):
+    # Get the mask of the weights that won't change
+    mask = torch.ones(len(tokenizer)).to(accelerator.device, dtype=torch.bool)
+    for placeholder_token in tokenizer.token_map:
+        placeholder_token_ids = tokenizer.encode(placeholder_token, add_special_tokens=False)
+        for i in range(len(placeholder_token_ids)):
+            mask = mask & (torch.arange(len(tokenizer)) != placeholder_token_ids[i]).to(accelerator.device)
+    return mask
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--progressive_tokens_max_steps",
+        type=int,
+        default=2000,
+        help="The number of steps until all tokens will be used.",
+    )
+    parser.add_argument(
+        "--progressive_tokens",
+        action="store_true",
+        help="Progressively train the tokens. For example, first train for 1 token, then 2 tokens and so on.",
+    )
+    parser.add_argument("--vector_shuffle", action="store_true", help="Shuffling tokens durint training")
+    parser.add_argument(
+        "--num_vec_per_token",
+        type=int,
+        default=1,
+        help=(
+            "The number of vectors used to represent the placeholder token. The higher the number, the better the"
+            " result at the cost of editability. This can be fixed by prompt editing."
+        ),
+    )
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save learned_embeds.bin every X updates steps.",
+    )
+    parser.add_argument(
+        "--only_save_embeds",
+        action="store_true",
+        default=False,
+        help="Save only the embeddings for the new concept.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
+    )
+    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
+    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution."
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run validation every X epochs. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more docs"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    return args
+
+
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+
+
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+        vector_shuffle=False,
+        progressive_tokens=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+        self.vector_shuffle = vector_shuffle
+        self.progressive_tokens = progressive_tokens
+        self.prop_tokens_to_load = 0
+
+        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+
+        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+
+        example["input_ids"] = self.tokenizer.encode(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+            vector_shuffle=self.vector_shuffle,
+            prop_tokens_to_load=self.prop_tokens_to_load if self.progressive_tokens else 1.0,
+        )[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            (
+                h,
+                w,
+            ) = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
+
+        image = Image.fromarray(img)
+        image = image.resize((self.size, self.size), resample=self.interpolation)
+
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+def main():
+    args = parse_args()
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load tokenizer
+    if args.tokenizer_name:
+        tokenizer = MultiTokenCLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = MultiTokenCLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+    if is_xformers_available():
+        try:
+            unet.enable_xformers_memory_efficient_attention()
+        except Exception as e:
+            logger.warning(
+                "Could not enable memory efficient attention. Make sure xformers is installed"
+                f" correctly and a GPU is available: {e}"
+            )
+    add_tokens(tokenizer, text_encoder, args.placeholder_token, args.num_vec_per_token, args.initializer_token)
+
+    # Freeze vae and unet
+    vae.requires_grad_(False)
+    unet.requires_grad_(False)
+    # Freeze all parameters except for the token embeddings in text encoder
+    text_encoder.text_model.encoder.requires_grad_(False)
+    text_encoder.text_model.final_layer_norm.requires_grad_(False)
+    text_encoder.text_model.embeddings.position_embedding.requires_grad_(False)
+
+    if args.gradient_checkpointing:
+        # Keep unet in train mode if we are using gradient checkpointing to save memory.
+        # The dropout cannot be != 0 so it doesn't matter if we are in eval or train mode.
+        unet.train()
+        text_encoder.gradient_checkpointing_enable()
+        unet.enable_gradient_checkpointing()
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = TextualInversionDataset(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=args.placeholder_token,
+        repeats=args.repeats,
+        learnable_property=args.learnable_property,
+        center_crop=args.center_crop,
+        set="train",
+    )
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    # Prepare everything with our `accelerator`.
+    text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        text_encoder, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # For mixed precision training we cast the unet and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae and unet to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("textual_inversion", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    # keep original embeddings as reference
+    orig_embeds_params = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight.data.clone()
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+            if args.progressive_tokens:
+                train_dataset.prop_tokens_to_load = float(global_step) / args.progressive_tokens_max_steps
+
+            with accelerator.accumulate(text_encoder):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample().detach()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0].to(dtype=weight_dtype)
+
+                # Predict the noise residual
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                accelerator.backward(loss)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+                # Let's make sure we don't update any embedding weights besides the newly added token
+                index_no_updates = get_mask(tokenizer, accelerator)
+                with torch.no_grad():
+                    accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[
+                        index_no_updates
+                    ] = orig_embeds_params[index_no_updates]
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.save_steps == 0:
+                    save_path = os.path.join(args.output_dir, f"learned_embeds-steps-{global_step}.bin")
+                    save_progress(tokenizer, text_encoder, accelerator, save_path)
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process and args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+            logger.info(
+                f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                f" {args.validation_prompt}."
+            )
+            # create pipeline (note: unet and vae are loaded again in float32)
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                text_encoder=accelerator.unwrap_model(text_encoder),
+                tokenizer=tokenizer,
+                unet=unet,
+                vae=vae,
+                revision=args.revision,
+                torch_dtype=weight_dtype,
+            )
+            pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+            pipeline = pipeline.to(accelerator.device)
+            pipeline.set_progress_bar_config(disable=True)
+
+            # run inference
+            generator = (
+                None if args.seed is None else torch.Generator(device=accelerator.device).manual_seed(args.seed)
+            )
+            images = []
+            for _ in range(args.num_validation_images):
+                with torch.autocast("cuda"):
+                    image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                images.append(image)
+
+            for tracker in accelerator.trackers:
+                if tracker.name == "tensorboard":
+                    np_images = np.stack([np.asarray(img) for img in images])
+                    tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                if tracker.name == "wandb":
+                    tracker.log(
+                        {
+                            "validation": [
+                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                for i, image in enumerate(images)
+                            ]
+                        }
+                    )
+
+            del pipeline
+            torch.cuda.empty_cache()
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        if args.push_to_hub and args.only_save_embeds:
+            logger.warning("Enabling full model saving because --push_to_hub=True was specified.")
+            save_full_model = True
+        else:
+            save_full_model = not args.only_save_embeds
+        if save_full_model:
+            pipeline = StableDiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                text_encoder=accelerator.unwrap_model(text_encoder),
+                vae=vae,
+                unet=unet,
+                tokenizer=tokenizer,
+            )
+            pipeline.save_pretrained(args.output_dir)
+        # Save the newly trained embeddings
+        save_path = os.path.join(args.output_dir, "learned_embeds.bin")
+        save_progress(tokenizer, text_encoder, accelerator, save_path)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/research_projects/multi_token_textual_inversion/textual_inversion_flax.py b/diffusers/examples/research_projects/multi_token_textual_inversion/textual_inversion_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecc89f98298e3e4205581fee1689761c519bc4e4
--- /dev/null
+++ b/diffusers/examples/research_projects/multi_token_textual_inversion/textual_inversion_flax.py
@@ -0,0 +1,654 @@
+import argparse
+import logging
+import math
+import os
+import random
+from pathlib import Path
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+import PIL
+import torch
+import torch.utils.checkpoint
+import transformers
+from flax import jax_utils
+from flax.training import train_state
+from flax.training.common_utils import shard
+from huggingface_hub import create_repo, upload_folder
+
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPImageProcessor, CLIPTokenizer, FlaxCLIPTextModel, set_seed
+
+from diffusers import (
+    FlaxAutoencoderKL,
+    FlaxDDPMScheduler,
+    FlaxPNDMScheduler,
+    FlaxStableDiffusionPipeline,
+    FlaxUNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
+from diffusers.utils import check_min_version
+
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.14.0.dev0")
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
+    )
+    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
+    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution."
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=True,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--use_auth_token",
+        action="store_true",
+        help=(
+            "Will use the token generated when running `huggingface-cli login` (necessary to use this script with"
+            " private models)."
+        ),
+    )
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    return args
+
+
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+
+
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+
+        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+
+        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+
+        example["input_ids"] = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            (
+                h,
+                w,
+            ) = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
+
+        image = Image.fromarray(img)
+        image = image.resize((self.size, self.size), resample=self.interpolation)
+
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+def resize_token_embeddings(model, new_num_tokens, initializer_token_id, placeholder_token_id, rng):
+    if model.config.vocab_size == new_num_tokens or new_num_tokens is None:
+        return
+    model.config.vocab_size = new_num_tokens
+
+    params = model.params
+    old_embeddings = params["text_model"]["embeddings"]["token_embedding"]["embedding"]
+    old_num_tokens, emb_dim = old_embeddings.shape
+
+    initializer = jax.nn.initializers.normal()
+
+    new_embeddings = initializer(rng, (new_num_tokens, emb_dim))
+    new_embeddings = new_embeddings.at[:old_num_tokens].set(old_embeddings)
+    new_embeddings = new_embeddings.at[placeholder_token_id].set(new_embeddings[initializer_token_id])
+    params["text_model"]["embeddings"]["token_embedding"]["embedding"] = new_embeddings
+
+    model.params = params
+    return model
+
+
+def get_params_to_save(params):
+    return jax.device_get(jax.tree_util.tree_map(lambda x: x[0], params))
+
+
+def main():
+    args = parse_args()
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    if jax.process_index() == 0:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+
+    # Load the tokenizer and add the placeholder token as a additional special token
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Add the placeholder token in tokenizer
+    num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
+    if num_added_tokens == 0:
+        raise ValueError(
+            f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+            " `placeholder_token` that is not already in the tokenizer."
+        )
+
+    # Convert the initializer_token, placeholder_token to ids
+    token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
+    # Check if initializer_token is a single token or a sequence of tokens
+    if len(token_ids) > 1:
+        raise ValueError("The initializer token must be a single token.")
+
+    initializer_token_id = token_ids[0]
+    placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
+
+    # Load models and create wrapper for stable diffusion
+    text_encoder = FlaxCLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
+    vae, vae_params = FlaxAutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+    unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+
+    # Create sampling rng
+    rng = jax.random.PRNGKey(args.seed)
+    rng, _ = jax.random.split(rng)
+    # Resize the token embeddings as we are adding new special tokens to the tokenizer
+    text_encoder = resize_token_embeddings(
+        text_encoder, len(tokenizer), initializer_token_id, placeholder_token_id, rng
+    )
+    original_token_embeds = text_encoder.params["text_model"]["embeddings"]["token_embedding"]["embedding"]
+
+    train_dataset = TextualInversionDataset(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=args.placeholder_token,
+        repeats=args.repeats,
+        learnable_property=args.learnable_property,
+        center_crop=args.center_crop,
+        set="train",
+    )
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+
+        batch = {"pixel_values": pixel_values, "input_ids": input_ids}
+        batch = {k: v.numpy() for k, v in batch.items()}
+
+        return batch
+
+    total_train_batch_size = args.train_batch_size * jax.local_device_count()
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=total_train_batch_size, shuffle=True, drop_last=True, collate_fn=collate_fn
+    )
+
+    # Optimization
+    if args.scale_lr:
+        args.learning_rate = args.learning_rate * total_train_batch_size
+
+    constant_scheduler = optax.constant_schedule(args.learning_rate)
+
+    optimizer = optax.adamw(
+        learning_rate=constant_scheduler,
+        b1=args.adam_beta1,
+        b2=args.adam_beta2,
+        eps=args.adam_epsilon,
+        weight_decay=args.adam_weight_decay,
+    )
+
+    def create_mask(params, label_fn):
+        def _map(params, mask, label_fn):
+            for k in params:
+                if label_fn(k):
+                    mask[k] = "token_embedding"
+                else:
+                    if isinstance(params[k], dict):
+                        mask[k] = {}
+                        _map(params[k], mask[k], label_fn)
+                    else:
+                        mask[k] = "zero"
+
+        mask = {}
+        _map(params, mask, label_fn)
+        return mask
+
+    def zero_grads():
+        # from https://github.com/deepmind/optax/issues/159#issuecomment-896459491
+        def init_fn(_):
+            return ()
+
+        def update_fn(updates, state, params=None):
+            return jax.tree_util.tree_map(jnp.zeros_like, updates), ()
+
+        return optax.GradientTransformation(init_fn, update_fn)
+
+    # Zero out gradients of layers other than the token embedding layer
+    tx = optax.multi_transform(
+        {"token_embedding": optimizer, "zero": zero_grads()},
+        create_mask(text_encoder.params, lambda s: s == "token_embedding"),
+    )
+
+    state = train_state.TrainState.create(apply_fn=text_encoder.__call__, params=text_encoder.params, tx=tx)
+
+    noise_scheduler = FlaxDDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
+    )
+    noise_scheduler_state = noise_scheduler.create_state()
+
+    # Initialize our training
+    train_rngs = jax.random.split(rng, jax.local_device_count())
+
+    # Define gradient train step fn
+    def train_step(state, vae_params, unet_params, batch, train_rng):
+        dropout_rng, sample_rng, new_train_rng = jax.random.split(train_rng, 3)
+
+        def compute_loss(params):
+            vae_outputs = vae.apply(
+                {"params": vae_params}, batch["pixel_values"], deterministic=True, method=vae.encode
+            )
+            latents = vae_outputs.latent_dist.sample(sample_rng)
+            # (NHWC) -> (NCHW)
+            latents = jnp.transpose(latents, (0, 3, 1, 2))
+            latents = latents * vae.config.scaling_factor
+
+            noise_rng, timestep_rng = jax.random.split(sample_rng)
+            noise = jax.random.normal(noise_rng, latents.shape)
+            bsz = latents.shape[0]
+            timesteps = jax.random.randint(
+                timestep_rng,
+                (bsz,),
+                0,
+                noise_scheduler.config.num_train_timesteps,
+            )
+            noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)
+            encoder_hidden_states = state.apply_fn(
+                batch["input_ids"], params=params, dropout_rng=dropout_rng, train=True
+            )[0]
+            # Predict the noise residual and compute loss
+            model_pred = unet.apply(
+                {"params": unet_params}, noisy_latents, timesteps, encoder_hidden_states, train=False
+            ).sample
+
+            # Get the target for loss depending on the prediction type
+            if noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif noise_scheduler.config.prediction_type == "v_prediction":
+                target = noise_scheduler.get_velocity(noise_scheduler_state, latents, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+            loss = (target - model_pred) ** 2
+            loss = loss.mean()
+
+            return loss
+
+        grad_fn = jax.value_and_grad(compute_loss)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+
+        # Keep the token embeddings fixed except the newly added embeddings for the concept,
+        # as we only want to optimize the concept embeddings
+        token_embeds = original_token_embeds.at[placeholder_token_id].set(
+            new_state.params["text_model"]["embeddings"]["token_embedding"]["embedding"][placeholder_token_id]
+        )
+        new_state.params["text_model"]["embeddings"]["token_embedding"]["embedding"] = token_embeds
+
+        metrics = {"loss": loss}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+        return new_state, metrics, new_train_rng
+
+    # Create parallel version of the train and eval step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    vae_params = jax_utils.replicate(vae_params)
+    unet_params = jax_utils.replicate(unet_params)
+
+    # Train!
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+
+    # Scheduler and math around the number of training steps.
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {total_train_batch_size}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    global_step = 0
+
+    epochs = tqdm(range(args.num_train_epochs), desc=f"Epoch ... (1/{args.num_train_epochs})", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+
+        train_metrics = []
+
+        steps_per_epoch = len(train_dataset) // total_train_batch_size
+        train_step_progress_bar = tqdm(total=steps_per_epoch, desc="Training...", position=1, leave=False)
+        # train
+        for batch in train_dataloader:
+            batch = shard(batch)
+            state, train_metric, train_rngs = p_train_step(state, vae_params, unet_params, batch, train_rngs)
+            train_metrics.append(train_metric)
+
+            train_step_progress_bar.update(1)
+            global_step += 1
+
+            if global_step >= args.max_train_steps:
+                break
+
+        train_metric = jax_utils.unreplicate(train_metric)
+
+        train_step_progress_bar.close()
+        epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})")
+
+    # Create the pipeline using using the trained modules and save it.
+    if jax.process_index() == 0:
+        scheduler = FlaxPNDMScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
+        )
+        safety_checker = FlaxStableDiffusionSafetyChecker.from_pretrained(
+            "CompVis/stable-diffusion-safety-checker", from_pt=True
+        )
+        pipeline = FlaxStableDiffusionPipeline(
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32"),
+        )
+
+        pipeline.save_pretrained(
+            args.output_dir,
+            params={
+                "text_encoder": get_params_to_save(state.params),
+                "vae": get_params_to_save(vae_params),
+                "unet": get_params_to_save(unet_params),
+                "safety_checker": safety_checker.params,
+            },
+        )
+
+        # Also save the newly trained embeddings
+        learned_embeds = get_params_to_save(state.params)["text_model"]["embeddings"]["token_embedding"]["embedding"][
+            placeholder_token_id
+        ]
+        learned_embeds_dict = {args.placeholder_token: learned_embeds}
+        jnp.save(os.path.join(args.output_dir, "learned_embeds.npy"), learned_embeds_dict)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/research_projects/onnxruntime/README.md b/diffusers/examples/research_projects/onnxruntime/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a7935189f61b3078b0926661cd1b3ee91fc2295a
--- /dev/null
+++ b/diffusers/examples/research_projects/onnxruntime/README.md
@@ -0,0 +1,5 @@
+## Diffusers examples with ONNXRuntime optimizations
+
+**This research project is not actively maintained by the diffusers team. For any questions or comments, please contact Prathik Rao (prathikr), Sunghoon Choi (hanbitmyths), Ashwini Khade (askhade), or Peng Wang (pengwa) on github with any questions.**
+
+This aims to provide diffusers examples with ONNXRuntime optimizations for training/fine-tuning unconditional image generation, text to image, and textual inversion. Please see individual directories for more details on how to run each task using ONNXRuntime.
diff --git a/diffusers/examples/research_projects/onnxruntime/text_to_image/README.md b/diffusers/examples/research_projects/onnxruntime/text_to_image/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f1f134c576b26aeddb637c803b9a70b4c8c1a97a
--- /dev/null
+++ b/diffusers/examples/research_projects/onnxruntime/text_to_image/README.md
@@ -0,0 +1,74 @@
+# Stable Diffusion text-to-image fine-tuning
+
+The `train_text_to_image.py` script shows how to fine-tune stable diffusion model on your own dataset.
+
+___Note___:
+
+___This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparamters to get the best result on your dataset.___
+
+
+## Running locally with PyTorch
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder  and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+### Naruto example
+
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree.
+
+You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+
+Run the following command to authenticate your token
+
+```bash
+huggingface-cli login
+```
+
+If you have already cloned the repo, then you won't need to go through these steps.
+
+<br>
+
+## Use ONNXRuntime to accelerate training
+In order to leverage onnxruntime to accelerate training, please use train_text_to_image.py
+
+The command to train a DDPM UNetCondition model on the Naruto dataset with onnxruntime:
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export dataset_name="lambdalabs/naruto-blip-captions"
+accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$dataset_name \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --output_dir="sd-naruto-model"
+```
+
+Please contact Prathik Rao (prathikr), Sunghoon Choi (hanbitmyths), Ashwini Khade (askhade), or Peng Wang (pengwa) on github with any questions.
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/onnxruntime/text_to_image/requirements.txt b/diffusers/examples/research_projects/onnxruntime/text_to_image/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2dbadea4474aac24b501e61a4b05f24168ac85be
--- /dev/null
+++ b/diffusers/examples/research_projects/onnxruntime/text_to_image/requirements.txt
@@ -0,0 +1,7 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+datasets
+ftfy
+tensorboard
+modelcards
diff --git a/diffusers/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py b/diffusers/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..126a10b4f9e90e0716e2784f3e3673dbda37a601
--- /dev/null
+++ b/diffusers/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
@@ -0,0 +1,954 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import math
+import os
+import random
+from pathlib import Path
+
+import accelerate
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.state import AcceleratorState
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from onnxruntime.training.optim.fp16_optimizer import FP16_Optimizer as ORT_FP16_Optimizer
+from onnxruntime.training.ortmodule import ORTModule
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+from transformers.utils import ContextManagers
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel, compute_snr
+from diffusers.utils import check_min_version, deprecate, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+if is_wandb_available():
+    import wandb
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.17.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight_dtype, epoch):
+    logger.info("Running validation... ")
+
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        vae=accelerator.unwrap_model(vae),
+        text_encoder=accelerator.unwrap_model(text_encoder),
+        tokenizer=tokenizer,
+        unet=accelerator.unwrap_model(unet),
+        safety_checker=None,
+        revision=args.revision,
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    images = []
+    for i in range(len(args.validation_prompts)):
+        with torch.autocast("cuda"):
+            image = pipeline(args.validation_prompts[i], num_inference_steps=20, generator=generator).images[0]
+
+        images.append(image)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompts[i]}")
+                        for i, image in enumerate(images)
+                    ]
+                }
+            )
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--input_pertubation", type=float, default=0, help="The scale of input pretubation. Recommended 0.1."
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompts",
+        type=str,
+        default=None,
+        nargs="+",
+        help=("A set of prompts evaluated every `--validation_epochs` and logged to `--report_to`."),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--non_ema_revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or"
+            " remote repository specified with --pretrained_model_name_or_path."
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more docs"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=5,
+        help="Run validation every X epochs.",
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="text2image-fine-tune",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    # default to using the same revision for the non-ema model if not specified
+    if args.non_ema_revision is None:
+        args.non_ema_revision = args.revision
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if args.non_ema_revision is not None:
+        deprecate(
+            "non_ema_revision!=None",
+            "0.15.0",
+            message=(
+                "Downloading 'non_ema' weights from revision branches of the Hub is deprecated. Please make sure to"
+                " use `--variant=non_ema` instead."
+            ),
+        )
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+
+    def deepspeed_zero_init_disabled_context_manager():
+        """
+        returns either a context list that includes one that will disable zero.Init or an empty context list
+        """
+        deepspeed_plugin = AcceleratorState().deepspeed_plugin if accelerate.state.is_initialized() else None
+        if deepspeed_plugin is None:
+            return []
+
+        return [deepspeed_plugin.zero3_init_context_manager(enable=False)]
+
+    # Currently Accelerate doesn't know how to handle multiple models under Deepspeed ZeRO stage 3.
+    # For this to work properly all models must be run through `accelerate.prepare`. But accelerate
+    # will try to assign the same optimizer with the same weights to all models during
+    # `deepspeed.initialize`, which of course doesn't work.
+    #
+    # For now the following workaround will partially support Deepspeed ZeRO-3, by excluding the 2
+    # frozen models from being partitioned during `zero.Init` which gets called during
+    # `from_pretrained` So CLIPTextModel and AutoencoderKL will not enjoy the parameter sharding
+    # across multiple gpus and only UNet2DConditionModel will get ZeRO sharded.
+    with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
+        text_encoder = CLIPTextModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+        )
+        vae = AutoencoderKL.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision
+        )
+
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.non_ema_revision
+    )
+
+    # Freeze vae and text_encoder
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+
+    # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = UNet2DConditionModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+        )
+        ema_unet = EMAModel(ema_unet.parameters(), model_cls=UNet2DConditionModel, model_config=ema_unet.config)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DConditionModel)
+                ema_unet.load_state_dict(load_model.state_dict())
+                ema_unet.to(accelerator.device)
+                del load_model
+
+            for i in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    optimizer = ORT_FP16_Optimizer(optimizer)
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        return inputs.input_ids
+
+    # Preprocessing the datasets.
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["input_ids"] = tokenize_captions(examples)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+        return {"pixel_values": pixel_values, "input_ids": input_ids}
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.use_ema:
+        ema_unet.to(accelerator.device)
+
+    unet = ORTModule(unet)
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move text_encode and vae to gpu and cast to weight_dtype
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        tracker_config.pop("validation_prompts")
+        accelerator.init_trackers(args.tracker_project_name, tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (latents.shape[0], latents.shape[1], 1, 1), device=latents.device
+                    )
+                if args.input_pertubation:
+                    new_noise = noise + args.input_pertubation * torch.randn_like(noise)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                if args.input_pertubation:
+                    noisy_latents = noise_scheduler.add_noise(latents, new_noise, timesteps)
+                else:
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # Predict the noise residual and compute loss
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_unet.step(unet.parameters())
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompts is not None and epoch % args.validation_epochs == 0:
+                if args.use_ema:
+                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                    ema_unet.store(unet.parameters())
+                    ema_unet.copy_to(unet.parameters())
+                log_validation(
+                    vae,
+                    text_encoder,
+                    tokenizer,
+                    unet,
+                    args,
+                    accelerator,
+                    weight_dtype,
+                    global_step,
+                )
+                if args.use_ema:
+                    # Switch back to the original UNet parameters.
+                    ema_unet.restore(unet.parameters())
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            revision=args.revision,
+        )
+        pipeline.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/research_projects/onnxruntime/textual_inversion/README.md b/diffusers/examples/research_projects/onnxruntime/textual_inversion/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0f6ec7f51186539379df53e4b1b49c6332789cb1
--- /dev/null
+++ b/diffusers/examples/research_projects/onnxruntime/textual_inversion/README.md
@@ -0,0 +1,94 @@
+## Textual Inversion fine-tuning example
+
+[Textual inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like stable diffusion on your own images using just 3-5 examples.
+The `textual_inversion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+## Running on Colab
+
+Colab for training
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
+
+Colab for inference
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_conceptualizer_inference.ipynb)
+
+## Running locally with PyTorch
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder  and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+
+### Cat toy example
+
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-5`, so you'll need to visit [its card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5), read the license and tick the checkbox if you agree.
+
+You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+
+Run the following command to authenticate your token
+
+```bash
+huggingface-cli login
+```
+
+If you have already cloned the repo, then you won't need to go through these steps.
+
+<br>
+
+Now let's get our dataset. For this example we will use some cat images: https://huggingface.co/datasets/diffusers/cat_toy_example .
+
+Let's first download it locally:
+
+```py
+from huggingface_hub import snapshot_download
+
+local_dir = "./cat"
+snapshot_download("diffusers/cat_toy_example", local_dir=local_dir, repo_type="dataset", ignore_patterns=".gitattributes")
+```
+
+This will be our training data.
+Now we can launch the training using
+
+## Use ONNXRuntime to accelerate training
+In order to leverage onnxruntime to accelerate training, please use textual_inversion.py
+
+The command to train on custom data with onnxruntime:
+
+```bash
+export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export DATA_DIR="path-to-dir-containing-images"
+
+accelerate launch textual_inversion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<cat-toy>" --initializer_token="toy" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --max_train_steps=3000 \
+  --learning_rate=5.0e-04 --scale_lr \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="textual_inversion_cat"
+```
+
+Please contact Prathik Rao (prathikr), Sunghoon Choi (hanbitmyths), Ashwini Khade (askhade), or Peng Wang (pengwa) on github with any questions.
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/onnxruntime/textual_inversion/requirements.txt b/diffusers/examples/research_projects/onnxruntime/textual_inversion/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c1a94eac83e6eb9a3a2dd11672b5d73f794ca3d1
--- /dev/null
+++ b/diffusers/examples/research_projects/onnxruntime/textual_inversion/requirements.txt
@@ -0,0 +1,6 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+modelcards
diff --git a/diffusers/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py b/diffusers/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..e10564fa59efb191ea37fef6c2fa15d9203c7f86
--- /dev/null
+++ b/diffusers/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py
@@ -0,0 +1,958 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import math
+import os
+import random
+import warnings
+from pathlib import Path
+
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from onnxruntime.training.optim.fp16_optimizer import FP16_Optimizer as ORT_FP16_Optimizer
+from onnxruntime.training.ortmodule import ORTModule
+
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+if is_wandb_available():
+    import wandb
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.17.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(repo_id: str, images=None, base_model=str, repo_folder=None):
+    img_str = ""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+- textual_inversion
+- diffusers-training
+- onxruntime
+inference: true
+---
+    """
+    model_card = f"""
+# Textual inversion text2image fine-tuning - {repo_id}
+These are textual inversion adaption weights for {base_model}. You can find some example images in the following. \n
+{img_str}
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    # create pipeline (note: unet and vae are loaded again in float32)
+    pipeline = DiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        text_encoder=accelerator.unwrap_model(text_encoder),
+        tokenizer=tokenizer,
+        unet=unet,
+        vae=vae,
+        safety_checker=None,
+        revision=args.revision,
+        torch_dtype=weight_dtype,
+    )
+    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = None if args.seed is None else torch.Generator(device=accelerator.device).manual_seed(args.seed)
+    images = []
+    for _ in range(args.num_validation_images):
+        with torch.autocast("cuda"):
+            image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+        images.append(image)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    torch.cuda.empty_cache()
+    return images
+
+
+def save_progress(text_encoder, placeholder_token_ids, accelerator, args, save_path):
+    logger.info("Saving embeddings")
+    learned_embeds = (
+        accelerator.unwrap_model(text_encoder)
+        .get_input_embeddings()
+        .weight[min(placeholder_token_ids) : max(placeholder_token_ids) + 1]
+    )
+    learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
+    torch.save(learned_embeds_dict, save_path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save learned_embeds.bin every X updates steps.",
+    )
+    parser.add_argument(
+        "--save_as_full_pipeline",
+        action="store_true",
+        help="Save the complete stable diffusion pipeline.",
+    )
+    parser.add_argument(
+        "--num_vectors",
+        type=int,
+        default=1,
+        help="How many textual inversion vectors shall be used to learn the concept.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
+    )
+    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
+    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution."
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=None,
+        help=(
+            "Deprecated in favor of validation_steps. Run validation every X epochs. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more docs"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    return args
+
+
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+
+
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+
+        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+
+        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+
+        example["input_ids"] = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            (
+                h,
+                w,
+            ) = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
+
+        image = Image.fromarray(img)
+        image = image.resize((self.size, self.size), resample=self.interpolation)
+
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+def main():
+    args = parse_args()
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load tokenizer
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+
+    # Add the placeholder token in tokenizer
+    placeholder_tokens = [args.placeholder_token]
+
+    if args.num_vectors < 1:
+        raise ValueError(f"--num_vectors has to be larger or equal to 1, but is {args.num_vectors}")
+
+    # add dummy tokens for multi-vector
+    additional_tokens = []
+    for i in range(1, args.num_vectors):
+        additional_tokens.append(f"{args.placeholder_token}_{i}")
+    placeholder_tokens += additional_tokens
+
+    num_added_tokens = tokenizer.add_tokens(placeholder_tokens)
+    if num_added_tokens != args.num_vectors:
+        raise ValueError(
+            f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+            " `placeholder_token` that is not already in the tokenizer."
+        )
+
+    # Convert the initializer_token, placeholder_token to ids
+    token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
+    # Check if initializer_token is a single token or a sequence of tokens
+    if len(token_ids) > 1:
+        raise ValueError("The initializer token must be a single token.")
+
+    initializer_token_id = token_ids[0]
+    placeholder_token_ids = tokenizer.convert_tokens_to_ids(placeholder_tokens)
+
+    # Resize the token embeddings as we are adding new special tokens to the tokenizer
+    text_encoder.resize_token_embeddings(len(tokenizer))
+
+    # Initialise the newly added placeholder token with the embeddings of the initializer token
+    token_embeds = text_encoder.get_input_embeddings().weight.data
+    with torch.no_grad():
+        for token_id in placeholder_token_ids:
+            token_embeds[token_id] = token_embeds[initializer_token_id].clone()
+
+    # Freeze vae and unet
+    vae.requires_grad_(False)
+    unet.requires_grad_(False)
+    # Freeze all parameters except for the token embeddings in text encoder
+    text_encoder.text_model.encoder.requires_grad_(False)
+    text_encoder.text_model.final_layer_norm.requires_grad_(False)
+    text_encoder.text_model.embeddings.position_embedding.requires_grad_(False)
+
+    if args.gradient_checkpointing:
+        # Keep unet in train mode if we are using gradient checkpointing to save memory.
+        # The dropout cannot be != 0 so it doesn't matter if we are in eval or train mode.
+        unet.train()
+        text_encoder.gradient_checkpointing_enable()
+        unet.enable_gradient_checkpointing()
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    optimizer = ORT_FP16_Optimizer(optimizer)
+
+    # Dataset and DataLoaders creation:
+    train_dataset = TextualInversionDataset(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=args.placeholder_token,
+        repeats=args.repeats,
+        learnable_property=args.learnable_property,
+        center_crop=args.center_crop,
+        set="train",
+    )
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
+    )
+    if args.validation_epochs is not None:
+        warnings.warn(
+            f"FutureWarning: You are doing logging with validation_epochs={args.validation_epochs}."
+            " Deprecated validation_epochs in favor of `validation_steps`"
+            f"Setting `args.validation_steps` to {args.validation_epochs * len(train_dataset)}",
+            FutureWarning,
+            stacklevel=2,
+        )
+        args.validation_steps = args.validation_epochs * len(train_dataset)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    # Prepare everything with our `accelerator`.
+    text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        text_encoder, optimizer, train_dataloader, lr_scheduler
+    )
+
+    text_encoder = ORTModule(text_encoder)
+    unet = ORTModule(unet)
+    vae = ORTModule(vae)
+
+    # For mixed precision training we cast the unet and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae and unet to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("textual_inversion", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    # keep original embeddings as reference
+    orig_embeds_params = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight.data.clone()
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            with accelerator.accumulate(text_encoder):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample().detach()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0].to(dtype=weight_dtype)
+
+                # Predict the noise residual
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                accelerator.backward(loss)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+                # Let's make sure we don't update any embedding weights besides the newly added token
+                index_no_updates = torch.ones((len(tokenizer),), dtype=torch.bool)
+                index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False
+
+                with torch.no_grad():
+                    accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[
+                        index_no_updates
+                    ] = orig_embeds_params[index_no_updates]
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                images = []
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.save_steps == 0:
+                    save_path = os.path.join(args.output_dir, f"learned_embeds-steps-{global_step}.bin")
+                    save_progress(text_encoder, placeholder_token_ids, accelerator, args, save_path)
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                        images = log_validation(
+                            text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        if args.push_to_hub and not args.save_as_full_pipeline:
+            logger.warning("Enabling full model saving because --push_to_hub=True was specified.")
+            save_full_model = True
+        else:
+            save_full_model = args.save_as_full_pipeline
+        if save_full_model:
+            pipeline = StableDiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                text_encoder=accelerator.unwrap_model(text_encoder),
+                vae=vae,
+                unet=unet,
+                tokenizer=tokenizer,
+            )
+            pipeline.save_pretrained(args.output_dir)
+        # Save the newly trained embeddings
+        save_path = os.path.join(args.output_dir, "learned_embeds.bin")
+        save_progress(text_encoder, placeholder_token_ids, accelerator, args, save_path)
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/research_projects/onnxruntime/unconditional_image_generation/README.md b/diffusers/examples/research_projects/onnxruntime/unconditional_image_generation/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c28ecefc9a3002b2f6c6d3d97e53047e82ab2733
--- /dev/null
+++ b/diffusers/examples/research_projects/onnxruntime/unconditional_image_generation/README.md
@@ -0,0 +1,50 @@
+## Training examples
+
+Creating a training image set is [described in a different document](https://huggingface.co/docs/datasets/image_process#image-datasets).
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder  and run
+```bash
+pip install -r requirements.txt
+```
+
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+#### Use ONNXRuntime to accelerate training
+
+In order to leverage onnxruntime to accelerate training, please use train_unconditional_ort.py
+
+The command to train a DDPM UNet model on the Oxford Flowers dataset with onnxruntime:
+
+```bash
+accelerate launch train_unconditional.py \
+  --dataset_name="huggan/flowers-102-categories" \
+  --resolution=64 --center_crop --random_flip \
+  --output_dir="ddpm-ema-flowers-64" \
+  --use_ema \
+  --train_batch_size=16 \
+  --num_epochs=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=1e-4 \
+  --lr_warmup_steps=500 \
+  --mixed_precision=fp16
+  ```
+
+Please contact Prathik Rao (prathikr), Sunghoon Choi (hanbitmyths), Ashwini Khade (askhade), or Peng Wang (pengwa) on github with any questions.
diff --git a/diffusers/examples/research_projects/onnxruntime/unconditional_image_generation/requirements.txt b/diffusers/examples/research_projects/onnxruntime/unconditional_image_generation/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ca21143c42d9c08bf693fcc8cd11fed53acb895f
--- /dev/null
+++ b/diffusers/examples/research_projects/onnxruntime/unconditional_image_generation/requirements.txt
@@ -0,0 +1,4 @@
+accelerate>=0.16.0
+torchvision
+datasets
+tensorboard
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py b/diffusers/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a00f7cc4a9adf7cde0cbd17b9aa3b2153927cdf
--- /dev/null
+++ b/diffusers/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py
@@ -0,0 +1,697 @@
+import argparse
+import inspect
+import logging
+import math
+import os
+from pathlib import Path
+
+import accelerate
+import datasets
+import torch
+import torch.nn.functional as F
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from onnxruntime.training.optim.fp16_optimizer import FP16_Optimizer as ORT_FP16_Optimizer
+from onnxruntime.training.ortmodule import ORTModule
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+
+import diffusers
+from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel
+from diffusers.utils import check_min_version, is_accelerate_version, is_tensorboard_available, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.17.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    if not isinstance(arr, torch.Tensor):
+        arr = torch.from_numpy(arr)
+    res = arr[timesteps].float().to(timesteps.device)
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res.expand(broadcast_shape)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that HF Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--model_config_name_or_path",
+        type=str,
+        default=None,
+        help="The config of the UNet model to train, leave as None to use standard DDPM configuration.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="ddpm-model-64",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--overwrite_output_dir", action="store_true")
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=64,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        default=False,
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--eval_batch_size", type=int, default=16, help="The number of images to generate for evaluation."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "The number of subprocesses to use for data loading. 0 means that the data will be loaded in the main"
+            " process."
+        ),
+    )
+    parser.add_argument("--num_epochs", type=int, default=100)
+    parser.add_argument("--save_images_epochs", type=int, default=10, help="How often to save images during training.")
+    parser.add_argument(
+        "--save_model_epochs", type=int, default=10, help="How often to save the model during training."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="cosine",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.95, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument(
+        "--adam_weight_decay", type=float, default=1e-6, help="Weight decay magnitude for the Adam optimizer."
+    )
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer.")
+    parser.add_argument(
+        "--use_ema",
+        action="store_true",
+        help="Whether to use Exponential Moving Average for the final model weights.",
+    )
+    parser.add_argument("--ema_inv_gamma", type=float, default=1.0, help="The inverse gamma value for the EMA decay.")
+    parser.add_argument("--ema_power", type=float, default=3 / 4, help="The power value for the EMA decay.")
+    parser.add_argument("--ema_max_decay", type=float, default=0.9999, help="The maximum decay magnitude for EMA.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--hub_private_repo", action="store_true", help="Whether or not to create a private repository."
+    )
+    parser.add_argument(
+        "--logger",
+        type=str,
+        default="tensorboard",
+        choices=["tensorboard", "wandb"],
+        help=(
+            "Whether to use [tensorboard](https://www.tensorflow.org/tensorboard) or [wandb](https://www.wandb.ai)"
+            " for experiment tracking and logging of model metrics and model checkpoints"
+        ),
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default="epsilon",
+        choices=["epsilon", "sample"],
+        help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.",
+    )
+    parser.add_argument("--ddpm_num_steps", type=int, default=1000)
+    parser.add_argument("--ddpm_num_inference_steps", type=int, default=1000)
+    parser.add_argument("--ddpm_beta_schedule", type=str, default="linear")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more docs"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("You must specify either a dataset name from the hub or a train data directory.")
+
+    return args
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.logger == "tensorboard":
+        if not is_tensorboard_available():
+            raise ImportError("Make sure to install tensorboard if you want to use it for logging during training.")
+
+    elif args.logger == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_model.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DModel)
+                ema_model.load_state_dict(load_model.state_dict())
+                ema_model.to(accelerator.device)
+                del load_model
+
+            for i in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Initialize the model
+    if args.model_config_name_or_path is None:
+        model = UNet2DModel(
+            sample_size=args.resolution,
+            in_channels=3,
+            out_channels=3,
+            layers_per_block=2,
+            block_out_channels=(128, 128, 256, 256, 512, 512),
+            down_block_types=(
+                "DownBlock2D",
+                "DownBlock2D",
+                "DownBlock2D",
+                "DownBlock2D",
+                "AttnDownBlock2D",
+                "DownBlock2D",
+            ),
+            up_block_types=(
+                "UpBlock2D",
+                "AttnUpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+            ),
+        )
+    else:
+        config = UNet2DModel.load_config(args.model_config_name_or_path)
+        model = UNet2DModel.from_config(config)
+
+    # Create EMA for the model.
+    if args.use_ema:
+        ema_model = EMAModel(
+            model.parameters(),
+            decay=args.ema_max_decay,
+            use_ema_warmup=True,
+            inv_gamma=args.ema_inv_gamma,
+            power=args.ema_power,
+            model_cls=UNet2DModel,
+            model_config=model.config,
+        )
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            model.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Initialize the scheduler
+    accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys())
+    if accepts_prediction_type:
+        noise_scheduler = DDPMScheduler(
+            num_train_timesteps=args.ddpm_num_steps,
+            beta_schedule=args.ddpm_beta_schedule,
+            prediction_type=args.prediction_type,
+        )
+    else:
+        noise_scheduler = DDPMScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule)
+
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        model.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    optimizer = ORT_FP16_Optimizer(optimizer)
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            split="train",
+        )
+    else:
+        dataset = load_dataset("imagefolder", data_dir=args.train_data_dir, cache_dir=args.cache_dir, split="train")
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets and DataLoaders creation.
+    augmentations = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def transform_images(examples):
+        images = [augmentations(image.convert("RGB")) for image in examples["image"]]
+        return {"input": images}
+
+    logger.info(f"Dataset size: {len(dataset)}")
+
+    dataset.set_transform(transform_images)
+    train_dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
+    )
+
+    # Initialize the learning rate scheduler
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=(len(train_dataloader) * args.num_epochs),
+    )
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.use_ema:
+        ema_model.to(accelerator.device)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        run = os.path.split(__file__)[-1].split(".")[0]
+        accelerator.init_trackers(run)
+
+    model = ORTModule(model)
+
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    max_train_steps = args.num_epochs * num_update_steps_per_epoch
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(dataset)}")
+    logger.info(f"  Num Epochs = {args.num_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {max_train_steps}")
+
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Train!
+    for epoch in range(first_epoch, args.num_epochs):
+        model.train()
+        progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not accelerator.is_local_main_process)
+        progress_bar.set_description(f"Epoch {epoch}")
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            clean_images = batch["input"]
+            # Sample noise that we'll add to the images
+            noise = torch.randn(
+                clean_images.shape, dtype=(torch.float32 if args.mixed_precision == "no" else torch.float16)
+            ).to(clean_images.device)
+            bsz = clean_images.shape[0]
+            # Sample a random timestep for each image
+            timesteps = torch.randint(
+                0, noise_scheduler.config.num_train_timesteps, (bsz,), device=clean_images.device
+            ).long()
+
+            # Add noise to the clean images according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
+
+            with accelerator.accumulate(model):
+                # Predict the noise residual
+                model_output = model(noisy_images, timesteps, return_dict=False)[0]
+
+                if args.prediction_type == "epsilon":
+                    loss = F.mse_loss(model_output, noise)  # this could have different weights!
+                elif args.prediction_type == "sample":
+                    alpha_t = _extract_into_tensor(
+                        noise_scheduler.alphas_cumprod, timesteps, (clean_images.shape[0], 1, 1, 1)
+                    )
+                    snr_weights = alpha_t / (1 - alpha_t)
+                    loss = snr_weights * F.mse_loss(
+                        model_output, clean_images, reduction="none"
+                    )  # use SNR weighting from distillation paper
+                    loss = loss.mean()
+                else:
+                    raise ValueError(f"Unsupported prediction type: {args.prediction_type}")
+
+                accelerator.backward(loss)
+
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(model.parameters(), 1.0)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_model.step(model.parameters())
+                progress_bar.update(1)
+                global_step += 1
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
+            if args.use_ema:
+                logs["ema_decay"] = ema_model.cur_decay_value
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+        progress_bar.close()
+
+        accelerator.wait_for_everyone()
+
+        # Generate sample images for visual inspection
+        if accelerator.is_main_process:
+            if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1:
+                unet = accelerator.unwrap_model(model)
+
+                if args.use_ema:
+                    ema_model.store(unet.parameters())
+                    ema_model.copy_to(unet.parameters())
+
+                pipeline = DDPMPipeline(
+                    unet=unet,
+                    scheduler=noise_scheduler,
+                )
+
+                generator = torch.Generator(device=pipeline.device).manual_seed(0)
+                # run pipeline in inference (sample random noise and denoise)
+                images = pipeline(
+                    generator=generator,
+                    batch_size=args.eval_batch_size,
+                    num_inference_steps=args.ddpm_num_inference_steps,
+                    output_type="np",
+                ).images
+
+                if args.use_ema:
+                    ema_model.restore(unet.parameters())
+
+                # denormalize the images and save to tensorboard
+                images_processed = (images * 255).round().astype("uint8")
+
+                if args.logger == "tensorboard":
+                    if is_accelerate_version(">=", "0.17.0.dev0"):
+                        tracker = accelerator.get_tracker("tensorboard", unwrap=True)
+                    else:
+                        tracker = accelerator.get_tracker("tensorboard")
+                    tracker.add_images("test_samples", images_processed.transpose(0, 3, 1, 2), epoch)
+                elif args.logger == "wandb":
+                    # Upcoming `log_images` helper coming in https://github.com/huggingface/accelerate/pull/962/files
+                    accelerator.get_tracker("wandb").log(
+                        {"test_samples": [wandb.Image(img) for img in images_processed], "epoch": epoch},
+                        step=global_step,
+                    )
+
+            if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
+                # save the model
+                unet = accelerator.unwrap_model(model)
+
+                if args.use_ema:
+                    ema_model.store(unet.parameters())
+                    ema_model.copy_to(unet.parameters())
+
+                pipeline = DDPMPipeline(
+                    unet=unet,
+                    scheduler=noise_scheduler,
+                )
+
+                pipeline.save_pretrained(args.output_dir)
+
+                if args.use_ema:
+                    ema_model.restore(unet.parameters())
+
+                if args.push_to_hub:
+                    upload_folder(
+                        repo_id=repo_id,
+                        folder_path=args.output_dir,
+                        commit_message=f"Epoch {epoch}",
+                        ignore_patterns=["step_*", "epoch_*"],
+                    )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/promptdiffusion/README.md b/diffusers/examples/research_projects/promptdiffusion/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..33ffec312501f90bbe832829f5723c50b1d8899d
--- /dev/null
+++ b/diffusers/examples/research_projects/promptdiffusion/README.md
@@ -0,0 +1,49 @@
+# PromptDiffusion Pipeline
+
+From the project [page](https://zhendong-wang.github.io/prompt-diffusion.github.io/)
+
+"With a prompt consisting of a task-specific example pair of images and text guidance, and a new query image, Prompt Diffusion can comprehend the desired task and generate the corresponding output image on both seen (trained) and unseen (new) task types."
+
+For any usage questions, please refer to the [paper](https://arxiv.org/abs/2305.01115).
+
+Prepare models by converting them from the [checkpoint](https://huggingface.co/zhendongw/prompt-diffusion)
+
+To convert the controlnet, use cldm_v15.yaml from the [repository](https://github.com/Zhendong-Wang/Prompt-Diffusion/tree/main/models/):
+
+```bash
+python convert_original_promptdiffusion_to_diffusers.py --checkpoint_path path-to-network-step04999.ckpt --original_config_file path-to-cldm_v15.yaml --dump_path path-to-output-directory
+```
+
+To learn about how to convert the fine-tuned stable diffusion model, see the [Load different Stable Diffusion formats guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/other-formats).
+
+
+```py
+import torch
+from diffusers import UniPCMultistepScheduler
+from diffusers.utils import load_image
+from promptdiffusioncontrolnet import PromptDiffusionControlNetModel
+from pipeline_prompt_diffusion import PromptDiffusionPipeline
+
+
+from PIL import ImageOps
+
+image_a = ImageOps.invert(load_image("https://github.com/Zhendong-Wang/Prompt-Diffusion/blob/main/images_to_try/house_line.png?raw=true"))
+
+image_b = load_image("https://github.com/Zhendong-Wang/Prompt-Diffusion/blob/main/images_to_try/house.png?raw=true")
+query = ImageOps.invert(load_image("https://github.com/Zhendong-Wang/Prompt-Diffusion/blob/main/images_to_try/new_01.png?raw=true"))
+
+# load prompt diffusion controlnet and prompt diffusion
+
+controlnet = PromptDiffusionControlNetModel.from_pretrained("iczaw/prompt-diffusion-diffusers", subfolder="controlnet", torch_dtype=torch.float16)
+model_id = "path-to-model"
+pipe = PromptDiffusionPipeline.from_pretrained("iczaw/prompt-diffusion-diffusers", subfolder="base", controlnet=controlnet, torch_dtype=torch.float16, variant="fp16")
+
+# speed up diffusion process with faster scheduler and memory optimization
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+# remove following line if xformers is not installed
+pipe.enable_xformers_memory_efficient_attention()
+pipe.enable_model_cpu_offload()
+# generate image
+generator = torch.manual_seed(0)
+image = pipe("a tortoise", num_inference_steps=20, generator=generator, image_pair=[image_a,image_b], image=query).images[0]
+```
diff --git a/diffusers/examples/research_projects/promptdiffusion/convert_original_promptdiffusion_to_diffusers.py b/diffusers/examples/research_projects/promptdiffusion/convert_original_promptdiffusion_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..26b56a21e8658b1e68826e807d748d69a696fe7f
--- /dev/null
+++ b/diffusers/examples/research_projects/promptdiffusion/convert_original_promptdiffusion_to_diffusers.py
@@ -0,0 +1,2118 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for stable diffusion checkpoints which _only_ contain a controlnet."""
+
+import argparse
+import re
+from contextlib import nullcontext
+from io import BytesIO
+from typing import Dict, Optional, Union
+
+import requests
+import torch
+import yaml
+from promptdiffusioncontrolnet import PromptDiffusionControlNetModel
+from transformers import (
+    AutoFeatureExtractor,
+    BertTokenizerFast,
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers.models import (
+    AutoencoderKL,
+    ControlNetModel,
+    PriorTransformer,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
+from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UnCLIPScheduler,
+)
+from diffusers.utils import is_accelerate_available, logging
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+    from accelerate.utils import set_module_tensor_to_device
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "to_q.weight")
+        new_item = new_item.replace("q.bias", "to_q.bias")
+
+        new_item = new_item.replace("k.weight", "to_k.weight")
+        new_item = new_item.replace("k.bias", "to_k.bias")
+
+        new_item = new_item.replace("v.weight", "to_v.weight")
+        new_item = new_item.replace("v.bias", "to_v.bias")
+
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        is_attn_weight = "proj_attn.weight" in new_path or ("attentions" in new_path and "to_" in new_path)
+        shape = old_checkpoint[path["old"]].shape
+        if is_attn_weight and len(shape) == 3:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        elif is_attn_weight and len(shape) == 4:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    if controlnet:
+        unet_params = original_config["model"]["params"]["control_stage_config"]["params"]
+    else:
+        if (
+            "unet_config" in original_config["model"]["params"]
+            and original_config["model"]["params"]["unet_config"] is not None
+        ):
+            unet_params = original_config["model"]["params"]["unet_config"]["params"]
+        else:
+            unet_params = original_config["model"]["params"]["network_config"]["params"]
+
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    if unet_params["transformer_depth"] is not None:
+        transformer_layers_per_block = (
+            unet_params["transformer_depth"]
+            if isinstance(unet_params["transformer_depth"], int)
+            else list(unet_params["transformer_depth"])
+        )
+    else:
+        transformer_layers_per_block = 1
+
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
+
+    head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None
+    use_linear_projection = (
+        unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim_mult = unet_params["model_channels"] // unet_params["num_head_channels"]
+            head_dim = [head_dim_mult * c for c in list(unet_params["channel_mult"])]
+
+    class_embed_type = None
+    addition_embed_type = None
+    addition_time_embed_dim = None
+    projection_class_embeddings_input_dim = None
+    context_dim = None
+
+    if unet_params["context_dim"] is not None:
+        context_dim = (
+            unet_params["context_dim"]
+            if isinstance(unet_params["context_dim"], int)
+            else unet_params["context_dim"][0]
+        )
+
+    if "num_classes" in unet_params:
+        if unet_params["num_classes"] == "sequential":
+            if context_dim in [2048, 1280]:
+                # SDXL
+                addition_embed_type = "text_time"
+                addition_time_embed_dim = 256
+            else:
+                class_embed_type = "projection"
+            assert "adm_in_channels" in unet_params
+            projection_class_embeddings_input_dim = unet_params["adm_in_channels"]
+
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params["in_channels"],
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params["num_res_blocks"],
+        "cross_attention_dim": context_dim,
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "addition_embed_type": addition_embed_type,
+        "addition_time_embed_dim": addition_time_embed_dim,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "transformer_layers_per_block": transformer_layers_per_block,
+    }
+
+    if "disable_self_attentions" in unet_params:
+        config["only_cross_attention"] = unet_params["disable_self_attentions"]
+
+    if "num_classes" in unet_params and isinstance(unet_params["num_classes"], int):
+        config["num_class_embeds"] = unet_params["num_classes"]
+
+    if controlnet:
+        config["conditioning_channels"] = unet_params["hint_channels"]
+    else:
+        config["out_channels"] = unet_params["out_channels"]
+        config["up_block_types"] = tuple(up_block_types)
+
+    return config
+
+
+def create_vae_diffusers_config(original_config, image_size: int):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
+
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
+    }
+    return config
+
+
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config["model"]["params"]["timesteps"],
+        beta_start=original_config["model"]["params"]["linear_start"],
+        beta_end=original_config["model"]["params"]["linear_end"],
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+
+
+def create_ldm_bert_config(original_config):
+    bert_params = original_config["model"]["params"]["cond_stage_config"]["params"]
+    config = LDMBertConfig(
+        d_model=bert_params.n_embed,
+        encoder_layers=bert_params.n_layer,
+        encoder_ffn_dim=bert_params.n_embed * 4,
+    )
+    return config
+
+
+def convert_ldm_unet_checkpoint(
+    checkpoint,
+    config,
+    path=None,
+    extract_ema=False,
+    controlnet=False,
+    skip_extract_state_dict=False,
+    promptdiffusion=False,
+):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    if skip_extract_state_dict:
+        unet_state_dict = checkpoint
+    else:
+        # extract state_dict for UNet
+        unet_state_dict = {}
+        keys = list(checkpoint.keys())
+
+        if controlnet:
+            unet_key = "control_model."
+        else:
+            unet_key = "model.diffusion_model."
+
+        # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+        if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+            logger.warning(f"Checkpoint {path} has both EMA and non-EMA weights.")
+            logger.warning(
+                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+            )
+            for key in keys:
+                if key.startswith("model.diffusion_model"):
+                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+        else:
+            if sum(k.startswith("model_ema") for k in keys) > 100:
+                logger.warning(
+                    "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                    " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+                )
+
+            for key in keys:
+                if key.startswith(unet_key):
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+
+    if config["addition_embed_type"] == "text_time":
+        new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+
+    # Relevant to StableDiffusionUpscalePipeline
+    if "num_class_embeds" in config:
+        if (config["num_class_embeds"] is not None) and ("label_emb.weight" in unet_state_dict):
+            new_checkpoint["class_embedding.weight"] = unet_state_dict["label_emb.weight"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    if not controlnet:
+        new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+        new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+        new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+        new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    if controlnet and not promptdiffusion:
+        # conditioning embedding
+
+        orig_index = 0
+
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+
+        orig_index += 2
+
+        diffusers_index = 0
+
+        while diffusers_index < 6:
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
+            )
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
+            )
+            diffusers_index += 1
+            orig_index += 2
+
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+
+        # down blocks
+        for i in range(num_input_blocks):
+            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
+
+        # mid block
+        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
+
+    if promptdiffusion:
+        # conditioning embedding
+
+        orig_index = 0
+
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+
+        new_checkpoint["controlnet_query_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_cond_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_query_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_cond_block.{orig_index}.bias"
+        )
+        orig_index += 2
+
+        diffusers_index = 0
+
+        while diffusers_index < 6:
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
+            )
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
+            )
+            new_checkpoint[f"controlnet_query_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_cond_block.{orig_index}.weight"
+            )
+            new_checkpoint[f"controlnet_query_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_cond_block.{orig_index}.bias"
+            )
+            diffusers_index += 1
+            orig_index += 2
+
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+
+        new_checkpoint["controlnet_query_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_cond_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_query_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_cond_block.{orig_index}.bias"
+        )
+        # down blocks
+        for i in range(num_input_blocks):
+            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
+
+        # mid block
+        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
+
+    return new_checkpoint
+
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    keys = list(checkpoint.keys())
+    vae_key = "first_stage_model." if any(k.startswith("first_stage_model.") for k in keys) else ""
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+def convert_ldm_bert_checkpoint(checkpoint, config):
+    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
+        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
+        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
+        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
+
+        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
+        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
+
+    def _copy_linear(hf_linear, pt_linear):
+        hf_linear.weight = pt_linear.weight
+        hf_linear.bias = pt_linear.bias
+
+    def _copy_layer(hf_layer, pt_layer):
+        # copy layer norms
+        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
+        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
+
+        # copy attn
+        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
+
+        # copy MLP
+        pt_mlp = pt_layer[1][1]
+        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
+        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
+
+    def _copy_layers(hf_layers, pt_layers):
+        for i, hf_layer in enumerate(hf_layers):
+            if i != 0:
+                i += i
+            pt_layer = pt_layers[i : i + 2]
+            _copy_layer(hf_layer, pt_layer)
+
+    hf_model = LDMBertModel(config).eval()
+
+    # copy  embeds
+    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
+    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
+
+    # copy layer norm
+    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
+
+    # copy hidden layers
+    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
+
+    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
+
+    return hf_model
+
+
+def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
+    if text_encoder is None:
+        config_name = "openai/clip-vit-large-patch14"
+        try:
+            config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: 'openai/clip-vit-large-patch14'."
+            )
+
+        ctx = init_empty_weights if is_accelerate_available() else nullcontext
+        with ctx():
+            text_model = CLIPTextModel(config)
+    else:
+        text_model = text_encoder
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    remove_prefixes = ["cond_stage_model.transformer", "conditioner.embedders.0.transformer"]
+
+    for key in keys:
+        for prefix in remove_prefixes:
+            if key.startswith(prefix):
+                text_model_dict[key[len(prefix + ".") :]] = checkpoint[key]
+
+    if is_accelerate_available():
+        for param_name, param in text_model_dict.items():
+            set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
+    else:
+        if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)):
+            text_model_dict.pop("text_model.embeddings.position_ids", None)
+
+        text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
+
+textenc_conversion_lst = [
+    ("positional_embedding", "text_model.embeddings.position_embedding.weight"),
+    ("token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("ln_final.bias", "text_model.final_layer_norm.bias"),
+    ("text_projection", "text_projection.weight"),
+]
+textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
+
+textenc_transformer_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+
+
+def convert_paint_by_example_checkpoint(checkpoint, local_files_only=False):
+    config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
+    model = PaintByExampleImageEncoder(config)
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+
+    # load clip vision
+    model.model.load_state_dict(text_model_dict)
+
+    # load mapper
+    keys_mapper = {
+        k[len("cond_stage_model.mapper.res") :]: v
+        for k, v in checkpoint.items()
+        if k.startswith("cond_stage_model.mapper")
+    }
+
+    MAPPING = {
+        "attn.c_qkv": ["attn1.to_q", "attn1.to_k", "attn1.to_v"],
+        "attn.c_proj": ["attn1.to_out.0"],
+        "ln_1": ["norm1"],
+        "ln_2": ["norm3"],
+        "mlp.c_fc": ["ff.net.0.proj"],
+        "mlp.c_proj": ["ff.net.2"],
+    }
+
+    mapped_weights = {}
+    for key, value in keys_mapper.items():
+        prefix = key[: len("blocks.i")]
+        suffix = key.split(prefix)[-1].split(".")[-1]
+        name = key.split(prefix)[-1].split(suffix)[0][1:-1]
+        mapped_names = MAPPING[name]
+
+        num_splits = len(mapped_names)
+        for i, mapped_name in enumerate(mapped_names):
+            new_name = ".".join([prefix, mapped_name, suffix])
+            shape = value.shape[0] // num_splits
+            mapped_weights[new_name] = value[i * shape : (i + 1) * shape]
+
+    model.mapper.load_state_dict(mapped_weights)
+
+    # load final layer norm
+    model.final_layer_norm.load_state_dict(
+        {
+            "bias": checkpoint["cond_stage_model.final_ln.bias"],
+            "weight": checkpoint["cond_stage_model.final_ln.weight"],
+        }
+    )
+
+    # load final proj
+    model.proj_out.load_state_dict(
+        {
+            "bias": checkpoint["proj_out.bias"],
+            "weight": checkpoint["proj_out.weight"],
+        }
+    )
+
+    # load uncond vector
+    model.uncond_vector.data = torch.nn.Parameter(checkpoint["learnable_vector"])
+    return model
+
+
+def convert_open_clip_checkpoint(
+    checkpoint,
+    config_name,
+    prefix="cond_stage_model.model.",
+    has_projection=False,
+    local_files_only=False,
+    **config_kwargs,
+):
+    # text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
+    # text_model = CLIPTextModelWithProjection.from_pretrained(
+    #    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
+    # )
+    try:
+        config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only)
+    except Exception:
+        raise ValueError(
+            f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: '{config_name}'."
+        )
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        text_model = CLIPTextModelWithProjection(config) if has_projection else CLIPTextModel(config)
+
+    keys = list(checkpoint.keys())
+
+    keys_to_ignore = []
+    if config_name == "stabilityai/stable-diffusion-2" and config.num_hidden_layers == 23:
+        # make sure to remove all keys > 22
+        keys_to_ignore += [k for k in keys if k.startswith("cond_stage_model.model.transformer.resblocks.23")]
+        keys_to_ignore += ["cond_stage_model.model.text_projection"]
+
+    text_model_dict = {}
+
+    if prefix + "text_projection" in checkpoint:
+        d_model = int(checkpoint[prefix + "text_projection"].shape[0])
+    else:
+        d_model = 1024
+
+    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
+
+    for key in keys:
+        if key in keys_to_ignore:
+            continue
+        if key[len(prefix) :] in textenc_conversion_map:
+            if key.endswith("text_projection"):
+                value = checkpoint[key].T.contiguous()
+            else:
+                value = checkpoint[key]
+
+            text_model_dict[textenc_conversion_map[key[len(prefix) :]]] = value
+
+        if key.startswith(prefix + "transformer."):
+            new_key = key[len(prefix + "transformer.") :]
+            if new_key.endswith(".in_proj_weight"):
+                new_key = new_key[: -len(".in_proj_weight")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
+            elif new_key.endswith(".in_proj_bias"):
+                new_key = new_key[: -len(".in_proj_bias")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
+            else:
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+
+                text_model_dict[new_key] = checkpoint[key]
+
+    if is_accelerate_available():
+        for param_name, param in text_model_dict.items():
+            set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
+    else:
+        if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)):
+            text_model_dict.pop("text_model.embeddings.position_ids", None)
+
+        text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
+
+def stable_unclip_image_encoder(original_config, local_files_only=False):
+    """
+    Returns the image processor and clip image encoder for the img2img unclip pipeline.
+
+    We currently know of two types of stable unclip models which separately use the clip and the openclip image
+    encoders.
+    """
+
+    image_embedder_config = original_config["model"]["params"]["embedder_config"]
+
+    sd_clip_image_embedder_class = image_embedder_config["target"]
+    sd_clip_image_embedder_class = sd_clip_image_embedder_class.split(".")[-1]
+
+    if sd_clip_image_embedder_class == "ClipImageEmbedder":
+        clip_model_name = image_embedder_config.params.model
+
+        if clip_model_name == "ViT-L/14":
+            feature_extractor = CLIPImageProcessor()
+            image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                "openai/clip-vit-large-patch14", local_files_only=local_files_only
+            )
+        else:
+            raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}")
+
+    elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder":
+        feature_extractor = CLIPImageProcessor()
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", local_files_only=local_files_only
+        )
+    else:
+        raise NotImplementedError(
+            f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}"
+        )
+
+    return feature_extractor, image_encoder
+
+
+def stable_unclip_image_noising_components(
+    original_config, clip_stats_path: Optional[str] = None, device: Optional[str] = None
+):
+    """
+    Returns the noising components for the img2img and txt2img unclip pipelines.
+
+    Converts the stability noise augmentor into
+    1. a `StableUnCLIPImageNormalizer` for holding the CLIP stats
+    2. a `DDPMScheduler` for holding the noise schedule
+
+    If the noise augmentor config specifies a clip stats path, the `clip_stats_path` must be provided.
+    """
+    noise_aug_config = original_config["model"]["params"]["noise_aug_config"]
+    noise_aug_class = noise_aug_config["target"]
+    noise_aug_class = noise_aug_class.split(".")[-1]
+
+    if noise_aug_class == "CLIPEmbeddingNoiseAugmentation":
+        noise_aug_config = noise_aug_config.params
+        embedding_dim = noise_aug_config.timestep_dim
+        max_noise_level = noise_aug_config.noise_schedule_config.timesteps
+        beta_schedule = noise_aug_config.noise_schedule_config.beta_schedule
+
+        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedding_dim)
+        image_noising_scheduler = DDPMScheduler(num_train_timesteps=max_noise_level, beta_schedule=beta_schedule)
+
+        if "clip_stats_path" in noise_aug_config:
+            if clip_stats_path is None:
+                raise ValueError("This stable unclip config requires a `clip_stats_path`")
+
+            clip_mean, clip_std = torch.load(clip_stats_path, map_location=device)
+            clip_mean = clip_mean[None, :]
+            clip_std = clip_std[None, :]
+
+            clip_stats_state_dict = {
+                "mean": clip_mean,
+                "std": clip_std,
+            }
+
+            image_normalizer.load_state_dict(clip_stats_state_dict)
+    else:
+        raise NotImplementedError(f"Unknown noise augmentor class: {noise_aug_class}")
+
+    return image_normalizer, image_noising_scheduler
+
+
+def convert_controlnet_checkpoint(
+    checkpoint,
+    original_config,
+    checkpoint_path,
+    image_size,
+    upcast_attention,
+    extract_ema,
+    use_linear_projection=None,
+    cross_attention_dim=None,
+):
+    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
+    ctrlnet_config["upcast_attention"] = upcast_attention
+
+    ctrlnet_config.pop("sample_size")
+
+    if use_linear_projection is not None:
+        ctrlnet_config["use_linear_projection"] = use_linear_projection
+
+    if cross_attention_dim is not None:
+        ctrlnet_config["cross_attention_dim"] = cross_attention_dim
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        controlnet = ControlNetModel(**ctrlnet_config)
+
+    # Some controlnet ckpt files are distributed independently from the rest of the
+    # model components i.e. https://huggingface.co/thibaud/controlnet-sd21/
+    if "time_embed.0.weight" in checkpoint:
+        skip_extract_state_dict = True
+    else:
+        skip_extract_state_dict = False
+
+    converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint,
+        ctrlnet_config,
+        path=checkpoint_path,
+        extract_ema=extract_ema,
+        controlnet=True,
+        skip_extract_state_dict=skip_extract_state_dict,
+    )
+
+    if is_accelerate_available():
+        for param_name, param in converted_ctrl_checkpoint.items():
+            set_module_tensor_to_device(controlnet, param_name, "cpu", value=param)
+    else:
+        controlnet.load_state_dict(converted_ctrl_checkpoint)
+
+    return controlnet
+
+
+def convert_promptdiffusion_checkpoint(
+    checkpoint,
+    original_config,
+    checkpoint_path,
+    image_size,
+    upcast_attention,
+    extract_ema,
+    use_linear_projection=None,
+    cross_attention_dim=None,
+):
+    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
+    ctrlnet_config["upcast_attention"] = upcast_attention
+
+    ctrlnet_config.pop("sample_size")
+
+    if use_linear_projection is not None:
+        ctrlnet_config["use_linear_projection"] = use_linear_projection
+
+    if cross_attention_dim is not None:
+        ctrlnet_config["cross_attention_dim"] = cross_attention_dim
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        controlnet = PromptDiffusionControlNetModel(**ctrlnet_config)
+
+    # Some controlnet ckpt files are distributed independently from the rest of the
+    # model components i.e. https://huggingface.co/thibaud/controlnet-sd21/
+    if "time_embed.0.weight" in checkpoint:
+        skip_extract_state_dict = True
+    else:
+        skip_extract_state_dict = False
+
+    converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint,
+        ctrlnet_config,
+        path=checkpoint_path,
+        extract_ema=extract_ema,
+        promptdiffusion=True,
+        controlnet=True,
+        skip_extract_state_dict=skip_extract_state_dict,
+    )
+
+    if is_accelerate_available():
+        for param_name, param in converted_ctrl_checkpoint.items():
+            set_module_tensor_to_device(controlnet, param_name, "cpu", value=param)
+    else:
+        controlnet.load_state_dict(converted_ctrl_checkpoint)
+
+    return controlnet
+
+
+def download_from_original_stable_diffusion_ckpt(
+    checkpoint_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+    original_config_file: str = None,
+    image_size: Optional[int] = None,
+    prediction_type: str = None,
+    model_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "pndm",
+    num_in_channels: Optional[int] = None,
+    upcast_attention: Optional[bool] = None,
+    device: str = None,
+    from_safetensors: bool = False,
+    stable_unclip: Optional[str] = None,
+    stable_unclip_prior: Optional[str] = None,
+    clip_stats_path: Optional[str] = None,
+    controlnet: Optional[bool] = None,
+    adapter: Optional[bool] = None,
+    load_safety_checker: bool = True,
+    pipeline_class: DiffusionPipeline = None,
+    local_files_only=False,
+    vae_path=None,
+    vae=None,
+    text_encoder=None,
+    text_encoder_2=None,
+    tokenizer=None,
+    tokenizer_2=None,
+    config_files=None,
+) -> DiffusionPipeline:
+    """
+    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
+    config file.
+
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+
+    Args:
+        checkpoint_path_or_dict (`str` or `dict`): Path to `.ckpt` file, or the state dict.
+        original_config_file (`str`):
+            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
+            inferred by looking for a key that only exists in SD2.0 models.
+        image_size (`int`, *optional*, defaults to 512):
+            The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
+            Base. Use 768 for Stable Diffusion v2.
+        prediction_type (`str`, *optional*):
+            The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable
+            Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2.
+        num_in_channels (`int`, *optional*, defaults to None):
+            The number of input channels. If `None`, it will be automatically inferred.
+        scheduler_type (`str`, *optional*, defaults to 'pndm'):
+            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
+            "ddim"]`.
+        model_type (`str`, *optional*, defaults to `None`):
+            The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder",
+            "FrozenCLIPEmbedder", "PaintByExample"]`.
+        is_img2img (`bool`, *optional*, defaults to `False`):
+            Whether the model should be loaded as an img2img pipeline.
+        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
+            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
+            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
+            inference. Non-EMA weights are usually better to continue fine-tuning.
+        upcast_attention (`bool`, *optional*, defaults to `None`):
+            Whether the attention computation should always be upcasted. This is necessary when running stable
+            diffusion 2.1.
+        device (`str`, *optional*, defaults to `None`):
+            The device to use. Pass `None` to determine automatically.
+        from_safetensors (`str`, *optional*, defaults to `False`):
+            If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
+        load_safety_checker (`bool`, *optional*, defaults to `True`):
+            Whether to load the safety checker or not. Defaults to `True`.
+        pipeline_class (`str`, *optional*, defaults to `None`):
+            The pipeline class to use. Pass `None` to determine automatically.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            Whether or not to only look at local files (i.e., do not try to download the model).
+        vae (`AutoencoderKL`, *optional*, defaults to `None`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. If
+            this parameter is `None`, the function will load a new instance of [CLIP] by itself, if needed.
+        text_encoder (`CLIPTextModel`, *optional*, defaults to `None`):
+            An instance of [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel)
+            to use, specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)
+            variant. If this parameter is `None`, the function will load a new instance of [CLIP] by itself, if needed.
+        tokenizer (`CLIPTokenizer`, *optional*, defaults to `None`):
+            An instance of
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
+            to use. If this parameter is `None`, the function will load a new instance of [CLIPTokenizer] by itself, if
+            needed.
+        config_files (`Dict[str, str]`, *optional*, defaults to `None`):
+            A dictionary mapping from config file names to their contents. If this parameter is `None`, the function
+            will load the config files by itself, if needed. Valid keys are:
+                - `v1`: Config file for Stable Diffusion v1
+                - `v2`: Config file for Stable Diffusion v2
+                - `xl`: Config file for Stable Diffusion XL
+                - `xl_refiner`: Config file for Stable Diffusion XL Refiner
+        return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    """
+
+    # import pipelines here to avoid circular import error when using from_single_file method
+    from diffusers import (
+        LDMTextToImagePipeline,
+        PaintByExamplePipeline,
+        StableDiffusionControlNetPipeline,
+        StableDiffusionInpaintPipeline,
+        StableDiffusionPipeline,
+        StableDiffusionUpscalePipeline,
+        StableDiffusionXLControlNetInpaintPipeline,
+        StableDiffusionXLImg2ImgPipeline,
+        StableDiffusionXLInpaintPipeline,
+        StableDiffusionXLPipeline,
+        StableUnCLIPImg2ImgPipeline,
+        StableUnCLIPPipeline,
+    )
+
+    if prediction_type == "v-prediction":
+        prediction_type = "v_prediction"
+
+    if isinstance(checkpoint_path_or_dict, str):
+        if from_safetensors:
+            from safetensors.torch import load_file as safe_load
+
+            checkpoint = safe_load(checkpoint_path_or_dict, device="cpu")
+        else:
+            if device is None:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                checkpoint = torch.load(checkpoint_path_or_dict, map_location=device)
+            else:
+                checkpoint = torch.load(checkpoint_path_or_dict, map_location=device)
+    elif isinstance(checkpoint_path_or_dict, dict):
+        checkpoint = checkpoint_path_or_dict
+
+    # Sometimes models don't have the global_step item
+    if "global_step" in checkpoint:
+        global_step = checkpoint["global_step"]
+    else:
+        logger.debug("global_step key not found in model")
+        global_step = None
+
+    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
+    # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
+    while "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    if original_config_file is None:
+        key_name_v2_1 = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
+        key_name_sd_xl_base = "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.bias"
+        key_name_sd_xl_refiner = "conditioner.embedders.0.model.transformer.resblocks.9.mlp.c_proj.bias"
+        is_upscale = pipeline_class == StableDiffusionUpscalePipeline
+
+        config_url = None
+
+        # model_type = "v1"
+        if config_files is not None and "v1" in config_files:
+            original_config_file = config_files["v1"]
+        else:
+            config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+
+        if key_name_v2_1 in checkpoint and checkpoint[key_name_v2_1].shape[-1] == 1024:
+            # model_type = "v2"
+            if config_files is not None and "v2" in config_files:
+                original_config_file = config_files["v2"]
+            else:
+                config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
+            if global_step == 110000:
+                # v2.1 needs to upcast attention
+                upcast_attention = True
+        elif key_name_sd_xl_base in checkpoint:
+            # only base xl has two text embedders
+            if config_files is not None and "xl" in config_files:
+                original_config_file = config_files["xl"]
+            else:
+                config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
+        elif key_name_sd_xl_refiner in checkpoint:
+            # only refiner xl has embedder and one text embedders
+            if config_files is not None and "xl_refiner" in config_files:
+                original_config_file = config_files["xl_refiner"]
+            else:
+                config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml"
+
+        if is_upscale:
+            config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml"
+
+        if config_url is not None:
+            original_config_file = BytesIO(requests.get(config_url).content)
+        else:
+            with open(original_config_file, "r") as f:
+                original_config_file = f.read()
+
+    original_config = yaml.safe_load(original_config_file)
+
+    # Convert the text model.
+    if (
+        model_type is None
+        and "cond_stage_config" in original_config["model"]["params"]
+        and original_config["model"]["params"]["cond_stage_config"] is not None
+    ):
+        model_type = original_config["model"]["params"]["cond_stage_config"]["target"].split(".")[-1]
+        logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
+    elif model_type is None and original_config["model"]["params"]["network_config"] is not None:
+        if original_config["model"]["params"]["network_config"]["params"]["context_dim"] == 2048:
+            model_type = "SDXL"
+        else:
+            model_type = "SDXL-Refiner"
+        if image_size is None:
+            image_size = 1024
+
+    if pipeline_class is None:
+        # Check if we have a SDXL or SD model and initialize default pipeline
+        if model_type not in ["SDXL", "SDXL-Refiner"]:
+            pipeline_class = StableDiffusionPipeline if not controlnet else StableDiffusionControlNetPipeline
+        else:
+            pipeline_class = StableDiffusionXLPipeline if model_type == "SDXL" else StableDiffusionXLImg2ImgPipeline
+
+    if num_in_channels is None and pipeline_class in [
+        StableDiffusionInpaintPipeline,
+        StableDiffusionXLInpaintPipeline,
+        StableDiffusionXLControlNetInpaintPipeline,
+    ]:
+        num_in_channels = 9
+    if num_in_channels is None and pipeline_class == StableDiffusionUpscalePipeline:
+        num_in_channels = 7
+    elif num_in_channels is None:
+        num_in_channels = 4
+
+    if "unet_config" in original_config["model"]["params"]:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
+            # as it relies on a brittle global step parameter here
+            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
+        if image_size is None:
+            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
+            # as it relies on a brittle global step parameter here
+            image_size = 512 if global_step == 875000 else 768
+    else:
+        if prediction_type is None:
+            prediction_type = "epsilon"
+        if image_size is None:
+            image_size = 512
+
+    if controlnet is None and "control_stage_config" in original_config["model"]["params"]:
+        path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
+        controlnet = convert_controlnet_checkpoint(
+            checkpoint, original_config, path, image_size, upcast_attention, extract_ema
+        )
+
+    if "timesteps" in original_config["model"]["params"]:
+        num_train_timesteps = original_config["model"]["params"]["timesteps"]
+    else:
+        num_train_timesteps = 1000
+
+    if model_type in ["SDXL", "SDXL-Refiner"]:
+        scheduler_dict = {
+            "beta_schedule": "scaled_linear",
+            "beta_start": 0.00085,
+            "beta_end": 0.012,
+            "interpolation_type": "linear",
+            "num_train_timesteps": num_train_timesteps,
+            "prediction_type": "epsilon",
+            "sample_max_value": 1.0,
+            "set_alpha_to_one": False,
+            "skip_prk_steps": True,
+            "steps_offset": 1,
+            "timestep_spacing": "leading",
+        }
+        scheduler = EulerDiscreteScheduler.from_config(scheduler_dict)
+        scheduler_type = "euler"
+    else:
+        if "linear_start" in original_config["model"]["params"]:
+            beta_start = original_config["model"]["params"]["linear_start"]
+        else:
+            beta_start = 0.02
+
+        if "linear_end" in original_config["model"]["params"]:
+            beta_end = original_config["model"]["params"]["linear_end"]
+        else:
+            beta_end = 0.085
+        scheduler = DDIMScheduler(
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            beta_start=beta_start,
+            num_train_timesteps=num_train_timesteps,
+            steps_offset=1,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            prediction_type=prediction_type,
+        )
+    # make sure scheduler works correctly with DDIM
+    scheduler.register_to_config(clip_sample=False)
+
+    if scheduler_type == "pndm":
+        config = dict(scheduler.config)
+        config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(config)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+    elif scheduler_type == "ddim":
+        scheduler = scheduler
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+    if pipeline_class == StableDiffusionUpscalePipeline:
+        image_size = original_config["model"]["params"]["unet_config"]["params"]["image_size"]
+
+    # Convert the UNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet_config["upcast_attention"] = upcast_attention
+
+    path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=path, extract_ema=extract_ema
+    )
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        unet = UNet2DConditionModel(**unet_config)
+
+    if is_accelerate_available():
+        if model_type not in ["SDXL", "SDXL-Refiner"]:  # SBM Delay this.
+            for param_name, param in converted_unet_checkpoint.items():
+                set_module_tensor_to_device(unet, param_name, "cpu", value=param)
+    else:
+        unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model.
+    if vae_path is None and vae is None:
+        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+        if (
+            "model" in original_config
+            and "params" in original_config["model"]
+            and "scale_factor" in original_config["model"]["params"]
+        ):
+            vae_scaling_factor = original_config["model"]["params"]["scale_factor"]
+        else:
+            vae_scaling_factor = 0.18215  # default SD scaling factor
+
+        vae_config["scaling_factor"] = vae_scaling_factor
+
+        ctx = init_empty_weights if is_accelerate_available() else nullcontext
+        with ctx():
+            vae = AutoencoderKL(**vae_config)
+
+        if is_accelerate_available():
+            for param_name, param in converted_vae_checkpoint.items():
+                set_module_tensor_to_device(vae, param_name, "cpu", value=param)
+        else:
+            vae.load_state_dict(converted_vae_checkpoint)
+    elif vae is None:
+        vae = AutoencoderKL.from_pretrained(vae_path, local_files_only=local_files_only)
+
+    if model_type == "FrozenOpenCLIPEmbedder":
+        config_name = "stabilityai/stable-diffusion-2"
+        config_kwargs = {"subfolder": "text_encoder"}
+
+        if text_encoder is None:
+            text_model = convert_open_clip_checkpoint(
+                checkpoint, config_name, local_files_only=local_files_only, **config_kwargs
+            )
+        else:
+            text_model = text_encoder
+
+        try:
+            tokenizer = CLIPTokenizer.from_pretrained(
+                "stabilityai/stable-diffusion-2", subfolder="tokenizer", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'stabilityai/stable-diffusion-2'."
+            )
+
+        if stable_unclip is None:
+            if controlnet:
+                pipe = pipeline_class(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    controlnet=controlnet,
+                    safety_checker=None,
+                    feature_extractor=None,
+                )
+                if hasattr(pipe, "requires_safety_checker"):
+                    pipe.requires_safety_checker = False
+
+            elif pipeline_class == StableDiffusionUpscalePipeline:
+                scheduler = DDIMScheduler.from_pretrained(
+                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler"
+                )
+                low_res_scheduler = DDPMScheduler.from_pretrained(
+                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="low_res_scheduler"
+                )
+
+                pipe = pipeline_class(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    low_res_scheduler=low_res_scheduler,
+                    safety_checker=None,
+                    feature_extractor=None,
+                )
+
+            else:
+                pipe = pipeline_class(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    safety_checker=None,
+                    feature_extractor=None,
+                )
+                if hasattr(pipe, "requires_safety_checker"):
+                    pipe.requires_safety_checker = False
+
+        else:
+            image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components(
+                original_config, clip_stats_path=clip_stats_path, device=device
+            )
+
+            if stable_unclip == "img2img":
+                feature_extractor, image_encoder = stable_unclip_image_encoder(original_config)
+
+                pipe = StableUnCLIPImg2ImgPipeline(
+                    # image encoding components
+                    feature_extractor=feature_extractor,
+                    image_encoder=image_encoder,
+                    # image noising components
+                    image_normalizer=image_normalizer,
+                    image_noising_scheduler=image_noising_scheduler,
+                    # regular denoising components
+                    tokenizer=tokenizer,
+                    text_encoder=text_model,
+                    unet=unet,
+                    scheduler=scheduler,
+                    # vae
+                    vae=vae,
+                )
+            elif stable_unclip == "txt2img":
+                if stable_unclip_prior is None or stable_unclip_prior == "karlo":
+                    karlo_model = "kakaobrain/karlo-v1-alpha"
+                    prior = PriorTransformer.from_pretrained(
+                        karlo_model, subfolder="prior", local_files_only=local_files_only
+                    )
+
+                    try:
+                        prior_tokenizer = CLIPTokenizer.from_pretrained(
+                            "openai/clip-vit-large-patch14", local_files_only=local_files_only
+                        )
+                    except Exception:
+                        raise ValueError(
+                            f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+                        )
+                    prior_text_model = CLIPTextModelWithProjection.from_pretrained(
+                        "openai/clip-vit-large-patch14", local_files_only=local_files_only
+                    )
+
+                    prior_scheduler = UnCLIPScheduler.from_pretrained(
+                        karlo_model, subfolder="prior_scheduler", local_files_only=local_files_only
+                    )
+                    prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
+                else:
+                    raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}")
+
+                pipe = StableUnCLIPPipeline(
+                    # prior components
+                    prior_tokenizer=prior_tokenizer,
+                    prior_text_encoder=prior_text_model,
+                    prior=prior,
+                    prior_scheduler=prior_scheduler,
+                    # image noising components
+                    image_normalizer=image_normalizer,
+                    image_noising_scheduler=image_noising_scheduler,
+                    # regular denoising components
+                    tokenizer=tokenizer,
+                    text_encoder=text_model,
+                    unet=unet,
+                    scheduler=scheduler,
+                    # vae
+                    vae=vae,
+                )
+            else:
+                raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}")
+    elif model_type == "PaintByExample":
+        vision_model = convert_paint_by_example_checkpoint(checkpoint)
+        try:
+            tokenizer = CLIPTokenizer.from_pretrained(
+                "openai/clip-vit-large-patch14", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+            )
+        try:
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the feature_extractor in the following path: 'CompVis/stable-diffusion-safety-checker'."
+            )
+        pipe = PaintByExamplePipeline(
+            vae=vae,
+            image_encoder=vision_model,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=None,
+            feature_extractor=feature_extractor,
+        )
+    elif model_type == "FrozenCLIPEmbedder":
+        text_model = convert_ldm_clip_checkpoint(
+            checkpoint, local_files_only=local_files_only, text_encoder=text_encoder
+        )
+        try:
+            tokenizer = (
+                CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
+                if tokenizer is None
+                else tokenizer
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+            )
+
+        if load_safety_checker:
+            safety_checker = StableDiffusionSafetyChecker.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+        else:
+            safety_checker = None
+            feature_extractor = None
+
+        if controlnet:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_model,
+                tokenizer=tokenizer,
+                unet=unet,
+                controlnet=controlnet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+            )
+        else:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_model,
+                tokenizer=tokenizer,
+                unet=unet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+            )
+    elif model_type in ["SDXL", "SDXL-Refiner"]:
+        is_refiner = model_type == "SDXL-Refiner"
+
+        if (is_refiner is False) and (tokenizer is None):
+            try:
+                tokenizer = CLIPTokenizer.from_pretrained(
+                    "openai/clip-vit-large-patch14", local_files_only=local_files_only
+                )
+            except Exception:
+                raise ValueError(
+                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+                )
+
+        if (is_refiner is False) and (text_encoder is None):
+            text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
+
+        if tokenizer_2 is None:
+            try:
+                tokenizer_2 = CLIPTokenizer.from_pretrained(
+                    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only
+                )
+            except Exception:
+                raise ValueError(
+                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k' with `pad_token` set to '!'."
+                )
+
+        if text_encoder_2 is None:
+            config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+            config_kwargs = {"projection_dim": 1280}
+            prefix = "conditioner.embedders.0.model." if is_refiner else "conditioner.embedders.1.model."
+
+            text_encoder_2 = convert_open_clip_checkpoint(
+                checkpoint,
+                config_name,
+                prefix=prefix,
+                has_projection=True,
+                local_files_only=local_files_only,
+                **config_kwargs,
+            )
+
+        if is_accelerate_available():  # SBM Now move model to cpu.
+            for param_name, param in converted_unet_checkpoint.items():
+                set_module_tensor_to_device(unet, param_name, "cpu", value=param)
+
+        if controlnet:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                text_encoder_2=text_encoder_2,
+                tokenizer_2=tokenizer_2,
+                unet=unet,
+                controlnet=controlnet,
+                scheduler=scheduler,
+                force_zeros_for_empty_prompt=True,
+            )
+        elif adapter:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                text_encoder_2=text_encoder_2,
+                tokenizer_2=tokenizer_2,
+                unet=unet,
+                adapter=adapter,
+                scheduler=scheduler,
+                force_zeros_for_empty_prompt=True,
+            )
+
+        else:
+            pipeline_kwargs = {
+                "vae": vae,
+                "text_encoder": text_encoder,
+                "tokenizer": tokenizer,
+                "text_encoder_2": text_encoder_2,
+                "tokenizer_2": tokenizer_2,
+                "unet": unet,
+                "scheduler": scheduler,
+            }
+
+            if (pipeline_class == StableDiffusionXLImg2ImgPipeline) or (
+                pipeline_class == StableDiffusionXLInpaintPipeline
+            ):
+                pipeline_kwargs.update({"requires_aesthetics_score": is_refiner})
+
+            if is_refiner:
+                pipeline_kwargs.update({"force_zeros_for_empty_prompt": False})
+
+            pipe = pipeline_class(**pipeline_kwargs)
+    else:
+        text_config = create_ldm_bert_config(original_config)
+        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
+        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", local_files_only=local_files_only)
+        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+
+    return pipe
+
+
+def download_controlnet_from_original_ckpt(
+    checkpoint_path: str,
+    original_config_file: str,
+    image_size: int = 512,
+    extract_ema: bool = False,
+    num_in_channels: Optional[int] = None,
+    upcast_attention: Optional[bool] = None,
+    device: str = None,
+    from_safetensors: bool = False,
+    use_linear_projection: Optional[bool] = None,
+    cross_attention_dim: Optional[bool] = None,
+) -> DiffusionPipeline:
+    if from_safetensors:
+        from safetensors import safe_open
+
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+
+    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
+    # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
+    while "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    original_config = yaml.safe_load(original_config_file)
+
+    if num_in_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+
+    if "control_stage_config" not in original_config["model"]["params"]:
+        raise ValueError("`control_stage_config` not present in original config")
+
+    controlnet = convert_controlnet_checkpoint(
+        checkpoint,
+        original_config,
+        checkpoint_path,
+        image_size,
+        upcast_attention,
+        extract_ema,
+        use_linear_projection=use_linear_projection,
+        cross_attention_dim=cross_attention_dim,
+    )
+
+    return controlnet
+
+
+def download_promptdiffusion_from_original_ckpt(
+    checkpoint_path: str,
+    original_config_file: str,
+    image_size: int = 512,
+    extract_ema: bool = False,
+    num_in_channels: Optional[int] = None,
+    upcast_attention: Optional[bool] = None,
+    device: str = None,
+    from_safetensors: bool = False,
+    use_linear_projection: Optional[bool] = None,
+    cross_attention_dim: Optional[bool] = None,
+) -> DiffusionPipeline:
+    if from_safetensors:
+        from safetensors import safe_open
+
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+
+    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
+    # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
+    while "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    original_config = yaml.safe_load(open(original_config_file))
+
+    if num_in_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+    if "control_stage_config" not in original_config["model"]["params"]:
+        raise ValueError("`control_stage_config` not present in original config")
+
+    controlnet = convert_promptdiffusion_checkpoint(
+        checkpoint,
+        original_config,
+        checkpoint_path,
+        image_size,
+        upcast_attention,
+        extract_ema,
+        use_linear_projection=use_linear_projection,
+        cross_attention_dim=cross_attention_dim,
+    )
+
+    return controlnet
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--original_config_file",
+        type=str,
+        required=True,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--num_in_channels",
+        default=None,
+        type=int,
+        help="The number of input channels. If `None` number of input channels will be automatically inferred.",
+    )
+    parser.add_argument(
+        "--image_size",
+        default=512,
+        type=int,
+        help=(
+            "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2"
+            " Base. Use 768 for Stable Diffusion v2."
+        ),
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument(
+        "--upcast_attention",
+        action="store_true",
+        help=(
+            "Whether the attention computation should always be upcasted. This is necessary when running stable"
+            " diffusion 2.1."
+        ),
+    )
+    parser.add_argument(
+        "--from_safetensors",
+        action="store_true",
+        help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
+    )
+    parser.add_argument(
+        "--to_safetensors",
+        action="store_true",
+        help="Whether to store pipeline in safetensors format or not.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+
+    # small workaround to get argparser to parse a boolean input as either true _or_ false
+    def parse_bool(string):
+        if string == "True":
+            return True
+        elif string == "False":
+            return False
+        else:
+            raise ValueError(f"could not parse string as bool {string}")
+
+    parser.add_argument(
+        "--use_linear_projection", help="Override for use linear projection", required=False, type=parse_bool
+    )
+
+    parser.add_argument("--cross_attention_dim", help="Override for cross attention_dim", required=False, type=int)
+
+    args = parser.parse_args()
+
+    controlnet = download_promptdiffusion_from_original_ckpt(
+        checkpoint_path=args.checkpoint_path,
+        original_config_file=args.original_config_file,
+        image_size=args.image_size,
+        extract_ema=args.extract_ema,
+        num_in_channels=args.num_in_channels,
+        upcast_attention=args.upcast_attention,
+        from_safetensors=args.from_safetensors,
+        device=args.device,
+        use_linear_projection=args.use_linear_projection,
+        cross_attention_dim=args.cross_attention_dim,
+    )
+
+    controlnet.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/diffusers/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py b/diffusers/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb4260d4653f98ad65ec6e30b17569c9603db4bf
--- /dev/null
+++ b/diffusers/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py
@@ -0,0 +1,1328 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Based on [In-Context Learning Unlocked for Diffusion Models](https://arxiv.org/abs/2305.01115)
+# Authors: Zhendong Wang, Yifan Jiang, Yadong Lu, Yelong Shen, Pengcheng He, Weizhu Chen, Zhangyang Wang, Mingyuan Zhou
+# Project Page: https://zhendong-wang.github.io/prompt-diffusion.github.io/
+# Code: https://github.com/Zhendong-Wang/Prompt-Diffusion
+#
+# Adapted to Diffusers by [iczaw](https://github.com/iczaw).
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from promptdiffusioncontrolnet import PromptDiffusionControlNetModel
+        >>> from diffusers.utils import load_image
+        >>> import torch
+
+        >>> from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+        >>> from diffusers import UniPCMultistepScheduler
+        >>> from PIL import ImageOps
+
+        >>> # download an image
+        >>> image_a = ImageOps.invert(load_image(
+        ...     "https://github.com/Zhendong-Wang/Prompt-Diffusion/blob/main/images_to_try/house_line.png?raw=true"
+        ... ))
+
+        >>> # download an image
+        >>> image_b = load_image(
+        ...     "https://github.com/Zhendong-Wang/Prompt-Diffusion/blob/main/images_to_try/house.png?raw=true"
+        ... )
+
+        >>> # download an image
+        >>> query = ImageOps.invert(load_image(
+        ...     "https://github.com/Zhendong-Wang/Prompt-Diffusion/blob/main/images_to_try/new_01.png?raw=true"
+        ... ))
+
+        >>> # load prompt diffusion control net and prompt diffusion
+        >>> controlnet = PromptDiffusionControlNetModel.from_pretrained("path-to-converted-promptdiffusion-controlnet", torch_dtype=torch.float16)
+        >>> pipe = DiffusionPipeline.from_pretrained(model_id, controlnet=controlnet, torch_dtype=torch.float16, variant="fp16", custom_pipeline="pipeline_prompt_diffusion")
+
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> # remove following line if xformers is not installed
+        >>> pipe.enable_xformers_memory_efficient_attention()
+
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "a tortoise", num_inference_steps=20, generator=generator, image_pair=[image_a,image_b], image=query
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class PromptDiffusionPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, FromSingleFileMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
+
+    This pipeline also adds experimental support for [Prompt Diffusion](https://arxiv.org/abs/2305.01115).
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        image_pair,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings is not supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `image_pair`
+        if len(image_pair) == 2:
+            for image in image_pair:
+                if (
+                    isinstance(self.controlnet, ControlNetModel)
+                    or is_compiled
+                    and isinstance(self.controlnet._orig_mod, ControlNetModel)
+                ):
+                    self.check_image(image, prompt, prompt_embeds)
+        else:
+            raise ValueError(
+                f"You have passed a list of images of length {len(image_pair)}."
+                f"Make sure the list size equals to two."
+            )
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings is not supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stages where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        image_pair: List[PipelineImageInput] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single ControlNet.
+            image_pair `List[PIL.Image.Image]`:
+                a pair of task-specific example images
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            image,
+            image_pair,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 3.1 Prepare image pair
+
+        if isinstance(controlnet, ControlNetModel):
+            image_pair = torch.cat(
+                [
+                    self.prepare_image(
+                        image=im,
+                        width=width,
+                        height=height,
+                        batch_size=batch_size * num_images_per_prompt,
+                        num_images_per_prompt=num_images_per_prompt,
+                        device=device,
+                        dtype=controlnet.dtype,
+                        do_classifier_free_guidance=self.do_classifier_free_guidance,
+                        guess_mode=guess_mode,
+                    )
+                    for im in image_pair
+                ],
+                1,
+            )
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            images = []
+
+            for image_ in image:
+                image_ = self.prepare_image(
+                    image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                images.append(image_)
+
+            image = images
+            height, width = image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.2 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_controlnet_compiled = is_compiled_module(self.controlnet)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_query_cond=image,
+                    controlnet_cond=image_pair,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Inferred ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py b/diffusers/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..46cabd863dfaca444ac9b819943ef0843ef505fd
--- /dev/null
+++ b/diffusers/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py
@@ -0,0 +1,385 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+
+from diffusers.configuration_utils import register_to_config
+from diffusers.models.controlnet import (
+    ControlNetConditioningEmbedding,
+    ControlNetModel,
+    ControlNetOutput,
+)
+from diffusers.utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class PromptDiffusionControlNetModel(ControlNetModel):
+    """
+    A PromptDiffusionControlNet model.
+
+    Args:
+        in_channels (`int`, defaults to 4):
+            The number of channels in the input sample.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, defaults to 0):
+            The frequency shift to apply to the time embedding.
+        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
+        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, defaults to 2):
+            The number of layers per block.
+        downsample_padding (`int`, defaults to 1):
+            The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, defaults to 1):
+            The scale factor to use for the mid block.
+        act_fn (`str`, defaults to "silu"):
+            The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
+            in post-processing.
+        norm_eps (`float`, defaults to 1e-5):
+            The epsilon to use for the normalization.
+        cross_attention_dim (`int`, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
+            The dimension of the attention heads.
+        use_linear_projection (`bool`, defaults to `False`):
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        num_class_embeds (`int`, *optional*, defaults to 0):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        upcast_attention (`bool`, defaults to `False`):
+        resnet_time_scale_shift (`str`, defaults to `"default"`):
+            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
+        projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
+            The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
+            `class_embed_type="projection"`.
+        controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
+            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+        conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
+            The tuple of output channel for each block in the `conditioning_embedding` layer.
+        global_pool_conditions (`bool`, defaults to `False`):
+            TODO(Patrick) - unused parameter.
+        addition_embed_type_num_heads (`int`, defaults to 64):
+            The number of heads to use for the `TextTimeEmbedding` layer.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 4,
+        conditioning_channels: int = 3,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str, ...] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int, ...]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
+        addition_embed_type_num_heads: int = 64,
+    ):
+        super().__init__(
+            in_channels,
+            conditioning_channels,
+            flip_sin_to_cos,
+            freq_shift,
+            down_block_types,
+            mid_block_type,
+            only_cross_attention,
+            block_out_channels,
+            layers_per_block,
+            downsample_padding,
+            mid_block_scale_factor,
+            act_fn,
+            norm_num_groups,
+            norm_eps,
+            cross_attention_dim,
+            transformer_layers_per_block,
+            encoder_hid_dim,
+            encoder_hid_dim_type,
+            attention_head_dim,
+            num_attention_heads,
+            use_linear_projection,
+            class_embed_type,
+            addition_embed_type,
+            addition_time_embed_dim,
+            num_class_embeds,
+            upcast_attention,
+            resnet_time_scale_shift,
+            projection_class_embeddings_input_dim,
+            controlnet_conditioning_channel_order,
+            conditioning_embedding_out_channels,
+            global_pool_conditions,
+            addition_embed_type_num_heads,
+        )
+        self.controlnet_query_cond_embedding = ControlNetConditioningEmbedding(
+            conditioning_embedding_channels=block_out_channels[0],
+            block_out_channels=conditioning_embedding_out_channels,
+            conditioning_channels=3,
+        )
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        controlnet_cond: torch.Tensor,
+        controlnet_query_cond: torch.Tensor,
+        conditioning_scale: float = 1.0,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]:
+        """
+        The [`~PromptDiffusionControlNetModel`] forward method.
+
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor.
+            timestep (`Union[torch.Tensor, float, int]`):
+                The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states.
+            controlnet_cond (`torch.Tensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            controlnet_query_cond (`torch.Tensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                The scale factor for ControlNet outputs.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
+                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+                embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            added_cond_kwargs (`dict`):
+                Additional conditions for the Stable Diffusion XL UNet.
+            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
+            guess_mode (`bool`, defaults to `False`):
+                In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
+                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
+                returned where the first element is the sample tensor.
+        """
+        # check channel order
+        channel_order = self.config.controlnet_conditioning_channel_order
+
+        if channel_order == "rgb":
+            # in rgb order by default
+            ...
+        elif channel_order == "bgr":
+            controlnet_cond = torch.flip(controlnet_cond, dims=[1])
+        else:
+            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
+
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+
+        if self.config.addition_embed_type is not None:
+            if self.config.addition_embed_type == "text":
+                aug_emb = self.add_embedding(encoder_hidden_states)
+
+            elif self.config.addition_embed_type == "text_time":
+                if "text_embeds" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                    )
+                text_embeds = added_cond_kwargs.get("text_embeds")
+                if "time_ids" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                    )
+                time_ids = added_cond_kwargs.get("time_ids")
+                time_embeds = self.add_time_proj(time_ids.flatten())
+                time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+
+                add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+                add_embeds = add_embeds.to(emb.dtype)
+                aug_emb = self.add_embedding(add_embeds)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
+        controlnet_query_cond = self.controlnet_query_cond_embedding(controlnet_query_cond)
+        sample = sample + controlnet_cond + controlnet_query_cond
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+
+        # 5. Control net blocks
+
+        controlnet_down_block_res_samples = ()
+
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
+
+        down_block_res_samples = controlnet_down_block_res_samples
+
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+
+        # 6. scaling
+        if guess_mode and not self.config.global_pool_conditions:
+            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device)  # 0.1 to 1.0
+            scales = scales * conditioning_scale
+            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
+            mid_block_res_sample = mid_block_res_sample * scales[-1]  # last one
+        else:
+            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+            mid_block_res_sample = mid_block_res_sample * conditioning_scale
+
+        if self.config.global_pool_conditions:
+            down_block_res_samples = [
+                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
+            ]
+            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
+
+        if not return_dict:
+            return (down_block_res_samples, mid_block_res_sample)
+
+        return ControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )
diff --git a/diffusers/examples/research_projects/pytorch_xla/README.md b/diffusers/examples/research_projects/pytorch_xla/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a6901d5ada9d6d3d1d7ed95bd3b74e6ac2ba5822
--- /dev/null
+++ b/diffusers/examples/research_projects/pytorch_xla/README.md
@@ -0,0 +1,167 @@
+# Stable Diffusion text-to-image fine-tuning using PyTorch/XLA
+
+The `train_text_to_image_xla.py` script shows how to fine-tune stable diffusion model on TPU devices using PyTorch/XLA.
+
+It has been tested on v4 and v5p TPU versions. Training code has been tested on multi-host. 
+
+This script implements Distributed Data Parallel using GSPMD feature in XLA compiler
+where we shard the input batches over the TPU devices. 
+
+As of 9-11-2024, these are some expected step times.
+
+| accelerator | global batch size | step time (seconds) |
+| ----------- | ----------------- | --------- |
+| v5p-128 | 1024 | 0.245 |
+| v5p-256 | 2048 | 0.234 |
+| v5p-512 | 4096 | 0.2498 |
+
+## Create TPU
+
+To create a TPU on Google Cloud first set these environment variables:
+
+```bash
+export TPU_NAME=<tpu-name>
+export PROJECT_ID=<project-id>
+export ZONE=<google-cloud-zone>
+export ACCELERATOR_TYPE=<accelerator type like v5p-8>
+export RUNTIME_VERSION=<runtime version like v2-alpha-tpuv5 for v5p>
+```
+
+Then run the create TPU command:
+```bash
+gcloud alpha compute tpus tpu-vm create ${TPU_NAME} --project ${PROJECT_ID} 
+--zone ${ZONE} --accelerator-type ${ACCELERATOR_TYPE} --version ${RUNTIME_VERSION} 
+--reserved
+```
+
+You can also use other ways to reserve TPUs like GKE or queued resources.
+
+## Setup TPU environment
+
+Install PyTorch and PyTorch/XLA nightly versions:
+```bash
+gcloud compute tpus tpu-vm ssh ${TPU_NAME} \
+--project=${PROJECT_ID} --zone=${ZONE} --worker=all \
+--command='
+pip3 install --pre torch==2.5.0.dev20240905+cpu torchvision==0.20.0.dev20240905+cpu --index-url https://download.pytorch.org/whl/nightly/cpu
+pip3 install "torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.5.0.dev20240905-cp310-cp310-linux_x86_64.whl" -f https://storage.googleapis.com/libtpu-releases/index.html
+'
+```
+
+Verify that PyTorch and PyTorch/XLA were installed correctly:
+
+```bash
+gcloud compute tpus tpu-vm ssh ${TPU_NAME} \
+--project ${PROJECT_ID} --zone ${ZONE} --worker=all \
+--command='python3 -c "import torch; import torch_xla;"'
+```
+
+Install dependencies:
+```bash
+gcloud compute tpus tpu-vm ssh ${TPU_NAME} \
+--project=${PROJECT_ID} --zone=${ZONE} --worker=all \
+--command='
+git clone https://github.com/huggingface/diffusers.git
+cd diffusers
+git checkout main
+cd examples/research_projects/pytorch_xla
+pip3 install -r requirements.txt
+pip3 install pillow --upgrade
+cd ../../..
+pip3 install .'
+```
+
+## Run the training job
+
+### Authenticate
+
+Run the following command to authenticate your token.
+
+```bash
+huggingface-cli login
+```
+
+This script only trains the unet part of the network. The VAE and text encoder
+are fixed.
+
+```bash
+gcloud compute tpus tpu-vm ssh ${TPU_NAME} \
+--project=${PROJECT_ID} --zone=${ZONE} --worker=all \
+--command='
+export XLA_DISABLE_FUNCTIONALIZATION=1 
+export PROFILE_DIR=/tmp/
+export CACHE_DIR=/tmp/
+export DATASET_NAME=lambdalabs/naruto-blip-captions
+export PER_HOST_BATCH_SIZE=32 # This is known to work on TPU v4. Can set this to 64 for TPU v5p
+export TRAIN_STEPS=50
+export OUTPUT_DIR=/tmp/trained-model/
+python diffusers/examples/research_projects/pytorch_xla/train_text_to_image_xla.py --pretrained_model_name_or_path=stabilityai/stable-diffusion-2-base --dataset_name=$DATASET_NAME --resolution=512 --center_crop --random_flip --train_batch_size=$PER_HOST_BATCH_SIZE  --max_train_steps=$TRAIN_STEPS --learning_rate=1e-06 --mixed_precision=bf16 --profile_duration=80000 --output_dir=$OUTPUT_DIR --dataloader_num_workers=4 --loader_prefetch_size=4 --device_prefetch_size=4'
+   
+```
+
+### Environment Envs Explained
+
+*   `XLA_DISABLE_FUNCTIONALIZATION`: To optimize the performance for AdamW optimizer.
+*   `PROFILE_DIR`: Specify where to put the profiling results.
+*   `CACHE_DIR`: Directory to store XLA compiled graphs for persistent caching.
+*   `DATASET_NAME`: Dataset to train the model. 
+*   `PER_HOST_BATCH_SIZE`: Size of the batch to load per CPU host. For e.g. for a v5p-16 with 2 CPU hosts, the global batch size will be 2xPER_HOST_BATCH_SIZE. The input batch is sharded along the batch axis.
+*    `TRAIN_STEPS`: Total number of training steps to run the training for.
+*    `OUTPUT_DIR`: Directory to store the fine-tuned model.
+
+## Run inference using the output model
+
+To run inference using the output, you can simply load the model and pass it
+input prompts. The first pass will compile the graph and takes longer with the following passes running much faster.
+
+```bash
+export CACHE_DIR=/tmp/
+```
+
+```python
+import torch
+import os
+import sys
+import  numpy as np
+
+import torch_xla.core.xla_model as xm
+from time import time
+from diffusers import StableDiffusionPipeline
+import torch_xla.runtime as xr
+
+CACHE_DIR = os.environ.get("CACHE_DIR", None)
+if CACHE_DIR:
+    xr.initialize_cache(CACHE_DIR, readonly=False)
+
+def main():
+    device = xm.xla_device()
+    model_path = "jffacevedo/pxla_trained_model"
+    pipe = StableDiffusionPipeline.from_pretrained(
+        model_path, 
+        torch_dtype=torch.bfloat16
+    )
+    pipe.to(device)
+    prompt = ["A naruto with green eyes and red legs."]
+    start = time()
+    print("compiling...")
+    image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+    print(f"compile time: {time() - start}")
+    print("generate...")
+    start = time()
+    image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+    print(f"generation time (after compile) : {time() - start}")
+    image.save("naruto.png")
+
+if __name__ == '__main__':
+    main()
+```
+
+Expected Results:
+
+```bash
+compiling...
+100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [10:03<00:00, 20.10s/it]
+compile time: 720.656970500946
+generate...
+100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 17.65it/s]
+generation time (after compile) : 1.8461642265319824
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/pytorch_xla/requirements.txt b/diffusers/examples/research_projects/pytorch_xla/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c3ffa42f0edc9862345ab82b69ffb5cdfaeac45e
--- /dev/null
+++ b/diffusers/examples/research_projects/pytorch_xla/requirements.txt
@@ -0,0 +1,8 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+datasets>=2.19.1
+ftfy
+tensorboard
+Jinja2
+peft==0.7.0
diff --git a/diffusers/examples/research_projects/pytorch_xla/train_text_to_image_xla.py b/diffusers/examples/research_projects/pytorch_xla/train_text_to_image_xla.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d9d8c540f11e500e14c98575d75ab5c20d3327b
--- /dev/null
+++ b/diffusers/examples/research_projects/pytorch_xla/train_text_to_image_xla.py
@@ -0,0 +1,669 @@
+import argparse
+import os
+import random
+import time
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import torch_xla.core.xla_model as xm
+import torch_xla.debug.profiler as xp
+import torch_xla.distributed.parallel_loader as pl
+import torch_xla.distributed.spmd as xs
+import torch_xla.runtime as xr
+from huggingface_hub import create_repo, upload_folder
+from torchvision import transforms
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.training_utils import compute_snr
+from diffusers.utils import is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+
+
+if is_wandb_available():
+    pass
+
+PROFILE_DIR = os.environ.get("PROFILE_DIR", None)
+CACHE_DIR = os.environ.get("CACHE_DIR", None)
+if CACHE_DIR:
+    xr.initialize_cache(CACHE_DIR, readonly=False)
+xr.use_spmd()
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+PORT = 9012
+
+
+def save_model_card(
+    args,
+    repo_id: str,
+    repo_folder: str = None,
+):
+    model_description = f"""
+# Text-to-image finetuning - {repo_id}
+
+This pipeline was finetuned from **{args.pretrained_model_name_or_path}** on the **{args.dataset_name}** dataset. \n
+
+## Pipeline usage
+
+You can use the pipeline like so:
+
+```python
+import torch
+import os
+import sys
+import  numpy as np
+
+import torch_xla.core.xla_model as xm
+from time import time
+from typing import Tuple
+from diffusers import StableDiffusionPipeline
+
+def main(args):
+    device = xm.xla_device()
+    model_path = <output_dir>
+    pipe = StableDiffusionPipeline.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16
+    )
+    pipe.to(device)
+    prompt = ["A naruto with green eyes and red legs."]
+    image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+    image.save("naruto.png")
+
+if __name__ == '__main__':
+    main()
+```
+
+## Training info
+
+These are the key hyperparameters used during training:
+
+* Steps: {args.max_train_steps}
+* Learning rate: {args.learning_rate}
+* Batch size: {args.train_batch_size}
+* Image resolution: {args.resolution}
+* Mixed-precision: {args.mixed_precision}
+
+"""
+
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=args.pretrained_model_name_or_path,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = ["stable-diffusion", "stable-diffusion-diffusers", "text-to-image", "diffusers", "diffusers-training"]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+class TrainSD:
+    def __init__(
+        self,
+        vae,
+        weight_dtype,
+        device,
+        noise_scheduler,
+        unet,
+        optimizer,
+        text_encoder,
+        dataloader,
+        args,
+    ):
+        self.vae = vae
+        self.weight_dtype = weight_dtype
+        self.device = device
+        self.noise_scheduler = noise_scheduler
+        self.unet = unet
+        self.optimizer = optimizer
+        self.text_encoder = text_encoder
+        self.args = args
+        self.mesh = xs.get_global_mesh()
+        self.dataloader = iter(dataloader)
+        self.global_step = 0
+
+    def run_optimizer(self):
+        self.optimizer.step()
+
+    def start_training(self):
+        times = []
+        last_time = time.time()
+        step = 0
+        while True:
+            if self.global_step >= self.args.max_train_steps:
+                xm.mark_step()
+                break
+            if step == 4 and PROFILE_DIR is not None:
+                xm.wait_device_ops()
+                xp.trace_detached(f"localhost:{PORT}", PROFILE_DIR, duration_ms=args.profile_duration)
+            try:
+                batch = next(self.dataloader)
+            except Exception as e:
+                print(e)
+                break
+            loss = self.step_fn(batch["pixel_values"], batch["input_ids"])
+            step_time = time.time() - last_time
+            if step >= 10:
+                times.append(step_time)
+            print(f"step: {step}, step_time: {step_time}")
+            if step % 5 == 0:
+                print(f"step: {step}, loss: {loss}")
+            last_time = time.time()
+            self.global_step += 1
+            step += 1
+        # print(f"Average step time: {sum(times)/len(times)}")
+        xm.wait_device_ops()
+
+    def step_fn(
+        self,
+        pixel_values,
+        input_ids,
+    ):
+        with xp.Trace("model.forward"):
+            self.optimizer.zero_grad()
+            latents = self.vae.encode(pixel_values).latent_dist.sample()
+            latents = latents * self.vae.config.scaling_factor
+            noise = torch.randn_like(latents).to(self.device, dtype=self.weight_dtype)
+            bsz = latents.shape[0]
+            timesteps = torch.randint(
+                0, self.noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+            )
+            timesteps = timesteps.long()
+
+            noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
+            encoder_hidden_states = self.text_encoder(input_ids, return_dict=False)[0]
+            if self.args.prediction_type is not None:
+                # set prediction_type of scheduler if defined
+                self.noise_scheduler.register_to_config(prediction_type=self.args.prediction_type)
+
+            if self.noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif self.noise_scheduler.config.prediction_type == "v_prediction":
+                target = self.noise_scheduler.get_velocity(latents, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {self.noise_scheduler.config.prediction_type}")
+            model_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
+        with xp.Trace("model.backward"):
+            if self.args.snr_gamma is None:
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+            else:
+                # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                # This is discussed in Section 4.2 of the same paper.
+                snr = compute_snr(self.noise_scheduler, timesteps)
+                mse_loss_weights = torch.stack([snr, self.args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                    dim=1
+                )[0]
+                if self.noise_scheduler.config.prediction_type == "epsilon":
+                    mse_loss_weights = mse_loss_weights / snr
+                elif self.noise_scheduler.config.prediction_type == "v_prediction":
+                    mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                loss = loss.mean()
+            loss.backward()
+        with xp.Trace("optimizer_step"):
+            self.run_optimizer()
+        return loss
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--input_perturbation", type=float, default=0, help="The scale of input perturbation. Recommended 0.1."
+    )
+    parser.add_argument("--profile_duration", type=int, default=10000, help="Profile duration in ms")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--non_ema_revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or"
+            " remote repository specified with --pretrained_model_name_or_path."
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--loader_prefetch_size",
+        type=int,
+        default=1,
+        help=("Number of subprocesses to use for data loading to cpu."),
+    )
+    parser.add_argument(
+        "--device_prefetch_size",
+        type=int,
+        default=1,
+        help=("Number of subprocesses to use for data loading to tpu from cpu. "),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+
+    args = parser.parse_args()
+
+    # default to using the same revision for the non-ema model if not specified
+    if args.non_ema_revision is None:
+        args.non_ema_revision = args.revision
+
+    return args
+
+
+def setup_optimizer(unet, args):
+    optimizer_cls = torch.optim.AdamW
+    return optimizer_cls(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+        foreach=True,
+    )
+
+
+def load_dataset(args):
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = datasets.load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            data_dir=args.train_data_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = datasets.load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+    return dataset
+
+
+def get_column_names(dataset, args):
+    column_names = dataset["train"].column_names
+
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    return image_column, caption_column
+
+
+def main(args):
+    args = parse_args()
+
+    _ = xp.start_server(PORT)
+
+    num_devices = xr.global_runtime_device_count()
+    device_ids = np.arange(num_devices)
+    mesh_shape = (num_devices, 1)
+    mesh = xs.Mesh(device_ids, mesh_shape, ("x", "y"))
+    xs.set_global_mesh(mesh)
+
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+        variant=args.variant,
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+        variant=args.variant,
+    )
+
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="unet",
+        revision=args.non_ema_revision,
+    )
+
+    if xm.is_master_ordinal() and args.push_to_hub:
+        repo_id = create_repo(
+            repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+        ).repo_id
+
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+    )
+
+    from torch_xla.distributed.fsdp.utils import apply_xla_patch_to_nn_linear
+
+    unet = apply_xla_patch_to_nn_linear(unet, xs.xla_patched_nn_linear_forward)
+
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.train()
+
+    # For mixed precision training we cast all non-trainable weights (vae,
+    # non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full
+    # precision is not required.
+    weight_dtype = torch.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    device = xm.xla_device()
+    print("device: ", device)
+    print("weight_dtype: ", weight_dtype)
+
+    text_encoder = text_encoder.to(device, dtype=weight_dtype)
+    vae = vae.to(device, dtype=weight_dtype)
+    unet = unet.to(device, dtype=weight_dtype)
+    optimizer = setup_optimizer(unet, args)
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.train()
+
+    dataset = load_dataset(args)
+    image_column, caption_column = get_column_names(dataset, args)
+
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions,
+            max_length=tokenizer.model_max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        return inputs.input_ids
+
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            (transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)),
+            (transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x)),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["input_ids"] = tokenize_captions(examples)
+        return examples
+
+    train_dataset = dataset["train"]
+    train_dataset.set_format("torch")
+    train_dataset.set_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).to(weight_dtype)
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+        return {"pixel_values": pixel_values, "input_ids": input_ids}
+
+    g = torch.Generator()
+    g.manual_seed(xr.host_index())
+    sampler = torch.utils.data.RandomSampler(train_dataset, replacement=True, num_samples=int(1e10), generator=g)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        sampler=sampler,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+        batch_size=args.train_batch_size,
+    )
+
+    train_dataloader = pl.MpDeviceLoader(
+        train_dataloader,
+        device,
+        input_sharding={
+            "pixel_values": xs.ShardingSpec(mesh, ("x", None, None, None), minibatch=True),
+            "input_ids": xs.ShardingSpec(mesh, ("x", None), minibatch=True),
+        },
+        loader_prefetch_size=args.loader_prefetch_size,
+        device_prefetch_size=args.device_prefetch_size,
+    )
+
+    if xm.is_master_ordinal():
+        print("***** Running training *****")
+        print(f"Instantaneous batch size per device = {args.train_batch_size}")
+        print(
+            f"Total train batch size (w. parallel, distributed & accumulation) = {args.train_batch_size * num_devices}"
+        )
+        print(f"  Total optimization steps = {args.max_train_steps}")
+
+    trainer = TrainSD(
+        vae=vae,
+        weight_dtype=weight_dtype,
+        device=device,
+        noise_scheduler=noise_scheduler,
+        unet=unet,
+        optimizer=optimizer,
+        text_encoder=text_encoder,
+        dataloader=train_dataloader,
+        args=args,
+    )
+
+    trainer.start_training()
+    unet = trainer.unet.to("cpu")
+    vae = trainer.vae.to("cpu")
+    text_encoder = trainer.text_encoder.to("cpu")
+
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        text_encoder=text_encoder,
+        vae=vae,
+        unet=unet,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    pipeline.save_pretrained(args.output_dir)
+
+    if xm.is_master_ordinal() and args.push_to_hub:
+        save_model_card(args, repo_id, repo_folder=args.output_dir)
+        upload_folder(
+            repo_id=repo_id,
+            folder_path=args.output_dir,
+            commit_message="End of training",
+            ignore_patterns=["step_*", "epoch_*"],
+        )
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/rdm/README.md b/diffusers/examples/research_projects/rdm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cfd755e9c9c3d030953fdc0c40834c0d5f51c04d
--- /dev/null
+++ b/diffusers/examples/research_projects/rdm/README.md
@@ -0,0 +1,5 @@
+## Diffusers examples with ONNXRuntime optimizations
+
+**This research project is not actively maintained by the diffusers team. For any questions or comments, please contact Isamu Isozaki(isamu-isozaki) on github with any questions.**
+
+The aim of this project is to provide retrieval augmented diffusion models to diffusers!
\ No newline at end of file
diff --git a/diffusers/examples/research_projects/rdm/pipeline_rdm.py b/diffusers/examples/research_projects/rdm/pipeline_rdm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8093a3f217dd4e5b9e0e80da40c199fd461c8fa
--- /dev/null
+++ b/diffusers/examples/research_projects/rdm/pipeline_rdm.py
@@ -0,0 +1,344 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+from PIL import Image
+from retriever import Retriever, normalize_images, preprocess_images
+from transformers import CLIPImageProcessor, CLIPModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    ImagePipelineOutput,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class RDMPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Retrieval Augmented Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        clip ([`CLIPModel`]):
+            Frozen CLIP model. Retrieval Augmented Diffusion uses the CLIP model, specifically the
+            [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        clip: CLIPModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: CLIPImageProcessor,
+        retriever: Optional[Retriever] = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            clip=clip,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        # Copy from statement here and all the methods we take from stable_diffusion_pipeline
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.retriever = retriever
+
+    def _encode_prompt(self, prompt):
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        prompt_embeds = self.clip.get_text_features(text_input_ids.to(self.device))
+        prompt_embeds = prompt_embeds / torch.linalg.norm(prompt_embeds, dim=-1, keepdim=True)
+        prompt_embeds = prompt_embeds[:, None, :]
+        return prompt_embeds
+
+    def _encode_image(self, retrieved_images, batch_size):
+        if len(retrieved_images[0]) == 0:
+            return None
+        for i in range(len(retrieved_images)):
+            retrieved_images[i] = normalize_images(retrieved_images[i])
+            retrieved_images[i] = preprocess_images(retrieved_images[i], self.feature_extractor).to(
+                self.clip.device, dtype=self.clip.dtype
+            )
+        _, c, h, w = retrieved_images[0].shape
+
+        retrieved_images = torch.reshape(torch.cat(retrieved_images, dim=0), (-1, c, h, w))
+        image_embeddings = self.clip.get_image_features(retrieved_images)
+        image_embeddings = image_embeddings / torch.linalg.norm(image_embeddings, dim=-1, keepdim=True)
+        _, d = image_embeddings.shape
+        image_embeddings = torch.reshape(image_embeddings, (batch_size, -1, d))
+        return image_embeddings
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def retrieve_images(self, retrieved_images, prompt_embeds, knn=10):
+        if self.retriever is not None:
+            additional_images = self.retriever.retrieve_imgs_batch(prompt_embeds[:, 0].cpu(), knn).total_examples
+            for i in range(len(retrieved_images)):
+                retrieved_images[i] += additional_images[i][self.retriever.config.image_column]
+        return retrieved_images
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        retrieved_images: Optional[List[Image.Image]] = None,
+        height: int = 768,
+        width: int = 768,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        knn: Optional[int] = 10,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if retrieved_images is not None:
+            retrieved_images = [retrieved_images for _ in range(batch_size)]
+        else:
+            retrieved_images = [[] for _ in range(batch_size)]
+        device = self._execution_device
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if prompt_embeds is None:
+            prompt_embeds = self._encode_prompt(prompt)
+        retrieved_images = self.retrieve_images(retrieved_images, prompt_embeds, knn=knn)
+        image_embeddings = self._encode_image(retrieved_images, batch_size)
+        if image_embeddings is not None:
+            prompt_embeds = torch.cat([prompt_embeds, image_embeddings], dim=1)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_embeddings = torch.zeros_like(prompt_embeds).to(prompt_embeds.device)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([uncond_embeddings, prompt_embeds])
+        # get the initial random noise unless the user supplied it
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=[True] * image.shape[0]
+        )
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/examples/research_projects/rdm/retriever.py b/diffusers/examples/research_projects/rdm/retriever.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ae4989bd8bb26c47622ae33c950241c8b49c10c
--- /dev/null
+++ b/diffusers/examples/research_projects/rdm/retriever.py
@@ -0,0 +1,248 @@
+import os
+from typing import List
+
+import faiss
+import numpy as np
+import torch
+from datasets import Dataset, load_dataset
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPModel, PretrainedConfig
+
+from diffusers import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def normalize_images(images: List[Image.Image]):
+    images = [np.array(image) for image in images]
+    images = [image / 127.5 - 1 for image in images]
+    return images
+
+
+def preprocess_images(images: List[np.array], feature_extractor: CLIPImageProcessor) -> torch.Tensor:
+    """
+    Preprocesses a list of images into a batch of tensors.
+
+    Args:
+        images (:obj:`List[Image.Image]`):
+            A list of images to preprocess.
+
+    Returns:
+        :obj:`torch.Tensor`: A batch of tensors.
+    """
+    images = [np.array(image) for image in images]
+    images = [(image + 1.0) / 2.0 for image in images]
+    images = feature_extractor(images, return_tensors="pt").pixel_values
+    return images
+
+
+class IndexConfig(PretrainedConfig):
+    def __init__(
+        self,
+        clip_name_or_path="openai/clip-vit-large-patch14",
+        dataset_name="Isamu136/oxford_pets_with_l14_emb",
+        image_column="image",
+        index_name="embeddings",
+        index_path=None,
+        dataset_set="train",
+        metric_type=faiss.METRIC_L2,
+        faiss_device=-1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.clip_name_or_path = clip_name_or_path
+        self.dataset_name = dataset_name
+        self.image_column = image_column
+        self.index_name = index_name
+        self.index_path = index_path
+        self.dataset_set = dataset_set
+        self.metric_type = metric_type
+        self.faiss_device = faiss_device
+
+
+class Index:
+    """
+    Each index for a retrieval model is specific to the clip model used and the dataset used.
+    """
+
+    def __init__(self, config: IndexConfig, dataset: Dataset):
+        self.config = config
+        self.dataset = dataset
+        self.index_initialized = False
+        self.index_name = config.index_name
+        self.index_path = config.index_path
+        self.init_index()
+
+    def set_index_name(self, index_name: str):
+        self.index_name = index_name
+
+    def init_index(self):
+        if not self.index_initialized:
+            if self.index_path and self.index_name:
+                try:
+                    self.dataset.add_faiss_index(
+                        column=self.index_name, metric_type=self.config.metric_type, device=self.config.faiss_device
+                    )
+                    self.index_initialized = True
+                except Exception as e:
+                    print(e)
+                    logger.info("Index not initialized")
+            if self.index_name in self.dataset.features:
+                self.dataset.add_faiss_index(column=self.index_name)
+                self.index_initialized = True
+
+    def build_index(
+        self,
+        model=None,
+        feature_extractor: CLIPImageProcessor = None,
+        torch_dtype=torch.float32,
+    ):
+        if not self.index_initialized:
+            model = model or CLIPModel.from_pretrained(self.config.clip_name_or_path).to(dtype=torch_dtype)
+            feature_extractor = feature_extractor or CLIPImageProcessor.from_pretrained(self.config.clip_name_or_path)
+            self.dataset = get_dataset_with_emb_from_clip_model(
+                self.dataset,
+                model,
+                feature_extractor,
+                image_column=self.config.image_column,
+                index_name=self.config.index_name,
+            )
+            self.init_index()
+
+    def retrieve_imgs(self, vec, k: int = 20):
+        vec = np.array(vec).astype(np.float32)
+        return self.dataset.get_nearest_examples(self.index_name, vec, k=k)
+
+    def retrieve_imgs_batch(self, vec, k: int = 20):
+        vec = np.array(vec).astype(np.float32)
+        return self.dataset.get_nearest_examples_batch(self.index_name, vec, k=k)
+
+    def retrieve_indices(self, vec, k: int = 20):
+        vec = np.array(vec).astype(np.float32)
+        return self.dataset.search(self.index_name, vec, k=k)
+
+    def retrieve_indices_batch(self, vec, k: int = 20):
+        vec = np.array(vec).astype(np.float32)
+        return self.dataset.search_batch(self.index_name, vec, k=k)
+
+
+class Retriever:
+    def __init__(
+        self,
+        config: IndexConfig,
+        index: Index = None,
+        dataset: Dataset = None,
+        model=None,
+        feature_extractor: CLIPImageProcessor = None,
+    ):
+        self.config = config
+        self.index = index or self._build_index(config, dataset, model=model, feature_extractor=feature_extractor)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        retriever_name_or_path: str,
+        index: Index = None,
+        dataset: Dataset = None,
+        model=None,
+        feature_extractor: CLIPImageProcessor = None,
+        **kwargs,
+    ):
+        config = kwargs.pop("config", None) or IndexConfig.from_pretrained(retriever_name_or_path, **kwargs)
+        return cls(config, index=index, dataset=dataset, model=model, feature_extractor=feature_extractor)
+
+    @staticmethod
+    def _build_index(
+        config: IndexConfig, dataset: Dataset = None, model=None, feature_extractor: CLIPImageProcessor = None
+    ):
+        dataset = dataset or load_dataset(config.dataset_name)
+        dataset = dataset[config.dataset_set]
+        index = Index(config, dataset)
+        index.build_index(model=model, feature_extractor=feature_extractor)
+        return index
+
+    def save_pretrained(self, save_directory):
+        os.makedirs(save_directory, exist_ok=True)
+        if self.config.index_path is None:
+            index_path = os.path.join(save_directory, "hf_dataset_index.faiss")
+            self.index.dataset.get_index(self.config.index_name).save(index_path)
+            self.config.index_path = index_path
+        self.config.save_pretrained(save_directory)
+
+    def init_retrieval(self):
+        logger.info("initializing retrieval")
+        self.index.init_index()
+
+    def retrieve_imgs(self, embeddings: np.ndarray, k: int):
+        return self.index.retrieve_imgs(embeddings, k)
+
+    def retrieve_imgs_batch(self, embeddings: np.ndarray, k: int):
+        return self.index.retrieve_imgs_batch(embeddings, k)
+
+    def retrieve_indices(self, embeddings: np.ndarray, k: int):
+        return self.index.retrieve_indices(embeddings, k)
+
+    def retrieve_indices_batch(self, embeddings: np.ndarray, k: int):
+        return self.index.retrieve_indices_batch(embeddings, k)
+
+    def __call__(
+        self,
+        embeddings,
+        k: int = 20,
+    ):
+        return self.index.retrieve_imgs(embeddings, k)
+
+
+def map_txt_to_clip_feature(clip_model, tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+
+    if text_input_ids.shape[-1] > tokenizer.model_max_length:
+        removed_text = tokenizer.batch_decode(text_input_ids[:, tokenizer.model_max_length :])
+        logger.warning(
+            "The following part of your input was truncated because CLIP can only handle sequences up to"
+            f" {tokenizer.model_max_length} tokens: {removed_text}"
+        )
+        text_input_ids = text_input_ids[:, : tokenizer.model_max_length]
+    text_embeddings = clip_model.get_text_features(text_input_ids.to(clip_model.device))
+    text_embeddings = text_embeddings / torch.linalg.norm(text_embeddings, dim=-1, keepdim=True)
+    text_embeddings = text_embeddings[:, None, :]
+    return text_embeddings[0][0].cpu().detach().numpy()
+
+
+def map_img_to_model_feature(model, feature_extractor, imgs, device):
+    for i, image in enumerate(imgs):
+        if not image.mode == "RGB":
+            imgs[i] = image.convert("RGB")
+    imgs = normalize_images(imgs)
+    retrieved_images = preprocess_images(imgs, feature_extractor).to(device)
+    image_embeddings = model(retrieved_images)
+    image_embeddings = image_embeddings / torch.linalg.norm(image_embeddings, dim=-1, keepdim=True)
+    image_embeddings = image_embeddings[None, ...]
+    return image_embeddings.cpu().detach().numpy()[0][0]
+
+
+def get_dataset_with_emb_from_model(dataset, model, feature_extractor, image_column="image", index_name="embeddings"):
+    return dataset.map(
+        lambda example: {
+            index_name: map_img_to_model_feature(model, feature_extractor, [example[image_column]], model.device)
+        }
+    )
+
+
+def get_dataset_with_emb_from_clip_model(
+    dataset, clip_model, feature_extractor, image_column="image", index_name="embeddings"
+):
+    return dataset.map(
+        lambda example: {
+            index_name: map_img_to_model_feature(
+                clip_model.get_image_features, feature_extractor, [example[image_column]], clip_model.device
+            )
+        }
+    )
diff --git a/diffusers/examples/research_projects/realfill/README.md b/diffusers/examples/research_projects/realfill/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..91821031d2e0d1fe3f217aed44d235f64531d43b
--- /dev/null
+++ b/diffusers/examples/research_projects/realfill/README.md
@@ -0,0 +1,118 @@
+# RealFill
+
+[RealFill](https://arxiv.org/abs/2309.16668) is a method to personalize text2image inpainting models like stable diffusion inpainting given just a few(1~5) images of a scene.
+The `train_realfill.py` script shows how to implement the training procedure for stable diffusion inpainting.
+
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+cd to the realfill folder and run
+```bash
+cd realfill
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+
+### Toy example
+
+Now let's fill the real. For this example, we will use some images of the flower girl example from the paper.
+
+We already provide some images for testing in [this link](https://github.com/thuanz123/realfill/tree/main/data/flowerwoman)
+
+You only have to launch the training using:
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-2-inpainting"
+export TRAIN_DIR="data/flowerwoman"
+export OUTPUT_DIR="flowerwoman-model"
+
+accelerate launch train_realfill.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$TRAIN_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --resolution=512 \
+  --train_batch_size=16 \
+  --gradient_accumulation_steps=1 \
+  --unet_learning_rate=2e-4 \
+  --text_encoder_learning_rate=4e-5 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=100 \
+  --max_train_steps=2000 \
+  --lora_rank=8 \
+  --lora_dropout=0.1 \
+  --lora_alpha=16 \
+```
+
+### Training on a low-memory GPU:
+
+It is possible to run realfill on a low-memory GPU by using the following optimizations:
+- [gradient checkpointing and the 8-bit optimizer](#training-with-gradient-checkpointing-and-8-bit-optimizers)
+- [xformers](#training-with-xformers)
+- [setting grads to none](#set-grads-to-none)
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-2-inpainting"
+export TRAIN_DIR="data/flowerwoman"
+export OUTPUT_DIR="flowerwoman-model"
+
+accelerate launch train_realfill.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$TRAIN_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --resolution=512 \
+  --train_batch_size=16 \
+  --gradient_accumulation_steps=1 --gradient_checkpointing \
+  --use_8bit_adam \
+  --enable_xformers_memory_efficient_attention \
+  --set_grads_to_none \
+  --unet_learning_rate=2e-4 \
+  --text_encoder_learning_rate=4e-5 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=100 \
+  --max_train_steps=2000 \
+  --lora_rank=8 \
+  --lora_dropout=0.1 \
+  --lora_alpha=16 \
+```
+
+### Training with gradient checkpointing and 8-bit optimizers:
+
+With the help of gradient checkpointing and the 8-bit optimizer from bitsandbytes it's possible to run train realfill on a 16GB GPU.
+
+To install `bitsandbytes` please refer to this [readme](https://github.com/TimDettmers/bitsandbytes#requirements--installation).
+
+### Training with xformers:
+You can enable memory efficient attention by [installing xFormers](https://github.com/facebookresearch/xformers#installing-xformers) and padding the `--enable_xformers_memory_efficient_attention` argument to the script.
+
+### Set grads to none
+
+To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument.
+
+More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
+
+## Acknowledge
+This repo is built upon the code of DreamBooth from diffusers and we thank the developers for their great works and efforts to release source code. Furthermore, a special "thank you" to RealFill's authors for publishing such an amazing work.
diff --git a/diffusers/examples/research_projects/realfill/infer.py b/diffusers/examples/research_projects/realfill/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3153307c4ad31892df7607d3ecf9b2af79b65187
--- /dev/null
+++ b/diffusers/examples/research_projects/realfill/infer.py
@@ -0,0 +1,91 @@
+import argparse
+import os
+
+import torch
+from PIL import Image, ImageFilter
+from transformers import CLIPTextModel
+
+from diffusers import DPMSolverMultistepScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel
+
+
+parser = argparse.ArgumentParser(description="Inference")
+parser.add_argument(
+    "--model_path",
+    type=str,
+    default=None,
+    required=True,
+    help="Path to pretrained model or model identifier from huggingface.co/models.",
+)
+parser.add_argument(
+    "--validation_image",
+    type=str,
+    default=None,
+    required=True,
+    help="The directory of the validation image",
+)
+parser.add_argument(
+    "--validation_mask",
+    type=str,
+    default=None,
+    required=True,
+    help="The directory of the validation mask",
+)
+parser.add_argument(
+    "--output_dir",
+    type=str,
+    default="./test-infer/",
+    help="The output directory where predictions are saved",
+)
+parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible inference.")
+
+args = parser.parse_args()
+
+if __name__ == "__main__":
+    os.makedirs(args.output_dir, exist_ok=True)
+    generator = None
+
+    # create & load model
+    pipe = StableDiffusionInpaintPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-2-inpainting", torch_dtype=torch.float32, revision=None
+    )
+
+    pipe.unet = UNet2DConditionModel.from_pretrained(
+        args.model_path,
+        subfolder="unet",
+        revision=None,
+    )
+    pipe.text_encoder = CLIPTextModel.from_pretrained(
+        args.model_path,
+        subfolder="text_encoder",
+        revision=None,
+    )
+    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+    pipe = pipe.to("cuda")
+
+    if args.seed is not None:
+        generator = torch.Generator(device="cuda").manual_seed(args.seed)
+
+    image = Image.open(args.validation_image)
+    mask_image = Image.open(args.validation_mask)
+
+    results = pipe(
+        ["a photo of sks"] * 16,
+        image=image,
+        mask_image=mask_image,
+        num_inference_steps=25,
+        guidance_scale=5,
+        generator=generator,
+    ).images
+
+    erode_kernel = ImageFilter.MaxFilter(3)
+    mask_image = mask_image.filter(erode_kernel)
+
+    blur_kernel = ImageFilter.BoxBlur(1)
+    mask_image = mask_image.filter(blur_kernel)
+
+    for idx, result in enumerate(results):
+        result = Image.composite(result, image, mask_image)
+        result.save(f"{args.output_dir}/{idx}.png")
+
+    del pipe
+    torch.cuda.empty_cache()
diff --git a/diffusers/examples/research_projects/realfill/requirements.txt b/diffusers/examples/research_projects/realfill/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8fbaf908a2c887e77b58d8f339f0e0643892f238
--- /dev/null
+++ b/diffusers/examples/research_projects/realfill/requirements.txt
@@ -0,0 +1,9 @@
+diffusers==0.20.1
+accelerate==0.23.0
+transformers==4.38.0
+peft==0.5.0
+torch==2.2.0
+torchvision>=0.16
+ftfy==6.1.1
+tensorboard==2.14.0
+Jinja2==3.1.4
diff --git a/diffusers/examples/research_projects/realfill/train_realfill.py b/diffusers/examples/research_projects/realfill/train_realfill.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7cc25df02b97c5cbd5705defa5c3d706a3d51c4
--- /dev/null
+++ b/diffusers/examples/research_projects/realfill/train_realfill.py
@@ -0,0 +1,984 @@
+import argparse
+import copy
+import itertools
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import torchvision.transforms.v2 as transforms_v2
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig, PeftModel, get_peft_model
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, CLIPTextModel
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    StableDiffusionInpaintPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.20.1")
+
+logger = get_logger(__name__)
+
+
+def make_mask(images, resolution, times=30):
+    mask, times = torch.ones_like(images[0:1, :, :]), np.random.randint(1, times)
+    min_size, max_size, margin = np.array([0.03, 0.25, 0.01]) * resolution
+    max_size = min(max_size, resolution - margin * 2)
+
+    for _ in range(times):
+        width = np.random.randint(int(min_size), int(max_size))
+        height = np.random.randint(int(min_size), int(max_size))
+
+        x_start = np.random.randint(int(margin), resolution - int(margin) - width + 1)
+        y_start = np.random.randint(int(margin), resolution - int(margin) - height + 1)
+        mask[:, y_start : y_start + height, x_start : x_start + width] = 0
+
+    mask = 1 - mask if random.random() < 0.5 else mask
+    return mask
+
+
+def save_model_card(
+    repo_id: str,
+    images=None,
+    base_model=str,
+    repo_folder=None,
+):
+    img_str = ""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+prompt: "a photo of sks"
+tags:
+- stable-diffusion-inpainting
+- stable-diffusion-inpainting-diffusers
+- text-to-image
+- diffusers
+- realfill
+- diffusers-training
+inference: true
+---
+    """
+    model_card = f"""
+# RealFill - {repo_id}
+
+This is a realfill model derived from {base_model}. The weights were trained using [RealFill](https://realfill.github.io/).
+You can find some example images in the following. \n
+{img_str}
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def log_validation(
+    text_encoder,
+    tokenizer,
+    unet,
+    args,
+    accelerator,
+    weight_dtype,
+    epoch,
+):
+    logger.info(f"Running validation... \nGenerating {args.num_validation_images} images")
+
+    # create pipeline (note: unet and vae are loaded again in float32)
+    pipeline = StableDiffusionInpaintPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        tokenizer=tokenizer,
+        revision=args.revision,
+        torch_dtype=weight_dtype,
+    )
+
+    # set `keep_fp32_wrapper` to True because we do not want to remove
+    # mixed precision hooks while we are still training
+    pipeline.unet = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
+    pipeline.text_encoder = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
+    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = None if args.seed is None else torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    target_dir = Path(args.train_data_dir) / "target"
+    target_image, target_mask = target_dir / "target.png", target_dir / "mask.png"
+    image, mask_image = Image.open(target_image), Image.open(target_mask)
+
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+
+    images = []
+    for _ in range(args.num_validation_images):
+        image = pipeline(
+            prompt="a photo of sks",
+            image=image,
+            mask_image=mask_image,
+            num_inference_steps=25,
+            guidance_scale=5,
+            generator=generator,
+        ).images[0]
+        images.append(image)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log({"validation": [wandb.Image(image, caption=str(i)) for i, image in enumerate(images)]})
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+    return images
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of images.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_conditioning`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run realfill validation every X steps. RealFill validation consists of running the conditioning"
+            " `args.validation_conditioning` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="realfill-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--unet_learning_rate",
+        type=float,
+        default=2e-4,
+        help="Learning rate to use for unet.",
+    )
+    parser.add_argument(
+        "--text_encoder_learning_rate",
+        type=float,
+        default=4e-5,
+        help="Learning rate to use for text encoder.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--wandb_key",
+        type=str,
+        default=None,
+        help=("If report to option is set to wandb, api-key for wandb used for login to wandb "),
+    )
+    parser.add_argument(
+        "--wandb_project_name",
+        type=str,
+        default=None,
+        help=("If report to option is set to wandb, project name in wandb for log tracking  "),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+    parser.add_argument(
+        "--lora_rank",
+        type=int,
+        default=16,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--lora_alpha",
+        type=int,
+        default=27,
+        help=("The alpha constant of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--lora_dropout",
+        type=float,
+        default=0.0,
+        help="The dropout rate of the LoRA update matrices.",
+    )
+    parser.add_argument(
+        "--lora_bias",
+        type=str,
+        default="none",
+        help="The bias type of the Lora update matrices. Must be 'none', 'all' or 'lora_only'.",
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    return args
+
+
+class RealFillDataset(Dataset):
+    """
+    A dataset to prepare the training and conditioning images and
+    the masks with the dummy prompt for fine-tuning the model.
+    It pre-processes the images, masks and tokenizes the prompts.
+    """
+
+    def __init__(
+        self,
+        train_data_root,
+        tokenizer,
+        size=512,
+    ):
+        self.size = size
+        self.tokenizer = tokenizer
+
+        self.ref_data_root = Path(train_data_root) / "ref"
+        self.target_image = Path(train_data_root) / "target" / "target.png"
+        self.target_mask = Path(train_data_root) / "target" / "mask.png"
+        if not (self.ref_data_root.exists() and self.target_image.exists() and self.target_mask.exists()):
+            raise ValueError("Train images root doesn't exists.")
+
+        self.train_images_path = list(self.ref_data_root.iterdir()) + [self.target_image]
+        self.num_train_images = len(self.train_images_path)
+        self.train_prompt = "a photo of sks"
+
+        self.transform = transforms_v2.Compose(
+            [
+                transforms_v2.ToImage(),
+                transforms_v2.RandomResize(size, int(1.125 * size)),
+                transforms_v2.RandomCrop(size),
+                transforms_v2.ToDtype(torch.float32, scale=True),
+                transforms_v2.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self.num_train_images
+
+    def __getitem__(self, index):
+        example = {}
+
+        image = Image.open(self.train_images_path[index])
+        image = exif_transpose(image)
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        if index < len(self) - 1:
+            weighting = Image.new("L", image.size)
+        else:
+            weighting = Image.open(self.target_mask)
+            weighting = exif_transpose(weighting)
+
+        image, weighting = self.transform(image, weighting)
+        example["images"], example["weightings"] = image, weighting < 0
+
+        if random.random() < 0.1:
+            example["masks"] = torch.ones_like(example["images"][0:1, :, :])
+        else:
+            example["masks"] = make_mask(example["images"], self.size)
+
+        example["conditioning_images"] = example["images"] * (example["masks"] < 0.5)
+
+        train_prompt = "" if random.random() < 0.1 else self.train_prompt
+        example["prompt_ids"] = self.tokenizer(
+            train_prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+
+        return example
+
+
+def collate_fn(examples):
+    input_ids = [example["prompt_ids"] for example in examples]
+    images = [example["images"] for example in examples]
+
+    masks = [example["masks"] for example in examples]
+    weightings = [example["weightings"] for example in examples]
+    conditioning_images = [example["conditioning_images"] for example in examples]
+
+    images = torch.stack(images)
+    images = images.to(memory_format=torch.contiguous_format).float()
+
+    masks = torch.stack(masks)
+    masks = masks.to(memory_format=torch.contiguous_format).float()
+
+    weightings = torch.stack(weightings)
+    weightings = weightings.to(memory_format=torch.contiguous_format).float()
+
+    conditioning_images = torch.stack(conditioning_images)
+    conditioning_images = conditioning_images.to(memory_format=torch.contiguous_format).float()
+
+    input_ids = torch.cat(input_ids, dim=0)
+
+    batch = {
+        "input_ids": input_ids,
+        "images": images,
+        "masks": masks,
+        "weightings": weightings,
+        "conditioning_images": conditioning_images,
+    }
+    return batch
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_dir=logging_dir,
+    )
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+        wandb.login(key=args.wandb_key)
+        wandb.init(project=args.wandb_project_name)
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+
+    config = LoraConfig(
+        r=args.lora_rank,
+        lora_alpha=args.lora_alpha,
+        target_modules=["to_k", "to_q", "to_v", "key", "query", "value"],
+        lora_dropout=args.lora_dropout,
+        bias=args.lora_bias,
+    )
+    unet = get_peft_model(unet, config)
+
+    config = LoraConfig(
+        r=args.lora_rank,
+        lora_alpha=args.lora_alpha,
+        target_modules=["k_proj", "q_proj", "v_proj"],
+        lora_dropout=args.lora_dropout,
+        bias=args.lora_bias,
+    )
+    text_encoder = get_peft_model(text_encoder, config)
+
+    vae.requires_grad_(False)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        text_encoder.gradient_checkpointing_enable()
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            for model in models:
+                sub_dir = (
+                    "unet"
+                    if isinstance(model.base_model.model, type(accelerator.unwrap_model(unet).base_model.model))
+                    else "text_encoder"
+                )
+                model.save_pretrained(os.path.join(output_dir, sub_dir))
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+    def load_model_hook(models, input_dir):
+        while len(models) > 0:
+            # pop models so that they are not loaded again
+            model = models.pop()
+
+            sub_dir = (
+                "unet"
+                if isinstance(model.base_model.model, type(accelerator.unwrap_model(unet).base_model.model))
+                else "text_encoder"
+            )
+            model_cls = (
+                UNet2DConditionModel
+                if isinstance(model.base_model.model, type(accelerator.unwrap_model(unet).base_model.model))
+                else CLIPTextModel
+            )
+
+            load_model = model_cls.from_pretrained(args.pretrained_model_name_or_path, subfolder=sub_dir)
+            load_model = PeftModel.from_pretrained(load_model, input_dir, subfolder=sub_dir)
+
+            model.load_state_dict(load_model.state_dict())
+            del load_model
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.unet_learning_rate = (
+            args.unet_learning_rate
+            * args.gradient_accumulation_steps
+            * args.train_batch_size
+            * accelerator.num_processes
+        )
+
+        args.text_encoder_learning_rate = (
+            args.text_encoder_learning_rate
+            * args.gradient_accumulation_steps
+            * args.train_batch_size
+            * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    optimizer = optimizer_class(
+        [
+            {"params": unet.parameters(), "lr": args.unet_learning_rate},
+            {"params": text_encoder.parameters(), "lr": args.text_encoder_learning_rate},
+        ],
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = RealFillDataset(
+        train_data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=1,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, text_encoder, optimizer, train_dataloader = accelerator.prepare(
+        unet, text_encoder, optimizer, train_dataloader
+    )
+
+    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae to device and cast to weight_dtype
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = vars(copy.deepcopy(args))
+        accelerator.init_trackers("realfill", config=tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        text_encoder.train()
+
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet, text_encoder):
+                # Convert images to latent space
+                latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * 0.18215
+
+                # Convert masked images to latent space
+                conditionings = vae.encode(batch["conditioning_images"].to(dtype=weight_dtype)).latent_dist.sample()
+                conditionings = conditionings * 0.18215
+
+                # Downsample mask and weighting so that they match with the latents
+                masks, size = batch["masks"].to(dtype=weight_dtype), latents.shape[2:]
+                masks = F.interpolate(masks, size=size)
+
+                weightings = batch["weightings"].to(dtype=weight_dtype)
+                weightings = F.interpolate(weightings, size=size)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Concatenate noisy latents, masks and conditionings to get inputs to unet
+                inputs = torch.cat([noisy_latents, masks, conditionings], dim=1)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Predict the noise residual
+                model_pred = unet(inputs, timesteps, encoder_hidden_states).sample
+
+                # Compute the diffusion loss
+                assert noise_scheduler.config.prediction_type == "epsilon"
+                loss = (weightings * F.mse_loss(model_pred.float(), noise.float(), reduction="none")).mean()
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = itertools.chain(unet.parameters(), text_encoder.parameters())
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                if args.report_to == "wandb":
+                    accelerator.print(progress_bar)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if global_step % args.validation_steps == 0:
+                        log_validation(
+                            text_encoder,
+                            tokenizer,
+                            unet,
+                            args,
+                            accelerator,
+                            weight_dtype,
+                            global_step,
+                        )
+
+            logs = {"loss": loss.detach().item()}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        pipeline = StableDiffusionInpaintPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            unet=accelerator.unwrap_model(unet, keep_fp32_wrapper=True).merge_and_unload(),
+            text_encoder=accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True).merge_and_unload(),
+            revision=args.revision,
+        )
+
+        pipeline.save_pretrained(args.output_dir)
+
+        # Final inference
+        images = log_validation(
+            text_encoder,
+            tokenizer,
+            unet,
+            args,
+            accelerator,
+            weight_dtype,
+            global_step,
+        )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/scheduled_huber_loss_training/README.md b/diffusers/examples/research_projects/scheduled_huber_loss_training/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..239f94ba10057b20b193606a0678bc4853934ee7
--- /dev/null
+++ b/diffusers/examples/research_projects/scheduled_huber_loss_training/README.md
@@ -0,0 +1,15 @@
+# Scheduled Pseudo-Huber Loss for Diffusers
+
+These are the modifications of to include the possibility of training text2image models with Scheduled Pseudo Huber loss, introduced in https://arxiv.org/abs/2403.16728. (https://github.com/kabachuha/SPHL-for-stable-diffusion)
+
+## Why this might be useful?
+
+- If you suspect that the part of the training dataset might be corrupted, and you don't want these outliers to distort the model's supposed output
+
+- If you want to improve the aesthetic quality of pictures by helping the model disentangle concepts and be less influenced by another sorts of pictures.
+
+See https://github.com/huggingface/diffusers/issues/7488 for the detailed description.
+
+## Instructions
+
+The same usage as in the case of the corresponding vanilla Diffusers scripts https://github.com/huggingface/diffusers/tree/main/examples
diff --git a/diffusers/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth.py b/diffusers/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f7ca2262dccfaa19cfc3750c2a2f317f4f6c4f5
--- /dev/null
+++ b/diffusers/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth.py
@@ -0,0 +1,1518 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import copy
+import gc
+import importlib
+import itertools
+import logging
+import math
+import os
+import shutil
+import warnings
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, model_info, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from packaging import version
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import compute_snr
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.28.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    images: list = None,
+    base_model: str = None,
+    train_text_encoder=False,
+    prompt: str = None,
+    repo_folder: str = None,
+    pipeline: DiffusionPipeline = None,
+):
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# DreamBooth - {repo_id}
+
+This is a dreambooth model derived from {base_model}. The weights were trained on {prompt} using [DreamBooth](https://dreambooth.github.io/).
+You can find some example images in the following. \n
+{img_str}
+
+DreamBooth for the text encoder was enabled: {train_text_encoder}.
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        prompt=prompt,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = ["text-to-image", "dreambooth", "diffusers-training"]
+    if isinstance(pipeline, StableDiffusionPipeline):
+        tags.extend(["stable-diffusion", "stable-diffusion-diffusers"])
+    else:
+        tags.extend(["if", "if-diffusers"])
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(
+    text_encoder,
+    tokenizer,
+    unet,
+    vae,
+    args,
+    accelerator,
+    weight_dtype,
+    global_step,
+    prompt_embeds,
+    negative_prompt_embeds,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+
+    pipeline_args = {}
+
+    if vae is not None:
+        pipeline_args["vae"] = vae
+
+    # create pipeline (note: unet and vae are loaded again in float32)
+    pipeline = DiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        unet=unet,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+        **pipeline_args,
+    )
+
+    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+    scheduler_args = {}
+
+    if "variance_type" in pipeline.scheduler.config:
+        variance_type = pipeline.scheduler.config.variance_type
+
+        if variance_type in ["learned", "learned_range"]:
+            variance_type = "fixed_small"
+
+        scheduler_args["variance_type"] = variance_type
+
+    module = importlib.import_module("diffusers")
+    scheduler_class = getattr(module, args.validation_scheduler)
+    pipeline.scheduler = scheduler_class.from_config(pipeline.scheduler.config, **scheduler_args)
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.pre_compute_text_embeddings:
+        pipeline_args = {
+            "prompt_embeds": prompt_embeds,
+            "negative_prompt_embeds": negative_prompt_embeds,
+        }
+    else:
+        pipeline_args = {"prompt": args.validation_prompt}
+
+    # run inference
+    generator = None if args.seed is None else torch.Generator(device=accelerator.device).manual_seed(args.seed)
+    images = []
+    if args.validation_images is None:
+        for _ in range(args.num_validation_images):
+            with torch.autocast("cuda"):
+                image = pipeline(**pipeline_args, num_inference_steps=25, generator=generator).images[0]
+            images.append(image)
+    else:
+        for image in args.validation_images:
+            image = Image.open(image)
+            image = pipeline(**pipeline_args, image=image, generator=generator).images[0]
+            images.append(image)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, global_step, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+    return images
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="dreambooth-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more details"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+
+    parser.add_argument(
+        "--offset_noise",
+        action="store_true",
+        default=False,
+        help=(
+            "Fine-tuning against a modified noise"
+            " See: https://www.crosslabs.org//blog/diffusion-with-offset-noise for more information."
+        ),
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--pre_compute_text_embeddings",
+        action="store_true",
+        help="Whether or not to pre-compute text embeddings. If text embeddings are pre-computed, the text encoder will not be kept in memory during training and will leave more GPU memory available for training the rest of the model. This is not compatible with `--train_text_encoder`.",
+    )
+    parser.add_argument(
+        "--tokenizer_max_length",
+        type=int,
+        default=None,
+        required=False,
+        help="The maximum length of the tokenizer. If not set, will default to the tokenizer's max length.",
+    )
+    parser.add_argument(
+        "--text_encoder_use_attention_mask",
+        action="store_true",
+        required=False,
+        help="Whether to use attention mask for the text encoder",
+    )
+    parser.add_argument(
+        "--skip_save_text_encoder", action="store_true", required=False, help="Set to not save text encoder"
+    )
+    parser.add_argument(
+        "--validation_images",
+        required=False,
+        default=None,
+        nargs="+",
+        help="Optional set of images to use for validation. Used when the target pipeline takes an initial image as input such as when training image variation or superresolution.",
+    )
+    parser.add_argument(
+        "--class_labels_conditioning",
+        required=False,
+        default=None,
+        help="The optional `class_label` conditioning to pass to the unet, available values are `timesteps`.",
+    )
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber", "smooth_l1"],
+        help="The type of loss to use and whether it's timestep-scheduled. See Issue #7488 for more info.",
+    )
+    parser.add_argument(
+        "--huber_schedule",
+        type=str,
+        default="snr",
+        choices=["constant", "exponential", "snr"],
+        help="The schedule to use for the huber losses parameter",
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.1,
+        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type.",
+    )
+    parser.add_argument(
+        "--validation_scheduler",
+        type=str,
+        default="DPMSolverMultistepScheduler",
+        choices=["DPMSolverMultistepScheduler", "DDPMScheduler"],
+        help="Select which scheduler to use for validation. DDPMScheduler is recommended for DeepFloyd IF.",
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    if args.train_text_encoder and args.pre_compute_text_embeddings:
+        raise ValueError("`--train_text_encoder` cannot be used with `--pre_compute_text_embeddings`")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        class_num=None,
+        size=512,
+        center_crop=False,
+        encoder_hidden_states=None,
+        class_prompt_encoder_hidden_states=None,
+        tokenizer_max_length=None,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+        self.encoder_hidden_states = encoder_hidden_states
+        self.class_prompt_encoder_hidden_states = class_prompt_encoder_hidden_states
+        self.tokenizer_max_length = tokenizer_max_length
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError(f"Instance {self.instance_data_root} images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        instance_image = exif_transpose(instance_image)
+
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+
+        if self.encoder_hidden_states is not None:
+            example["instance_prompt_ids"] = self.encoder_hidden_states
+        else:
+            text_inputs = tokenize_prompt(
+                self.tokenizer, self.instance_prompt, tokenizer_max_length=self.tokenizer_max_length
+            )
+            example["instance_prompt_ids"] = text_inputs.input_ids
+            example["instance_attention_mask"] = text_inputs.attention_mask
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            class_image = exif_transpose(class_image)
+
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+
+            if self.class_prompt_encoder_hidden_states is not None:
+                example["class_prompt_ids"] = self.class_prompt_encoder_hidden_states
+            else:
+                class_text_inputs = tokenize_prompt(
+                    self.tokenizer, self.class_prompt, tokenizer_max_length=self.tokenizer_max_length
+                )
+                example["class_prompt_ids"] = class_text_inputs.input_ids
+                example["class_attention_mask"] = class_text_inputs.attention_mask
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    has_attention_mask = "instance_attention_mask" in examples[0]
+
+    input_ids = [example["instance_prompt_ids"] for example in examples]
+    pixel_values = [example["instance_images"] for example in examples]
+
+    if has_attention_mask:
+        attention_mask = [example["instance_attention_mask"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        input_ids += [example["class_prompt_ids"] for example in examples]
+        pixel_values += [example["class_images"] for example in examples]
+
+        if has_attention_mask:
+            attention_mask += [example["class_attention_mask"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    input_ids = torch.cat(input_ids, dim=0)
+
+    batch = {
+        "input_ids": input_ids,
+        "pixel_values": pixel_values,
+    }
+
+    if has_attention_mask:
+        attention_mask = torch.cat(attention_mask, dim=0)
+        batch["attention_mask"] = attention_mask
+
+    return batch
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def model_has_vae(args):
+    config_file_name = os.path.join("vae", AutoencoderKL.config_name)
+    if os.path.isdir(args.pretrained_model_name_or_path):
+        config_file_name = os.path.join(args.pretrained_model_name_or_path, config_file_name)
+        return os.path.isfile(config_file_name)
+    else:
+        files_in_repo = model_info(args.pretrained_model_name_or_path, revision=args.revision).siblings
+        return any(file.rfilename == config_file_name for file in files_in_repo)
+
+
+def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None):
+    if tokenizer_max_length is not None:
+        max_length = tokenizer_max_length
+    else:
+        max_length = tokenizer.model_max_length
+
+    text_inputs = tokenizer(
+        prompt,
+        truncation=True,
+        padding="max_length",
+        max_length=max_length,
+        return_tensors="pt",
+    )
+
+    return text_inputs
+
+
+def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_attention_mask=None):
+    text_input_ids = input_ids.to(text_encoder.device)
+
+    if text_encoder_use_attention_mask:
+        attention_mask = attention_mask.to(text_encoder.device)
+    else:
+        attention_mask = None
+
+    prompt_embeds = text_encoder(
+        text_input_ids,
+        attention_mask=attention_mask,
+        return_dict=False,
+    )
+    prompt_embeds = prompt_embeds[0]
+
+    return prompt_embeds
+
+
+# NOTE: if you're using the scheduled version, huber_c has to depend on the timesteps already
+def conditional_loss(
+    model_pred: torch.Tensor,
+    target: torch.Tensor,
+    reduction: str = "mean",
+    loss_type: str = "l2",
+    huber_c: float = 0.1,
+):
+    if loss_type == "l2":
+        loss = F.mse_loss(model_pred, target, reduction=reduction)
+    elif loss_type == "huber":
+        loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    elif loss_type == "smooth_l1":
+        loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    else:
+        raise NotImplementedError(f"Unsupported Loss Type {loss_type}")
+    return loss
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
+                variant=args.variant,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+
+    if model_has_vae(args):
+        vae = AutoencoderKL.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+        )
+    else:
+        vae = None
+
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            for model in models:
+                sub_dir = "unet" if isinstance(model, type(unwrap_model(unet))) else "text_encoder"
+                model.save_pretrained(os.path.join(output_dir, sub_dir))
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+    def load_model_hook(models, input_dir):
+        while len(models) > 0:
+            # pop models so that they are not loaded again
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(text_encoder))):
+                # load transformers style into model
+                load_model = text_encoder_cls.from_pretrained(input_dir, subfolder="text_encoder")
+                model.config = load_model.config
+            else:
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+            model.load_state_dict(load_model.state_dict())
+            del load_model
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if vae is not None:
+        vae.requires_grad_(False)
+
+    if not args.train_text_encoder:
+        text_encoder.requires_grad_(False)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        "Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training. copy of the weights should still be float32."
+    )
+
+    if unwrap_model(unet).dtype != torch.float32:
+        raise ValueError(f"Unet loaded as datatype {unwrap_model(unet).dtype}. {low_precision_error_string}")
+
+    if args.train_text_encoder and unwrap_model(text_encoder).dtype != torch.float32:
+        raise ValueError(
+            f"Text encoder loaded as datatype {unwrap_model(text_encoder).dtype}." f" {low_precision_error_string}"
+        )
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = (
+        itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
+    )
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    if args.pre_compute_text_embeddings:
+
+        def compute_text_embeddings(prompt):
+            with torch.no_grad():
+                text_inputs = tokenize_prompt(tokenizer, prompt, tokenizer_max_length=args.tokenizer_max_length)
+                prompt_embeds = encode_prompt(
+                    text_encoder,
+                    text_inputs.input_ids,
+                    text_inputs.attention_mask,
+                    text_encoder_use_attention_mask=args.text_encoder_use_attention_mask,
+                )
+
+            return prompt_embeds
+
+        pre_computed_encoder_hidden_states = compute_text_embeddings(args.instance_prompt)
+        validation_prompt_negative_prompt_embeds = compute_text_embeddings("")
+
+        if args.validation_prompt is not None:
+            validation_prompt_encoder_hidden_states = compute_text_embeddings(args.validation_prompt)
+        else:
+            validation_prompt_encoder_hidden_states = None
+
+        if args.class_prompt is not None:
+            pre_computed_class_prompt_encoder_hidden_states = compute_text_embeddings(args.class_prompt)
+        else:
+            pre_computed_class_prompt_encoder_hidden_states = None
+
+        text_encoder = None
+        tokenizer = None
+
+        gc.collect()
+        torch.cuda.empty_cache()
+    else:
+        pre_computed_encoder_hidden_states = None
+        validation_prompt_encoder_hidden_states = None
+        validation_prompt_negative_prompt_embeds = None
+        pre_computed_class_prompt_encoder_hidden_states = None
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        class_num=args.num_class_images,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+        encoder_hidden_states=pre_computed_encoder_hidden_states,
+        class_prompt_encoder_hidden_states=pre_computed_class_prompt_encoder_hidden_states,
+        tokenizer_max_length=args.tokenizer_max_length,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae and text_encoder to device and cast to weight_dtype
+    if vae is not None:
+        vae.to(accelerator.device, dtype=weight_dtype)
+
+    if not args.train_text_encoder and text_encoder is not None:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = vars(copy.deepcopy(args))
+        tracker_config.pop("validation_images")
+        accelerator.init_trackers("dreambooth", config=tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
+
+                if vae is not None:
+                    # Convert images to latent space
+                    model_input = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                    model_input = model_input * vae.config.scaling_factor
+                else:
+                    model_input = pixel_values
+
+                # Sample noise that we'll add to the model input
+                if args.offset_noise:
+                    noise = torch.randn_like(model_input) + 0.1 * torch.randn(
+                        model_input.shape[0], model_input.shape[1], 1, 1, device=model_input.device
+                    )
+                else:
+                    noise = torch.randn_like(model_input)
+                bsz, channels, height, width = model_input.shape
+                # Sample a random timestep for each image
+                if args.loss_type == "huber" or args.loss_type == "smooth_l1":
+                    timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (1,), device="cpu")
+                    timestep = timesteps.item()
+
+                    if args.huber_schedule == "exponential":
+                        alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+                        huber_c = math.exp(-alpha * timestep)
+                    elif args.huber_schedule == "snr":
+                        alphas_cumprod = noise_scheduler.alphas_cumprod[timestep]
+                        sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
+                        huber_c = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
+                    elif args.huber_schedule == "constant":
+                        huber_c = args.huber_c
+                    else:
+                        raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
+
+                    timesteps = timesteps.repeat(bsz).to(model_input.device)
+                elif args.loss_type == "l2":
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                    )
+                    huber_c = 1  # may be anything, as it's not used
+                else:
+                    raise NotImplementedError(f"Unknown loss type {args.loss_type}")
+
+                timesteps = timesteps.long()
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                if args.pre_compute_text_embeddings:
+                    encoder_hidden_states = batch["input_ids"]
+                else:
+                    encoder_hidden_states = encode_prompt(
+                        text_encoder,
+                        batch["input_ids"],
+                        batch["attention_mask"],
+                        text_encoder_use_attention_mask=args.text_encoder_use_attention_mask,
+                    )
+
+                if unwrap_model(unet).config.in_channels == channels * 2:
+                    noisy_model_input = torch.cat([noisy_model_input, noisy_model_input], dim=1)
+
+                if args.class_labels_conditioning == "timesteps":
+                    class_labels = timesteps
+                else:
+                    class_labels = None
+
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_model_input, timesteps, encoder_hidden_states, class_labels=class_labels, return_dict=False
+                )[0]
+
+                if model_pred.shape[1] == 6:
+                    model_pred, _ = torch.chunk(model_pred, 2, dim=1)
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+                    # Compute prior loss
+                    prior_loss = conditional_loss(
+                        model_pred_prior.float(),
+                        target_prior.float(),
+                        reduction="mean",
+                        loss_type=args.loss_type,
+                        huber_c=huber_c,
+                    )
+
+                # Compute instance loss
+                if args.snr_gamma is None:
+                    loss = conditional_loss(
+                        model_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c
+                    )
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    base_weight = (
+                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
+                    )
+
+                    if noise_scheduler.config.prediction_type == "v_prediction":
+                        # Velocity objective needs to be floored to an SNR weight of one.
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
+                    loss = conditional_loss(
+                        model_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    )
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                if args.with_prior_preservation:
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(unet.parameters(), text_encoder.parameters())
+                        if args.train_text_encoder
+                        else unet.parameters()
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    images = []
+
+                    if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                        images = log_validation(
+                            unwrap_model(text_encoder) if text_encoder is not None else text_encoder,
+                            tokenizer,
+                            unwrap_model(unet),
+                            vae,
+                            args,
+                            accelerator,
+                            weight_dtype,
+                            global_step,
+                            validation_prompt_encoder_hidden_states,
+                            validation_prompt_negative_prompt_embeds,
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        pipeline_args = {}
+
+        if text_encoder is not None:
+            pipeline_args["text_encoder"] = unwrap_model(text_encoder)
+
+        if args.skip_save_text_encoder:
+            pipeline_args["text_encoder"] = None
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            unet=unwrap_model(unet),
+            revision=args.revision,
+            variant=args.variant,
+            **pipeline_args,
+        )
+
+        # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+        scheduler_args = {}
+
+        if "variance_type" in pipeline.scheduler.config:
+            variance_type = pipeline.scheduler.config.variance_type
+
+            if variance_type in ["learned", "learned_range"]:
+                variance_type = "fixed_small"
+
+            scheduler_args["variance_type"] = variance_type
+
+        pipeline.scheduler = pipeline.scheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+
+        pipeline.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                train_text_encoder=args.train_text_encoder,
+                prompt=args.instance_prompt,
+                repo_folder=args.output_dir,
+                pipeline=pipeline,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora.py b/diffusers/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..663dbbf99473fab9e4e02849f7ca55ad88d30152
--- /dev/null
+++ b/diffusers/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora.py
@@ -0,0 +1,1504 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import copy
+import gc
+import logging
+import math
+import os
+import shutil
+import warnings
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from packaging import version
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import StableDiffusionLoraLoaderMixin
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params
+from diffusers.utils import (
+    check_min_version,
+    convert_state_dict_to_diffusers,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.28.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    images=None,
+    base_model=str,
+    train_text_encoder=False,
+    prompt=str,
+    repo_folder=None,
+    pipeline: DiffusionPipeline = None,
+):
+    img_str = ""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# LoRA DreamBooth - {repo_id}
+
+These are LoRA adaption weights for {base_model}. The weights were trained on {prompt} using [DreamBooth](https://dreambooth.github.io/). You can find some example images in the following. \n
+{img_str}
+
+LoRA for the text encoder was enabled: {train_text_encoder}.
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        prompt=prompt,
+        model_description=model_description,
+        inference=True,
+    )
+    tags = ["text-to-image", "diffusers", "lora", "diffusers-training"]
+    if isinstance(pipeline, StableDiffusionPipeline):
+        tags.extend(["stable-diffusion", "stable-diffusion-diffusers"])
+    else:
+        tags.extend(["if", "if-diffusers"])
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+    scheduler_args = {}
+
+    if "variance_type" in pipeline.scheduler.config:
+        variance_type = pipeline.scheduler.config.variance_type
+
+        if variance_type in ["learned", "learned_range"]:
+            variance_type = "fixed_small"
+
+        scheduler_args["variance_type"] = variance_type
+
+    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+
+    if args.validation_images is None:
+        images = []
+        for _ in range(args.num_validation_images):
+            with torch.cuda.amp.autocast():
+                image = pipeline(**pipeline_args, generator=generator).images[0]
+                images.append(image)
+    else:
+        images = []
+        for image in args.validation_images:
+            image = Image.open(image)
+            with torch.cuda.amp.autocast():
+                image = pipeline(**pipeline_args, image=image, generator=generator).images[0]
+            images.append(image)
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+    return images
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="lora-dreambooth-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--pre_compute_text_embeddings",
+        action="store_true",
+        help="Whether or not to pre-compute text embeddings. If text embeddings are pre-computed, the text encoder will not be kept in memory during training and will leave more GPU memory available for training the rest of the model. This is not compatible with `--train_text_encoder`.",
+    )
+    parser.add_argument(
+        "--tokenizer_max_length",
+        type=int,
+        default=None,
+        required=False,
+        help="The maximum length of the tokenizer. If not set, will default to the tokenizer's max length.",
+    )
+    parser.add_argument(
+        "--text_encoder_use_attention_mask",
+        action="store_true",
+        required=False,
+        help="Whether to use attention mask for the text encoder",
+    )
+    parser.add_argument(
+        "--validation_images",
+        required=False,
+        default=None,
+        nargs="+",
+        help="Optional set of images to use for validation. Used when the target pipeline takes an initial image as input such as when training image variation or superresolution.",
+    )
+    parser.add_argument(
+        "--class_labels_conditioning",
+        required=False,
+        default=None,
+        help="The optional `class_label` conditioning to pass to the unet, available values are `timesteps`.",
+    )
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber", "smooth_l1"],
+        help="The type of loss to use and whether it's timestep-scheduled. See Issue #7488 for more info.",
+    )
+    parser.add_argument(
+        "--huber_schedule",
+        type=str,
+        default="snr",
+        choices=["constant", "exponential", "snr"],
+        help="The schedule to use for the huber losses parameter",
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.1,
+        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type.",
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    if args.train_text_encoder and args.pre_compute_text_embeddings:
+        raise ValueError("`--train_text_encoder` cannot be used with `--pre_compute_text_embeddings`")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        class_num=None,
+        size=512,
+        center_crop=False,
+        encoder_hidden_states=None,
+        class_prompt_encoder_hidden_states=None,
+        tokenizer_max_length=None,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+        self.encoder_hidden_states = encoder_hidden_states
+        self.class_prompt_encoder_hidden_states = class_prompt_encoder_hidden_states
+        self.tokenizer_max_length = tokenizer_max_length
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        instance_image = exif_transpose(instance_image)
+
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+
+        if self.encoder_hidden_states is not None:
+            example["instance_prompt_ids"] = self.encoder_hidden_states
+        else:
+            text_inputs = tokenize_prompt(
+                self.tokenizer, self.instance_prompt, tokenizer_max_length=self.tokenizer_max_length
+            )
+            example["instance_prompt_ids"] = text_inputs.input_ids
+            example["instance_attention_mask"] = text_inputs.attention_mask
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            class_image = exif_transpose(class_image)
+
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+
+            if self.class_prompt_encoder_hidden_states is not None:
+                example["class_prompt_ids"] = self.class_prompt_encoder_hidden_states
+            else:
+                class_text_inputs = tokenize_prompt(
+                    self.tokenizer, self.class_prompt, tokenizer_max_length=self.tokenizer_max_length
+                )
+                example["class_prompt_ids"] = class_text_inputs.input_ids
+                example["class_attention_mask"] = class_text_inputs.attention_mask
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    has_attention_mask = "instance_attention_mask" in examples[0]
+
+    input_ids = [example["instance_prompt_ids"] for example in examples]
+    pixel_values = [example["instance_images"] for example in examples]
+
+    if has_attention_mask:
+        attention_mask = [example["instance_attention_mask"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        input_ids += [example["class_prompt_ids"] for example in examples]
+        pixel_values += [example["class_images"] for example in examples]
+        if has_attention_mask:
+            attention_mask += [example["class_attention_mask"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    input_ids = torch.cat(input_ids, dim=0)
+
+    batch = {
+        "input_ids": input_ids,
+        "pixel_values": pixel_values,
+    }
+
+    if has_attention_mask:
+        batch["attention_mask"] = attention_mask
+
+    return batch
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None):
+    if tokenizer_max_length is not None:
+        max_length = tokenizer_max_length
+    else:
+        max_length = tokenizer.model_max_length
+
+    text_inputs = tokenizer(
+        prompt,
+        truncation=True,
+        padding="max_length",
+        max_length=max_length,
+        return_tensors="pt",
+    )
+
+    return text_inputs
+
+
+def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_attention_mask=None):
+    text_input_ids = input_ids.to(text_encoder.device)
+
+    if text_encoder_use_attention_mask:
+        attention_mask = attention_mask.to(text_encoder.device)
+    else:
+        attention_mask = None
+
+    prompt_embeds = text_encoder(
+        text_input_ids,
+        attention_mask=attention_mask,
+        return_dict=False,
+    )
+    prompt_embeds = prompt_embeds[0]
+
+    return prompt_embeds
+
+
+# NOTE: if you're using the scheduled version, huber_c has to depend on the timesteps already
+def conditional_loss(
+    model_pred: torch.Tensor,
+    target: torch.Tensor,
+    reduction: str = "mean",
+    loss_type: str = "l2",
+    huber_c: float = 0.1,
+):
+    if loss_type == "l2":
+        loss = F.mse_loss(model_pred, target, reduction=reduction)
+    elif loss_type == "huber":
+        loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    elif loss_type == "smooth_l1":
+        loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    else:
+        raise NotImplementedError(f"Unsupported Loss Type {loss_type}")
+    return loss
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (sayakpaul): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
+                variant=args.variant,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    try:
+        vae = AutoencoderKL.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+        )
+    except OSError:
+        # IF does not have a VAE so let's just set it to None
+        # We don't have to error out here
+        vae = None
+
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # We only train the additional adapter LoRA layers
+    if vae is not None:
+        vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    if vae is not None:
+        vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
+
+    # now we will add new LoRA weights to the attention layers
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0", "add_k_proj", "add_v_proj"],
+    )
+    unet.add_adapter(unet_lora_config)
+
+    # The text encoder comes from 🤗 transformers, we will also attach adapters to it.
+    if args.train_text_encoder:
+        text_lora_config = LoraConfig(
+            r=args.rank,
+            lora_alpha=args.rank,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder.add_adapter(text_lora_config)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder atten layers
+            unet_lora_layers_to_save = None
+            text_encoder_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(unwrap_model(unet))):
+                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
+                elif isinstance(model, type(unwrap_model(text_encoder))):
+                    text_encoder_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusionLoraLoaderMixin.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_lora_layers_to_save,
+            )
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+        text_encoder_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(unet))):
+                unet_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder))):
+                text_encoder_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir)
+
+        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        if args.train_text_encoder:
+            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_)
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [unet_]
+            if args.train_text_encoder:
+                models.append(text_encoder_)
+
+            # only upcast trainable parameters (LoRA) into fp32
+            cast_training_params(models, dtype=torch.float32)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [unet]
+        if args.train_text_encoder:
+            models.append(text_encoder)
+
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(models, dtype=torch.float32)
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = list(filter(lambda p: p.requires_grad, unet.parameters()))
+    if args.train_text_encoder:
+        params_to_optimize = params_to_optimize + list(filter(lambda p: p.requires_grad, text_encoder.parameters()))
+
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    if args.pre_compute_text_embeddings:
+
+        def compute_text_embeddings(prompt):
+            with torch.no_grad():
+                text_inputs = tokenize_prompt(tokenizer, prompt, tokenizer_max_length=args.tokenizer_max_length)
+                prompt_embeds = encode_prompt(
+                    text_encoder,
+                    text_inputs.input_ids,
+                    text_inputs.attention_mask,
+                    text_encoder_use_attention_mask=args.text_encoder_use_attention_mask,
+                )
+
+            return prompt_embeds
+
+        pre_computed_encoder_hidden_states = compute_text_embeddings(args.instance_prompt)
+        validation_prompt_negative_prompt_embeds = compute_text_embeddings("")
+
+        if args.validation_prompt is not None:
+            validation_prompt_encoder_hidden_states = compute_text_embeddings(args.validation_prompt)
+        else:
+            validation_prompt_encoder_hidden_states = None
+
+        if args.class_prompt is not None:
+            pre_computed_class_prompt_encoder_hidden_states = compute_text_embeddings(args.class_prompt)
+        else:
+            pre_computed_class_prompt_encoder_hidden_states = None
+
+        text_encoder = None
+        tokenizer = None
+
+        gc.collect()
+        torch.cuda.empty_cache()
+    else:
+        pre_computed_encoder_hidden_states = None
+        validation_prompt_encoder_hidden_states = None
+        validation_prompt_negative_prompt_embeds = None
+        pre_computed_class_prompt_encoder_hidden_states = None
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        class_num=args.num_class_images,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+        encoder_hidden_states=pre_computed_encoder_hidden_states,
+        class_prompt_encoder_hidden_states=pre_computed_class_prompt_encoder_hidden_states,
+        tokenizer_max_length=args.tokenizer_max_length,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = vars(copy.deepcopy(args))
+        tracker_config.pop("validation_images")
+        accelerator.init_trackers("dreambooth-lora", config=tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
+
+                if vae is not None:
+                    # Convert images to latent space
+                    model_input = vae.encode(pixel_values).latent_dist.sample()
+                    model_input = model_input * vae.config.scaling_factor
+                else:
+                    model_input = pixel_values
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                bsz, channels, height, width = model_input.shape
+                # Sample a random timestep for each image
+                if args.loss_type == "huber" or args.loss_type == "smooth_l1":
+                    timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (1,), device="cpu")
+                    timestep = timesteps.item()
+
+                    if args.huber_schedule == "exponential":
+                        alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+                        huber_c = math.exp(-alpha * timestep)
+                    elif args.huber_schedule == "snr":
+                        alphas_cumprod = noise_scheduler.alphas_cumprod[timestep]
+                        sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
+                        huber_c = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
+                    elif args.huber_schedule == "constant":
+                        huber_c = args.huber_c
+                    else:
+                        raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
+
+                    timesteps = timesteps.repeat(bsz).to(model_input.device)
+                elif args.loss_type == "l2":
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                    )
+                    huber_c = 1  # may be anything, as it's not used
+                else:
+                    raise NotImplementedError(f"Unknown loss type {args.loss_type}")
+
+                timesteps = timesteps.long()
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                if args.pre_compute_text_embeddings:
+                    encoder_hidden_states = batch["input_ids"]
+                else:
+                    encoder_hidden_states = encode_prompt(
+                        text_encoder,
+                        batch["input_ids"],
+                        batch["attention_mask"],
+                        text_encoder_use_attention_mask=args.text_encoder_use_attention_mask,
+                    )
+
+                if unwrap_model(unet).config.in_channels == channels * 2:
+                    noisy_model_input = torch.cat([noisy_model_input, noisy_model_input], dim=1)
+
+                if args.class_labels_conditioning == "timesteps":
+                    class_labels = timesteps
+                else:
+                    class_labels = None
+
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_model_input,
+                    timesteps,
+                    encoder_hidden_states,
+                    class_labels=class_labels,
+                    return_dict=False,
+                )[0]
+
+                # if model predicts variance, throw away the prediction. we will only train on the
+                # simplified training objective. This means that all schedulers using the fine tuned
+                # model must be configured to use one of the fixed variance variance types.
+                if model_pred.shape[1] == 6:
+                    model_pred, _ = torch.chunk(model_pred, 2, dim=1)
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute instance loss
+                    loss = conditional_loss(
+                        model_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c
+                    )
+
+                    # Compute prior loss
+                    prior_loss = conditional_loss(
+                        model_pred_prior.float(),
+                        target_prior.float(),
+                        reduction="mean",
+                        loss_type=args.loss_type,
+                        huber_c=huber_c,
+                    )
+
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+                else:
+                    loss = conditional_loss(
+                        model_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c
+                    )
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                # create pipeline
+                pipeline = DiffusionPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    unet=unwrap_model(unet),
+                    text_encoder=None if args.pre_compute_text_embeddings else unwrap_model(text_encoder),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+
+                if args.pre_compute_text_embeddings:
+                    pipeline_args = {
+                        "prompt_embeds": validation_prompt_encoder_hidden_states,
+                        "negative_prompt_embeds": validation_prompt_negative_prompt_embeds,
+                    }
+                else:
+                    pipeline_args = {"prompt": args.validation_prompt}
+
+                images = log_validation(
+                    pipeline,
+                    args,
+                    accelerator,
+                    pipeline_args,
+                    epoch,
+                )
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        unet = unet.to(torch.float32)
+
+        unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+
+        if args.train_text_encoder:
+            text_encoder = unwrap_model(text_encoder)
+            text_encoder_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(text_encoder))
+        else:
+            text_encoder_state_dict = None
+
+        StableDiffusionLoraLoaderMixin.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_state_dict,
+            text_encoder_lora_layers=text_encoder_state_dict,
+        )
+
+        # Final inference
+        # Load previous pipeline
+        pipeline = DiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path, revision=args.revision, variant=args.variant, torch_dtype=weight_dtype
+        )
+
+        # load attention processors
+        pipeline.load_lora_weights(args.output_dir, weight_name="pytorch_lora_weights.safetensors")
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline_args = {"prompt": args.validation_prompt, "num_inference_steps": 25}
+            images = log_validation(
+                pipeline,
+                args,
+                accelerator,
+                pipeline_args,
+                epoch,
+                is_final_validation=True,
+            )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                train_text_encoder=args.train_text_encoder,
+                prompt=args.instance_prompt,
+                repo_folder=args.output_dir,
+                pipeline=pipeline,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora_sdxl.py b/diffusers/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..d16780131139123a61cff6398a6bd45d26678993
--- /dev/null
+++ b/diffusers/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora_sdxl.py
@@ -0,0 +1,2078 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import contextlib
+import gc
+import itertools
+import json
+import logging
+import math
+import os
+import random
+import shutil
+import warnings
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, hf_hub_download, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from packaging import version
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from safetensors.torch import load_file, save_file
+from torch.utils.data import Dataset
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EDMEulerScheduler,
+    EulerDiscreteScheduler,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import StableDiffusionLoraLoaderMixin
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr
+from diffusers.utils import (
+    check_min_version,
+    convert_all_state_dict_to_peft,
+    convert_state_dict_to_diffusers,
+    convert_state_dict_to_kohya,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.28.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def determine_scheduler_type(pretrained_model_name_or_path, revision):
+    model_index_filename = "model_index.json"
+    if os.path.isdir(pretrained_model_name_or_path):
+        model_index = os.path.join(pretrained_model_name_or_path, model_index_filename)
+    else:
+        model_index = hf_hub_download(
+            repo_id=pretrained_model_name_or_path, filename=model_index_filename, revision=revision
+        )
+
+    with open(model_index, "r") as f:
+        scheduler_type = json.load(f)["scheduler"][1]
+    return scheduler_type
+
+
+def save_model_card(
+    repo_id: str,
+    use_dora: bool,
+    images=None,
+    base_model: str = None,
+    train_text_encoder=False,
+    instance_prompt=None,
+    validation_prompt=None,
+    repo_folder=None,
+    vae_path=None,
+):
+    widget_dict = []
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            widget_dict.append(
+                {"text": validation_prompt if validation_prompt else " ", "output": {"url": f"image_{i}.png"}}
+            )
+
+    model_description = f"""
+# {'SDXL' if 'playground' not in base_model else 'Playground'} LoRA DreamBooth - {repo_id}
+
+<Gallery />
+
+## Model description
+
+These are {repo_id} LoRA adaption weights for {base_model}.
+
+The weights were trained  using [DreamBooth](https://dreambooth.github.io/).
+
+LoRA for the text encoder was enabled: {train_text_encoder}.
+
+Special VAE used for training: {vae_path}.
+
+## Trigger words
+
+You should use {instance_prompt} to trigger the image generation.
+
+## Download model
+
+Weights for this model are available in Safetensors format.
+
+[Download]({repo_id}/tree/main) them in the Files & versions tab.
+
+"""
+    if "playground" in base_model:
+        model_description += """\n
+## License
+
+Please adhere to the licensing terms as described [here](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic/blob/main/LICENSE.md).
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="openrail++" if "playground" not in base_model else "playground-v2dot5-community",
+        base_model=base_model,
+        prompt=instance_prompt,
+        model_description=model_description,
+        widget=widget_dict,
+    )
+    tags = [
+        "text-to-image",
+        "text-to-image",
+        "diffusers-training",
+        "diffusers",
+        "lora" if not use_dora else "dora",
+        "template:sd-lora",
+    ]
+    if "playground" in base_model:
+        tags.extend(["playground", "playground-diffusers"])
+    else:
+        tags.extend(["stable-diffusion-xl", "stable-diffusion-xl-diffusers"])
+
+    model_card = populate_model_card(model_card, tags=tags)
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+
+    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+    scheduler_args = {}
+
+    if not args.do_edm_style_training:
+        if "variance_type" in pipeline.scheduler.config:
+            variance_type = pipeline.scheduler.config.variance_type
+
+            if variance_type in ["learned", "learned_range"]:
+                variance_type = "fixed_small"
+
+            scheduler_args["variance_type"] = variance_type
+
+        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    # Currently the context determination is a bit hand-wavy. We can improve it in the future if there's a better
+    # way to condition it. Reference: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
+    inference_ctx = (
+        contextlib.nullcontext() if "playground" in args.pretrained_model_name_or_path else torch.cuda.amp.autocast()
+    )
+
+    with inference_ctx:
+        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+    return images
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help=("A folder containing the training data. "),
+    )
+
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+
+    parser.add_argument(
+        "--image_column",
+        type=str,
+        default="image",
+        help="The column of the dataset containing the target image. By "
+        "default, the standard Image Dataset maps out 'file_name' "
+        "to 'image'.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default=None,
+        help="The column of the dataset containing the instance prompt for each image",
+    )
+
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance, e.g. 'photo of a TOK dog', 'in the style of TOK'",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--do_edm_style_training",
+        default=False,
+        action="store_true",
+        help="Flag to conduct training using the EDM formulation as introduced in https://arxiv.org/abs/2206.00364.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="lora-dreambooth-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--output_kohya_format",
+        action="store_true",
+        help="Flag to additionally generate final state dict in the Kohya format so that it becomes compatible with A111, Comfy, Kohya, etc.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+
+    parser.add_argument(
+        "--text_encoder_lr",
+        type=float,
+        default=5e-6,
+        help="Text encoder learning rate to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="AdamW",
+        help=('The optimizer type to use. Choose between ["AdamW", "prodigy"]'),
+    )
+
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+
+    parser.add_argument(
+        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--prodigy_beta3",
+        type=float,
+        default=None,
+        help="coefficients for computing the Prodigy stepsize using running averages. If set to None, "
+        "uses the value of square root of beta2. Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+    parser.add_argument(
+        "--adam_weight_decay_text_encoder", type=float, default=1e-03, help="Weight decay to use for text_encoder"
+    )
+
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
+    )
+
+    parser.add_argument(
+        "--prodigy_use_bias_correction",
+        type=bool,
+        default=True,
+        help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW",
+    )
+    parser.add_argument(
+        "--prodigy_safeguard_warmup",
+        type=bool,
+        default=True,
+        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. "
+        "Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--use_dora",
+        action="store_true",
+        default=False,
+        help=(
+            "Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
+            "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
+        ),
+    )
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber", "smooth_l1"],
+        help="The type of loss to use and whether it's timestep-scheduled. See Issue #7488 for more info.",
+    )
+    parser.add_argument(
+        "--huber_schedule",
+        type=str,
+        default="snr",
+        choices=["constant", "exponential", "snr"],
+        help="The schedule to use for the huber losses parameter",
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.1,
+        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type.",
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.instance_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--instance_data_dir`")
+
+    if args.dataset_name is not None and args.instance_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--instance_data_dir`")
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        class_prompt,
+        class_data_root=None,
+        class_num=None,
+        size=1024,
+        repeats=1,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+
+        self.instance_prompt = instance_prompt
+        self.custom_instance_prompts = None
+        self.class_prompt = class_prompt
+
+        # if --dataset_name is provided or a metadata jsonl file is provided in the local --instance_data directory,
+        # we load the training data using load_dataset
+        if args.dataset_name is not None:
+            try:
+                from datasets import load_dataset
+            except ImportError:
+                raise ImportError(
+                    "You are trying to load your data using the datasets library. If you wish to train using custom "
+                    "captions please install the datasets library: `pip install datasets`. If you wish to load a "
+                    "local folder containing images only, specify --instance_data_dir instead."
+                )
+            # Downloading and loading a dataset from the hub.
+            # See more about loading custom images at
+            # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+            dataset = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                cache_dir=args.cache_dir,
+            )
+            # Preprocessing the datasets.
+            column_names = dataset["train"].column_names
+
+            # 6. Get the column names for input/target.
+            if args.image_column is None:
+                image_column = column_names[0]
+                logger.info(f"image column defaulting to {image_column}")
+            else:
+                image_column = args.image_column
+                if image_column not in column_names:
+                    raise ValueError(
+                        f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+            instance_images = dataset["train"][image_column]
+
+            if args.caption_column is None:
+                logger.info(
+                    "No caption column provided, defaulting to instance_prompt for all images. If your dataset "
+                    "contains captions/prompts for the images, make sure to specify the "
+                    "column as --caption_column"
+                )
+                self.custom_instance_prompts = None
+            else:
+                if args.caption_column not in column_names:
+                    raise ValueError(
+                        f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+                custom_instance_prompts = dataset["train"][args.caption_column]
+                # create final list of captions according to --repeats
+                self.custom_instance_prompts = []
+                for caption in custom_instance_prompts:
+                    self.custom_instance_prompts.extend(itertools.repeat(caption, repeats))
+        else:
+            self.instance_data_root = Path(instance_data_root)
+            if not self.instance_data_root.exists():
+                raise ValueError("Instance images root doesn't exists.")
+
+            instance_images = [Image.open(path) for path in list(Path(instance_data_root).iterdir())]
+            self.custom_instance_prompts = None
+
+        self.instance_images = []
+        for img in instance_images:
+            self.instance_images.extend(itertools.repeat(img, repeats))
+
+        # image processing to prepare for using SD-XL micro-conditioning
+        self.original_sizes = []
+        self.crop_top_lefts = []
+        self.pixel_values = []
+        train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
+        train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
+        train_flip = transforms.RandomHorizontalFlip(p=1.0)
+        train_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        for image in self.instance_images:
+            image = exif_transpose(image)
+            if not image.mode == "RGB":
+                image = image.convert("RGB")
+            self.original_sizes.append((image.height, image.width))
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            crop_top_left = (y1, x1)
+            self.crop_top_lefts.append(crop_top_left)
+            image = train_transforms(image)
+            self.pixel_values.append(image)
+
+        self.num_instance_images = len(self.instance_images)
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = self.pixel_values[index % self.num_instance_images]
+        original_size = self.original_sizes[index % self.num_instance_images]
+        crop_top_left = self.crop_top_lefts[index % self.num_instance_images]
+        example["instance_images"] = instance_image
+        example["original_size"] = original_size
+        example["crop_top_left"] = crop_top_left
+
+        if self.custom_instance_prompts:
+            caption = self.custom_instance_prompts[index % self.num_instance_images]
+            if caption:
+                example["instance_prompt"] = caption
+            else:
+                example["instance_prompt"] = self.instance_prompt
+
+        else:  # custom prompts were provided, but length does not match size of image dataset
+            example["instance_prompt"] = self.instance_prompt
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            class_image = exif_transpose(class_image)
+
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt"] = self.class_prompt
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    pixel_values = [example["instance_images"] for example in examples]
+    prompts = [example["instance_prompt"] for example in examples]
+    original_sizes = [example["original_size"] for example in examples]
+    crop_top_lefts = [example["crop_top_left"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        pixel_values += [example["class_images"] for example in examples]
+        prompts += [example["class_prompt"] for example in examples]
+        original_sizes += [example["original_size"] for example in examples]
+        crop_top_lefts += [example["crop_top_left"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    batch = {
+        "pixel_values": pixel_values,
+        "prompts": prompts,
+        "original_sizes": original_sizes,
+        "crop_top_lefts": crop_top_lefts,
+    }
+    return batch
+
+
+class PromptDataset(Dataset):
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def tokenize_prompt(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(text_encoders, tokenizers, prompt, text_input_ids_list=None):
+    prompt_embeds_list = []
+
+    for i, text_encoder in enumerate(text_encoders):
+        if tokenizers is not None:
+            tokenizer = tokenizers[i]
+            text_input_ids = tokenize_prompt(tokenizer, prompt)
+        else:
+            assert text_input_ids_list is not None
+            text_input_ids = text_input_ids_list[i]
+
+        prompt_embeds = text_encoder(
+            text_input_ids.to(text_encoder.device), output_hidden_states=True, return_dict=False
+        )
+
+        # We are only ALWAYS interested in the pooled output of the final text encoder
+        pooled_prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds[-1][-2]
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+        prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+# NOTE: if you're using the scheduled version, huber_c has to depend on the timesteps already
+def conditional_loss(
+    model_pred: torch.Tensor,
+    target: torch.Tensor,
+    reduction: str = "mean",
+    loss_type: str = "l2",
+    huber_c: float = 0.1,
+    weighting: Optional[torch.Tensor] = None,
+):
+    if loss_type == "l2":
+        if weighting is not None:
+            loss = torch.mean(
+                (weighting * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                1,
+            )
+            if reduction == "mean":
+                loss = torch.mean(loss)
+            elif reduction == "sum":
+                loss = torch.sum(loss)
+        else:
+            loss = F.mse_loss(model_pred.float(), target.float(), reduction=reduction)
+
+    elif loss_type == "huber":
+        if weighting is not None:
+            loss = torch.mean(
+                (
+                    2
+                    * huber_c
+                    * (
+                        torch.sqrt(weighting.float() * (model_pred.float() - target.float()) ** 2 + huber_c**2)
+                        - huber_c
+                    )
+                ).reshape(target.shape[0], -1),
+                1,
+            )
+            if reduction == "mean":
+                loss = torch.mean(loss)
+            elif reduction == "sum":
+                loss = torch.sum(loss)
+        else:
+            loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+            if reduction == "mean":
+                loss = torch.mean(loss)
+            elif reduction == "sum":
+                loss = torch.sum(loss)
+    elif loss_type == "smooth_l1":
+        if weighting is not None:
+            loss = torch.mean(
+                (
+                    2
+                    * (
+                        torch.sqrt(weighting.float() * (model_pred.float() - target.float()) ** 2 + huber_c**2)
+                        - huber_c
+                    )
+                ).reshape(target.shape[0], -1),
+                1,
+            )
+            if reduction == "mean":
+                loss = torch.mean(loss)
+            elif reduction == "sum":
+                loss = torch.sum(loss)
+        else:
+            loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+            if reduction == "mean":
+                loss = torch.mean(loss)
+            elif reduction == "sum":
+                loss = torch.sum(loss)
+    else:
+        raise NotImplementedError(f"Unsupported Loss Type {loss_type}")
+    return loss
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if args.do_edm_style_training and args.snr_gamma is not None:
+        raise ValueError("Min-SNR formulation is not supported when conducting EDM-style training.")
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = StableDiffusionXLPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                revision=args.revision,
+                variant=args.variant,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    scheduler_type = determine_scheduler_type(args.pretrained_model_name_or_path, args.revision)
+    if "EDM" in scheduler_type:
+        args.do_edm_style_training = True
+        noise_scheduler = EDMEulerScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+        logger.info("Performing EDM-style training!")
+    elif args.do_edm_style_training:
+        noise_scheduler = EulerDiscreteScheduler.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="scheduler"
+        )
+        logger.info("Performing EDM-style training!")
+    else:
+        noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    latents_mean = latents_std = None
+    if hasattr(vae.config, "latents_mean") and vae.config.latents_mean is not None:
+        latents_mean = torch.tensor(vae.config.latents_mean).view(1, 4, 1, 1)
+    if hasattr(vae.config, "latents_std") and vae.config.latents_std is not None:
+        latents_std = torch.tensor(vae.config.latents_std).view(1, 4, 1, 1)
+
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # We only train the additional adapter LoRA layers
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+
+    # The VAE is always in float32 to avoid NaN losses.
+    vae.to(accelerator.device, dtype=torch.float32)
+
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, "
+                    "please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder_one.gradient_checkpointing_enable()
+            text_encoder_two.gradient_checkpointing_enable()
+
+    # now we will add new LoRA weights to the attention layers
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        use_dora=args.use_dora,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+    unet.add_adapter(unet_lora_config)
+
+    # The text encoder comes from 🤗 transformers, so we cannot directly modify it.
+    # So, instead, we monkey-patch the forward calls of its attention-blocks.
+    if args.train_text_encoder:
+        text_lora_config = LoraConfig(
+            r=args.rank,
+            use_dora=args.use_dora,
+            lora_alpha=args.rank,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder_one.add_adapter(text_lora_config)
+        text_encoder_two.add_adapter(text_lora_config)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder atten layers
+            unet_lora_layers_to_save = None
+            text_encoder_one_lora_layers_to_save = None
+            text_encoder_two_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(unwrap_model(unet))):
+                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
+                elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                    text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
+                elif isinstance(model, type(unwrap_model(text_encoder_two))):
+                    text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusionXLPipeline.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
+                text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save,
+            )
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+        text_encoder_one_ = None
+        text_encoder_two_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(unet))):
+                unet_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                text_encoder_one_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_two))):
+                text_encoder_two_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir)
+
+        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        if args.train_text_encoder:
+            # Do we need to call `scale_lora_layers()` here?
+            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)
+
+            _set_state_dict_into_text_encoder(
+                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_two_
+            )
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [unet_]
+            if args.train_text_encoder:
+                models.extend([text_encoder_one_, text_encoder_two_])
+            # only upcast trainable parameters (LoRA) into fp32
+            cast_training_params(models)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [unet]
+        if args.train_text_encoder:
+            models.extend([text_encoder_one, text_encoder_two])
+
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(models, dtype=torch.float32)
+
+    unet_lora_parameters = list(filter(lambda p: p.requires_grad, unet.parameters()))
+
+    if args.train_text_encoder:
+        text_lora_parameters_one = list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
+        text_lora_parameters_two = list(filter(lambda p: p.requires_grad, text_encoder_two.parameters()))
+
+    # Optimization parameters
+    unet_lora_parameters_with_lr = {"params": unet_lora_parameters, "lr": args.learning_rate}
+    if args.train_text_encoder:
+        # different learning rate for text encoder and unet
+        text_lora_parameters_one_with_lr = {
+            "params": text_lora_parameters_one,
+            "weight_decay": args.adam_weight_decay_text_encoder,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        text_lora_parameters_two_with_lr = {
+            "params": text_lora_parameters_two,
+            "weight_decay": args.adam_weight_decay_text_encoder,
+            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
+        }
+        params_to_optimize = [
+            unet_lora_parameters_with_lr,
+            text_lora_parameters_one_with_lr,
+            text_lora_parameters_two_with_lr,
+        ]
+    else:
+        params_to_optimize = [unet_lora_parameters_with_lr]
+
+    # Optimizer creation
+    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+            "Defaulting to adamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.optimizer.lower() == "adamw":
+        if args.use_8bit_adam:
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError(
+                    "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+                )
+
+            optimizer_class = bnb.optim.AdamW8bit
+        else:
+            optimizer_class = torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+
+    if args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+        if args.train_text_encoder and args.text_encoder_lr:
+            logger.warning(
+                f"Learning rates were provided both for the unet and the text encoder- e.g. text_encoder_lr:"
+                f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
+                f"When using prodigy only learning_rate is used as the initial learning rate."
+            )
+            # changes the learning rate of text_encoder_parameters_one and text_encoder_parameters_two to be
+            # --learning_rate
+            params_to_optimize[1]["lr"] = args.learning_rate
+            params_to_optimize[2]["lr"] = args.learning_rate
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_prompt=args.class_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_num=args.num_class_images,
+        size=args.resolution,
+        repeats=args.repeats,
+        center_crop=args.center_crop,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Computes additional embeddings/ids required by the SDXL UNet.
+    # regular text embeddings (when `train_text_encoder` is not True)
+    # pooled text embeddings
+    # time ids
+
+    def compute_time_ids(original_size, crops_coords_top_left):
+        # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+        target_size = (args.resolution, args.resolution)
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids])
+        add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+        return add_time_ids
+
+    if not args.train_text_encoder:
+        tokenizers = [tokenizer_one, tokenizer_two]
+        text_encoders = [text_encoder_one, text_encoder_two]
+
+        def compute_text_embeddings(prompt, text_encoders, tokenizers):
+            with torch.no_grad():
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(text_encoders, tokenizers, prompt)
+                prompt_embeds = prompt_embeds.to(accelerator.device)
+                pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device)
+            return prompt_embeds, pooled_prompt_embeds
+
+    # If no type of tuning is done on the text_encoder and custom instance prompts are NOT
+    # provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid
+    # the redundant encoding.
+    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
+        instance_prompt_hidden_states, instance_pooled_prompt_embeds = compute_text_embeddings(
+            args.instance_prompt, text_encoders, tokenizers
+        )
+
+    # Handle class prompt for prior-preservation.
+    if args.with_prior_preservation:
+        if not args.train_text_encoder:
+            class_prompt_hidden_states, class_pooled_prompt_embeds = compute_text_embeddings(
+                args.class_prompt, text_encoders, tokenizers
+            )
+
+    # Clear the memory here
+    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
+        del tokenizers, text_encoders
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
+    # pack the statically computed variables appropriately here. This is so that we don't
+    # have to pass them to the dataloader.
+
+    if not train_dataset.custom_instance_prompts:
+        if not args.train_text_encoder:
+            prompt_embeds = instance_prompt_hidden_states
+            unet_add_text_embeds = instance_pooled_prompt_embeds
+            if args.with_prior_preservation:
+                prompt_embeds = torch.cat([prompt_embeds, class_prompt_hidden_states], dim=0)
+                unet_add_text_embeds = torch.cat([unet_add_text_embeds, class_pooled_prompt_embeds], dim=0)
+        # if we're optimizing the text encoder (both if instance prompt is used for all images or custom prompts) we need to tokenize and encode the
+        # batch prompts on all training steps
+        else:
+            tokens_one = tokenize_prompt(tokenizer_one, args.instance_prompt)
+            tokens_two = tokenize_prompt(tokenizer_two, args.instance_prompt)
+            if args.with_prior_preservation:
+                class_tokens_one = tokenize_prompt(tokenizer_one, args.class_prompt)
+                class_tokens_two = tokenize_prompt(tokenizer_two, args.class_prompt)
+                tokens_one = torch.cat([tokens_one, class_tokens_one], dim=0)
+                tokens_two = torch.cat([tokens_two, class_tokens_two], dim=0)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = (
+            "dreambooth-lora-sd-xl"
+            if "playground" not in args.pretrained_model_name_or_path
+            else "dreambooth-lora-playground"
+        )
+        accelerator.init_trackers(tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder_one.train()
+            text_encoder_two.train()
+
+            # set top parameter requires_grad = True for gradient checkpointing works
+            accelerator.unwrap_model(text_encoder_one).text_model.embeddings.requires_grad_(True)
+            accelerator.unwrap_model(text_encoder_two).text_model.embeddings.requires_grad_(True)
+
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                prompts = batch["prompts"]
+
+                # encode batch prompts when custom prompts are provided for each image -
+                if train_dataset.custom_instance_prompts:
+                    if not args.train_text_encoder:
+                        prompt_embeds, unet_add_text_embeds = compute_text_embeddings(
+                            prompts, text_encoders, tokenizers
+                        )
+                    else:
+                        tokens_one = tokenize_prompt(tokenizer_one, prompts)
+                        tokens_two = tokenize_prompt(tokenizer_two, prompts)
+
+                # Convert images to latent space
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+
+                if latents_mean is None and latents_std is None:
+                    model_input = model_input * vae.config.scaling_factor
+                    if args.pretrained_vae_model_name_or_path is None:
+                        model_input = model_input.to(weight_dtype)
+                else:
+                    latents_mean = latents_mean.to(device=model_input.device, dtype=model_input.dtype)
+                    latents_std = latents_std.to(device=model_input.device, dtype=model_input.dtype)
+                    model_input = (model_input - latents_mean) * vae.config.scaling_factor / latents_std
+                    model_input = model_input.to(dtype=weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+
+                # Sample a random timestep for each image
+                if not args.do_edm_style_training:
+                    if args.loss_type == "huber" or args.loss_type == "smooth_l1":
+                        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (1,), device="cpu")
+                        timestep = timesteps.item()
+
+                        if args.huber_schedule == "exponential":
+                            alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+                            huber_c = math.exp(-alpha * timestep)
+                        elif args.huber_schedule == "snr":
+                            alphas_cumprod = noise_scheduler.alphas_cumprod[timestep]
+                            sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
+                            huber_c = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
+                        elif args.huber_schedule == "constant":
+                            huber_c = args.huber_c
+                        else:
+                            raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
+
+                        timesteps = timesteps.repeat(bsz).to(model_input.device)
+                    elif args.loss_type == "l2":
+                        timesteps = torch.randint(
+                            0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                        )
+                        huber_c = 1  # may be anything, as it's not used
+                    else:
+                        raise NotImplementedError(f"Unknown loss type {args.loss_type}")
+                    timesteps = timesteps.long()
+                else:
+                    if "huber" in args.loss_type or "l1" in args.loss_type:
+                        raise NotImplementedError("Huber loss is not implemented for EDM training yet!")
+                    # in EDM formulation, the model is conditioned on the pre-conditioned noise levels
+                    # instead of discrete timesteps, so here we sample indices to get the noise levels
+                    # from `scheduler.timesteps`
+                    indices = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,))
+                    timesteps = noise_scheduler.timesteps[indices].to(device=model_input.device)
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+                # For EDM-style training, we first obtain the sigmas based on the continuous timesteps.
+                # We then precondition the final model inputs based on these sigmas instead of the timesteps.
+                # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                if args.do_edm_style_training:
+                    sigmas = get_sigmas(timesteps, len(noisy_model_input.shape), noisy_model_input.dtype)
+                    if "EDM" in scheduler_type:
+                        inp_noisy_latents = noise_scheduler.precondition_inputs(noisy_model_input, sigmas)
+                    else:
+                        inp_noisy_latents = noisy_model_input / ((sigmas**2 + 1) ** 0.5)
+
+                # time ids
+                add_time_ids = torch.cat(
+                    [
+                        compute_time_ids(original_size=s, crops_coords_top_left=c)
+                        for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])
+                    ]
+                )
+
+                # Calculate the elements to repeat depending on the use of prior-preservation and custom captions.
+                if not train_dataset.custom_instance_prompts:
+                    elems_to_repeat_text_embeds = bsz // 2 if args.with_prior_preservation else bsz
+                else:
+                    elems_to_repeat_text_embeds = 1
+
+                # Predict the noise residual
+                if not args.train_text_encoder:
+                    unet_added_conditions = {
+                        "time_ids": add_time_ids,
+                        "text_embeds": unet_add_text_embeds.repeat(elems_to_repeat_text_embeds, 1),
+                    }
+                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
+                    model_pred = unet(
+                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
+                        timesteps,
+                        prompt_embeds_input,
+                        added_cond_kwargs=unet_added_conditions,
+                        return_dict=False,
+                    )[0]
+                else:
+                    unet_added_conditions = {"time_ids": add_time_ids}
+                    prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                        text_encoders=[text_encoder_one, text_encoder_two],
+                        tokenizers=None,
+                        prompt=None,
+                        text_input_ids_list=[tokens_one, tokens_two],
+                    )
+                    unet_added_conditions.update(
+                        {"text_embeds": pooled_prompt_embeds.repeat(elems_to_repeat_text_embeds, 1)}
+                    )
+                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
+                    model_pred = unet(
+                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
+                        timesteps,
+                        prompt_embeds_input,
+                        added_cond_kwargs=unet_added_conditions,
+                        return_dict=False,
+                    )[0]
+
+                weighting = None
+                if args.do_edm_style_training:
+                    # Similar to the input preconditioning, the model predictions are also preconditioned
+                    # on noised model inputs (before preconditioning) and the sigmas.
+                    # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                    if "EDM" in scheduler_type:
+                        model_pred = noise_scheduler.precondition_outputs(noisy_model_input, model_pred, sigmas)
+                    else:
+                        if noise_scheduler.config.prediction_type == "epsilon":
+                            model_pred = model_pred * (-sigmas) + noisy_model_input
+                        elif noise_scheduler.config.prediction_type == "v_prediction":
+                            model_pred = model_pred * (-sigmas / (sigmas**2 + 1) ** 0.5) + (
+                                noisy_model_input / (sigmas**2 + 1)
+                            )
+                    # We are not doing weighting here because it tends result in numerical problems.
+                    # See: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
+                    # There might be other alternatives for weighting as well:
+                    # https://github.com/huggingface/diffusers/pull/7126#discussion_r1505404686
+                    if "EDM" not in scheduler_type:
+                        weighting = (sigmas**-2.0).float()
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = model_input if args.do_edm_style_training else noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = (
+                        model_input
+                        if args.do_edm_style_training
+                        else noise_scheduler.get_velocity(model_input, noise, timesteps)
+                    )
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute prior loss
+                    prior_loss = conditional_loss(
+                        model_pred_prior,
+                        target_prior,
+                        reduction="mean",
+                        loss_type=args.loss_type,
+                        huber_c=huber_c,
+                        weighting=weighting,
+                    )
+
+                if args.snr_gamma is None:
+                    loss = conditional_loss(
+                        model_pred,
+                        target,
+                        reduction="mean",
+                        loss_type=args.loss_type,
+                        huber_c=huber_c,
+                        weighting=weighting,
+                    )
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    base_weight = (
+                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
+                    )
+
+                    if noise_scheduler.config.prediction_type == "v_prediction":
+                        # Velocity objective needs to be floored to an SNR weight of one.
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
+
+                    loss = conditional_loss(
+                        model_pred, target, reduction="none", loss_type=args.loss_type, huber_c=huber_c, weighting=None
+                    )
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                if args.with_prior_preservation:
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(unet_lora_parameters, text_lora_parameters_one, text_lora_parameters_two)
+                        if args.train_text_encoder
+                        else unet_lora_parameters
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                # create pipeline
+                if not args.train_text_encoder:
+                    text_encoder_one = text_encoder_cls_one.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        subfolder="text_encoder",
+                        revision=args.revision,
+                        variant=args.variant,
+                    )
+                    text_encoder_two = text_encoder_cls_two.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        subfolder="text_encoder_2",
+                        revision=args.revision,
+                        variant=args.variant,
+                    )
+                pipeline = StableDiffusionXLPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=accelerator.unwrap_model(text_encoder_one),
+                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
+                    unet=accelerator.unwrap_model(unet),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline_args = {"prompt": args.validation_prompt}
+
+                images = log_validation(
+                    pipeline,
+                    args,
+                    accelerator,
+                    pipeline_args,
+                    epoch,
+                )
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        unet = unet.to(torch.float32)
+        unet_lora_layers = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+
+        if args.train_text_encoder:
+            text_encoder_one = unwrap_model(text_encoder_one)
+            text_encoder_lora_layers = convert_state_dict_to_diffusers(
+                get_peft_model_state_dict(text_encoder_one.to(torch.float32))
+            )
+            text_encoder_two = unwrap_model(text_encoder_two)
+            text_encoder_2_lora_layers = convert_state_dict_to_diffusers(
+                get_peft_model_state_dict(text_encoder_two.to(torch.float32))
+            )
+        else:
+            text_encoder_lora_layers = None
+            text_encoder_2_lora_layers = None
+
+        StableDiffusionXLPipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_layers,
+            text_encoder_lora_layers=text_encoder_lora_layers,
+            text_encoder_2_lora_layers=text_encoder_2_lora_layers,
+        )
+        if args.output_kohya_format:
+            lora_state_dict = load_file(f"{args.output_dir}/pytorch_lora_weights.safetensors")
+            peft_state_dict = convert_all_state_dict_to_peft(lora_state_dict)
+            kohya_state_dict = convert_state_dict_to_kohya(peft_state_dict)
+            save_file(kohya_state_dict, f"{args.output_dir}/pytorch_lora_weights_kohya.safetensors")
+
+        # Final inference
+        # Load previous pipeline
+        vae = AutoencoderKL.from_pretrained(
+            vae_path,
+            subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            vae=vae,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+
+        # load attention processors
+        pipeline.load_lora_weights(args.output_dir)
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline_args = {"prompt": args.validation_prompt, "num_inference_steps": 25}
+            images = log_validation(
+                pipeline,
+                args,
+                accelerator,
+                pipeline_args,
+                epoch,
+                is_final_validation=True,
+            )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                use_dora=args.use_dora,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                train_text_encoder=args.train_text_encoder,
+                instance_prompt=args.instance_prompt,
+                validation_prompt=args.validation_prompt,
+                repo_folder=args.output_dir,
+                vae_path=args.pretrained_vae_model_name_or_path,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py b/diffusers/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3bf95305dad758e28b71bf2264122ed241b4372
--- /dev/null
+++ b/diffusers/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py
@@ -0,0 +1,1162 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import accelerate
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.state import AcceleratorState
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+from transformers.utils import ContextManagers
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel, compute_snr
+from diffusers.utils import check_min_version, deprecate, is_wandb_available, make_image_grid
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.28.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def save_model_card(
+    args,
+    repo_id: str,
+    images: list = None,
+    repo_folder: str = None,
+):
+    img_str = ""
+    if len(images) > 0:
+        image_grid = make_image_grid(images, 1, len(args.validation_prompts))
+        image_grid.save(os.path.join(repo_folder, "val_imgs_grid.png"))
+        img_str += "![val_imgs_grid](./val_imgs_grid.png)\n"
+
+    model_description = f"""
+# Text-to-image finetuning - {repo_id}
+
+This pipeline was finetuned from **{args.pretrained_model_name_or_path}** on the **{args.dataset_name}** dataset. Below are some example images generated with the finetuned pipeline using the following prompts: {args.validation_prompts}: \n
+{img_str}
+
+## Pipeline usage
+
+You can use the pipeline like so:
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained("{repo_id}", torch_dtype=torch.float16)
+prompt = "{args.validation_prompts[0]}"
+image = pipeline(prompt).images[0]
+image.save("my_image.png")
+```
+
+## Training info
+
+These are the key hyperparameters used during training:
+
+* Epochs: {args.num_train_epochs}
+* Learning rate: {args.learning_rate}
+* Batch size: {args.train_batch_size}
+* Gradient accumulation steps: {args.gradient_accumulation_steps}
+* Image resolution: {args.resolution}
+* Mixed-precision: {args.mixed_precision}
+
+"""
+    wandb_info = ""
+    if is_wandb_available():
+        wandb_run_url = None
+        if wandb.run is not None:
+            wandb_run_url = wandb.run.url
+
+    if wandb_run_url is not None:
+        wandb_info = f"""
+More information on all the CLI arguments and the environment are available on your [`wandb` run page]({wandb_run_url}).
+"""
+
+    model_description += wandb_info
+
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=args.pretrained_model_name_or_path,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = ["stable-diffusion", "stable-diffusion-diffusers", "text-to-image", "diffusers", "diffusers-training"]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight_dtype, epoch):
+    logger.info("Running validation... ")
+
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        vae=accelerator.unwrap_model(vae),
+        text_encoder=accelerator.unwrap_model(text_encoder),
+        tokenizer=tokenizer,
+        unet=accelerator.unwrap_model(unet),
+        safety_checker=None,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    images = []
+    for i in range(len(args.validation_prompts)):
+        with torch.autocast("cuda"):
+            image = pipeline(args.validation_prompts[i], num_inference_steps=20, generator=generator).images[0]
+
+        images.append(image)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompts[i]}")
+                        for i, image in enumerate(images)
+                    ]
+                }
+            )
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+    return images
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--input_perturbation", type=float, default=0, help="The scale of input perturbation. Recommended 0.1."
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompts",
+        type=str,
+        default=None,
+        nargs="+",
+        help=("A set of prompts evaluated every `--validation_epochs` and logged to `--report_to`."),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--non_ema_revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or"
+            " remote repository specified with --pretrained_model_name_or_path."
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber", "smooth_l1"],
+        help="The type of loss to use and whether it's timestep-scheduled. See Issue #7488 for more info.",
+    )
+    parser.add_argument(
+        "--huber_schedule",
+        type=str,
+        default="snr",
+        choices=["constant", "exponential", "snr"],
+        help="The schedule to use for the huber losses parameter",
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.1,
+        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=5,
+        help="Run validation every X epochs.",
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="text2image-fine-tune",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    # default to using the same revision for the non-ema model if not specified
+    if args.non_ema_revision is None:
+        args.non_ema_revision = args.revision
+
+    return args
+
+
+# NOTE: if you're using the scheduled version, huber_c has to depend on the timesteps already
+def conditional_loss(
+    model_pred: torch.Tensor,
+    target: torch.Tensor,
+    reduction: str = "mean",
+    loss_type: str = "l2",
+    huber_c: float = 0.1,
+):
+    if loss_type == "l2":
+        loss = F.mse_loss(model_pred, target, reduction=reduction)
+    elif loss_type == "huber":
+        loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    elif loss_type == "smooth_l1":
+        loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    else:
+        raise NotImplementedError(f"Unsupported Loss Type {loss_type}")
+    return loss
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if args.non_ema_revision is not None:
+        deprecate(
+            "non_ema_revision!=None",
+            "0.15.0",
+            message=(
+                "Downloading 'non_ema' weights from revision branches of the Hub is deprecated. Please make sure to"
+                " use `--variant=non_ema` instead."
+            ),
+        )
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+
+    def deepspeed_zero_init_disabled_context_manager():
+        """
+        returns either a context list that includes one that will disable zero.Init or an empty context list
+        """
+        deepspeed_plugin = AcceleratorState().deepspeed_plugin if accelerate.state.is_initialized() else None
+        if deepspeed_plugin is None:
+            return []
+
+        return [deepspeed_plugin.zero3_init_context_manager(enable=False)]
+
+    # Currently Accelerate doesn't know how to handle multiple models under Deepspeed ZeRO stage 3.
+    # For this to work properly all models must be run through `accelerate.prepare`. But accelerate
+    # will try to assign the same optimizer with the same weights to all models during
+    # `deepspeed.initialize`, which of course doesn't work.
+    #
+    # For now the following workaround will partially support Deepspeed ZeRO-3, by excluding the 2
+    # frozen models from being partitioned during `zero.Init` which gets called during
+    # `from_pretrained` So CLIPTextModel and AutoencoderKL will not enjoy the parameter sharding
+    # across multiple gpus and only UNet2DConditionModel will get ZeRO sharded.
+    with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
+        text_encoder = CLIPTextModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+        )
+        vae = AutoencoderKL.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+        )
+
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.non_ema_revision
+    )
+
+    # Freeze vae and text_encoder and set unet to trainable
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.train()
+
+    # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = UNet2DConditionModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+        )
+        ema_unet = EMAModel(ema_unet.parameters(), model_cls=UNet2DConditionModel, model_config=ema_unet.config)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DConditionModel)
+                ema_unet.load_state_dict(load_model.state_dict())
+                ema_unet.to(accelerator.device)
+                del load_model
+
+            for _ in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            data_dir=args.train_data_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        return inputs.input_ids
+
+    # Preprocessing the datasets.
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["input_ids"] = tokenize_captions(examples)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+        return {"pixel_values": pixel_values, "input_ids": input_ids}
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.use_ema:
+        ema_unet.to(accelerator.device)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+        args.mixed_precision = accelerator.mixed_precision
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+        args.mixed_precision = accelerator.mixed_precision
+
+    # Move text_encode and vae to gpu and cast to weight_dtype
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        tracker_config.pop("validation_prompts")
+        accelerator.init_trackers(args.tracker_project_name, tracker_config)
+
+    # Function for unwrapping if model was compiled with `torch.compile`.
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (latents.shape[0], latents.shape[1], 1, 1), device=latents.device
+                    )
+                if args.input_perturbation:
+                    new_noise = noise + args.input_perturbation * torch.randn_like(noise)
+                bsz = latents.shape[0]
+
+                # Sample a random timestep for each image
+                if args.loss_type == "huber" or args.loss_type == "smooth_l1":
+                    timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (1,), device="cpu")
+                    timestep = timesteps.item()
+
+                    if args.huber_schedule == "exponential":
+                        alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+                        huber_c = math.exp(-alpha * timestep)
+                    elif args.huber_schedule == "snr":
+                        alphas_cumprod = noise_scheduler.alphas_cumprod[timestep]
+                        sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
+                        huber_c = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
+                    elif args.huber_schedule == "constant":
+                        huber_c = args.huber_c
+                    else:
+                        raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
+
+                    timesteps = timesteps.repeat(bsz).to(latents.device)
+                elif args.loss_type == "l2":
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+                    )
+                    huber_c = 1  # may be anything, as it's not used
+                else:
+                    raise NotImplementedError(f"Unknown loss type {args.loss_type}")
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                if args.input_perturbation:
+                    noisy_latents = noise_scheduler.add_noise(latents, new_noise, timesteps)
+                else:
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"], return_dict=False)[0]
+
+                # Get the target for loss depending on the prediction type
+                if args.prediction_type is not None:
+                    # set prediction_type of scheduler if defined
+                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # Predict the noise residual and compute loss
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
+
+                if args.snr_gamma is None:
+                    loss = conditional_loss(
+                        model_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c
+                    )
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = conditional_loss(
+                        model_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    )
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_unet.step(unet.parameters())
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompts is not None and epoch % args.validation_epochs == 0:
+                if args.use_ema:
+                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                    ema_unet.store(unet.parameters())
+                    ema_unet.copy_to(unet.parameters())
+                log_validation(
+                    vae,
+                    text_encoder,
+                    tokenizer,
+                    unet,
+                    args,
+                    accelerator,
+                    weight_dtype,
+                    global_step,
+                )
+                if args.use_ema:
+                    # Switch back to the original UNet parameters.
+                    ema_unet.restore(unet.parameters())
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            revision=args.revision,
+            variant=args.variant,
+        )
+        pipeline.save_pretrained(args.output_dir)
+
+        # Run a final round of inference.
+        images = []
+        if args.validation_prompts is not None:
+            logger.info("Running inference for collecting generated images...")
+            pipeline = pipeline.to(accelerator.device)
+            pipeline.torch_dtype = weight_dtype
+            pipeline.set_progress_bar_config(disable=True)
+
+            if args.enable_xformers_memory_efficient_attention:
+                pipeline.enable_xformers_memory_efficient_attention()
+
+            if args.seed is None:
+                generator = None
+            else:
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+            for i in range(len(args.validation_prompts)):
+                with torch.autocast("cuda"):
+                    image = pipeline(args.validation_prompts[i], num_inference_steps=20, generator=generator).images[0]
+                images.append(image)
+
+        if args.push_to_hub:
+            save_model_card(args, repo_id, images, repo_folder=args.output_dir)
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py b/diffusers/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4b4d69bb8923976c26b04bb29763555b1ff876e
--- /dev/null
+++ b/diffusers/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py
@@ -0,0 +1,1051 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning script for Stable Diffusion for text2image with support for LoRA."""
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import cast_training_params, compute_snr
+from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.28.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def save_model_card(
+    repo_id: str,
+    images: list = None,
+    base_model: str = None,
+    dataset_name: str = None,
+    repo_folder: str = None,
+):
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# LoRA text2image fine-tuning - {repo_id}
+These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
+{img_str}
+"""
+
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion",
+        "stable-diffusion-diffusers",
+        "text-to-image",
+        "diffusers",
+        "diffusers-training",
+        "lora",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber", "smooth_l1"],
+        help="The type of loss to use and whether it's timestep-scheduled. See Issue #7488 for more info.",
+    )
+    parser.add_argument(
+        "--huber_schedule",
+        type=str,
+        default="snr",
+        choices=["constant", "exponential", "snr"],
+        help="The schedule to use for the huber losses parameter",
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.1,
+        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type.",
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+# NOTE: if you're using the scheduled version, huber_c has to depend on the timesteps already
+def conditional_loss(
+    model_pred: torch.Tensor,
+    target: torch.Tensor,
+    reduction: str = "mean",
+    loss_type: str = "l2",
+    huber_c: float = 0.1,
+):
+    if loss_type == "l2":
+        loss = F.mse_loss(model_pred, target, reduction=reduction)
+    elif loss_type == "huber":
+        loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    elif loss_type == "smooth_l1":
+        loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    else:
+        raise NotImplementedError(f"Unsupported Loss Type {loss_type}")
+    return loss
+
+
+def main():
+    args = parse_args()
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+    # freeze parameters of models to save more memory
+    unet.requires_grad_(False)
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Freeze the unet parameters before adding adapters
+    for param in unet.parameters():
+        param.requires_grad_(False)
+
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # Add adapter and make sure the trainable params are in float32.
+    unet.add_adapter(unet_lora_config)
+    if args.mixed_precision == "fp16":
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(unet, dtype=torch.float32)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    lora_layers = filter(lambda p: p.requires_grad, unet.parameters())
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        lora_layers,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            data_dir=args.train_data_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        return inputs.input_ids
+
+    # Preprocessing the datasets.
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["input_ids"] = tokenize_captions(examples)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+        return {"pixel_values": pixel_values, "input_ids": input_ids}
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2image-fine-tune", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (latents.shape[0], latents.shape[1], 1, 1), device=latents.device
+                    )
+
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                if args.loss_type == "huber" or args.loss_type == "smooth_l1":
+                    timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (1,), device="cpu")
+                    timestep = timesteps.item()
+
+                    if args.huber_schedule == "exponential":
+                        alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+                        huber_c = math.exp(-alpha * timestep)
+                    elif args.huber_schedule == "snr":
+                        alphas_cumprod = noise_scheduler.alphas_cumprod[timestep]
+                        sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
+                        huber_c = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
+                    elif args.huber_schedule == "constant":
+                        huber_c = args.huber_c
+                    else:
+                        raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
+
+                    timesteps = timesteps.repeat(bsz).to(latents.device)
+                elif args.loss_type == "l2":
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+                    )
+                    huber_c = 1  # may be anything, as it's not used
+                else:
+                    raise NotImplementedError(f"Unknown loss type {args.loss_type}")
+
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"], return_dict=False)[0]
+
+                # Get the target for loss depending on the prediction type
+                if args.prediction_type is not None:
+                    # set prediction_type of scheduler if defined
+                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # Predict the noise residual and compute loss
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
+
+                if args.snr_gamma is None:
+                    loss = conditional_loss(
+                        model_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c
+                    )
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = conditional_loss(
+                        model_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    )
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = lora_layers
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+
+                        unwrapped_unet = unwrap_model(unet)
+                        unet_lora_state_dict = convert_state_dict_to_diffusers(
+                            get_peft_model_state_dict(unwrapped_unet)
+                        )
+
+                        StableDiffusionPipeline.save_lora_weights(
+                            save_directory=save_path,
+                            unet_lora_layers=unet_lora_state_dict,
+                            safe_serialization=True,
+                        )
+
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                # create pipeline
+                pipeline = DiffusionPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    unet=unwrap_model(unet),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                generator = torch.Generator(device=accelerator.device)
+                if args.seed is not None:
+                    generator = generator.manual_seed(args.seed)
+                images = []
+                with torch.cuda.amp.autocast():
+                    for _ in range(args.num_validation_images):
+                        images.append(
+                            pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
+                        )
+
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "validation": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unet.to(torch.float32)
+
+        unwrapped_unet = unwrap_model(unet)
+        unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unwrapped_unet))
+        StableDiffusionPipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_state_dict,
+            safe_serialization=True,
+        )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                dataset_name=args.dataset_name,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+        # Final inference
+        # Load previous pipeline
+        if args.validation_prompt is not None:
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                revision=args.revision,
+                variant=args.variant,
+                torch_dtype=weight_dtype,
+            )
+            pipeline = pipeline.to(accelerator.device)
+
+            # load attention processors
+            pipeline.load_lora_weights(args.output_dir)
+
+            # run inference
+            generator = torch.Generator(device=accelerator.device)
+            if args.seed is not None:
+                generator = generator.manual_seed(args.seed)
+            images = []
+            with torch.cuda.amp.autocast():
+                for _ in range(args.num_validation_images):
+                    images.append(
+                        pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
+                    )
+
+            for tracker in accelerator.trackers:
+                if len(images) != 0:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "test": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py b/diffusers/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..bab86bf21a765c4fd20354c3e3ca298e13e2ba8b
--- /dev/null
+++ b/diffusers/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py
@@ -0,0 +1,1384 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning script for Stable Diffusion XL for text2image with support for LoRA."""
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import StableDiffusionLoraLoaderMixin
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr
+from diffusers.utils import (
+    check_min_version,
+    convert_state_dict_to_diffusers,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.28.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    images: list = None,
+    base_model: str = None,
+    dataset_name: str = None,
+    train_text_encoder: bool = False,
+    repo_folder: str = None,
+    vae_path: str = None,
+):
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# LoRA text2image fine-tuning - {repo_id}
+
+These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
+{img_str}
+
+LoRA for the text encoder was enabled: {train_text_encoder}.
+
+Special VAE used for training: {vae_path}.
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion-xl",
+        "stable-diffusion-xl-diffusers",
+        "text-to-image",
+        "diffusers",
+        "diffusers-training",
+        "lora",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber", "smooth_l1"],
+        help="The type of loss to use and whether it's timestep-scheduled. See Issue #7488 for more info.",
+    )
+    parser.add_argument(
+        "--huber_schedule",
+        type=str,
+        default="snr",
+        choices=["constant", "exponential", "snr"],
+        help="The schedule to use for the huber losses parameter",
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.1,
+        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type.",
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--debug_loss",
+        action="store_true",
+        help="debug loss for each image, if filenames are awailable in the dataset",
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def tokenize_prompt(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(text_encoders, tokenizers, prompt, text_input_ids_list=None):
+    prompt_embeds_list = []
+
+    for i, text_encoder in enumerate(text_encoders):
+        if tokenizers is not None:
+            tokenizer = tokenizers[i]
+            text_input_ids = tokenize_prompt(tokenizer, prompt)
+        else:
+            assert text_input_ids_list is not None
+            text_input_ids = text_input_ids_list[i]
+
+        prompt_embeds = text_encoder(
+            text_input_ids.to(text_encoder.device), output_hidden_states=True, return_dict=False
+        )
+
+        # We are only ALWAYS interested in the pooled output of the final text encoder
+        pooled_prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds[-1][-2]
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+        prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+# NOTE: if you're using the scheduled version, huber_c has to depend on the timesteps already
+def conditional_loss(
+    model_pred: torch.Tensor,
+    target: torch.Tensor,
+    reduction: str = "mean",
+    loss_type: str = "l2",
+    huber_c: float = 0.1,
+):
+    if loss_type == "l2":
+        loss = F.mse_loss(model_pred, target, reduction=reduction)
+    elif loss_type == "huber" or loss_type == "huber_scheduled":
+        loss = huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    else:
+        raise NotImplementedError(f"Unsupported Loss Type {loss_type}")
+    return loss
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # We only train the additional adapter LoRA layers
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    unet.to(accelerator.device, dtype=weight_dtype)
+
+    if args.pretrained_vae_model_name_or_path is None:
+        vae.to(accelerator.device, dtype=torch.float32)
+    else:
+        vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # now we will add new LoRA weights to the attention layers
+    # Set correct lora layers
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+
+    unet.add_adapter(unet_lora_config)
+
+    # The text encoder comes from 🤗 transformers, we will also attach adapters to it.
+    if args.train_text_encoder:
+        # ensure that dtype is float32, even if rest of the model that isn't trained is loaded in fp16
+        text_lora_config = LoraConfig(
+            r=args.rank,
+            lora_alpha=args.rank,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder_one.add_adapter(text_lora_config)
+        text_encoder_two.add_adapter(text_lora_config)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder attn layers
+            unet_lora_layers_to_save = None
+            text_encoder_one_lora_layers_to_save = None
+            text_encoder_two_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(unwrap_model(model), type(unwrap_model(unet))):
+                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
+                elif isinstance(unwrap_model(model), type(unwrap_model(text_encoder_one))):
+                    text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
+                elif isinstance(unwrap_model(model), type(unwrap_model(text_encoder_two))):
+                    text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                if weights:
+                    weights.pop()
+
+            StableDiffusionXLPipeline.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
+                text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save,
+            )
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+        text_encoder_one_ = None
+        text_encoder_two_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(unet))):
+                unet_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                text_encoder_one_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_two))):
+                text_encoder_two_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, _ = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir)
+        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        if args.train_text_encoder:
+            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)
+
+            _set_state_dict_into_text_encoder(
+                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_two_
+            )
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [unet_]
+            if args.train_text_encoder:
+                models.extend([text_encoder_one_, text_encoder_two_])
+            cast_training_params(models, dtype=torch.float32)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder_one.gradient_checkpointing_enable()
+            text_encoder_two.gradient_checkpointing_enable()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [unet]
+        if args.train_text_encoder:
+            models.extend([text_encoder_one, text_encoder_two])
+        cast_training_params(models, dtype=torch.float32)
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = list(filter(lambda p: p.requires_grad, unet.parameters()))
+    if args.train_text_encoder:
+        params_to_optimize = (
+            params_to_optimize
+            + list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
+            + list(filter(lambda p: p.requires_grad, text_encoder_two.parameters()))
+        )
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, data_dir=args.train_data_dir
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        tokens_one = tokenize_prompt(tokenizer_one, captions)
+        tokens_two = tokenize_prompt(tokenizer_two, captions)
+        return tokens_one, tokens_two
+
+    # Preprocessing the datasets.
+    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
+    train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
+    train_flip = transforms.RandomHorizontalFlip(p=1.0)
+    train_transforms = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        # image aug
+        original_sizes = []
+        all_images = []
+        crop_top_lefts = []
+        for image in images:
+            original_sizes.append((image.height, image.width))
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            crop_top_left = (y1, x1)
+            crop_top_lefts.append(crop_top_left)
+            image = train_transforms(image)
+            all_images.append(image)
+
+        examples["original_sizes"] = original_sizes
+        examples["crop_top_lefts"] = crop_top_lefts
+        examples["pixel_values"] = all_images
+        tokens_one, tokens_two = tokenize_captions(examples)
+        examples["input_ids_one"] = tokens_one
+        examples["input_ids_two"] = tokens_two
+        if args.debug_loss:
+            fnames = [os.path.basename(image.filename) for image in examples[image_column] if image.filename]
+            if fnames:
+                examples["filenames"] = fnames
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train, output_all_columns=True)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        original_sizes = [example["original_sizes"] for example in examples]
+        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
+        input_ids_one = torch.stack([example["input_ids_one"] for example in examples])
+        input_ids_two = torch.stack([example["input_ids_two"] for example in examples])
+        result = {
+            "pixel_values": pixel_values,
+            "input_ids_one": input_ids_one,
+            "input_ids_two": input_ids_two,
+            "original_sizes": original_sizes,
+            "crop_top_lefts": crop_top_lefts,
+        }
+
+        filenames = [example["filenames"] for example in examples if "filenames" in example]
+        if filenames:
+            result["filenames"] = filenames
+        return result
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2image-fine-tune", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder_one.train()
+            text_encoder_two.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                if args.pretrained_vae_model_name_or_path is not None:
+                    pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
+                else:
+                    pixel_values = batch["pixel_values"]
+
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                model_input = model_input * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    model_input = model_input.to(weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (model_input.shape[0], model_input.shape[1], 1, 1), device=model_input.device
+                    )
+
+                bsz = model_input.shape[0]
+                # Sample a random timestep for each image
+                if args.loss_type == "huber" or args.loss_type == "smooth_l1":
+                    timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (1,), device="cpu")
+                    timestep = timesteps.item()
+
+                    if args.huber_schedule == "exponential":
+                        alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+                        huber_c = math.exp(-alpha * timestep)
+                    elif args.huber_schedule == "snr":
+                        alphas_cumprod = noise_scheduler.alphas_cumprod[timestep]
+                        sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
+                        huber_c = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
+                    elif args.huber_schedule == "constant":
+                        huber_c = args.huber_c
+                    else:
+                        raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
+
+                    timesteps = timesteps.repeat(bsz).to(model_input.device)
+                elif args.loss_type == "l2":
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                    )
+                    huber_c = 1  # may be anything, as it's not used
+                else:
+                    raise NotImplementedError(f"Unknown loss type {args.loss_type}")
+
+                timesteps = timesteps.long()
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+
+                # time ids
+                def compute_time_ids(original_size, crops_coords_top_left):
+                    # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+                    target_size = (args.resolution, args.resolution)
+                    add_time_ids = list(original_size + crops_coords_top_left + target_size)
+                    add_time_ids = torch.tensor([add_time_ids])
+                    add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+                    return add_time_ids
+
+                add_time_ids = torch.cat(
+                    [compute_time_ids(s, c) for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])]
+                )
+
+                # Predict the noise residual
+                unet_added_conditions = {"time_ids": add_time_ids}
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                    text_encoders=[text_encoder_one, text_encoder_two],
+                    tokenizers=None,
+                    prompt=None,
+                    text_input_ids_list=[batch["input_ids_one"], batch["input_ids_two"]],
+                )
+                unet_added_conditions.update({"text_embeds": pooled_prompt_embeds})
+                model_pred = unet(
+                    noisy_model_input,
+                    timesteps,
+                    prompt_embeds,
+                    added_cond_kwargs=unet_added_conditions,
+                    return_dict=False,
+                )[0]
+
+                # Get the target for loss depending on the prediction type
+                if args.prediction_type is not None:
+                    # set prediction_type of scheduler if defined
+                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.snr_gamma is None:
+                    loss = conditional_loss(
+                        model_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c
+                    )
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = conditional_loss(
+                        model_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    )
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+                if args.debug_loss and "filenames" in batch:
+                    for fname in batch["filenames"]:
+                        accelerator.log({"loss_for_" + fname: loss}, step=global_step)
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                # create pipeline
+                pipeline = StableDiffusionXLPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=unwrap_model(text_encoder_one),
+                    text_encoder_2=unwrap_model(text_encoder_two),
+                    unet=unwrap_model(unet),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+                pipeline_args = {"prompt": args.validation_prompt}
+
+                with torch.cuda.amp.autocast():
+                    images = [
+                        pipeline(**pipeline_args, generator=generator).images[0]
+                        for _ in range(args.num_validation_images)
+                    ]
+
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "validation": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+
+        if args.train_text_encoder:
+            text_encoder_one = unwrap_model(text_encoder_one)
+            text_encoder_two = unwrap_model(text_encoder_two)
+
+            text_encoder_lora_layers = convert_state_dict_to_diffusers(get_peft_model_state_dict(text_encoder_one))
+            text_encoder_2_lora_layers = convert_state_dict_to_diffusers(get_peft_model_state_dict(text_encoder_two))
+        else:
+            text_encoder_lora_layers = None
+            text_encoder_2_lora_layers = None
+
+        StableDiffusionXLPipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_state_dict,
+            text_encoder_lora_layers=text_encoder_lora_layers,
+            text_encoder_2_lora_layers=text_encoder_2_lora_layers,
+        )
+
+        del unet
+        del text_encoder_one
+        del text_encoder_two
+        del text_encoder_lora_layers
+        del text_encoder_2_lora_layers
+        torch.cuda.empty_cache()
+
+        # Final inference
+        # Make sure vae.dtype is consistent with the unet.dtype
+        if args.mixed_precision == "fp16":
+            vae.to(weight_dtype)
+        # Load previous pipeline
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            vae=vae,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        pipeline = pipeline.to(accelerator.device)
+
+        # load attention processors
+        pipeline.load_lora_weights(args.output_dir)
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+            images = [
+                pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                for _ in range(args.num_validation_images)
+            ]
+
+            for tracker in accelerator.trackers:
+                if tracker.name == "tensorboard":
+                    np_images = np.stack([np.asarray(img) for img in images])
+                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+                if tracker.name == "wandb":
+                    tracker.log(
+                        {
+                            "test": [
+                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                for i, image in enumerate(images)
+                            ]
+                        }
+                    )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                dataset_name=args.dataset_name,
+                train_text_encoder=args.train_text_encoder,
+                repo_folder=args.output_dir,
+                vae_path=args.pretrained_vae_model_name_or_path,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py b/diffusers/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..a056bcfc8cb10e33b409377792111065d53de14e
--- /dev/null
+++ b/diffusers/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py
@@ -0,0 +1,1394 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning script for Stable Diffusion XL for text2image."""
+
+import argparse
+import functools
+import gc
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import accelerate
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import concatenate_datasets, load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionXLPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel, compute_snr
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.28.0.dev0")
+
+logger = get_logger(__name__)
+
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def save_model_card(
+    repo_id: str,
+    images: list = None,
+    validation_prompt: str = None,
+    base_model: str = None,
+    dataset_name: str = None,
+    repo_folder: str = None,
+    vae_path: str = None,
+):
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# Text-to-image finetuning - {repo_id}
+
+This pipeline was finetuned from **{base_model}** on the **{dataset_name}** dataset. Below are some example images generated with the finetuned pipeline using the following prompt: {validation_prompt}: \n
+{img_str}
+
+Special VAE used for training: {vae_path}.
+"""
+
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion-xl",
+        "stable-diffusion-xl-diffusers",
+        "text-to-image",
+        "diffusers-training",
+        "diffusers",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sdxl-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--timestep_bias_strategy",
+        type=str,
+        default="none",
+        choices=["earlier", "later", "range", "none"],
+        help=(
+            "The timestep bias strategy, which may help direct the model toward learning low or high frequency details."
+            " Choices: ['earlier', 'later', 'range', 'none']."
+            " The default is 'none', which means no bias is applied, and training proceeds normally."
+            " The value of 'later' will increase the frequency of the model's final training timesteps."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_multiplier",
+        type=float,
+        default=1.0,
+        help=(
+            "The multiplier for the bias. Defaults to 1.0, which means no bias is applied."
+            " A value of 2.0 will double the weight of the bias, and a value of 0.5 will halve it."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_begin",
+        type=int,
+        default=0,
+        help=(
+            "When using `--timestep_bias_strategy=range`, the beginning (inclusive) timestep to bias."
+            " Defaults to zero, which equates to having no specific bias."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_end",
+        type=int,
+        default=1000,
+        help=(
+            "When using `--timestep_bias_strategy=range`, the final timestep (inclusive) to bias."
+            " Defaults to 1000, which is the number of timesteps that Stable Diffusion is trained on."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_portion",
+        type=float,
+        default=0.25,
+        help=(
+            "The portion of timesteps to bias. Defaults to 0.25, which 25% of timesteps will be biased."
+            " A value of 0.5 will bias one half of the timesteps. The value provided for `--timestep_bias_strategy` determines"
+            " whether the biased portions are in the earlier or later timesteps."
+        ),
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber", "smooth_l1"],
+        help="The type of loss to use and whether it's timestep-scheduled. See Issue #7488 for more info.",
+    )
+    parser.add_argument(
+        "--huber_schedule",
+        type=str,
+        default="snr",
+        choices=["constant", "exponential", "snr"],
+        help="The schedule to use for the huber losses parameter",
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.1,
+        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type.",
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    return args
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(batch, text_encoders, tokenizers, proportion_empty_prompts, caption_column, is_train=True):
+    prompt_embeds_list = []
+    prompt_batch = batch[caption_column]
+
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+
+    with torch.no_grad():
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                captions,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = text_encoder(
+                text_input_ids.to(text_encoder.device),
+                output_hidden_states=True,
+                return_dict=False,
+            )
+
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds[-1][-2]
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+            prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return {"prompt_embeds": prompt_embeds.cpu(), "pooled_prompt_embeds": pooled_prompt_embeds.cpu()}
+
+
+def compute_vae_encodings(batch, vae):
+    images = batch.pop("pixel_values")
+    pixel_values = torch.stack(list(images))
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+    pixel_values = pixel_values.to(vae.device, dtype=vae.dtype)
+
+    with torch.no_grad():
+        model_input = vae.encode(pixel_values).latent_dist.sample()
+    model_input = model_input * vae.config.scaling_factor
+    return {"model_input": model_input.cpu()}
+
+
+def generate_timestep_weights(args, num_timesteps):
+    weights = torch.ones(num_timesteps)
+
+    # Determine the indices to bias
+    num_to_bias = int(args.timestep_bias_portion * num_timesteps)
+
+    if args.timestep_bias_strategy == "later":
+        bias_indices = slice(-num_to_bias, None)
+    elif args.timestep_bias_strategy == "earlier":
+        bias_indices = slice(0, num_to_bias)
+    elif args.timestep_bias_strategy == "range":
+        # Out of the possible 1000 timesteps, we might want to focus on eg. 200-500.
+        range_begin = args.timestep_bias_begin
+        range_end = args.timestep_bias_end
+        if range_begin < 0:
+            raise ValueError(
+                "When using the range strategy for timestep bias, you must provide a beginning timestep greater or equal to zero."
+            )
+        if range_end > num_timesteps:
+            raise ValueError(
+                "When using the range strategy for timestep bias, you must provide an ending timestep smaller than the number of timesteps."
+            )
+        bias_indices = slice(range_begin, range_end)
+    else:  # 'none' or any other string
+        return weights
+    if args.timestep_bias_multiplier <= 0:
+        return ValueError(
+            "The parameter --timestep_bias_multiplier is not intended to be used to disable the training of specific timesteps."
+            " If it was intended to disable timestep bias, use `--timestep_bias_strategy none` instead."
+            " A timestep bias multiplier less than or equal to 0 is not allowed."
+        )
+
+    # Apply the bias
+    weights[bias_indices] *= args.timestep_bias_multiplier
+
+    # Normalize
+    weights /= weights.sum()
+
+    return weights
+
+
+# NOTE: if you're using the scheduled version, huber_c has to depend on the timesteps already
+def conditional_loss(
+    model_pred: torch.Tensor,
+    target: torch.Tensor,
+    reduction: str = "mean",
+    loss_type: str = "l2",
+    huber_c: float = 0.1,
+):
+    if loss_type == "l2":
+        loss = F.mse_loss(model_pred, target, reduction=reduction)
+    elif loss_type == "huber":
+        loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    elif loss_type == "smooth_l1":
+        loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    else:
+        raise NotImplementedError(f"Unsupported Loss Type {loss_type}")
+    return loss
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    # Check for terminal SNR in combination with SNR Gamma
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # Freeze vae and text encoders.
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    # Set unet as trainable.
+    unet.train()
+
+    # For mixed precision training we cast all non-trainable weights to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    vae.to(accelerator.device, dtype=torch.float32)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = UNet2DConditionModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+        )
+        ema_unet = EMAModel(ema_unet.parameters(), model_cls=UNet2DConditionModel, model_config=ema_unet.config)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DConditionModel)
+                ema_unet.load_state_dict(load_model.state_dict())
+                ema_unet.to(accelerator.device)
+                del load_model
+
+            for _ in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = unet.parameters()
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
+    train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
+    train_flip = transforms.RandomHorizontalFlip(p=1.0)
+    train_transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        # image aug
+        original_sizes = []
+        all_images = []
+        crop_top_lefts = []
+        for image in images:
+            original_sizes.append((image.height, image.width))
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            crop_top_left = (y1, x1)
+            crop_top_lefts.append(crop_top_left)
+            image = train_transforms(image)
+            all_images.append(image)
+
+        examples["original_sizes"] = original_sizes
+        examples["crop_top_lefts"] = crop_top_lefts
+        examples["pixel_values"] = all_images
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    # Let's first compute all the embeddings so that we can free up the text encoders
+    # from memory. We will pre-compute the VAE encodings too.
+    text_encoders = [text_encoder_one, text_encoder_two]
+    tokenizers = [tokenizer_one, tokenizer_two]
+    compute_embeddings_fn = functools.partial(
+        encode_prompt,
+        text_encoders=text_encoders,
+        tokenizers=tokenizers,
+        proportion_empty_prompts=args.proportion_empty_prompts,
+        caption_column=args.caption_column,
+    )
+    compute_vae_encodings_fn = functools.partial(compute_vae_encodings, vae=vae)
+    with accelerator.main_process_first():
+        from datasets.fingerprint import Hasher
+
+        # fingerprint used by the cache for the other processes to load the result
+        # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401
+        new_fingerprint = Hasher.hash(args)
+        new_fingerprint_for_vae = Hasher.hash(vae_path)
+        train_dataset_with_embeddings = train_dataset.map(
+            compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint
+        )
+        train_dataset_with_vae = train_dataset.map(
+            compute_vae_encodings_fn,
+            batched=True,
+            batch_size=args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps,
+            new_fingerprint=new_fingerprint_for_vae,
+        )
+        precomputed_dataset = concatenate_datasets(
+            [train_dataset_with_embeddings, train_dataset_with_vae.remove_columns(["image", "text"])], axis=1
+        )
+        precomputed_dataset = precomputed_dataset.with_transform(preprocess_train)
+
+    del compute_vae_encodings_fn, compute_embeddings_fn, text_encoder_one, text_encoder_two
+    del text_encoders, tokenizers, vae
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    def collate_fn(examples):
+        model_input = torch.stack([torch.tensor(example["model_input"]) for example in examples])
+        original_sizes = [example["original_sizes"] for example in examples]
+        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
+        prompt_embeds = torch.stack([torch.tensor(example["prompt_embeds"]) for example in examples])
+        pooled_prompt_embeds = torch.stack([torch.tensor(example["pooled_prompt_embeds"]) for example in examples])
+
+        return {
+            "model_input": model_input,
+            "prompt_embeds": prompt_embeds,
+            "pooled_prompt_embeds": pooled_prompt_embeds,
+            "original_sizes": original_sizes,
+            "crop_top_lefts": crop_top_lefts,
+        }
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        precomputed_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.use_ema:
+        ema_unet.to(accelerator.device)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2image-fine-tune-sdxl", config=vars(args))
+
+    # Function for unwrapping if torch.compile() was used in accelerate.
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(precomputed_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Sample noise that we'll add to the latents
+                model_input = batch["model_input"].to(accelerator.device)
+                noise = torch.randn_like(model_input)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (model_input.shape[0], model_input.shape[1], 1, 1), device=model_input.device
+                    )
+
+                bsz = model_input.shape[0]
+                if args.timestep_bias_strategy == "none":
+                    # Sample a random timestep for each image
+                    if args.loss_type == "huber" or args.loss_type == "smooth_l1":
+                        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (1,), device="cpu")
+                        timestep = timesteps.item()
+
+                        if args.huber_schedule == "exponential":
+                            alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+                            huber_c = math.exp(-alpha * timestep)
+                        elif args.huber_schedule == "snr":
+                            alphas_cumprod = noise_scheduler.alphas_cumprod[timestep]
+                            sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
+                            huber_c = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
+                        elif args.huber_schedule == "constant":
+                            huber_c = args.huber_c
+                        else:
+                            raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
+
+                        timesteps = timesteps.repeat(bsz).to(model_input.device)
+                    elif args.loss_type == "l2":
+                        timesteps = torch.randint(
+                            0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                        )
+                        huber_c = 1  # may be anything, as it's not used
+                    else:
+                        raise NotImplementedError(f"Unknown loss type {args.loss_type}")
+
+                    timesteps = timesteps.long()
+
+                else:
+                    if "huber_scheduled" in args.loss_type:
+                        raise NotImplementedError(
+                            "Randomly weighted timesteps not implemented yet for scheduled huber loss!"
+                        )
+                    else:
+                        huber_c = args.huber_c
+                    # Sample a random timestep for each image, potentially biased by the timestep weights.
+                    # Biasing the timestep weights allows us to spend less time training irrelevant timesteps.
+                    weights = generate_timestep_weights(args, noise_scheduler.config.num_train_timesteps).to(
+                        model_input.device
+                    )
+                    timesteps = torch.multinomial(weights, bsz, replacement=True).long()
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+
+                # time ids
+                def compute_time_ids(original_size, crops_coords_top_left):
+                    # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+                    target_size = (args.resolution, args.resolution)
+                    add_time_ids = list(original_size + crops_coords_top_left + target_size)
+                    add_time_ids = torch.tensor([add_time_ids])
+                    add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+                    return add_time_ids
+
+                add_time_ids = torch.cat(
+                    [compute_time_ids(s, c) for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])]
+                )
+
+                # Predict the noise residual
+                unet_added_conditions = {"time_ids": add_time_ids}
+                prompt_embeds = batch["prompt_embeds"].to(accelerator.device)
+                pooled_prompt_embeds = batch["pooled_prompt_embeds"].to(accelerator.device)
+                unet_added_conditions.update({"text_embeds": pooled_prompt_embeds})
+                model_pred = unet(
+                    noisy_model_input,
+                    timesteps,
+                    prompt_embeds,
+                    added_cond_kwargs=unet_added_conditions,
+                    return_dict=False,
+                )[0]
+
+                # Get the target for loss depending on the prediction type
+                if args.prediction_type is not None:
+                    # set prediction_type of scheduler if defined
+                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+                elif noise_scheduler.config.prediction_type == "sample":
+                    # We set the target to latents here, but the model_pred will return the noise sample prediction.
+                    target = model_input
+                    # We will have to subtract the noise residual from the prediction to get the target sample.
+                    model_pred = model_pred - noise
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.snr_gamma is None:
+                    loss = conditional_loss(
+                        model_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c
+                    )
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = conditional_loss(
+                        model_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    )
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = unet.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_unet.step(unet.parameters())
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                if args.use_ema:
+                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                    ema_unet.store(unet.parameters())
+                    ema_unet.copy_to(unet.parameters())
+
+                # create pipeline
+                vae = AutoencoderKL.from_pretrained(
+                    vae_path,
+                    subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+                    revision=args.revision,
+                    variant=args.variant,
+                )
+                pipeline = StableDiffusionXLPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    unet=accelerator.unwrap_model(unet),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                if args.prediction_type is not None:
+                    scheduler_args = {"prediction_type": args.prediction_type}
+                    pipeline.scheduler = pipeline.scheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+                pipeline_args = {"prompt": args.validation_prompt}
+
+                with torch.cuda.amp.autocast():
+                    images = [
+                        pipeline(**pipeline_args, generator=generator, num_inference_steps=25).images[0]
+                        for _ in range(args.num_validation_images)
+                    ]
+
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "validation": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+
+        # Serialize pipeline.
+        vae = AutoencoderKL.from_pretrained(
+            vae_path,
+            subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            unet=unet,
+            vae=vae,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        if args.prediction_type is not None:
+            scheduler_args = {"prediction_type": args.prediction_type}
+            pipeline.scheduler = pipeline.scheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+        pipeline.save_pretrained(args.output_dir)
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline = pipeline.to(accelerator.device)
+            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+            with torch.cuda.amp.autocast():
+                images = [
+                    pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                    for _ in range(args.num_validation_images)
+                ]
+
+            for tracker in accelerator.trackers:
+                if tracker.name == "tensorboard":
+                    np_images = np.stack([np.asarray(img) for img in images])
+                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+                if tracker.name == "wandb":
+                    tracker.log(
+                        {
+                            "test": [
+                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                for i, image in enumerate(images)
+                            ]
+                        }
+                    )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id=repo_id,
+                images=images,
+                validation_prompt=args.validation_prompt,
+                base_model=args.pretrained_model_name_or_path,
+                dataset_name=args.dataset_name,
+                repo_folder=args.output_dir,
+                vae_path=args.pretrained_vae_model_name_or_path,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/sd3_lora_colab/README.md b/diffusers/examples/research_projects/sd3_lora_colab/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b7d7eedfb5dcfaee925b972fcad66b63d9ea9a12
--- /dev/null
+++ b/diffusers/examples/research_projects/sd3_lora_colab/README.md
@@ -0,0 +1,38 @@
+# Running Stable Diffusion 3 DreamBooth LoRA training under 16GB
+
+This is an **EDUCATIONAL** project that provides utilities for DreamBooth LoRA training for [Stable Diffusion 3 (SD3)](ttps://huggingface.co/papers/2403.03206) under 16GB GPU VRAM. This means you can successfully try out this project using a [free-tier Colab Notebook](https://colab.research.google.com/github/huggingface/diffusers/blob/main/examples/research_projects/sd3_lora_colab/sd3_dreambooth_lora_16gb.ipynb) instance. 🤗
+
+> [!NOTE]
+> SD3 is gated, so you need to make sure you agree to [share your contact info](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers) to access the model before using it with Diffusers. Once you have access, you need to log in so your system knows you’re authorized. Use the command below to log in:
+
+```bash
+huggingface-cli login
+```
+
+This will also allow us to push the trained model parameters to the Hugging Face Hub platform.
+
+For setup, inference code, and details on how to run the code, please follow the Colab Notebook provided above.
+
+## How
+
+We make use of several techniques to make this possible:
+
+* Compute the embeddings from the instance prompt and serialize them for later reuse. This is implemented in the [`compute_embeddings.py`](./compute_embeddings.py) script. We use an 8bit (as introduced in [`LLM.int8()`](https://arxiv.org/abs/2208.07339)) T5 to reduce memory requirements to ~10.5GB.
+* In the `train_dreambooth_sd3_lora_miniature.py` script, we make use of:
+  * 8bit Adam for optimization through the `bitsandbytes` library.
+  * Gradient checkpointing and gradient accumulation.
+  * FP16 precision.
+  * Flash attention through `F.scaled_dot_product_attention()`.
+
+Computing the text embeddings is arguably the most memory-intensive part in the pipeline as SD3 employs three text encoders. If we run them in FP32, it will take about 20GB of VRAM. With FP16, we are down to 12GB.
+
+
+## Gotchas
+
+This project is educational. It exists to showcase the possibility of fine-tuning a big diffusion system on consumer GPUs. But additional components might have to be added to obtain state-of-the-art performance. Below are some commonly known gotchas that users should be aware of:
+
+* Training of text encoders is purposefully disabled.
+* Techniques such as prior-preservation is unsupported.
+* Custom instance captions for instance images are unsupported, but this should be relatively easy to integrate.
+
+Hopefully, this project gives you a template to extend it further to suit your needs.
diff --git a/diffusers/examples/research_projects/sd3_lora_colab/compute_embeddings.py b/diffusers/examples/research_projects/sd3_lora_colab/compute_embeddings.py
new file mode 100644
index 0000000000000000000000000000000000000000..5014752ffe349863d4d90fd95445ebcebd730c24
--- /dev/null
+++ b/diffusers/examples/research_projects/sd3_lora_colab/compute_embeddings.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import glob
+import hashlib
+
+import pandas as pd
+import torch
+from transformers import T5EncoderModel
+
+from diffusers import StableDiffusion3Pipeline
+
+
+PROMPT = "a photo of sks dog"
+MAX_SEQ_LENGTH = 77
+LOCAL_DATA_DIR = "dog"
+OUTPUT_PATH = "sample_embeddings.parquet"
+
+
+def bytes_to_giga_bytes(bytes):
+    return bytes / 1024 / 1024 / 1024
+
+
+def generate_image_hash(image_path):
+    with open(image_path, "rb") as f:
+        img_data = f.read()
+    return hashlib.sha256(img_data).hexdigest()
+
+
+def load_sd3_pipeline():
+    id = "stabilityai/stable-diffusion-3-medium-diffusers"
+    text_encoder = T5EncoderModel.from_pretrained(id, subfolder="text_encoder_3", load_in_8bit=True, device_map="auto")
+    pipeline = StableDiffusion3Pipeline.from_pretrained(
+        id, text_encoder_3=text_encoder, transformer=None, vae=None, device_map="balanced"
+    )
+    return pipeline
+
+
+@torch.no_grad()
+def compute_embeddings(pipeline, prompt, max_sequence_length):
+    (
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+    ) = pipeline.encode_prompt(prompt=prompt, prompt_2=None, prompt_3=None, max_sequence_length=max_sequence_length)
+
+    print(
+        f"{prompt_embeds.shape=}, {negative_prompt_embeds.shape=}, {pooled_prompt_embeds.shape=}, {negative_pooled_prompt_embeds.shape}"
+    )
+
+    max_memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
+    print(f"Max memory allocated: {max_memory:.3f} GB")
+    return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+
+def run(args):
+    pipeline = load_sd3_pipeline()
+    prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds = compute_embeddings(
+        pipeline, args.prompt, args.max_sequence_length
+    )
+
+    # Assumes that the images within `args.local_image_dir` have a JPEG extension. Change
+    # as needed.
+    image_paths = glob.glob(f"{args.local_data_dir}/*.jpeg")
+    data = []
+    for image_path in image_paths:
+        img_hash = generate_image_hash(image_path)
+        data.append(
+            (img_hash, prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds)
+        )
+
+    # Create a DataFrame
+    embedding_cols = [
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "pooled_prompt_embeds",
+        "negative_pooled_prompt_embeds",
+    ]
+    df = pd.DataFrame(
+        data,
+        columns=["image_hash"] + embedding_cols,
+    )
+
+    # Convert embedding lists to arrays (for proper storage in parquet)
+    for col in embedding_cols:
+        df[col] = df[col].apply(lambda x: x.cpu().numpy().flatten().tolist())
+
+    # Save the dataframe to a parquet file
+    df.to_parquet(args.output_path)
+    print(f"Data successfully serialized to {args.output_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prompt", type=str, default=PROMPT, help="The instance prompt.")
+    parser.add_argument(
+        "--max_sequence_length",
+        type=int,
+        default=MAX_SEQ_LENGTH,
+        help="Maximum sequence length to use for computing the embeddings. The more the higher computational costs.",
+    )
+    parser.add_argument(
+        "--local_data_dir", type=str, default=LOCAL_DATA_DIR, help="Path to the directory containing instance images."
+    )
+    parser.add_argument("--output_path", type=str, default=OUTPUT_PATH, help="Path to serialize the parquet file.")
+    args = parser.parse_args()
+
+    run(args)
diff --git a/diffusers/examples/research_projects/sd3_lora_colab/sd3_dreambooth_lora_16gb.ipynb b/diffusers/examples/research_projects/sd3_lora_colab/sd3_dreambooth_lora_16gb.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..25fcb36a47d56214a5ed5cffbe60368d7202a041
--- /dev/null
+++ b/diffusers/examples/research_projects/sd3_lora_colab/sd3_dreambooth_lora_16gb.ipynb
@@ -0,0 +1,2428 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "a6xLZDgOajbd"
+      },
+      "source": [
+        "# Running Stable Diffusion 3 (SD3) DreamBooth LoRA training under 16GB GPU VRAM"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0jPZpMTwafua"
+      },
+      "source": [
+        "## Install Dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "lIYdn1woOS1n",
+        "outputId": "6d4a6332-d1f5-46e2-ad2b-c9e51b9f279a"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -q -U git+https://github.com/huggingface/diffusers\n",
+        "!pip install -q -U \\\n",
+        "    transformers \\\n",
+        "    accelerate \\\n",
+        "    wandb \\\n",
+        "    bitsandbytes \\\n",
+        "    peft"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "5qUNciw6aov2"
+      },
+      "source": [
+        "As SD3 is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Bpk5FleeK1NR",
+        "outputId": "54d8e774-514e-46fe-b9a7-0185e0bcf211"
+      },
+      "outputs": [],
+      "source": [
+        "!huggingface-cli login"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tcF7gl4FasJV"
+      },
+      "source": [
+        "## Clone `diffusers`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "QgSOJYglJKiM",
+        "outputId": "be51f30f-8848-4a79-ae91-c4fb89c244ba"
+      },
+      "outputs": [],
+      "source": [
+        "!git clone https://github.com/huggingface/diffusers\n",
+        "%cd diffusers/examples/research_projects/sd3_lora_colab"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "X9dBawr6ayRY"
+      },
+      "source": [
+        "## Download instance data images"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 351,
+          "referenced_widgets": [
+            "8720a1f0a3b043dba02b6aab0afb861a",
+            "0e70a30146ef4b30b014179bd4bfd131",
+            "c39072b8cfff4a11ba283a9ae3155e52",
+            "1e834badd9c74b95bda30456a585fc06",
+            "d7c7c83b341b4471ad8a0ca1fe76d9ff",
+            "5ab639bd765f4824818a53ab84f690a8",
+            "cd94205b05d54e4c96c9c475e13abe83",
+            "be260274fdb04798af6fce6169646ff2",
+            "b9912757b9f9477186c171ecb2551d3a",
+            "a1f88f8e27894cdfab54cad04871f74e",
+            "19026c269dce47d585506f734fa2981a",
+            "50237341e55e4da0ba5cdbf652f30115",
+            "1d006f25b17e4cd8aaa5f66d58940dc7",
+            "4c673aa247ff4d65b82c4c64ca2e72da",
+            "92698388b667476ea1daf5cacb2fdb07",
+            "f6b3aa0f980e450289ee15cea7cb3ed7",
+            "0690a95eb8c3403e90d5b023aaadb22c",
+            "4a20ceca22724ab082d013c20c758d31",
+            "c80f3825981646a8a3e178192e338962",
+            "5673d0ca1f1247dd924874355eadecd4",
+            "7774ac850ab2451ea380bf80f3be5a86",
+            "22e57b8c83fa48489d6a327f1bbb756b",
+            "dd2debcf1c774181bef97efab0f3d6e1",
+            "633d7df9a17e4bf6951249aca83a9e96",
+            "6469e9991d7b41a0b83a7b443b9eebe5",
+            "0b9c72fa39c241ba9dd22dd67c2436fe",
+            "99e707cfe1454757aad4014230f6dae8",
+            "5a4ec2d031fa438eb4d0492329b28f00",
+            "6c0d4d6d84704f88b46a9b5cf94e3836",
+            "e1fb8dec23c04d6f8d1217242f8a495c",
+            "4b35f9d8d6444d0397a8bafcf3d73e8f",
+            "0f3279a4e6a247a7b69ff73bc06acfe0",
+            "b5ac4ab9256e4d5092ba6e449bc3cdd3",
+            "2840e90c518d4666b3f5a935c90569a7",
+            "adb012e95d7d442a820680e61e615e3c",
+            "be4fd10d940d49cf8e916904da8192ab",
+            "fd93adba791f46c1b0a25ff692426149",
+            "cdee3b61ca6a487c8ec8e7e884eb8b07",
+            "190a7fbe2b554104a6d5b2caa3b0a08e",
+            "975922b877e143edb09cdb888cb7cae8",
+            "d7365b62df59406dbd38677299cce1c8",
+            "67f0f5f1179140b4bdaa74c5583e3958",
+            "e560f25c3e334cf2a4c748981ac38da6",
+            "65173381a80b40748b7b2800fdb89151",
+            "7a4c5c0acd2d400e91da611e91ff5306",
+            "2a02c69a19a741b4a032dedbc21ad088",
+            "c6211ddb71e64f9e92c70158da2f7ef1",
+            "c219a1e791894469aa1452045b0f74b5",
+            "8e881fd3a17e4a5d95e71f6411ed8167",
+            "5350f001bf774b5fb7f3e362f912bec3",
+            "a893c93bcbc444a4931d1ddc6e342786",
+            "03047a13f06744fcac17c77cb03bca62",
+            "4d77a9c44d1c47b18022f8a51b29e20d",
+            "0b5bb94394fc447282d2c44780303f15",
+            "01bfa49325a8403b808ad1662465996e",
+            "3c0f67144f974aea85c7088f482e830d",
+            "0996e9f698dc4d6ab3c4319c96186619",
+            "9c663933ece544b193531725f4dc873d",
+            "b48dc06ca1654fe39bf8a77352a921f2",
+            "641f5a361a584cc0b71fd71d5f786958",
+            "66a952451b4c43cab54312fe886df5e6",
+            "42757924240c4abeb39add0c26687ab3",
+            "7f26ae5417cf4c80921cce830e60f72b",
+            "b77093ec9ffd40e0b2c1d9d1bdc063f5",
+            "7d8e3510c1e34524993849b8fce52758",
+            "b15011804460483ab84904a52da754b7"
+          ]
+        },
+        "id": "La1rBYWFNjEP",
+        "outputId": "e8567843-193e-4653-86b8-be26390700df"
+      },
+      "outputs": [],
+      "source": [
+        "from huggingface_hub import snapshot_download\n",
+        "\n",
+        "local_dir = \"./dog\"\n",
+        "snapshot_download(\n",
+        "    \"diffusers/dog-example\",\n",
+        "    local_dir=local_dir, repo_type=\"dataset\",\n",
+        "    ignore_patterns=\".gitattributes\",\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "hbsIzdjbOzgi"
+      },
+      "outputs": [],
+      "source": [
+        "!rm -rf dog/.huggingface"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "88sOTn2ga07q"
+      },
+      "source": [
+        "## Compute embeddings\n",
+        "\n",
+        "Here we are using the default instance prompt \"a photo of sks dog\". But you can configure this. Refer to the `compute_embeddings.py` script for details on other supported arguments."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ha6hPLpHLM8c",
+        "outputId": "82843eb0-473e-4d6b-d11d-1f79bcd1d11a"
+      },
+      "outputs": [],
+      "source": [
+        "!python compute_embeddings.py"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "10iMo-RUa_yv"
+      },
+      "source": [
+        "## Clear memory"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-YltRmPgMuNa"
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "import gc\n",
+        "\n",
+        "\n",
+        "def flush():\n",
+        "    torch.cuda.empty_cache()\n",
+        "    gc.collect()\n",
+        "\n",
+        "flush()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UO5oEtOJbBS9"
+      },
+      "source": [
+        "## Train!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "HuJ6hdm2M4Aw",
+        "outputId": "0b2d8ca3-c65f-4bb4-af9a-809b77116510"
+      },
+      "outputs": [],
+      "source": [
+        "!accelerate launch train_dreambooth_lora_sd3_miniature.py \\\n",
+        "  --pretrained_model_name_or_path=\"stabilityai/stable-diffusion-3-medium-diffusers\"  \\\n",
+        "  --instance_data_dir=\"dog\" \\\n",
+        "  --data_df_path=\"sample_embeddings.parquet\" \\\n",
+        "  --output_dir=\"trained-sd3-lora-miniature\" \\\n",
+        "  --mixed_precision=\"fp16\" \\\n",
+        "  --instance_prompt=\"a photo of sks dog\" \\\n",
+        "  --resolution=1024 \\\n",
+        "  --train_batch_size=1 \\\n",
+        "  --gradient_accumulation_steps=4 --gradient_checkpointing \\\n",
+        "  --use_8bit_adam \\\n",
+        "  --learning_rate=1e-4 \\\n",
+        "  --report_to=\"wandb\" \\\n",
+        "  --lr_scheduler=\"constant\" \\\n",
+        "  --lr_warmup_steps=0 \\\n",
+        "  --max_train_steps=500 \\\n",
+        "  --seed=\"0\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "itS-dsJ0gjy3"
+      },
+      "source": [
+        "Training will take about an hour to complete depending on the length of your dataset."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BpOuL7S1bI6j"
+      },
+      "source": [
+        "## Inference"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "clfMv4jKfQzb"
+      },
+      "outputs": [],
+      "source": [
+        "flush()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "np03SXHkbKpG"
+      },
+      "outputs": [],
+      "source": [
+        "from diffusers import DiffusionPipeline\n",
+        "import torch\n",
+        "\n",
+        "pipeline = DiffusionPipeline.from_pretrained(\n",
+        "    \"stabilityai/stable-diffusion-3-medium-diffusers\",\n",
+        "    torch_dtype=torch.float16\n",
+        ")\n",
+        "lora_output_path = \"trained-sd3-lora-miniature\"\n",
+        "pipeline.load_lora_weights(\"trained-sd3-lora-miniature\")\n",
+        "\n",
+        "pipeline.enable_sequential_cpu_offload()\n",
+        "\n",
+        "image = pipeline(\"a photo of sks dog in a bucket\").images[0]\n",
+        "image.save(\"bucket_dog.png\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HDfrY2opjGjD"
+      },
+      "source": [
+        "Note that inference will be very slow in this case because we're loading and unloading individual components of the models and that introduces significant data movement overhead. Refer to [this resource](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more memory optimization related techniques."
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "01bfa49325a8403b808ad1662465996e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "03047a13f06744fcac17c77cb03bca62": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "0690a95eb8c3403e90d5b023aaadb22c": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "0996e9f698dc4d6ab3c4319c96186619": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_66a952451b4c43cab54312fe886df5e6",
+            "placeholder": "​",
+            "style": "IPY_MODEL_42757924240c4abeb39add0c26687ab3",
+            "value": "alvan-nee-bQaAJCbNq3g-unsplash.jpeg: 100%"
+          }
+        },
+        "0b5bb94394fc447282d2c44780303f15": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "0b9c72fa39c241ba9dd22dd67c2436fe": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_0f3279a4e6a247a7b69ff73bc06acfe0",
+            "placeholder": "​",
+            "style": "IPY_MODEL_b5ac4ab9256e4d5092ba6e449bc3cdd3",
+            "value": " 1.16M/1.16M [00:00&lt;00:00, 10.3MB/s]"
+          }
+        },
+        "0e70a30146ef4b30b014179bd4bfd131": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_5ab639bd765f4824818a53ab84f690a8",
+            "placeholder": "​",
+            "style": "IPY_MODEL_cd94205b05d54e4c96c9c475e13abe83",
+            "value": "Fetching 5 files: 100%"
+          }
+        },
+        "0f3279a4e6a247a7b69ff73bc06acfe0": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "19026c269dce47d585506f734fa2981a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "190a7fbe2b554104a6d5b2caa3b0a08e": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "1d006f25b17e4cd8aaa5f66d58940dc7": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_0690a95eb8c3403e90d5b023aaadb22c",
+            "placeholder": "​",
+            "style": "IPY_MODEL_4a20ceca22724ab082d013c20c758d31",
+            "value": "alvan-nee-9M0tSjb-cpA-unsplash.jpeg: 100%"
+          }
+        },
+        "1e834badd9c74b95bda30456a585fc06": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_a1f88f8e27894cdfab54cad04871f74e",
+            "placeholder": "​",
+            "style": "IPY_MODEL_19026c269dce47d585506f734fa2981a",
+            "value": " 5/5 [00:01&lt;00:00,  3.44it/s]"
+          }
+        },
+        "22e57b8c83fa48489d6a327f1bbb756b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "2840e90c518d4666b3f5a935c90569a7": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_adb012e95d7d442a820680e61e615e3c",
+              "IPY_MODEL_be4fd10d940d49cf8e916904da8192ab",
+              "IPY_MODEL_fd93adba791f46c1b0a25ff692426149"
+            ],
+            "layout": "IPY_MODEL_cdee3b61ca6a487c8ec8e7e884eb8b07"
+          }
+        },
+        "2a02c69a19a741b4a032dedbc21ad088": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_5350f001bf774b5fb7f3e362f912bec3",
+            "placeholder": "​",
+            "style": "IPY_MODEL_a893c93bcbc444a4931d1ddc6e342786",
+            "value": "alvan-nee-eoqnr8ikwFE-unsplash.jpeg: 100%"
+          }
+        },
+        "3c0f67144f974aea85c7088f482e830d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_0996e9f698dc4d6ab3c4319c96186619",
+              "IPY_MODEL_9c663933ece544b193531725f4dc873d",
+              "IPY_MODEL_b48dc06ca1654fe39bf8a77352a921f2"
+            ],
+            "layout": "IPY_MODEL_641f5a361a584cc0b71fd71d5f786958"
+          }
+        },
+        "42757924240c4abeb39add0c26687ab3": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "4a20ceca22724ab082d013c20c758d31": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "4b35f9d8d6444d0397a8bafcf3d73e8f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "4c673aa247ff4d65b82c4c64ca2e72da": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_c80f3825981646a8a3e178192e338962",
+            "max": 677407,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_5673d0ca1f1247dd924874355eadecd4",
+            "value": 677407
+          }
+        },
+        "4d77a9c44d1c47b18022f8a51b29e20d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "50237341e55e4da0ba5cdbf652f30115": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_1d006f25b17e4cd8aaa5f66d58940dc7",
+              "IPY_MODEL_4c673aa247ff4d65b82c4c64ca2e72da",
+              "IPY_MODEL_92698388b667476ea1daf5cacb2fdb07"
+            ],
+            "layout": "IPY_MODEL_f6b3aa0f980e450289ee15cea7cb3ed7"
+          }
+        },
+        "5350f001bf774b5fb7f3e362f912bec3": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "5673d0ca1f1247dd924874355eadecd4": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "5a4ec2d031fa438eb4d0492329b28f00": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "5ab639bd765f4824818a53ab84f690a8": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "633d7df9a17e4bf6951249aca83a9e96": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_5a4ec2d031fa438eb4d0492329b28f00",
+            "placeholder": "​",
+            "style": "IPY_MODEL_6c0d4d6d84704f88b46a9b5cf94e3836",
+            "value": "alvan-nee-Id1DBHv4fbg-unsplash.jpeg: 100%"
+          }
+        },
+        "641f5a361a584cc0b71fd71d5f786958": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "6469e9991d7b41a0b83a7b443b9eebe5": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_e1fb8dec23c04d6f8d1217242f8a495c",
+            "max": 1163467,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_4b35f9d8d6444d0397a8bafcf3d73e8f",
+            "value": 1163467
+          }
+        },
+        "65173381a80b40748b7b2800fdb89151": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "66a952451b4c43cab54312fe886df5e6": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "67f0f5f1179140b4bdaa74c5583e3958": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "6c0d4d6d84704f88b46a9b5cf94e3836": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "7774ac850ab2451ea380bf80f3be5a86": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "7a4c5c0acd2d400e91da611e91ff5306": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_2a02c69a19a741b4a032dedbc21ad088",
+              "IPY_MODEL_c6211ddb71e64f9e92c70158da2f7ef1",
+              "IPY_MODEL_c219a1e791894469aa1452045b0f74b5"
+            ],
+            "layout": "IPY_MODEL_8e881fd3a17e4a5d95e71f6411ed8167"
+          }
+        },
+        "7d8e3510c1e34524993849b8fce52758": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "7f26ae5417cf4c80921cce830e60f72b": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "8720a1f0a3b043dba02b6aab0afb861a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_0e70a30146ef4b30b014179bd4bfd131",
+              "IPY_MODEL_c39072b8cfff4a11ba283a9ae3155e52",
+              "IPY_MODEL_1e834badd9c74b95bda30456a585fc06"
+            ],
+            "layout": "IPY_MODEL_d7c7c83b341b4471ad8a0ca1fe76d9ff"
+          }
+        },
+        "8e881fd3a17e4a5d95e71f6411ed8167": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "92698388b667476ea1daf5cacb2fdb07": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_7774ac850ab2451ea380bf80f3be5a86",
+            "placeholder": "​",
+            "style": "IPY_MODEL_22e57b8c83fa48489d6a327f1bbb756b",
+            "value": " 677k/677k [00:00&lt;00:00, 9.50MB/s]"
+          }
+        },
+        "975922b877e143edb09cdb888cb7cae8": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "99e707cfe1454757aad4014230f6dae8": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "9c663933ece544b193531725f4dc873d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_7f26ae5417cf4c80921cce830e60f72b",
+            "max": 1396297,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_b77093ec9ffd40e0b2c1d9d1bdc063f5",
+            "value": 1396297
+          }
+        },
+        "a1f88f8e27894cdfab54cad04871f74e": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "a893c93bcbc444a4931d1ddc6e342786": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "adb012e95d7d442a820680e61e615e3c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_190a7fbe2b554104a6d5b2caa3b0a08e",
+            "placeholder": "​",
+            "style": "IPY_MODEL_975922b877e143edb09cdb888cb7cae8",
+            "value": "alvan-nee-brFsZ7qszSY-unsplash.jpeg: 100%"
+          }
+        },
+        "b15011804460483ab84904a52da754b7": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "b48dc06ca1654fe39bf8a77352a921f2": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_7d8e3510c1e34524993849b8fce52758",
+            "placeholder": "​",
+            "style": "IPY_MODEL_b15011804460483ab84904a52da754b7",
+            "value": " 1.40M/1.40M [00:00&lt;00:00, 10.5MB/s]"
+          }
+        },
+        "b5ac4ab9256e4d5092ba6e449bc3cdd3": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "b77093ec9ffd40e0b2c1d9d1bdc063f5": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "b9912757b9f9477186c171ecb2551d3a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "be260274fdb04798af6fce6169646ff2": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "be4fd10d940d49cf8e916904da8192ab": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_d7365b62df59406dbd38677299cce1c8",
+            "max": 1186464,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_67f0f5f1179140b4bdaa74c5583e3958",
+            "value": 1186464
+          }
+        },
+        "c219a1e791894469aa1452045b0f74b5": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_0b5bb94394fc447282d2c44780303f15",
+            "placeholder": "​",
+            "style": "IPY_MODEL_01bfa49325a8403b808ad1662465996e",
+            "value": " 1.17M/1.17M [00:00&lt;00:00, 15.7MB/s]"
+          }
+        },
+        "c39072b8cfff4a11ba283a9ae3155e52": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_be260274fdb04798af6fce6169646ff2",
+            "max": 5,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_b9912757b9f9477186c171ecb2551d3a",
+            "value": 5
+          }
+        },
+        "c6211ddb71e64f9e92c70158da2f7ef1": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_03047a13f06744fcac17c77cb03bca62",
+            "max": 1167042,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_4d77a9c44d1c47b18022f8a51b29e20d",
+            "value": 1167042
+          }
+        },
+        "c80f3825981646a8a3e178192e338962": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "cd94205b05d54e4c96c9c475e13abe83": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "cdee3b61ca6a487c8ec8e7e884eb8b07": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "d7365b62df59406dbd38677299cce1c8": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "d7c7c83b341b4471ad8a0ca1fe76d9ff": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "dd2debcf1c774181bef97efab0f3d6e1": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_633d7df9a17e4bf6951249aca83a9e96",
+              "IPY_MODEL_6469e9991d7b41a0b83a7b443b9eebe5",
+              "IPY_MODEL_0b9c72fa39c241ba9dd22dd67c2436fe"
+            ],
+            "layout": "IPY_MODEL_99e707cfe1454757aad4014230f6dae8"
+          }
+        },
+        "e1fb8dec23c04d6f8d1217242f8a495c": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "e560f25c3e334cf2a4c748981ac38da6": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "f6b3aa0f980e450289ee15cea7cb3ed7": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "fd93adba791f46c1b0a25ff692426149": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_e560f25c3e334cf2a4c748981ac38da6",
+            "placeholder": "​",
+            "style": "IPY_MODEL_65173381a80b40748b7b2800fdb89151",
+            "value": " 1.19M/1.19M [00:00&lt;00:00, 12.7MB/s]"
+          }
+        }
+      }
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/diffusers/examples/research_projects/sd3_lora_colab/train_dreambooth_lora_sd3_miniature.py b/diffusers/examples/research_projects/sd3_lora_colab/train_dreambooth_lora_sd3_miniature.py
new file mode 100644
index 0000000000000000000000000000000000000000..163ff8f08931ab3eaffa9e23049c8060d3455507
--- /dev/null
+++ b/diffusers/examples/research_projects/sd3_lora_colab/train_dreambooth_lora_sd3_miniature.py
@@ -0,0 +1,1147 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import copy
+import gc
+import hashlib
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    SD3Transformer2DModel,
+    StableDiffusion3Pipeline,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import (
+    cast_training_params,
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+)
+from diffusers.utils import (
+    check_min_version,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.30.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    images=None,
+    base_model: str = None,
+    train_text_encoder=False,
+    instance_prompt=None,
+    validation_prompt=None,
+    repo_folder=None,
+):
+    widget_dict = []
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            widget_dict.append(
+                {"text": validation_prompt if validation_prompt else " ", "output": {"url": f"image_{i}.png"}}
+            )
+
+    model_description = f"""
+# SD3 DreamBooth LoRA - {repo_id}
+
+<Gallery />
+
+## Model description
+
+These are {repo_id} DreamBooth weights for {base_model}.
+
+The weights were trained  using [DreamBooth](https://dreambooth.github.io/).
+
+LoRA for the text encoder was enabled: {train_text_encoder}.
+
+## Trigger words
+
+You should use {instance_prompt} to trigger the image generation.
+
+## Download model
+
+[Download]({repo_id}/tree/main) them in the Files & versions tab.
+
+## License
+
+Please adhere to the licensing terms as described [here](https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/LICENSE).
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="openrail++",
+        base_model=base_model,
+        prompt=instance_prompt,
+        model_description=model_description,
+        widget=widget_dict,
+    )
+    tags = [
+        "text-to-image",
+        "diffusers-training",
+        "diffusers",
+        "lora",
+        "sd3",
+        "sd3-diffusers",
+        "template:sd-lora",
+    ]
+
+    model_card = populate_model_card(model_card, tags=tags)
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline.enable_model_cpu_offload()
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    # autocast_ctx = torch.autocast(accelerator.device.type) if not is_final_validation else nullcontext()
+    autocast_ctx = nullcontext()
+
+    with autocast_ctx:
+        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    return images
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help=("A folder containing the training data. "),
+    )
+    parser.add_argument(
+        "--data_df_path",
+        type=str,
+        default=None,
+        help=("Path to the parquet file serialized with compute_embeddings.py."),
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance, e.g. 'photo of a TOK dog', 'in the style of TOK'",
+    )
+    parser.add_argument(
+        "--max_sequence_length",
+        type=int,
+        default=77,
+        help="Maximum sequence length to use with with the T5 text encoder",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd3-dreambooth-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="logit_normal",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap"],
+    )
+    parser.add_argument(
+        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+    )
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="AdamW",
+        help=('The optimizer type to use. Choose between ["AdamW"]'),
+    )
+
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer.",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.instance_data_dir is None:
+        raise ValueError("Specify `instance_data_dir`.")
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images.
+    """
+
+    def __init__(
+        self,
+        data_df_path,
+        instance_data_root,
+        instance_prompt,
+        size=1024,
+        center_crop=False,
+    ):
+        # Logistics
+        self.size = size
+        self.center_crop = center_crop
+
+        self.instance_prompt = instance_prompt
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        # Load images.
+        instance_images = [Image.open(path) for path in list(Path(instance_data_root).iterdir())]
+        image_hashes = [self.generate_image_hash(path) for path in list(Path(instance_data_root).iterdir())]
+        self.instance_images = instance_images
+        self.image_hashes = image_hashes
+
+        # Image transformations
+        self.pixel_values = self.apply_image_transformations(
+            instance_images=instance_images, size=size, center_crop=center_crop
+        )
+
+        # Map hashes to embeddings.
+        self.data_dict = self.map_image_hash_embedding(data_df_path=data_df_path)
+
+        self.num_instance_images = len(instance_images)
+        self._length = self.num_instance_images
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = self.pixel_values[index % self.num_instance_images]
+        image_hash = self.image_hashes[index % self.num_instance_images]
+        prompt_embeds, pooled_prompt_embeds = self.data_dict[image_hash]
+        example["instance_images"] = instance_image
+        example["prompt_embeds"] = prompt_embeds
+        example["pooled_prompt_embeds"] = pooled_prompt_embeds
+        return example
+
+    def apply_image_transformations(self, instance_images, size, center_crop):
+        pixel_values = []
+
+        train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
+        train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
+        train_flip = transforms.RandomHorizontalFlip(p=1.0)
+        train_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        for image in instance_images:
+            image = exif_transpose(image)
+            if not image.mode == "RGB":
+                image = image.convert("RGB")
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            image = train_transforms(image)
+            pixel_values.append(image)
+
+        return pixel_values
+
+    def convert_to_torch_tensor(self, embeddings: list):
+        prompt_embeds = embeddings[0]
+        pooled_prompt_embeds = embeddings[1]
+        prompt_embeds = np.array(prompt_embeds).reshape(154, 4096)
+        pooled_prompt_embeds = np.array(pooled_prompt_embeds).reshape(2048)
+        return torch.from_numpy(prompt_embeds), torch.from_numpy(pooled_prompt_embeds)
+
+    def map_image_hash_embedding(self, data_df_path):
+        hashes_df = pd.read_parquet(data_df_path)
+        data_dict = {}
+        for i, row in hashes_df.iterrows():
+            embeddings = [row["prompt_embeds"], row["pooled_prompt_embeds"]]
+            prompt_embeds, pooled_prompt_embeds = self.convert_to_torch_tensor(embeddings=embeddings)
+            data_dict.update({row["image_hash"]: (prompt_embeds, pooled_prompt_embeds)})
+        return data_dict
+
+    def generate_image_hash(self, image_path):
+        with open(image_path, "rb") as f:
+            img_data = f.read()
+        return hashlib.sha256(img_data).hexdigest()
+
+
+def collate_fn(examples):
+    pixel_values = [example["instance_images"] for example in examples]
+    prompt_embeds = [example["prompt_embeds"] for example in examples]
+    pooled_prompt_embeds = [example["pooled_prompt_embeds"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+    prompt_embeds = torch.stack(prompt_embeds)
+    pooled_prompt_embeds = torch.stack(pooled_prompt_embeds)
+
+    batch = {
+        "pixel_values": pixel_values,
+        "prompt_embeds": prompt_embeds,
+        "pooled_prompt_embeds": pooled_prompt_embeds,
+    }
+    return batch
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+            ).repo_id
+
+    # Load scheduler and models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="scheduler"
+    )
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+        variant=args.variant,
+    )
+    transformer = SD3Transformer2DModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant
+    )
+
+    transformer.requires_grad_(False)
+    vae.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    vae.to(accelerator.device, dtype=torch.float32)
+    transformer.to(accelerator.device, dtype=weight_dtype)
+
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+
+    # now we will add new LoRA weights to the attention layers
+    transformer_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+    transformer.add_adapter(transformer_lora_config)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            transformer_lora_layers_to_save = None
+            for model in models:
+                if isinstance(model, type(unwrap_model(transformer))):
+                    transformer_lora_layers_to_save = get_peft_model_state_dict(model)
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusion3Pipeline.save_lora_weights(
+                output_dir,
+                transformer_lora_layers=transformer_lora_layers_to_save,
+            )
+
+    def load_model_hook(models, input_dir):
+        transformer_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(transformer))):
+                transformer_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict = StableDiffusion3Pipeline.lora_state_dict(input_dir)
+
+        transformer_state_dict = {
+            f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")
+        }
+        transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)
+        incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [transformer_]
+            # only upcast trainable parameters (LoRA) into fp32
+            cast_training_params(models)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [transformer]
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(models, dtype=torch.float32)
+
+    # Optimization parameters
+    transformer_lora_parameters = list(filter(lambda p: p.requires_grad, transformer.parameters()))
+    transformer_parameters_with_lr = {"params": transformer_lora_parameters, "lr": args.learning_rate}
+    params_to_optimize = [transformer_parameters_with_lr]
+
+    # Optimizer creation
+    if not args.optimizer.lower() == "adamw":
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}. Supported optimizers include [adamW]."
+            "Defaulting to adamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.optimizer.lower() == "adamw":
+        if args.use_8bit_adam:
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError(
+                    "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+                )
+
+            optimizer_class = bnb.optim.AdamW8bit
+        else:
+            optimizer_class = torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        data_df_path=args.data_df_path,
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        transformer, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = "dreambooth-sd3-lora-miniature"
+        accelerator.init_trackers(tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [transformer]
+            with accelerator.accumulate(models_to_accumulate):
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+
+                # Convert images to latent space
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                model_input = model_input * vae.config.scaling_factor
+                model_input = model_input.to(dtype=weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+
+                # Sample a random timestep for each image
+                # for weighting schemes where we sample timesteps non-uniformly
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+
+                # Add noise according to flow matching.
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = sigmas * noise + (1.0 - sigmas) * model_input
+
+                # Predict the noise residual
+                prompt_embeds, pooled_prompt_embeds = batch["prompt_embeds"], batch["pooled_prompt_embeds"]
+                prompt_embeds = prompt_embeds.to(device=accelerator.device, dtype=weight_dtype)
+                pooled_prompt_embeds = pooled_prompt_embeds.to(device=accelerator.device, dtype=weight_dtype)
+                model_pred = transformer(
+                    hidden_states=noisy_model_input,
+                    timestep=timesteps,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    return_dict=False,
+                )[0]
+
+                # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                # Preconditioning of the model outputs.
+                model_pred = model_pred * (-sigmas) + noisy_model_input
+
+                # these weighting schemes use a uniform timestep sampling
+                # and instead post-weight the loss
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+
+                # flow matching loss
+                target = model_input
+
+                # Compute regular loss.
+                loss = torch.mean(
+                    (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    1,
+                )
+                loss = loss.mean()
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = transformer_lora_parameters
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                pipeline = StableDiffusion3Pipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    transformer=accelerator.unwrap_model(transformer),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline_args = {"prompt": args.validation_prompt}
+                images = log_validation(
+                    pipeline=pipeline,
+                    args=args,
+                    accelerator=accelerator,
+                    pipeline_args=pipeline_args,
+                    epoch=epoch,
+                )
+                torch.cuda.empty_cache()
+                gc.collect()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        transformer = unwrap_model(transformer)
+        transformer = transformer.to(torch.float32)
+        transformer_lora_layers = get_peft_model_state_dict(transformer)
+
+        StableDiffusion3Pipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            transformer_lora_layers=transformer_lora_layers,
+        )
+
+        # Final inference
+        # Load previous pipeline
+        pipeline = StableDiffusion3Pipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        # load attention processors
+        pipeline.load_lora_weights(args.output_dir)
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline_args = {"prompt": args.validation_prompt}
+            images = log_validation(
+                pipeline=pipeline,
+                args=args,
+                accelerator=accelerator,
+                pipeline_args=pipeline_args,
+                epoch=epoch,
+                is_final_validation=True,
+            )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                instance_prompt=args.instance_prompt,
+                validation_prompt=args.validation_prompt,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/research_projects/sdxl_flax/README.md b/diffusers/examples/research_projects/sdxl_flax/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..dfbe90e63bde910eb40535e655c9b9845a89da4c
--- /dev/null
+++ b/diffusers/examples/research_projects/sdxl_flax/README.md
@@ -0,0 +1,243 @@
+# Stable Diffusion XL for JAX + TPUv5e
+
+[TPU v5e](https://cloud.google.com/blog/products/compute/how-cloud-tpu-v5e-accelerates-large-scale-ai-inference) is a new generation of TPUs from Google Cloud. It is the most cost-effective, versatile, and scalable Cloud TPU to date. This makes them ideal for serving and scaling large diffusion models.
+
+[JAX](https://github.com/google/jax) is a high-performance numerical computation library that is well-suited to develop and deploy diffusion models:
+
+- **High performance**. All JAX operations are implemented in terms of operations in [XLA](https://www.tensorflow.org/xla/) - the Accelerated Linear Algebra compiler
+
+- **Compilation**. JAX uses just-in-time (jit) compilation of JAX Python functions so it can be executed efficiently in XLA. In order to get the best performance, we must use static shapes for jitted functions, this is because JAX transforms work by tracing a function and to determine its effect on inputs of a specific shape and type. When a new shape is introduced to an already compiled function, it retriggers compilation on the new shape, which can greatly reduce performance. **Note**: JIT compilation is particularly well-suited for text-to-image generation because all inputs and outputs (image input / output sizes) are static.
+
+- **Parallelization**. Workloads can be scaled across multiple devices using JAX's [pmap](https://jax.readthedocs.io/en/latest/_autosummary/jax.pmap.html), which expresses single-program multiple-data (SPMD) programs. Applying pmap to a function will compile a function with XLA, then execute in parallel on XLA devices. For text-to-image generation workloads this means that increasing the number of images rendered simultaneously is straightforward to implement and doesn't compromise performance.
+
+👉 Try it out for yourself:
+
+[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/google/sdxl)
+
+## Stable Diffusion XL pipeline in JAX
+
+Upon having access to a TPU VM (TPUs higher than version 3), you should first install
+a TPU-compatible version of JAX:
+```sh
+pip install jax[tpu] -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+```
+
+Next, we can install [flax](https://github.com/google/flax) and the diffusers library:
+
+```sh
+pip install flax diffusers transformers
+```
+
+In [sdxl_single.py](./sdxl_single.py) we give a simple example of how to write a text-to-image generation pipeline in JAX using [StabilityAI's Stable Diffusion XL](stabilityai/stable-diffusion-xl-base-1.0).
+
+Let's explain it step-by-step:
+
+**Imports and Setup**
+
+```python
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.jax_utils import replicate
+from diffusers import FlaxStableDiffusionXLPipeline
+
+from jax.experimental.compilation_cache import compilation_cache as cc
+cc.initialize_cache("/tmp/sdxl_cache")
+import time
+
+NUM_DEVICES = jax.device_count()
+```
+
+First, we import the necessary libraries:
+- `jax` is provides the primitives for TPU operations
+- `flax.jax_utils` contains some useful utility functions for `Flax`, a neural network library built on top of JAX
+- `diffusers` has all the code that is relevant for SDXL.
+- We also initialize a cache to speed up the JAX model compilation.
+- We automatically determine the number of available TPU devices.
+
+**1. Downloading Model and Loading Pipeline**
+
+```python
+pipeline, params = FlaxStableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", revision="refs/pr/95", split_head_dim=True
+)
+```
+Here, a pre-trained model `stable-diffusion-xl-base-1.0` from the namespace `stabilityai` is loaded. It returns a pipeline for inference and its parameters.
+
+**2. Casting Parameter Types**
+
+```python
+scheduler_state = params.pop("scheduler")
+params = jax.tree_util.tree_map(lambda x: x.astype(jnp.bfloat16), params)
+params["scheduler"] = scheduler_state
+```
+This section adjusts the data types of the model parameters.
+We convert all parameters to `bfloat16` to speed-up the computation with model weights.
+**Note** that the scheduler parameters are **not** converted to `blfoat16` as the loss
+in precision is degrading the pipeline's performance too significantly.
+
+**3. Define Inputs to Pipeline**
+
+```python
+default_prompt = ...
+default_neg_prompt = ...
+default_seed = 33
+default_guidance_scale = 5.0
+default_num_steps = 25
+```
+Here, various default inputs for the pipeline are set, including the prompt, negative prompt, random seed, guidance scale, and the number of inference steps.
+
+**4. Tokenizing Inputs**
+
+```python
+def tokenize_prompt(prompt, neg_prompt):
+    prompt_ids = pipeline.prepare_inputs(prompt)
+    neg_prompt_ids = pipeline.prepare_inputs(neg_prompt)
+    return prompt_ids, neg_prompt_ids
+```
+This function tokenizes the given prompts. It's essential because the text encoders of SDXL don't understand raw text; they work with numbers. Tokenization converts text to numbers.
+
+**5. Parallelization and Replication**
+
+```python
+p_params = replicate(params)
+
+def replicate_all(prompt_ids, neg_prompt_ids, seed):
+    ...
+```
+To utilize JAX's parallel capabilities, the parameters and input tensors are duplicated across devices. The `replicate_all` function also ensures that every device produces a different image by creating a unique random seed for each device.
+
+**6. Putting Everything Together**
+
+```python
+def generate(...):
+    ...
+```
+This function integrates all the steps to produce the desired outputs from the model. It takes in prompts, tokenizes them, replicates them across devices, runs them through the pipeline, and converts the images to a format that's more interpretable (PIL format).
+
+**7. Compilation Step**
+
+```python
+start = time.time()
+print(f"Compiling ...")
+generate(default_prompt, default_neg_prompt)
+print(f"Compiled in {time.time() - start}")
+```
+The initial run of the `generate` function will be slow because JAX compiles the function during this call. By running it once here, subsequent calls will be much faster. This section measures and prints the compilation time.
+
+**8. Fast Inference**
+
+```python
+start = time.time()
+prompt = ...
+neg_prompt = ...
+images = generate(prompt, neg_prompt)
+print(f"Inference in {time.time() - start}")
+```
+Now that the function is compiled, this section shows how to use it for fast inference. It measures and prints the inference time.
+
+In summary, the code demonstrates how to load a pre-trained model using Flax and JAX, prepare it for inference, and run it efficiently using JAX's capabilities.
+
+## Ahead of Time (AOT) Compilation
+
+FlaxStableDiffusionXLPipeline takes care of parallelization across multiple devices using jit. Now let's build parallelization ourselves.
+
+For this we will be using a JAX feature called [Ahead of Time](https://jax.readthedocs.io/en/latest/aot.html) (AOT) lowering and compilation. AOT allows to fully compile prior to execution time and have control over different parts of the compilation process.
+
+In [sdxl_single_aot.py](./sdxl_single_aot.py) we give a simple example of how to write our own parallelization logic for text-to-image generation pipeline in JAX using [StabilityAI's Stable Diffusion XL](stabilityai/stable-diffusion-xl-base-1.0)
+
+We add a `aot_compile` function that compiles the `pipeline._generate` function
+telling JAX which input arguments are static, that is, arguments that
+are known at compile time and won't change. In our case, it is num_inference_steps,
+height, width and return_latents.
+
+Once the function is compiled, these parameters are omitted from future calls and
+cannot be changed without modifying the code and recompiling.
+
+```python
+def aot_compile(
+        prompt=default_prompt,
+        negative_prompt=default_neg_prompt,
+        seed=default_seed,
+        guidance_scale=default_guidance_scale,
+        num_inference_steps=default_num_steps
+):
+    prompt_ids, neg_prompt_ids = tokenize_prompt(prompt, negative_prompt)
+    prompt_ids, neg_prompt_ids, rng = replicate_all(prompt_ids, neg_prompt_ids, seed)
+    g = jnp.array([guidance_scale] * prompt_ids.shape[0], dtype=jnp.float32)
+    g = g[:, None]
+
+    return pmap(
+        pipeline._generate,static_broadcasted_argnums=[3, 4, 5, 9]
+        ).lower(
+            prompt_ids,
+            p_params,
+            rng,
+            num_inference_steps, # num_inference_steps
+            height, # height
+            width, # width
+            g,
+            None,
+            neg_prompt_ids,
+            False # return_latents
+            ).compile()
+````
+
+Next we can compile the generate function by executing `aot_compile`.
+
+```python
+start = time.time()
+print("Compiling ...")
+p_generate = aot_compile()
+print(f"Compiled in {time.time() - start}")
+```
+And again we put everything together in a `generate` function.
+
+```python
+def generate(
+    prompt,
+    negative_prompt,
+    seed=default_seed,
+    guidance_scale=default_guidance_scale
+):
+    prompt_ids, neg_prompt_ids = tokenize_prompt(prompt, negative_prompt)
+    prompt_ids, neg_prompt_ids, rng = replicate_all(prompt_ids, neg_prompt_ids, seed)
+    g = jnp.array([guidance_scale] * prompt_ids.shape[0], dtype=jnp.float32)
+    g = g[:, None]
+    images = p_generate(
+        prompt_ids,
+        p_params,
+        rng,
+        g,
+        None,
+        neg_prompt_ids)
+
+    # convert the images to PIL
+    images = images.reshape((images.shape[0] * images.shape[1], ) + images.shape[-3:])
+    return pipeline.numpy_to_pil(np.array(images))
+```
+
+The first forward pass after AOT compilation still takes a while longer than
+subsequent passes, this is because on the first pass, JAX uses Python dispatch, which
+Fills the C++ dispatch cache.
+When using jit, this extra step is done automatically, but when using AOT compilation,
+it doesn't happen until the function call is made.
+
+```python
+start = time.time()
+prompt = "photo of a rhino dressed suit and tie sitting at a table in a bar with a bar stools, award winning photography, Elke vogelsang"
+neg_prompt = "cartoon, illustration, animation. face. male, female"
+images = generate(prompt, neg_prompt)
+print(f"First inference in {time.time() - start}")
+```
+
+From this point forward, any calls to generate should result in a faster inference
+time and it won't change.
+
+```python
+start = time.time()
+prompt = "photo of a rhino dressed suit and tie sitting at a table in a bar with a bar stools, award winning photography, Elke vogelsang"
+neg_prompt = "cartoon, illustration, animation. face. male, female"
+images = generate(prompt, neg_prompt)
+print(f"Inference in {time.time() - start}")
+```
diff --git a/diffusers/examples/research_projects/sdxl_flax/sdxl_single.py b/diffusers/examples/research_projects/sdxl_flax/sdxl_single.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b9b862d99b5c2d583d28395ccb2191e50bef7a4
--- /dev/null
+++ b/diffusers/examples/research_projects/sdxl_flax/sdxl_single.py
@@ -0,0 +1,106 @@
+# Show best practices for SDXL JAX
+import time
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.jax_utils import replicate
+
+# Let's cache the model compilation, so that it doesn't take as long the next time around.
+from jax.experimental.compilation_cache import compilation_cache as cc
+
+from diffusers import FlaxStableDiffusionXLPipeline
+
+
+cc.initialize_cache("/tmp/sdxl_cache")
+
+
+NUM_DEVICES = jax.device_count()
+
+# 1. Let's start by downloading the model and loading it into our pipeline class
+# Adhering to JAX's functional approach, the model's parameters are returned seperatetely and
+# will have to be passed to the pipeline during inference
+pipeline, params = FlaxStableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", revision="refs/pr/95", split_head_dim=True
+)
+
+# 2. We cast all parameters to bfloat16 EXCEPT the scheduler which we leave in
+# float32 to keep maximal precision
+scheduler_state = params.pop("scheduler")
+params = jax.tree_util.tree_map(lambda x: x.astype(jnp.bfloat16), params)
+params["scheduler"] = scheduler_state
+
+# 3. Next, we define the different inputs to the pipeline
+default_prompt = "a colorful photo of a castle in the middle of a forest with trees and bushes, by Ismail Inceoglu, shadows, high contrast, dynamic shading, hdr, detailed vegetation, digital painting, digital drawing, detailed painting, a detailed digital painting, gothic art, featured on deviantart"
+default_neg_prompt = "fog, grainy, purple"
+default_seed = 33
+default_guidance_scale = 5.0
+default_num_steps = 25
+
+
+# 4. In order to be able to compile the pipeline
+# all inputs have to be tensors or strings
+# Let's tokenize the prompt and negative prompt
+def tokenize_prompt(prompt, neg_prompt):
+    prompt_ids = pipeline.prepare_inputs(prompt)
+    neg_prompt_ids = pipeline.prepare_inputs(neg_prompt)
+    return prompt_ids, neg_prompt_ids
+
+
+# 5. To make full use of JAX's parallelization capabilities
+# the parameters and input tensors are duplicated across devices
+# To make sure every device generates a different image, we create
+# different seeds for each image. The model parameters won't change
+# during inference so we do not wrap them into a function
+p_params = replicate(params)
+
+
+def replicate_all(prompt_ids, neg_prompt_ids, seed):
+    p_prompt_ids = replicate(prompt_ids)
+    p_neg_prompt_ids = replicate(neg_prompt_ids)
+    rng = jax.random.PRNGKey(seed)
+    rng = jax.random.split(rng, NUM_DEVICES)
+    return p_prompt_ids, p_neg_prompt_ids, rng
+
+
+# 6. Let's now put it all together in a generate function
+def generate(
+    prompt,
+    negative_prompt,
+    seed=default_seed,
+    guidance_scale=default_guidance_scale,
+    num_inference_steps=default_num_steps,
+):
+    prompt_ids, neg_prompt_ids = tokenize_prompt(prompt, negative_prompt)
+    prompt_ids, neg_prompt_ids, rng = replicate_all(prompt_ids, neg_prompt_ids, seed)
+    images = pipeline(
+        prompt_ids,
+        p_params,
+        rng,
+        num_inference_steps=num_inference_steps,
+        neg_prompt_ids=neg_prompt_ids,
+        guidance_scale=guidance_scale,
+        jit=True,
+    ).images
+
+    # convert the images to PIL
+    images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
+    return pipeline.numpy_to_pil(np.array(images))
+
+
+# 7. Remember that the first call will compile the function and hence be very slow. Let's run generate once
+# so that the pipeline call is compiled
+start = time.time()
+print("Compiling ...")
+generate(default_prompt, default_neg_prompt)
+print(f"Compiled in {time.time() - start}")
+
+# 8. Now the model forward pass will run very quickly, let's try it again
+start = time.time()
+prompt = "photo of a rhino dressed suit and tie sitting at a table in a bar with a bar stools, award winning photography, Elke vogelsang"
+neg_prompt = "cartoon, illustration, animation. face. male, female"
+images = generate(prompt, neg_prompt)
+print(f"Inference in {time.time() - start}")
+
+for i, image in enumerate(images):
+    image.save(f"castle_{i}.png")
diff --git a/diffusers/examples/research_projects/sdxl_flax/sdxl_single_aot.py b/diffusers/examples/research_projects/sdxl_flax/sdxl_single_aot.py
new file mode 100644
index 0000000000000000000000000000000000000000..08bd13902aa9b3caa8553de12b6702a07d6a0936
--- /dev/null
+++ b/diffusers/examples/research_projects/sdxl_flax/sdxl_single_aot.py
@@ -0,0 +1,143 @@
+import time
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.jax_utils import replicate
+from jax import pmap
+
+# Let's cache the model compilation, so that it doesn't take as long the next time around.
+from jax.experimental.compilation_cache import compilation_cache as cc
+
+from diffusers import FlaxStableDiffusionXLPipeline
+
+
+cc.initialize_cache("/tmp/sdxl_cache")
+
+
+NUM_DEVICES = jax.device_count()
+
+# 1. Let's start by downloading the model and loading it into our pipeline class
+# Adhering to JAX's functional approach, the model's parameters are returned separately and
+# will have to be passed to the pipeline during inference
+pipeline, params = FlaxStableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", revision="refs/pr/95", split_head_dim=True
+)
+
+# 2. We cast all parameters to bfloat16 EXCEPT the scheduler which we leave in
+# float32 to keep maximal precision
+scheduler_state = params.pop("scheduler")
+params = jax.tree_util.tree_map(lambda x: x.astype(jnp.bfloat16), params)
+params["scheduler"] = scheduler_state
+
+# 3. Next, we define the different inputs to the pipeline
+default_prompt = "a colorful photo of a castle in the middle of a forest with trees and bushes, by Ismail Inceoglu, shadows, high contrast, dynamic shading, hdr, detailed vegetation, digital painting, digital drawing, detailed painting, a detailed digital painting, gothic art, featured on deviantart"
+default_neg_prompt = "fog, grainy, purple"
+default_seed = 33
+default_guidance_scale = 5.0
+default_num_steps = 25
+width = 1024
+height = 1024
+
+
+# 4. In order to be able to compile the pipeline
+# all inputs have to be tensors or strings
+# Let's tokenize the prompt and negative prompt
+def tokenize_prompt(prompt, neg_prompt):
+    prompt_ids = pipeline.prepare_inputs(prompt)
+    neg_prompt_ids = pipeline.prepare_inputs(neg_prompt)
+    return prompt_ids, neg_prompt_ids
+
+
+# 5. To make full use of JAX's parallelization capabilities
+# the parameters and input tensors are duplicated across devices
+# To make sure every device generates a different image, we create
+# different seeds for each image. The model parameters won't change
+# during inference so we do not wrap them into a function
+p_params = replicate(params)
+
+
+def replicate_all(prompt_ids, neg_prompt_ids, seed):
+    p_prompt_ids = replicate(prompt_ids)
+    p_neg_prompt_ids = replicate(neg_prompt_ids)
+    rng = jax.random.PRNGKey(seed)
+    rng = jax.random.split(rng, NUM_DEVICES)
+    return p_prompt_ids, p_neg_prompt_ids, rng
+
+
+# 6. To compile the pipeline._generate function, we must pass all parameters
+# to the function and tell JAX which are static arguments, that is, arguments that
+# are known at compile time and won't change. In our case, it is num_inference_steps,
+# height, width and return_latents.
+# Once the function is compiled, these parameters are omitted from future calls and
+# cannot be changed without modifying the code and recompiling.
+def aot_compile(
+    prompt=default_prompt,
+    negative_prompt=default_neg_prompt,
+    seed=default_seed,
+    guidance_scale=default_guidance_scale,
+    num_inference_steps=default_num_steps,
+):
+    prompt_ids, neg_prompt_ids = tokenize_prompt(prompt, negative_prompt)
+    prompt_ids, neg_prompt_ids, rng = replicate_all(prompt_ids, neg_prompt_ids, seed)
+    g = jnp.array([guidance_scale] * prompt_ids.shape[0], dtype=jnp.float32)
+    g = g[:, None]
+
+    return (
+        pmap(pipeline._generate, static_broadcasted_argnums=[3, 4, 5, 9])
+        .lower(
+            prompt_ids,
+            p_params,
+            rng,
+            num_inference_steps,  # num_inference_steps
+            height,  # height
+            width,  # width
+            g,
+            None,
+            neg_prompt_ids,
+            False,  # return_latents
+        )
+        .compile()
+    )
+
+
+start = time.time()
+print("Compiling ...")
+p_generate = aot_compile()
+print(f"Compiled in {time.time() - start}")
+
+
+# 7. Let's now put it all together in a generate function.
+def generate(prompt, negative_prompt, seed=default_seed, guidance_scale=default_guidance_scale):
+    prompt_ids, neg_prompt_ids = tokenize_prompt(prompt, negative_prompt)
+    prompt_ids, neg_prompt_ids, rng = replicate_all(prompt_ids, neg_prompt_ids, seed)
+    g = jnp.array([guidance_scale] * prompt_ids.shape[0], dtype=jnp.float32)
+    g = g[:, None]
+    images = p_generate(prompt_ids, p_params, rng, g, None, neg_prompt_ids)
+
+    # convert the images to PIL
+    images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
+    return pipeline.numpy_to_pil(np.array(images))
+
+
+# 8. The first forward pass after AOT compilation still takes a while longer than
+# subsequent passes, this is because on the first pass, JAX uses Python dispatch, which
+# Fills the C++ dispatch cache.
+# When using jit, this extra step is done automatically, but when using AOT compilation,
+# it doesn't happen until the function call is made.
+start = time.time()
+prompt = "photo of a rhino dressed suit and tie sitting at a table in a bar with a bar stools, award winning photography, Elke vogelsang"
+neg_prompt = "cartoon, illustration, animation. face. male, female"
+images = generate(prompt, neg_prompt)
+print(f"First inference in {time.time() - start}")
+
+# 9. From this point forward, any calls to generate should result in a faster inference
+# time and it won't change.
+start = time.time()
+prompt = "photo of a rhino dressed suit and tie sitting at a table in a bar with a bar stools, award winning photography, Elke vogelsang"
+neg_prompt = "cartoon, illustration, animation. face. male, female"
+images = generate(prompt, neg_prompt)
+print(f"Inference in {time.time() - start}")
+
+for i, image in enumerate(images):
+    image.save(f"castle_{i}.png")
diff --git a/diffusers/examples/research_projects/vae/README.md b/diffusers/examples/research_projects/vae/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5bb8a5ffd2caccee3052e7f2aacc25a540fbf98c
--- /dev/null
+++ b/diffusers/examples/research_projects/vae/README.md
@@ -0,0 +1,11 @@
+# VAE
+
+`vae_roundtrip.py` Demonstrates the use of a VAE by roundtripping an image through the encoder and decoder. Original and reconstructed images are displayed side by side.
+
+```
+cd examples/research_projects/vae
+python vae_roundtrip.py \
+    --pretrained_model_name_or_path="stable-diffusion-v1-5/stable-diffusion-v1-5" \
+    --subfolder="vae" \
+    --input_image="/path/to/your/input.png"
+```
diff --git a/diffusers/examples/research_projects/vae/vae_roundtrip.py b/diffusers/examples/research_projects/vae/vae_roundtrip.py
new file mode 100644
index 0000000000000000000000000000000000000000..65c2b43a9bde606d2fd78910e837559d3bdc60c8
--- /dev/null
+++ b/diffusers/examples/research_projects/vae/vae_roundtrip.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import typing
+from typing import Optional, Union
+
+import torch
+from PIL import Image
+from torchvision import transforms  # type: ignore
+
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models.autoencoders.autoencoder_kl import (
+    AutoencoderKL,
+    AutoencoderKLOutput,
+)
+from diffusers.models.autoencoders.autoencoder_tiny import (
+    AutoencoderTiny,
+    AutoencoderTinyOutput,
+)
+from diffusers.models.autoencoders.vae import DecoderOutput
+
+
+SupportedAutoencoder = Union[AutoencoderKL, AutoencoderTiny]
+
+
+def load_vae_model(
+    *,
+    device: torch.device,
+    model_name_or_path: str,
+    revision: Optional[str],
+    variant: Optional[str],
+    # NOTE: use subfolder="vae" if the pointed model is for stable diffusion as a whole instead of just the VAE
+    subfolder: Optional[str],
+    use_tiny_nn: bool,
+) -> SupportedAutoencoder:
+    if use_tiny_nn:
+        # NOTE: These scaling factors don't have to be the same as each other.
+        down_scale = 2
+        up_scale = 2
+        vae = AutoencoderTiny.from_pretrained(  # type: ignore
+            model_name_or_path,
+            subfolder=subfolder,
+            revision=revision,
+            variant=variant,
+            downscaling_scaling_factor=down_scale,
+            upsampling_scaling_factor=up_scale,
+        )
+        assert isinstance(vae, AutoencoderTiny)
+    else:
+        vae = AutoencoderKL.from_pretrained(  # type: ignore
+            model_name_or_path,
+            subfolder=subfolder,
+            revision=revision,
+            variant=variant,
+        )
+        assert isinstance(vae, AutoencoderKL)
+    vae = vae.to(device)
+    vae.eval()  # Set the model to inference mode
+    return vae
+
+
+def pil_to_nhwc(
+    *,
+    device: torch.device,
+    image: Image.Image,
+) -> torch.Tensor:
+    assert image.mode == "RGB"
+    transform = transforms.ToTensor()
+    nhwc = transform(image).unsqueeze(0).to(device)  # type: ignore
+    assert isinstance(nhwc, torch.Tensor)
+    return nhwc
+
+
+def nhwc_to_pil(
+    *,
+    nhwc: torch.Tensor,
+) -> Image.Image:
+    assert nhwc.shape[0] == 1
+    hwc = nhwc.squeeze(0).cpu()
+    return transforms.ToPILImage()(hwc)  # type: ignore
+
+
+def concatenate_images(
+    *,
+    left: Image.Image,
+    right: Image.Image,
+    vertical: bool = False,
+) -> Image.Image:
+    width1, height1 = left.size
+    width2, height2 = right.size
+    if vertical:
+        total_height = height1 + height2
+        max_width = max(width1, width2)
+        new_image = Image.new("RGB", (max_width, total_height))
+        new_image.paste(left, (0, 0))
+        new_image.paste(right, (0, height1))
+    else:
+        total_width = width1 + width2
+        max_height = max(height1, height2)
+        new_image = Image.new("RGB", (total_width, max_height))
+        new_image.paste(left, (0, 0))
+        new_image.paste(right, (width1, 0))
+    return new_image
+
+
+def to_latent(
+    *,
+    rgb_nchw: torch.Tensor,
+    vae: SupportedAutoencoder,
+) -> torch.Tensor:
+    rgb_nchw = VaeImageProcessor.normalize(rgb_nchw)  # type: ignore
+    encoding_nchw = vae.encode(typing.cast(torch.FloatTensor, rgb_nchw))
+    if isinstance(encoding_nchw, AutoencoderKLOutput):
+        latent = encoding_nchw.latent_dist.sample()  # type: ignore
+        assert isinstance(latent, torch.Tensor)
+    elif isinstance(encoding_nchw, AutoencoderTinyOutput):
+        latent = encoding_nchw.latents
+        do_internal_vae_scaling = False  # Is this needed?
+        if do_internal_vae_scaling:
+            latent = vae.scale_latents(latent).mul(255).round().byte()  # type: ignore
+            latent = vae.unscale_latents(latent / 255.0)  # type: ignore
+            assert isinstance(latent, torch.Tensor)
+    else:
+        assert False, f"Unknown encoding type: {type(encoding_nchw)}"
+    return latent
+
+
+def from_latent(
+    *,
+    latent_nchw: torch.Tensor,
+    vae: SupportedAutoencoder,
+) -> torch.Tensor:
+    decoding_nchw = vae.decode(latent_nchw)  # type: ignore
+    assert isinstance(decoding_nchw, DecoderOutput)
+    rgb_nchw = VaeImageProcessor.denormalize(decoding_nchw.sample)  # type: ignore
+    assert isinstance(rgb_nchw, torch.Tensor)
+    return rgb_nchw
+
+
+def main_kwargs(
+    *,
+    device: torch.device,
+    input_image_path: str,
+    pretrained_model_name_or_path: str,
+    revision: Optional[str],
+    variant: Optional[str],
+    subfolder: Optional[str],
+    use_tiny_nn: bool,
+) -> None:
+    vae = load_vae_model(
+        device=device,
+        model_name_or_path=pretrained_model_name_or_path,
+        revision=revision,
+        variant=variant,
+        subfolder=subfolder,
+        use_tiny_nn=use_tiny_nn,
+    )
+    original_pil = Image.open(input_image_path).convert("RGB")
+    original_image = pil_to_nhwc(
+        device=device,
+        image=original_pil,
+    )
+    print(f"Original image shape: {original_image.shape}")
+    reconstructed_image: Optional[torch.Tensor] = None
+
+    with torch.no_grad():
+        latent_image = to_latent(rgb_nchw=original_image, vae=vae)
+        print(f"Latent shape: {latent_image.shape}")
+        reconstructed_image = from_latent(latent_nchw=latent_image, vae=vae)
+        reconstructed_pil = nhwc_to_pil(nhwc=reconstructed_image)
+    combined_image = concatenate_images(
+        left=original_pil,
+        right=reconstructed_pil,
+        vertical=False,
+    )
+    combined_image.show("Original | Reconstruction")
+    print(f"Reconstructed image shape: {reconstructed_image.shape}")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Inference with VAE")
+    parser.add_argument(
+        "--input_image",
+        type=str,
+        required=True,
+        help="Path to the input image for inference.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        required=True,
+        help="Path to pretrained VAE model.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        help="Model version.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Model file variant, e.g., 'fp16'.",
+    )
+    parser.add_argument(
+        "--subfolder",
+        type=str,
+        default=None,
+        help="Subfolder in the model file.",
+    )
+    parser.add_argument(
+        "--use_cuda",
+        action="store_true",
+        help="Use CUDA if available.",
+    )
+    parser.add_argument(
+        "--use_tiny_nn",
+        action="store_true",
+        help="Use tiny neural network.",
+    )
+    return parser.parse_args()
+
+
+# EXAMPLE USAGE:
+#
+# python vae_roundtrip.py --use_cuda --pretrained_model_name_or_path "runwayml/stable-diffusion-v1-5" --subfolder "vae" --input_image "foo.png"
+#
+# python vae_roundtrip.py --use_cuda --pretrained_model_name_or_path "madebyollin/taesd" --use_tiny_nn --input_image "foo.png"
+#
+def main_cli() -> None:
+    args = parse_args()
+
+    input_image_path = args.input_image
+    assert isinstance(input_image_path, str)
+
+    pretrained_model_name_or_path = args.pretrained_model_name_or_path
+    assert isinstance(pretrained_model_name_or_path, str)
+
+    revision = args.revision
+    assert isinstance(revision, (str, type(None)))
+
+    variant = args.variant
+    assert isinstance(variant, (str, type(None)))
+
+    subfolder = args.subfolder
+    assert isinstance(subfolder, (str, type(None)))
+
+    use_cuda = args.use_cuda
+    assert isinstance(use_cuda, bool)
+
+    use_tiny_nn = args.use_tiny_nn
+    assert isinstance(use_tiny_nn, bool)
+
+    device = torch.device("cuda" if use_cuda else "cpu")
+
+    main_kwargs(
+        device=device,
+        input_image_path=input_image_path,
+        pretrained_model_name_or_path=pretrained_model_name_or_path,
+        revision=revision,
+        variant=variant,
+        subfolder=subfolder,
+        use_tiny_nn=use_tiny_nn,
+    )
+
+
+if __name__ == "__main__":
+    main_cli()
diff --git a/diffusers/examples/t2i_adapter/README.md b/diffusers/examples/t2i_adapter/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7d7491950d0ec22ed6f9badd8307efee73789786
--- /dev/null
+++ b/diffusers/examples/t2i_adapter/README.md
@@ -0,0 +1 @@
+We don't yet support training T2I-Adapters on Stable Diffusion yet. For training T2I-Adapters on Stable Diffusion XL, refer [here](./README_sdxl.md).
\ No newline at end of file
diff --git a/diffusers/examples/t2i_adapter/README_sdxl.md b/diffusers/examples/t2i_adapter/README_sdxl.md
new file mode 100644
index 0000000000000000000000000000000000000000..1e5a19fedad1b1e94de1d8a8ad3202fa7a96c53c
--- /dev/null
+++ b/diffusers/examples/t2i_adapter/README_sdxl.md
@@ -0,0 +1,131 @@
+# T2I-Adapter training example for Stable Diffusion XL (SDXL)
+
+The `train_t2i_adapter_sdxl.py` script shows how to implement the [T2I-Adapter training procedure](https://hf.co/papers/2302.08453) for [Stable Diffusion XL](https://huggingface.co/papers/2307.01952).
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the `examples/t2i_adapter` folder and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell (e.g., a notebook)
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+
+## Circle filling dataset
+
+The original dataset is hosted in the [ControlNet repo](https://huggingface.co/lllyasviel/ControlNet/blob/main/training/fill50k.zip). We re-uploaded it to be compatible with `datasets` [here](https://huggingface.co/datasets/fusing/fill50k). Note that `datasets` handles dataloading within the training script.
+
+## Training
+
+Our training examples use two test conditioning images. They can be downloaded by running
+
+```sh
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png
+
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
+```
+
+Then run `huggingface-cli login` to log into your Hugging Face account. This is needed to be able to push the trained T2IAdapter parameters to Hugging Face Hub.
+
+```bash
+export MODEL_DIR="stabilityai/stable-diffusion-xl-base-1.0"
+export OUTPUT_DIR="path to save model"
+
+accelerate launch train_t2i_adapter_sdxl.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=fusing/fill50k \
+ --mixed_precision="fp16" \
+ --resolution=1024 \
+ --learning_rate=1e-5 \
+ --max_train_steps=15000 \
+ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+ --validation_steps=100 \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=4 \
+ --report_to="wandb" \
+ --seed=42 \
+ --push_to_hub
+```
+
+To better track our training experiments, we're using the following flags in the command above:
+
+* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
+* `validation_image`, `validation_prompt`, and `validation_steps` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
+
+Our experiments were conducted on a single 40GB A100 GPU.
+
+### Inference
+
+Once training is done, we can perform inference like so:
+
+```python
+from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, EulerAncestralDiscreteSchedulerTest
+from diffusers.utils import load_image
+import torch
+
+base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
+adapter_path = "path to adapter"
+
+adapter = T2IAdapter.from_pretrained(adapter_path, torch_dtype=torch.float16)
+pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
+    base_model_path, adapter=adapter, torch_dtype=torch.float16
+)
+
+# speed up diffusion process with faster scheduler and memory optimization
+pipe.scheduler = EulerAncestralDiscreteSchedulerTest.from_config(pipe.scheduler.config)
+# remove following line if xformers is not installed or when using Torch 2.0.
+pipe.enable_xformers_memory_efficient_attention()
+# memory optimization.
+pipe.enable_model_cpu_offload()
+
+control_image = load_image("./conditioning_image_1.png")
+prompt = "pale golden rod circle with old lace background"
+
+# generate image
+generator = torch.manual_seed(0)
+image = pipe(
+    prompt, num_inference_steps=20, generator=generator, image=control_image
+).images[0]
+image.save("./output.png")
+```
+
+## Notes
+
+### Specifying a better VAE
+
+SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
diff --git a/diffusers/examples/t2i_adapter/requirements.txt b/diffusers/examples/t2i_adapter/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2955535b192796e2618393012560a7b534fbea23
--- /dev/null
+++ b/diffusers/examples/t2i_adapter/requirements.txt
@@ -0,0 +1,8 @@
+transformers>=4.25.1
+accelerate>=0.16.0
+safetensors
+datasets
+torchvision
+ftfy
+tensorboard
+wandb
\ No newline at end of file
diff --git a/diffusers/examples/t2i_adapter/test_t2i_adapter.py b/diffusers/examples/t2i_adapter/test_t2i_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdf124cdd9321a37540e5de6645d905eba3900d7
--- /dev/null
+++ b/diffusers/examples/t2i_adapter/test_t2i_adapter.py
@@ -0,0 +1,51 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class T2IAdapter(ExamplesTestsAccelerate):
+    def test_t2i_adapter_sdxl(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/t2i_adapter/train_t2i_adapter_sdxl.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-xl-pipe
+            --adapter_model_name_or_path=hf-internal-testing/tiny-adapter
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=9
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
diff --git a/diffusers/examples/t2i_adapter/train_t2i_adapter_sdxl.py b/diffusers/examples/t2i_adapter/train_t2i_adapter_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..02a064fa81ed30ea8f04331fcfbdc285b885c0a2
--- /dev/null
+++ b/diffusers/examples/t2i_adapter/train_t2i_adapter_sdxl.py
@@ -0,0 +1,1327 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import functools
+import gc
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import accelerate
+import numpy as np
+import torch
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    EulerDiscreteScheduler,
+    StableDiffusionXLAdapterPipeline,
+    T2IAdapter,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+MAX_SEQ_LENGTH = 77
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+
+
+def log_validation(vae, unet, adapter, args, accelerator, weight_dtype, step):
+    logger.info("Running validation... ")
+
+    adapter = accelerator.unwrap_model(adapter)
+
+    pipeline = StableDiffusionXLAdapterPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        vae=vae,
+        unet=unet,
+        adapter=adapter,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    if len(args.validation_image) == len(args.validation_prompt):
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_image) == 1:
+        validation_images = args.validation_image * len(args.validation_prompt)
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_prompt) == 1:
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt * len(args.validation_image)
+    else:
+        raise ValueError(
+            "number of `args.validation_image` and `args.validation_prompt` should be checked in `parse_args`"
+        )
+
+    image_logs = []
+
+    for validation_prompt, validation_image in zip(validation_prompts, validation_images):
+        validation_image = Image.open(validation_image).convert("RGB")
+        validation_image = validation_image.resize((args.resolution, args.resolution))
+
+        images = []
+
+        for _ in range(args.num_validation_images):
+            with torch.autocast("cuda"):
+                image = pipeline(
+                    prompt=validation_prompt, image=validation_image, num_inference_steps=20, generator=generator
+                ).images[0]
+            images.append(image)
+
+        image_logs.append(
+            {"validation_image": validation_image, "images": images, "validation_prompt": validation_prompt}
+        )
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                validation_image = log["validation_image"]
+
+                formatted_images = []
+
+                formatted_images.append(np.asarray(validation_image))
+
+                for image in images:
+                    formatted_images.append(np.asarray(image))
+
+                formatted_images = np.stack(formatted_images)
+
+                tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            formatted_images = []
+
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                validation_image = log["validation_image"]
+
+                formatted_images.append(wandb.Image(validation_image, caption="adapter conditioning"))
+
+                for image in images:
+                    image = wandb.Image(image, caption=validation_prompt)
+                    formatted_images.append(image)
+
+            tracker.log({"validation": formatted_images})
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+        del pipeline
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        return image_logs
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def save_model_card(repo_id: str, image_logs: dict = None, base_model: str = None, repo_folder: str = None):
+    img_str = ""
+    if image_logs is not None:
+        img_str = "You can find some example images below.\n"
+        for i, log in enumerate(image_logs):
+            images = log["images"]
+            validation_prompt = log["validation_prompt"]
+            validation_image = log["validation_image"]
+            validation_image.save(os.path.join(repo_folder, "image_control.png"))
+            img_str += f"prompt: {validation_prompt}\n"
+            images = [validation_image] + images
+            image_grid(images, 1, len(images)).save(os.path.join(repo_folder, f"images_{i}.png"))
+            img_str += f"![images_{i})](./images_{i}.png)\n"
+
+    model_description = f"""
+# t2iadapter-{repo_id}
+
+These are t2iadapter weights trained on {base_model} with new type of conditioning.
+{img_str}
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion-xl",
+        "stable-diffusion-xl-diffusers",
+        "text-to-image",
+        "diffusers",
+        "t2iadapter",
+        "diffusers-training",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a ControlNet training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to an improved VAE to stabilize training. For more details check out: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--adapter_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained adapter model or model identifier from huggingface.co/models."
+        " If not specified adapter weights are initialized w.r.t the configurations of SDXL.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be"
+            " float32 precision."
+        ),
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="t2iadapter-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--detection_resolution",
+        type=int,
+        default=None,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--crops_coords_top_left_h",
+        type=int,
+        default=0,
+        help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."),
+    )
+    parser.add_argument(
+        "--crops_coords_top_left_w",
+        type=int,
+        default=0,
+        help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=3,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=1,
+        help=("Number of subprocesses to use for data loading."),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing the target image."
+    )
+    parser.add_argument(
+        "--conditioning_image_column",
+        type=str,
+        default="conditioning_image",
+        help="The column of the dataset containing the adapter conditioning image.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        nargs="+",
+        help=(
+            "A set of prompts evaluated every `--validation_steps` and logged to `--report_to`."
+            " Provide either a matching number of `--validation_image`s, a single `--validation_image`"
+            " to be used with all prompts, or a single prompt that will be used with all `--validation_image`s."
+        ),
+    )
+    parser.add_argument(
+        "--validation_image",
+        type=str,
+        default=None,
+        nargs="+",
+        help=(
+            "A set of paths to the t2iadapter conditioning image be evaluated every `--validation_steps`"
+            " and logged to `--report_to`. Provide either a matching number of `--validation_prompt`s, a"
+            " a single `--validation_prompt` to be used with all `--validation_image`s, or a single"
+            " `--validation_image` that will be used with all `--validation_prompt`s."
+        ),
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images to be generated for each `--validation_image`, `--validation_prompt` pair",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="sd_xl_train_t2iadapter",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--train_data_dir`")
+
+    if args.dataset_name is not None and args.train_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--train_data_dir`")
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    if args.validation_prompt is not None and args.validation_image is None:
+        raise ValueError("`--validation_image` must be set if `--validation_prompt` is set")
+
+    if args.validation_prompt is None and args.validation_image is not None:
+        raise ValueError("`--validation_prompt` must be set if `--validation_image` is set")
+
+    if (
+        args.validation_image is not None
+        and args.validation_prompt is not None
+        and len(args.validation_image) != 1
+        and len(args.validation_prompt) != 1
+        and len(args.validation_image) != len(args.validation_prompt)
+    ):
+        raise ValueError(
+            "Must provide either 1 `--validation_image`, 1 `--validation_prompt`,"
+            " or the same number of `--validation_prompt`s and `--validation_image`s"
+        )
+
+    if args.resolution % 8 != 0:
+        raise ValueError(
+            "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the t2iadapter encoder."
+        )
+
+    return args
+
+
+def get_train_dataset(args, accelerator):
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        if args.train_data_dir is not None:
+            dataset = load_dataset(
+                args.train_data_dir,
+                cache_dir=args.cache_dir,
+            )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    if args.image_column is None:
+        image_column = column_names[0]
+        logger.info(f"image column defaulting to {image_column}")
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    if args.caption_column is None:
+        caption_column = column_names[1]
+        logger.info(f"caption column defaulting to {caption_column}")
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    if args.conditioning_image_column is None:
+        conditioning_image_column = column_names[2]
+        logger.info(f"conditioning image column defaulting to {conditioning_image_column}")
+    else:
+        conditioning_image_column = args.conditioning_image_column
+        if conditioning_image_column not in column_names:
+            raise ValueError(
+                f"`--conditioning_image_column` value '{args.conditioning_image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    with accelerator.main_process_first():
+        train_dataset = dataset["train"].shuffle(seed=args.seed)
+        if args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(args.max_train_samples))
+    return train_dataset
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train=True):
+    prompt_embeds_list = []
+
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+
+    with torch.no_grad():
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                captions,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = text_encoder(
+                text_input_ids.to(text_encoder.device),
+                output_hidden_states=True,
+            )
+
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+            prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def prepare_train_dataset(dataset, accelerator):
+    image_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    conditioning_image_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[args.image_column]]
+        images = [image_transforms(image) for image in images]
+
+        conditioning_images = [image.convert("RGB") for image in examples[args.conditioning_image_column]]
+        conditioning_images = [conditioning_image_transforms(image) for image in conditioning_images]
+
+        examples["pixel_values"] = images
+        examples["conditioning_pixel_values"] = conditioning_images
+
+        return examples
+
+    with accelerator.main_process_first():
+        dataset = dataset.with_transform(preprocess_train)
+
+    return dataset
+
+
+def collate_fn(examples):
+    pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    conditioning_pixel_values = torch.stack([example["conditioning_pixel_values"] for example in examples])
+    conditioning_pixel_values = conditioning_pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    prompt_ids = torch.stack([torch.tensor(example["prompt_embeds"]) for example in examples])
+
+    add_text_embeds = torch.stack([torch.tensor(example["text_embeds"]) for example in examples])
+    add_time_ids = torch.stack([torch.tensor(example["time_ids"]) for example in examples])
+
+    return {
+        "pixel_values": pixel_values,
+        "conditioning_pixel_values": conditioning_pixel_values,
+        "prompt_ids": prompt_ids,
+        "unet_added_conditions": {"text_embeds": add_text_embeds, "time_ids": add_time_ids},
+    }
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+                token=args.hub_token,
+                private=True,
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = EulerDiscreteScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    if args.adapter_model_name_or_path:
+        logger.info("Loading existing adapter weights.")
+        t2iadapter = T2IAdapter.from_pretrained(args.adapter_model_name_or_path)
+    else:
+        logger.info("Initializing t2iadapter weights.")
+        t2iadapter = T2IAdapter(
+            in_channels=3,
+            channels=(320, 640, 1280, 1280),
+            num_res_blocks=2,
+            downscale_factor=16,
+            adapter_type="full_adapter_xl",
+        )
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            i = len(weights) - 1
+
+            while len(weights) > 0:
+                weights.pop()
+                model = models[i]
+
+                sub_dir = "t2iadapter"
+                model.save_pretrained(os.path.join(output_dir, sub_dir))
+
+                i -= 1
+
+        def load_model_hook(models, input_dir):
+            while len(models) > 0:
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = T2IAdapter.from_pretrained(os.path.join(input_dir, "t2iadapter"))
+
+                if args.control_type != "style":
+                    model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    t2iadapter.train()
+    unet.train()
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        " Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training, copy of the weights should still be float32."
+    )
+
+    if unwrap_model(t2iadapter).dtype != torch.float32:
+        raise ValueError(
+            f"Controlnet loaded as datatype {unwrap_model(t2iadapter).dtype}. {low_precision_error_string}"
+        )
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = t2iadapter.parameters()
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae, unet and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    if args.pretrained_vae_model_name_or_path is not None:
+        vae.to(accelerator.device, dtype=weight_dtype)
+    else:
+        vae.to(accelerator.device, dtype=torch.float32)
+    unet.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    # Here, we compute not just the text embeddings but also the additional embeddings
+    # needed for the SD XL UNet to operate.
+    def compute_embeddings(batch, proportion_empty_prompts, text_encoders, tokenizers, is_train=True):
+        original_size = (args.resolution, args.resolution)
+        target_size = (args.resolution, args.resolution)
+        crops_coords_top_left = (args.crops_coords_top_left_h, args.crops_coords_top_left_w)
+        prompt_batch = batch[args.caption_column]
+
+        prompt_embeds, pooled_prompt_embeds = encode_prompt(
+            prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train
+        )
+        add_text_embeds = pooled_prompt_embeds
+
+        # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids])
+
+        prompt_embeds = prompt_embeds.to(accelerator.device)
+        add_text_embeds = add_text_embeds.to(accelerator.device)
+        add_time_ids = add_time_ids.repeat(len(prompt_batch), 1)
+        add_time_ids = add_time_ids.to(accelerator.device, dtype=prompt_embeds.dtype)
+        unet_added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        return {"prompt_embeds": prompt_embeds, **unet_added_cond_kwargs}
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    # Let's first compute all the embeddings so that we can free up the text encoders
+    # from memory.
+    text_encoders = [text_encoder_one, text_encoder_two]
+    tokenizers = [tokenizer_one, tokenizer_two]
+    train_dataset = get_train_dataset(args, accelerator)
+    compute_embeddings_fn = functools.partial(
+        compute_embeddings,
+        proportion_empty_prompts=args.proportion_empty_prompts,
+        text_encoders=text_encoders,
+        tokenizers=tokenizers,
+    )
+    with accelerator.main_process_first():
+        from datasets.fingerprint import Hasher
+
+        # fingerprint used by the cache for the other processes to load the result
+        # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401
+        new_fingerprint = Hasher.hash(args)
+        train_dataset = train_dataset.map(compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint)
+
+    # Then get the training dataset ready to be passed to the dataloader.
+    train_dataset = prepare_train_dataset(train_dataset, accelerator)
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=args.max_train_steps,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    t2iadapter, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        t2iadapter, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+
+        # tensorboard cannot handle list types for config
+        tracker_config.pop("validation_prompt")
+        tracker_config.pop("validation_image")
+
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    image_logs = None
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(t2iadapter):
+                if args.pretrained_vae_model_name_or_path is not None:
+                    pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
+                else:
+                    pixel_values = batch["pixel_values"]
+
+                # encode pixel values with batch size of at most 8 to avoid OOM
+                latents = []
+                for i in range(0, pixel_values.shape[0], 8):
+                    latents.append(vae.encode(pixel_values[i : i + 8]).latent_dist.sample())
+                latents = torch.cat(latents, dim=0)
+                latents = latents * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    latents = latents.to(weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+
+                # Cubic sampling to sample a random timestep for each image.
+                # For more details about why cubic sampling is used, refer to section 3.4 of https://arxiv.org/abs/2302.08453
+                timesteps = torch.rand((bsz,), device=latents.device)
+                timesteps = (1 - timesteps**3) * noise_scheduler.config.num_train_timesteps
+                timesteps = timesteps.long().to(noise_scheduler.timesteps.dtype)
+                timesteps = timesteps.clamp(0, noise_scheduler.config.num_train_timesteps - 1)
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Scale the noisy latents for the UNet
+                sigmas = get_sigmas(timesteps, len(noisy_latents.shape), noisy_latents.dtype)
+                inp_noisy_latents = noisy_latents / ((sigmas**2 + 1) ** 0.5)
+
+                # Adapter conditioning.
+                t2iadapter_image = batch["conditioning_pixel_values"].to(dtype=weight_dtype)
+                down_block_additional_residuals = t2iadapter(t2iadapter_image)
+                down_block_additional_residuals = [
+                    sample.to(dtype=weight_dtype) for sample in down_block_additional_residuals
+                ]
+
+                # Predict the noise residual
+                model_pred = unet(
+                    inp_noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=batch["prompt_ids"],
+                    added_cond_kwargs=batch["unet_added_conditions"],
+                    down_block_additional_residuals=down_block_additional_residuals,
+                    return_dict=False,
+                )[0]
+
+                # Denoise the latents
+                denoised_latents = model_pred * (-sigmas) + noisy_latents
+                weighing = sigmas**-2.0
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = latents  # we are computing loss against denoise latents
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # MSE loss
+                loss = torch.mean(
+                    (weighing.float() * (denoised_latents.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    dim=1,
+                )
+                loss = loss.mean()
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = t2iadapter.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                        image_logs = log_validation(
+                            vae,
+                            unet,
+                            t2iadapter,
+                            args,
+                            accelerator,
+                            weight_dtype,
+                            global_step,
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        t2iadapter = unwrap_model(t2iadapter)
+        t2iadapter.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                image_logs=image_logs,
+                base_model=args.pretrained_model_name_or_path,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/test_examples_utils.py b/diffusers/examples/test_examples_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b57a35e1f16ec4bcd401a4392f1bc16e7f0a2408
--- /dev/null
+++ b/diffusers/examples/test_examples_utils.py
@@ -0,0 +1,61 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import subprocess
+import tempfile
+import unittest
+from typing import List
+
+from accelerate.utils import write_basic_config
+
+
+# These utils relate to ensuring the right error message is received when running scripts
+class SubprocessCallException(Exception):
+    pass
+
+
+def run_command(command: List[str], return_stdout=False):
+    """
+    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
+    if an error occurred while running `command`
+    """
+    try:
+        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
+        if return_stdout:
+            if hasattr(output, "decode"):
+                output = output.decode("utf-8")
+            return output
+    except subprocess.CalledProcessError as e:
+        raise SubprocessCallException(
+            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
+        ) from e
+
+
+class ExamplesTestsAccelerate(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls._tmpdir = tempfile.mkdtemp()
+        cls.configPath = os.path.join(cls._tmpdir, "default_config.yml")
+
+        write_basic_config(save_location=cls.configPath)
+        cls._launch_args = ["accelerate", "launch", "--config_file", cls.configPath]
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        shutil.rmtree(cls._tmpdir)
diff --git a/diffusers/examples/text_to_image/README.md b/diffusers/examples/text_to_image/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0bdf02f804bd837847cb0758d13bcdc9a802f4ef
--- /dev/null
+++ b/diffusers/examples/text_to_image/README.md
@@ -0,0 +1,339 @@
+# Stable Diffusion text-to-image fine-tuning
+
+The `train_text_to_image.py` script shows how to fine-tune stable diffusion model on your own dataset.
+
+___Note___:
+
+___This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparameters to get the best result on your dataset.___
+
+
+## Running locally with PyTorch
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder  and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+### Naruto example
+
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree.
+
+You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+
+Run the following command to authenticate your token
+
+```bash
+huggingface-cli login
+```
+
+If you have already cloned the repo, then you won't need to go through these steps.
+
+<br>
+
+#### Hardware
+With `gradient_checkpointing` and `mixed_precision` it should be possible to fine tune the model on a single 24GB GPU. For higher `batch_size` and faster training it's better to use GPUs with >30GB memory.
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+<!-- accelerate_snippet_start -->
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+
+accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --output_dir="sd-naruto-model"
+```
+<!-- accelerate_snippet_end -->
+
+
+To run on your own training files prepare the dataset according to the format required by `datasets`, you can find the instructions for how to do that in this [document](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata).
+If you wish to use custom loading logic, you should modify the script, we have left pointers for that in the training script.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export TRAIN_DIR="path_to_your_dataset"
+
+accelerate launch --mixed_precision="fp16" train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$TRAIN_DIR \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --output_dir="sd-naruto-model"
+```
+
+
+Once the training is finished the model will be saved in the `output_dir` specified in the command. In this example it's `sd-naruto-model`. To load the fine-tuned model for inference just pass that path to `StableDiffusionPipeline`
+
+```python
+import torch
+from diffusers import StableDiffusionPipeline
+
+model_path = "path_to_saved_model"
+pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
+pipe.to("cuda")
+
+image = pipe(prompt="yoda").images[0]
+image.save("yoda-naruto.png")
+```
+
+Checkpoints only save the unet, so to run inference from a checkpoint, just load the unet
+
+```python
+import torch
+from diffusers import StableDiffusionPipeline, UNet2DConditionModel
+
+model_path = "path_to_saved_model"
+unet = UNet2DConditionModel.from_pretrained(model_path + "/checkpoint-<N>/unet", torch_dtype=torch.float16)
+
+pipe = StableDiffusionPipeline.from_pretrained("<initial model>", unet=unet, torch_dtype=torch.float16)
+pipe.to("cuda")
+
+image = pipe(prompt="yoda").images[0]
+image.save("yoda-naruto.png")
+```
+
+#### Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+
+accelerate launch --mixed_precision="fp16" --multi_gpu  train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --output_dir="sd-naruto-model"
+```
+
+
+#### Training with Min-SNR weighting
+
+We support training with the Min-SNR weighting strategy proposed in [Efficient Diffusion Training via Min-SNR Weighting Strategy](https://arxiv.org/abs/2303.09556) which helps to achieve faster convergence
+by rebalancing the loss. In order to use it, one needs to set the `--snr_gamma` argument. The recommended
+value when using it is 5.0.
+
+You can find [this project on Weights and Biases](https://wandb.ai/sayakpaul/text2image-finetune-minsnr) that compares the loss surfaces of the following setups:
+
+* Training without the Min-SNR weighting strategy
+* Training with the Min-SNR weighting strategy (`snr_gamma` set to 5.0)
+* Training with the Min-SNR weighting strategy (`snr_gamma` set to 1.0)
+
+For our small Narutos dataset, the effects of Min-SNR weighting strategy might not appear to be pronounced, but for larger datasets, we believe the effects will be more pronounced.
+
+Also, note that in this example, we either predict `epsilon` (i.e., the noise) or the `v_prediction`. For both of these cases, the formulation of the Min-SNR weighting strategy that we have used holds.
+
+
+#### Training with EMA weights
+
+Through the `EMAModel` class, we support a convenient method of tracking an exponential moving average of model parameters.  This helps to smooth out noise in model parameter updates and generally improves model performance.  If enabled with the `--use_ema` argument, the final model checkpoint that is saved at the end of training will use the EMA weights.
+
+EMA weights require an additional full-precision copy of the model parameters to be stored in memory, but otherwise have very little performance overhead.  `--foreach_ema` can be used to further reduce the overhead.  If you are short on VRAM and still want to use EMA weights, you can store them in CPU RAM by using the `--offload_ema` argument.  This will keep the EMA weights in pinned CPU memory during the training step.  Then, once every model parameter update, it will transfer the EMA weights back to the GPU which can then update the parameters on the GPU, before sending them back to the CPU.  Both of these transfers are set up as non-blocking, so CUDA devices should be able to overlap this transfer with other computations.  With sufficient bandwidth between the host and device and a sufficiently long gap between model parameter updates, storing EMA weights in CPU RAM should have no additional performance overhead, as long as no other calls force synchronization.
+
+#### Training with DREAM
+
+We support training epsilon (noise) prediction models using the [DREAM (Diffusion Rectification and Estimation-Adaptive Models) strategy](https://arxiv.org/abs/2312.00210). DREAM claims to increase model fidelity for the performance cost of an extra grad-less unet `forward` step in the training loop.  You can turn on DREAM training by using the `--dream_training` argument. The `--dream_detail_preservation` argument controls the detail preservation variable p and is the default of 1 from the paper.
+
+
+
+## Training with LoRA
+
+Low-Rank Adaption of Large Language Models was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*.
+
+In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-decomposition matrices to existing weights and **only** training those newly added weights. This has a couple of advantages:
+
+- Previous pretrained weights are kept frozen so that model is not prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114).
+- Rank-decomposition matrices have significantly fewer parameters than original model, which means that trained LoRA weights are easily portable.
+- LoRA attention layers allow to control to which extent the model is adapted toward new training images via a `scale` parameter.
+
+[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository.
+
+With LoRA, it's possible to fine-tune Stable Diffusion on a custom image-caption pair dataset
+on consumer GPUs like Tesla T4, Tesla V100.
+
+### Training
+
+First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Narutos dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+**___Note: It is quite useful to monitor the training progress by regularly generating sample images during training. [Weights and Biases](https://docs.wandb.ai/quickstart) is a nice solution to easily see generating images during training. All you need to do is to run `pip install wandb` before training to automatically log images.___**
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+```
+
+For this example we want to directly store the trained LoRA embeddings on the Hub, so
+we need to be logged in and add the `--push_to_hub` flag.
+
+```bash
+huggingface-cli login
+```
+
+Now we can start training!
+
+```bash
+accelerate launch --mixed_precision="fp16" train_text_to_image_lora.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=512 --random_flip \
+  --train_batch_size=1 \
+  --num_train_epochs=100 --checkpointing_steps=5000 \
+  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --seed=42 \
+  --output_dir="sd-naruto-model-lora" \
+  --validation_prompt="cute dragon creature" --report_to="wandb"
+```
+
+The above command will also run inference as fine-tuning progresses and log the results to Weights and Biases.
+
+**___Note: When using LoRA we can use a much higher learning rate compared to non-LoRA fine-tuning. Here we use *1e-4* instead of the usual *1e-5*. Also, by using LoRA, it's possible to run `train_text_to_image_lora.py` in consumer GPUs like T4 or V100.___**
+
+The final LoRA embedding weights have been uploaded to [sayakpaul/sd-model-finetuned-lora-t4](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4). **___Note: [The final weights](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4/blob/main/pytorch_lora_weights.bin) are only 3 MB in size, which is orders of magnitudes smaller than the original model.___**
+
+You can check some inference samples that were logged during the course of the fine-tuning process [here](https://wandb.ai/sayakpaul/text2image-fine-tune/runs/q4lc0xsw).
+
+### Inference
+
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline` after loading the trained LoRA weights.  You
+need to pass the `output_dir` for loading the LoRA weights which, in this case, is `sd-naruto-model-lora`.
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+model_path = "sayakpaul/sd-model-finetuned-lora-t4"
+pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+pipe.unet.load_attn_procs(model_path)
+pipe.to("cuda")
+
+prompt = "A naruto with green eyes and red legs."
+image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+image.save("naruto.png")
+```
+
+If you are loading the LoRA parameters from the Hub and if the Hub repository has
+a `base_model` tag (such as [this](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4/blob/main/README.md?code=true#L4)), then
+you can do:
+
+```py
+from huggingface_hub.repocard import RepoCard
+
+lora_model_id = "sayakpaul/sd-model-finetuned-lora-t4"
+card = RepoCard.load(lora_model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)
+...
+```
+
+## Training with Flax/JAX
+
+For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
+
+**___Note: The flax example doesn't yet support features like gradient checkpoint, gradient accumulation etc, so to use flax for faster training we will need >30GB cards or TPU v3.___**
+
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install -U -r requirements_flax.txt
+```
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+
+python train_text_to_image_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --mixed_precision="fp16" \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --output_dir="sd-naruto-model"
+```
+
+To run on your own training files prepare the dataset according to the format required by `datasets`, you can find the instructions for how to do that in this [document](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata).
+If you wish to use custom loading logic, you should modify the script, we have left pointers for that in the training script.
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export TRAIN_DIR="path_to_your_dataset"
+
+python train_text_to_image_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$TRAIN_DIR \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --mixed_precision="fp16" \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --output_dir="sd-naruto-model"
+```
+
+### Training with xFormers:
+
+You can enable memory efficient attention by [installing xFormers](https://huggingface.co/docs/diffusers/main/en/optimization/xformers) and passing the `--enable_xformers_memory_efficient_attention` argument to the script.
+
+xFormers training is not available for Flax/JAX.
+
+**Note**:
+
+According to [this issue](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212), xFormers `v0.0.16` cannot be used for training in some GPUs. If you observe that problem, please install a development version as indicated in that comment.
+
+## Stable Diffusion XL
+
+* We support fine-tuning the UNet shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) via the `train_text_to_image_sdxl.py` script. Please refer to the docs [here](./README_sdxl.md).
+* We also support fine-tuning of the UNet and Text Encoder shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) with LoRA via the `train_text_to_image_lora_sdxl.py` script. Please refer to the docs [here](./README_sdxl.md).
diff --git a/diffusers/examples/text_to_image/README_sdxl.md b/diffusers/examples/text_to_image/README_sdxl.md
new file mode 100644
index 0000000000000000000000000000000000000000..08d82ac133f416a8075be16ba673265927e0a82a
--- /dev/null
+++ b/diffusers/examples/text_to_image/README_sdxl.md
@@ -0,0 +1,285 @@
+# Stable Diffusion XL text-to-image fine-tuning
+
+The `train_text_to_image_sdxl.py` script shows how to fine-tune Stable Diffusion XL (SDXL) on your own dataset.
+
+🚨 This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparameters to get the best result on your dataset. 🚨
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the `examples/text_to_image` folder and run
+```bash
+pip install -r requirements_sdxl.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell (e.g., a notebook)
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+### Training
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+
+accelerate launch train_text_to_image_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --pretrained_vae_model_name_or_path=$VAE_NAME \
+  --dataset_name=$DATASET_NAME \
+  --enable_xformers_memory_efficient_attention \
+  --resolution=512 --center_crop --random_flip \
+  --proportion_empty_prompts=0.2 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 --gradient_checkpointing \
+  --max_train_steps=10000 \
+  --use_8bit_adam \
+  --learning_rate=1e-06 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --mixed_precision="fp16" \
+  --report_to="wandb" \
+  --validation_prompt="a cute Sundar Pichai creature" --validation_epochs 5 \
+  --checkpointing_steps=5000 \
+  --output_dir="sdxl-naruto-model" \
+  --push_to_hub
+```
+
+**Notes**:
+
+*  The `train_text_to_image_sdxl.py` script pre-computes text embeddings and the VAE encodings and keeps them in memory. While for smaller datasets like [`lambdalabs/naruto-blip-captions`](https://hf.co/datasets/lambdalabs/naruto-blip-captions), it might not be a problem, it can definitely lead to memory problems when the script is used on a larger dataset. For those purposes, you would want to serialize these pre-computed representations to disk separately and load them during the fine-tuning process. Refer to [this PR](https://github.com/huggingface/diffusers/pull/4505) for a more in-depth discussion.
+* The training script is compute-intensive and may not run on a consumer GPU like Tesla T4.
+* The training command shown above performs intermediate quality validation in between the training epochs and logs the results to Weights and Biases. `--report_to`, `--validation_prompt`, and `--validation_epochs` are the relevant CLI arguments here.
+* SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
+
+### Inference
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+model_path = "you-model-id-goes-here" # <-- change this
+pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
+pipe.to("cuda")
+
+prompt = "A naruto with green eyes and red legs."
+image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+image.save("naruto.png")
+```
+
+### Inference in Pytorch XLA
+```python
+from diffusers import DiffusionPipeline
+import torch
+import torch_xla.core.xla_model as xm
+
+model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+pipe = DiffusionPipeline.from_pretrained(model_id)
+
+device = xm.xla_device()
+pipe.to(device)
+
+prompt = "A naruto with green eyes and red legs."
+start = time()
+image = pipe(prompt, num_inference_steps=inference_steps).images[0]
+print(f'Compilation time is {time()-start} sec')
+image.save("naruto.png")
+
+start = time()
+image = pipe(prompt, num_inference_steps=inference_steps).images[0]
+print(f'Inference time is {time()-start} sec after compilation')
+```
+
+Note: There is a warmup step in PyTorch XLA. This takes longer because of
+compilation and optimization. To see the real benefits of Pytorch XLA and
+speedup, we need to call the pipe again on the input with the same length
+as the original prompt to reuse the optimized graph and get the performance
+boost.
+
+## LoRA training example for Stable Diffusion XL (SDXL)
+
+Low-Rank Adaption of Large Language Models was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*.
+
+In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-decomposition matrices to existing weights and **only** training those newly added weights. This has a couple of advantages:
+
+- Previous pretrained weights are kept frozen so that model is not prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114).
+- Rank-decomposition matrices have significantly fewer parameters than original model, which means that trained LoRA weights are easily portable.
+- LoRA attention layers allow to control to which extent the model is adapted toward new training images via a `scale` parameter.
+
+[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository.
+
+With LoRA, it's possible to fine-tune Stable Diffusion on a custom image-caption pair dataset
+on consumer GPUs like Tesla T4, Tesla V100.
+
+### Training
+
+First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables and, optionally, the `VAE_NAME` variable. Here, we will use [Stable Diffusion XL 1.0-base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and the [Narutos dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).
+
+**___Note: It is quite useful to monitor the training progress by regularly generating sample images during training. [Weights and Biases](https://docs.wandb.ai/quickstart) is a nice solution to easily see generating images during training. All you need to do is to run `pip install wandb` before training to automatically log images.___**
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+```
+
+For this example we want to directly store the trained LoRA embeddings on the Hub, so
+we need to be logged in and add the `--push_to_hub` flag.
+
+```bash
+huggingface-cli login
+```
+
+Now we can start training!
+
+```bash
+accelerate launch train_text_to_image_lora_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --pretrained_vae_model_name_or_path=$VAE_NAME \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=1024 --random_flip \
+  --train_batch_size=1 \
+  --num_train_epochs=2 --checkpointing_steps=500 \
+  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --mixed_precision="fp16" \
+  --seed=42 \
+  --output_dir="sd-naruto-model-lora-sdxl" \
+  --validation_prompt="cute dragon creature" --report_to="wandb" \
+  --push_to_hub
+```
+
+The above command will also run inference as fine-tuning progresses and log the results to Weights and Biases.
+
+**Notes**:
+
+* SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
+
+
+### Using DeepSpeed
+Using DeepSpeed one can reduce the consumption of GPU memory, enabling the training of models on GPUs with smaller memory sizes. DeepSpeed is capable of offloading model parameters to the machine's memory, or it can distribute parameters, gradients, and optimizer states across multiple GPUs. This allows for the training of larger models under the same hardware configuration.
+
+First, you need to use the `accelerate config` command to choose to use DeepSpeed, or manually use the accelerate config file to set up DeepSpeed.
+
+Here is an example of a config file for using DeepSpeed. For more detailed explanations of the configuration, you can refer to this [link](https://huggingface.co/docs/accelerate/usage_guides/deepspeed).
+```yaml
+compute_environment: LOCAL_MACHINE
+debug: true
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+You need to save the mentioned configuration as an `accelerate_config.yaml` file. Then, you need to input the path of your `accelerate_config.yaml` file into the `ACCELERATE_CONFIG_FILE` parameter. This way you can use DeepSpeed to train your SDXL model in LoRA. Additionally, you can use DeepSpeed to train other SD models in this way.
+
+```shell
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+export ACCELERATE_CONFIG_FILE="your accelerate_config.yaml"
+
+accelerate launch  --config_file $ACCELERATE_CONFIG_FILE train_text_to_image_lora_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --pretrained_vae_model_name_or_path=$VAE_NAME \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=1024  \
+  --train_batch_size=1 \
+  --num_train_epochs=2 \
+  --checkpointing_steps=2 \
+  --learning_rate=1e-04 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --mixed_precision="fp16" \
+  --max_train_steps=20 \
+  --validation_epochs=20 \
+  --seed=1234 \
+  --output_dir="sd-naruto-model-lora-sdxl" \
+  --validation_prompt="cute dragon creature"
+```
+
+
+### Finetuning the text encoder and UNet
+
+The script also allows you to finetune the `text_encoder` along with the `unet`.
+
+🚨 Training the text encoder requires additional memory.
+
+Pass the `--train_text_encoder` argument to the training script to enable finetuning the `text_encoder` and `unet`:
+
+```bash
+accelerate launch train_text_to_image_lora_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=1024 --random_flip \
+  --train_batch_size=1 \
+  --num_train_epochs=2 --checkpointing_steps=500 \
+  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --seed=42 \
+  --output_dir="sd-naruto-model-lora-sdxl-txt" \
+  --train_text_encoder \
+  --validation_prompt="cute dragon creature" --report_to="wandb" \
+  --push_to_hub
+```
+
+### Inference
+
+Once you have trained a model using above command, the inference can be done simply using the `DiffusionPipeline` after loading the trained LoRA weights.  You
+need to pass the `output_dir` for loading the LoRA weights which, in this case, is `sd-naruto-model-lora-sdxl`.
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+model_path = "takuoko/sd-naruto-model-lora-sdxl"
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
+pipe.to("cuda")
+pipe.load_lora_weights(model_path)
+
+prompt = "A naruto with green eyes and red legs."
+image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+image.save("naruto.png")
+```
diff --git a/diffusers/examples/text_to_image/requirements.txt b/diffusers/examples/text_to_image/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c3ffa42f0edc9862345ab82b69ffb5cdfaeac45e
--- /dev/null
+++ b/diffusers/examples/text_to_image/requirements.txt
@@ -0,0 +1,8 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+datasets>=2.19.1
+ftfy
+tensorboard
+Jinja2
+peft==0.7.0
diff --git a/diffusers/examples/text_to_image/requirements_flax.txt b/diffusers/examples/text_to_image/requirements_flax.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b6eb64e254625ee8eff2ef126d67adfd5b6994dc
--- /dev/null
+++ b/diffusers/examples/text_to_image/requirements_flax.txt
@@ -0,0 +1,9 @@
+transformers>=4.25.1
+datasets
+flax
+optax
+torch
+torchvision
+ftfy
+tensorboard
+Jinja2
diff --git a/diffusers/examples/text_to_image/requirements_sdxl.txt b/diffusers/examples/text_to_image/requirements_sdxl.txt
new file mode 100644
index 0000000000000000000000000000000000000000..64cbc9205fd08b960303dfc611214ae6dc48dd7c
--- /dev/null
+++ b/diffusers/examples/text_to_image/requirements_sdxl.txt
@@ -0,0 +1,8 @@
+accelerate>=0.22.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
+datasets
+peft==0.7.0
\ No newline at end of file
diff --git a/diffusers/examples/text_to_image/test_text_to_image.py b/diffusers/examples/text_to_image/test_text_to_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..6231a89b1d1d9c16ab34281b223d7365791eab67
--- /dev/null
+++ b/diffusers/examples/text_to_image/test_text_to_image.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import shutil
+import sys
+import tempfile
+
+from diffusers import DiffusionPipeline, UNet2DConditionModel  # noqa: E402
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class TextToImage(ExamplesTestsAccelerate):
+    def test_text_to_image(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
+
+    def test_text_to_image_checkpointing(self):
+        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
+        prompt = "a prompt"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # check can run an intermediate checkpoint
+            unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
+            pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
+            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
+
+            # Run training script for 2 total steps resuming from checkpoint 4
+
+            resume_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=1
+                --resume_from_checkpoint=checkpoint-4
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check can run new fully trained pipeline
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # no checkpoint-2 -> check old checkpoints do not exist
+            # check new checkpoints exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-5"},
+            )
+
+    def test_text_to_image_checkpointing_use_ema(self):
+        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
+        prompt = "a prompt"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --use_ema
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=2)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # check can run an intermediate checkpoint
+            unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
+            pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
+            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
+
+            # Run training script for 2 total steps resuming from checkpoint 4
+
+            resume_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=1
+                --resume_from_checkpoint=checkpoint-4
+                --use_ema
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check can run new fully trained pipeline
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # no checkpoint-2 -> check old checkpoints do not exist
+            # check new checkpoints exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-5"},
+            )
+
+    def test_text_to_image_checkpointing_checkpoints_total_limit(self):
+        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
+        prompt = "a prompt"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # Should create checkpoints at steps 2, 4, 6
+            # with checkpoint at step 2 deleted
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            # checkpoint-2 should have been deleted
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
+
+    def test_text_to_image_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
+        prompt = "a prompt"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # resume and we should try to checkpoint at 6, where we'll have to remove
+            # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint
+
+            resume_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 8
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --resume_from_checkpoint=checkpoint-4
+                --checkpoints_total_limit=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-6", "checkpoint-8"},
+            )
+
+
+class TextToImageSDXL(ExamplesTestsAccelerate):
+    def test_text_to_image_sdxl(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/text_to_image/train_text_to_image_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
diff --git a/diffusers/examples/text_to_image/test_text_to_image_lora.py b/diffusers/examples/text_to_image/test_text_to_image_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..4604b9f5210ca3e2040cababbd4b7d03b6a18bbb
--- /dev/null
+++ b/diffusers/examples/text_to_image/test_text_to_image_lora.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+import safetensors
+
+from diffusers import DiffusionPipeline  # noqa: E402
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class TextToImageLoRA(ExamplesTestsAccelerate):
+    def test_text_to_image_lora_sdxl_checkpointing_checkpoints_total_limit(self):
+        prompt = "a prompt"
+        pipeline_path = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # Should create checkpoints at steps 2, 4, 6
+            # with checkpoint at step 2 deleted
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image_lora_sdxl.py
+                --pretrained_model_name_or_path {pipeline_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(pipeline_path)
+            pipe.load_lora_weights(tmpdir)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            # checkpoint-2 should have been deleted
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
+
+    def test_text_to_image_lora_checkpointing_checkpoints_total_limit(self):
+        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
+        prompt = "a prompt"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # Should create checkpoints at steps 2, 4, 6
+            # with checkpoint at step 2 deleted
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image_lora.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                --seed=0
+                --num_validation_images=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
+            )
+            pipe.load_lora_weights(tmpdir)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            # checkpoint-2 should have been deleted
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
+
+    def test_text_to_image_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
+        prompt = "a prompt"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image_lora.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --seed=0
+                --num_validation_images=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
+            )
+            pipe.load_lora_weights(tmpdir)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # resume and we should try to checkpoint at 6, where we'll have to remove
+            # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint
+
+            resume_run_args = f"""
+                examples/text_to_image/train_text_to_image_lora.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 8
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --resume_from_checkpoint=checkpoint-4
+                --checkpoints_total_limit=2
+                --seed=0
+                --num_validation_images=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
+            )
+            pipe.load_lora_weights(tmpdir)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-6", "checkpoint-8"},
+            )
+
+
+class TextToImageLoRASDXL(ExamplesTestsAccelerate):
+    def test_text_to_image_lora_sdxl(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/text_to_image/train_text_to_image_lora_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+    def test_text_to_image_lora_sdxl_with_text_encoder(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/text_to_image/train_text_to_image_lora_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --train_text_encoder
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"unet"` or `"text_encoder"` or `"text_encoder_2"` in their names.
+            keys = lora_state_dict.keys()
+            starts_with_unet = all(
+                k.startswith("unet") or k.startswith("text_encoder") or k.startswith("text_encoder_2") for k in keys
+            )
+            self.assertTrue(starts_with_unet)
+
+    def test_text_to_image_lora_sdxl_text_encoder_checkpointing_checkpoints_total_limit(self):
+        prompt = "a prompt"
+        pipeline_path = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # Should create checkpoints at steps 2, 4, 6
+            # with checkpoint at step 2 deleted
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image_lora_sdxl.py
+                --pretrained_model_name_or_path {pipeline_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --train_text_encoder
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(pipeline_path)
+            pipe.load_lora_weights(tmpdir)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            # checkpoint-2 should have been deleted
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
diff --git a/diffusers/examples/text_to_image/train_text_to_image.py b/diffusers/examples/text_to_image/train_text_to_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..684bf352a6c1abf13a622be88d2444e2e6d7436d
--- /dev/null
+++ b/diffusers/examples/text_to_image/train_text_to_image.py
@@ -0,0 +1,1153 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+
+import accelerate
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.state import AcceleratorState
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+from transformers.utils import ContextManagers
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel, compute_dream_and_update_latents, compute_snr
+from diffusers.utils import check_min_version, deprecate, is_wandb_available, make_image_grid
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def save_model_card(
+    args,
+    repo_id: str,
+    images: list = None,
+    repo_folder: str = None,
+):
+    img_str = ""
+    if len(images) > 0:
+        image_grid = make_image_grid(images, 1, len(args.validation_prompts))
+        image_grid.save(os.path.join(repo_folder, "val_imgs_grid.png"))
+        img_str += "![val_imgs_grid](./val_imgs_grid.png)\n"
+
+    model_description = f"""
+# Text-to-image finetuning - {repo_id}
+
+This pipeline was finetuned from **{args.pretrained_model_name_or_path}** on the **{args.dataset_name}** dataset. Below are some example images generated with the finetuned pipeline using the following prompts: {args.validation_prompts}: \n
+{img_str}
+
+## Pipeline usage
+
+You can use the pipeline like so:
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained("{repo_id}", torch_dtype=torch.float16)
+prompt = "{args.validation_prompts[0]}"
+image = pipeline(prompt).images[0]
+image.save("my_image.png")
+```
+
+## Training info
+
+These are the key hyperparameters used during training:
+
+* Epochs: {args.num_train_epochs}
+* Learning rate: {args.learning_rate}
+* Batch size: {args.train_batch_size}
+* Gradient accumulation steps: {args.gradient_accumulation_steps}
+* Image resolution: {args.resolution}
+* Mixed-precision: {args.mixed_precision}
+
+"""
+    wandb_info = ""
+    if is_wandb_available():
+        wandb_run_url = None
+        if wandb.run is not None:
+            wandb_run_url = wandb.run.url
+
+    if wandb_run_url is not None:
+        wandb_info = f"""
+More information on all the CLI arguments and the environment are available on your [`wandb` run page]({wandb_run_url}).
+"""
+
+    model_description += wandb_info
+
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=args.pretrained_model_name_or_path,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = ["stable-diffusion", "stable-diffusion-diffusers", "text-to-image", "diffusers", "diffusers-training"]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight_dtype, epoch):
+    logger.info("Running validation... ")
+
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        vae=accelerator.unwrap_model(vae),
+        text_encoder=accelerator.unwrap_model(text_encoder),
+        tokenizer=tokenizer,
+        unet=accelerator.unwrap_model(unet),
+        safety_checker=None,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    images = []
+    for i in range(len(args.validation_prompts)):
+        if torch.backends.mps.is_available():
+            autocast_ctx = nullcontext()
+        else:
+            autocast_ctx = torch.autocast(accelerator.device.type)
+
+        with autocast_ctx:
+            image = pipeline(args.validation_prompts[i], num_inference_steps=20, generator=generator).images[0]
+
+        images.append(image)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompts[i]}")
+                        for i, image in enumerate(images)
+                    ]
+                }
+            )
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+    return images
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--input_perturbation", type=float, default=0, help="The scale of input perturbation. Recommended 0.1."
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompts",
+        type=str,
+        default=None,
+        nargs="+",
+        help=("A set of prompts evaluated every `--validation_epochs` and logged to `--report_to`."),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--dream_training",
+        action="store_true",
+        help=(
+            "Use the DREAM training method, which makes training more efficient and accurate at the ",
+            "expense of doing an extra forward pass. See: https://arxiv.org/abs/2312.00210",
+        ),
+    )
+    parser.add_argument(
+        "--dream_detail_preservation",
+        type=float,
+        default=1.0,
+        help="Dream detail preservation factor p (should be greater than 0; default=1.0, as suggested in the paper)",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument("--offload_ema", action="store_true", help="Offload EMA model to CPU during training step.")
+    parser.add_argument("--foreach_ema", action="store_true", help="Use faster foreach implementation of EMAModel.")
+    parser.add_argument(
+        "--non_ema_revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or"
+            " remote repository specified with --pretrained_model_name_or_path."
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=5,
+        help="Run validation every X epochs.",
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="text2image-fine-tune",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    # default to using the same revision for the non-ema model if not specified
+    if args.non_ema_revision is None:
+        args.non_ema_revision = args.revision
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if args.non_ema_revision is not None:
+        deprecate(
+            "non_ema_revision!=None",
+            "0.15.0",
+            message=(
+                "Downloading 'non_ema' weights from revision branches of the Hub is deprecated. Please make sure to"
+                " use `--variant=non_ema` instead."
+            ),
+        )
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+
+    def deepspeed_zero_init_disabled_context_manager():
+        """
+        returns either a context list that includes one that will disable zero.Init or an empty context list
+        """
+        deepspeed_plugin = AcceleratorState().deepspeed_plugin if accelerate.state.is_initialized() else None
+        if deepspeed_plugin is None:
+            return []
+
+        return [deepspeed_plugin.zero3_init_context_manager(enable=False)]
+
+    # Currently Accelerate doesn't know how to handle multiple models under Deepspeed ZeRO stage 3.
+    # For this to work properly all models must be run through `accelerate.prepare`. But accelerate
+    # will try to assign the same optimizer with the same weights to all models during
+    # `deepspeed.initialize`, which of course doesn't work.
+    #
+    # For now the following workaround will partially support Deepspeed ZeRO-3, by excluding the 2
+    # frozen models from being partitioned during `zero.Init` which gets called during
+    # `from_pretrained` So CLIPTextModel and AutoencoderKL will not enjoy the parameter sharding
+    # across multiple gpus and only UNet2DConditionModel will get ZeRO sharded.
+    with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
+        text_encoder = CLIPTextModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+        )
+        vae = AutoencoderKL.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+        )
+
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.non_ema_revision
+    )
+
+    # Freeze vae and text_encoder and set unet to trainable
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.train()
+
+    # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = UNet2DConditionModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+        )
+        ema_unet = EMAModel(
+            ema_unet.parameters(),
+            model_cls=UNet2DConditionModel,
+            model_config=ema_unet.config,
+            foreach=args.foreach_ema,
+        )
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(
+                    os.path.join(input_dir, "unet_ema"), UNet2DConditionModel, foreach=args.foreach_ema
+                )
+                ema_unet.load_state_dict(load_model.state_dict())
+                if args.offload_ema:
+                    ema_unet.pin_memory()
+                else:
+                    ema_unet.to(accelerator.device)
+                del load_model
+
+            for _ in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            data_dir=args.train_data_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        return inputs.input_ids
+
+    # Preprocessing the datasets.
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["input_ids"] = tokenize_captions(examples)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+        return {"pixel_values": pixel_values, "input_ids": input_ids}
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
+    num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
+    if args.max_train_steps is None:
+        len_train_dataloader_after_sharding = math.ceil(len(train_dataloader) / accelerator.num_processes)
+        num_update_steps_per_epoch = math.ceil(len_train_dataloader_after_sharding / args.gradient_accumulation_steps)
+        num_training_steps_for_scheduler = (
+            args.num_train_epochs * num_update_steps_per_epoch * accelerator.num_processes
+        )
+    else:
+        num_training_steps_for_scheduler = args.max_train_steps * accelerator.num_processes
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps_for_scheduler,
+        num_training_steps=num_training_steps_for_scheduler,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.use_ema:
+        if args.offload_ema:
+            ema_unet.pin_memory()
+        else:
+            ema_unet.to(accelerator.device)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+        args.mixed_precision = accelerator.mixed_precision
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+        args.mixed_precision = accelerator.mixed_precision
+
+    # Move text_encode and vae to gpu and cast to weight_dtype
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        if num_training_steps_for_scheduler != args.max_train_steps * accelerator.num_processes:
+            logger.warning(
+                f"The length of the 'train_dataloader' after 'accelerator.prepare' ({len(train_dataloader)}) does not match "
+                f"the expected length ({len_train_dataloader_after_sharding}) when the learning rate scheduler was created. "
+                f"This inconsistency may result in the learning rate scheduler not functioning properly."
+            )
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        tracker_config.pop("validation_prompts")
+        accelerator.init_trackers(args.tracker_project_name, tracker_config)
+
+    # Function for unwrapping if model was compiled with `torch.compile`.
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (latents.shape[0], latents.shape[1], 1, 1), device=latents.device
+                    )
+                if args.input_perturbation:
+                    new_noise = noise + args.input_perturbation * torch.randn_like(noise)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                if args.input_perturbation:
+                    noisy_latents = noise_scheduler.add_noise(latents, new_noise, timesteps)
+                else:
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"], return_dict=False)[0]
+
+                # Get the target for loss depending on the prediction type
+                if args.prediction_type is not None:
+                    # set prediction_type of scheduler if defined
+                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.dream_training:
+                    noisy_latents, target = compute_dream_and_update_latents(
+                        unet,
+                        noise_scheduler,
+                        timesteps,
+                        noise,
+                        noisy_latents,
+                        target,
+                        encoder_hidden_states,
+                        args.dream_detail_preservation,
+                    )
+
+                # Predict the noise residual and compute loss
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    if args.offload_ema:
+                        ema_unet.to(device="cuda", non_blocking=True)
+                    ema_unet.step(unet.parameters())
+                    if args.offload_ema:
+                        ema_unet.to(device="cpu", non_blocking=True)
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompts is not None and epoch % args.validation_epochs == 0:
+                if args.use_ema:
+                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                    ema_unet.store(unet.parameters())
+                    ema_unet.copy_to(unet.parameters())
+                log_validation(
+                    vae,
+                    text_encoder,
+                    tokenizer,
+                    unet,
+                    args,
+                    accelerator,
+                    weight_dtype,
+                    global_step,
+                )
+                if args.use_ema:
+                    # Switch back to the original UNet parameters.
+                    ema_unet.restore(unet.parameters())
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            revision=args.revision,
+            variant=args.variant,
+        )
+        pipeline.save_pretrained(args.output_dir)
+
+        # Run a final round of inference.
+        images = []
+        if args.validation_prompts is not None:
+            logger.info("Running inference for collecting generated images...")
+            pipeline = pipeline.to(accelerator.device)
+            pipeline.torch_dtype = weight_dtype
+            pipeline.set_progress_bar_config(disable=True)
+
+            if args.enable_xformers_memory_efficient_attention:
+                pipeline.enable_xformers_memory_efficient_attention()
+
+            if args.seed is None:
+                generator = None
+            else:
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+            for i in range(len(args.validation_prompts)):
+                with torch.autocast("cuda"):
+                    image = pipeline(args.validation_prompts[i], num_inference_steps=20, generator=generator).images[0]
+                images.append(image)
+
+        if args.push_to_hub:
+            save_model_card(args, repo_id, images, repo_folder=args.output_dir)
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/text_to_image/train_text_to_image_flax.py b/diffusers/examples/text_to_image/train_text_to_image_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a80067d693d64c85ae5befff5852ea7537d9c48
--- /dev/null
+++ b/diffusers/examples/text_to_image/train_text_to_image_flax.py
@@ -0,0 +1,620 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import math
+import os
+import random
+from pathlib import Path
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+import torch
+import torch.utils.checkpoint
+import transformers
+from datasets import load_dataset
+from flax import jax_utils
+from flax.training import train_state
+from flax.training.common_utils import shard
+from huggingface_hub import create_repo, upload_folder
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPImageProcessor, CLIPTokenizer, FlaxCLIPTextModel, set_seed
+
+from diffusers import (
+    FlaxAutoencoderKL,
+    FlaxDDPMScheduler,
+    FlaxPNDMScheduler,
+    FlaxStableDiffusionPipeline,
+    FlaxUNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
+from diffusers.utils import check_min_version
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--from_pt",
+        action="store_true",
+        default=False,
+        help="Flag to indicate whether to convert models from PyTorch.",
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+dataset_name_mapping = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def get_params_to_save(params):
+    return jax.device_get(jax.tree_util.tree_map(lambda x: x[0], params))
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if jax.process_index() == 0:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, data_dir=args.train_data_dir
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = dataset_name_mapping.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(captions, max_length=tokenizer.model_max_length, padding="do_not_pad", truncation=True)
+        input_ids = inputs.input_ids
+        return input_ids
+
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["input_ids"] = tokenize_captions(examples)
+
+        return examples
+
+    if args.max_train_samples is not None:
+        dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+    train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = [example["input_ids"] for example in examples]
+
+        padded_tokens = tokenizer.pad(
+            {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt"
+        )
+        batch = {
+            "pixel_values": pixel_values,
+            "input_ids": padded_tokens.input_ids,
+        }
+        batch = {k: v.numpy() for k, v in batch.items()}
+
+        return batch
+
+    total_train_batch_size = args.train_batch_size * jax.local_device_count()
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=total_train_batch_size, drop_last=True
+    )
+
+    weight_dtype = jnp.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = jnp.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = jnp.bfloat16
+
+    # Load models and create wrapper for stable diffusion
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        from_pt=args.from_pt,
+        revision=args.revision,
+        subfolder="tokenizer",
+    )
+    text_encoder = FlaxCLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        from_pt=args.from_pt,
+        revision=args.revision,
+        subfolder="text_encoder",
+        dtype=weight_dtype,
+    )
+    vae, vae_params = FlaxAutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        from_pt=args.from_pt,
+        revision=args.revision,
+        subfolder="vae",
+        dtype=weight_dtype,
+    )
+    unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        from_pt=args.from_pt,
+        revision=args.revision,
+        subfolder="unet",
+        dtype=weight_dtype,
+    )
+
+    # Optimization
+    if args.scale_lr:
+        args.learning_rate = args.learning_rate * total_train_batch_size
+
+    constant_scheduler = optax.constant_schedule(args.learning_rate)
+
+    adamw = optax.adamw(
+        learning_rate=constant_scheduler,
+        b1=args.adam_beta1,
+        b2=args.adam_beta2,
+        eps=args.adam_epsilon,
+        weight_decay=args.adam_weight_decay,
+    )
+
+    optimizer = optax.chain(
+        optax.clip_by_global_norm(args.max_grad_norm),
+        adamw,
+    )
+
+    state = train_state.TrainState.create(apply_fn=unet.__call__, params=unet_params, tx=optimizer)
+
+    noise_scheduler = FlaxDDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
+    )
+    noise_scheduler_state = noise_scheduler.create_state()
+
+    # Initialize our training
+    rng = jax.random.PRNGKey(args.seed)
+    train_rngs = jax.random.split(rng, jax.local_device_count())
+
+    def train_step(state, text_encoder_params, vae_params, batch, train_rng):
+        dropout_rng, sample_rng, new_train_rng = jax.random.split(train_rng, 3)
+
+        def compute_loss(params):
+            # Convert images to latent space
+            vae_outputs = vae.apply(
+                {"params": vae_params}, batch["pixel_values"], deterministic=True, method=vae.encode
+            )
+            latents = vae_outputs.latent_dist.sample(sample_rng)
+            # (NHWC) -> (NCHW)
+            latents = jnp.transpose(latents, (0, 3, 1, 2))
+            latents = latents * vae.config.scaling_factor
+
+            # Sample noise that we'll add to the latents
+            noise_rng, timestep_rng = jax.random.split(sample_rng)
+            noise = jax.random.normal(noise_rng, latents.shape)
+            # Sample a random timestep for each image
+            bsz = latents.shape[0]
+            timesteps = jax.random.randint(
+                timestep_rng,
+                (bsz,),
+                0,
+                noise_scheduler.config.num_train_timesteps,
+            )
+
+            # Add noise to the latents according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)
+
+            # Get the text embedding for conditioning
+            encoder_hidden_states = text_encoder(
+                batch["input_ids"],
+                params=text_encoder_params,
+                train=False,
+            )[0]
+
+            # Predict the noise residual and compute loss
+            model_pred = unet.apply(
+                {"params": params}, noisy_latents, timesteps, encoder_hidden_states, train=True
+            ).sample
+
+            # Get the target for loss depending on the prediction type
+            if noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif noise_scheduler.config.prediction_type == "v_prediction":
+                target = noise_scheduler.get_velocity(noise_scheduler_state, latents, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+            loss = (target - model_pred) ** 2
+            loss = loss.mean()
+
+            return loss
+
+        grad_fn = jax.value_and_grad(compute_loss)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+
+        new_state = state.apply_gradients(grads=grad)
+
+        metrics = {"loss": loss}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+
+        return new_state, metrics, new_train_rng
+
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    text_encoder_params = jax_utils.replicate(text_encoder.params)
+    vae_params = jax_utils.replicate(vae_params)
+
+    # Train!
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+
+    # Scheduler and math around the number of training steps.
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {total_train_batch_size}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    global_step = 0
+
+    epochs = tqdm(range(args.num_train_epochs), desc="Epoch ... ", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+
+        train_metrics = []
+
+        steps_per_epoch = len(train_dataset) // total_train_batch_size
+        train_step_progress_bar = tqdm(total=steps_per_epoch, desc="Training...", position=1, leave=False)
+        # train
+        for batch in train_dataloader:
+            batch = shard(batch)
+            state, train_metric, train_rngs = p_train_step(state, text_encoder_params, vae_params, batch, train_rngs)
+            train_metrics.append(train_metric)
+
+            train_step_progress_bar.update(1)
+
+            global_step += 1
+            if global_step >= args.max_train_steps:
+                break
+
+        train_metric = jax_utils.unreplicate(train_metric)
+
+        train_step_progress_bar.close()
+        epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})")
+
+    # Create the pipeline using using the trained modules and save it.
+    if jax.process_index() == 0:
+        scheduler = FlaxPNDMScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
+        )
+        safety_checker = FlaxStableDiffusionSafetyChecker.from_pretrained(
+            "CompVis/stable-diffusion-safety-checker", from_pt=True
+        )
+        pipeline = FlaxStableDiffusionPipeline(
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32"),
+        )
+
+        pipeline.save_pretrained(
+            args.output_dir,
+            params={
+                "text_encoder": get_params_to_save(text_encoder_params),
+                "vae": get_params_to_save(vae_params),
+                "unet": get_params_to_save(state.params),
+                "safety_checker": safety_checker.params,
+            },
+        )
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/text_to_image/train_text_to_image_lora.py b/diffusers/examples/text_to_image/train_text_to_image_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..379519b4c8122e4892f24f83c667ec8385b73235
--- /dev/null
+++ b/diffusers/examples/text_to_image/train_text_to_image_lora.py
@@ -0,0 +1,979 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning script for Stable Diffusion for text2image with support for LoRA."""
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import cast_training_params, compute_snr
+from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def save_model_card(
+    repo_id: str,
+    images: list = None,
+    base_model: str = None,
+    dataset_name: str = None,
+    repo_folder: str = None,
+):
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# LoRA text2image fine-tuning - {repo_id}
+These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
+{img_str}
+"""
+
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion",
+        "stable-diffusion-diffusers",
+        "text-to-image",
+        "diffusers",
+        "diffusers-training",
+        "lora",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    epoch,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+    generator = torch.Generator(device=accelerator.device)
+    if args.seed is not None:
+        generator = generator.manual_seed(args.seed)
+    images = []
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+
+    with autocast_ctx:
+        for _ in range(args.num_validation_images):
+            images.append(pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0])
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+    return images
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def main():
+    args = parse_args()
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+    # freeze parameters of models to save more memory
+    unet.requires_grad_(False)
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Freeze the unet parameters before adding adapters
+    for param in unet.parameters():
+        param.requires_grad_(False)
+
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # Add adapter and make sure the trainable params are in float32.
+    unet.add_adapter(unet_lora_config)
+    if args.mixed_precision == "fp16":
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(unet, dtype=torch.float32)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    lora_layers = filter(lambda p: p.requires_grad, unet.parameters())
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        lora_layers,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            data_dir=args.train_data_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        return inputs.input_ids
+
+    # Preprocessing the datasets.
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["input_ids"] = tokenize_captions(examples)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+        return {"pixel_values": pixel_values, "input_ids": input_ids}
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
+    num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
+    if args.max_train_steps is None:
+        len_train_dataloader_after_sharding = math.ceil(len(train_dataloader) / accelerator.num_processes)
+        num_update_steps_per_epoch = math.ceil(len_train_dataloader_after_sharding / args.gradient_accumulation_steps)
+        num_training_steps_for_scheduler = (
+            args.num_train_epochs * num_update_steps_per_epoch * accelerator.num_processes
+        )
+    else:
+        num_training_steps_for_scheduler = args.max_train_steps * accelerator.num_processes
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps_for_scheduler,
+        num_training_steps=num_training_steps_for_scheduler,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        if num_training_steps_for_scheduler != args.max_train_steps * accelerator.num_processes:
+            logger.warning(
+                f"The length of the 'train_dataloader' after 'accelerator.prepare' ({len(train_dataloader)}) does not match "
+                f"the expected length ({len_train_dataloader_after_sharding}) when the learning rate scheduler was created. "
+                f"This inconsistency may result in the learning rate scheduler not functioning properly."
+            )
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2image-fine-tune", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (latents.shape[0], latents.shape[1], 1, 1), device=latents.device
+                    )
+
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"], return_dict=False)[0]
+
+                # Get the target for loss depending on the prediction type
+                if args.prediction_type is not None:
+                    # set prediction_type of scheduler if defined
+                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # Predict the noise residual and compute loss
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = lora_layers
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+
+                        unwrapped_unet = unwrap_model(unet)
+                        unet_lora_state_dict = convert_state_dict_to_diffusers(
+                            get_peft_model_state_dict(unwrapped_unet)
+                        )
+
+                        StableDiffusionPipeline.save_lora_weights(
+                            save_directory=save_path,
+                            unet_lora_layers=unet_lora_state_dict,
+                            safe_serialization=True,
+                        )
+
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                # create pipeline
+                pipeline = DiffusionPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    unet=unwrap_model(unet),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                images = log_validation(pipeline, args, accelerator, epoch)
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unet.to(torch.float32)
+
+        unwrapped_unet = unwrap_model(unet)
+        unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unwrapped_unet))
+        StableDiffusionPipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_state_dict,
+            safe_serialization=True,
+        )
+
+        # Final inference
+        # Load previous pipeline
+        if args.validation_prompt is not None:
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                revision=args.revision,
+                variant=args.variant,
+                torch_dtype=weight_dtype,
+            )
+
+            # load attention processors
+            pipeline.load_lora_weights(args.output_dir)
+
+            # run inference
+            images = log_validation(pipeline, args, accelerator, epoch, is_final_validation=True)
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                dataset_name=args.dataset_name,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/text_to_image/train_text_to_image_lora_sdxl.py b/diffusers/examples/text_to_image/train_text_to_image_lora_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe098c8638d52ff1ffd4503136f9aee0eaa2ace2
--- /dev/null
+++ b/diffusers/examples/text_to_image/train_text_to_image_lora_sdxl.py
@@ -0,0 +1,1327 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning script for Stable Diffusion XL for text2image with support for LoRA."""
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, DistributedType, ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import StableDiffusionLoraLoaderMixin
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr
+from diffusers.utils import (
+    check_min_version,
+    convert_state_dict_to_diffusers,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+if is_torch_npu_available():
+    torch.npu.config.allow_internal_format = False
+
+
+def save_model_card(
+    repo_id: str,
+    images: list = None,
+    base_model: str = None,
+    dataset_name: str = None,
+    train_text_encoder: bool = False,
+    repo_folder: str = None,
+    vae_path: str = None,
+):
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# LoRA text2image fine-tuning - {repo_id}
+
+These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
+{img_str}
+
+LoRA for the text encoder was enabled: {train_text_encoder}.
+
+Special VAE used for training: {vae_path}.
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion-xl",
+        "stable-diffusion-xl-diffusers",
+        "text-to-image",
+        "diffusers",
+        "diffusers-training",
+        "lora",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    epoch,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    pipeline_args = {"prompt": args.validation_prompt}
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+
+    with autocast_ctx:
+        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+    return images
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--debug_loss",
+        action="store_true",
+        help="debug loss for each image, if filenames are available in the dataset",
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def tokenize_prompt(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(text_encoders, tokenizers, prompt, text_input_ids_list=None):
+    prompt_embeds_list = []
+
+    for i, text_encoder in enumerate(text_encoders):
+        if tokenizers is not None:
+            tokenizer = tokenizers[i]
+            text_input_ids = tokenize_prompt(tokenizer, prompt)
+        else:
+            assert text_input_ids_list is not None
+            text_input_ids = text_input_ids_list[i]
+
+        prompt_embeds = text_encoder(
+            text_input_ids.to(text_encoder.device), output_hidden_states=True, return_dict=False
+        )
+
+        # We are only ALWAYS interested in the pooled output of the final text encoder
+        pooled_prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds[-1][-2]
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+        prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # We only train the additional adapter LoRA layers
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    unet.to(accelerator.device, dtype=weight_dtype)
+
+    if args.pretrained_vae_model_name_or_path is None:
+        vae.to(accelerator.device, dtype=torch.float32)
+    else:
+        vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    if args.enable_npu_flash_attention:
+        if is_torch_npu_available():
+            logger.info("npu flash attention enabled.")
+            unet.enable_npu_flash_attention()
+        else:
+            raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.")
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # now we will add new LoRA weights to the attention layers
+    # Set correct lora layers
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+
+    unet.add_adapter(unet_lora_config)
+
+    # The text encoder comes from 🤗 transformers, we will also attach adapters to it.
+    if args.train_text_encoder:
+        # ensure that dtype is float32, even if rest of the model that isn't trained is loaded in fp16
+        text_lora_config = LoraConfig(
+            r=args.rank,
+            lora_alpha=args.rank,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder_one.add_adapter(text_lora_config)
+        text_encoder_two.add_adapter(text_lora_config)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder attn layers
+            unet_lora_layers_to_save = None
+            text_encoder_one_lora_layers_to_save = None
+            text_encoder_two_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(unwrap_model(model), type(unwrap_model(unet))):
+                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
+                elif isinstance(unwrap_model(model), type(unwrap_model(text_encoder_one))):
+                    text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
+                elif isinstance(unwrap_model(model), type(unwrap_model(text_encoder_two))):
+                    text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                if weights:
+                    weights.pop()
+
+            StableDiffusionXLPipeline.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
+                text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save,
+            )
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+        text_encoder_one_ = None
+        text_encoder_two_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(unet))):
+                unet_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                text_encoder_one_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_two))):
+                text_encoder_two_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, _ = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir)
+        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        if args.train_text_encoder:
+            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)
+
+            _set_state_dict_into_text_encoder(
+                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_two_
+            )
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [unet_]
+            if args.train_text_encoder:
+                models.extend([text_encoder_one_, text_encoder_two_])
+            cast_training_params(models, dtype=torch.float32)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder_one.gradient_checkpointing_enable()
+            text_encoder_two.gradient_checkpointing_enable()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [unet]
+        if args.train_text_encoder:
+            models.extend([text_encoder_one, text_encoder_two])
+        cast_training_params(models, dtype=torch.float32)
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = list(filter(lambda p: p.requires_grad, unet.parameters()))
+    if args.train_text_encoder:
+        params_to_optimize = (
+            params_to_optimize
+            + list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
+            + list(filter(lambda p: p.requires_grad, text_encoder_two.parameters()))
+        )
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, data_dir=args.train_data_dir
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        tokens_one = tokenize_prompt(tokenizer_one, captions)
+        tokens_two = tokenize_prompt(tokenizer_two, captions)
+        return tokens_one, tokens_two
+
+    # Preprocessing the datasets.
+    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
+    train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
+    train_flip = transforms.RandomHorizontalFlip(p=1.0)
+    train_transforms = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        # image aug
+        original_sizes = []
+        all_images = []
+        crop_top_lefts = []
+        for image in images:
+            original_sizes.append((image.height, image.width))
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            crop_top_left = (y1, x1)
+            crop_top_lefts.append(crop_top_left)
+            image = train_transforms(image)
+            all_images.append(image)
+
+        examples["original_sizes"] = original_sizes
+        examples["crop_top_lefts"] = crop_top_lefts
+        examples["pixel_values"] = all_images
+        tokens_one, tokens_two = tokenize_captions(examples)
+        examples["input_ids_one"] = tokens_one
+        examples["input_ids_two"] = tokens_two
+        if args.debug_loss:
+            fnames = [os.path.basename(image.filename) for image in examples[image_column] if image.filename]
+            if fnames:
+                examples["filenames"] = fnames
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train, output_all_columns=True)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        original_sizes = [example["original_sizes"] for example in examples]
+        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
+        input_ids_one = torch.stack([example["input_ids_one"] for example in examples])
+        input_ids_two = torch.stack([example["input_ids_two"] for example in examples])
+        result = {
+            "pixel_values": pixel_values,
+            "input_ids_one": input_ids_one,
+            "input_ids_two": input_ids_two,
+            "original_sizes": original_sizes,
+            "crop_top_lefts": crop_top_lefts,
+        }
+
+        filenames = [example["filenames"] for example in examples if "filenames" in example]
+        if filenames:
+            result["filenames"] = filenames
+        return result
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2image-fine-tune", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder_one.train()
+            text_encoder_two.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                if args.pretrained_vae_model_name_or_path is not None:
+                    pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
+                else:
+                    pixel_values = batch["pixel_values"]
+
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                model_input = model_input * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    model_input = model_input.to(weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (model_input.shape[0], model_input.shape[1], 1, 1), device=model_input.device
+                    )
+
+                bsz = model_input.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                )
+                timesteps = timesteps.long()
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+
+                # time ids
+                def compute_time_ids(original_size, crops_coords_top_left):
+                    # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+                    target_size = (args.resolution, args.resolution)
+                    add_time_ids = list(original_size + crops_coords_top_left + target_size)
+                    add_time_ids = torch.tensor([add_time_ids])
+                    add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+                    return add_time_ids
+
+                add_time_ids = torch.cat(
+                    [compute_time_ids(s, c) for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])]
+                )
+
+                # Predict the noise residual
+                unet_added_conditions = {"time_ids": add_time_ids}
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                    text_encoders=[text_encoder_one, text_encoder_two],
+                    tokenizers=None,
+                    prompt=None,
+                    text_input_ids_list=[batch["input_ids_one"], batch["input_ids_two"]],
+                )
+                unet_added_conditions.update({"text_embeds": pooled_prompt_embeds})
+                model_pred = unet(
+                    noisy_model_input,
+                    timesteps,
+                    prompt_embeds,
+                    added_cond_kwargs=unet_added_conditions,
+                    return_dict=False,
+                )[0]
+
+                # Get the target for loss depending on the prediction type
+                if args.prediction_type is not None:
+                    # set prediction_type of scheduler if defined
+                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+                if args.debug_loss and "filenames" in batch:
+                    for fname in batch["filenames"]:
+                        accelerator.log({"loss_for_" + fname: loss}, step=global_step)
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
+                if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                # create pipeline
+                pipeline = StableDiffusionXLPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=unwrap_model(text_encoder_one),
+                    text_encoder_2=unwrap_model(text_encoder_two),
+                    unet=unwrap_model(unet),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+
+                images = log_validation(pipeline, args, accelerator, epoch)
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+
+        if args.train_text_encoder:
+            text_encoder_one = unwrap_model(text_encoder_one)
+            text_encoder_two = unwrap_model(text_encoder_two)
+
+            text_encoder_lora_layers = convert_state_dict_to_diffusers(get_peft_model_state_dict(text_encoder_one))
+            text_encoder_2_lora_layers = convert_state_dict_to_diffusers(get_peft_model_state_dict(text_encoder_two))
+        else:
+            text_encoder_lora_layers = None
+            text_encoder_2_lora_layers = None
+
+        StableDiffusionXLPipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_state_dict,
+            text_encoder_lora_layers=text_encoder_lora_layers,
+            text_encoder_2_lora_layers=text_encoder_2_lora_layers,
+        )
+
+        del unet
+        del text_encoder_one
+        del text_encoder_two
+        del text_encoder_lora_layers
+        del text_encoder_2_lora_layers
+        torch.cuda.empty_cache()
+
+        # Final inference
+        # Make sure vae.dtype is consistent with the unet.dtype
+        if args.mixed_precision == "fp16":
+            vae.to(weight_dtype)
+        # Load previous pipeline
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            vae=vae,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+
+        # load attention processors
+        pipeline.load_lora_weights(args.output_dir)
+
+        # run inference
+        if args.validation_prompt and args.num_validation_images > 0:
+            images = log_validation(pipeline, args, accelerator, epoch, is_final_validation=True)
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                dataset_name=args.dataset_name,
+                train_text_encoder=args.train_text_encoder,
+                repo_folder=args.output_dir,
+                vae_path=args.pretrained_vae_model_name_or_path,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/text_to_image/train_text_to_image_sdxl.py b/diffusers/examples/text_to_image/train_text_to_image_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ca511c857ae62ae62bbec66b801ca3c0960ba08
--- /dev/null
+++ b/diffusers/examples/text_to_image/train_text_to_image_sdxl.py
@@ -0,0 +1,1345 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning script for Stable Diffusion XL for text2image."""
+
+import argparse
+import functools
+import gc
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+
+import accelerate
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedType, ProjectConfiguration, set_seed
+from datasets import concatenate_datasets, load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionXLPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel, compute_snr
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+if is_torch_npu_available():
+    torch.npu.config.allow_internal_format = False
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def save_model_card(
+    repo_id: str,
+    images: list = None,
+    validation_prompt: str = None,
+    base_model: str = None,
+    dataset_name: str = None,
+    repo_folder: str = None,
+    vae_path: str = None,
+):
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# Text-to-image finetuning - {repo_id}
+
+This pipeline was finetuned from **{base_model}** on the **{dataset_name}** dataset. Below are some example images generated with the finetuned pipeline using the following prompt: {validation_prompt}: \n
+{img_str}
+
+Special VAE used for training: {vae_path}.
+"""
+
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion-xl",
+        "stable-diffusion-xl-diffusers",
+        "text-to-image",
+        "diffusers-training",
+        "diffusers",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sdxl-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--timestep_bias_strategy",
+        type=str,
+        default="none",
+        choices=["earlier", "later", "range", "none"],
+        help=(
+            "The timestep bias strategy, which may help direct the model toward learning low or high frequency details."
+            " Choices: ['earlier', 'later', 'range', 'none']."
+            " The default is 'none', which means no bias is applied, and training proceeds normally."
+            " The value of 'later' will increase the frequency of the model's final training timesteps."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_multiplier",
+        type=float,
+        default=1.0,
+        help=(
+            "The multiplier for the bias. Defaults to 1.0, which means no bias is applied."
+            " A value of 2.0 will double the weight of the bias, and a value of 0.5 will halve it."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_begin",
+        type=int,
+        default=0,
+        help=(
+            "When using `--timestep_bias_strategy=range`, the beginning (inclusive) timestep to bias."
+            " Defaults to zero, which equates to having no specific bias."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_end",
+        type=int,
+        default=1000,
+        help=(
+            "When using `--timestep_bias_strategy=range`, the final timestep (inclusive) to bias."
+            " Defaults to 1000, which is the number of timesteps that Stable Diffusion is trained on."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_portion",
+        type=float,
+        default=0.25,
+        help=(
+            "The portion of timesteps to bias. Defaults to 0.25, which 25% of timesteps will be biased."
+            " A value of 0.5 will bias one half of the timesteps. The value provided for `--timestep_bias_strategy` determines"
+            " whether the biased portions are in the earlier or later timesteps."
+        ),
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention."
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    return args
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(batch, text_encoders, tokenizers, proportion_empty_prompts, caption_column, is_train=True):
+    prompt_embeds_list = []
+    prompt_batch = batch[caption_column]
+
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+
+    with torch.no_grad():
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                captions,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = text_encoder(
+                text_input_ids.to(text_encoder.device),
+                output_hidden_states=True,
+                return_dict=False,
+            )
+
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds[-1][-2]
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+            prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return {"prompt_embeds": prompt_embeds.cpu(), "pooled_prompt_embeds": pooled_prompt_embeds.cpu()}
+
+
+def compute_vae_encodings(batch, vae):
+    images = batch.pop("pixel_values")
+    pixel_values = torch.stack(list(images))
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+    pixel_values = pixel_values.to(vae.device, dtype=vae.dtype)
+
+    with torch.no_grad():
+        model_input = vae.encode(pixel_values).latent_dist.sample()
+    model_input = model_input * vae.config.scaling_factor
+    return {"model_input": model_input.cpu()}
+
+
+def generate_timestep_weights(args, num_timesteps):
+    weights = torch.ones(num_timesteps)
+
+    # Determine the indices to bias
+    num_to_bias = int(args.timestep_bias_portion * num_timesteps)
+
+    if args.timestep_bias_strategy == "later":
+        bias_indices = slice(-num_to_bias, None)
+    elif args.timestep_bias_strategy == "earlier":
+        bias_indices = slice(0, num_to_bias)
+    elif args.timestep_bias_strategy == "range":
+        # Out of the possible 1000 timesteps, we might want to focus on eg. 200-500.
+        range_begin = args.timestep_bias_begin
+        range_end = args.timestep_bias_end
+        if range_begin < 0:
+            raise ValueError(
+                "When using the range strategy for timestep bias, you must provide a beginning timestep greater or equal to zero."
+            )
+        if range_end > num_timesteps:
+            raise ValueError(
+                "When using the range strategy for timestep bias, you must provide an ending timestep smaller than the number of timesteps."
+            )
+        bias_indices = slice(range_begin, range_end)
+    else:  # 'none' or any other string
+        return weights
+    if args.timestep_bias_multiplier <= 0:
+        return ValueError(
+            "The parameter --timestep_bias_multiplier is not intended to be used to disable the training of specific timesteps."
+            " If it was intended to disable timestep bias, use `--timestep_bias_strategy none` instead."
+            " A timestep bias multiplier less than or equal to 0 is not allowed."
+        )
+
+    # Apply the bias
+    weights[bias_indices] *= args.timestep_bias_multiplier
+
+    # Normalize
+    weights /= weights.sum()
+
+    return weights
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    # Check for terminal SNR in combination with SNR Gamma
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # Freeze vae and text encoders.
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    # Set unet as trainable.
+    unet.train()
+
+    # For mixed precision training we cast all non-trainable weights to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    vae.to(accelerator.device, dtype=torch.float32)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = UNet2DConditionModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+        )
+        ema_unet = EMAModel(ema_unet.parameters(), model_cls=UNet2DConditionModel, model_config=ema_unet.config)
+    if args.enable_npu_flash_attention:
+        if is_torch_npu_available():
+            logger.info("npu flash attention enabled.")
+            unet.enable_npu_flash_attention()
+        else:
+            raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.")
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    if weights:
+                        weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DConditionModel)
+                ema_unet.load_state_dict(load_model.state_dict())
+                ema_unet.to(accelerator.device)
+                del load_model
+
+            for _ in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = unet.parameters()
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
+    train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
+    train_flip = transforms.RandomHorizontalFlip(p=1.0)
+    train_transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        # image aug
+        original_sizes = []
+        all_images = []
+        crop_top_lefts = []
+        for image in images:
+            original_sizes.append((image.height, image.width))
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            crop_top_left = (y1, x1)
+            crop_top_lefts.append(crop_top_left)
+            image = train_transforms(image)
+            all_images.append(image)
+
+        examples["original_sizes"] = original_sizes
+        examples["crop_top_lefts"] = crop_top_lefts
+        examples["pixel_values"] = all_images
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    # Let's first compute all the embeddings so that we can free up the text encoders
+    # from memory. We will pre-compute the VAE encodings too.
+    text_encoders = [text_encoder_one, text_encoder_two]
+    tokenizers = [tokenizer_one, tokenizer_two]
+    compute_embeddings_fn = functools.partial(
+        encode_prompt,
+        text_encoders=text_encoders,
+        tokenizers=tokenizers,
+        proportion_empty_prompts=args.proportion_empty_prompts,
+        caption_column=args.caption_column,
+    )
+    compute_vae_encodings_fn = functools.partial(compute_vae_encodings, vae=vae)
+    with accelerator.main_process_first():
+        from datasets.fingerprint import Hasher
+
+        # fingerprint used by the cache for the other processes to load the result
+        # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401
+        new_fingerprint = Hasher.hash(args)
+        new_fingerprint_for_vae = Hasher.hash(vae_path)
+        train_dataset_with_embeddings = train_dataset.map(
+            compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint
+        )
+        train_dataset_with_vae = train_dataset.map(
+            compute_vae_encodings_fn,
+            batched=True,
+            batch_size=args.train_batch_size,
+            new_fingerprint=new_fingerprint_for_vae,
+        )
+        precomputed_dataset = concatenate_datasets(
+            [train_dataset_with_embeddings, train_dataset_with_vae.remove_columns(["image", "text"])], axis=1
+        )
+        precomputed_dataset = precomputed_dataset.with_transform(preprocess_train)
+
+    del compute_vae_encodings_fn, compute_embeddings_fn, text_encoder_one, text_encoder_two
+    del text_encoders, tokenizers, vae
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    def collate_fn(examples):
+        model_input = torch.stack([torch.tensor(example["model_input"]) for example in examples])
+        original_sizes = [example["original_sizes"] for example in examples]
+        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
+        prompt_embeds = torch.stack([torch.tensor(example["prompt_embeds"]) for example in examples])
+        pooled_prompt_embeds = torch.stack([torch.tensor(example["pooled_prompt_embeds"]) for example in examples])
+
+        return {
+            "model_input": model_input,
+            "prompt_embeds": prompt_embeds,
+            "pooled_prompt_embeds": pooled_prompt_embeds,
+            "original_sizes": original_sizes,
+            "crop_top_lefts": crop_top_lefts,
+        }
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        precomputed_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.use_ema:
+        ema_unet.to(accelerator.device)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2image-fine-tune-sdxl", config=vars(args))
+
+    # Function for unwrapping if torch.compile() was used in accelerate.
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(precomputed_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Sample noise that we'll add to the latents
+                model_input = batch["model_input"].to(accelerator.device)
+                noise = torch.randn_like(model_input)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (model_input.shape[0], model_input.shape[1], 1, 1), device=model_input.device
+                    )
+
+                bsz = model_input.shape[0]
+                if args.timestep_bias_strategy == "none":
+                    # Sample a random timestep for each image without bias.
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                    )
+                else:
+                    # Sample a random timestep for each image, potentially biased by the timestep weights.
+                    # Biasing the timestep weights allows us to spend less time training irrelevant timesteps.
+                    weights = generate_timestep_weights(args, noise_scheduler.config.num_train_timesteps).to(
+                        model_input.device
+                    )
+                    timesteps = torch.multinomial(weights, bsz, replacement=True).long()
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps).to(dtype=weight_dtype)
+
+                # time ids
+                def compute_time_ids(original_size, crops_coords_top_left):
+                    # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+                    target_size = (args.resolution, args.resolution)
+                    add_time_ids = list(original_size + crops_coords_top_left + target_size)
+                    add_time_ids = torch.tensor([add_time_ids])
+                    add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+                    return add_time_ids
+
+                add_time_ids = torch.cat(
+                    [compute_time_ids(s, c) for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])]
+                )
+
+                # Predict the noise residual
+                unet_added_conditions = {"time_ids": add_time_ids}
+                prompt_embeds = batch["prompt_embeds"].to(accelerator.device, dtype=weight_dtype)
+                pooled_prompt_embeds = batch["pooled_prompt_embeds"].to(accelerator.device)
+                unet_added_conditions.update({"text_embeds": pooled_prompt_embeds})
+                model_pred = unet(
+                    noisy_model_input,
+                    timesteps,
+                    prompt_embeds,
+                    added_cond_kwargs=unet_added_conditions,
+                    return_dict=False,
+                )[0]
+
+                # Get the target for loss depending on the prediction type
+                if args.prediction_type is not None:
+                    # set prediction_type of scheduler if defined
+                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+                elif noise_scheduler.config.prediction_type == "sample":
+                    # We set the target to latents here, but the model_pred will return the noise sample prediction.
+                    target = model_input
+                    # We will have to subtract the noise residual from the prediction to get the target sample.
+                    model_pred = model_pred - noise
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = unet.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_unet.step(unet.parameters())
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
+                if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                if args.use_ema:
+                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                    ema_unet.store(unet.parameters())
+                    ema_unet.copy_to(unet.parameters())
+
+                # create pipeline
+                vae = AutoencoderKL.from_pretrained(
+                    vae_path,
+                    subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+                    revision=args.revision,
+                    variant=args.variant,
+                )
+                pipeline = StableDiffusionXLPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    unet=accelerator.unwrap_model(unet),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                if args.prediction_type is not None:
+                    scheduler_args = {"prediction_type": args.prediction_type}
+                    pipeline.scheduler = pipeline.scheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+                pipeline_args = {"prompt": args.validation_prompt}
+
+                with autocast_ctx:
+                    images = [
+                        pipeline(**pipeline_args, generator=generator, num_inference_steps=25).images[0]
+                        for _ in range(args.num_validation_images)
+                    ]
+
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "validation": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+                if args.use_ema:
+                    # Switch back to the original UNet parameters.
+                    ema_unet.restore(unet.parameters())
+
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+
+        # Serialize pipeline.
+        vae = AutoencoderKL.from_pretrained(
+            vae_path,
+            subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            unet=unet,
+            vae=vae,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        if args.prediction_type is not None:
+            scheduler_args = {"prediction_type": args.prediction_type}
+            pipeline.scheduler = pipeline.scheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+        pipeline.save_pretrained(args.output_dir)
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline = pipeline.to(accelerator.device)
+            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+
+            with autocast_ctx:
+                images = [
+                    pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                    for _ in range(args.num_validation_images)
+                ]
+
+            for tracker in accelerator.trackers:
+                if tracker.name == "tensorboard":
+                    np_images = np.stack([np.asarray(img) for img in images])
+                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+                if tracker.name == "wandb":
+                    tracker.log(
+                        {
+                            "test": [
+                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                for i, image in enumerate(images)
+                            ]
+                        }
+                    )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id=repo_id,
+                images=images,
+                validation_prompt=args.validation_prompt,
+                base_model=args.pretrained_model_name_or_path,
+                dataset_name=args.dataset_name,
+                repo_folder=args.output_dir,
+                vae_path=args.pretrained_vae_model_name_or_path,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/textual_inversion/README.md b/diffusers/examples/textual_inversion/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e869bb38d252e6bd39640485c3e6ef494d7e5caa
--- /dev/null
+++ b/diffusers/examples/textual_inversion/README.md
@@ -0,0 +1,153 @@
+## Textual Inversion fine-tuning example
+
+[Textual inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like stable diffusion on your own images using just 3-5 examples.
+The `textual_inversion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+## Running on Colab
+
+Colab for training
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
+
+Colab for inference
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_conceptualizer_inference.ipynb)
+
+## Running locally with PyTorch
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder and run:
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+### Cat toy example
+
+First, let's login so that we can upload the checkpoint to the Hub during training:
+
+```bash
+huggingface-cli login
+```
+
+Now let's get our dataset. For this example we will use some cat images: https://huggingface.co/datasets/diffusers/cat_toy_example .
+
+Let's first download it locally:
+
+```py
+from huggingface_hub import snapshot_download
+
+local_dir = "./cat"
+snapshot_download("diffusers/cat_toy_example", local_dir=local_dir, repo_type="dataset", ignore_patterns=".gitattributes")
+```
+
+This will be our training data.
+Now we can launch the training using:
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+**___Note: Please follow the [README_sdxl.md](./README_sdxl.md) if you are using the [stable-diffusion-xl](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0).___**
+
+```bash
+export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export DATA_DIR="./cat"
+
+accelerate launch textual_inversion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<cat-toy>" \
+  --initializer_token="toy" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --max_train_steps=3000 \
+  --learning_rate=5.0e-04 \
+  --scale_lr \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --push_to_hub \
+  --output_dir="textual_inversion_cat"
+```
+
+A full training run takes ~1 hour on one V100 GPU.
+
+**Note**: As described in [the official paper](https://arxiv.org/abs/2208.01618)
+only one embedding vector is used for the placeholder token, *e.g.* `"<cat-toy>"`.
+However, one can also add multiple embedding vectors for the placeholder token
+to increase the number of fine-tuneable parameters. This can help the model to learn
+more complex details. To use multiple embedding vectors, you should define `--num_vectors`
+to a number larger than one, *e.g.*:
+```bash
+--num_vectors 5
+```
+
+The saved textual inversion vectors will then be larger in size compared to the default case.
+
+### Inference
+
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `placeholder_token` in your prompt.
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+model_id = "path-to-your-trained-model"
+pipe = StableDiffusionPipeline.from_pretrained(model_id,torch_dtype=torch.float16).to("cuda")
+
+repo_id_embeds = "path-to-your-learned-embeds"
+pipe.load_textual_inversion(repo_id_embeds)
+
+prompt = "A <cat-toy> backpack"
+
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+
+image.save("cat-backpack.png")
+```
+
+
+## Training with Flax/JAX
+
+For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install -U -r requirements_flax.txt
+```
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export DATA_DIR="path-to-dir-containing-images"
+
+python textual_inversion_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<cat-toy>" \
+  --initializer_token="toy" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --max_train_steps=3000 \
+  --learning_rate=5.0e-04 \
+  --scale_lr \
+  --output_dir="textual_inversion_cat"
+```
+It should be at least 70% faster than the PyTorch script with the same configuration.
+
+### Training with xformers:
+You can enable memory efficient attention by [installing xFormers](https://github.com/facebookresearch/xformers#installing-xformers) and padding the `--enable_xformers_memory_efficient_attention` argument to the script. This is not available with the Flax/JAX implementation.
diff --git a/diffusers/examples/textual_inversion/README_sdxl.md b/diffusers/examples/textual_inversion/README_sdxl.md
new file mode 100644
index 0000000000000000000000000000000000000000..c971ee2a0f39d3fe4baef1b3a834b65dc569d2fb
--- /dev/null
+++ b/diffusers/examples/textual_inversion/README_sdxl.md
@@ -0,0 +1,47 @@
+## Textual Inversion fine-tuning example for SDXL
+
+```sh
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export DATA_DIR="./cat"
+
+accelerate launch textual_inversion_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<cat-toy>" \
+  --initializer_token="toy" \
+  --mixed_precision="bf16" \
+  --resolution=768 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --max_train_steps=500 \
+  --learning_rate=5.0e-04 \
+  --scale_lr \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --save_as_full_pipeline \
+  --output_dir="./textual_inversion_cat_sdxl"
+```
+
+Training of both text encoders is supported.
+
+### Inference Example
+
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionXLPipeline`.
+Make sure to include the `placeholder_token` in your prompt.
+
+```python
+from diffusers import StableDiffusionXLPipeline
+import torch
+
+model_id = "./textual_inversion_cat_sdxl"
+pipe = StableDiffusionXLPipeline.from_pretrained(model_id,torch_dtype=torch.float16).to("cuda")
+
+prompt = "A <cat-toy> backpack"
+
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+image.save("cat-backpack.png")
+
+image = pipe(prompt="", prompt_2=prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+image.save("cat-backpack-prompt_2.png")
+```
diff --git a/diffusers/examples/textual_inversion/requirements.txt b/diffusers/examples/textual_inversion/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7a612982f4abbaa64f83db52e411a1235a372259
--- /dev/null
+++ b/diffusers/examples/textual_inversion/requirements.txt
@@ -0,0 +1,6 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
diff --git a/diffusers/examples/textual_inversion/requirements_flax.txt b/diffusers/examples/textual_inversion/requirements_flax.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8f85ad523a3b46b65abf0138c05ecdd656e6845c
--- /dev/null
+++ b/diffusers/examples/textual_inversion/requirements_flax.txt
@@ -0,0 +1,8 @@
+transformers>=4.25.1
+flax
+optax
+torch
+torchvision
+ftfy
+tensorboard
+Jinja2
diff --git a/diffusers/examples/textual_inversion/test_textual_inversion.py b/diffusers/examples/textual_inversion/test_textual_inversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa0b2c2bcf09998c7001a1e72b77600e2f946326
--- /dev/null
+++ b/diffusers/examples/textual_inversion/test_textual_inversion.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class TextualInversion(ExamplesTestsAccelerate):
+    def test_textual_inversion(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/textual_inversion/textual_inversion.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --train_data_dir docs/source/en/imgs
+                --learnable_property object
+                --placeholder_token <cat-toy>
+                --initializer_token a
+                --save_steps 1
+                --num_vectors 2
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "learned_embeds.safetensors")))
+
+    def test_textual_inversion_checkpointing(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/textual_inversion/textual_inversion.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --train_data_dir docs/source/en/imgs
+                --learnable_property object
+                --placeholder_token <cat-toy>
+                --initializer_token a
+                --save_steps 1
+                --num_vectors 2
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 3
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=1
+                --checkpoints_total_limit=2
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-3"},
+            )
+
+    def test_textual_inversion_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/textual_inversion/textual_inversion.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --train_data_dir docs/source/en/imgs
+                --learnable_property object
+                --placeholder_token <cat-toy>
+                --initializer_token a
+                --save_steps 1
+                --num_vectors 2
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=1
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-1", "checkpoint-2"},
+            )
+
+            resume_run_args = f"""
+                examples/textual_inversion/textual_inversion.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --train_data_dir docs/source/en/imgs
+                --learnable_property object
+                --placeholder_token <cat-toy>
+                --initializer_token a
+                --save_steps 1
+                --num_vectors 2
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=1
+                --resume_from_checkpoint=checkpoint-2
+                --checkpoints_total_limit=2
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-3"},
+            )
diff --git a/diffusers/examples/textual_inversion/test_textual_inversion_sdxl.py b/diffusers/examples/textual_inversion/test_textual_inversion_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..a861708a70f9c3d35dce5cbd4d2e8a206d9d11a5
--- /dev/null
+++ b/diffusers/examples/textual_inversion/test_textual_inversion_sdxl.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class TextualInversionSdxl(ExamplesTestsAccelerate):
+    def test_textual_inversion_sdxl(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/textual_inversion/textual_inversion_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-sdxl-pipe
+                --train_data_dir docs/source/en/imgs
+                --learnable_property object
+                --placeholder_token <cat-toy>
+                --initializer_token a
+                --save_steps 1
+                --num_vectors 2
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "learned_embeds.safetensors")))
+
+    def test_textual_inversion_sdxl_checkpointing(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/textual_inversion/textual_inversion_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-sdxl-pipe
+                --train_data_dir docs/source/en/imgs
+                --learnable_property object
+                --placeholder_token <cat-toy>
+                --initializer_token a
+                --save_steps 1
+                --num_vectors 2
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 3
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=1
+                --checkpoints_total_limit=2
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-3"},
+            )
+
+    def test_textual_inversion_sdxl_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/textual_inversion/textual_inversion_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-sdxl-pipe
+                --train_data_dir docs/source/en/imgs
+                --learnable_property object
+                --placeholder_token <cat-toy>
+                --initializer_token a
+                --save_steps 1
+                --num_vectors 2
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=1
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-1", "checkpoint-2"},
+            )
+
+            resume_run_args = f"""
+                examples/textual_inversion/textual_inversion_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-sdxl-pipe
+                --train_data_dir docs/source/en/imgs
+                --learnable_property object
+                --placeholder_token <cat-toy>
+                --initializer_token a
+                --save_steps 1
+                --num_vectors 2
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=1
+                --resume_from_checkpoint=checkpoint-2
+                --checkpoints_total_limit=2
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-3"},
+            )
diff --git a/diffusers/examples/textual_inversion/textual_inversion.py b/diffusers/examples/textual_inversion/textual_inversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b710531836b1b4bc1ea5a2c49b13b797fc19d85
--- /dev/null
+++ b/diffusers/examples/textual_inversion/textual_inversion.py
@@ -0,0 +1,1022 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+import warnings
+from contextlib import nullcontext
+from pathlib import Path
+
+import numpy as np
+import PIL
+import safetensors
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+
+
+if is_wandb_available():
+    import wandb
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(repo_id: str, images: list = None, base_model: str = None, repo_folder: str = None):
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+    model_description = f"""
+# Textual inversion text2image fine-tuning - {repo_id}
+These are textual inversion adaption weights for {base_model}. You can find some example images in the following. \n
+{img_str}
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion",
+        "stable-diffusion-diffusers",
+        "text-to-image",
+        "diffusers",
+        "textual_inversion",
+        "diffusers-training",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    # create pipeline (note: unet and vae are loaded again in float32)
+    pipeline = DiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        text_encoder=accelerator.unwrap_model(text_encoder),
+        tokenizer=tokenizer,
+        unet=unet,
+        vae=vae,
+        safety_checker=None,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+    )
+    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = None if args.seed is None else torch.Generator(device=accelerator.device).manual_seed(args.seed)
+    images = []
+    for _ in range(args.num_validation_images):
+        if torch.backends.mps.is_available():
+            autocast_ctx = nullcontext()
+        else:
+            autocast_ctx = torch.autocast(accelerator.device.type)
+
+        with autocast_ctx:
+            image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+        images.append(image)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    torch.cuda.empty_cache()
+    return images
+
+
+def save_progress(text_encoder, placeholder_token_ids, accelerator, args, save_path, safe_serialization=True):
+    logger.info("Saving embeddings")
+    learned_embeds = (
+        accelerator.unwrap_model(text_encoder)
+        .get_input_embeddings()
+        .weight[min(placeholder_token_ids) : max(placeholder_token_ids) + 1]
+    )
+    learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
+
+    if safe_serialization:
+        safetensors.torch.save_file(learned_embeds_dict, save_path, metadata={"format": "pt"})
+    else:
+        torch.save(learned_embeds_dict, save_path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save learned_embeds.bin every X updates steps.",
+    )
+    parser.add_argument(
+        "--save_as_full_pipeline",
+        action="store_true",
+        help="Save the complete stable diffusion pipeline.",
+    )
+    parser.add_argument(
+        "--num_vectors",
+        type=int,
+        default=1,
+        help="How many textual inversion vectors shall be used to learn the concept.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
+    )
+    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
+    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution."
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and Nvidia Ampere GPU or Intel Gen 4 Xeon (and later) ."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=None,
+        help=(
+            "Deprecated in favor of validation_steps. Run validation every X epochs. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--no_safe_serialization",
+        action="store_true",
+        help="If specified save the checkpoint not in `safetensors` format, but in original PyTorch format instead.",
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    return args
+
+
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+
+
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+
+        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+
+        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+
+        example["input_ids"] = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            (
+                h,
+                w,
+            ) = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
+
+        image = Image.fromarray(img)
+        image = image.resize((self.size, self.size), resample=self.interpolation)
+
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+def main():
+    args = parse_args()
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load tokenizer
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # Add the placeholder token in tokenizer
+    placeholder_tokens = [args.placeholder_token]
+
+    if args.num_vectors < 1:
+        raise ValueError(f"--num_vectors has to be larger or equal to 1, but is {args.num_vectors}")
+
+    # add dummy tokens for multi-vector
+    additional_tokens = []
+    for i in range(1, args.num_vectors):
+        additional_tokens.append(f"{args.placeholder_token}_{i}")
+    placeholder_tokens += additional_tokens
+
+    num_added_tokens = tokenizer.add_tokens(placeholder_tokens)
+    if num_added_tokens != args.num_vectors:
+        raise ValueError(
+            f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+            " `placeholder_token` that is not already in the tokenizer."
+        )
+
+    # Convert the initializer_token, placeholder_token to ids
+    token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
+    # Check if initializer_token is a single token or a sequence of tokens
+    if len(token_ids) > 1:
+        raise ValueError("The initializer token must be a single token.")
+
+    initializer_token_id = token_ids[0]
+    placeholder_token_ids = tokenizer.convert_tokens_to_ids(placeholder_tokens)
+
+    # Resize the token embeddings as we are adding new special tokens to the tokenizer
+    text_encoder.resize_token_embeddings(len(tokenizer))
+
+    # Initialise the newly added placeholder token with the embeddings of the initializer token
+    token_embeds = text_encoder.get_input_embeddings().weight.data
+    with torch.no_grad():
+        for token_id in placeholder_token_ids:
+            token_embeds[token_id] = token_embeds[initializer_token_id].clone()
+
+    # Freeze vae and unet
+    vae.requires_grad_(False)
+    unet.requires_grad_(False)
+    # Freeze all parameters except for the token embeddings in text encoder
+    text_encoder.text_model.encoder.requires_grad_(False)
+    text_encoder.text_model.final_layer_norm.requires_grad_(False)
+    text_encoder.text_model.embeddings.position_embedding.requires_grad_(False)
+
+    if args.gradient_checkpointing:
+        # Keep unet in train mode if we are using gradient checkpointing to save memory.
+        # The dropout cannot be != 0 so it doesn't matter if we are in eval or train mode.
+        unet.train()
+        text_encoder.gradient_checkpointing_enable()
+        unet.enable_gradient_checkpointing()
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = TextualInversionDataset(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=(" ".join(tokenizer.convert_ids_to_tokens(placeholder_token_ids))),
+        repeats=args.repeats,
+        learnable_property=args.learnable_property,
+        center_crop=args.center_crop,
+        set="train",
+    )
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
+    )
+    if args.validation_epochs is not None:
+        warnings.warn(
+            f"FutureWarning: You are doing logging with validation_epochs={args.validation_epochs}."
+            " Deprecated validation_epochs in favor of `validation_steps`"
+            f"Setting `args.validation_steps` to {args.validation_epochs * len(train_dataset)}",
+            FutureWarning,
+            stacklevel=2,
+        )
+        args.validation_steps = args.validation_epochs * len(train_dataset)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+    )
+
+    text_encoder.train()
+    # Prepare everything with our `accelerator`.
+    text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        text_encoder, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae and unet to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("textual_inversion", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    # keep original embeddings as reference
+    orig_embeds_params = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight.data.clone()
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(text_encoder):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample().detach()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0].to(dtype=weight_dtype)
+
+                # Predict the noise residual
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                accelerator.backward(loss)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+                # Let's make sure we don't update any embedding weights besides the newly added token
+                index_no_updates = torch.ones((len(tokenizer),), dtype=torch.bool)
+                index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False
+
+                with torch.no_grad():
+                    accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[
+                        index_no_updates
+                    ] = orig_embeds_params[index_no_updates]
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                images = []
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.save_steps == 0:
+                    weight_name = (
+                        f"learned_embeds-steps-{global_step}.bin"
+                        if args.no_safe_serialization
+                        else f"learned_embeds-steps-{global_step}.safetensors"
+                    )
+                    save_path = os.path.join(args.output_dir, weight_name)
+                    save_progress(
+                        text_encoder,
+                        placeholder_token_ids,
+                        accelerator,
+                        args,
+                        save_path,
+                        safe_serialization=not args.no_safe_serialization,
+                    )
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                        images = log_validation(
+                            text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        if args.push_to_hub and not args.save_as_full_pipeline:
+            logger.warning("Enabling full model saving because --push_to_hub=True was specified.")
+            save_full_model = True
+        else:
+            save_full_model = args.save_as_full_pipeline
+        if save_full_model:
+            pipeline = StableDiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                text_encoder=accelerator.unwrap_model(text_encoder),
+                vae=vae,
+                unet=unet,
+                tokenizer=tokenizer,
+            )
+            pipeline.save_pretrained(args.output_dir)
+        # Save the newly trained embeddings
+        weight_name = "learned_embeds.bin" if args.no_safe_serialization else "learned_embeds.safetensors"
+        save_path = os.path.join(args.output_dir, weight_name)
+        save_progress(
+            text_encoder,
+            placeholder_token_ids,
+            accelerator,
+            args,
+            save_path,
+            safe_serialization=not args.no_safe_serialization,
+        )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/textual_inversion/textual_inversion_flax.py b/diffusers/examples/textual_inversion/textual_inversion_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee7b1580d14520f9a8e44c81844c803d6932a414
--- /dev/null
+++ b/diffusers/examples/textual_inversion/textual_inversion_flax.py
@@ -0,0 +1,681 @@
+import argparse
+import logging
+import math
+import os
+import random
+from pathlib import Path
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+import PIL
+import torch
+import torch.utils.checkpoint
+import transformers
+from flax import jax_utils
+from flax.training import train_state
+from flax.training.common_utils import shard
+from huggingface_hub import create_repo, upload_folder
+
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPImageProcessor, CLIPTokenizer, FlaxCLIPTextModel, set_seed
+
+from diffusers import (
+    FlaxAutoencoderKL,
+    FlaxDDPMScheduler,
+    FlaxPNDMScheduler,
+    FlaxStableDiffusionPipeline,
+    FlaxUNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
+from diffusers.utils import check_min_version
+
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
+    )
+    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
+    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution."
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save learned_embeds.bin every X updates steps.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=True,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--use_auth_token",
+        action="store_true",
+        help=(
+            "Will use the token generated when running `huggingface-cli login` (necessary to use this script with"
+            " private models)."
+        ),
+    )
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    return args
+
+
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+
+
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+
+        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+
+        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+
+        example["input_ids"] = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            (
+                h,
+                w,
+            ) = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
+
+        image = Image.fromarray(img)
+        image = image.resize((self.size, self.size), resample=self.interpolation)
+
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+def resize_token_embeddings(model, new_num_tokens, initializer_token_id, placeholder_token_id, rng):
+    if model.config.vocab_size == new_num_tokens or new_num_tokens is None:
+        return
+    model.config.vocab_size = new_num_tokens
+
+    params = model.params
+    old_embeddings = params["text_model"]["embeddings"]["token_embedding"]["embedding"]
+    old_num_tokens, emb_dim = old_embeddings.shape
+
+    initializer = jax.nn.initializers.normal()
+
+    new_embeddings = initializer(rng, (new_num_tokens, emb_dim))
+    new_embeddings = new_embeddings.at[:old_num_tokens].set(old_embeddings)
+    new_embeddings = new_embeddings.at[placeholder_token_id].set(new_embeddings[initializer_token_id])
+    params["text_model"]["embeddings"]["token_embedding"]["embedding"] = new_embeddings
+
+    model.params = params
+    return model
+
+
+def get_params_to_save(params):
+    return jax.device_get(jax.tree_util.tree_map(lambda x: x[0], params))
+
+
+def main():
+    args = parse_args()
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    if jax.process_index() == 0:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+
+    # Load the tokenizer and add the placeholder token as a additional special token
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Add the placeholder token in tokenizer
+    num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
+    if num_added_tokens == 0:
+        raise ValueError(
+            f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+            " `placeholder_token` that is not already in the tokenizer."
+        )
+
+    # Convert the initializer_token, placeholder_token to ids
+    token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
+    # Check if initializer_token is a single token or a sequence of tokens
+    if len(token_ids) > 1:
+        raise ValueError("The initializer token must be a single token.")
+
+    initializer_token_id = token_ids[0]
+    placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
+
+    # Load models and create wrapper for stable diffusion
+    text_encoder = FlaxCLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae, vae_params = FlaxAutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision
+    )
+    unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+
+    # Create sampling rng
+    rng = jax.random.PRNGKey(args.seed)
+    rng, _ = jax.random.split(rng)
+    # Resize the token embeddings as we are adding new special tokens to the tokenizer
+    text_encoder = resize_token_embeddings(
+        text_encoder, len(tokenizer), initializer_token_id, placeholder_token_id, rng
+    )
+    original_token_embeds = text_encoder.params["text_model"]["embeddings"]["token_embedding"]["embedding"]
+
+    train_dataset = TextualInversionDataset(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=args.placeholder_token,
+        repeats=args.repeats,
+        learnable_property=args.learnable_property,
+        center_crop=args.center_crop,
+        set="train",
+    )
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+
+        batch = {"pixel_values": pixel_values, "input_ids": input_ids}
+        batch = {k: v.numpy() for k, v in batch.items()}
+
+        return batch
+
+    total_train_batch_size = args.train_batch_size * jax.local_device_count()
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=total_train_batch_size, shuffle=True, drop_last=True, collate_fn=collate_fn
+    )
+
+    # Optimization
+    if args.scale_lr:
+        args.learning_rate = args.learning_rate * total_train_batch_size
+
+    constant_scheduler = optax.constant_schedule(args.learning_rate)
+
+    optimizer = optax.adamw(
+        learning_rate=constant_scheduler,
+        b1=args.adam_beta1,
+        b2=args.adam_beta2,
+        eps=args.adam_epsilon,
+        weight_decay=args.adam_weight_decay,
+    )
+
+    def create_mask(params, label_fn):
+        def _map(params, mask, label_fn):
+            for k in params:
+                if label_fn(k):
+                    mask[k] = "token_embedding"
+                else:
+                    if isinstance(params[k], dict):
+                        mask[k] = {}
+                        _map(params[k], mask[k], label_fn)
+                    else:
+                        mask[k] = "zero"
+
+        mask = {}
+        _map(params, mask, label_fn)
+        return mask
+
+    def zero_grads():
+        # from https://github.com/deepmind/optax/issues/159#issuecomment-896459491
+        def init_fn(_):
+            return ()
+
+        def update_fn(updates, state, params=None):
+            return jax.tree_util.tree_map(jnp.zeros_like, updates), ()
+
+        return optax.GradientTransformation(init_fn, update_fn)
+
+    # Zero out gradients of layers other than the token embedding layer
+    tx = optax.multi_transform(
+        {"token_embedding": optimizer, "zero": zero_grads()},
+        create_mask(text_encoder.params, lambda s: s == "token_embedding"),
+    )
+
+    state = train_state.TrainState.create(apply_fn=text_encoder.__call__, params=text_encoder.params, tx=tx)
+
+    noise_scheduler = FlaxDDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
+    )
+    noise_scheduler_state = noise_scheduler.create_state()
+
+    # Initialize our training
+    train_rngs = jax.random.split(rng, jax.local_device_count())
+
+    # Define gradient train step fn
+    def train_step(state, vae_params, unet_params, batch, train_rng):
+        dropout_rng, sample_rng, new_train_rng = jax.random.split(train_rng, 3)
+
+        def compute_loss(params):
+            vae_outputs = vae.apply(
+                {"params": vae_params}, batch["pixel_values"], deterministic=True, method=vae.encode
+            )
+            latents = vae_outputs.latent_dist.sample(sample_rng)
+            # (NHWC) -> (NCHW)
+            latents = jnp.transpose(latents, (0, 3, 1, 2))
+            latents = latents * vae.config.scaling_factor
+
+            noise_rng, timestep_rng = jax.random.split(sample_rng)
+            noise = jax.random.normal(noise_rng, latents.shape)
+            bsz = latents.shape[0]
+            timesteps = jax.random.randint(
+                timestep_rng,
+                (bsz,),
+                0,
+                noise_scheduler.config.num_train_timesteps,
+            )
+            noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)
+            encoder_hidden_states = state.apply_fn(
+                batch["input_ids"], params=params, dropout_rng=dropout_rng, train=True
+            )[0]
+            # Predict the noise residual and compute loss
+            model_pred = unet.apply(
+                {"params": unet_params}, noisy_latents, timesteps, encoder_hidden_states, train=False
+            ).sample
+
+            # Get the target for loss depending on the prediction type
+            if noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif noise_scheduler.config.prediction_type == "v_prediction":
+                target = noise_scheduler.get_velocity(noise_scheduler_state, latents, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+            loss = (target - model_pred) ** 2
+            loss = loss.mean()
+
+            return loss
+
+        grad_fn = jax.value_and_grad(compute_loss)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+
+        # Keep the token embeddings fixed except the newly added embeddings for the concept,
+        # as we only want to optimize the concept embeddings
+        token_embeds = original_token_embeds.at[placeholder_token_id].set(
+            new_state.params["text_model"]["embeddings"]["token_embedding"]["embedding"][placeholder_token_id]
+        )
+        new_state.params["text_model"]["embeddings"]["token_embedding"]["embedding"] = token_embeds
+
+        metrics = {"loss": loss}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+        return new_state, metrics, new_train_rng
+
+    # Create parallel version of the train and eval step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    vae_params = jax_utils.replicate(vae_params)
+    unet_params = jax_utils.replicate(unet_params)
+
+    # Train!
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+
+    # Scheduler and math around the number of training steps.
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {total_train_batch_size}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    global_step = 0
+
+    epochs = tqdm(range(args.num_train_epochs), desc=f"Epoch ... (1/{args.num_train_epochs})", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+
+        train_metrics = []
+
+        steps_per_epoch = len(train_dataset) // total_train_batch_size
+        train_step_progress_bar = tqdm(total=steps_per_epoch, desc="Training...", position=1, leave=False)
+        # train
+        for batch in train_dataloader:
+            batch = shard(batch)
+            state, train_metric, train_rngs = p_train_step(state, vae_params, unet_params, batch, train_rngs)
+            train_metrics.append(train_metric)
+
+            train_step_progress_bar.update(1)
+            global_step += 1
+
+            if global_step >= args.max_train_steps:
+                break
+            if global_step % args.save_steps == 0:
+                learned_embeds = get_params_to_save(state.params)["text_model"]["embeddings"]["token_embedding"][
+                    "embedding"
+                ][placeholder_token_id]
+                learned_embeds_dict = {args.placeholder_token: learned_embeds}
+                jnp.save(
+                    os.path.join(args.output_dir, "learned_embeds-" + str(global_step) + ".npy"), learned_embeds_dict
+                )
+
+        train_metric = jax_utils.unreplicate(train_metric)
+
+        train_step_progress_bar.close()
+        epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})")
+
+    # Create the pipeline using using the trained modules and save it.
+    if jax.process_index() == 0:
+        scheduler = FlaxPNDMScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
+        )
+        safety_checker = FlaxStableDiffusionSafetyChecker.from_pretrained(
+            "CompVis/stable-diffusion-safety-checker", from_pt=True
+        )
+        pipeline = FlaxStableDiffusionPipeline(
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32"),
+        )
+
+        pipeline.save_pretrained(
+            args.output_dir,
+            params={
+                "text_encoder": get_params_to_save(state.params),
+                "vae": get_params_to_save(vae_params),
+                "unet": get_params_to_save(unet_params),
+                "safety_checker": safety_checker.params,
+            },
+        )
+
+        # Also save the newly trained embeddings
+        learned_embeds = get_params_to_save(state.params)["text_model"]["embeddings"]["token_embedding"]["embedding"][
+            placeholder_token_id
+        ]
+        learned_embeds_dict = {args.placeholder_token: learned_embeds}
+        jnp.save(os.path.join(args.output_dir, "learned_embeds.npy"), learned_embeds_dict)
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/textual_inversion/textual_inversion_sdxl.py b/diffusers/examples/textual_inversion/textual_inversion_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4629f0f43d613efef87f9f19d2b997bc2de5c74
--- /dev/null
+++ b/diffusers/examples/textual_inversion/textual_inversion_sdxl.py
@@ -0,0 +1,1122 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import numpy as np
+import PIL
+import safetensors
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+
+
+if is_wandb_available():
+    import wandb
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(repo_id: str, images=None, base_model=str, repo_folder=None):
+    img_str = ""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# Textual inversion text2image fine-tuning - {repo_id}
+These are textual inversion adaption weights for {base_model}. You can find some example images in the following. \n
+{img_str}
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion-xl",
+        "stable-diffusion-xl-diffusers",
+        "text-to-image",
+        "diffusers",
+        "diffusers-training",
+        "textual_inversion",
+    ]
+
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(
+    text_encoder_1,
+    text_encoder_2,
+    tokenizer_1,
+    tokenizer_2,
+    unet,
+    vae,
+    args,
+    accelerator,
+    weight_dtype,
+    epoch,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = DiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        text_encoder=accelerator.unwrap_model(text_encoder_1),
+        text_encoder_2=accelerator.unwrap_model(text_encoder_2),
+        tokenizer=tokenizer_1,
+        tokenizer_2=tokenizer_2,
+        unet=unet,
+        vae=vae,
+        safety_checker=None,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+    )
+    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = None if args.seed is None else torch.Generator(device=accelerator.device).manual_seed(args.seed)
+    images = []
+    for _ in range(args.num_validation_images):
+        image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+        images.append(image)
+
+    tracker_key = "test" if is_final_validation else "validation"
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(tracker_key, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    tracker_key: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    torch.cuda.empty_cache()
+    return images
+
+
+def save_progress(text_encoder, placeholder_token_ids, accelerator, args, save_path, safe_serialization=True):
+    logger.info("Saving embeddings")
+    learned_embeds = (
+        accelerator.unwrap_model(text_encoder)
+        .get_input_embeddings()
+        .weight[min(placeholder_token_ids) : max(placeholder_token_ids) + 1]
+    )
+    learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
+
+    if safe_serialization:
+        safetensors.torch.save_file(learned_embeds_dict, save_path, metadata={"format": "pt"})
+    else:
+        torch.save(learned_embeds_dict, save_path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save learned_embeds.bin every X updates steps.",
+    )
+    parser.add_argument(
+        "--save_as_full_pipeline",
+        action="store_true",
+        help="Save the complete stable diffusion pipeline.",
+    )
+    parser.add_argument(
+        "--num_vectors",
+        type=int,
+        default=1,
+        help="How many textual inversion vectors shall be used to learn the concept.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
+    )
+    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
+    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution."
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    return args
+
+
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+
+
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer_1,
+        tokenizer_2,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer_1 = tokenizer_1
+        self.tokenizer_2 = tokenizer_2
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+
+        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+
+        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+        self.crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+
+        example["original_size"] = (image.height, image.width)
+
+        image = image.resize((self.size, self.size), resample=self.interpolation)
+
+        if self.center_crop:
+            y1 = max(0, int(round((image.height - self.size) / 2.0)))
+            x1 = max(0, int(round((image.width - self.size) / 2.0)))
+            image = self.crop(image)
+        else:
+            y1, x1, h, w = self.crop.get_params(image, (self.size, self.size))
+            image = transforms.functional.crop(image, y1, x1, h, w)
+
+        example["crop_top_left"] = (y1, x1)
+
+        example["input_ids_1"] = self.tokenizer_1(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer_1.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        example["input_ids_2"] = self.tokenizer_2(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer_2.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        image = Image.fromarray(img)
+
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+def main():
+    args = parse_args()
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load tokenizer
+    tokenizer_1 = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+    tokenizer_2 = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer_2")
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder_1 = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # Add the placeholder token in tokenizer_1
+    placeholder_tokens = [args.placeholder_token]
+
+    if args.num_vectors < 1:
+        raise ValueError(f"--num_vectors has to be larger or equal to 1, but is {args.num_vectors}")
+
+    # add dummy tokens for multi-vector
+    additional_tokens = []
+    for i in range(1, args.num_vectors):
+        additional_tokens.append(f"{args.placeholder_token}_{i}")
+    placeholder_tokens += additional_tokens
+
+    num_added_tokens = tokenizer_1.add_tokens(placeholder_tokens)
+    if num_added_tokens != args.num_vectors:
+        raise ValueError(
+            f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+            " `placeholder_token` that is not already in the tokenizer."
+        )
+    num_added_tokens = tokenizer_2.add_tokens(placeholder_tokens)
+    if num_added_tokens != args.num_vectors:
+        raise ValueError(
+            f"The 2nd tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+            " `placeholder_token` that is not already in the tokenizer."
+        )
+
+    # Convert the initializer_token, placeholder_token to ids
+    token_ids = tokenizer_1.encode(args.initializer_token, add_special_tokens=False)
+    token_ids_2 = tokenizer_2.encode(args.initializer_token, add_special_tokens=False)
+
+    # Check if initializer_token is a single token or a sequence of tokens
+    if len(token_ids) > 1 or len(token_ids_2) > 1:
+        raise ValueError("The initializer token must be a single token.")
+
+    initializer_token_id = token_ids[0]
+    placeholder_token_ids = tokenizer_1.convert_tokens_to_ids(placeholder_tokens)
+    initializer_token_id_2 = token_ids_2[0]
+    placeholder_token_ids_2 = tokenizer_2.convert_tokens_to_ids(placeholder_tokens)
+
+    # Resize the token embeddings as we are adding new special tokens to the tokenizer
+    text_encoder_1.resize_token_embeddings(len(tokenizer_1))
+    text_encoder_2.resize_token_embeddings(len(tokenizer_2))
+
+    # Initialise the newly added placeholder token with the embeddings of the initializer token
+    token_embeds = text_encoder_1.get_input_embeddings().weight.data
+    token_embeds_2 = text_encoder_2.get_input_embeddings().weight.data
+    with torch.no_grad():
+        for token_id in placeholder_token_ids:
+            token_embeds[token_id] = token_embeds[initializer_token_id].clone()
+        for token_id in placeholder_token_ids_2:
+            token_embeds_2[token_id] = token_embeds_2[initializer_token_id_2].clone()
+
+    # Freeze vae and unet
+    vae.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # Freeze all parameters except for the token embeddings in text encoder
+    text_encoder_1.text_model.encoder.requires_grad_(False)
+    text_encoder_1.text_model.final_layer_norm.requires_grad_(False)
+    text_encoder_1.text_model.embeddings.position_embedding.requires_grad_(False)
+    text_encoder_2.text_model.encoder.requires_grad_(False)
+    text_encoder_2.text_model.final_layer_norm.requires_grad_(False)
+    text_encoder_2.text_model.embeddings.position_embedding.requires_grad_(False)
+
+    if args.gradient_checkpointing:
+        text_encoder_1.gradient_checkpointing_enable()
+        text_encoder_2.gradient_checkpointing_enable()
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    optimizer = optimizer_class(
+        # only optimize the embeddings
+        [
+            text_encoder_1.text_model.embeddings.token_embedding.weight,
+            text_encoder_2.text_model.embeddings.token_embedding.weight,
+        ],
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    placeholder_token = " ".join(tokenizer_1.convert_ids_to_tokens(placeholder_token_ids))
+    # Dataset and DataLoaders creation:
+    train_dataset = TextualInversionDataset(
+        data_root=args.train_data_dir,
+        tokenizer_1=tokenizer_1,
+        tokenizer_2=tokenizer_2,
+        size=args.resolution,
+        placeholder_token=placeholder_token,
+        repeats=args.repeats,
+        learnable_property=args.learnable_property,
+        center_crop=args.center_crop,
+        set="train",
+    )
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+    )
+
+    text_encoder_1.train()
+    text_encoder_2.train()
+    # Prepare everything with our `accelerator`.
+    text_encoder_1, text_encoder_2, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        text_encoder_1, text_encoder_2, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae and unet and text_encoder_2 to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_2.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("textual_inversion", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    # keep original embeddings as reference
+    orig_embeds_params = accelerator.unwrap_model(text_encoder_1).get_input_embeddings().weight.data.clone()
+    orig_embeds_params_2 = accelerator.unwrap_model(text_encoder_2).get_input_embeddings().weight.data.clone()
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        text_encoder_1.train()
+        text_encoder_2.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate([text_encoder_1, text_encoder_2]):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample().detach()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states_1 = (
+                    text_encoder_1(batch["input_ids_1"], output_hidden_states=True)
+                    .hidden_states[-2]
+                    .to(dtype=weight_dtype)
+                )
+                encoder_output_2 = text_encoder_2(batch["input_ids_2"], output_hidden_states=True)
+                encoder_hidden_states_2 = encoder_output_2.hidden_states[-2].to(dtype=weight_dtype)
+                original_size = [
+                    (batch["original_size"][0][i].item(), batch["original_size"][1][i].item())
+                    for i in range(args.train_batch_size)
+                ]
+                crop_top_left = [
+                    (batch["crop_top_left"][0][i].item(), batch["crop_top_left"][1][i].item())
+                    for i in range(args.train_batch_size)
+                ]
+                target_size = (args.resolution, args.resolution)
+                add_time_ids = torch.cat(
+                    [
+                        torch.tensor(original_size[i] + crop_top_left[i] + target_size)
+                        for i in range(args.train_batch_size)
+                    ]
+                ).to(accelerator.device, dtype=weight_dtype)
+                added_cond_kwargs = {"text_embeds": encoder_output_2[0], "time_ids": add_time_ids}
+                encoder_hidden_states = torch.cat([encoder_hidden_states_1, encoder_hidden_states_2], dim=-1)
+
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_latents, timesteps, encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+                ).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                accelerator.backward(loss)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+                # Let's make sure we don't update any embedding weights besides the newly added token
+                index_no_updates = torch.ones((len(tokenizer_1),), dtype=torch.bool)
+                index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False
+                index_no_updates_2 = torch.ones((len(tokenizer_2),), dtype=torch.bool)
+                index_no_updates_2[min(placeholder_token_ids_2) : max(placeholder_token_ids_2) + 1] = False
+
+                with torch.no_grad():
+                    accelerator.unwrap_model(text_encoder_1).get_input_embeddings().weight[
+                        index_no_updates
+                    ] = orig_embeds_params[index_no_updates]
+                    accelerator.unwrap_model(text_encoder_2).get_input_embeddings().weight[
+                        index_no_updates_2
+                    ] = orig_embeds_params_2[index_no_updates_2]
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                images = []
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.save_steps == 0:
+                    weight_name = f"learned_embeds-steps-{global_step}.safetensors"
+                    save_path = os.path.join(args.output_dir, weight_name)
+                    save_progress(
+                        text_encoder_1,
+                        placeholder_token_ids,
+                        accelerator,
+                        args,
+                        save_path,
+                        safe_serialization=True,
+                    )
+                    weight_name = f"learned_embeds_2-steps-{global_step}.safetensors"
+                    save_path = os.path.join(args.output_dir, weight_name)
+                    save_progress(
+                        text_encoder_2,
+                        placeholder_token_ids_2,
+                        accelerator,
+                        args,
+                        save_path,
+                        safe_serialization=True,
+                    )
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                        images = log_validation(
+                            text_encoder_1,
+                            text_encoder_2,
+                            tokenizer_1,
+                            tokenizer_2,
+                            unet,
+                            vae,
+                            args,
+                            accelerator,
+                            weight_dtype,
+                            epoch,
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        if args.validation_prompt:
+            images = log_validation(
+                text_encoder_1,
+                text_encoder_2,
+                tokenizer_1,
+                tokenizer_2,
+                unet,
+                vae,
+                args,
+                accelerator,
+                weight_dtype,
+                epoch,
+                is_final_validation=True,
+            )
+
+        if args.push_to_hub and not args.save_as_full_pipeline:
+            logger.warning("Enabling full model saving because --push_to_hub=True was specified.")
+            save_full_model = True
+        else:
+            save_full_model = args.save_as_full_pipeline
+        if save_full_model:
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                text_encoder=accelerator.unwrap_model(text_encoder_1),
+                text_encoder_2=accelerator.unwrap_model(text_encoder_2),
+                vae=vae,
+                unet=unet,
+                tokenizer=tokenizer_1,
+                tokenizer_2=tokenizer_2,
+            )
+            pipeline.save_pretrained(args.output_dir)
+        # Save the newly trained embeddings
+        weight_name = "learned_embeds.safetensors"
+        save_path = os.path.join(args.output_dir, weight_name)
+        save_progress(
+            text_encoder_1,
+            placeholder_token_ids,
+            accelerator,
+            args,
+            save_path,
+            safe_serialization=True,
+        )
+        weight_name = "learned_embeds_2.safetensors"
+        save_path = os.path.join(args.output_dir, weight_name)
+        save_progress(
+            text_encoder_2,
+            placeholder_token_ids_2,
+            accelerator,
+            args,
+            save_path,
+            safe_serialization=True,
+        )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/unconditional_image_generation/README.md b/diffusers/examples/unconditional_image_generation/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2990b3abf3f5b09af67ed2d0da96ce3bc297e1f6
--- /dev/null
+++ b/diffusers/examples/unconditional_image_generation/README.md
@@ -0,0 +1,163 @@
+## Training an unconditional diffusion model
+
+Creating a training image set is [described in a different document](https://huggingface.co/docs/datasets/image_process#image-datasets).
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder  and run
+```bash
+pip install -r requirements.txt
+```
+
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+### Unconditional Flowers
+
+The command to train a DDPM UNet model on the Oxford Flowers dataset:
+
+```bash
+accelerate launch train_unconditional.py \
+  --dataset_name="huggan/flowers-102-categories" \
+  --resolution=64 --center_crop --random_flip \
+  --output_dir="ddpm-ema-flowers-64" \
+  --train_batch_size=16 \
+  --num_epochs=100 \
+  --gradient_accumulation_steps=1 \
+  --use_ema \
+  --learning_rate=1e-4 \
+  --lr_warmup_steps=500 \
+  --mixed_precision=no \
+  --push_to_hub
+```
+An example trained model: https://huggingface.co/anton-l/ddpm-ema-flowers-64
+
+A full training run takes 2 hours on 4xV100 GPUs.
+
+<img src="https://user-images.githubusercontent.com/26864830/180248660-a0b143d0-b89a-42c5-8656-2ebf6ece7e52.png" width="700" />
+
+
+### Unconditional Pokemon
+
+The command to train a DDPM UNet model on the Pokemon dataset:
+
+```bash
+accelerate launch train_unconditional.py \
+  --dataset_name="huggan/pokemon" \
+  --resolution=64 --center_crop --random_flip \
+  --output_dir="ddpm-ema-pokemon-64" \
+  --train_batch_size=16 \
+  --num_epochs=100 \
+  --gradient_accumulation_steps=1 \
+  --use_ema \
+  --learning_rate=1e-4 \
+  --lr_warmup_steps=500 \
+  --mixed_precision=no \
+  --push_to_hub
+```
+An example trained model: https://huggingface.co/anton-l/ddpm-ema-pokemon-64
+
+A full training run takes 2 hours on 4xV100 GPUs.
+
+<img src="https://user-images.githubusercontent.com/26864830/180248200-928953b4-db38-48db-b0c6-8b740fe6786f.png" width="700" />
+
+### Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash
+accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \
+  --dataset_name="huggan/pokemon" \
+  --resolution=64 --center_crop --random_flip \
+  --output_dir="ddpm-ema-pokemon-64" \
+  --train_batch_size=16 \
+  --num_epochs=100 \
+  --gradient_accumulation_steps=1 \
+  --use_ema \
+  --learning_rate=1e-4 \
+  --lr_warmup_steps=500 \
+  --mixed_precision="fp16" \
+  --logger="wandb"
+```
+
+To be able to use Weights and Biases (`wandb`) as a logger you need to install the library: `pip install wandb`.
+
+### Using your own data
+
+To use your own dataset, there are 2 ways:
+- you can either provide your own folder as `--train_data_dir`
+- or you can upload your dataset to the hub (possibly as a private repo, if you prefer so), and simply pass the `--dataset_name` argument.
+
+Below, we explain both in more detail.
+
+#### Provide the dataset as a folder
+
+If you provide your own folders with images, the script expects the following directory structure:
+
+```bash
+data_dir/xxx.png
+data_dir/xxy.png
+data_dir/[...]/xxz.png
+```
+
+In other words, the script will take care of gathering all images inside the folder. You can then run the script like this:
+
+```bash
+accelerate launch train_unconditional.py \
+    --train_data_dir <path-to-train-directory> \
+    <other-arguments>
+```
+
+Internally, the script will use the [`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder) feature which will automatically turn the folders into 🤗 Dataset objects.
+
+#### Upload your data to the hub, as a (possibly private) repo
+
+It's very easy (and convenient) to upload your image dataset to the hub using the [`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder) feature available in 🤗 Datasets. Simply do the following:
+
+```python
+from datasets import load_dataset
+
+# example 1: local folder
+dataset = load_dataset("imagefolder", data_dir="path_to_your_folder")
+
+# example 2: local files (supported formats are tar, gzip, zip, xz, rar, zstd)
+dataset = load_dataset("imagefolder", data_files="path_to_zip_file")
+
+# example 3: remote files (supported formats are tar, gzip, zip, xz, rar, zstd)
+dataset = load_dataset("imagefolder", data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip")
+
+# example 4: providing several splits
+dataset = load_dataset("imagefolder", data_files={"train": ["path/to/file1", "path/to/file2"], "test": ["path/to/file3", "path/to/file4"]})
+```
+
+`ImageFolder` will create an `image` column containing the PIL-encoded images.
+
+Next, push it to the hub!
+
+```python
+# assuming you have ran the huggingface-cli login command in a terminal
+dataset.push_to_hub("name_of_your_dataset")
+
+# if you want to push to a private repo, simply pass private=True:
+dataset.push_to_hub("name_of_your_dataset", private=True)
+```
+
+and that's it! You can now train your model by simply setting the `--dataset_name` argument to the name of your dataset on the hub.
+
+More on this can also be found in [this blog post](https://huggingface.co/blog/image-search-datasets).
diff --git a/diffusers/examples/unconditional_image_generation/requirements.txt b/diffusers/examples/unconditional_image_generation/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f366720afd11e41945a3f29472be2048dbf98404
--- /dev/null
+++ b/diffusers/examples/unconditional_image_generation/requirements.txt
@@ -0,0 +1,3 @@
+accelerate>=0.16.0
+torchvision
+datasets
diff --git a/diffusers/examples/unconditional_image_generation/test_unconditional.py b/diffusers/examples/unconditional_image_generation/test_unconditional.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef71da0a114c5b1cc36ebb1bfb3eacdeaa2598f0
--- /dev/null
+++ b/diffusers/examples/unconditional_image_generation/test_unconditional.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class Unconditional(ExamplesTestsAccelerate):
+    def test_train_unconditional(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/unconditional_image_generation/train_unconditional.py
+                --dataset_name hf-internal-testing/dummy_image_class_data
+                --model_config_name_or_path diffusers/ddpm_dummy
+                --resolution 64
+                --output_dir {tmpdir}
+                --train_batch_size 2
+                --num_epochs 1
+                --gradient_accumulation_steps 1
+                --ddpm_num_inference_steps 2
+                --learning_rate 1e-3
+                --lr_warmup_steps 5
+                """.split()
+
+            run_command(self._launch_args + test_args, return_stdout=True)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
+
+    def test_unconditional_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            initial_run_args = f"""
+                examples/unconditional_image_generation/train_unconditional.py
+                --dataset_name hf-internal-testing/dummy_image_class_data
+                --model_config_name_or_path diffusers/ddpm_dummy
+                --resolution 64
+                --output_dir {tmpdir}
+                --train_batch_size 1
+                --num_epochs 1
+                --gradient_accumulation_steps 1
+                --ddpm_num_inference_steps 2
+                --learning_rate 1e-3
+                --lr_warmup_steps 5
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                # checkpoint-2 should have been deleted
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_unconditional_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            initial_run_args = f"""
+                examples/unconditional_image_generation/train_unconditional.py
+                --dataset_name hf-internal-testing/dummy_image_class_data
+                --model_config_name_or_path diffusers/ddpm_dummy
+                --resolution 64
+                --output_dir {tmpdir}
+                --train_batch_size 1
+                --num_epochs 1
+                --gradient_accumulation_steps 1
+                --ddpm_num_inference_steps 1
+                --learning_rate 1e-3
+                --lr_warmup_steps 5
+                --checkpointing_steps=2
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4", "checkpoint-6"},
+            )
+
+            resume_run_args = f"""
+                examples/unconditional_image_generation/train_unconditional.py
+                --dataset_name hf-internal-testing/dummy_image_class_data
+                --model_config_name_or_path diffusers/ddpm_dummy
+                --resolution 64
+                --output_dir {tmpdir}
+                --train_batch_size 1
+                --num_epochs 2
+                --gradient_accumulation_steps 1
+                --ddpm_num_inference_steps 1
+                --learning_rate 1e-3
+                --lr_warmup_steps 5
+                --resume_from_checkpoint=checkpoint-6
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-10", "checkpoint-12"},
+            )
diff --git a/diffusers/examples/unconditional_image_generation/train_unconditional.py b/diffusers/examples/unconditional_image_generation/train_unconditional.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f5e1de240cb5e218d85828d73dd92d88d39d778
--- /dev/null
+++ b/diffusers/examples/unconditional_image_generation/train_unconditional.py
@@ -0,0 +1,704 @@
+import argparse
+import inspect
+import logging
+import math
+import os
+import shutil
+from datetime import timedelta
+from pathlib import Path
+
+import accelerate
+import datasets
+import torch
+import torch.nn.functional as F
+from accelerate import Accelerator, InitProcessGroupKwargs
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+
+import diffusers
+from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel
+from diffusers.utils import check_min_version, is_accelerate_version, is_tensorboard_available, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    if not isinstance(arr, torch.Tensor):
+        arr = torch.from_numpy(arr)
+    res = arr[timesteps].float().to(timesteps.device)
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res.expand(broadcast_shape)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that HF Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--model_config_name_or_path",
+        type=str,
+        default=None,
+        help="The config of the UNet model to train, leave as None to use standard DDPM configuration.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="ddpm-model-64",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--overwrite_output_dir", action="store_true")
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=64,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        default=False,
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--eval_batch_size", type=int, default=16, help="The number of images to generate for evaluation."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "The number of subprocesses to use for data loading. 0 means that the data will be loaded in the main"
+            " process."
+        ),
+    )
+    parser.add_argument("--num_epochs", type=int, default=100)
+    parser.add_argument("--save_images_epochs", type=int, default=10, help="How often to save images during training.")
+    parser.add_argument(
+        "--save_model_epochs", type=int, default=10, help="How often to save the model during training."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="cosine",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.95, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument(
+        "--adam_weight_decay", type=float, default=1e-6, help="Weight decay magnitude for the Adam optimizer."
+    )
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer.")
+    parser.add_argument(
+        "--use_ema",
+        action="store_true",
+        help="Whether to use Exponential Moving Average for the final model weights.",
+    )
+    parser.add_argument("--ema_inv_gamma", type=float, default=1.0, help="The inverse gamma value for the EMA decay.")
+    parser.add_argument("--ema_power", type=float, default=3 / 4, help="The power value for the EMA decay.")
+    parser.add_argument("--ema_max_decay", type=float, default=0.9999, help="The maximum decay magnitude for EMA.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--hub_private_repo", action="store_true", help="Whether or not to create a private repository."
+    )
+    parser.add_argument(
+        "--logger",
+        type=str,
+        default="tensorboard",
+        choices=["tensorboard", "wandb"],
+        help=(
+            "Whether to use [tensorboard](https://www.tensorflow.org/tensorboard) or [wandb](https://www.wandb.ai)"
+            " for experiment tracking and logging of model metrics and model checkpoints"
+        ),
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default="epsilon",
+        choices=["epsilon", "sample"],
+        help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.",
+    )
+    parser.add_argument("--ddpm_num_steps", type=int, default=1000)
+    parser.add_argument("--ddpm_num_inference_steps", type=int, default=1000)
+    parser.add_argument("--ddpm_beta_schedule", type=str, default="linear")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("You must specify either a dataset name from the hub or a train data directory.")
+
+    return args
+
+
+def main(args):
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=7200))  # a big number for high resolution or big dataset
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.logger,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    if args.logger == "tensorboard":
+        if not is_tensorboard_available():
+            raise ImportError("Make sure to install tensorboard if you want to use it for logging during training.")
+
+    elif args.logger == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_model.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DModel)
+                ema_model.load_state_dict(load_model.state_dict())
+                ema_model.to(accelerator.device)
+                del load_model
+
+            for i in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Initialize the model
+    if args.model_config_name_or_path is None:
+        model = UNet2DModel(
+            sample_size=args.resolution,
+            in_channels=3,
+            out_channels=3,
+            layers_per_block=2,
+            block_out_channels=(128, 128, 256, 256, 512, 512),
+            down_block_types=(
+                "DownBlock2D",
+                "DownBlock2D",
+                "DownBlock2D",
+                "DownBlock2D",
+                "AttnDownBlock2D",
+                "DownBlock2D",
+            ),
+            up_block_types=(
+                "UpBlock2D",
+                "AttnUpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+            ),
+        )
+    else:
+        config = UNet2DModel.load_config(args.model_config_name_or_path)
+        model = UNet2DModel.from_config(config)
+
+    # Create EMA for the model.
+    if args.use_ema:
+        ema_model = EMAModel(
+            model.parameters(),
+            decay=args.ema_max_decay,
+            use_ema_warmup=True,
+            inv_gamma=args.ema_inv_gamma,
+            power=args.ema_power,
+            model_cls=UNet2DModel,
+            model_config=model.config,
+        )
+
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+        args.mixed_precision = accelerator.mixed_precision
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+        args.mixed_precision = accelerator.mixed_precision
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            model.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Initialize the scheduler
+    accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys())
+    if accepts_prediction_type:
+        noise_scheduler = DDPMScheduler(
+            num_train_timesteps=args.ddpm_num_steps,
+            beta_schedule=args.ddpm_beta_schedule,
+            prediction_type=args.prediction_type,
+        )
+    else:
+        noise_scheduler = DDPMScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule)
+
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        model.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            split="train",
+        )
+    else:
+        dataset = load_dataset("imagefolder", data_dir=args.train_data_dir, cache_dir=args.cache_dir, split="train")
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets and DataLoaders creation.
+    augmentations = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def transform_images(examples):
+        images = [augmentations(image.convert("RGB")) for image in examples["image"]]
+        return {"input": images}
+
+    logger.info(f"Dataset size: {len(dataset)}")
+
+    dataset.set_transform(transform_images)
+    train_dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
+    )
+
+    # Initialize the learning rate scheduler
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=(len(train_dataloader) * args.num_epochs),
+    )
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.use_ema:
+        ema_model.to(accelerator.device)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        run = os.path.split(__file__)[-1].split(".")[0]
+        accelerator.init_trackers(run)
+
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    max_train_steps = args.num_epochs * num_update_steps_per_epoch
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(dataset)}")
+    logger.info(f"  Num Epochs = {args.num_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {max_train_steps}")
+
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Train!
+    for epoch in range(first_epoch, args.num_epochs):
+        model.train()
+        progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not accelerator.is_local_main_process)
+        progress_bar.set_description(f"Epoch {epoch}")
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            clean_images = batch["input"].to(weight_dtype)
+            # Sample noise that we'll add to the images
+            noise = torch.randn(clean_images.shape, dtype=weight_dtype, device=clean_images.device)
+            bsz = clean_images.shape[0]
+            # Sample a random timestep for each image
+            timesteps = torch.randint(
+                0, noise_scheduler.config.num_train_timesteps, (bsz,), device=clean_images.device
+            ).long()
+
+            # Add noise to the clean images according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
+
+            with accelerator.accumulate(model):
+                # Predict the noise residual
+                model_output = model(noisy_images, timesteps).sample
+
+                if args.prediction_type == "epsilon":
+                    loss = F.mse_loss(model_output.float(), noise.float())  # this could have different weights!
+                elif args.prediction_type == "sample":
+                    alpha_t = _extract_into_tensor(
+                        noise_scheduler.alphas_cumprod, timesteps, (clean_images.shape[0], 1, 1, 1)
+                    )
+                    snr_weights = alpha_t / (1 - alpha_t)
+                    # use SNR weighting from distillation paper
+                    loss = snr_weights * F.mse_loss(model_output.float(), clean_images.float(), reduction="none")
+                    loss = loss.mean()
+                else:
+                    raise ValueError(f"Unsupported prediction type: {args.prediction_type}")
+
+                accelerator.backward(loss)
+
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(model.parameters(), 1.0)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_model.step(model.parameters())
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
+            if args.use_ema:
+                logs["ema_decay"] = ema_model.cur_decay_value
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+        progress_bar.close()
+
+        accelerator.wait_for_everyone()
+
+        # Generate sample images for visual inspection
+        if accelerator.is_main_process:
+            if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1:
+                unet = accelerator.unwrap_model(model)
+
+                if args.use_ema:
+                    ema_model.store(unet.parameters())
+                    ema_model.copy_to(unet.parameters())
+
+                pipeline = DDPMPipeline(
+                    unet=unet,
+                    scheduler=noise_scheduler,
+                )
+
+                generator = torch.Generator(device=pipeline.device).manual_seed(0)
+                # run pipeline in inference (sample random noise and denoise)
+                images = pipeline(
+                    generator=generator,
+                    batch_size=args.eval_batch_size,
+                    num_inference_steps=args.ddpm_num_inference_steps,
+                    output_type="np",
+                ).images
+
+                if args.use_ema:
+                    ema_model.restore(unet.parameters())
+
+                # denormalize the images and save to tensorboard
+                images_processed = (images * 255).round().astype("uint8")
+
+                if args.logger == "tensorboard":
+                    if is_accelerate_version(">=", "0.17.0.dev0"):
+                        tracker = accelerator.get_tracker("tensorboard", unwrap=True)
+                    else:
+                        tracker = accelerator.get_tracker("tensorboard")
+                    tracker.add_images("test_samples", images_processed.transpose(0, 3, 1, 2), epoch)
+                elif args.logger == "wandb":
+                    # Upcoming `log_images` helper coming in https://github.com/huggingface/accelerate/pull/962/files
+                    accelerator.get_tracker("wandb").log(
+                        {"test_samples": [wandb.Image(img) for img in images_processed], "epoch": epoch},
+                        step=global_step,
+                    )
+
+            if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
+                # save the model
+                unet = accelerator.unwrap_model(model)
+
+                if args.use_ema:
+                    ema_model.store(unet.parameters())
+                    ema_model.copy_to(unet.parameters())
+
+                pipeline = DDPMPipeline(
+                    unet=unet,
+                    scheduler=noise_scheduler,
+                )
+
+                pipeline.save_pretrained(args.output_dir)
+
+                if args.use_ema:
+                    ema_model.restore(unet.parameters())
+
+                if args.push_to_hub:
+                    upload_folder(
+                        repo_id=repo_id,
+                        folder_path=args.output_dir,
+                        commit_message=f"Epoch {epoch}",
+                        ignore_patterns=["step_*", "epoch_*"],
+                    )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/examples/vqgan/README.md b/diffusers/examples/vqgan/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..056bbcaf67487dd538f7dff8b075b2be4a5f814a
--- /dev/null
+++ b/diffusers/examples/vqgan/README.md
@@ -0,0 +1,127 @@
+## Training an VQGAN VAE
+VQVAEs were first introduced in [Neural Discrete Representation Learning](https://arxiv.org/abs/1711.00937) and was combined with a GAN in the paper [Taming Transformers for High-Resolution Image Synthesis](https://arxiv.org/abs/2012.09841). The basic idea of a VQVAE is it's a type of a variational auto encoder with tokens as the latent space similar to tokens for LLMs. This script was adapted from a [pr to huggingface's open-muse project](https://github.com/huggingface/open-muse/pull/52) with general code following [lucidrian's implementation of the vqgan training script](https://github.com/lucidrains/muse-maskgit-pytorch/blob/main/muse_maskgit_pytorch/trainers.py) but both of these implementation follow from the [taming transformer repo](https://github.com/CompVis/taming-transformers?tab=readme-ov-file).
+
+
+Creating a training image set is [described in a different document](https://huggingface.co/docs/datasets/image_process#image-datasets).
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder  and run
+```bash
+pip install -r requirements.txt
+```
+
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+### Training on CIFAR10
+
+The command to train a VQGAN model on cifar10 dataset:
+
+```bash
+accelerate launch train_vqgan.py \
+  --dataset_name=cifar10 \
+  --image_column=img \
+  --validation_images images/bird.jpg images/car.jpg images/dog.jpg images/frog.jpg \
+  --resolution=128 \
+  --train_batch_size=2 \
+  --gradient_accumulation_steps=8 \
+  --report_to=wandb
+```
+
+An example training run is [here](https://wandb.ai/sayakpaul/vqgan-training/runs/0m5kzdfp) by @sayakpaul and a lower scale one [here](https://wandb.ai/dsbuddy27/vqgan-training/runs/eqd6xi4n?nw=nwuserisamu). The validation images can be obtained from [here](https://huggingface.co/datasets/diffusers/docs-images/tree/main/vqgan_validation_images).
+The simplest way to improve the quality of a VQGAN model is to maximize the amount of information present in the bottleneck. The easiest way to do this is increasing the image resolution. However, other ways include, but not limited to, lowering compression by downsampling fewer times or increasing the vocabulary size which at most can be around 16384. How to do this is shown below.
+
+# Modifying the architecture
+
+To modify the architecture of the vqgan model you can save the config taken from [here](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder/blob/main/movq/config.json) and then provide that to the script with the option --model_config_name_or_path. This config is below
+```
+{
+  "_class_name": "VQModel",
+  "_diffusers_version": "0.17.0.dev0",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    256,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "AttnDownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "norm_type": "spatial",
+  "num_vq_embeddings": 16384,
+  "out_channels": 3,
+  "sample_size": 32,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "AttnUpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "vq_embed_dim": 4
+}
+```
+To lower the amount of layers in a VQGan, you can remove layers by modifying the block_out_channels, down_block_types, and up_block_types like below
+```
+{
+  "_class_name": "VQModel",
+  "_diffusers_version": "0.17.0.dev0",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    256,
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "norm_type": "spatial",
+  "num_vq_embeddings": 16384,
+  "out_channels": 3,
+  "sample_size": 32,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "vq_embed_dim": 4
+}
+```
+For increasing the size of the vocabularies you can increase num_vq_embeddings. However, [some research](https://magvit.cs.cmu.edu/v2/) shows that the representation of VQGANs start degrading after 2^14~16384 vq embeddings so it's not recommended to go past that.
+
+## Extra training tips/ideas
+During logging take care to make sure data_time is low. data_time is the amount spent loading the data and where the GPU is not active. So essentially, it's the time wasted. The easiest way to lower data time is to increase the --dataloader_num_workers to a higher number like 4. Due to a bug in Pytorch, this only works on linux based systems. For more details check [here](https://github.com/huggingface/diffusers/issues/7646)
+Secondly, training should seem to be done when both the discriminator and the generator loss converges.
+Thirdly, another low hanging fruit is just using ema using the --use_ema parameter. This tends to make the output images smoother. This has a con where you have to lower your batch size by 1 but it may be worth it.
+Another more experimental low hanging fruit is changing from the vgg19 to different models for the lpips loss using the --timm_model_backend. If you do this, I recommend also changing the timm_model_layers parameter to the layer in your model which you think is best for representation. However, be careful with the feature map norms since this can easily overdominate the loss.
\ No newline at end of file
diff --git a/diffusers/examples/vqgan/discriminator.py b/diffusers/examples/vqgan/discriminator.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb31c3cb0165533f5587f6c40a046ae7c6555882
--- /dev/null
+++ b/diffusers/examples/vqgan/discriminator.py
@@ -0,0 +1,48 @@
+"""
+Ported from Paella
+"""
+
+import torch
+from torch import nn
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+
+
+# Discriminator model ported from Paella https://github.com/dome272/Paella/blob/main/src_distributed/vqgan.py
+class Discriminator(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, in_channels=3, cond_channels=0, hidden_channels=512, depth=6):
+        super().__init__()
+        d = max(depth - 3, 3)
+        layers = [
+            nn.utils.spectral_norm(
+                nn.Conv2d(in_channels, hidden_channels // (2**d), kernel_size=3, stride=2, padding=1)
+            ),
+            nn.LeakyReLU(0.2),
+        ]
+        for i in range(depth - 1):
+            c_in = hidden_channels // (2 ** max((d - i), 0))
+            c_out = hidden_channels // (2 ** max((d - 1 - i), 0))
+            layers.append(nn.utils.spectral_norm(nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
+            layers.append(nn.InstanceNorm2d(c_out))
+            layers.append(nn.LeakyReLU(0.2))
+        self.encoder = nn.Sequential(*layers)
+        self.shuffle = nn.Conv2d(
+            (hidden_channels + cond_channels) if cond_channels > 0 else hidden_channels, 1, kernel_size=1
+        )
+        self.logits = nn.Sigmoid()
+
+    def forward(self, x, cond=None):
+        x = self.encoder(x)
+        if cond is not None:
+            cond = cond.view(
+                cond.size(0),
+                cond.size(1),
+                1,
+                1,
+            ).expand(-1, -1, x.size(-2), x.size(-1))
+            x = torch.cat([x, cond], dim=1)
+        x = self.shuffle(x)
+        x = self.logits(x)
+        return x
diff --git a/diffusers/examples/vqgan/requirements.txt b/diffusers/examples/vqgan/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f204a70f1e0e7fc38f5f262a697d9781beb300e6
--- /dev/null
+++ b/diffusers/examples/vqgan/requirements.txt
@@ -0,0 +1,8 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+datasets
+timm
+numpy
+tqdm
+tensorboard
\ No newline at end of file
diff --git a/diffusers/examples/vqgan/test_vqgan.py b/diffusers/examples/vqgan/test_vqgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..664a7f7365b05cff8edaa8d20f9fbb01e3c56407
--- /dev/null
+++ b/diffusers/examples/vqgan/test_vqgan.py
@@ -0,0 +1,395 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os
+import shutil
+import sys
+import tempfile
+
+import torch
+
+from diffusers import VQModel
+from diffusers.utils.testing_utils import require_timm
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+@require_timm
+class TextToImage(ExamplesTestsAccelerate):
+    @property
+    def test_vqmodel_config(self):
+        return {
+            "_class_name": "VQModel",
+            "_diffusers_version": "0.17.0.dev0",
+            "act_fn": "silu",
+            "block_out_channels": [
+                32,
+            ],
+            "down_block_types": [
+                "DownEncoderBlock2D",
+            ],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 2,
+            "norm_num_groups": 32,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 32,
+            "out_channels": 3,
+            "sample_size": 32,
+            "scaling_factor": 0.18215,
+            "up_block_types": [
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def test_discriminator_config(self):
+        return {
+            "_class_name": "Discriminator",
+            "_diffusers_version": "0.27.0.dev0",
+            "in_channels": 3,
+            "cond_channels": 0,
+            "hidden_channels": 8,
+            "depth": 4,
+        }
+
+    def get_vq_and_discriminator_configs(self, tmpdir):
+        vqmodel_config_path = os.path.join(tmpdir, "vqmodel.json")
+        discriminator_config_path = os.path.join(tmpdir, "discriminator.json")
+        with open(vqmodel_config_path, "w") as fp:
+            json.dump(self.test_vqmodel_config, fp)
+        with open(discriminator_config_path, "w") as fp:
+            json.dump(self.test_discriminator_config, fp)
+        return vqmodel_config_path, discriminator_config_path
+
+    def test_vqmodel(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
+            test_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(
+                os.path.isfile(os.path.join(tmpdir, "discriminator", "diffusion_pytorch_model.safetensors"))
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "vqmodel", "diffusion_pytorch_model.safetensors")))
+
+    def test_vqmodel_checkpointing(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --checkpointing_steps=2
+                --output_dir {tmpdir}
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # check can run an intermediate checkpoint
+            model = VQModel.from_pretrained(tmpdir, subfolder="checkpoint-2/vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
+            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4"},
+            )
+
+            # Run training script for 2 total steps resuming from checkpoint 4
+
+            resume_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --checkpointing_steps=1
+                --resume_from_checkpoint={os.path.join(tmpdir, 'checkpoint-4')}
+                --output_dir {tmpdir}
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check can run new fully trained pipeline
+            model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # no checkpoint-2 -> check old checkpoints do not exist
+            # check new checkpoints exist
+            # In the current script, checkpointing_steps 1 is equivalent to checkpointing_steps 2 as after the generator gets trained for one step,
+            # the discriminator gets trained and loss and saving happens after that. Thus we do not expect to get a checkpoint-5
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_vqmodel_checkpointing_use_ema(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --checkpointing_steps=2
+                --output_dir {tmpdir}
+                --use_ema
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # check can run an intermediate checkpoint
+            model = VQModel.from_pretrained(tmpdir, subfolder="checkpoint-2/vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
+            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
+
+            # Run training script for 2 total steps resuming from checkpoint 4
+
+            resume_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --checkpointing_steps=1
+                --resume_from_checkpoint={os.path.join(tmpdir, 'checkpoint-4')}
+                --output_dir {tmpdir}
+                --use_ema
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check can run new fully trained pipeline
+            model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # no checkpoint-2 -> check old checkpoints do not exist
+            # check new checkpoints exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_vqmodel_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
+            # Run training script with checkpointing
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # Should create checkpoints at steps 2, 4, 6
+            # with checkpoint at step 2 deleted
+
+            initial_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # check checkpoint directories exist
+            # checkpoint-2 should have been deleted
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
+
+    def test_vqmodel_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --checkpointing_steps=2
+                --output_dir {tmpdir}
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # resume and we should try to checkpoint at 6, where we'll have to remove
+            # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint
+
+            resume_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 8
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --resume_from_checkpoint={os.path.join(tmpdir, 'checkpoint-4')}
+                --checkpoints_total_limit=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-6", "checkpoint-8"},
+            )
diff --git a/diffusers/examples/vqgan/train_vqgan.py b/diffusers/examples/vqgan/train_vqgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..d16dce921896aa9cced7944f29bbfa2adf99e507
--- /dev/null
+++ b/diffusers/examples/vqgan/train_vqgan.py
@@ -0,0 +1,1067 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import math
+import os
+import shutil
+import time
+from pathlib import Path
+
+import accelerate
+import numpy as np
+import PIL
+import PIL.Image
+import timm
+import torch
+import torch.nn.functional as F
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedType, ProjectConfiguration, set_seed
+from datasets import load_dataset
+from discriminator import Discriminator
+from huggingface_hub import create_repo
+from packaging import version
+from PIL import Image
+from timm.data import resolve_data_config
+from timm.data.transforms_factory import create_transform
+from torchvision import transforms
+from tqdm import tqdm
+
+from diffusers import VQModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel
+from diffusers.utils import check_min_version, is_wandb_available
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def _map_layer_to_idx(backbone, layers, offset=0):
+    """Maps set of layer names to indices of model. Ported from anomalib
+
+    Returns:
+        Feature map extracted from the CNN
+    """
+    idx = []
+    features = timm.create_model(
+        backbone,
+        pretrained=False,
+        features_only=False,
+        exportable=True,
+    )
+    for i in layers:
+        try:
+            idx.append(list(dict(features.named_children()).keys()).index(i) - offset)
+        except ValueError:
+            raise ValueError(
+                f"Layer {i} not found in model {backbone}. Select layer from {list(dict(features.named_children()).keys())}. The network architecture is {features}"
+            )
+    return idx
+
+
+def get_perceptual_loss(pixel_values, fmap, timm_model, timm_model_resolution, timm_model_normalization):
+    img_timm_model_input = timm_model_normalization(F.interpolate(pixel_values, timm_model_resolution))
+    fmap_timm_model_input = timm_model_normalization(F.interpolate(fmap, timm_model_resolution))
+
+    if pixel_values.shape[1] == 1:
+        # handle grayscale for timm_model
+        img_timm_model_input, fmap_timm_model_input = (
+            t.repeat(1, 3, 1, 1) for t in (img_timm_model_input, fmap_timm_model_input)
+        )
+
+    img_timm_model_feats = timm_model(img_timm_model_input)
+    recon_timm_model_feats = timm_model(fmap_timm_model_input)
+    perceptual_loss = F.mse_loss(img_timm_model_feats[0], recon_timm_model_feats[0])
+    for i in range(1, len(img_timm_model_feats)):
+        perceptual_loss += F.mse_loss(img_timm_model_feats[i], recon_timm_model_feats[i])
+    perceptual_loss /= len(img_timm_model_feats)
+    return perceptual_loss
+
+
+def grad_layer_wrt_loss(loss, layer):
+    return torch.autograd.grad(
+        outputs=loss,
+        inputs=layer,
+        grad_outputs=torch.ones_like(loss),
+        retain_graph=True,
+    )[0].detach()
+
+
+def gradient_penalty(images, output, weight=10):
+    gradients = torch.autograd.grad(
+        outputs=output,
+        inputs=images,
+        grad_outputs=torch.ones(output.size(), device=images.device),
+        create_graph=True,
+        retain_graph=True,
+        only_inputs=True,
+    )[0]
+    bsz = gradients.shape[0]
+    gradients = torch.reshape(gradients, (bsz, -1))
+    return weight * ((gradients.norm(2, dim=1) - 1) ** 2).mean()
+
+
+@torch.no_grad()
+def log_validation(model, args, validation_transform, accelerator, global_step):
+    logger.info("Generating images...")
+    dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        dtype = torch.bfloat16
+    original_images = []
+    for image_path in args.validation_images:
+        image = PIL.Image.open(image_path)
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        image = validation_transform(image).to(accelerator.device, dtype=dtype)
+        original_images.append(image[None])
+    # Generate images
+    model.eval()
+    images = []
+    for original_image in original_images:
+        image = accelerator.unwrap_model(model)(original_image).sample
+        images.append(image)
+    model.train()
+    original_images = torch.cat(original_images, dim=0)
+    images = torch.cat(images, dim=0)
+
+    # Convert to PIL images
+    images = torch.clamp(images, 0.0, 1.0)
+    original_images = torch.clamp(original_images, 0.0, 1.0)
+    images *= 255.0
+    original_images *= 255.0
+    images = images.permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8)
+    original_images = original_images.permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8)
+    images = np.concatenate([original_images, images], axis=2)
+    images = [Image.fromarray(image) for image in images]
+
+    # Log images
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, global_step, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{i}: Original, Generated") for i, image in enumerate(images)
+                    ]
+                },
+                step=global_step,
+            )
+    torch.cuda.empty_cache()
+    return images
+
+
+def log_grad_norm(model, accelerator, global_step):
+    for name, param in model.named_parameters():
+        if param.grad is not None:
+            grads = param.grad.detach().data
+            grad_norm = (grads.norm(p=2) / grads.numel()).item()
+            accelerator.log({"grad_norm/" + name: grad_norm}, step=global_step)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--log_grad_norm_steps",
+        type=int,
+        default=500,
+        help=("Print logs of gradient norms every X steps."),
+    )
+    parser.add_argument(
+        "--log_steps",
+        type=int,
+        default=50,
+        help=("Print logs every X steps."),
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running reconstruction on images in"
+            " `args.validation_images` and logging the reconstructed images."
+        ),
+    )
+    parser.add_argument(
+        "--vae_loss",
+        type=str,
+        default="l2",
+        help="The loss function for vae reconstruction loss.",
+    )
+    parser.add_argument(
+        "--timm_model_offset",
+        type=int,
+        default=0,
+        help="Offset of timm layers to indices.",
+    )
+    parser.add_argument(
+        "--timm_model_layers",
+        type=str,
+        default="head",
+        help="The layers to get output from in the timm model.",
+    )
+    parser.add_argument(
+        "--timm_model_backend",
+        type=str,
+        default="vgg19",
+        help="Timm model used to get the lpips loss",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--model_config_name_or_path",
+        type=str,
+        default=None,
+        help="The config of the Vq model to train, leave as None to use standard Vq model configuration.",
+    )
+    parser.add_argument(
+        "--discriminator_config_name_or_path",
+        type=str,
+        default=None,
+        help="The config of the discriminator model to train, leave as None to use standard Vq model configuration.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--validation_images",
+        type=str,
+        default=None,
+        nargs="+",
+        help=("A set of validation images evaluated every `--validation_steps` and logged to `--report_to`."),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="vqgan-output",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--discr_learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--discr_lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--non_ema_revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or"
+            " remote repository specified with --pretrained_model_name_or_path."
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="vqgan-training",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    # default to using the same revision for the non-ema model if not specified
+    if args.non_ema_revision is None:
+        args.non_ema_revision = args.revision
+
+    return args
+
+
+def main():
+    #########################
+    # SETUP Accelerator     #
+    #########################
+    args = parse_args()
+
+    # Enable TF32 on Ampere GPUs
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cudnn.deterministic = False
+
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    if accelerator.distributed_type == DistributedType.DEEPSPEED:
+        accelerator.state.deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = args.train_batch_size
+
+    #####################################
+    # SETUP LOGGING, SEED and CONFIG    #
+    #####################################
+
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        tracker_config.pop("validation_images")
+        accelerator.init_trackers(args.tracker_project_name, tracker_config)
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    #########################
+    # MODELS and OPTIMIZER  #
+    #########################
+    logger.info("Loading models and optimizer")
+
+    if args.model_config_name_or_path is None and args.pretrained_model_name_or_path is None:
+        # Taken from config of movq at kandinsky-community/kandinsky-2-2-decoder but without the attention layers
+        model = VQModel(
+            act_fn="silu",
+            block_out_channels=[
+                128,
+                256,
+                512,
+            ],
+            down_block_types=[
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+            ],
+            in_channels=3,
+            latent_channels=4,
+            layers_per_block=2,
+            norm_num_groups=32,
+            norm_type="spatial",
+            num_vq_embeddings=16384,
+            out_channels=3,
+            sample_size=32,
+            scaling_factor=0.18215,
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            vq_embed_dim=4,
+        )
+    elif args.pretrained_model_name_or_path is not None:
+        model = VQModel.from_pretrained(args.pretrained_model_name_or_path)
+    else:
+        config = VQModel.load_config(args.model_config_name_or_path)
+        model = VQModel.from_config(config)
+    if args.use_ema:
+        ema_model = EMAModel(model.parameters(), model_cls=VQModel, model_config=model.config)
+    if args.discriminator_config_name_or_path is None:
+        discriminator = Discriminator()
+    else:
+        config = Discriminator.load_config(args.discriminator_config_name_or_path)
+        discriminator = Discriminator.from_config(config)
+
+    idx = _map_layer_to_idx(args.timm_model_backend, args.timm_model_layers.split("|"), args.timm_model_offset)
+
+    timm_model = timm.create_model(
+        args.timm_model_backend,
+        pretrained=True,
+        features_only=True,
+        exportable=True,
+        out_indices=idx,
+    )
+    timm_model = timm_model.to(accelerator.device)
+    timm_model.requires_grad = False
+    timm_model.eval()
+    timm_transform = create_transform(**resolve_data_config(timm_model.pretrained_cfg, model=timm_model))
+    try:
+        # Gets the resolution of the timm transformation after centercrop
+        timm_centercrop_transform = timm_transform.transforms[1]
+        assert isinstance(
+            timm_centercrop_transform, transforms.CenterCrop
+        ), f"Timm model {timm_model} is currently incompatible with this script. Try vgg19."
+        timm_model_resolution = timm_centercrop_transform.size[0]
+        # Gets final normalization
+        timm_model_normalization = timm_transform.transforms[-1]
+        assert isinstance(
+            timm_model_normalization, transforms.Normalize
+        ), f"Timm model {timm_model} is currently incompatible with this script. Try vgg19."
+    except AssertionError as e:
+        raise NotImplementedError(e)
+    # Enable flash attention if asked
+    if args.enable_xformers_memory_efficient_attention:
+        model.enable_xformers_memory_efficient_attention()
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_model.save_pretrained(os.path.join(output_dir, "vqmodel_ema"))
+                vqmodel = models[0]
+                discriminator = models[1]
+                vqmodel.save_pretrained(os.path.join(output_dir, "vqmodel"))
+                discriminator.save_pretrained(os.path.join(output_dir, "discriminator"))
+                weights.pop()
+                weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "vqmodel_ema"), VQModel)
+                ema_model.load_state_dict(load_model.state_dict())
+                ema_model.to(accelerator.device)
+                del load_model
+            discriminator = models.pop()
+            load_model = Discriminator.from_pretrained(input_dir, subfolder="discriminator")
+            discriminator.register_to_config(**load_model.config)
+            discriminator.load_state_dict(load_model.state_dict())
+            del load_model
+            vqmodel = models.pop()
+            load_model = VQModel.from_pretrained(input_dir, subfolder="vqmodel")
+            vqmodel.register_to_config(**load_model.config)
+            vqmodel.load_state_dict(load_model.state_dict())
+            del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    learning_rate = args.learning_rate
+    if args.scale_lr:
+        learning_rate = (
+            learning_rate * args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        list(model.parameters()),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    discr_optimizer = optimizer_cls(
+        list(discriminator.parameters()),
+        lr=args.discr_learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    ##################################
+    # DATLOADER and LR-SCHEDULER     #
+    #################################
+    logger.info("Creating dataloaders and lr_scheduler")
+
+    args.train_batch_size * accelerator.num_processes
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    # DataLoaders creation:
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            data_dir=args.train_data_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    assert args.image_column is not None
+    image_column = args.image_column
+    if image_column not in column_names:
+        raise ValueError(f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}")
+    # Preprocessing the datasets.
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+        ]
+    )
+    validation_transform = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.ToTensor(),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        return {"pixel_values": pixel_values}
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.lr_warmup_steps,
+    )
+    discr_lr_scheduler = get_scheduler(
+        args.discr_lr_scheduler,
+        optimizer=discr_optimizer,
+        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.lr_warmup_steps,
+    )
+
+    # Prepare everything with accelerator
+    logger.info("Preparing model, optimizer and dataloaders")
+    # The dataloader are already aware of distributed training, so we don't need to prepare them.
+    model, discriminator, optimizer, discr_optimizer, lr_scheduler, discr_lr_scheduler = accelerator.prepare(
+        model, discriminator, optimizer, discr_optimizer, lr_scheduler, discr_lr_scheduler
+    )
+    if args.use_ema:
+        ema_model.to(accelerator.device)
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Potentially load in the weights and states from a previous save
+    resume_from_checkpoint = args.resume_from_checkpoint
+    if resume_from_checkpoint:
+        if resume_from_checkpoint != "latest":
+            path = resume_from_checkpoint
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+            path = os.path.join(args.output_dir, path)
+
+        if path is None:
+            accelerator.print(f"Checkpoint '{resume_from_checkpoint}' does not exist. Starting a new training run.")
+            resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(path)
+            accelerator.wait_for_everyone()
+            global_step = int(os.path.basename(path).split("-")[1])
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    batch_time_m = AverageMeter()
+    data_time_m = AverageMeter()
+    end = time.time()
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+    # As stated above, we are not doing epoch based training here, but just using this for book keeping and being able to
+    # reuse the same training loop with other datasets/loaders.
+    avg_gen_loss, avg_discr_loss = None, None
+    for epoch in range(first_epoch, args.num_train_epochs):
+        model.train()
+        discriminator.train()
+        for i, batch in enumerate(train_dataloader):
+            pixel_values = batch["pixel_values"]
+            pixel_values = pixel_values.to(accelerator.device, non_blocking=True)
+            data_time_m.update(time.time() - end)
+            generator_step = ((i // args.gradient_accumulation_steps) % 2) == 0
+            # Train Step
+            # The behavior of accelerator.accumulate is to
+            # 1. Check if gradients are synced(reached gradient-accumulation_steps)
+            # 2. If so sync gradients by stopping the not syncing process
+            if generator_step:
+                optimizer.zero_grad(set_to_none=True)
+            else:
+                discr_optimizer.zero_grad(set_to_none=True)
+            # encode images to the latent space and get the commit loss from vq tokenization
+            # Return commit loss
+            fmap, commit_loss = model(pixel_values, return_dict=False)
+
+            if generator_step:
+                with accelerator.accumulate(model):
+                    # reconstruction loss. Pixel level differences between input vs output
+                    if args.vae_loss == "l2":
+                        loss = F.mse_loss(pixel_values, fmap)
+                    else:
+                        loss = F.l1_loss(pixel_values, fmap)
+                    # perceptual loss. The high level feature mean squared error loss
+                    perceptual_loss = get_perceptual_loss(
+                        pixel_values,
+                        fmap,
+                        timm_model,
+                        timm_model_resolution=timm_model_resolution,
+                        timm_model_normalization=timm_model_normalization,
+                    )
+                    # generator loss
+                    gen_loss = -discriminator(fmap).mean()
+                    last_dec_layer = accelerator.unwrap_model(model).decoder.conv_out.weight
+                    norm_grad_wrt_perceptual_loss = grad_layer_wrt_loss(perceptual_loss, last_dec_layer).norm(p=2)
+                    norm_grad_wrt_gen_loss = grad_layer_wrt_loss(gen_loss, last_dec_layer).norm(p=2)
+
+                    adaptive_weight = norm_grad_wrt_perceptual_loss / norm_grad_wrt_gen_loss.clamp(min=1e-8)
+                    adaptive_weight = adaptive_weight.clamp(max=1e4)
+                    loss += commit_loss
+                    loss += perceptual_loss
+                    loss += adaptive_weight * gen_loss
+                    # Gather the losses across all processes for logging (if we use distributed training).
+                    avg_gen_loss = accelerator.gather(loss.repeat(args.train_batch_size)).float().mean()
+                    accelerator.backward(loss)
+
+                    if args.max_grad_norm is not None and accelerator.sync_gradients:
+                        accelerator.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    # log gradient norm before zeroing it
+                    if (
+                        accelerator.sync_gradients
+                        and global_step % args.log_grad_norm_steps == 0
+                        and accelerator.is_main_process
+                    ):
+                        log_grad_norm(model, accelerator, global_step)
+            else:
+                # Return discriminator loss
+                with accelerator.accumulate(discriminator):
+                    fmap.detach_()
+                    pixel_values.requires_grad_()
+                    real = discriminator(pixel_values)
+                    fake = discriminator(fmap)
+                    loss = (F.relu(1 + fake) + F.relu(1 - real)).mean()
+                    gp = gradient_penalty(pixel_values, real)
+                    loss += gp
+                    avg_discr_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                    accelerator.backward(loss)
+
+                    if args.max_grad_norm is not None and accelerator.sync_gradients:
+                        accelerator.clip_grad_norm_(discriminator.parameters(), args.max_grad_norm)
+
+                    discr_optimizer.step()
+                    discr_lr_scheduler.step()
+                    if (
+                        accelerator.sync_gradients
+                        and global_step % args.log_grad_norm_steps == 0
+                        and accelerator.is_main_process
+                    ):
+                        log_grad_norm(discriminator, accelerator, global_step)
+            batch_time_m.update(time.time() - end)
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                global_step += 1
+                progress_bar.update(1)
+                if args.use_ema:
+                    ema_model.step(model.parameters())
+            if accelerator.sync_gradients and not generator_step and accelerator.is_main_process:
+                # wait for both generator and discriminator to settle
+                # Log metrics
+                if global_step % args.log_steps == 0:
+                    samples_per_second_per_gpu = (
+                        args.gradient_accumulation_steps * args.train_batch_size / batch_time_m.val
+                    )
+                    logs = {
+                        "step_discr_loss": avg_discr_loss.item(),
+                        "lr": lr_scheduler.get_last_lr()[0],
+                        "samples/sec/gpu": samples_per_second_per_gpu,
+                        "data_time": data_time_m.val,
+                        "batch_time": batch_time_m.val,
+                    }
+                    if avg_gen_loss is not None:
+                        logs["step_gen_loss"] = avg_gen_loss.item()
+                    accelerator.log(logs, step=global_step)
+
+                    # resetting batch / data time meters per log window
+                    batch_time_m.reset()
+                    data_time_m.reset()
+                # Save model checkpoint
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                # Generate images
+                if global_step % args.validation_steps == 0:
+                    if args.use_ema:
+                        # Store the VQGAN parameters temporarily and load the EMA parameters to perform inference.
+                        ema_model.store(model.parameters())
+                        ema_model.copy_to(model.parameters())
+                    log_validation(model, args, validation_transform, accelerator, global_step)
+                    if args.use_ema:
+                        # Switch back to the original VQGAN parameters.
+                        ema_model.restore(model.parameters())
+            end = time.time()
+            # Stop training if max steps is reached
+            if global_step >= args.max_train_steps:
+                break
+        # End for
+
+    accelerator.wait_for_everyone()
+
+    # Save the final trained checkpoint
+    if accelerator.is_main_process:
+        model = accelerator.unwrap_model(model)
+        discriminator = accelerator.unwrap_model(discriminator)
+        if args.use_ema:
+            ema_model.copy_to(model.parameters())
+        model.save_pretrained(os.path.join(args.output_dir, "vqmodel"))
+        discriminator.save_pretrained(os.path.join(args.output_dir, "discriminator"))
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/wuerstchen/text_to_image/README.md b/diffusers/examples/wuerstchen/text_to_image/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a6ec4698b6118216ab3d1293cafa58b1ef133024
--- /dev/null
+++ b/diffusers/examples/wuerstchen/text_to_image/README.md
@@ -0,0 +1,93 @@
+# Würstchen text-to-image fine-tuning
+
+## Running locally with PyTorch
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd into the example folder and run
+```bash
+cd examples/wuerstchen/text_to_image
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+For this example we want to directly store the trained LoRA embeddings on the Hub, so we need to be logged in and add the `--push_to_hub` flag to the training script. To log in, run:
+```bash
+huggingface-cli login
+```
+
+## Prior training
+
+You can fine-tune the Würstchen prior model with the `train_text_to_image_prior.py` script. Note that we currently support `--gradient_checkpointing` for prior model fine-tuning so you can use it for more GPU memory constrained setups.
+
+<br>
+
+<!-- accelerate_snippet_start -->
+```bash
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+
+accelerate launch  train_text_to_image_prior.py \
+  --mixed_precision="fp16" \
+  --dataset_name=$DATASET_NAME \
+  --resolution=768 \
+  --train_batch_size=4 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --dataloader_num_workers=4 \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --checkpoints_total_limit=3 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --validation_prompts="A robot naruto, 4k photo" \
+  --report_to="wandb" \
+  --push_to_hub \
+  --output_dir="wuerstchen-prior-naruto-model"
+```
+<!-- accelerate_snippet_end -->
+
+## Training with LoRA
+
+Low-Rank Adaption of Large Language Models (or LoRA) was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*.
+
+In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-decomposition matrices to existing weights and **only** training those newly added weights. This has a couple of advantages:
+
+- Previous pretrained weights are kept frozen so that the model is not prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114).
+- Rank-decomposition matrices have significantly fewer parameters than original model, which means that trained LoRA weights are easily portable.
+- LoRA attention layers allow to control to which extent the model is adapted toward new training images via a `scale` parameter.
+
+
+### Prior Training
+
+First, you need to set up your development environment as explained in the [installation](#Running-locally-with-PyTorch) section. Make sure to set the `DATASET_NAME` environment variable. Here, we will use the [Naruto captions dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).
+
+```bash
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+
+accelerate launch train_text_to_image_lora_prior.py \
+  --mixed_precision="fp16" \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=768 \
+  --train_batch_size=8 \
+  --num_train_epochs=100 --checkpointing_steps=5000 \
+  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --seed=42 \
+  --rank=4 \
+  --validation_prompt="cute dragon creature" \
+  --report_to="wandb" \
+  --push_to_hub \
+  --output_dir="wuerstchen-prior-naruto-lora"
+```
diff --git a/diffusers/examples/wuerstchen/text_to_image/__init__.py b/diffusers/examples/wuerstchen/text_to_image/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/examples/wuerstchen/text_to_image/modeling_efficient_net_encoder.py b/diffusers/examples/wuerstchen/text_to_image/modeling_efficient_net_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd551ebf1623af6d03cff998b258f19ee1294245
--- /dev/null
+++ b/diffusers/examples/wuerstchen/text_to_image/modeling_efficient_net_encoder.py
@@ -0,0 +1,23 @@
+import torch.nn as nn
+from torchvision.models import efficientnet_v2_l, efficientnet_v2_s
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+
+
+class EfficientNetEncoder(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, c_latent=16, c_cond=1280, effnet="efficientnet_v2_s"):
+        super().__init__()
+
+        if effnet == "efficientnet_v2_s":
+            self.backbone = efficientnet_v2_s(weights="DEFAULT").features
+        else:
+            self.backbone = efficientnet_v2_l(weights="DEFAULT").features
+        self.mapper = nn.Sequential(
+            nn.Conv2d(c_cond, c_latent, kernel_size=1, bias=False),
+            nn.BatchNorm2d(c_latent),  # then normalize them to have mean 0 and std 1
+        )
+
+    def forward(self, x):
+        return self.mapper(self.backbone(x))
diff --git a/diffusers/examples/wuerstchen/text_to_image/requirements.txt b/diffusers/examples/wuerstchen/text_to_image/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dfe8f3db46a2abc0f1c5f2cbab8278116434ba50
--- /dev/null
+++ b/diffusers/examples/wuerstchen/text_to_image/requirements.txt
@@ -0,0 +1,7 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+wandb
+bitsandbytes
+deepspeed
+peft>=0.6.0
diff --git a/diffusers/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py b/diffusers/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4c9a44bb5b25a63ce742eb543ee73d1b4087171
--- /dev/null
+++ b/diffusers/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py
@@ -0,0 +1,949 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.state import AcceleratorState, is_initialized
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, hf_hub_download, upload_folder
+from modeling_efficient_net_encoder import EfficientNetEncoder
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict
+from torchvision import transforms
+from tqdm import tqdm
+from transformers import CLIPTextModel, PreTrainedTokenizerFast
+from transformers.utils import ContextManagers
+
+from diffusers import AutoPipelineForText2Image, DDPMWuerstchenScheduler, WuerstchenPriorPipeline
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.wuerstchen import DEFAULT_STAGE_C_TIMESTEPS, WuerstchenPrior
+from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
+from diffusers.utils.logging import set_verbosity_error, set_verbosity_info
+
+
+if is_wandb_available():
+    import wandb
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def save_model_card(
+    args,
+    repo_id: str,
+    images=None,
+    repo_folder=None,
+):
+    img_str = ""
+    if len(images) > 0:
+        image_grid = make_image_grid(images, 1, len(args.validation_prompts))
+        image_grid.save(os.path.join(repo_folder, "val_imgs_grid.png"))
+        img_str += "![val_imgs_grid](./val_imgs_grid.png)\n"
+
+    yaml = f"""
+---
+license: mit
+base_model: {args.pretrained_prior_model_name_or_path}
+datasets:
+- {args.dataset_name}
+tags:
+- wuerstchen
+- text-to-image
+- diffusers
+- diffusers-training
+- lora
+inference: true
+---
+    """
+    model_card = f"""
+# LoRA Finetuning - {repo_id}
+
+This pipeline was finetuned from **{args.pretrained_prior_model_name_or_path}** on the **{args.dataset_name}** dataset. Below are some example images generated with the finetuned pipeline using the following prompts: {args.validation_prompts}: \n
+{img_str}
+
+## Pipeline usage
+
+You can use the pipeline like so:
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = AutoPipelineForText2Image.from_pretrained(
+                "{args.pretrained_decoder_model_name_or_path}", torch_dtype={args.weight_dtype}
+            )
+# load lora weights from folder:
+pipeline.prior_pipe.load_lora_weights("{repo_id}", torch_dtype={args.weight_dtype})
+
+image = pipeline(prompt=prompt).images[0]
+image.save("my_image.png")
+```
+
+## Training info
+
+These are the key hyperparameters used during training:
+
+* LoRA rank: {args.rank}
+* Epochs: {args.num_train_epochs}
+* Learning rate: {args.learning_rate}
+* Batch size: {args.train_batch_size}
+* Gradient accumulation steps: {args.gradient_accumulation_steps}
+* Image resolution: {args.resolution}
+* Mixed-precision: {args.mixed_precision}
+
+"""
+    wandb_info = ""
+    if is_wandb_available():
+        wandb_run_url = None
+        if wandb.run is not None:
+            wandb_run_url = wandb.run.url
+
+    if wandb_run_url is not None:
+        wandb_info = f"""
+More information on all the CLI arguments and the environment are available on your [`wandb` run page]({wandb_run_url}).
+"""
+
+    model_card += wandb_info
+
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def log_validation(text_encoder, tokenizer, prior, args, accelerator, weight_dtype, epoch):
+    logger.info("Running validation... ")
+
+    pipeline = AutoPipelineForText2Image.from_pretrained(
+        args.pretrained_decoder_model_name_or_path,
+        prior=accelerator.unwrap_model(prior),
+        prior_text_encoder=accelerator.unwrap_model(text_encoder),
+        prior_tokenizer=tokenizer,
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    images = []
+    for i in range(len(args.validation_prompts)):
+        with torch.cuda.amp.autocast():
+            image = pipeline(
+                args.validation_prompts[i],
+                prior_timesteps=DEFAULT_STAGE_C_TIMESTEPS,
+                generator=generator,
+                height=args.resolution,
+                width=args.resolution,
+            ).images[0]
+        images.append(image)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompts[i]}")
+                        for i, image in enumerate(images)
+                    ]
+                }
+            )
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+    return images
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of finetuning Würstchen Prior.")
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--pretrained_decoder_model_name_or_path",
+        type=str,
+        default="warp-ai/wuerstchen",
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_prior_model_name_or_path",
+        type=str,
+        default="warp-ai/wuerstchen-prior",
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompts",
+        type=str,
+        default=None,
+        nargs="+",
+        help=("A set of prompts evaluated every `--validation_epochs` and logged to `--report_to`."),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="wuerstchen-model-finetuned-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=1, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="learning rate",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument(
+        "--adam_weight_decay",
+        type=float,
+        default=0.0,
+        required=False,
+        help="weight decay_to_use",
+    )
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=5,
+        help="Run validation every X epochs.",
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="text2image-fine-tune",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+def main():
+    args = parse_args()
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load scheduler, effnet, tokenizer, clip_model
+    noise_scheduler = DDPMWuerstchenScheduler()
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(
+        args.pretrained_prior_model_name_or_path, subfolder="tokenizer"
+    )
+
+    def deepspeed_zero_init_disabled_context_manager():
+        """
+        returns either a context list that includes one that will disable zero.Init or an empty context list
+        """
+        deepspeed_plugin = AcceleratorState().deepspeed_plugin if is_initialized() else None
+        if deepspeed_plugin is None:
+            return []
+
+        return [deepspeed_plugin.zero3_init_context_manager(enable=False)]
+
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
+        pretrained_checkpoint_file = hf_hub_download("dome272/wuerstchen", filename="model_v2_stage_b.pt")
+        state_dict = torch.load(pretrained_checkpoint_file, map_location="cpu")
+        image_encoder = EfficientNetEncoder()
+        image_encoder.load_state_dict(state_dict["effnet_state_dict"])
+        image_encoder.eval()
+
+        text_encoder = CLIPTextModel.from_pretrained(
+            args.pretrained_prior_model_name_or_path, subfolder="text_encoder", torch_dtype=weight_dtype
+        ).eval()
+
+    # Freeze text_encoder, cast to weight_dtype and image_encoder and move to device
+    text_encoder.requires_grad_(False)
+    image_encoder.requires_grad_(False)
+    image_encoder.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # load prior model, cast to weight_dtype and move to device
+    prior = WuerstchenPrior.from_pretrained(args.pretrained_prior_model_name_or_path, subfolder="prior")
+    prior.to(accelerator.device, dtype=weight_dtype)
+
+    # lora attn processor
+    prior_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        target_modules=["to_k", "to_q", "to_v", "to_out.0", "add_k_proj", "add_v_proj"],
+    )
+    # Add adapter and make sure the trainable params are in float32.
+    prior.add_adapter(prior_lora_config)
+    if args.mixed_precision == "fp16":
+        for param in prior.parameters():
+            # only upcast trainable parameters (LoRA) into fp32
+            if param.requires_grad:
+                param.data = param.to(torch.float32)
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            prior_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(accelerator.unwrap_model(prior))):
+                    prior_lora_layers_to_save = get_peft_model_state_dict(model)
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            WuerstchenPriorPipeline.save_lora_weights(
+                output_dir,
+                unet_lora_layers=prior_lora_layers_to_save,
+            )
+
+    def load_model_hook(models, input_dir):
+        prior_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(accelerator.unwrap_model(prior))):
+                prior_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, network_alphas = WuerstchenPriorPipeline.lora_state_dict(input_dir)
+        WuerstchenPriorPipeline.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=prior_)
+        WuerstchenPriorPipeline.load_lora_into_text_encoder(
+            lora_state_dict,
+            network_alphas=network_alphas,
+        )
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+    params_to_optimize = list(filter(lambda p: p.requires_grad, prior.parameters()))
+    optimizer = optimizer_cls(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        text_input_ids = inputs.input_ids
+        text_mask = inputs.attention_mask.bool()
+        return text_input_ids, text_mask
+
+    effnet_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR, antialias=True),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["effnet_pixel_values"] = [effnet_transforms(image) for image in images]
+        examples["text_input_ids"], examples["text_mask"] = tokenize_captions(examples)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        effnet_pixel_values = torch.stack([example["effnet_pixel_values"] for example in examples])
+        effnet_pixel_values = effnet_pixel_values.to(memory_format=torch.contiguous_format).float()
+        text_input_ids = torch.stack([example["text_input_ids"] for example in examples])
+        text_mask = torch.stack([example["text_mask"] for example in examples])
+        return {"effnet_pixel_values": effnet_pixel_values, "text_input_ids": text_input_ids, "text_mask": text_mask}
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    prior, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        prior, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        tracker_config.pop("validation_prompts")
+        accelerator.init_trackers(args.tracker_project_name, tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        prior.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            with accelerator.accumulate(prior):
+                # Convert images to latent space
+                text_input_ids, text_mask, effnet_images = (
+                    batch["text_input_ids"],
+                    batch["text_mask"],
+                    batch["effnet_pixel_values"].to(weight_dtype),
+                )
+
+                with torch.no_grad():
+                    text_encoder_output = text_encoder(text_input_ids, attention_mask=text_mask)
+                    prompt_embeds = text_encoder_output.last_hidden_state
+                    image_embeds = image_encoder(effnet_images)
+                    # scale
+                    image_embeds = image_embeds.add(1.0).div(42.0)
+
+                    # Sample noise that we'll add to the image_embeds
+                    noise = torch.randn_like(image_embeds)
+                    bsz = image_embeds.shape[0]
+
+                    # Sample a random timestep for each image
+                    timesteps = torch.rand((bsz,), device=image_embeds.device, dtype=weight_dtype)
+
+                    # add noise to latent
+                    noisy_latents = noise_scheduler.add_noise(image_embeds, noise, timesteps)
+
+                # Predict the noise residual and compute losscd
+                pred_noise = prior(noisy_latents, timesteps, prompt_embeds)
+
+                # vanilla loss
+                loss = F.mse_loss(pred_noise.float(), noise.float(), reduction="mean")
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompts is not None and epoch % args.validation_epochs == 0:
+                log_validation(text_encoder, tokenizer, prior, args, accelerator, weight_dtype, global_step)
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        prior = accelerator.unwrap_model(prior)
+        prior = prior.to(torch.float32)
+
+        prior_lora_state_dict = get_peft_model_state_dict(prior)
+
+        WuerstchenPriorPipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=prior_lora_state_dict,
+        )
+
+        # Run a final round of inference.
+        images = []
+        if args.validation_prompts is not None:
+            logger.info("Running inference for collecting generated images...")
+            pipeline = AutoPipelineForText2Image.from_pretrained(
+                args.pretrained_decoder_model_name_or_path,
+                prior_text_encoder=accelerator.unwrap_model(text_encoder),
+                prior_tokenizer=tokenizer,
+                torch_dtype=weight_dtype,
+            )
+            pipeline = pipeline.to(accelerator.device)
+
+            # load lora weights
+            pipeline.prior_pipe.load_lora_weights(args.output_dir, weight_name="pytorch_lora_weights.safetensors")
+            pipeline.set_progress_bar_config(disable=True)
+
+            if args.seed is None:
+                generator = None
+            else:
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+            for i in range(len(args.validation_prompts)):
+                with torch.cuda.amp.autocast():
+                    image = pipeline(
+                        args.validation_prompts[i],
+                        prior_timesteps=DEFAULT_STAGE_C_TIMESTEPS,
+                        generator=generator,
+                        width=args.resolution,
+                        height=args.resolution,
+                    ).images[0]
+                images.append(image)
+
+        if args.push_to_hub:
+            save_model_card(args, repo_id, images, repo_folder=args.output_dir)
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/examples/wuerstchen/text_to_image/train_text_to_image_prior.py b/diffusers/examples/wuerstchen/text_to_image/train_text_to_image_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba8de69203a9f840d70d0ebf25a87ee8050e58d
--- /dev/null
+++ b/diffusers/examples/wuerstchen/text_to_image/train_text_to_image_prior.py
@@ -0,0 +1,936 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import accelerate
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.state import AcceleratorState, is_initialized
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, hf_hub_download, upload_folder
+from modeling_efficient_net_encoder import EfficientNetEncoder
+from packaging import version
+from torchvision import transforms
+from tqdm import tqdm
+from transformers import CLIPTextModel, PreTrainedTokenizerFast
+from transformers.utils import ContextManagers
+
+from diffusers import AutoPipelineForText2Image, DDPMWuerstchenScheduler
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.wuerstchen import DEFAULT_STAGE_C_TIMESTEPS, WuerstchenPrior
+from diffusers.training_utils import EMAModel
+from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
+from diffusers.utils.logging import set_verbosity_error, set_verbosity_info
+
+
+if is_wandb_available():
+    import wandb
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+
+
+def save_model_card(
+    args,
+    repo_id: str,
+    images=None,
+    repo_folder=None,
+):
+    img_str = ""
+    if len(images) > 0:
+        image_grid = make_image_grid(images, 1, len(args.validation_prompts))
+        image_grid.save(os.path.join(repo_folder, "val_imgs_grid.png"))
+        img_str += "![val_imgs_grid](./val_imgs_grid.png)\n"
+
+    yaml = f"""
+---
+license: mit
+base_model: {args.pretrained_prior_model_name_or_path}
+datasets:
+- {args.dataset_name}
+tags:
+- wuerstchen
+- text-to-image
+- diffusers
+- diffusers-training
+inference: true
+---
+    """
+    model_card = f"""
+# Finetuning - {repo_id}
+
+This pipeline was finetuned from **{args.pretrained_prior_model_name_or_path}** on the **{args.dataset_name}** dataset. Below are some example images generated with the finetuned pipeline using the following prompts: {args.validation_prompts}: \n
+{img_str}
+
+## Pipeline usage
+
+You can use the pipeline like so:
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe_prior = DiffusionPipeline.from_pretrained("{repo_id}", torch_dtype={args.weight_dtype})
+pipe_t2i = DiffusionPipeline.from_pretrained("{args.pretrained_decoder_model_name_or_path}", torch_dtype={args.weight_dtype})
+prompt = "{args.validation_prompts[0]}"
+(image_embeds,) = pipe_prior(prompt).to_tuple()
+image = pipe_t2i(image_embeddings=image_embeds, prompt=prompt).images[0]
+image.save("my_image.png")
+```
+
+## Training info
+
+These are the key hyperparameters used during training:
+
+* Epochs: {args.num_train_epochs}
+* Learning rate: {args.learning_rate}
+* Batch size: {args.train_batch_size}
+* Gradient accumulation steps: {args.gradient_accumulation_steps}
+* Image resolution: {args.resolution}
+* Mixed-precision: {args.mixed_precision}
+
+"""
+    wandb_info = ""
+    if is_wandb_available():
+        wandb_run_url = None
+        if wandb.run is not None:
+            wandb_run_url = wandb.run.url
+
+    if wandb_run_url is not None:
+        wandb_info = f"""
+More information on all the CLI arguments and the environment are available on your [`wandb` run page]({wandb_run_url}).
+"""
+
+    model_card += wandb_info
+
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def log_validation(text_encoder, tokenizer, prior, args, accelerator, weight_dtype, epoch):
+    logger.info("Running validation... ")
+
+    pipeline = AutoPipelineForText2Image.from_pretrained(
+        args.pretrained_decoder_model_name_or_path,
+        prior_prior=accelerator.unwrap_model(prior),
+        prior_text_encoder=accelerator.unwrap_model(text_encoder),
+        prior_tokenizer=tokenizer,
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    images = []
+    for i in range(len(args.validation_prompts)):
+        with torch.autocast("cuda"):
+            image = pipeline(
+                args.validation_prompts[i],
+                prior_timesteps=DEFAULT_STAGE_C_TIMESTEPS,
+                generator=generator,
+                height=args.resolution,
+                width=args.resolution,
+            ).images[0]
+
+        images.append(image)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompts[i]}")
+                        for i, image in enumerate(images)
+                    ]
+                }
+            )
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+    return images
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of finetuning Würstchen Prior.")
+    parser.add_argument(
+        "--pretrained_decoder_model_name_or_path",
+        type=str,
+        default="warp-ai/wuerstchen",
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_prior_model_name_or_path",
+        type=str,
+        default="warp-ai/wuerstchen-prior",
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompts",
+        type=str,
+        default=None,
+        nargs="+",
+        help=("A set of prompts evaluated every `--validation_epochs` and logged to `--report_to`."),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="wuerstchen-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=1, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="learning rate",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument(
+        "--adam_weight_decay",
+        type=float,
+        default=0.0,
+        required=False,
+        help="weight decay_to_use",
+    )
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=5,
+        help="Run validation every X epochs.",
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="text2image-fine-tune",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+def main():
+    args = parse_args()
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(
+        total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir
+    )
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load scheduler, effnet, tokenizer, clip_model
+    noise_scheduler = DDPMWuerstchenScheduler()
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(
+        args.pretrained_prior_model_name_or_path, subfolder="tokenizer"
+    )
+
+    def deepspeed_zero_init_disabled_context_manager():
+        """
+        returns either a context list that includes one that will disable zero.Init or an empty context list
+        """
+        deepspeed_plugin = AcceleratorState().deepspeed_plugin if is_initialized() else None
+        if deepspeed_plugin is None:
+            return []
+
+        return [deepspeed_plugin.zero3_init_context_manager(enable=False)]
+
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
+        pretrained_checkpoint_file = hf_hub_download("dome272/wuerstchen", filename="model_v2_stage_b.pt")
+        state_dict = torch.load(pretrained_checkpoint_file, map_location="cpu")
+        image_encoder = EfficientNetEncoder()
+        image_encoder.load_state_dict(state_dict["effnet_state_dict"])
+        image_encoder.eval()
+
+        text_encoder = CLIPTextModel.from_pretrained(
+            args.pretrained_prior_model_name_or_path, subfolder="text_encoder", torch_dtype=weight_dtype
+        ).eval()
+
+    # Freeze text_encoder and image_encoder
+    text_encoder.requires_grad_(False)
+    image_encoder.requires_grad_(False)
+
+    # load prior model
+    prior = WuerstchenPrior.from_pretrained(args.pretrained_prior_model_name_or_path, subfolder="prior")
+
+    # Create EMA for the prior
+    if args.use_ema:
+        ema_prior = WuerstchenPrior.from_pretrained(args.pretrained_prior_model_name_or_path, subfolder="prior")
+        ema_prior = EMAModel(ema_prior.parameters(), model_cls=WuerstchenPrior, model_config=ema_prior.config)
+        ema_prior.to(accelerator.device)
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if args.use_ema:
+                ema_prior.save_pretrained(os.path.join(output_dir, "prior_ema"))
+
+            for i, model in enumerate(models):
+                model.save_pretrained(os.path.join(output_dir, "prior"))
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "prior_ema"), WuerstchenPrior)
+                ema_prior.load_state_dict(load_model.state_dict())
+                ema_prior.to(accelerator.device)
+                del load_model
+
+            for i in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = WuerstchenPrior.from_pretrained(input_dir, subfolder="prior")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        prior.enable_gradient_checkpointing()
+
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+    optimizer = optimizer_cls(
+        prior.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        text_input_ids = inputs.input_ids
+        text_mask = inputs.attention_mask.bool()
+        return text_input_ids, text_mask
+
+    effnet_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR, antialias=True),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["effnet_pixel_values"] = [effnet_transforms(image) for image in images]
+        examples["text_input_ids"], examples["text_mask"] = tokenize_captions(examples)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        effnet_pixel_values = torch.stack([example["effnet_pixel_values"] for example in examples])
+        effnet_pixel_values = effnet_pixel_values.to(memory_format=torch.contiguous_format).float()
+        text_input_ids = torch.stack([example["text_input_ids"] for example in examples])
+        text_mask = torch.stack([example["text_mask"] for example in examples])
+        return {"effnet_pixel_values": effnet_pixel_values, "text_input_ids": text_input_ids, "text_mask": text_mask}
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    prior, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        prior, optimizer, train_dataloader, lr_scheduler
+    )
+    image_encoder.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        tracker_config.pop("validation_prompts")
+        accelerator.init_trackers(args.tracker_project_name, tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        prior.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            with accelerator.accumulate(prior):
+                # Convert images to latent space
+                text_input_ids, text_mask, effnet_images = (
+                    batch["text_input_ids"],
+                    batch["text_mask"],
+                    batch["effnet_pixel_values"].to(weight_dtype),
+                )
+
+                with torch.no_grad():
+                    text_encoder_output = text_encoder(text_input_ids, attention_mask=text_mask)
+                    prompt_embeds = text_encoder_output.last_hidden_state
+                    image_embeds = image_encoder(effnet_images)
+                    # scale
+                    image_embeds = image_embeds.add(1.0).div(42.0)
+
+                    # Sample noise that we'll add to the image_embeds
+                    noise = torch.randn_like(image_embeds)
+                    bsz = image_embeds.shape[0]
+
+                    # Sample a random timestep for each image
+                    timesteps = torch.rand((bsz,), device=image_embeds.device, dtype=weight_dtype)
+
+                    # add noise to latent
+                    noisy_latents = noise_scheduler.add_noise(image_embeds, noise, timesteps)
+
+                # Predict the noise residual and compute losscd
+                pred_noise = prior(noisy_latents, timesteps, prompt_embeds)
+
+                # vanilla loss
+                loss = F.mse_loss(pred_noise.float(), noise.float(), reduction="mean")
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(prior.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_prior.step(prior.parameters())
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompts is not None and epoch % args.validation_epochs == 0:
+                if args.use_ema:
+                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                    ema_prior.store(prior.parameters())
+                    ema_prior.copy_to(prior.parameters())
+                log_validation(text_encoder, tokenizer, prior, args, accelerator, weight_dtype, global_step)
+                if args.use_ema:
+                    # Switch back to the original UNet parameters.
+                    ema_prior.restore(prior.parameters())
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        prior = accelerator.unwrap_model(prior)
+        if args.use_ema:
+            ema_prior.copy_to(prior.parameters())
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            args.pretrained_decoder_model_name_or_path,
+            prior_prior=prior,
+            prior_text_encoder=accelerator.unwrap_model(text_encoder),
+            prior_tokenizer=tokenizer,
+        )
+        pipeline.prior_pipe.save_pretrained(os.path.join(args.output_dir, "prior_pipeline"))
+
+        # Run a final round of inference.
+        images = []
+        if args.validation_prompts is not None:
+            logger.info("Running inference for collecting generated images...")
+            pipeline = pipeline.to(accelerator.device, torch_dtype=weight_dtype)
+            pipeline.set_progress_bar_config(disable=True)
+
+            if args.seed is None:
+                generator = None
+            else:
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+            for i in range(len(args.validation_prompts)):
+                with torch.autocast("cuda"):
+                    image = pipeline(
+                        args.validation_prompts[i],
+                        prior_timesteps=DEFAULT_STAGE_C_TIMESTEPS,
+                        generator=generator,
+                        width=args.resolution,
+                        height=args.resolution,
+                    ).images[0]
+                images.append(image)
+
+        if args.push_to_hub:
+            save_model_card(args, repo_id, images, repo_folder=args.output_dir)
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/pyproject.toml b/diffusers/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..299865a1225d92be40427f6112af0dab1c2a7993
--- /dev/null
+++ b/diffusers/pyproject.toml
@@ -0,0 +1,29 @@
+[tool.ruff]
+line-length = 119
+
+[tool.ruff.lint]
+# Never enforce `E501` (line length violations).
+ignore = ["C901", "E501", "E741", "F402", "F823"]
+select = ["C", "E", "F", "I", "W"]
+
+# Ignore import violations in all `__init__.py` files.
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
+"src/diffusers/utils/dummy_*.py" = ["F401"]
+
+[tool.ruff.lint.isort]
+lines-after-imports = 2
+known-first-party = ["diffusers"]
+
+[tool.ruff.format]
+# Like Black, use double quotes for strings.
+quote-style = "double"
+
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"
diff --git a/diffusers/scripts/__init__.py b/diffusers/scripts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/scripts/change_naming_configs_and_checkpoints.py b/diffusers/scripts/change_naming_configs_and_checkpoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..adc1605e95b3e44d434ef810c49571cea1cb4643
--- /dev/null
+++ b/diffusers/scripts/change_naming_configs_and_checkpoints.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the LDM checkpoints."""
+
+import argparse
+import json
+import os
+
+import torch
+from transformers.file_utils import has_file
+
+from diffusers import UNet2DConditionModel, UNet2DModel
+
+
+do_only_config = False
+do_only_weights = True
+do_only_renaming = False
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--repo_path",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the architecture.",
+    )
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    args = parser.parse_args()
+
+    config_parameters_to_change = {
+        "image_size": "sample_size",
+        "num_res_blocks": "layers_per_block",
+        "block_channels": "block_out_channels",
+        "down_blocks": "down_block_types",
+        "up_blocks": "up_block_types",
+        "downscale_freq_shift": "freq_shift",
+        "resnet_num_groups": "norm_num_groups",
+        "resnet_act_fn": "act_fn",
+        "resnet_eps": "norm_eps",
+        "num_head_channels": "attention_head_dim",
+    }
+
+    key_parameters_to_change = {
+        "time_steps": "time_proj",
+        "mid": "mid_block",
+        "downsample_blocks": "down_blocks",
+        "upsample_blocks": "up_blocks",
+    }
+
+    subfolder = "" if has_file(args.repo_path, "config.json") else "unet"
+
+    with open(os.path.join(args.repo_path, subfolder, "config.json"), "r", encoding="utf-8") as reader:
+        text = reader.read()
+        config = json.loads(text)
+
+    if do_only_config:
+        for key in config_parameters_to_change.keys():
+            config.pop(key, None)
+
+    if has_file(args.repo_path, "config.json"):
+        model = UNet2DModel(**config)
+    else:
+        class_name = UNet2DConditionModel if "ldm-text2im-large-256" in args.repo_path else UNet2DModel
+        model = class_name(**config)
+
+    if do_only_config:
+        model.save_config(os.path.join(args.repo_path, subfolder))
+
+    config = dict(model.config)
+
+    if do_only_renaming:
+        for key, value in config_parameters_to_change.items():
+            if key in config:
+                config[value] = config[key]
+                del config[key]
+
+        config["down_block_types"] = [k.replace("UNetRes", "") for k in config["down_block_types"]]
+        config["up_block_types"] = [k.replace("UNetRes", "") for k in config["up_block_types"]]
+
+    if do_only_weights:
+        state_dict = torch.load(os.path.join(args.repo_path, subfolder, "diffusion_pytorch_model.bin"))
+
+        new_state_dict = {}
+        for param_key, param_value in state_dict.items():
+            if param_key.endswith(".op.bias") or param_key.endswith(".op.weight"):
+                continue
+            has_changed = False
+            for key, new_key in key_parameters_to_change.items():
+                if not has_changed and param_key.split(".")[0] == key:
+                    new_state_dict[".".join([new_key] + param_key.split(".")[1:])] = param_value
+                    has_changed = True
+            if not has_changed:
+                new_state_dict[param_key] = param_value
+
+        model.load_state_dict(new_state_dict)
+        model.save_pretrained(os.path.join(args.repo_path, subfolder))
diff --git a/diffusers/scripts/conversion_ldm_uncond.py b/diffusers/scripts/conversion_ldm_uncond.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c22cc1ce8f2da514c1f096afaf67f1650e4df8a
--- /dev/null
+++ b/diffusers/scripts/conversion_ldm_uncond.py
@@ -0,0 +1,56 @@
+import argparse
+
+import torch
+import yaml
+
+from diffusers import DDIMScheduler, LDMPipeline, UNetLDMModel, VQModel
+
+
+def convert_ldm_original(checkpoint_path, config_path, output_path):
+    config = yaml.safe_load(config_path)
+    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
+    keys = list(state_dict.keys())
+
+    # extract state_dict for VQVAE
+    first_stage_dict = {}
+    first_stage_key = "first_stage_model."
+    for key in keys:
+        if key.startswith(first_stage_key):
+            first_stage_dict[key.replace(first_stage_key, "")] = state_dict[key]
+
+    # extract state_dict for UNetLDM
+    unet_state_dict = {}
+    unet_key = "model.diffusion_model."
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, "")] = state_dict[key]
+
+    vqvae_init_args = config["model"]["params"]["first_stage_config"]["params"]
+    unet_init_args = config["model"]["params"]["unet_config"]["params"]
+
+    vqvae = VQModel(**vqvae_init_args).eval()
+    vqvae.load_state_dict(first_stage_dict)
+
+    unet = UNetLDMModel(**unet_init_args).eval()
+    unet.load_state_dict(unet_state_dict)
+
+    noise_scheduler = DDIMScheduler(
+        timesteps=config["model"]["params"]["timesteps"],
+        beta_schedule="scaled_linear",
+        beta_start=config["model"]["params"]["linear_start"],
+        beta_end=config["model"]["params"]["linear_end"],
+        clip_sample=False,
+    )
+
+    pipeline = LDMPipeline(vqvae, unet, noise_scheduler)
+    pipeline.save_pretrained(output_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", type=str, required=True)
+    parser.add_argument("--config_path", type=str, required=True)
+    parser.add_argument("--output_path", type=str, required=True)
+    args = parser.parse_args()
+
+    convert_ldm_original(args.checkpoint_path, args.config_path, args.output_path)
diff --git a/diffusers/scripts/convert_amused.py b/diffusers/scripts/convert_amused.py
new file mode 100644
index 0000000000000000000000000000000000000000..21be29dfdb99a7093b3060e828c514c8357d03d5
--- /dev/null
+++ b/diffusers/scripts/convert_amused.py
@@ -0,0 +1,523 @@
+import inspect
+import os
+from argparse import ArgumentParser
+
+import numpy as np
+import torch
+from muse import MaskGiTUViT, VQGANModel
+from muse import PipelineMuse as OldPipelineMuse
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import VQModel
+from diffusers.models.attention_processor import AttnProcessor
+from diffusers.models.unets.uvit_2d import UVit2DModel
+from diffusers.pipelines.amused.pipeline_amused import AmusedPipeline
+from diffusers.schedulers import AmusedScheduler
+
+
+torch.backends.cuda.enable_flash_sdp(False)
+torch.backends.cuda.enable_mem_efficient_sdp(False)
+torch.backends.cuda.enable_math_sdp(True)
+
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+torch.use_deterministic_algorithms(True)
+
+# Enable CUDNN deterministic mode
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+torch.backends.cuda.matmul.allow_tf32 = False
+
+device = "cuda"
+
+
+def main():
+    args = ArgumentParser()
+    args.add_argument("--model_256", action="store_true")
+    args.add_argument("--write_to", type=str, required=False, default=None)
+    args.add_argument("--transformer_path", type=str, required=False, default=None)
+    args = args.parse_args()
+
+    transformer_path = args.transformer_path
+    subfolder = "transformer"
+
+    if transformer_path is None:
+        if args.model_256:
+            transformer_path = "openMUSE/muse-256"
+        else:
+            transformer_path = (
+                "../research-run-512-checkpoints/research-run-512-with-downsample-checkpoint-554000/unwrapped_model/"
+            )
+            subfolder = None
+
+    old_transformer = MaskGiTUViT.from_pretrained(transformer_path, subfolder=subfolder)
+
+    old_transformer.to(device)
+
+    old_vae = VQGANModel.from_pretrained("openMUSE/muse-512", subfolder="vae")
+    old_vae.to(device)
+
+    vqvae = make_vqvae(old_vae)
+
+    tokenizer = CLIPTokenizer.from_pretrained("openMUSE/muse-512", subfolder="text_encoder")
+
+    text_encoder = CLIPTextModelWithProjection.from_pretrained("openMUSE/muse-512", subfolder="text_encoder")
+    text_encoder.to(device)
+
+    transformer = make_transformer(old_transformer, args.model_256)
+
+    scheduler = AmusedScheduler(mask_token_id=old_transformer.config.mask_token_id)
+
+    new_pipe = AmusedPipeline(
+        vqvae=vqvae, tokenizer=tokenizer, text_encoder=text_encoder, transformer=transformer, scheduler=scheduler
+    )
+
+    old_pipe = OldPipelineMuse(
+        vae=old_vae, transformer=old_transformer, text_encoder=text_encoder, tokenizer=tokenizer
+    )
+    old_pipe.to(device)
+
+    if args.model_256:
+        transformer_seq_len = 256
+        orig_size = (256, 256)
+    else:
+        transformer_seq_len = 1024
+        orig_size = (512, 512)
+
+    old_out = old_pipe(
+        "dog",
+        generator=torch.Generator(device).manual_seed(0),
+        transformer_seq_len=transformer_seq_len,
+        orig_size=orig_size,
+        timesteps=12,
+    )[0]
+
+    new_out = new_pipe("dog", generator=torch.Generator(device).manual_seed(0)).images[0]
+
+    old_out = np.array(old_out)
+    new_out = np.array(new_out)
+
+    diff = np.abs(old_out.astype(np.float64) - new_out.astype(np.float64))
+
+    # assert diff diff.sum() == 0
+    print("skipping pipeline full equivalence check")
+
+    print(f"max diff: {diff.max()}, diff.sum() / diff.size {diff.sum() / diff.size}")
+
+    if args.model_256:
+        assert diff.max() <= 3
+        assert diff.sum() / diff.size < 0.7
+    else:
+        assert diff.max() <= 1
+        assert diff.sum() / diff.size < 0.4
+
+    if args.write_to is not None:
+        new_pipe.save_pretrained(args.write_to)
+
+
+def make_transformer(old_transformer, model_256):
+    args = dict(old_transformer.config)
+    force_down_up_sample = args["force_down_up_sample"]
+
+    signature = inspect.signature(UVit2DModel.__init__)
+
+    args_ = {
+        "downsample": force_down_up_sample,
+        "upsample": force_down_up_sample,
+        "block_out_channels": args["block_out_channels"][0],
+        "sample_size": 16 if model_256 else 32,
+    }
+
+    for s in list(signature.parameters.keys()):
+        if s in ["self", "downsample", "upsample", "sample_size", "block_out_channels"]:
+            continue
+
+        args_[s] = args[s]
+
+    new_transformer = UVit2DModel(**args_)
+    new_transformer.to(device)
+
+    new_transformer.set_attn_processor(AttnProcessor())
+
+    state_dict = old_transformer.state_dict()
+
+    state_dict["cond_embed.linear_1.weight"] = state_dict.pop("cond_embed.0.weight")
+    state_dict["cond_embed.linear_2.weight"] = state_dict.pop("cond_embed.2.weight")
+
+    for i in range(22):
+        state_dict[f"transformer_layers.{i}.norm1.norm.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.attn_layer_norm.weight"
+        )
+        state_dict[f"transformer_layers.{i}.norm1.linear.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.self_attn_adaLN_modulation.mapper.weight"
+        )
+
+        state_dict[f"transformer_layers.{i}.attn1.to_q.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.attention.query.weight"
+        )
+        state_dict[f"transformer_layers.{i}.attn1.to_k.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.attention.key.weight"
+        )
+        state_dict[f"transformer_layers.{i}.attn1.to_v.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.attention.value.weight"
+        )
+        state_dict[f"transformer_layers.{i}.attn1.to_out.0.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.attention.out.weight"
+        )
+
+        state_dict[f"transformer_layers.{i}.norm2.norm.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.crossattn_layer_norm.weight"
+        )
+        state_dict[f"transformer_layers.{i}.norm2.linear.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.cross_attn_adaLN_modulation.mapper.weight"
+        )
+
+        state_dict[f"transformer_layers.{i}.attn2.to_q.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.crossattention.query.weight"
+        )
+        state_dict[f"transformer_layers.{i}.attn2.to_k.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.crossattention.key.weight"
+        )
+        state_dict[f"transformer_layers.{i}.attn2.to_v.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.crossattention.value.weight"
+        )
+        state_dict[f"transformer_layers.{i}.attn2.to_out.0.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.crossattention.out.weight"
+        )
+
+        state_dict[f"transformer_layers.{i}.norm3.norm.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.ffn.pre_mlp_layer_norm.weight"
+        )
+        state_dict[f"transformer_layers.{i}.norm3.linear.weight"] = state_dict.pop(
+            f"transformer_layers.{i}.ffn.adaLN_modulation.mapper.weight"
+        )
+
+        wi_0_weight = state_dict.pop(f"transformer_layers.{i}.ffn.wi_0.weight")
+        wi_1_weight = state_dict.pop(f"transformer_layers.{i}.ffn.wi_1.weight")
+        proj_weight = torch.concat([wi_1_weight, wi_0_weight], dim=0)
+        state_dict[f"transformer_layers.{i}.ff.net.0.proj.weight"] = proj_weight
+
+        state_dict[f"transformer_layers.{i}.ff.net.2.weight"] = state_dict.pop(f"transformer_layers.{i}.ffn.wo.weight")
+
+    if force_down_up_sample:
+        state_dict["down_block.downsample.norm.weight"] = state_dict.pop("down_blocks.0.downsample.0.norm.weight")
+        state_dict["down_block.downsample.conv.weight"] = state_dict.pop("down_blocks.0.downsample.1.weight")
+
+        state_dict["up_block.upsample.norm.weight"] = state_dict.pop("up_blocks.0.upsample.0.norm.weight")
+        state_dict["up_block.upsample.conv.weight"] = state_dict.pop("up_blocks.0.upsample.1.weight")
+
+    state_dict["mlm_layer.layer_norm.weight"] = state_dict.pop("mlm_layer.layer_norm.norm.weight")
+
+    for i in range(3):
+        state_dict[f"down_block.res_blocks.{i}.norm.weight"] = state_dict.pop(
+            f"down_blocks.0.res_blocks.{i}.norm.norm.weight"
+        )
+        state_dict[f"down_block.res_blocks.{i}.channelwise_linear_1.weight"] = state_dict.pop(
+            f"down_blocks.0.res_blocks.{i}.channelwise.0.weight"
+        )
+        state_dict[f"down_block.res_blocks.{i}.channelwise_norm.gamma"] = state_dict.pop(
+            f"down_blocks.0.res_blocks.{i}.channelwise.2.gamma"
+        )
+        state_dict[f"down_block.res_blocks.{i}.channelwise_norm.beta"] = state_dict.pop(
+            f"down_blocks.0.res_blocks.{i}.channelwise.2.beta"
+        )
+        state_dict[f"down_block.res_blocks.{i}.channelwise_linear_2.weight"] = state_dict.pop(
+            f"down_blocks.0.res_blocks.{i}.channelwise.4.weight"
+        )
+        state_dict[f"down_block.res_blocks.{i}.cond_embeds_mapper.weight"] = state_dict.pop(
+            f"down_blocks.0.res_blocks.{i}.adaLN_modulation.mapper.weight"
+        )
+
+        state_dict[f"down_block.attention_blocks.{i}.norm1.weight"] = state_dict.pop(
+            f"down_blocks.0.attention_blocks.{i}.attn_layer_norm.weight"
+        )
+        state_dict[f"down_block.attention_blocks.{i}.attn1.to_q.weight"] = state_dict.pop(
+            f"down_blocks.0.attention_blocks.{i}.attention.query.weight"
+        )
+        state_dict[f"down_block.attention_blocks.{i}.attn1.to_k.weight"] = state_dict.pop(
+            f"down_blocks.0.attention_blocks.{i}.attention.key.weight"
+        )
+        state_dict[f"down_block.attention_blocks.{i}.attn1.to_v.weight"] = state_dict.pop(
+            f"down_blocks.0.attention_blocks.{i}.attention.value.weight"
+        )
+        state_dict[f"down_block.attention_blocks.{i}.attn1.to_out.0.weight"] = state_dict.pop(
+            f"down_blocks.0.attention_blocks.{i}.attention.out.weight"
+        )
+
+        state_dict[f"down_block.attention_blocks.{i}.norm2.weight"] = state_dict.pop(
+            f"down_blocks.0.attention_blocks.{i}.crossattn_layer_norm.weight"
+        )
+        state_dict[f"down_block.attention_blocks.{i}.attn2.to_q.weight"] = state_dict.pop(
+            f"down_blocks.0.attention_blocks.{i}.crossattention.query.weight"
+        )
+        state_dict[f"down_block.attention_blocks.{i}.attn2.to_k.weight"] = state_dict.pop(
+            f"down_blocks.0.attention_blocks.{i}.crossattention.key.weight"
+        )
+        state_dict[f"down_block.attention_blocks.{i}.attn2.to_v.weight"] = state_dict.pop(
+            f"down_blocks.0.attention_blocks.{i}.crossattention.value.weight"
+        )
+        state_dict[f"down_block.attention_blocks.{i}.attn2.to_out.0.weight"] = state_dict.pop(
+            f"down_blocks.0.attention_blocks.{i}.crossattention.out.weight"
+        )
+
+        state_dict[f"up_block.res_blocks.{i}.norm.weight"] = state_dict.pop(
+            f"up_blocks.0.res_blocks.{i}.norm.norm.weight"
+        )
+        state_dict[f"up_block.res_blocks.{i}.channelwise_linear_1.weight"] = state_dict.pop(
+            f"up_blocks.0.res_blocks.{i}.channelwise.0.weight"
+        )
+        state_dict[f"up_block.res_blocks.{i}.channelwise_norm.gamma"] = state_dict.pop(
+            f"up_blocks.0.res_blocks.{i}.channelwise.2.gamma"
+        )
+        state_dict[f"up_block.res_blocks.{i}.channelwise_norm.beta"] = state_dict.pop(
+            f"up_blocks.0.res_blocks.{i}.channelwise.2.beta"
+        )
+        state_dict[f"up_block.res_blocks.{i}.channelwise_linear_2.weight"] = state_dict.pop(
+            f"up_blocks.0.res_blocks.{i}.channelwise.4.weight"
+        )
+        state_dict[f"up_block.res_blocks.{i}.cond_embeds_mapper.weight"] = state_dict.pop(
+            f"up_blocks.0.res_blocks.{i}.adaLN_modulation.mapper.weight"
+        )
+
+        state_dict[f"up_block.attention_blocks.{i}.norm1.weight"] = state_dict.pop(
+            f"up_blocks.0.attention_blocks.{i}.attn_layer_norm.weight"
+        )
+        state_dict[f"up_block.attention_blocks.{i}.attn1.to_q.weight"] = state_dict.pop(
+            f"up_blocks.0.attention_blocks.{i}.attention.query.weight"
+        )
+        state_dict[f"up_block.attention_blocks.{i}.attn1.to_k.weight"] = state_dict.pop(
+            f"up_blocks.0.attention_blocks.{i}.attention.key.weight"
+        )
+        state_dict[f"up_block.attention_blocks.{i}.attn1.to_v.weight"] = state_dict.pop(
+            f"up_blocks.0.attention_blocks.{i}.attention.value.weight"
+        )
+        state_dict[f"up_block.attention_blocks.{i}.attn1.to_out.0.weight"] = state_dict.pop(
+            f"up_blocks.0.attention_blocks.{i}.attention.out.weight"
+        )
+
+        state_dict[f"up_block.attention_blocks.{i}.norm2.weight"] = state_dict.pop(
+            f"up_blocks.0.attention_blocks.{i}.crossattn_layer_norm.weight"
+        )
+        state_dict[f"up_block.attention_blocks.{i}.attn2.to_q.weight"] = state_dict.pop(
+            f"up_blocks.0.attention_blocks.{i}.crossattention.query.weight"
+        )
+        state_dict[f"up_block.attention_blocks.{i}.attn2.to_k.weight"] = state_dict.pop(
+            f"up_blocks.0.attention_blocks.{i}.crossattention.key.weight"
+        )
+        state_dict[f"up_block.attention_blocks.{i}.attn2.to_v.weight"] = state_dict.pop(
+            f"up_blocks.0.attention_blocks.{i}.crossattention.value.weight"
+        )
+        state_dict[f"up_block.attention_blocks.{i}.attn2.to_out.0.weight"] = state_dict.pop(
+            f"up_blocks.0.attention_blocks.{i}.crossattention.out.weight"
+        )
+
+    for key in list(state_dict.keys()):
+        if key.startswith("up_blocks.0"):
+            key_ = "up_block." + ".".join(key.split(".")[2:])
+            state_dict[key_] = state_dict.pop(key)
+
+        if key.startswith("down_blocks.0"):
+            key_ = "down_block." + ".".join(key.split(".")[2:])
+            state_dict[key_] = state_dict.pop(key)
+
+    new_transformer.load_state_dict(state_dict)
+
+    input_ids = torch.randint(0, 10, (1, 32, 32), device=old_transformer.device)
+    encoder_hidden_states = torch.randn((1, 77, 768), device=old_transformer.device)
+    cond_embeds = torch.randn((1, 768), device=old_transformer.device)
+    micro_conds = torch.tensor([[512, 512, 0, 0, 6]], dtype=torch.float32, device=old_transformer.device)
+
+    old_out = old_transformer(input_ids.reshape(1, -1), encoder_hidden_states, cond_embeds, micro_conds)
+    old_out = old_out.reshape(1, 32, 32, 8192).permute(0, 3, 1, 2)
+
+    new_out = new_transformer(input_ids, encoder_hidden_states, cond_embeds, micro_conds)
+
+    # NOTE: these differences are solely due to using the geglu block that has a single linear layer of
+    # double output dimension instead of two different linear layers
+    max_diff = (old_out - new_out).abs().max()
+    total_diff = (old_out - new_out).abs().sum()
+    print(f"Transformer max_diff: {max_diff} total_diff:  {total_diff}")
+    assert max_diff < 0.01
+    assert total_diff < 1500
+
+    return new_transformer
+
+
+def make_vqvae(old_vae):
+    new_vae = VQModel(
+        act_fn="silu",
+        block_out_channels=[128, 256, 256, 512, 768],
+        down_block_types=[
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+        ],
+        in_channels=3,
+        latent_channels=64,
+        layers_per_block=2,
+        norm_num_groups=32,
+        num_vq_embeddings=8192,
+        out_channels=3,
+        sample_size=32,
+        up_block_types=[
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+        ],
+        mid_block_add_attention=False,
+        lookup_from_codebook=True,
+    )
+    new_vae.to(device)
+
+    # fmt: off
+
+    new_state_dict = {}
+
+    old_state_dict = old_vae.state_dict()
+
+    new_state_dict["encoder.conv_in.weight"] = old_state_dict.pop("encoder.conv_in.weight")
+    new_state_dict["encoder.conv_in.bias"]   = old_state_dict.pop("encoder.conv_in.bias")
+
+    convert_vae_block_state_dict(old_state_dict, "encoder.down.0", new_state_dict, "encoder.down_blocks.0")
+    convert_vae_block_state_dict(old_state_dict, "encoder.down.1", new_state_dict, "encoder.down_blocks.1")
+    convert_vae_block_state_dict(old_state_dict, "encoder.down.2", new_state_dict, "encoder.down_blocks.2")
+    convert_vae_block_state_dict(old_state_dict, "encoder.down.3", new_state_dict, "encoder.down_blocks.3")
+    convert_vae_block_state_dict(old_state_dict, "encoder.down.4", new_state_dict, "encoder.down_blocks.4")
+
+    new_state_dict["encoder.mid_block.resnets.0.norm1.weight"] = old_state_dict.pop("encoder.mid.block_1.norm1.weight")
+    new_state_dict["encoder.mid_block.resnets.0.norm1.bias"]   = old_state_dict.pop("encoder.mid.block_1.norm1.bias")
+    new_state_dict["encoder.mid_block.resnets.0.conv1.weight"] = old_state_dict.pop("encoder.mid.block_1.conv1.weight")
+    new_state_dict["encoder.mid_block.resnets.0.conv1.bias"]   = old_state_dict.pop("encoder.mid.block_1.conv1.bias")
+    new_state_dict["encoder.mid_block.resnets.0.norm2.weight"] = old_state_dict.pop("encoder.mid.block_1.norm2.weight")
+    new_state_dict["encoder.mid_block.resnets.0.norm2.bias"]   = old_state_dict.pop("encoder.mid.block_1.norm2.bias")
+    new_state_dict["encoder.mid_block.resnets.0.conv2.weight"] = old_state_dict.pop("encoder.mid.block_1.conv2.weight")
+    new_state_dict["encoder.mid_block.resnets.0.conv2.bias"]   = old_state_dict.pop("encoder.mid.block_1.conv2.bias")
+    new_state_dict["encoder.mid_block.resnets.1.norm1.weight"] = old_state_dict.pop("encoder.mid.block_2.norm1.weight")
+    new_state_dict["encoder.mid_block.resnets.1.norm1.bias"]   = old_state_dict.pop("encoder.mid.block_2.norm1.bias")
+    new_state_dict["encoder.mid_block.resnets.1.conv1.weight"] = old_state_dict.pop("encoder.mid.block_2.conv1.weight")
+    new_state_dict["encoder.mid_block.resnets.1.conv1.bias"]   = old_state_dict.pop("encoder.mid.block_2.conv1.bias")
+    new_state_dict["encoder.mid_block.resnets.1.norm2.weight"] = old_state_dict.pop("encoder.mid.block_2.norm2.weight")
+    new_state_dict["encoder.mid_block.resnets.1.norm2.bias"]   = old_state_dict.pop("encoder.mid.block_2.norm2.bias")
+    new_state_dict["encoder.mid_block.resnets.1.conv2.weight"] = old_state_dict.pop("encoder.mid.block_2.conv2.weight")
+    new_state_dict["encoder.mid_block.resnets.1.conv2.bias"]   = old_state_dict.pop("encoder.mid.block_2.conv2.bias")
+    new_state_dict["encoder.conv_norm_out.weight"]             = old_state_dict.pop("encoder.norm_out.weight")
+    new_state_dict["encoder.conv_norm_out.bias"]               = old_state_dict.pop("encoder.norm_out.bias")
+    new_state_dict["encoder.conv_out.weight"]                  = old_state_dict.pop("encoder.conv_out.weight")
+    new_state_dict["encoder.conv_out.bias"]                    = old_state_dict.pop("encoder.conv_out.bias")
+    new_state_dict["quant_conv.weight"]                        = old_state_dict.pop("quant_conv.weight")
+    new_state_dict["quant_conv.bias"]                          = old_state_dict.pop("quant_conv.bias")
+    new_state_dict["quantize.embedding.weight"]                = old_state_dict.pop("quantize.embedding.weight")
+    new_state_dict["post_quant_conv.weight"]                   = old_state_dict.pop("post_quant_conv.weight")
+    new_state_dict["post_quant_conv.bias"]                     = old_state_dict.pop("post_quant_conv.bias")
+    new_state_dict["decoder.conv_in.weight"]                   = old_state_dict.pop("decoder.conv_in.weight")
+    new_state_dict["decoder.conv_in.bias"]                     = old_state_dict.pop("decoder.conv_in.bias")
+    new_state_dict["decoder.mid_block.resnets.0.norm1.weight"] = old_state_dict.pop("decoder.mid.block_1.norm1.weight")
+    new_state_dict["decoder.mid_block.resnets.0.norm1.bias"]   = old_state_dict.pop("decoder.mid.block_1.norm1.bias")
+    new_state_dict["decoder.mid_block.resnets.0.conv1.weight"] = old_state_dict.pop("decoder.mid.block_1.conv1.weight")
+    new_state_dict["decoder.mid_block.resnets.0.conv1.bias"]   = old_state_dict.pop("decoder.mid.block_1.conv1.bias")
+    new_state_dict["decoder.mid_block.resnets.0.norm2.weight"] = old_state_dict.pop("decoder.mid.block_1.norm2.weight")
+    new_state_dict["decoder.mid_block.resnets.0.norm2.bias"]   = old_state_dict.pop("decoder.mid.block_1.norm2.bias")
+    new_state_dict["decoder.mid_block.resnets.0.conv2.weight"] = old_state_dict.pop("decoder.mid.block_1.conv2.weight")
+    new_state_dict["decoder.mid_block.resnets.0.conv2.bias"]   = old_state_dict.pop("decoder.mid.block_1.conv2.bias")
+    new_state_dict["decoder.mid_block.resnets.1.norm1.weight"] = old_state_dict.pop("decoder.mid.block_2.norm1.weight")
+    new_state_dict["decoder.mid_block.resnets.1.norm1.bias"]   = old_state_dict.pop("decoder.mid.block_2.norm1.bias")
+    new_state_dict["decoder.mid_block.resnets.1.conv1.weight"] = old_state_dict.pop("decoder.mid.block_2.conv1.weight")
+    new_state_dict["decoder.mid_block.resnets.1.conv1.bias"]   = old_state_dict.pop("decoder.mid.block_2.conv1.bias")
+    new_state_dict["decoder.mid_block.resnets.1.norm2.weight"] = old_state_dict.pop("decoder.mid.block_2.norm2.weight")
+    new_state_dict["decoder.mid_block.resnets.1.norm2.bias"]   = old_state_dict.pop("decoder.mid.block_2.norm2.bias")
+    new_state_dict["decoder.mid_block.resnets.1.conv2.weight"] = old_state_dict.pop("decoder.mid.block_2.conv2.weight")
+    new_state_dict["decoder.mid_block.resnets.1.conv2.bias"]   = old_state_dict.pop("decoder.mid.block_2.conv2.bias")
+
+    convert_vae_block_state_dict(old_state_dict, "decoder.up.0", new_state_dict, "decoder.up_blocks.4")
+    convert_vae_block_state_dict(old_state_dict, "decoder.up.1", new_state_dict, "decoder.up_blocks.3")
+    convert_vae_block_state_dict(old_state_dict, "decoder.up.2", new_state_dict, "decoder.up_blocks.2")
+    convert_vae_block_state_dict(old_state_dict, "decoder.up.3", new_state_dict, "decoder.up_blocks.1")
+    convert_vae_block_state_dict(old_state_dict, "decoder.up.4", new_state_dict, "decoder.up_blocks.0")
+
+    new_state_dict["decoder.conv_norm_out.weight"] = old_state_dict.pop("decoder.norm_out.weight")
+    new_state_dict["decoder.conv_norm_out.bias"]   = old_state_dict.pop("decoder.norm_out.bias")
+    new_state_dict["decoder.conv_out.weight"]      = old_state_dict.pop("decoder.conv_out.weight")
+    new_state_dict["decoder.conv_out.bias"]        = old_state_dict.pop("decoder.conv_out.bias")
+
+    # fmt: on
+
+    assert len(old_state_dict.keys()) == 0
+
+    new_vae.load_state_dict(new_state_dict)
+
+    input = torch.randn((1, 3, 512, 512), device=device)
+    input = input.clamp(-1, 1)
+
+    old_encoder_output = old_vae.quant_conv(old_vae.encoder(input))
+    new_encoder_output = new_vae.quant_conv(new_vae.encoder(input))
+    assert (old_encoder_output == new_encoder_output).all()
+
+    old_decoder_output = old_vae.decoder(old_vae.post_quant_conv(old_encoder_output))
+    new_decoder_output = new_vae.decoder(new_vae.post_quant_conv(new_encoder_output))
+
+    # assert (old_decoder_output == new_decoder_output).all()
+    print("kipping vae decoder equivalence check")
+    print(f"vae decoder diff {(old_decoder_output - new_decoder_output).float().abs().sum()}")
+
+    old_output = old_vae(input)[0]
+    new_output = new_vae(input)[0]
+
+    # assert (old_output == new_output).all()
+    print("skipping full vae equivalence check")
+    print(f"vae full diff { (old_output - new_output).float().abs().sum()}")
+
+    return new_vae
+
+
+def convert_vae_block_state_dict(old_state_dict, prefix_from, new_state_dict, prefix_to):
+    # fmt: off
+
+    new_state_dict[f"{prefix_to}.resnets.0.norm1.weight"]             = old_state_dict.pop(f"{prefix_from}.block.0.norm1.weight")
+    new_state_dict[f"{prefix_to}.resnets.0.norm1.bias"]               = old_state_dict.pop(f"{prefix_from}.block.0.norm1.bias")
+    new_state_dict[f"{prefix_to}.resnets.0.conv1.weight"]             = old_state_dict.pop(f"{prefix_from}.block.0.conv1.weight")
+    new_state_dict[f"{prefix_to}.resnets.0.conv1.bias"]               = old_state_dict.pop(f"{prefix_from}.block.0.conv1.bias")
+    new_state_dict[f"{prefix_to}.resnets.0.norm2.weight"]             = old_state_dict.pop(f"{prefix_from}.block.0.norm2.weight")
+    new_state_dict[f"{prefix_to}.resnets.0.norm2.bias"]               = old_state_dict.pop(f"{prefix_from}.block.0.norm2.bias")
+    new_state_dict[f"{prefix_to}.resnets.0.conv2.weight"]             = old_state_dict.pop(f"{prefix_from}.block.0.conv2.weight")
+    new_state_dict[f"{prefix_to}.resnets.0.conv2.bias"]               = old_state_dict.pop(f"{prefix_from}.block.0.conv2.bias")
+
+    if f"{prefix_from}.block.0.nin_shortcut.weight" in old_state_dict:
+        new_state_dict[f"{prefix_to}.resnets.0.conv_shortcut.weight"]     = old_state_dict.pop(f"{prefix_from}.block.0.nin_shortcut.weight")
+        new_state_dict[f"{prefix_to}.resnets.0.conv_shortcut.bias"]       = old_state_dict.pop(f"{prefix_from}.block.0.nin_shortcut.bias")
+
+    new_state_dict[f"{prefix_to}.resnets.1.norm1.weight"]             = old_state_dict.pop(f"{prefix_from}.block.1.norm1.weight")
+    new_state_dict[f"{prefix_to}.resnets.1.norm1.bias"]               = old_state_dict.pop(f"{prefix_from}.block.1.norm1.bias")
+    new_state_dict[f"{prefix_to}.resnets.1.conv1.weight"]             = old_state_dict.pop(f"{prefix_from}.block.1.conv1.weight")
+    new_state_dict[f"{prefix_to}.resnets.1.conv1.bias"]               = old_state_dict.pop(f"{prefix_from}.block.1.conv1.bias")
+    new_state_dict[f"{prefix_to}.resnets.1.norm2.weight"]             = old_state_dict.pop(f"{prefix_from}.block.1.norm2.weight")
+    new_state_dict[f"{prefix_to}.resnets.1.norm2.bias"]               = old_state_dict.pop(f"{prefix_from}.block.1.norm2.bias")
+    new_state_dict[f"{prefix_to}.resnets.1.conv2.weight"]             = old_state_dict.pop(f"{prefix_from}.block.1.conv2.weight")
+    new_state_dict[f"{prefix_to}.resnets.1.conv2.bias"]               = old_state_dict.pop(f"{prefix_from}.block.1.conv2.bias")
+
+    if f"{prefix_from}.downsample.conv.weight" in old_state_dict:
+        new_state_dict[f"{prefix_to}.downsamplers.0.conv.weight"]         = old_state_dict.pop(f"{prefix_from}.downsample.conv.weight")
+        new_state_dict[f"{prefix_to}.downsamplers.0.conv.bias"]           = old_state_dict.pop(f"{prefix_from}.downsample.conv.bias")
+
+    if f"{prefix_from}.upsample.conv.weight" in old_state_dict:
+        new_state_dict[f"{prefix_to}.upsamplers.0.conv.weight"]         = old_state_dict.pop(f"{prefix_from}.upsample.conv.weight")
+        new_state_dict[f"{prefix_to}.upsamplers.0.conv.bias"]           = old_state_dict.pop(f"{prefix_from}.upsample.conv.bias")
+
+    if f"{prefix_from}.block.2.norm1.weight" in old_state_dict:
+        new_state_dict[f"{prefix_to}.resnets.2.norm1.weight"]             = old_state_dict.pop(f"{prefix_from}.block.2.norm1.weight")
+        new_state_dict[f"{prefix_to}.resnets.2.norm1.bias"]               = old_state_dict.pop(f"{prefix_from}.block.2.norm1.bias")
+        new_state_dict[f"{prefix_to}.resnets.2.conv1.weight"]             = old_state_dict.pop(f"{prefix_from}.block.2.conv1.weight")
+        new_state_dict[f"{prefix_to}.resnets.2.conv1.bias"]               = old_state_dict.pop(f"{prefix_from}.block.2.conv1.bias")
+        new_state_dict[f"{prefix_to}.resnets.2.norm2.weight"]             = old_state_dict.pop(f"{prefix_from}.block.2.norm2.weight")
+        new_state_dict[f"{prefix_to}.resnets.2.norm2.bias"]               = old_state_dict.pop(f"{prefix_from}.block.2.norm2.bias")
+        new_state_dict[f"{prefix_to}.resnets.2.conv2.weight"]             = old_state_dict.pop(f"{prefix_from}.block.2.conv2.weight")
+        new_state_dict[f"{prefix_to}.resnets.2.conv2.bias"]               = old_state_dict.pop(f"{prefix_from}.block.2.conv2.bias")
+
+    # fmt: on
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/scripts/convert_animatediff_motion_lora_to_diffusers.py b/diffusers/scripts/convert_animatediff_motion_lora_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..21567ffa9e7a95abd86c573761ca90d1648b8002
--- /dev/null
+++ b/diffusers/scripts/convert_animatediff_motion_lora_to_diffusers.py
@@ -0,0 +1,69 @@
+import argparse
+import os
+
+import torch
+from huggingface_hub import create_repo, upload_folder
+from safetensors.torch import load_file, save_file
+
+
+def convert_motion_module(original_state_dict):
+    converted_state_dict = {}
+    for k, v in original_state_dict.items():
+        if "pos_encoder" in k:
+            continue
+
+        else:
+            converted_state_dict[
+                k.replace(".norms.0", ".norm1")
+                .replace(".norms.1", ".norm2")
+                .replace(".ff_norm", ".norm3")
+                .replace(".attention_blocks.0", ".attn1")
+                .replace(".attention_blocks.1", ".attn2")
+                .replace(".temporal_transformer", "")
+            ] = v
+
+    return converted_state_dict
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt_path", type=str, required=True, help="Path to checkpoint")
+    parser.add_argument("--output_path", type=str, required=True, help="Path to output directory")
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        default=False,
+        help="Whether to push the converted model to the HF or not",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    if args.ckpt_path.endswith(".safetensors"):
+        state_dict = load_file(args.ckpt_path)
+    else:
+        state_dict = torch.load(args.ckpt_path, map_location="cpu")
+
+    if "state_dict" in state_dict.keys():
+        state_dict = state_dict["state_dict"]
+
+    conv_state_dict = convert_motion_module(state_dict)
+
+    # convert to new format
+    output_dict = {}
+    for module_name, params in conv_state_dict.items():
+        if type(params) is not torch.Tensor:
+            continue
+        output_dict.update({f"unet.{module_name}": params})
+
+    os.makedirs(args.output_path, exist_ok=True)
+
+    filepath = os.path.join(args.output_path, "diffusion_pytorch_model.safetensors")
+    save_file(output_dict, filepath)
+
+    if args.push_to_hub:
+        repo_id = create_repo(args.output_path, exist_ok=True).repo_id
+        upload_folder(repo_id=repo_id, folder_path=args.output_path, repo_type="model")
diff --git a/diffusers/scripts/convert_animatediff_motion_module_to_diffusers.py b/diffusers/scripts/convert_animatediff_motion_module_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e188a6a533e86a837b58b61ba68905d5ab24a008
--- /dev/null
+++ b/diffusers/scripts/convert_animatediff_motion_module_to_diffusers.py
@@ -0,0 +1,62 @@
+import argparse
+
+import torch
+from safetensors.torch import load_file
+
+from diffusers import MotionAdapter
+
+
+def convert_motion_module(original_state_dict):
+    converted_state_dict = {}
+    for k, v in original_state_dict.items():
+        if "pos_encoder" in k:
+            continue
+
+        else:
+            converted_state_dict[
+                k.replace(".norms.0", ".norm1")
+                .replace(".norms.1", ".norm2")
+                .replace(".ff_norm", ".norm3")
+                .replace(".attention_blocks.0", ".attn1")
+                .replace(".attention_blocks.1", ".attn2")
+                .replace(".temporal_transformer", "")
+            ] = v
+
+    return converted_state_dict
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt_path", type=str, required=True)
+    parser.add_argument("--output_path", type=str, required=True)
+    parser.add_argument("--use_motion_mid_block", action="store_true")
+    parser.add_argument("--motion_max_seq_length", type=int, default=32)
+    parser.add_argument("--block_out_channels", nargs="+", default=[320, 640, 1280, 1280], type=int)
+    parser.add_argument("--save_fp16", action="store_true")
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    if args.ckpt_path.endswith(".safetensors"):
+        state_dict = load_file(args.ckpt_path)
+    else:
+        state_dict = torch.load(args.ckpt_path, map_location="cpu")
+
+    if "state_dict" in state_dict.keys():
+        state_dict = state_dict["state_dict"]
+
+    conv_state_dict = convert_motion_module(state_dict)
+    adapter = MotionAdapter(
+        block_out_channels=args.block_out_channels,
+        use_motion_mid_block=args.use_motion_mid_block,
+        motion_max_seq_length=args.motion_max_seq_length,
+    )
+    # skip loading position embeddings
+    adapter.load_state_dict(conv_state_dict, strict=False)
+    adapter.save_pretrained(args.output_path)
+
+    if args.save_fp16:
+        adapter.to(dtype=torch.float16).save_pretrained(args.output_path, variant="fp16")
diff --git a/diffusers/scripts/convert_animatediff_sparsectrl_to_diffusers.py b/diffusers/scripts/convert_animatediff_sparsectrl_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f246dceb97f841273d0904248e4bfd8c341fda53
--- /dev/null
+++ b/diffusers/scripts/convert_animatediff_sparsectrl_to_diffusers.py
@@ -0,0 +1,83 @@
+import argparse
+from typing import Dict
+
+import torch
+import torch.nn as nn
+
+from diffusers import SparseControlNetModel
+
+
+KEYS_RENAME_MAPPING = {
+    ".attention_blocks.0": ".attn1",
+    ".attention_blocks.1": ".attn2",
+    ".attn1.pos_encoder": ".pos_embed",
+    ".ff_norm": ".norm3",
+    ".norms.0": ".norm1",
+    ".norms.1": ".norm2",
+    ".temporal_transformer": "",
+}
+
+
+def convert(original_state_dict: Dict[str, nn.Module]) -> Dict[str, nn.Module]:
+    converted_state_dict = {}
+
+    for key in list(original_state_dict.keys()):
+        renamed_key = key
+        for new_name, old_name in KEYS_RENAME_MAPPING.items():
+            renamed_key = renamed_key.replace(new_name, old_name)
+        converted_state_dict[renamed_key] = original_state_dict.pop(key)
+
+    return converted_state_dict
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt_path", type=str, required=True, help="Path to checkpoint")
+    parser.add_argument("--output_path", type=str, required=True, help="Path to output directory")
+    parser.add_argument(
+        "--max_motion_seq_length",
+        type=int,
+        default=32,
+        help="Max motion sequence length supported by the motion adapter",
+    )
+    parser.add_argument(
+        "--conditioning_channels", type=int, default=4, help="Number of channels in conditioning input to controlnet"
+    )
+    parser.add_argument(
+        "--use_simplified_condition_embedding",
+        action="store_true",
+        default=False,
+        help="Whether or not to use simplified condition embedding. When `conditioning_channels==4` i.e. latent inputs, set this to `True`. When `conditioning_channels==3` i.e. image inputs, set this to `False`",
+    )
+    parser.add_argument(
+        "--save_fp16",
+        action="store_true",
+        default=False,
+        help="Whether or not to save model in fp16 precision along with fp32",
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", default=False, help="Whether or not to push saved model to the HF hub"
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    state_dict = torch.load(args.ckpt_path, map_location="cpu")
+    if "state_dict" in state_dict.keys():
+        state_dict: dict = state_dict["state_dict"]
+
+    controlnet = SparseControlNetModel(
+        conditioning_channels=args.conditioning_channels,
+        motion_max_seq_length=args.max_motion_seq_length,
+        use_simplified_condition_embedding=args.use_simplified_condition_embedding,
+    )
+
+    state_dict = convert(state_dict)
+    controlnet.load_state_dict(state_dict, strict=True)
+
+    controlnet.save_pretrained(args.output_path, push_to_hub=args.push_to_hub)
+    if args.save_fp16:
+        controlnet = controlnet.to(dtype=torch.float16)
+        controlnet.save_pretrained(args.output_path, variant="fp16", push_to_hub=args.push_to_hub)
diff --git a/diffusers/scripts/convert_asymmetric_vqgan_to_diffusers.py b/diffusers/scripts/convert_asymmetric_vqgan_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffb735e18224a7ef48503367112f5ce8142bdf9c
--- /dev/null
+++ b/diffusers/scripts/convert_asymmetric_vqgan_to_diffusers.py
@@ -0,0 +1,184 @@
+import argparse
+import time
+from pathlib import Path
+from typing import Any, Dict, Literal
+
+import torch
+
+from diffusers import AsymmetricAutoencoderKL
+
+
+ASYMMETRIC_AUTOENCODER_KL_x_1_5_CONFIG = {
+    "in_channels": 3,
+    "out_channels": 3,
+    "down_block_types": [
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+    ],
+    "down_block_out_channels": [128, 256, 512, 512],
+    "layers_per_down_block": 2,
+    "up_block_types": [
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+    ],
+    "up_block_out_channels": [192, 384, 768, 768],
+    "layers_per_up_block": 3,
+    "act_fn": "silu",
+    "latent_channels": 4,
+    "norm_num_groups": 32,
+    "sample_size": 256,
+    "scaling_factor": 0.18215,
+}
+
+ASYMMETRIC_AUTOENCODER_KL_x_2_CONFIG = {
+    "in_channels": 3,
+    "out_channels": 3,
+    "down_block_types": [
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+    ],
+    "down_block_out_channels": [128, 256, 512, 512],
+    "layers_per_down_block": 2,
+    "up_block_types": [
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+    ],
+    "up_block_out_channels": [256, 512, 1024, 1024],
+    "layers_per_up_block": 5,
+    "act_fn": "silu",
+    "latent_channels": 4,
+    "norm_num_groups": 32,
+    "sample_size": 256,
+    "scaling_factor": 0.18215,
+}
+
+
+def convert_asymmetric_autoencoder_kl_state_dict(original_state_dict: Dict[str, Any]) -> Dict[str, Any]:
+    converted_state_dict = {}
+    for k, v in original_state_dict.items():
+        if k.startswith("encoder."):
+            converted_state_dict[
+                k.replace("encoder.down.", "encoder.down_blocks.")
+                .replace("encoder.mid.", "encoder.mid_block.")
+                .replace("encoder.norm_out.", "encoder.conv_norm_out.")
+                .replace(".downsample.", ".downsamplers.0.")
+                .replace(".nin_shortcut.", ".conv_shortcut.")
+                .replace(".block.", ".resnets.")
+                .replace(".block_1.", ".resnets.0.")
+                .replace(".block_2.", ".resnets.1.")
+                .replace(".attn_1.k.", ".attentions.0.to_k.")
+                .replace(".attn_1.q.", ".attentions.0.to_q.")
+                .replace(".attn_1.v.", ".attentions.0.to_v.")
+                .replace(".attn_1.proj_out.", ".attentions.0.to_out.0.")
+                .replace(".attn_1.norm.", ".attentions.0.group_norm.")
+            ] = v
+        elif k.startswith("decoder.") and "up_layers" not in k:
+            converted_state_dict[
+                k.replace("decoder.encoder.", "decoder.condition_encoder.")
+                .replace(".norm_out.", ".conv_norm_out.")
+                .replace(".up.0.", ".up_blocks.3.")
+                .replace(".up.1.", ".up_blocks.2.")
+                .replace(".up.2.", ".up_blocks.1.")
+                .replace(".up.3.", ".up_blocks.0.")
+                .replace(".block.", ".resnets.")
+                .replace("mid", "mid_block")
+                .replace(".0.upsample.", ".0.upsamplers.0.")
+                .replace(".1.upsample.", ".1.upsamplers.0.")
+                .replace(".2.upsample.", ".2.upsamplers.0.")
+                .replace(".nin_shortcut.", ".conv_shortcut.")
+                .replace(".block_1.", ".resnets.0.")
+                .replace(".block_2.", ".resnets.1.")
+                .replace(".attn_1.k.", ".attentions.0.to_k.")
+                .replace(".attn_1.q.", ".attentions.0.to_q.")
+                .replace(".attn_1.v.", ".attentions.0.to_v.")
+                .replace(".attn_1.proj_out.", ".attentions.0.to_out.0.")
+                .replace(".attn_1.norm.", ".attentions.0.group_norm.")
+            ] = v
+        elif k.startswith("quant_conv."):
+            converted_state_dict[k] = v
+        elif k.startswith("post_quant_conv."):
+            converted_state_dict[k] = v
+        else:
+            print(f"  skipping key `{k}`")
+    # fix weights shape
+    for k, v in converted_state_dict.items():
+        if (
+            (k.startswith("encoder.mid_block.attentions.0") or k.startswith("decoder.mid_block.attentions.0"))
+            and k.endswith("weight")
+            and ("to_q" in k or "to_k" in k or "to_v" in k or "to_out" in k)
+        ):
+            converted_state_dict[k] = converted_state_dict[k][:, :, 0, 0]
+
+    return converted_state_dict
+
+
+def get_asymmetric_autoencoder_kl_from_original_checkpoint(
+    scale: Literal["1.5", "2"], original_checkpoint_path: str, map_location: torch.device
+) -> AsymmetricAutoencoderKL:
+    print("Loading original state_dict")
+    original_state_dict = torch.load(original_checkpoint_path, map_location=map_location)
+    original_state_dict = original_state_dict["state_dict"]
+    print("Converting state_dict")
+    converted_state_dict = convert_asymmetric_autoencoder_kl_state_dict(original_state_dict)
+    kwargs = ASYMMETRIC_AUTOENCODER_KL_x_1_5_CONFIG if scale == "1.5" else ASYMMETRIC_AUTOENCODER_KL_x_2_CONFIG
+    print("Initializing AsymmetricAutoencoderKL model")
+    asymmetric_autoencoder_kl = AsymmetricAutoencoderKL(**kwargs)
+    print("Loading weight from converted state_dict")
+    asymmetric_autoencoder_kl.load_state_dict(converted_state_dict)
+    asymmetric_autoencoder_kl.eval()
+    print("AsymmetricAutoencoderKL successfully initialized")
+    return asymmetric_autoencoder_kl
+
+
+if __name__ == "__main__":
+    start = time.time()
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--scale",
+        default=None,
+        type=str,
+        required=True,
+        help="Asymmetric VQGAN scale: `1.5` or `2`",
+    )
+    parser.add_argument(
+        "--original_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the original Asymmetric VQGAN checkpoint",
+    )
+    parser.add_argument(
+        "--output_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to save pretrained AsymmetricAutoencoderKL model",
+    )
+    parser.add_argument(
+        "--map_location",
+        default="cpu",
+        type=str,
+        required=False,
+        help="The device passed to `map_location` when loading the checkpoint",
+    )
+    args = parser.parse_args()
+
+    assert args.scale in ["1.5", "2"], f"{args.scale} should be `1.5` of `2`"
+    assert Path(args.original_checkpoint_path).is_file()
+
+    asymmetric_autoencoder_kl = get_asymmetric_autoencoder_kl_from_original_checkpoint(
+        scale=args.scale,
+        original_checkpoint_path=args.original_checkpoint_path,
+        map_location=torch.device(args.map_location),
+    )
+    print("Saving pretrained AsymmetricAutoencoderKL")
+    asymmetric_autoencoder_kl.save_pretrained(args.output_path)
+    print(f"Done in {time.time() - start:.2f} seconds")
diff --git a/diffusers/scripts/convert_aura_flow_to_diffusers.py b/diffusers/scripts/convert_aura_flow_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..74c34f4851ff203422275b237cb03423a17d54a1
--- /dev/null
+++ b/diffusers/scripts/convert_aura_flow_to_diffusers.py
@@ -0,0 +1,131 @@
+import argparse
+
+import torch
+from huggingface_hub import hf_hub_download
+
+from diffusers.models.transformers.auraflow_transformer_2d import AuraFlowTransformer2DModel
+
+
+def load_original_state_dict(args):
+    model_pt = hf_hub_download(repo_id=args.original_state_dict_repo_id, filename="aura_diffusion_pytorch_model.bin")
+    state_dict = torch.load(model_pt, map_location="cpu")
+    return state_dict
+
+
+def calculate_layers(state_dict_keys, key_prefix):
+    dit_layers = set()
+    for k in state_dict_keys:
+        if key_prefix in k:
+            dit_layers.add(int(k.split(".")[2]))
+    print(f"{key_prefix}: {len(dit_layers)}")
+    return len(dit_layers)
+
+
+# similar to SD3 but only for the last norm layer
+def swap_scale_shift(weight, dim):
+    shift, scale = weight.chunk(2, dim=0)
+    new_weight = torch.cat([scale, shift], dim=0)
+    return new_weight
+
+
+def convert_transformer(state_dict):
+    converted_state_dict = {}
+    state_dict_keys = list(state_dict.keys())
+
+    converted_state_dict["register_tokens"] = state_dict.pop("model.register_tokens")
+    converted_state_dict["pos_embed.pos_embed"] = state_dict.pop("model.positional_encoding")
+    converted_state_dict["pos_embed.proj.weight"] = state_dict.pop("model.init_x_linear.weight")
+    converted_state_dict["pos_embed.proj.bias"] = state_dict.pop("model.init_x_linear.bias")
+
+    converted_state_dict["time_step_proj.linear_1.weight"] = state_dict.pop("model.t_embedder.mlp.0.weight")
+    converted_state_dict["time_step_proj.linear_1.bias"] = state_dict.pop("model.t_embedder.mlp.0.bias")
+    converted_state_dict["time_step_proj.linear_2.weight"] = state_dict.pop("model.t_embedder.mlp.2.weight")
+    converted_state_dict["time_step_proj.linear_2.bias"] = state_dict.pop("model.t_embedder.mlp.2.bias")
+
+    converted_state_dict["context_embedder.weight"] = state_dict.pop("model.cond_seq_linear.weight")
+
+    mmdit_layers = calculate_layers(state_dict_keys, key_prefix="double_layers")
+    single_dit_layers = calculate_layers(state_dict_keys, key_prefix="single_layers")
+
+    # MMDiT blocks 🎸.
+    for i in range(mmdit_layers):
+        # feed-forward
+        path_mapping = {"mlpX": "ff", "mlpC": "ff_context"}
+        weight_mapping = {"c_fc1": "linear_1", "c_fc2": "linear_2", "c_proj": "out_projection"}
+        for orig_k, diffuser_k in path_mapping.items():
+            for k, v in weight_mapping.items():
+                converted_state_dict[f"joint_transformer_blocks.{i}.{diffuser_k}.{v}.weight"] = state_dict.pop(
+                    f"model.double_layers.{i}.{orig_k}.{k}.weight"
+                )
+
+        # norms
+        path_mapping = {"modX": "norm1", "modC": "norm1_context"}
+        for orig_k, diffuser_k in path_mapping.items():
+            converted_state_dict[f"joint_transformer_blocks.{i}.{diffuser_k}.linear.weight"] = state_dict.pop(
+                f"model.double_layers.{i}.{orig_k}.1.weight"
+            )
+
+        # attns
+        x_attn_mapping = {"w2q": "to_q", "w2k": "to_k", "w2v": "to_v", "w2o": "to_out.0"}
+        context_attn_mapping = {"w1q": "add_q_proj", "w1k": "add_k_proj", "w1v": "add_v_proj", "w1o": "to_add_out"}
+        for attn_mapping in [x_attn_mapping, context_attn_mapping]:
+            for k, v in attn_mapping.items():
+                converted_state_dict[f"joint_transformer_blocks.{i}.attn.{v}.weight"] = state_dict.pop(
+                    f"model.double_layers.{i}.attn.{k}.weight"
+                )
+
+    # Single-DiT blocks.
+    for i in range(single_dit_layers):
+        # feed-forward
+        mapping = {"c_fc1": "linear_1", "c_fc2": "linear_2", "c_proj": "out_projection"}
+        for k, v in mapping.items():
+            converted_state_dict[f"single_transformer_blocks.{i}.ff.{v}.weight"] = state_dict.pop(
+                f"model.single_layers.{i}.mlp.{k}.weight"
+            )
+
+        # norms
+        converted_state_dict[f"single_transformer_blocks.{i}.norm1.linear.weight"] = state_dict.pop(
+            f"model.single_layers.{i}.modCX.1.weight"
+        )
+
+        # attns
+        x_attn_mapping = {"w1q": "to_q", "w1k": "to_k", "w1v": "to_v", "w1o": "to_out.0"}
+        for k, v in x_attn_mapping.items():
+            converted_state_dict[f"single_transformer_blocks.{i}.attn.{v}.weight"] = state_dict.pop(
+                f"model.single_layers.{i}.attn.{k}.weight"
+            )
+
+    # Final blocks.
+    converted_state_dict["proj_out.weight"] = state_dict.pop("model.final_linear.weight")
+    converted_state_dict["norm_out.linear.weight"] = swap_scale_shift(state_dict.pop("model.modF.1.weight"), dim=None)
+
+    return converted_state_dict
+
+
+@torch.no_grad()
+def populate_state_dict(args):
+    original_state_dict = load_original_state_dict(args)
+    state_dict_keys = list(original_state_dict.keys())
+    mmdit_layers = calculate_layers(state_dict_keys, key_prefix="double_layers")
+    single_dit_layers = calculate_layers(state_dict_keys, key_prefix="single_layers")
+
+    converted_state_dict = convert_transformer(original_state_dict)
+    model_diffusers = AuraFlowTransformer2DModel(
+        num_mmdit_layers=mmdit_layers, num_single_dit_layers=single_dit_layers
+    )
+    model_diffusers.load_state_dict(converted_state_dict, strict=True)
+
+    return model_diffusers
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--original_state_dict_repo_id", default="AuraDiffusion/auradiffusion-v0.1a0", type=str)
+    parser.add_argument("--dump_path", default="aura-flow", type=str)
+    parser.add_argument("--hub_id", default=None, type=str)
+    args = parser.parse_args()
+
+    model_diffusers = populate_state_dict(args)
+    model_diffusers.save_pretrained(args.dump_path)
+    if args.hub_id is not None:
+        model_diffusers.push_to_hub(args.hub_id)
diff --git a/diffusers/scripts/convert_blipdiffusion_to_diffusers.py b/diffusers/scripts/convert_blipdiffusion_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..03cf67e5476bd43260d2829767fffc220a7801c1
--- /dev/null
+++ b/diffusers/scripts/convert_blipdiffusion_to_diffusers.py
@@ -0,0 +1,343 @@
+"""
+This script requires you to build `LAVIS` from source, since the pip version doesn't have BLIP Diffusion. Follow instructions here: https://github.com/salesforce/LAVIS/tree/main.
+"""
+
+import argparse
+import os
+import tempfile
+
+import torch
+from lavis.models import load_model_and_preprocess
+from transformers import CLIPTokenizer
+from transformers.models.blip_2.configuration_blip_2 import Blip2Config
+
+from diffusers import (
+    AutoencoderKL,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines import BlipDiffusionPipeline
+from diffusers.pipelines.blip_diffusion.blip_image_processing import BlipImageProcessor
+from diffusers.pipelines.blip_diffusion.modeling_blip2 import Blip2QFormerModel
+from diffusers.pipelines.blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel
+
+
+BLIP2_CONFIG = {
+    "vision_config": {
+        "hidden_size": 1024,
+        "num_hidden_layers": 23,
+        "num_attention_heads": 16,
+        "image_size": 224,
+        "patch_size": 14,
+        "intermediate_size": 4096,
+        "hidden_act": "quick_gelu",
+    },
+    "qformer_config": {
+        "cross_attention_frequency": 1,
+        "encoder_hidden_size": 1024,
+        "vocab_size": 30523,
+    },
+    "num_query_tokens": 16,
+}
+blip2config = Blip2Config(**BLIP2_CONFIG)
+
+
+def qformer_model_from_original_config():
+    qformer = Blip2QFormerModel(blip2config)
+    return qformer
+
+
+def embeddings_from_original_checkpoint(model, diffuser_embeddings_prefix, original_embeddings_prefix):
+    embeddings = {}
+    embeddings.update(
+        {
+            f"{diffuser_embeddings_prefix}.word_embeddings.weight": model[
+                f"{original_embeddings_prefix}.word_embeddings.weight"
+            ]
+        }
+    )
+    embeddings.update(
+        {
+            f"{diffuser_embeddings_prefix}.position_embeddings.weight": model[
+                f"{original_embeddings_prefix}.position_embeddings.weight"
+            ]
+        }
+    )
+    embeddings.update(
+        {f"{diffuser_embeddings_prefix}.LayerNorm.weight": model[f"{original_embeddings_prefix}.LayerNorm.weight"]}
+    )
+    embeddings.update(
+        {f"{diffuser_embeddings_prefix}.LayerNorm.bias": model[f"{original_embeddings_prefix}.LayerNorm.bias"]}
+    )
+    return embeddings
+
+
+def proj_layer_from_original_checkpoint(model, diffuser_proj_prefix, original_proj_prefix):
+    proj_layer = {}
+    proj_layer.update({f"{diffuser_proj_prefix}.dense1.weight": model[f"{original_proj_prefix}.dense1.weight"]})
+    proj_layer.update({f"{diffuser_proj_prefix}.dense1.bias": model[f"{original_proj_prefix}.dense1.bias"]})
+    proj_layer.update({f"{diffuser_proj_prefix}.dense2.weight": model[f"{original_proj_prefix}.dense2.weight"]})
+    proj_layer.update({f"{diffuser_proj_prefix}.dense2.bias": model[f"{original_proj_prefix}.dense2.bias"]})
+    proj_layer.update({f"{diffuser_proj_prefix}.LayerNorm.weight": model[f"{original_proj_prefix}.LayerNorm.weight"]})
+    proj_layer.update({f"{diffuser_proj_prefix}.LayerNorm.bias": model[f"{original_proj_prefix}.LayerNorm.bias"]})
+    return proj_layer
+
+
+def attention_from_original_checkpoint(model, diffuser_attention_prefix, original_attention_prefix):
+    attention = {}
+    attention.update(
+        {
+            f"{diffuser_attention_prefix}.attention.query.weight": model[
+                f"{original_attention_prefix}.self.query.weight"
+            ]
+        }
+    )
+    attention.update(
+        {f"{diffuser_attention_prefix}.attention.query.bias": model[f"{original_attention_prefix}.self.query.bias"]}
+    )
+    attention.update(
+        {f"{diffuser_attention_prefix}.attention.key.weight": model[f"{original_attention_prefix}.self.key.weight"]}
+    )
+    attention.update(
+        {f"{diffuser_attention_prefix}.attention.key.bias": model[f"{original_attention_prefix}.self.key.bias"]}
+    )
+    attention.update(
+        {
+            f"{diffuser_attention_prefix}.attention.value.weight": model[
+                f"{original_attention_prefix}.self.value.weight"
+            ]
+        }
+    )
+    attention.update(
+        {f"{diffuser_attention_prefix}.attention.value.bias": model[f"{original_attention_prefix}.self.value.bias"]}
+    )
+    attention.update(
+        {f"{diffuser_attention_prefix}.output.dense.weight": model[f"{original_attention_prefix}.output.dense.weight"]}
+    )
+    attention.update(
+        {f"{diffuser_attention_prefix}.output.dense.bias": model[f"{original_attention_prefix}.output.dense.bias"]}
+    )
+    attention.update(
+        {
+            f"{diffuser_attention_prefix}.output.LayerNorm.weight": model[
+                f"{original_attention_prefix}.output.LayerNorm.weight"
+            ]
+        }
+    )
+    attention.update(
+        {
+            f"{diffuser_attention_prefix}.output.LayerNorm.bias": model[
+                f"{original_attention_prefix}.output.LayerNorm.bias"
+            ]
+        }
+    )
+    return attention
+
+
+def output_layers_from_original_checkpoint(model, diffuser_output_prefix, original_output_prefix):
+    output_layers = {}
+    output_layers.update({f"{diffuser_output_prefix}.dense.weight": model[f"{original_output_prefix}.dense.weight"]})
+    output_layers.update({f"{diffuser_output_prefix}.dense.bias": model[f"{original_output_prefix}.dense.bias"]})
+    output_layers.update(
+        {f"{diffuser_output_prefix}.LayerNorm.weight": model[f"{original_output_prefix}.LayerNorm.weight"]}
+    )
+    output_layers.update(
+        {f"{diffuser_output_prefix}.LayerNorm.bias": model[f"{original_output_prefix}.LayerNorm.bias"]}
+    )
+    return output_layers
+
+
+def encoder_from_original_checkpoint(model, diffuser_encoder_prefix, original_encoder_prefix):
+    encoder = {}
+    for i in range(blip2config.qformer_config.num_hidden_layers):
+        encoder.update(
+            attention_from_original_checkpoint(
+                model, f"{diffuser_encoder_prefix}.{i}.attention", f"{original_encoder_prefix}.{i}.attention"
+            )
+        )
+        encoder.update(
+            attention_from_original_checkpoint(
+                model, f"{diffuser_encoder_prefix}.{i}.crossattention", f"{original_encoder_prefix}.{i}.crossattention"
+            )
+        )
+
+        encoder.update(
+            {
+                f"{diffuser_encoder_prefix}.{i}.intermediate.dense.weight": model[
+                    f"{original_encoder_prefix}.{i}.intermediate.dense.weight"
+                ]
+            }
+        )
+        encoder.update(
+            {
+                f"{diffuser_encoder_prefix}.{i}.intermediate.dense.bias": model[
+                    f"{original_encoder_prefix}.{i}.intermediate.dense.bias"
+                ]
+            }
+        )
+        encoder.update(
+            {
+                f"{diffuser_encoder_prefix}.{i}.intermediate_query.dense.weight": model[
+                    f"{original_encoder_prefix}.{i}.intermediate_query.dense.weight"
+                ]
+            }
+        )
+        encoder.update(
+            {
+                f"{diffuser_encoder_prefix}.{i}.intermediate_query.dense.bias": model[
+                    f"{original_encoder_prefix}.{i}.intermediate_query.dense.bias"
+                ]
+            }
+        )
+
+        encoder.update(
+            output_layers_from_original_checkpoint(
+                model, f"{diffuser_encoder_prefix}.{i}.output", f"{original_encoder_prefix}.{i}.output"
+            )
+        )
+        encoder.update(
+            output_layers_from_original_checkpoint(
+                model, f"{diffuser_encoder_prefix}.{i}.output_query", f"{original_encoder_prefix}.{i}.output_query"
+            )
+        )
+    return encoder
+
+
+def visual_encoder_layer_from_original_checkpoint(model, diffuser_prefix, original_prefix):
+    visual_encoder_layer = {}
+
+    visual_encoder_layer.update({f"{diffuser_prefix}.layer_norm1.weight": model[f"{original_prefix}.ln_1.weight"]})
+    visual_encoder_layer.update({f"{diffuser_prefix}.layer_norm1.bias": model[f"{original_prefix}.ln_1.bias"]})
+    visual_encoder_layer.update({f"{diffuser_prefix}.layer_norm2.weight": model[f"{original_prefix}.ln_2.weight"]})
+    visual_encoder_layer.update({f"{diffuser_prefix}.layer_norm2.bias": model[f"{original_prefix}.ln_2.bias"]})
+    visual_encoder_layer.update(
+        {f"{diffuser_prefix}.self_attn.qkv.weight": model[f"{original_prefix}.attn.in_proj_weight"]}
+    )
+    visual_encoder_layer.update(
+        {f"{diffuser_prefix}.self_attn.qkv.bias": model[f"{original_prefix}.attn.in_proj_bias"]}
+    )
+    visual_encoder_layer.update(
+        {f"{diffuser_prefix}.self_attn.projection.weight": model[f"{original_prefix}.attn.out_proj.weight"]}
+    )
+    visual_encoder_layer.update(
+        {f"{diffuser_prefix}.self_attn.projection.bias": model[f"{original_prefix}.attn.out_proj.bias"]}
+    )
+    visual_encoder_layer.update({f"{diffuser_prefix}.mlp.fc1.weight": model[f"{original_prefix}.mlp.c_fc.weight"]})
+    visual_encoder_layer.update({f"{diffuser_prefix}.mlp.fc1.bias": model[f"{original_prefix}.mlp.c_fc.bias"]})
+    visual_encoder_layer.update({f"{diffuser_prefix}.mlp.fc2.weight": model[f"{original_prefix}.mlp.c_proj.weight"]})
+    visual_encoder_layer.update({f"{diffuser_prefix}.mlp.fc2.bias": model[f"{original_prefix}.mlp.c_proj.bias"]})
+
+    return visual_encoder_layer
+
+
+def visual_encoder_from_original_checkpoint(model, diffuser_prefix, original_prefix):
+    visual_encoder = {}
+
+    visual_encoder.update(
+        {
+            f"{diffuser_prefix}.embeddings.class_embedding": model[f"{original_prefix}.class_embedding"]
+            .unsqueeze(0)
+            .unsqueeze(0)
+        }
+    )
+    visual_encoder.update(
+        {
+            f"{diffuser_prefix}.embeddings.position_embedding": model[
+                f"{original_prefix}.positional_embedding"
+            ].unsqueeze(0)
+        }
+    )
+    visual_encoder.update(
+        {f"{diffuser_prefix}.embeddings.patch_embedding.weight": model[f"{original_prefix}.conv1.weight"]}
+    )
+    visual_encoder.update({f"{diffuser_prefix}.pre_layernorm.weight": model[f"{original_prefix}.ln_pre.weight"]})
+    visual_encoder.update({f"{diffuser_prefix}.pre_layernorm.bias": model[f"{original_prefix}.ln_pre.bias"]})
+
+    for i in range(blip2config.vision_config.num_hidden_layers):
+        visual_encoder.update(
+            visual_encoder_layer_from_original_checkpoint(
+                model, f"{diffuser_prefix}.encoder.layers.{i}", f"{original_prefix}.transformer.resblocks.{i}"
+            )
+        )
+
+    visual_encoder.update({f"{diffuser_prefix}.post_layernorm.weight": model["blip.ln_vision.weight"]})
+    visual_encoder.update({f"{diffuser_prefix}.post_layernorm.bias": model["blip.ln_vision.bias"]})
+
+    return visual_encoder
+
+
+def qformer_original_checkpoint_to_diffusers_checkpoint(model):
+    qformer_checkpoint = {}
+    qformer_checkpoint.update(embeddings_from_original_checkpoint(model, "embeddings", "blip.Qformer.bert.embeddings"))
+    qformer_checkpoint.update({"query_tokens": model["blip.query_tokens"]})
+    qformer_checkpoint.update(proj_layer_from_original_checkpoint(model, "proj_layer", "proj_layer"))
+    qformer_checkpoint.update(
+        encoder_from_original_checkpoint(model, "encoder.layer", "blip.Qformer.bert.encoder.layer")
+    )
+    qformer_checkpoint.update(visual_encoder_from_original_checkpoint(model, "visual_encoder", "blip.visual_encoder"))
+    return qformer_checkpoint
+
+
+def get_qformer(model):
+    print("loading qformer")
+
+    qformer = qformer_model_from_original_config()
+    qformer_diffusers_checkpoint = qformer_original_checkpoint_to_diffusers_checkpoint(model)
+
+    load_checkpoint_to_model(qformer_diffusers_checkpoint, qformer)
+
+    print("done loading qformer")
+    return qformer
+
+
+def load_checkpoint_to_model(checkpoint, model):
+    with tempfile.NamedTemporaryFile(delete=False) as file:
+        torch.save(checkpoint, file.name)
+        del checkpoint
+        model.load_state_dict(torch.load(file.name), strict=False)
+
+    os.remove(file.name)
+
+
+def save_blip_diffusion_model(model, args):
+    qformer = get_qformer(model)
+    qformer.eval()
+
+    text_encoder = ContextCLIPTextModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="text_encoder")
+    vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae")
+
+    unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
+    vae.eval()
+    text_encoder.eval()
+    scheduler = PNDMScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        set_alpha_to_one=False,
+        skip_prk_steps=True,
+    )
+    tokenizer = CLIPTokenizer.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="tokenizer")
+    image_processor = BlipImageProcessor()
+    blip_diffusion = BlipDiffusionPipeline(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        vae=vae,
+        unet=unet,
+        scheduler=scheduler,
+        qformer=qformer,
+        image_processor=image_processor,
+    )
+    blip_diffusion.save_pretrained(args.checkpoint_path)
+
+
+def main(args):
+    model, _, _ = load_model_and_preprocess("blip_diffusion", "base", device="cpu", is_eval=True)
+    save_blip_diffusion_model(model.state_dict(), args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", default=None, type=str, required=True, help="Path to the output model.")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/diffusers/scripts/convert_cogvideox_to_diffusers.py b/diffusers/scripts/convert_cogvideox_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4343eaf34038dc67faf905fcba5c57917ece208b
--- /dev/null
+++ b/diffusers/scripts/convert_cogvideox_to_diffusers.py
@@ -0,0 +1,290 @@
+import argparse
+from typing import Any, Dict
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDDIMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXPipeline,
+    CogVideoXTransformer3DModel,
+)
+
+
+def reassign_query_key_value_inplace(key: str, state_dict: Dict[str, Any]):
+    to_q_key = key.replace("query_key_value", "to_q")
+    to_k_key = key.replace("query_key_value", "to_k")
+    to_v_key = key.replace("query_key_value", "to_v")
+    to_q, to_k, to_v = torch.chunk(state_dict[key], chunks=3, dim=0)
+    state_dict[to_q_key] = to_q
+    state_dict[to_k_key] = to_k
+    state_dict[to_v_key] = to_v
+    state_dict.pop(key)
+
+
+def reassign_query_key_layernorm_inplace(key: str, state_dict: Dict[str, Any]):
+    layer_id, weight_or_bias = key.split(".")[-2:]
+
+    if "query" in key:
+        new_key = f"transformer_blocks.{layer_id}.attn1.norm_q.{weight_or_bias}"
+    elif "key" in key:
+        new_key = f"transformer_blocks.{layer_id}.attn1.norm_k.{weight_or_bias}"
+
+    state_dict[new_key] = state_dict.pop(key)
+
+
+def reassign_adaln_norm_inplace(key: str, state_dict: Dict[str, Any]):
+    layer_id, _, weight_or_bias = key.split(".")[-3:]
+
+    weights_or_biases = state_dict[key].chunk(12, dim=0)
+    norm1_weights_or_biases = torch.cat(weights_or_biases[0:3] + weights_or_biases[6:9])
+    norm2_weights_or_biases = torch.cat(weights_or_biases[3:6] + weights_or_biases[9:12])
+
+    norm1_key = f"transformer_blocks.{layer_id}.norm1.linear.{weight_or_bias}"
+    state_dict[norm1_key] = norm1_weights_or_biases
+
+    norm2_key = f"transformer_blocks.{layer_id}.norm2.linear.{weight_or_bias}"
+    state_dict[norm2_key] = norm2_weights_or_biases
+
+    state_dict.pop(key)
+
+
+def remove_keys_inplace(key: str, state_dict: Dict[str, Any]):
+    state_dict.pop(key)
+
+
+def replace_up_keys_inplace(key: str, state_dict: Dict[str, Any]):
+    key_split = key.split(".")
+    layer_index = int(key_split[2])
+    replace_layer_index = 4 - 1 - layer_index
+
+    key_split[1] = "up_blocks"
+    key_split[2] = str(replace_layer_index)
+    new_key = ".".join(key_split)
+
+    state_dict[new_key] = state_dict.pop(key)
+
+
+TRANSFORMER_KEYS_RENAME_DICT = {
+    "transformer.final_layernorm": "norm_final",
+    "transformer": "transformer_blocks",
+    "attention": "attn1",
+    "mlp": "ff.net",
+    "dense_h_to_4h": "0.proj",
+    "dense_4h_to_h": "2",
+    ".layers": "",
+    "dense": "to_out.0",
+    "input_layernorm": "norm1.norm",
+    "post_attn1_layernorm": "norm2.norm",
+    "time_embed.0": "time_embedding.linear_1",
+    "time_embed.2": "time_embedding.linear_2",
+    "mixins.patch_embed": "patch_embed",
+    "mixins.final_layer.norm_final": "norm_out.norm",
+    "mixins.final_layer.linear": "proj_out",
+    "mixins.final_layer.adaLN_modulation.1": "norm_out.linear",
+    "mixins.pos_embed.pos_embedding": "patch_embed.pos_embedding",  # Specific to CogVideoX-5b-I2V
+}
+
+TRANSFORMER_SPECIAL_KEYS_REMAP = {
+    "query_key_value": reassign_query_key_value_inplace,
+    "query_layernorm_list": reassign_query_key_layernorm_inplace,
+    "key_layernorm_list": reassign_query_key_layernorm_inplace,
+    "adaln_layer.adaLN_modulations": reassign_adaln_norm_inplace,
+    "embed_tokens": remove_keys_inplace,
+    "freqs_sin": remove_keys_inplace,
+    "freqs_cos": remove_keys_inplace,
+    "position_embedding": remove_keys_inplace,
+}
+
+VAE_KEYS_RENAME_DICT = {
+    "block.": "resnets.",
+    "down.": "down_blocks.",
+    "downsample": "downsamplers.0",
+    "upsample": "upsamplers.0",
+    "nin_shortcut": "conv_shortcut",
+    "encoder.mid.block_1": "encoder.mid_block.resnets.0",
+    "encoder.mid.block_2": "encoder.mid_block.resnets.1",
+    "decoder.mid.block_1": "decoder.mid_block.resnets.0",
+    "decoder.mid.block_2": "decoder.mid_block.resnets.1",
+}
+
+VAE_SPECIAL_KEYS_REMAP = {
+    "loss": remove_keys_inplace,
+    "up.": replace_up_keys_inplace,
+}
+
+TOKENIZER_MAX_LENGTH = 226
+
+
+def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
+    state_dict = saved_dict
+    if "model" in saved_dict.keys():
+        state_dict = state_dict["model"]
+    if "module" in saved_dict.keys():
+        state_dict = state_dict["module"]
+    if "state_dict" in saved_dict.keys():
+        state_dict = state_dict["state_dict"]
+    return state_dict
+
+
+def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
+    state_dict[new_key] = state_dict.pop(old_key)
+
+
+def convert_transformer(
+    ckpt_path: str,
+    num_layers: int,
+    num_attention_heads: int,
+    use_rotary_positional_embeddings: bool,
+    i2v: bool,
+    dtype: torch.dtype,
+):
+    PREFIX_KEY = "model.diffusion_model."
+
+    original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True))
+    transformer = CogVideoXTransformer3DModel(
+        in_channels=32 if i2v else 16,
+        num_layers=num_layers,
+        num_attention_heads=num_attention_heads,
+        use_rotary_positional_embeddings=use_rotary_positional_embeddings,
+        use_learned_positional_embeddings=i2v,
+    ).to(dtype=dtype)
+
+    for key in list(original_state_dict.keys()):
+        new_key = key[len(PREFIX_KEY) :]
+        for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict_inplace(original_state_dict, key, new_key)
+
+    for key in list(original_state_dict.keys()):
+        for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, original_state_dict)
+    transformer.load_state_dict(original_state_dict, strict=True)
+    return transformer
+
+
+def convert_vae(ckpt_path: str, scaling_factor: float, dtype: torch.dtype):
+    original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True))
+    vae = AutoencoderKLCogVideoX(scaling_factor=scaling_factor).to(dtype=dtype)
+
+    for key in list(original_state_dict.keys()):
+        new_key = key[:]
+        for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict_inplace(original_state_dict, key, new_key)
+
+    for key in list(original_state_dict.keys()):
+        for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, original_state_dict)
+
+    vae.load_state_dict(original_state_dict, strict=True)
+    return vae
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
+    )
+    parser.add_argument("--vae_ckpt_path", type=str, default=None, help="Path to original vae checkpoint")
+    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
+    parser.add_argument("--fp16", action="store_true", default=False, help="Whether to save the model weights in fp16")
+    parser.add_argument("--bf16", action="store_true", default=False, help="Whether to save the model weights in bf16")
+    parser.add_argument(
+        "--push_to_hub", action="store_true", default=False, help="Whether to push to HF Hub after saving"
+    )
+    parser.add_argument(
+        "--text_encoder_cache_dir", type=str, default=None, help="Path to text encoder cache directory"
+    )
+    # For CogVideoX-2B, num_layers is 30. For 5B, it is 42
+    parser.add_argument("--num_layers", type=int, default=30, help="Number of transformer blocks")
+    # For CogVideoX-2B, num_attention_heads is 30. For 5B, it is 48
+    parser.add_argument("--num_attention_heads", type=int, default=30, help="Number of attention heads")
+    # For CogVideoX-2B, use_rotary_positional_embeddings is False. For 5B, it is True
+    parser.add_argument(
+        "--use_rotary_positional_embeddings", action="store_true", default=False, help="Whether to use RoPE or not"
+    )
+    # For CogVideoX-2B, scaling_factor is 1.15258426. For 5B, it is 0.7
+    parser.add_argument("--scaling_factor", type=float, default=1.15258426, help="Scaling factor in the VAE")
+    # For CogVideoX-2B, snr_shift_scale is 3.0. For 5B, it is 1.0
+    parser.add_argument("--snr_shift_scale", type=float, default=3.0, help="Scaling factor in the VAE")
+    parser.add_argument("--i2v", action="store_true", default=False, help="Whether to save the model weights in fp16")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    transformer = None
+    vae = None
+
+    if args.fp16 and args.bf16:
+        raise ValueError("You cannot pass both --fp16 and --bf16 at the same time.")
+
+    dtype = torch.float16 if args.fp16 else torch.bfloat16 if args.bf16 else torch.float32
+
+    if args.transformer_ckpt_path is not None:
+        transformer = convert_transformer(
+            args.transformer_ckpt_path,
+            args.num_layers,
+            args.num_attention_heads,
+            args.use_rotary_positional_embeddings,
+            args.i2v,
+            dtype,
+        )
+    if args.vae_ckpt_path is not None:
+        vae = convert_vae(args.vae_ckpt_path, args.scaling_factor, dtype)
+
+    text_encoder_id = "google/t5-v1_1-xxl"
+    tokenizer = T5Tokenizer.from_pretrained(text_encoder_id, model_max_length=TOKENIZER_MAX_LENGTH)
+    text_encoder = T5EncoderModel.from_pretrained(text_encoder_id, cache_dir=args.text_encoder_cache_dir)
+
+    # Apparently, the conversion does not work anymore without this :shrug:
+    for param in text_encoder.parameters():
+        param.data = param.data.contiguous()
+
+    scheduler = CogVideoXDDIMScheduler.from_config(
+        {
+            "snr_shift_scale": args.snr_shift_scale,
+            "beta_end": 0.012,
+            "beta_schedule": "scaled_linear",
+            "beta_start": 0.00085,
+            "clip_sample": False,
+            "num_train_timesteps": 1000,
+            "prediction_type": "v_prediction",
+            "rescale_betas_zero_snr": True,
+            "set_alpha_to_one": True,
+            "timestep_spacing": "trailing",
+        }
+    )
+    if args.i2v:
+        pipeline_cls = CogVideoXImageToVideoPipeline
+    else:
+        pipeline_cls = CogVideoXPipeline
+
+    pipe = pipeline_cls(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        vae=vae,
+        transformer=transformer,
+        scheduler=scheduler,
+    )
+
+    if args.fp16:
+        pipe = pipe.to(dtype=torch.float16)
+    if args.bf16:
+        pipe = pipe.to(dtype=torch.bfloat16)
+
+    # We don't use variant here because the model must be run in fp16 (2B) or bf16 (5B). It would be weird
+    # for users to specify variant when the default is not fp32 and they want to run with the correct default (which
+    # is either fp16/bf16 here).
+
+    # This is necessary This is necessary for users with insufficient memory,
+    # such as those using Colab and notebooks, as it can save some memory used for model loading.
+    pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB", push_to_hub=args.push_to_hub)
diff --git a/diffusers/scripts/convert_consistency_decoder.py b/diffusers/scripts/convert_consistency_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cb5fc50dd60b2956ba9dd3c4044c49349387943
--- /dev/null
+++ b/diffusers/scripts/convert_consistency_decoder.py
@@ -0,0 +1,1128 @@
+import math
+import os
+import urllib
+import warnings
+from argparse import ArgumentParser
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from huggingface_hub.utils import insecure_hashlib
+from safetensors.torch import load_file as stl
+from tqdm import tqdm
+
+from diffusers import AutoencoderKL, ConsistencyDecoderVAE, DiffusionPipeline, StableDiffusionPipeline, UNet2DModel
+from diffusers.models.autoencoders.vae import Encoder
+from diffusers.models.embeddings import TimestepEmbedding
+from diffusers.models.unets.unet_2d_blocks import ResnetDownsampleBlock2D, ResnetUpsampleBlock2D, UNetMidBlock2D
+
+
+args = ArgumentParser()
+args.add_argument("--save_pretrained", required=False, default=None, type=str)
+args.add_argument("--test_image", required=True, type=str)
+args = args.parse_args()
+
+
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    # from: https://github.com/openai/guided-diffusion/blob/22e0df8183507e13a7813f8d38d51b072ca1e67c/guided_diffusion/gaussian_diffusion.py#L895    """
+    res = arr[timesteps].float()
+    dims_to_append = len(broadcast_shape) - len(res.shape)
+    return res[(...,) + (None,) * dims_to_append]
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    # from: https://github.com/openai/guided-diffusion/blob/22e0df8183507e13a7813f8d38d51b072ca1e67c/guided_diffusion/gaussian_diffusion.py#L45
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return torch.tensor(betas)
+
+
+def _download(url: str, root: str):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+
+    if os.path.isfile(download_target):
+        if insecure_hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(
+            total=int(source.info().get("Content-Length")),
+            ncols=80,
+            unit="iB",
+            unit_scale=True,
+            unit_divisor=1024,
+        ) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+
+                output.write(buffer)
+                loop.update(len(buffer))
+
+    if insecure_hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match")
+
+    return download_target
+
+
+class ConsistencyDecoder:
+    def __init__(self, device="cuda:0", download_root=os.path.expanduser("~/.cache/clip")):
+        self.n_distilled_steps = 64
+        download_target = _download(
+            "https://openaipublic.azureedge.net/diff-vae/c9cebd3132dd9c42936d803e33424145a748843c8f716c0814838bdc8a2fe7cb/decoder.pt",
+            download_root,
+        )
+        self.ckpt = torch.jit.load(download_target).to(device)
+        self.device = device
+        sigma_data = 0.5
+        betas = betas_for_alpha_bar(1024, lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2).to(device)
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        self.sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
+        sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / alphas_cumprod)
+        sigmas = torch.sqrt(1.0 / alphas_cumprod - 1)
+        self.c_skip = sqrt_recip_alphas_cumprod * sigma_data**2 / (sigmas**2 + sigma_data**2)
+        self.c_out = sigmas * sigma_data / (sigmas**2 + sigma_data**2) ** 0.5
+        self.c_in = sqrt_recip_alphas_cumprod / (sigmas**2 + sigma_data**2) ** 0.5
+
+    @staticmethod
+    def round_timesteps(timesteps, total_timesteps, n_distilled_steps, truncate_start=True):
+        with torch.no_grad():
+            space = torch.div(total_timesteps, n_distilled_steps, rounding_mode="floor")
+            rounded_timesteps = (torch.div(timesteps, space, rounding_mode="floor") + 1) * space
+            if truncate_start:
+                rounded_timesteps[rounded_timesteps == total_timesteps] -= space
+            else:
+                rounded_timesteps[rounded_timesteps == total_timesteps] -= space
+                rounded_timesteps[rounded_timesteps == 0] += space
+            return rounded_timesteps
+
+    @staticmethod
+    def ldm_transform_latent(z, extra_scale_factor=1):
+        channel_means = [0.38862467, 0.02253063, 0.07381133, -0.0171294]
+        channel_stds = [0.9654121, 1.0440036, 0.76147926, 0.77022034]
+
+        if len(z.shape) != 4:
+            raise ValueError()
+
+        z = z * 0.18215
+        channels = [z[:, i] for i in range(z.shape[1])]
+
+        channels = [extra_scale_factor * (c - channel_means[i]) / channel_stds[i] for i, c in enumerate(channels)]
+        return torch.stack(channels, dim=1)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        features: torch.Tensor,
+        schedule=[1.0, 0.5],
+        generator=None,
+    ):
+        features = self.ldm_transform_latent(features)
+        ts = self.round_timesteps(
+            torch.arange(0, 1024),
+            1024,
+            self.n_distilled_steps,
+            truncate_start=False,
+        )
+        shape = (
+            features.size(0),
+            3,
+            8 * features.size(2),
+            8 * features.size(3),
+        )
+        x_start = torch.zeros(shape, device=features.device, dtype=features.dtype)
+        schedule_timesteps = [int((1024 - 1) * s) for s in schedule]
+        for i in schedule_timesteps:
+            t = ts[i].item()
+            t_ = torch.tensor([t] * features.shape[0]).to(self.device)
+            # noise = torch.randn_like(x_start)
+            noise = torch.randn(x_start.shape, dtype=x_start.dtype, generator=generator).to(device=x_start.device)
+            x_start = (
+                _extract_into_tensor(self.sqrt_alphas_cumprod, t_, x_start.shape) * x_start
+                + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t_, x_start.shape) * noise
+            )
+            c_in = _extract_into_tensor(self.c_in, t_, x_start.shape)
+
+            import torch.nn.functional as F
+
+            from diffusers import UNet2DModel
+
+            if isinstance(self.ckpt, UNet2DModel):
+                input = torch.concat([c_in * x_start, F.upsample_nearest(features, scale_factor=8)], dim=1)
+                model_output = self.ckpt(input, t_).sample
+            else:
+                model_output = self.ckpt(c_in * x_start, t_, features=features)
+
+            B, C = x_start.shape[:2]
+            model_output, _ = torch.split(model_output, C, dim=1)
+            pred_xstart = (
+                _extract_into_tensor(self.c_out, t_, x_start.shape) * model_output
+                + _extract_into_tensor(self.c_skip, t_, x_start.shape) * x_start
+            ).clamp(-1, 1)
+            x_start = pred_xstart
+        return x_start
+
+
+def save_image(image, name):
+    import numpy as np
+    from PIL import Image
+
+    image = image[0].cpu().numpy()
+    image = (image + 1.0) * 127.5
+    image = image.clip(0, 255).astype(np.uint8)
+    image = Image.fromarray(image.transpose(1, 2, 0))
+    image.save(name)
+
+
+def load_image(uri, size=None, center_crop=False):
+    import numpy as np
+    from PIL import Image
+
+    image = Image.open(uri)
+    if center_crop:
+        image = image.crop(
+            (
+                (image.width - min(image.width, image.height)) // 2,
+                (image.height - min(image.width, image.height)) // 2,
+                (image.width + min(image.width, image.height)) // 2,
+                (image.height + min(image.width, image.height)) // 2,
+            )
+        )
+    if size is not None:
+        image = image.resize(size)
+    image = torch.tensor(np.array(image).transpose(2, 0, 1)).unsqueeze(0).float()
+    image = image / 127.5 - 1.0
+    return image
+
+
+class TimestepEmbedding_(nn.Module):
+    def __init__(self, n_time=1024, n_emb=320, n_out=1280) -> None:
+        super().__init__()
+        self.emb = nn.Embedding(n_time, n_emb)
+        self.f_1 = nn.Linear(n_emb, n_out)
+        self.f_2 = nn.Linear(n_out, n_out)
+
+    def forward(self, x) -> torch.Tensor:
+        x = self.emb(x)
+        x = self.f_1(x)
+        x = F.silu(x)
+        return self.f_2(x)
+
+
+class ImageEmbedding(nn.Module):
+    def __init__(self, in_channels=7, out_channels=320) -> None:
+        super().__init__()
+        self.f = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
+
+    def forward(self, x) -> torch.Tensor:
+        return self.f(x)
+
+
+class ImageUnembedding(nn.Module):
+    def __init__(self, in_channels=320, out_channels=6) -> None:
+        super().__init__()
+        self.gn = nn.GroupNorm(32, in_channels)
+        self.f = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
+
+    def forward(self, x) -> torch.Tensor:
+        return self.f(F.silu(self.gn(x)))
+
+
+class ConvResblock(nn.Module):
+    def __init__(self, in_features=320, out_features=320) -> None:
+        super().__init__()
+        self.f_t = nn.Linear(1280, out_features * 2)
+
+        self.gn_1 = nn.GroupNorm(32, in_features)
+        self.f_1 = nn.Conv2d(in_features, out_features, kernel_size=3, padding=1)
+
+        self.gn_2 = nn.GroupNorm(32, out_features)
+        self.f_2 = nn.Conv2d(out_features, out_features, kernel_size=3, padding=1)
+
+        skip_conv = in_features != out_features
+        self.f_s = nn.Conv2d(in_features, out_features, kernel_size=1, padding=0) if skip_conv else nn.Identity()
+
+    def forward(self, x, t):
+        x_skip = x
+        t = self.f_t(F.silu(t))
+        t = t.chunk(2, dim=1)
+        t_1 = t[0].unsqueeze(dim=2).unsqueeze(dim=3) + 1
+        t_2 = t[1].unsqueeze(dim=2).unsqueeze(dim=3)
+
+        gn_1 = F.silu(self.gn_1(x))
+        f_1 = self.f_1(gn_1)
+
+        gn_2 = self.gn_2(f_1)
+
+        return self.f_s(x_skip) + self.f_2(F.silu(gn_2 * t_1 + t_2))
+
+
+# Also ConvResblock
+class Downsample(nn.Module):
+    def __init__(self, in_channels=320) -> None:
+        super().__init__()
+        self.f_t = nn.Linear(1280, in_channels * 2)
+
+        self.gn_1 = nn.GroupNorm(32, in_channels)
+        self.f_1 = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)
+        self.gn_2 = nn.GroupNorm(32, in_channels)
+
+        self.f_2 = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)
+
+    def forward(self, x, t) -> torch.Tensor:
+        x_skip = x
+
+        t = self.f_t(F.silu(t))
+        t_1, t_2 = t.chunk(2, dim=1)
+        t_1 = t_1.unsqueeze(2).unsqueeze(3) + 1
+        t_2 = t_2.unsqueeze(2).unsqueeze(3)
+
+        gn_1 = F.silu(self.gn_1(x))
+        avg_pool2d = F.avg_pool2d(gn_1, kernel_size=(2, 2), stride=None)
+
+        f_1 = self.f_1(avg_pool2d)
+        gn_2 = self.gn_2(f_1)
+
+        f_2 = self.f_2(F.silu(t_2 + (t_1 * gn_2)))
+
+        return f_2 + F.avg_pool2d(x_skip, kernel_size=(2, 2), stride=None)
+
+
+# Also ConvResblock
+class Upsample(nn.Module):
+    def __init__(self, in_channels=1024) -> None:
+        super().__init__()
+        self.f_t = nn.Linear(1280, in_channels * 2)
+
+        self.gn_1 = nn.GroupNorm(32, in_channels)
+        self.f_1 = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)
+        self.gn_2 = nn.GroupNorm(32, in_channels)
+
+        self.f_2 = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)
+
+    def forward(self, x, t) -> torch.Tensor:
+        x_skip = x
+
+        t = self.f_t(F.silu(t))
+        t_1, t_2 = t.chunk(2, dim=1)
+        t_1 = t_1.unsqueeze(2).unsqueeze(3) + 1
+        t_2 = t_2.unsqueeze(2).unsqueeze(3)
+
+        gn_1 = F.silu(self.gn_1(x))
+        upsample = F.upsample_nearest(gn_1, scale_factor=2)
+        f_1 = self.f_1(upsample)
+        gn_2 = self.gn_2(f_1)
+
+        f_2 = self.f_2(F.silu(t_2 + (t_1 * gn_2)))
+
+        return f_2 + F.upsample_nearest(x_skip, scale_factor=2)
+
+
+class ConvUNetVAE(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.embed_image = ImageEmbedding()
+        self.embed_time = TimestepEmbedding_()
+
+        down_0 = nn.ModuleList(
+            [
+                ConvResblock(320, 320),
+                ConvResblock(320, 320),
+                ConvResblock(320, 320),
+                Downsample(320),
+            ]
+        )
+        down_1 = nn.ModuleList(
+            [
+                ConvResblock(320, 640),
+                ConvResblock(640, 640),
+                ConvResblock(640, 640),
+                Downsample(640),
+            ]
+        )
+        down_2 = nn.ModuleList(
+            [
+                ConvResblock(640, 1024),
+                ConvResblock(1024, 1024),
+                ConvResblock(1024, 1024),
+                Downsample(1024),
+            ]
+        )
+        down_3 = nn.ModuleList(
+            [
+                ConvResblock(1024, 1024),
+                ConvResblock(1024, 1024),
+                ConvResblock(1024, 1024),
+            ]
+        )
+        self.down = nn.ModuleList(
+            [
+                down_0,
+                down_1,
+                down_2,
+                down_3,
+            ]
+        )
+
+        self.mid = nn.ModuleList(
+            [
+                ConvResblock(1024, 1024),
+                ConvResblock(1024, 1024),
+            ]
+        )
+
+        up_3 = nn.ModuleList(
+            [
+                ConvResblock(1024 * 2, 1024),
+                ConvResblock(1024 * 2, 1024),
+                ConvResblock(1024 * 2, 1024),
+                ConvResblock(1024 * 2, 1024),
+                Upsample(1024),
+            ]
+        )
+        up_2 = nn.ModuleList(
+            [
+                ConvResblock(1024 * 2, 1024),
+                ConvResblock(1024 * 2, 1024),
+                ConvResblock(1024 * 2, 1024),
+                ConvResblock(1024 + 640, 1024),
+                Upsample(1024),
+            ]
+        )
+        up_1 = nn.ModuleList(
+            [
+                ConvResblock(1024 + 640, 640),
+                ConvResblock(640 * 2, 640),
+                ConvResblock(640 * 2, 640),
+                ConvResblock(320 + 640, 640),
+                Upsample(640),
+            ]
+        )
+        up_0 = nn.ModuleList(
+            [
+                ConvResblock(320 + 640, 320),
+                ConvResblock(320 * 2, 320),
+                ConvResblock(320 * 2, 320),
+                ConvResblock(320 * 2, 320),
+            ]
+        )
+        self.up = nn.ModuleList(
+            [
+                up_0,
+                up_1,
+                up_2,
+                up_3,
+            ]
+        )
+
+        self.output = ImageUnembedding()
+
+    def forward(self, x, t, features) -> torch.Tensor:
+        converted = hasattr(self, "converted") and self.converted
+
+        x = torch.cat([x, F.upsample_nearest(features, scale_factor=8)], dim=1)
+
+        if converted:
+            t = self.time_embedding(self.time_proj(t))
+        else:
+            t = self.embed_time(t)
+
+        x = self.embed_image(x)
+
+        skips = [x]
+        for i, down in enumerate(self.down):
+            if converted and i in [0, 1, 2, 3]:
+                x, skips_ = down(x, t)
+                for skip in skips_:
+                    skips.append(skip)
+            else:
+                for block in down:
+                    x = block(x, t)
+                    skips.append(x)
+            print(x.float().abs().sum())
+
+        if converted:
+            x = self.mid(x, t)
+        else:
+            for i in range(2):
+                x = self.mid[i](x, t)
+        print(x.float().abs().sum())
+
+        for i, up in enumerate(self.up[::-1]):
+            if converted and i in [0, 1, 2, 3]:
+                skip_4 = skips.pop()
+                skip_3 = skips.pop()
+                skip_2 = skips.pop()
+                skip_1 = skips.pop()
+                skips_ = (skip_1, skip_2, skip_3, skip_4)
+                x = up(x, skips_, t)
+            else:
+                for block in up:
+                    if isinstance(block, ConvResblock):
+                        x = torch.concat([x, skips.pop()], dim=1)
+                    x = block(x, t)
+
+        return self.output(x)
+
+
+def rename_state_dict_key(k):
+    k = k.replace("blocks.", "")
+    for i in range(5):
+        k = k.replace(f"down_{i}_", f"down.{i}.")
+        k = k.replace(f"conv_{i}.", f"{i}.")
+        k = k.replace(f"up_{i}_", f"up.{i}.")
+        k = k.replace(f"mid_{i}", f"mid.{i}")
+    k = k.replace("upsamp.", "4.")
+    k = k.replace("downsamp.", "3.")
+    k = k.replace("f_t.w", "f_t.weight").replace("f_t.b", "f_t.bias")
+    k = k.replace("f_1.w", "f_1.weight").replace("f_1.b", "f_1.bias")
+    k = k.replace("f_2.w", "f_2.weight").replace("f_2.b", "f_2.bias")
+    k = k.replace("f_s.w", "f_s.weight").replace("f_s.b", "f_s.bias")
+    k = k.replace("f.w", "f.weight").replace("f.b", "f.bias")
+    k = k.replace("gn_1.g", "gn_1.weight").replace("gn_1.b", "gn_1.bias")
+    k = k.replace("gn_2.g", "gn_2.weight").replace("gn_2.b", "gn_2.bias")
+    k = k.replace("gn.g", "gn.weight").replace("gn.b", "gn.bias")
+    return k
+
+
+def rename_state_dict(sd, embedding):
+    sd = {rename_state_dict_key(k): v for k, v in sd.items()}
+    sd["embed_time.emb.weight"] = embedding["weight"]
+    return sd
+
+
+# encode with stable diffusion vae
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+pipe.vae.cuda()
+
+# construct original decoder with jitted model
+decoder_consistency = ConsistencyDecoder(device="cuda:0")
+
+# construct UNet code, overwrite the decoder with conv_unet_vae
+model = ConvUNetVAE()
+model.load_state_dict(
+    rename_state_dict(
+        stl("consistency_decoder.safetensors"),
+        stl("embedding.safetensors"),
+    )
+)
+model = model.cuda()
+
+decoder_consistency.ckpt = model
+
+image = load_image(args.test_image, size=(256, 256), center_crop=True)
+latent = pipe.vae.encode(image.half().cuda()).latent_dist.sample()
+
+# decode with gan
+sample_gan = pipe.vae.decode(latent).sample.detach()
+save_image(sample_gan, "gan.png")
+
+# decode with conv_unet_vae
+sample_consistency_orig = decoder_consistency(latent, generator=torch.Generator("cpu").manual_seed(0))
+save_image(sample_consistency_orig, "con_orig.png")
+
+
+########### conversion
+
+print("CONVERSION")
+
+print("DOWN BLOCK ONE")
+
+block_one_sd_orig = model.down[0].state_dict()
+block_one_sd_new = {}
+
+for i in range(3):
+    block_one_sd_new[f"resnets.{i}.norm1.weight"] = block_one_sd_orig.pop(f"{i}.gn_1.weight")
+    block_one_sd_new[f"resnets.{i}.norm1.bias"] = block_one_sd_orig.pop(f"{i}.gn_1.bias")
+    block_one_sd_new[f"resnets.{i}.conv1.weight"] = block_one_sd_orig.pop(f"{i}.f_1.weight")
+    block_one_sd_new[f"resnets.{i}.conv1.bias"] = block_one_sd_orig.pop(f"{i}.f_1.bias")
+    block_one_sd_new[f"resnets.{i}.time_emb_proj.weight"] = block_one_sd_orig.pop(f"{i}.f_t.weight")
+    block_one_sd_new[f"resnets.{i}.time_emb_proj.bias"] = block_one_sd_orig.pop(f"{i}.f_t.bias")
+    block_one_sd_new[f"resnets.{i}.norm2.weight"] = block_one_sd_orig.pop(f"{i}.gn_2.weight")
+    block_one_sd_new[f"resnets.{i}.norm2.bias"] = block_one_sd_orig.pop(f"{i}.gn_2.bias")
+    block_one_sd_new[f"resnets.{i}.conv2.weight"] = block_one_sd_orig.pop(f"{i}.f_2.weight")
+    block_one_sd_new[f"resnets.{i}.conv2.bias"] = block_one_sd_orig.pop(f"{i}.f_2.bias")
+
+block_one_sd_new["downsamplers.0.norm1.weight"] = block_one_sd_orig.pop("3.gn_1.weight")
+block_one_sd_new["downsamplers.0.norm1.bias"] = block_one_sd_orig.pop("3.gn_1.bias")
+block_one_sd_new["downsamplers.0.conv1.weight"] = block_one_sd_orig.pop("3.f_1.weight")
+block_one_sd_new["downsamplers.0.conv1.bias"] = block_one_sd_orig.pop("3.f_1.bias")
+block_one_sd_new["downsamplers.0.time_emb_proj.weight"] = block_one_sd_orig.pop("3.f_t.weight")
+block_one_sd_new["downsamplers.0.time_emb_proj.bias"] = block_one_sd_orig.pop("3.f_t.bias")
+block_one_sd_new["downsamplers.0.norm2.weight"] = block_one_sd_orig.pop("3.gn_2.weight")
+block_one_sd_new["downsamplers.0.norm2.bias"] = block_one_sd_orig.pop("3.gn_2.bias")
+block_one_sd_new["downsamplers.0.conv2.weight"] = block_one_sd_orig.pop("3.f_2.weight")
+block_one_sd_new["downsamplers.0.conv2.bias"] = block_one_sd_orig.pop("3.f_2.bias")
+
+assert len(block_one_sd_orig) == 0
+
+block_one = ResnetDownsampleBlock2D(
+    in_channels=320,
+    out_channels=320,
+    temb_channels=1280,
+    num_layers=3,
+    add_downsample=True,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+
+block_one.load_state_dict(block_one_sd_new)
+
+print("DOWN BLOCK TWO")
+
+block_two_sd_orig = model.down[1].state_dict()
+block_two_sd_new = {}
+
+for i in range(3):
+    block_two_sd_new[f"resnets.{i}.norm1.weight"] = block_two_sd_orig.pop(f"{i}.gn_1.weight")
+    block_two_sd_new[f"resnets.{i}.norm1.bias"] = block_two_sd_orig.pop(f"{i}.gn_1.bias")
+    block_two_sd_new[f"resnets.{i}.conv1.weight"] = block_two_sd_orig.pop(f"{i}.f_1.weight")
+    block_two_sd_new[f"resnets.{i}.conv1.bias"] = block_two_sd_orig.pop(f"{i}.f_1.bias")
+    block_two_sd_new[f"resnets.{i}.time_emb_proj.weight"] = block_two_sd_orig.pop(f"{i}.f_t.weight")
+    block_two_sd_new[f"resnets.{i}.time_emb_proj.bias"] = block_two_sd_orig.pop(f"{i}.f_t.bias")
+    block_two_sd_new[f"resnets.{i}.norm2.weight"] = block_two_sd_orig.pop(f"{i}.gn_2.weight")
+    block_two_sd_new[f"resnets.{i}.norm2.bias"] = block_two_sd_orig.pop(f"{i}.gn_2.bias")
+    block_two_sd_new[f"resnets.{i}.conv2.weight"] = block_two_sd_orig.pop(f"{i}.f_2.weight")
+    block_two_sd_new[f"resnets.{i}.conv2.bias"] = block_two_sd_orig.pop(f"{i}.f_2.bias")
+
+    if i == 0:
+        block_two_sd_new[f"resnets.{i}.conv_shortcut.weight"] = block_two_sd_orig.pop(f"{i}.f_s.weight")
+        block_two_sd_new[f"resnets.{i}.conv_shortcut.bias"] = block_two_sd_orig.pop(f"{i}.f_s.bias")
+
+block_two_sd_new["downsamplers.0.norm1.weight"] = block_two_sd_orig.pop("3.gn_1.weight")
+block_two_sd_new["downsamplers.0.norm1.bias"] = block_two_sd_orig.pop("3.gn_1.bias")
+block_two_sd_new["downsamplers.0.conv1.weight"] = block_two_sd_orig.pop("3.f_1.weight")
+block_two_sd_new["downsamplers.0.conv1.bias"] = block_two_sd_orig.pop("3.f_1.bias")
+block_two_sd_new["downsamplers.0.time_emb_proj.weight"] = block_two_sd_orig.pop("3.f_t.weight")
+block_two_sd_new["downsamplers.0.time_emb_proj.bias"] = block_two_sd_orig.pop("3.f_t.bias")
+block_two_sd_new["downsamplers.0.norm2.weight"] = block_two_sd_orig.pop("3.gn_2.weight")
+block_two_sd_new["downsamplers.0.norm2.bias"] = block_two_sd_orig.pop("3.gn_2.bias")
+block_two_sd_new["downsamplers.0.conv2.weight"] = block_two_sd_orig.pop("3.f_2.weight")
+block_two_sd_new["downsamplers.0.conv2.bias"] = block_two_sd_orig.pop("3.f_2.bias")
+
+assert len(block_two_sd_orig) == 0
+
+block_two = ResnetDownsampleBlock2D(
+    in_channels=320,
+    out_channels=640,
+    temb_channels=1280,
+    num_layers=3,
+    add_downsample=True,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+
+block_two.load_state_dict(block_two_sd_new)
+
+print("DOWN BLOCK THREE")
+
+block_three_sd_orig = model.down[2].state_dict()
+block_three_sd_new = {}
+
+for i in range(3):
+    block_three_sd_new[f"resnets.{i}.norm1.weight"] = block_three_sd_orig.pop(f"{i}.gn_1.weight")
+    block_three_sd_new[f"resnets.{i}.norm1.bias"] = block_three_sd_orig.pop(f"{i}.gn_1.bias")
+    block_three_sd_new[f"resnets.{i}.conv1.weight"] = block_three_sd_orig.pop(f"{i}.f_1.weight")
+    block_three_sd_new[f"resnets.{i}.conv1.bias"] = block_three_sd_orig.pop(f"{i}.f_1.bias")
+    block_three_sd_new[f"resnets.{i}.time_emb_proj.weight"] = block_three_sd_orig.pop(f"{i}.f_t.weight")
+    block_three_sd_new[f"resnets.{i}.time_emb_proj.bias"] = block_three_sd_orig.pop(f"{i}.f_t.bias")
+    block_three_sd_new[f"resnets.{i}.norm2.weight"] = block_three_sd_orig.pop(f"{i}.gn_2.weight")
+    block_three_sd_new[f"resnets.{i}.norm2.bias"] = block_three_sd_orig.pop(f"{i}.gn_2.bias")
+    block_three_sd_new[f"resnets.{i}.conv2.weight"] = block_three_sd_orig.pop(f"{i}.f_2.weight")
+    block_three_sd_new[f"resnets.{i}.conv2.bias"] = block_three_sd_orig.pop(f"{i}.f_2.bias")
+
+    if i == 0:
+        block_three_sd_new[f"resnets.{i}.conv_shortcut.weight"] = block_three_sd_orig.pop(f"{i}.f_s.weight")
+        block_three_sd_new[f"resnets.{i}.conv_shortcut.bias"] = block_three_sd_orig.pop(f"{i}.f_s.bias")
+
+block_three_sd_new["downsamplers.0.norm1.weight"] = block_three_sd_orig.pop("3.gn_1.weight")
+block_three_sd_new["downsamplers.0.norm1.bias"] = block_three_sd_orig.pop("3.gn_1.bias")
+block_three_sd_new["downsamplers.0.conv1.weight"] = block_three_sd_orig.pop("3.f_1.weight")
+block_three_sd_new["downsamplers.0.conv1.bias"] = block_three_sd_orig.pop("3.f_1.bias")
+block_three_sd_new["downsamplers.0.time_emb_proj.weight"] = block_three_sd_orig.pop("3.f_t.weight")
+block_three_sd_new["downsamplers.0.time_emb_proj.bias"] = block_three_sd_orig.pop("3.f_t.bias")
+block_three_sd_new["downsamplers.0.norm2.weight"] = block_three_sd_orig.pop("3.gn_2.weight")
+block_three_sd_new["downsamplers.0.norm2.bias"] = block_three_sd_orig.pop("3.gn_2.bias")
+block_three_sd_new["downsamplers.0.conv2.weight"] = block_three_sd_orig.pop("3.f_2.weight")
+block_three_sd_new["downsamplers.0.conv2.bias"] = block_three_sd_orig.pop("3.f_2.bias")
+
+assert len(block_three_sd_orig) == 0
+
+block_three = ResnetDownsampleBlock2D(
+    in_channels=640,
+    out_channels=1024,
+    temb_channels=1280,
+    num_layers=3,
+    add_downsample=True,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+
+block_three.load_state_dict(block_three_sd_new)
+
+print("DOWN BLOCK FOUR")
+
+block_four_sd_orig = model.down[3].state_dict()
+block_four_sd_new = {}
+
+for i in range(3):
+    block_four_sd_new[f"resnets.{i}.norm1.weight"] = block_four_sd_orig.pop(f"{i}.gn_1.weight")
+    block_four_sd_new[f"resnets.{i}.norm1.bias"] = block_four_sd_orig.pop(f"{i}.gn_1.bias")
+    block_four_sd_new[f"resnets.{i}.conv1.weight"] = block_four_sd_orig.pop(f"{i}.f_1.weight")
+    block_four_sd_new[f"resnets.{i}.conv1.bias"] = block_four_sd_orig.pop(f"{i}.f_1.bias")
+    block_four_sd_new[f"resnets.{i}.time_emb_proj.weight"] = block_four_sd_orig.pop(f"{i}.f_t.weight")
+    block_four_sd_new[f"resnets.{i}.time_emb_proj.bias"] = block_four_sd_orig.pop(f"{i}.f_t.bias")
+    block_four_sd_new[f"resnets.{i}.norm2.weight"] = block_four_sd_orig.pop(f"{i}.gn_2.weight")
+    block_four_sd_new[f"resnets.{i}.norm2.bias"] = block_four_sd_orig.pop(f"{i}.gn_2.bias")
+    block_four_sd_new[f"resnets.{i}.conv2.weight"] = block_four_sd_orig.pop(f"{i}.f_2.weight")
+    block_four_sd_new[f"resnets.{i}.conv2.bias"] = block_four_sd_orig.pop(f"{i}.f_2.bias")
+
+assert len(block_four_sd_orig) == 0
+
+block_four = ResnetDownsampleBlock2D(
+    in_channels=1024,
+    out_channels=1024,
+    temb_channels=1280,
+    num_layers=3,
+    add_downsample=False,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+
+block_four.load_state_dict(block_four_sd_new)
+
+
+print("MID BLOCK 1")
+
+mid_block_one_sd_orig = model.mid.state_dict()
+mid_block_one_sd_new = {}
+
+for i in range(2):
+    mid_block_one_sd_new[f"resnets.{i}.norm1.weight"] = mid_block_one_sd_orig.pop(f"{i}.gn_1.weight")
+    mid_block_one_sd_new[f"resnets.{i}.norm1.bias"] = mid_block_one_sd_orig.pop(f"{i}.gn_1.bias")
+    mid_block_one_sd_new[f"resnets.{i}.conv1.weight"] = mid_block_one_sd_orig.pop(f"{i}.f_1.weight")
+    mid_block_one_sd_new[f"resnets.{i}.conv1.bias"] = mid_block_one_sd_orig.pop(f"{i}.f_1.bias")
+    mid_block_one_sd_new[f"resnets.{i}.time_emb_proj.weight"] = mid_block_one_sd_orig.pop(f"{i}.f_t.weight")
+    mid_block_one_sd_new[f"resnets.{i}.time_emb_proj.bias"] = mid_block_one_sd_orig.pop(f"{i}.f_t.bias")
+    mid_block_one_sd_new[f"resnets.{i}.norm2.weight"] = mid_block_one_sd_orig.pop(f"{i}.gn_2.weight")
+    mid_block_one_sd_new[f"resnets.{i}.norm2.bias"] = mid_block_one_sd_orig.pop(f"{i}.gn_2.bias")
+    mid_block_one_sd_new[f"resnets.{i}.conv2.weight"] = mid_block_one_sd_orig.pop(f"{i}.f_2.weight")
+    mid_block_one_sd_new[f"resnets.{i}.conv2.bias"] = mid_block_one_sd_orig.pop(f"{i}.f_2.bias")
+
+assert len(mid_block_one_sd_orig) == 0
+
+mid_block_one = UNetMidBlock2D(
+    in_channels=1024,
+    temb_channels=1280,
+    num_layers=1,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+    add_attention=False,
+)
+
+mid_block_one.load_state_dict(mid_block_one_sd_new)
+
+print("UP BLOCK ONE")
+
+up_block_one_sd_orig = model.up[-1].state_dict()
+up_block_one_sd_new = {}
+
+for i in range(4):
+    up_block_one_sd_new[f"resnets.{i}.norm1.weight"] = up_block_one_sd_orig.pop(f"{i}.gn_1.weight")
+    up_block_one_sd_new[f"resnets.{i}.norm1.bias"] = up_block_one_sd_orig.pop(f"{i}.gn_1.bias")
+    up_block_one_sd_new[f"resnets.{i}.conv1.weight"] = up_block_one_sd_orig.pop(f"{i}.f_1.weight")
+    up_block_one_sd_new[f"resnets.{i}.conv1.bias"] = up_block_one_sd_orig.pop(f"{i}.f_1.bias")
+    up_block_one_sd_new[f"resnets.{i}.time_emb_proj.weight"] = up_block_one_sd_orig.pop(f"{i}.f_t.weight")
+    up_block_one_sd_new[f"resnets.{i}.time_emb_proj.bias"] = up_block_one_sd_orig.pop(f"{i}.f_t.bias")
+    up_block_one_sd_new[f"resnets.{i}.norm2.weight"] = up_block_one_sd_orig.pop(f"{i}.gn_2.weight")
+    up_block_one_sd_new[f"resnets.{i}.norm2.bias"] = up_block_one_sd_orig.pop(f"{i}.gn_2.bias")
+    up_block_one_sd_new[f"resnets.{i}.conv2.weight"] = up_block_one_sd_orig.pop(f"{i}.f_2.weight")
+    up_block_one_sd_new[f"resnets.{i}.conv2.bias"] = up_block_one_sd_orig.pop(f"{i}.f_2.bias")
+    up_block_one_sd_new[f"resnets.{i}.conv_shortcut.weight"] = up_block_one_sd_orig.pop(f"{i}.f_s.weight")
+    up_block_one_sd_new[f"resnets.{i}.conv_shortcut.bias"] = up_block_one_sd_orig.pop(f"{i}.f_s.bias")
+
+up_block_one_sd_new["upsamplers.0.norm1.weight"] = up_block_one_sd_orig.pop("4.gn_1.weight")
+up_block_one_sd_new["upsamplers.0.norm1.bias"] = up_block_one_sd_orig.pop("4.gn_1.bias")
+up_block_one_sd_new["upsamplers.0.conv1.weight"] = up_block_one_sd_orig.pop("4.f_1.weight")
+up_block_one_sd_new["upsamplers.0.conv1.bias"] = up_block_one_sd_orig.pop("4.f_1.bias")
+up_block_one_sd_new["upsamplers.0.time_emb_proj.weight"] = up_block_one_sd_orig.pop("4.f_t.weight")
+up_block_one_sd_new["upsamplers.0.time_emb_proj.bias"] = up_block_one_sd_orig.pop("4.f_t.bias")
+up_block_one_sd_new["upsamplers.0.norm2.weight"] = up_block_one_sd_orig.pop("4.gn_2.weight")
+up_block_one_sd_new["upsamplers.0.norm2.bias"] = up_block_one_sd_orig.pop("4.gn_2.bias")
+up_block_one_sd_new["upsamplers.0.conv2.weight"] = up_block_one_sd_orig.pop("4.f_2.weight")
+up_block_one_sd_new["upsamplers.0.conv2.bias"] = up_block_one_sd_orig.pop("4.f_2.bias")
+
+assert len(up_block_one_sd_orig) == 0
+
+up_block_one = ResnetUpsampleBlock2D(
+    in_channels=1024,
+    prev_output_channel=1024,
+    out_channels=1024,
+    temb_channels=1280,
+    num_layers=4,
+    add_upsample=True,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+
+up_block_one.load_state_dict(up_block_one_sd_new)
+
+print("UP BLOCK TWO")
+
+up_block_two_sd_orig = model.up[-2].state_dict()
+up_block_two_sd_new = {}
+
+for i in range(4):
+    up_block_two_sd_new[f"resnets.{i}.norm1.weight"] = up_block_two_sd_orig.pop(f"{i}.gn_1.weight")
+    up_block_two_sd_new[f"resnets.{i}.norm1.bias"] = up_block_two_sd_orig.pop(f"{i}.gn_1.bias")
+    up_block_two_sd_new[f"resnets.{i}.conv1.weight"] = up_block_two_sd_orig.pop(f"{i}.f_1.weight")
+    up_block_two_sd_new[f"resnets.{i}.conv1.bias"] = up_block_two_sd_orig.pop(f"{i}.f_1.bias")
+    up_block_two_sd_new[f"resnets.{i}.time_emb_proj.weight"] = up_block_two_sd_orig.pop(f"{i}.f_t.weight")
+    up_block_two_sd_new[f"resnets.{i}.time_emb_proj.bias"] = up_block_two_sd_orig.pop(f"{i}.f_t.bias")
+    up_block_two_sd_new[f"resnets.{i}.norm2.weight"] = up_block_two_sd_orig.pop(f"{i}.gn_2.weight")
+    up_block_two_sd_new[f"resnets.{i}.norm2.bias"] = up_block_two_sd_orig.pop(f"{i}.gn_2.bias")
+    up_block_two_sd_new[f"resnets.{i}.conv2.weight"] = up_block_two_sd_orig.pop(f"{i}.f_2.weight")
+    up_block_two_sd_new[f"resnets.{i}.conv2.bias"] = up_block_two_sd_orig.pop(f"{i}.f_2.bias")
+    up_block_two_sd_new[f"resnets.{i}.conv_shortcut.weight"] = up_block_two_sd_orig.pop(f"{i}.f_s.weight")
+    up_block_two_sd_new[f"resnets.{i}.conv_shortcut.bias"] = up_block_two_sd_orig.pop(f"{i}.f_s.bias")
+
+up_block_two_sd_new["upsamplers.0.norm1.weight"] = up_block_two_sd_orig.pop("4.gn_1.weight")
+up_block_two_sd_new["upsamplers.0.norm1.bias"] = up_block_two_sd_orig.pop("4.gn_1.bias")
+up_block_two_sd_new["upsamplers.0.conv1.weight"] = up_block_two_sd_orig.pop("4.f_1.weight")
+up_block_two_sd_new["upsamplers.0.conv1.bias"] = up_block_two_sd_orig.pop("4.f_1.bias")
+up_block_two_sd_new["upsamplers.0.time_emb_proj.weight"] = up_block_two_sd_orig.pop("4.f_t.weight")
+up_block_two_sd_new["upsamplers.0.time_emb_proj.bias"] = up_block_two_sd_orig.pop("4.f_t.bias")
+up_block_two_sd_new["upsamplers.0.norm2.weight"] = up_block_two_sd_orig.pop("4.gn_2.weight")
+up_block_two_sd_new["upsamplers.0.norm2.bias"] = up_block_two_sd_orig.pop("4.gn_2.bias")
+up_block_two_sd_new["upsamplers.0.conv2.weight"] = up_block_two_sd_orig.pop("4.f_2.weight")
+up_block_two_sd_new["upsamplers.0.conv2.bias"] = up_block_two_sd_orig.pop("4.f_2.bias")
+
+assert len(up_block_two_sd_orig) == 0
+
+up_block_two = ResnetUpsampleBlock2D(
+    in_channels=640,
+    prev_output_channel=1024,
+    out_channels=1024,
+    temb_channels=1280,
+    num_layers=4,
+    add_upsample=True,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+
+up_block_two.load_state_dict(up_block_two_sd_new)
+
+print("UP BLOCK THREE")
+
+up_block_three_sd_orig = model.up[-3].state_dict()
+up_block_three_sd_new = {}
+
+for i in range(4):
+    up_block_three_sd_new[f"resnets.{i}.norm1.weight"] = up_block_three_sd_orig.pop(f"{i}.gn_1.weight")
+    up_block_three_sd_new[f"resnets.{i}.norm1.bias"] = up_block_three_sd_orig.pop(f"{i}.gn_1.bias")
+    up_block_three_sd_new[f"resnets.{i}.conv1.weight"] = up_block_three_sd_orig.pop(f"{i}.f_1.weight")
+    up_block_three_sd_new[f"resnets.{i}.conv1.bias"] = up_block_three_sd_orig.pop(f"{i}.f_1.bias")
+    up_block_three_sd_new[f"resnets.{i}.time_emb_proj.weight"] = up_block_three_sd_orig.pop(f"{i}.f_t.weight")
+    up_block_three_sd_new[f"resnets.{i}.time_emb_proj.bias"] = up_block_three_sd_orig.pop(f"{i}.f_t.bias")
+    up_block_three_sd_new[f"resnets.{i}.norm2.weight"] = up_block_three_sd_orig.pop(f"{i}.gn_2.weight")
+    up_block_three_sd_new[f"resnets.{i}.norm2.bias"] = up_block_three_sd_orig.pop(f"{i}.gn_2.bias")
+    up_block_three_sd_new[f"resnets.{i}.conv2.weight"] = up_block_three_sd_orig.pop(f"{i}.f_2.weight")
+    up_block_three_sd_new[f"resnets.{i}.conv2.bias"] = up_block_three_sd_orig.pop(f"{i}.f_2.bias")
+    up_block_three_sd_new[f"resnets.{i}.conv_shortcut.weight"] = up_block_three_sd_orig.pop(f"{i}.f_s.weight")
+    up_block_three_sd_new[f"resnets.{i}.conv_shortcut.bias"] = up_block_three_sd_orig.pop(f"{i}.f_s.bias")
+
+up_block_three_sd_new["upsamplers.0.norm1.weight"] = up_block_three_sd_orig.pop("4.gn_1.weight")
+up_block_three_sd_new["upsamplers.0.norm1.bias"] = up_block_three_sd_orig.pop("4.gn_1.bias")
+up_block_three_sd_new["upsamplers.0.conv1.weight"] = up_block_three_sd_orig.pop("4.f_1.weight")
+up_block_three_sd_new["upsamplers.0.conv1.bias"] = up_block_three_sd_orig.pop("4.f_1.bias")
+up_block_three_sd_new["upsamplers.0.time_emb_proj.weight"] = up_block_three_sd_orig.pop("4.f_t.weight")
+up_block_three_sd_new["upsamplers.0.time_emb_proj.bias"] = up_block_three_sd_orig.pop("4.f_t.bias")
+up_block_three_sd_new["upsamplers.0.norm2.weight"] = up_block_three_sd_orig.pop("4.gn_2.weight")
+up_block_three_sd_new["upsamplers.0.norm2.bias"] = up_block_three_sd_orig.pop("4.gn_2.bias")
+up_block_three_sd_new["upsamplers.0.conv2.weight"] = up_block_three_sd_orig.pop("4.f_2.weight")
+up_block_three_sd_new["upsamplers.0.conv2.bias"] = up_block_three_sd_orig.pop("4.f_2.bias")
+
+assert len(up_block_three_sd_orig) == 0
+
+up_block_three = ResnetUpsampleBlock2D(
+    in_channels=320,
+    prev_output_channel=1024,
+    out_channels=640,
+    temb_channels=1280,
+    num_layers=4,
+    add_upsample=True,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+
+up_block_three.load_state_dict(up_block_three_sd_new)
+
+print("UP BLOCK FOUR")
+
+up_block_four_sd_orig = model.up[-4].state_dict()
+up_block_four_sd_new = {}
+
+for i in range(4):
+    up_block_four_sd_new[f"resnets.{i}.norm1.weight"] = up_block_four_sd_orig.pop(f"{i}.gn_1.weight")
+    up_block_four_sd_new[f"resnets.{i}.norm1.bias"] = up_block_four_sd_orig.pop(f"{i}.gn_1.bias")
+    up_block_four_sd_new[f"resnets.{i}.conv1.weight"] = up_block_four_sd_orig.pop(f"{i}.f_1.weight")
+    up_block_four_sd_new[f"resnets.{i}.conv1.bias"] = up_block_four_sd_orig.pop(f"{i}.f_1.bias")
+    up_block_four_sd_new[f"resnets.{i}.time_emb_proj.weight"] = up_block_four_sd_orig.pop(f"{i}.f_t.weight")
+    up_block_four_sd_new[f"resnets.{i}.time_emb_proj.bias"] = up_block_four_sd_orig.pop(f"{i}.f_t.bias")
+    up_block_four_sd_new[f"resnets.{i}.norm2.weight"] = up_block_four_sd_orig.pop(f"{i}.gn_2.weight")
+    up_block_four_sd_new[f"resnets.{i}.norm2.bias"] = up_block_four_sd_orig.pop(f"{i}.gn_2.bias")
+    up_block_four_sd_new[f"resnets.{i}.conv2.weight"] = up_block_four_sd_orig.pop(f"{i}.f_2.weight")
+    up_block_four_sd_new[f"resnets.{i}.conv2.bias"] = up_block_four_sd_orig.pop(f"{i}.f_2.bias")
+    up_block_four_sd_new[f"resnets.{i}.conv_shortcut.weight"] = up_block_four_sd_orig.pop(f"{i}.f_s.weight")
+    up_block_four_sd_new[f"resnets.{i}.conv_shortcut.bias"] = up_block_four_sd_orig.pop(f"{i}.f_s.bias")
+
+assert len(up_block_four_sd_orig) == 0
+
+up_block_four = ResnetUpsampleBlock2D(
+    in_channels=320,
+    prev_output_channel=640,
+    out_channels=320,
+    temb_channels=1280,
+    num_layers=4,
+    add_upsample=False,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+
+up_block_four.load_state_dict(up_block_four_sd_new)
+
+print("initial projection (conv_in)")
+
+conv_in_sd_orig = model.embed_image.state_dict()
+conv_in_sd_new = {}
+
+conv_in_sd_new["weight"] = conv_in_sd_orig.pop("f.weight")
+conv_in_sd_new["bias"] = conv_in_sd_orig.pop("f.bias")
+
+assert len(conv_in_sd_orig) == 0
+
+block_out_channels = [320, 640, 1024, 1024]
+
+in_channels = 7
+conv_in_kernel = 3
+conv_in_padding = (conv_in_kernel - 1) // 2
+conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding)
+
+conv_in.load_state_dict(conv_in_sd_new)
+
+print("out projection (conv_out) (conv_norm_out)")
+out_channels = 6
+norm_num_groups = 32
+norm_eps = 1e-5
+act_fn = "silu"
+conv_out_kernel = 3
+conv_out_padding = (conv_out_kernel - 1) // 2
+conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
+# uses torch.functional in orig
+# conv_act = get_activation(act_fn)
+conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding)
+
+conv_norm_out.load_state_dict(model.output.gn.state_dict())
+conv_out.load_state_dict(model.output.f.state_dict())
+
+print("timestep projection (time_proj) (time_embedding)")
+
+f1_sd = model.embed_time.f_1.state_dict()
+f2_sd = model.embed_time.f_2.state_dict()
+
+time_embedding_sd = {
+    "linear_1.weight": f1_sd.pop("weight"),
+    "linear_1.bias": f1_sd.pop("bias"),
+    "linear_2.weight": f2_sd.pop("weight"),
+    "linear_2.bias": f2_sd.pop("bias"),
+}
+
+assert len(f1_sd) == 0
+assert len(f2_sd) == 0
+
+time_embedding_type = "learned"
+num_train_timesteps = 1024
+time_embedding_dim = 1280
+
+time_proj = nn.Embedding(num_train_timesteps, block_out_channels[0])
+timestep_input_dim = block_out_channels[0]
+
+time_embedding = TimestepEmbedding(timestep_input_dim, time_embedding_dim)
+
+time_proj.load_state_dict(model.embed_time.emb.state_dict())
+time_embedding.load_state_dict(time_embedding_sd)
+
+print("CONVERT")
+
+time_embedding.to("cuda")
+time_proj.to("cuda")
+conv_in.to("cuda")
+
+block_one.to("cuda")
+block_two.to("cuda")
+block_three.to("cuda")
+block_four.to("cuda")
+
+mid_block_one.to("cuda")
+
+up_block_one.to("cuda")
+up_block_two.to("cuda")
+up_block_three.to("cuda")
+up_block_four.to("cuda")
+
+conv_norm_out.to("cuda")
+conv_out.to("cuda")
+
+model.time_proj = time_proj
+model.time_embedding = time_embedding
+model.embed_image = conv_in
+
+model.down[0] = block_one
+model.down[1] = block_two
+model.down[2] = block_three
+model.down[3] = block_four
+
+model.mid = mid_block_one
+
+model.up[-1] = up_block_one
+model.up[-2] = up_block_two
+model.up[-3] = up_block_three
+model.up[-4] = up_block_four
+
+model.output.gn = conv_norm_out
+model.output.f = conv_out
+
+model.converted = True
+
+sample_consistency_new = decoder_consistency(latent, generator=torch.Generator("cpu").manual_seed(0))
+save_image(sample_consistency_new, "con_new.png")
+
+assert (sample_consistency_orig == sample_consistency_new).all()
+
+print("making unet")
+
+unet = UNet2DModel(
+    in_channels=in_channels,
+    out_channels=out_channels,
+    down_block_types=(
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+    ),
+    up_block_types=(
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+    ),
+    block_out_channels=block_out_channels,
+    layers_per_block=3,
+    norm_num_groups=norm_num_groups,
+    norm_eps=norm_eps,
+    resnet_time_scale_shift="scale_shift",
+    time_embedding_type="learned",
+    num_train_timesteps=num_train_timesteps,
+    add_attention=False,
+)
+
+unet_state_dict = {}
+
+
+def add_state_dict(prefix, mod):
+    for k, v in mod.state_dict().items():
+        unet_state_dict[f"{prefix}.{k}"] = v
+
+
+add_state_dict("conv_in", conv_in)
+add_state_dict("time_proj", time_proj)
+add_state_dict("time_embedding", time_embedding)
+add_state_dict("down_blocks.0", block_one)
+add_state_dict("down_blocks.1", block_two)
+add_state_dict("down_blocks.2", block_three)
+add_state_dict("down_blocks.3", block_four)
+add_state_dict("mid_block", mid_block_one)
+add_state_dict("up_blocks.0", up_block_one)
+add_state_dict("up_blocks.1", up_block_two)
+add_state_dict("up_blocks.2", up_block_three)
+add_state_dict("up_blocks.3", up_block_four)
+add_state_dict("conv_norm_out", conv_norm_out)
+add_state_dict("conv_out", conv_out)
+
+unet.load_state_dict(unet_state_dict)
+
+print("running with diffusers unet")
+
+unet.to("cuda")
+
+decoder_consistency.ckpt = unet
+
+sample_consistency_new_2 = decoder_consistency(latent, generator=torch.Generator("cpu").manual_seed(0))
+save_image(sample_consistency_new_2, "con_new_2.png")
+
+assert (sample_consistency_orig == sample_consistency_new_2).all()
+
+print("running with diffusers model")
+
+Encoder.old_constructor = Encoder.__init__
+
+
+def new_constructor(self, **kwargs):
+    self.old_constructor(**kwargs)
+    self.constructor_arguments = kwargs
+
+
+Encoder.__init__ = new_constructor
+
+
+vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae")
+consistency_vae = ConsistencyDecoderVAE(
+    encoder_args=vae.encoder.constructor_arguments,
+    decoder_args=unet.config,
+    scaling_factor=vae.config.scaling_factor,
+    block_out_channels=vae.config.block_out_channels,
+    latent_channels=vae.config.latent_channels,
+)
+consistency_vae.encoder.load_state_dict(vae.encoder.state_dict())
+consistency_vae.quant_conv.load_state_dict(vae.quant_conv.state_dict())
+consistency_vae.decoder_unet.load_state_dict(unet.state_dict())
+
+consistency_vae.to(dtype=torch.float16, device="cuda")
+
+sample_consistency_new_3 = consistency_vae.decode(
+    0.18215 * latent, generator=torch.Generator("cpu").manual_seed(0)
+).sample
+
+print("max difference")
+print((sample_consistency_orig - sample_consistency_new_3).abs().max())
+print("total difference")
+print((sample_consistency_orig - sample_consistency_new_3).abs().sum())
+# assert (sample_consistency_orig == sample_consistency_new_3).all()
+
+print("running with diffusers pipeline")
+
+pipe = DiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", vae=consistency_vae, torch_dtype=torch.float16
+)
+pipe.to("cuda")
+
+pipe("horse", generator=torch.Generator("cpu").manual_seed(0)).images[0].save("horse.png")
+
+
+if args.save_pretrained is not None:
+    consistency_vae.save_pretrained(args.save_pretrained)
diff --git a/diffusers/scripts/convert_consistency_to_diffusers.py b/diffusers/scripts/convert_consistency_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f8b4ddca8efd5e4392754f12b67448adad8c0b7
--- /dev/null
+++ b/diffusers/scripts/convert_consistency_to_diffusers.py
@@ -0,0 +1,315 @@
+import argparse
+import os
+
+import torch
+
+from diffusers import (
+    CMStochasticIterativeScheduler,
+    ConsistencyModelPipeline,
+    UNet2DModel,
+)
+
+
+TEST_UNET_CONFIG = {
+    "sample_size": 32,
+    "in_channels": 3,
+    "out_channels": 3,
+    "layers_per_block": 2,
+    "num_class_embeds": 1000,
+    "block_out_channels": [32, 64],
+    "attention_head_dim": 8,
+    "down_block_types": [
+        "ResnetDownsampleBlock2D",
+        "AttnDownBlock2D",
+    ],
+    "up_block_types": [
+        "AttnUpBlock2D",
+        "ResnetUpsampleBlock2D",
+    ],
+    "resnet_time_scale_shift": "scale_shift",
+    "attn_norm_num_groups": 32,
+    "upsample_type": "resnet",
+    "downsample_type": "resnet",
+}
+
+IMAGENET_64_UNET_CONFIG = {
+    "sample_size": 64,
+    "in_channels": 3,
+    "out_channels": 3,
+    "layers_per_block": 3,
+    "num_class_embeds": 1000,
+    "block_out_channels": [192, 192 * 2, 192 * 3, 192 * 4],
+    "attention_head_dim": 64,
+    "down_block_types": [
+        "ResnetDownsampleBlock2D",
+        "AttnDownBlock2D",
+        "AttnDownBlock2D",
+        "AttnDownBlock2D",
+    ],
+    "up_block_types": [
+        "AttnUpBlock2D",
+        "AttnUpBlock2D",
+        "AttnUpBlock2D",
+        "ResnetUpsampleBlock2D",
+    ],
+    "resnet_time_scale_shift": "scale_shift",
+    "attn_norm_num_groups": 32,
+    "upsample_type": "resnet",
+    "downsample_type": "resnet",
+}
+
+LSUN_256_UNET_CONFIG = {
+    "sample_size": 256,
+    "in_channels": 3,
+    "out_channels": 3,
+    "layers_per_block": 2,
+    "num_class_embeds": None,
+    "block_out_channels": [256, 256, 256 * 2, 256 * 2, 256 * 4, 256 * 4],
+    "attention_head_dim": 64,
+    "down_block_types": [
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+        "AttnDownBlock2D",
+        "AttnDownBlock2D",
+        "AttnDownBlock2D",
+    ],
+    "up_block_types": [
+        "AttnUpBlock2D",
+        "AttnUpBlock2D",
+        "AttnUpBlock2D",
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+    ],
+    "resnet_time_scale_shift": "default",
+    "upsample_type": "resnet",
+    "downsample_type": "resnet",
+}
+
+CD_SCHEDULER_CONFIG = {
+    "num_train_timesteps": 40,
+    "sigma_min": 0.002,
+    "sigma_max": 80.0,
+}
+
+CT_IMAGENET_64_SCHEDULER_CONFIG = {
+    "num_train_timesteps": 201,
+    "sigma_min": 0.002,
+    "sigma_max": 80.0,
+}
+
+CT_LSUN_256_SCHEDULER_CONFIG = {
+    "num_train_timesteps": 151,
+    "sigma_min": 0.002,
+    "sigma_max": 80.0,
+}
+
+
+def str2bool(v):
+    """
+    https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
+    """
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("boolean value expected")
+
+
+def convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix, has_skip=False):
+    new_checkpoint[f"{new_prefix}.norm1.weight"] = checkpoint[f"{old_prefix}.in_layers.0.weight"]
+    new_checkpoint[f"{new_prefix}.norm1.bias"] = checkpoint[f"{old_prefix}.in_layers.0.bias"]
+    new_checkpoint[f"{new_prefix}.conv1.weight"] = checkpoint[f"{old_prefix}.in_layers.2.weight"]
+    new_checkpoint[f"{new_prefix}.conv1.bias"] = checkpoint[f"{old_prefix}.in_layers.2.bias"]
+    new_checkpoint[f"{new_prefix}.time_emb_proj.weight"] = checkpoint[f"{old_prefix}.emb_layers.1.weight"]
+    new_checkpoint[f"{new_prefix}.time_emb_proj.bias"] = checkpoint[f"{old_prefix}.emb_layers.1.bias"]
+    new_checkpoint[f"{new_prefix}.norm2.weight"] = checkpoint[f"{old_prefix}.out_layers.0.weight"]
+    new_checkpoint[f"{new_prefix}.norm2.bias"] = checkpoint[f"{old_prefix}.out_layers.0.bias"]
+    new_checkpoint[f"{new_prefix}.conv2.weight"] = checkpoint[f"{old_prefix}.out_layers.3.weight"]
+    new_checkpoint[f"{new_prefix}.conv2.bias"] = checkpoint[f"{old_prefix}.out_layers.3.bias"]
+
+    if has_skip:
+        new_checkpoint[f"{new_prefix}.conv_shortcut.weight"] = checkpoint[f"{old_prefix}.skip_connection.weight"]
+        new_checkpoint[f"{new_prefix}.conv_shortcut.bias"] = checkpoint[f"{old_prefix}.skip_connection.bias"]
+
+    return new_checkpoint
+
+
+def convert_attention(checkpoint, new_checkpoint, old_prefix, new_prefix, attention_dim=None):
+    weight_q, weight_k, weight_v = checkpoint[f"{old_prefix}.qkv.weight"].chunk(3, dim=0)
+    bias_q, bias_k, bias_v = checkpoint[f"{old_prefix}.qkv.bias"].chunk(3, dim=0)
+
+    new_checkpoint[f"{new_prefix}.group_norm.weight"] = checkpoint[f"{old_prefix}.norm.weight"]
+    new_checkpoint[f"{new_prefix}.group_norm.bias"] = checkpoint[f"{old_prefix}.norm.bias"]
+
+    new_checkpoint[f"{new_prefix}.to_q.weight"] = weight_q.squeeze(-1).squeeze(-1)
+    new_checkpoint[f"{new_prefix}.to_q.bias"] = bias_q.squeeze(-1).squeeze(-1)
+    new_checkpoint[f"{new_prefix}.to_k.weight"] = weight_k.squeeze(-1).squeeze(-1)
+    new_checkpoint[f"{new_prefix}.to_k.bias"] = bias_k.squeeze(-1).squeeze(-1)
+    new_checkpoint[f"{new_prefix}.to_v.weight"] = weight_v.squeeze(-1).squeeze(-1)
+    new_checkpoint[f"{new_prefix}.to_v.bias"] = bias_v.squeeze(-1).squeeze(-1)
+
+    new_checkpoint[f"{new_prefix}.to_out.0.weight"] = (
+        checkpoint[f"{old_prefix}.proj_out.weight"].squeeze(-1).squeeze(-1)
+    )
+    new_checkpoint[f"{new_prefix}.to_out.0.bias"] = checkpoint[f"{old_prefix}.proj_out.bias"].squeeze(-1).squeeze(-1)
+
+    return new_checkpoint
+
+
+def con_pt_to_diffuser(checkpoint_path: str, unet_config):
+    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = checkpoint["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = checkpoint["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = checkpoint["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = checkpoint["time_embed.2.bias"]
+
+    if unet_config["num_class_embeds"] is not None:
+        new_checkpoint["class_embedding.weight"] = checkpoint["label_emb.weight"]
+
+    new_checkpoint["conv_in.weight"] = checkpoint["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = checkpoint["input_blocks.0.0.bias"]
+
+    down_block_types = unet_config["down_block_types"]
+    layers_per_block = unet_config["layers_per_block"]
+    attention_head_dim = unet_config["attention_head_dim"]
+    channels_list = unet_config["block_out_channels"]
+    current_layer = 1
+    prev_channels = channels_list[0]
+
+    for i, layer_type in enumerate(down_block_types):
+        current_channels = channels_list[i]
+        downsample_block_has_skip = current_channels != prev_channels
+        if layer_type == "ResnetDownsampleBlock2D":
+            for j in range(layers_per_block):
+                new_prefix = f"down_blocks.{i}.resnets.{j}"
+                old_prefix = f"input_blocks.{current_layer}.0"
+                has_skip = True if j == 0 and downsample_block_has_skip else False
+                new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix, has_skip=has_skip)
+                current_layer += 1
+
+        elif layer_type == "AttnDownBlock2D":
+            for j in range(layers_per_block):
+                new_prefix = f"down_blocks.{i}.resnets.{j}"
+                old_prefix = f"input_blocks.{current_layer}.0"
+                has_skip = True if j == 0 and downsample_block_has_skip else False
+                new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix, has_skip=has_skip)
+                new_prefix = f"down_blocks.{i}.attentions.{j}"
+                old_prefix = f"input_blocks.{current_layer}.1"
+                new_checkpoint = convert_attention(
+                    checkpoint, new_checkpoint, old_prefix, new_prefix, attention_head_dim
+                )
+                current_layer += 1
+
+        if i != len(down_block_types) - 1:
+            new_prefix = f"down_blocks.{i}.downsamplers.0"
+            old_prefix = f"input_blocks.{current_layer}.0"
+            new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix)
+            current_layer += 1
+
+        prev_channels = current_channels
+
+    # hardcoded the mid-block for now
+    new_prefix = "mid_block.resnets.0"
+    old_prefix = "middle_block.0"
+    new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix)
+    new_prefix = "mid_block.attentions.0"
+    old_prefix = "middle_block.1"
+    new_checkpoint = convert_attention(checkpoint, new_checkpoint, old_prefix, new_prefix, attention_head_dim)
+    new_prefix = "mid_block.resnets.1"
+    old_prefix = "middle_block.2"
+    new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix)
+
+    current_layer = 0
+    up_block_types = unet_config["up_block_types"]
+
+    for i, layer_type in enumerate(up_block_types):
+        if layer_type == "ResnetUpsampleBlock2D":
+            for j in range(layers_per_block + 1):
+                new_prefix = f"up_blocks.{i}.resnets.{j}"
+                old_prefix = f"output_blocks.{current_layer}.0"
+                new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix, has_skip=True)
+                current_layer += 1
+
+            if i != len(up_block_types) - 1:
+                new_prefix = f"up_blocks.{i}.upsamplers.0"
+                old_prefix = f"output_blocks.{current_layer-1}.1"
+                new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix)
+        elif layer_type == "AttnUpBlock2D":
+            for j in range(layers_per_block + 1):
+                new_prefix = f"up_blocks.{i}.resnets.{j}"
+                old_prefix = f"output_blocks.{current_layer}.0"
+                new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix, has_skip=True)
+                new_prefix = f"up_blocks.{i}.attentions.{j}"
+                old_prefix = f"output_blocks.{current_layer}.1"
+                new_checkpoint = convert_attention(
+                    checkpoint, new_checkpoint, old_prefix, new_prefix, attention_head_dim
+                )
+                current_layer += 1
+
+            if i != len(up_block_types) - 1:
+                new_prefix = f"up_blocks.{i}.upsamplers.0"
+                old_prefix = f"output_blocks.{current_layer-1}.2"
+                new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix)
+
+    new_checkpoint["conv_norm_out.weight"] = checkpoint["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = checkpoint["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = checkpoint["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = checkpoint["out.2.bias"]
+
+    return new_checkpoint
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--unet_path", default=None, type=str, required=True, help="Path to the unet.pt to convert.")
+    parser.add_argument(
+        "--dump_path", default=None, type=str, required=True, help="Path to output the converted UNet model."
+    )
+    parser.add_argument("--class_cond", default=True, type=str, help="Whether the model is class-conditional.")
+
+    args = parser.parse_args()
+    args.class_cond = str2bool(args.class_cond)
+
+    ckpt_name = os.path.basename(args.unet_path)
+    print(f"Checkpoint: {ckpt_name}")
+
+    # Get U-Net config
+    if "imagenet64" in ckpt_name:
+        unet_config = IMAGENET_64_UNET_CONFIG
+    elif "256" in ckpt_name and (("bedroom" in ckpt_name) or ("cat" in ckpt_name)):
+        unet_config = LSUN_256_UNET_CONFIG
+    elif "test" in ckpt_name:
+        unet_config = TEST_UNET_CONFIG
+    else:
+        raise ValueError(f"Checkpoint type {ckpt_name} is not currently supported.")
+
+    if not args.class_cond:
+        unet_config["num_class_embeds"] = None
+
+    converted_unet_ckpt = con_pt_to_diffuser(args.unet_path, unet_config)
+
+    image_unet = UNet2DModel(**unet_config)
+    image_unet.load_state_dict(converted_unet_ckpt)
+
+    # Get scheduler config
+    if "cd" in ckpt_name or "test" in ckpt_name:
+        scheduler_config = CD_SCHEDULER_CONFIG
+    elif "ct" in ckpt_name and "imagenet64" in ckpt_name:
+        scheduler_config = CT_IMAGENET_64_SCHEDULER_CONFIG
+    elif "ct" in ckpt_name and "256" in ckpt_name and (("bedroom" in ckpt_name) or ("cat" in ckpt_name)):
+        scheduler_config = CT_LSUN_256_SCHEDULER_CONFIG
+    else:
+        raise ValueError(f"Checkpoint type {ckpt_name} is not currently supported.")
+
+    cm_scheduler = CMStochasticIterativeScheduler(**scheduler_config)
+
+    consistency_model = ConsistencyModelPipeline(unet=image_unet, scheduler=cm_scheduler)
+    consistency_model.save_pretrained(args.dump_path)
diff --git a/diffusers/scripts/convert_dance_diffusion_to_diffusers.py b/diffusers/scripts/convert_dance_diffusion_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce69bfe2bfc8e4828517004e5bea6335339f3466
--- /dev/null
+++ b/diffusers/scripts/convert_dance_diffusion_to_diffusers.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python3
+import argparse
+import math
+import os
+from copy import deepcopy
+
+import requests
+import torch
+from audio_diffusion.models import DiffusionAttnUnet1D
+from diffusion import sampling
+from torch import nn
+
+from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
+
+
+MODELS_MAP = {
+    "gwf-440k": {
+        "url": "https://model-server.zqevans2.workers.dev/gwf-440k.ckpt",
+        "sample_rate": 48000,
+        "sample_size": 65536,
+    },
+    "jmann-small-190k": {
+        "url": "https://model-server.zqevans2.workers.dev/jmann-small-190k.ckpt",
+        "sample_rate": 48000,
+        "sample_size": 65536,
+    },
+    "jmann-large-580k": {
+        "url": "https://model-server.zqevans2.workers.dev/jmann-large-580k.ckpt",
+        "sample_rate": 48000,
+        "sample_size": 131072,
+    },
+    "maestro-uncond-150k": {
+        "url": "https://model-server.zqevans2.workers.dev/maestro-uncond-150k.ckpt",
+        "sample_rate": 16000,
+        "sample_size": 65536,
+    },
+    "unlocked-uncond-250k": {
+        "url": "https://model-server.zqevans2.workers.dev/unlocked-uncond-250k.ckpt",
+        "sample_rate": 16000,
+        "sample_size": 65536,
+    },
+    "honk-140k": {
+        "url": "https://model-server.zqevans2.workers.dev/honk-140k.ckpt",
+        "sample_rate": 16000,
+        "sample_size": 65536,
+    },
+}
+
+
+def alpha_sigma_to_t(alpha, sigma):
+    """Returns a timestep, given the scaling factors for the clean image and for
+    the noise."""
+    return torch.atan2(sigma, alpha) / math.pi * 2
+
+
+def get_crash_schedule(t):
+    sigma = torch.sin(t * math.pi / 2) ** 2
+    alpha = (1 - sigma**2) ** 0.5
+    return alpha_sigma_to_t(alpha, sigma)
+
+
+class Object(object):
+    pass
+
+
+class DiffusionUncond(nn.Module):
+    def __init__(self, global_args):
+        super().__init__()
+
+        self.diffusion = DiffusionAttnUnet1D(global_args, n_attn_layers=4)
+        self.diffusion_ema = deepcopy(self.diffusion)
+        self.rng = torch.quasirandom.SobolEngine(1, scramble=True)
+
+
+def download(model_name):
+    url = MODELS_MAP[model_name]["url"]
+    r = requests.get(url, stream=True)
+
+    local_filename = f"./{model_name}.ckpt"
+    with open(local_filename, "wb") as fp:
+        for chunk in r.iter_content(chunk_size=8192):
+            fp.write(chunk)
+
+    return local_filename
+
+
+DOWN_NUM_TO_LAYER = {
+    "1": "resnets.0",
+    "2": "attentions.0",
+    "3": "resnets.1",
+    "4": "attentions.1",
+    "5": "resnets.2",
+    "6": "attentions.2",
+}
+UP_NUM_TO_LAYER = {
+    "8": "resnets.0",
+    "9": "attentions.0",
+    "10": "resnets.1",
+    "11": "attentions.1",
+    "12": "resnets.2",
+    "13": "attentions.2",
+}
+MID_NUM_TO_LAYER = {
+    "1": "resnets.0",
+    "2": "attentions.0",
+    "3": "resnets.1",
+    "4": "attentions.1",
+    "5": "resnets.2",
+    "6": "attentions.2",
+    "8": "resnets.3",
+    "9": "attentions.3",
+    "10": "resnets.4",
+    "11": "attentions.4",
+    "12": "resnets.5",
+    "13": "attentions.5",
+}
+DEPTH_0_TO_LAYER = {
+    "0": "resnets.0",
+    "1": "resnets.1",
+    "2": "resnets.2",
+    "4": "resnets.0",
+    "5": "resnets.1",
+    "6": "resnets.2",
+}
+
+RES_CONV_MAP = {
+    "skip": "conv_skip",
+    "main.0": "conv_1",
+    "main.1": "group_norm_1",
+    "main.3": "conv_2",
+    "main.4": "group_norm_2",
+}
+
+ATTN_MAP = {
+    "norm": "group_norm",
+    "qkv_proj": ["query", "key", "value"],
+    "out_proj": ["proj_attn"],
+}
+
+
+def convert_resconv_naming(name):
+    if name.startswith("skip"):
+        return name.replace("skip", RES_CONV_MAP["skip"])
+
+    # name has to be of format main.{digit}
+    if not name.startswith("main."):
+        raise ValueError(f"ResConvBlock error with {name}")
+
+    return name.replace(name[:6], RES_CONV_MAP[name[:6]])
+
+
+def convert_attn_naming(name):
+    for key, value in ATTN_MAP.items():
+        if name.startswith(key) and not isinstance(value, list):
+            return name.replace(key, value)
+        elif name.startswith(key):
+            return [name.replace(key, v) for v in value]
+    raise ValueError(f"Attn error with {name}")
+
+
+def rename(input_string, max_depth=13):
+    string = input_string
+
+    if string.split(".")[0] == "timestep_embed":
+        return string.replace("timestep_embed", "time_proj")
+
+    depth = 0
+    if string.startswith("net.3."):
+        depth += 1
+        string = string[6:]
+    elif string.startswith("net."):
+        string = string[4:]
+
+    while string.startswith("main.7."):
+        depth += 1
+        string = string[7:]
+
+    if string.startswith("main."):
+        string = string[5:]
+
+    # mid block
+    if string[:2].isdigit():
+        layer_num = string[:2]
+        string_left = string[2:]
+    else:
+        layer_num = string[0]
+        string_left = string[1:]
+
+    if depth == max_depth:
+        new_layer = MID_NUM_TO_LAYER[layer_num]
+        prefix = "mid_block"
+    elif depth > 0 and int(layer_num) < 7:
+        new_layer = DOWN_NUM_TO_LAYER[layer_num]
+        prefix = f"down_blocks.{depth}"
+    elif depth > 0 and int(layer_num) > 7:
+        new_layer = UP_NUM_TO_LAYER[layer_num]
+        prefix = f"up_blocks.{max_depth - depth - 1}"
+    elif depth == 0:
+        new_layer = DEPTH_0_TO_LAYER[layer_num]
+        prefix = f"up_blocks.{max_depth - 1}" if int(layer_num) > 3 else "down_blocks.0"
+
+    if not string_left.startswith("."):
+        raise ValueError(f"Naming error with {input_string} and string_left: {string_left}.")
+
+    string_left = string_left[1:]
+
+    if "resnets" in new_layer:
+        string_left = convert_resconv_naming(string_left)
+    elif "attentions" in new_layer:
+        new_string_left = convert_attn_naming(string_left)
+        string_left = new_string_left
+
+    if not isinstance(string_left, list):
+        new_string = prefix + "." + new_layer + "." + string_left
+    else:
+        new_string = [prefix + "." + new_layer + "." + s for s in string_left]
+    return new_string
+
+
+def rename_orig_weights(state_dict):
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        if k.endswith("kernel"):
+            # up- and downsample layers, don't have trainable weights
+            continue
+
+        new_k = rename(k)
+
+        # check if we need to transform from Conv => Linear for attention
+        if isinstance(new_k, list):
+            new_state_dict = transform_conv_attns(new_state_dict, new_k, v)
+        else:
+            new_state_dict[new_k] = v
+
+    return new_state_dict
+
+
+def transform_conv_attns(new_state_dict, new_k, v):
+    if len(new_k) == 1:
+        if len(v.shape) == 3:
+            # weight
+            new_state_dict[new_k[0]] = v[:, :, 0]
+        else:
+            # bias
+            new_state_dict[new_k[0]] = v
+    else:
+        # qkv matrices
+        trippled_shape = v.shape[0]
+        single_shape = trippled_shape // 3
+        for i in range(3):
+            if len(v.shape) == 3:
+                new_state_dict[new_k[i]] = v[i * single_shape : (i + 1) * single_shape, :, 0]
+            else:
+                new_state_dict[new_k[i]] = v[i * single_shape : (i + 1) * single_shape]
+    return new_state_dict
+
+
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    model_name = args.model_path.split("/")[-1].split(".")[0]
+    if not os.path.isfile(args.model_path):
+        assert (
+            model_name == args.model_path
+        ), f"Make sure to provide one of the official model names {MODELS_MAP.keys()}"
+        args.model_path = download(model_name)
+
+    sample_rate = MODELS_MAP[model_name]["sample_rate"]
+    sample_size = MODELS_MAP[model_name]["sample_size"]
+
+    config = Object()
+    config.sample_size = sample_size
+    config.sample_rate = sample_rate
+    config.latent_dim = 0
+
+    diffusers_model = UNet1DModel(sample_size=sample_size, sample_rate=sample_rate)
+    diffusers_state_dict = diffusers_model.state_dict()
+
+    orig_model = DiffusionUncond(config)
+    orig_model.load_state_dict(torch.load(args.model_path, map_location=device)["state_dict"])
+    orig_model = orig_model.diffusion_ema.eval()
+    orig_model_state_dict = orig_model.state_dict()
+    renamed_state_dict = rename_orig_weights(orig_model_state_dict)
+
+    renamed_minus_diffusers = set(renamed_state_dict.keys()) - set(diffusers_state_dict.keys())
+    diffusers_minus_renamed = set(diffusers_state_dict.keys()) - set(renamed_state_dict.keys())
+
+    assert len(renamed_minus_diffusers) == 0, f"Problem with {renamed_minus_diffusers}"
+    assert all(k.endswith("kernel") for k in list(diffusers_minus_renamed)), f"Problem with {diffusers_minus_renamed}"
+
+    for key, value in renamed_state_dict.items():
+        assert (
+            diffusers_state_dict[key].squeeze().shape == value.squeeze().shape
+        ), f"Shape for {key} doesn't match. Diffusers: {diffusers_state_dict[key].shape} vs. {value.shape}"
+        if key == "time_proj.weight":
+            value = value.squeeze()
+
+        diffusers_state_dict[key] = value
+
+    diffusers_model.load_state_dict(diffusers_state_dict)
+
+    steps = 100
+    seed = 33
+
+    diffusers_scheduler = IPNDMScheduler(num_train_timesteps=steps)
+
+    generator = torch.manual_seed(seed)
+    noise = torch.randn([1, 2, config.sample_size], generator=generator).to(device)
+
+    t = torch.linspace(1, 0, steps + 1, device=device)[:-1]
+    step_list = get_crash_schedule(t)
+
+    pipe = DanceDiffusionPipeline(unet=diffusers_model, scheduler=diffusers_scheduler)
+
+    generator = torch.manual_seed(33)
+    audio = pipe(num_inference_steps=steps, generator=generator).audios
+
+    generated = sampling.iplms_sample(orig_model, noise, step_list, {})
+    generated = generated.clamp(-1, 1)
+
+    diff_sum = (generated - audio).abs().sum()
+    diff_max = (generated - audio).abs().max()
+
+    if args.save:
+        pipe.save_pretrained(args.checkpoint_path)
+
+    print("Diff sum", diff_sum)
+    print("Diff max", diff_max)
+
+    assert diff_max < 1e-3, f"Diff max: {diff_max} is too much :-/"
+
+    print(f"Conversion for {model_name} successful!")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")
+    parser.add_argument(
+        "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not."
+    )
+    parser.add_argument("--checkpoint_path", default=None, type=str, required=True, help="Path to the output model.")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/diffusers/scripts/convert_ddpm_original_checkpoint_to_diffusers.py b/diffusers/scripts/convert_ddpm_original_checkpoint_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..46595784b0bac0016b623b7122082275248363e9
--- /dev/null
+++ b/diffusers/scripts/convert_ddpm_original_checkpoint_to_diffusers.py
@@ -0,0 +1,431 @@
+import argparse
+import json
+
+import torch
+
+from diffusers import AutoencoderKL, DDPMPipeline, DDPMScheduler, UNet2DModel, VQModel
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("block.", "resnets.")
+        new_item = new_item.replace("conv_shorcut", "conv1")
+        new_item = new_item.replace("in_shortcut", "conv_shortcut")
+        new_item = new_item.replace("temb_proj", "time_emb_proj")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0, in_mid=False):
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        # In `model.mid`, the layer is called `attn`.
+        if not in_mid:
+            new_item = new_item.replace("attn", "attentions")
+        new_item = new_item.replace(".k.", ".key.")
+        new_item = new_item.replace(".v.", ".value.")
+        new_item = new_item.replace(".q.", ".query.")
+
+        new_item = new_item.replace("proj_out", "proj_attn")
+        new_item = new_item.replace("norm", "group_norm")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    if attention_paths_to_split is not None:
+        if config is None:
+            raise ValueError("Please specify the config if setting 'attention_paths_to_split' to 'True'.")
+
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config.get("num_head_channels", 1) // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape).squeeze()
+            checkpoint[path_map["key"]] = key.reshape(target_shape).squeeze()
+            checkpoint[path_map["value"]] = value.reshape(target_shape).squeeze()
+
+    for path in paths:
+        new_path = path["new"]
+
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        new_path = new_path.replace("down.", "down_blocks.")
+        new_path = new_path.replace("up.", "up_blocks.")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        if "attentions" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]].squeeze()
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def convert_ddpm_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = checkpoint["temb.dense.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = checkpoint["temb.dense.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = checkpoint["temb.dense.1.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = checkpoint["temb.dense.1.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = checkpoint["norm_out.weight"]
+    new_checkpoint["conv_norm_out.bias"] = checkpoint["norm_out.bias"]
+
+    new_checkpoint["conv_in.weight"] = checkpoint["conv_in.weight"]
+    new_checkpoint["conv_in.bias"] = checkpoint["conv_in.bias"]
+    new_checkpoint["conv_out.weight"] = checkpoint["conv_out.weight"]
+    new_checkpoint["conv_out.bias"] = checkpoint["conv_out.bias"]
+
+    num_down_blocks = len({".".join(layer.split(".")[:2]) for layer in checkpoint if "down" in layer})
+    down_blocks = {
+        layer_id: [key for key in checkpoint if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    num_up_blocks = len({".".join(layer.split(".")[:2]) for layer in checkpoint if "up" in layer})
+    up_blocks = {layer_id: [key for key in checkpoint if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)}
+
+    for i in range(num_down_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+
+        if any("downsample" in layer for layer in down_blocks[i]):
+            new_checkpoint[f"down_blocks.{i}.downsamplers.0.conv.weight"] = checkpoint[
+                f"down.{i}.downsample.op.weight"
+            ]
+            new_checkpoint[f"down_blocks.{i}.downsamplers.0.conv.bias"] = checkpoint[f"down.{i}.downsample.op.bias"]
+        #            new_checkpoint[f'down_blocks.{i}.downsamplers.0.op.weight'] = checkpoint[f'down.{i}.downsample.conv.weight']
+        #            new_checkpoint[f'down_blocks.{i}.downsamplers.0.op.bias'] = checkpoint[f'down.{i}.downsample.conv.bias']
+
+        if any("block" in layer for layer in down_blocks[i]):
+            num_blocks = len(
+                {".".join(shave_segments(layer, 2).split(".")[:2]) for layer in down_blocks[i] if "block" in layer}
+            )
+            blocks = {
+                layer_id: [key for key in down_blocks[i] if f"block.{layer_id}" in key]
+                for layer_id in range(num_blocks)
+            }
+
+            if num_blocks > 0:
+                for j in range(config["layers_per_block"]):
+                    paths = renew_resnet_paths(blocks[j])
+                    assign_to_checkpoint(paths, new_checkpoint, checkpoint)
+
+        if any("attn" in layer for layer in down_blocks[i]):
+            num_attn = len(
+                {".".join(shave_segments(layer, 2).split(".")[:2]) for layer in down_blocks[i] if "attn" in layer}
+            )
+            attns = {
+                layer_id: [key for key in down_blocks[i] if f"attn.{layer_id}" in key]
+                for layer_id in range(num_blocks)
+            }
+
+            if num_attn > 0:
+                for j in range(config["layers_per_block"]):
+                    paths = renew_attention_paths(attns[j])
+                    assign_to_checkpoint(paths, new_checkpoint, checkpoint, config=config)
+
+    mid_block_1_layers = [key for key in checkpoint if "mid.block_1" in key]
+    mid_block_2_layers = [key for key in checkpoint if "mid.block_2" in key]
+    mid_attn_1_layers = [key for key in checkpoint if "mid.attn_1" in key]
+
+    # Mid new 2
+    paths = renew_resnet_paths(mid_block_1_layers)
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        checkpoint,
+        additional_replacements=[{"old": "mid.", "new": "mid_new_2."}, {"old": "block_1", "new": "resnets.0"}],
+    )
+
+    paths = renew_resnet_paths(mid_block_2_layers)
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        checkpoint,
+        additional_replacements=[{"old": "mid.", "new": "mid_new_2."}, {"old": "block_2", "new": "resnets.1"}],
+    )
+
+    paths = renew_attention_paths(mid_attn_1_layers, in_mid=True)
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        checkpoint,
+        additional_replacements=[{"old": "mid.", "new": "mid_new_2."}, {"old": "attn_1", "new": "attentions.0"}],
+    )
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+
+        if any("upsample" in layer for layer in up_blocks[i]):
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = checkpoint[
+                f"up.{i}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = checkpoint[f"up.{i}.upsample.conv.bias"]
+
+        if any("block" in layer for layer in up_blocks[i]):
+            num_blocks = len(
+                {".".join(shave_segments(layer, 2).split(".")[:2]) for layer in up_blocks[i] if "block" in layer}
+            )
+            blocks = {
+                layer_id: [key for key in up_blocks[i] if f"block.{layer_id}" in key] for layer_id in range(num_blocks)
+            }
+
+            if num_blocks > 0:
+                for j in range(config["layers_per_block"] + 1):
+                    replace_indices = {"old": f"up_blocks.{i}", "new": f"up_blocks.{block_id}"}
+                    paths = renew_resnet_paths(blocks[j])
+                    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[replace_indices])
+
+        if any("attn" in layer for layer in up_blocks[i]):
+            num_attn = len(
+                {".".join(shave_segments(layer, 2).split(".")[:2]) for layer in up_blocks[i] if "attn" in layer}
+            )
+            attns = {
+                layer_id: [key for key in up_blocks[i] if f"attn.{layer_id}" in key] for layer_id in range(num_blocks)
+            }
+
+            if num_attn > 0:
+                for j in range(config["layers_per_block"] + 1):
+                    replace_indices = {"old": f"up_blocks.{i}", "new": f"up_blocks.{block_id}"}
+                    paths = renew_attention_paths(attns[j])
+                    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[replace_indices])
+
+    new_checkpoint = {k.replace("mid_new_2", "mid_block"): v for k, v in new_checkpoint.items()}
+    return new_checkpoint
+
+
+def convert_vq_autoenc_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_norm_out.weight"] = checkpoint["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = checkpoint["encoder.norm_out.bias"]
+
+    new_checkpoint["encoder.conv_in.weight"] = checkpoint["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = checkpoint["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = checkpoint["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = checkpoint["encoder.conv_out.bias"]
+
+    new_checkpoint["decoder.conv_norm_out.weight"] = checkpoint["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = checkpoint["decoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = checkpoint["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = checkpoint["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = checkpoint["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = checkpoint["decoder.conv_out.bias"]
+
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in checkpoint if "down" in layer})
+    down_blocks = {
+        layer_id: [key for key in checkpoint if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in checkpoint if "up" in layer})
+    up_blocks = {layer_id: [key for key in checkpoint if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)}
+
+    for i in range(num_down_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+
+        if any("downsample" in layer for layer in down_blocks[i]):
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = checkpoint[
+                f"encoder.down.{i}.downsample.conv.weight"
+            ]
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = checkpoint[
+                f"encoder.down.{i}.downsample.conv.bias"
+            ]
+
+        if any("block" in layer for layer in down_blocks[i]):
+            num_blocks = len(
+                {".".join(shave_segments(layer, 3).split(".")[:3]) for layer in down_blocks[i] if "block" in layer}
+            )
+            blocks = {
+                layer_id: [key for key in down_blocks[i] if f"block.{layer_id}" in key]
+                for layer_id in range(num_blocks)
+            }
+
+            if num_blocks > 0:
+                for j in range(config["layers_per_block"]):
+                    paths = renew_resnet_paths(blocks[j])
+                    assign_to_checkpoint(paths, new_checkpoint, checkpoint)
+
+        if any("attn" in layer for layer in down_blocks[i]):
+            num_attn = len(
+                {".".join(shave_segments(layer, 3).split(".")[:3]) for layer in down_blocks[i] if "attn" in layer}
+            )
+            attns = {
+                layer_id: [key for key in down_blocks[i] if f"attn.{layer_id}" in key]
+                for layer_id in range(num_blocks)
+            }
+
+            if num_attn > 0:
+                for j in range(config["layers_per_block"]):
+                    paths = renew_attention_paths(attns[j])
+                    assign_to_checkpoint(paths, new_checkpoint, checkpoint, config=config)
+
+    mid_block_1_layers = [key for key in checkpoint if "mid.block_1" in key]
+    mid_block_2_layers = [key for key in checkpoint if "mid.block_2" in key]
+    mid_attn_1_layers = [key for key in checkpoint if "mid.attn_1" in key]
+
+    # Mid new 2
+    paths = renew_resnet_paths(mid_block_1_layers)
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        checkpoint,
+        additional_replacements=[{"old": "mid.", "new": "mid_new_2."}, {"old": "block_1", "new": "resnets.0"}],
+    )
+
+    paths = renew_resnet_paths(mid_block_2_layers)
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        checkpoint,
+        additional_replacements=[{"old": "mid.", "new": "mid_new_2."}, {"old": "block_2", "new": "resnets.1"}],
+    )
+
+    paths = renew_attention_paths(mid_attn_1_layers, in_mid=True)
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        checkpoint,
+        additional_replacements=[{"old": "mid.", "new": "mid_new_2."}, {"old": "attn_1", "new": "attentions.0"}],
+    )
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+
+        if any("upsample" in layer for layer in up_blocks[i]):
+            new_checkpoint[f"decoder.up_blocks.{block_id}.upsamplers.0.conv.weight"] = checkpoint[
+                f"decoder.up.{i}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{block_id}.upsamplers.0.conv.bias"] = checkpoint[
+                f"decoder.up.{i}.upsample.conv.bias"
+            ]
+
+        if any("block" in layer for layer in up_blocks[i]):
+            num_blocks = len(
+                {".".join(shave_segments(layer, 3).split(".")[:3]) for layer in up_blocks[i] if "block" in layer}
+            )
+            blocks = {
+                layer_id: [key for key in up_blocks[i] if f"block.{layer_id}" in key] for layer_id in range(num_blocks)
+            }
+
+            if num_blocks > 0:
+                for j in range(config["layers_per_block"] + 1):
+                    replace_indices = {"old": f"up_blocks.{i}", "new": f"up_blocks.{block_id}"}
+                    paths = renew_resnet_paths(blocks[j])
+                    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[replace_indices])
+
+        if any("attn" in layer for layer in up_blocks[i]):
+            num_attn = len(
+                {".".join(shave_segments(layer, 3).split(".")[:3]) for layer in up_blocks[i] if "attn" in layer}
+            )
+            attns = {
+                layer_id: [key for key in up_blocks[i] if f"attn.{layer_id}" in key] for layer_id in range(num_blocks)
+            }
+
+            if num_attn > 0:
+                for j in range(config["layers_per_block"] + 1):
+                    replace_indices = {"old": f"up_blocks.{i}", "new": f"up_blocks.{block_id}"}
+                    paths = renew_attention_paths(attns[j])
+                    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[replace_indices])
+
+    new_checkpoint = {k.replace("mid_new_2", "mid_block"): v for k, v in new_checkpoint.items()}
+    new_checkpoint["quant_conv.weight"] = checkpoint["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = checkpoint["quant_conv.bias"]
+    if "quantize.embedding.weight" in checkpoint:
+        new_checkpoint["quantize.embedding.weight"] = checkpoint["quantize.embedding.weight"]
+    new_checkpoint["post_quant_conv.weight"] = checkpoint["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = checkpoint["post_quant_conv.bias"]
+
+    return new_checkpoint
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the architecture.",
+    )
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    args = parser.parse_args()
+    checkpoint = torch.load(args.checkpoint_path)
+
+    with open(args.config_file) as f:
+        config = json.loads(f.read())
+
+    # unet case
+    key_prefix_set = {key.split(".")[0] for key in checkpoint.keys()}
+    if "encoder" in key_prefix_set and "decoder" in key_prefix_set:
+        converted_checkpoint = convert_vq_autoenc_checkpoint(checkpoint, config)
+    else:
+        converted_checkpoint = convert_ddpm_checkpoint(checkpoint, config)
+
+    if "ddpm" in config:
+        del config["ddpm"]
+
+    if config["_class_name"] == "VQModel":
+        model = VQModel(**config)
+        model.load_state_dict(converted_checkpoint)
+        model.save_pretrained(args.dump_path)
+    elif config["_class_name"] == "AutoencoderKL":
+        model = AutoencoderKL(**config)
+        model.load_state_dict(converted_checkpoint)
+        model.save_pretrained(args.dump_path)
+    else:
+        model = UNet2DModel(**config)
+        model.load_state_dict(converted_checkpoint)
+
+        scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1]))
+
+        pipe = DDPMPipeline(unet=model, scheduler=scheduler)
+        pipe.save_pretrained(args.dump_path)
diff --git a/diffusers/scripts/convert_diffusers_sdxl_lora_to_webui.py b/diffusers/scripts/convert_diffusers_sdxl_lora_to_webui.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfb3871275cbc68809379596a9209e08f377936c
--- /dev/null
+++ b/diffusers/scripts/convert_diffusers_sdxl_lora_to_webui.py
@@ -0,0 +1,56 @@
+# Script for converting a Hugging Face Diffusers trained SDXL LoRAs to Kohya format
+# This means that you can input your diffusers-trained LoRAs and
+# Get the output to work with WebUIs such as AUTOMATIC1111, ComfyUI, SD.Next and others.
+
+# To get started you can find some cool `diffusers` trained LoRAs such as this cute Corgy
+# https://huggingface.co/ignasbud/corgy_dog_LoRA/, download its `pytorch_lora_weights.safetensors` file
+# and run the script:
+# python convert_diffusers_sdxl_lora_to_webui.py --input_lora pytorch_lora_weights.safetensors --output_lora corgy.safetensors
+# now you can use corgy.safetensors in your WebUI of choice!
+
+# To train your own, here are some diffusers training scripts and utils that you can use and then convert:
+# LoRA Ease - no code SDXL Dreambooth LoRA trainer: https://huggingface.co/spaces/multimodalart/lora-ease
+# Dreambooth Advanced Training Script - state of the art techniques such as pivotal tuning and prodigy optimizer:
+# - Script: https://github.com/huggingface/diffusers/blob/main/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
+# - Colab (only on Pro): https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_Dreambooth_LoRA_advanced_example.ipynb
+# Canonical diffusers training scripts:
+# - Script: https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora_sdxl.py
+# - Colab (runs on free tier): https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_.ipynb
+
+import argparse
+import os
+
+from safetensors.torch import load_file, save_file
+
+from diffusers.utils import convert_all_state_dict_to_peft, convert_state_dict_to_kohya
+
+
+def convert_and_save(input_lora, output_lora=None):
+    if output_lora is None:
+        base_name = os.path.splitext(input_lora)[0]
+        output_lora = f"{base_name}_webui.safetensors"
+
+    diffusers_state_dict = load_file(input_lora)
+    peft_state_dict = convert_all_state_dict_to_peft(diffusers_state_dict)
+    kohya_state_dict = convert_state_dict_to_kohya(peft_state_dict)
+    save_file(kohya_state_dict, output_lora)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert LoRA model to PEFT and then to Kohya format.")
+    parser.add_argument(
+        "--input_lora",
+        type=str,
+        required=True,
+        help="Path to the input LoRA model file in the diffusers format.",
+    )
+    parser.add_argument(
+        "--output_lora",
+        type=str,
+        required=False,
+        help="Path for the converted LoRA (safetensors format for AUTOMATIC1111, ComfyUI, etc.). Optional, defaults to input name with a _webui suffix.",
+    )
+
+    args = parser.parse_args()
+
+    convert_and_save(args.input_lora, args.output_lora)
diff --git a/diffusers/scripts/convert_diffusers_to_original_sdxl.py b/diffusers/scripts/convert_diffusers_to_original_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..648d0376f72eb8d73663247ae40be62b1c960eda
--- /dev/null
+++ b/diffusers/scripts/convert_diffusers_to_original_sdxl.py
@@ -0,0 +1,350 @@
+# Script for converting a HF Diffusers saved pipeline to a Stable Diffusion checkpoint.
+# *Only* converts the UNet, VAE, and Text Encoder.
+# Does not convert optimizer state or any other thing.
+
+import argparse
+import os.path as osp
+import re
+
+import torch
+from safetensors.torch import load_file, save_file
+
+
+# =================#
+# UNet Conversion #
+# =================#
+
+unet_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+    ("input_blocks.0.0.weight", "conv_in.weight"),
+    ("input_blocks.0.0.bias", "conv_in.bias"),
+    ("out.0.weight", "conv_norm_out.weight"),
+    ("out.0.bias", "conv_norm_out.bias"),
+    ("out.2.weight", "conv_out.weight"),
+    ("out.2.bias", "conv_out.bias"),
+    # the following are for sdxl
+    ("label_emb.0.0.weight", "add_embedding.linear_1.weight"),
+    ("label_emb.0.0.bias", "add_embedding.linear_1.bias"),
+    ("label_emb.0.2.weight", "add_embedding.linear_2.weight"),
+    ("label_emb.0.2.bias", "add_embedding.linear_2.bias"),
+]
+
+unet_conversion_map_resnet = [
+    # (stable-diffusion, HF Diffusers)
+    ("in_layers.0", "norm1"),
+    ("in_layers.2", "conv1"),
+    ("out_layers.0", "norm2"),
+    ("out_layers.3", "conv2"),
+    ("emb_layers.1", "time_emb_proj"),
+    ("skip_connection", "conv_shortcut"),
+]
+
+unet_conversion_map_layer = []
+# hardcoded number of downblocks and resnets/attentions...
+# would need smarter logic for other networks.
+for i in range(3):
+    # loop over downblocks/upblocks
+
+    for j in range(2):
+        # loop over resnets/attentions for downblocks
+        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+        sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
+        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+
+        if i > 0:
+            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+            sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
+            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+
+    for j in range(4):
+        # loop over resnets/attentions for upblocks
+        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+        sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
+        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+
+        if i < 2:
+            # no attention layers in up_blocks.0
+            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+            sd_up_atn_prefix = f"output_blocks.{3 * i + j}.1."
+            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+
+    if i < 3:
+        # no downsample in down_blocks.3
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+        sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
+        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+
+        # no upsample in up_blocks.3
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
+        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+unet_conversion_map_layer.append(("output_blocks.2.2.conv.", "output_blocks.2.1.conv."))
+
+hf_mid_atn_prefix = "mid_block.attentions.0."
+sd_mid_atn_prefix = "middle_block.1."
+unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+for j in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{j}."
+    sd_mid_res_prefix = f"middle_block.{2*j}."
+    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+
+
+def convert_unet_state_dict(unet_state_dict):
+    # buyer beware: this is a *brittle* function,
+    # and correct output requires that all of these pieces interact in
+    # the exact order in which I have arranged them.
+    mapping = {k: k for k in unet_state_dict.keys()}
+    for sd_name, hf_name in unet_conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {sd_name: unet_state_dict[hf_name] for hf_name, sd_name in mapping.items()}
+    return new_state_dict
+
+
+# ================#
+# VAE Conversion #
+# ================#
+
+vae_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("nin_shortcut", "conv_shortcut"),
+    ("norm_out", "conv_norm_out"),
+    ("mid.attn_1.", "mid_block.attentions.0."),
+]
+
+for i in range(4):
+    # down_blocks have two resnets
+    for j in range(2):
+        hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
+        sd_down_prefix = f"encoder.down.{i}.block.{j}."
+        vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
+
+    if i < 3:
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
+        sd_downsample_prefix = f"down.{i}.downsample."
+        vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
+
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"up.{3-i}.upsample."
+        vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
+
+    # up_blocks have three resnets
+    # also, up blocks in hf are numbered in reverse from sd
+    for j in range(3):
+        hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
+        sd_up_prefix = f"decoder.up.{3-i}.block.{j}."
+        vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
+
+# this part accounts for mid blocks in both the encoder and the decoder
+for i in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{i}."
+    sd_mid_res_prefix = f"mid.block_{i+1}."
+    vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
+
+
+vae_conversion_map_attn = [
+    # (stable-diffusion, HF Diffusers)
+    ("norm.", "group_norm."),
+    # the following are for SDXL
+    ("q.", "to_q."),
+    ("k.", "to_k."),
+    ("v.", "to_v."),
+    ("proj_out.", "to_out.0."),
+]
+
+
+def reshape_weight_for_sd(w):
+    # convert HF linear weights to SD conv2d weights
+    if not w.ndim == 1:
+        return w.reshape(*w.shape, 1, 1)
+    else:
+        return w
+
+
+def convert_vae_state_dict(vae_state_dict):
+    mapping = {k: k for k in vae_state_dict.keys()}
+    for k, v in mapping.items():
+        for sd_part, hf_part in vae_conversion_map:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    for k, v in mapping.items():
+        if "attentions" in k:
+            for sd_part, hf_part in vae_conversion_map_attn:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
+    weights_to_convert = ["q", "k", "v", "proj_out"]
+    for k, v in new_state_dict.items():
+        for weight_name in weights_to_convert:
+            if f"mid.attn_1.{weight_name}.weight" in k:
+                print(f"Reshaping {k} for SD format")
+                new_state_dict[k] = reshape_weight_for_sd(v)
+    return new_state_dict
+
+
+# =========================#
+# Text Encoder Conversion #
+# =========================#
+
+
+textenc_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("transformer.resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "text_model.final_layer_norm."),
+    ("token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[1]): x[0] for x in textenc_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+
+# Ordering is from https://github.com/pytorch/pytorch/blob/master/test/cpp/api/modules.cpp
+code2idx = {"q": 0, "k": 1, "v": 2}
+
+
+def convert_openclip_text_enc_state_dict(text_enc_dict):
+    new_state_dict = {}
+    capture_qkv_weight = {}
+    capture_qkv_bias = {}
+    for k, v in text_enc_dict.items():
+        if (
+            k.endswith(".self_attn.q_proj.weight")
+            or k.endswith(".self_attn.k_proj.weight")
+            or k.endswith(".self_attn.v_proj.weight")
+        ):
+            k_pre = k[: -len(".q_proj.weight")]
+            k_code = k[-len("q_proj.weight")]
+            if k_pre not in capture_qkv_weight:
+                capture_qkv_weight[k_pre] = [None, None, None]
+            capture_qkv_weight[k_pre][code2idx[k_code]] = v
+            continue
+
+        if (
+            k.endswith(".self_attn.q_proj.bias")
+            or k.endswith(".self_attn.k_proj.bias")
+            or k.endswith(".self_attn.v_proj.bias")
+        ):
+            k_pre = k[: -len(".q_proj.bias")]
+            k_code = k[-len("q_proj.bias")]
+            if k_pre not in capture_qkv_bias:
+                capture_qkv_bias[k_pre] = [None, None, None]
+            capture_qkv_bias[k_pre][code2idx[k_code]] = v
+            continue
+
+        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k)
+        new_state_dict[relabelled_key] = v
+
+    for k_pre, tensors in capture_qkv_weight.items():
+        if None in tensors:
+            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
+        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
+        new_state_dict[relabelled_key + ".in_proj_weight"] = torch.cat(tensors)
+
+    for k_pre, tensors in capture_qkv_bias.items():
+        if None in tensors:
+            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
+        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
+        new_state_dict[relabelled_key + ".in_proj_bias"] = torch.cat(tensors)
+
+    return new_state_dict
+
+
+def convert_openai_text_enc_state_dict(text_enc_dict):
+    return text_enc_dict
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
+    parser.add_argument(
+        "--use_safetensors", action="store_true", help="Save weights use safetensors, default is ckpt."
+    )
+
+    args = parser.parse_args()
+
+    assert args.model_path is not None, "Must provide a model path!"
+
+    assert args.checkpoint_path is not None, "Must provide a checkpoint path!"
+
+    # Path for safetensors
+    unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.safetensors")
+    vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.safetensors")
+    text_enc_path = osp.join(args.model_path, "text_encoder", "model.safetensors")
+    text_enc_2_path = osp.join(args.model_path, "text_encoder_2", "model.safetensors")
+
+    # Load models from safetensors if it exists, if it doesn't pytorch
+    if osp.exists(unet_path):
+        unet_state_dict = load_file(unet_path, device="cpu")
+    else:
+        unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.bin")
+        unet_state_dict = torch.load(unet_path, map_location="cpu")
+
+    if osp.exists(vae_path):
+        vae_state_dict = load_file(vae_path, device="cpu")
+    else:
+        vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.bin")
+        vae_state_dict = torch.load(vae_path, map_location="cpu")
+
+    if osp.exists(text_enc_path):
+        text_enc_dict = load_file(text_enc_path, device="cpu")
+    else:
+        text_enc_path = osp.join(args.model_path, "text_encoder", "pytorch_model.bin")
+        text_enc_dict = torch.load(text_enc_path, map_location="cpu")
+
+    if osp.exists(text_enc_2_path):
+        text_enc_2_dict = load_file(text_enc_2_path, device="cpu")
+    else:
+        text_enc_2_path = osp.join(args.model_path, "text_encoder_2", "pytorch_model.bin")
+        text_enc_2_dict = torch.load(text_enc_2_path, map_location="cpu")
+
+    # Convert the UNet model
+    unet_state_dict = convert_unet_state_dict(unet_state_dict)
+    unet_state_dict = {"model.diffusion_model." + k: v for k, v in unet_state_dict.items()}
+
+    # Convert the VAE model
+    vae_state_dict = convert_vae_state_dict(vae_state_dict)
+    vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()}
+
+    # Convert text encoder 1
+    text_enc_dict = convert_openai_text_enc_state_dict(text_enc_dict)
+    text_enc_dict = {"conditioner.embedders.0.transformer." + k: v for k, v in text_enc_dict.items()}
+
+    # Convert text encoder 2
+    text_enc_2_dict = convert_openclip_text_enc_state_dict(text_enc_2_dict)
+    text_enc_2_dict = {"conditioner.embedders.1.model." + k: v for k, v in text_enc_2_dict.items()}
+    # We call the `.T.contiguous()` to match what's done in
+    # https://github.com/huggingface/diffusers/blob/84905ca7287876b925b6bf8e9bb92fec21c78764/src/diffusers/loaders/single_file_utils.py#L1085
+    text_enc_2_dict["conditioner.embedders.1.model.text_projection"] = text_enc_2_dict.pop(
+        "conditioner.embedders.1.model.text_projection.weight"
+    ).T.contiguous()
+
+    # Put together new checkpoint
+    state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict, **text_enc_2_dict}
+
+    if args.half:
+        state_dict = {k: v.half() for k, v in state_dict.items()}
+
+    if args.use_safetensors:
+        save_file(state_dict, args.checkpoint_path)
+    else:
+        state_dict = {"state_dict": state_dict}
+        torch.save(state_dict, args.checkpoint_path)
diff --git a/diffusers/scripts/convert_diffusers_to_original_stable_diffusion.py b/diffusers/scripts/convert_diffusers_to_original_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b7df070c436b4d1a5fad2b9a096f1999aacc33
--- /dev/null
+++ b/diffusers/scripts/convert_diffusers_to_original_stable_diffusion.py
@@ -0,0 +1,353 @@
+# Script for converting a HF Diffusers saved pipeline to a Stable Diffusion checkpoint.
+# *Only* converts the UNet, VAE, and Text Encoder.
+# Does not convert optimizer state or any other thing.
+
+import argparse
+import os.path as osp
+import re
+
+import torch
+from safetensors.torch import load_file, save_file
+
+
+# =================#
+# UNet Conversion #
+# =================#
+
+unet_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+    ("input_blocks.0.0.weight", "conv_in.weight"),
+    ("input_blocks.0.0.bias", "conv_in.bias"),
+    ("out.0.weight", "conv_norm_out.weight"),
+    ("out.0.bias", "conv_norm_out.bias"),
+    ("out.2.weight", "conv_out.weight"),
+    ("out.2.bias", "conv_out.bias"),
+]
+
+unet_conversion_map_resnet = [
+    # (stable-diffusion, HF Diffusers)
+    ("in_layers.0", "norm1"),
+    ("in_layers.2", "conv1"),
+    ("out_layers.0", "norm2"),
+    ("out_layers.3", "conv2"),
+    ("emb_layers.1", "time_emb_proj"),
+    ("skip_connection", "conv_shortcut"),
+]
+
+unet_conversion_map_layer = []
+# hardcoded number of downblocks and resnets/attentions...
+# would need smarter logic for other networks.
+for i in range(4):
+    # loop over downblocks/upblocks
+
+    for j in range(2):
+        # loop over resnets/attentions for downblocks
+        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+        sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
+        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+
+        if i < 3:
+            # no attention layers in down_blocks.3
+            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+            sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
+            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+
+    for j in range(3):
+        # loop over resnets/attentions for upblocks
+        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+        sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
+        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+
+        if i > 0:
+            # no attention layers in up_blocks.0
+            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+            sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
+            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+
+    if i < 3:
+        # no downsample in down_blocks.3
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+        sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
+        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+
+        # no upsample in up_blocks.3
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
+        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+
+hf_mid_atn_prefix = "mid_block.attentions.0."
+sd_mid_atn_prefix = "middle_block.1."
+unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+
+for j in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{j}."
+    sd_mid_res_prefix = f"middle_block.{2*j}."
+    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+
+
+def convert_unet_state_dict(unet_state_dict):
+    # buyer beware: this is a *brittle* function,
+    # and correct output requires that all of these pieces interact in
+    # the exact order in which I have arranged them.
+    mapping = {k: k for k in unet_state_dict.keys()}
+    for sd_name, hf_name in unet_conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
+    return new_state_dict
+
+
+# ================#
+# VAE Conversion #
+# ================#
+
+vae_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("nin_shortcut", "conv_shortcut"),
+    ("norm_out", "conv_norm_out"),
+    ("mid.attn_1.", "mid_block.attentions.0."),
+]
+
+for i in range(4):
+    # down_blocks have two resnets
+    for j in range(2):
+        hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
+        sd_down_prefix = f"encoder.down.{i}.block.{j}."
+        vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
+
+    if i < 3:
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
+        sd_downsample_prefix = f"down.{i}.downsample."
+        vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
+
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"up.{3-i}.upsample."
+        vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
+
+    # up_blocks have three resnets
+    # also, up blocks in hf are numbered in reverse from sd
+    for j in range(3):
+        hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
+        sd_up_prefix = f"decoder.up.{3-i}.block.{j}."
+        vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
+
+# this part accounts for mid blocks in both the encoder and the decoder
+for i in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{i}."
+    sd_mid_res_prefix = f"mid.block_{i+1}."
+    vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
+
+
+vae_conversion_map_attn = [
+    # (stable-diffusion, HF Diffusers)
+    ("norm.", "group_norm."),
+    ("q.", "query."),
+    ("k.", "key."),
+    ("v.", "value."),
+    ("proj_out.", "proj_attn."),
+]
+
+# This is probably not the most ideal solution, but it does work.
+vae_extra_conversion_map = [
+    ("to_q", "q"),
+    ("to_k", "k"),
+    ("to_v", "v"),
+    ("to_out.0", "proj_out"),
+]
+
+
+def reshape_weight_for_sd(w):
+    # convert HF linear weights to SD conv2d weights
+    if not w.ndim == 1:
+        return w.reshape(*w.shape, 1, 1)
+    else:
+        return w
+
+
+def convert_vae_state_dict(vae_state_dict):
+    mapping = {k: k for k in vae_state_dict.keys()}
+    for k, v in mapping.items():
+        for sd_part, hf_part in vae_conversion_map:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    for k, v in mapping.items():
+        if "attentions" in k:
+            for sd_part, hf_part in vae_conversion_map_attn:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
+    weights_to_convert = ["q", "k", "v", "proj_out"]
+    keys_to_rename = {}
+    for k, v in new_state_dict.items():
+        for weight_name in weights_to_convert:
+            if f"mid.attn_1.{weight_name}.weight" in k:
+                print(f"Reshaping {k} for SD format")
+                new_state_dict[k] = reshape_weight_for_sd(v)
+        for weight_name, real_weight_name in vae_extra_conversion_map:
+            if f"mid.attn_1.{weight_name}.weight" in k or f"mid.attn_1.{weight_name}.bias" in k:
+                keys_to_rename[k] = k.replace(weight_name, real_weight_name)
+    for k, v in keys_to_rename.items():
+        if k in new_state_dict:
+            print(f"Renaming {k} to {v}")
+            new_state_dict[v] = reshape_weight_for_sd(new_state_dict[k])
+            del new_state_dict[k]
+    return new_state_dict
+
+
+# =========================#
+# Text Encoder Conversion #
+# =========================#
+
+
+textenc_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[1]): x[0] for x in textenc_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+
+# Ordering is from https://github.com/pytorch/pytorch/blob/master/test/cpp/api/modules.cpp
+code2idx = {"q": 0, "k": 1, "v": 2}
+
+
+def convert_text_enc_state_dict_v20(text_enc_dict):
+    new_state_dict = {}
+    capture_qkv_weight = {}
+    capture_qkv_bias = {}
+    for k, v in text_enc_dict.items():
+        if (
+            k.endswith(".self_attn.q_proj.weight")
+            or k.endswith(".self_attn.k_proj.weight")
+            or k.endswith(".self_attn.v_proj.weight")
+        ):
+            k_pre = k[: -len(".q_proj.weight")]
+            k_code = k[-len("q_proj.weight")]
+            if k_pre not in capture_qkv_weight:
+                capture_qkv_weight[k_pre] = [None, None, None]
+            capture_qkv_weight[k_pre][code2idx[k_code]] = v
+            continue
+
+        if (
+            k.endswith(".self_attn.q_proj.bias")
+            or k.endswith(".self_attn.k_proj.bias")
+            or k.endswith(".self_attn.v_proj.bias")
+        ):
+            k_pre = k[: -len(".q_proj.bias")]
+            k_code = k[-len("q_proj.bias")]
+            if k_pre not in capture_qkv_bias:
+                capture_qkv_bias[k_pre] = [None, None, None]
+            capture_qkv_bias[k_pre][code2idx[k_code]] = v
+            continue
+
+        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k)
+        new_state_dict[relabelled_key] = v
+
+    for k_pre, tensors in capture_qkv_weight.items():
+        if None in tensors:
+            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
+        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
+        new_state_dict[relabelled_key + ".in_proj_weight"] = torch.cat(tensors)
+
+    for k_pre, tensors in capture_qkv_bias.items():
+        if None in tensors:
+            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
+        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
+        new_state_dict[relabelled_key + ".in_proj_bias"] = torch.cat(tensors)
+
+    return new_state_dict
+
+
+def convert_text_enc_state_dict(text_enc_dict):
+    return text_enc_dict
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
+    parser.add_argument(
+        "--use_safetensors", action="store_true", help="Save weights use safetensors, default is ckpt."
+    )
+
+    args = parser.parse_args()
+
+    assert args.model_path is not None, "Must provide a model path!"
+
+    assert args.checkpoint_path is not None, "Must provide a checkpoint path!"
+
+    # Path for safetensors
+    unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.safetensors")
+    vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.safetensors")
+    text_enc_path = osp.join(args.model_path, "text_encoder", "model.safetensors")
+
+    # Load models from safetensors if it exists, if it doesn't pytorch
+    if osp.exists(unet_path):
+        unet_state_dict = load_file(unet_path, device="cpu")
+    else:
+        unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.bin")
+        unet_state_dict = torch.load(unet_path, map_location="cpu")
+
+    if osp.exists(vae_path):
+        vae_state_dict = load_file(vae_path, device="cpu")
+    else:
+        vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.bin")
+        vae_state_dict = torch.load(vae_path, map_location="cpu")
+
+    if osp.exists(text_enc_path):
+        text_enc_dict = load_file(text_enc_path, device="cpu")
+    else:
+        text_enc_path = osp.join(args.model_path, "text_encoder", "pytorch_model.bin")
+        text_enc_dict = torch.load(text_enc_path, map_location="cpu")
+
+    # Convert the UNet model
+    unet_state_dict = convert_unet_state_dict(unet_state_dict)
+    unet_state_dict = {"model.diffusion_model." + k: v for k, v in unet_state_dict.items()}
+
+    # Convert the VAE model
+    vae_state_dict = convert_vae_state_dict(vae_state_dict)
+    vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()}
+
+    # Easiest way to identify v2.0 model seems to be that the text encoder (OpenCLIP) is deeper
+    is_v20_model = "text_model.encoder.layers.22.layer_norm2.bias" in text_enc_dict
+
+    if is_v20_model:
+        # Need to add the tag 'transformer' in advance so we can knock it out from the final layer-norm
+        text_enc_dict = {"transformer." + k: v for k, v in text_enc_dict.items()}
+        text_enc_dict = convert_text_enc_state_dict_v20(text_enc_dict)
+        text_enc_dict = {"cond_stage_model.model." + k: v for k, v in text_enc_dict.items()}
+    else:
+        text_enc_dict = convert_text_enc_state_dict(text_enc_dict)
+        text_enc_dict = {"cond_stage_model.transformer." + k: v for k, v in text_enc_dict.items()}
+
+    # Put together new checkpoint
+    state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict}
+    if args.half:
+        state_dict = {k: v.half() for k, v in state_dict.items()}
+
+    if args.use_safetensors:
+        save_file(state_dict, args.checkpoint_path)
+    else:
+        state_dict = {"state_dict": state_dict}
+        torch.save(state_dict, args.checkpoint_path)
diff --git a/diffusers/scripts/convert_dit_to_diffusers.py b/diffusers/scripts/convert_dit_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc127f69555c260f594e70444b1540faa196e3fb
--- /dev/null
+++ b/diffusers/scripts/convert_dit_to_diffusers.py
@@ -0,0 +1,162 @@
+import argparse
+import os
+
+import torch
+from torchvision.datasets.utils import download_url
+
+from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, Transformer2DModel
+
+
+pretrained_models = {512: "DiT-XL-2-512x512.pt", 256: "DiT-XL-2-256x256.pt"}
+
+
+def download_model(model_name):
+    """
+    Downloads a pre-trained DiT model from the web.
+    """
+    local_path = f"pretrained_models/{model_name}"
+    if not os.path.isfile(local_path):
+        os.makedirs("pretrained_models", exist_ok=True)
+        web_path = f"https://dl.fbaipublicfiles.com/DiT/models/{model_name}"
+        download_url(web_path, "pretrained_models")
+    model = torch.load(local_path, map_location=lambda storage, loc: storage)
+    return model
+
+
+def main(args):
+    state_dict = download_model(pretrained_models[args.image_size])
+
+    state_dict["pos_embed.proj.weight"] = state_dict["x_embedder.proj.weight"]
+    state_dict["pos_embed.proj.bias"] = state_dict["x_embedder.proj.bias"]
+    state_dict.pop("x_embedder.proj.weight")
+    state_dict.pop("x_embedder.proj.bias")
+
+    for depth in range(28):
+        state_dict[f"transformer_blocks.{depth}.norm1.emb.timestep_embedder.linear_1.weight"] = state_dict[
+            "t_embedder.mlp.0.weight"
+        ]
+        state_dict[f"transformer_blocks.{depth}.norm1.emb.timestep_embedder.linear_1.bias"] = state_dict[
+            "t_embedder.mlp.0.bias"
+        ]
+        state_dict[f"transformer_blocks.{depth}.norm1.emb.timestep_embedder.linear_2.weight"] = state_dict[
+            "t_embedder.mlp.2.weight"
+        ]
+        state_dict[f"transformer_blocks.{depth}.norm1.emb.timestep_embedder.linear_2.bias"] = state_dict[
+            "t_embedder.mlp.2.bias"
+        ]
+        state_dict[f"transformer_blocks.{depth}.norm1.emb.class_embedder.embedding_table.weight"] = state_dict[
+            "y_embedder.embedding_table.weight"
+        ]
+
+        state_dict[f"transformer_blocks.{depth}.norm1.linear.weight"] = state_dict[
+            f"blocks.{depth}.adaLN_modulation.1.weight"
+        ]
+        state_dict[f"transformer_blocks.{depth}.norm1.linear.bias"] = state_dict[
+            f"blocks.{depth}.adaLN_modulation.1.bias"
+        ]
+
+        q, k, v = torch.chunk(state_dict[f"blocks.{depth}.attn.qkv.weight"], 3, dim=0)
+        q_bias, k_bias, v_bias = torch.chunk(state_dict[f"blocks.{depth}.attn.qkv.bias"], 3, dim=0)
+
+        state_dict[f"transformer_blocks.{depth}.attn1.to_q.weight"] = q
+        state_dict[f"transformer_blocks.{depth}.attn1.to_q.bias"] = q_bias
+        state_dict[f"transformer_blocks.{depth}.attn1.to_k.weight"] = k
+        state_dict[f"transformer_blocks.{depth}.attn1.to_k.bias"] = k_bias
+        state_dict[f"transformer_blocks.{depth}.attn1.to_v.weight"] = v
+        state_dict[f"transformer_blocks.{depth}.attn1.to_v.bias"] = v_bias
+
+        state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.weight"] = state_dict[
+            f"blocks.{depth}.attn.proj.weight"
+        ]
+        state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.bias"] = state_dict[f"blocks.{depth}.attn.proj.bias"]
+
+        state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.weight"] = state_dict[f"blocks.{depth}.mlp.fc1.weight"]
+        state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.bias"] = state_dict[f"blocks.{depth}.mlp.fc1.bias"]
+        state_dict[f"transformer_blocks.{depth}.ff.net.2.weight"] = state_dict[f"blocks.{depth}.mlp.fc2.weight"]
+        state_dict[f"transformer_blocks.{depth}.ff.net.2.bias"] = state_dict[f"blocks.{depth}.mlp.fc2.bias"]
+
+        state_dict.pop(f"blocks.{depth}.attn.qkv.weight")
+        state_dict.pop(f"blocks.{depth}.attn.qkv.bias")
+        state_dict.pop(f"blocks.{depth}.attn.proj.weight")
+        state_dict.pop(f"blocks.{depth}.attn.proj.bias")
+        state_dict.pop(f"blocks.{depth}.mlp.fc1.weight")
+        state_dict.pop(f"blocks.{depth}.mlp.fc1.bias")
+        state_dict.pop(f"blocks.{depth}.mlp.fc2.weight")
+        state_dict.pop(f"blocks.{depth}.mlp.fc2.bias")
+        state_dict.pop(f"blocks.{depth}.adaLN_modulation.1.weight")
+        state_dict.pop(f"blocks.{depth}.adaLN_modulation.1.bias")
+
+    state_dict.pop("t_embedder.mlp.0.weight")
+    state_dict.pop("t_embedder.mlp.0.bias")
+    state_dict.pop("t_embedder.mlp.2.weight")
+    state_dict.pop("t_embedder.mlp.2.bias")
+    state_dict.pop("y_embedder.embedding_table.weight")
+
+    state_dict["proj_out_1.weight"] = state_dict["final_layer.adaLN_modulation.1.weight"]
+    state_dict["proj_out_1.bias"] = state_dict["final_layer.adaLN_modulation.1.bias"]
+    state_dict["proj_out_2.weight"] = state_dict["final_layer.linear.weight"]
+    state_dict["proj_out_2.bias"] = state_dict["final_layer.linear.bias"]
+
+    state_dict.pop("final_layer.linear.weight")
+    state_dict.pop("final_layer.linear.bias")
+    state_dict.pop("final_layer.adaLN_modulation.1.weight")
+    state_dict.pop("final_layer.adaLN_modulation.1.bias")
+
+    # DiT XL/2
+    transformer = Transformer2DModel(
+        sample_size=args.image_size // 8,
+        num_layers=28,
+        attention_head_dim=72,
+        in_channels=4,
+        out_channels=8,
+        patch_size=2,
+        attention_bias=True,
+        num_attention_heads=16,
+        activation_fn="gelu-approximate",
+        num_embeds_ada_norm=1000,
+        norm_type="ada_norm_zero",
+        norm_elementwise_affine=False,
+    )
+    transformer.load_state_dict(state_dict, strict=True)
+
+    scheduler = DDIMScheduler(
+        num_train_timesteps=1000,
+        beta_schedule="linear",
+        prediction_type="epsilon",
+        clip_sample=False,
+    )
+
+    vae = AutoencoderKL.from_pretrained(args.vae_model)
+
+    pipeline = DiTPipeline(transformer=transformer, vae=vae, scheduler=scheduler)
+
+    if args.save:
+        pipeline.save_pretrained(args.checkpoint_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--image_size",
+        default=256,
+        type=int,
+        required=False,
+        help="Image size of pretrained model, either 256 or 512.",
+    )
+    parser.add_argument(
+        "--vae_model",
+        default="stabilityai/sd-vae-ft-ema",
+        type=str,
+        required=False,
+        help="Path to pretrained VAE model, either stabilityai/sd-vae-ft-mse or stabilityai/sd-vae-ft-ema.",
+    )
+    parser.add_argument(
+        "--save", default=True, type=bool, required=False, help="Whether to save the converted pipeline or not."
+    )
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the output pipeline."
+    )
+
+    args = parser.parse_args()
+    main(args)
diff --git a/diffusers/scripts/convert_flux_to_diffusers.py b/diffusers/scripts/convert_flux_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a1da256d336aaefc8ad630fc17b1a587dd5d09
--- /dev/null
+++ b/diffusers/scripts/convert_flux_to_diffusers.py
@@ -0,0 +1,303 @@
+import argparse
+from contextlib import nullcontext
+
+import safetensors.torch
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import hf_hub_download
+
+from diffusers import AutoencoderKL, FluxTransformer2DModel
+from diffusers.loaders.single_file_utils import convert_ldm_vae_checkpoint
+from diffusers.utils.import_utils import is_accelerate_available
+
+
+"""
+# Transformer
+
+python scripts/convert_flux_to_diffusers.py  \
+--original_state_dict_repo_id "black-forest-labs/FLUX.1-schnell" \
+--filename "flux1-schnell.sft"
+--output_path "flux-schnell" \
+--transformer
+"""
+
+"""
+# VAE
+
+python scripts/convert_flux_to_diffusers.py  \
+--original_state_dict_repo_id "black-forest-labs/FLUX.1-schnell" \
+--filename "ae.sft"
+--output_path "flux-schnell" \
+--vae
+"""
+
+CTX = init_empty_weights if is_accelerate_available else nullcontext
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--original_state_dict_repo_id", default=None, type=str)
+parser.add_argument("--filename", default="flux.safetensors", type=str)
+parser.add_argument("--checkpoint_path", default=None, type=str)
+parser.add_argument("--vae", action="store_true")
+parser.add_argument("--transformer", action="store_true")
+parser.add_argument("--output_path", type=str)
+parser.add_argument("--dtype", type=str, default="bf16")
+
+args = parser.parse_args()
+dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float32
+
+
+def load_original_checkpoint(args):
+    if args.original_state_dict_repo_id is not None:
+        ckpt_path = hf_hub_download(repo_id=args.original_state_dict_repo_id, filename=args.filename)
+    elif args.checkpoint_path is not None:
+        ckpt_path = args.checkpoint_path
+    else:
+        raise ValueError(" please provide either `original_state_dict_repo_id` or a local `checkpoint_path`")
+
+    original_state_dict = safetensors.torch.load_file(ckpt_path)
+    return original_state_dict
+
+
+# in SD3 original implementation of AdaLayerNormContinuous, it split linear projection output into shift, scale;
+# while in diffusers it split into scale, shift. Here we swap the linear projection weights in order to be able to use diffusers implementation
+def swap_scale_shift(weight):
+    shift, scale = weight.chunk(2, dim=0)
+    new_weight = torch.cat([scale, shift], dim=0)
+    return new_weight
+
+
+def convert_flux_transformer_checkpoint_to_diffusers(
+    original_state_dict, num_layers, num_single_layers, inner_dim, mlp_ratio=4.0
+):
+    converted_state_dict = {}
+
+    ## time_text_embed.timestep_embedder <-  time_in
+    converted_state_dict["time_text_embed.timestep_embedder.linear_1.weight"] = original_state_dict.pop(
+        "time_in.in_layer.weight"
+    )
+    converted_state_dict["time_text_embed.timestep_embedder.linear_1.bias"] = original_state_dict.pop(
+        "time_in.in_layer.bias"
+    )
+    converted_state_dict["time_text_embed.timestep_embedder.linear_2.weight"] = original_state_dict.pop(
+        "time_in.out_layer.weight"
+    )
+    converted_state_dict["time_text_embed.timestep_embedder.linear_2.bias"] = original_state_dict.pop(
+        "time_in.out_layer.bias"
+    )
+
+    ## time_text_embed.text_embedder <- vector_in
+    converted_state_dict["time_text_embed.text_embedder.linear_1.weight"] = original_state_dict.pop(
+        "vector_in.in_layer.weight"
+    )
+    converted_state_dict["time_text_embed.text_embedder.linear_1.bias"] = original_state_dict.pop(
+        "vector_in.in_layer.bias"
+    )
+    converted_state_dict["time_text_embed.text_embedder.linear_2.weight"] = original_state_dict.pop(
+        "vector_in.out_layer.weight"
+    )
+    converted_state_dict["time_text_embed.text_embedder.linear_2.bias"] = original_state_dict.pop(
+        "vector_in.out_layer.bias"
+    )
+
+    # guidance
+    has_guidance = any("guidance" in k for k in original_state_dict)
+    if has_guidance:
+        converted_state_dict["time_text_embed.guidance_embedder.linear_1.weight"] = original_state_dict.pop(
+            "guidance_in.in_layer.weight"
+        )
+        converted_state_dict["time_text_embed.guidance_embedder.linear_1.bias"] = original_state_dict.pop(
+            "guidance_in.in_layer.bias"
+        )
+        converted_state_dict["time_text_embed.guidance_embedder.linear_2.weight"] = original_state_dict.pop(
+            "guidance_in.out_layer.weight"
+        )
+        converted_state_dict["time_text_embed.guidance_embedder.linear_2.bias"] = original_state_dict.pop(
+            "guidance_in.out_layer.bias"
+        )
+
+    # context_embedder
+    converted_state_dict["context_embedder.weight"] = original_state_dict.pop("txt_in.weight")
+    converted_state_dict["context_embedder.bias"] = original_state_dict.pop("txt_in.bias")
+
+    # x_embedder
+    converted_state_dict["x_embedder.weight"] = original_state_dict.pop("img_in.weight")
+    converted_state_dict["x_embedder.bias"] = original_state_dict.pop("img_in.bias")
+
+    # double transformer blocks
+    for i in range(num_layers):
+        block_prefix = f"transformer_blocks.{i}."
+        # norms.
+        ## norm1
+        converted_state_dict[f"{block_prefix}norm1.linear.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_mod.lin.weight"
+        )
+        converted_state_dict[f"{block_prefix}norm1.linear.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_mod.lin.bias"
+        )
+        ## norm1_context
+        converted_state_dict[f"{block_prefix}norm1_context.linear.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_mod.lin.weight"
+        )
+        converted_state_dict[f"{block_prefix}norm1_context.linear.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_mod.lin.bias"
+        )
+        # Q, K, V
+        sample_q, sample_k, sample_v = torch.chunk(
+            original_state_dict.pop(f"double_blocks.{i}.img_attn.qkv.weight"), 3, dim=0
+        )
+        context_q, context_k, context_v = torch.chunk(
+            original_state_dict.pop(f"double_blocks.{i}.txt_attn.qkv.weight"), 3, dim=0
+        )
+        sample_q_bias, sample_k_bias, sample_v_bias = torch.chunk(
+            original_state_dict.pop(f"double_blocks.{i}.img_attn.qkv.bias"), 3, dim=0
+        )
+        context_q_bias, context_k_bias, context_v_bias = torch.chunk(
+            original_state_dict.pop(f"double_blocks.{i}.txt_attn.qkv.bias"), 3, dim=0
+        )
+        converted_state_dict[f"{block_prefix}attn.to_q.weight"] = torch.cat([sample_q])
+        converted_state_dict[f"{block_prefix}attn.to_q.bias"] = torch.cat([sample_q_bias])
+        converted_state_dict[f"{block_prefix}attn.to_k.weight"] = torch.cat([sample_k])
+        converted_state_dict[f"{block_prefix}attn.to_k.bias"] = torch.cat([sample_k_bias])
+        converted_state_dict[f"{block_prefix}attn.to_v.weight"] = torch.cat([sample_v])
+        converted_state_dict[f"{block_prefix}attn.to_v.bias"] = torch.cat([sample_v_bias])
+        converted_state_dict[f"{block_prefix}attn.add_q_proj.weight"] = torch.cat([context_q])
+        converted_state_dict[f"{block_prefix}attn.add_q_proj.bias"] = torch.cat([context_q_bias])
+        converted_state_dict[f"{block_prefix}attn.add_k_proj.weight"] = torch.cat([context_k])
+        converted_state_dict[f"{block_prefix}attn.add_k_proj.bias"] = torch.cat([context_k_bias])
+        converted_state_dict[f"{block_prefix}attn.add_v_proj.weight"] = torch.cat([context_v])
+        converted_state_dict[f"{block_prefix}attn.add_v_proj.bias"] = torch.cat([context_v_bias])
+        # qk_norm
+        converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_attn.norm.query_norm.scale"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_attn.norm.key_norm.scale"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_added_q.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_attn.norm.query_norm.scale"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_added_k.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_attn.norm.key_norm.scale"
+        )
+        # ff img_mlp
+        converted_state_dict[f"{block_prefix}ff.net.0.proj.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_mlp.0.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff.net.0.proj.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_mlp.0.bias"
+        )
+        converted_state_dict[f"{block_prefix}ff.net.2.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_mlp.2.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff.net.2.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_mlp.2.bias"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.0.proj.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_mlp.0.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.0.proj.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_mlp.0.bias"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.2.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_mlp.2.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.2.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_mlp.2.bias"
+        )
+        # output projections.
+        converted_state_dict[f"{block_prefix}attn.to_out.0.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_attn.proj.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_out.0.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_attn.proj.bias"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_add_out.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_attn.proj.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_add_out.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_attn.proj.bias"
+        )
+
+    # single transfomer blocks
+    for i in range(num_single_layers):
+        block_prefix = f"single_transformer_blocks.{i}."
+        # norm.linear  <- single_blocks.0.modulation.lin
+        converted_state_dict[f"{block_prefix}norm.linear.weight"] = original_state_dict.pop(
+            f"single_blocks.{i}.modulation.lin.weight"
+        )
+        converted_state_dict[f"{block_prefix}norm.linear.bias"] = original_state_dict.pop(
+            f"single_blocks.{i}.modulation.lin.bias"
+        )
+        # Q, K, V, mlp
+        mlp_hidden_dim = int(inner_dim * mlp_ratio)
+        split_size = (inner_dim, inner_dim, inner_dim, mlp_hidden_dim)
+        q, k, v, mlp = torch.split(original_state_dict.pop(f"single_blocks.{i}.linear1.weight"), split_size, dim=0)
+        q_bias, k_bias, v_bias, mlp_bias = torch.split(
+            original_state_dict.pop(f"single_blocks.{i}.linear1.bias"), split_size, dim=0
+        )
+        converted_state_dict[f"{block_prefix}attn.to_q.weight"] = torch.cat([q])
+        converted_state_dict[f"{block_prefix}attn.to_q.bias"] = torch.cat([q_bias])
+        converted_state_dict[f"{block_prefix}attn.to_k.weight"] = torch.cat([k])
+        converted_state_dict[f"{block_prefix}attn.to_k.bias"] = torch.cat([k_bias])
+        converted_state_dict[f"{block_prefix}attn.to_v.weight"] = torch.cat([v])
+        converted_state_dict[f"{block_prefix}attn.to_v.bias"] = torch.cat([v_bias])
+        converted_state_dict[f"{block_prefix}proj_mlp.weight"] = torch.cat([mlp])
+        converted_state_dict[f"{block_prefix}proj_mlp.bias"] = torch.cat([mlp_bias])
+        # qk norm
+        converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = original_state_dict.pop(
+            f"single_blocks.{i}.norm.query_norm.scale"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = original_state_dict.pop(
+            f"single_blocks.{i}.norm.key_norm.scale"
+        )
+        # output projections.
+        converted_state_dict[f"{block_prefix}proj_out.weight"] = original_state_dict.pop(
+            f"single_blocks.{i}.linear2.weight"
+        )
+        converted_state_dict[f"{block_prefix}proj_out.bias"] = original_state_dict.pop(
+            f"single_blocks.{i}.linear2.bias"
+        )
+
+    converted_state_dict["proj_out.weight"] = original_state_dict.pop("final_layer.linear.weight")
+    converted_state_dict["proj_out.bias"] = original_state_dict.pop("final_layer.linear.bias")
+    converted_state_dict["norm_out.linear.weight"] = swap_scale_shift(
+        original_state_dict.pop("final_layer.adaLN_modulation.1.weight")
+    )
+    converted_state_dict["norm_out.linear.bias"] = swap_scale_shift(
+        original_state_dict.pop("final_layer.adaLN_modulation.1.bias")
+    )
+
+    return converted_state_dict
+
+
+def main(args):
+    original_ckpt = load_original_checkpoint(args)
+    has_guidance = any("guidance" in k for k in original_ckpt)
+
+    if args.transformer:
+        num_layers = 19
+        num_single_layers = 38
+        inner_dim = 3072
+        mlp_ratio = 4.0
+        converted_transformer_state_dict = convert_flux_transformer_checkpoint_to_diffusers(
+            original_ckpt, num_layers, num_single_layers, inner_dim, mlp_ratio=mlp_ratio
+        )
+        transformer = FluxTransformer2DModel(guidance_embeds=has_guidance)
+        transformer.load_state_dict(converted_transformer_state_dict, strict=True)
+
+        print(
+            f"Saving Flux Transformer in Diffusers format. Variant: {'guidance-distilled' if has_guidance else 'timestep-distilled'}"
+        )
+        transformer.to(dtype).save_pretrained(f"{args.output_path}/transformer")
+
+    if args.vae:
+        config = AutoencoderKL.load_config("stabilityai/stable-diffusion-3-medium-diffusers", subfolder="vae")
+        vae = AutoencoderKL.from_config(config, scaling_factor=0.3611, shift_factor=0.1159).to(torch.bfloat16)
+
+        converted_vae_state_dict = convert_ldm_vae_checkpoint(original_ckpt, vae.config)
+        vae.load_state_dict(converted_vae_state_dict, strict=True)
+        vae.to(dtype).save_pretrained(f"{args.output_path}/vae")
+
+
+if __name__ == "__main__":
+    main(args)
diff --git a/diffusers/scripts/convert_gligen_to_diffusers.py b/diffusers/scripts/convert_gligen_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..83c1f928e407f9a9cc93a07a609ae2472c2a7b18
--- /dev/null
+++ b/diffusers/scripts/convert_gligen_to_diffusers.py
@@ -0,0 +1,581 @@
+import argparse
+import re
+
+import torch
+import yaml
+from transformers import (
+    CLIPProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    StableDiffusionGLIGENPipeline,
+    StableDiffusionGLIGENTextImagePipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
+    assign_to_checkpoint,
+    conv_attn_to_linear,
+    protected,
+    renew_attention_paths,
+    renew_resnet_paths,
+    renew_vae_attention_paths,
+    renew_vae_resnet_paths,
+    shave_segments,
+    textenc_conversion_map,
+    textenc_pattern,
+)
+
+
+def convert_open_clip_checkpoint(checkpoint):
+    checkpoint = checkpoint["text_encoder"]
+    text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    if "cond_stage_model.model.text_projection" in checkpoint:
+        d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0])
+    else:
+        d_model = 1024
+
+    for key in keys:
+        if "resblocks.23" in key:  # Diffusers drops the final layer and only uses the penultimate layer
+            continue
+        if key in textenc_conversion_map:
+            text_model_dict[textenc_conversion_map[key]] = checkpoint[key]
+        # if key.startswith("cond_stage_model.model.transformer."):
+        new_key = key[len("transformer.") :]
+        if new_key.endswith(".in_proj_weight"):
+            new_key = new_key[: -len(".in_proj_weight")]
+            new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+            text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+            text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+            text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
+        elif new_key.endswith(".in_proj_bias"):
+            new_key = new_key[: -len(".in_proj_bias")]
+            new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+            text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+            text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+            text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
+        else:
+            if key != "transformer.text_model.embeddings.position_ids":
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+
+                text_model_dict[new_key] = checkpoint[key]
+
+            if key == "transformer.text_model.embeddings.token_embedding.weight":
+                text_model_dict["text_model.embeddings.token_embedding.weight"] = checkpoint[key]
+
+    text_model_dict.pop("text_model.embeddings.transformer.text_model.embeddings.token_embedding.weight")
+
+    text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
+
+def convert_gligen_vae_checkpoint(checkpoint, config):
+    checkpoint = checkpoint["autoencoder"]
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for key in new_checkpoint.keys():
+        if "encoder.mid_block.attentions.0" in key or "decoder.mid_block.attentions.0" in key:
+            if "query" in key:
+                new_checkpoint[key.replace("query", "to_q")] = new_checkpoint.pop(key)
+            if "value" in key:
+                new_checkpoint[key.replace("value", "to_v")] = new_checkpoint.pop(key)
+            if "key" in key:
+                new_checkpoint[key.replace("key", "to_k")] = new_checkpoint.pop(key)
+            if "proj_attn" in key:
+                new_checkpoint[key.replace("proj_attn", "to_out.0")] = new_checkpoint.pop(key)
+
+    return new_checkpoint
+
+
+def convert_gligen_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+    unet_state_dict = {}
+    checkpoint = checkpoint["model"]
+    keys = list(checkpoint.keys())
+
+    unet_key = "model.diffusion_model."
+
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has bot EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+    for key in keys:
+        unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    for key in keys:
+        if "position_net" in key:
+            new_checkpoint[key] = unet_state_dict[key]
+
+    return new_checkpoint
+
+
+def create_vae_config(original_config, image_size: int):
+    vae_params = original_config["autoencoder"]["params"]["ddconfig"]
+    _ = original_config["autoencoder"]["params"]["embed_dim"]
+
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
+    }
+
+    return config
+
+
+def create_unet_config(original_config, image_size: int, attention_type):
+    unet_params = original_config["model"]["params"]
+    vae_params = original_config["autoencoder"]["params"]["ddconfig"]
+
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
+
+    head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None
+    use_linear_projection = (
+        unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False
+    )
+    if use_linear_projection:
+        if head_dim is None:
+            head_dim = [5, 10, 20, 20]
+
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params["in_channels"],
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params["num_res_blocks"],
+        "cross_attention_dim": unet_params["context_dim"],
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "attention_type": attention_type,
+    }
+
+    return config
+
+
+def convert_gligen_to_diffusers(
+    checkpoint_path: str,
+    original_config_file: str,
+    attention_type: str,
+    image_size: int = 512,
+    extract_ema: bool = False,
+    num_in_channels: int = None,
+    device: str = None,
+):
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        checkpoint = torch.load(checkpoint_path, map_location=device)
+    else:
+        checkpoint = torch.load(checkpoint_path, map_location=device)
+
+    if "global_step" in checkpoint:
+        checkpoint["global_step"]
+    else:
+        print("global_step key not found in model")
+
+    original_config = yaml.safe_load(original_config_file)
+
+    if num_in_channels is not None:
+        original_config["model"]["params"]["in_channels"] = num_in_channels
+
+    num_train_timesteps = original_config["diffusion"]["params"]["timesteps"]
+    beta_start = original_config["diffusion"]["params"]["linear_start"]
+    beta_end = original_config["diffusion"]["params"]["linear_end"]
+
+    scheduler = DDIMScheduler(
+        beta_end=beta_end,
+        beta_schedule="scaled_linear",
+        beta_start=beta_start,
+        num_train_timesteps=num_train_timesteps,
+        steps_offset=1,
+        clip_sample=False,
+        set_alpha_to_one=False,
+        prediction_type="epsilon",
+    )
+
+    # Convert the UNet2DConditionalModel model
+    unet_config = create_unet_config(original_config, image_size, attention_type)
+    unet = UNet2DConditionModel(**unet_config)
+
+    converted_unet_checkpoint = convert_gligen_unet_checkpoint(
+        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
+    )
+
+    unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model
+    vae_config = create_vae_config(original_config, image_size)
+    converted_vae_checkpoint = convert_gligen_vae_checkpoint(checkpoint, vae_config)
+
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+
+    # Convert the text model
+    text_encoder = convert_open_clip_checkpoint(checkpoint)
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+
+    if attention_type == "gated-text-image":
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+        processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+
+        pipe = StableDiffusionGLIGENTextImagePipeline(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            image_encoder=image_encoder,
+            processor=processor,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=None,
+            feature_extractor=None,
+        )
+    elif attention_type == "gated":
+        pipe = StableDiffusionGLIGENPipeline(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=None,
+            feature_extractor=None,
+        )
+
+    return pipe
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The YAML config file corresponding to the gligen architecture.",
+    )
+    parser.add_argument(
+        "--num_in_channels",
+        default=None,
+        type=int,
+        help="The number of input channels. If `None` number of input channels will be automatically inferred.",
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument(
+        "--attention_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Type of attention ex: gated or gated-text-image",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--device", type=str, help="Device to use.")
+    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
+
+    args = parser.parse_args()
+
+    pipe = convert_gligen_to_diffusers(
+        checkpoint_path=args.checkpoint_path,
+        original_config_file=args.original_config_file,
+        attention_type=args.attention_type,
+        extract_ema=args.extract_ema,
+        num_in_channels=args.num_in_channels,
+        device=args.device,
+    )
+
+    if args.half:
+        pipe.to(dtype=torch.float16)
+
+    pipe.save_pretrained(args.dump_path)
diff --git a/diffusers/scripts/convert_hunyuandit_controlnet_to_diffusers.py b/diffusers/scripts/convert_hunyuandit_controlnet_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c838369089077c93301f42acbc5e17ee5d64bf9
--- /dev/null
+++ b/diffusers/scripts/convert_hunyuandit_controlnet_to_diffusers.py
@@ -0,0 +1,241 @@
+import argparse
+
+import torch
+
+from diffusers import HunyuanDiT2DControlNetModel
+
+
+def main(args):
+    state_dict = torch.load(args.pt_checkpoint_path, map_location="cpu")
+
+    if args.load_key != "none":
+        try:
+            state_dict = state_dict[args.load_key]
+        except KeyError:
+            raise KeyError(
+                f"{args.load_key} not found in the checkpoint."
+                "Please load from the following keys:{state_dict.keys()}"
+            )
+    device = "cuda"
+
+    model_config = HunyuanDiT2DControlNetModel.load_config(
+        "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers", subfolder="transformer"
+    )
+    model_config[
+        "use_style_cond_and_image_meta_size"
+    ] = args.use_style_cond_and_image_meta_size  ### version <= v1.1: True; version >= v1.2: False
+    print(model_config)
+
+    for key in state_dict:
+        print("local:", key)
+
+    model = HunyuanDiT2DControlNetModel.from_config(model_config).to(device)
+
+    for key in model.state_dict():
+        print("diffusers:", key)
+
+    num_layers = 19
+    for i in range(num_layers):
+        # attn1
+        # Wkqv -> to_q, to_k, to_v
+        q, k, v = torch.chunk(state_dict[f"blocks.{i}.attn1.Wqkv.weight"], 3, dim=0)
+        q_bias, k_bias, v_bias = torch.chunk(state_dict[f"blocks.{i}.attn1.Wqkv.bias"], 3, dim=0)
+        state_dict[f"blocks.{i}.attn1.to_q.weight"] = q
+        state_dict[f"blocks.{i}.attn1.to_q.bias"] = q_bias
+        state_dict[f"blocks.{i}.attn1.to_k.weight"] = k
+        state_dict[f"blocks.{i}.attn1.to_k.bias"] = k_bias
+        state_dict[f"blocks.{i}.attn1.to_v.weight"] = v
+        state_dict[f"blocks.{i}.attn1.to_v.bias"] = v_bias
+        state_dict.pop(f"blocks.{i}.attn1.Wqkv.weight")
+        state_dict.pop(f"blocks.{i}.attn1.Wqkv.bias")
+
+        # q_norm, k_norm -> norm_q, norm_k
+        state_dict[f"blocks.{i}.attn1.norm_q.weight"] = state_dict[f"blocks.{i}.attn1.q_norm.weight"]
+        state_dict[f"blocks.{i}.attn1.norm_q.bias"] = state_dict[f"blocks.{i}.attn1.q_norm.bias"]
+        state_dict[f"blocks.{i}.attn1.norm_k.weight"] = state_dict[f"blocks.{i}.attn1.k_norm.weight"]
+        state_dict[f"blocks.{i}.attn1.norm_k.bias"] = state_dict[f"blocks.{i}.attn1.k_norm.bias"]
+
+        state_dict.pop(f"blocks.{i}.attn1.q_norm.weight")
+        state_dict.pop(f"blocks.{i}.attn1.q_norm.bias")
+        state_dict.pop(f"blocks.{i}.attn1.k_norm.weight")
+        state_dict.pop(f"blocks.{i}.attn1.k_norm.bias")
+
+        # out_proj -> to_out
+        state_dict[f"blocks.{i}.attn1.to_out.0.weight"] = state_dict[f"blocks.{i}.attn1.out_proj.weight"]
+        state_dict[f"blocks.{i}.attn1.to_out.0.bias"] = state_dict[f"blocks.{i}.attn1.out_proj.bias"]
+        state_dict.pop(f"blocks.{i}.attn1.out_proj.weight")
+        state_dict.pop(f"blocks.{i}.attn1.out_proj.bias")
+
+        # attn2
+        # kq_proj -> to_k, to_v
+        k, v = torch.chunk(state_dict[f"blocks.{i}.attn2.kv_proj.weight"], 2, dim=0)
+        k_bias, v_bias = torch.chunk(state_dict[f"blocks.{i}.attn2.kv_proj.bias"], 2, dim=0)
+        state_dict[f"blocks.{i}.attn2.to_k.weight"] = k
+        state_dict[f"blocks.{i}.attn2.to_k.bias"] = k_bias
+        state_dict[f"blocks.{i}.attn2.to_v.weight"] = v
+        state_dict[f"blocks.{i}.attn2.to_v.bias"] = v_bias
+        state_dict.pop(f"blocks.{i}.attn2.kv_proj.weight")
+        state_dict.pop(f"blocks.{i}.attn2.kv_proj.bias")
+
+        # q_proj -> to_q
+        state_dict[f"blocks.{i}.attn2.to_q.weight"] = state_dict[f"blocks.{i}.attn2.q_proj.weight"]
+        state_dict[f"blocks.{i}.attn2.to_q.bias"] = state_dict[f"blocks.{i}.attn2.q_proj.bias"]
+        state_dict.pop(f"blocks.{i}.attn2.q_proj.weight")
+        state_dict.pop(f"blocks.{i}.attn2.q_proj.bias")
+
+        # q_norm, k_norm -> norm_q, norm_k
+        state_dict[f"blocks.{i}.attn2.norm_q.weight"] = state_dict[f"blocks.{i}.attn2.q_norm.weight"]
+        state_dict[f"blocks.{i}.attn2.norm_q.bias"] = state_dict[f"blocks.{i}.attn2.q_norm.bias"]
+        state_dict[f"blocks.{i}.attn2.norm_k.weight"] = state_dict[f"blocks.{i}.attn2.k_norm.weight"]
+        state_dict[f"blocks.{i}.attn2.norm_k.bias"] = state_dict[f"blocks.{i}.attn2.k_norm.bias"]
+
+        state_dict.pop(f"blocks.{i}.attn2.q_norm.weight")
+        state_dict.pop(f"blocks.{i}.attn2.q_norm.bias")
+        state_dict.pop(f"blocks.{i}.attn2.k_norm.weight")
+        state_dict.pop(f"blocks.{i}.attn2.k_norm.bias")
+
+        # out_proj -> to_out
+        state_dict[f"blocks.{i}.attn2.to_out.0.weight"] = state_dict[f"blocks.{i}.attn2.out_proj.weight"]
+        state_dict[f"blocks.{i}.attn2.to_out.0.bias"] = state_dict[f"blocks.{i}.attn2.out_proj.bias"]
+        state_dict.pop(f"blocks.{i}.attn2.out_proj.weight")
+        state_dict.pop(f"blocks.{i}.attn2.out_proj.bias")
+
+        # switch norm 2 and norm 3
+        norm2_weight = state_dict[f"blocks.{i}.norm2.weight"]
+        norm2_bias = state_dict[f"blocks.{i}.norm2.bias"]
+        state_dict[f"blocks.{i}.norm2.weight"] = state_dict[f"blocks.{i}.norm3.weight"]
+        state_dict[f"blocks.{i}.norm2.bias"] = state_dict[f"blocks.{i}.norm3.bias"]
+        state_dict[f"blocks.{i}.norm3.weight"] = norm2_weight
+        state_dict[f"blocks.{i}.norm3.bias"] = norm2_bias
+
+        # norm1 -> norm1.norm
+        # default_modulation.1 -> norm1.linear
+        state_dict[f"blocks.{i}.norm1.norm.weight"] = state_dict[f"blocks.{i}.norm1.weight"]
+        state_dict[f"blocks.{i}.norm1.norm.bias"] = state_dict[f"blocks.{i}.norm1.bias"]
+        state_dict[f"blocks.{i}.norm1.linear.weight"] = state_dict[f"blocks.{i}.default_modulation.1.weight"]
+        state_dict[f"blocks.{i}.norm1.linear.bias"] = state_dict[f"blocks.{i}.default_modulation.1.bias"]
+        state_dict.pop(f"blocks.{i}.norm1.weight")
+        state_dict.pop(f"blocks.{i}.norm1.bias")
+        state_dict.pop(f"blocks.{i}.default_modulation.1.weight")
+        state_dict.pop(f"blocks.{i}.default_modulation.1.bias")
+
+        # mlp.fc1 -> ff.net.0, mlp.fc2 -> ff.net.2
+        state_dict[f"blocks.{i}.ff.net.0.proj.weight"] = state_dict[f"blocks.{i}.mlp.fc1.weight"]
+        state_dict[f"blocks.{i}.ff.net.0.proj.bias"] = state_dict[f"blocks.{i}.mlp.fc1.bias"]
+        state_dict[f"blocks.{i}.ff.net.2.weight"] = state_dict[f"blocks.{i}.mlp.fc2.weight"]
+        state_dict[f"blocks.{i}.ff.net.2.bias"] = state_dict[f"blocks.{i}.mlp.fc2.bias"]
+        state_dict.pop(f"blocks.{i}.mlp.fc1.weight")
+        state_dict.pop(f"blocks.{i}.mlp.fc1.bias")
+        state_dict.pop(f"blocks.{i}.mlp.fc2.weight")
+        state_dict.pop(f"blocks.{i}.mlp.fc2.bias")
+
+        # after_proj_list -> controlnet_blocks
+        state_dict[f"controlnet_blocks.{i}.weight"] = state_dict[f"after_proj_list.{i}.weight"]
+        state_dict[f"controlnet_blocks.{i}.bias"] = state_dict[f"after_proj_list.{i}.bias"]
+        state_dict.pop(f"after_proj_list.{i}.weight")
+        state_dict.pop(f"after_proj_list.{i}.bias")
+
+    # before_proj -> input_block
+    state_dict["input_block.weight"] = state_dict["before_proj.weight"]
+    state_dict["input_block.bias"] = state_dict["before_proj.bias"]
+    state_dict.pop("before_proj.weight")
+    state_dict.pop("before_proj.bias")
+
+    # pooler -> time_extra_emb
+    state_dict["time_extra_emb.pooler.positional_embedding"] = state_dict["pooler.positional_embedding"]
+    state_dict["time_extra_emb.pooler.k_proj.weight"] = state_dict["pooler.k_proj.weight"]
+    state_dict["time_extra_emb.pooler.k_proj.bias"] = state_dict["pooler.k_proj.bias"]
+    state_dict["time_extra_emb.pooler.q_proj.weight"] = state_dict["pooler.q_proj.weight"]
+    state_dict["time_extra_emb.pooler.q_proj.bias"] = state_dict["pooler.q_proj.bias"]
+    state_dict["time_extra_emb.pooler.v_proj.weight"] = state_dict["pooler.v_proj.weight"]
+    state_dict["time_extra_emb.pooler.v_proj.bias"] = state_dict["pooler.v_proj.bias"]
+    state_dict["time_extra_emb.pooler.c_proj.weight"] = state_dict["pooler.c_proj.weight"]
+    state_dict["time_extra_emb.pooler.c_proj.bias"] = state_dict["pooler.c_proj.bias"]
+    state_dict.pop("pooler.k_proj.weight")
+    state_dict.pop("pooler.k_proj.bias")
+    state_dict.pop("pooler.q_proj.weight")
+    state_dict.pop("pooler.q_proj.bias")
+    state_dict.pop("pooler.v_proj.weight")
+    state_dict.pop("pooler.v_proj.bias")
+    state_dict.pop("pooler.c_proj.weight")
+    state_dict.pop("pooler.c_proj.bias")
+    state_dict.pop("pooler.positional_embedding")
+
+    # t_embedder -> time_embedding (`TimestepEmbedding`)
+    state_dict["time_extra_emb.timestep_embedder.linear_1.bias"] = state_dict["t_embedder.mlp.0.bias"]
+    state_dict["time_extra_emb.timestep_embedder.linear_1.weight"] = state_dict["t_embedder.mlp.0.weight"]
+    state_dict["time_extra_emb.timestep_embedder.linear_2.bias"] = state_dict["t_embedder.mlp.2.bias"]
+    state_dict["time_extra_emb.timestep_embedder.linear_2.weight"] = state_dict["t_embedder.mlp.2.weight"]
+
+    state_dict.pop("t_embedder.mlp.0.bias")
+    state_dict.pop("t_embedder.mlp.0.weight")
+    state_dict.pop("t_embedder.mlp.2.bias")
+    state_dict.pop("t_embedder.mlp.2.weight")
+
+    # x_embedder -> pos_embd (`PatchEmbed`)
+    state_dict["pos_embed.proj.weight"] = state_dict["x_embedder.proj.weight"]
+    state_dict["pos_embed.proj.bias"] = state_dict["x_embedder.proj.bias"]
+    state_dict.pop("x_embedder.proj.weight")
+    state_dict.pop("x_embedder.proj.bias")
+
+    # mlp_t5 -> text_embedder
+    state_dict["text_embedder.linear_1.bias"] = state_dict["mlp_t5.0.bias"]
+    state_dict["text_embedder.linear_1.weight"] = state_dict["mlp_t5.0.weight"]
+    state_dict["text_embedder.linear_2.bias"] = state_dict["mlp_t5.2.bias"]
+    state_dict["text_embedder.linear_2.weight"] = state_dict["mlp_t5.2.weight"]
+    state_dict.pop("mlp_t5.0.bias")
+    state_dict.pop("mlp_t5.0.weight")
+    state_dict.pop("mlp_t5.2.bias")
+    state_dict.pop("mlp_t5.2.weight")
+
+    # extra_embedder -> extra_embedder
+    state_dict["time_extra_emb.extra_embedder.linear_1.bias"] = state_dict["extra_embedder.0.bias"]
+    state_dict["time_extra_emb.extra_embedder.linear_1.weight"] = state_dict["extra_embedder.0.weight"]
+    state_dict["time_extra_emb.extra_embedder.linear_2.bias"] = state_dict["extra_embedder.2.bias"]
+    state_dict["time_extra_emb.extra_embedder.linear_2.weight"] = state_dict["extra_embedder.2.weight"]
+    state_dict.pop("extra_embedder.0.bias")
+    state_dict.pop("extra_embedder.0.weight")
+    state_dict.pop("extra_embedder.2.bias")
+    state_dict.pop("extra_embedder.2.weight")
+
+    # style_embedder
+    if model_config["use_style_cond_and_image_meta_size"]:
+        print(state_dict["style_embedder.weight"])
+        print(state_dict["style_embedder.weight"].shape)
+        state_dict["time_extra_emb.style_embedder.weight"] = state_dict["style_embedder.weight"][0:1]
+        state_dict.pop("style_embedder.weight")
+
+    model.load_state_dict(state_dict)
+
+    if args.save:
+        model.save_pretrained(args.output_checkpoint_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--save", default=True, type=bool, required=False, help="Whether to save the converted pipeline or not."
+    )
+    parser.add_argument(
+        "--pt_checkpoint_path", default=None, type=str, required=True, help="Path to the .pt pretrained model."
+    )
+    parser.add_argument(
+        "--output_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the output converted diffusers pipeline.",
+    )
+    parser.add_argument(
+        "--load_key", default="none", type=str, required=False, help="The key to load from the pretrained .pt file"
+    )
+    parser.add_argument(
+        "--use_style_cond_and_image_meta_size",
+        type=bool,
+        default=False,
+        help="version <= v1.1: True; version >= v1.2: False",
+    )
+
+    args = parser.parse_args()
+    main(args)
diff --git a/diffusers/scripts/convert_hunyuandit_to_diffusers.py b/diffusers/scripts/convert_hunyuandit_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..da3af8333ee3e34f4b8af8b3f58e38e8dec5925c
--- /dev/null
+++ b/diffusers/scripts/convert_hunyuandit_to_diffusers.py
@@ -0,0 +1,267 @@
+import argparse
+
+import torch
+
+from diffusers import HunyuanDiT2DModel
+
+
+def main(args):
+    state_dict = torch.load(args.pt_checkpoint_path, map_location="cpu")
+
+    if args.load_key != "none":
+        try:
+            state_dict = state_dict[args.load_key]
+        except KeyError:
+            raise KeyError(
+                f"{args.load_key} not found in the checkpoint."
+                f"Please load from the following keys:{state_dict.keys()}"
+            )
+
+    device = "cuda"
+    model_config = HunyuanDiT2DModel.load_config("Tencent-Hunyuan/HunyuanDiT-Diffusers", subfolder="transformer")
+    model_config[
+        "use_style_cond_and_image_meta_size"
+    ] = args.use_style_cond_and_image_meta_size  ### version <= v1.1: True; version >= v1.2: False
+
+    # input_size -> sample_size, text_dim -> cross_attention_dim
+    for key in state_dict:
+        print("local:", key)
+
+    model = HunyuanDiT2DModel.from_config(model_config).to(device)
+
+    for key in model.state_dict():
+        print("diffusers:", key)
+
+    num_layers = 40
+    for i in range(num_layers):
+        # attn1
+        # Wkqv -> to_q, to_k, to_v
+        q, k, v = torch.chunk(state_dict[f"blocks.{i}.attn1.Wqkv.weight"], 3, dim=0)
+        q_bias, k_bias, v_bias = torch.chunk(state_dict[f"blocks.{i}.attn1.Wqkv.bias"], 3, dim=0)
+        state_dict[f"blocks.{i}.attn1.to_q.weight"] = q
+        state_dict[f"blocks.{i}.attn1.to_q.bias"] = q_bias
+        state_dict[f"blocks.{i}.attn1.to_k.weight"] = k
+        state_dict[f"blocks.{i}.attn1.to_k.bias"] = k_bias
+        state_dict[f"blocks.{i}.attn1.to_v.weight"] = v
+        state_dict[f"blocks.{i}.attn1.to_v.bias"] = v_bias
+        state_dict.pop(f"blocks.{i}.attn1.Wqkv.weight")
+        state_dict.pop(f"blocks.{i}.attn1.Wqkv.bias")
+
+        # q_norm, k_norm -> norm_q, norm_k
+        state_dict[f"blocks.{i}.attn1.norm_q.weight"] = state_dict[f"blocks.{i}.attn1.q_norm.weight"]
+        state_dict[f"blocks.{i}.attn1.norm_q.bias"] = state_dict[f"blocks.{i}.attn1.q_norm.bias"]
+        state_dict[f"blocks.{i}.attn1.norm_k.weight"] = state_dict[f"blocks.{i}.attn1.k_norm.weight"]
+        state_dict[f"blocks.{i}.attn1.norm_k.bias"] = state_dict[f"blocks.{i}.attn1.k_norm.bias"]
+
+        state_dict.pop(f"blocks.{i}.attn1.q_norm.weight")
+        state_dict.pop(f"blocks.{i}.attn1.q_norm.bias")
+        state_dict.pop(f"blocks.{i}.attn1.k_norm.weight")
+        state_dict.pop(f"blocks.{i}.attn1.k_norm.bias")
+
+        # out_proj -> to_out
+        state_dict[f"blocks.{i}.attn1.to_out.0.weight"] = state_dict[f"blocks.{i}.attn1.out_proj.weight"]
+        state_dict[f"blocks.{i}.attn1.to_out.0.bias"] = state_dict[f"blocks.{i}.attn1.out_proj.bias"]
+        state_dict.pop(f"blocks.{i}.attn1.out_proj.weight")
+        state_dict.pop(f"blocks.{i}.attn1.out_proj.bias")
+
+        # attn2
+        # kq_proj -> to_k, to_v
+        k, v = torch.chunk(state_dict[f"blocks.{i}.attn2.kv_proj.weight"], 2, dim=0)
+        k_bias, v_bias = torch.chunk(state_dict[f"blocks.{i}.attn2.kv_proj.bias"], 2, dim=0)
+        state_dict[f"blocks.{i}.attn2.to_k.weight"] = k
+        state_dict[f"blocks.{i}.attn2.to_k.bias"] = k_bias
+        state_dict[f"blocks.{i}.attn2.to_v.weight"] = v
+        state_dict[f"blocks.{i}.attn2.to_v.bias"] = v_bias
+        state_dict.pop(f"blocks.{i}.attn2.kv_proj.weight")
+        state_dict.pop(f"blocks.{i}.attn2.kv_proj.bias")
+
+        # q_proj -> to_q
+        state_dict[f"blocks.{i}.attn2.to_q.weight"] = state_dict[f"blocks.{i}.attn2.q_proj.weight"]
+        state_dict[f"blocks.{i}.attn2.to_q.bias"] = state_dict[f"blocks.{i}.attn2.q_proj.bias"]
+        state_dict.pop(f"blocks.{i}.attn2.q_proj.weight")
+        state_dict.pop(f"blocks.{i}.attn2.q_proj.bias")
+
+        # q_norm, k_norm -> norm_q, norm_k
+        state_dict[f"blocks.{i}.attn2.norm_q.weight"] = state_dict[f"blocks.{i}.attn2.q_norm.weight"]
+        state_dict[f"blocks.{i}.attn2.norm_q.bias"] = state_dict[f"blocks.{i}.attn2.q_norm.bias"]
+        state_dict[f"blocks.{i}.attn2.norm_k.weight"] = state_dict[f"blocks.{i}.attn2.k_norm.weight"]
+        state_dict[f"blocks.{i}.attn2.norm_k.bias"] = state_dict[f"blocks.{i}.attn2.k_norm.bias"]
+
+        state_dict.pop(f"blocks.{i}.attn2.q_norm.weight")
+        state_dict.pop(f"blocks.{i}.attn2.q_norm.bias")
+        state_dict.pop(f"blocks.{i}.attn2.k_norm.weight")
+        state_dict.pop(f"blocks.{i}.attn2.k_norm.bias")
+
+        # out_proj -> to_out
+        state_dict[f"blocks.{i}.attn2.to_out.0.weight"] = state_dict[f"blocks.{i}.attn2.out_proj.weight"]
+        state_dict[f"blocks.{i}.attn2.to_out.0.bias"] = state_dict[f"blocks.{i}.attn2.out_proj.bias"]
+        state_dict.pop(f"blocks.{i}.attn2.out_proj.weight")
+        state_dict.pop(f"blocks.{i}.attn2.out_proj.bias")
+
+        # switch norm 2 and norm 3
+        norm2_weight = state_dict[f"blocks.{i}.norm2.weight"]
+        norm2_bias = state_dict[f"blocks.{i}.norm2.bias"]
+        state_dict[f"blocks.{i}.norm2.weight"] = state_dict[f"blocks.{i}.norm3.weight"]
+        state_dict[f"blocks.{i}.norm2.bias"] = state_dict[f"blocks.{i}.norm3.bias"]
+        state_dict[f"blocks.{i}.norm3.weight"] = norm2_weight
+        state_dict[f"blocks.{i}.norm3.bias"] = norm2_bias
+
+        # norm1 -> norm1.norm
+        # default_modulation.1 -> norm1.linear
+        state_dict[f"blocks.{i}.norm1.norm.weight"] = state_dict[f"blocks.{i}.norm1.weight"]
+        state_dict[f"blocks.{i}.norm1.norm.bias"] = state_dict[f"blocks.{i}.norm1.bias"]
+        state_dict[f"blocks.{i}.norm1.linear.weight"] = state_dict[f"blocks.{i}.default_modulation.1.weight"]
+        state_dict[f"blocks.{i}.norm1.linear.bias"] = state_dict[f"blocks.{i}.default_modulation.1.bias"]
+        state_dict.pop(f"blocks.{i}.norm1.weight")
+        state_dict.pop(f"blocks.{i}.norm1.bias")
+        state_dict.pop(f"blocks.{i}.default_modulation.1.weight")
+        state_dict.pop(f"blocks.{i}.default_modulation.1.bias")
+
+        # mlp.fc1 -> ff.net.0, mlp.fc2 -> ff.net.2
+        state_dict[f"blocks.{i}.ff.net.0.proj.weight"] = state_dict[f"blocks.{i}.mlp.fc1.weight"]
+        state_dict[f"blocks.{i}.ff.net.0.proj.bias"] = state_dict[f"blocks.{i}.mlp.fc1.bias"]
+        state_dict[f"blocks.{i}.ff.net.2.weight"] = state_dict[f"blocks.{i}.mlp.fc2.weight"]
+        state_dict[f"blocks.{i}.ff.net.2.bias"] = state_dict[f"blocks.{i}.mlp.fc2.bias"]
+        state_dict.pop(f"blocks.{i}.mlp.fc1.weight")
+        state_dict.pop(f"blocks.{i}.mlp.fc1.bias")
+        state_dict.pop(f"blocks.{i}.mlp.fc2.weight")
+        state_dict.pop(f"blocks.{i}.mlp.fc2.bias")
+
+    # pooler -> time_extra_emb
+    state_dict["time_extra_emb.pooler.positional_embedding"] = state_dict["pooler.positional_embedding"]
+    state_dict["time_extra_emb.pooler.k_proj.weight"] = state_dict["pooler.k_proj.weight"]
+    state_dict["time_extra_emb.pooler.k_proj.bias"] = state_dict["pooler.k_proj.bias"]
+    state_dict["time_extra_emb.pooler.q_proj.weight"] = state_dict["pooler.q_proj.weight"]
+    state_dict["time_extra_emb.pooler.q_proj.bias"] = state_dict["pooler.q_proj.bias"]
+    state_dict["time_extra_emb.pooler.v_proj.weight"] = state_dict["pooler.v_proj.weight"]
+    state_dict["time_extra_emb.pooler.v_proj.bias"] = state_dict["pooler.v_proj.bias"]
+    state_dict["time_extra_emb.pooler.c_proj.weight"] = state_dict["pooler.c_proj.weight"]
+    state_dict["time_extra_emb.pooler.c_proj.bias"] = state_dict["pooler.c_proj.bias"]
+    state_dict.pop("pooler.k_proj.weight")
+    state_dict.pop("pooler.k_proj.bias")
+    state_dict.pop("pooler.q_proj.weight")
+    state_dict.pop("pooler.q_proj.bias")
+    state_dict.pop("pooler.v_proj.weight")
+    state_dict.pop("pooler.v_proj.bias")
+    state_dict.pop("pooler.c_proj.weight")
+    state_dict.pop("pooler.c_proj.bias")
+    state_dict.pop("pooler.positional_embedding")
+
+    # t_embedder -> time_embedding (`TimestepEmbedding`)
+    state_dict["time_extra_emb.timestep_embedder.linear_1.bias"] = state_dict["t_embedder.mlp.0.bias"]
+    state_dict["time_extra_emb.timestep_embedder.linear_1.weight"] = state_dict["t_embedder.mlp.0.weight"]
+    state_dict["time_extra_emb.timestep_embedder.linear_2.bias"] = state_dict["t_embedder.mlp.2.bias"]
+    state_dict["time_extra_emb.timestep_embedder.linear_2.weight"] = state_dict["t_embedder.mlp.2.weight"]
+
+    state_dict.pop("t_embedder.mlp.0.bias")
+    state_dict.pop("t_embedder.mlp.0.weight")
+    state_dict.pop("t_embedder.mlp.2.bias")
+    state_dict.pop("t_embedder.mlp.2.weight")
+
+    # x_embedder -> pos_embd (`PatchEmbed`)
+    state_dict["pos_embed.proj.weight"] = state_dict["x_embedder.proj.weight"]
+    state_dict["pos_embed.proj.bias"] = state_dict["x_embedder.proj.bias"]
+    state_dict.pop("x_embedder.proj.weight")
+    state_dict.pop("x_embedder.proj.bias")
+
+    # mlp_t5 -> text_embedder
+    state_dict["text_embedder.linear_1.bias"] = state_dict["mlp_t5.0.bias"]
+    state_dict["text_embedder.linear_1.weight"] = state_dict["mlp_t5.0.weight"]
+    state_dict["text_embedder.linear_2.bias"] = state_dict["mlp_t5.2.bias"]
+    state_dict["text_embedder.linear_2.weight"] = state_dict["mlp_t5.2.weight"]
+    state_dict.pop("mlp_t5.0.bias")
+    state_dict.pop("mlp_t5.0.weight")
+    state_dict.pop("mlp_t5.2.bias")
+    state_dict.pop("mlp_t5.2.weight")
+
+    # extra_embedder -> extra_embedder
+    state_dict["time_extra_emb.extra_embedder.linear_1.bias"] = state_dict["extra_embedder.0.bias"]
+    state_dict["time_extra_emb.extra_embedder.linear_1.weight"] = state_dict["extra_embedder.0.weight"]
+    state_dict["time_extra_emb.extra_embedder.linear_2.bias"] = state_dict["extra_embedder.2.bias"]
+    state_dict["time_extra_emb.extra_embedder.linear_2.weight"] = state_dict["extra_embedder.2.weight"]
+    state_dict.pop("extra_embedder.0.bias")
+    state_dict.pop("extra_embedder.0.weight")
+    state_dict.pop("extra_embedder.2.bias")
+    state_dict.pop("extra_embedder.2.weight")
+
+    # model.final_adaLN_modulation.1 -> norm_out.linear
+    def swap_scale_shift(weight):
+        shift, scale = weight.chunk(2, dim=0)
+        new_weight = torch.cat([scale, shift], dim=0)
+        return new_weight
+
+    state_dict["norm_out.linear.weight"] = swap_scale_shift(state_dict["final_layer.adaLN_modulation.1.weight"])
+    state_dict["norm_out.linear.bias"] = swap_scale_shift(state_dict["final_layer.adaLN_modulation.1.bias"])
+    state_dict.pop("final_layer.adaLN_modulation.1.weight")
+    state_dict.pop("final_layer.adaLN_modulation.1.bias")
+
+    # final_linear -> proj_out
+    state_dict["proj_out.weight"] = state_dict["final_layer.linear.weight"]
+    state_dict["proj_out.bias"] = state_dict["final_layer.linear.bias"]
+    state_dict.pop("final_layer.linear.weight")
+    state_dict.pop("final_layer.linear.bias")
+
+    # style_embedder
+    if model_config["use_style_cond_and_image_meta_size"]:
+        print(state_dict["style_embedder.weight"])
+        print(state_dict["style_embedder.weight"].shape)
+        state_dict["time_extra_emb.style_embedder.weight"] = state_dict["style_embedder.weight"][0:1]
+        state_dict.pop("style_embedder.weight")
+
+    model.load_state_dict(state_dict)
+
+    from diffusers import HunyuanDiTPipeline
+
+    if args.use_style_cond_and_image_meta_size:
+        pipe = HunyuanDiTPipeline.from_pretrained(
+            "Tencent-Hunyuan/HunyuanDiT-Diffusers", transformer=model, torch_dtype=torch.float32
+        )
+    else:
+        pipe = HunyuanDiTPipeline.from_pretrained(
+            "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers", transformer=model, torch_dtype=torch.float32
+        )
+    pipe.to("cuda")
+    pipe.to(dtype=torch.float32)
+
+    if args.save:
+        pipe.save_pretrained(args.output_checkpoint_path)
+
+    # ### NOTE: HunyuanDiT supports both Chinese and English inputs
+    prompt = "一个宇航员在骑马"
+    # prompt = "An astronaut riding a horse"
+    generator = torch.Generator(device="cuda").manual_seed(0)
+    image = pipe(
+        height=1024, width=1024, prompt=prompt, generator=generator, num_inference_steps=25, guidance_scale=5.0
+    ).images[0]
+
+    image.save("img.png")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--save", default=True, type=bool, required=False, help="Whether to save the converted pipeline or not."
+    )
+    parser.add_argument(
+        "--pt_checkpoint_path", default=None, type=str, required=True, help="Path to the .pt pretrained model."
+    )
+    parser.add_argument(
+        "--output_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the output converted diffusers pipeline.",
+    )
+    parser.add_argument(
+        "--load_key", default="none", type=str, required=False, help="The key to load from the pretrained .pt file"
+    )
+    parser.add_argument(
+        "--use_style_cond_and_image_meta_size",
+        type=bool,
+        default=False,
+        help="version <= v1.1: True; version >= v1.2: False",
+    )
+
+    args = parser.parse_args()
+    main(args)
diff --git a/diffusers/scripts/convert_i2vgen_to_diffusers.py b/diffusers/scripts/convert_i2vgen_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9e3ff2cd35cb3334f7228a1fd5cc4e713b50a6c
--- /dev/null
+++ b/diffusers/scripts/convert_i2vgen_to_diffusers.py
@@ -0,0 +1,510 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the LDM checkpoints."""
+
+import argparse
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers import DDIMScheduler, I2VGenXLPipeline, I2VGenXLUNet, StableDiffusionPipeline
+
+
+CLIP_ID = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        weight = old_checkpoint[path["old"]]
+        names = ["proj_attn.weight"]
+        names_2 = ["proj_out.weight", "proj_in.weight"]
+        if any(k in new_path for k in names):
+            checkpoint[new_path] = weight[:, :, 0]
+        elif any(k in new_path for k in names_2) and len(weight.shape) > 2 and ".attentions." not in new_path:
+            checkpoint[new_path] = weight[:, :, 0]
+        else:
+            checkpoint[new_path] = weight
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_temp_conv_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        mapping.append({"old": old_item, "new": old_item})
+
+    return mapping
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        if "temopral_conv" not in old_item:
+            mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+
+    unet_key = "model.diffusion_model."
+
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+
+        for key in keys:
+            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    additional_embedding_substrings = [
+        "local_image_concat",
+        "context_embedding",
+        "local_image_embedding",
+        "fps_embedding",
+    ]
+    for k in unet_state_dict:
+        if any(substring in k for substring in additional_embedding_substrings):
+            diffusers_key = k.replace("local_image_concat", "image_latents_proj_in").replace(
+                "local_image_embedding", "image_latents_context_embedding"
+            )
+            new_checkpoint[diffusers_key] = unet_state_dict[k]
+
+    # temporal encoder.
+    new_checkpoint["image_latents_temporal_encoder.norm1.weight"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.0.norm.weight"
+    ]
+    new_checkpoint["image_latents_temporal_encoder.norm1.bias"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.0.norm.bias"
+    ]
+
+    # attention
+    qkv = unet_state_dict["local_temporal_encoder.layers.0.0.fn.to_qkv.weight"]
+    q, k, v = torch.chunk(qkv, 3, dim=0)
+    new_checkpoint["image_latents_temporal_encoder.attn1.to_q.weight"] = q
+    new_checkpoint["image_latents_temporal_encoder.attn1.to_k.weight"] = k
+    new_checkpoint["image_latents_temporal_encoder.attn1.to_v.weight"] = v
+    new_checkpoint["image_latents_temporal_encoder.attn1.to_out.0.weight"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.0.fn.to_out.0.weight"
+    ]
+    new_checkpoint["image_latents_temporal_encoder.attn1.to_out.0.bias"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.0.fn.to_out.0.bias"
+    ]
+
+    # feedforward
+    new_checkpoint["image_latents_temporal_encoder.ff.net.0.proj.weight"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.1.net.0.0.weight"
+    ]
+    new_checkpoint["image_latents_temporal_encoder.ff.net.0.proj.bias"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.1.net.0.0.bias"
+    ]
+    new_checkpoint["image_latents_temporal_encoder.ff.net.2.weight"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.1.net.2.weight"
+    ]
+    new_checkpoint["image_latents_temporal_encoder.ff.net.2.bias"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.1.net.2.bias"
+    ]
+
+    if "class_embed_type" in config:
+        if config["class_embed_type"] is None:
+            # No parameters to port
+            ...
+        elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+            new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+            new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+            new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+            new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+        else:
+            raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    first_temp_attention = [v for v in unet_state_dict if v.startswith("input_blocks.0.1")]
+    paths = renew_attention_paths(first_temp_attention)
+    meta_path = {"old": "input_blocks.0.1", "new": "transformer_in"}
+    assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        temp_attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.2" in key]
+
+        if f"input_blocks.{i}.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        temporal_convs = [key for key in resnets if "temopral_conv" in key]
+        paths = renew_temp_conv_paths(temporal_convs)
+        meta_path = {
+            "old": f"input_blocks.{i}.0.temopral_conv",
+            "new": f"down_blocks.{block_id}.temp_convs.{layer_in_block_id}",
+        }
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+        if len(temp_attentions):
+            paths = renew_attention_paths(temp_attentions)
+            meta_path = {
+                "old": f"input_blocks.{i}.2",
+                "new": f"down_blocks.{block_id}.temp_attentions.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    temporal_convs_0 = [key for key in resnet_0 if "temopral_conv" in key]
+    attentions = middle_blocks[1]
+    temp_attentions = middle_blocks[2]
+    resnet_1 = middle_blocks[3]
+    temporal_convs_1 = [key for key in resnet_1 if "temopral_conv" in key]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    meta_path = {"old": "middle_block.0", "new": "mid_block.resnets.0"}
+    assign_to_checkpoint(
+        resnet_0_paths, new_checkpoint, unet_state_dict, config=config, additional_replacements=[meta_path]
+    )
+
+    temp_conv_0_paths = renew_temp_conv_paths(temporal_convs_0)
+    meta_path = {"old": "middle_block.0.temopral_conv", "new": "mid_block.temp_convs.0"}
+    assign_to_checkpoint(
+        temp_conv_0_paths, new_checkpoint, unet_state_dict, config=config, additional_replacements=[meta_path]
+    )
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    meta_path = {"old": "middle_block.3", "new": "mid_block.resnets.1"}
+    assign_to_checkpoint(
+        resnet_1_paths, new_checkpoint, unet_state_dict, config=config, additional_replacements=[meta_path]
+    )
+
+    temp_conv_1_paths = renew_temp_conv_paths(temporal_convs_1)
+    meta_path = {"old": "middle_block.3.temopral_conv", "new": "mid_block.temp_convs.1"}
+    assign_to_checkpoint(
+        temp_conv_1_paths, new_checkpoint, unet_state_dict, config=config, additional_replacements=[meta_path]
+    )
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    temp_attentions_paths = renew_attention_paths(temp_attentions)
+    meta_path = {"old": "middle_block.2", "new": "mid_block.temp_attentions.0"}
+    assign_to_checkpoint(
+        temp_attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+            temp_attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.2" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            temporal_convs = [key for key in resnets if "temopral_conv" in key]
+            paths = renew_temp_conv_paths(temporal_convs)
+            meta_path = {
+                "old": f"output_blocks.{i}.0.temopral_conv",
+                "new": f"up_blocks.{block_id}.temp_convs.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+
+            if len(temp_attentions):
+                paths = renew_attention_paths(temp_attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.2",
+                    "new": f"up_blocks.{block_id}.temp_attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+            temopral_conv_paths = [l for l in output_block_layers if "temopral_conv" in l]
+            for path in temopral_conv_paths:
+                pruned_path = path.split("temopral_conv.")[-1]
+                old_path = ".".join(["output_blocks", str(i), str(block_id), "temopral_conv", pruned_path])
+                new_path = ".".join(["up_blocks", str(block_id), "temp_convs", str(layer_in_block_id), pruned_path])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--unet_checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--push_to_hub", action="store_true")
+    args = parser.parse_args()
+
+    # UNet
+    unet_checkpoint = torch.load(args.unet_checkpoint_path, map_location="cpu")
+    unet_checkpoint = unet_checkpoint["state_dict"]
+    unet = I2VGenXLUNet(sample_size=32)
+
+    converted_ckpt = convert_ldm_unet_checkpoint(unet_checkpoint, unet.config)
+
+    diff_0 = set(unet.state_dict().keys()) - set(converted_ckpt.keys())
+    diff_1 = set(converted_ckpt.keys()) - set(unet.state_dict().keys())
+
+    assert len(diff_0) == len(diff_1) == 0, "Converted weights don't match"
+
+    unet.load_state_dict(converted_ckpt, strict=True)
+
+    # vae
+    temp_pipe = StableDiffusionPipeline.from_single_file(
+        "https://huggingface.co/ali-vilab/i2vgen-xl/blob/main/models/v2-1_512-ema-pruned.ckpt"
+    )
+    vae = temp_pipe.vae
+    del temp_pipe
+
+    # text encoder and tokenizer
+    text_encoder = CLIPTextModel.from_pretrained(CLIP_ID)
+    tokenizer = CLIPTokenizer.from_pretrained(CLIP_ID)
+
+    # image encoder and feature extractor
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(CLIP_ID)
+    feature_extractor = CLIPImageProcessor.from_pretrained(CLIP_ID)
+
+    # scheduler
+    # https://github.com/ali-vilab/i2vgen-xl/blob/main/configs/i2vgen_xl_train.yaml
+    scheduler = DDIMScheduler(
+        beta_schedule="squaredcos_cap_v2",
+        rescale_betas_zero_snr=True,
+        set_alpha_to_one=True,
+        clip_sample=False,
+        steps_offset=1,
+        timestep_spacing="leading",
+        prediction_type="v_prediction",
+    )
+
+    # final
+    pipeline = I2VGenXLPipeline(
+        unet=unet,
+        vae=vae,
+        image_encoder=image_encoder,
+        feature_extractor=feature_extractor,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        scheduler=scheduler,
+    )
+
+    pipeline.save_pretrained(args.dump_path, push_to_hub=args.push_to_hub)
diff --git a/diffusers/scripts/convert_if.py b/diffusers/scripts/convert_if.py
new file mode 100644
index 0000000000000000000000000000000000000000..85c739ca92f04bd2e1fefdaca3ff15b59f75c66e
--- /dev/null
+++ b/diffusers/scripts/convert_if.py
@@ -0,0 +1,1250 @@
+import argparse
+import inspect
+import os
+
+import numpy as np
+import torch
+import yaml
+from torch.nn import functional as F
+from transformers import CLIPConfig, CLIPImageProcessor, CLIPVisionModelWithProjection, T5EncoderModel, T5Tokenizer
+
+from diffusers import DDPMScheduler, IFPipeline, IFSuperResolutionPipeline, UNet2DConditionModel
+from diffusers.pipelines.deepfloyd_if.safety_checker import IFSafetyChecker
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dump_path", required=False, default=None, type=str)
+
+    parser.add_argument("--dump_path_stage_2", required=False, default=None, type=str)
+
+    parser.add_argument("--dump_path_stage_3", required=False, default=None, type=str)
+
+    parser.add_argument("--unet_config", required=False, default=None, type=str, help="Path to unet config file")
+
+    parser.add_argument(
+        "--unet_checkpoint_path", required=False, default=None, type=str, help="Path to unet checkpoint file"
+    )
+
+    parser.add_argument(
+        "--unet_checkpoint_path_stage_2",
+        required=False,
+        default=None,
+        type=str,
+        help="Path to stage 2 unet checkpoint file",
+    )
+
+    parser.add_argument(
+        "--unet_checkpoint_path_stage_3",
+        required=False,
+        default=None,
+        type=str,
+        help="Path to stage 3 unet checkpoint file",
+    )
+
+    parser.add_argument("--p_head_path", type=str, required=True)
+
+    parser.add_argument("--w_head_path", type=str, required=True)
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main(args):
+    tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-xxl")
+    text_encoder = T5EncoderModel.from_pretrained("google/t5-v1_1-xxl")
+
+    feature_extractor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
+    safety_checker = convert_safety_checker(p_head_path=args.p_head_path, w_head_path=args.w_head_path)
+
+    if args.unet_config is not None and args.unet_checkpoint_path is not None and args.dump_path is not None:
+        convert_stage_1_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args)
+
+    if args.unet_checkpoint_path_stage_2 is not None and args.dump_path_stage_2 is not None:
+        convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args, stage=2)
+
+    if args.unet_checkpoint_path_stage_3 is not None and args.dump_path_stage_3 is not None:
+        convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args, stage=3)
+
+
+def convert_stage_1_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args):
+    unet = get_stage_1_unet(args.unet_config, args.unet_checkpoint_path)
+
+    scheduler = DDPMScheduler(
+        variance_type="learned_range",
+        beta_schedule="squaredcos_cap_v2",
+        prediction_type="epsilon",
+        thresholding=True,
+        dynamic_thresholding_ratio=0.95,
+        sample_max_value=1.5,
+    )
+
+    pipe = IFPipeline(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        unet=unet,
+        scheduler=scheduler,
+        safety_checker=safety_checker,
+        feature_extractor=feature_extractor,
+        requires_safety_checker=True,
+    )
+
+    pipe.save_pretrained(args.dump_path)
+
+
+def convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args, stage):
+    if stage == 2:
+        unet_checkpoint_path = args.unet_checkpoint_path_stage_2
+        sample_size = None
+        dump_path = args.dump_path_stage_2
+    elif stage == 3:
+        unet_checkpoint_path = args.unet_checkpoint_path_stage_3
+        sample_size = 1024
+        dump_path = args.dump_path_stage_3
+    else:
+        assert False
+
+    unet = get_super_res_unet(unet_checkpoint_path, verify_param_count=False, sample_size=sample_size)
+
+    image_noising_scheduler = DDPMScheduler(
+        beta_schedule="squaredcos_cap_v2",
+    )
+
+    scheduler = DDPMScheduler(
+        variance_type="learned_range",
+        beta_schedule="squaredcos_cap_v2",
+        prediction_type="epsilon",
+        thresholding=True,
+        dynamic_thresholding_ratio=0.95,
+        sample_max_value=1.0,
+    )
+
+    pipe = IFSuperResolutionPipeline(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        unet=unet,
+        scheduler=scheduler,
+        image_noising_scheduler=image_noising_scheduler,
+        safety_checker=safety_checker,
+        feature_extractor=feature_extractor,
+        requires_safety_checker=True,
+    )
+
+    pipe.save_pretrained(dump_path)
+
+
+def get_stage_1_unet(unet_config, unet_checkpoint_path):
+    original_unet_config = yaml.safe_load(unet_config)
+    original_unet_config = original_unet_config["params"]
+
+    unet_diffusers_config = create_unet_diffusers_config(original_unet_config)
+
+    unet = UNet2DConditionModel(**unet_diffusers_config)
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    unet_checkpoint = torch.load(unet_checkpoint_path, map_location=device)
+
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        unet_checkpoint, unet_diffusers_config, path=unet_checkpoint_path
+    )
+
+    unet.load_state_dict(converted_unet_checkpoint)
+
+    return unet
+
+
+def convert_safety_checker(p_head_path, w_head_path):
+    state_dict = {}
+
+    # p head
+
+    p_head = np.load(p_head_path)
+
+    p_head_weights = p_head["weights"]
+    p_head_weights = torch.from_numpy(p_head_weights)
+    p_head_weights = p_head_weights.unsqueeze(0)
+
+    p_head_biases = p_head["biases"]
+    p_head_biases = torch.from_numpy(p_head_biases)
+    p_head_biases = p_head_biases.unsqueeze(0)
+
+    state_dict["p_head.weight"] = p_head_weights
+    state_dict["p_head.bias"] = p_head_biases
+
+    # w head
+
+    w_head = np.load(w_head_path)
+
+    w_head_weights = w_head["weights"]
+    w_head_weights = torch.from_numpy(w_head_weights)
+    w_head_weights = w_head_weights.unsqueeze(0)
+
+    w_head_biases = w_head["biases"]
+    w_head_biases = torch.from_numpy(w_head_biases)
+    w_head_biases = w_head_biases.unsqueeze(0)
+
+    state_dict["w_head.weight"] = w_head_weights
+    state_dict["w_head.bias"] = w_head_biases
+
+    # vision model
+
+    vision_model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+    vision_model_state_dict = vision_model.state_dict()
+
+    for key, value in vision_model_state_dict.items():
+        key = f"vision_model.{key}"
+        state_dict[key] = value
+
+    # full model
+
+    config = CLIPConfig.from_pretrained("openai/clip-vit-large-patch14")
+    safety_checker = IFSafetyChecker(config)
+
+    safety_checker.load_state_dict(state_dict)
+
+    return safety_checker
+
+
+def create_unet_diffusers_config(original_unet_config, class_embed_type=None):
+    attention_resolutions = parse_list(original_unet_config["attention_resolutions"])
+    attention_resolutions = [original_unet_config["image_size"] // int(res) for res in attention_resolutions]
+
+    channel_mult = parse_list(original_unet_config["channel_mult"])
+    block_out_channels = [original_unet_config["model_channels"] * mult for mult in channel_mult]
+
+    down_block_types = []
+    resolution = 1
+
+    for i in range(len(block_out_channels)):
+        if resolution in attention_resolutions:
+            block_type = "SimpleCrossAttnDownBlock2D"
+        elif original_unet_config["resblock_updown"]:
+            block_type = "ResnetDownsampleBlock2D"
+        else:
+            block_type = "DownBlock2D"
+
+        down_block_types.append(block_type)
+
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        if resolution in attention_resolutions:
+            block_type = "SimpleCrossAttnUpBlock2D"
+        elif original_unet_config["resblock_updown"]:
+            block_type = "ResnetUpsampleBlock2D"
+        else:
+            block_type = "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    head_dim = original_unet_config["num_head_channels"]
+
+    use_linear_projection = (
+        original_unet_config["use_linear_in_transformer"]
+        if "use_linear_in_transformer" in original_unet_config
+        else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim = [5, 10, 20, 20]
+
+    projection_class_embeddings_input_dim = None
+
+    if class_embed_type is None:
+        if "num_classes" in original_unet_config:
+            if original_unet_config["num_classes"] == "sequential":
+                class_embed_type = "projection"
+                assert "adm_in_channels" in original_unet_config
+                projection_class_embeddings_input_dim = original_unet_config["adm_in_channels"]
+            else:
+                raise NotImplementedError(
+                    f"Unknown conditional unet num_classes config: {original_unet_config['num_classes']}"
+                )
+
+    config = {
+        "sample_size": original_unet_config["image_size"],
+        "in_channels": original_unet_config["in_channels"],
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": original_unet_config["num_res_blocks"],
+        "cross_attention_dim": original_unet_config["encoder_channels"],
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "out_channels": original_unet_config["out_channels"],
+        "up_block_types": tuple(up_block_types),
+        "upcast_attention": False,  # TODO: guessing
+        "cross_attention_norm": "group_norm",
+        "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+        "addition_embed_type": "text",
+        "act_fn": "gelu",
+    }
+
+    if original_unet_config["use_scale_shift_norm"]:
+        config["resnet_time_scale_shift"] = "scale_shift"
+
+    if "encoder_dim" in original_unet_config:
+        config["encoder_hid_dim"] = original_unet_config["encoder_dim"]
+
+    return config
+
+
+def convert_ldm_unet_checkpoint(unet_state_dict, config, path=None):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    if config["class_embed_type"] in [None, "identity"]:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+
+        # TODO need better check than i in [4, 8, 12, 16]
+        block_type = config["down_block_types"][block_id]
+        if (block_type == "ResnetDownsampleBlock2D" or block_type == "SimpleCrossAttnDownBlock2D") and i in [
+            4,
+            8,
+            12,
+            16,
+        ]:
+            meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.downsamplers.0"}
+        else:
+            meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            old_path = f"input_blocks.{i}.1"
+            new_path = f"down_blocks.{block_id}.attentions.{layer_in_block_id}"
+
+            assign_attention_to_checkpoint(
+                new_checkpoint=new_checkpoint,
+                unet_state_dict=unet_state_dict,
+                old_path=old_path,
+                new_path=new_path,
+                config=config,
+            )
+
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": old_path, "new": new_path}
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+                config=config,
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    old_path = "middle_block.1"
+    new_path = "mid_block.attentions.0"
+
+    assign_attention_to_checkpoint(
+        new_checkpoint=new_checkpoint,
+        unet_state_dict=unet_state_dict,
+        old_path=old_path,
+        new_path=new_path,
+        config=config,
+    )
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        # len(output_block_list) == 1 -> resnet
+        # len(output_block_list) == 2 -> resnet, attention
+        # len(output_block_list) == 3 -> resnet, attention, upscale resnet
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                old_path = f"output_blocks.{i}.1"
+                new_path = f"up_blocks.{block_id}.attentions.{layer_in_block_id}"
+
+                assign_attention_to_checkpoint(
+                    new_checkpoint=new_checkpoint,
+                    unet_state_dict=unet_state_dict,
+                    old_path=old_path,
+                    new_path=new_path,
+                    config=config,
+                )
+
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": old_path,
+                    "new": new_path,
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+
+            if len(output_block_list) == 3:
+                resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.2" in key]
+                paths = renew_resnet_paths(resnets)
+                meta_path = {"old": f"output_blocks.{i}.2", "new": f"up_blocks.{block_id}.upsamplers.0"}
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    if "encoder_proj.weight" in unet_state_dict:
+        new_checkpoint["encoder_hid_proj.weight"] = unet_state_dict.pop("encoder_proj.weight")
+        new_checkpoint["encoder_hid_proj.bias"] = unet_state_dict.pop("encoder_proj.bias")
+
+    if "encoder_pooling.0.weight" in unet_state_dict:
+        new_checkpoint["add_embedding.norm1.weight"] = unet_state_dict.pop("encoder_pooling.0.weight")
+        new_checkpoint["add_embedding.norm1.bias"] = unet_state_dict.pop("encoder_pooling.0.bias")
+
+        new_checkpoint["add_embedding.pool.positional_embedding"] = unet_state_dict.pop(
+            "encoder_pooling.1.positional_embedding"
+        )
+        new_checkpoint["add_embedding.pool.k_proj.weight"] = unet_state_dict.pop("encoder_pooling.1.k_proj.weight")
+        new_checkpoint["add_embedding.pool.k_proj.bias"] = unet_state_dict.pop("encoder_pooling.1.k_proj.bias")
+        new_checkpoint["add_embedding.pool.q_proj.weight"] = unet_state_dict.pop("encoder_pooling.1.q_proj.weight")
+        new_checkpoint["add_embedding.pool.q_proj.bias"] = unet_state_dict.pop("encoder_pooling.1.q_proj.bias")
+        new_checkpoint["add_embedding.pool.v_proj.weight"] = unet_state_dict.pop("encoder_pooling.1.v_proj.weight")
+        new_checkpoint["add_embedding.pool.v_proj.bias"] = unet_state_dict.pop("encoder_pooling.1.v_proj.bias")
+
+        new_checkpoint["add_embedding.proj.weight"] = unet_state_dict.pop("encoder_pooling.2.weight")
+        new_checkpoint["add_embedding.proj.bias"] = unet_state_dict.pop("encoder_pooling.2.bias")
+
+        new_checkpoint["add_embedding.norm2.weight"] = unet_state_dict.pop("encoder_pooling.3.weight")
+        new_checkpoint["add_embedding.norm2.bias"] = unet_state_dict.pop("encoder_pooling.3.bias")
+
+    return new_checkpoint
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        if "qkv" in new_item:
+            continue
+
+        if "encoder_kv" in new_item:
+            continue
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+
+        new_item = new_item.replace("norm_encoder.weight", "norm_cross.weight")
+        new_item = new_item.replace("norm_encoder.bias", "norm_cross.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_attention_to_checkpoint(new_checkpoint, unet_state_dict, old_path, new_path, config):
+    qkv_weight = unet_state_dict.pop(f"{old_path}.qkv.weight")
+    qkv_weight = qkv_weight[:, :, 0]
+
+    qkv_bias = unet_state_dict.pop(f"{old_path}.qkv.bias")
+
+    is_cross_attn_only = "only_cross_attention" in config and config["only_cross_attention"]
+
+    split = 1 if is_cross_attn_only else 3
+
+    weights, bias = split_attentions(
+        weight=qkv_weight,
+        bias=qkv_bias,
+        split=split,
+        chunk_size=config["attention_head_dim"],
+    )
+
+    if is_cross_attn_only:
+        query_weight, q_bias = weights, bias
+        new_checkpoint[f"{new_path}.to_q.weight"] = query_weight[0]
+        new_checkpoint[f"{new_path}.to_q.bias"] = q_bias[0]
+    else:
+        [query_weight, key_weight, value_weight], [q_bias, k_bias, v_bias] = weights, bias
+        new_checkpoint[f"{new_path}.to_q.weight"] = query_weight
+        new_checkpoint[f"{new_path}.to_q.bias"] = q_bias
+        new_checkpoint[f"{new_path}.to_k.weight"] = key_weight
+        new_checkpoint[f"{new_path}.to_k.bias"] = k_bias
+        new_checkpoint[f"{new_path}.to_v.weight"] = value_weight
+        new_checkpoint[f"{new_path}.to_v.bias"] = v_bias
+
+    encoder_kv_weight = unet_state_dict.pop(f"{old_path}.encoder_kv.weight")
+    encoder_kv_weight = encoder_kv_weight[:, :, 0]
+
+    encoder_kv_bias = unet_state_dict.pop(f"{old_path}.encoder_kv.bias")
+
+    [encoder_k_weight, encoder_v_weight], [encoder_k_bias, encoder_v_bias] = split_attentions(
+        weight=encoder_kv_weight,
+        bias=encoder_kv_bias,
+        split=2,
+        chunk_size=config["attention_head_dim"],
+    )
+
+    new_checkpoint[f"{new_path}.add_k_proj.weight"] = encoder_k_weight
+    new_checkpoint[f"{new_path}.add_k_proj.bias"] = encoder_k_bias
+    new_checkpoint[f"{new_path}.add_v_proj.weight"] = encoder_v_weight
+    new_checkpoint[f"{new_path}.add_v_proj.bias"] = encoder_v_bias
+
+
+def assign_to_checkpoint(paths, checkpoint, old_checkpoint, additional_replacements=None, config=None):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    for path in paths:
+        new_path = path["new"]
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path or "to_out.0.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+# TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?)
+def split_attentions(*, weight, bias, split, chunk_size):
+    weights = [None] * split
+    biases = [None] * split
+
+    weights_biases_idx = 0
+
+    for starting_row_index in range(0, weight.shape[0], chunk_size):
+        row_indices = torch.arange(starting_row_index, starting_row_index + chunk_size)
+
+        weight_rows = weight[row_indices, :]
+        bias_rows = bias[row_indices]
+
+        if weights[weights_biases_idx] is None:
+            weights[weights_biases_idx] = weight_rows
+            biases[weights_biases_idx] = bias_rows
+        else:
+            assert weights[weights_biases_idx] is not None
+            weights[weights_biases_idx] = torch.concat([weights[weights_biases_idx], weight_rows])
+            biases[weights_biases_idx] = torch.concat([biases[weights_biases_idx], bias_rows])
+
+        weights_biases_idx = (weights_biases_idx + 1) % split
+
+    return weights, biases
+
+
+def parse_list(value):
+    if isinstance(value, str):
+        value = value.split(",")
+        value = [int(v) for v in value]
+    elif isinstance(value, list):
+        pass
+    else:
+        raise ValueError(f"Can't parse list for type: {type(value)}")
+
+    return value
+
+
+# below is copy and pasted from original convert_if_stage_2.py script
+
+
+def get_super_res_unet(unet_checkpoint_path, verify_param_count=True, sample_size=None):
+    orig_path = unet_checkpoint_path
+
+    original_unet_config = yaml.safe_load(os.path.join(orig_path, "config.yml"))
+    original_unet_config = original_unet_config["params"]
+
+    unet_diffusers_config = superres_create_unet_diffusers_config(original_unet_config)
+    unet_diffusers_config["time_embedding_dim"] = original_unet_config["model_channels"] * int(
+        original_unet_config["channel_mult"].split(",")[-1]
+    )
+    if original_unet_config["encoder_dim"] != original_unet_config["encoder_channels"]:
+        unet_diffusers_config["encoder_hid_dim"] = original_unet_config["encoder_dim"]
+        unet_diffusers_config["class_embed_type"] = "timestep"
+        unet_diffusers_config["addition_embed_type"] = "text"
+
+    unet_diffusers_config["time_embedding_act_fn"] = "gelu"
+    unet_diffusers_config["resnet_skip_time_act"] = True
+    unet_diffusers_config["resnet_out_scale_factor"] = 1 / 0.7071
+    unet_diffusers_config["mid_block_scale_factor"] = 1 / 0.7071
+    unet_diffusers_config["only_cross_attention"] = (
+        bool(original_unet_config["disable_self_attentions"])
+        if (
+            "disable_self_attentions" in original_unet_config
+            and isinstance(original_unet_config["disable_self_attentions"], int)
+        )
+        else True
+    )
+
+    if sample_size is None:
+        unet_diffusers_config["sample_size"] = original_unet_config["image_size"]
+    else:
+        # The second upscaler unet's sample size is incorrectly specified
+        # in the config and is instead hardcoded in source
+        unet_diffusers_config["sample_size"] = sample_size
+
+    unet_checkpoint = torch.load(os.path.join(unet_checkpoint_path, "pytorch_model.bin"), map_location="cpu")
+
+    if verify_param_count:
+        # check that architecture matches - is a bit slow
+        verify_param_count(orig_path, unet_diffusers_config)
+
+    converted_unet_checkpoint = superres_convert_ldm_unet_checkpoint(
+        unet_checkpoint, unet_diffusers_config, path=unet_checkpoint_path
+    )
+    converted_keys = converted_unet_checkpoint.keys()
+
+    model = UNet2DConditionModel(**unet_diffusers_config)
+    expected_weights = model.state_dict().keys()
+
+    diff_c_e = set(converted_keys) - set(expected_weights)
+    diff_e_c = set(expected_weights) - set(converted_keys)
+
+    assert len(diff_e_c) == 0, f"Expected, but not converted: {diff_e_c}"
+    assert len(diff_c_e) == 0, f"Converted, but not expected: {diff_c_e}"
+
+    model.load_state_dict(converted_unet_checkpoint)
+
+    return model
+
+
+def superres_create_unet_diffusers_config(original_unet_config):
+    attention_resolutions = parse_list(original_unet_config["attention_resolutions"])
+    attention_resolutions = [original_unet_config["image_size"] // int(res) for res in attention_resolutions]
+
+    channel_mult = parse_list(original_unet_config["channel_mult"])
+    block_out_channels = [original_unet_config["model_channels"] * mult for mult in channel_mult]
+
+    down_block_types = []
+    resolution = 1
+
+    for i in range(len(block_out_channels)):
+        if resolution in attention_resolutions:
+            block_type = "SimpleCrossAttnDownBlock2D"
+        elif original_unet_config["resblock_updown"]:
+            block_type = "ResnetDownsampleBlock2D"
+        else:
+            block_type = "DownBlock2D"
+
+        down_block_types.append(block_type)
+
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        if resolution in attention_resolutions:
+            block_type = "SimpleCrossAttnUpBlock2D"
+        elif original_unet_config["resblock_updown"]:
+            block_type = "ResnetUpsampleBlock2D"
+        else:
+            block_type = "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    head_dim = original_unet_config["num_head_channels"]
+    use_linear_projection = (
+        original_unet_config["use_linear_in_transformer"]
+        if "use_linear_in_transformer" in original_unet_config
+        else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim = [5, 10, 20, 20]
+
+    class_embed_type = None
+    projection_class_embeddings_input_dim = None
+
+    if "num_classes" in original_unet_config:
+        if original_unet_config["num_classes"] == "sequential":
+            class_embed_type = "projection"
+            assert "adm_in_channels" in original_unet_config
+            projection_class_embeddings_input_dim = original_unet_config["adm_in_channels"]
+        else:
+            raise NotImplementedError(
+                f"Unknown conditional unet num_classes config: {original_unet_config['num_classes']}"
+            )
+
+    config = {
+        "in_channels": original_unet_config["in_channels"],
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": tuple(original_unet_config["num_res_blocks"]),
+        "cross_attention_dim": original_unet_config["encoder_channels"],
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "out_channels": original_unet_config["out_channels"],
+        "up_block_types": tuple(up_block_types),
+        "upcast_attention": False,  # TODO: guessing
+        "cross_attention_norm": "group_norm",
+        "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+        "act_fn": "gelu",
+    }
+
+    if original_unet_config["use_scale_shift_norm"]:
+        config["resnet_time_scale_shift"] = "scale_shift"
+
+    return config
+
+
+def superres_convert_ldm_unet_checkpoint(unet_state_dict, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["aug_proj.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["aug_proj.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["aug_proj.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["aug_proj.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+
+    if "encoder_proj.weight" in unet_state_dict:
+        new_checkpoint["encoder_hid_proj.weight"] = unet_state_dict["encoder_proj.weight"]
+        new_checkpoint["encoder_hid_proj.bias"] = unet_state_dict["encoder_proj.bias"]
+
+    if "encoder_pooling.0.weight" in unet_state_dict:
+        mapping = {
+            "encoder_pooling.0": "add_embedding.norm1",
+            "encoder_pooling.1": "add_embedding.pool",
+            "encoder_pooling.2": "add_embedding.proj",
+            "encoder_pooling.3": "add_embedding.norm2",
+        }
+        for key in unet_state_dict.keys():
+            if key.startswith("encoder_pooling"):
+                prefix = key[: len("encoder_pooling.0")]
+                new_key = key.replace(prefix, mapping[prefix])
+                new_checkpoint[new_key] = unet_state_dict[key]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key]
+        for layer_id in range(num_output_blocks)
+    }
+    if not isinstance(config["layers_per_block"], int):
+        layers_per_block_list = [e + 1 for e in config["layers_per_block"]]
+        layers_per_block_cumsum = list(np.cumsum(layers_per_block_list))
+        downsampler_ids = layers_per_block_cumsum
+    else:
+        # TODO need better check than i in [4, 8, 12, 16]
+        downsampler_ids = [4, 8, 12, 16]
+
+    for i in range(1, num_input_blocks):
+        if isinstance(config["layers_per_block"], int):
+            layers_per_block = config["layers_per_block"]
+            block_id = (i - 1) // (layers_per_block + 1)
+            layer_in_block_id = (i - 1) % (layers_per_block + 1)
+        else:
+            block_id = next(k for k, n in enumerate(layers_per_block_cumsum) if (i - 1) < n)
+            passed_blocks = layers_per_block_cumsum[block_id - 1] if block_id > 0 else 0
+            layer_in_block_id = (i - 1) - passed_blocks
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+
+        block_type = config["down_block_types"][block_id]
+        if (
+            block_type == "ResnetDownsampleBlock2D" or block_type == "SimpleCrossAttnDownBlock2D"
+        ) and i in downsampler_ids:
+            meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.downsamplers.0"}
+        else:
+            meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            old_path = f"input_blocks.{i}.1"
+            new_path = f"down_blocks.{block_id}.attentions.{layer_in_block_id}"
+
+            assign_attention_to_checkpoint(
+                new_checkpoint=new_checkpoint,
+                unet_state_dict=unet_state_dict,
+                old_path=old_path,
+                new_path=new_path,
+                config=config,
+            )
+
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": old_path, "new": new_path}
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+                config=config,
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    old_path = "middle_block.1"
+    new_path = "mid_block.attentions.0"
+
+    assign_attention_to_checkpoint(
+        new_checkpoint=new_checkpoint,
+        unet_state_dict=unet_state_dict,
+        old_path=old_path,
+        new_path=new_path,
+        config=config,
+    )
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+    if not isinstance(config["layers_per_block"], int):
+        layers_per_block_list = list(reversed([e + 1 for e in config["layers_per_block"]]))
+        layers_per_block_cumsum = list(np.cumsum(layers_per_block_list))
+
+    for i in range(num_output_blocks):
+        if isinstance(config["layers_per_block"], int):
+            layers_per_block = config["layers_per_block"]
+            block_id = i // (layers_per_block + 1)
+            layer_in_block_id = i % (layers_per_block + 1)
+        else:
+            block_id = next(k for k, n in enumerate(layers_per_block_cumsum) if i < n)
+            passed_blocks = layers_per_block_cumsum[block_id - 1] if block_id > 0 else 0
+            layer_in_block_id = i - passed_blocks
+
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        # len(output_block_list) == 1 -> resnet
+        # len(output_block_list) == 2 -> resnet, attention or resnet, upscale resnet
+        # len(output_block_list) == 3 -> resnet, attention, upscale resnet
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+
+            has_attention = True
+            if len(output_block_list) == 2 and any("in_layers" in k for k in output_block_list["1"]):
+                has_attention = False
+
+            maybe_attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # this layer was no attention
+                has_attention = False
+                maybe_attentions = []
+
+            if has_attention:
+                old_path = f"output_blocks.{i}.1"
+                new_path = f"up_blocks.{block_id}.attentions.{layer_in_block_id}"
+
+                assign_attention_to_checkpoint(
+                    new_checkpoint=new_checkpoint,
+                    unet_state_dict=unet_state_dict,
+                    old_path=old_path,
+                    new_path=new_path,
+                    config=config,
+                )
+
+                paths = renew_attention_paths(maybe_attentions)
+                meta_path = {
+                    "old": old_path,
+                    "new": new_path,
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+
+            if len(output_block_list) == 3 or (not has_attention and len(maybe_attentions) > 0):
+                layer_id = len(output_block_list) - 1
+                resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.{layer_id}" in key]
+                paths = renew_resnet_paths(resnets)
+                meta_path = {"old": f"output_blocks.{i}.{layer_id}", "new": f"up_blocks.{block_id}.upsamplers.0"}
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+def verify_param_count(orig_path, unet_diffusers_config):
+    if "-II-" in orig_path:
+        from deepfloyd_if.modules import IFStageII
+
+        if_II = IFStageII(device="cpu", dir_or_name=orig_path)
+    elif "-III-" in orig_path:
+        from deepfloyd_if.modules import IFStageIII
+
+        if_II = IFStageIII(device="cpu", dir_or_name=orig_path)
+    else:
+        assert f"Weird name. Should have -II- or -III- in path: {orig_path}"
+
+    unet = UNet2DConditionModel(**unet_diffusers_config)
+
+    # in params
+    assert_param_count(unet.time_embedding, if_II.model.time_embed)
+    assert_param_count(unet.conv_in, if_II.model.input_blocks[:1])
+
+    # downblocks
+    assert_param_count(unet.down_blocks[0], if_II.model.input_blocks[1:4])
+    assert_param_count(unet.down_blocks[1], if_II.model.input_blocks[4:7])
+    assert_param_count(unet.down_blocks[2], if_II.model.input_blocks[7:11])
+
+    if "-II-" in orig_path:
+        assert_param_count(unet.down_blocks[3], if_II.model.input_blocks[11:17])
+        assert_param_count(unet.down_blocks[4], if_II.model.input_blocks[17:])
+    if "-III-" in orig_path:
+        assert_param_count(unet.down_blocks[3], if_II.model.input_blocks[11:15])
+        assert_param_count(unet.down_blocks[4], if_II.model.input_blocks[15:20])
+        assert_param_count(unet.down_blocks[5], if_II.model.input_blocks[20:])
+
+    # mid block
+    assert_param_count(unet.mid_block, if_II.model.middle_block)
+
+    # up block
+    if "-II-" in orig_path:
+        assert_param_count(unet.up_blocks[0], if_II.model.output_blocks[:6])
+        assert_param_count(unet.up_blocks[1], if_II.model.output_blocks[6:12])
+        assert_param_count(unet.up_blocks[2], if_II.model.output_blocks[12:16])
+        assert_param_count(unet.up_blocks[3], if_II.model.output_blocks[16:19])
+        assert_param_count(unet.up_blocks[4], if_II.model.output_blocks[19:])
+    if "-III-" in orig_path:
+        assert_param_count(unet.up_blocks[0], if_II.model.output_blocks[:5])
+        assert_param_count(unet.up_blocks[1], if_II.model.output_blocks[5:10])
+        assert_param_count(unet.up_blocks[2], if_II.model.output_blocks[10:14])
+        assert_param_count(unet.up_blocks[3], if_II.model.output_blocks[14:18])
+        assert_param_count(unet.up_blocks[4], if_II.model.output_blocks[18:21])
+        assert_param_count(unet.up_blocks[5], if_II.model.output_blocks[21:24])
+
+    # out params
+    assert_param_count(unet.conv_norm_out, if_II.model.out[0])
+    assert_param_count(unet.conv_out, if_II.model.out[2])
+
+    # make sure all model architecture has same param count
+    assert_param_count(unet, if_II.model)
+
+
+def assert_param_count(model_1, model_2):
+    count_1 = sum(p.numel() for p in model_1.parameters())
+    count_2 = sum(p.numel() for p in model_2.parameters())
+    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
+
+
+def superres_check_against_original(dump_path, unet_checkpoint_path):
+    model_path = dump_path
+    model = UNet2DConditionModel.from_pretrained(model_path)
+    model.to("cuda")
+    orig_path = unet_checkpoint_path
+
+    if "-II-" in orig_path:
+        from deepfloyd_if.modules import IFStageII
+
+        if_II_model = IFStageII(device="cuda", dir_or_name=orig_path, model_kwargs={"precision": "fp32"}).model
+    elif "-III-" in orig_path:
+        from deepfloyd_if.modules import IFStageIII
+
+        if_II_model = IFStageIII(device="cuda", dir_or_name=orig_path, model_kwargs={"precision": "fp32"}).model
+
+    batch_size = 1
+    channels = model.config.in_channels // 2
+    height = model.config.sample_size
+    width = model.config.sample_size
+    height = 1024
+    width = 1024
+
+    torch.manual_seed(0)
+
+    latents = torch.randn((batch_size, channels, height, width), device=model.device)
+    image_small = torch.randn((batch_size, channels, height // 4, width // 4), device=model.device)
+
+    interpolate_antialias = {}
+    if "antialias" in inspect.signature(F.interpolate).parameters:
+        interpolate_antialias["antialias"] = True
+        image_upscaled = F.interpolate(
+            image_small, size=[height, width], mode="bicubic", align_corners=False, **interpolate_antialias
+        )
+
+    latent_model_input = torch.cat([latents, image_upscaled], dim=1).to(model.dtype)
+    t = torch.tensor([5], device=model.device).to(model.dtype)
+
+    seq_len = 64
+    encoder_hidden_states = torch.randn((batch_size, seq_len, model.config.encoder_hid_dim), device=model.device).to(
+        model.dtype
+    )
+
+    fake_class_labels = torch.tensor([t], device=model.device).to(model.dtype)
+
+    with torch.no_grad():
+        out = if_II_model(latent_model_input, t, aug_steps=fake_class_labels, text_emb=encoder_hidden_states)
+
+    if_II_model.to("cpu")
+    del if_II_model
+    import gc
+
+    torch.cuda.empty_cache()
+    gc.collect()
+    print(50 * "=")
+
+    with torch.no_grad():
+        noise_pred = model(
+            sample=latent_model_input,
+            encoder_hidden_states=encoder_hidden_states,
+            class_labels=fake_class_labels,
+            timestep=t,
+        ).sample
+
+    print("Out shape", noise_pred.shape)
+    print("Diff", (out - noise_pred).abs().sum())
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/diffusers/scripts/convert_k_upscaler_to_diffusers.py b/diffusers/scripts/convert_k_upscaler_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..62abedd737855ca0b0bc9abb75c9b6fb91d5bde2
--- /dev/null
+++ b/diffusers/scripts/convert_k_upscaler_to_diffusers.py
@@ -0,0 +1,297 @@
+import argparse
+
+import huggingface_hub
+import k_diffusion as K
+import torch
+
+from diffusers import UNet2DConditionModel
+
+
+UPSCALER_REPO = "pcuenq/k-upscaler"
+
+
+def resnet_to_diffusers_checkpoint(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
+    rv = {
+        # norm1
+        f"{diffusers_resnet_prefix}.norm1.linear.weight": checkpoint[f"{resnet_prefix}.main.0.mapper.weight"],
+        f"{diffusers_resnet_prefix}.norm1.linear.bias": checkpoint[f"{resnet_prefix}.main.0.mapper.bias"],
+        # conv1
+        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.main.2.weight"],
+        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.main.2.bias"],
+        # norm2
+        f"{diffusers_resnet_prefix}.norm2.linear.weight": checkpoint[f"{resnet_prefix}.main.4.mapper.weight"],
+        f"{diffusers_resnet_prefix}.norm2.linear.bias": checkpoint[f"{resnet_prefix}.main.4.mapper.bias"],
+        # conv2
+        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.main.6.weight"],
+        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.main.6.bias"],
+    }
+
+    if resnet.conv_shortcut is not None:
+        rv.update(
+            {
+                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.skip.weight"],
+            }
+        )
+
+    return rv
+
+
+def self_attn_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
+    weight_q, weight_k, weight_v = checkpoint[f"{attention_prefix}.qkv_proj.weight"].chunk(3, dim=0)
+    bias_q, bias_k, bias_v = checkpoint[f"{attention_prefix}.qkv_proj.bias"].chunk(3, dim=0)
+    rv = {
+        # norm
+        f"{diffusers_attention_prefix}.norm1.linear.weight": checkpoint[f"{attention_prefix}.norm_in.mapper.weight"],
+        f"{diffusers_attention_prefix}.norm1.linear.bias": checkpoint[f"{attention_prefix}.norm_in.mapper.bias"],
+        # to_q
+        f"{diffusers_attention_prefix}.attn1.to_q.weight": weight_q.squeeze(-1).squeeze(-1),
+        f"{diffusers_attention_prefix}.attn1.to_q.bias": bias_q,
+        # to_k
+        f"{diffusers_attention_prefix}.attn1.to_k.weight": weight_k.squeeze(-1).squeeze(-1),
+        f"{diffusers_attention_prefix}.attn1.to_k.bias": bias_k,
+        # to_v
+        f"{diffusers_attention_prefix}.attn1.to_v.weight": weight_v.squeeze(-1).squeeze(-1),
+        f"{diffusers_attention_prefix}.attn1.to_v.bias": bias_v,
+        # to_out
+        f"{diffusers_attention_prefix}.attn1.to_out.0.weight": checkpoint[f"{attention_prefix}.out_proj.weight"]
+        .squeeze(-1)
+        .squeeze(-1),
+        f"{diffusers_attention_prefix}.attn1.to_out.0.bias": checkpoint[f"{attention_prefix}.out_proj.bias"],
+    }
+
+    return rv
+
+
+def cross_attn_to_diffusers_checkpoint(
+    checkpoint, *, diffusers_attention_prefix, diffusers_attention_index, attention_prefix
+):
+    weight_k, weight_v = checkpoint[f"{attention_prefix}.kv_proj.weight"].chunk(2, dim=0)
+    bias_k, bias_v = checkpoint[f"{attention_prefix}.kv_proj.bias"].chunk(2, dim=0)
+
+    rv = {
+        # norm2 (ada groupnorm)
+        f"{diffusers_attention_prefix}.norm{diffusers_attention_index}.linear.weight": checkpoint[
+            f"{attention_prefix}.norm_dec.mapper.weight"
+        ],
+        f"{diffusers_attention_prefix}.norm{diffusers_attention_index}.linear.bias": checkpoint[
+            f"{attention_prefix}.norm_dec.mapper.bias"
+        ],
+        # layernorm on encoder_hidden_state
+        f"{diffusers_attention_prefix}.attn{diffusers_attention_index}.norm_cross.weight": checkpoint[
+            f"{attention_prefix}.norm_enc.weight"
+        ],
+        f"{diffusers_attention_prefix}.attn{diffusers_attention_index}.norm_cross.bias": checkpoint[
+            f"{attention_prefix}.norm_enc.bias"
+        ],
+        # to_q
+        f"{diffusers_attention_prefix}.attn{diffusers_attention_index}.to_q.weight": checkpoint[
+            f"{attention_prefix}.q_proj.weight"
+        ]
+        .squeeze(-1)
+        .squeeze(-1),
+        f"{diffusers_attention_prefix}.attn{diffusers_attention_index}.to_q.bias": checkpoint[
+            f"{attention_prefix}.q_proj.bias"
+        ],
+        # to_k
+        f"{diffusers_attention_prefix}.attn{diffusers_attention_index}.to_k.weight": weight_k.squeeze(-1).squeeze(-1),
+        f"{diffusers_attention_prefix}.attn{diffusers_attention_index}.to_k.bias": bias_k,
+        # to_v
+        f"{diffusers_attention_prefix}.attn{diffusers_attention_index}.to_v.weight": weight_v.squeeze(-1).squeeze(-1),
+        f"{diffusers_attention_prefix}.attn{diffusers_attention_index}.to_v.bias": bias_v,
+        # to_out
+        f"{diffusers_attention_prefix}.attn{diffusers_attention_index}.to_out.0.weight": checkpoint[
+            f"{attention_prefix}.out_proj.weight"
+        ]
+        .squeeze(-1)
+        .squeeze(-1),
+        f"{diffusers_attention_prefix}.attn{diffusers_attention_index}.to_out.0.bias": checkpoint[
+            f"{attention_prefix}.out_proj.bias"
+        ],
+    }
+
+    return rv
+
+
+def block_to_diffusers_checkpoint(block, checkpoint, block_idx, block_type):
+    block_prefix = "inner_model.u_net.u_blocks" if block_type == "up" else "inner_model.u_net.d_blocks"
+    block_prefix = f"{block_prefix}.{block_idx}"
+
+    diffusers_checkpoint = {}
+
+    if not hasattr(block, "attentions"):
+        n = 1  # resnet only
+    elif not block.attentions[0].add_self_attention:
+        n = 2  # resnet -> cross-attention
+    else:
+        n = 3  # resnet -> self-attention -> cross-attention)
+
+    for resnet_idx, resnet in enumerate(block.resnets):
+        # diffusers_resnet_prefix = f"{diffusers_up_block_prefix}.resnets.{resnet_idx}"
+        diffusers_resnet_prefix = f"{block_type}_blocks.{block_idx}.resnets.{resnet_idx}"
+        idx = n * resnet_idx if block_type == "up" else n * resnet_idx + 1
+        resnet_prefix = f"{block_prefix}.{idx}" if block_type == "up" else f"{block_prefix}.{idx}"
+
+        diffusers_checkpoint.update(
+            resnet_to_diffusers_checkpoint(
+                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+            )
+        )
+
+    if hasattr(block, "attentions"):
+        for attention_idx, attention in enumerate(block.attentions):
+            diffusers_attention_prefix = f"{block_type}_blocks.{block_idx}.attentions.{attention_idx}"
+            idx = n * attention_idx + 1 if block_type == "up" else n * attention_idx + 2
+            self_attention_prefix = f"{block_prefix}.{idx}"
+            cross_attention_prefix = f"{block_prefix}.{idx }"
+            cross_attention_index = 1 if not attention.add_self_attention else 2
+            idx = (
+                n * attention_idx + cross_attention_index
+                if block_type == "up"
+                else n * attention_idx + cross_attention_index + 1
+            )
+            cross_attention_prefix = f"{block_prefix}.{idx }"
+
+            diffusers_checkpoint.update(
+                cross_attn_to_diffusers_checkpoint(
+                    checkpoint,
+                    diffusers_attention_prefix=diffusers_attention_prefix,
+                    diffusers_attention_index=2,
+                    attention_prefix=cross_attention_prefix,
+                )
+            )
+
+            if attention.add_self_attention is True:
+                diffusers_checkpoint.update(
+                    self_attn_to_diffusers_checkpoint(
+                        checkpoint,
+                        diffusers_attention_prefix=diffusers_attention_prefix,
+                        attention_prefix=self_attention_prefix,
+                    )
+                )
+
+    return diffusers_checkpoint
+
+
+def unet_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # pre-processing
+    diffusers_checkpoint.update(
+        {
+            "conv_in.weight": checkpoint["inner_model.proj_in.weight"],
+            "conv_in.bias": checkpoint["inner_model.proj_in.bias"],
+        }
+    )
+
+    # timestep and class embedding
+    diffusers_checkpoint.update(
+        {
+            "time_proj.weight": checkpoint["inner_model.timestep_embed.weight"].squeeze(-1),
+            "time_embedding.linear_1.weight": checkpoint["inner_model.mapping.0.weight"],
+            "time_embedding.linear_1.bias": checkpoint["inner_model.mapping.0.bias"],
+            "time_embedding.linear_2.weight": checkpoint["inner_model.mapping.2.weight"],
+            "time_embedding.linear_2.bias": checkpoint["inner_model.mapping.2.bias"],
+            "time_embedding.cond_proj.weight": checkpoint["inner_model.mapping_cond.weight"],
+        }
+    )
+
+    # down_blocks
+    for down_block_idx, down_block in enumerate(model.down_blocks):
+        diffusers_checkpoint.update(block_to_diffusers_checkpoint(down_block, checkpoint, down_block_idx, "down"))
+
+    # up_blocks
+    for up_block_idx, up_block in enumerate(model.up_blocks):
+        diffusers_checkpoint.update(block_to_diffusers_checkpoint(up_block, checkpoint, up_block_idx, "up"))
+
+    # post-processing
+    diffusers_checkpoint.update(
+        {
+            "conv_out.weight": checkpoint["inner_model.proj_out.weight"],
+            "conv_out.bias": checkpoint["inner_model.proj_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def unet_model_from_original_config(original_config):
+    in_channels = original_config["input_channels"] + original_config["unet_cond_dim"]
+    out_channels = original_config["input_channels"] + (1 if original_config["has_variance"] else 0)
+
+    block_out_channels = original_config["channels"]
+
+    assert (
+        len(set(original_config["depths"])) == 1
+    ), "UNet2DConditionModel currently do not support blocks with different number of layers"
+    layers_per_block = original_config["depths"][0]
+
+    class_labels_dim = original_config["mapping_cond_dim"]
+    cross_attention_dim = original_config["cross_cond_dim"]
+
+    attn1_types = []
+    attn2_types = []
+    for s, c in zip(original_config["self_attn_depths"], original_config["cross_attn_depths"]):
+        if s:
+            a1 = "self"
+            a2 = "cross" if c else None
+        elif c:
+            a1 = "cross"
+            a2 = None
+        else:
+            a1 = None
+            a2 = None
+        attn1_types.append(a1)
+        attn2_types.append(a2)
+
+    unet = UNet2DConditionModel(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        down_block_types=("KDownBlock2D", "KCrossAttnDownBlock2D", "KCrossAttnDownBlock2D", "KCrossAttnDownBlock2D"),
+        mid_block_type=None,
+        up_block_types=("KCrossAttnUpBlock2D", "KCrossAttnUpBlock2D", "KCrossAttnUpBlock2D", "KUpBlock2D"),
+        block_out_channels=block_out_channels,
+        layers_per_block=layers_per_block,
+        act_fn="gelu",
+        norm_num_groups=None,
+        cross_attention_dim=cross_attention_dim,
+        attention_head_dim=64,
+        time_cond_proj_dim=class_labels_dim,
+        resnet_time_scale_shift="scale_shift",
+        time_embedding_type="fourier",
+        timestep_post_act="gelu",
+        conv_in_kernel=1,
+        conv_out_kernel=1,
+    )
+
+    return unet
+
+
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    orig_config_path = huggingface_hub.hf_hub_download(UPSCALER_REPO, "config_laion_text_cond_latent_upscaler_2.json")
+    orig_weights_path = huggingface_hub.hf_hub_download(
+        UPSCALER_REPO, "laion_text_cond_latent_upscaler_2_1_00470000_slim.pth"
+    )
+    print(f"loading original model configuration from {orig_config_path}")
+    print(f"loading original model checkpoint from {orig_weights_path}")
+
+    print("converting to diffusers unet")
+    orig_config = K.config.load_config(open(orig_config_path))["model"]
+    model = unet_model_from_original_config(orig_config)
+
+    orig_checkpoint = torch.load(orig_weights_path, map_location=device)["model_ema"]
+    converted_checkpoint = unet_to_diffusers_checkpoint(model, orig_checkpoint)
+
+    model.load_state_dict(converted_checkpoint, strict=True)
+    model.save_pretrained(args.dump_path)
+    print(f"saving converted unet model in {args.dump_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/diffusers/scripts/convert_kakao_brain_unclip_to_diffusers.py b/diffusers/scripts/convert_kakao_brain_unclip_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..5135eaed5b98391733185122f7f147201f4e4591
--- /dev/null
+++ b/diffusers/scripts/convert_kakao_brain_unclip_to_diffusers.py
@@ -0,0 +1,1159 @@
+import argparse
+import tempfile
+
+import torch
+from accelerate import load_checkpoint_and_dispatch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import UnCLIPPipeline, UNet2DConditionModel, UNet2DModel
+from diffusers.models.transformers.prior_transformer import PriorTransformer
+from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
+from diffusers.schedulers.scheduling_unclip import UnCLIPScheduler
+
+
+r"""
+Example - From the diffusers root directory:
+
+Download weights:
+```sh
+$ wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/efdf6206d8ed593961593dc029a8affa/decoder-ckpt-step%3D01000000-of-01000000.ckpt
+$ wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/4226b831ae0279020d134281f3c31590/improved-sr-ckpt-step%3D1.2M.ckpt
+$ wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/85626483eaca9f581e2a78d31ff905ca/prior-ckpt-step%3D01000000-of-01000000.ckpt
+$ wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/0b62380a75e56f073e2844ab5199153d/ViT-L-14_stats.th
+```
+
+Convert the model:
+```sh
+$ python scripts/convert_kakao_brain_unclip_to_diffusers.py \
+      --decoder_checkpoint_path ./decoder-ckpt-step\=01000000-of-01000000.ckpt \
+      --super_res_unet_checkpoint_path ./improved-sr-ckpt-step\=1.2M.ckpt \
+      --prior_checkpoint_path ./prior-ckpt-step\=01000000-of-01000000.ckpt \
+      --clip_stat_path ./ViT-L-14_stats.th \
+      --dump_path <path where to save model>
+```
+"""
+
+
+# prior
+
+PRIOR_ORIGINAL_PREFIX = "model"
+
+# Uses default arguments
+PRIOR_CONFIG = {}
+
+
+def prior_model_from_original_config():
+    model = PriorTransformer(**PRIOR_CONFIG)
+
+    return model
+
+
+def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint, clip_stats_checkpoint):
+    diffusers_checkpoint = {}
+
+    # <original>.time_embed.0 -> <diffusers>.time_embedding.linear_1
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_1.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.0.weight"],
+            "time_embedding.linear_1.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.0.bias"],
+        }
+    )
+
+    # <original>.clip_img_proj -> <diffusers>.proj_in
+    diffusers_checkpoint.update(
+        {
+            "proj_in.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_img_proj.weight"],
+            "proj_in.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_img_proj.bias"],
+        }
+    )
+
+    # <original>.text_emb_proj -> <diffusers>.embedding_proj
+    diffusers_checkpoint.update(
+        {
+            "embedding_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_emb_proj.weight"],
+            "embedding_proj.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_emb_proj.bias"],
+        }
+    )
+
+    # <original>.text_enc_proj -> <diffusers>.encoder_hidden_states_proj
+    diffusers_checkpoint.update(
+        {
+            "encoder_hidden_states_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_enc_proj.weight"],
+            "encoder_hidden_states_proj.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_enc_proj.bias"],
+        }
+    )
+
+    # <original>.positional_embedding -> <diffusers>.positional_embedding
+    diffusers_checkpoint.update({"positional_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.positional_embedding"]})
+
+    # <original>.prd_emb -> <diffusers>.prd_embedding
+    diffusers_checkpoint.update({"prd_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.prd_emb"]})
+
+    # <original>.time_embed.2 -> <diffusers>.time_embedding.linear_2
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_2.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.2.weight"],
+            "time_embedding.linear_2.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.2.bias"],
+        }
+    )
+
+    # <original>.resblocks.<x> -> <diffusers>.transformer_blocks.<x>
+    for idx in range(len(model.transformer_blocks)):
+        diffusers_transformer_prefix = f"transformer_blocks.{idx}"
+        original_transformer_prefix = f"{PRIOR_ORIGINAL_PREFIX}.transformer.resblocks.{idx}"
+
+        # <original>.attn -> <diffusers>.attn1
+        diffusers_attention_prefix = f"{diffusers_transformer_prefix}.attn1"
+        original_attention_prefix = f"{original_transformer_prefix}.attn"
+        diffusers_checkpoint.update(
+            prior_attention_to_diffusers(
+                checkpoint,
+                diffusers_attention_prefix=diffusers_attention_prefix,
+                original_attention_prefix=original_attention_prefix,
+                attention_head_dim=model.attention_head_dim,
+            )
+        )
+
+        # <original>.mlp -> <diffusers>.ff
+        diffusers_ff_prefix = f"{diffusers_transformer_prefix}.ff"
+        original_ff_prefix = f"{original_transformer_prefix}.mlp"
+        diffusers_checkpoint.update(
+            prior_ff_to_diffusers(
+                checkpoint, diffusers_ff_prefix=diffusers_ff_prefix, original_ff_prefix=original_ff_prefix
+            )
+        )
+
+        # <original>.ln_1 -> <diffusers>.norm1
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm1.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_1.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm1.bias": checkpoint[f"{original_transformer_prefix}.ln_1.bias"],
+            }
+        )
+
+        # <original>.ln_2 -> <diffusers>.norm3
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm3.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_2.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm3.bias": checkpoint[f"{original_transformer_prefix}.ln_2.bias"],
+            }
+        )
+
+    # <original>.final_ln -> <diffusers>.norm_out
+    diffusers_checkpoint.update(
+        {
+            "norm_out.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.final_ln.weight"],
+            "norm_out.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.final_ln.bias"],
+        }
+    )
+
+    # <original>.out_proj -> <diffusers>.proj_to_clip_embeddings
+    diffusers_checkpoint.update(
+        {
+            "proj_to_clip_embeddings.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.out_proj.weight"],
+            "proj_to_clip_embeddings.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.out_proj.bias"],
+        }
+    )
+
+    # clip stats
+    clip_mean, clip_std = clip_stats_checkpoint
+    clip_mean = clip_mean[None, :]
+    clip_std = clip_std[None, :]
+
+    diffusers_checkpoint.update({"clip_mean": clip_mean, "clip_std": clip_std})
+
+    return diffusers_checkpoint
+
+
+def prior_attention_to_diffusers(
+    checkpoint, *, diffusers_attention_prefix, original_attention_prefix, attention_head_dim
+):
+    diffusers_checkpoint = {}
+
+    # <original>.c_qkv -> <diffusers>.{to_q, to_k, to_v}
+    [q_weight, k_weight, v_weight], [q_bias, k_bias, v_bias] = split_attentions(
+        weight=checkpoint[f"{original_attention_prefix}.c_qkv.weight"],
+        bias=checkpoint[f"{original_attention_prefix}.c_qkv.bias"],
+        split=3,
+        chunk_size=attention_head_dim,
+    )
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_q.weight": q_weight,
+            f"{diffusers_attention_prefix}.to_q.bias": q_bias,
+            f"{diffusers_attention_prefix}.to_k.weight": k_weight,
+            f"{diffusers_attention_prefix}.to_k.bias": k_bias,
+            f"{diffusers_attention_prefix}.to_v.weight": v_weight,
+            f"{diffusers_attention_prefix}.to_v.bias": v_bias,
+        }
+    )
+
+    # <original>.c_proj -> <diffusers>.to_out.0
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{original_attention_prefix}.c_proj.weight"],
+            f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{original_attention_prefix}.c_proj.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix):
+    diffusers_checkpoint = {
+        # <original>.c_fc -> <diffusers>.net.0.proj
+        f"{diffusers_ff_prefix}.net.{0}.proj.weight": checkpoint[f"{original_ff_prefix}.c_fc.weight"],
+        f"{diffusers_ff_prefix}.net.{0}.proj.bias": checkpoint[f"{original_ff_prefix}.c_fc.bias"],
+        # <original>.c_proj -> <diffusers>.net.2
+        f"{diffusers_ff_prefix}.net.{2}.weight": checkpoint[f"{original_ff_prefix}.c_proj.weight"],
+        f"{diffusers_ff_prefix}.net.{2}.bias": checkpoint[f"{original_ff_prefix}.c_proj.bias"],
+    }
+
+    return diffusers_checkpoint
+
+
+# done prior
+
+
+# decoder
+
+DECODER_ORIGINAL_PREFIX = "model"
+
+# We are hardcoding the model configuration for now. If we need to generalize to more model configurations, we can
+# update then.
+DECODER_CONFIG = {
+    "sample_size": 64,
+    "layers_per_block": 3,
+    "down_block_types": (
+        "ResnetDownsampleBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+    ),
+    "up_block_types": (
+        "SimpleCrossAttnUpBlock2D",
+        "SimpleCrossAttnUpBlock2D",
+        "SimpleCrossAttnUpBlock2D",
+        "ResnetUpsampleBlock2D",
+    ),
+    "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+    "block_out_channels": (320, 640, 960, 1280),
+    "in_channels": 3,
+    "out_channels": 6,
+    "cross_attention_dim": 1536,
+    "class_embed_type": "identity",
+    "attention_head_dim": 64,
+    "resnet_time_scale_shift": "scale_shift",
+}
+
+
+def decoder_model_from_original_config():
+    model = UNet2DConditionModel(**DECODER_CONFIG)
+
+    return model
+
+
+def decoder_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    original_unet_prefix = DECODER_ORIGINAL_PREFIX
+    num_head_channels = DECODER_CONFIG["attention_head_dim"]
+
+    diffusers_checkpoint.update(unet_time_embeddings(checkpoint, original_unet_prefix))
+    diffusers_checkpoint.update(unet_conv_in(checkpoint, original_unet_prefix))
+
+    # <original>.input_blocks -> <diffusers>.down_blocks
+
+    original_down_block_idx = 1
+
+    for diffusers_down_block_idx in range(len(model.down_blocks)):
+        checkpoint_update, num_original_down_blocks = unet_downblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_down_block_idx=diffusers_down_block_idx,
+            original_down_block_idx=original_down_block_idx,
+            original_unet_prefix=original_unet_prefix,
+            num_head_channels=num_head_channels,
+        )
+
+        original_down_block_idx += num_original_down_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    # done <original>.input_blocks -> <diffusers>.down_blocks
+
+    diffusers_checkpoint.update(
+        unet_midblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            original_unet_prefix=original_unet_prefix,
+            num_head_channels=num_head_channels,
+        )
+    )
+
+    # <original>.output_blocks -> <diffusers>.up_blocks
+
+    original_up_block_idx = 0
+
+    for diffusers_up_block_idx in range(len(model.up_blocks)):
+        checkpoint_update, num_original_up_blocks = unet_upblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_up_block_idx=diffusers_up_block_idx,
+            original_up_block_idx=original_up_block_idx,
+            original_unet_prefix=original_unet_prefix,
+            num_head_channels=num_head_channels,
+        )
+
+        original_up_block_idx += num_original_up_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    # done <original>.output_blocks -> <diffusers>.up_blocks
+
+    diffusers_checkpoint.update(unet_conv_norm_out(checkpoint, original_unet_prefix))
+    diffusers_checkpoint.update(unet_conv_out(checkpoint, original_unet_prefix))
+
+    return diffusers_checkpoint
+
+
+# done decoder
+
+# text proj
+
+
+def text_proj_from_original_config():
+    # From the conditional unet constructor where the dimension of the projected time embeddings is
+    # constructed
+    time_embed_dim = DECODER_CONFIG["block_out_channels"][0] * 4
+
+    cross_attention_dim = DECODER_CONFIG["cross_attention_dim"]
+
+    model = UnCLIPTextProjModel(time_embed_dim=time_embed_dim, cross_attention_dim=cross_attention_dim)
+
+    return model
+
+
+# Note that the input checkpoint is the original decoder checkpoint
+def text_proj_original_checkpoint_to_diffusers_checkpoint(checkpoint):
+    diffusers_checkpoint = {
+        # <original>.text_seq_proj.0 -> <diffusers>.encoder_hidden_states_proj
+        "encoder_hidden_states_proj.weight": checkpoint[f"{DECODER_ORIGINAL_PREFIX}.text_seq_proj.0.weight"],
+        "encoder_hidden_states_proj.bias": checkpoint[f"{DECODER_ORIGINAL_PREFIX}.text_seq_proj.0.bias"],
+        # <original>.text_seq_proj.1 -> <diffusers>.text_encoder_hidden_states_norm
+        "text_encoder_hidden_states_norm.weight": checkpoint[f"{DECODER_ORIGINAL_PREFIX}.text_seq_proj.1.weight"],
+        "text_encoder_hidden_states_norm.bias": checkpoint[f"{DECODER_ORIGINAL_PREFIX}.text_seq_proj.1.bias"],
+        # <original>.clip_tok_proj -> <diffusers>.clip_extra_context_tokens_proj
+        "clip_extra_context_tokens_proj.weight": checkpoint[f"{DECODER_ORIGINAL_PREFIX}.clip_tok_proj.weight"],
+        "clip_extra_context_tokens_proj.bias": checkpoint[f"{DECODER_ORIGINAL_PREFIX}.clip_tok_proj.bias"],
+        # <original>.text_feat_proj -> <diffusers>.embedding_proj
+        "embedding_proj.weight": checkpoint[f"{DECODER_ORIGINAL_PREFIX}.text_feat_proj.weight"],
+        "embedding_proj.bias": checkpoint[f"{DECODER_ORIGINAL_PREFIX}.text_feat_proj.bias"],
+        # <original>.cf_param -> <diffusers>.learned_classifier_free_guidance_embeddings
+        "learned_classifier_free_guidance_embeddings": checkpoint[f"{DECODER_ORIGINAL_PREFIX}.cf_param"],
+        # <original>.clip_emb -> <diffusers>.clip_image_embeddings_project_to_time_embeddings
+        "clip_image_embeddings_project_to_time_embeddings.weight": checkpoint[
+            f"{DECODER_ORIGINAL_PREFIX}.clip_emb.weight"
+        ],
+        "clip_image_embeddings_project_to_time_embeddings.bias": checkpoint[
+            f"{DECODER_ORIGINAL_PREFIX}.clip_emb.bias"
+        ],
+    }
+
+    return diffusers_checkpoint
+
+
+# done text proj
+
+# super res unet first steps
+
+SUPER_RES_UNET_FIRST_STEPS_PREFIX = "model_first_steps"
+
+SUPER_RES_UNET_FIRST_STEPS_CONFIG = {
+    "sample_size": 256,
+    "layers_per_block": 3,
+    "down_block_types": (
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+    ),
+    "up_block_types": (
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+    ),
+    "block_out_channels": (320, 640, 960, 1280),
+    "in_channels": 6,
+    "out_channels": 3,
+    "add_attention": False,
+}
+
+
+def super_res_unet_first_steps_model_from_original_config():
+    model = UNet2DModel(**SUPER_RES_UNET_FIRST_STEPS_CONFIG)
+
+    return model
+
+
+def super_res_unet_first_steps_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    original_unet_prefix = SUPER_RES_UNET_FIRST_STEPS_PREFIX
+
+    diffusers_checkpoint.update(unet_time_embeddings(checkpoint, original_unet_prefix))
+    diffusers_checkpoint.update(unet_conv_in(checkpoint, original_unet_prefix))
+
+    # <original>.input_blocks -> <diffusers>.down_blocks
+
+    original_down_block_idx = 1
+
+    for diffusers_down_block_idx in range(len(model.down_blocks)):
+        checkpoint_update, num_original_down_blocks = unet_downblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_down_block_idx=diffusers_down_block_idx,
+            original_down_block_idx=original_down_block_idx,
+            original_unet_prefix=original_unet_prefix,
+            num_head_channels=None,
+        )
+
+        original_down_block_idx += num_original_down_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    diffusers_checkpoint.update(
+        unet_midblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            original_unet_prefix=original_unet_prefix,
+            num_head_channels=None,
+        )
+    )
+
+    # <original>.output_blocks -> <diffusers>.up_blocks
+
+    original_up_block_idx = 0
+
+    for diffusers_up_block_idx in range(len(model.up_blocks)):
+        checkpoint_update, num_original_up_blocks = unet_upblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_up_block_idx=diffusers_up_block_idx,
+            original_up_block_idx=original_up_block_idx,
+            original_unet_prefix=original_unet_prefix,
+            num_head_channels=None,
+        )
+
+        original_up_block_idx += num_original_up_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    # done <original>.output_blocks -> <diffusers>.up_blocks
+
+    diffusers_checkpoint.update(unet_conv_norm_out(checkpoint, original_unet_prefix))
+    diffusers_checkpoint.update(unet_conv_out(checkpoint, original_unet_prefix))
+
+    return diffusers_checkpoint
+
+
+# done super res unet first steps
+
+# super res unet last step
+
+SUPER_RES_UNET_LAST_STEP_PREFIX = "model_last_step"
+
+SUPER_RES_UNET_LAST_STEP_CONFIG = {
+    "sample_size": 256,
+    "layers_per_block": 3,
+    "down_block_types": (
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+    ),
+    "up_block_types": (
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+    ),
+    "block_out_channels": (320, 640, 960, 1280),
+    "in_channels": 6,
+    "out_channels": 3,
+    "add_attention": False,
+}
+
+
+def super_res_unet_last_step_model_from_original_config():
+    model = UNet2DModel(**SUPER_RES_UNET_LAST_STEP_CONFIG)
+
+    return model
+
+
+def super_res_unet_last_step_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    original_unet_prefix = SUPER_RES_UNET_LAST_STEP_PREFIX
+
+    diffusers_checkpoint.update(unet_time_embeddings(checkpoint, original_unet_prefix))
+    diffusers_checkpoint.update(unet_conv_in(checkpoint, original_unet_prefix))
+
+    # <original>.input_blocks -> <diffusers>.down_blocks
+
+    original_down_block_idx = 1
+
+    for diffusers_down_block_idx in range(len(model.down_blocks)):
+        checkpoint_update, num_original_down_blocks = unet_downblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_down_block_idx=diffusers_down_block_idx,
+            original_down_block_idx=original_down_block_idx,
+            original_unet_prefix=original_unet_prefix,
+            num_head_channels=None,
+        )
+
+        original_down_block_idx += num_original_down_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    diffusers_checkpoint.update(
+        unet_midblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            original_unet_prefix=original_unet_prefix,
+            num_head_channels=None,
+        )
+    )
+
+    # <original>.output_blocks -> <diffusers>.up_blocks
+
+    original_up_block_idx = 0
+
+    for diffusers_up_block_idx in range(len(model.up_blocks)):
+        checkpoint_update, num_original_up_blocks = unet_upblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_up_block_idx=diffusers_up_block_idx,
+            original_up_block_idx=original_up_block_idx,
+            original_unet_prefix=original_unet_prefix,
+            num_head_channels=None,
+        )
+
+        original_up_block_idx += num_original_up_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    # done <original>.output_blocks -> <diffusers>.up_blocks
+
+    diffusers_checkpoint.update(unet_conv_norm_out(checkpoint, original_unet_prefix))
+    diffusers_checkpoint.update(unet_conv_out(checkpoint, original_unet_prefix))
+
+    return diffusers_checkpoint
+
+
+# done super res unet last step
+
+
+# unet utils
+
+
+# <original>.time_embed -> <diffusers>.time_embedding
+def unet_time_embeddings(checkpoint, original_unet_prefix):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_1.weight": checkpoint[f"{original_unet_prefix}.time_embed.0.weight"],
+            "time_embedding.linear_1.bias": checkpoint[f"{original_unet_prefix}.time_embed.0.bias"],
+            "time_embedding.linear_2.weight": checkpoint[f"{original_unet_prefix}.time_embed.2.weight"],
+            "time_embedding.linear_2.bias": checkpoint[f"{original_unet_prefix}.time_embed.2.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+# <original>.input_blocks.0 -> <diffusers>.conv_in
+def unet_conv_in(checkpoint, original_unet_prefix):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "conv_in.weight": checkpoint[f"{original_unet_prefix}.input_blocks.0.0.weight"],
+            "conv_in.bias": checkpoint[f"{original_unet_prefix}.input_blocks.0.0.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+# <original>.out.0 -> <diffusers>.conv_norm_out
+def unet_conv_norm_out(checkpoint, original_unet_prefix):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "conv_norm_out.weight": checkpoint[f"{original_unet_prefix}.out.0.weight"],
+            "conv_norm_out.bias": checkpoint[f"{original_unet_prefix}.out.0.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+# <original>.out.2 -> <diffusers>.conv_out
+def unet_conv_out(checkpoint, original_unet_prefix):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "conv_out.weight": checkpoint[f"{original_unet_prefix}.out.2.weight"],
+            "conv_out.bias": checkpoint[f"{original_unet_prefix}.out.2.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+# <original>.input_blocks -> <diffusers>.down_blocks
+def unet_downblock_to_diffusers_checkpoint(
+    model, checkpoint, *, diffusers_down_block_idx, original_down_block_idx, original_unet_prefix, num_head_channels
+):
+    diffusers_checkpoint = {}
+
+    diffusers_resnet_prefix = f"down_blocks.{diffusers_down_block_idx}.resnets"
+    original_down_block_prefix = f"{original_unet_prefix}.input_blocks"
+
+    down_block = model.down_blocks[diffusers_down_block_idx]
+
+    num_resnets = len(down_block.resnets)
+
+    if down_block.downsamplers is None:
+        downsampler = False
+    else:
+        assert len(down_block.downsamplers) == 1
+        downsampler = True
+        # The downsample block is also a resnet
+        num_resnets += 1
+
+    for resnet_idx_inc in range(num_resnets):
+        full_resnet_prefix = f"{original_down_block_prefix}.{original_down_block_idx + resnet_idx_inc}.0"
+
+        if downsampler and resnet_idx_inc == num_resnets - 1:
+            # this is a downsample block
+            full_diffusers_resnet_prefix = f"down_blocks.{diffusers_down_block_idx}.downsamplers.0"
+        else:
+            # this is a regular resnet block
+            full_diffusers_resnet_prefix = f"{diffusers_resnet_prefix}.{resnet_idx_inc}"
+
+        diffusers_checkpoint.update(
+            resnet_to_diffusers_checkpoint(
+                checkpoint, resnet_prefix=full_resnet_prefix, diffusers_resnet_prefix=full_diffusers_resnet_prefix
+            )
+        )
+
+    if hasattr(down_block, "attentions"):
+        num_attentions = len(down_block.attentions)
+        diffusers_attention_prefix = f"down_blocks.{diffusers_down_block_idx}.attentions"
+
+        for attention_idx_inc in range(num_attentions):
+            full_attention_prefix = f"{original_down_block_prefix}.{original_down_block_idx + attention_idx_inc}.1"
+            full_diffusers_attention_prefix = f"{diffusers_attention_prefix}.{attention_idx_inc}"
+
+            diffusers_checkpoint.update(
+                attention_to_diffusers_checkpoint(
+                    checkpoint,
+                    attention_prefix=full_attention_prefix,
+                    diffusers_attention_prefix=full_diffusers_attention_prefix,
+                    num_head_channels=num_head_channels,
+                )
+            )
+
+    num_original_down_blocks = num_resnets
+
+    return diffusers_checkpoint, num_original_down_blocks
+
+
+# <original>.middle_block -> <diffusers>.mid_block
+def unet_midblock_to_diffusers_checkpoint(model, checkpoint, *, original_unet_prefix, num_head_channels):
+    diffusers_checkpoint = {}
+
+    # block 0
+
+    original_block_idx = 0
+
+    diffusers_checkpoint.update(
+        resnet_to_diffusers_checkpoint(
+            checkpoint,
+            diffusers_resnet_prefix="mid_block.resnets.0",
+            resnet_prefix=f"{original_unet_prefix}.middle_block.{original_block_idx}",
+        )
+    )
+
+    original_block_idx += 1
+
+    # optional block 1
+
+    if hasattr(model.mid_block, "attentions") and model.mid_block.attentions[0] is not None:
+        diffusers_checkpoint.update(
+            attention_to_diffusers_checkpoint(
+                checkpoint,
+                diffusers_attention_prefix="mid_block.attentions.0",
+                attention_prefix=f"{original_unet_prefix}.middle_block.{original_block_idx}",
+                num_head_channels=num_head_channels,
+            )
+        )
+        original_block_idx += 1
+
+    # block 1 or block 2
+
+    diffusers_checkpoint.update(
+        resnet_to_diffusers_checkpoint(
+            checkpoint,
+            diffusers_resnet_prefix="mid_block.resnets.1",
+            resnet_prefix=f"{original_unet_prefix}.middle_block.{original_block_idx}",
+        )
+    )
+
+    return diffusers_checkpoint
+
+
+# <original>.output_blocks -> <diffusers>.up_blocks
+def unet_upblock_to_diffusers_checkpoint(
+    model, checkpoint, *, diffusers_up_block_idx, original_up_block_idx, original_unet_prefix, num_head_channels
+):
+    diffusers_checkpoint = {}
+
+    diffusers_resnet_prefix = f"up_blocks.{diffusers_up_block_idx}.resnets"
+    original_up_block_prefix = f"{original_unet_prefix}.output_blocks"
+
+    up_block = model.up_blocks[diffusers_up_block_idx]
+
+    num_resnets = len(up_block.resnets)
+
+    if up_block.upsamplers is None:
+        upsampler = False
+    else:
+        assert len(up_block.upsamplers) == 1
+        upsampler = True
+        # The upsample block is also a resnet
+        num_resnets += 1
+
+    has_attentions = hasattr(up_block, "attentions")
+
+    for resnet_idx_inc in range(num_resnets):
+        if upsampler and resnet_idx_inc == num_resnets - 1:
+            # this is an upsample block
+            if has_attentions:
+                # There is a middle attention block that we skip
+                original_resnet_block_idx = 2
+            else:
+                original_resnet_block_idx = 1
+
+            # we add the `minus 1` because the last two resnets are stuck together in the same output block
+            full_resnet_prefix = (
+                f"{original_up_block_prefix}.{original_up_block_idx + resnet_idx_inc - 1}.{original_resnet_block_idx}"
+            )
+
+            full_diffusers_resnet_prefix = f"up_blocks.{diffusers_up_block_idx}.upsamplers.0"
+        else:
+            # this is a regular resnet block
+            full_resnet_prefix = f"{original_up_block_prefix}.{original_up_block_idx + resnet_idx_inc}.0"
+            full_diffusers_resnet_prefix = f"{diffusers_resnet_prefix}.{resnet_idx_inc}"
+
+        diffusers_checkpoint.update(
+            resnet_to_diffusers_checkpoint(
+                checkpoint, resnet_prefix=full_resnet_prefix, diffusers_resnet_prefix=full_diffusers_resnet_prefix
+            )
+        )
+
+    if has_attentions:
+        num_attentions = len(up_block.attentions)
+        diffusers_attention_prefix = f"up_blocks.{diffusers_up_block_idx}.attentions"
+
+        for attention_idx_inc in range(num_attentions):
+            full_attention_prefix = f"{original_up_block_prefix}.{original_up_block_idx + attention_idx_inc}.1"
+            full_diffusers_attention_prefix = f"{diffusers_attention_prefix}.{attention_idx_inc}"
+
+            diffusers_checkpoint.update(
+                attention_to_diffusers_checkpoint(
+                    checkpoint,
+                    attention_prefix=full_attention_prefix,
+                    diffusers_attention_prefix=full_diffusers_attention_prefix,
+                    num_head_channels=num_head_channels,
+                )
+            )
+
+    num_original_down_blocks = num_resnets - 1 if upsampler else num_resnets
+
+    return diffusers_checkpoint, num_original_down_blocks
+
+
+def resnet_to_diffusers_checkpoint(checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
+    diffusers_checkpoint = {
+        f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.in_layers.0.weight"],
+        f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.in_layers.0.bias"],
+        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.in_layers.2.weight"],
+        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.in_layers.2.bias"],
+        f"{diffusers_resnet_prefix}.time_emb_proj.weight": checkpoint[f"{resnet_prefix}.emb_layers.1.weight"],
+        f"{diffusers_resnet_prefix}.time_emb_proj.bias": checkpoint[f"{resnet_prefix}.emb_layers.1.bias"],
+        f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.out_layers.0.weight"],
+        f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.out_layers.0.bias"],
+        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.out_layers.3.weight"],
+        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.out_layers.3.bias"],
+    }
+
+    skip_connection_prefix = f"{resnet_prefix}.skip_connection"
+
+    if f"{skip_connection_prefix}.weight" in checkpoint:
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{skip_connection_prefix}.weight"],
+                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{skip_connection_prefix}.bias"],
+            }
+        )
+
+    return diffusers_checkpoint
+
+
+def attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix, num_head_channels):
+    diffusers_checkpoint = {}
+
+    # <original>.norm -> <diffusers>.group_norm
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.group_norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
+            f"{diffusers_attention_prefix}.group_norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
+        }
+    )
+
+    # <original>.qkv -> <diffusers>.{query, key, value}
+    [q_weight, k_weight, v_weight], [q_bias, k_bias, v_bias] = split_attentions(
+        weight=checkpoint[f"{attention_prefix}.qkv.weight"][:, :, 0],
+        bias=checkpoint[f"{attention_prefix}.qkv.bias"],
+        split=3,
+        chunk_size=num_head_channels,
+    )
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_q.weight": q_weight,
+            f"{diffusers_attention_prefix}.to_q.bias": q_bias,
+            f"{diffusers_attention_prefix}.to_k.weight": k_weight,
+            f"{diffusers_attention_prefix}.to_k.bias": k_bias,
+            f"{diffusers_attention_prefix}.to_v.weight": v_weight,
+            f"{diffusers_attention_prefix}.to_v.bias": v_bias,
+        }
+    )
+
+    # <original>.encoder_kv -> <diffusers>.{context_key, context_value}
+    [encoder_k_weight, encoder_v_weight], [encoder_k_bias, encoder_v_bias] = split_attentions(
+        weight=checkpoint[f"{attention_prefix}.encoder_kv.weight"][:, :, 0],
+        bias=checkpoint[f"{attention_prefix}.encoder_kv.bias"],
+        split=2,
+        chunk_size=num_head_channels,
+    )
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.add_k_proj.weight": encoder_k_weight,
+            f"{diffusers_attention_prefix}.add_k_proj.bias": encoder_k_bias,
+            f"{diffusers_attention_prefix}.add_v_proj.weight": encoder_v_weight,
+            f"{diffusers_attention_prefix}.add_v_proj.bias": encoder_v_bias,
+        }
+    )
+
+    # <original>.proj_out (1d conv) -> <diffusers>.proj_attn (linear)
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
+                :, :, 0
+            ],
+            f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+# TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?)
+def split_attentions(*, weight, bias, split, chunk_size):
+    weights = [None] * split
+    biases = [None] * split
+
+    weights_biases_idx = 0
+
+    for starting_row_index in range(0, weight.shape[0], chunk_size):
+        row_indices = torch.arange(starting_row_index, starting_row_index + chunk_size)
+
+        weight_rows = weight[row_indices, :]
+        bias_rows = bias[row_indices]
+
+        if weights[weights_biases_idx] is None:
+            assert weights[weights_biases_idx] is None
+            weights[weights_biases_idx] = weight_rows
+            biases[weights_biases_idx] = bias_rows
+        else:
+            assert weights[weights_biases_idx] is not None
+            weights[weights_biases_idx] = torch.concat([weights[weights_biases_idx], weight_rows])
+            biases[weights_biases_idx] = torch.concat([biases[weights_biases_idx], bias_rows])
+
+        weights_biases_idx = (weights_biases_idx + 1) % split
+
+    return weights, biases
+
+
+# done unet utils
+
+
+# Driver functions
+
+
+def text_encoder():
+    print("loading CLIP text encoder")
+
+    clip_name = "openai/clip-vit-large-patch14"
+
+    # sets pad_value to 0
+    pad_token = "!"
+
+    tokenizer_model = CLIPTokenizer.from_pretrained(clip_name, pad_token=pad_token, device_map="auto")
+
+    assert tokenizer_model.convert_tokens_to_ids(pad_token) == 0
+
+    text_encoder_model = CLIPTextModelWithProjection.from_pretrained(
+        clip_name,
+        # `CLIPTextModel` does not support device_map="auto"
+        # device_map="auto"
+    )
+
+    print("done loading CLIP text encoder")
+
+    return text_encoder_model, tokenizer_model
+
+
+def prior(*, args, checkpoint_map_location):
+    print("loading prior")
+
+    prior_checkpoint = torch.load(args.prior_checkpoint_path, map_location=checkpoint_map_location)
+    prior_checkpoint = prior_checkpoint["state_dict"]
+
+    clip_stats_checkpoint = torch.load(args.clip_stat_path, map_location=checkpoint_map_location)
+
+    prior_model = prior_model_from_original_config()
+
+    prior_diffusers_checkpoint = prior_original_checkpoint_to_diffusers_checkpoint(
+        prior_model, prior_checkpoint, clip_stats_checkpoint
+    )
+
+    del prior_checkpoint
+    del clip_stats_checkpoint
+
+    load_checkpoint_to_model(prior_diffusers_checkpoint, prior_model, strict=True)
+
+    print("done loading prior")
+
+    return prior_model
+
+
+def decoder(*, args, checkpoint_map_location):
+    print("loading decoder")
+
+    decoder_checkpoint = torch.load(args.decoder_checkpoint_path, map_location=checkpoint_map_location)
+    decoder_checkpoint = decoder_checkpoint["state_dict"]
+
+    decoder_model = decoder_model_from_original_config()
+
+    decoder_diffusers_checkpoint = decoder_original_checkpoint_to_diffusers_checkpoint(
+        decoder_model, decoder_checkpoint
+    )
+
+    # text proj interlude
+
+    # The original decoder implementation includes a set of parameters that are used
+    # for creating the `encoder_hidden_states` which are what the U-net is conditioned
+    # on. The diffusers conditional unet directly takes the encoder_hidden_states. We pull
+    # the parameters into the UnCLIPTextProjModel class
+    text_proj_model = text_proj_from_original_config()
+
+    text_proj_checkpoint = text_proj_original_checkpoint_to_diffusers_checkpoint(decoder_checkpoint)
+
+    load_checkpoint_to_model(text_proj_checkpoint, text_proj_model, strict=True)
+
+    # done text proj interlude
+
+    del decoder_checkpoint
+
+    load_checkpoint_to_model(decoder_diffusers_checkpoint, decoder_model, strict=True)
+
+    print("done loading decoder")
+
+    return decoder_model, text_proj_model
+
+
+def super_res_unet(*, args, checkpoint_map_location):
+    print("loading super resolution unet")
+
+    super_res_checkpoint = torch.load(args.super_res_unet_checkpoint_path, map_location=checkpoint_map_location)
+    super_res_checkpoint = super_res_checkpoint["state_dict"]
+
+    # model_first_steps
+
+    super_res_first_model = super_res_unet_first_steps_model_from_original_config()
+
+    super_res_first_steps_checkpoint = super_res_unet_first_steps_original_checkpoint_to_diffusers_checkpoint(
+        super_res_first_model, super_res_checkpoint
+    )
+
+    # model_last_step
+    super_res_last_model = super_res_unet_last_step_model_from_original_config()
+
+    super_res_last_step_checkpoint = super_res_unet_last_step_original_checkpoint_to_diffusers_checkpoint(
+        super_res_last_model, super_res_checkpoint
+    )
+
+    del super_res_checkpoint
+
+    load_checkpoint_to_model(super_res_first_steps_checkpoint, super_res_first_model, strict=True)
+
+    load_checkpoint_to_model(super_res_last_step_checkpoint, super_res_last_model, strict=True)
+
+    print("done loading super resolution unet")
+
+    return super_res_first_model, super_res_last_model
+
+
+def load_checkpoint_to_model(checkpoint, model, strict=False):
+    with tempfile.NamedTemporaryFile() as file:
+        torch.save(checkpoint, file.name)
+        del checkpoint
+        if strict:
+            model.load_state_dict(torch.load(file.name), strict=True)
+        else:
+            load_checkpoint_and_dispatch(model, file.name, device_map="auto")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    parser.add_argument(
+        "--prior_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the prior checkpoint to convert.",
+    )
+
+    parser.add_argument(
+        "--decoder_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the decoder checkpoint to convert.",
+    )
+
+    parser.add_argument(
+        "--super_res_unet_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the super resolution checkpoint to convert.",
+    )
+
+    parser.add_argument(
+        "--clip_stat_path", default=None, type=str, required=True, help="Path to the clip stats checkpoint to convert."
+    )
+
+    parser.add_argument(
+        "--checkpoint_load_device",
+        default="cpu",
+        type=str,
+        required=False,
+        help="The device passed to `map_location` when loading checkpoints.",
+    )
+
+    parser.add_argument(
+        "--debug",
+        default=None,
+        type=str,
+        required=False,
+        help="Only run a specific stage of the convert script. Used for debugging",
+    )
+
+    args = parser.parse_args()
+
+    print(f"loading checkpoints to {args.checkpoint_load_device}")
+
+    checkpoint_map_location = torch.device(args.checkpoint_load_device)
+
+    if args.debug is not None:
+        print(f"debug: only executing {args.debug}")
+
+    if args.debug is None:
+        text_encoder_model, tokenizer_model = text_encoder()
+
+        prior_model = prior(args=args, checkpoint_map_location=checkpoint_map_location)
+
+        decoder_model, text_proj_model = decoder(args=args, checkpoint_map_location=checkpoint_map_location)
+
+        super_res_first_model, super_res_last_model = super_res_unet(
+            args=args, checkpoint_map_location=checkpoint_map_location
+        )
+
+        prior_scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="sample",
+            num_train_timesteps=1000,
+            clip_sample_range=5.0,
+        )
+
+        decoder_scheduler = UnCLIPScheduler(
+            variance_type="learned_range",
+            prediction_type="epsilon",
+            num_train_timesteps=1000,
+        )
+
+        super_res_scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="epsilon",
+            num_train_timesteps=1000,
+        )
+
+        print(f"saving Kakao Brain unCLIP to {args.dump_path}")
+
+        pipe = UnCLIPPipeline(
+            prior=prior_model,
+            decoder=decoder_model,
+            text_proj=text_proj_model,
+            tokenizer=tokenizer_model,
+            text_encoder=text_encoder_model,
+            super_res_first=super_res_first_model,
+            super_res_last=super_res_last_model,
+            prior_scheduler=prior_scheduler,
+            decoder_scheduler=decoder_scheduler,
+            super_res_scheduler=super_res_scheduler,
+        )
+        pipe.save_pretrained(args.dump_path)
+
+        print("done writing Kakao Brain unCLIP")
+    elif args.debug == "text_encoder":
+        text_encoder_model, tokenizer_model = text_encoder()
+    elif args.debug == "prior":
+        prior_model = prior(args=args, checkpoint_map_location=checkpoint_map_location)
+    elif args.debug == "decoder":
+        decoder_model, text_proj_model = decoder(args=args, checkpoint_map_location=checkpoint_map_location)
+    elif args.debug == "super_res_unet":
+        super_res_first_model, super_res_last_model = super_res_unet(
+            args=args, checkpoint_map_location=checkpoint_map_location
+        )
+    else:
+        raise ValueError(f"unknown debug value : {args.debug}")
diff --git a/diffusers/scripts/convert_kandinsky3_unet.py b/diffusers/scripts/convert_kandinsky3_unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fe8c54eb7fc17150a6d927aeefd5ca21dce9636
--- /dev/null
+++ b/diffusers/scripts/convert_kandinsky3_unet.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+import argparse
+import fnmatch
+
+from safetensors.torch import load_file
+
+from diffusers import Kandinsky3UNet
+
+
+MAPPING = {
+    "to_time_embed.1": "time_embedding.linear_1",
+    "to_time_embed.3": "time_embedding.linear_2",
+    "in_layer": "conv_in",
+    "out_layer.0": "conv_norm_out",
+    "out_layer.2": "conv_out",
+    "down_samples": "down_blocks",
+    "up_samples": "up_blocks",
+    "projection_lin": "encoder_hid_proj.projection_linear",
+    "projection_ln": "encoder_hid_proj.projection_norm",
+    "feature_pooling": "add_time_condition",
+    "to_query": "to_q",
+    "to_key": "to_k",
+    "to_value": "to_v",
+    "output_layer": "to_out.0",
+    "self_attention_block": "attentions.0",
+}
+
+DYNAMIC_MAP = {
+    "resnet_attn_blocks.*.0": "resnets_in.*",
+    "resnet_attn_blocks.*.1": ("attentions.*", 1),
+    "resnet_attn_blocks.*.2": "resnets_out.*",
+}
+# MAPPING = {}
+
+
+def convert_state_dict(unet_state_dict):
+    """
+    Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model.
+    Args:
+        unet_model (torch.nn.Module): The original U-Net model.
+        unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet model to match keys with.
+
+    Returns:
+        OrderedDict: The converted state dictionary.
+    """
+    # Example of renaming logic (this will vary based on your model's architecture)
+    converted_state_dict = {}
+    for key in unet_state_dict:
+        new_key = key
+        for pattern, new_pattern in MAPPING.items():
+            new_key = new_key.replace(pattern, new_pattern)
+
+        for dyn_pattern, dyn_new_pattern in DYNAMIC_MAP.items():
+            has_matched = False
+            if fnmatch.fnmatch(new_key, f"*.{dyn_pattern}.*") and not has_matched:
+                star = int(new_key.split(dyn_pattern.split(".")[0])[-1].split(".")[1])
+
+                if isinstance(dyn_new_pattern, tuple):
+                    new_star = star + dyn_new_pattern[-1]
+                    dyn_new_pattern = dyn_new_pattern[0]
+                else:
+                    new_star = star
+
+                pattern = dyn_pattern.replace("*", str(star))
+                new_pattern = dyn_new_pattern.replace("*", str(new_star))
+
+                new_key = new_key.replace(pattern, new_pattern)
+                has_matched = True
+
+        converted_state_dict[new_key] = unet_state_dict[key]
+
+    return converted_state_dict
+
+
+def main(model_path, output_path):
+    # Load your original U-Net model
+    unet_state_dict = load_file(model_path)
+
+    # Initialize your Kandinsky3UNet model
+    config = {}
+
+    # Convert the state dict
+    converted_state_dict = convert_state_dict(unet_state_dict)
+
+    unet = Kandinsky3UNet(config)
+    unet.load_state_dict(converted_state_dict)
+
+    unet.save_pretrained(output_path)
+    print(f"Converted model saved to {output_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert U-Net PyTorch model to Kandinsky3UNet format")
+    parser.add_argument("--model_path", type=str, required=True, help="Path to the original U-Net PyTorch model")
+    parser.add_argument("--output_path", type=str, required=True, help="Path to save the converted model")
+
+    args = parser.parse_args()
+    main(args.model_path, args.output_path)
diff --git a/diffusers/scripts/convert_kandinsky_to_diffusers.py b/diffusers/scripts/convert_kandinsky_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d3f7b63d0c1075679efc4b0322b6ae37c5acd6c
--- /dev/null
+++ b/diffusers/scripts/convert_kandinsky_to_diffusers.py
@@ -0,0 +1,1411 @@
+import argparse
+import os
+import tempfile
+
+import torch
+from accelerate import load_checkpoint_and_dispatch
+
+from diffusers import UNet2DConditionModel
+from diffusers.models.transformers.prior_transformer import PriorTransformer
+from diffusers.models.vq_model import VQModel
+
+
+"""
+Example - From the diffusers root directory:
+
+Download weights:
+```sh
+$ wget https://huggingface.co/ai-forever/Kandinsky_2.1/blob/main/prior_fp16.ckpt
+```
+
+Convert the model:
+```sh
+python scripts/convert_kandinsky_to_diffusers.py \
+      --prior_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/prior_fp16.ckpt \
+      --clip_stat_path  /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/ViT-L-14_stats.th \
+      --text2img_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/decoder_fp16.ckpt \
+      --inpaint_text2img_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/inpainting_fp16.ckpt \
+      --movq_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/movq_final.ckpt \
+      --dump_path /home/yiyi_huggingface_co/dump \
+      --debug decoder
+```
+"""
+
+
+# prior
+
+PRIOR_ORIGINAL_PREFIX = "model"
+
+# Uses default arguments
+PRIOR_CONFIG = {}
+
+
+def prior_model_from_original_config():
+    model = PriorTransformer(**PRIOR_CONFIG)
+
+    return model
+
+
+def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint, clip_stats_checkpoint):
+    diffusers_checkpoint = {}
+
+    # <original>.time_embed.0 -> <diffusers>.time_embedding.linear_1
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_1.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.0.weight"],
+            "time_embedding.linear_1.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.0.bias"],
+        }
+    )
+
+    # <original>.clip_img_proj -> <diffusers>.proj_in
+    diffusers_checkpoint.update(
+        {
+            "proj_in.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_img_proj.weight"],
+            "proj_in.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_img_proj.bias"],
+        }
+    )
+
+    # <original>.text_emb_proj -> <diffusers>.embedding_proj
+    diffusers_checkpoint.update(
+        {
+            "embedding_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_emb_proj.weight"],
+            "embedding_proj.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_emb_proj.bias"],
+        }
+    )
+
+    # <original>.text_enc_proj -> <diffusers>.encoder_hidden_states_proj
+    diffusers_checkpoint.update(
+        {
+            "encoder_hidden_states_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_enc_proj.weight"],
+            "encoder_hidden_states_proj.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_enc_proj.bias"],
+        }
+    )
+
+    # <original>.positional_embedding -> <diffusers>.positional_embedding
+    diffusers_checkpoint.update({"positional_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.positional_embedding"]})
+
+    # <original>.prd_emb -> <diffusers>.prd_embedding
+    diffusers_checkpoint.update({"prd_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.prd_emb"]})
+
+    # <original>.time_embed.2 -> <diffusers>.time_embedding.linear_2
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_2.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.2.weight"],
+            "time_embedding.linear_2.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.2.bias"],
+        }
+    )
+
+    # <original>.resblocks.<x> -> <diffusers>.transformer_blocks.<x>
+    for idx in range(len(model.transformer_blocks)):
+        diffusers_transformer_prefix = f"transformer_blocks.{idx}"
+        original_transformer_prefix = f"{PRIOR_ORIGINAL_PREFIX}.transformer.resblocks.{idx}"
+
+        # <original>.attn -> <diffusers>.attn1
+        diffusers_attention_prefix = f"{diffusers_transformer_prefix}.attn1"
+        original_attention_prefix = f"{original_transformer_prefix}.attn"
+        diffusers_checkpoint.update(
+            prior_attention_to_diffusers(
+                checkpoint,
+                diffusers_attention_prefix=diffusers_attention_prefix,
+                original_attention_prefix=original_attention_prefix,
+                attention_head_dim=model.attention_head_dim,
+            )
+        )
+
+        # <original>.mlp -> <diffusers>.ff
+        diffusers_ff_prefix = f"{diffusers_transformer_prefix}.ff"
+        original_ff_prefix = f"{original_transformer_prefix}.mlp"
+        diffusers_checkpoint.update(
+            prior_ff_to_diffusers(
+                checkpoint, diffusers_ff_prefix=diffusers_ff_prefix, original_ff_prefix=original_ff_prefix
+            )
+        )
+
+        # <original>.ln_1 -> <diffusers>.norm1
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm1.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_1.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm1.bias": checkpoint[f"{original_transformer_prefix}.ln_1.bias"],
+            }
+        )
+
+        # <original>.ln_2 -> <diffusers>.norm3
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm3.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_2.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm3.bias": checkpoint[f"{original_transformer_prefix}.ln_2.bias"],
+            }
+        )
+
+    # <original>.final_ln -> <diffusers>.norm_out
+    diffusers_checkpoint.update(
+        {
+            "norm_out.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.final_ln.weight"],
+            "norm_out.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.final_ln.bias"],
+        }
+    )
+
+    # <original>.out_proj -> <diffusers>.proj_to_clip_embeddings
+    diffusers_checkpoint.update(
+        {
+            "proj_to_clip_embeddings.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.out_proj.weight"],
+            "proj_to_clip_embeddings.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.out_proj.bias"],
+        }
+    )
+
+    # clip stats
+    clip_mean, clip_std = clip_stats_checkpoint
+    clip_mean = clip_mean[None, :]
+    clip_std = clip_std[None, :]
+
+    diffusers_checkpoint.update({"clip_mean": clip_mean, "clip_std": clip_std})
+
+    return diffusers_checkpoint
+
+
+def prior_attention_to_diffusers(
+    checkpoint, *, diffusers_attention_prefix, original_attention_prefix, attention_head_dim
+):
+    diffusers_checkpoint = {}
+
+    # <original>.c_qkv -> <diffusers>.{to_q, to_k, to_v}
+    [q_weight, k_weight, v_weight], [q_bias, k_bias, v_bias] = split_attentions(
+        weight=checkpoint[f"{original_attention_prefix}.c_qkv.weight"],
+        bias=checkpoint[f"{original_attention_prefix}.c_qkv.bias"],
+        split=3,
+        chunk_size=attention_head_dim,
+    )
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_q.weight": q_weight,
+            f"{diffusers_attention_prefix}.to_q.bias": q_bias,
+            f"{diffusers_attention_prefix}.to_k.weight": k_weight,
+            f"{diffusers_attention_prefix}.to_k.bias": k_bias,
+            f"{diffusers_attention_prefix}.to_v.weight": v_weight,
+            f"{diffusers_attention_prefix}.to_v.bias": v_bias,
+        }
+    )
+
+    # <original>.c_proj -> <diffusers>.to_out.0
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{original_attention_prefix}.c_proj.weight"],
+            f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{original_attention_prefix}.c_proj.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix):
+    diffusers_checkpoint = {
+        # <original>.c_fc -> <diffusers>.net.0.proj
+        f"{diffusers_ff_prefix}.net.{0}.proj.weight": checkpoint[f"{original_ff_prefix}.c_fc.weight"],
+        f"{diffusers_ff_prefix}.net.{0}.proj.bias": checkpoint[f"{original_ff_prefix}.c_fc.bias"],
+        # <original>.c_proj -> <diffusers>.net.2
+        f"{diffusers_ff_prefix}.net.{2}.weight": checkpoint[f"{original_ff_prefix}.c_proj.weight"],
+        f"{diffusers_ff_prefix}.net.{2}.bias": checkpoint[f"{original_ff_prefix}.c_proj.bias"],
+    }
+
+    return diffusers_checkpoint
+
+
+# done prior
+
+# unet
+
+# We are hardcoding the model configuration for now. If we need to generalize to more model configurations, we can
+# update then.
+
+UNET_CONFIG = {
+    "act_fn": "silu",
+    "addition_embed_type": "text_image",
+    "addition_embed_type_num_heads": 64,
+    "attention_head_dim": 64,
+    "block_out_channels": [384, 768, 1152, 1536],
+    "center_input_sample": False,
+    "class_embed_type": None,
+    "class_embeddings_concat": False,
+    "conv_in_kernel": 3,
+    "conv_out_kernel": 3,
+    "cross_attention_dim": 768,
+    "cross_attention_norm": None,
+    "down_block_types": [
+        "ResnetDownsampleBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+    ],
+    "downsample_padding": 1,
+    "dual_cross_attention": False,
+    "encoder_hid_dim": 1024,
+    "encoder_hid_dim_type": "text_image_proj",
+    "flip_sin_to_cos": True,
+    "freq_shift": 0,
+    "in_channels": 4,
+    "layers_per_block": 3,
+    "mid_block_only_cross_attention": None,
+    "mid_block_scale_factor": 1,
+    "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+    "norm_eps": 1e-05,
+    "norm_num_groups": 32,
+    "num_class_embeds": None,
+    "only_cross_attention": False,
+    "out_channels": 8,
+    "projection_class_embeddings_input_dim": None,
+    "resnet_out_scale_factor": 1.0,
+    "resnet_skip_time_act": False,
+    "resnet_time_scale_shift": "scale_shift",
+    "sample_size": 64,
+    "time_cond_proj_dim": None,
+    "time_embedding_act_fn": None,
+    "time_embedding_dim": None,
+    "time_embedding_type": "positional",
+    "timestep_post_act": None,
+    "up_block_types": [
+        "SimpleCrossAttnUpBlock2D",
+        "SimpleCrossAttnUpBlock2D",
+        "SimpleCrossAttnUpBlock2D",
+        "ResnetUpsampleBlock2D",
+    ],
+    "upcast_attention": False,
+    "use_linear_projection": False,
+}
+
+
+def unet_model_from_original_config():
+    model = UNet2DConditionModel(**UNET_CONFIG)
+
+    return model
+
+
+def unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    num_head_channels = UNET_CONFIG["attention_head_dim"]
+
+    diffusers_checkpoint.update(unet_time_embeddings(checkpoint))
+    diffusers_checkpoint.update(unet_conv_in(checkpoint))
+    diffusers_checkpoint.update(unet_add_embedding(checkpoint))
+    diffusers_checkpoint.update(unet_encoder_hid_proj(checkpoint))
+
+    # <original>.input_blocks -> <diffusers>.down_blocks
+
+    original_down_block_idx = 1
+
+    for diffusers_down_block_idx in range(len(model.down_blocks)):
+        checkpoint_update, num_original_down_blocks = unet_downblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_down_block_idx=diffusers_down_block_idx,
+            original_down_block_idx=original_down_block_idx,
+            num_head_channels=num_head_channels,
+        )
+
+        original_down_block_idx += num_original_down_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    # done <original>.input_blocks -> <diffusers>.down_blocks
+
+    diffusers_checkpoint.update(
+        unet_midblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            num_head_channels=num_head_channels,
+        )
+    )
+
+    # <original>.output_blocks -> <diffusers>.up_blocks
+
+    original_up_block_idx = 0
+
+    for diffusers_up_block_idx in range(len(model.up_blocks)):
+        checkpoint_update, num_original_up_blocks = unet_upblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_up_block_idx=diffusers_up_block_idx,
+            original_up_block_idx=original_up_block_idx,
+            num_head_channels=num_head_channels,
+        )
+
+        original_up_block_idx += num_original_up_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    # done <original>.output_blocks -> <diffusers>.up_blocks
+
+    diffusers_checkpoint.update(unet_conv_norm_out(checkpoint))
+    diffusers_checkpoint.update(unet_conv_out(checkpoint))
+
+    return diffusers_checkpoint
+
+
+# done unet
+
+# inpaint unet
+
+# We are hardcoding the model configuration for now. If we need to generalize to more model configurations, we can
+# update then.
+
+INPAINT_UNET_CONFIG = {
+    "act_fn": "silu",
+    "addition_embed_type": "text_image",
+    "addition_embed_type_num_heads": 64,
+    "attention_head_dim": 64,
+    "block_out_channels": [384, 768, 1152, 1536],
+    "center_input_sample": False,
+    "class_embed_type": None,
+    "class_embeddings_concat": None,
+    "conv_in_kernel": 3,
+    "conv_out_kernel": 3,
+    "cross_attention_dim": 768,
+    "cross_attention_norm": None,
+    "down_block_types": [
+        "ResnetDownsampleBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+    ],
+    "downsample_padding": 1,
+    "dual_cross_attention": False,
+    "encoder_hid_dim": 1024,
+    "encoder_hid_dim_type": "text_image_proj",
+    "flip_sin_to_cos": True,
+    "freq_shift": 0,
+    "in_channels": 9,
+    "layers_per_block": 3,
+    "mid_block_only_cross_attention": None,
+    "mid_block_scale_factor": 1,
+    "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+    "norm_eps": 1e-05,
+    "norm_num_groups": 32,
+    "num_class_embeds": None,
+    "only_cross_attention": False,
+    "out_channels": 8,
+    "projection_class_embeddings_input_dim": None,
+    "resnet_out_scale_factor": 1.0,
+    "resnet_skip_time_act": False,
+    "resnet_time_scale_shift": "scale_shift",
+    "sample_size": 64,
+    "time_cond_proj_dim": None,
+    "time_embedding_act_fn": None,
+    "time_embedding_dim": None,
+    "time_embedding_type": "positional",
+    "timestep_post_act": None,
+    "up_block_types": [
+        "SimpleCrossAttnUpBlock2D",
+        "SimpleCrossAttnUpBlock2D",
+        "SimpleCrossAttnUpBlock2D",
+        "ResnetUpsampleBlock2D",
+    ],
+    "upcast_attention": False,
+    "use_linear_projection": False,
+}
+
+
+def inpaint_unet_model_from_original_config():
+    model = UNet2DConditionModel(**INPAINT_UNET_CONFIG)
+
+    return model
+
+
+def inpaint_unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    num_head_channels = INPAINT_UNET_CONFIG["attention_head_dim"]
+
+    diffusers_checkpoint.update(unet_time_embeddings(checkpoint))
+    diffusers_checkpoint.update(unet_conv_in(checkpoint))
+    diffusers_checkpoint.update(unet_add_embedding(checkpoint))
+    diffusers_checkpoint.update(unet_encoder_hid_proj(checkpoint))
+
+    # <original>.input_blocks -> <diffusers>.down_blocks
+
+    original_down_block_idx = 1
+
+    for diffusers_down_block_idx in range(len(model.down_blocks)):
+        checkpoint_update, num_original_down_blocks = unet_downblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_down_block_idx=diffusers_down_block_idx,
+            original_down_block_idx=original_down_block_idx,
+            num_head_channels=num_head_channels,
+        )
+
+        original_down_block_idx += num_original_down_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    # done <original>.input_blocks -> <diffusers>.down_blocks
+
+    diffusers_checkpoint.update(
+        unet_midblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            num_head_channels=num_head_channels,
+        )
+    )
+
+    # <original>.output_blocks -> <diffusers>.up_blocks
+
+    original_up_block_idx = 0
+
+    for diffusers_up_block_idx in range(len(model.up_blocks)):
+        checkpoint_update, num_original_up_blocks = unet_upblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_up_block_idx=diffusers_up_block_idx,
+            original_up_block_idx=original_up_block_idx,
+            num_head_channels=num_head_channels,
+        )
+
+        original_up_block_idx += num_original_up_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    # done <original>.output_blocks -> <diffusers>.up_blocks
+
+    diffusers_checkpoint.update(unet_conv_norm_out(checkpoint))
+    diffusers_checkpoint.update(unet_conv_out(checkpoint))
+
+    return diffusers_checkpoint
+
+
+# done inpaint unet
+
+
+# unet utils
+
+
+# <original>.time_embed -> <diffusers>.time_embedding
+def unet_time_embeddings(checkpoint):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_1.weight": checkpoint["time_embed.0.weight"],
+            "time_embedding.linear_1.bias": checkpoint["time_embed.0.bias"],
+            "time_embedding.linear_2.weight": checkpoint["time_embed.2.weight"],
+            "time_embedding.linear_2.bias": checkpoint["time_embed.2.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+# <original>.input_blocks.0 -> <diffusers>.conv_in
+def unet_conv_in(checkpoint):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "conv_in.weight": checkpoint["input_blocks.0.0.weight"],
+            "conv_in.bias": checkpoint["input_blocks.0.0.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def unet_add_embedding(checkpoint):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "add_embedding.text_norm.weight": checkpoint["ln_model_n.weight"],
+            "add_embedding.text_norm.bias": checkpoint["ln_model_n.bias"],
+            "add_embedding.text_proj.weight": checkpoint["proj_n.weight"],
+            "add_embedding.text_proj.bias": checkpoint["proj_n.bias"],
+            "add_embedding.image_proj.weight": checkpoint["img_layer.weight"],
+            "add_embedding.image_proj.bias": checkpoint["img_layer.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def unet_encoder_hid_proj(checkpoint):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "encoder_hid_proj.image_embeds.weight": checkpoint["clip_to_seq.weight"],
+            "encoder_hid_proj.image_embeds.bias": checkpoint["clip_to_seq.bias"],
+            "encoder_hid_proj.text_proj.weight": checkpoint["to_model_dim_n.weight"],
+            "encoder_hid_proj.text_proj.bias": checkpoint["to_model_dim_n.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+# <original>.out.0 -> <diffusers>.conv_norm_out
+def unet_conv_norm_out(checkpoint):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "conv_norm_out.weight": checkpoint["out.0.weight"],
+            "conv_norm_out.bias": checkpoint["out.0.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+# <original>.out.2 -> <diffusers>.conv_out
+def unet_conv_out(checkpoint):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "conv_out.weight": checkpoint["out.2.weight"],
+            "conv_out.bias": checkpoint["out.2.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+# <original>.input_blocks -> <diffusers>.down_blocks
+def unet_downblock_to_diffusers_checkpoint(
+    model, checkpoint, *, diffusers_down_block_idx, original_down_block_idx, num_head_channels
+):
+    diffusers_checkpoint = {}
+
+    diffusers_resnet_prefix = f"down_blocks.{diffusers_down_block_idx}.resnets"
+    original_down_block_prefix = "input_blocks"
+
+    down_block = model.down_blocks[diffusers_down_block_idx]
+
+    num_resnets = len(down_block.resnets)
+
+    if down_block.downsamplers is None:
+        downsampler = False
+    else:
+        assert len(down_block.downsamplers) == 1
+        downsampler = True
+        # The downsample block is also a resnet
+        num_resnets += 1
+
+    for resnet_idx_inc in range(num_resnets):
+        full_resnet_prefix = f"{original_down_block_prefix}.{original_down_block_idx + resnet_idx_inc}.0"
+
+        if downsampler and resnet_idx_inc == num_resnets - 1:
+            # this is a downsample block
+            full_diffusers_resnet_prefix = f"down_blocks.{diffusers_down_block_idx}.downsamplers.0"
+        else:
+            # this is a regular resnet block
+            full_diffusers_resnet_prefix = f"{diffusers_resnet_prefix}.{resnet_idx_inc}"
+
+        diffusers_checkpoint.update(
+            resnet_to_diffusers_checkpoint(
+                checkpoint, resnet_prefix=full_resnet_prefix, diffusers_resnet_prefix=full_diffusers_resnet_prefix
+            )
+        )
+
+    if hasattr(down_block, "attentions"):
+        num_attentions = len(down_block.attentions)
+        diffusers_attention_prefix = f"down_blocks.{diffusers_down_block_idx}.attentions"
+
+        for attention_idx_inc in range(num_attentions):
+            full_attention_prefix = f"{original_down_block_prefix}.{original_down_block_idx + attention_idx_inc}.1"
+            full_diffusers_attention_prefix = f"{diffusers_attention_prefix}.{attention_idx_inc}"
+
+            diffusers_checkpoint.update(
+                attention_to_diffusers_checkpoint(
+                    checkpoint,
+                    attention_prefix=full_attention_prefix,
+                    diffusers_attention_prefix=full_diffusers_attention_prefix,
+                    num_head_channels=num_head_channels,
+                )
+            )
+
+    num_original_down_blocks = num_resnets
+
+    return diffusers_checkpoint, num_original_down_blocks
+
+
+# <original>.middle_block -> <diffusers>.mid_block
+def unet_midblock_to_diffusers_checkpoint(model, checkpoint, *, num_head_channels):
+    diffusers_checkpoint = {}
+
+    # block 0
+
+    original_block_idx = 0
+
+    diffusers_checkpoint.update(
+        resnet_to_diffusers_checkpoint(
+            checkpoint,
+            diffusers_resnet_prefix="mid_block.resnets.0",
+            resnet_prefix=f"middle_block.{original_block_idx}",
+        )
+    )
+
+    original_block_idx += 1
+
+    # optional block 1
+
+    if hasattr(model.mid_block, "attentions") and model.mid_block.attentions[0] is not None:
+        diffusers_checkpoint.update(
+            attention_to_diffusers_checkpoint(
+                checkpoint,
+                diffusers_attention_prefix="mid_block.attentions.0",
+                attention_prefix=f"middle_block.{original_block_idx}",
+                num_head_channels=num_head_channels,
+            )
+        )
+        original_block_idx += 1
+
+    # block 1 or block 2
+
+    diffusers_checkpoint.update(
+        resnet_to_diffusers_checkpoint(
+            checkpoint,
+            diffusers_resnet_prefix="mid_block.resnets.1",
+            resnet_prefix=f"middle_block.{original_block_idx}",
+        )
+    )
+
+    return diffusers_checkpoint
+
+
+# <original>.output_blocks -> <diffusers>.up_blocks
+def unet_upblock_to_diffusers_checkpoint(
+    model, checkpoint, *, diffusers_up_block_idx, original_up_block_idx, num_head_channels
+):
+    diffusers_checkpoint = {}
+
+    diffusers_resnet_prefix = f"up_blocks.{diffusers_up_block_idx}.resnets"
+    original_up_block_prefix = "output_blocks"
+
+    up_block = model.up_blocks[diffusers_up_block_idx]
+
+    num_resnets = len(up_block.resnets)
+
+    if up_block.upsamplers is None:
+        upsampler = False
+    else:
+        assert len(up_block.upsamplers) == 1
+        upsampler = True
+        # The upsample block is also a resnet
+        num_resnets += 1
+
+    has_attentions = hasattr(up_block, "attentions")
+
+    for resnet_idx_inc in range(num_resnets):
+        if upsampler and resnet_idx_inc == num_resnets - 1:
+            # this is an upsample block
+            if has_attentions:
+                # There is a middle attention block that we skip
+                original_resnet_block_idx = 2
+            else:
+                original_resnet_block_idx = 1
+
+            # we add the `minus 1` because the last two resnets are stuck together in the same output block
+            full_resnet_prefix = (
+                f"{original_up_block_prefix}.{original_up_block_idx + resnet_idx_inc - 1}.{original_resnet_block_idx}"
+            )
+
+            full_diffusers_resnet_prefix = f"up_blocks.{diffusers_up_block_idx}.upsamplers.0"
+        else:
+            # this is a regular resnet block
+            full_resnet_prefix = f"{original_up_block_prefix}.{original_up_block_idx + resnet_idx_inc}.0"
+            full_diffusers_resnet_prefix = f"{diffusers_resnet_prefix}.{resnet_idx_inc}"
+
+        diffusers_checkpoint.update(
+            resnet_to_diffusers_checkpoint(
+                checkpoint, resnet_prefix=full_resnet_prefix, diffusers_resnet_prefix=full_diffusers_resnet_prefix
+            )
+        )
+
+    if has_attentions:
+        num_attentions = len(up_block.attentions)
+        diffusers_attention_prefix = f"up_blocks.{diffusers_up_block_idx}.attentions"
+
+        for attention_idx_inc in range(num_attentions):
+            full_attention_prefix = f"{original_up_block_prefix}.{original_up_block_idx + attention_idx_inc}.1"
+            full_diffusers_attention_prefix = f"{diffusers_attention_prefix}.{attention_idx_inc}"
+
+            diffusers_checkpoint.update(
+                attention_to_diffusers_checkpoint(
+                    checkpoint,
+                    attention_prefix=full_attention_prefix,
+                    diffusers_attention_prefix=full_diffusers_attention_prefix,
+                    num_head_channels=num_head_channels,
+                )
+            )
+
+    num_original_down_blocks = num_resnets - 1 if upsampler else num_resnets
+
+    return diffusers_checkpoint, num_original_down_blocks
+
+
+def resnet_to_diffusers_checkpoint(checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
+    diffusers_checkpoint = {
+        f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.in_layers.0.weight"],
+        f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.in_layers.0.bias"],
+        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.in_layers.2.weight"],
+        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.in_layers.2.bias"],
+        f"{diffusers_resnet_prefix}.time_emb_proj.weight": checkpoint[f"{resnet_prefix}.emb_layers.1.weight"],
+        f"{diffusers_resnet_prefix}.time_emb_proj.bias": checkpoint[f"{resnet_prefix}.emb_layers.1.bias"],
+        f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.out_layers.0.weight"],
+        f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.out_layers.0.bias"],
+        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.out_layers.3.weight"],
+        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.out_layers.3.bias"],
+    }
+
+    skip_connection_prefix = f"{resnet_prefix}.skip_connection"
+
+    if f"{skip_connection_prefix}.weight" in checkpoint:
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{skip_connection_prefix}.weight"],
+                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{skip_connection_prefix}.bias"],
+            }
+        )
+
+    return diffusers_checkpoint
+
+
+def attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix, num_head_channels):
+    diffusers_checkpoint = {}
+
+    # <original>.norm -> <diffusers>.group_norm
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.group_norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
+            f"{diffusers_attention_prefix}.group_norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
+        }
+    )
+
+    # <original>.qkv -> <diffusers>.{query, key, value}
+    [q_weight, k_weight, v_weight], [q_bias, k_bias, v_bias] = split_attentions(
+        weight=checkpoint[f"{attention_prefix}.qkv.weight"][:, :, 0],
+        bias=checkpoint[f"{attention_prefix}.qkv.bias"],
+        split=3,
+        chunk_size=num_head_channels,
+    )
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_q.weight": q_weight,
+            f"{diffusers_attention_prefix}.to_q.bias": q_bias,
+            f"{diffusers_attention_prefix}.to_k.weight": k_weight,
+            f"{diffusers_attention_prefix}.to_k.bias": k_bias,
+            f"{diffusers_attention_prefix}.to_v.weight": v_weight,
+            f"{diffusers_attention_prefix}.to_v.bias": v_bias,
+        }
+    )
+
+    # <original>.encoder_kv -> <diffusers>.{context_key, context_value}
+    [encoder_k_weight, encoder_v_weight], [encoder_k_bias, encoder_v_bias] = split_attentions(
+        weight=checkpoint[f"{attention_prefix}.encoder_kv.weight"][:, :, 0],
+        bias=checkpoint[f"{attention_prefix}.encoder_kv.bias"],
+        split=2,
+        chunk_size=num_head_channels,
+    )
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.add_k_proj.weight": encoder_k_weight,
+            f"{diffusers_attention_prefix}.add_k_proj.bias": encoder_k_bias,
+            f"{diffusers_attention_prefix}.add_v_proj.weight": encoder_v_weight,
+            f"{diffusers_attention_prefix}.add_v_proj.bias": encoder_v_bias,
+        }
+    )
+
+    # <original>.proj_out (1d conv) -> <diffusers>.proj_attn (linear)
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
+                :, :, 0
+            ],
+            f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+# TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?)
+def split_attentions(*, weight, bias, split, chunk_size):
+    weights = [None] * split
+    biases = [None] * split
+
+    weights_biases_idx = 0
+
+    for starting_row_index in range(0, weight.shape[0], chunk_size):
+        row_indices = torch.arange(starting_row_index, starting_row_index + chunk_size)
+
+        weight_rows = weight[row_indices, :]
+        bias_rows = bias[row_indices]
+
+        if weights[weights_biases_idx] is None:
+            assert weights[weights_biases_idx] is None
+            weights[weights_biases_idx] = weight_rows
+            biases[weights_biases_idx] = bias_rows
+        else:
+            assert weights[weights_biases_idx] is not None
+            weights[weights_biases_idx] = torch.concat([weights[weights_biases_idx], weight_rows])
+            biases[weights_biases_idx] = torch.concat([biases[weights_biases_idx], bias_rows])
+
+        weights_biases_idx = (weights_biases_idx + 1) % split
+
+    return weights, biases
+
+
+# done unet utils
+
+
+def prior(*, args, checkpoint_map_location):
+    print("loading prior")
+
+    prior_checkpoint = torch.load(args.prior_checkpoint_path, map_location=checkpoint_map_location)
+
+    clip_stats_checkpoint = torch.load(args.clip_stat_path, map_location=checkpoint_map_location)
+
+    prior_model = prior_model_from_original_config()
+
+    prior_diffusers_checkpoint = prior_original_checkpoint_to_diffusers_checkpoint(
+        prior_model, prior_checkpoint, clip_stats_checkpoint
+    )
+
+    del prior_checkpoint
+    del clip_stats_checkpoint
+
+    load_checkpoint_to_model(prior_diffusers_checkpoint, prior_model, strict=True)
+
+    print("done loading prior")
+
+    return prior_model
+
+
+def text2img(*, args, checkpoint_map_location):
+    print("loading text2img")
+
+    text2img_checkpoint = torch.load(args.text2img_checkpoint_path, map_location=checkpoint_map_location)
+
+    unet_model = unet_model_from_original_config()
+
+    unet_diffusers_checkpoint = unet_original_checkpoint_to_diffusers_checkpoint(unet_model, text2img_checkpoint)
+
+    del text2img_checkpoint
+
+    load_checkpoint_to_model(unet_diffusers_checkpoint, unet_model, strict=True)
+
+    print("done loading text2img")
+
+    return unet_model
+
+
+def inpaint_text2img(*, args, checkpoint_map_location):
+    print("loading inpaint text2img")
+
+    inpaint_text2img_checkpoint = torch.load(
+        args.inpaint_text2img_checkpoint_path, map_location=checkpoint_map_location
+    )
+
+    inpaint_unet_model = inpaint_unet_model_from_original_config()
+
+    inpaint_unet_diffusers_checkpoint = inpaint_unet_original_checkpoint_to_diffusers_checkpoint(
+        inpaint_unet_model, inpaint_text2img_checkpoint
+    )
+
+    del inpaint_text2img_checkpoint
+
+    load_checkpoint_to_model(inpaint_unet_diffusers_checkpoint, inpaint_unet_model, strict=True)
+
+    print("done loading inpaint text2img")
+
+    return inpaint_unet_model
+
+
+# movq
+
+MOVQ_CONFIG = {
+    "in_channels": 3,
+    "out_channels": 3,
+    "latent_channels": 4,
+    "down_block_types": ("DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "AttnDownEncoderBlock2D"),
+    "up_block_types": ("AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"),
+    "num_vq_embeddings": 16384,
+    "block_out_channels": (128, 256, 256, 512),
+    "vq_embed_dim": 4,
+    "layers_per_block": 2,
+    "norm_type": "spatial",
+}
+
+
+def movq_model_from_original_config():
+    movq = VQModel(**MOVQ_CONFIG)
+    return movq
+
+
+def movq_encoder_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # conv_in
+    diffusers_checkpoint.update(
+        {
+            "encoder.conv_in.weight": checkpoint["encoder.conv_in.weight"],
+            "encoder.conv_in.bias": checkpoint["encoder.conv_in.bias"],
+        }
+    )
+
+    # down_blocks
+    for down_block_idx, down_block in enumerate(model.encoder.down_blocks):
+        diffusers_down_block_prefix = f"encoder.down_blocks.{down_block_idx}"
+        down_block_prefix = f"encoder.down.{down_block_idx}"
+
+        # resnets
+        for resnet_idx, resnet in enumerate(down_block.resnets):
+            diffusers_resnet_prefix = f"{diffusers_down_block_prefix}.resnets.{resnet_idx}"
+            resnet_prefix = f"{down_block_prefix}.block.{resnet_idx}"
+
+            diffusers_checkpoint.update(
+                movq_resnet_to_diffusers_checkpoint(
+                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+                )
+            )
+
+        # downsample
+
+        # do not include the downsample when on the last down block
+        # There is no downsample on the last down block
+        if down_block_idx != len(model.encoder.down_blocks) - 1:
+            # There's a single downsample in the original checkpoint but a list of downsamples
+            # in the diffusers model.
+            diffusers_downsample_prefix = f"{diffusers_down_block_prefix}.downsamplers.0.conv"
+            downsample_prefix = f"{down_block_prefix}.downsample.conv"
+            diffusers_checkpoint.update(
+                {
+                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
+                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
+                }
+            )
+
+        # attentions
+
+        if hasattr(down_block, "attentions"):
+            for attention_idx, _ in enumerate(down_block.attentions):
+                diffusers_attention_prefix = f"{diffusers_down_block_prefix}.attentions.{attention_idx}"
+                attention_prefix = f"{down_block_prefix}.attn.{attention_idx}"
+                diffusers_checkpoint.update(
+                    movq_attention_to_diffusers_checkpoint(
+                        checkpoint,
+                        diffusers_attention_prefix=diffusers_attention_prefix,
+                        attention_prefix=attention_prefix,
+                    )
+                )
+
+    # mid block
+
+    # mid block attentions
+
+    # There is a single hardcoded attention block in the middle of the VQ-diffusion encoder
+    diffusers_attention_prefix = "encoder.mid_block.attentions.0"
+    attention_prefix = "encoder.mid.attn_1"
+    diffusers_checkpoint.update(
+        movq_attention_to_diffusers_checkpoint(
+            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+        )
+    )
+
+    # mid block resnets
+
+    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
+        diffusers_resnet_prefix = f"encoder.mid_block.resnets.{diffusers_resnet_idx}"
+
+        # the hardcoded prefixes to `block_` are 1 and 2
+        orig_resnet_idx = diffusers_resnet_idx + 1
+        # There are two hardcoded resnets in the middle of the VQ-diffusion encoder
+        resnet_prefix = f"encoder.mid.block_{orig_resnet_idx}"
+
+        diffusers_checkpoint.update(
+            movq_resnet_to_diffusers_checkpoint(
+                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+            )
+        )
+
+    diffusers_checkpoint.update(
+        {
+            # conv_norm_out
+            "encoder.conv_norm_out.weight": checkpoint["encoder.norm_out.weight"],
+            "encoder.conv_norm_out.bias": checkpoint["encoder.norm_out.bias"],
+            # conv_out
+            "encoder.conv_out.weight": checkpoint["encoder.conv_out.weight"],
+            "encoder.conv_out.bias": checkpoint["encoder.conv_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def movq_decoder_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # conv in
+    diffusers_checkpoint.update(
+        {
+            "decoder.conv_in.weight": checkpoint["decoder.conv_in.weight"],
+            "decoder.conv_in.bias": checkpoint["decoder.conv_in.bias"],
+        }
+    )
+
+    # up_blocks
+
+    for diffusers_up_block_idx, up_block in enumerate(model.decoder.up_blocks):
+        # up_blocks are stored in reverse order in the VQ-diffusion checkpoint
+        orig_up_block_idx = len(model.decoder.up_blocks) - 1 - diffusers_up_block_idx
+
+        diffusers_up_block_prefix = f"decoder.up_blocks.{diffusers_up_block_idx}"
+        up_block_prefix = f"decoder.up.{orig_up_block_idx}"
+
+        # resnets
+        for resnet_idx, resnet in enumerate(up_block.resnets):
+            diffusers_resnet_prefix = f"{diffusers_up_block_prefix}.resnets.{resnet_idx}"
+            resnet_prefix = f"{up_block_prefix}.block.{resnet_idx}"
+
+            diffusers_checkpoint.update(
+                movq_resnet_to_diffusers_checkpoint_spatial_norm(
+                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+                )
+            )
+
+        # upsample
+
+        # there is no up sample on the last up block
+        if diffusers_up_block_idx != len(model.decoder.up_blocks) - 1:
+            # There's a single upsample in the VQ-diffusion checkpoint but a list of downsamples
+            # in the diffusers model.
+            diffusers_downsample_prefix = f"{diffusers_up_block_prefix}.upsamplers.0.conv"
+            downsample_prefix = f"{up_block_prefix}.upsample.conv"
+            diffusers_checkpoint.update(
+                {
+                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
+                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
+                }
+            )
+
+        # attentions
+
+        if hasattr(up_block, "attentions"):
+            for attention_idx, _ in enumerate(up_block.attentions):
+                diffusers_attention_prefix = f"{diffusers_up_block_prefix}.attentions.{attention_idx}"
+                attention_prefix = f"{up_block_prefix}.attn.{attention_idx}"
+                diffusers_checkpoint.update(
+                    movq_attention_to_diffusers_checkpoint_spatial_norm(
+                        checkpoint,
+                        diffusers_attention_prefix=diffusers_attention_prefix,
+                        attention_prefix=attention_prefix,
+                    )
+                )
+
+    # mid block
+
+    # mid block attentions
+
+    # There is a single hardcoded attention block in the middle of the VQ-diffusion decoder
+    diffusers_attention_prefix = "decoder.mid_block.attentions.0"
+    attention_prefix = "decoder.mid.attn_1"
+    diffusers_checkpoint.update(
+        movq_attention_to_diffusers_checkpoint_spatial_norm(
+            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+        )
+    )
+
+    # mid block resnets
+
+    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
+        diffusers_resnet_prefix = f"decoder.mid_block.resnets.{diffusers_resnet_idx}"
+
+        # the hardcoded prefixes to `block_` are 1 and 2
+        orig_resnet_idx = diffusers_resnet_idx + 1
+        # There are two hardcoded resnets in the middle of the VQ-diffusion decoder
+        resnet_prefix = f"decoder.mid.block_{orig_resnet_idx}"
+
+        diffusers_checkpoint.update(
+            movq_resnet_to_diffusers_checkpoint_spatial_norm(
+                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+            )
+        )
+
+    diffusers_checkpoint.update(
+        {
+            # conv_norm_out
+            "decoder.conv_norm_out.norm_layer.weight": checkpoint["decoder.norm_out.norm_layer.weight"],
+            "decoder.conv_norm_out.norm_layer.bias": checkpoint["decoder.norm_out.norm_layer.bias"],
+            "decoder.conv_norm_out.conv_y.weight": checkpoint["decoder.norm_out.conv_y.weight"],
+            "decoder.conv_norm_out.conv_y.bias": checkpoint["decoder.norm_out.conv_y.bias"],
+            "decoder.conv_norm_out.conv_b.weight": checkpoint["decoder.norm_out.conv_b.weight"],
+            "decoder.conv_norm_out.conv_b.bias": checkpoint["decoder.norm_out.conv_b.bias"],
+            # conv_out
+            "decoder.conv_out.weight": checkpoint["decoder.conv_out.weight"],
+            "decoder.conv_out.bias": checkpoint["decoder.conv_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def movq_resnet_to_diffusers_checkpoint(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
+    rv = {
+        # norm1
+        f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.norm1.weight"],
+        f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.norm1.bias"],
+        # conv1
+        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"],
+        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"],
+        # norm2
+        f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.norm2.weight"],
+        f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.norm2.bias"],
+        # conv2
+        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"],
+        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"],
+    }
+
+    if resnet.conv_shortcut is not None:
+        rv.update(
+            {
+                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"],
+                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"],
+            }
+        )
+
+    return rv
+
+
+def movq_resnet_to_diffusers_checkpoint_spatial_norm(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
+    rv = {
+        # norm1
+        f"{diffusers_resnet_prefix}.norm1.norm_layer.weight": checkpoint[f"{resnet_prefix}.norm1.norm_layer.weight"],
+        f"{diffusers_resnet_prefix}.norm1.norm_layer.bias": checkpoint[f"{resnet_prefix}.norm1.norm_layer.bias"],
+        f"{diffusers_resnet_prefix}.norm1.conv_y.weight": checkpoint[f"{resnet_prefix}.norm1.conv_y.weight"],
+        f"{diffusers_resnet_prefix}.norm1.conv_y.bias": checkpoint[f"{resnet_prefix}.norm1.conv_y.bias"],
+        f"{diffusers_resnet_prefix}.norm1.conv_b.weight": checkpoint[f"{resnet_prefix}.norm1.conv_b.weight"],
+        f"{diffusers_resnet_prefix}.norm1.conv_b.bias": checkpoint[f"{resnet_prefix}.norm1.conv_b.bias"],
+        # conv1
+        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"],
+        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"],
+        # norm2
+        f"{diffusers_resnet_prefix}.norm2.norm_layer.weight": checkpoint[f"{resnet_prefix}.norm2.norm_layer.weight"],
+        f"{diffusers_resnet_prefix}.norm2.norm_layer.bias": checkpoint[f"{resnet_prefix}.norm2.norm_layer.bias"],
+        f"{diffusers_resnet_prefix}.norm2.conv_y.weight": checkpoint[f"{resnet_prefix}.norm2.conv_y.weight"],
+        f"{diffusers_resnet_prefix}.norm2.conv_y.bias": checkpoint[f"{resnet_prefix}.norm2.conv_y.bias"],
+        f"{diffusers_resnet_prefix}.norm2.conv_b.weight": checkpoint[f"{resnet_prefix}.norm2.conv_b.weight"],
+        f"{diffusers_resnet_prefix}.norm2.conv_b.bias": checkpoint[f"{resnet_prefix}.norm2.conv_b.bias"],
+        # conv2
+        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"],
+        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"],
+    }
+
+    if resnet.conv_shortcut is not None:
+        rv.update(
+            {
+                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"],
+                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"],
+            }
+        )
+
+    return rv
+
+
+def movq_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
+    return {
+        # norm
+        f"{diffusers_attention_prefix}.group_norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
+        f"{diffusers_attention_prefix}.group_norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
+        # query
+        f"{diffusers_attention_prefix}.to_q.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_q.bias": checkpoint[f"{attention_prefix}.q.bias"],
+        # key
+        f"{diffusers_attention_prefix}.to_k.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_k.bias": checkpoint[f"{attention_prefix}.k.bias"],
+        # value
+        f"{diffusers_attention_prefix}.to_v.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_v.bias": checkpoint[f"{attention_prefix}.v.bias"],
+        # proj_attn
+        f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
+    }
+
+
+def movq_attention_to_diffusers_checkpoint_spatial_norm(checkpoint, *, diffusers_attention_prefix, attention_prefix):
+    return {
+        # norm
+        f"{diffusers_attention_prefix}.spatial_norm.norm_layer.weight": checkpoint[
+            f"{attention_prefix}.norm.norm_layer.weight"
+        ],
+        f"{diffusers_attention_prefix}.spatial_norm.norm_layer.bias": checkpoint[
+            f"{attention_prefix}.norm.norm_layer.bias"
+        ],
+        f"{diffusers_attention_prefix}.spatial_norm.conv_y.weight": checkpoint[
+            f"{attention_prefix}.norm.conv_y.weight"
+        ],
+        f"{diffusers_attention_prefix}.spatial_norm.conv_y.bias": checkpoint[f"{attention_prefix}.norm.conv_y.bias"],
+        f"{diffusers_attention_prefix}.spatial_norm.conv_b.weight": checkpoint[
+            f"{attention_prefix}.norm.conv_b.weight"
+        ],
+        f"{diffusers_attention_prefix}.spatial_norm.conv_b.bias": checkpoint[f"{attention_prefix}.norm.conv_b.bias"],
+        # query
+        f"{diffusers_attention_prefix}.to_q.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_q.bias": checkpoint[f"{attention_prefix}.q.bias"],
+        # key
+        f"{diffusers_attention_prefix}.to_k.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_k.bias": checkpoint[f"{attention_prefix}.k.bias"],
+        # value
+        f"{diffusers_attention_prefix}.to_v.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_v.bias": checkpoint[f"{attention_prefix}.v.bias"],
+        # proj_attn
+        f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
+    }
+
+
+def movq_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+    diffusers_checkpoint.update(movq_encoder_to_diffusers_checkpoint(model, checkpoint))
+
+    # quant_conv
+
+    diffusers_checkpoint.update(
+        {
+            "quant_conv.weight": checkpoint["quant_conv.weight"],
+            "quant_conv.bias": checkpoint["quant_conv.bias"],
+        }
+    )
+
+    # quantize
+    diffusers_checkpoint.update({"quantize.embedding.weight": checkpoint["quantize.embedding.weight"]})
+
+    # post_quant_conv
+    diffusers_checkpoint.update(
+        {
+            "post_quant_conv.weight": checkpoint["post_quant_conv.weight"],
+            "post_quant_conv.bias": checkpoint["post_quant_conv.bias"],
+        }
+    )
+
+    # decoder
+    diffusers_checkpoint.update(movq_decoder_to_diffusers_checkpoint(model, checkpoint))
+
+    return diffusers_checkpoint
+
+
+def movq(*, args, checkpoint_map_location):
+    print("loading movq")
+
+    movq_checkpoint = torch.load(args.movq_checkpoint_path, map_location=checkpoint_map_location)
+
+    movq_model = movq_model_from_original_config()
+
+    movq_diffusers_checkpoint = movq_original_checkpoint_to_diffusers_checkpoint(movq_model, movq_checkpoint)
+
+    del movq_checkpoint
+
+    load_checkpoint_to_model(movq_diffusers_checkpoint, movq_model, strict=True)
+
+    print("done loading movq")
+
+    return movq_model
+
+
+def load_checkpoint_to_model(checkpoint, model, strict=False):
+    with tempfile.NamedTemporaryFile(delete=False) as file:
+        torch.save(checkpoint, file.name)
+        del checkpoint
+        if strict:
+            model.load_state_dict(torch.load(file.name), strict=True)
+        else:
+            load_checkpoint_and_dispatch(model, file.name, device_map="auto")
+    os.remove(file.name)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    parser.add_argument(
+        "--prior_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the prior checkpoint to convert.",
+    )
+    parser.add_argument(
+        "--clip_stat_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the clip stats checkpoint to convert.",
+    )
+    parser.add_argument(
+        "--text2img_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the text2img checkpoint to convert.",
+    )
+    parser.add_argument(
+        "--movq_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the text2img checkpoint to convert.",
+    )
+    parser.add_argument(
+        "--inpaint_text2img_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the inpaint text2img checkpoint to convert.",
+    )
+    parser.add_argument(
+        "--checkpoint_load_device",
+        default="cpu",
+        type=str,
+        required=False,
+        help="The device passed to `map_location` when loading checkpoints.",
+    )
+
+    parser.add_argument(
+        "--debug",
+        default=None,
+        type=str,
+        required=False,
+        help="Only run a specific stage of the convert script. Used for debugging",
+    )
+
+    args = parser.parse_args()
+
+    print(f"loading checkpoints to {args.checkpoint_load_device}")
+
+    checkpoint_map_location = torch.device(args.checkpoint_load_device)
+
+    if args.debug is not None:
+        print(f"debug: only executing {args.debug}")
+
+    if args.debug is None:
+        print("to-do")
+    elif args.debug == "prior":
+        prior_model = prior(args=args, checkpoint_map_location=checkpoint_map_location)
+        prior_model.save_pretrained(args.dump_path)
+    elif args.debug == "text2img":
+        unet_model = text2img(args=args, checkpoint_map_location=checkpoint_map_location)
+        unet_model.save_pretrained(f"{args.dump_path}/unet")
+    elif args.debug == "inpaint_text2img":
+        inpaint_unet_model = inpaint_text2img(args=args, checkpoint_map_location=checkpoint_map_location)
+        inpaint_unet_model.save_pretrained(f"{args.dump_path}/inpaint_unet")
+    elif args.debug == "decoder":
+        decoder = movq(args=args, checkpoint_map_location=checkpoint_map_location)
+        decoder.save_pretrained(f"{args.dump_path}/decoder")
+    else:
+        raise ValueError(f"unknown debug value : {args.debug}")
diff --git a/diffusers/scripts/convert_ldm_original_checkpoint_to_diffusers.py b/diffusers/scripts/convert_ldm_original_checkpoint_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ada7dc6e2950c3f2dd3af23ecf2c47579cd6696c
--- /dev/null
+++ b/diffusers/scripts/convert_ldm_original_checkpoint_to_diffusers.py
@@ -0,0 +1,359 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the LDM checkpoints."""
+
+import argparse
+import json
+
+import torch
+
+from diffusers import DDPMScheduler, LDMPipeline, UNet2DModel, VQModel
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def convert_ldm_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = checkpoint["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = checkpoint["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = checkpoint["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = checkpoint["time_embed.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = checkpoint["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = checkpoint["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = checkpoint["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = checkpoint["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = checkpoint["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = checkpoint["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in checkpoint if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in checkpoint if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in checkpoint if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in checkpoint if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in checkpoint if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in checkpoint if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["num_res_blocks"] + 1)
+        layer_in_block_id = (i - 1) % (config["num_res_blocks"] + 1)
+
+        resnets = [key for key in input_blocks[i] if f"input_blocks.{i}.0" in key]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in checkpoint:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = checkpoint[
+                f"input_blocks.{i}.0.op.weight"
+            ]
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = checkpoint[
+                f"input_blocks.{i}.0.op.bias"
+            ]
+            continue
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        resnet_op = {"old": "resnets.2.op", "new": "downsamplers.0.op"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, checkpoint, additional_replacements=[meta_path, resnet_op], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {
+                "old": f"input_blocks.{i}.1",
+                "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}",
+            }
+            to_split = {
+                f"input_blocks.{i}.1.qkv.bias": {
+                    "key": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias",
+                    "query": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias",
+                    "value": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias",
+                },
+                f"input_blocks.{i}.1.qkv.weight": {
+                    "key": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight",
+                    "query": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight",
+                    "value": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight",
+                },
+            }
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                checkpoint,
+                additional_replacements=[meta_path],
+                attention_paths_to_split=to_split,
+                config=config,
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, checkpoint, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, checkpoint, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    to_split = {
+        "middle_block.1.qkv.bias": {
+            "key": "mid_block.attentions.0.key.bias",
+            "query": "mid_block.attentions.0.query.bias",
+            "value": "mid_block.attentions.0.value.bias",
+        },
+        "middle_block.1.qkv.weight": {
+            "key": "mid_block.attentions.0.key.weight",
+            "query": "mid_block.attentions.0.query.weight",
+            "value": "mid_block.attentions.0.value.weight",
+        },
+    }
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, checkpoint, attention_paths_to_split=to_split, config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["num_res_blocks"] + 1)
+        layer_in_block_id = i % (config["num_res_blocks"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[meta_path], config=config)
+
+            if ["conv.weight", "conv.bias"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = checkpoint[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = checkpoint[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                to_split = {
+                    f"output_blocks.{i}.1.qkv.bias": {
+                        "key": f"up_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias",
+                        "query": f"up_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias",
+                        "value": f"up_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias",
+                    },
+                    f"output_blocks.{i}.1.qkv.weight": {
+                        "key": f"up_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight",
+                        "query": f"up_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight",
+                        "value": f"up_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight",
+                    },
+                }
+                assign_to_checkpoint(
+                    paths,
+                    new_checkpoint,
+                    checkpoint,
+                    additional_replacements=[meta_path],
+                    attention_paths_to_split=to_split if any("qkv" in key for key in attentions) else None,
+                    config=config,
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = checkpoint[old_path]
+
+    return new_checkpoint
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the architecture.",
+    )
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    args = parser.parse_args()
+
+    checkpoint = torch.load(args.checkpoint_path)
+
+    with open(args.config_file) as f:
+        config = json.loads(f.read())
+
+    converted_checkpoint = convert_ldm_checkpoint(checkpoint, config)
+
+    if "ldm" in config:
+        del config["ldm"]
+
+    model = UNet2DModel(**config)
+    model.load_state_dict(converted_checkpoint)
+
+    try:
+        scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1]))
+        vqvae = VQModel.from_pretrained("/".join(args.checkpoint_path.split("/")[:-1]))
+
+        pipe = LDMPipeline(unet=model, scheduler=scheduler, vae=vqvae)
+        pipe.save_pretrained(args.dump_path)
+    except:  # noqa: E722
+        model.save_pretrained(args.dump_path)
diff --git a/diffusers/scripts/convert_lora_safetensor_to_diffusers.py b/diffusers/scripts/convert_lora_safetensor_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4237452e2ed16a82ef53199651f8a48c60b051a8
--- /dev/null
+++ b/diffusers/scripts/convert_lora_safetensor_to_diffusers.py
@@ -0,0 +1,128 @@
+# coding=utf-8
+# Copyright 2024, Haofan Wang, Qixun Wang, All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Conversion script for the LoRA's safetensors checkpoints."""
+
+import argparse
+
+import torch
+from safetensors.torch import load_file
+
+from diffusers import StableDiffusionPipeline
+
+
+def convert(base_model_path, checkpoint_path, LORA_PREFIX_UNET, LORA_PREFIX_TEXT_ENCODER, alpha):
+    # load base model
+    pipeline = StableDiffusionPipeline.from_pretrained(base_model_path, torch_dtype=torch.float32)
+
+    # load LoRA weight from .safetensors
+    state_dict = load_file(checkpoint_path)
+
+    visited = []
+
+    # directly update weight in diffusers model
+    for key in state_dict:
+        # it is suggested to print out the key, it usually will be something like below
+        # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
+
+        # as we have set the alpha beforehand, so just skip
+        if ".alpha" in key or key in visited:
+            continue
+
+        if "text" in key:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
+            curr_layer = pipeline.text_encoder
+        else:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
+            curr_layer = pipeline.unet
+
+        # find the target layer
+        temp_name = layer_infos.pop(0)
+        while len(layer_infos) > -1:
+            try:
+                curr_layer = curr_layer.__getattr__(temp_name)
+                if len(layer_infos) > 0:
+                    temp_name = layer_infos.pop(0)
+                elif len(layer_infos) == 0:
+                    break
+            except Exception:
+                if len(temp_name) > 0:
+                    temp_name += "_" + layer_infos.pop(0)
+                else:
+                    temp_name = layer_infos.pop(0)
+
+        pair_keys = []
+        if "lora_down" in key:
+            pair_keys.append(key.replace("lora_down", "lora_up"))
+            pair_keys.append(key)
+        else:
+            pair_keys.append(key)
+            pair_keys.append(key.replace("lora_up", "lora_down"))
+
+        # update weight
+        if len(state_dict[pair_keys[0]].shape) == 4:
+            weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3)
+        else:
+            weight_up = state_dict[pair_keys[0]].to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down)
+
+        # update visited list
+        for item in pair_keys:
+            visited.append(item)
+
+    return pipeline
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--base_model_path", default=None, type=str, required=True, help="Path to the base model in diffusers format."
+    )
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument(
+        "--lora_prefix_unet", default="lora_unet", type=str, help="The prefix of UNet weight in safetensors"
+    )
+    parser.add_argument(
+        "--lora_prefix_text_encoder",
+        default="lora_te",
+        type=str,
+        help="The prefix of text encoder weight in safetensors",
+    )
+    parser.add_argument("--alpha", default=0.75, type=float, help="The merging ratio in W = W0 + alpha * deltaW")
+    parser.add_argument(
+        "--to_safetensors", action="store_true", help="Whether to store pipeline in safetensors format or not."
+    )
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+
+    args = parser.parse_args()
+
+    base_model_path = args.base_model_path
+    checkpoint_path = args.checkpoint_path
+    dump_path = args.dump_path
+    lora_prefix_unet = args.lora_prefix_unet
+    lora_prefix_text_encoder = args.lora_prefix_text_encoder
+    alpha = args.alpha
+
+    pipe = convert(base_model_path, checkpoint_path, lora_prefix_unet, lora_prefix_text_encoder, alpha)
+
+    pipe = pipe.to(args.device)
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/diffusers/scripts/convert_lumina_to_diffusers.py b/diffusers/scripts/convert_lumina_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..a12625d1376fe4ff2e024244867bc33b9f1a9dd5
--- /dev/null
+++ b/diffusers/scripts/convert_lumina_to_diffusers.py
@@ -0,0 +1,142 @@
+import argparse
+import os
+
+import torch
+from safetensors.torch import load_file
+from transformers import AutoModel, AutoTokenizer
+
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, LuminaNextDiT2DModel, LuminaText2ImgPipeline
+
+
+def main(args):
+    # checkpoint from https://huggingface.co/Alpha-VLLM/Lumina-Next-SFT or https://huggingface.co/Alpha-VLLM/Lumina-Next-T2I
+    all_sd = load_file(args.origin_ckpt_path, device="cpu")
+    converted_state_dict = {}
+    # pad token
+    converted_state_dict["pad_token"] = all_sd["pad_token"]
+
+    # patch embed
+    converted_state_dict["patch_embedder.weight"] = all_sd["x_embedder.weight"]
+    converted_state_dict["patch_embedder.bias"] = all_sd["x_embedder.bias"]
+
+    # time and caption embed
+    converted_state_dict["time_caption_embed.timestep_embedder.linear_1.weight"] = all_sd["t_embedder.mlp.0.weight"]
+    converted_state_dict["time_caption_embed.timestep_embedder.linear_1.bias"] = all_sd["t_embedder.mlp.0.bias"]
+    converted_state_dict["time_caption_embed.timestep_embedder.linear_2.weight"] = all_sd["t_embedder.mlp.2.weight"]
+    converted_state_dict["time_caption_embed.timestep_embedder.linear_2.bias"] = all_sd["t_embedder.mlp.2.bias"]
+    converted_state_dict["time_caption_embed.caption_embedder.0.weight"] = all_sd["cap_embedder.0.weight"]
+    converted_state_dict["time_caption_embed.caption_embedder.0.bias"] = all_sd["cap_embedder.0.bias"]
+    converted_state_dict["time_caption_embed.caption_embedder.1.weight"] = all_sd["cap_embedder.1.weight"]
+    converted_state_dict["time_caption_embed.caption_embedder.1.bias"] = all_sd["cap_embedder.1.bias"]
+
+    for i in range(24):
+        # adaln
+        converted_state_dict[f"layers.{i}.gate"] = all_sd[f"layers.{i}.attention.gate"]
+        converted_state_dict[f"layers.{i}.adaLN_modulation.1.weight"] = all_sd[f"layers.{i}.adaLN_modulation.1.weight"]
+        converted_state_dict[f"layers.{i}.adaLN_modulation.1.bias"] = all_sd[f"layers.{i}.adaLN_modulation.1.bias"]
+
+        # qkv
+        converted_state_dict[f"layers.{i}.attn1.to_q.weight"] = all_sd[f"layers.{i}.attention.wq.weight"]
+        converted_state_dict[f"layers.{i}.attn1.to_k.weight"] = all_sd[f"layers.{i}.attention.wk.weight"]
+        converted_state_dict[f"layers.{i}.attn1.to_v.weight"] = all_sd[f"layers.{i}.attention.wv.weight"]
+
+        # cap
+        converted_state_dict[f"layers.{i}.attn2.to_q.weight"] = all_sd[f"layers.{i}.attention.wq.weight"]
+        converted_state_dict[f"layers.{i}.attn2.to_k.weight"] = all_sd[f"layers.{i}.attention.wk_y.weight"]
+        converted_state_dict[f"layers.{i}.attn2.to_v.weight"] = all_sd[f"layers.{i}.attention.wv_y.weight"]
+
+        # output
+        converted_state_dict[f"layers.{i}.attn2.to_out.0.weight"] = all_sd[f"layers.{i}.attention.wo.weight"]
+
+        # attention
+        # qk norm
+        converted_state_dict[f"layers.{i}.attn1.norm_q.weight"] = all_sd[f"layers.{i}.attention.q_norm.weight"]
+        converted_state_dict[f"layers.{i}.attn1.norm_q.bias"] = all_sd[f"layers.{i}.attention.q_norm.bias"]
+
+        converted_state_dict[f"layers.{i}.attn1.norm_k.weight"] = all_sd[f"layers.{i}.attention.k_norm.weight"]
+        converted_state_dict[f"layers.{i}.attn1.norm_k.bias"] = all_sd[f"layers.{i}.attention.k_norm.bias"]
+
+        converted_state_dict[f"layers.{i}.attn2.norm_q.weight"] = all_sd[f"layers.{i}.attention.q_norm.weight"]
+        converted_state_dict[f"layers.{i}.attn2.norm_q.bias"] = all_sd[f"layers.{i}.attention.q_norm.bias"]
+
+        converted_state_dict[f"layers.{i}.attn2.norm_k.weight"] = all_sd[f"layers.{i}.attention.ky_norm.weight"]
+        converted_state_dict[f"layers.{i}.attn2.norm_k.bias"] = all_sd[f"layers.{i}.attention.ky_norm.bias"]
+
+        # attention norm
+        converted_state_dict[f"layers.{i}.attn_norm1.weight"] = all_sd[f"layers.{i}.attention_norm1.weight"]
+        converted_state_dict[f"layers.{i}.attn_norm2.weight"] = all_sd[f"layers.{i}.attention_norm2.weight"]
+        converted_state_dict[f"layers.{i}.norm1_context.weight"] = all_sd[f"layers.{i}.attention_y_norm.weight"]
+
+        # feed forward
+        converted_state_dict[f"layers.{i}.feed_forward.linear_1.weight"] = all_sd[f"layers.{i}.feed_forward.w1.weight"]
+        converted_state_dict[f"layers.{i}.feed_forward.linear_2.weight"] = all_sd[f"layers.{i}.feed_forward.w2.weight"]
+        converted_state_dict[f"layers.{i}.feed_forward.linear_3.weight"] = all_sd[f"layers.{i}.feed_forward.w3.weight"]
+
+        # feed forward norm
+        converted_state_dict[f"layers.{i}.ffn_norm1.weight"] = all_sd[f"layers.{i}.ffn_norm1.weight"]
+        converted_state_dict[f"layers.{i}.ffn_norm2.weight"] = all_sd[f"layers.{i}.ffn_norm2.weight"]
+
+    # final layer
+    converted_state_dict["final_layer.linear.weight"] = all_sd["final_layer.linear.weight"]
+    converted_state_dict["final_layer.linear.bias"] = all_sd["final_layer.linear.bias"]
+
+    converted_state_dict["final_layer.adaLN_modulation.1.weight"] = all_sd["final_layer.adaLN_modulation.1.weight"]
+    converted_state_dict["final_layer.adaLN_modulation.1.bias"] = all_sd["final_layer.adaLN_modulation.1.bias"]
+
+    # Lumina-Next-SFT 2B
+    transformer = LuminaNextDiT2DModel(
+        sample_size=128,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=2304,
+        num_layers=24,
+        num_attention_heads=32,
+        num_kv_heads=8,
+        multiple_of=256,
+        ffn_dim_multiplier=None,
+        norm_eps=1e-5,
+        learn_sigma=True,
+        qk_norm=True,
+        cross_attention_dim=2048,
+        scaling_factor=1.0,
+    )
+    transformer.load_state_dict(converted_state_dict, strict=True)
+
+    num_model_params = sum(p.numel() for p in transformer.parameters())
+    print(f"Total number of transformer parameters: {num_model_params}")
+
+    if args.only_transformer:
+        transformer.save_pretrained(os.path.join(args.dump_path, "transformer"))
+    else:
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae", torch_dtype=torch.float32)
+
+        tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+        text_encoder = AutoModel.from_pretrained("google/gemma-2b")
+
+        pipeline = LuminaText2ImgPipeline(
+            tokenizer=tokenizer, text_encoder=text_encoder, transformer=transformer, vae=vae, scheduler=scheduler
+        )
+        pipeline.save_pretrained(args.dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--origin_ckpt_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--image_size",
+        default=1024,
+        type=int,
+        choices=[256, 512, 1024],
+        required=False,
+        help="Image size of pretrained model, either 512 or 1024.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.")
+    parser.add_argument("--only_transformer", default=True, type=bool, required=True)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/diffusers/scripts/convert_models_diffuser_to_diffusers.py b/diffusers/scripts/convert_models_diffuser_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc5321e33fe088c652f6014c6dab813bb8d5f246
--- /dev/null
+++ b/diffusers/scripts/convert_models_diffuser_to_diffusers.py
@@ -0,0 +1,100 @@
+import json
+import os
+
+import torch
+
+from diffusers import UNet1DModel
+
+
+os.makedirs("hub/hopper-medium-v2/unet/hor32", exist_ok=True)
+os.makedirs("hub/hopper-medium-v2/unet/hor128", exist_ok=True)
+
+os.makedirs("hub/hopper-medium-v2/value_function", exist_ok=True)
+
+
+def unet(hor):
+    if hor == 128:
+        down_block_types = ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D")
+        block_out_channels = (32, 128, 256)
+        up_block_types = ("UpResnetBlock1D", "UpResnetBlock1D")
+
+    elif hor == 32:
+        down_block_types = ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D")
+        block_out_channels = (32, 64, 128, 256)
+        up_block_types = ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D")
+    model = torch.load(f"/Users/bglickenhaus/Documents/diffuser/temporal_unet-hopper-mediumv2-hor{hor}.torch")
+    state_dict = model.state_dict()
+    config = {
+        "down_block_types": down_block_types,
+        "block_out_channels": block_out_channels,
+        "up_block_types": up_block_types,
+        "layers_per_block": 1,
+        "use_timestep_embedding": True,
+        "out_block_type": "OutConv1DBlock",
+        "norm_num_groups": 8,
+        "downsample_each_block": False,
+        "in_channels": 14,
+        "out_channels": 14,
+        "extra_in_channels": 0,
+        "time_embedding_type": "positional",
+        "flip_sin_to_cos": False,
+        "freq_shift": 1,
+        "sample_size": 65536,
+        "mid_block_type": "MidResTemporalBlock1D",
+        "act_fn": "mish",
+    }
+    hf_value_function = UNet1DModel(**config)
+    print(f"length of state dict: {len(state_dict.keys())}")
+    print(f"length of value function dict: {len(hf_value_function.state_dict().keys())}")
+    mapping = dict(zip(model.state_dict().keys(), hf_value_function.state_dict().keys()))
+    for k, v in mapping.items():
+        state_dict[v] = state_dict.pop(k)
+    hf_value_function.load_state_dict(state_dict)
+
+    torch.save(hf_value_function.state_dict(), f"hub/hopper-medium-v2/unet/hor{hor}/diffusion_pytorch_model.bin")
+    with open(f"hub/hopper-medium-v2/unet/hor{hor}/config.json", "w") as f:
+        json.dump(config, f)
+
+
+def value_function():
+    config = {
+        "in_channels": 14,
+        "down_block_types": ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"),
+        "up_block_types": (),
+        "out_block_type": "ValueFunction",
+        "mid_block_type": "ValueFunctionMidBlock1D",
+        "block_out_channels": (32, 64, 128, 256),
+        "layers_per_block": 1,
+        "downsample_each_block": True,
+        "sample_size": 65536,
+        "out_channels": 14,
+        "extra_in_channels": 0,
+        "time_embedding_type": "positional",
+        "use_timestep_embedding": True,
+        "flip_sin_to_cos": False,
+        "freq_shift": 1,
+        "norm_num_groups": 8,
+        "act_fn": "mish",
+    }
+
+    model = torch.load("/Users/bglickenhaus/Documents/diffuser/value_function-hopper-mediumv2-hor32.torch")
+    state_dict = model
+    hf_value_function = UNet1DModel(**config)
+    print(f"length of state dict: {len(state_dict.keys())}")
+    print(f"length of value function dict: {len(hf_value_function.state_dict().keys())}")
+
+    mapping = dict(zip(state_dict.keys(), hf_value_function.state_dict().keys()))
+    for k, v in mapping.items():
+        state_dict[v] = state_dict.pop(k)
+
+    hf_value_function.load_state_dict(state_dict)
+
+    torch.save(hf_value_function.state_dict(), "hub/hopper-medium-v2/value_function/diffusion_pytorch_model.bin")
+    with open("hub/hopper-medium-v2/value_function/config.json", "w") as f:
+        json.dump(config, f)
+
+
+if __name__ == "__main__":
+    unet(32)
+    # unet(128)
+    value_function()
diff --git a/diffusers/scripts/convert_ms_text_to_video_to_diffusers.py b/diffusers/scripts/convert_ms_text_to_video_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0251ab680d5956e21afa9327315139425d15b644
--- /dev/null
+++ b/diffusers/scripts/convert_ms_text_to_video_to_diffusers.py
@@ -0,0 +1,428 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the LDM checkpoints."""
+
+import argparse
+
+import torch
+
+from diffusers import UNet3DConditionModel
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        weight = old_checkpoint[path["old"]]
+        names = ["proj_attn.weight"]
+        names_2 = ["proj_out.weight", "proj_in.weight"]
+        if any(k in new_path for k in names):
+            checkpoint[new_path] = weight[:, :, 0]
+        elif any(k in new_path for k in names_2) and len(weight.shape) > 2 and ".attentions." not in new_path:
+            checkpoint[new_path] = weight[:, :, 0]
+        else:
+            checkpoint[new_path] = weight
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_temp_conv_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        mapping.append({"old": old_item, "new": old_item})
+
+    return mapping
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        if "temopral_conv" not in old_item:
+            mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+
+    unet_key = "model.diffusion_model."
+
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+
+        for key in keys:
+            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    first_temp_attention = [v for v in unet_state_dict if v.startswith("input_blocks.0.1")]
+    paths = renew_attention_paths(first_temp_attention)
+    meta_path = {"old": "input_blocks.0.1", "new": "transformer_in"}
+    assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        temp_attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.2" in key]
+
+        if f"input_blocks.{i}.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        temporal_convs = [key for key in resnets if "temopral_conv" in key]
+        paths = renew_temp_conv_paths(temporal_convs)
+        meta_path = {
+            "old": f"input_blocks.{i}.0.temopral_conv",
+            "new": f"down_blocks.{block_id}.temp_convs.{layer_in_block_id}",
+        }
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+        if len(temp_attentions):
+            paths = renew_attention_paths(temp_attentions)
+            meta_path = {
+                "old": f"input_blocks.{i}.2",
+                "new": f"down_blocks.{block_id}.temp_attentions.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    temporal_convs_0 = [key for key in resnet_0 if "temopral_conv" in key]
+    attentions = middle_blocks[1]
+    temp_attentions = middle_blocks[2]
+    resnet_1 = middle_blocks[3]
+    temporal_convs_1 = [key for key in resnet_1 if "temopral_conv" in key]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    meta_path = {"old": "middle_block.0", "new": "mid_block.resnets.0"}
+    assign_to_checkpoint(
+        resnet_0_paths, new_checkpoint, unet_state_dict, config=config, additional_replacements=[meta_path]
+    )
+
+    temp_conv_0_paths = renew_temp_conv_paths(temporal_convs_0)
+    meta_path = {"old": "middle_block.0.temopral_conv", "new": "mid_block.temp_convs.0"}
+    assign_to_checkpoint(
+        temp_conv_0_paths, new_checkpoint, unet_state_dict, config=config, additional_replacements=[meta_path]
+    )
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    meta_path = {"old": "middle_block.3", "new": "mid_block.resnets.1"}
+    assign_to_checkpoint(
+        resnet_1_paths, new_checkpoint, unet_state_dict, config=config, additional_replacements=[meta_path]
+    )
+
+    temp_conv_1_paths = renew_temp_conv_paths(temporal_convs_1)
+    meta_path = {"old": "middle_block.3.temopral_conv", "new": "mid_block.temp_convs.1"}
+    assign_to_checkpoint(
+        temp_conv_1_paths, new_checkpoint, unet_state_dict, config=config, additional_replacements=[meta_path]
+    )
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    temp_attentions_paths = renew_attention_paths(temp_attentions)
+    meta_path = {"old": "middle_block.2", "new": "mid_block.temp_attentions.0"}
+    assign_to_checkpoint(
+        temp_attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+            temp_attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.2" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            temporal_convs = [key for key in resnets if "temopral_conv" in key]
+            paths = renew_temp_conv_paths(temporal_convs)
+            meta_path = {
+                "old": f"output_blocks.{i}.0.temopral_conv",
+                "new": f"up_blocks.{block_id}.temp_convs.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+
+            if len(temp_attentions):
+                paths = renew_attention_paths(temp_attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.2",
+                    "new": f"up_blocks.{block_id}.temp_attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+            temopral_conv_paths = [l for l in output_block_layers if "temopral_conv" in l]
+            for path in temopral_conv_paths:
+                pruned_path = path.split("temopral_conv.")[-1]
+                old_path = ".".join(["output_blocks", str(i), str(block_id), "temopral_conv", pruned_path])
+                new_path = ".".join(["up_blocks", str(block_id), "temp_convs", str(layer_in_block_id), pruned_path])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    args = parser.parse_args()
+
+    unet_checkpoint = torch.load(args.checkpoint_path, map_location="cpu")
+    unet = UNet3DConditionModel()
+
+    converted_ckpt = convert_ldm_unet_checkpoint(unet_checkpoint, unet.config)
+
+    diff_0 = set(unet.state_dict().keys()) - set(converted_ckpt.keys())
+    diff_1 = set(converted_ckpt.keys()) - set(unet.state_dict().keys())
+
+    assert len(diff_0) == len(diff_1) == 0, "Converted weights don't match"
+
+    # load state_dict
+    unet.load_state_dict(converted_ckpt)
+
+    unet.save_pretrained(args.dump_path)
+
+    # -- finish converting the unet --
diff --git a/diffusers/scripts/convert_music_spectrogram_to_diffusers.py b/diffusers/scripts/convert_music_spectrogram_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ac9ceddb57eb7bfe4bf26498f7fd573c5f390c1
--- /dev/null
+++ b/diffusers/scripts/convert_music_spectrogram_to_diffusers.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+import argparse
+import os
+
+import jax as jnp
+import numpy as onp
+import torch
+import torch.nn as nn
+from music_spectrogram_diffusion import inference
+from t5x import checkpoints
+
+from diffusers import DDPMScheduler, OnnxRuntimeModel, SpectrogramDiffusionPipeline
+from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
+
+
+MODEL = "base_with_context"
+
+
+def load_notes_encoder(weights, model):
+    model.token_embedder.weight = nn.Parameter(torch.Tensor(weights["token_embedder"]["embedding"]))
+    model.position_encoding.weight = nn.Parameter(torch.Tensor(weights["Embed_0"]["embedding"]), requires_grad=False)
+    for lyr_num, lyr in enumerate(model.encoders):
+        ly_weight = weights[f"layers_{lyr_num}"]
+        lyr.layer[0].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_attention_layer_norm"]["scale"]))
+
+        attention_weights = ly_weight["attention"]
+        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T))
+        lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T))
+        lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T))
+
+        lyr.layer[1].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+
+        lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
+        lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
+        lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wo"]["kernel"].T))
+
+    model.layer_norm.weight = nn.Parameter(torch.Tensor(weights["encoder_norm"]["scale"]))
+    return model
+
+
+def load_continuous_encoder(weights, model):
+    model.input_proj.weight = nn.Parameter(torch.Tensor(weights["input_proj"]["kernel"].T))
+
+    model.position_encoding.weight = nn.Parameter(torch.Tensor(weights["Embed_0"]["embedding"]), requires_grad=False)
+
+    for lyr_num, lyr in enumerate(model.encoders):
+        ly_weight = weights[f"layers_{lyr_num}"]
+        attention_weights = ly_weight["attention"]
+
+        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T))
+        lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T))
+        lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T))
+        lyr.layer[0].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_attention_layer_norm"]["scale"]))
+
+        lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
+        lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
+        lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wo"]["kernel"].T))
+        lyr.layer[1].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+
+    model.layer_norm.weight = nn.Parameter(torch.Tensor(weights["encoder_norm"]["scale"]))
+
+    return model
+
+
+def load_decoder(weights, model):
+    model.conditioning_emb[0].weight = nn.Parameter(torch.Tensor(weights["time_emb_dense0"]["kernel"].T))
+    model.conditioning_emb[2].weight = nn.Parameter(torch.Tensor(weights["time_emb_dense1"]["kernel"].T))
+
+    model.position_encoding.weight = nn.Parameter(torch.Tensor(weights["Embed_0"]["embedding"]), requires_grad=False)
+
+    model.continuous_inputs_projection.weight = nn.Parameter(
+        torch.Tensor(weights["continuous_inputs_projection"]["kernel"].T)
+    )
+
+    for lyr_num, lyr in enumerate(model.decoders):
+        ly_weight = weights[f"layers_{lyr_num}"]
+        lyr.layer[0].layer_norm.weight = nn.Parameter(
+            torch.Tensor(ly_weight["pre_self_attention_layer_norm"]["scale"])
+        )
+
+        lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter(
+            torch.Tensor(ly_weight["FiLMLayer_0"]["DenseGeneral_0"]["kernel"].T)
+        )
+
+        attention_weights = ly_weight["self_attention"]
+        lyr.layer[0].attention.to_q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[0].attention.to_k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T))
+        lyr.layer[0].attention.to_v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T))
+        lyr.layer[0].attention.to_out[0].weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T))
+
+        attention_weights = ly_weight["MultiHeadDotProductAttention_0"]
+        lyr.layer[1].attention.to_q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[1].attention.to_k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T))
+        lyr.layer[1].attention.to_v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T))
+        lyr.layer[1].attention.to_out[0].weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T))
+        lyr.layer[1].layer_norm.weight = nn.Parameter(
+            torch.Tensor(ly_weight["pre_cross_attention_layer_norm"]["scale"])
+        )
+
+        lyr.layer[2].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+        lyr.layer[2].film.scale_bias.weight = nn.Parameter(
+            torch.Tensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T)
+        )
+        lyr.layer[2].DenseReluDense.wi_0.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
+        lyr.layer[2].DenseReluDense.wi_1.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
+        lyr.layer[2].DenseReluDense.wo.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wo"]["kernel"].T))
+
+    model.decoder_norm.weight = nn.Parameter(torch.Tensor(weights["decoder_norm"]["scale"]))
+
+    model.spec_out.weight = nn.Parameter(torch.Tensor(weights["spec_out_dense"]["kernel"].T))
+
+    return model
+
+
+def main(args):
+    t5_checkpoint = checkpoints.load_t5x_checkpoint(args.checkpoint_path)
+    t5_checkpoint = jnp.tree_util.tree_map(onp.array, t5_checkpoint)
+
+    gin_overrides = [
+        "from __gin__ import dynamic_registration",
+        "from music_spectrogram_diffusion.models.diffusion import diffusion_utils",
+        "diffusion_utils.ClassifierFreeGuidanceConfig.eval_condition_weight = 2.0",
+        "diffusion_utils.DiffusionConfig.classifier_free_guidance = @diffusion_utils.ClassifierFreeGuidanceConfig()",
+    ]
+
+    gin_file = os.path.join(args.checkpoint_path, "..", "config.gin")
+    gin_config = inference.parse_training_gin_file(gin_file, gin_overrides)
+    synth_model = inference.InferenceModel(args.checkpoint_path, gin_config)
+
+    scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large")
+
+    notes_encoder = SpectrogramNotesEncoder(
+        max_length=synth_model.sequence_length["inputs"],
+        vocab_size=synth_model.model.module.config.vocab_size,
+        d_model=synth_model.model.module.config.emb_dim,
+        dropout_rate=synth_model.model.module.config.dropout_rate,
+        num_layers=synth_model.model.module.config.num_encoder_layers,
+        num_heads=synth_model.model.module.config.num_heads,
+        d_kv=synth_model.model.module.config.head_dim,
+        d_ff=synth_model.model.module.config.mlp_dim,
+        feed_forward_proj="gated-gelu",
+    )
+
+    continuous_encoder = SpectrogramContEncoder(
+        input_dims=synth_model.audio_codec.n_dims,
+        targets_context_length=synth_model.sequence_length["targets_context"],
+        d_model=synth_model.model.module.config.emb_dim,
+        dropout_rate=synth_model.model.module.config.dropout_rate,
+        num_layers=synth_model.model.module.config.num_encoder_layers,
+        num_heads=synth_model.model.module.config.num_heads,
+        d_kv=synth_model.model.module.config.head_dim,
+        d_ff=synth_model.model.module.config.mlp_dim,
+        feed_forward_proj="gated-gelu",
+    )
+
+    decoder = T5FilmDecoder(
+        input_dims=synth_model.audio_codec.n_dims,
+        targets_length=synth_model.sequence_length["targets_context"],
+        max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time,
+        d_model=synth_model.model.module.config.emb_dim,
+        num_layers=synth_model.model.module.config.num_decoder_layers,
+        num_heads=synth_model.model.module.config.num_heads,
+        d_kv=synth_model.model.module.config.head_dim,
+        d_ff=synth_model.model.module.config.mlp_dim,
+        dropout_rate=synth_model.model.module.config.dropout_rate,
+    )
+
+    notes_encoder = load_notes_encoder(t5_checkpoint["target"]["token_encoder"], notes_encoder)
+    continuous_encoder = load_continuous_encoder(t5_checkpoint["target"]["continuous_encoder"], continuous_encoder)
+    decoder = load_decoder(t5_checkpoint["target"]["decoder"], decoder)
+
+    melgan = OnnxRuntimeModel.from_pretrained("kashif/soundstream_mel_decoder")
+
+    pipe = SpectrogramDiffusionPipeline(
+        notes_encoder=notes_encoder,
+        continuous_encoder=continuous_encoder,
+        decoder=decoder,
+        scheduler=scheduler,
+        melgan=melgan,
+    )
+    if args.save:
+        pipe.save_pretrained(args.output_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--output_path", default=None, type=str, required=True, help="Path to the converted model.")
+    parser.add_argument(
+        "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not."
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        default=f"{MODEL}/checkpoint_500000",
+        type=str,
+        required=False,
+        help="Path to the original jax model checkpoint.",
+    )
+    args = parser.parse_args()
+
+    main(args)
diff --git a/diffusers/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py b/diffusers/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d67123d9ad72e82f4d1b53ee3e1c8604de762c2
--- /dev/null
+++ b/diffusers/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
@@ -0,0 +1,185 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the NCSNPP checkpoints."""
+
+import argparse
+import json
+
+import torch
+
+from diffusers import ScoreSdeVePipeline, ScoreSdeVeScheduler, UNet2DModel
+
+
+def convert_ncsnpp_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and the path to
+    """
+    new_model_architecture = UNet2DModel(**config)
+    new_model_architecture.time_proj.W.data = checkpoint["all_modules.0.W"].data
+    new_model_architecture.time_proj.weight.data = checkpoint["all_modules.0.W"].data
+    new_model_architecture.time_embedding.linear_1.weight.data = checkpoint["all_modules.1.weight"].data
+    new_model_architecture.time_embedding.linear_1.bias.data = checkpoint["all_modules.1.bias"].data
+
+    new_model_architecture.time_embedding.linear_2.weight.data = checkpoint["all_modules.2.weight"].data
+    new_model_architecture.time_embedding.linear_2.bias.data = checkpoint["all_modules.2.bias"].data
+
+    new_model_architecture.conv_in.weight.data = checkpoint["all_modules.3.weight"].data
+    new_model_architecture.conv_in.bias.data = checkpoint["all_modules.3.bias"].data
+
+    new_model_architecture.conv_norm_out.weight.data = checkpoint[list(checkpoint.keys())[-4]].data
+    new_model_architecture.conv_norm_out.bias.data = checkpoint[list(checkpoint.keys())[-3]].data
+    new_model_architecture.conv_out.weight.data = checkpoint[list(checkpoint.keys())[-2]].data
+    new_model_architecture.conv_out.bias.data = checkpoint[list(checkpoint.keys())[-1]].data
+
+    module_index = 4
+
+    def set_attention_weights(new_layer, old_checkpoint, index):
+        new_layer.query.weight.data = old_checkpoint[f"all_modules.{index}.NIN_0.W"].data.T
+        new_layer.key.weight.data = old_checkpoint[f"all_modules.{index}.NIN_1.W"].data.T
+        new_layer.value.weight.data = old_checkpoint[f"all_modules.{index}.NIN_2.W"].data.T
+
+        new_layer.query.bias.data = old_checkpoint[f"all_modules.{index}.NIN_0.b"].data
+        new_layer.key.bias.data = old_checkpoint[f"all_modules.{index}.NIN_1.b"].data
+        new_layer.value.bias.data = old_checkpoint[f"all_modules.{index}.NIN_2.b"].data
+
+        new_layer.proj_attn.weight.data = old_checkpoint[f"all_modules.{index}.NIN_3.W"].data.T
+        new_layer.proj_attn.bias.data = old_checkpoint[f"all_modules.{index}.NIN_3.b"].data
+
+        new_layer.group_norm.weight.data = old_checkpoint[f"all_modules.{index}.GroupNorm_0.weight"].data
+        new_layer.group_norm.bias.data = old_checkpoint[f"all_modules.{index}.GroupNorm_0.bias"].data
+
+    def set_resnet_weights(new_layer, old_checkpoint, index):
+        new_layer.conv1.weight.data = old_checkpoint[f"all_modules.{index}.Conv_0.weight"].data
+        new_layer.conv1.bias.data = old_checkpoint[f"all_modules.{index}.Conv_0.bias"].data
+        new_layer.norm1.weight.data = old_checkpoint[f"all_modules.{index}.GroupNorm_0.weight"].data
+        new_layer.norm1.bias.data = old_checkpoint[f"all_modules.{index}.GroupNorm_0.bias"].data
+
+        new_layer.conv2.weight.data = old_checkpoint[f"all_modules.{index}.Conv_1.weight"].data
+        new_layer.conv2.bias.data = old_checkpoint[f"all_modules.{index}.Conv_1.bias"].data
+        new_layer.norm2.weight.data = old_checkpoint[f"all_modules.{index}.GroupNorm_1.weight"].data
+        new_layer.norm2.bias.data = old_checkpoint[f"all_modules.{index}.GroupNorm_1.bias"].data
+
+        new_layer.time_emb_proj.weight.data = old_checkpoint[f"all_modules.{index}.Dense_0.weight"].data
+        new_layer.time_emb_proj.bias.data = old_checkpoint[f"all_modules.{index}.Dense_0.bias"].data
+
+        if new_layer.in_channels != new_layer.out_channels or new_layer.up or new_layer.down:
+            new_layer.conv_shortcut.weight.data = old_checkpoint[f"all_modules.{index}.Conv_2.weight"].data
+            new_layer.conv_shortcut.bias.data = old_checkpoint[f"all_modules.{index}.Conv_2.bias"].data
+
+    for i, block in enumerate(new_model_architecture.downsample_blocks):
+        has_attentions = hasattr(block, "attentions")
+        for j in range(len(block.resnets)):
+            set_resnet_weights(block.resnets[j], checkpoint, module_index)
+            module_index += 1
+            if has_attentions:
+                set_attention_weights(block.attentions[j], checkpoint, module_index)
+                module_index += 1
+
+        if hasattr(block, "downsamplers") and block.downsamplers is not None:
+            set_resnet_weights(block.resnet_down, checkpoint, module_index)
+            module_index += 1
+            block.skip_conv.weight.data = checkpoint[f"all_modules.{module_index}.Conv_0.weight"].data
+            block.skip_conv.bias.data = checkpoint[f"all_modules.{module_index}.Conv_0.bias"].data
+            module_index += 1
+
+    set_resnet_weights(new_model_architecture.mid_block.resnets[0], checkpoint, module_index)
+    module_index += 1
+    set_attention_weights(new_model_architecture.mid_block.attentions[0], checkpoint, module_index)
+    module_index += 1
+    set_resnet_weights(new_model_architecture.mid_block.resnets[1], checkpoint, module_index)
+    module_index += 1
+
+    for i, block in enumerate(new_model_architecture.up_blocks):
+        has_attentions = hasattr(block, "attentions")
+        for j in range(len(block.resnets)):
+            set_resnet_weights(block.resnets[j], checkpoint, module_index)
+            module_index += 1
+        if has_attentions:
+            set_attention_weights(
+                block.attentions[0], checkpoint, module_index
+            )  # why can there only be a single attention layer for up?
+            module_index += 1
+
+        if hasattr(block, "resnet_up") and block.resnet_up is not None:
+            block.skip_norm.weight.data = checkpoint[f"all_modules.{module_index}.weight"].data
+            block.skip_norm.bias.data = checkpoint[f"all_modules.{module_index}.bias"].data
+            module_index += 1
+            block.skip_conv.weight.data = checkpoint[f"all_modules.{module_index}.weight"].data
+            block.skip_conv.bias.data = checkpoint[f"all_modules.{module_index}.bias"].data
+            module_index += 1
+            set_resnet_weights(block.resnet_up, checkpoint, module_index)
+            module_index += 1
+
+    new_model_architecture.conv_norm_out.weight.data = checkpoint[f"all_modules.{module_index}.weight"].data
+    new_model_architecture.conv_norm_out.bias.data = checkpoint[f"all_modules.{module_index}.bias"].data
+    module_index += 1
+    new_model_architecture.conv_out.weight.data = checkpoint[f"all_modules.{module_index}.weight"].data
+    new_model_architecture.conv_out.bias.data = checkpoint[f"all_modules.{module_index}.bias"].data
+
+    return new_model_architecture.state_dict()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path",
+        default="/Users/arthurzucker/Work/diffusers/ArthurZ/diffusion_pytorch_model.bin",
+        type=str,
+        required=False,
+        help="Path to the checkpoint to convert.",
+    )
+
+    parser.add_argument(
+        "--config_file",
+        default="/Users/arthurzucker/Work/diffusers/ArthurZ/config.json",
+        type=str,
+        required=False,
+        help="The config json file corresponding to the architecture.",
+    )
+
+    parser.add_argument(
+        "--dump_path",
+        default="/Users/arthurzucker/Work/diffusers/ArthurZ/diffusion_model_new.pt",
+        type=str,
+        required=False,
+        help="Path to the output model.",
+    )
+
+    args = parser.parse_args()
+
+    checkpoint = torch.load(args.checkpoint_path, map_location="cpu")
+
+    with open(args.config_file) as f:
+        config = json.loads(f.read())
+
+    converted_checkpoint = convert_ncsnpp_checkpoint(
+        checkpoint,
+        config,
+    )
+
+    if "sde" in config:
+        del config["sde"]
+
+    model = UNet2DModel(**config)
+    model.load_state_dict(converted_checkpoint)
+
+    try:
+        scheduler = ScoreSdeVeScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1]))
+
+        pipe = ScoreSdeVePipeline(unet=model, scheduler=scheduler)
+        pipe.save_pretrained(args.dump_path)
+    except:  # noqa: E722
+        model.save_pretrained(args.dump_path)
diff --git a/diffusers/scripts/convert_original_audioldm2_to_diffusers.py b/diffusers/scripts/convert_original_audioldm2_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea9c02d53815ef03496747b13afec7ca70f85689
--- /dev/null
+++ b/diffusers/scripts/convert_original_audioldm2_to_diffusers.py
@@ -0,0 +1,1135 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the AudioLDM2 checkpoints."""
+
+import argparse
+import re
+from typing import List, Union
+
+import torch
+import yaml
+from transformers import (
+    AutoFeatureExtractor,
+    AutoTokenizer,
+    ClapConfig,
+    ClapModel,
+    GPT2Config,
+    GPT2Model,
+    SpeechT5HifiGan,
+    SpeechT5HifiGanConfig,
+    T5Config,
+    T5EncoderModel,
+)
+
+from diffusers import (
+    AudioLDM2Pipeline,
+    AudioLDM2ProjectionModel,
+    AudioLDM2UNet2DConditionModel,
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import is_safetensors_available
+from diffusers.utils.import_utils import BACKENDS_MAPPING
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_resnet_paths
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_resnet_paths
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_attention_paths
+def renew_attention_paths(old_list):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "to_q.weight")
+        new_item = new_item.replace("q.bias", "to_q.bias")
+
+        new_item = new_item.replace("k.weight", "to_k.weight")
+        new_item = new_item.replace("k.bias", "to_k.bias")
+
+        new_item = new_item.replace("v.weight", "to_v.weight")
+        new_item = new_item.replace("v.bias", "to_v.bias")
+
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["to_q.weight", "to_k.weight", "to_v.weight"]
+    proj_key = "to_out.0.weight"
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys or ".".join(key.split(".")[-3:]) == proj_key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key].squeeze()
+
+
+def create_unet_diffusers_config(original_config, image_size: int):
+    """
+    Creates a UNet config for diffusers based on the config of the original AudioLDM2 model.
+    """
+    unet_params = original_config["model"]["params"]["unet_config"]["params"]
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
+
+    cross_attention_dim = list(unet_params["context_dim"]) if "context_dim" in unet_params else block_out_channels
+    if len(cross_attention_dim) > 1:
+        # require two or more cross-attention layers per-block, each of different dimension
+        cross_attention_dim = [cross_attention_dim for _ in range(len(block_out_channels))]
+
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params["in_channels"],
+        "out_channels": unet_params["out_channels"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params["num_res_blocks"],
+        "transformer_layers_per_block": unet_params["transformer_depth"],
+        "cross_attention_dim": tuple(cross_attention_dim),
+    }
+
+    return config
+
+
+# Adapted from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_vae_diffusers_config
+def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
+    """
+    Creates a VAE config for diffusers based on the config of the original AudioLDM2 model. Compared to the original
+    Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE.
+    """
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
+
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config["model"]["params"] else 0.18215
+
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
+        "scaling_factor": float(scaling_factor),
+    }
+    return config
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config["model"]["params"]["timesteps"],
+        beta_start=original_config["model"]["params"]["linear_start"],
+        beta_end=original_config["model"]["params"]["linear_end"],
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+
+
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted UNet checkpoint.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+
+    unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+
+        # strip the unet prefix from the weight names
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}." in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    # Check how many Transformer blocks we have per layer
+    if isinstance(config.get("cross_attention_dim"), (list, tuple)):
+        if isinstance(config["cross_attention_dim"][0], (list, tuple)):
+            # in this case we have multiple cross-attention layers per-block
+            num_attention_layers = len(config.get("cross_attention_dim")[0])
+    else:
+        num_attention_layers = 1
+
+    if config.get("extra_self_attn_layer"):
+        num_attention_layers += 1
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.0" not in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = [
+                {
+                    "old": f"input_blocks.{i}.{1 + layer_id}",
+                    "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id * num_attention_layers + layer_id}",
+                }
+                for layer_id in range(num_attention_layers)
+            ]
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=meta_path, config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    resnet_1 = middle_blocks[num_middle_blocks - 1]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    meta_path = {"old": "middle_block.0", "new": "mid_block.resnets.0"}
+    assign_to_checkpoint(
+        resnet_0_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    meta_path = {"old": f"middle_block.{len(middle_blocks) - 1}", "new": "mid_block.resnets.1"}
+    assign_to_checkpoint(
+        resnet_1_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(1, num_middle_blocks - 1):
+        attentions = middle_blocks[i]
+        attentions_paths = renew_attention_paths(attentions)
+        meta_path = {"old": f"middle_block.{i}", "new": f"mid_block.attentions.{i - 1}"}
+        assign_to_checkpoint(
+            attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.0" not in key]
+
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                attentions.remove(f"output_blocks.{i}.{index}.conv.bias")
+                attentions.remove(f"output_blocks.{i}.{index}.conv.weight")
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = [
+                    {
+                        "old": f"output_blocks.{i}.{1 + layer_id}",
+                        "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id * num_attention_layers + layer_id}",
+                    }
+                    for layer_id in range(num_attention_layers)
+                ]
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=meta_path, config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+CLAP_KEYS_TO_MODIFY_MAPPING = {
+    "text_branch": "text_model",
+    "audio_branch": "audio_model.audio_encoder",
+    "attn": "attention.self",
+    "self.proj": "output.dense",
+    "attention.self_mask": "attn_mask",
+    "mlp.fc1": "intermediate.dense",
+    "mlp.fc2": "output.dense",
+    "norm1": "layernorm_before",
+    "norm2": "layernorm_after",
+    "bn0": "batch_norm",
+}
+
+CLAP_KEYS_TO_IGNORE = [
+    "text_transform",
+    "audio_transform",
+    "stft",
+    "logmel_extractor",
+    "tscam_conv",
+    "head",
+    "attn_mask",
+]
+
+CLAP_EXPECTED_MISSING_KEYS = ["text_model.embeddings.token_type_ids"]
+
+
+def convert_open_clap_checkpoint(checkpoint):
+    """
+    Takes a state dict and returns a converted CLAP checkpoint.
+    """
+    # extract state dict for CLAP text embedding model, discarding the audio component
+    model_state_dict = {}
+    model_key = "clap.model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(model_key):
+            model_state_dict[key.replace(model_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    sequential_layers_pattern = r".*sequential.(\d+).*"
+    text_projection_pattern = r".*_projection.(\d+).*"
+
+    for key, value in model_state_dict.items():
+        # check if key should be ignored in mapping - if so map it to a key name that we'll filter out at the end
+        for key_to_ignore in CLAP_KEYS_TO_IGNORE:
+            if key_to_ignore in key:
+                key = "spectrogram"
+
+        # check if any key needs to be modified
+        for key_to_modify, new_key in CLAP_KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+
+        if re.match(sequential_layers_pattern, key):
+            # replace sequential layers with list
+            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
+
+            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
+        elif re.match(text_projection_pattern, key):
+            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
+
+            # Because in CLAP they use `nn.Sequential`...
+            transformers_projection_layer = 1 if projecton_layer == 0 else 2
+
+            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
+
+        if "audio" and "qkv" in key:
+            # split qkv into query key and value
+            mixed_qkv = value
+            qkv_dim = mixed_qkv.size(0) // 3
+
+            query_layer = mixed_qkv[:qkv_dim]
+            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+            value_layer = mixed_qkv[qkv_dim * 2 :]
+
+            new_checkpoint[key.replace("qkv", "query")] = query_layer
+            new_checkpoint[key.replace("qkv", "key")] = key_layer
+            new_checkpoint[key.replace("qkv", "value")] = value_layer
+        elif key != "spectrogram":
+            new_checkpoint[key] = value
+
+    return new_checkpoint
+
+
+def create_transformers_vocoder_config(original_config):
+    """
+    Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model.
+    """
+    vocoder_params = original_config["model"]["params"]["vocoder_config"]["params"]
+
+    config = {
+        "model_in_dim": vocoder_params["num_mels"],
+        "sampling_rate": vocoder_params["sampling_rate"],
+        "upsample_initial_channel": vocoder_params["upsample_initial_channel"],
+        "upsample_rates": list(vocoder_params["upsample_rates"]),
+        "upsample_kernel_sizes": list(vocoder_params["upsample_kernel_sizes"]),
+        "resblock_kernel_sizes": list(vocoder_params["resblock_kernel_sizes"]),
+        "resblock_dilation_sizes": [
+            list(resblock_dilation) for resblock_dilation in vocoder_params["resblock_dilation_sizes"]
+        ],
+        "normalize_before": False,
+    }
+
+    return config
+
+
+def extract_sub_model(checkpoint, key_prefix):
+    """
+    Takes a state dict and returns the state dict for a particular sub-model.
+    """
+
+    sub_model_state_dict = {}
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(key_prefix):
+            sub_model_state_dict[key.replace(key_prefix, "")] = checkpoint.get(key)
+
+    return sub_model_state_dict
+
+
+def convert_hifigan_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and config, and returns a converted HiFiGAN vocoder checkpoint.
+    """
+    # extract state dict for vocoder
+    vocoder_state_dict = extract_sub_model(checkpoint, key_prefix="first_stage_model.vocoder.")
+
+    # fix upsampler keys, everything else is correct already
+    for i in range(len(config.upsample_rates)):
+        vocoder_state_dict[f"upsampler.{i}.weight"] = vocoder_state_dict.pop(f"ups.{i}.weight")
+        vocoder_state_dict[f"upsampler.{i}.bias"] = vocoder_state_dict.pop(f"ups.{i}.bias")
+
+    if not config.normalize_before:
+        # if we don't set normalize_before then these variables are unused, so we set them to their initialised values
+        vocoder_state_dict["mean"] = torch.zeros(config.model_in_dim)
+        vocoder_state_dict["scale"] = torch.ones(config.model_in_dim)
+
+    return vocoder_state_dict
+
+
+def convert_projection_checkpoint(checkpoint):
+    projection_state_dict = {}
+    conditioner_state_dict = extract_sub_model(checkpoint, key_prefix="cond_stage_models.0.")
+
+    projection_state_dict["sos_embed"] = conditioner_state_dict["start_of_sequence_tokens.weight"][0]
+    projection_state_dict["sos_embed_1"] = conditioner_state_dict["start_of_sequence_tokens.weight"][1]
+
+    projection_state_dict["eos_embed"] = conditioner_state_dict["end_of_sequence_tokens.weight"][0]
+    projection_state_dict["eos_embed_1"] = conditioner_state_dict["end_of_sequence_tokens.weight"][1]
+
+    projection_state_dict["projection.weight"] = conditioner_state_dict["input_sequence_embed_linear.0.weight"]
+    projection_state_dict["projection.bias"] = conditioner_state_dict["input_sequence_embed_linear.0.bias"]
+
+    projection_state_dict["projection_1.weight"] = conditioner_state_dict["input_sequence_embed_linear.1.weight"]
+    projection_state_dict["projection_1.bias"] = conditioner_state_dict["input_sequence_embed_linear.1.bias"]
+
+    return projection_state_dict
+
+
+# Adapted from https://github.com/haoheliu/AudioLDM2/blob/81ad2c6ce015c1310387695e2dae975a7d2ed6fd/audioldm2/utils.py#L143
+DEFAULT_CONFIG = {
+    "model": {
+        "params": {
+            "linear_start": 0.0015,
+            "linear_end": 0.0195,
+            "timesteps": 1000,
+            "channels": 8,
+            "scale_by_std": True,
+            "unet_config": {
+                "target": "audioldm2.latent_diffusion.openaimodel.UNetModel",
+                "params": {
+                    "context_dim": [None, 768, 1024],
+                    "in_channels": 8,
+                    "out_channels": 8,
+                    "model_channels": 128,
+                    "attention_resolutions": [8, 4, 2],
+                    "num_res_blocks": 2,
+                    "channel_mult": [1, 2, 3, 5],
+                    "num_head_channels": 32,
+                    "transformer_depth": 1,
+                },
+            },
+            "first_stage_config": {
+                "target": "audioldm2.variational_autoencoder.autoencoder.AutoencoderKL",
+                "params": {
+                    "embed_dim": 8,
+                    "ddconfig": {
+                        "z_channels": 8,
+                        "resolution": 256,
+                        "in_channels": 1,
+                        "out_ch": 1,
+                        "ch": 128,
+                        "ch_mult": [1, 2, 4],
+                        "num_res_blocks": 2,
+                    },
+                },
+            },
+            "cond_stage_config": {
+                "crossattn_audiomae_generated": {
+                    "target": "audioldm2.latent_diffusion.modules.encoders.modules.SequenceGenAudioMAECond",
+                    "params": {
+                        "sequence_gen_length": 8,
+                        "sequence_input_embed_dim": [512, 1024],
+                    },
+                }
+            },
+            "vocoder_config": {
+                "target": "audioldm2.first_stage_model.vocoder",
+                "params": {
+                    "upsample_rates": [5, 4, 2, 2, 2],
+                    "upsample_kernel_sizes": [16, 16, 8, 4, 4],
+                    "upsample_initial_channel": 1024,
+                    "resblock_kernel_sizes": [3, 7, 11],
+                    "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    "num_mels": 64,
+                    "sampling_rate": 16000,
+                },
+            },
+        },
+    },
+}
+
+
+def load_pipeline_from_original_AudioLDM2_ckpt(
+    checkpoint_path: str,
+    original_config_file: str = None,
+    image_size: int = 1024,
+    prediction_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "ddim",
+    cross_attention_dim: Union[List, List[List]] = None,
+    transformer_layers_per_block: int = None,
+    device: str = None,
+    from_safetensors: bool = False,
+) -> AudioLDM2Pipeline:
+    """
+    Load an AudioLDM2 pipeline object from a `.ckpt`/`.safetensors` file and (ideally) a `.yaml` config file.
+
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+
+    Args:
+        checkpoint_path (`str`): Path to `.ckpt` file.
+        original_config_file (`str`):
+            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
+            set to the AudioLDM2 base config.
+        image_size (`int`, *optional*, defaults to 1024):
+            The image size that the model was trained on.
+        prediction_type (`str`, *optional*):
+            The prediction type that the model was trained on. If `None`, will be automatically
+            inferred by looking for a key in the config. For the default config, the prediction type is `'epsilon'`.
+        scheduler_type (`str`, *optional*, defaults to 'ddim'):
+            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
+            "ddim"]`.
+        cross_attention_dim (`list`, *optional*, defaults to `None`):
+            The dimension of the cross-attention layers. If `None`, the cross-attention dimension will be
+            automatically inferred. Set to `[768, 1024]` for the base model, or `[768, 1024, None]` for the large model.
+        transformer_layers_per_block (`int`, *optional*, defaults to `None`):
+            The number of transformer layers in each transformer block. If `None`, number of layers will be "
+             "automatically inferred. Set to `1` for the base model, or `2` for the large model.
+        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
+            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
+            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
+            inference. Non-EMA weights are usually better to continue fine-tuning.
+        device (`str`, *optional*, defaults to `None`):
+            The device to use. Pass `None` to determine automatically.
+        from_safetensors (`str`, *optional*, defaults to `False`):
+            If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
+        return: An AudioLDM2Pipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    """
+
+    if from_safetensors:
+        if not is_safetensors_available():
+            raise ValueError(BACKENDS_MAPPING["safetensors"][1])
+
+        from safetensors import safe_open
+
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+
+    if "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    if original_config_file is None:
+        original_config = DEFAULT_CONFIG
+    else:
+        original_config = yaml.safe_load(original_config_file)
+
+    if image_size is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["image_size"] = image_size
+
+    if cross_attention_dim is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["context_dim"] = cross_attention_dim
+
+    if transformer_layers_per_block is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["transformer_depth"] = transformer_layers_per_block
+
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            prediction_type = "v_prediction"
+    else:
+        if prediction_type is None:
+            prediction_type = "epsilon"
+
+    num_train_timesteps = original_config["model"]["params"]["timesteps"]
+    beta_start = original_config["model"]["params"]["linear_start"]
+    beta_end = original_config["model"]["params"]["linear_end"]
+
+    scheduler = DDIMScheduler(
+        beta_end=beta_end,
+        beta_schedule="scaled_linear",
+        beta_start=beta_start,
+        num_train_timesteps=num_train_timesteps,
+        steps_offset=1,
+        clip_sample=False,
+        set_alpha_to_one=False,
+        prediction_type=prediction_type,
+    )
+    # make sure scheduler works correctly with DDIM
+    scheduler.register_to_config(clip_sample=False)
+
+    if scheduler_type == "pndm":
+        config = dict(scheduler.config)
+        config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(config)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+    elif scheduler_type == "ddim":
+        scheduler = scheduler
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+    # Convert the UNet2DModel
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet = AudioLDM2UNet2DConditionModel(**unet_config)
+
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
+    )
+
+    unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model
+    vae_config = create_vae_diffusers_config(original_config, checkpoint=checkpoint, image_size=image_size)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+
+    # Convert the joint audio-text encoding model
+    clap_config = ClapConfig.from_pretrained("laion/clap-htsat-unfused")
+    clap_config.audio_config.update(
+        {
+            "patch_embeds_hidden_size": 128,
+            "hidden_size": 1024,
+            "depths": [2, 2, 12, 2],
+        }
+    )
+    # AudioLDM2 uses the same tokenizer and feature extractor as the original CLAP model
+    clap_tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+    clap_feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
+
+    converted_clap_model = convert_open_clap_checkpoint(checkpoint)
+    clap_model = ClapModel(clap_config)
+
+    missing_keys, unexpected_keys = clap_model.load_state_dict(converted_clap_model, strict=False)
+    # we expect not to have token_type_ids in our original state dict so let's ignore them
+    missing_keys = list(set(missing_keys) - set(CLAP_EXPECTED_MISSING_KEYS))
+
+    if len(unexpected_keys) > 0:
+        raise ValueError(f"Unexpected keys when loading CLAP model: {unexpected_keys}")
+
+    if len(missing_keys) > 0:
+        raise ValueError(f"Missing keys when loading CLAP model: {missing_keys}")
+
+    # Convert the vocoder model
+    vocoder_config = create_transformers_vocoder_config(original_config)
+    vocoder_config = SpeechT5HifiGanConfig(**vocoder_config)
+    converted_vocoder_checkpoint = convert_hifigan_checkpoint(checkpoint, vocoder_config)
+
+    vocoder = SpeechT5HifiGan(vocoder_config)
+    vocoder.load_state_dict(converted_vocoder_checkpoint)
+
+    # Convert the Flan-T5 encoder model: AudioLDM2 uses the same configuration and tokenizer as the original Flan-T5 large model
+    t5_config = T5Config.from_pretrained("google/flan-t5-large")
+    converted_t5_checkpoint = extract_sub_model(checkpoint, key_prefix="cond_stage_models.1.model.")
+
+    t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
+    # hard-coded in the original implementation (i.e. not retrievable from the config)
+    t5_tokenizer.model_max_length = 128
+    t5_model = T5EncoderModel(t5_config)
+    t5_model.load_state_dict(converted_t5_checkpoint)
+
+    # Convert the GPT2 encoder model: AudioLDM2 uses the same configuration as the original GPT2 base model
+    gpt2_config = GPT2Config.from_pretrained("gpt2")
+    gpt2_model = GPT2Model(gpt2_config)
+    gpt2_model.config.max_new_tokens = original_config["model"]["params"]["cond_stage_config"][
+        "crossattn_audiomae_generated"
+    ]["params"]["sequence_gen_length"]
+
+    converted_gpt2_checkpoint = extract_sub_model(checkpoint, key_prefix="cond_stage_models.0.model.")
+    gpt2_model.load_state_dict(converted_gpt2_checkpoint)
+
+    # Convert the extra embedding / projection layers
+    projection_model = AudioLDM2ProjectionModel(clap_config.projection_dim, t5_config.d_model, gpt2_config.n_embd)
+
+    converted_projection_checkpoint = convert_projection_checkpoint(checkpoint)
+    projection_model.load_state_dict(converted_projection_checkpoint)
+
+    # Instantiate the diffusers pipeline
+    pipe = AudioLDM2Pipeline(
+        vae=vae,
+        text_encoder=clap_model,
+        text_encoder_2=t5_model,
+        projection_model=projection_model,
+        language_model=gpt2_model,
+        tokenizer=clap_tokenizer,
+        tokenizer_2=t5_tokenizer,
+        feature_extractor=clap_feature_extractor,
+        unet=unet,
+        scheduler=scheduler,
+        vocoder=vocoder,
+    )
+
+    return pipe
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--cross_attention_dim",
+        default=None,
+        type=int,
+        nargs="+",
+        help="The dimension of the cross-attention layers. If `None`, the cross-attention dimension will be "
+        "automatically inferred. Set to `768+1024` for the base model, or `768+1024+640` for the large model",
+    )
+    parser.add_argument(
+        "--transformer_layers_per_block",
+        default=None,
+        type=int,
+        help="The number of transformer layers in each transformer block. If `None`, number of layers will be "
+        "automatically inferred. Set to `1` for the base model, or `2` for the large model.",
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="ddim",
+        type=str,
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
+    )
+    parser.add_argument(
+        "--image_size",
+        default=1048,
+        type=int,
+        help="The image size that the model was trained on.",
+    )
+    parser.add_argument(
+        "--prediction_type",
+        default=None,
+        type=str,
+        help=("The prediction type that the model was trained on."),
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument(
+        "--from_safetensors",
+        action="store_true",
+        help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
+    )
+    parser.add_argument(
+        "--to_safetensors",
+        action="store_true",
+        help="Whether to store pipeline in safetensors format or not.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    args = parser.parse_args()
+
+    pipe = load_pipeline_from_original_AudioLDM2_ckpt(
+        checkpoint_path=args.checkpoint_path,
+        original_config_file=args.original_config_file,
+        image_size=args.image_size,
+        prediction_type=args.prediction_type,
+        extract_ema=args.extract_ema,
+        scheduler_type=args.scheduler_type,
+        cross_attention_dim=args.cross_attention_dim,
+        transformer_layers_per_block=args.transformer_layers_per_block,
+        from_safetensors=args.from_safetensors,
+        device=args.device,
+    )
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/diffusers/scripts/convert_original_audioldm_to_diffusers.py b/diffusers/scripts/convert_original_audioldm_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..797d1982609130aa7ecaa10369ab6b926d9493b0
--- /dev/null
+++ b/diffusers/scripts/convert_original_audioldm_to_diffusers.py
@@ -0,0 +1,1042 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the AudioLDM checkpoints."""
+
+import argparse
+import re
+
+import torch
+import yaml
+from transformers import (
+    AutoTokenizer,
+    ClapTextConfig,
+    ClapTextModelWithProjection,
+    SpeechT5HifiGan,
+    SpeechT5HifiGanConfig,
+)
+
+from diffusers import (
+    AudioLDMPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_resnet_paths
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_resnet_paths
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_attention_paths
+def renew_attention_paths(old_list):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_attention_paths
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.assign_to_checkpoint
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_unet_diffusers_config(original_config, image_size: int):
+    """
+    Creates a UNet config for diffusers based on the config of the original AudioLDM model.
+    """
+    unet_params = original_config["model"]["params"]["unet_config"]["params"]
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
+
+    cross_attention_dim = (
+        unet_params["cross_attention_dim"] if "cross_attention_dim" in unet_params else block_out_channels
+    )
+
+    class_embed_type = "simple_projection" if "extra_film_condition_dim" in unet_params else None
+    projection_class_embeddings_input_dim = (
+        unet_params["extra_film_condition_dim"] if "extra_film_condition_dim" in unet_params else None
+    )
+    class_embeddings_concat = unet_params["extra_film_use_concat"] if "extra_film_use_concat" in unet_params else None
+
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params["in_channels"],
+        "out_channels": unet_params["out_channels"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params["num_res_blocks"],
+        "cross_attention_dim": cross_attention_dim,
+        "class_embed_type": class_embed_type,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "class_embeddings_concat": class_embeddings_concat,
+    }
+
+    return config
+
+
+# Adapted from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_vae_diffusers_config
+def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
+    """
+    Creates a VAE config for diffusers based on the config of the original AudioLDM model. Compared to the original
+    Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE.
+    """
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
+
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config["model"]["params"] else 0.18215
+
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
+        "scaling_factor": float(scaling_factor),
+    }
+    return config
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config["model"]["params"]["timesteps"],
+        beta_start=original_config["model"]["params"]["linear_start"],
+        beta_end=original_config["model"]["params"]["linear_end"],
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+
+
+# Adapted from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_unet_checkpoint
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint. Compared to the original Stable Diffusion
+    conversion, this function additionally converts the learnt film embedding linear layer.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+
+    unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    new_checkpoint["class_embedding.weight"] = unet_state_dict["film_emb.weight"]
+    new_checkpoint["class_embedding.bias"] = unet_state_dict["film_emb.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_vae_checkpoint
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+CLAP_KEYS_TO_MODIFY_MAPPING = {
+    "text_branch": "text_model",
+    "attn": "attention.self",
+    "self.proj": "output.dense",
+    "attention.self_mask": "attn_mask",
+    "mlp.fc1": "intermediate.dense",
+    "mlp.fc2": "output.dense",
+    "norm1": "layernorm_before",
+    "norm2": "layernorm_after",
+    "bn0": "batch_norm",
+}
+
+CLAP_KEYS_TO_IGNORE = ["text_transform"]
+
+CLAP_EXPECTED_MISSING_KEYS = ["text_model.embeddings.token_type_ids"]
+
+
+def convert_open_clap_checkpoint(checkpoint):
+    """
+    Takes a state dict and returns a converted CLAP checkpoint.
+    """
+    # extract state dict for CLAP text embedding model, discarding the audio component
+    model_state_dict = {}
+    model_key = "cond_stage_model.model.text_"
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(model_key):
+            model_state_dict[key.replace(model_key, "text_")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    sequential_layers_pattern = r".*sequential.(\d+).*"
+    text_projection_pattern = r".*_projection.(\d+).*"
+
+    for key, value in model_state_dict.items():
+        # check if key should be ignored in mapping
+        if key.split(".")[0] in CLAP_KEYS_TO_IGNORE:
+            continue
+
+        # check if any key needs to be modified
+        for key_to_modify, new_key in CLAP_KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+
+        if re.match(sequential_layers_pattern, key):
+            # replace sequential layers with list
+            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
+
+            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
+        elif re.match(text_projection_pattern, key):
+            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
+
+            # Because in CLAP they use `nn.Sequential`...
+            transformers_projection_layer = 1 if projecton_layer == 0 else 2
+
+            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
+
+        if "audio" and "qkv" in key:
+            # split qkv into query key and value
+            mixed_qkv = value
+            qkv_dim = mixed_qkv.size(0) // 3
+
+            query_layer = mixed_qkv[:qkv_dim]
+            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+            value_layer = mixed_qkv[qkv_dim * 2 :]
+
+            new_checkpoint[key.replace("qkv", "query")] = query_layer
+            new_checkpoint[key.replace("qkv", "key")] = key_layer
+            new_checkpoint[key.replace("qkv", "value")] = value_layer
+        else:
+            new_checkpoint[key] = value
+
+    return new_checkpoint
+
+
+def create_transformers_vocoder_config(original_config):
+    """
+    Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model.
+    """
+    vocoder_params = original_config["model"]["params"]["vocoder_config"]["params"]
+
+    config = {
+        "model_in_dim": vocoder_params["num_mels"],
+        "sampling_rate": vocoder_params["sampling_rate"],
+        "upsample_initial_channel": vocoder_params["upsample_initial_channel"],
+        "upsample_rates": list(vocoder_params["upsample_rates"]),
+        "upsample_kernel_sizes": list(vocoder_params["upsample_kernel_sizes"]),
+        "resblock_kernel_sizes": list(vocoder_params["resblock_kernel_sizes"]),
+        "resblock_dilation_sizes": [
+            list(resblock_dilation) for resblock_dilation in vocoder_params["resblock_dilation_sizes"]
+        ],
+        "normalize_before": False,
+    }
+
+    return config
+
+
+def convert_hifigan_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and config, and returns a converted HiFiGAN vocoder checkpoint.
+    """
+    # extract state dict for vocoder
+    vocoder_state_dict = {}
+    vocoder_key = "first_stage_model.vocoder."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vocoder_key):
+            vocoder_state_dict[key.replace(vocoder_key, "")] = checkpoint.get(key)
+
+    # fix upsampler keys, everything else is correct already
+    for i in range(len(config.upsample_rates)):
+        vocoder_state_dict[f"upsampler.{i}.weight"] = vocoder_state_dict.pop(f"ups.{i}.weight")
+        vocoder_state_dict[f"upsampler.{i}.bias"] = vocoder_state_dict.pop(f"ups.{i}.bias")
+
+    if not config.normalize_before:
+        # if we don't set normalize_before then these variables are unused, so we set them to their initialised values
+        vocoder_state_dict["mean"] = torch.zeros(config.model_in_dim)
+        vocoder_state_dict["scale"] = torch.ones(config.model_in_dim)
+
+    return vocoder_state_dict
+
+
+# Adapted from https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation/blob/84a0384742a22bd80c44e903e241f0623e874f1d/audioldm/utils.py#L72-L73
+DEFAULT_CONFIG = {
+    "model": {
+        "params": {
+            "linear_start": 0.0015,
+            "linear_end": 0.0195,
+            "timesteps": 1000,
+            "channels": 8,
+            "scale_by_std": True,
+            "unet_config": {
+                "target": "audioldm.latent_diffusion.openaimodel.UNetModel",
+                "params": {
+                    "extra_film_condition_dim": 512,
+                    "extra_film_use_concat": True,
+                    "in_channels": 8,
+                    "out_channels": 8,
+                    "model_channels": 128,
+                    "attention_resolutions": [8, 4, 2],
+                    "num_res_blocks": 2,
+                    "channel_mult": [1, 2, 3, 5],
+                    "num_head_channels": 32,
+                },
+            },
+            "first_stage_config": {
+                "target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL",
+                "params": {
+                    "embed_dim": 8,
+                    "ddconfig": {
+                        "z_channels": 8,
+                        "resolution": 256,
+                        "in_channels": 1,
+                        "out_ch": 1,
+                        "ch": 128,
+                        "ch_mult": [1, 2, 4],
+                        "num_res_blocks": 2,
+                    },
+                },
+            },
+            "vocoder_config": {
+                "target": "audioldm.first_stage_model.vocoder",
+                "params": {
+                    "upsample_rates": [5, 4, 2, 2, 2],
+                    "upsample_kernel_sizes": [16, 16, 8, 4, 4],
+                    "upsample_initial_channel": 1024,
+                    "resblock_kernel_sizes": [3, 7, 11],
+                    "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    "num_mels": 64,
+                    "sampling_rate": 16000,
+                },
+            },
+        },
+    },
+}
+
+
+def load_pipeline_from_original_audioldm_ckpt(
+    checkpoint_path: str,
+    original_config_file: str = None,
+    image_size: int = 512,
+    prediction_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "ddim",
+    num_in_channels: int = None,
+    model_channels: int = None,
+    num_head_channels: int = None,
+    device: str = None,
+    from_safetensors: bool = False,
+) -> AudioLDMPipeline:
+    """
+    Load an AudioLDM pipeline object from a `.ckpt`/`.safetensors` file and (ideally) a `.yaml` config file.
+
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+
+    Args:
+        checkpoint_path (`str`): Path to `.ckpt` file.
+        original_config_file (`str`):
+            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
+            set to the audioldm-s-full-v2 config.
+        image_size (`int`, *optional*, defaults to 512):
+            The image size that the model was trained on.
+        prediction_type (`str`, *optional*):
+            The prediction type that the model was trained on. If `None`, will be automatically
+            inferred by looking for a key in the config. For the default config, the prediction type is `'epsilon'`.
+        num_in_channels (`int`, *optional*, defaults to None):
+            The number of UNet input channels. If `None`, it will be automatically inferred from the config.
+        model_channels (`int`, *optional*, defaults to None):
+            The number of UNet model channels. If `None`, it will be automatically inferred from the config. Override
+            to 128 for the small checkpoints, 192 for the medium checkpoints and 256 for the large.
+        num_head_channels (`int`, *optional*, defaults to None):
+            The number of UNet head channels. If `None`, it will be automatically inferred from the config. Override
+            to 32 for the small and medium checkpoints, and 64 for the large.
+        scheduler_type (`str`, *optional*, defaults to 'pndm'):
+            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
+            "ddim"]`.
+        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
+            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
+            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
+            inference. Non-EMA weights are usually better to continue fine-tuning.
+        device (`str`, *optional*, defaults to `None`):
+            The device to use. Pass `None` to determine automatically.
+        from_safetensors (`str`, *optional*, defaults to `False`):
+            If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
+        return: An AudioLDMPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    """
+
+    if from_safetensors:
+        from safetensors import safe_open
+
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+
+    if "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    if original_config_file is None:
+        original_config = DEFAULT_CONFIG
+    else:
+        original_config = yaml.safe_load(original_config_file)
+
+    if num_in_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+
+    if model_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["model_channels"] = model_channels
+
+    if num_head_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["num_head_channels"] = num_head_channels
+
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            prediction_type = "v_prediction"
+    else:
+        if prediction_type is None:
+            prediction_type = "epsilon"
+
+    if image_size is None:
+        image_size = 512
+
+    num_train_timesteps = original_config["model"]["params"]["timesteps"]
+    beta_start = original_config["model"]["params"]["linear_start"]
+    beta_end = original_config["model"]["params"]["linear_end"]
+
+    scheduler = DDIMScheduler(
+        beta_end=beta_end,
+        beta_schedule="scaled_linear",
+        beta_start=beta_start,
+        num_train_timesteps=num_train_timesteps,
+        steps_offset=1,
+        clip_sample=False,
+        set_alpha_to_one=False,
+        prediction_type=prediction_type,
+    )
+    # make sure scheduler works correctly with DDIM
+    scheduler.register_to_config(clip_sample=False)
+
+    if scheduler_type == "pndm":
+        config = dict(scheduler.config)
+        config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(config)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+    elif scheduler_type == "ddim":
+        scheduler = scheduler
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+    # Convert the UNet2DModel
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet = UNet2DConditionModel(**unet_config)
+
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
+    )
+
+    unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model
+    vae_config = create_vae_diffusers_config(original_config, checkpoint=checkpoint, image_size=image_size)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+
+    # Convert the text model
+    # AudioLDM uses the same configuration and tokenizer as the original CLAP model
+    config = ClapTextConfig.from_pretrained("laion/clap-htsat-unfused")
+    tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+
+    converted_text_model = convert_open_clap_checkpoint(checkpoint)
+    text_model = ClapTextModelWithProjection(config)
+
+    missing_keys, unexpected_keys = text_model.load_state_dict(converted_text_model, strict=False)
+    # we expect not to have token_type_ids in our original state dict so let's ignore them
+    missing_keys = list(set(missing_keys) - set(CLAP_EXPECTED_MISSING_KEYS))
+
+    if len(unexpected_keys) > 0:
+        raise ValueError(f"Unexpected keys when loading CLAP model: {unexpected_keys}")
+
+    if len(missing_keys) > 0:
+        raise ValueError(f"Missing keys when loading CLAP model: {missing_keys}")
+
+    # Convert the vocoder model
+    vocoder_config = create_transformers_vocoder_config(original_config)
+    vocoder_config = SpeechT5HifiGanConfig(**vocoder_config)
+    converted_vocoder_checkpoint = convert_hifigan_checkpoint(checkpoint, vocoder_config)
+
+    vocoder = SpeechT5HifiGan(vocoder_config)
+    vocoder.load_state_dict(converted_vocoder_checkpoint)
+
+    # Instantiate the diffusers pipeline
+    pipe = AudioLDMPipeline(
+        vae=vae,
+        text_encoder=text_model,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler=scheduler,
+        vocoder=vocoder,
+    )
+
+    return pipe
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--num_in_channels",
+        default=None,
+        type=int,
+        help="The number of input channels. If `None` number of input channels will be automatically inferred.",
+    )
+    parser.add_argument(
+        "--model_channels",
+        default=None,
+        type=int,
+        help="The number of UNet model channels. If `None`, it will be automatically inferred from the config. Override"
+        " to 128 for the small checkpoints, 192 for the medium checkpoints and 256 for the large.",
+    )
+    parser.add_argument(
+        "--num_head_channels",
+        default=None,
+        type=int,
+        help="The number of UNet head channels. If `None`, it will be automatically inferred from the config. Override"
+        " to 32 for the small and medium checkpoints, and 64 for the large.",
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="ddim",
+        type=str,
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
+    )
+    parser.add_argument(
+        "--image_size",
+        default=None,
+        type=int,
+        help=("The image size that the model was trained on."),
+    )
+    parser.add_argument(
+        "--prediction_type",
+        default=None,
+        type=str,
+        help=("The prediction type that the model was trained on."),
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument(
+        "--from_safetensors",
+        action="store_true",
+        help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
+    )
+    parser.add_argument(
+        "--to_safetensors",
+        action="store_true",
+        help="Whether to store pipeline in safetensors format or not.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    args = parser.parse_args()
+
+    pipe = load_pipeline_from_original_audioldm_ckpt(
+        checkpoint_path=args.checkpoint_path,
+        original_config_file=args.original_config_file,
+        image_size=args.image_size,
+        prediction_type=args.prediction_type,
+        extract_ema=args.extract_ema,
+        scheduler_type=args.scheduler_type,
+        num_in_channels=args.num_in_channels,
+        model_channels=args.model_channels,
+        num_head_channels=args.num_head_channels,
+        from_safetensors=args.from_safetensors,
+        device=args.device,
+    )
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/diffusers/scripts/convert_original_controlnet_to_diffusers.py b/diffusers/scripts/convert_original_controlnet_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..92aad4f09e7046437ee8e00c44d9f81600433b45
--- /dev/null
+++ b/diffusers/scripts/convert_original_controlnet_to_diffusers.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for stable diffusion checkpoints which _only_ contain a controlnet."""
+
+import argparse
+
+from diffusers.pipelines.stable_diffusion.convert_from_ckpt import download_controlnet_from_original_ckpt
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--original_config_file",
+        type=str,
+        required=True,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--num_in_channels",
+        default=None,
+        type=int,
+        help="The number of input channels. If `None` number of input channels will be automatically inferred.",
+    )
+    parser.add_argument(
+        "--image_size",
+        default=512,
+        type=int,
+        help=(
+            "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2"
+            " Base. Use 768 for Stable Diffusion v2."
+        ),
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument(
+        "--upcast_attention",
+        action="store_true",
+        help=(
+            "Whether the attention computation should always be upcasted. This is necessary when running stable"
+            " diffusion 2.1."
+        ),
+    )
+    parser.add_argument(
+        "--from_safetensors",
+        action="store_true",
+        help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
+    )
+    parser.add_argument(
+        "--to_safetensors",
+        action="store_true",
+        help="Whether to store pipeline in safetensors format or not.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+
+    # small workaround to get argparser to parse a boolean input as either true _or_ false
+    def parse_bool(string):
+        if string == "True":
+            return True
+        elif string == "False":
+            return False
+        else:
+            raise ValueError(f"could not parse string as bool {string}")
+
+    parser.add_argument(
+        "--use_linear_projection", help="Override for use linear projection", required=False, type=parse_bool
+    )
+
+    parser.add_argument("--cross_attention_dim", help="Override for cross attention_dim", required=False, type=int)
+
+    args = parser.parse_args()
+
+    controlnet = download_controlnet_from_original_ckpt(
+        checkpoint_path=args.checkpoint_path,
+        original_config_file=args.original_config_file,
+        image_size=args.image_size,
+        extract_ema=args.extract_ema,
+        num_in_channels=args.num_in_channels,
+        upcast_attention=args.upcast_attention,
+        from_safetensors=args.from_safetensors,
+        device=args.device,
+        use_linear_projection=args.use_linear_projection,
+        cross_attention_dim=args.cross_attention_dim,
+    )
+
+    controlnet.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/diffusers/scripts/convert_original_musicldm_to_diffusers.py b/diffusers/scripts/convert_original_musicldm_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6db9dbdfdb74ca063694e8de8510657832393154
--- /dev/null
+++ b/diffusers/scripts/convert_original_musicldm_to_diffusers.py
@@ -0,0 +1,1056 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the MusicLDM checkpoints."""
+
+import argparse
+import re
+
+import torch
+import yaml
+from transformers import (
+    AutoFeatureExtractor,
+    AutoTokenizer,
+    ClapConfig,
+    ClapModel,
+    SpeechT5HifiGan,
+    SpeechT5HifiGanConfig,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    MusicLDMPipeline,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_resnet_paths
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_resnet_paths
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_attention_paths
+def renew_attention_paths(old_list):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "to_q.weight")
+        new_item = new_item.replace("q.bias", "to_q.bias")
+
+        new_item = new_item.replace("k.weight", "to_k.weight")
+        new_item = new_item.replace("k.bias", "to_k.bias")
+
+        new_item = new_item.replace("v.weight", "to_v.weight")
+        new_item = new_item.replace("v.bias", "to_v.bias")
+
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.assign_to_checkpoint
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["to_q.weight", "to_k.weight", "to_v.weight"]
+    proj_key = "to_out.0.weight"
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys or ".".join(key.split(".")[-3:]) == proj_key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key].squeeze()
+
+
+def create_unet_diffusers_config(original_config, image_size: int):
+    """
+    Creates a UNet config for diffusers based on the config of the original MusicLDM model.
+    """
+    unet_params = original_config["model"]["params"]["unet_config"]["params"]
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
+
+    cross_attention_dim = (
+        unet_params["cross_attention_dim"] if "cross_attention_dim" in unet_params else block_out_channels
+    )
+
+    class_embed_type = "simple_projection" if "extra_film_condition_dim" in unet_params else None
+    projection_class_embeddings_input_dim = (
+        unet_params["extra_film_condition_dim"] if "extra_film_condition_dim" in unet_params else None
+    )
+    class_embeddings_concat = unet_params["extra_film_use_concat"] if "extra_film_use_concat" in unet_params else None
+
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params["in_channels"],
+        "out_channels": unet_params["out_channels"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params["num_res_blocks"],
+        "cross_attention_dim": cross_attention_dim,
+        "class_embed_type": class_embed_type,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "class_embeddings_concat": class_embeddings_concat,
+    }
+
+    return config
+
+
+# Adapted from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_vae_diffusers_config
+def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
+    """
+    Creates a VAE config for diffusers based on the config of the original MusicLDM model. Compared to the original
+    Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE.
+    """
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
+
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config["model"]["params"] else 0.18215
+
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
+        "scaling_factor": float(scaling_factor),
+    }
+    return config
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config["model"]["params"]["timesteps"],
+        beta_start=original_config["model"]["params"]["linear_start"],
+        beta_end=original_config["model"]["params"]["linear_end"],
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+
+
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint. Compared to the original Stable Diffusion
+    conversion, this function additionally converts the learnt film embedding linear layer.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+
+    unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    new_checkpoint["class_embedding.weight"] = unet_state_dict["film_emb.weight"]
+    new_checkpoint["class_embedding.bias"] = unet_state_dict["film_emb.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_vae_checkpoint
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+CLAP_KEYS_TO_MODIFY_MAPPING = {
+    "text_branch": "text_model",
+    "audio_branch": "audio_model.audio_encoder",
+    "attn": "attention.self",
+    "self.proj": "output.dense",
+    "attention.self_mask": "attn_mask",
+    "mlp.fc1": "intermediate.dense",
+    "mlp.fc2": "output.dense",
+    "norm1": "layernorm_before",
+    "norm2": "layernorm_after",
+    "bn0": "batch_norm",
+}
+
+CLAP_KEYS_TO_IGNORE = [
+    "text_transform",
+    "audio_transform",
+    "stft",
+    "logmel_extractor",
+    "tscam_conv",
+    "head",
+    "attn_mask",
+]
+
+CLAP_EXPECTED_MISSING_KEYS = ["text_model.embeddings.token_type_ids"]
+
+
+def convert_open_clap_checkpoint(checkpoint):
+    """
+    Takes a state dict and returns a converted CLAP checkpoint.
+    """
+    # extract state dict for CLAP text embedding model, discarding the audio component
+    model_state_dict = {}
+    model_key = "cond_stage_model.model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(model_key):
+            model_state_dict[key.replace(model_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    sequential_layers_pattern = r".*sequential.(\d+).*"
+    text_projection_pattern = r".*_projection.(\d+).*"
+
+    for key, value in model_state_dict.items():
+        # check if key should be ignored in mapping - if so map it to a key name that we'll filter out at the end
+        for key_to_ignore in CLAP_KEYS_TO_IGNORE:
+            if key_to_ignore in key:
+                key = "spectrogram"
+
+        # check if any key needs to be modified
+        for key_to_modify, new_key in CLAP_KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+
+        if re.match(sequential_layers_pattern, key):
+            # replace sequential layers with list
+            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
+
+            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
+        elif re.match(text_projection_pattern, key):
+            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
+
+            # Because in CLAP they use `nn.Sequential`...
+            transformers_projection_layer = 1 if projecton_layer == 0 else 2
+
+            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
+
+        if "audio" and "qkv" in key:
+            # split qkv into query key and value
+            mixed_qkv = value
+            qkv_dim = mixed_qkv.size(0) // 3
+
+            query_layer = mixed_qkv[:qkv_dim]
+            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+            value_layer = mixed_qkv[qkv_dim * 2 :]
+
+            new_checkpoint[key.replace("qkv", "query")] = query_layer
+            new_checkpoint[key.replace("qkv", "key")] = key_layer
+            new_checkpoint[key.replace("qkv", "value")] = value_layer
+        elif key != "spectrogram":
+            new_checkpoint[key] = value
+
+    return new_checkpoint
+
+
+def create_transformers_vocoder_config(original_config):
+    """
+    Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model.
+    """
+    vocoder_params = original_config["model"]["params"]["vocoder_config"]["params"]
+
+    config = {
+        "model_in_dim": vocoder_params["num_mels"],
+        "sampling_rate": vocoder_params["sampling_rate"],
+        "upsample_initial_channel": vocoder_params["upsample_initial_channel"],
+        "upsample_rates": list(vocoder_params["upsample_rates"]),
+        "upsample_kernel_sizes": list(vocoder_params["upsample_kernel_sizes"]),
+        "resblock_kernel_sizes": list(vocoder_params["resblock_kernel_sizes"]),
+        "resblock_dilation_sizes": [
+            list(resblock_dilation) for resblock_dilation in vocoder_params["resblock_dilation_sizes"]
+        ],
+        "normalize_before": False,
+    }
+
+    return config
+
+
+def convert_hifigan_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and config, and returns a converted HiFiGAN vocoder checkpoint.
+    """
+    # extract state dict for vocoder
+    vocoder_state_dict = {}
+    vocoder_key = "first_stage_model.vocoder."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vocoder_key):
+            vocoder_state_dict[key.replace(vocoder_key, "")] = checkpoint.get(key)
+
+    # fix upsampler keys, everything else is correct already
+    for i in range(len(config.upsample_rates)):
+        vocoder_state_dict[f"upsampler.{i}.weight"] = vocoder_state_dict.pop(f"ups.{i}.weight")
+        vocoder_state_dict[f"upsampler.{i}.bias"] = vocoder_state_dict.pop(f"ups.{i}.bias")
+
+    if not config.normalize_before:
+        # if we don't set normalize_before then these variables are unused, so we set them to their initialised values
+        vocoder_state_dict["mean"] = torch.zeros(config.model_in_dim)
+        vocoder_state_dict["scale"] = torch.ones(config.model_in_dim)
+
+    return vocoder_state_dict
+
+
+# Adapted from https://huggingface.co/spaces/haoheliu/MusicLDM-text-to-audio-generation/blob/84a0384742a22bd80c44e903e241f0623e874f1d/MusicLDM/utils.py#L72-L73
+DEFAULT_CONFIG = {
+    "model": {
+        "params": {
+            "linear_start": 0.0015,
+            "linear_end": 0.0195,
+            "timesteps": 1000,
+            "channels": 8,
+            "scale_by_std": True,
+            "unet_config": {
+                "target": "MusicLDM.latent_diffusion.openaimodel.UNetModel",
+                "params": {
+                    "extra_film_condition_dim": 512,
+                    "extra_film_use_concat": True,
+                    "in_channels": 8,
+                    "out_channels": 8,
+                    "model_channels": 128,
+                    "attention_resolutions": [8, 4, 2],
+                    "num_res_blocks": 2,
+                    "channel_mult": [1, 2, 3, 5],
+                    "num_head_channels": 32,
+                },
+            },
+            "first_stage_config": {
+                "target": "MusicLDM.variational_autoencoder.autoencoder.AutoencoderKL",
+                "params": {
+                    "embed_dim": 8,
+                    "ddconfig": {
+                        "z_channels": 8,
+                        "resolution": 256,
+                        "in_channels": 1,
+                        "out_ch": 1,
+                        "ch": 128,
+                        "ch_mult": [1, 2, 4],
+                        "num_res_blocks": 2,
+                    },
+                },
+            },
+            "vocoder_config": {
+                "target": "MusicLDM.first_stage_model.vocoder",
+                "params": {
+                    "upsample_rates": [5, 4, 2, 2, 2],
+                    "upsample_kernel_sizes": [16, 16, 8, 4, 4],
+                    "upsample_initial_channel": 1024,
+                    "resblock_kernel_sizes": [3, 7, 11],
+                    "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    "num_mels": 64,
+                    "sampling_rate": 16000,
+                },
+            },
+        },
+    },
+}
+
+
+def load_pipeline_from_original_MusicLDM_ckpt(
+    checkpoint_path: str,
+    original_config_file: str = None,
+    image_size: int = 1024,
+    prediction_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "ddim",
+    num_in_channels: int = None,
+    model_channels: int = None,
+    num_head_channels: int = None,
+    device: str = None,
+    from_safetensors: bool = False,
+) -> MusicLDMPipeline:
+    """
+    Load an MusicLDM pipeline object from a `.ckpt`/`.safetensors` file and (ideally) a `.yaml` config file.
+
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+
+    Args:
+        checkpoint_path (`str`): Path to `.ckpt` file.
+        original_config_file (`str`):
+            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
+            set to the MusicLDM-s-full-v2 config.
+        image_size (`int`, *optional*, defaults to 1024):
+            The image size that the model was trained on.
+        prediction_type (`str`, *optional*):
+            The prediction type that the model was trained on. If `None`, will be automatically
+            inferred by looking for a key in the config. For the default config, the prediction type is `'epsilon'`.
+        num_in_channels (`int`, *optional*, defaults to None):
+            The number of UNet input channels. If `None`, it will be automatically inferred from the config.
+        model_channels (`int`, *optional*, defaults to None):
+            The number of UNet model channels. If `None`, it will be automatically inferred from the config. Override
+            to 128 for the small checkpoints, 192 for the medium checkpoints and 256 for the large.
+        num_head_channels (`int`, *optional*, defaults to None):
+            The number of UNet head channels. If `None`, it will be automatically inferred from the config. Override
+            to 32 for the small and medium checkpoints, and 64 for the large.
+        scheduler_type (`str`, *optional*, defaults to 'pndm'):
+            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
+            "ddim"]`.
+        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
+            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
+            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
+            inference. Non-EMA weights are usually better to continue fine-tuning.
+        device (`str`, *optional*, defaults to `None`):
+            The device to use. Pass `None` to determine automatically.
+        from_safetensors (`str`, *optional*, defaults to `False`):
+            If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
+        return: An MusicLDMPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    """
+    if from_safetensors:
+        from safetensors import safe_open
+
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+
+    if "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    if original_config_file is None:
+        original_config = DEFAULT_CONFIG
+    else:
+        original_config = yaml.safe_load(original_config_file)
+
+    if num_in_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+
+    if model_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["model_channels"] = model_channels
+
+    if num_head_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["num_head_channels"] = num_head_channels
+
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            prediction_type = "v_prediction"
+    else:
+        if prediction_type is None:
+            prediction_type = "epsilon"
+
+    if image_size is None:
+        image_size = 512
+
+    num_train_timesteps = original_config["model"]["params"]["timesteps"]
+    beta_start = original_config["model"]["params"]["linear_start"]
+    beta_end = original_config["model"]["params"]["linear_end"]
+
+    scheduler = DDIMScheduler(
+        beta_end=beta_end,
+        beta_schedule="scaled_linear",
+        beta_start=beta_start,
+        num_train_timesteps=num_train_timesteps,
+        steps_offset=1,
+        clip_sample=False,
+        set_alpha_to_one=False,
+        prediction_type=prediction_type,
+    )
+    # make sure scheduler works correctly with DDIM
+    scheduler.register_to_config(clip_sample=False)
+
+    if scheduler_type == "pndm":
+        config = dict(scheduler.config)
+        config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(config)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+    elif scheduler_type == "ddim":
+        scheduler = scheduler
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+    # Convert the UNet2DModel
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet = UNet2DConditionModel(**unet_config)
+
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
+    )
+
+    unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model
+    vae_config = create_vae_diffusers_config(original_config, checkpoint=checkpoint, image_size=image_size)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+
+    # Convert the text model
+    # MusicLDM uses the same tokenizer as the original CLAP model, but a slightly different configuration
+    config = ClapConfig.from_pretrained("laion/clap-htsat-unfused")
+    config.audio_config.update(
+        {
+            "patch_embeds_hidden_size": 128,
+            "hidden_size": 1024,
+            "depths": [2, 2, 12, 2],
+        }
+    )
+    tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+    feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
+
+    converted_text_model = convert_open_clap_checkpoint(checkpoint)
+    text_model = ClapModel(config)
+
+    missing_keys, unexpected_keys = text_model.load_state_dict(converted_text_model, strict=False)
+    # we expect not to have token_type_ids in our original state dict so let's ignore them
+    missing_keys = list(set(missing_keys) - set(CLAP_EXPECTED_MISSING_KEYS))
+
+    if len(unexpected_keys) > 0:
+        raise ValueError(f"Unexpected keys when loading CLAP model: {unexpected_keys}")
+
+    if len(missing_keys) > 0:
+        raise ValueError(f"Missing keys when loading CLAP model: {missing_keys}")
+
+    # Convert the vocoder model
+    vocoder_config = create_transformers_vocoder_config(original_config)
+    vocoder_config = SpeechT5HifiGanConfig(**vocoder_config)
+    converted_vocoder_checkpoint = convert_hifigan_checkpoint(checkpoint, vocoder_config)
+
+    vocoder = SpeechT5HifiGan(vocoder_config)
+    vocoder.load_state_dict(converted_vocoder_checkpoint)
+
+    # Instantiate the diffusers pipeline
+    pipe = MusicLDMPipeline(
+        vae=vae,
+        text_encoder=text_model,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler=scheduler,
+        vocoder=vocoder,
+        feature_extractor=feature_extractor,
+    )
+
+    return pipe
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--num_in_channels",
+        default=None,
+        type=int,
+        help="The number of input channels. If `None` number of input channels will be automatically inferred.",
+    )
+    parser.add_argument(
+        "--model_channels",
+        default=None,
+        type=int,
+        help="The number of UNet model channels. If `None`, it will be automatically inferred from the config. Override"
+        " to 128 for the small checkpoints, 192 for the medium checkpoints and 256 for the large.",
+    )
+    parser.add_argument(
+        "--num_head_channels",
+        default=None,
+        type=int,
+        help="The number of UNet head channels. If `None`, it will be automatically inferred from the config. Override"
+        " to 32 for the small and medium checkpoints, and 64 for the large.",
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="ddim",
+        type=str,
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
+    )
+    parser.add_argument(
+        "--image_size",
+        default=None,
+        type=int,
+        help=("The image size that the model was trained on."),
+    )
+    parser.add_argument(
+        "--prediction_type",
+        default=None,
+        type=str,
+        help=("The prediction type that the model was trained on."),
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument(
+        "--from_safetensors",
+        action="store_true",
+        help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
+    )
+    parser.add_argument(
+        "--to_safetensors",
+        action="store_true",
+        help="Whether to store pipeline in safetensors format or not.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    args = parser.parse_args()
+
+    pipe = load_pipeline_from_original_MusicLDM_ckpt(
+        checkpoint_path=args.checkpoint_path,
+        original_config_file=args.original_config_file,
+        image_size=args.image_size,
+        prediction_type=args.prediction_type,
+        extract_ema=args.extract_ema,
+        scheduler_type=args.scheduler_type,
+        num_in_channels=args.num_in_channels,
+        model_channels=args.model_channels,
+        num_head_channels=args.num_head_channels,
+        from_safetensors=args.from_safetensors,
+        device=args.device,
+    )
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/diffusers/scripts/convert_original_stable_diffusion_to_diffusers.py b/diffusers/scripts/convert_original_stable_diffusion_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e7925b0a4127e2e280237ceac24405cfccdd192
--- /dev/null
+++ b/diffusers/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -0,0 +1,188 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the LDM checkpoints."""
+
+import argparse
+import importlib
+
+import torch
+
+from diffusers.pipelines.stable_diffusion.convert_from_ckpt import download_from_original_stable_diffusion_ckpt
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--config_files",
+        default=None,
+        type=str,
+        help="The YAML config file corresponding to the architecture.",
+    )
+    parser.add_argument(
+        "--num_in_channels",
+        default=None,
+        type=int,
+        help="The number of input channels. If `None` number of input channels will be automatically inferred.",
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="pndm",
+        type=str,
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
+    )
+    parser.add_argument(
+        "--pipeline_type",
+        default=None,
+        type=str,
+        help=(
+            "The pipeline type. One of 'FrozenOpenCLIPEmbedder', 'FrozenCLIPEmbedder', 'PaintByExample'"
+            ". If `None` pipeline will be automatically inferred."
+        ),
+    )
+    parser.add_argument(
+        "--image_size",
+        default=None,
+        type=int,
+        help=(
+            "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2"
+            " Base. Use 768 for Stable Diffusion v2."
+        ),
+    )
+    parser.add_argument(
+        "--prediction_type",
+        default=None,
+        type=str,
+        help=(
+            "The prediction type that the model was trained on. Use 'epsilon' for Stable Diffusion v1.X and Stable"
+            " Diffusion v2 Base. Use 'v_prediction' for Stable Diffusion v2."
+        ),
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument(
+        "--upcast_attention",
+        action="store_true",
+        help=(
+            "Whether the attention computation should always be upcasted. This is necessary when running stable"
+            " diffusion 2.1."
+        ),
+    )
+    parser.add_argument(
+        "--from_safetensors",
+        action="store_true",
+        help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
+    )
+    parser.add_argument(
+        "--to_safetensors",
+        action="store_true",
+        help="Whether to store pipeline in safetensors format or not.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    parser.add_argument(
+        "--stable_unclip",
+        type=str,
+        default=None,
+        required=False,
+        help="Set if this is a stable unCLIP model. One of 'txt2img' or 'img2img'.",
+    )
+    parser.add_argument(
+        "--stable_unclip_prior",
+        type=str,
+        default=None,
+        required=False,
+        help="Set if this is a stable unCLIP txt2img model. Selects which prior to use. If `--stable_unclip` is set to `txt2img`, the karlo prior (https://huggingface.co/kakaobrain/karlo-v1-alpha/tree/main/prior) is selected by default.",
+    )
+    parser.add_argument(
+        "--clip_stats_path",
+        type=str,
+        help="Path to the clip stats file. Only required if the stable unclip model's config specifies `model.params.noise_aug_config.params.clip_stats_path`.",
+        required=False,
+    )
+    parser.add_argument(
+        "--controlnet", action="store_true", default=None, help="Set flag if this is a controlnet checkpoint."
+    )
+    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
+    parser.add_argument(
+        "--vae_path",
+        type=str,
+        default=None,
+        required=False,
+        help="Set to a path, hub id to an already converted vae to not convert it again.",
+    )
+    parser.add_argument(
+        "--pipeline_class_name",
+        type=str,
+        default=None,
+        required=False,
+        help="Specify the pipeline class name",
+    )
+
+    args = parser.parse_args()
+
+    if args.pipeline_class_name is not None:
+        library = importlib.import_module("diffusers")
+        class_obj = getattr(library, args.pipeline_class_name)
+        pipeline_class = class_obj
+    else:
+        pipeline_class = None
+
+    pipe = download_from_original_stable_diffusion_ckpt(
+        checkpoint_path_or_dict=args.checkpoint_path,
+        original_config_file=args.original_config_file,
+        config_files=args.config_files,
+        image_size=args.image_size,
+        prediction_type=args.prediction_type,
+        model_type=args.pipeline_type,
+        extract_ema=args.extract_ema,
+        scheduler_type=args.scheduler_type,
+        num_in_channels=args.num_in_channels,
+        upcast_attention=args.upcast_attention,
+        from_safetensors=args.from_safetensors,
+        device=args.device,
+        stable_unclip=args.stable_unclip,
+        stable_unclip_prior=args.stable_unclip_prior,
+        clip_stats_path=args.clip_stats_path,
+        controlnet=args.controlnet,
+        vae_path=args.vae_path,
+        pipeline_class=pipeline_class,
+    )
+
+    if args.half:
+        pipe.to(dtype=torch.float16)
+
+    if args.controlnet:
+        # only save the controlnet model
+        pipe.controlnet.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
+    else:
+        pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/diffusers/scripts/convert_original_t2i_adapter.py b/diffusers/scripts/convert_original_t2i_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..95c8817b508df807b85a953391e40e6b91833f42
--- /dev/null
+++ b/diffusers/scripts/convert_original_t2i_adapter.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Conversion script for the T2I-Adapter checkpoints.
+"""
+
+import argparse
+
+import torch
+
+from diffusers import T2IAdapter
+
+
+def convert_adapter(src_state, in_channels):
+    original_body_length = max([int(x.split(".")[1]) for x in src_state.keys() if "body." in x]) + 1
+
+    assert original_body_length == 8
+
+    # (0, 1) -> channels 1
+    assert src_state["body.0.block1.weight"].shape == (320, 320, 3, 3)
+
+    # (2, 3) -> channels 2
+    assert src_state["body.2.in_conv.weight"].shape == (640, 320, 1, 1)
+
+    # (4, 5) -> channels 3
+    assert src_state["body.4.in_conv.weight"].shape == (1280, 640, 1, 1)
+
+    # (6, 7) -> channels 4
+    assert src_state["body.6.block1.weight"].shape == (1280, 1280, 3, 3)
+
+    res_state = {
+        "adapter.conv_in.weight": src_state.pop("conv_in.weight"),
+        "adapter.conv_in.bias": src_state.pop("conv_in.bias"),
+        # 0.resnets.0
+        "adapter.body.0.resnets.0.block1.weight": src_state.pop("body.0.block1.weight"),
+        "adapter.body.0.resnets.0.block1.bias": src_state.pop("body.0.block1.bias"),
+        "adapter.body.0.resnets.0.block2.weight": src_state.pop("body.0.block2.weight"),
+        "adapter.body.0.resnets.0.block2.bias": src_state.pop("body.0.block2.bias"),
+        # 0.resnets.1
+        "adapter.body.0.resnets.1.block1.weight": src_state.pop("body.1.block1.weight"),
+        "adapter.body.0.resnets.1.block1.bias": src_state.pop("body.1.block1.bias"),
+        "adapter.body.0.resnets.1.block2.weight": src_state.pop("body.1.block2.weight"),
+        "adapter.body.0.resnets.1.block2.bias": src_state.pop("body.1.block2.bias"),
+        # 1
+        "adapter.body.1.in_conv.weight": src_state.pop("body.2.in_conv.weight"),
+        "adapter.body.1.in_conv.bias": src_state.pop("body.2.in_conv.bias"),
+        # 1.resnets.0
+        "adapter.body.1.resnets.0.block1.weight": src_state.pop("body.2.block1.weight"),
+        "adapter.body.1.resnets.0.block1.bias": src_state.pop("body.2.block1.bias"),
+        "adapter.body.1.resnets.0.block2.weight": src_state.pop("body.2.block2.weight"),
+        "adapter.body.1.resnets.0.block2.bias": src_state.pop("body.2.block2.bias"),
+        # 1.resnets.1
+        "adapter.body.1.resnets.1.block1.weight": src_state.pop("body.3.block1.weight"),
+        "adapter.body.1.resnets.1.block1.bias": src_state.pop("body.3.block1.bias"),
+        "adapter.body.1.resnets.1.block2.weight": src_state.pop("body.3.block2.weight"),
+        "adapter.body.1.resnets.1.block2.bias": src_state.pop("body.3.block2.bias"),
+        # 2
+        "adapter.body.2.in_conv.weight": src_state.pop("body.4.in_conv.weight"),
+        "adapter.body.2.in_conv.bias": src_state.pop("body.4.in_conv.bias"),
+        # 2.resnets.0
+        "adapter.body.2.resnets.0.block1.weight": src_state.pop("body.4.block1.weight"),
+        "adapter.body.2.resnets.0.block1.bias": src_state.pop("body.4.block1.bias"),
+        "adapter.body.2.resnets.0.block2.weight": src_state.pop("body.4.block2.weight"),
+        "adapter.body.2.resnets.0.block2.bias": src_state.pop("body.4.block2.bias"),
+        # 2.resnets.1
+        "adapter.body.2.resnets.1.block1.weight": src_state.pop("body.5.block1.weight"),
+        "adapter.body.2.resnets.1.block1.bias": src_state.pop("body.5.block1.bias"),
+        "adapter.body.2.resnets.1.block2.weight": src_state.pop("body.5.block2.weight"),
+        "adapter.body.2.resnets.1.block2.bias": src_state.pop("body.5.block2.bias"),
+        # 3.resnets.0
+        "adapter.body.3.resnets.0.block1.weight": src_state.pop("body.6.block1.weight"),
+        "adapter.body.3.resnets.0.block1.bias": src_state.pop("body.6.block1.bias"),
+        "adapter.body.3.resnets.0.block2.weight": src_state.pop("body.6.block2.weight"),
+        "adapter.body.3.resnets.0.block2.bias": src_state.pop("body.6.block2.bias"),
+        # 3.resnets.1
+        "adapter.body.3.resnets.1.block1.weight": src_state.pop("body.7.block1.weight"),
+        "adapter.body.3.resnets.1.block1.bias": src_state.pop("body.7.block1.bias"),
+        "adapter.body.3.resnets.1.block2.weight": src_state.pop("body.7.block2.weight"),
+        "adapter.body.3.resnets.1.block2.bias": src_state.pop("body.7.block2.bias"),
+    }
+
+    assert len(src_state) == 0
+
+    adapter = T2IAdapter(in_channels=in_channels, adapter_type="full_adapter")
+
+    adapter.load_state_dict(res_state)
+
+    return adapter
+
+
+def convert_light_adapter(src_state):
+    original_body_length = max([int(x.split(".")[1]) for x in src_state.keys() if "body." in x]) + 1
+
+    assert original_body_length == 4
+
+    res_state = {
+        # body.0.in_conv
+        "adapter.body.0.in_conv.weight": src_state.pop("body.0.in_conv.weight"),
+        "adapter.body.0.in_conv.bias": src_state.pop("body.0.in_conv.bias"),
+        # body.0.resnets.0
+        "adapter.body.0.resnets.0.block1.weight": src_state.pop("body.0.body.0.block1.weight"),
+        "adapter.body.0.resnets.0.block1.bias": src_state.pop("body.0.body.0.block1.bias"),
+        "adapter.body.0.resnets.0.block2.weight": src_state.pop("body.0.body.0.block2.weight"),
+        "adapter.body.0.resnets.0.block2.bias": src_state.pop("body.0.body.0.block2.bias"),
+        # body.0.resnets.1
+        "adapter.body.0.resnets.1.block1.weight": src_state.pop("body.0.body.1.block1.weight"),
+        "adapter.body.0.resnets.1.block1.bias": src_state.pop("body.0.body.1.block1.bias"),
+        "adapter.body.0.resnets.1.block2.weight": src_state.pop("body.0.body.1.block2.weight"),
+        "adapter.body.0.resnets.1.block2.bias": src_state.pop("body.0.body.1.block2.bias"),
+        # body.0.resnets.2
+        "adapter.body.0.resnets.2.block1.weight": src_state.pop("body.0.body.2.block1.weight"),
+        "adapter.body.0.resnets.2.block1.bias": src_state.pop("body.0.body.2.block1.bias"),
+        "adapter.body.0.resnets.2.block2.weight": src_state.pop("body.0.body.2.block2.weight"),
+        "adapter.body.0.resnets.2.block2.bias": src_state.pop("body.0.body.2.block2.bias"),
+        # body.0.resnets.3
+        "adapter.body.0.resnets.3.block1.weight": src_state.pop("body.0.body.3.block1.weight"),
+        "adapter.body.0.resnets.3.block1.bias": src_state.pop("body.0.body.3.block1.bias"),
+        "adapter.body.0.resnets.3.block2.weight": src_state.pop("body.0.body.3.block2.weight"),
+        "adapter.body.0.resnets.3.block2.bias": src_state.pop("body.0.body.3.block2.bias"),
+        # body.0.out_conv
+        "adapter.body.0.out_conv.weight": src_state.pop("body.0.out_conv.weight"),
+        "adapter.body.0.out_conv.bias": src_state.pop("body.0.out_conv.bias"),
+        # body.1.in_conv
+        "adapter.body.1.in_conv.weight": src_state.pop("body.1.in_conv.weight"),
+        "adapter.body.1.in_conv.bias": src_state.pop("body.1.in_conv.bias"),
+        # body.1.resnets.0
+        "adapter.body.1.resnets.0.block1.weight": src_state.pop("body.1.body.0.block1.weight"),
+        "adapter.body.1.resnets.0.block1.bias": src_state.pop("body.1.body.0.block1.bias"),
+        "adapter.body.1.resnets.0.block2.weight": src_state.pop("body.1.body.0.block2.weight"),
+        "adapter.body.1.resnets.0.block2.bias": src_state.pop("body.1.body.0.block2.bias"),
+        # body.1.resnets.1
+        "adapter.body.1.resnets.1.block1.weight": src_state.pop("body.1.body.1.block1.weight"),
+        "adapter.body.1.resnets.1.block1.bias": src_state.pop("body.1.body.1.block1.bias"),
+        "adapter.body.1.resnets.1.block2.weight": src_state.pop("body.1.body.1.block2.weight"),
+        "adapter.body.1.resnets.1.block2.bias": src_state.pop("body.1.body.1.block2.bias"),
+        # body.1.body.2
+        "adapter.body.1.resnets.2.block1.weight": src_state.pop("body.1.body.2.block1.weight"),
+        "adapter.body.1.resnets.2.block1.bias": src_state.pop("body.1.body.2.block1.bias"),
+        "adapter.body.1.resnets.2.block2.weight": src_state.pop("body.1.body.2.block2.weight"),
+        "adapter.body.1.resnets.2.block2.bias": src_state.pop("body.1.body.2.block2.bias"),
+        # body.1.body.3
+        "adapter.body.1.resnets.3.block1.weight": src_state.pop("body.1.body.3.block1.weight"),
+        "adapter.body.1.resnets.3.block1.bias": src_state.pop("body.1.body.3.block1.bias"),
+        "adapter.body.1.resnets.3.block2.weight": src_state.pop("body.1.body.3.block2.weight"),
+        "adapter.body.1.resnets.3.block2.bias": src_state.pop("body.1.body.3.block2.bias"),
+        # body.1.out_conv
+        "adapter.body.1.out_conv.weight": src_state.pop("body.1.out_conv.weight"),
+        "adapter.body.1.out_conv.bias": src_state.pop("body.1.out_conv.bias"),
+        # body.2.in_conv
+        "adapter.body.2.in_conv.weight": src_state.pop("body.2.in_conv.weight"),
+        "adapter.body.2.in_conv.bias": src_state.pop("body.2.in_conv.bias"),
+        # body.2.body.0
+        "adapter.body.2.resnets.0.block1.weight": src_state.pop("body.2.body.0.block1.weight"),
+        "adapter.body.2.resnets.0.block1.bias": src_state.pop("body.2.body.0.block1.bias"),
+        "adapter.body.2.resnets.0.block2.weight": src_state.pop("body.2.body.0.block2.weight"),
+        "adapter.body.2.resnets.0.block2.bias": src_state.pop("body.2.body.0.block2.bias"),
+        # body.2.body.1
+        "adapter.body.2.resnets.1.block1.weight": src_state.pop("body.2.body.1.block1.weight"),
+        "adapter.body.2.resnets.1.block1.bias": src_state.pop("body.2.body.1.block1.bias"),
+        "adapter.body.2.resnets.1.block2.weight": src_state.pop("body.2.body.1.block2.weight"),
+        "adapter.body.2.resnets.1.block2.bias": src_state.pop("body.2.body.1.block2.bias"),
+        # body.2.body.2
+        "adapter.body.2.resnets.2.block1.weight": src_state.pop("body.2.body.2.block1.weight"),
+        "adapter.body.2.resnets.2.block1.bias": src_state.pop("body.2.body.2.block1.bias"),
+        "adapter.body.2.resnets.2.block2.weight": src_state.pop("body.2.body.2.block2.weight"),
+        "adapter.body.2.resnets.2.block2.bias": src_state.pop("body.2.body.2.block2.bias"),
+        # body.2.body.3
+        "adapter.body.2.resnets.3.block1.weight": src_state.pop("body.2.body.3.block1.weight"),
+        "adapter.body.2.resnets.3.block1.bias": src_state.pop("body.2.body.3.block1.bias"),
+        "adapter.body.2.resnets.3.block2.weight": src_state.pop("body.2.body.3.block2.weight"),
+        "adapter.body.2.resnets.3.block2.bias": src_state.pop("body.2.body.3.block2.bias"),
+        # body.2.out_conv
+        "adapter.body.2.out_conv.weight": src_state.pop("body.2.out_conv.weight"),
+        "adapter.body.2.out_conv.bias": src_state.pop("body.2.out_conv.bias"),
+        # body.3.in_conv
+        "adapter.body.3.in_conv.weight": src_state.pop("body.3.in_conv.weight"),
+        "adapter.body.3.in_conv.bias": src_state.pop("body.3.in_conv.bias"),
+        # body.3.body.0
+        "adapter.body.3.resnets.0.block1.weight": src_state.pop("body.3.body.0.block1.weight"),
+        "adapter.body.3.resnets.0.block1.bias": src_state.pop("body.3.body.0.block1.bias"),
+        "adapter.body.3.resnets.0.block2.weight": src_state.pop("body.3.body.0.block2.weight"),
+        "adapter.body.3.resnets.0.block2.bias": src_state.pop("body.3.body.0.block2.bias"),
+        # body.3.body.1
+        "adapter.body.3.resnets.1.block1.weight": src_state.pop("body.3.body.1.block1.weight"),
+        "adapter.body.3.resnets.1.block1.bias": src_state.pop("body.3.body.1.block1.bias"),
+        "adapter.body.3.resnets.1.block2.weight": src_state.pop("body.3.body.1.block2.weight"),
+        "adapter.body.3.resnets.1.block2.bias": src_state.pop("body.3.body.1.block2.bias"),
+        # body.3.body.2
+        "adapter.body.3.resnets.2.block1.weight": src_state.pop("body.3.body.2.block1.weight"),
+        "adapter.body.3.resnets.2.block1.bias": src_state.pop("body.3.body.2.block1.bias"),
+        "adapter.body.3.resnets.2.block2.weight": src_state.pop("body.3.body.2.block2.weight"),
+        "adapter.body.3.resnets.2.block2.bias": src_state.pop("body.3.body.2.block2.bias"),
+        # body.3.body.3
+        "adapter.body.3.resnets.3.block1.weight": src_state.pop("body.3.body.3.block1.weight"),
+        "adapter.body.3.resnets.3.block1.bias": src_state.pop("body.3.body.3.block1.bias"),
+        "adapter.body.3.resnets.3.block2.weight": src_state.pop("body.3.body.3.block2.weight"),
+        "adapter.body.3.resnets.3.block2.bias": src_state.pop("body.3.body.3.block2.bias"),
+        # body.3.out_conv
+        "adapter.body.3.out_conv.weight": src_state.pop("body.3.out_conv.weight"),
+        "adapter.body.3.out_conv.bias": src_state.pop("body.3.out_conv.bias"),
+    }
+
+    assert len(src_state) == 0
+
+    adapter = T2IAdapter(in_channels=3, channels=[320, 640, 1280], num_res_blocks=4, adapter_type="light_adapter")
+
+    adapter.load_state_dict(res_state)
+
+    return adapter
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--output_path", default=None, type=str, required=True, help="Path to the store the result checkpoint."
+    )
+    parser.add_argument(
+        "--is_adapter_light",
+        action="store_true",
+        help="Is checkpoint come from Adapter-Light architecture. ex: color-adapter",
+    )
+    parser.add_argument("--in_channels", required=False, type=int, help="Input channels for non-light adapter")
+
+    args = parser.parse_args()
+    src_state = torch.load(args.checkpoint_path)
+
+    if args.is_adapter_light:
+        adapter = convert_light_adapter(src_state)
+    else:
+        if args.in_channels is None:
+            raise ValueError("set `--in_channels=<n>`")
+        adapter = convert_adapter(src_state, args.in_channels)
+
+    adapter.save_pretrained(args.output_path)
diff --git a/diffusers/scripts/convert_pixart_alpha_to_diffusers.py b/diffusers/scripts/convert_pixart_alpha_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..228b479df090961df6e6f20d028d041eeda094de
--- /dev/null
+++ b/diffusers/scripts/convert_pixart_alpha_to_diffusers.py
@@ -0,0 +1,198 @@
+import argparse
+import os
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, PixArtAlphaPipeline, Transformer2DModel
+
+
+ckpt_id = "PixArt-alpha/PixArt-alpha"
+# https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/scripts/inference.py#L125
+interpolation_scale = {256: 0.5, 512: 1, 1024: 2}
+
+
+def main(args):
+    all_state_dict = torch.load(args.orig_ckpt_path, map_location="cpu")
+    state_dict = all_state_dict.pop("state_dict")
+    converted_state_dict = {}
+
+    # Patch embeddings.
+    converted_state_dict["pos_embed.proj.weight"] = state_dict.pop("x_embedder.proj.weight")
+    converted_state_dict["pos_embed.proj.bias"] = state_dict.pop("x_embedder.proj.bias")
+
+    # Caption projection.
+    converted_state_dict["caption_projection.linear_1.weight"] = state_dict.pop("y_embedder.y_proj.fc1.weight")
+    converted_state_dict["caption_projection.linear_1.bias"] = state_dict.pop("y_embedder.y_proj.fc1.bias")
+    converted_state_dict["caption_projection.linear_2.weight"] = state_dict.pop("y_embedder.y_proj.fc2.weight")
+    converted_state_dict["caption_projection.linear_2.bias"] = state_dict.pop("y_embedder.y_proj.fc2.bias")
+
+    # AdaLN-single LN
+    converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.weight"] = state_dict.pop(
+        "t_embedder.mlp.0.weight"
+    )
+    converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.bias"] = state_dict.pop("t_embedder.mlp.0.bias")
+    converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.weight"] = state_dict.pop(
+        "t_embedder.mlp.2.weight"
+    )
+    converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.bias"] = state_dict.pop("t_embedder.mlp.2.bias")
+
+    if args.image_size == 1024:
+        # Resolution.
+        converted_state_dict["adaln_single.emb.resolution_embedder.linear_1.weight"] = state_dict.pop(
+            "csize_embedder.mlp.0.weight"
+        )
+        converted_state_dict["adaln_single.emb.resolution_embedder.linear_1.bias"] = state_dict.pop(
+            "csize_embedder.mlp.0.bias"
+        )
+        converted_state_dict["adaln_single.emb.resolution_embedder.linear_2.weight"] = state_dict.pop(
+            "csize_embedder.mlp.2.weight"
+        )
+        converted_state_dict["adaln_single.emb.resolution_embedder.linear_2.bias"] = state_dict.pop(
+            "csize_embedder.mlp.2.bias"
+        )
+        # Aspect ratio.
+        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_1.weight"] = state_dict.pop(
+            "ar_embedder.mlp.0.weight"
+        )
+        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_1.bias"] = state_dict.pop(
+            "ar_embedder.mlp.0.bias"
+        )
+        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_2.weight"] = state_dict.pop(
+            "ar_embedder.mlp.2.weight"
+        )
+        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_2.bias"] = state_dict.pop(
+            "ar_embedder.mlp.2.bias"
+        )
+    # Shared norm.
+    converted_state_dict["adaln_single.linear.weight"] = state_dict.pop("t_block.1.weight")
+    converted_state_dict["adaln_single.linear.bias"] = state_dict.pop("t_block.1.bias")
+
+    for depth in range(28):
+        # Transformer blocks.
+        converted_state_dict[f"transformer_blocks.{depth}.scale_shift_table"] = state_dict.pop(
+            f"blocks.{depth}.scale_shift_table"
+        )
+
+        # Attention is all you need 🤘
+
+        # Self attention.
+        q, k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.weight"), 3, dim=0)
+        q_bias, k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.bias"), 3, dim=0)
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.weight"] = q
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.bias"] = q_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.weight"] = k
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.bias"] = k_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.weight"] = v
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.bias"] = v_bias
+        # Projection.
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.weight"] = state_dict.pop(
+            f"blocks.{depth}.attn.proj.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.bias"] = state_dict.pop(
+            f"blocks.{depth}.attn.proj.bias"
+        )
+
+        # Feed-forward.
+        converted_state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.weight"] = state_dict.pop(
+            f"blocks.{depth}.mlp.fc1.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.bias"] = state_dict.pop(
+            f"blocks.{depth}.mlp.fc1.bias"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.net.2.weight"] = state_dict.pop(
+            f"blocks.{depth}.mlp.fc2.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.net.2.bias"] = state_dict.pop(
+            f"blocks.{depth}.mlp.fc2.bias"
+        )
+
+        # Cross-attention.
+        q = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.weight")
+        q_bias = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.bias")
+        k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.weight"), 2, dim=0)
+        k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.bias"), 2, dim=0)
+
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.weight"] = q
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.bias"] = q_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.weight"] = k
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.bias"] = k_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.weight"] = v
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.bias"] = v_bias
+
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.weight"] = state_dict.pop(
+            f"blocks.{depth}.cross_attn.proj.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.bias"] = state_dict.pop(
+            f"blocks.{depth}.cross_attn.proj.bias"
+        )
+
+    # Final block.
+    converted_state_dict["proj_out.weight"] = state_dict.pop("final_layer.linear.weight")
+    converted_state_dict["proj_out.bias"] = state_dict.pop("final_layer.linear.bias")
+    converted_state_dict["scale_shift_table"] = state_dict.pop("final_layer.scale_shift_table")
+
+    # DiT XL/2
+    transformer = Transformer2DModel(
+        sample_size=args.image_size // 8,
+        num_layers=28,
+        attention_head_dim=72,
+        in_channels=4,
+        out_channels=8,
+        patch_size=2,
+        attention_bias=True,
+        num_attention_heads=16,
+        cross_attention_dim=1152,
+        activation_fn="gelu-approximate",
+        num_embeds_ada_norm=1000,
+        norm_type="ada_norm_single",
+        norm_elementwise_affine=False,
+        norm_eps=1e-6,
+        caption_channels=4096,
+    )
+    transformer.load_state_dict(converted_state_dict, strict=True)
+
+    assert transformer.pos_embed.pos_embed is not None
+    state_dict.pop("pos_embed")
+    state_dict.pop("y_embedder.y_embedding")
+    assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}"
+
+    num_model_params = sum(p.numel() for p in transformer.parameters())
+    print(f"Total number of transformer parameters: {num_model_params}")
+
+    if args.only_transformer:
+        transformer.save_pretrained(os.path.join(args.dump_path, "transformer"))
+    else:
+        scheduler = DPMSolverMultistepScheduler()
+
+        vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="sd-vae-ft-ema")
+
+        tokenizer = T5Tokenizer.from_pretrained(ckpt_id, subfolder="t5-v1_1-xxl")
+        text_encoder = T5EncoderModel.from_pretrained(ckpt_id, subfolder="t5-v1_1-xxl")
+
+        pipeline = PixArtAlphaPipeline(
+            tokenizer=tokenizer, text_encoder=text_encoder, transformer=transformer, vae=vae, scheduler=scheduler
+        )
+
+        pipeline.save_pretrained(args.dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--orig_ckpt_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--image_size",
+        default=1024,
+        type=int,
+        choices=[256, 512, 1024],
+        required=False,
+        help="Image size of pretrained model, either 512 or 1024.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.")
+    parser.add_argument("--only_transformer", default=True, type=bool, required=True)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/diffusers/scripts/convert_pixart_sigma_to_diffusers.py b/diffusers/scripts/convert_pixart_sigma_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..9572a83c06441b36221ccbc88b343182f082dcf2
--- /dev/null
+++ b/diffusers/scripts/convert_pixart_sigma_to_diffusers.py
@@ -0,0 +1,223 @@
+import argparse
+import os
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, PixArtSigmaPipeline, Transformer2DModel
+
+
+ckpt_id = "PixArt-alpha"
+# https://github.com/PixArt-alpha/PixArt-sigma/blob/dd087141864e30ec44f12cb7448dd654be065e88/scripts/inference.py#L158
+interpolation_scale = {256: 0.5, 512: 1, 1024: 2, 2048: 4}
+
+
+def main(args):
+    all_state_dict = torch.load(args.orig_ckpt_path)
+    state_dict = all_state_dict.pop("state_dict")
+    converted_state_dict = {}
+
+    # Patch embeddings.
+    converted_state_dict["pos_embed.proj.weight"] = state_dict.pop("x_embedder.proj.weight")
+    converted_state_dict["pos_embed.proj.bias"] = state_dict.pop("x_embedder.proj.bias")
+
+    # Caption projection.
+    converted_state_dict["caption_projection.linear_1.weight"] = state_dict.pop("y_embedder.y_proj.fc1.weight")
+    converted_state_dict["caption_projection.linear_1.bias"] = state_dict.pop("y_embedder.y_proj.fc1.bias")
+    converted_state_dict["caption_projection.linear_2.weight"] = state_dict.pop("y_embedder.y_proj.fc2.weight")
+    converted_state_dict["caption_projection.linear_2.bias"] = state_dict.pop("y_embedder.y_proj.fc2.bias")
+
+    # AdaLN-single LN
+    converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.weight"] = state_dict.pop(
+        "t_embedder.mlp.0.weight"
+    )
+    converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.bias"] = state_dict.pop("t_embedder.mlp.0.bias")
+    converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.weight"] = state_dict.pop(
+        "t_embedder.mlp.2.weight"
+    )
+    converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.bias"] = state_dict.pop("t_embedder.mlp.2.bias")
+
+    if args.micro_condition:
+        # Resolution.
+        converted_state_dict["adaln_single.emb.resolution_embedder.linear_1.weight"] = state_dict.pop(
+            "csize_embedder.mlp.0.weight"
+        )
+        converted_state_dict["adaln_single.emb.resolution_embedder.linear_1.bias"] = state_dict.pop(
+            "csize_embedder.mlp.0.bias"
+        )
+        converted_state_dict["adaln_single.emb.resolution_embedder.linear_2.weight"] = state_dict.pop(
+            "csize_embedder.mlp.2.weight"
+        )
+        converted_state_dict["adaln_single.emb.resolution_embedder.linear_2.bias"] = state_dict.pop(
+            "csize_embedder.mlp.2.bias"
+        )
+        # Aspect ratio.
+        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_1.weight"] = state_dict.pop(
+            "ar_embedder.mlp.0.weight"
+        )
+        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_1.bias"] = state_dict.pop(
+            "ar_embedder.mlp.0.bias"
+        )
+        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_2.weight"] = state_dict.pop(
+            "ar_embedder.mlp.2.weight"
+        )
+        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_2.bias"] = state_dict.pop(
+            "ar_embedder.mlp.2.bias"
+        )
+    # Shared norm.
+    converted_state_dict["adaln_single.linear.weight"] = state_dict.pop("t_block.1.weight")
+    converted_state_dict["adaln_single.linear.bias"] = state_dict.pop("t_block.1.bias")
+
+    for depth in range(28):
+        # Transformer blocks.
+        converted_state_dict[f"transformer_blocks.{depth}.scale_shift_table"] = state_dict.pop(
+            f"blocks.{depth}.scale_shift_table"
+        )
+        # Attention is all you need 🤘
+
+        # Self attention.
+        q, k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.weight"), 3, dim=0)
+        q_bias, k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.bias"), 3, dim=0)
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.weight"] = q
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.bias"] = q_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.weight"] = k
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.bias"] = k_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.weight"] = v
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.bias"] = v_bias
+        # Projection.
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.weight"] = state_dict.pop(
+            f"blocks.{depth}.attn.proj.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.bias"] = state_dict.pop(
+            f"blocks.{depth}.attn.proj.bias"
+        )
+        if args.qk_norm:
+            converted_state_dict[f"transformer_blocks.{depth}.attn1.q_norm.weight"] = state_dict.pop(
+                f"blocks.{depth}.attn.q_norm.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{depth}.attn1.q_norm.bias"] = state_dict.pop(
+                f"blocks.{depth}.attn.q_norm.bias"
+            )
+            converted_state_dict[f"transformer_blocks.{depth}.attn1.k_norm.weight"] = state_dict.pop(
+                f"blocks.{depth}.attn.k_norm.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{depth}.attn1.k_norm.bias"] = state_dict.pop(
+                f"blocks.{depth}.attn.k_norm.bias"
+            )
+
+        # Feed-forward.
+        converted_state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.weight"] = state_dict.pop(
+            f"blocks.{depth}.mlp.fc1.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.bias"] = state_dict.pop(
+            f"blocks.{depth}.mlp.fc1.bias"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.net.2.weight"] = state_dict.pop(
+            f"blocks.{depth}.mlp.fc2.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.net.2.bias"] = state_dict.pop(
+            f"blocks.{depth}.mlp.fc2.bias"
+        )
+
+        # Cross-attention.
+        q = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.weight")
+        q_bias = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.bias")
+        k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.weight"), 2, dim=0)
+        k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.bias"), 2, dim=0)
+
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.weight"] = q
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.bias"] = q_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.weight"] = k
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.bias"] = k_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.weight"] = v
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.bias"] = v_bias
+
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.weight"] = state_dict.pop(
+            f"blocks.{depth}.cross_attn.proj.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.bias"] = state_dict.pop(
+            f"blocks.{depth}.cross_attn.proj.bias"
+        )
+
+    # Final block.
+    converted_state_dict["proj_out.weight"] = state_dict.pop("final_layer.linear.weight")
+    converted_state_dict["proj_out.bias"] = state_dict.pop("final_layer.linear.bias")
+    converted_state_dict["scale_shift_table"] = state_dict.pop("final_layer.scale_shift_table")
+
+    # PixArt XL/2
+    transformer = Transformer2DModel(
+        sample_size=args.image_size // 8,
+        num_layers=28,
+        attention_head_dim=72,
+        in_channels=4,
+        out_channels=8,
+        patch_size=2,
+        attention_bias=True,
+        num_attention_heads=16,
+        cross_attention_dim=1152,
+        activation_fn="gelu-approximate",
+        num_embeds_ada_norm=1000,
+        norm_type="ada_norm_single",
+        norm_elementwise_affine=False,
+        norm_eps=1e-6,
+        caption_channels=4096,
+        interpolation_scale=interpolation_scale[args.image_size],
+        use_additional_conditions=args.micro_condition,
+    )
+    transformer.load_state_dict(converted_state_dict, strict=True)
+
+    assert transformer.pos_embed.pos_embed is not None
+    try:
+        state_dict.pop("y_embedder.y_embedding")
+        state_dict.pop("pos_embed")
+    except Exception as e:
+        print(f"Skipping {str(e)}")
+        pass
+    assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}"
+
+    num_model_params = sum(p.numel() for p in transformer.parameters())
+    print(f"Total number of transformer parameters: {num_model_params}")
+
+    if args.only_transformer:
+        transformer.save_pretrained(os.path.join(args.dump_path, "transformer"))
+    else:
+        # pixart-Sigma vae link: https://huggingface.co/PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers/tree/main/vae
+        vae = AutoencoderKL.from_pretrained(f"{ckpt_id}/pixart_sigma_sdxlvae_T5_diffusers", subfolder="vae")
+
+        scheduler = DPMSolverMultistepScheduler()
+
+        tokenizer = T5Tokenizer.from_pretrained(f"{ckpt_id}/pixart_sigma_sdxlvae_T5_diffusers", subfolder="tokenizer")
+        text_encoder = T5EncoderModel.from_pretrained(
+            f"{ckpt_id}/pixart_sigma_sdxlvae_T5_diffusers", subfolder="text_encoder"
+        )
+
+        pipeline = PixArtSigmaPipeline(
+            tokenizer=tokenizer, text_encoder=text_encoder, transformer=transformer, vae=vae, scheduler=scheduler
+        )
+
+        pipeline.save_pretrained(args.dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--micro_condition", action="store_true", help="If use Micro-condition in PixArtMS structure during training."
+    )
+    parser.add_argument("--qk_norm", action="store_true", help="If use qk norm during training.")
+    parser.add_argument(
+        "--orig_ckpt_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--image_size",
+        default=1024,
+        type=int,
+        choices=[256, 512, 1024, 2048],
+        required=False,
+        help="Image size of pretrained model, 256, 512, 1024, or 2048.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.")
+    parser.add_argument("--only_transformer", default=True, type=bool, required=True)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/diffusers/scripts/convert_sd3_to_diffusers.py b/diffusers/scripts/convert_sd3_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f32745dae75142ce829395617150876b34afc09
--- /dev/null
+++ b/diffusers/scripts/convert_sd3_to_diffusers.py
@@ -0,0 +1,248 @@
+import argparse
+from contextlib import nullcontext
+
+import safetensors.torch
+import torch
+from accelerate import init_empty_weights
+
+from diffusers import AutoencoderKL, SD3Transformer2DModel
+from diffusers.loaders.single_file_utils import convert_ldm_vae_checkpoint
+from diffusers.models.modeling_utils import load_model_dict_into_meta
+from diffusers.utils.import_utils import is_accelerate_available
+
+
+CTX = init_empty_weights if is_accelerate_available else nullcontext
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--checkpoint_path", type=str)
+parser.add_argument("--output_path", type=str)
+parser.add_argument("--dtype", type=str, default="fp16")
+
+args = parser.parse_args()
+dtype = torch.float16 if args.dtype == "fp16" else torch.float32
+
+
+def load_original_checkpoint(ckpt_path):
+    original_state_dict = safetensors.torch.load_file(ckpt_path)
+    keys = list(original_state_dict.keys())
+    for k in keys:
+        if "model.diffusion_model." in k:
+            original_state_dict[k.replace("model.diffusion_model.", "")] = original_state_dict.pop(k)
+
+    return original_state_dict
+
+
+# in SD3 original implementation of AdaLayerNormContinuous, it split linear projection output into shift, scale;
+# while in diffusers it split into scale, shift. Here we swap the linear projection weights in order to be able to use diffusers implementation
+def swap_scale_shift(weight, dim):
+    shift, scale = weight.chunk(2, dim=0)
+    new_weight = torch.cat([scale, shift], dim=0)
+    return new_weight
+
+
+def convert_sd3_transformer_checkpoint_to_diffusers(original_state_dict, num_layers, caption_projection_dim):
+    converted_state_dict = {}
+
+    # Positional and patch embeddings.
+    converted_state_dict["pos_embed.pos_embed"] = original_state_dict.pop("pos_embed")
+    converted_state_dict["pos_embed.proj.weight"] = original_state_dict.pop("x_embedder.proj.weight")
+    converted_state_dict["pos_embed.proj.bias"] = original_state_dict.pop("x_embedder.proj.bias")
+
+    # Timestep embeddings.
+    converted_state_dict["time_text_embed.timestep_embedder.linear_1.weight"] = original_state_dict.pop(
+        "t_embedder.mlp.0.weight"
+    )
+    converted_state_dict["time_text_embed.timestep_embedder.linear_1.bias"] = original_state_dict.pop(
+        "t_embedder.mlp.0.bias"
+    )
+    converted_state_dict["time_text_embed.timestep_embedder.linear_2.weight"] = original_state_dict.pop(
+        "t_embedder.mlp.2.weight"
+    )
+    converted_state_dict["time_text_embed.timestep_embedder.linear_2.bias"] = original_state_dict.pop(
+        "t_embedder.mlp.2.bias"
+    )
+
+    # Context projections.
+    converted_state_dict["context_embedder.weight"] = original_state_dict.pop("context_embedder.weight")
+    converted_state_dict["context_embedder.bias"] = original_state_dict.pop("context_embedder.bias")
+
+    # Pooled context projection.
+    converted_state_dict["time_text_embed.text_embedder.linear_1.weight"] = original_state_dict.pop(
+        "y_embedder.mlp.0.weight"
+    )
+    converted_state_dict["time_text_embed.text_embedder.linear_1.bias"] = original_state_dict.pop(
+        "y_embedder.mlp.0.bias"
+    )
+    converted_state_dict["time_text_embed.text_embedder.linear_2.weight"] = original_state_dict.pop(
+        "y_embedder.mlp.2.weight"
+    )
+    converted_state_dict["time_text_embed.text_embedder.linear_2.bias"] = original_state_dict.pop(
+        "y_embedder.mlp.2.bias"
+    )
+
+    # Transformer blocks 🎸.
+    for i in range(num_layers):
+        # Q, K, V
+        sample_q, sample_k, sample_v = torch.chunk(
+            original_state_dict.pop(f"joint_blocks.{i}.x_block.attn.qkv.weight"), 3, dim=0
+        )
+        context_q, context_k, context_v = torch.chunk(
+            original_state_dict.pop(f"joint_blocks.{i}.context_block.attn.qkv.weight"), 3, dim=0
+        )
+        sample_q_bias, sample_k_bias, sample_v_bias = torch.chunk(
+            original_state_dict.pop(f"joint_blocks.{i}.x_block.attn.qkv.bias"), 3, dim=0
+        )
+        context_q_bias, context_k_bias, context_v_bias = torch.chunk(
+            original_state_dict.pop(f"joint_blocks.{i}.context_block.attn.qkv.bias"), 3, dim=0
+        )
+
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_q.weight"] = torch.cat([sample_q])
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_q.bias"] = torch.cat([sample_q_bias])
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_k.weight"] = torch.cat([sample_k])
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_k.bias"] = torch.cat([sample_k_bias])
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_v.weight"] = torch.cat([sample_v])
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_v.bias"] = torch.cat([sample_v_bias])
+
+        converted_state_dict[f"transformer_blocks.{i}.attn.add_q_proj.weight"] = torch.cat([context_q])
+        converted_state_dict[f"transformer_blocks.{i}.attn.add_q_proj.bias"] = torch.cat([context_q_bias])
+        converted_state_dict[f"transformer_blocks.{i}.attn.add_k_proj.weight"] = torch.cat([context_k])
+        converted_state_dict[f"transformer_blocks.{i}.attn.add_k_proj.bias"] = torch.cat([context_k_bias])
+        converted_state_dict[f"transformer_blocks.{i}.attn.add_v_proj.weight"] = torch.cat([context_v])
+        converted_state_dict[f"transformer_blocks.{i}.attn.add_v_proj.bias"] = torch.cat([context_v_bias])
+
+        # output projections.
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_out.0.weight"] = original_state_dict.pop(
+            f"joint_blocks.{i}.x_block.attn.proj.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_out.0.bias"] = original_state_dict.pop(
+            f"joint_blocks.{i}.x_block.attn.proj.bias"
+        )
+        if not (i == num_layers - 1):
+            converted_state_dict[f"transformer_blocks.{i}.attn.to_add_out.weight"] = original_state_dict.pop(
+                f"joint_blocks.{i}.context_block.attn.proj.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.attn.to_add_out.bias"] = original_state_dict.pop(
+                f"joint_blocks.{i}.context_block.attn.proj.bias"
+            )
+
+        # norms.
+        converted_state_dict[f"transformer_blocks.{i}.norm1.linear.weight"] = original_state_dict.pop(
+            f"joint_blocks.{i}.x_block.adaLN_modulation.1.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{i}.norm1.linear.bias"] = original_state_dict.pop(
+            f"joint_blocks.{i}.x_block.adaLN_modulation.1.bias"
+        )
+        if not (i == num_layers - 1):
+            converted_state_dict[f"transformer_blocks.{i}.norm1_context.linear.weight"] = original_state_dict.pop(
+                f"joint_blocks.{i}.context_block.adaLN_modulation.1.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.norm1_context.linear.bias"] = original_state_dict.pop(
+                f"joint_blocks.{i}.context_block.adaLN_modulation.1.bias"
+            )
+        else:
+            converted_state_dict[f"transformer_blocks.{i}.norm1_context.linear.weight"] = swap_scale_shift(
+                original_state_dict.pop(f"joint_blocks.{i}.context_block.adaLN_modulation.1.weight"),
+                dim=caption_projection_dim,
+            )
+            converted_state_dict[f"transformer_blocks.{i}.norm1_context.linear.bias"] = swap_scale_shift(
+                original_state_dict.pop(f"joint_blocks.{i}.context_block.adaLN_modulation.1.bias"),
+                dim=caption_projection_dim,
+            )
+
+        # ffs.
+        converted_state_dict[f"transformer_blocks.{i}.ff.net.0.proj.weight"] = original_state_dict.pop(
+            f"joint_blocks.{i}.x_block.mlp.fc1.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{i}.ff.net.0.proj.bias"] = original_state_dict.pop(
+            f"joint_blocks.{i}.x_block.mlp.fc1.bias"
+        )
+        converted_state_dict[f"transformer_blocks.{i}.ff.net.2.weight"] = original_state_dict.pop(
+            f"joint_blocks.{i}.x_block.mlp.fc2.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{i}.ff.net.2.bias"] = original_state_dict.pop(
+            f"joint_blocks.{i}.x_block.mlp.fc2.bias"
+        )
+        if not (i == num_layers - 1):
+            converted_state_dict[f"transformer_blocks.{i}.ff_context.net.0.proj.weight"] = original_state_dict.pop(
+                f"joint_blocks.{i}.context_block.mlp.fc1.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.ff_context.net.0.proj.bias"] = original_state_dict.pop(
+                f"joint_blocks.{i}.context_block.mlp.fc1.bias"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.ff_context.net.2.weight"] = original_state_dict.pop(
+                f"joint_blocks.{i}.context_block.mlp.fc2.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.ff_context.net.2.bias"] = original_state_dict.pop(
+                f"joint_blocks.{i}.context_block.mlp.fc2.bias"
+            )
+
+    # Final blocks.
+    converted_state_dict["proj_out.weight"] = original_state_dict.pop("final_layer.linear.weight")
+    converted_state_dict["proj_out.bias"] = original_state_dict.pop("final_layer.linear.bias")
+    converted_state_dict["norm_out.linear.weight"] = swap_scale_shift(
+        original_state_dict.pop("final_layer.adaLN_modulation.1.weight"), dim=caption_projection_dim
+    )
+    converted_state_dict["norm_out.linear.bias"] = swap_scale_shift(
+        original_state_dict.pop("final_layer.adaLN_modulation.1.bias"), dim=caption_projection_dim
+    )
+
+    return converted_state_dict
+
+
+def is_vae_in_checkpoint(original_state_dict):
+    return ("first_stage_model.decoder.conv_in.weight" in original_state_dict) and (
+        "first_stage_model.encoder.conv_in.weight" in original_state_dict
+    )
+
+
+def main(args):
+    original_ckpt = load_original_checkpoint(args.checkpoint_path)
+    num_layers = list(set(int(k.split(".", 2)[1]) for k in original_ckpt if "joint_blocks" in k))[-1] + 1  # noqa: C401
+    caption_projection_dim = 1536
+
+    converted_transformer_state_dict = convert_sd3_transformer_checkpoint_to_diffusers(
+        original_ckpt, num_layers, caption_projection_dim
+    )
+
+    with CTX():
+        transformer = SD3Transformer2DModel(
+            sample_size=64,
+            patch_size=2,
+            in_channels=16,
+            joint_attention_dim=4096,
+            num_layers=num_layers,
+            caption_projection_dim=caption_projection_dim,
+            num_attention_heads=24,
+            pos_embed_max_size=192,
+        )
+    if is_accelerate_available():
+        load_model_dict_into_meta(transformer, converted_transformer_state_dict)
+    else:
+        transformer.load_state_dict(converted_transformer_state_dict, strict=True)
+
+    print("Saving SD3 Transformer in Diffusers format.")
+    transformer.to(dtype).save_pretrained(f"{args.output_path}/transformer")
+
+    if is_vae_in_checkpoint(original_ckpt):
+        with CTX():
+            vae = AutoencoderKL.from_config(
+                "stabilityai/stable-diffusion-xl-base-1.0",
+                subfolder="vae",
+                latent_channels=16,
+                use_post_quant_conv=False,
+                use_quant_conv=False,
+                scaling_factor=1.5305,
+                shift_factor=0.0609,
+            )
+        converted_vae_state_dict = convert_ldm_vae_checkpoint(original_ckpt, vae.config)
+        if is_accelerate_available():
+            load_model_dict_into_meta(vae, converted_vae_state_dict)
+        else:
+            vae.load_state_dict(converted_vae_state_dict, strict=True)
+
+        print("Saving SD3 Autoencoder in Diffusers format.")
+        vae.to(dtype).save_pretrained(f"{args.output_path}/vae")
+
+
+if __name__ == "__main__":
+    main(args)
diff --git a/diffusers/scripts/convert_shap_e_to_diffusers.py b/diffusers/scripts/convert_shap_e_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..b903b4ee8a7fb0877a6342811b2db0ca938be7eb
--- /dev/null
+++ b/diffusers/scripts/convert_shap_e_to_diffusers.py
@@ -0,0 +1,1080 @@
+import argparse
+import tempfile
+
+import torch
+from accelerate import load_checkpoint_and_dispatch
+
+from diffusers.models.transformers.prior_transformer import PriorTransformer
+from diffusers.pipelines.shap_e import ShapERenderer
+
+
+"""
+Example - From the diffusers root directory:
+
+Download weights:
+```sh
+$ wget  "https://openaipublic.azureedge.net/main/shap-e/text_cond.pt"
+```
+
+Convert the model:
+```sh
+$ python scripts/convert_shap_e_to_diffusers.py \
+      --prior_checkpoint_path  /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/text_cond.pt \
+      --prior_image_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/image_cond.pt \
+      --transmitter_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/transmitter.pt\
+      --dump_path /home/yiyi_huggingface_co/model_repo/shap-e-img2img/shap_e_renderer\
+      --debug renderer
+```
+"""
+
+
+# prior
+
+PRIOR_ORIGINAL_PREFIX = "wrapped"
+
+PRIOR_CONFIG = {
+    "num_attention_heads": 16,
+    "attention_head_dim": 1024 // 16,
+    "num_layers": 24,
+    "embedding_dim": 1024,
+    "num_embeddings": 1024,
+    "additional_embeddings": 0,
+    "time_embed_act_fn": "gelu",
+    "norm_in_type": "layer",
+    "encoder_hid_proj_type": None,
+    "added_emb_type": None,
+    "time_embed_dim": 1024 * 4,
+    "embedding_proj_dim": 768,
+    "clip_embed_dim": 1024 * 2,
+}
+
+
+def prior_model_from_original_config():
+    model = PriorTransformer(**PRIOR_CONFIG)
+
+    return model
+
+
+def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # <original>.time_embed.c_fc -> <diffusers>.time_embedding.linear_1
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_1.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.c_fc.weight"],
+            "time_embedding.linear_1.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.c_fc.bias"],
+        }
+    )
+
+    # <original>.time_embed.c_proj -> <diffusers>.time_embedding.linear_2
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_2.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.c_proj.weight"],
+            "time_embedding.linear_2.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.c_proj.bias"],
+        }
+    )
+
+    # <original>.input_proj -> <diffusers>.proj_in
+    diffusers_checkpoint.update(
+        {
+            "proj_in.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.input_proj.weight"],
+            "proj_in.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.input_proj.bias"],
+        }
+    )
+
+    # <original>.clip_emb -> <diffusers>.embedding_proj
+    diffusers_checkpoint.update(
+        {
+            "embedding_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_embed.weight"],
+            "embedding_proj.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_embed.bias"],
+        }
+    )
+
+    # <original>.pos_emb -> <diffusers>.positional_embedding
+    diffusers_checkpoint.update({"positional_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.pos_emb"][None, :]})
+
+    # <original>.ln_pre -> <diffusers>.norm_in
+    diffusers_checkpoint.update(
+        {
+            "norm_in.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.ln_pre.weight"],
+            "norm_in.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.ln_pre.bias"],
+        }
+    )
+
+    # <original>.backbone.resblocks.<x> -> <diffusers>.transformer_blocks.<x>
+    for idx in range(len(model.transformer_blocks)):
+        diffusers_transformer_prefix = f"transformer_blocks.{idx}"
+        original_transformer_prefix = f"{PRIOR_ORIGINAL_PREFIX}.backbone.resblocks.{idx}"
+
+        # <original>.attn -> <diffusers>.attn1
+        diffusers_attention_prefix = f"{diffusers_transformer_prefix}.attn1"
+        original_attention_prefix = f"{original_transformer_prefix}.attn"
+        diffusers_checkpoint.update(
+            prior_attention_to_diffusers(
+                checkpoint,
+                diffusers_attention_prefix=diffusers_attention_prefix,
+                original_attention_prefix=original_attention_prefix,
+                attention_head_dim=model.attention_head_dim,
+            )
+        )
+
+        # <original>.mlp -> <diffusers>.ff
+        diffusers_ff_prefix = f"{diffusers_transformer_prefix}.ff"
+        original_ff_prefix = f"{original_transformer_prefix}.mlp"
+        diffusers_checkpoint.update(
+            prior_ff_to_diffusers(
+                checkpoint, diffusers_ff_prefix=diffusers_ff_prefix, original_ff_prefix=original_ff_prefix
+            )
+        )
+
+        # <original>.ln_1 -> <diffusers>.norm1
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm1.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_1.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm1.bias": checkpoint[f"{original_transformer_prefix}.ln_1.bias"],
+            }
+        )
+
+        # <original>.ln_2 -> <diffusers>.norm3
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm3.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_2.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm3.bias": checkpoint[f"{original_transformer_prefix}.ln_2.bias"],
+            }
+        )
+
+    # <original>.ln_post -> <diffusers>.norm_out
+    diffusers_checkpoint.update(
+        {
+            "norm_out.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.ln_post.weight"],
+            "norm_out.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.ln_post.bias"],
+        }
+    )
+
+    # <original>.output_proj -> <diffusers>.proj_to_clip_embeddings
+    diffusers_checkpoint.update(
+        {
+            "proj_to_clip_embeddings.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.output_proj.weight"],
+            "proj_to_clip_embeddings.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.output_proj.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def prior_attention_to_diffusers(
+    checkpoint, *, diffusers_attention_prefix, original_attention_prefix, attention_head_dim
+):
+    diffusers_checkpoint = {}
+
+    # <original>.c_qkv -> <diffusers>.{to_q, to_k, to_v}
+    [q_weight, k_weight, v_weight], [q_bias, k_bias, v_bias] = split_attentions(
+        weight=checkpoint[f"{original_attention_prefix}.c_qkv.weight"],
+        bias=checkpoint[f"{original_attention_prefix}.c_qkv.bias"],
+        split=3,
+        chunk_size=attention_head_dim,
+    )
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_q.weight": q_weight,
+            f"{diffusers_attention_prefix}.to_q.bias": q_bias,
+            f"{diffusers_attention_prefix}.to_k.weight": k_weight,
+            f"{diffusers_attention_prefix}.to_k.bias": k_bias,
+            f"{diffusers_attention_prefix}.to_v.weight": v_weight,
+            f"{diffusers_attention_prefix}.to_v.bias": v_bias,
+        }
+    )
+
+    # <original>.c_proj -> <diffusers>.to_out.0
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{original_attention_prefix}.c_proj.weight"],
+            f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{original_attention_prefix}.c_proj.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix):
+    diffusers_checkpoint = {
+        # <original>.c_fc -> <diffusers>.net.0.proj
+        f"{diffusers_ff_prefix}.net.{0}.proj.weight": checkpoint[f"{original_ff_prefix}.c_fc.weight"],
+        f"{diffusers_ff_prefix}.net.{0}.proj.bias": checkpoint[f"{original_ff_prefix}.c_fc.bias"],
+        # <original>.c_proj -> <diffusers>.net.2
+        f"{diffusers_ff_prefix}.net.{2}.weight": checkpoint[f"{original_ff_prefix}.c_proj.weight"],
+        f"{diffusers_ff_prefix}.net.{2}.bias": checkpoint[f"{original_ff_prefix}.c_proj.bias"],
+    }
+
+    return diffusers_checkpoint
+
+
+# done prior
+
+
+# prior_image (only slightly different from prior)
+
+
+PRIOR_IMAGE_ORIGINAL_PREFIX = "wrapped"
+
+# Uses default arguments
+PRIOR_IMAGE_CONFIG = {
+    "num_attention_heads": 8,
+    "attention_head_dim": 1024 // 8,
+    "num_layers": 24,
+    "embedding_dim": 1024,
+    "num_embeddings": 1024,
+    "additional_embeddings": 0,
+    "time_embed_act_fn": "gelu",
+    "norm_in_type": "layer",
+    "embedding_proj_norm_type": "layer",
+    "encoder_hid_proj_type": None,
+    "added_emb_type": None,
+    "time_embed_dim": 1024 * 4,
+    "embedding_proj_dim": 1024,
+    "clip_embed_dim": 1024 * 2,
+}
+
+
+def prior_image_model_from_original_config():
+    model = PriorTransformer(**PRIOR_IMAGE_CONFIG)
+
+    return model
+
+
+def prior_image_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # <original>.time_embed.c_fc -> <diffusers>.time_embedding.linear_1
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_1.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.time_embed.c_fc.weight"],
+            "time_embedding.linear_1.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.time_embed.c_fc.bias"],
+        }
+    )
+
+    # <original>.time_embed.c_proj -> <diffusers>.time_embedding.linear_2
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_2.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.time_embed.c_proj.weight"],
+            "time_embedding.linear_2.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.time_embed.c_proj.bias"],
+        }
+    )
+
+    # <original>.input_proj -> <diffusers>.proj_in
+    diffusers_checkpoint.update(
+        {
+            "proj_in.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.input_proj.weight"],
+            "proj_in.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.input_proj.bias"],
+        }
+    )
+
+    # <original>.clip_embed.0 -> <diffusers>.embedding_proj_norm
+    diffusers_checkpoint.update(
+        {
+            "embedding_proj_norm.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.clip_embed.0.weight"],
+            "embedding_proj_norm.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.clip_embed.0.bias"],
+        }
+    )
+
+    # <original>..clip_embed.1 -> <diffusers>.embedding_proj
+    diffusers_checkpoint.update(
+        {
+            "embedding_proj.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.clip_embed.1.weight"],
+            "embedding_proj.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.clip_embed.1.bias"],
+        }
+    )
+
+    # <original>.pos_emb -> <diffusers>.positional_embedding
+    diffusers_checkpoint.update(
+        {"positional_embedding": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.pos_emb"][None, :]}
+    )
+
+    # <original>.ln_pre -> <diffusers>.norm_in
+    diffusers_checkpoint.update(
+        {
+            "norm_in.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.ln_pre.weight"],
+            "norm_in.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.ln_pre.bias"],
+        }
+    )
+
+    # <original>.backbone.resblocks.<x> -> <diffusers>.transformer_blocks.<x>
+    for idx in range(len(model.transformer_blocks)):
+        diffusers_transformer_prefix = f"transformer_blocks.{idx}"
+        original_transformer_prefix = f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.backbone.resblocks.{idx}"
+
+        # <original>.attn -> <diffusers>.attn1
+        diffusers_attention_prefix = f"{diffusers_transformer_prefix}.attn1"
+        original_attention_prefix = f"{original_transformer_prefix}.attn"
+        diffusers_checkpoint.update(
+            prior_attention_to_diffusers(
+                checkpoint,
+                diffusers_attention_prefix=diffusers_attention_prefix,
+                original_attention_prefix=original_attention_prefix,
+                attention_head_dim=model.attention_head_dim,
+            )
+        )
+
+        # <original>.mlp -> <diffusers>.ff
+        diffusers_ff_prefix = f"{diffusers_transformer_prefix}.ff"
+        original_ff_prefix = f"{original_transformer_prefix}.mlp"
+        diffusers_checkpoint.update(
+            prior_ff_to_diffusers(
+                checkpoint, diffusers_ff_prefix=diffusers_ff_prefix, original_ff_prefix=original_ff_prefix
+            )
+        )
+
+        # <original>.ln_1 -> <diffusers>.norm1
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm1.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_1.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm1.bias": checkpoint[f"{original_transformer_prefix}.ln_1.bias"],
+            }
+        )
+
+        # <original>.ln_2 -> <diffusers>.norm3
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm3.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_2.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm3.bias": checkpoint[f"{original_transformer_prefix}.ln_2.bias"],
+            }
+        )
+
+    # <original>.ln_post -> <diffusers>.norm_out
+    diffusers_checkpoint.update(
+        {
+            "norm_out.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.ln_post.weight"],
+            "norm_out.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.ln_post.bias"],
+        }
+    )
+
+    # <original>.output_proj -> <diffusers>.proj_to_clip_embeddings
+    diffusers_checkpoint.update(
+        {
+            "proj_to_clip_embeddings.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.output_proj.weight"],
+            "proj_to_clip_embeddings.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.output_proj.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+# done prior_image
+
+
+# renderer
+
+## create the lookup table for marching cubes method used in MeshDecoder
+
+MC_TABLE = [
+    [],
+    [[0, 1, 0, 2, 0, 4]],
+    [[1, 0, 1, 5, 1, 3]],
+    [[0, 4, 1, 5, 0, 2], [1, 5, 1, 3, 0, 2]],
+    [[2, 0, 2, 3, 2, 6]],
+    [[0, 1, 2, 3, 0, 4], [2, 3, 2, 6, 0, 4]],
+    [[1, 0, 1, 5, 1, 3], [2, 6, 0, 2, 3, 2]],
+    [[3, 2, 2, 6, 3, 1], [3, 1, 2, 6, 1, 5], [1, 5, 2, 6, 0, 4]],
+    [[3, 1, 3, 7, 3, 2]],
+    [[0, 2, 0, 4, 0, 1], [3, 7, 2, 3, 1, 3]],
+    [[1, 5, 3, 7, 1, 0], [3, 7, 3, 2, 1, 0]],
+    [[2, 0, 0, 4, 2, 3], [2, 3, 0, 4, 3, 7], [3, 7, 0, 4, 1, 5]],
+    [[2, 0, 3, 1, 2, 6], [3, 1, 3, 7, 2, 6]],
+    [[1, 3, 3, 7, 1, 0], [1, 0, 3, 7, 0, 4], [0, 4, 3, 7, 2, 6]],
+    [[0, 1, 1, 5, 0, 2], [0, 2, 1, 5, 2, 6], [2, 6, 1, 5, 3, 7]],
+    [[0, 4, 1, 5, 3, 7], [0, 4, 3, 7, 2, 6]],
+    [[4, 0, 4, 6, 4, 5]],
+    [[0, 2, 4, 6, 0, 1], [4, 6, 4, 5, 0, 1]],
+    [[1, 5, 1, 3, 1, 0], [4, 6, 5, 4, 0, 4]],
+    [[5, 1, 1, 3, 5, 4], [5, 4, 1, 3, 4, 6], [4, 6, 1, 3, 0, 2]],
+    [[2, 0, 2, 3, 2, 6], [4, 5, 0, 4, 6, 4]],
+    [[6, 4, 4, 5, 6, 2], [6, 2, 4, 5, 2, 3], [2, 3, 4, 5, 0, 1]],
+    [[2, 6, 2, 0, 3, 2], [1, 0, 1, 5, 3, 1], [6, 4, 5, 4, 0, 4]],
+    [[1, 3, 5, 4, 1, 5], [1, 3, 4, 6, 5, 4], [1, 3, 3, 2, 4, 6], [3, 2, 2, 6, 4, 6]],
+    [[3, 1, 3, 7, 3, 2], [6, 4, 5, 4, 0, 4]],
+    [[4, 5, 0, 1, 4, 6], [0, 1, 0, 2, 4, 6], [7, 3, 2, 3, 1, 3]],
+    [[3, 2, 1, 0, 3, 7], [1, 0, 1, 5, 3, 7], [6, 4, 5, 4, 0, 4]],
+    [[3, 7, 3, 2, 1, 5], [3, 2, 6, 4, 1, 5], [1, 5, 6, 4, 5, 4], [3, 2, 2, 0, 6, 4]],
+    [[3, 7, 2, 6, 3, 1], [2, 6, 2, 0, 3, 1], [5, 4, 0, 4, 6, 4]],
+    [[1, 0, 1, 3, 5, 4], [1, 3, 2, 6, 5, 4], [1, 3, 3, 7, 2, 6], [5, 4, 2, 6, 4, 6]],
+    [[0, 1, 1, 5, 0, 2], [0, 2, 1, 5, 2, 6], [2, 6, 1, 5, 3, 7], [4, 5, 0, 4, 4, 6]],
+    [[6, 2, 4, 6, 4, 5], [4, 5, 5, 1, 6, 2], [6, 2, 5, 1, 7, 3]],
+    [[5, 1, 5, 4, 5, 7]],
+    [[0, 1, 0, 2, 0, 4], [5, 7, 1, 5, 4, 5]],
+    [[1, 0, 5, 4, 1, 3], [5, 4, 5, 7, 1, 3]],
+    [[4, 5, 5, 7, 4, 0], [4, 0, 5, 7, 0, 2], [0, 2, 5, 7, 1, 3]],
+    [[2, 0, 2, 3, 2, 6], [7, 5, 1, 5, 4, 5]],
+    [[2, 6, 0, 4, 2, 3], [0, 4, 0, 1, 2, 3], [7, 5, 1, 5, 4, 5]],
+    [[5, 7, 1, 3, 5, 4], [1, 3, 1, 0, 5, 4], [6, 2, 0, 2, 3, 2]],
+    [[3, 1, 3, 2, 7, 5], [3, 2, 0, 4, 7, 5], [3, 2, 2, 6, 0, 4], [7, 5, 0, 4, 5, 4]],
+    [[3, 7, 3, 2, 3, 1], [5, 4, 7, 5, 1, 5]],
+    [[0, 4, 0, 1, 2, 0], [3, 1, 3, 7, 2, 3], [4, 5, 7, 5, 1, 5]],
+    [[7, 3, 3, 2, 7, 5], [7, 5, 3, 2, 5, 4], [5, 4, 3, 2, 1, 0]],
+    [[0, 4, 2, 3, 0, 2], [0, 4, 3, 7, 2, 3], [0, 4, 4, 5, 3, 7], [4, 5, 5, 7, 3, 7]],
+    [[2, 0, 3, 1, 2, 6], [3, 1, 3, 7, 2, 6], [4, 5, 7, 5, 1, 5]],
+    [[1, 3, 3, 7, 1, 0], [1, 0, 3, 7, 0, 4], [0, 4, 3, 7, 2, 6], [5, 7, 1, 5, 5, 4]],
+    [[2, 6, 2, 0, 3, 7], [2, 0, 4, 5, 3, 7], [3, 7, 4, 5, 7, 5], [2, 0, 0, 1, 4, 5]],
+    [[4, 0, 5, 4, 5, 7], [5, 7, 7, 3, 4, 0], [4, 0, 7, 3, 6, 2]],
+    [[4, 6, 5, 7, 4, 0], [5, 7, 5, 1, 4, 0]],
+    [[1, 0, 0, 2, 1, 5], [1, 5, 0, 2, 5, 7], [5, 7, 0, 2, 4, 6]],
+    [[0, 4, 4, 6, 0, 1], [0, 1, 4, 6, 1, 3], [1, 3, 4, 6, 5, 7]],
+    [[0, 2, 4, 6, 5, 7], [0, 2, 5, 7, 1, 3]],
+    [[5, 1, 4, 0, 5, 7], [4, 0, 4, 6, 5, 7], [3, 2, 6, 2, 0, 2]],
+    [[2, 3, 2, 6, 0, 1], [2, 6, 7, 5, 0, 1], [0, 1, 7, 5, 1, 5], [2, 6, 6, 4, 7, 5]],
+    [[0, 4, 4, 6, 0, 1], [0, 1, 4, 6, 1, 3], [1, 3, 4, 6, 5, 7], [2, 6, 0, 2, 2, 3]],
+    [[3, 1, 2, 3, 2, 6], [2, 6, 6, 4, 3, 1], [3, 1, 6, 4, 7, 5]],
+    [[4, 6, 5, 7, 4, 0], [5, 7, 5, 1, 4, 0], [2, 3, 1, 3, 7, 3]],
+    [[1, 0, 0, 2, 1, 5], [1, 5, 0, 2, 5, 7], [5, 7, 0, 2, 4, 6], [3, 2, 1, 3, 3, 7]],
+    [[0, 1, 0, 4, 2, 3], [0, 4, 5, 7, 2, 3], [0, 4, 4, 6, 5, 7], [2, 3, 5, 7, 3, 7]],
+    [[7, 5, 3, 7, 3, 2], [3, 2, 2, 0, 7, 5], [7, 5, 2, 0, 6, 4]],
+    [[0, 4, 4, 6, 5, 7], [0, 4, 5, 7, 1, 5], [0, 2, 1, 3, 3, 7], [3, 7, 2, 6, 0, 2]],
+    [
+        [3, 1, 7, 3, 6, 2],
+        [6, 2, 0, 1, 3, 1],
+        [6, 4, 0, 1, 6, 2],
+        [6, 4, 5, 1, 0, 1],
+        [6, 4, 7, 5, 5, 1],
+    ],
+    [
+        [4, 0, 6, 4, 7, 5],
+        [7, 5, 1, 0, 4, 0],
+        [7, 3, 1, 0, 7, 5],
+        [7, 3, 2, 0, 1, 0],
+        [7, 3, 6, 2, 2, 0],
+    ],
+    [[7, 3, 6, 2, 6, 4], [7, 5, 7, 3, 6, 4]],
+    [[6, 2, 6, 7, 6, 4]],
+    [[0, 4, 0, 1, 0, 2], [6, 7, 4, 6, 2, 6]],
+    [[1, 0, 1, 5, 1, 3], [7, 6, 4, 6, 2, 6]],
+    [[1, 3, 0, 2, 1, 5], [0, 2, 0, 4, 1, 5], [7, 6, 4, 6, 2, 6]],
+    [[2, 3, 6, 7, 2, 0], [6, 7, 6, 4, 2, 0]],
+    [[4, 0, 0, 1, 4, 6], [4, 6, 0, 1, 6, 7], [6, 7, 0, 1, 2, 3]],
+    [[6, 4, 2, 0, 6, 7], [2, 0, 2, 3, 6, 7], [5, 1, 3, 1, 0, 1]],
+    [[1, 5, 1, 3, 0, 4], [1, 3, 7, 6, 0, 4], [0, 4, 7, 6, 4, 6], [1, 3, 3, 2, 7, 6]],
+    [[3, 2, 3, 1, 3, 7], [6, 4, 2, 6, 7, 6]],
+    [[3, 7, 3, 2, 1, 3], [0, 2, 0, 4, 1, 0], [7, 6, 4, 6, 2, 6]],
+    [[1, 5, 3, 7, 1, 0], [3, 7, 3, 2, 1, 0], [4, 6, 2, 6, 7, 6]],
+    [[2, 0, 0, 4, 2, 3], [2, 3, 0, 4, 3, 7], [3, 7, 0, 4, 1, 5], [6, 4, 2, 6, 6, 7]],
+    [[7, 6, 6, 4, 7, 3], [7, 3, 6, 4, 3, 1], [3, 1, 6, 4, 2, 0]],
+    [[0, 1, 4, 6, 0, 4], [0, 1, 6, 7, 4, 6], [0, 1, 1, 3, 6, 7], [1, 3, 3, 7, 6, 7]],
+    [[0, 2, 0, 1, 4, 6], [0, 1, 3, 7, 4, 6], [0, 1, 1, 5, 3, 7], [4, 6, 3, 7, 6, 7]],
+    [[7, 3, 6, 7, 6, 4], [6, 4, 4, 0, 7, 3], [7, 3, 4, 0, 5, 1]],
+    [[4, 0, 6, 2, 4, 5], [6, 2, 6, 7, 4, 5]],
+    [[2, 6, 6, 7, 2, 0], [2, 0, 6, 7, 0, 1], [0, 1, 6, 7, 4, 5]],
+    [[6, 7, 4, 5, 6, 2], [4, 5, 4, 0, 6, 2], [3, 1, 0, 1, 5, 1]],
+    [[2, 0, 2, 6, 3, 1], [2, 6, 4, 5, 3, 1], [2, 6, 6, 7, 4, 5], [3, 1, 4, 5, 1, 5]],
+    [[0, 2, 2, 3, 0, 4], [0, 4, 2, 3, 4, 5], [4, 5, 2, 3, 6, 7]],
+    [[0, 1, 2, 3, 6, 7], [0, 1, 6, 7, 4, 5]],
+    [[0, 2, 2, 3, 0, 4], [0, 4, 2, 3, 4, 5], [4, 5, 2, 3, 6, 7], [1, 3, 0, 1, 1, 5]],
+    [[5, 4, 1, 5, 1, 3], [1, 3, 3, 2, 5, 4], [5, 4, 3, 2, 7, 6]],
+    [[4, 0, 6, 2, 4, 5], [6, 2, 6, 7, 4, 5], [1, 3, 7, 3, 2, 3]],
+    [[2, 6, 6, 7, 2, 0], [2, 0, 6, 7, 0, 1], [0, 1, 6, 7, 4, 5], [3, 7, 2, 3, 3, 1]],
+    [[0, 1, 1, 5, 3, 7], [0, 1, 3, 7, 2, 3], [0, 4, 2, 6, 6, 7], [6, 7, 4, 5, 0, 4]],
+    [
+        [6, 2, 7, 6, 5, 4],
+        [5, 4, 0, 2, 6, 2],
+        [5, 1, 0, 2, 5, 4],
+        [5, 1, 3, 2, 0, 2],
+        [5, 1, 7, 3, 3, 2],
+    ],
+    [[3, 1, 3, 7, 2, 0], [3, 7, 5, 4, 2, 0], [2, 0, 5, 4, 0, 4], [3, 7, 7, 6, 5, 4]],
+    [[1, 0, 3, 1, 3, 7], [3, 7, 7, 6, 1, 0], [1, 0, 7, 6, 5, 4]],
+    [
+        [1, 0, 5, 1, 7, 3],
+        [7, 3, 2, 0, 1, 0],
+        [7, 6, 2, 0, 7, 3],
+        [7, 6, 4, 0, 2, 0],
+        [7, 6, 5, 4, 4, 0],
+    ],
+    [[7, 6, 5, 4, 5, 1], [7, 3, 7, 6, 5, 1]],
+    [[5, 7, 5, 1, 5, 4], [6, 2, 7, 6, 4, 6]],
+    [[0, 2, 0, 4, 1, 0], [5, 4, 5, 7, 1, 5], [2, 6, 7, 6, 4, 6]],
+    [[1, 0, 5, 4, 1, 3], [5, 4, 5, 7, 1, 3], [2, 6, 7, 6, 4, 6]],
+    [[4, 5, 5, 7, 4, 0], [4, 0, 5, 7, 0, 2], [0, 2, 5, 7, 1, 3], [6, 7, 4, 6, 6, 2]],
+    [[2, 3, 6, 7, 2, 0], [6, 7, 6, 4, 2, 0], [1, 5, 4, 5, 7, 5]],
+    [[4, 0, 0, 1, 4, 6], [4, 6, 0, 1, 6, 7], [6, 7, 0, 1, 2, 3], [5, 1, 4, 5, 5, 7]],
+    [[0, 2, 2, 3, 6, 7], [0, 2, 6, 7, 4, 6], [0, 1, 4, 5, 5, 7], [5, 7, 1, 3, 0, 1]],
+    [
+        [5, 4, 7, 5, 3, 1],
+        [3, 1, 0, 4, 5, 4],
+        [3, 2, 0, 4, 3, 1],
+        [3, 2, 6, 4, 0, 4],
+        [3, 2, 7, 6, 6, 4],
+    ],
+    [[5, 4, 5, 7, 1, 5], [3, 7, 3, 2, 1, 3], [4, 6, 2, 6, 7, 6]],
+    [[1, 0, 0, 2, 0, 4], [1, 5, 5, 4, 5, 7], [3, 2, 1, 3, 3, 7], [2, 6, 7, 6, 4, 6]],
+    [[7, 3, 3, 2, 7, 5], [7, 5, 3, 2, 5, 4], [5, 4, 3, 2, 1, 0], [6, 2, 7, 6, 6, 4]],
+    [
+        [0, 4, 2, 3, 0, 2],
+        [0, 4, 3, 7, 2, 3],
+        [0, 4, 4, 5, 3, 7],
+        [4, 5, 5, 7, 3, 7],
+        [6, 7, 4, 6, 2, 6],
+    ],
+    [[7, 6, 6, 4, 7, 3], [7, 3, 6, 4, 3, 1], [3, 1, 6, 4, 2, 0], [5, 4, 7, 5, 5, 1]],
+    [
+        [0, 1, 4, 6, 0, 4],
+        [0, 1, 6, 7, 4, 6],
+        [0, 1, 1, 3, 6, 7],
+        [1, 3, 3, 7, 6, 7],
+        [5, 7, 1, 5, 4, 5],
+    ],
+    [
+        [6, 7, 4, 6, 0, 2],
+        [0, 2, 3, 7, 6, 7],
+        [0, 1, 3, 7, 0, 2],
+        [0, 1, 5, 7, 3, 7],
+        [0, 1, 4, 5, 5, 7],
+    ],
+    [[4, 0, 6, 7, 4, 6], [4, 0, 7, 3, 6, 7], [4, 0, 5, 7, 7, 3], [4, 5, 5, 7, 4, 0]],
+    [[7, 5, 5, 1, 7, 6], [7, 6, 5, 1, 6, 2], [6, 2, 5, 1, 4, 0]],
+    [[0, 2, 1, 5, 0, 1], [0, 2, 5, 7, 1, 5], [0, 2, 2, 6, 5, 7], [2, 6, 6, 7, 5, 7]],
+    [[1, 3, 1, 0, 5, 7], [1, 0, 2, 6, 5, 7], [5, 7, 2, 6, 7, 6], [1, 0, 0, 4, 2, 6]],
+    [[2, 0, 6, 2, 6, 7], [6, 7, 7, 5, 2, 0], [2, 0, 7, 5, 3, 1]],
+    [[0, 4, 0, 2, 1, 5], [0, 2, 6, 7, 1, 5], [0, 2, 2, 3, 6, 7], [1, 5, 6, 7, 5, 7]],
+    [[7, 6, 5, 7, 5, 1], [5, 1, 1, 0, 7, 6], [7, 6, 1, 0, 3, 2]],
+    [
+        [2, 0, 3, 2, 7, 6],
+        [7, 6, 4, 0, 2, 0],
+        [7, 5, 4, 0, 7, 6],
+        [7, 5, 1, 0, 4, 0],
+        [7, 5, 3, 1, 1, 0],
+    ],
+    [[7, 5, 3, 1, 3, 2], [7, 6, 7, 5, 3, 2]],
+    [[7, 5, 5, 1, 7, 6], [7, 6, 5, 1, 6, 2], [6, 2, 5, 1, 4, 0], [3, 1, 7, 3, 3, 2]],
+    [
+        [0, 2, 1, 5, 0, 1],
+        [0, 2, 5, 7, 1, 5],
+        [0, 2, 2, 6, 5, 7],
+        [2, 6, 6, 7, 5, 7],
+        [3, 7, 2, 3, 1, 3],
+    ],
+    [
+        [3, 7, 2, 3, 0, 1],
+        [0, 1, 5, 7, 3, 7],
+        [0, 4, 5, 7, 0, 1],
+        [0, 4, 6, 7, 5, 7],
+        [0, 4, 2, 6, 6, 7],
+    ],
+    [[2, 0, 3, 7, 2, 3], [2, 0, 7, 5, 3, 7], [2, 0, 6, 7, 7, 5], [2, 6, 6, 7, 2, 0]],
+    [
+        [5, 7, 1, 5, 0, 4],
+        [0, 4, 6, 7, 5, 7],
+        [0, 2, 6, 7, 0, 4],
+        [0, 2, 3, 7, 6, 7],
+        [0, 2, 1, 3, 3, 7],
+    ],
+    [[1, 0, 5, 7, 1, 5], [1, 0, 7, 6, 5, 7], [1, 0, 3, 7, 7, 6], [1, 3, 3, 7, 1, 0]],
+    [[0, 2, 0, 1, 0, 4], [3, 7, 6, 7, 5, 7]],
+    [[7, 5, 7, 3, 7, 6]],
+    [[7, 3, 7, 5, 7, 6]],
+    [[0, 1, 0, 2, 0, 4], [6, 7, 3, 7, 5, 7]],
+    [[1, 3, 1, 0, 1, 5], [7, 6, 3, 7, 5, 7]],
+    [[0, 4, 1, 5, 0, 2], [1, 5, 1, 3, 0, 2], [6, 7, 3, 7, 5, 7]],
+    [[2, 6, 2, 0, 2, 3], [7, 5, 6, 7, 3, 7]],
+    [[0, 1, 2, 3, 0, 4], [2, 3, 2, 6, 0, 4], [5, 7, 6, 7, 3, 7]],
+    [[1, 5, 1, 3, 0, 1], [2, 3, 2, 6, 0, 2], [5, 7, 6, 7, 3, 7]],
+    [[3, 2, 2, 6, 3, 1], [3, 1, 2, 6, 1, 5], [1, 5, 2, 6, 0, 4], [7, 6, 3, 7, 7, 5]],
+    [[3, 1, 7, 5, 3, 2], [7, 5, 7, 6, 3, 2]],
+    [[7, 6, 3, 2, 7, 5], [3, 2, 3, 1, 7, 5], [4, 0, 1, 0, 2, 0]],
+    [[5, 7, 7, 6, 5, 1], [5, 1, 7, 6, 1, 0], [1, 0, 7, 6, 3, 2]],
+    [[2, 3, 2, 0, 6, 7], [2, 0, 1, 5, 6, 7], [2, 0, 0, 4, 1, 5], [6, 7, 1, 5, 7, 5]],
+    [[6, 2, 2, 0, 6, 7], [6, 7, 2, 0, 7, 5], [7, 5, 2, 0, 3, 1]],
+    [[0, 4, 0, 1, 2, 6], [0, 1, 5, 7, 2, 6], [2, 6, 5, 7, 6, 7], [0, 1, 1, 3, 5, 7]],
+    [[1, 5, 0, 2, 1, 0], [1, 5, 2, 6, 0, 2], [1, 5, 5, 7, 2, 6], [5, 7, 7, 6, 2, 6]],
+    [[5, 1, 7, 5, 7, 6], [7, 6, 6, 2, 5, 1], [5, 1, 6, 2, 4, 0]],
+    [[4, 5, 4, 0, 4, 6], [7, 3, 5, 7, 6, 7]],
+    [[0, 2, 4, 6, 0, 1], [4, 6, 4, 5, 0, 1], [3, 7, 5, 7, 6, 7]],
+    [[4, 6, 4, 5, 0, 4], [1, 5, 1, 3, 0, 1], [6, 7, 3, 7, 5, 7]],
+    [[5, 1, 1, 3, 5, 4], [5, 4, 1, 3, 4, 6], [4, 6, 1, 3, 0, 2], [7, 3, 5, 7, 7, 6]],
+    [[2, 3, 2, 6, 0, 2], [4, 6, 4, 5, 0, 4], [3, 7, 5, 7, 6, 7]],
+    [[6, 4, 4, 5, 6, 2], [6, 2, 4, 5, 2, 3], [2, 3, 4, 5, 0, 1], [7, 5, 6, 7, 7, 3]],
+    [[0, 1, 1, 5, 1, 3], [0, 2, 2, 3, 2, 6], [4, 5, 0, 4, 4, 6], [5, 7, 6, 7, 3, 7]],
+    [
+        [1, 3, 5, 4, 1, 5],
+        [1, 3, 4, 6, 5, 4],
+        [1, 3, 3, 2, 4, 6],
+        [3, 2, 2, 6, 4, 6],
+        [7, 6, 3, 7, 5, 7],
+    ],
+    [[3, 1, 7, 5, 3, 2], [7, 5, 7, 6, 3, 2], [0, 4, 6, 4, 5, 4]],
+    [[1, 0, 0, 2, 4, 6], [1, 0, 4, 6, 5, 4], [1, 3, 5, 7, 7, 6], [7, 6, 3, 2, 1, 3]],
+    [[5, 7, 7, 6, 5, 1], [5, 1, 7, 6, 1, 0], [1, 0, 7, 6, 3, 2], [4, 6, 5, 4, 4, 0]],
+    [
+        [7, 5, 6, 7, 2, 3],
+        [2, 3, 1, 5, 7, 5],
+        [2, 0, 1, 5, 2, 3],
+        [2, 0, 4, 5, 1, 5],
+        [2, 0, 6, 4, 4, 5],
+    ],
+    [[6, 2, 2, 0, 6, 7], [6, 7, 2, 0, 7, 5], [7, 5, 2, 0, 3, 1], [4, 0, 6, 4, 4, 5]],
+    [
+        [4, 6, 5, 4, 1, 0],
+        [1, 0, 2, 6, 4, 6],
+        [1, 3, 2, 6, 1, 0],
+        [1, 3, 7, 6, 2, 6],
+        [1, 3, 5, 7, 7, 6],
+    ],
+    [
+        [1, 5, 0, 2, 1, 0],
+        [1, 5, 2, 6, 0, 2],
+        [1, 5, 5, 7, 2, 6],
+        [5, 7, 7, 6, 2, 6],
+        [4, 6, 5, 4, 0, 4],
+    ],
+    [[5, 1, 4, 6, 5, 4], [5, 1, 6, 2, 4, 6], [5, 1, 7, 6, 6, 2], [5, 7, 7, 6, 5, 1]],
+    [[5, 4, 7, 6, 5, 1], [7, 6, 7, 3, 5, 1]],
+    [[7, 3, 5, 1, 7, 6], [5, 1, 5, 4, 7, 6], [2, 0, 4, 0, 1, 0]],
+    [[3, 1, 1, 0, 3, 7], [3, 7, 1, 0, 7, 6], [7, 6, 1, 0, 5, 4]],
+    [[0, 2, 0, 4, 1, 3], [0, 4, 6, 7, 1, 3], [1, 3, 6, 7, 3, 7], [0, 4, 4, 5, 6, 7]],
+    [[5, 4, 7, 6, 5, 1], [7, 6, 7, 3, 5, 1], [0, 2, 3, 2, 6, 2]],
+    [[1, 5, 5, 4, 7, 6], [1, 5, 7, 6, 3, 7], [1, 0, 3, 2, 2, 6], [2, 6, 0, 4, 1, 0]],
+    [[3, 1, 1, 0, 3, 7], [3, 7, 1, 0, 7, 6], [7, 6, 1, 0, 5, 4], [2, 0, 3, 2, 2, 6]],
+    [
+        [2, 3, 6, 2, 4, 0],
+        [4, 0, 1, 3, 2, 3],
+        [4, 5, 1, 3, 4, 0],
+        [4, 5, 7, 3, 1, 3],
+        [4, 5, 6, 7, 7, 3],
+    ],
+    [[1, 5, 5, 4, 1, 3], [1, 3, 5, 4, 3, 2], [3, 2, 5, 4, 7, 6]],
+    [[1, 5, 5, 4, 1, 3], [1, 3, 5, 4, 3, 2], [3, 2, 5, 4, 7, 6], [0, 4, 1, 0, 0, 2]],
+    [[1, 0, 5, 4, 7, 6], [1, 0, 7, 6, 3, 2]],
+    [[2, 3, 0, 2, 0, 4], [0, 4, 4, 5, 2, 3], [2, 3, 4, 5, 6, 7]],
+    [[1, 3, 1, 5, 0, 2], [1, 5, 7, 6, 0, 2], [1, 5, 5, 4, 7, 6], [0, 2, 7, 6, 2, 6]],
+    [
+        [5, 1, 4, 5, 6, 7],
+        [6, 7, 3, 1, 5, 1],
+        [6, 2, 3, 1, 6, 7],
+        [6, 2, 0, 1, 3, 1],
+        [6, 2, 4, 0, 0, 1],
+    ],
+    [[6, 7, 2, 6, 2, 0], [2, 0, 0, 1, 6, 7], [6, 7, 0, 1, 4, 5]],
+    [[6, 2, 4, 0, 4, 5], [6, 7, 6, 2, 4, 5]],
+    [[6, 7, 7, 3, 6, 4], [6, 4, 7, 3, 4, 0], [4, 0, 7, 3, 5, 1]],
+    [[1, 5, 1, 0, 3, 7], [1, 0, 4, 6, 3, 7], [1, 0, 0, 2, 4, 6], [3, 7, 4, 6, 7, 6]],
+    [[1, 0, 3, 7, 1, 3], [1, 0, 7, 6, 3, 7], [1, 0, 0, 4, 7, 6], [0, 4, 4, 6, 7, 6]],
+    [[6, 4, 7, 6, 7, 3], [7, 3, 3, 1, 6, 4], [6, 4, 3, 1, 2, 0]],
+    [[6, 7, 7, 3, 6, 4], [6, 4, 7, 3, 4, 0], [4, 0, 7, 3, 5, 1], [2, 3, 6, 2, 2, 0]],
+    [
+        [7, 6, 3, 7, 1, 5],
+        [1, 5, 4, 6, 7, 6],
+        [1, 0, 4, 6, 1, 5],
+        [1, 0, 2, 6, 4, 6],
+        [1, 0, 3, 2, 2, 6],
+    ],
+    [
+        [1, 0, 3, 7, 1, 3],
+        [1, 0, 7, 6, 3, 7],
+        [1, 0, 0, 4, 7, 6],
+        [0, 4, 4, 6, 7, 6],
+        [2, 6, 0, 2, 3, 2],
+    ],
+    [[3, 1, 7, 6, 3, 7], [3, 1, 6, 4, 7, 6], [3, 1, 2, 6, 6, 4], [3, 2, 2, 6, 3, 1]],
+    [[3, 2, 3, 1, 7, 6], [3, 1, 0, 4, 7, 6], [7, 6, 0, 4, 6, 4], [3, 1, 1, 5, 0, 4]],
+    [
+        [0, 1, 2, 0, 6, 4],
+        [6, 4, 5, 1, 0, 1],
+        [6, 7, 5, 1, 6, 4],
+        [6, 7, 3, 1, 5, 1],
+        [6, 7, 2, 3, 3, 1],
+    ],
+    [[0, 1, 4, 0, 4, 6], [4, 6, 6, 7, 0, 1], [0, 1, 6, 7, 2, 3]],
+    [[6, 7, 2, 3, 2, 0], [6, 4, 6, 7, 2, 0]],
+    [
+        [2, 6, 0, 2, 1, 3],
+        [1, 3, 7, 6, 2, 6],
+        [1, 5, 7, 6, 1, 3],
+        [1, 5, 4, 6, 7, 6],
+        [1, 5, 0, 4, 4, 6],
+    ],
+    [[1, 5, 1, 0, 1, 3], [4, 6, 7, 6, 2, 6]],
+    [[0, 1, 2, 6, 0, 2], [0, 1, 6, 7, 2, 6], [0, 1, 4, 6, 6, 7], [0, 4, 4, 6, 0, 1]],
+    [[6, 7, 6, 2, 6, 4]],
+    [[6, 2, 7, 3, 6, 4], [7, 3, 7, 5, 6, 4]],
+    [[7, 5, 6, 4, 7, 3], [6, 4, 6, 2, 7, 3], [1, 0, 2, 0, 4, 0]],
+    [[6, 2, 7, 3, 6, 4], [7, 3, 7, 5, 6, 4], [0, 1, 5, 1, 3, 1]],
+    [[2, 0, 0, 4, 1, 5], [2, 0, 1, 5, 3, 1], [2, 6, 3, 7, 7, 5], [7, 5, 6, 4, 2, 6]],
+    [[3, 7, 7, 5, 3, 2], [3, 2, 7, 5, 2, 0], [2, 0, 7, 5, 6, 4]],
+    [[3, 2, 3, 7, 1, 0], [3, 7, 6, 4, 1, 0], [3, 7, 7, 5, 6, 4], [1, 0, 6, 4, 0, 4]],
+    [[3, 7, 7, 5, 3, 2], [3, 2, 7, 5, 2, 0], [2, 0, 7, 5, 6, 4], [1, 5, 3, 1, 1, 0]],
+    [
+        [7, 3, 5, 7, 4, 6],
+        [4, 6, 2, 3, 7, 3],
+        [4, 0, 2, 3, 4, 6],
+        [4, 0, 1, 3, 2, 3],
+        [4, 0, 5, 1, 1, 3],
+    ],
+    [[2, 3, 3, 1, 2, 6], [2, 6, 3, 1, 6, 4], [6, 4, 3, 1, 7, 5]],
+    [[2, 3, 3, 1, 2, 6], [2, 6, 3, 1, 6, 4], [6, 4, 3, 1, 7, 5], [0, 1, 2, 0, 0, 4]],
+    [[1, 0, 1, 5, 3, 2], [1, 5, 4, 6, 3, 2], [3, 2, 4, 6, 2, 6], [1, 5, 5, 7, 4, 6]],
+    [
+        [0, 2, 4, 0, 5, 1],
+        [5, 1, 3, 2, 0, 2],
+        [5, 7, 3, 2, 5, 1],
+        [5, 7, 6, 2, 3, 2],
+        [5, 7, 4, 6, 6, 2],
+    ],
+    [[2, 0, 3, 1, 7, 5], [2, 0, 7, 5, 6, 4]],
+    [[4, 6, 0, 4, 0, 1], [0, 1, 1, 3, 4, 6], [4, 6, 1, 3, 5, 7]],
+    [[0, 2, 1, 0, 1, 5], [1, 5, 5, 7, 0, 2], [0, 2, 5, 7, 4, 6]],
+    [[5, 7, 4, 6, 4, 0], [5, 1, 5, 7, 4, 0]],
+    [[5, 4, 4, 0, 5, 7], [5, 7, 4, 0, 7, 3], [7, 3, 4, 0, 6, 2]],
+    [[0, 1, 0, 2, 4, 5], [0, 2, 3, 7, 4, 5], [4, 5, 3, 7, 5, 7], [0, 2, 2, 6, 3, 7]],
+    [[5, 4, 4, 0, 5, 7], [5, 7, 4, 0, 7, 3], [7, 3, 4, 0, 6, 2], [1, 0, 5, 1, 1, 3]],
+    [
+        [1, 5, 3, 1, 2, 0],
+        [2, 0, 4, 5, 1, 5],
+        [2, 6, 4, 5, 2, 0],
+        [2, 6, 7, 5, 4, 5],
+        [2, 6, 3, 7, 7, 5],
+    ],
+    [[2, 3, 0, 4, 2, 0], [2, 3, 4, 5, 0, 4], [2, 3, 3, 7, 4, 5], [3, 7, 7, 5, 4, 5]],
+    [[3, 2, 7, 3, 7, 5], [7, 5, 5, 4, 3, 2], [3, 2, 5, 4, 1, 0]],
+    [
+        [2, 3, 0, 4, 2, 0],
+        [2, 3, 4, 5, 0, 4],
+        [2, 3, 3, 7, 4, 5],
+        [3, 7, 7, 5, 4, 5],
+        [1, 5, 3, 1, 0, 1],
+    ],
+    [[3, 2, 1, 5, 3, 1], [3, 2, 5, 4, 1, 5], [3, 2, 7, 5, 5, 4], [3, 7, 7, 5, 3, 2]],
+    [[2, 6, 2, 3, 0, 4], [2, 3, 7, 5, 0, 4], [2, 3, 3, 1, 7, 5], [0, 4, 7, 5, 4, 5]],
+    [
+        [3, 2, 1, 3, 5, 7],
+        [5, 7, 6, 2, 3, 2],
+        [5, 4, 6, 2, 5, 7],
+        [5, 4, 0, 2, 6, 2],
+        [5, 4, 1, 0, 0, 2],
+    ],
+    [
+        [4, 5, 0, 4, 2, 6],
+        [2, 6, 7, 5, 4, 5],
+        [2, 3, 7, 5, 2, 6],
+        [2, 3, 1, 5, 7, 5],
+        [2, 3, 0, 1, 1, 5],
+    ],
+    [[2, 3, 2, 0, 2, 6], [1, 5, 7, 5, 4, 5]],
+    [[5, 7, 4, 5, 4, 0], [4, 0, 0, 2, 5, 7], [5, 7, 0, 2, 1, 3]],
+    [[5, 4, 1, 0, 1, 3], [5, 7, 5, 4, 1, 3]],
+    [[0, 2, 4, 5, 0, 4], [0, 2, 5, 7, 4, 5], [0, 2, 1, 5, 5, 7], [0, 1, 1, 5, 0, 2]],
+    [[5, 4, 5, 1, 5, 7]],
+    [[4, 6, 6, 2, 4, 5], [4, 5, 6, 2, 5, 1], [5, 1, 6, 2, 7, 3]],
+    [[4, 6, 6, 2, 4, 5], [4, 5, 6, 2, 5, 1], [5, 1, 6, 2, 7, 3], [0, 2, 4, 0, 0, 1]],
+    [[3, 7, 3, 1, 2, 6], [3, 1, 5, 4, 2, 6], [3, 1, 1, 0, 5, 4], [2, 6, 5, 4, 6, 4]],
+    [
+        [6, 4, 2, 6, 3, 7],
+        [3, 7, 5, 4, 6, 4],
+        [3, 1, 5, 4, 3, 7],
+        [3, 1, 0, 4, 5, 4],
+        [3, 1, 2, 0, 0, 4],
+    ],
+    [[2, 0, 2, 3, 6, 4], [2, 3, 1, 5, 6, 4], [6, 4, 1, 5, 4, 5], [2, 3, 3, 7, 1, 5]],
+    [
+        [0, 4, 1, 0, 3, 2],
+        [3, 2, 6, 4, 0, 4],
+        [3, 7, 6, 4, 3, 2],
+        [3, 7, 5, 4, 6, 4],
+        [3, 7, 1, 5, 5, 4],
+    ],
+    [
+        [1, 3, 0, 1, 4, 5],
+        [4, 5, 7, 3, 1, 3],
+        [4, 6, 7, 3, 4, 5],
+        [4, 6, 2, 3, 7, 3],
+        [4, 6, 0, 2, 2, 3],
+    ],
+    [[3, 7, 3, 1, 3, 2], [5, 4, 6, 4, 0, 4]],
+    [[3, 1, 2, 6, 3, 2], [3, 1, 6, 4, 2, 6], [3, 1, 1, 5, 6, 4], [1, 5, 5, 4, 6, 4]],
+    [
+        [3, 1, 2, 6, 3, 2],
+        [3, 1, 6, 4, 2, 6],
+        [3, 1, 1, 5, 6, 4],
+        [1, 5, 5, 4, 6, 4],
+        [0, 4, 1, 0, 2, 0],
+    ],
+    [[4, 5, 6, 4, 6, 2], [6, 2, 2, 3, 4, 5], [4, 5, 2, 3, 0, 1]],
+    [[2, 3, 6, 4, 2, 6], [2, 3, 4, 5, 6, 4], [2, 3, 0, 4, 4, 5], [2, 0, 0, 4, 2, 3]],
+    [[1, 3, 5, 1, 5, 4], [5, 4, 4, 6, 1, 3], [1, 3, 4, 6, 0, 2]],
+    [[1, 3, 0, 4, 1, 0], [1, 3, 4, 6, 0, 4], [1, 3, 5, 4, 4, 6], [1, 5, 5, 4, 1, 3]],
+    [[4, 6, 0, 2, 0, 1], [4, 5, 4, 6, 0, 1]],
+    [[4, 6, 4, 0, 4, 5]],
+    [[4, 0, 6, 2, 7, 3], [4, 0, 7, 3, 5, 1]],
+    [[1, 5, 0, 1, 0, 2], [0, 2, 2, 6, 1, 5], [1, 5, 2, 6, 3, 7]],
+    [[3, 7, 1, 3, 1, 0], [1, 0, 0, 4, 3, 7], [3, 7, 0, 4, 2, 6]],
+    [[3, 1, 2, 0, 2, 6], [3, 7, 3, 1, 2, 6]],
+    [[0, 4, 2, 0, 2, 3], [2, 3, 3, 7, 0, 4], [0, 4, 3, 7, 1, 5]],
+    [[3, 7, 1, 5, 1, 0], [3, 2, 3, 7, 1, 0]],
+    [[0, 4, 1, 3, 0, 1], [0, 4, 3, 7, 1, 3], [0, 4, 2, 3, 3, 7], [0, 2, 2, 3, 0, 4]],
+    [[3, 7, 3, 1, 3, 2]],
+    [[2, 6, 3, 2, 3, 1], [3, 1, 1, 5, 2, 6], [2, 6, 1, 5, 0, 4]],
+    [[1, 5, 3, 2, 1, 3], [1, 5, 2, 6, 3, 2], [1, 5, 0, 2, 2, 6], [1, 0, 0, 2, 1, 5]],
+    [[2, 3, 0, 1, 0, 4], [2, 6, 2, 3, 0, 4]],
+    [[2, 3, 2, 0, 2, 6]],
+    [[1, 5, 0, 4, 0, 2], [1, 3, 1, 5, 0, 2]],
+    [[1, 5, 1, 0, 1, 3]],
+    [[0, 2, 0, 1, 0, 4]],
+    [],
+]
+
+
+def create_mc_lookup_table():
+    cases = torch.zeros(256, 5, 3, dtype=torch.long)
+    masks = torch.zeros(256, 5, dtype=torch.bool)
+
+    edge_to_index = {
+        (0, 1): 0,
+        (2, 3): 1,
+        (4, 5): 2,
+        (6, 7): 3,
+        (0, 2): 4,
+        (1, 3): 5,
+        (4, 6): 6,
+        (5, 7): 7,
+        (0, 4): 8,
+        (1, 5): 9,
+        (2, 6): 10,
+        (3, 7): 11,
+    }
+
+    for i, case in enumerate(MC_TABLE):
+        for j, tri in enumerate(case):
+            for k, (c1, c2) in enumerate(zip(tri[::2], tri[1::2])):
+                cases[i, j, k] = edge_to_index[(c1, c2) if c1 < c2 else (c2, c1)]
+            masks[i, j] = True
+    return cases, masks
+
+
+RENDERER_CONFIG = {}
+
+
+def renderer_model_from_original_config():
+    model = ShapERenderer(**RENDERER_CONFIG)
+
+    return model
+
+
+RENDERER_MLP_ORIGINAL_PREFIX = "renderer.nerstf"
+
+RENDERER_PARAMS_PROJ_ORIGINAL_PREFIX = "encoder.params_proj"
+
+
+def renderer_model_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+    diffusers_checkpoint.update(
+        {f"mlp.{k}": checkpoint[f"{RENDERER_MLP_ORIGINAL_PREFIX}.{k}"] for k in model.mlp.state_dict().keys()}
+    )
+
+    diffusers_checkpoint.update(
+        {
+            f"params_proj.{k}": checkpoint[f"{RENDERER_PARAMS_PROJ_ORIGINAL_PREFIX}.{k}"]
+            for k in model.params_proj.state_dict().keys()
+        }
+    )
+
+    diffusers_checkpoint.update({"void.background": model.state_dict()["void.background"]})
+
+    cases, masks = create_mc_lookup_table()
+
+    diffusers_checkpoint.update({"mesh_decoder.cases": cases})
+    diffusers_checkpoint.update({"mesh_decoder.masks": masks})
+
+    return diffusers_checkpoint
+
+
+# done renderer
+
+
+# TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?)
+def split_attentions(*, weight, bias, split, chunk_size):
+    weights = [None] * split
+    biases = [None] * split
+
+    weights_biases_idx = 0
+
+    for starting_row_index in range(0, weight.shape[0], chunk_size):
+        row_indices = torch.arange(starting_row_index, starting_row_index + chunk_size)
+
+        weight_rows = weight[row_indices, :]
+        bias_rows = bias[row_indices]
+
+        if weights[weights_biases_idx] is None:
+            assert weights[weights_biases_idx] is None
+            weights[weights_biases_idx] = weight_rows
+            biases[weights_biases_idx] = bias_rows
+        else:
+            assert weights[weights_biases_idx] is not None
+            weights[weights_biases_idx] = torch.concat([weights[weights_biases_idx], weight_rows])
+            biases[weights_biases_idx] = torch.concat([biases[weights_biases_idx], bias_rows])
+
+        weights_biases_idx = (weights_biases_idx + 1) % split
+
+    return weights, biases
+
+
+# done unet utils
+
+
+# Driver functions
+
+
+def prior(*, args, checkpoint_map_location):
+    print("loading prior")
+
+    prior_checkpoint = torch.load(args.prior_checkpoint_path, map_location=checkpoint_map_location)
+
+    prior_model = prior_model_from_original_config()
+
+    prior_diffusers_checkpoint = prior_original_checkpoint_to_diffusers_checkpoint(prior_model, prior_checkpoint)
+
+    del prior_checkpoint
+
+    load_prior_checkpoint_to_model(prior_diffusers_checkpoint, prior_model)
+
+    print("done loading prior")
+
+    return prior_model
+
+
+def prior_image(*, args, checkpoint_map_location):
+    print("loading prior_image")
+
+    print(f"load checkpoint from {args.prior_image_checkpoint_path}")
+    prior_checkpoint = torch.load(args.prior_image_checkpoint_path, map_location=checkpoint_map_location)
+
+    prior_model = prior_image_model_from_original_config()
+
+    prior_diffusers_checkpoint = prior_image_original_checkpoint_to_diffusers_checkpoint(prior_model, prior_checkpoint)
+
+    del prior_checkpoint
+
+    load_prior_checkpoint_to_model(prior_diffusers_checkpoint, prior_model)
+
+    print("done loading prior_image")
+
+    return prior_model
+
+
+def renderer(*, args, checkpoint_map_location):
+    print(" loading renderer")
+
+    renderer_checkpoint = torch.load(args.transmitter_checkpoint_path, map_location=checkpoint_map_location)
+
+    renderer_model = renderer_model_from_original_config()
+
+    renderer_diffusers_checkpoint = renderer_model_original_checkpoint_to_diffusers_checkpoint(
+        renderer_model, renderer_checkpoint
+    )
+
+    del renderer_checkpoint
+
+    load_checkpoint_to_model(renderer_diffusers_checkpoint, renderer_model, strict=True)
+
+    print("done loading renderer")
+
+    return renderer_model
+
+
+# prior model will expect clip_mean and clip_std, whic are missing from the state_dict
+PRIOR_EXPECTED_MISSING_KEYS = ["clip_mean", "clip_std"]
+
+
+def load_prior_checkpoint_to_model(checkpoint, model):
+    with tempfile.NamedTemporaryFile() as file:
+        torch.save(checkpoint, file.name)
+        del checkpoint
+        missing_keys, unexpected_keys = model.load_state_dict(torch.load(file.name), strict=False)
+        missing_keys = list(set(missing_keys) - set(PRIOR_EXPECTED_MISSING_KEYS))
+
+        if len(unexpected_keys) > 0:
+            raise ValueError(f"Unexpected keys when loading prior model: {unexpected_keys}")
+        if len(missing_keys) > 0:
+            raise ValueError(f"Missing keys when loading prior model: {missing_keys}")
+
+
+def load_checkpoint_to_model(checkpoint, model, strict=False):
+    with tempfile.NamedTemporaryFile() as file:
+        torch.save(checkpoint, file.name)
+        del checkpoint
+        if strict:
+            model.load_state_dict(torch.load(file.name), strict=True)
+        else:
+            load_checkpoint_and_dispatch(model, file.name, device_map="auto")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    parser.add_argument(
+        "--prior_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the prior checkpoint to convert.",
+    )
+
+    parser.add_argument(
+        "--prior_image_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the prior_image checkpoint to convert.",
+    )
+
+    parser.add_argument(
+        "--transmitter_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the transmitter checkpoint to convert.",
+    )
+
+    parser.add_argument(
+        "--checkpoint_load_device",
+        default="cpu",
+        type=str,
+        required=False,
+        help="The device passed to `map_location` when loading checkpoints.",
+    )
+
+    parser.add_argument(
+        "--debug",
+        default=None,
+        type=str,
+        required=False,
+        help="Only run a specific stage of the convert script. Used for debugging",
+    )
+
+    args = parser.parse_args()
+
+    print(f"loading checkpoints to {args.checkpoint_load_device}")
+
+    checkpoint_map_location = torch.device(args.checkpoint_load_device)
+
+    if args.debug is not None:
+        print(f"debug: only executing {args.debug}")
+
+    if args.debug is None:
+        print("YiYi TO-DO")
+    elif args.debug == "prior":
+        prior_model = prior(args=args, checkpoint_map_location=checkpoint_map_location)
+        prior_model.save_pretrained(args.dump_path)
+    elif args.debug == "prior_image":
+        prior_model = prior_image(args=args, checkpoint_map_location=checkpoint_map_location)
+        prior_model.save_pretrained(args.dump_path)
+    elif args.debug == "renderer":
+        renderer_model = renderer(args=args, checkpoint_map_location=checkpoint_map_location)
+        renderer_model.save_pretrained(args.dump_path)
+    else:
+        raise ValueError(f"unknown debug value : {args.debug}")
diff --git a/diffusers/scripts/convert_stable_audio.py b/diffusers/scripts/convert_stable_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0f9d0f87d90a1845bd732fc2df9eee6b4b88809
--- /dev/null
+++ b/diffusers/scripts/convert_stable_audio.py
@@ -0,0 +1,279 @@
+# Run this script to convert the Stable Cascade model weights to a diffusers pipeline.
+import argparse
+import json
+import os
+from contextlib import nullcontext
+
+import torch
+from safetensors.torch import load_file
+from transformers import (
+    AutoTokenizer,
+    T5EncoderModel,
+)
+
+from diffusers import (
+    AutoencoderOobleck,
+    CosineDPMSolverMultistepScheduler,
+    StableAudioDiTModel,
+    StableAudioPipeline,
+    StableAudioProjectionModel,
+)
+from diffusers.models.modeling_utils import load_model_dict_into_meta
+from diffusers.utils import is_accelerate_available
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+
+def convert_stable_audio_state_dict_to_diffusers(state_dict, num_autoencoder_layers=5):
+    projection_model_state_dict = {
+        k.replace("conditioner.conditioners.", "").replace("embedder.embedding", "time_positional_embedding"): v
+        for (k, v) in state_dict.items()
+        if "conditioner.conditioners" in k
+    }
+
+    # NOTE: we assume here that there's no projection layer from the text encoder to the latent space, script should be adapted a bit if there is.
+    for key, value in list(projection_model_state_dict.items()):
+        new_key = key.replace("seconds_start", "start_number_conditioner").replace(
+            "seconds_total", "end_number_conditioner"
+        )
+        projection_model_state_dict[new_key] = projection_model_state_dict.pop(key)
+
+    model_state_dict = {k.replace("model.model.", ""): v for (k, v) in state_dict.items() if "model.model." in k}
+    for key, value in list(model_state_dict.items()):
+        # attention layers
+        new_key = (
+            key.replace("transformer.", "")
+            .replace("layers", "transformer_blocks")
+            .replace("self_attn", "attn1")
+            .replace("cross_attn", "attn2")
+            .replace("ff.ff", "ff.net")
+        )
+        new_key = (
+            new_key.replace("pre_norm", "norm1")
+            .replace("cross_attend_norm", "norm2")
+            .replace("ff_norm", "norm3")
+            .replace("to_out", "to_out.0")
+        )
+        new_key = new_key.replace("gamma", "weight").replace("beta", "bias")  # replace layernorm
+
+        # other layers
+        new_key = (
+            new_key.replace("project", "proj")
+            .replace("to_timestep_embed", "timestep_proj")
+            .replace("timestep_features", "time_proj")
+            .replace("to_global_embed", "global_proj")
+            .replace("to_cond_embed", "cross_attention_proj")
+        )
+
+        # we're using diffusers implementation of time_proj (GaussianFourierProjection) which creates a 1D tensor
+        if new_key == "time_proj.weight":
+            model_state_dict[key] = model_state_dict[key].squeeze(1)
+
+        if "to_qkv" in new_key:
+            q, k, v = torch.chunk(model_state_dict.pop(key), 3, dim=0)
+            model_state_dict[new_key.replace("qkv", "q")] = q
+            model_state_dict[new_key.replace("qkv", "k")] = k
+            model_state_dict[new_key.replace("qkv", "v")] = v
+        elif "to_kv" in new_key:
+            k, v = torch.chunk(model_state_dict.pop(key), 2, dim=0)
+            model_state_dict[new_key.replace("kv", "k")] = k
+            model_state_dict[new_key.replace("kv", "v")] = v
+        else:
+            model_state_dict[new_key] = model_state_dict.pop(key)
+
+    autoencoder_state_dict = {
+        k.replace("pretransform.model.", "").replace("coder.layers.0", "coder.conv1"): v
+        for (k, v) in state_dict.items()
+        if "pretransform.model." in k
+    }
+
+    for key, _ in list(autoencoder_state_dict.items()):
+        new_key = key
+        if "coder.layers" in new_key:
+            # get idx of the layer
+            idx = int(new_key.split("coder.layers.")[1].split(".")[0])
+
+            new_key = new_key.replace(f"coder.layers.{idx}", f"coder.block.{idx-1}")
+
+            if "encoder" in new_key:
+                for i in range(3):
+                    new_key = new_key.replace(f"block.{idx-1}.layers.{i}", f"block.{idx-1}.res_unit{i+1}")
+                new_key = new_key.replace(f"block.{idx-1}.layers.3", f"block.{idx-1}.snake1")
+                new_key = new_key.replace(f"block.{idx-1}.layers.4", f"block.{idx-1}.conv1")
+            else:
+                for i in range(2, 5):
+                    new_key = new_key.replace(f"block.{idx-1}.layers.{i}", f"block.{idx-1}.res_unit{i-1}")
+                new_key = new_key.replace(f"block.{idx-1}.layers.0", f"block.{idx-1}.snake1")
+                new_key = new_key.replace(f"block.{idx-1}.layers.1", f"block.{idx-1}.conv_t1")
+
+            new_key = new_key.replace("layers.0.beta", "snake1.beta")
+            new_key = new_key.replace("layers.0.alpha", "snake1.alpha")
+            new_key = new_key.replace("layers.2.beta", "snake2.beta")
+            new_key = new_key.replace("layers.2.alpha", "snake2.alpha")
+            new_key = new_key.replace("layers.1.bias", "conv1.bias")
+            new_key = new_key.replace("layers.1.weight_", "conv1.weight_")
+            new_key = new_key.replace("layers.3.bias", "conv2.bias")
+            new_key = new_key.replace("layers.3.weight_", "conv2.weight_")
+
+            if idx == num_autoencoder_layers + 1:
+                new_key = new_key.replace(f"block.{idx-1}", "snake1")
+            elif idx == num_autoencoder_layers + 2:
+                new_key = new_key.replace(f"block.{idx-1}", "conv2")
+
+        else:
+            new_key = new_key
+
+        value = autoencoder_state_dict.pop(key)
+        if "snake" in new_key:
+            value = value.unsqueeze(0).unsqueeze(-1)
+        if new_key in autoencoder_state_dict:
+            raise ValueError(f"{new_key} already in state dict.")
+        autoencoder_state_dict[new_key] = value
+
+    return model_state_dict, projection_model_state_dict, autoencoder_state_dict
+
+
+parser = argparse.ArgumentParser(description="Convert Stable Audio 1.0 model weights to a diffusers pipeline")
+parser.add_argument("--model_folder_path", type=str, help="Location of Stable Audio weights and config")
+parser.add_argument("--use_safetensors", action="store_true", help="Use SafeTensors for conversion")
+parser.add_argument(
+    "--save_directory",
+    type=str,
+    default="./tmp/stable-audio-1.0",
+    help="Directory to save a pipeline to. Will be created if it doesn't exist.",
+)
+parser.add_argument(
+    "--repo_id",
+    type=str,
+    default="stable-audio-1.0",
+    help="Hub organization to save the pipelines to",
+)
+parser.add_argument("--push_to_hub", action="store_true", help="Push to hub")
+parser.add_argument("--variant", type=str, help="Set to bf16 to save bfloat16 weights")
+
+args = parser.parse_args()
+
+checkpoint_path = (
+    os.path.join(args.model_folder_path, "model.safetensors")
+    if args.use_safetensors
+    else os.path.join(args.model_folder_path, "model.ckpt")
+)
+config_path = os.path.join(args.model_folder_path, "model_config.json")
+
+device = "cpu"
+if args.variant == "bf16":
+    dtype = torch.bfloat16
+else:
+    dtype = torch.float32
+
+with open(config_path) as f_in:
+    config_dict = json.load(f_in)
+
+conditioning_dict = {
+    conditioning["id"]: conditioning["config"] for conditioning in config_dict["model"]["conditioning"]["configs"]
+}
+
+t5_model_config = conditioning_dict["prompt"]
+
+# T5 Text encoder
+text_encoder = T5EncoderModel.from_pretrained(t5_model_config["t5_model_name"])
+tokenizer = AutoTokenizer.from_pretrained(
+    t5_model_config["t5_model_name"], truncation=True, model_max_length=t5_model_config["max_length"]
+)
+
+
+# scheduler
+scheduler = CosineDPMSolverMultistepScheduler(
+    sigma_min=0.3,
+    sigma_max=500,
+    solver_order=2,
+    prediction_type="v_prediction",
+    sigma_data=1.0,
+    sigma_schedule="exponential",
+)
+ctx = init_empty_weights if is_accelerate_available() else nullcontext
+
+
+if args.use_safetensors:
+    orig_state_dict = load_file(checkpoint_path, device=device)
+else:
+    orig_state_dict = torch.load(checkpoint_path, map_location=device)
+
+
+model_config = config_dict["model"]["diffusion"]["config"]
+
+model_state_dict, projection_model_state_dict, autoencoder_state_dict = convert_stable_audio_state_dict_to_diffusers(
+    orig_state_dict
+)
+
+
+with ctx():
+    projection_model = StableAudioProjectionModel(
+        text_encoder_dim=text_encoder.config.d_model,
+        conditioning_dim=config_dict["model"]["conditioning"]["cond_dim"],
+        min_value=conditioning_dict["seconds_start"][
+            "min_val"
+        ],  # assume `seconds_start` and `seconds_total` have the same min / max values.
+        max_value=conditioning_dict["seconds_start"][
+            "max_val"
+        ],  # assume `seconds_start` and `seconds_total` have the same min / max values.
+    )
+if is_accelerate_available():
+    load_model_dict_into_meta(projection_model, projection_model_state_dict)
+else:
+    projection_model.load_state_dict(projection_model_state_dict)
+
+attention_head_dim = model_config["embed_dim"] // model_config["num_heads"]
+with ctx():
+    model = StableAudioDiTModel(
+        sample_size=int(config_dict["sample_size"])
+        / int(config_dict["model"]["pretransform"]["config"]["downsampling_ratio"]),
+        in_channels=model_config["io_channels"],
+        num_layers=model_config["depth"],
+        attention_head_dim=attention_head_dim,
+        num_key_value_attention_heads=model_config["cond_token_dim"] // attention_head_dim,
+        num_attention_heads=model_config["num_heads"],
+        out_channels=model_config["io_channels"],
+        cross_attention_dim=model_config["cond_token_dim"],
+        time_proj_dim=256,
+        global_states_input_dim=model_config["global_cond_dim"],
+        cross_attention_input_dim=model_config["cond_token_dim"],
+    )
+if is_accelerate_available():
+    load_model_dict_into_meta(model, model_state_dict)
+else:
+    model.load_state_dict(model_state_dict)
+
+
+autoencoder_config = config_dict["model"]["pretransform"]["config"]
+with ctx():
+    autoencoder = AutoencoderOobleck(
+        encoder_hidden_size=autoencoder_config["encoder"]["config"]["channels"],
+        downsampling_ratios=autoencoder_config["encoder"]["config"]["strides"],
+        decoder_channels=autoencoder_config["decoder"]["config"]["channels"],
+        decoder_input_channels=autoencoder_config["decoder"]["config"]["latent_dim"],
+        audio_channels=autoencoder_config["io_channels"],
+        channel_multiples=autoencoder_config["encoder"]["config"]["c_mults"],
+        sampling_rate=config_dict["sample_rate"],
+    )
+
+if is_accelerate_available():
+    load_model_dict_into_meta(autoencoder, autoencoder_state_dict)
+else:
+    autoencoder.load_state_dict(autoencoder_state_dict)
+
+
+# Prior pipeline
+pipeline = StableAudioPipeline(
+    transformer=model,
+    tokenizer=tokenizer,
+    text_encoder=text_encoder,
+    scheduler=scheduler,
+    vae=autoencoder,
+    projection_model=projection_model,
+)
+pipeline.to(dtype).save_pretrained(
+    args.save_directory, repo_id=args.repo_id, push_to_hub=args.push_to_hub, variant=args.variant
+)
diff --git a/diffusers/scripts/convert_stable_cascade.py b/diffusers/scripts/convert_stable_cascade.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce10970b0b6ae33dcd83922c22779ba0a19a5517
--- /dev/null
+++ b/diffusers/scripts/convert_stable_cascade.py
@@ -0,0 +1,218 @@
+# Run this script to convert the Stable Cascade model weights to a diffusers pipeline.
+import argparse
+from contextlib import nullcontext
+
+import torch
+from safetensors.torch import load_file
+from transformers import (
+    AutoTokenizer,
+    CLIPConfig,
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    DDPMWuerstchenScheduler,
+    StableCascadeCombinedPipeline,
+    StableCascadeDecoderPipeline,
+    StableCascadePriorPipeline,
+)
+from diffusers.loaders.single_file_utils import convert_stable_cascade_unet_single_file_to_diffusers
+from diffusers.models import StableCascadeUNet
+from diffusers.models.modeling_utils import load_model_dict_into_meta
+from diffusers.pipelines.wuerstchen import PaellaVQModel
+from diffusers.utils import is_accelerate_available
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+parser = argparse.ArgumentParser(description="Convert Stable Cascade model weights to a diffusers pipeline")
+parser.add_argument("--model_path", type=str, help="Location of Stable Cascade weights")
+parser.add_argument("--stage_c_name", type=str, default="stage_c.safetensors", help="Name of stage c checkpoint file")
+parser.add_argument("--stage_b_name", type=str, default="stage_b.safetensors", help="Name of stage b checkpoint file")
+parser.add_argument("--skip_stage_c", action="store_true", help="Skip converting stage c")
+parser.add_argument("--skip_stage_b", action="store_true", help="Skip converting stage b")
+parser.add_argument("--use_safetensors", action="store_true", help="Use SafeTensors for conversion")
+parser.add_argument(
+    "--prior_output_path", default="stable-cascade-prior", type=str, help="Hub organization to save the pipelines to"
+)
+parser.add_argument(
+    "--decoder_output_path",
+    type=str,
+    default="stable-cascade-decoder",
+    help="Hub organization to save the pipelines to",
+)
+parser.add_argument(
+    "--combined_output_path",
+    type=str,
+    default="stable-cascade-combined",
+    help="Hub organization to save the pipelines to",
+)
+parser.add_argument("--save_combined", action="store_true")
+parser.add_argument("--push_to_hub", action="store_true", help="Push to hub")
+parser.add_argument("--variant", type=str, help="Set to bf16 to save bfloat16 weights")
+
+args = parser.parse_args()
+
+if args.skip_stage_b and args.skip_stage_c:
+    raise ValueError("At least one stage should be converted")
+if (args.skip_stage_b or args.skip_stage_c) and args.save_combined:
+    raise ValueError("Cannot skip stages when creating a combined pipeline")
+
+model_path = args.model_path
+
+device = "cpu"
+if args.variant == "bf16":
+    dtype = torch.bfloat16
+else:
+    dtype = torch.float32
+
+# set paths to model weights
+prior_checkpoint_path = f"{model_path}/{args.stage_c_name}"
+decoder_checkpoint_path = f"{model_path}/{args.stage_b_name}"
+
+# Clip Text encoder and tokenizer
+config = CLIPConfig.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+config.text_config.projection_dim = config.projection_dim
+text_encoder = CLIPTextModelWithProjection.from_pretrained(
+    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", config=config.text_config
+)
+tokenizer = AutoTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+
+# image processor
+feature_extractor = CLIPImageProcessor()
+image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+
+# scheduler for prior and decoder
+scheduler = DDPMWuerstchenScheduler()
+ctx = init_empty_weights if is_accelerate_available() else nullcontext
+
+if not args.skip_stage_c:
+    # Prior
+    if args.use_safetensors:
+        prior_orig_state_dict = load_file(prior_checkpoint_path, device=device)
+    else:
+        prior_orig_state_dict = torch.load(prior_checkpoint_path, map_location=device)
+
+    prior_state_dict = convert_stable_cascade_unet_single_file_to_diffusers(prior_orig_state_dict)
+
+    with ctx():
+        prior_model = StableCascadeUNet(
+            in_channels=16,
+            out_channels=16,
+            timestep_ratio_embedding_dim=64,
+            patch_size=1,
+            conditioning_dim=2048,
+            block_out_channels=[2048, 2048],
+            num_attention_heads=[32, 32],
+            down_num_layers_per_block=[8, 24],
+            up_num_layers_per_block=[24, 8],
+            down_blocks_repeat_mappers=[1, 1],
+            up_blocks_repeat_mappers=[1, 1],
+            block_types_per_layer=[
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+            ],
+            clip_text_in_channels=1280,
+            clip_text_pooled_in_channels=1280,
+            clip_image_in_channels=768,
+            clip_seq=4,
+            kernel_size=3,
+            dropout=[0.1, 0.1],
+            self_attn=True,
+            timestep_conditioning_type=["sca", "crp"],
+            switch_level=[False],
+        )
+    if is_accelerate_available():
+        load_model_dict_into_meta(prior_model, prior_state_dict)
+    else:
+        prior_model.load_state_dict(prior_state_dict)
+
+    # Prior pipeline
+    prior_pipeline = StableCascadePriorPipeline(
+        prior=prior_model,
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        image_encoder=image_encoder,
+        scheduler=scheduler,
+        feature_extractor=feature_extractor,
+    )
+    prior_pipeline.to(dtype).save_pretrained(
+        args.prior_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
+
+if not args.skip_stage_b:
+    # Decoder
+    if args.use_safetensors:
+        decoder_orig_state_dict = load_file(decoder_checkpoint_path, device=device)
+    else:
+        decoder_orig_state_dict = torch.load(decoder_checkpoint_path, map_location=device)
+
+    decoder_state_dict = convert_stable_cascade_unet_single_file_to_diffusers(decoder_orig_state_dict)
+    with ctx():
+        decoder = StableCascadeUNet(
+            in_channels=4,
+            out_channels=4,
+            timestep_ratio_embedding_dim=64,
+            patch_size=2,
+            conditioning_dim=1280,
+            block_out_channels=[320, 640, 1280, 1280],
+            down_num_layers_per_block=[2, 6, 28, 6],
+            up_num_layers_per_block=[6, 28, 6, 2],
+            down_blocks_repeat_mappers=[1, 1, 1, 1],
+            up_blocks_repeat_mappers=[3, 3, 2, 2],
+            num_attention_heads=[0, 0, 20, 20],
+            block_types_per_layer=[
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+            ],
+            clip_text_pooled_in_channels=1280,
+            clip_seq=4,
+            effnet_in_channels=16,
+            pixel_mapper_in_channels=3,
+            kernel_size=3,
+            dropout=[0, 0, 0.1, 0.1],
+            self_attn=True,
+            timestep_conditioning_type=["sca"],
+        )
+
+    if is_accelerate_available():
+        load_model_dict_into_meta(decoder, decoder_state_dict)
+    else:
+        decoder.load_state_dict(decoder_state_dict)
+
+    # VQGAN from Wuerstchen-V2
+    vqmodel = PaellaVQModel.from_pretrained("warp-ai/wuerstchen", subfolder="vqgan")
+
+    # Decoder pipeline
+    decoder_pipeline = StableCascadeDecoderPipeline(
+        decoder=decoder, text_encoder=text_encoder, tokenizer=tokenizer, vqgan=vqmodel, scheduler=scheduler
+    )
+    decoder_pipeline.to(dtype).save_pretrained(
+        args.decoder_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
+
+if args.save_combined:
+    # Stable Cascade combined pipeline
+    stable_cascade_pipeline = StableCascadeCombinedPipeline(
+        # Decoder
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        decoder=decoder,
+        scheduler=scheduler,
+        vqgan=vqmodel,
+        # Prior
+        prior_text_encoder=text_encoder,
+        prior_tokenizer=tokenizer,
+        prior_prior=prior_model,
+        prior_scheduler=scheduler,
+        prior_image_encoder=image_encoder,
+        prior_feature_extractor=feature_extractor,
+    )
+    stable_cascade_pipeline.to(dtype).save_pretrained(
+        args.combined_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
diff --git a/diffusers/scripts/convert_stable_cascade_lite.py b/diffusers/scripts/convert_stable_cascade_lite.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddccaa3b2e8a600dde57ab1270b5a4ac0ce3923b
--- /dev/null
+++ b/diffusers/scripts/convert_stable_cascade_lite.py
@@ -0,0 +1,226 @@
+# Run this script to convert the Stable Cascade model weights to a diffusers pipeline.
+import argparse
+from contextlib import nullcontext
+
+import torch
+from safetensors.torch import load_file
+from transformers import (
+    AutoTokenizer,
+    CLIPConfig,
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    DDPMWuerstchenScheduler,
+    StableCascadeCombinedPipeline,
+    StableCascadeDecoderPipeline,
+    StableCascadePriorPipeline,
+)
+from diffusers.loaders.single_file_utils import convert_stable_cascade_unet_single_file_to_diffusers
+from diffusers.models import StableCascadeUNet
+from diffusers.models.modeling_utils import load_model_dict_into_meta
+from diffusers.pipelines.wuerstchen import PaellaVQModel
+from diffusers.utils import is_accelerate_available
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+parser = argparse.ArgumentParser(description="Convert Stable Cascade model weights to a diffusers pipeline")
+parser.add_argument("--model_path", type=str, help="Location of Stable Cascade weights")
+parser.add_argument(
+    "--stage_c_name", type=str, default="stage_c_lite.safetensors", help="Name of stage c checkpoint file"
+)
+parser.add_argument(
+    "--stage_b_name", type=str, default="stage_b_lite.safetensors", help="Name of stage b checkpoint file"
+)
+parser.add_argument("--skip_stage_c", action="store_true", help="Skip converting stage c")
+parser.add_argument("--skip_stage_b", action="store_true", help="Skip converting stage b")
+parser.add_argument("--use_safetensors", action="store_true", help="Use SafeTensors for conversion")
+parser.add_argument(
+    "--prior_output_path",
+    default="stable-cascade-prior-lite",
+    type=str,
+    help="Hub organization to save the pipelines to",
+)
+parser.add_argument(
+    "--decoder_output_path",
+    type=str,
+    default="stable-cascade-decoder-lite",
+    help="Hub organization to save the pipelines to",
+)
+parser.add_argument(
+    "--combined_output_path",
+    type=str,
+    default="stable-cascade-combined-lite",
+    help="Hub organization to save the pipelines to",
+)
+parser.add_argument("--save_combined", action="store_true")
+parser.add_argument("--push_to_hub", action="store_true", help="Push to hub")
+parser.add_argument("--variant", type=str, help="Set to bf16 to save bfloat16 weights")
+
+args = parser.parse_args()
+
+if args.skip_stage_b and args.skip_stage_c:
+    raise ValueError("At least one stage should be converted")
+if (args.skip_stage_b or args.skip_stage_c) and args.save_combined:
+    raise ValueError("Cannot skip stages when creating a combined pipeline")
+
+model_path = args.model_path
+
+device = "cpu"
+if args.variant == "bf16":
+    dtype = torch.bfloat16
+else:
+    dtype = torch.float32
+
+# set paths to model weights
+prior_checkpoint_path = f"{model_path}/{args.stage_c_name}"
+decoder_checkpoint_path = f"{model_path}/{args.stage_b_name}"
+
+# Clip Text encoder and tokenizer
+config = CLIPConfig.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+config.text_config.projection_dim = config.projection_dim
+text_encoder = CLIPTextModelWithProjection.from_pretrained(
+    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", config=config.text_config
+)
+tokenizer = AutoTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+
+# image processor
+feature_extractor = CLIPImageProcessor()
+image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+# scheduler for prior and decoder
+scheduler = DDPMWuerstchenScheduler()
+
+ctx = init_empty_weights if is_accelerate_available() else nullcontext
+
+if not args.skip_stage_c:
+    # Prior
+    if args.use_safetensors:
+        prior_orig_state_dict = load_file(prior_checkpoint_path, device=device)
+    else:
+        prior_orig_state_dict = torch.load(prior_checkpoint_path, map_location=device)
+
+    prior_state_dict = convert_stable_cascade_unet_single_file_to_diffusers(prior_orig_state_dict)
+    with ctx():
+        prior_model = StableCascadeUNet(
+            in_channels=16,
+            out_channels=16,
+            timestep_ratio_embedding_dim=64,
+            patch_size=1,
+            conditioning_dim=1536,
+            block_out_channels=[1536, 1536],
+            num_attention_heads=[24, 24],
+            down_num_layers_per_block=[4, 12],
+            up_num_layers_per_block=[12, 4],
+            down_blocks_repeat_mappers=[1, 1],
+            up_blocks_repeat_mappers=[1, 1],
+            block_types_per_layer=[
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+            ],
+            clip_text_in_channels=1280,
+            clip_text_pooled_in_channels=1280,
+            clip_image_in_channels=768,
+            clip_seq=4,
+            kernel_size=3,
+            dropout=[0.1, 0.1],
+            self_attn=True,
+            timestep_conditioning_type=["sca", "crp"],
+            switch_level=[False],
+        )
+
+    if is_accelerate_available():
+        load_model_dict_into_meta(prior_model, prior_state_dict)
+    else:
+        prior_model.load_state_dict(prior_state_dict)
+
+    # Prior pipeline
+    prior_pipeline = StableCascadePriorPipeline(
+        prior=prior_model,
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        image_encoder=image_encoder,
+        scheduler=scheduler,
+        feature_extractor=feature_extractor,
+    )
+    prior_pipeline.to(dtype).save_pretrained(
+        args.prior_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
+
+if not args.skip_stage_b:
+    # Decoder
+    if args.use_safetensors:
+        decoder_orig_state_dict = load_file(decoder_checkpoint_path, device=device)
+    else:
+        decoder_orig_state_dict = torch.load(decoder_checkpoint_path, map_location=device)
+
+    decoder_state_dict = convert_stable_cascade_unet_single_file_to_diffusers(decoder_orig_state_dict)
+
+    with ctx():
+        decoder = StableCascadeUNet(
+            in_channels=4,
+            out_channels=4,
+            timestep_ratio_embedding_dim=64,
+            patch_size=2,
+            conditioning_dim=1280,
+            block_out_channels=[320, 576, 1152, 1152],
+            down_num_layers_per_block=[2, 4, 14, 4],
+            up_num_layers_per_block=[4, 14, 4, 2],
+            down_blocks_repeat_mappers=[1, 1, 1, 1],
+            up_blocks_repeat_mappers=[2, 2, 2, 2],
+            num_attention_heads=[0, 9, 18, 18],
+            block_types_per_layer=[
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+            ],
+            clip_text_pooled_in_channels=1280,
+            clip_seq=4,
+            effnet_in_channels=16,
+            pixel_mapper_in_channels=3,
+            kernel_size=3,
+            dropout=[0, 0, 0.1, 0.1],
+            self_attn=True,
+            timestep_conditioning_type=["sca"],
+        )
+
+    if is_accelerate_available():
+        load_model_dict_into_meta(decoder, decoder_state_dict)
+    else:
+        decoder.load_state_dict(decoder_state_dict)
+
+    # VQGAN from Wuerstchen-V2
+    vqmodel = PaellaVQModel.from_pretrained("warp-ai/wuerstchen", subfolder="vqgan")
+
+    # Decoder pipeline
+    decoder_pipeline = StableCascadeDecoderPipeline(
+        decoder=decoder, text_encoder=text_encoder, tokenizer=tokenizer, vqgan=vqmodel, scheduler=scheduler
+    )
+    decoder_pipeline.to(dtype).save_pretrained(
+        args.decoder_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
+
+if args.save_combined:
+    # Stable Cascade combined pipeline
+    stable_cascade_pipeline = StableCascadeCombinedPipeline(
+        # Decoder
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        decoder=decoder,
+        scheduler=scheduler,
+        vqgan=vqmodel,
+        # Prior
+        prior_text_encoder=text_encoder,
+        prior_tokenizer=tokenizer,
+        prior_prior=prior_model,
+        prior_scheduler=scheduler,
+        prior_image_encoder=image_encoder,
+        prior_feature_extractor=feature_extractor,
+    )
+    stable_cascade_pipeline.to(dtype).save_pretrained(
+        args.combined_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
diff --git a/diffusers/scripts/convert_stable_diffusion_checkpoint_to_onnx.py b/diffusers/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..96546b62b1d68b548b558f1556e677926b70b4c0
--- /dev/null
+++ b/diffusers/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
@@ -0,0 +1,265 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import shutil
+from pathlib import Path
+
+import onnx
+import torch
+from packaging import version
+from torch.onnx import export
+
+from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline, StableDiffusionPipeline
+
+
+is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11")
+
+
+def onnx_export(
+    model,
+    model_args: tuple,
+    output_path: Path,
+    ordered_input_names,
+    output_names,
+    dynamic_axes,
+    opset,
+    use_external_data_format=False,
+):
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
+    # so we check the torch version for backwards compatibility
+    if is_torch_less_than_1_11:
+        export(
+            model,
+            model_args,
+            f=output_path.as_posix(),
+            input_names=ordered_input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            do_constant_folding=True,
+            use_external_data_format=use_external_data_format,
+            enable_onnx_checker=True,
+            opset_version=opset,
+        )
+    else:
+        export(
+            model,
+            model_args,
+            f=output_path.as_posix(),
+            input_names=ordered_input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            do_constant_folding=True,
+            opset_version=opset,
+        )
+
+
+@torch.no_grad()
+def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = False):
+    dtype = torch.float16 if fp16 else torch.float32
+    if fp16 and torch.cuda.is_available():
+        device = "cuda"
+    elif fp16 and not torch.cuda.is_available():
+        raise ValueError("`float16` model export is only supported on GPUs with CUDA")
+    else:
+        device = "cpu"
+    pipeline = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=dtype).to(device)
+    output_path = Path(output_path)
+
+    # TEXT ENCODER
+    num_tokens = pipeline.text_encoder.config.max_position_embeddings
+    text_hidden_size = pipeline.text_encoder.config.hidden_size
+    text_input = pipeline.tokenizer(
+        "A sample prompt",
+        padding="max_length",
+        max_length=pipeline.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    onnx_export(
+        pipeline.text_encoder,
+        # casting to torch.int32 until the CLIP fix is released: https://github.com/huggingface/transformers/pull/18515/files
+        model_args=(text_input.input_ids.to(device=device, dtype=torch.int32)),
+        output_path=output_path / "text_encoder" / "model.onnx",
+        ordered_input_names=["input_ids"],
+        output_names=["last_hidden_state", "pooler_output"],
+        dynamic_axes={
+            "input_ids": {0: "batch", 1: "sequence"},
+        },
+        opset=opset,
+    )
+    del pipeline.text_encoder
+
+    # UNET
+    unet_in_channels = pipeline.unet.config.in_channels
+    unet_sample_size = pipeline.unet.config.sample_size
+    unet_path = output_path / "unet" / "model.onnx"
+    onnx_export(
+        pipeline.unet,
+        model_args=(
+            torch.randn(2, unet_in_channels, unet_sample_size, unet_sample_size).to(device=device, dtype=dtype),
+            torch.randn(2).to(device=device, dtype=dtype),
+            torch.randn(2, num_tokens, text_hidden_size).to(device=device, dtype=dtype),
+            False,
+        ),
+        output_path=unet_path,
+        ordered_input_names=["sample", "timestep", "encoder_hidden_states", "return_dict"],
+        output_names=["out_sample"],  # has to be different from "sample" for correct tracing
+        dynamic_axes={
+            "sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+            "timestep": {0: "batch"},
+            "encoder_hidden_states": {0: "batch", 1: "sequence"},
+        },
+        opset=opset,
+        use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
+    )
+    unet_model_path = str(unet_path.absolute().as_posix())
+    unet_dir = os.path.dirname(unet_model_path)
+    unet = onnx.load(unet_model_path)
+    # clean up existing tensor files
+    shutil.rmtree(unet_dir)
+    os.mkdir(unet_dir)
+    # collate external tensor files into one
+    onnx.save_model(
+        unet,
+        unet_model_path,
+        save_as_external_data=True,
+        all_tensors_to_one_file=True,
+        location="weights.pb",
+        convert_attribute=False,
+    )
+    del pipeline.unet
+
+    # VAE ENCODER
+    vae_encoder = pipeline.vae
+    vae_in_channels = vae_encoder.config.in_channels
+    vae_sample_size = vae_encoder.config.sample_size
+    # need to get the raw tensor output (sample) from the encoder
+    vae_encoder.forward = lambda sample, return_dict: vae_encoder.encode(sample, return_dict)[0].sample()
+    onnx_export(
+        vae_encoder,
+        model_args=(
+            torch.randn(1, vae_in_channels, vae_sample_size, vae_sample_size).to(device=device, dtype=dtype),
+            False,
+        ),
+        output_path=output_path / "vae_encoder" / "model.onnx",
+        ordered_input_names=["sample", "return_dict"],
+        output_names=["latent_sample"],
+        dynamic_axes={
+            "sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+        },
+        opset=opset,
+    )
+
+    # VAE DECODER
+    vae_decoder = pipeline.vae
+    vae_latent_channels = vae_decoder.config.latent_channels
+    vae_out_channels = vae_decoder.config.out_channels
+    # forward only through the decoder part
+    vae_decoder.forward = vae_encoder.decode
+    onnx_export(
+        vae_decoder,
+        model_args=(
+            torch.randn(1, vae_latent_channels, unet_sample_size, unet_sample_size).to(device=device, dtype=dtype),
+            False,
+        ),
+        output_path=output_path / "vae_decoder" / "model.onnx",
+        ordered_input_names=["latent_sample", "return_dict"],
+        output_names=["sample"],
+        dynamic_axes={
+            "latent_sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+        },
+        opset=opset,
+    )
+    del pipeline.vae
+
+    # SAFETY CHECKER
+    if pipeline.safety_checker is not None:
+        safety_checker = pipeline.safety_checker
+        clip_num_channels = safety_checker.config.vision_config.num_channels
+        clip_image_size = safety_checker.config.vision_config.image_size
+        safety_checker.forward = safety_checker.forward_onnx
+        onnx_export(
+            pipeline.safety_checker,
+            model_args=(
+                torch.randn(
+                    1,
+                    clip_num_channels,
+                    clip_image_size,
+                    clip_image_size,
+                ).to(device=device, dtype=dtype),
+                torch.randn(1, vae_sample_size, vae_sample_size, vae_out_channels).to(device=device, dtype=dtype),
+            ),
+            output_path=output_path / "safety_checker" / "model.onnx",
+            ordered_input_names=["clip_input", "images"],
+            output_names=["out_images", "has_nsfw_concepts"],
+            dynamic_axes={
+                "clip_input": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+                "images": {0: "batch", 1: "height", 2: "width", 3: "channels"},
+            },
+            opset=opset,
+        )
+        del pipeline.safety_checker
+        safety_checker = OnnxRuntimeModel.from_pretrained(output_path / "safety_checker")
+        feature_extractor = pipeline.feature_extractor
+    else:
+        safety_checker = None
+        feature_extractor = None
+
+    onnx_pipeline = OnnxStableDiffusionPipeline(
+        vae_encoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_encoder"),
+        vae_decoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_decoder"),
+        text_encoder=OnnxRuntimeModel.from_pretrained(output_path / "text_encoder"),
+        tokenizer=pipeline.tokenizer,
+        unet=OnnxRuntimeModel.from_pretrained(output_path / "unet"),
+        scheduler=pipeline.scheduler,
+        safety_checker=safety_checker,
+        feature_extractor=feature_extractor,
+        requires_safety_checker=safety_checker is not None,
+    )
+
+    onnx_pipeline.save_pretrained(output_path)
+    print("ONNX pipeline saved to", output_path)
+
+    del pipeline
+    del onnx_pipeline
+    _ = OnnxStableDiffusionPipeline.from_pretrained(output_path, provider="CPUExecutionProvider")
+    print("ONNX pipeline is loadable")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the Hub).",
+    )
+
+    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
+
+    parser.add_argument(
+        "--opset",
+        default=14,
+        type=int,
+        help="The version of the ONNX operator set to use.",
+    )
+    parser.add_argument("--fp16", action="store_true", default=False, help="Export the models in `float16` mode")
+
+    args = parser.parse_args()
+
+    convert_models(args.model_path, args.output_path, args.opset, args.fp16)
diff --git a/diffusers/scripts/convert_stable_diffusion_controlnet_to_onnx.py b/diffusers/scripts/convert_stable_diffusion_controlnet_to_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..4af39b28783681c9e6626cd2c003f2b8635224a5
--- /dev/null
+++ b/diffusers/scripts/convert_stable_diffusion_controlnet_to_onnx.py
@@ -0,0 +1,505 @@
+import argparse
+import os
+import shutil
+from pathlib import Path
+
+import onnx
+import onnx_graphsurgeon as gs
+import torch
+from onnx import shape_inference
+from packaging import version
+from polygraphy.backend.onnx.loader import fold_constants
+from torch.onnx import export
+
+from diffusers import (
+    ControlNetModel,
+    StableDiffusionControlNetImg2ImgPipeline,
+)
+from diffusers.models.attention_processor import AttnProcessor
+from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl import StableDiffusionXLControlNetPipeline
+
+
+is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11")
+is_torch_2_0_1 = version.parse(version.parse(torch.__version__).base_version) == version.parse("2.0.1")
+
+
+class Optimizer:
+    def __init__(self, onnx_graph, verbose=False):
+        self.graph = gs.import_onnx(onnx_graph)
+        self.verbose = verbose
+
+    def info(self, prefix):
+        if self.verbose:
+            print(
+                f"{prefix} .. {len(self.graph.nodes)} nodes, {len(self.graph.tensors().keys())} tensors, {len(self.graph.inputs)} inputs, {len(self.graph.outputs)} outputs"
+            )
+
+    def cleanup(self, return_onnx=False):
+        self.graph.cleanup().toposort()
+        if return_onnx:
+            return gs.export_onnx(self.graph)
+
+    def select_outputs(self, keep, names=None):
+        self.graph.outputs = [self.graph.outputs[o] for o in keep]
+        if names:
+            for i, name in enumerate(names):
+                self.graph.outputs[i].name = name
+
+    def fold_constants(self, return_onnx=False):
+        onnx_graph = fold_constants(gs.export_onnx(self.graph), allow_onnxruntime_shape_inference=True)
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+
+    def infer_shapes(self, return_onnx=False):
+        onnx_graph = gs.export_onnx(self.graph)
+        if onnx_graph.ByteSize() > 2147483648:
+            raise TypeError("ERROR: model size exceeds supported 2GB limit")
+        else:
+            onnx_graph = shape_inference.infer_shapes(onnx_graph)
+
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+
+
+def optimize(onnx_graph, name, verbose):
+    opt = Optimizer(onnx_graph, verbose=verbose)
+    opt.info(name + ": original")
+    opt.cleanup()
+    opt.info(name + ": cleanup")
+    opt.fold_constants()
+    opt.info(name + ": fold constants")
+    # opt.infer_shapes()
+    # opt.info(name + ': shape inference')
+    onnx_opt_graph = opt.cleanup(return_onnx=True)
+    opt.info(name + ": finished")
+    return onnx_opt_graph
+
+
+class UNet2DConditionControlNetModel(torch.nn.Module):
+    def __init__(
+        self,
+        unet,
+        controlnets: ControlNetModel,
+    ):
+        super().__init__()
+        self.unet = unet
+        self.controlnets = controlnets
+
+    def forward(
+        self,
+        sample,
+        timestep,
+        encoder_hidden_states,
+        controlnet_conds,
+        controlnet_scales,
+    ):
+        for i, (controlnet_cond, conditioning_scale, controlnet) in enumerate(
+            zip(controlnet_conds, controlnet_scales, self.controlnets)
+        ):
+            down_samples, mid_sample = controlnet(
+                sample,
+                timestep,
+                encoder_hidden_states=encoder_hidden_states,
+                controlnet_cond=controlnet_cond,
+                conditioning_scale=conditioning_scale,
+                return_dict=False,
+            )
+
+            # merge samples
+            if i == 0:
+                down_block_res_samples, mid_block_res_sample = down_samples, mid_sample
+            else:
+                down_block_res_samples = [
+                    samples_prev + samples_curr
+                    for samples_prev, samples_curr in zip(down_block_res_samples, down_samples)
+                ]
+                mid_block_res_sample += mid_sample
+
+        noise_pred = self.unet(
+            sample,
+            timestep,
+            encoder_hidden_states=encoder_hidden_states,
+            down_block_additional_residuals=down_block_res_samples,
+            mid_block_additional_residual=mid_block_res_sample,
+            return_dict=False,
+        )[0]
+        return noise_pred
+
+
+class UNet2DConditionXLControlNetModel(torch.nn.Module):
+    def __init__(
+        self,
+        unet,
+        controlnets: ControlNetModel,
+    ):
+        super().__init__()
+        self.unet = unet
+        self.controlnets = controlnets
+
+    def forward(
+        self,
+        sample,
+        timestep,
+        encoder_hidden_states,
+        controlnet_conds,
+        controlnet_scales,
+        text_embeds,
+        time_ids,
+    ):
+        added_cond_kwargs = {"text_embeds": text_embeds, "time_ids": time_ids}
+        for i, (controlnet_cond, conditioning_scale, controlnet) in enumerate(
+            zip(controlnet_conds, controlnet_scales, self.controlnets)
+        ):
+            down_samples, mid_sample = controlnet(
+                sample,
+                timestep,
+                encoder_hidden_states=encoder_hidden_states,
+                controlnet_cond=controlnet_cond,
+                conditioning_scale=conditioning_scale,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )
+
+            # merge samples
+            if i == 0:
+                down_block_res_samples, mid_block_res_sample = down_samples, mid_sample
+            else:
+                down_block_res_samples = [
+                    samples_prev + samples_curr
+                    for samples_prev, samples_curr in zip(down_block_res_samples, down_samples)
+                ]
+                mid_block_res_sample += mid_sample
+
+        noise_pred = self.unet(
+            sample,
+            timestep,
+            encoder_hidden_states=encoder_hidden_states,
+            down_block_additional_residuals=down_block_res_samples,
+            mid_block_additional_residual=mid_block_res_sample,
+            added_cond_kwargs=added_cond_kwargs,
+            return_dict=False,
+        )[0]
+        return noise_pred
+
+
+def onnx_export(
+    model,
+    model_args: tuple,
+    output_path: Path,
+    ordered_input_names,
+    output_names,
+    dynamic_axes,
+    opset,
+    use_external_data_format=False,
+):
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
+    # so we check the torch version for backwards compatibility
+    with torch.inference_mode(), torch.autocast("cuda"):
+        if is_torch_less_than_1_11:
+            export(
+                model,
+                model_args,
+                f=output_path.as_posix(),
+                input_names=ordered_input_names,
+                output_names=output_names,
+                dynamic_axes=dynamic_axes,
+                do_constant_folding=True,
+                use_external_data_format=use_external_data_format,
+                enable_onnx_checker=True,
+                opset_version=opset,
+            )
+        else:
+            export(
+                model,
+                model_args,
+                f=output_path.as_posix(),
+                input_names=ordered_input_names,
+                output_names=output_names,
+                dynamic_axes=dynamic_axes,
+                do_constant_folding=True,
+                opset_version=opset,
+            )
+
+
+@torch.no_grad()
+def convert_models(
+    model_path: str, controlnet_path: list, output_path: str, opset: int, fp16: bool = False, sd_xl: bool = False
+):
+    """
+    Function to convert models in stable diffusion controlnet pipeline into ONNX format
+
+    Example:
+    python convert_stable_diffusion_controlnet_to_onnx.py
+    --model_path danbrown/RevAnimated-v1-2-2
+    --controlnet_path lllyasviel/control_v11f1e_sd15_tile ioclab/brightness-controlnet
+    --output_path path-to-models-stable_diffusion/RevAnimated-v1-2-2
+    --fp16
+
+    Example for SD XL:
+    python convert_stable_diffusion_controlnet_to_onnx.py
+    --model_path stabilityai/stable-diffusion-xl-base-1.0
+    --controlnet_path SargeZT/sdxl-controlnet-seg
+    --output_path path-to-models-stable_diffusion/stable-diffusion-xl-base-1.0
+    --fp16
+    --sd_xl
+
+    Returns:
+        create 4 onnx models in output path
+        text_encoder/model.onnx
+        unet/model.onnx + unet/weights.pb
+        vae_encoder/model.onnx
+        vae_decoder/model.onnx
+
+        run test script in diffusers/examples/community
+        python test_onnx_controlnet.py
+        --sd_model danbrown/RevAnimated-v1-2-2
+        --onnx_model_dir path-to-models-stable_diffusion/RevAnimated-v1-2-2
+        --qr_img_path path-to-qr-code-image
+    """
+    dtype = torch.float16 if fp16 else torch.float32
+    if fp16 and torch.cuda.is_available():
+        device = "cuda"
+    elif fp16 and not torch.cuda.is_available():
+        raise ValueError("`float16` model export is only supported on GPUs with CUDA")
+    else:
+        device = "cpu"
+
+    # init controlnet
+    controlnets = []
+    for path in controlnet_path:
+        controlnet = ControlNetModel.from_pretrained(path, torch_dtype=dtype).to(device)
+        if is_torch_2_0_1:
+            controlnet.set_attn_processor(AttnProcessor())
+        controlnets.append(controlnet)
+
+    if sd_xl:
+        if len(controlnets) == 1:
+            controlnet = controlnets[0]
+        else:
+            raise ValueError("MultiControlNet is not yet supported.")
+        pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
+            model_path, controlnet=controlnet, torch_dtype=dtype, variant="fp16", use_safetensors=True
+        ).to(device)
+    else:
+        pipeline = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
+            model_path, controlnet=controlnets, torch_dtype=dtype
+        ).to(device)
+
+    output_path = Path(output_path)
+    if is_torch_2_0_1:
+        pipeline.unet.set_attn_processor(AttnProcessor())
+        pipeline.vae.set_attn_processor(AttnProcessor())
+
+    # # TEXT ENCODER
+    num_tokens = pipeline.text_encoder.config.max_position_embeddings
+    text_hidden_size = pipeline.text_encoder.config.hidden_size
+    text_input = pipeline.tokenizer(
+        "A sample prompt",
+        padding="max_length",
+        max_length=pipeline.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    onnx_export(
+        pipeline.text_encoder,
+        # casting to torch.int32 until the CLIP fix is released: https://github.com/huggingface/transformers/pull/18515/files
+        model_args=(text_input.input_ids.to(device=device, dtype=torch.int32)),
+        output_path=output_path / "text_encoder" / "model.onnx",
+        ordered_input_names=["input_ids"],
+        output_names=["last_hidden_state", "pooler_output"],
+        dynamic_axes={
+            "input_ids": {0: "batch", 1: "sequence"},
+        },
+        opset=opset,
+    )
+    del pipeline.text_encoder
+
+    # # UNET
+    if sd_xl:
+        controlnets = torch.nn.ModuleList(controlnets)
+        unet_controlnet = UNet2DConditionXLControlNetModel(pipeline.unet, controlnets)
+        unet_in_channels = pipeline.unet.config.in_channels
+        unet_sample_size = pipeline.unet.config.sample_size
+        text_hidden_size = 2048
+        img_size = 8 * unet_sample_size
+        unet_path = output_path / "unet" / "model.onnx"
+
+        onnx_export(
+            unet_controlnet,
+            model_args=(
+                torch.randn(2, unet_in_channels, unet_sample_size, unet_sample_size).to(device=device, dtype=dtype),
+                torch.tensor([1.0]).to(device=device, dtype=dtype),
+                torch.randn(2, num_tokens, text_hidden_size).to(device=device, dtype=dtype),
+                torch.randn(len(controlnets), 2, 3, img_size, img_size).to(device=device, dtype=dtype),
+                torch.randn(len(controlnets), 1).to(device=device, dtype=dtype),
+                torch.randn(2, 1280).to(device=device, dtype=dtype),
+                torch.rand(2, 6).to(device=device, dtype=dtype),
+            ),
+            output_path=unet_path,
+            ordered_input_names=[
+                "sample",
+                "timestep",
+                "encoder_hidden_states",
+                "controlnet_conds",
+                "conditioning_scales",
+                "text_embeds",
+                "time_ids",
+            ],
+            output_names=["noise_pred"],  # has to be different from "sample" for correct tracing
+            dynamic_axes={
+                "sample": {0: "2B", 2: "H", 3: "W"},
+                "encoder_hidden_states": {0: "2B"},
+                "controlnet_conds": {1: "2B", 3: "8H", 4: "8W"},
+                "text_embeds": {0: "2B"},
+                "time_ids": {0: "2B"},
+            },
+            opset=opset,
+            use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
+        )
+        unet_model_path = str(unet_path.absolute().as_posix())
+        unet_dir = os.path.dirname(unet_model_path)
+        # optimize onnx
+        shape_inference.infer_shapes_path(unet_model_path, unet_model_path)
+        unet_opt_graph = optimize(onnx.load(unet_model_path), name="Unet", verbose=True)
+        # clean up existing tensor files
+        shutil.rmtree(unet_dir)
+        os.mkdir(unet_dir)
+        # collate external tensor files into one
+        onnx.save_model(
+            unet_opt_graph,
+            unet_model_path,
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            location="weights.pb",
+            convert_attribute=False,
+        )
+        del pipeline.unet
+    else:
+        controlnets = torch.nn.ModuleList(controlnets)
+        unet_controlnet = UNet2DConditionControlNetModel(pipeline.unet, controlnets)
+        unet_in_channels = pipeline.unet.config.in_channels
+        unet_sample_size = pipeline.unet.config.sample_size
+        img_size = 8 * unet_sample_size
+        unet_path = output_path / "unet" / "model.onnx"
+
+        onnx_export(
+            unet_controlnet,
+            model_args=(
+                torch.randn(2, unet_in_channels, unet_sample_size, unet_sample_size).to(device=device, dtype=dtype),
+                torch.tensor([1.0]).to(device=device, dtype=dtype),
+                torch.randn(2, num_tokens, text_hidden_size).to(device=device, dtype=dtype),
+                torch.randn(len(controlnets), 2, 3, img_size, img_size).to(device=device, dtype=dtype),
+                torch.randn(len(controlnets), 1).to(device=device, dtype=dtype),
+            ),
+            output_path=unet_path,
+            ordered_input_names=[
+                "sample",
+                "timestep",
+                "encoder_hidden_states",
+                "controlnet_conds",
+                "conditioning_scales",
+            ],
+            output_names=["noise_pred"],  # has to be different from "sample" for correct tracing
+            dynamic_axes={
+                "sample": {0: "2B", 2: "H", 3: "W"},
+                "encoder_hidden_states": {0: "2B"},
+                "controlnet_conds": {1: "2B", 3: "8H", 4: "8W"},
+            },
+            opset=opset,
+            use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
+        )
+        unet_model_path = str(unet_path.absolute().as_posix())
+        unet_dir = os.path.dirname(unet_model_path)
+        # optimize onnx
+        shape_inference.infer_shapes_path(unet_model_path, unet_model_path)
+        unet_opt_graph = optimize(onnx.load(unet_model_path), name="Unet", verbose=True)
+        # clean up existing tensor files
+        shutil.rmtree(unet_dir)
+        os.mkdir(unet_dir)
+        # collate external tensor files into one
+        onnx.save_model(
+            unet_opt_graph,
+            unet_model_path,
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            location="weights.pb",
+            convert_attribute=False,
+        )
+        del pipeline.unet
+
+    # VAE ENCODER
+    vae_encoder = pipeline.vae
+    vae_in_channels = vae_encoder.config.in_channels
+    vae_sample_size = vae_encoder.config.sample_size
+    # need to get the raw tensor output (sample) from the encoder
+    vae_encoder.forward = lambda sample: vae_encoder.encode(sample).latent_dist.sample()
+    onnx_export(
+        vae_encoder,
+        model_args=(torch.randn(1, vae_in_channels, vae_sample_size, vae_sample_size).to(device=device, dtype=dtype),),
+        output_path=output_path / "vae_encoder" / "model.onnx",
+        ordered_input_names=["sample"],
+        output_names=["latent_sample"],
+        dynamic_axes={
+            "sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+        },
+        opset=opset,
+    )
+
+    # VAE DECODER
+    vae_decoder = pipeline.vae
+    vae_latent_channels = vae_decoder.config.latent_channels
+    # forward only through the decoder part
+    vae_decoder.forward = vae_encoder.decode
+    onnx_export(
+        vae_decoder,
+        model_args=(
+            torch.randn(1, vae_latent_channels, unet_sample_size, unet_sample_size).to(device=device, dtype=dtype),
+        ),
+        output_path=output_path / "vae_decoder" / "model.onnx",
+        ordered_input_names=["latent_sample"],
+        output_names=["sample"],
+        dynamic_axes={
+            "latent_sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+        },
+        opset=opset,
+    )
+    del pipeline.vae
+
+    del pipeline
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--sd_xl", action="store_true", default=False, help="SD XL pipeline")
+
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the Hub).",
+    )
+
+    parser.add_argument(
+        "--controlnet_path",
+        nargs="+",
+        required=True,
+        help="Path to the `controlnet` checkpoint to convert (either a local directory or on the Hub).",
+    )
+
+    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
+
+    parser.add_argument(
+        "--opset",
+        default=14,
+        type=int,
+        help="The version of the ONNX operator set to use.",
+    )
+    parser.add_argument("--fp16", action="store_true", default=False, help="Export the models in `float16` mode")
+
+    args = parser.parse_args()
+
+    convert_models(args.model_path, args.controlnet_path, args.output_path, args.opset, args.fp16, args.sd_xl)
diff --git a/diffusers/scripts/convert_stable_diffusion_controlnet_to_tensorrt.py b/diffusers/scripts/convert_stable_diffusion_controlnet_to_tensorrt.py
new file mode 100644
index 0000000000000000000000000000000000000000..52ab02c221e91c655df9c1698afc49cdb5bdb91a
--- /dev/null
+++ b/diffusers/scripts/convert_stable_diffusion_controlnet_to_tensorrt.py
@@ -0,0 +1,121 @@
+import argparse
+import sys
+
+import tensorrt as trt
+
+
+def convert_models(onnx_path: str, num_controlnet: int, output_path: str, fp16: bool = False, sd_xl: bool = False):
+    """
+    Function to convert models in stable diffusion controlnet pipeline into TensorRT format
+
+    Example:
+    python convert_stable_diffusion_controlnet_to_tensorrt.py
+    --onnx_path path-to-models-stable_diffusion/RevAnimated-v1-2-2/unet/model.onnx
+    --output_path path-to-models-stable_diffusion/RevAnimated-v1-2-2/unet/model.engine
+    --fp16
+    --num_controlnet 2
+
+    Example for SD XL:
+    python convert_stable_diffusion_controlnet_to_tensorrt.py
+    --onnx_path path-to-models-stable_diffusion/stable-diffusion-xl-base-1.0/unet/model.onnx
+    --output_path path-to-models-stable_diffusion/stable-diffusion-xl-base-1.0/unet/model.engine
+    --fp16
+    --num_controlnet 1
+    --sd_xl
+
+    Returns:
+        unet/model.engine
+
+        run test script in diffusers/examples/community
+        python test_onnx_controlnet.py
+        --sd_model danbrown/RevAnimated-v1-2-2
+        --onnx_model_dir path-to-models-stable_diffusion/RevAnimated-v1-2-2
+        --unet_engine_path path-to-models-stable_diffusion/stable-diffusion-xl-base-1.0/unet/model.engine
+        --qr_img_path path-to-qr-code-image
+    """
+    # UNET
+    if sd_xl:
+        batch_size = 1
+        unet_in_channels = 4
+        unet_sample_size = 64
+        num_tokens = 77
+        text_hidden_size = 2048
+        img_size = 512
+
+        text_embeds_shape = (2 * batch_size, 1280)
+        time_ids_shape = (2 * batch_size, 6)
+    else:
+        batch_size = 1
+        unet_in_channels = 4
+        unet_sample_size = 64
+        num_tokens = 77
+        text_hidden_size = 768
+        img_size = 512
+        batch_size = 1
+
+    latents_shape = (2 * batch_size, unet_in_channels, unet_sample_size, unet_sample_size)
+    embed_shape = (2 * batch_size, num_tokens, text_hidden_size)
+    controlnet_conds_shape = (num_controlnet, 2 * batch_size, 3, img_size, img_size)
+
+    TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
+    TRT_BUILDER = trt.Builder(TRT_LOGGER)
+    TRT_RUNTIME = trt.Runtime(TRT_LOGGER)
+
+    network = TRT_BUILDER.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+    onnx_parser = trt.OnnxParser(network, TRT_LOGGER)
+
+    parse_success = onnx_parser.parse_from_file(onnx_path)
+    for idx in range(onnx_parser.num_errors):
+        print(onnx_parser.get_error(idx))
+    if not parse_success:
+        sys.exit("ONNX model parsing failed")
+    print("Load Onnx model done")
+
+    profile = TRT_BUILDER.create_optimization_profile()
+
+    profile.set_shape("sample", latents_shape, latents_shape, latents_shape)
+    profile.set_shape("encoder_hidden_states", embed_shape, embed_shape, embed_shape)
+    profile.set_shape("controlnet_conds", controlnet_conds_shape, controlnet_conds_shape, controlnet_conds_shape)
+    if sd_xl:
+        profile.set_shape("text_embeds", text_embeds_shape, text_embeds_shape, text_embeds_shape)
+        profile.set_shape("time_ids", time_ids_shape, time_ids_shape, time_ids_shape)
+
+    config = TRT_BUILDER.create_builder_config()
+    config.add_optimization_profile(profile)
+    config.set_preview_feature(trt.PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805, True)
+    if fp16:
+        config.set_flag(trt.BuilderFlag.FP16)
+
+    plan = TRT_BUILDER.build_serialized_network(network, config)
+    if plan is None:
+        sys.exit("Failed building engine")
+    print("Succeeded building engine")
+
+    engine = TRT_RUNTIME.deserialize_cuda_engine(plan)
+
+    ## save TRT engine
+    with open(output_path, "wb") as f:
+        f.write(engine.serialize())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--sd_xl", action="store_true", default=False, help="SD XL pipeline")
+
+    parser.add_argument(
+        "--onnx_path",
+        type=str,
+        required=True,
+        help="Path to the onnx checkpoint to convert",
+    )
+
+    parser.add_argument("--num_controlnet", type=int)
+
+    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
+
+    parser.add_argument("--fp16", action="store_true", default=False, help="Export the models in `float16` mode")
+
+    args = parser.parse_args()
+
+    convert_models(args.onnx_path, args.num_controlnet, args.output_path, args.fp16, args.sd_xl)
diff --git a/diffusers/scripts/convert_svd_to_diffusers.py b/diffusers/scripts/convert_svd_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3243ce294b267b734c656581fc626717970f595c
--- /dev/null
+++ b/diffusers/scripts/convert_svd_to_diffusers.py
@@ -0,0 +1,730 @@
+from diffusers.utils import is_accelerate_available, logging
+
+
+if is_accelerate_available():
+    pass
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    if controlnet:
+        unet_params = original_config.model.params.control_stage_config.params
+    else:
+        if "unet_config" in original_config.model.params and original_config.model.params.unet_config is not None:
+            unet_params = original_config.model.params.unet_config.params
+        else:
+            unet_params = original_config.model.params.network_config.params
+
+    vae_params = original_config.model.params.first_stage_config.params.encoder_config.params
+
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = (
+            "CrossAttnDownBlockSpatioTemporal"
+            if resolution in unet_params.attention_resolutions
+            else "DownBlockSpatioTemporal"
+        )
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = (
+            "CrossAttnUpBlockSpatioTemporal"
+            if resolution in unet_params.attention_resolutions
+            else "UpBlockSpatioTemporal"
+        )
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    if unet_params.transformer_depth is not None:
+        transformer_layers_per_block = (
+            unet_params.transformer_depth
+            if isinstance(unet_params.transformer_depth, int)
+            else list(unet_params.transformer_depth)
+        )
+    else:
+        transformer_layers_per_block = 1
+
+    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+
+    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
+    use_linear_projection = (
+        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim_mult = unet_params.model_channels // unet_params.num_head_channels
+            head_dim = [head_dim_mult * c for c in list(unet_params.channel_mult)]
+
+    class_embed_type = None
+    addition_embed_type = None
+    addition_time_embed_dim = None
+    projection_class_embeddings_input_dim = None
+    context_dim = None
+
+    if unet_params.context_dim is not None:
+        context_dim = (
+            unet_params.context_dim if isinstance(unet_params.context_dim, int) else unet_params.context_dim[0]
+        )
+
+    if "num_classes" in unet_params:
+        if unet_params.num_classes == "sequential":
+            addition_time_embed_dim = 256
+            assert "adm_in_channels" in unet_params
+            projection_class_embeddings_input_dim = unet_params.adm_in_channels
+
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params.in_channels,
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params.num_res_blocks,
+        "cross_attention_dim": context_dim,
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "addition_embed_type": addition_embed_type,
+        "addition_time_embed_dim": addition_time_embed_dim,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "transformer_layers_per_block": transformer_layers_per_block,
+    }
+
+    if "disable_self_attentions" in unet_params:
+        config["only_cross_attention"] = unet_params.disable_self_attentions
+
+    if "num_classes" in unet_params and isinstance(unet_params.num_classes, int):
+        config["num_class_embeds"] = unet_params.num_classes
+
+    if controlnet:
+        config["conditioning_channels"] = unet_params.hint_channels
+    else:
+        config["out_channels"] = unet_params.out_channels
+        config["up_block_types"] = tuple(up_block_types)
+
+    return config
+
+
+def assign_to_checkpoint(
+    paths,
+    checkpoint,
+    old_checkpoint,
+    attention_paths_to_split=None,
+    additional_replacements=None,
+    config=None,
+    mid_block_suffix="",
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    if mid_block_suffix is not None:
+        mid_block_suffix = f".{mid_block_suffix}"
+    else:
+        mid_block_suffix = ""
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", f"mid_block.resnets.0{mid_block_suffix}")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", f"mid_block.resnets.1{mid_block_suffix}")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        if new_path == "mid_block.resnets.0.spatial_res_block.norm1.weight":
+            print("yeyy")
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        is_attn_weight = "proj_attn.weight" in new_path or ("attentions" in new_path and "to_" in new_path)
+        shape = old_checkpoint[path["old"]].shape
+        if is_attn_weight and len(shape) == 3:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        elif is_attn_weight and len(shape) == 4:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = new_item.replace("time_stack", "temporal_transformer_blocks")
+
+        new_item = new_item.replace("time_pos_embed.0.bias", "time_pos_embed.linear_1.bias")
+        new_item = new_item.replace("time_pos_embed.0.weight", "time_pos_embed.linear_1.weight")
+        new_item = new_item.replace("time_pos_embed.2.bias", "time_pos_embed.linear_2.bias")
+        new_item = new_item.replace("time_pos_embed.2.weight", "time_pos_embed.linear_2.weight")
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = new_item.replace("time_stack.", "")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def convert_ldm_unet_checkpoint(
+    checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False
+):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    if skip_extract_state_dict:
+        unet_state_dict = checkpoint
+    else:
+        # extract state_dict for UNet
+        unet_state_dict = {}
+        keys = list(checkpoint.keys())
+
+        unet_key = "model.diffusion_model."
+
+        # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+        if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+            logger.warning(f"Checkpoint {path} has both EMA and non-EMA weights.")
+            logger.warning(
+                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+            )
+            for key in keys:
+                if key.startswith("model.diffusion_model"):
+                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+        else:
+            if sum(k.startswith("model_ema") for k in keys) > 100:
+                logger.warning(
+                    "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                    " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+                )
+
+            for key in keys:
+                if key.startswith(unet_key):
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+
+    # if config["addition_embed_type"] == "text_time":
+    new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+    new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+    new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+    new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        spatial_resnets = [
+            key
+            for key in input_blocks[i]
+            if f"input_blocks.{i}.0" in key
+            and (
+                f"input_blocks.{i}.0.op" not in key
+                and f"input_blocks.{i}.0.time_stack" not in key
+                and f"input_blocks.{i}.0.time_mixer" not in key
+            )
+        ]
+        temporal_resnets = [key for key in input_blocks[i] if f"input_blocks.{i}.0.time_stack" in key]
+        # import ipdb; ipdb.set_trace()
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(spatial_resnets)
+        meta_path = {
+            "old": f"input_blocks.{i}.0",
+            "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}.spatial_res_block",
+        }
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        paths = renew_resnet_paths(temporal_resnets)
+        meta_path = {
+            "old": f"input_blocks.{i}.0",
+            "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}.temporal_res_block",
+        }
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        # TODO resnet time_mixer.mix_factor
+        if f"input_blocks.{i}.0.time_mixer.mix_factor" in unet_state_dict:
+            new_checkpoint[
+                f"down_blocks.{block_id}.resnets.{layer_in_block_id}.time_mixer.mix_factor"
+            ] = unet_state_dict[f"input_blocks.{i}.0.time_mixer.mix_factor"]
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            # import ipdb; ipdb.set_trace()
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_spatial = [key for key in resnet_0 if "time_stack" not in key and "time_mixer" not in key]
+    resnet_0_paths = renew_resnet_paths(resnet_0_spatial)
+    # import ipdb; ipdb.set_trace()
+    assign_to_checkpoint(
+        resnet_0_paths, new_checkpoint, unet_state_dict, config=config, mid_block_suffix="spatial_res_block"
+    )
+
+    resnet_0_temporal = [key for key in resnet_0 if "time_stack" in key and "time_mixer" not in key]
+    resnet_0_paths = renew_resnet_paths(resnet_0_temporal)
+    assign_to_checkpoint(
+        resnet_0_paths, new_checkpoint, unet_state_dict, config=config, mid_block_suffix="temporal_res_block"
+    )
+
+    resnet_1_spatial = [key for key in resnet_1 if "time_stack" not in key and "time_mixer" not in key]
+    resnet_1_paths = renew_resnet_paths(resnet_1_spatial)
+    assign_to_checkpoint(
+        resnet_1_paths, new_checkpoint, unet_state_dict, config=config, mid_block_suffix="spatial_res_block"
+    )
+
+    resnet_1_temporal = [key for key in resnet_1 if "time_stack" in key and "time_mixer" not in key]
+    resnet_1_paths = renew_resnet_paths(resnet_1_temporal)
+    assign_to_checkpoint(
+        resnet_1_paths, new_checkpoint, unet_state_dict, config=config, mid_block_suffix="temporal_res_block"
+    )
+
+    new_checkpoint["mid_block.resnets.0.time_mixer.mix_factor"] = unet_state_dict[
+        "middle_block.0.time_mixer.mix_factor"
+    ]
+    new_checkpoint["mid_block.resnets.1.time_mixer.mix_factor"] = unet_state_dict[
+        "middle_block.2.time_mixer.mix_factor"
+    ]
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            spatial_resnets = [
+                key
+                for key in output_blocks[i]
+                if f"output_blocks.{i}.0" in key
+                and (f"output_blocks.{i}.0.time_stack" not in key and "time_mixer" not in key)
+            ]
+            # import ipdb; ipdb.set_trace()
+
+            temporal_resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0.time_stack" in key]
+
+            paths = renew_resnet_paths(spatial_resnets)
+            meta_path = {
+                "old": f"output_blocks.{i}.0",
+                "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}.spatial_res_block",
+            }
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            paths = renew_resnet_paths(temporal_resnets)
+            meta_path = {
+                "old": f"output_blocks.{i}.0",
+                "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}.temporal_res_block",
+            }
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            if f"output_blocks.{i}.0.time_mixer.mix_factor" in unet_state_dict:
+                new_checkpoint[
+                    f"up_blocks.{block_id}.resnets.{layer_in_block_id}.time_mixer.mix_factor"
+                ] = unet_state_dict[f"output_blocks.{i}.0.time_mixer.mix_factor"]
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key and "conv" not in key]
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                # import ipdb; ipdb.set_trace()
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            spatial_layers = [
+                layer for layer in output_block_layers if "time_stack" not in layer and "time_mixer" not in layer
+            ]
+            resnet_0_paths = renew_resnet_paths(spatial_layers, n_shave_prefix_segments=1)
+            # import ipdb; ipdb.set_trace()
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(
+                    ["up_blocks", str(block_id), "resnets", str(layer_in_block_id), "spatial_res_block", path["new"]]
+                )
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+            temporal_layers = [
+                layer for layer in output_block_layers if "time_stack" in layer and "time_mixer" not in key
+            ]
+            resnet_0_paths = renew_resnet_paths(temporal_layers, n_shave_prefix_segments=1)
+            # import ipdb; ipdb.set_trace()
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(
+                    ["up_blocks", str(block_id), "resnets", str(layer_in_block_id), "temporal_res_block", path["new"]]
+                )
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+            new_checkpoint["up_blocks.0.resnets.0.time_mixer.mix_factor"] = unet_state_dict[
+                f"output_blocks.{str(i)}.0.time_mixer.mix_factor"
+            ]
+
+    return new_checkpoint
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["to_q.weight", "to_k.weight", "to_v.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0, is_temporal=False):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        # Temporal resnet
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = new_item.replace("time_stack.", "temporal_res_block.")
+
+        # Spatial resnet
+        new_item = new_item.replace("conv1", "spatial_res_block.conv1")
+        new_item = new_item.replace("norm1", "spatial_res_block.norm1")
+
+        new_item = new_item.replace("conv2", "spatial_res_block.conv2")
+        new_item = new_item.replace("norm2", "spatial_res_block.norm2")
+
+        new_item = new_item.replace("nin_shortcut", "spatial_res_block.conv_shortcut")
+
+        new_item = new_item.replace("mix_factor", "spatial_res_block.time_mixer.mix_factor")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "to_q.weight")
+        new_item = new_item.replace("q.bias", "to_q.bias")
+
+        new_item = new_item.replace("k.weight", "to_k.weight")
+        new_item = new_item.replace("k.bias", "to_k.bias")
+
+        new_item = new_item.replace("v.weight", "to_v.weight")
+        new_item = new_item.replace("v.bias", "to_v.bias")
+
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    keys = list(checkpoint.keys())
+    vae_key = "first_stage_model." if any(k.startswith("first_stage_model.") for k in keys) else ""
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+    new_checkpoint["decoder.time_conv_out.weight"] = vae_state_dict["decoder.time_mix_conv.weight"]
+    new_checkpoint["decoder.time_conv_out.bias"] = vae_state_dict["decoder.time_mix_conv.bias"]
+
+    # new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    # new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    # new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    # new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
diff --git a/diffusers/scripts/convert_tiny_autoencoder_to_diffusers.py b/diffusers/scripts/convert_tiny_autoencoder_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bb2df98a77a62400cfcea024d1d1ddd2b3f3368
--- /dev/null
+++ b/diffusers/scripts/convert_tiny_autoencoder_to_diffusers.py
@@ -0,0 +1,71 @@
+import argparse
+
+import safetensors.torch
+
+from diffusers import AutoencoderTiny
+
+
+"""
+Example - From the diffusers root directory:
+
+Download the weights:
+```sh
+$ wget -q https://huggingface.co/madebyollin/taesd/resolve/main/taesd_encoder.safetensors
+$ wget -q https://huggingface.co/madebyollin/taesd/resolve/main/taesd_decoder.safetensors
+```
+
+Convert the model:
+```sh
+$ python scripts/convert_tiny_autoencoder_to_diffusers.py \
+    --encoder_ckpt_path  taesd_encoder.safetensors \
+    --decoder_ckpt_path taesd_decoder.safetensors \
+    --dump_path taesd-diffusers
+```
+"""
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument(
+        "--encoder_ckpt_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the encoder ckpt.",
+    )
+    parser.add_argument(
+        "--decoder_ckpt_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the decoder ckpt.",
+    )
+    parser.add_argument(
+        "--use_safetensors", action="store_true", help="Whether to serialize in the safetensors format."
+    )
+    args = parser.parse_args()
+
+    print("Loading the original state_dicts of the encoder and the decoder...")
+    encoder_state_dict = safetensors.torch.load_file(args.encoder_ckpt_path)
+    decoder_state_dict = safetensors.torch.load_file(args.decoder_ckpt_path)
+
+    print("Populating the state_dicts in the diffusers format...")
+    tiny_autoencoder = AutoencoderTiny()
+    new_state_dict = {}
+
+    # Modify the encoder state dict.
+    for k in encoder_state_dict:
+        new_state_dict.update({f"encoder.layers.{k}": encoder_state_dict[k]})
+
+    # Modify the decoder state dict.
+    for k in decoder_state_dict:
+        layer_id = int(k.split(".")[0]) - 1
+        new_k = str(layer_id) + "." + ".".join(k.split(".")[1:])
+        new_state_dict.update({f"decoder.layers.{new_k}": decoder_state_dict[k]})
+
+    # Assertion tests with the original implementation can be found here:
+    # https://gist.github.com/sayakpaul/337b0988f08bd2cf2b248206f760e28f
+    tiny_autoencoder.load_state_dict(new_state_dict)
+    print("Population successful, serializing...")
+    tiny_autoencoder.save_pretrained(args.dump_path, safe_serialization=args.use_safetensors)
diff --git a/diffusers/scripts/convert_unclip_txt2img_to_image_variation.py b/diffusers/scripts/convert_unclip_txt2img_to_image_variation.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f8ebf2a3d012600a533dcfa642b609c31a3d8c
--- /dev/null
+++ b/diffusers/scripts/convert_unclip_txt2img_to_image_variation.py
@@ -0,0 +1,41 @@
+import argparse
+
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from diffusers import UnCLIPImageVariationPipeline, UnCLIPPipeline
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    parser.add_argument(
+        "--txt2img_unclip",
+        default="kakaobrain/karlo-v1-alpha",
+        type=str,
+        required=False,
+        help="The pretrained txt2img unclip.",
+    )
+
+    args = parser.parse_args()
+
+    txt2img = UnCLIPPipeline.from_pretrained(args.txt2img_unclip)
+
+    feature_extractor = CLIPImageProcessor()
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+
+    img2img = UnCLIPImageVariationPipeline(
+        decoder=txt2img.decoder,
+        text_encoder=txt2img.text_encoder,
+        tokenizer=txt2img.tokenizer,
+        text_proj=txt2img.text_proj,
+        feature_extractor=feature_extractor,
+        image_encoder=image_encoder,
+        super_res_first=txt2img.super_res_first,
+        super_res_last=txt2img.super_res_last,
+        decoder_scheduler=txt2img.decoder_scheduler,
+        super_res_scheduler=txt2img.super_res_scheduler,
+    )
+
+    img2img.save_pretrained(args.dump_path)
diff --git a/diffusers/scripts/convert_unidiffuser_to_diffusers.py b/diffusers/scripts/convert_unidiffuser_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c38172754f6645111b8eeed75cf1cd85dbeb30f
--- /dev/null
+++ b/diffusers/scripts/convert_unidiffuser_to_diffusers.py
@@ -0,0 +1,786 @@
+# Convert the original UniDiffuser checkpoints into diffusers equivalents.
+
+import argparse
+from argparse import Namespace
+
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+    GPT2Tokenizer,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    DPMSolverMultistepScheduler,
+    UniDiffuserModel,
+    UniDiffuserPipeline,
+    UniDiffuserTextDecoder,
+)
+
+
+SCHEDULER_CONFIG = Namespace(
+    **{
+        "beta_start": 0.00085,
+        "beta_end": 0.012,
+        "beta_schedule": "scaled_linear",
+        "solver_order": 3,
+    }
+)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_resnet_paths
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_attention_paths
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "to_q.weight")
+        new_item = new_item.replace("q.bias", "to_q.bias")
+
+        new_item = new_item.replace("k.weight", "to_k.weight")
+        new_item = new_item.replace("k.bias", "to_k.bias")
+
+        new_item = new_item.replace("v.weight", "to_v.weight")
+        new_item = new_item.replace("v.bias", "to_v.bias")
+
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+# Modified from diffusers.pipelines.stable_diffusion.convert_from_ckpt.assign_to_checkpoint
+# config.num_head_channels => num_head_channels
+def assign_to_checkpoint(
+    paths,
+    checkpoint,
+    old_checkpoint,
+    attention_paths_to_split=None,
+    additional_replacements=None,
+    num_head_channels=1,
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // num_head_channels // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        is_attn_weight = "proj_attn.weight" in new_path or ("attentions" in new_path and "to_" in new_path)
+        shape = old_checkpoint[path["old"]].shape
+        if is_attn_weight and len(shape) == 3:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        elif is_attn_weight and len(shape) == 4:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def create_vae_diffusers_config(config_type):
+    # Hardcoded for now
+    if args.config_type == "test":
+        vae_config = create_vae_diffusers_config_test()
+    elif args.config_type == "big":
+        vae_config = create_vae_diffusers_config_big()
+    else:
+        raise NotImplementedError(
+            f"Config type {config_type} is not implemented, currently only config types"
+            " 'test' and 'big' are available."
+        )
+    return vae_config
+
+
+def create_unidiffuser_unet_config(config_type, version):
+    # Hardcoded for now
+    if args.config_type == "test":
+        unet_config = create_unidiffuser_unet_config_test()
+    elif args.config_type == "big":
+        unet_config = create_unidiffuser_unet_config_big()
+    else:
+        raise NotImplementedError(
+            f"Config type {config_type} is not implemented, currently only config types"
+            " 'test' and 'big' are available."
+        )
+    # Unidiffuser-v1 uses data type embeddings
+    if version == 1:
+        unet_config["use_data_type_embedding"] = True
+    return unet_config
+
+
+def create_text_decoder_config(config_type):
+    # Hardcoded for now
+    if args.config_type == "test":
+        text_decoder_config = create_text_decoder_config_test()
+    elif args.config_type == "big":
+        text_decoder_config = create_text_decoder_config_big()
+    else:
+        raise NotImplementedError(
+            f"Config type {config_type} is not implemented, currently only config types"
+            " 'test' and 'big' are available."
+        )
+    return text_decoder_config
+
+
+# Hardcoded configs for test versions of the UniDiffuser models, corresponding to those in the fast default tests.
+def create_vae_diffusers_config_test():
+    vae_config = {
+        "sample_size": 32,
+        "in_channels": 3,
+        "out_channels": 3,
+        "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
+        "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
+        "block_out_channels": [32, 64],
+        "latent_channels": 4,
+        "layers_per_block": 1,
+    }
+    return vae_config
+
+
+def create_unidiffuser_unet_config_test():
+    unet_config = {
+        "text_dim": 32,
+        "clip_img_dim": 32,
+        "num_text_tokens": 77,
+        "num_attention_heads": 2,
+        "attention_head_dim": 8,
+        "in_channels": 4,
+        "out_channels": 4,
+        "num_layers": 2,
+        "dropout": 0.0,
+        "norm_num_groups": 32,
+        "attention_bias": False,
+        "sample_size": 16,
+        "patch_size": 2,
+        "activation_fn": "gelu",
+        "num_embeds_ada_norm": 1000,
+        "norm_type": "layer_norm",
+        "block_type": "unidiffuser",
+        "pre_layer_norm": False,
+        "use_timestep_embedding": False,
+        "norm_elementwise_affine": True,
+        "use_patch_pos_embed": False,
+        "ff_final_dropout": True,
+        "use_data_type_embedding": False,
+    }
+    return unet_config
+
+
+def create_text_decoder_config_test():
+    text_decoder_config = {
+        "prefix_length": 77,
+        "prefix_inner_dim": 32,
+        "prefix_hidden_dim": 32,
+        "vocab_size": 1025,  # 1024 + 1 for new EOS token
+        "n_positions": 1024,
+        "n_embd": 32,
+        "n_layer": 5,
+        "n_head": 4,
+        "n_inner": 37,
+        "activation_function": "gelu",
+        "resid_pdrop": 0.1,
+        "embd_pdrop": 0.1,
+        "attn_pdrop": 0.1,
+        "layer_norm_epsilon": 1e-5,
+        "initializer_range": 0.02,
+    }
+    return text_decoder_config
+
+
+# Hardcoded configs for the UniDiffuser V1 model at https://huggingface.co/thu-ml/unidiffuser-v1
+# See also https://github.com/thu-ml/unidiffuser/blob/main/configs/sample_unidiffuser_v1.py
+def create_vae_diffusers_config_big():
+    vae_config = {
+        "sample_size": 256,
+        "in_channels": 3,
+        "out_channels": 3,
+        "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
+        "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+        "block_out_channels": [128, 256, 512, 512],
+        "latent_channels": 4,
+        "layers_per_block": 2,
+    }
+    return vae_config
+
+
+def create_unidiffuser_unet_config_big():
+    unet_config = {
+        "text_dim": 64,
+        "clip_img_dim": 512,
+        "num_text_tokens": 77,
+        "num_attention_heads": 24,
+        "attention_head_dim": 64,
+        "in_channels": 4,
+        "out_channels": 4,
+        "num_layers": 30,
+        "dropout": 0.0,
+        "norm_num_groups": 32,
+        "attention_bias": False,
+        "sample_size": 64,
+        "patch_size": 2,
+        "activation_fn": "gelu",
+        "num_embeds_ada_norm": 1000,
+        "norm_type": "layer_norm",
+        "block_type": "unidiffuser",
+        "pre_layer_norm": False,
+        "use_timestep_embedding": False,
+        "norm_elementwise_affine": True,
+        "use_patch_pos_embed": False,
+        "ff_final_dropout": True,
+        "use_data_type_embedding": False,
+    }
+    return unet_config
+
+
+# From https://huggingface.co/gpt2/blob/main/config.json, the GPT2 checkpoint used by UniDiffuser
+def create_text_decoder_config_big():
+    text_decoder_config = {
+        "prefix_length": 77,
+        "prefix_inner_dim": 768,
+        "prefix_hidden_dim": 64,
+        "vocab_size": 50258,  # 50257 + 1 for new EOS token
+        "n_positions": 1024,
+        "n_embd": 768,
+        "n_layer": 12,
+        "n_head": 12,
+        "n_inner": 3072,
+        "activation_function": "gelu",
+        "resid_pdrop": 0.1,
+        "embd_pdrop": 0.1,
+        "attn_pdrop": 0.1,
+        "layer_norm_epsilon": 1e-5,
+        "initializer_range": 0.02,
+    }
+    return text_decoder_config
+
+
+# Based on diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_vae_checkpoint
+def convert_vae_to_diffusers(ckpt, diffusers_model, num_head_channels=1):
+    """
+    Converts a UniDiffuser autoencoder_kl.pth checkpoint to a diffusers AutoencoderKL.
+    """
+    # autoencoder_kl.pth ckpt is a torch state dict
+    vae_state_dict = torch.load(ckpt, map_location="cpu")
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(
+            paths,
+            new_checkpoint,
+            vae_state_dict,
+            additional_replacements=[meta_path],
+            num_head_channels=num_head_channels,  # not used in vae
+        )
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(
+            paths,
+            new_checkpoint,
+            vae_state_dict,
+            additional_replacements=[meta_path],
+            num_head_channels=num_head_channels,  # not used in vae
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        vae_state_dict,
+        additional_replacements=[meta_path],
+        num_head_channels=num_head_channels,  # not used in vae
+    )
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(
+            paths,
+            new_checkpoint,
+            vae_state_dict,
+            additional_replacements=[meta_path],
+            num_head_channels=num_head_channels,  # not used in vae
+        )
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(
+            paths,
+            new_checkpoint,
+            vae_state_dict,
+            additional_replacements=[meta_path],
+            num_head_channels=num_head_channels,  # not used in vae
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        vae_state_dict,
+        additional_replacements=[meta_path],
+        num_head_channels=num_head_channels,  # not used in vae
+    )
+    conv_attn_to_linear(new_checkpoint)
+
+    missing_keys, unexpected_keys = diffusers_model.load_state_dict(new_checkpoint)
+    for missing_key in missing_keys:
+        print(f"Missing key: {missing_key}")
+    for unexpected_key in unexpected_keys:
+        print(f"Unexpected key: {unexpected_key}")
+
+    return diffusers_model
+
+
+def convert_uvit_block_to_diffusers_block(
+    uvit_state_dict,
+    new_state_dict,
+    block_prefix,
+    new_prefix="transformer.transformer_",
+    skip_connection=False,
+):
+    """
+    Maps the keys in a UniDiffuser transformer block (`Block`) to the keys in a diffusers transformer block
+    (`UTransformerBlock`/`UniDiffuserBlock`).
+    """
+    prefix = new_prefix + block_prefix
+    if skip_connection:
+        new_state_dict[prefix + ".skip.skip_linear.weight"] = uvit_state_dict[block_prefix + ".skip_linear.weight"]
+        new_state_dict[prefix + ".skip.skip_linear.bias"] = uvit_state_dict[block_prefix + ".skip_linear.bias"]
+        new_state_dict[prefix + ".skip.norm.weight"] = uvit_state_dict[block_prefix + ".norm1.weight"]
+        new_state_dict[prefix + ".skip.norm.bias"] = uvit_state_dict[block_prefix + ".norm1.bias"]
+
+        # Create the prefix string for out_blocks.
+        prefix += ".block"
+
+    # Split up attention qkv.weight into to_q.weight, to_k.weight, to_v.weight
+    qkv = uvit_state_dict[block_prefix + ".attn.qkv.weight"]
+    new_attn_keys = [".attn1.to_q.weight", ".attn1.to_k.weight", ".attn1.to_v.weight"]
+    new_attn_keys = [prefix + key for key in new_attn_keys]
+    shape = qkv.shape[0] // len(new_attn_keys)
+    for i, attn_key in enumerate(new_attn_keys):
+        new_state_dict[attn_key] = qkv[i * shape : (i + 1) * shape]
+
+    new_state_dict[prefix + ".attn1.to_out.0.weight"] = uvit_state_dict[block_prefix + ".attn.proj.weight"]
+    new_state_dict[prefix + ".attn1.to_out.0.bias"] = uvit_state_dict[block_prefix + ".attn.proj.bias"]
+    new_state_dict[prefix + ".norm1.weight"] = uvit_state_dict[block_prefix + ".norm2.weight"]
+    new_state_dict[prefix + ".norm1.bias"] = uvit_state_dict[block_prefix + ".norm2.bias"]
+    new_state_dict[prefix + ".ff.net.0.proj.weight"] = uvit_state_dict[block_prefix + ".mlp.fc1.weight"]
+    new_state_dict[prefix + ".ff.net.0.proj.bias"] = uvit_state_dict[block_prefix + ".mlp.fc1.bias"]
+    new_state_dict[prefix + ".ff.net.2.weight"] = uvit_state_dict[block_prefix + ".mlp.fc2.weight"]
+    new_state_dict[prefix + ".ff.net.2.bias"] = uvit_state_dict[block_prefix + ".mlp.fc2.bias"]
+    new_state_dict[prefix + ".norm3.weight"] = uvit_state_dict[block_prefix + ".norm3.weight"]
+    new_state_dict[prefix + ".norm3.bias"] = uvit_state_dict[block_prefix + ".norm3.bias"]
+
+    return uvit_state_dict, new_state_dict
+
+
+def convert_uvit_to_diffusers(ckpt, diffusers_model):
+    """
+    Converts a UniDiffuser uvit_v*.pth checkpoint to a diffusers UniDiffusersModel.
+    """
+    # uvit_v*.pth ckpt is a torch state dict
+    uvit_state_dict = torch.load(ckpt, map_location="cpu")
+
+    new_state_dict = {}
+
+    # Input layers
+    new_state_dict["vae_img_in.proj.weight"] = uvit_state_dict["patch_embed.proj.weight"]
+    new_state_dict["vae_img_in.proj.bias"] = uvit_state_dict["patch_embed.proj.bias"]
+    new_state_dict["clip_img_in.weight"] = uvit_state_dict["clip_img_embed.weight"]
+    new_state_dict["clip_img_in.bias"] = uvit_state_dict["clip_img_embed.bias"]
+    new_state_dict["text_in.weight"] = uvit_state_dict["text_embed.weight"]
+    new_state_dict["text_in.bias"] = uvit_state_dict["text_embed.bias"]
+
+    new_state_dict["pos_embed"] = uvit_state_dict["pos_embed"]
+
+    # Handle data type token embeddings for UniDiffuser-v1
+    if "token_embedding.weight" in uvit_state_dict and diffusers_model.use_data_type_embedding:
+        new_state_dict["data_type_pos_embed_token"] = uvit_state_dict["pos_embed_token"]
+        new_state_dict["data_type_token_embedding.weight"] = uvit_state_dict["token_embedding.weight"]
+
+    # Also initialize the PatchEmbedding in UTransformer2DModel with the PatchEmbedding from the checkpoint.
+    # This isn't used in the current implementation, so might want to remove.
+    new_state_dict["transformer.pos_embed.proj.weight"] = uvit_state_dict["patch_embed.proj.weight"]
+    new_state_dict["transformer.pos_embed.proj.bias"] = uvit_state_dict["patch_embed.proj.bias"]
+
+    # Output layers
+    new_state_dict["transformer.norm_out.weight"] = uvit_state_dict["norm.weight"]
+    new_state_dict["transformer.norm_out.bias"] = uvit_state_dict["norm.bias"]
+
+    new_state_dict["vae_img_out.weight"] = uvit_state_dict["decoder_pred.weight"]
+    new_state_dict["vae_img_out.bias"] = uvit_state_dict["decoder_pred.bias"]
+    new_state_dict["clip_img_out.weight"] = uvit_state_dict["clip_img_out.weight"]
+    new_state_dict["clip_img_out.bias"] = uvit_state_dict["clip_img_out.bias"]
+    new_state_dict["text_out.weight"] = uvit_state_dict["text_out.weight"]
+    new_state_dict["text_out.bias"] = uvit_state_dict["text_out.bias"]
+
+    # in_blocks
+    in_blocks_prefixes = {".".join(layer.split(".")[:2]) for layer in uvit_state_dict if "in_blocks" in layer}
+    for in_block_prefix in list(in_blocks_prefixes):
+        convert_uvit_block_to_diffusers_block(uvit_state_dict, new_state_dict, in_block_prefix)
+
+    # mid_block
+    # Assume there's only one mid block
+    convert_uvit_block_to_diffusers_block(uvit_state_dict, new_state_dict, "mid_block")
+
+    # out_blocks
+    out_blocks_prefixes = {".".join(layer.split(".")[:2]) for layer in uvit_state_dict if "out_blocks" in layer}
+    for out_block_prefix in list(out_blocks_prefixes):
+        convert_uvit_block_to_diffusers_block(uvit_state_dict, new_state_dict, out_block_prefix, skip_connection=True)
+
+    missing_keys, unexpected_keys = diffusers_model.load_state_dict(new_state_dict)
+    for missing_key in missing_keys:
+        print(f"Missing key: {missing_key}")
+    for unexpected_key in unexpected_keys:
+        print(f"Unexpected key: {unexpected_key}")
+
+    return diffusers_model
+
+
+def convert_caption_decoder_to_diffusers(ckpt, diffusers_model):
+    """
+    Converts a UniDiffuser caption_decoder.pth checkpoint to a diffusers UniDiffuserTextDecoder.
+    """
+    # caption_decoder.pth ckpt is a torch state dict
+    checkpoint_state_dict = torch.load(ckpt, map_location="cpu")
+    decoder_state_dict = {}
+    # Remove the "module." prefix, if necessary
+    caption_decoder_key = "module."
+    for key in checkpoint_state_dict:
+        if key.startswith(caption_decoder_key):
+            decoder_state_dict[key.replace(caption_decoder_key, "")] = checkpoint_state_dict.get(key)
+        else:
+            decoder_state_dict[key] = checkpoint_state_dict.get(key)
+
+    new_state_dict = {}
+
+    # Encoder and Decoder
+    new_state_dict["encode_prefix.weight"] = decoder_state_dict["encode_prefix.weight"]
+    new_state_dict["encode_prefix.bias"] = decoder_state_dict["encode_prefix.bias"]
+    new_state_dict["decode_prefix.weight"] = decoder_state_dict["decode_prefix.weight"]
+    new_state_dict["decode_prefix.bias"] = decoder_state_dict["decode_prefix.bias"]
+
+    # Internal GPT2LMHeadModel transformer model
+    for key, val in decoder_state_dict.items():
+        if key.startswith("gpt"):
+            suffix = key[len("gpt") :]
+            new_state_dict["transformer" + suffix] = val
+
+    missing_keys, unexpected_keys = diffusers_model.load_state_dict(new_state_dict)
+    for missing_key in missing_keys:
+        print(f"Missing key: {missing_key}")
+    for unexpected_key in unexpected_keys:
+        print(f"Unexpected key: {unexpected_key}")
+
+    return diffusers_model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--caption_decoder_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to caption decoder checkpoint to convert.",
+    )
+    parser.add_argument(
+        "--uvit_checkpoint_path", default=None, type=str, required=False, help="Path to U-ViT checkpoint to convert."
+    )
+    parser.add_argument(
+        "--vae_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to VAE checkpoint to convert.",
+    )
+    parser.add_argument(
+        "--pipeline_output_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to save the output pipeline to.",
+    )
+    parser.add_argument(
+        "--config_type",
+        default="test",
+        type=str,
+        help=(
+            "Config type to use. Should be 'test' to create small models for testing or 'big' to convert a full"
+            " checkpoint."
+        ),
+    )
+    parser.add_argument(
+        "--version",
+        default=0,
+        type=int,
+        help="The UniDiffuser model type to convert to. Should be 0 for UniDiffuser-v0 and 1 for UniDiffuser-v1.",
+    )
+    parser.add_argument(
+        "--safe_serialization",
+        action="store_true",
+        help="Whether to use safetensors/safe seialization when saving the pipeline.",
+    )
+
+    args = parser.parse_args()
+
+    # Convert the VAE model.
+    if args.vae_checkpoint_path is not None:
+        vae_config = create_vae_diffusers_config(args.config_type)
+        vae = AutoencoderKL(**vae_config)
+        vae = convert_vae_to_diffusers(args.vae_checkpoint_path, vae)
+
+    # Convert the U-ViT ("unet") model.
+    if args.uvit_checkpoint_path is not None:
+        unet_config = create_unidiffuser_unet_config(args.config_type, args.version)
+        unet = UniDiffuserModel(**unet_config)
+        unet = convert_uvit_to_diffusers(args.uvit_checkpoint_path, unet)
+
+    # Convert the caption decoder ("text_decoder") model.
+    if args.caption_decoder_checkpoint_path is not None:
+        text_decoder_config = create_text_decoder_config(args.config_type)
+        text_decoder = UniDiffuserTextDecoder(**text_decoder_config)
+        text_decoder = convert_caption_decoder_to_diffusers(args.caption_decoder_checkpoint_path, text_decoder)
+
+    # Scheduler is the same for both the test and big models.
+    scheduler_config = SCHEDULER_CONFIG
+    scheduler = DPMSolverMultistepScheduler(
+        beta_start=scheduler_config.beta_start,
+        beta_end=scheduler_config.beta_end,
+        beta_schedule=scheduler_config.beta_schedule,
+        solver_order=scheduler_config.solver_order,
+    )
+
+    if args.config_type == "test":
+        # Make a small random CLIPTextModel
+        torch.manual_seed(0)
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(clip_text_encoder_config)
+        clip_tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # Make a small random CLIPVisionModel and accompanying CLIPImageProcessor
+        torch.manual_seed(0)
+        clip_image_encoder_config = CLIPVisionConfig(
+            image_size=32,
+            patch_size=2,
+            num_channels=3,
+            hidden_size=32,
+            projection_dim=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            dropout=0.1,
+            attention_dropout=0.1,
+            initializer_range=0.02,
+        )
+        image_encoder = CLIPVisionModelWithProjection(clip_image_encoder_config)
+        image_processor = CLIPImageProcessor(crop_size=32, size=32)
+
+        # Note that the text_decoder should already have its token embeddings resized.
+        text_tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model")
+        eos = "<|EOS|>"
+        special_tokens_dict = {"eos_token": eos}
+        text_tokenizer.add_special_tokens(special_tokens_dict)
+    elif args.config_type == "big":
+        text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+        clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
+        image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        # Note that the text_decoder should already have its token embeddings resized.
+        text_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        eos = "<|EOS|>"
+        special_tokens_dict = {"eos_token": eos}
+        text_tokenizer.add_special_tokens(special_tokens_dict)
+    else:
+        raise NotImplementedError(
+            f"Config type {args.config_type} is not implemented, currently only config types"
+            " 'test' and 'big' are available."
+        )
+
+    pipeline = UniDiffuserPipeline(
+        vae=vae,
+        text_encoder=text_encoder,
+        image_encoder=image_encoder,
+        clip_image_processor=image_processor,
+        clip_tokenizer=clip_tokenizer,
+        text_decoder=text_decoder,
+        text_tokenizer=text_tokenizer,
+        unet=unet,
+        scheduler=scheduler,
+    )
+    pipeline.save_pretrained(args.pipeline_output_path, safe_serialization=args.safe_serialization)
diff --git a/diffusers/scripts/convert_vae_diff_to_onnx.py b/diffusers/scripts/convert_vae_diff_to_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..e023e04b94973f26ff6a93b6fa3e2b7b3661b829
--- /dev/null
+++ b/diffusers/scripts/convert_vae_diff_to_onnx.py
@@ -0,0 +1,122 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from pathlib import Path
+
+import torch
+from packaging import version
+from torch.onnx import export
+
+from diffusers import AutoencoderKL
+
+
+is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11")
+
+
+def onnx_export(
+    model,
+    model_args: tuple,
+    output_path: Path,
+    ordered_input_names,
+    output_names,
+    dynamic_axes,
+    opset,
+    use_external_data_format=False,
+):
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
+    # so we check the torch version for backwards compatibility
+    if is_torch_less_than_1_11:
+        export(
+            model,
+            model_args,
+            f=output_path.as_posix(),
+            input_names=ordered_input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            do_constant_folding=True,
+            use_external_data_format=use_external_data_format,
+            enable_onnx_checker=True,
+            opset_version=opset,
+        )
+    else:
+        export(
+            model,
+            model_args,
+            f=output_path.as_posix(),
+            input_names=ordered_input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            do_constant_folding=True,
+            opset_version=opset,
+        )
+
+
+@torch.no_grad()
+def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = False):
+    dtype = torch.float16 if fp16 else torch.float32
+    if fp16 and torch.cuda.is_available():
+        device = "cuda"
+    elif fp16 and not torch.cuda.is_available():
+        raise ValueError("`float16` model export is only supported on GPUs with CUDA")
+    else:
+        device = "cpu"
+    output_path = Path(output_path)
+
+    # VAE DECODER
+    vae_decoder = AutoencoderKL.from_pretrained(model_path + "/vae")
+    vae_latent_channels = vae_decoder.config.latent_channels
+    # forward only through the decoder part
+    vae_decoder.forward = vae_decoder.decode
+    onnx_export(
+        vae_decoder,
+        model_args=(
+            torch.randn(1, vae_latent_channels, 25, 25).to(device=device, dtype=dtype),
+            False,
+        ),
+        output_path=output_path / "vae_decoder" / "model.onnx",
+        ordered_input_names=["latent_sample", "return_dict"],
+        output_names=["sample"],
+        dynamic_axes={
+            "latent_sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+        },
+        opset=opset,
+    )
+    del vae_decoder
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the Hub).",
+    )
+
+    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
+    parser.add_argument(
+        "--opset",
+        default=14,
+        type=int,
+        help="The version of the ONNX operator set to use.",
+    )
+    parser.add_argument("--fp16", action="store_true", default=False, help="Export the models in `float16` mode")
+
+    args = parser.parse_args()
+    print(args.output_path)
+    convert_models(args.model_path, args.output_path, args.opset, args.fp16)
+    print("SD: Done: ONNX")
diff --git a/diffusers/scripts/convert_vae_pt_to_diffusers.py b/diffusers/scripts/convert_vae_pt_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4f967c94fa69bee8e8538dd7d494485889c171a
--- /dev/null
+++ b/diffusers/scripts/convert_vae_pt_to_diffusers.py
@@ -0,0 +1,159 @@
+import argparse
+import io
+
+import requests
+import torch
+import yaml
+
+from diffusers import AutoencoderKL
+from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
+    assign_to_checkpoint,
+    conv_attn_to_linear,
+    create_vae_diffusers_config,
+    renew_vae_attention_paths,
+    renew_vae_resnet_paths,
+)
+
+
+def custom_convert_ldm_vae_checkpoint(checkpoint, config):
+    vae_state_dict = checkpoint
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+def vae_pt_to_vae_diffuser(
+    checkpoint_path: str,
+    output_path: str,
+):
+    # Only support V1
+    r = requests.get(
+        " https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+    )
+    io_obj = io.BytesIO(r.content)
+
+    original_config = yaml.safe_load(io_obj)
+    image_size = 512
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if checkpoint_path.endswith("safetensors"):
+        from safetensors import safe_open
+
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        checkpoint = torch.load(checkpoint_path, map_location=device)["state_dict"]
+
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    converted_vae_checkpoint = custom_convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+    vae.save_pretrained(output_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--vae_pt_path", default=None, type=str, required=True, help="Path to the VAE.pt to convert.")
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the VAE.pt to convert.")
+
+    args = parser.parse_args()
+
+    vae_pt_to_vae_diffuser(args.vae_pt_path, args.dump_path)
diff --git a/diffusers/scripts/convert_versatile_diffusion_to_diffusers.py b/diffusers/scripts/convert_versatile_diffusion_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..41e2e01912094700072841aaa9ac8e85e4042dc8
--- /dev/null
+++ b/diffusers/scripts/convert_versatile_diffusion_to_diffusers.py
@@ -0,0 +1,791 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the Versatile Stable Diffusion checkpoints."""
+
+import argparse
+from argparse import Namespace
+
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+    VersatileDiffusionPipeline,
+)
+from diffusers.pipelines.versatile_diffusion.modeling_text_unet import UNetFlatConditionModel
+
+
+SCHEDULER_CONFIG = Namespace(
+    **{
+        "beta_linear_start": 0.00085,
+        "beta_linear_end": 0.012,
+        "timesteps": 1000,
+        "scale_factor": 0.18215,
+    }
+)
+
+IMAGE_UNET_CONFIG = Namespace(
+    **{
+        "input_channels": 4,
+        "model_channels": 320,
+        "output_channels": 4,
+        "num_noattn_blocks": [2, 2, 2, 2],
+        "channel_mult": [1, 2, 4, 4],
+        "with_attn": [True, True, True, False],
+        "num_heads": 8,
+        "context_dim": 768,
+        "use_checkpoint": True,
+    }
+)
+
+TEXT_UNET_CONFIG = Namespace(
+    **{
+        "input_channels": 768,
+        "model_channels": 320,
+        "output_channels": 768,
+        "num_noattn_blocks": [2, 2, 2, 2],
+        "channel_mult": [1, 2, 4, 4],
+        "second_dim": [4, 4, 4, 4],
+        "with_attn": [True, True, True, False],
+        "num_heads": 8,
+        "context_dim": 768,
+        "use_checkpoint": True,
+    }
+)
+
+AUTOENCODER_CONFIG = Namespace(
+    **{
+        "double_z": True,
+        "z_channels": 4,
+        "resolution": 256,
+        "in_channels": 3,
+        "out_ch": 3,
+        "ch": 128,
+        "ch_mult": [1, 2, 4, 4],
+        "num_res_blocks": 2,
+        "attn_resolutions": [],
+        "dropout": 0.0,
+    }
+)
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        elif path["old"] in old_checkpoint:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_image_unet_diffusers_config(unet_params):
+    """
+    Creates a config for the diffusers based on the config of the VD model.
+    """
+
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if unet_params.with_attn[i] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if unet_params.with_attn[-i - 1] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    if not all(n == unet_params.num_noattn_blocks[0] for n in unet_params.num_noattn_blocks):
+        raise ValueError("Not all num_res_blocks are equal, which is not supported in this script.")
+
+    config = {
+        "sample_size": None,
+        "in_channels": unet_params.input_channels,
+        "out_channels": unet_params.output_channels,
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params.num_noattn_blocks[0],
+        "cross_attention_dim": unet_params.context_dim,
+        "attention_head_dim": unet_params.num_heads,
+    }
+
+    return config
+
+
+def create_text_unet_diffusers_config(unet_params):
+    """
+    Creates a config for the diffusers based on the config of the VD model.
+    """
+
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlockFlat" if unet_params.with_attn[i] else "DownBlockFlat"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlockFlat" if unet_params.with_attn[-i - 1] else "UpBlockFlat"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    if not all(n == unet_params.num_noattn_blocks[0] for n in unet_params.num_noattn_blocks):
+        raise ValueError("Not all num_res_blocks are equal, which is not supported in this script.")
+
+    config = {
+        "sample_size": None,
+        "in_channels": (unet_params.input_channels, 1, 1),
+        "out_channels": (unet_params.output_channels, 1, 1),
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params.num_noattn_blocks[0],
+        "cross_attention_dim": unet_params.context_dim,
+        "attention_head_dim": unet_params.num_heads,
+    }
+
+    return config
+
+
+def create_vae_diffusers_config(vae_params):
+    """
+    Creates a config for the diffusers based on the config of the VD model.
+    """
+
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = {
+        "sample_size": vae_params.resolution,
+        "in_channels": vae_params.in_channels,
+        "out_channels": vae_params.out_ch,
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params.z_channels,
+        "layers_per_block": vae_params.num_res_blocks,
+    }
+    return config
+
+
+def create_diffusers_scheduler(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config.model.params.timesteps,
+        beta_start=original_config.model.params.linear_start,
+        beta_end=original_config.model.params.linear_end,
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+
+
+def convert_vd_unet_checkpoint(checkpoint, config, unet_key, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100:
+        print("Checkpoint has both EMA and non-EMA weights.")
+        if extract_ema:
+            print(
+                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+            )
+            for key in keys:
+                if key.startswith("model.diffusion_model"):
+                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+        else:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = checkpoint["model.diffusion_model.time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = checkpoint["model.diffusion_model.time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = checkpoint["model.diffusion_model.time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = checkpoint["model.diffusion_model.time_embed.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+        elif f"input_blocks.{i}.0.weight" in unet_state_dict:
+            # text_unet uses linear layers in place of downsamplers
+            shape = unet_state_dict[f"input_blocks.{i}.0.weight"].shape
+            if shape[0] != shape[1]:
+                continue
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            if ["conv.weight", "conv.bias"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            elif f"output_blocks.{i}.1.weight" in unet_state_dict:
+                # text_unet uses linear layers in place of upsamplers
+                shape = unet_state_dict[f"output_blocks.{i}.1.weight"].shape
+                if shape[0] != shape[1]:
+                    continue
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.weight"] = unet_state_dict.pop(
+                    f"output_blocks.{i}.1.weight"
+                )
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.bias"] = unet_state_dict.pop(
+                    f"output_blocks.{i}.1.bias"
+                )
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            elif f"output_blocks.{i}.2.weight" in unet_state_dict:
+                # text_unet uses linear layers in place of upsamplers
+                shape = unet_state_dict[f"output_blocks.{i}.2.weight"].shape
+                if shape[0] != shape[1]:
+                    continue
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.weight"] = unet_state_dict.pop(
+                    f"output_blocks.{i}.2.weight"
+                )
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.bias"] = unet_state_dict.pop(
+                    f"output_blocks.{i}.2.bias"
+                )
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+def convert_vd_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    keys = list(checkpoint.keys())
+    for key in keys:
+        vae_state_dict[key] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--unet_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--vae_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--optimus_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="pndm",
+        type=str,
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    args = parser.parse_args()
+
+    scheduler_config = SCHEDULER_CONFIG
+
+    num_train_timesteps = scheduler_config.timesteps
+    beta_start = scheduler_config.beta_linear_start
+    beta_end = scheduler_config.beta_linear_end
+    if args.scheduler_type == "pndm":
+        scheduler = PNDMScheduler(
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            beta_start=beta_start,
+            num_train_timesteps=num_train_timesteps,
+            skip_prk_steps=True,
+            steps_offset=1,
+        )
+    elif args.scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
+    elif args.scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
+    elif args.scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler(
+            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+        )
+    elif args.scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler(
+            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+        )
+    elif args.scheduler_type == "ddim":
+        scheduler = DDIMScheduler(
+            beta_start=beta_start,
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+        )
+    else:
+        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
+
+    # Convert the UNet2DConditionModel models.
+    if args.unet_checkpoint_path is not None:
+        # image UNet
+        image_unet_config = create_image_unet_diffusers_config(IMAGE_UNET_CONFIG)
+        checkpoint = torch.load(args.unet_checkpoint_path)
+        converted_image_unet_checkpoint = convert_vd_unet_checkpoint(
+            checkpoint, image_unet_config, unet_key="model.diffusion_model.unet_image.", extract_ema=args.extract_ema
+        )
+        image_unet = UNet2DConditionModel(**image_unet_config)
+        image_unet.load_state_dict(converted_image_unet_checkpoint)
+
+        # text UNet
+        text_unet_config = create_text_unet_diffusers_config(TEXT_UNET_CONFIG)
+        converted_text_unet_checkpoint = convert_vd_unet_checkpoint(
+            checkpoint, text_unet_config, unet_key="model.diffusion_model.unet_text.", extract_ema=args.extract_ema
+        )
+        text_unet = UNetFlatConditionModel(**text_unet_config)
+        text_unet.load_state_dict(converted_text_unet_checkpoint)
+
+    # Convert the VAE model.
+    if args.vae_checkpoint_path is not None:
+        vae_config = create_vae_diffusers_config(AUTOENCODER_CONFIG)
+        checkpoint = torch.load(args.vae_checkpoint_path)
+        converted_vae_checkpoint = convert_vd_vae_checkpoint(checkpoint, vae_config)
+
+        vae = AutoencoderKL(**vae_config)
+        vae.load_state_dict(converted_vae_checkpoint)
+
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    image_feature_extractor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
+    text_encoder = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+
+    pipe = VersatileDiffusionPipeline(
+        scheduler=scheduler,
+        tokenizer=tokenizer,
+        image_feature_extractor=image_feature_extractor,
+        text_encoder=text_encoder,
+        image_encoder=image_encoder,
+        image_unet=image_unet,
+        text_unet=text_unet,
+        vae=vae,
+    )
+    pipe.save_pretrained(args.dump_path)
diff --git a/diffusers/scripts/convert_vq_diffusion_to_diffusers.py b/diffusers/scripts/convert_vq_diffusion_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7da6b40949867be27f7f1d5f88e9ff03de9d8b5e
--- /dev/null
+++ b/diffusers/scripts/convert_vq_diffusion_to_diffusers.py
@@ -0,0 +1,916 @@
+"""
+This script ports models from VQ-diffusion (https://github.com/microsoft/VQ-Diffusion) to diffusers.
+
+It currently only supports porting the ITHQ dataset.
+
+ITHQ dataset:
+```sh
+# From the root directory of diffusers.
+
+# Download the VQVAE checkpoint
+$ wget https://facevcstandard.blob.core.windows.net/v-zhictang/Improved-VQ-Diffusion_model_release/ithq_vqvae.pth?sv=2020-10-02&st=2022-05-30T15%3A17%3A18Z&se=2030-05-31T15%3A17%3A00Z&sr=b&sp=r&sig=1jVavHFPpUjDs%2FTO1V3PTezaNbPp2Nx8MxiWI7y6fEY%3D -O ithq_vqvae.pth
+
+# Download the VQVAE config
+# NOTE that in VQ-diffusion the documented file is `configs/ithq.yaml` but the target class
+# `image_synthesis.modeling.codecs.image_codec.ema_vqvae.PatchVQVAE`
+# loads `OUTPUT/pretrained_model/taming_dvae/config.yaml`
+$ wget https://raw.githubusercontent.com/microsoft/VQ-Diffusion/main/OUTPUT/pretrained_model/taming_dvae/config.yaml -O ithq_vqvae.yaml
+
+# Download the main model checkpoint
+$ wget https://facevcstandard.blob.core.windows.net/v-zhictang/Improved-VQ-Diffusion_model_release/ithq_learnable.pth?sv=2020-10-02&st=2022-05-30T10%3A22%3A06Z&se=2030-05-31T10%3A22%3A00Z&sr=b&sp=r&sig=GOE%2Bza02%2FPnGxYVOOPtwrTR4RA3%2F5NVgMxdW4kjaEZ8%3D -O ithq_learnable.pth
+
+# Download the main model config
+$ wget https://raw.githubusercontent.com/microsoft/VQ-Diffusion/main/configs/ithq.yaml -O ithq.yaml
+
+# run the convert script
+$ python ./scripts/convert_vq_diffusion_to_diffusers.py \
+    --checkpoint_path ./ithq_learnable.pth \
+    --original_config_file ./ithq.yaml \
+    --vqvae_checkpoint_path ./ithq_vqvae.pth \
+    --vqvae_original_config_file ./ithq_vqvae.yaml \
+    --dump_path <path to save pre-trained `VQDiffusionPipeline`>
+```
+"""
+
+import argparse
+import tempfile
+
+import torch
+import yaml
+from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+from transformers import CLIPTextModel, CLIPTokenizer
+from yaml.loader import FullLoader
+
+from diffusers import Transformer2DModel, VQDiffusionPipeline, VQDiffusionScheduler, VQModel
+from diffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings
+
+
+# vqvae model
+
+PORTED_VQVAES = ["image_synthesis.modeling.codecs.image_codec.patch_vqgan.PatchVQGAN"]
+
+
+def vqvae_model_from_original_config(original_config):
+    assert (
+        original_config["target"] in PORTED_VQVAES
+    ), f"{original_config['target']} has not yet been ported to diffusers."
+
+    original_config = original_config["params"]
+
+    original_encoder_config = original_config["encoder_config"]["params"]
+    original_decoder_config = original_config["decoder_config"]["params"]
+
+    in_channels = original_encoder_config["in_channels"]
+    out_channels = original_decoder_config["out_ch"]
+
+    down_block_types = get_down_block_types(original_encoder_config)
+    up_block_types = get_up_block_types(original_decoder_config)
+
+    assert original_encoder_config["ch"] == original_decoder_config["ch"]
+    assert original_encoder_config["ch_mult"] == original_decoder_config["ch_mult"]
+    block_out_channels = tuple(
+        [original_encoder_config["ch"] * a_ch_mult for a_ch_mult in original_encoder_config["ch_mult"]]
+    )
+
+    assert original_encoder_config["num_res_blocks"] == original_decoder_config["num_res_blocks"]
+    layers_per_block = original_encoder_config["num_res_blocks"]
+
+    assert original_encoder_config["z_channels"] == original_decoder_config["z_channels"]
+    latent_channels = original_encoder_config["z_channels"]
+
+    num_vq_embeddings = original_config["n_embed"]
+
+    # Hard coded value for ResnetBlock.GoupNorm(num_groups) in VQ-diffusion
+    norm_num_groups = 32
+
+    e_dim = original_config["embed_dim"]
+
+    model = VQModel(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        down_block_types=down_block_types,
+        up_block_types=up_block_types,
+        block_out_channels=block_out_channels,
+        layers_per_block=layers_per_block,
+        latent_channels=latent_channels,
+        num_vq_embeddings=num_vq_embeddings,
+        norm_num_groups=norm_num_groups,
+        vq_embed_dim=e_dim,
+    )
+
+    return model
+
+
+def get_down_block_types(original_encoder_config):
+    attn_resolutions = coerce_attn_resolutions(original_encoder_config["attn_resolutions"])
+    num_resolutions = len(original_encoder_config["ch_mult"])
+    resolution = coerce_resolution(original_encoder_config["resolution"])
+
+    curr_res = resolution
+    down_block_types = []
+
+    for _ in range(num_resolutions):
+        if curr_res in attn_resolutions:
+            down_block_type = "AttnDownEncoderBlock2D"
+        else:
+            down_block_type = "DownEncoderBlock2D"
+
+        down_block_types.append(down_block_type)
+
+        curr_res = [r // 2 for r in curr_res]
+
+    return down_block_types
+
+
+def get_up_block_types(original_decoder_config):
+    attn_resolutions = coerce_attn_resolutions(original_decoder_config["attn_resolutions"])
+    num_resolutions = len(original_decoder_config["ch_mult"])
+    resolution = coerce_resolution(original_decoder_config["resolution"])
+
+    curr_res = [r // 2 ** (num_resolutions - 1) for r in resolution]
+    up_block_types = []
+
+    for _ in reversed(range(num_resolutions)):
+        if curr_res in attn_resolutions:
+            up_block_type = "AttnUpDecoderBlock2D"
+        else:
+            up_block_type = "UpDecoderBlock2D"
+
+        up_block_types.append(up_block_type)
+
+        curr_res = [r * 2 for r in curr_res]
+
+    return up_block_types
+
+
+def coerce_attn_resolutions(attn_resolutions):
+    attn_resolutions = list(attn_resolutions)
+    attn_resolutions_ = []
+    for ar in attn_resolutions:
+        if isinstance(ar, (list, tuple)):
+            attn_resolutions_.append(list(ar))
+        else:
+            attn_resolutions_.append([ar, ar])
+    return attn_resolutions_
+
+
+def coerce_resolution(resolution):
+    if isinstance(resolution, int):
+        resolution = [resolution, resolution]  # H, W
+    elif isinstance(resolution, (tuple, list)):
+        resolution = list(resolution)
+    else:
+        raise ValueError("Unknown type of resolution:", resolution)
+    return resolution
+
+
+# done vqvae model
+
+# vqvae checkpoint
+
+
+def vqvae_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(vqvae_encoder_to_diffusers_checkpoint(model, checkpoint))
+
+    # quant_conv
+
+    diffusers_checkpoint.update(
+        {
+            "quant_conv.weight": checkpoint["quant_conv.weight"],
+            "quant_conv.bias": checkpoint["quant_conv.bias"],
+        }
+    )
+
+    # quantize
+    diffusers_checkpoint.update({"quantize.embedding.weight": checkpoint["quantize.embedding"]})
+
+    # post_quant_conv
+    diffusers_checkpoint.update(
+        {
+            "post_quant_conv.weight": checkpoint["post_quant_conv.weight"],
+            "post_quant_conv.bias": checkpoint["post_quant_conv.bias"],
+        }
+    )
+
+    # decoder
+    diffusers_checkpoint.update(vqvae_decoder_to_diffusers_checkpoint(model, checkpoint))
+
+    return diffusers_checkpoint
+
+
+def vqvae_encoder_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # conv_in
+    diffusers_checkpoint.update(
+        {
+            "encoder.conv_in.weight": checkpoint["encoder.conv_in.weight"],
+            "encoder.conv_in.bias": checkpoint["encoder.conv_in.bias"],
+        }
+    )
+
+    # down_blocks
+    for down_block_idx, down_block in enumerate(model.encoder.down_blocks):
+        diffusers_down_block_prefix = f"encoder.down_blocks.{down_block_idx}"
+        down_block_prefix = f"encoder.down.{down_block_idx}"
+
+        # resnets
+        for resnet_idx, resnet in enumerate(down_block.resnets):
+            diffusers_resnet_prefix = f"{diffusers_down_block_prefix}.resnets.{resnet_idx}"
+            resnet_prefix = f"{down_block_prefix}.block.{resnet_idx}"
+
+            diffusers_checkpoint.update(
+                vqvae_resnet_to_diffusers_checkpoint(
+                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+                )
+            )
+
+        # downsample
+
+        # do not include the downsample when on the last down block
+        # There is no downsample on the last down block
+        if down_block_idx != len(model.encoder.down_blocks) - 1:
+            # There's a single downsample in the original checkpoint but a list of downsamples
+            # in the diffusers model.
+            diffusers_downsample_prefix = f"{diffusers_down_block_prefix}.downsamplers.0.conv"
+            downsample_prefix = f"{down_block_prefix}.downsample.conv"
+            diffusers_checkpoint.update(
+                {
+                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
+                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
+                }
+            )
+
+        # attentions
+
+        if hasattr(down_block, "attentions"):
+            for attention_idx, _ in enumerate(down_block.attentions):
+                diffusers_attention_prefix = f"{diffusers_down_block_prefix}.attentions.{attention_idx}"
+                attention_prefix = f"{down_block_prefix}.attn.{attention_idx}"
+                diffusers_checkpoint.update(
+                    vqvae_attention_to_diffusers_checkpoint(
+                        checkpoint,
+                        diffusers_attention_prefix=diffusers_attention_prefix,
+                        attention_prefix=attention_prefix,
+                    )
+                )
+
+    # mid block
+
+    # mid block attentions
+
+    # There is a single hardcoded attention block in the middle of the VQ-diffusion encoder
+    diffusers_attention_prefix = "encoder.mid_block.attentions.0"
+    attention_prefix = "encoder.mid.attn_1"
+    diffusers_checkpoint.update(
+        vqvae_attention_to_diffusers_checkpoint(
+            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+        )
+    )
+
+    # mid block resnets
+
+    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
+        diffusers_resnet_prefix = f"encoder.mid_block.resnets.{diffusers_resnet_idx}"
+
+        # the hardcoded prefixes to `block_` are 1 and 2
+        orig_resnet_idx = diffusers_resnet_idx + 1
+        # There are two hardcoded resnets in the middle of the VQ-diffusion encoder
+        resnet_prefix = f"encoder.mid.block_{orig_resnet_idx}"
+
+        diffusers_checkpoint.update(
+            vqvae_resnet_to_diffusers_checkpoint(
+                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+            )
+        )
+
+    diffusers_checkpoint.update(
+        {
+            # conv_norm_out
+            "encoder.conv_norm_out.weight": checkpoint["encoder.norm_out.weight"],
+            "encoder.conv_norm_out.bias": checkpoint["encoder.norm_out.bias"],
+            # conv_out
+            "encoder.conv_out.weight": checkpoint["encoder.conv_out.weight"],
+            "encoder.conv_out.bias": checkpoint["encoder.conv_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def vqvae_decoder_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # conv in
+    diffusers_checkpoint.update(
+        {
+            "decoder.conv_in.weight": checkpoint["decoder.conv_in.weight"],
+            "decoder.conv_in.bias": checkpoint["decoder.conv_in.bias"],
+        }
+    )
+
+    # up_blocks
+
+    for diffusers_up_block_idx, up_block in enumerate(model.decoder.up_blocks):
+        # up_blocks are stored in reverse order in the VQ-diffusion checkpoint
+        orig_up_block_idx = len(model.decoder.up_blocks) - 1 - diffusers_up_block_idx
+
+        diffusers_up_block_prefix = f"decoder.up_blocks.{diffusers_up_block_idx}"
+        up_block_prefix = f"decoder.up.{orig_up_block_idx}"
+
+        # resnets
+        for resnet_idx, resnet in enumerate(up_block.resnets):
+            diffusers_resnet_prefix = f"{diffusers_up_block_prefix}.resnets.{resnet_idx}"
+            resnet_prefix = f"{up_block_prefix}.block.{resnet_idx}"
+
+            diffusers_checkpoint.update(
+                vqvae_resnet_to_diffusers_checkpoint(
+                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+                )
+            )
+
+        # upsample
+
+        # there is no up sample on the last up block
+        if diffusers_up_block_idx != len(model.decoder.up_blocks) - 1:
+            # There's a single upsample in the VQ-diffusion checkpoint but a list of downsamples
+            # in the diffusers model.
+            diffusers_downsample_prefix = f"{diffusers_up_block_prefix}.upsamplers.0.conv"
+            downsample_prefix = f"{up_block_prefix}.upsample.conv"
+            diffusers_checkpoint.update(
+                {
+                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
+                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
+                }
+            )
+
+        # attentions
+
+        if hasattr(up_block, "attentions"):
+            for attention_idx, _ in enumerate(up_block.attentions):
+                diffusers_attention_prefix = f"{diffusers_up_block_prefix}.attentions.{attention_idx}"
+                attention_prefix = f"{up_block_prefix}.attn.{attention_idx}"
+                diffusers_checkpoint.update(
+                    vqvae_attention_to_diffusers_checkpoint(
+                        checkpoint,
+                        diffusers_attention_prefix=diffusers_attention_prefix,
+                        attention_prefix=attention_prefix,
+                    )
+                )
+
+    # mid block
+
+    # mid block attentions
+
+    # There is a single hardcoded attention block in the middle of the VQ-diffusion decoder
+    diffusers_attention_prefix = "decoder.mid_block.attentions.0"
+    attention_prefix = "decoder.mid.attn_1"
+    diffusers_checkpoint.update(
+        vqvae_attention_to_diffusers_checkpoint(
+            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+        )
+    )
+
+    # mid block resnets
+
+    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
+        diffusers_resnet_prefix = f"decoder.mid_block.resnets.{diffusers_resnet_idx}"
+
+        # the hardcoded prefixes to `block_` are 1 and 2
+        orig_resnet_idx = diffusers_resnet_idx + 1
+        # There are two hardcoded resnets in the middle of the VQ-diffusion decoder
+        resnet_prefix = f"decoder.mid.block_{orig_resnet_idx}"
+
+        diffusers_checkpoint.update(
+            vqvae_resnet_to_diffusers_checkpoint(
+                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+            )
+        )
+
+    diffusers_checkpoint.update(
+        {
+            # conv_norm_out
+            "decoder.conv_norm_out.weight": checkpoint["decoder.norm_out.weight"],
+            "decoder.conv_norm_out.bias": checkpoint["decoder.norm_out.bias"],
+            # conv_out
+            "decoder.conv_out.weight": checkpoint["decoder.conv_out.weight"],
+            "decoder.conv_out.bias": checkpoint["decoder.conv_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def vqvae_resnet_to_diffusers_checkpoint(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
+    rv = {
+        # norm1
+        f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.norm1.weight"],
+        f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.norm1.bias"],
+        # conv1
+        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"],
+        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"],
+        # norm2
+        f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.norm2.weight"],
+        f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.norm2.bias"],
+        # conv2
+        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"],
+        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"],
+    }
+
+    if resnet.conv_shortcut is not None:
+        rv.update(
+            {
+                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"],
+                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"],
+            }
+        )
+
+    return rv
+
+
+def vqvae_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
+    return {
+        # group_norm
+        f"{diffusers_attention_prefix}.group_norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
+        f"{diffusers_attention_prefix}.group_norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
+        # query
+        f"{diffusers_attention_prefix}.query.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.query.bias": checkpoint[f"{attention_prefix}.q.bias"],
+        # key
+        f"{diffusers_attention_prefix}.key.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.key.bias": checkpoint[f"{attention_prefix}.k.bias"],
+        # value
+        f"{diffusers_attention_prefix}.value.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.value.bias": checkpoint[f"{attention_prefix}.v.bias"],
+        # proj_attn
+        f"{diffusers_attention_prefix}.proj_attn.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
+            :, :, 0, 0
+        ],
+        f"{diffusers_attention_prefix}.proj_attn.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
+    }
+
+
+# done vqvae checkpoint
+
+# transformer model
+
+PORTED_DIFFUSIONS = ["image_synthesis.modeling.transformers.diffusion_transformer.DiffusionTransformer"]
+PORTED_TRANSFORMERS = ["image_synthesis.modeling.transformers.transformer_utils.Text2ImageTransformer"]
+PORTED_CONTENT_EMBEDDINGS = ["image_synthesis.modeling.embeddings.dalle_mask_image_embedding.DalleMaskImageEmbedding"]
+
+
+def transformer_model_from_original_config(
+    original_diffusion_config, original_transformer_config, original_content_embedding_config
+):
+    assert (
+        original_diffusion_config["target"] in PORTED_DIFFUSIONS
+    ), f"{original_diffusion_config['target']} has not yet been ported to diffusers."
+    assert (
+        original_transformer_config["target"] in PORTED_TRANSFORMERS
+    ), f"{original_transformer_config['target']} has not yet been ported to diffusers."
+    assert (
+        original_content_embedding_config["target"] in PORTED_CONTENT_EMBEDDINGS
+    ), f"{original_content_embedding_config['target']} has not yet been ported to diffusers."
+
+    original_diffusion_config = original_diffusion_config["params"]
+    original_transformer_config = original_transformer_config["params"]
+    original_content_embedding_config = original_content_embedding_config["params"]
+
+    inner_dim = original_transformer_config["n_embd"]
+
+    n_heads = original_transformer_config["n_head"]
+
+    # VQ-Diffusion gives dimension of the multi-headed attention layers as the
+    # number of attention heads times the sequence length (the dimension) of a
+    # single head. We want to specify our attention blocks with those values
+    # specified separately
+    assert inner_dim % n_heads == 0
+    d_head = inner_dim // n_heads
+
+    depth = original_transformer_config["n_layer"]
+    context_dim = original_transformer_config["condition_dim"]
+
+    num_embed = original_content_embedding_config["num_embed"]
+    # the number of embeddings in the transformer includes the mask embedding.
+    # the content embedding (the vqvae) does not include the mask embedding.
+    num_embed = num_embed + 1
+
+    height = original_transformer_config["content_spatial_size"][0]
+    width = original_transformer_config["content_spatial_size"][1]
+
+    assert width == height, "width has to be equal to height"
+    dropout = original_transformer_config["resid_pdrop"]
+    num_embeds_ada_norm = original_diffusion_config["diffusion_step"]
+
+    model_kwargs = {
+        "attention_bias": True,
+        "cross_attention_dim": context_dim,
+        "attention_head_dim": d_head,
+        "num_layers": depth,
+        "dropout": dropout,
+        "num_attention_heads": n_heads,
+        "num_vector_embeds": num_embed,
+        "num_embeds_ada_norm": num_embeds_ada_norm,
+        "norm_num_groups": 32,
+        "sample_size": width,
+        "activation_fn": "geglu-approximate",
+    }
+
+    model = Transformer2DModel(**model_kwargs)
+    return model
+
+
+# done transformer model
+
+# transformer checkpoint
+
+
+def transformer_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    transformer_prefix = "transformer.transformer"
+
+    diffusers_latent_image_embedding_prefix = "latent_image_embedding"
+    latent_image_embedding_prefix = f"{transformer_prefix}.content_emb"
+
+    # DalleMaskImageEmbedding
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_latent_image_embedding_prefix}.emb.weight": checkpoint[
+                f"{latent_image_embedding_prefix}.emb.weight"
+            ],
+            f"{diffusers_latent_image_embedding_prefix}.height_emb.weight": checkpoint[
+                f"{latent_image_embedding_prefix}.height_emb.weight"
+            ],
+            f"{diffusers_latent_image_embedding_prefix}.width_emb.weight": checkpoint[
+                f"{latent_image_embedding_prefix}.width_emb.weight"
+            ],
+        }
+    )
+
+    # transformer blocks
+    for transformer_block_idx, transformer_block in enumerate(model.transformer_blocks):
+        diffusers_transformer_block_prefix = f"transformer_blocks.{transformer_block_idx}"
+        transformer_block_prefix = f"{transformer_prefix}.blocks.{transformer_block_idx}"
+
+        # ada norm block
+        diffusers_ada_norm_prefix = f"{diffusers_transformer_block_prefix}.norm1"
+        ada_norm_prefix = f"{transformer_block_prefix}.ln1"
+
+        diffusers_checkpoint.update(
+            transformer_ada_norm_to_diffusers_checkpoint(
+                checkpoint, diffusers_ada_norm_prefix=diffusers_ada_norm_prefix, ada_norm_prefix=ada_norm_prefix
+            )
+        )
+
+        # attention block
+        diffusers_attention_prefix = f"{diffusers_transformer_block_prefix}.attn1"
+        attention_prefix = f"{transformer_block_prefix}.attn1"
+
+        diffusers_checkpoint.update(
+            transformer_attention_to_diffusers_checkpoint(
+                checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+            )
+        )
+
+        # ada norm block
+        diffusers_ada_norm_prefix = f"{diffusers_transformer_block_prefix}.norm2"
+        ada_norm_prefix = f"{transformer_block_prefix}.ln1_1"
+
+        diffusers_checkpoint.update(
+            transformer_ada_norm_to_diffusers_checkpoint(
+                checkpoint, diffusers_ada_norm_prefix=diffusers_ada_norm_prefix, ada_norm_prefix=ada_norm_prefix
+            )
+        )
+
+        # attention block
+        diffusers_attention_prefix = f"{diffusers_transformer_block_prefix}.attn2"
+        attention_prefix = f"{transformer_block_prefix}.attn2"
+
+        diffusers_checkpoint.update(
+            transformer_attention_to_diffusers_checkpoint(
+                checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+            )
+        )
+
+        # norm block
+        diffusers_norm_block_prefix = f"{diffusers_transformer_block_prefix}.norm3"
+        norm_block_prefix = f"{transformer_block_prefix}.ln2"
+
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_norm_block_prefix}.weight": checkpoint[f"{norm_block_prefix}.weight"],
+                f"{diffusers_norm_block_prefix}.bias": checkpoint[f"{norm_block_prefix}.bias"],
+            }
+        )
+
+        # feedforward block
+        diffusers_feedforward_prefix = f"{diffusers_transformer_block_prefix}.ff"
+        feedforward_prefix = f"{transformer_block_prefix}.mlp"
+
+        diffusers_checkpoint.update(
+            transformer_feedforward_to_diffusers_checkpoint(
+                checkpoint,
+                diffusers_feedforward_prefix=diffusers_feedforward_prefix,
+                feedforward_prefix=feedforward_prefix,
+            )
+        )
+
+    # to logits
+
+    diffusers_norm_out_prefix = "norm_out"
+    norm_out_prefix = f"{transformer_prefix}.to_logits.0"
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_norm_out_prefix}.weight": checkpoint[f"{norm_out_prefix}.weight"],
+            f"{diffusers_norm_out_prefix}.bias": checkpoint[f"{norm_out_prefix}.bias"],
+        }
+    )
+
+    diffusers_out_prefix = "out"
+    out_prefix = f"{transformer_prefix}.to_logits.1"
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_out_prefix}.weight": checkpoint[f"{out_prefix}.weight"],
+            f"{diffusers_out_prefix}.bias": checkpoint[f"{out_prefix}.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def transformer_ada_norm_to_diffusers_checkpoint(checkpoint, *, diffusers_ada_norm_prefix, ada_norm_prefix):
+    return {
+        f"{diffusers_ada_norm_prefix}.emb.weight": checkpoint[f"{ada_norm_prefix}.emb.weight"],
+        f"{diffusers_ada_norm_prefix}.linear.weight": checkpoint[f"{ada_norm_prefix}.linear.weight"],
+        f"{diffusers_ada_norm_prefix}.linear.bias": checkpoint[f"{ada_norm_prefix}.linear.bias"],
+    }
+
+
+def transformer_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
+    return {
+        # key
+        f"{diffusers_attention_prefix}.to_k.weight": checkpoint[f"{attention_prefix}.key.weight"],
+        f"{diffusers_attention_prefix}.to_k.bias": checkpoint[f"{attention_prefix}.key.bias"],
+        # query
+        f"{diffusers_attention_prefix}.to_q.weight": checkpoint[f"{attention_prefix}.query.weight"],
+        f"{diffusers_attention_prefix}.to_q.bias": checkpoint[f"{attention_prefix}.query.bias"],
+        # value
+        f"{diffusers_attention_prefix}.to_v.weight": checkpoint[f"{attention_prefix}.value.weight"],
+        f"{diffusers_attention_prefix}.to_v.bias": checkpoint[f"{attention_prefix}.value.bias"],
+        # linear out
+        f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj.weight"],
+        f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj.bias"],
+    }
+
+
+def transformer_feedforward_to_diffusers_checkpoint(checkpoint, *, diffusers_feedforward_prefix, feedforward_prefix):
+    return {
+        f"{diffusers_feedforward_prefix}.net.0.proj.weight": checkpoint[f"{feedforward_prefix}.0.weight"],
+        f"{diffusers_feedforward_prefix}.net.0.proj.bias": checkpoint[f"{feedforward_prefix}.0.bias"],
+        f"{diffusers_feedforward_prefix}.net.2.weight": checkpoint[f"{feedforward_prefix}.2.weight"],
+        f"{diffusers_feedforward_prefix}.net.2.bias": checkpoint[f"{feedforward_prefix}.2.bias"],
+    }
+
+
+# done transformer checkpoint
+
+
+def read_config_file(filename):
+    # The yaml file contains annotations that certain values should
+    # loaded as tuples.
+    with open(filename) as f:
+        original_config = yaml.load(f, FullLoader)
+
+    return original_config
+
+
+# We take separate arguments for the vqvae because the ITHQ vqvae config file
+# is separate from the config file for the rest of the model.
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--vqvae_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the vqvae checkpoint to convert.",
+    )
+
+    parser.add_argument(
+        "--vqvae_original_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The YAML config file corresponding to the original architecture for the vqvae.",
+    )
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    parser.add_argument(
+        "--checkpoint_load_device",
+        default="cpu",
+        type=str,
+        required=False,
+        help="The device passed to `map_location` when loading checkpoints.",
+    )
+
+    # See link for how ema weights are always selected
+    # https://github.com/microsoft/VQ-Diffusion/blob/3c98e77f721db7c787b76304fa2c96a36c7b00af/inference_VQ_Diffusion.py#L65
+    parser.add_argument(
+        "--no_use_ema",
+        action="store_true",
+        required=False,
+        help=(
+            "Set to not use the ema weights from the original VQ-Diffusion checkpoint. You probably do not want to set"
+            " it as the original VQ-Diffusion always uses the ema weights when loading models."
+        ),
+    )
+
+    args = parser.parse_args()
+
+    use_ema = not args.no_use_ema
+
+    print(f"loading checkpoints to {args.checkpoint_load_device}")
+
+    checkpoint_map_location = torch.device(args.checkpoint_load_device)
+
+    # vqvae_model
+
+    print(f"loading vqvae, config: {args.vqvae_original_config_file}, checkpoint: {args.vqvae_checkpoint_path}")
+
+    vqvae_original_config = read_config_file(args.vqvae_original_config_file).model
+    vqvae_checkpoint = torch.load(args.vqvae_checkpoint_path, map_location=checkpoint_map_location)["model"]
+
+    with init_empty_weights():
+        vqvae_model = vqvae_model_from_original_config(vqvae_original_config)
+
+    vqvae_diffusers_checkpoint = vqvae_original_checkpoint_to_diffusers_checkpoint(vqvae_model, vqvae_checkpoint)
+
+    with tempfile.NamedTemporaryFile() as vqvae_diffusers_checkpoint_file:
+        torch.save(vqvae_diffusers_checkpoint, vqvae_diffusers_checkpoint_file.name)
+        del vqvae_diffusers_checkpoint
+        del vqvae_checkpoint
+        load_checkpoint_and_dispatch(vqvae_model, vqvae_diffusers_checkpoint_file.name, device_map="auto")
+
+    print("done loading vqvae")
+
+    # done vqvae_model
+
+    # transformer_model
+
+    print(
+        f"loading transformer, config: {args.original_config_file}, checkpoint: {args.checkpoint_path}, use ema:"
+        f" {use_ema}"
+    )
+
+    original_config = read_config_file(args.original_config_file).model
+
+    diffusion_config = original_config["params"]["diffusion_config"]
+    transformer_config = original_config["params"]["diffusion_config"]["params"]["transformer_config"]
+    content_embedding_config = original_config["params"]["diffusion_config"]["params"]["content_emb_config"]
+
+    pre_checkpoint = torch.load(args.checkpoint_path, map_location=checkpoint_map_location)
+
+    if use_ema:
+        if "ema" in pre_checkpoint:
+            checkpoint = {}
+            for k, v in pre_checkpoint["model"].items():
+                checkpoint[k] = v
+
+            for k, v in pre_checkpoint["ema"].items():
+                # The ema weights are only used on the transformer. To mimic their key as if they came
+                # from the state_dict for the top level model, we prefix with an additional "transformer."
+                # See the source linked in the args.use_ema config for more information.
+                checkpoint[f"transformer.{k}"] = v
+        else:
+            print("attempted to load ema weights but no ema weights are specified in the loaded checkpoint.")
+            checkpoint = pre_checkpoint["model"]
+    else:
+        checkpoint = pre_checkpoint["model"]
+
+    del pre_checkpoint
+
+    with init_empty_weights():
+        transformer_model = transformer_model_from_original_config(
+            diffusion_config, transformer_config, content_embedding_config
+        )
+
+    diffusers_transformer_checkpoint = transformer_original_checkpoint_to_diffusers_checkpoint(
+        transformer_model, checkpoint
+    )
+
+    # classifier free sampling embeddings interlude
+
+    # The learned embeddings are stored on the transformer in the original VQ-diffusion. We store them on a separate
+    # model, so we pull them off the checkpoint before the checkpoint is deleted.
+
+    learnable_classifier_free_sampling_embeddings = diffusion_config["params"].learnable_cf
+
+    if learnable_classifier_free_sampling_embeddings:
+        learned_classifier_free_sampling_embeddings_embeddings = checkpoint["transformer.empty_text_embed"]
+    else:
+        learned_classifier_free_sampling_embeddings_embeddings = None
+
+    # done classifier free sampling embeddings interlude
+
+    with tempfile.NamedTemporaryFile() as diffusers_transformer_checkpoint_file:
+        torch.save(diffusers_transformer_checkpoint, diffusers_transformer_checkpoint_file.name)
+        del diffusers_transformer_checkpoint
+        del checkpoint
+        load_checkpoint_and_dispatch(transformer_model, diffusers_transformer_checkpoint_file.name, device_map="auto")
+
+    print("done loading transformer")
+
+    # done transformer_model
+
+    # text encoder
+
+    print("loading CLIP text encoder")
+
+    clip_name = "openai/clip-vit-base-patch32"
+
+    # The original VQ-Diffusion specifies the pad value by the int used in the
+    # returned tokens. Each model uses `0` as the pad value. The transformers clip api
+    # specifies the pad value via the token before it has been tokenized. The `!` pad
+    # token is the same as padding with the `0` pad value.
+    pad_token = "!"
+
+    tokenizer_model = CLIPTokenizer.from_pretrained(clip_name, pad_token=pad_token, device_map="auto")
+
+    assert tokenizer_model.convert_tokens_to_ids(pad_token) == 0
+
+    text_encoder_model = CLIPTextModel.from_pretrained(
+        clip_name,
+        # `CLIPTextModel` does not support device_map="auto"
+        # device_map="auto"
+    )
+
+    print("done loading CLIP text encoder")
+
+    # done text encoder
+
+    # scheduler
+
+    scheduler_model = VQDiffusionScheduler(
+        # the scheduler has the same number of embeddings as the transformer
+        num_vec_classes=transformer_model.num_vector_embeds
+    )
+
+    # done scheduler
+
+    # learned classifier free sampling embeddings
+
+    with init_empty_weights():
+        learned_classifier_free_sampling_embeddings_model = LearnedClassifierFreeSamplingEmbeddings(
+            learnable_classifier_free_sampling_embeddings,
+            hidden_size=text_encoder_model.config.hidden_size,
+            length=tokenizer_model.model_max_length,
+        )
+
+    learned_classifier_free_sampling_checkpoint = {
+        "embeddings": learned_classifier_free_sampling_embeddings_embeddings.float()
+    }
+
+    with tempfile.NamedTemporaryFile() as learned_classifier_free_sampling_checkpoint_file:
+        torch.save(learned_classifier_free_sampling_checkpoint, learned_classifier_free_sampling_checkpoint_file.name)
+        del learned_classifier_free_sampling_checkpoint
+        del learned_classifier_free_sampling_embeddings_embeddings
+        load_checkpoint_and_dispatch(
+            learned_classifier_free_sampling_embeddings_model,
+            learned_classifier_free_sampling_checkpoint_file.name,
+            device_map="auto",
+        )
+
+    # done learned classifier free sampling embeddings
+
+    print(f"saving VQ diffusion model, path: {args.dump_path}")
+
+    pipe = VQDiffusionPipeline(
+        vqvae=vqvae_model,
+        transformer=transformer_model,
+        tokenizer=tokenizer_model,
+        text_encoder=text_encoder_model,
+        learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings_model,
+        scheduler=scheduler_model,
+    )
+    pipe.save_pretrained(args.dump_path)
+
+    print("done writing VQ diffusion model")
diff --git a/diffusers/scripts/convert_wuerstchen.py b/diffusers/scripts/convert_wuerstchen.py
new file mode 100644
index 0000000000000000000000000000000000000000..23d45d3dd6ad4d03d89e77ab27b807bf5bb50de7
--- /dev/null
+++ b/diffusers/scripts/convert_wuerstchen.py
@@ -0,0 +1,115 @@
+# Run inside root directory of official source code: https://github.com/dome272/wuerstchen/
+import os
+
+import torch
+from transformers import AutoTokenizer, CLIPTextModel
+from vqgan import VQModel
+
+from diffusers import (
+    DDPMWuerstchenScheduler,
+    WuerstchenCombinedPipeline,
+    WuerstchenDecoderPipeline,
+    WuerstchenPriorPipeline,
+)
+from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt, WuerstchenPrior
+
+
+model_path = "models/"
+device = "cpu"
+
+paella_vqmodel = VQModel()
+state_dict = torch.load(os.path.join(model_path, "vqgan_f4_v1_500k.pt"), map_location=device)["state_dict"]
+paella_vqmodel.load_state_dict(state_dict)
+
+state_dict["vquantizer.embedding.weight"] = state_dict["vquantizer.codebook.weight"]
+state_dict.pop("vquantizer.codebook.weight")
+vqmodel = PaellaVQModel(num_vq_embeddings=paella_vqmodel.codebook_size, latent_channels=paella_vqmodel.c_latent)
+vqmodel.load_state_dict(state_dict)
+
+# Clip Text encoder and tokenizer
+text_encoder = CLIPTextModel.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+tokenizer = AutoTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+
+# Generator
+gen_text_encoder = CLIPTextModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K").to("cpu")
+gen_tokenizer = AutoTokenizer.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+
+orig_state_dict = torch.load(os.path.join(model_path, "model_v2_stage_b.pt"), map_location=device)["state_dict"]
+state_dict = {}
+for key in orig_state_dict.keys():
+    if key.endswith("in_proj_weight"):
+        weights = orig_state_dict[key].chunk(3, 0)
+        state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
+        state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
+        state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
+    elif key.endswith("in_proj_bias"):
+        weights = orig_state_dict[key].chunk(3, 0)
+        state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
+        state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
+        state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
+    elif key.endswith("out_proj.weight"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
+    elif key.endswith("out_proj.bias"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
+    else:
+        state_dict[key] = orig_state_dict[key]
+deocder = WuerstchenDiffNeXt()
+deocder.load_state_dict(state_dict)
+
+# Prior
+orig_state_dict = torch.load(os.path.join(model_path, "model_v3_stage_c.pt"), map_location=device)["ema_state_dict"]
+state_dict = {}
+for key in orig_state_dict.keys():
+    if key.endswith("in_proj_weight"):
+        weights = orig_state_dict[key].chunk(3, 0)
+        state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
+        state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
+        state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
+    elif key.endswith("in_proj_bias"):
+        weights = orig_state_dict[key].chunk(3, 0)
+        state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
+        state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
+        state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
+    elif key.endswith("out_proj.weight"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
+    elif key.endswith("out_proj.bias"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
+    else:
+        state_dict[key] = orig_state_dict[key]
+prior_model = WuerstchenPrior(c_in=16, c=1536, c_cond=1280, c_r=64, depth=32, nhead=24).to(device)
+prior_model.load_state_dict(state_dict)
+
+# scheduler
+scheduler = DDPMWuerstchenScheduler()
+
+# Prior pipeline
+prior_pipeline = WuerstchenPriorPipeline(
+    prior=prior_model, text_encoder=text_encoder, tokenizer=tokenizer, scheduler=scheduler
+)
+
+prior_pipeline.save_pretrained("warp-ai/wuerstchen-prior")
+
+decoder_pipeline = WuerstchenDecoderPipeline(
+    text_encoder=gen_text_encoder, tokenizer=gen_tokenizer, vqgan=vqmodel, decoder=deocder, scheduler=scheduler
+)
+decoder_pipeline.save_pretrained("warp-ai/wuerstchen")
+
+# Wuerstchen pipeline
+wuerstchen_pipeline = WuerstchenCombinedPipeline(
+    # Decoder
+    text_encoder=gen_text_encoder,
+    tokenizer=gen_tokenizer,
+    decoder=deocder,
+    scheduler=scheduler,
+    vqgan=vqmodel,
+    # Prior
+    prior_tokenizer=tokenizer,
+    prior_text_encoder=text_encoder,
+    prior=prior_model,
+    prior_scheduler=scheduler,
+)
+wuerstchen_pipeline.save_pretrained("warp-ai/WuerstchenCombinedPipeline")
diff --git a/diffusers/scripts/convert_zero123_to_diffusers.py b/diffusers/scripts/convert_zero123_to_diffusers.py
new file mode 100644
index 0000000000000000000000000000000000000000..b46633fae7ff878d74223c6b4a9d0dabc7a08182
--- /dev/null
+++ b/diffusers/scripts/convert_zero123_to_diffusers.py
@@ -0,0 +1,807 @@
+"""
+This script modified from
+https://github.com/huggingface/diffusers/blob/bc691231360a4cbc7d19a58742ebb8ed0f05e027/scripts/convert_original_stable_diffusion_to_diffusers.py
+
+Convert original Zero1to3 checkpoint to diffusers checkpoint.
+
+# run the convert script
+$ python convert_zero123_to_diffusers.py \
+   --checkpoint_path /path/zero123/105000.ckpt \
+   --dump_path ./zero1to3 \
+   --original_config_file /path/zero123/configs/sd-objaverse-finetune-c_concat-256.yaml
+```
+"""
+
+import argparse
+
+import torch
+import yaml
+from accelerate import init_empty_weights
+from accelerate.utils import set_module_tensor_to_device
+from pipeline_zero1to3 import CCProjection, Zero1to3StableDiffusionPipeline
+from transformers import (
+    CLIPImageProcessor,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers.models import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+)
+from diffusers.schedulers import DDIMScheduler
+from diffusers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    if controlnet:
+        unet_params = original_config["model"]["params"]["control_stage_config"]["params"]
+    else:
+        if (
+            "unet_config" in original_config["model"]["params"]
+            and original_config["model"]["params"]["unet_config"] is not None
+        ):
+            unet_params = original_config["model"]["params"]["unet_config"]["params"]
+        else:
+            unet_params = original_config["model"]["params"]["network_config"]["params"]
+
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    if unet_params["transformer_depth"] is not None:
+        transformer_layers_per_block = (
+            unet_params["transformer_depth"]
+            if isinstance(unet_params["transformer_depth"], int)
+            else list(unet_params["transformer_depth"])
+        )
+    else:
+        transformer_layers_per_block = 1
+
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
+
+    head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None
+    use_linear_projection = (
+        unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim_mult = unet_params["model_channels"] // unet_params["num_head_channels"]
+            head_dim = [head_dim_mult * c for c in list(unet_params["channel_mult"])]
+
+    class_embed_type = None
+    addition_embed_type = None
+    addition_time_embed_dim = None
+    projection_class_embeddings_input_dim = None
+    context_dim = None
+
+    if unet_params["context_dim"] is not None:
+        context_dim = (
+            unet_params["context_dim"]
+            if isinstance(unet_params["context_dim"], int)
+            else unet_params["context_dim"][0]
+        )
+
+    if "num_classes" in unet_params:
+        if unet_params["num_classes"] == "sequential":
+            if context_dim in [2048, 1280]:
+                # SDXL
+                addition_embed_type = "text_time"
+                addition_time_embed_dim = 256
+            else:
+                class_embed_type = "projection"
+            assert "adm_in_channels" in unet_params
+            projection_class_embeddings_input_dim = unet_params["adm_in_channels"]
+        else:
+            raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params['num_classes']}")
+
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params["in_channels"],
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params["num_res_blocks"],
+        "cross_attention_dim": context_dim,
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "addition_embed_type": addition_embed_type,
+        "addition_time_embed_dim": addition_time_embed_dim,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "transformer_layers_per_block": transformer_layers_per_block,
+    }
+
+    if controlnet:
+        config["conditioning_channels"] = unet_params["hint_channels"]
+    else:
+        config["out_channels"] = unet_params["out_channels"]
+        config["up_block_types"] = tuple(up_block_types)
+
+    return config
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        is_attn_weight = "proj_attn.weight" in new_path or ("attentions" in new_path and "to_" in new_path)
+        shape = old_checkpoint[path["old"]].shape
+        if is_attn_weight and len(shape) == 3:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        elif is_attn_weight and len(shape) == 4:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def convert_ldm_unet_checkpoint(
+    checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False
+):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    if skip_extract_state_dict:
+        unet_state_dict = checkpoint
+    else:
+        # extract state_dict for UNet
+        unet_state_dict = {}
+        keys = list(checkpoint.keys())
+
+        if controlnet:
+            unet_key = "control_model."
+        else:
+            unet_key = "model.diffusion_model."
+
+        # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+        if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+            logger.warning(f"Checkpoint {path} has both EMA and non-EMA weights.")
+            logger.warning(
+                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+            )
+            for key in keys:
+                if key.startswith("model.diffusion_model"):
+                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint[flat_ema_key]
+        else:
+            if sum(k.startswith("model_ema") for k in keys) > 100:
+                logger.warning(
+                    "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                    " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+                )
+
+            for key in keys:
+                if key.startswith(unet_key):
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint[key]
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+
+    if config["addition_embed_type"] == "text_time":
+        new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    if not controlnet:
+        new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+        new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+        new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+        new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    if controlnet:
+        # conditioning embedding
+
+        orig_index = 0
+
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+
+        orig_index += 2
+
+        diffusers_index = 0
+
+        while diffusers_index < 6:
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
+            )
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
+            )
+            diffusers_index += 1
+            orig_index += 2
+
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+
+        # down blocks
+        for i in range(num_input_blocks):
+            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
+
+        # mid block
+        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
+
+    return new_checkpoint
+
+
+def create_vae_diffusers_config(original_config, image_size: int):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
+
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
+    }
+    return config
+
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "to_q.weight")
+        new_item = new_item.replace("q.bias", "to_q.bias")
+
+        new_item = new_item.replace("k.weight", "to_k.weight")
+        new_item = new_item.replace("k.bias", "to_k.bias")
+
+        new_item = new_item.replace("v.weight", "to_v.weight")
+        new_item = new_item.replace("v.bias", "to_v.bias")
+
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def convert_from_original_zero123_ckpt(checkpoint_path, original_config_file, extract_ema, device):
+    ckpt = torch.load(checkpoint_path, map_location=device)
+    ckpt["global_step"]
+    checkpoint = ckpt["state_dict"]
+    del ckpt
+    torch.cuda.empty_cache()
+
+    original_config = yaml.safe_load(original_config_file)
+    original_config["model"]["params"]["cond_stage_config"]["target"].split(".")[-1]
+    num_in_channels = 8
+    original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+    prediction_type = "epsilon"
+    image_size = 256
+    num_train_timesteps = getattr(original_config["model"]["params"], "timesteps", None) or 1000
+
+    beta_start = getattr(original_config["model"]["params"], "linear_start", None) or 0.02
+    beta_end = getattr(original_config["model"]["params"], "linear_end", None) or 0.085
+    scheduler = DDIMScheduler(
+        beta_end=beta_end,
+        beta_schedule="scaled_linear",
+        beta_start=beta_start,
+        num_train_timesteps=num_train_timesteps,
+        steps_offset=1,
+        clip_sample=False,
+        set_alpha_to_one=False,
+        prediction_type=prediction_type,
+    )
+    scheduler.register_to_config(clip_sample=False)
+
+    # Convert the UNet2DConditionModel model.
+    upcast_attention = None
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet_config["upcast_attention"] = upcast_attention
+    with init_empty_weights():
+        unet = UNet2DConditionModel(**unet_config)
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=None, extract_ema=extract_ema
+    )
+    for param_name, param in converted_unet_checkpoint.items():
+        set_module_tensor_to_device(unet, param_name, "cpu", value=param)
+
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+    if (
+        "model" in original_config
+        and "params" in original_config["model"]
+        and "scale_factor" in original_config["model"]["params"]
+    ):
+        vae_scaling_factor = original_config["model"]["params"]["scale_factor"]
+    else:
+        vae_scaling_factor = 0.18215  # default SD scaling factor
+
+    vae_config["scaling_factor"] = vae_scaling_factor
+
+    with init_empty_weights():
+        vae = AutoencoderKL(**vae_config)
+
+    for param_name, param in converted_vae_checkpoint.items():
+        set_module_tensor_to_device(vae, param_name, "cpu", value=param)
+
+    feature_extractor = CLIPImageProcessor.from_pretrained(
+        "lambdalabs/sd-image-variations-diffusers", subfolder="feature_extractor"
+    )
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+        "lambdalabs/sd-image-variations-diffusers", subfolder="image_encoder"
+    )
+
+    cc_projection = CCProjection()
+    cc_projection.load_state_dict(
+        {
+            "projection.weight": checkpoint["cc_projection.weight"].cpu(),
+            "projection.bias": checkpoint["cc_projection.bias"].cpu(),
+        }
+    )
+
+    pipe = Zero1to3StableDiffusionPipeline(
+        vae, image_encoder, unet, scheduler, None, feature_extractor, cc_projection, requires_safety_checker=False
+    )
+
+    return pipe
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument(
+        "--to_safetensors",
+        action="store_true",
+        help="Whether to store pipeline in safetensors format or not.",
+    )
+    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    args = parser.parse_args()
+
+    pipe = convert_from_original_zero123_ckpt(
+        checkpoint_path=args.checkpoint_path,
+        original_config_file=args.original_config_file,
+        extract_ema=args.extract_ema,
+        device=args.device,
+    )
+
+    if args.half:
+        pipe.to(dtype=torch.float16)
+
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/diffusers/scripts/generate_logits.py b/diffusers/scripts/generate_logits.py
new file mode 100644
index 0000000000000000000000000000000000000000..99d46d6628a61c7e6cc0cb662ea382716a43843f
--- /dev/null
+++ b/diffusers/scripts/generate_logits.py
@@ -0,0 +1,127 @@
+import random
+
+import torch
+from huggingface_hub import HfApi
+
+from diffusers import UNet2DModel
+
+
+api = HfApi()
+
+results = {}
+# fmt: off
+results["google_ddpm_cifar10_32"] = torch.tensor([
+    -0.7515, -1.6883, 0.2420, 0.0300, 0.6347, 1.3433, -1.1743, -3.7467,
+    1.2342, -2.2485, 0.4636, 0.8076, -0.7991, 0.3969, 0.8498, 0.9189,
+    -1.8887, -3.3522, 0.7639, 0.2040, 0.6271, -2.7148, -1.6316, 3.0839,
+    0.3186, 0.2721, -0.9759, -1.2461, 2.6257, 1.3557
+])
+results["google_ddpm_ema_bedroom_256"] = torch.tensor([
+    -2.3639, -2.5344, 0.0054, -0.6674, 1.5990, 1.0158, 0.3124, -2.1436,
+    1.8795, -2.5429, -0.1566, -0.3973, 1.2490, 2.6447, 1.2283, -0.5208,
+    -2.8154, -3.5119, 2.3838, 1.2033, 1.7201, -2.1256, -1.4576, 2.7948,
+    2.4204, -0.9752, -1.2546, 0.8027, 3.2758, 3.1365
+])
+results["CompVis_ldm_celebahq_256"] = torch.tensor([
+    -0.6531, -0.6891, -0.3172, -0.5375, -0.9140, -0.5367, -0.1175, -0.7869,
+    -0.3808, -0.4513, -0.2098, -0.0083, 0.3183, 0.5140, 0.2247, -0.1304,
+    -0.1302, -0.2802, -0.2084, -0.2025, -0.4967, -0.4873, -0.0861, 0.6925,
+    0.0250, 0.1290, -0.1543, 0.6316, 1.0460, 1.4943
+])
+results["google_ncsnpp_ffhq_1024"] = torch.tensor([
+    0.0911, 0.1107, 0.0182, 0.0435, -0.0805, -0.0608, 0.0381, 0.2172,
+    -0.0280, 0.1327, -0.0299, -0.0255, -0.0050, -0.1170, -0.1046, 0.0309,
+    0.1367, 0.1728, -0.0533, -0.0748, -0.0534, 0.1624, 0.0384, -0.1805,
+    -0.0707, 0.0642, 0.0220, -0.0134, -0.1333, -0.1505
+])
+results["google_ncsnpp_bedroom_256"] = torch.tensor([
+    0.1321, 0.1337, 0.0440, 0.0622, -0.0591, -0.0370, 0.0503, 0.2133,
+    -0.0177, 0.1415, -0.0116, -0.0112, 0.0044, -0.0980, -0.0789, 0.0395,
+    0.1502, 0.1785, -0.0488, -0.0514, -0.0404, 0.1539, 0.0454, -0.1559,
+    -0.0665, 0.0659, 0.0383, -0.0005, -0.1266, -0.1386
+])
+results["google_ncsnpp_celebahq_256"] = torch.tensor([
+    0.1154, 0.1218, 0.0307, 0.0526, -0.0711, -0.0541, 0.0366, 0.2078,
+    -0.0267, 0.1317, -0.0226, -0.0193, -0.0014, -0.1055, -0.0902, 0.0330,
+    0.1391, 0.1709, -0.0562, -0.0693, -0.0560, 0.1482, 0.0381, -0.1683,
+    -0.0681, 0.0661, 0.0331, -0.0046, -0.1268, -0.1431
+])
+results["google_ncsnpp_church_256"] = torch.tensor([
+    0.1192, 0.1240, 0.0414, 0.0606, -0.0557, -0.0412, 0.0430, 0.2042,
+    -0.0200, 0.1385, -0.0115, -0.0132, 0.0017, -0.0965, -0.0802, 0.0398,
+    0.1433, 0.1747, -0.0458, -0.0533, -0.0407, 0.1545, 0.0419, -0.1574,
+    -0.0645, 0.0626, 0.0341, -0.0010, -0.1199, -0.1390
+])
+results["google_ncsnpp_ffhq_256"] = torch.tensor([
+    0.1075, 0.1074, 0.0205, 0.0431, -0.0774, -0.0607, 0.0298, 0.2042,
+    -0.0320, 0.1267, -0.0281, -0.0250, -0.0064, -0.1091, -0.0946, 0.0290,
+    0.1328, 0.1650, -0.0580, -0.0738, -0.0586, 0.1440, 0.0337, -0.1746,
+    -0.0712, 0.0605, 0.0250, -0.0099, -0.1316, -0.1473
+])
+results["google_ddpm_cat_256"] = torch.tensor([
+    -1.4572, -2.0481, -0.0414, -0.6005, 1.4136, 0.5848, 0.4028, -2.7330,
+    1.2212, -2.1228, 0.2155, 0.4039, 0.7662, 2.0535, 0.7477, -0.3243,
+    -2.1758, -2.7648, 1.6947, 0.7026, 1.2338, -1.6078, -0.8682, 2.2810,
+    1.8574, -0.5718, -0.5586, -0.0186, 2.3415, 2.1251])
+results["google_ddpm_celebahq_256"] = torch.tensor([
+    -1.3690, -1.9720, -0.4090, -0.6966, 1.4660, 0.9938, -0.1385, -2.7324,
+    0.7736, -1.8917, 0.2923, 0.4293, 0.1693, 1.4112, 1.1887, -0.3181,
+    -2.2160, -2.6381, 1.3170, 0.8163, 0.9240, -1.6544, -0.6099, 2.5259,
+    1.6430, -0.9090, -0.9392, -0.0126, 2.4268, 2.3266
+])
+results["google_ddpm_ema_celebahq_256"] = torch.tensor([
+    -1.3525, -1.9628, -0.3956, -0.6860, 1.4664, 1.0014, -0.1259, -2.7212,
+    0.7772, -1.8811, 0.2996, 0.4388, 0.1704, 1.4029, 1.1701, -0.3027,
+    -2.2053, -2.6287, 1.3350, 0.8131, 0.9274, -1.6292, -0.6098, 2.5131,
+    1.6505, -0.8958, -0.9298, -0.0151, 2.4257, 2.3355
+])
+results["google_ddpm_church_256"] = torch.tensor([
+    -2.0585, -2.7897, -0.2850, -0.8940, 1.9052, 0.5702, 0.6345, -3.8959,
+    1.5932, -3.2319, 0.1974, 0.0287, 1.7566, 2.6543, 0.8387, -0.5351,
+    -3.2736, -4.3375, 2.9029, 1.6390, 1.4640, -2.1701, -1.9013, 2.9341,
+    3.4981, -0.6255, -1.1644, -0.1591, 3.7097, 3.2066
+])
+results["google_ddpm_bedroom_256"] = torch.tensor([
+    -2.3139, -2.5594, -0.0197, -0.6785, 1.7001, 1.1606, 0.3075, -2.1740,
+    1.8071, -2.5630, -0.0926, -0.3811, 1.2116, 2.6246, 1.2731, -0.5398,
+    -2.8153, -3.6140, 2.3893, 1.3262, 1.6258, -2.1856, -1.3267, 2.8395,
+    2.3779, -1.0623, -1.2468, 0.8959, 3.3367, 3.2243
+])
+results["google_ddpm_ema_church_256"] = torch.tensor([
+    -2.0628, -2.7667, -0.2089, -0.8263, 2.0539, 0.5992, 0.6495, -3.8336,
+    1.6025, -3.2817, 0.1721, -0.0633, 1.7516, 2.7039, 0.8100, -0.5908,
+    -3.2113, -4.4343, 2.9257, 1.3632, 1.5562, -2.1489, -1.9894, 3.0560,
+    3.3396, -0.7328, -1.0417, 0.0383, 3.7093, 3.2343
+])
+results["google_ddpm_ema_cat_256"] = torch.tensor([
+    -1.4574, -2.0569, -0.0473, -0.6117, 1.4018, 0.5769, 0.4129, -2.7344,
+    1.2241, -2.1397, 0.2000, 0.3937, 0.7616, 2.0453, 0.7324, -0.3391,
+    -2.1746, -2.7744, 1.6963, 0.6921, 1.2187, -1.6172, -0.8877, 2.2439,
+    1.8471, -0.5839, -0.5605, -0.0464, 2.3250, 2.1219
+])
+# fmt: on
+
+models = api.list_models(filter="diffusers")
+for mod in models:
+    if "google" in mod.author or mod.id == "CompVis/ldm-celebahq-256":
+        local_checkpoint = "/home/patrick/google_checkpoints/" + mod.id.split("/")[-1]
+
+        print(f"Started running {mod.id}!!!")
+
+        if mod.id.startswith("CompVis"):
+            model = UNet2DModel.from_pretrained(local_checkpoint, subfolder="unet")
+        else:
+            model = UNet2DModel.from_pretrained(local_checkpoint)
+
+        torch.manual_seed(0)
+        random.seed(0)
+
+        noise = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+        time_step = torch.tensor([10] * noise.shape[0])
+        with torch.no_grad():
+            logits = model(noise, time_step).sample
+
+        assert torch.allclose(
+            logits[0, 0, 0, :30], results["_".join("_".join(mod.id.split("/")).split("-"))], atol=1e-3
+        )
+        print(f"{mod.id} has passed successfully!!!")
diff --git a/diffusers/setup.py b/diffusers/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..89e18cab629bff6b0a44de7d374e47bf5eb3cf00
--- /dev/null
+++ b/diffusers/setup.py
@@ -0,0 +1,286 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/main/setup.py
+
+To create the package for PyPI.
+
+1. Run `make pre-release` (or `make pre-patch` for a patch release) then run `make fix-copies` to fix the index of the
+   documentation.
+
+   If releasing on a special branch, copy the updated README.md on the main branch for the commit you will make
+   for the post-release and run `make fix-copies` on the main branch as well.
+
+2. Unpin specific versions from setup.py that use a git install.
+
+3. Checkout the release branch (v<RELEASE>-release, for example v4.19-release), and commit these changes with the
+   message: "Release: <RELEASE>" and push.
+
+4. Manually trigger the "Nightly and release tests on main/release branch" workflow from the release branch. Wait for
+   the tests to complete. We can safely ignore the known test failures.
+
+5. Wait for the tests on main to be completed and be green (otherwise revert and fix bugs).
+
+6. Add a tag in git to mark the release: "git tag v<RELEASE> -m 'Adds tag v<RELEASE> for PyPI'"
+   Push the tag to git: git push --tags origin v<RELEASE>-release
+
+7. Build both the sources and the wheel. Do not change anything in setup.py between
+   creating the wheel and the source distribution (obviously).
+
+   For the wheel, run: "python setup.py bdist_wheel" in the top level directory
+   (This will build a wheel for the Python version you use to build it).
+
+   For the sources, run: "python setup.py sdist"
+   You should now have a /dist directory with both .whl and .tar.gz source versions.
+
+   Long story cut short, you need to run both before you can upload the distribution to the
+   test PyPI and the actual PyPI servers:
+
+   python setup.py bdist_wheel && python setup.py sdist
+
+8. Check that everything looks correct by uploading the package to the PyPI test server:
+
+   twine upload dist/* -r pypitest
+   (pypi suggests using twine as other methods upload files via plaintext.)
+   You may have to specify the repository url, use the following command then:
+   twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
+
+   Check that you can install it in a virtualenv by running:
+   pip install -i https://testpypi.python.org/pypi diffusers
+
+   If you are testing from a Colab Notebook, for instance, then do:
+   pip install diffusers && pip uninstall diffusers
+   pip install -i https://testpypi.python.org/pypi diffusers
+
+   Check you can run the following commands:
+   python -c "from diffusers import __version__; print(__version__)"
+   python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('fusing/unet-ldm-dummy-update'); pipe()"
+   python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('hf-internal-testing/tiny-stable-diffusion-pipe', safety_checker=None); pipe('ah suh du')"
+   python -c "from diffusers import *"
+
+9. Upload the final version to the actual PyPI:
+   twine upload dist/* -r pypi
+
+10. Prepare the release notes and publish them on GitHub once everything is looking hunky-dory. You can use the following
+    Space to fetch all the commits applicable for the release: https://huggingface.co/spaces/lysandre/github-release. Repo should
+    be `huggingface/diffusers`. `tag` should be the previous release tag (v0.26.1, for example), and `branch` should be
+    the latest release branch (v0.27.0-release, for example). It denotes all commits that have happened on branch
+    v0.27.0-release after the tag v0.26.1 was created.
+
+11. Run `make post-release` (or, for a patch release, `make post-patch`). If you were on a branch for the release,
+    you need to go back to main before executing this.
+"""
+
+import os
+import re
+import sys
+
+from setuptools import Command, find_packages, setup
+
+
+# IMPORTANT:
+# 1. all dependencies should be listed here with their version requirements if any
+# 2. once modified, run: `make deps_table_update` to update src/diffusers/dependency_versions_table.py
+_deps = [
+    "Pillow",  # keep the PIL.Image.Resampling deprecation away
+    "accelerate>=0.31.0",
+    "compel==0.1.8",
+    "datasets",
+    "filelock",
+    "flax>=0.4.1",
+    "hf-doc-builder>=0.3.0",
+    "huggingface-hub>=0.23.2",
+    "requests-mock==1.10.0",
+    "importlib_metadata",
+    "invisible-watermark>=0.2.0",
+    "isort>=5.5.4",
+    "jax>=0.4.1",
+    "jaxlib>=0.4.1",
+    "Jinja2",
+    "k-diffusion>=0.0.12",
+    "torchsde",
+    "note_seq",
+    "librosa",
+    "numpy",
+    "parameterized",
+    "peft>=0.6.0",
+    "protobuf>=3.20.3,<4",
+    "pytest",
+    "pytest-timeout",
+    "pytest-xdist",
+    "python>=3.8.0",
+    "ruff==0.1.5",
+    "safetensors>=0.3.1",
+    "sentencepiece>=0.1.91,!=0.1.92",
+    "GitPython<3.1.19",
+    "scipy",
+    "onnx",
+    "regex!=2019.12.17",
+    "requests",
+    "tensorboard",
+    "torch>=1.4",
+    "torchvision",
+    "transformers>=4.41.2",
+    "urllib3<=2.0.0",
+    "black",
+]
+
+# this is a lookup table with items like:
+#
+# tokenizers: "huggingface-hub==0.8.0"
+# packaging: "packaging"
+#
+# some of the values are versioned whereas others aren't.
+deps = {b: a for a, b in (re.findall(r"^(([^!=<>~]+)(?:[!=<>~].*)?$)", x)[0] for x in _deps)}
+
+# since we save this data in src/diffusers/dependency_versions_table.py it can be easily accessed from
+# anywhere. If you need to quickly access the data from this table in a shell, you can do so easily with:
+#
+# python -c 'import sys; from diffusers.dependency_versions_table import deps; \
+# print(" ".join([deps[x] for x in sys.argv[1:]]))' tokenizers datasets
+#
+# Just pass the desired package names to that script as it's shown with 2 packages above.
+#
+# If diffusers is not yet installed and the work is done from the cloned repo remember to add `PYTHONPATH=src` to the script above
+#
+# You can then feed this for example to `pip`:
+#
+# pip install -U $(python -c 'import sys; from diffusers.dependency_versions_table import deps; \
+# print(" ".join([deps[x] for x in sys.argv[1:]]))' tokenizers datasets)
+#
+
+
+def deps_list(*pkgs):
+    return [deps[pkg] for pkg in pkgs]
+
+
+class DepsTableUpdateCommand(Command):
+    """
+    A custom command that updates the dependency table.
+    usage: python setup.py deps_table_update
+    """
+
+    description = "build runtime dependency table"
+    user_options = [
+        # format: (long option, short option, description).
+        (
+            "dep-table-update",
+            None,
+            "updates src/diffusers/dependency_versions_table.py",
+        ),
+    ]
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        entries = "\n".join([f'    "{k}": "{v}",' for k, v in deps.items()])
+        content = [
+            "# THIS FILE HAS BEEN AUTOGENERATED. To update:",
+            "# 1. modify the `_deps` dict in setup.py",
+            "# 2. run `make deps_table_update`",
+            "deps = {",
+            entries,
+            "}",
+            "",
+        ]
+        target = "src/diffusers/dependency_versions_table.py"
+        print(f"updating {target}")
+        with open(target, "w", encoding="utf-8", newline="\n") as f:
+            f.write("\n".join(content))
+
+
+extras = {}
+extras["quality"] = deps_list("urllib3", "isort", "ruff", "hf-doc-builder")
+extras["docs"] = deps_list("hf-doc-builder")
+extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft")
+extras["test"] = deps_list(
+    "compel",
+    "GitPython",
+    "datasets",
+    "Jinja2",
+    "invisible-watermark",
+    "k-diffusion",
+    "librosa",
+    "parameterized",
+    "pytest",
+    "pytest-timeout",
+    "pytest-xdist",
+    "requests-mock",
+    "safetensors",
+    "sentencepiece",
+    "scipy",
+    "torchvision",
+    "transformers",
+)
+extras["torch"] = deps_list("torch", "accelerate")
+
+if os.name == "nt":  # windows
+    extras["flax"] = []  # jax is not supported on windows
+else:
+    extras["flax"] = deps_list("jax", "jaxlib", "flax")
+
+extras["dev"] = (
+    extras["quality"] + extras["test"] + extras["training"] + extras["docs"] + extras["torch"] + extras["flax"]
+)
+
+install_requires = [
+    deps["importlib_metadata"],
+    deps["filelock"],
+    deps["huggingface-hub"],
+    deps["numpy"],
+    deps["regex"],
+    deps["requests"],
+    deps["safetensors"],
+    deps["Pillow"],
+]
+
+version_range_max = max(sys.version_info[1], 10) + 1
+
+setup(
+    name="diffusers",
+    version="0.31.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    description="State-of-the-art diffusion in PyTorch and JAX.",
+    long_description=open("README.md", "r", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    keywords="deep learning diffusion jax pytorch stable diffusion audioldm",
+    license="Apache 2.0 License",
+    author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/diffusers/graphs/contributors)",
+    author_email="diffusers@huggingface.co",
+    url="https://github.com/huggingface/diffusers",
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    package_data={"diffusers": ["py.typed"]},
+    include_package_data=True,
+    python_requires=">=3.8.0",
+    install_requires=list(install_requires),
+    extras_require=extras,
+    entry_points={"console_scripts": ["diffusers-cli=diffusers.commands.diffusers_cli:main"]},
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Programming Language :: Python :: 3",
+    ]
+    + [f"Programming Language :: Python :: 3.{i}" for i in range(8, version_range_max)],
+    cmdclass={"deps_table_update": DepsTableUpdateCommand},
+)
diff --git a/diffusers/src/diffusers.egg-info/PKG-INFO b/diffusers/src/diffusers.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..02aaec059cf201c00f06aad6fa7725cfd83ffd13
--- /dev/null
+++ b/diffusers/src/diffusers.egg-info/PKG-INFO
@@ -0,0 +1,359 @@
+Metadata-Version: 2.4
+Name: diffusers
+Version: 0.31.0.dev0
+Summary: State-of-the-art diffusion in PyTorch and JAX.
+Home-page: https://github.com/huggingface/diffusers
+Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/diffusers/graphs/contributors)
+Author-email: diffusers@huggingface.co
+License: Apache 2.0 License
+Keywords: deep learning diffusion jax pytorch stable diffusion audioldm
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Education
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: OS Independent
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Requires-Python: >=3.8.0
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: importlib_metadata
+Requires-Dist: filelock
+Requires-Dist: huggingface-hub>=0.23.2
+Requires-Dist: numpy
+Requires-Dist: regex!=2019.12.17
+Requires-Dist: requests
+Requires-Dist: safetensors>=0.3.1
+Requires-Dist: Pillow
+Provides-Extra: quality
+Requires-Dist: urllib3<=2.0.0; extra == "quality"
+Requires-Dist: isort>=5.5.4; extra == "quality"
+Requires-Dist: ruff==0.1.5; extra == "quality"
+Requires-Dist: hf-doc-builder>=0.3.0; extra == "quality"
+Provides-Extra: docs
+Requires-Dist: hf-doc-builder>=0.3.0; extra == "docs"
+Provides-Extra: training
+Requires-Dist: accelerate>=0.31.0; extra == "training"
+Requires-Dist: datasets; extra == "training"
+Requires-Dist: protobuf<4,>=3.20.3; extra == "training"
+Requires-Dist: tensorboard; extra == "training"
+Requires-Dist: Jinja2; extra == "training"
+Requires-Dist: peft>=0.6.0; extra == "training"
+Provides-Extra: test
+Requires-Dist: compel==0.1.8; extra == "test"
+Requires-Dist: GitPython<3.1.19; extra == "test"
+Requires-Dist: datasets; extra == "test"
+Requires-Dist: Jinja2; extra == "test"
+Requires-Dist: invisible-watermark>=0.2.0; extra == "test"
+Requires-Dist: k-diffusion>=0.0.12; extra == "test"
+Requires-Dist: librosa; extra == "test"
+Requires-Dist: parameterized; extra == "test"
+Requires-Dist: pytest; extra == "test"
+Requires-Dist: pytest-timeout; extra == "test"
+Requires-Dist: pytest-xdist; extra == "test"
+Requires-Dist: requests-mock==1.10.0; extra == "test"
+Requires-Dist: safetensors>=0.3.1; extra == "test"
+Requires-Dist: sentencepiece!=0.1.92,>=0.1.91; extra == "test"
+Requires-Dist: scipy; extra == "test"
+Requires-Dist: torchvision; extra == "test"
+Requires-Dist: transformers>=4.41.2; extra == "test"
+Provides-Extra: torch
+Requires-Dist: torch>=1.4; extra == "torch"
+Requires-Dist: accelerate>=0.31.0; extra == "torch"
+Provides-Extra: flax
+Requires-Dist: jax>=0.4.1; extra == "flax"
+Requires-Dist: jaxlib>=0.4.1; extra == "flax"
+Requires-Dist: flax>=0.4.1; extra == "flax"
+Provides-Extra: dev
+Requires-Dist: urllib3<=2.0.0; extra == "dev"
+Requires-Dist: isort>=5.5.4; extra == "dev"
+Requires-Dist: ruff==0.1.5; extra == "dev"
+Requires-Dist: hf-doc-builder>=0.3.0; extra == "dev"
+Requires-Dist: compel==0.1.8; extra == "dev"
+Requires-Dist: GitPython<3.1.19; extra == "dev"
+Requires-Dist: datasets; extra == "dev"
+Requires-Dist: Jinja2; extra == "dev"
+Requires-Dist: invisible-watermark>=0.2.0; extra == "dev"
+Requires-Dist: k-diffusion>=0.0.12; extra == "dev"
+Requires-Dist: librosa; extra == "dev"
+Requires-Dist: parameterized; extra == "dev"
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: pytest-timeout; extra == "dev"
+Requires-Dist: pytest-xdist; extra == "dev"
+Requires-Dist: requests-mock==1.10.0; extra == "dev"
+Requires-Dist: safetensors>=0.3.1; extra == "dev"
+Requires-Dist: sentencepiece!=0.1.92,>=0.1.91; extra == "dev"
+Requires-Dist: scipy; extra == "dev"
+Requires-Dist: torchvision; extra == "dev"
+Requires-Dist: transformers>=4.41.2; extra == "dev"
+Requires-Dist: accelerate>=0.31.0; extra == "dev"
+Requires-Dist: datasets; extra == "dev"
+Requires-Dist: protobuf<4,>=3.20.3; extra == "dev"
+Requires-Dist: tensorboard; extra == "dev"
+Requires-Dist: Jinja2; extra == "dev"
+Requires-Dist: peft>=0.6.0; extra == "dev"
+Requires-Dist: hf-doc-builder>=0.3.0; extra == "dev"
+Requires-Dist: torch>=1.4; extra == "dev"
+Requires-Dist: accelerate>=0.31.0; extra == "dev"
+Requires-Dist: jax>=0.4.1; extra == "dev"
+Requires-Dist: jaxlib>=0.4.1; extra == "dev"
+Requires-Dist: flax>=0.4.1; extra == "dev"
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: license-file
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+
+<!---
+Copyright 2022 - The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<p align="center">
+    <br>
+    <img src="https://raw.githubusercontent.com/huggingface/diffusers/main/docs/source/en/imgs/diffusers_library.jpg" width="400"/>
+    <br>
+<p>
+<p align="center">
+    <a href="https://github.com/huggingface/diffusers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/datasets.svg?color=blue"></a>
+    <a href="https://github.com/huggingface/diffusers/releases"><img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/diffusers.svg"></a>
+    <a href="https://pepy.tech/project/diffusers"><img alt="GitHub release" src="https://static.pepy.tech/badge/diffusers/month"></a>
+    <a href="CODE_OF_CONDUCT.md"><img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg"></a>
+    <a href="https://twitter.com/diffuserslib"><img alt="X account" src="https://img.shields.io/twitter/url/https/twitter.com/diffuserslib.svg?style=social&label=Follow%20%40diffuserslib"></a>
+</p>
+
+🤗 Diffusers is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. Whether you're looking for a simple inference solution or training your own diffusion models, 🤗 Diffusers is a modular toolbox that supports both. Our library is designed with a focus on [usability over performance](https://huggingface.co/docs/diffusers/conceptual/philosophy#usability-over-performance), [simple over easy](https://huggingface.co/docs/diffusers/conceptual/philosophy#simple-over-easy), and [customizability over abstractions](https://huggingface.co/docs/diffusers/conceptual/philosophy#tweakable-contributorfriendly-over-abstraction).
+
+🤗 Diffusers offers three core components:
+
+- State-of-the-art [diffusion pipelines](https://huggingface.co/docs/diffusers/api/pipelines/overview) that can be run in inference with just a few lines of code.
+- Interchangeable noise [schedulers](https://huggingface.co/docs/diffusers/api/schedulers/overview) for different diffusion speeds and output quality.
+- Pretrained [models](https://huggingface.co/docs/diffusers/api/models/overview) that can be used as building blocks, and combined with schedulers, for creating your own end-to-end diffusion systems.
+
+## Installation
+
+We recommend installing 🤗 Diffusers in a virtual environment from PyPI or Conda. For more details about installing [PyTorch](https://pytorch.org/get-started/locally/) and [Flax](https://flax.readthedocs.io/en/latest/#installation), please refer to their official documentation.
+
+### PyTorch
+
+With `pip` (official package):
+
+```bash
+pip install --upgrade diffusers[torch]
+```
+
+With `conda` (maintained by the community):
+
+```sh
+conda install -c conda-forge diffusers
+```
+
+### Flax
+
+With `pip` (official package):
+
+```bash
+pip install --upgrade diffusers[flax]
+```
+
+### Apple Silicon (M1/M2) support
+
+Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggingface.co/docs/diffusers/optimization/mps) guide.
+
+## Quickstart
+
+Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 30,000+ checkpoints):
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
+pipeline.to("cuda")
+pipeline("An image of a squirrel in Picasso style").images[0]
+```
+
+You can also dig into the models and schedulers toolbox to build your own diffusion system:
+
+```python
+from diffusers import DDPMScheduler, UNet2DModel
+from PIL import Image
+import torch
+
+scheduler = DDPMScheduler.from_pretrained("google/ddpm-cat-256")
+model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to("cuda")
+scheduler.set_timesteps(50)
+
+sample_size = model.config.sample_size
+noise = torch.randn((1, 3, sample_size, sample_size), device="cuda")
+input = noise
+
+for t in scheduler.timesteps:
+    with torch.no_grad():
+        noisy_residual = model(input, t).sample
+        prev_noisy_sample = scheduler.step(noisy_residual, t, input).prev_sample
+        input = prev_noisy_sample
+
+image = (input / 2 + 0.5).clamp(0, 1)
+image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+image = Image.fromarray((image * 255).round().astype("uint8"))
+image
+```
+
+Check out the [Quickstart](https://huggingface.co/docs/diffusers/quicktour) to launch your diffusion journey today!
+
+## How to navigate the documentation
+
+| **Documentation**                                                   | **What can I learn?**                                                                                                                                                                           |
+|---------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [Tutorial](https://huggingface.co/docs/diffusers/tutorials/tutorial_overview)                                                            | A basic crash course for learning how to use the library's most important features like using models and schedulers to build your own diffusion system, and training your own diffusion model.  |
+| [Loading](https://huggingface.co/docs/diffusers/using-diffusers/loading_overview)                                                             | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers.                                         |
+| [Pipelines for inference](https://huggingface.co/docs/diffusers/using-diffusers/pipeline_overview)                                             | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library.               |
+| [Optimization](https://huggingface.co/docs/diffusers/optimization/opt_overview)                                                        | Guides for how to optimize your diffusion model to run faster and consume less memory.                                                                                                          |
+| [Training](https://huggingface.co/docs/diffusers/training/overview) | Guides for how to train a diffusion model for different tasks with different training techniques.                                                                                               |
+## Contribution
+
+We ❤️  contributions from the open-source community!
+If you want to contribute to this library, please check out our [Contribution guide](https://github.com/huggingface/diffusers/blob/main/CONTRIBUTING.md).
+You can look out for [issues](https://github.com/huggingface/diffusers/issues) you'd like to tackle to contribute to the library.
+- See [Good first issues](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) for general opportunities to contribute
+- See [New model/pipeline](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+pipeline%2Fmodel%22) to contribute exciting new diffusion models / diffusion pipelines
+- See [New scheduler](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+scheduler%22)
+
+Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz98XR"><img alt="Join us on Discord" src="https://img.shields.io/discord/823813159592001537?color=5865F2&logo=discord&logoColor=white"></a>. We discuss the hottest trends about diffusion models, help each other with contributions, personal projects or just hang out ☕.
+
+
+## Popular Tasks & Pipelines
+
+<table>
+  <tr>
+    <th>Task</th>
+    <th>Pipeline</th>
+    <th>🤗 Hub</th>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td>Unconditional Image Generation</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/ddpm"> DDPM </a></td>
+    <td><a href="https://huggingface.co/google/ddpm-ema-church-256"> google/ddpm-ema-church-256 </a></td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td>Text-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img">Stable Diffusion Text-to-Image</a></td>
+      <td><a href="https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5"> stable-diffusion-v1-5/stable-diffusion-v1-5 </a></td>
+  </tr>
+  <tr>
+    <td>Text-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/unclip">unCLIP</a></td>
+      <td><a href="https://huggingface.co/kakaobrain/karlo-v1-alpha"> kakaobrain/karlo-v1-alpha </a></td>
+  </tr>
+  <tr>
+    <td>Text-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/deepfloyd_if">DeepFloyd IF</a></td>
+      <td><a href="https://huggingface.co/DeepFloyd/IF-I-XL-v1.0"> DeepFloyd/IF-I-XL-v1.0 </a></td>
+  </tr>
+  <tr>
+    <td>Text-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/kandinsky">Kandinsky</a></td>
+      <td><a href="https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder"> kandinsky-community/kandinsky-2-2-decoder </a></td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td>Text-guided Image-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/controlnet">ControlNet</a></td>
+      <td><a href="https://huggingface.co/lllyasviel/sd-controlnet-canny"> lllyasviel/sd-controlnet-canny </a></td>
+  </tr>
+  <tr>
+    <td>Text-guided Image-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/pix2pix">InstructPix2Pix</a></td>
+      <td><a href="https://huggingface.co/timbrooks/instruct-pix2pix"> timbrooks/instruct-pix2pix </a></td>
+  </tr>
+  <tr>
+    <td>Text-guided Image-to-Image</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img">Stable Diffusion Image-to-Image</a></td>
+      <td><a href="https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5"> stable-diffusion-v1-5/stable-diffusion-v1-5 </a></td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td>Text-guided Image Inpainting</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint">Stable Diffusion Inpainting</a></td>
+      <td><a href="https://huggingface.co/runwayml/stable-diffusion-inpainting"> runwayml/stable-diffusion-inpainting </a></td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td>Image Variation</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/image_variation">Stable Diffusion Image Variation</a></td>
+      <td><a href="https://huggingface.co/lambdalabs/sd-image-variations-diffusers"> lambdalabs/sd-image-variations-diffusers </a></td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td>Super Resolution</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/upscale">Stable Diffusion Upscale</a></td>
+      <td><a href="https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler"> stabilityai/stable-diffusion-x4-upscaler </a></td>
+  </tr>
+  <tr>
+    <td>Super Resolution</td>
+    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_upscale">Stable Diffusion Latent Upscale</a></td>
+      <td><a href="https://huggingface.co/stabilityai/sd-x2-latent-upscaler"> stabilityai/sd-x2-latent-upscaler </a></td>
+  </tr>
+</table>
+
+## Popular libraries using 🧨 Diffusers
+
+- https://github.com/microsoft/TaskMatrix
+- https://github.com/invoke-ai/InvokeAI
+- https://github.com/InstantID/InstantID
+- https://github.com/apple/ml-stable-diffusion
+- https://github.com/Sanster/lama-cleaner
+- https://github.com/IDEA-Research/Grounded-Segment-Anything
+- https://github.com/ashawkey/stable-dreamfusion
+- https://github.com/deep-floyd/IF
+- https://github.com/bentoml/BentoML
+- https://github.com/bmaltais/kohya_ss
+- +14,000 other amazing GitHub repositories 💪
+
+Thank you for using us ❤️.
+
+## Credits
+
+This library concretizes previous work by many different authors and would not have been possible without their great research and implementations. We'd like to thank, in particular, the following implementations which have helped us in our development and without which the API could not have been as polished today:
+
+- @CompVis' latent diffusion models library, available [here](https://github.com/CompVis/latent-diffusion)
+- @hojonathanho original DDPM implementation, available [here](https://github.com/hojonathanho/diffusion) as well as the extremely useful translation into PyTorch by @pesser, available [here](https://github.com/pesser/pytorch_diffusion)
+- @ermongroup's DDIM implementation, available [here](https://github.com/ermongroup/ddim)
+- @yang-song's Score-VE and Score-VP implementations, available [here](https://github.com/yang-song/score_sde_pytorch)
+
+We also want to thank @heejkoo for the very helpful overview of papers, code and resources on diffusion models, available [here](https://github.com/heejkoo/Awesome-Diffusion-Models) as well as @crowsonkb and @rromb for useful discussions and insights.
+
+## Citation
+
+```bibtex
+@misc{von-platen-etal-2022-diffusers,
+  author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Dhruv Nair and Sayak Paul and William Berman and Yiyi Xu and Steven Liu and Thomas Wolf},
+  title = {Diffusers: State-of-the-art diffusion models},
+  year = {2022},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/huggingface/diffusers}}
+}
+```
diff --git a/diffusers/src/diffusers.egg-info/SOURCES.txt b/diffusers/src/diffusers.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..37fb61daf159e255b31532d334e18a9fbad3384e
--- /dev/null
+++ b/diffusers/src/diffusers.egg-info/SOURCES.txt
@@ -0,0 +1,498 @@
+LICENSE
+MANIFEST.in
+README.md
+pyproject.toml
+setup.py
+src/diffusers/__init__.py
+src/diffusers/callbacks.py
+src/diffusers/configuration_utils.py
+src/diffusers/dependency_versions_check.py
+src/diffusers/dependency_versions_table.py
+src/diffusers/image_processor.py
+src/diffusers/optimization.py
+src/diffusers/py.typed
+src/diffusers/training_utils.py
+src/diffusers/video_processor.py
+src/diffusers.egg-info/PKG-INFO
+src/diffusers.egg-info/SOURCES.txt
+src/diffusers.egg-info/dependency_links.txt
+src/diffusers.egg-info/entry_points.txt
+src/diffusers.egg-info/requires.txt
+src/diffusers.egg-info/top_level.txt
+src/diffusers/commands/__init__.py
+src/diffusers/commands/diffusers_cli.py
+src/diffusers/commands/env.py
+src/diffusers/commands/fp16_safetensors.py
+src/diffusers/experimental/__init__.py
+src/diffusers/experimental/rl/__init__.py
+src/diffusers/experimental/rl/value_guided_sampling.py
+src/diffusers/loaders/__init__.py
+src/diffusers/loaders/ip_adapter.py
+src/diffusers/loaders/lora_base.py
+src/diffusers/loaders/lora_conversion_utils.py
+src/diffusers/loaders/lora_pipeline.py
+src/diffusers/loaders/peft.py
+src/diffusers/loaders/single_file.py
+src/diffusers/loaders/single_file_model.py
+src/diffusers/loaders/single_file_utils.py
+src/diffusers/loaders/textual_inversion.py
+src/diffusers/loaders/unet.py
+src/diffusers/loaders/unet_loader_utils.py
+src/diffusers/loaders/utils.py
+src/diffusers/models/__init__.py
+src/diffusers/models/activations.py
+src/diffusers/models/adapter.py
+src/diffusers/models/attention.py
+src/diffusers/models/attention_flax.py
+src/diffusers/models/attention_processor.py
+src/diffusers/models/colorguider_sd.py
+src/diffusers/models/controlnet.py
+src/diffusers/models/controlnet_flax.py
+src/diffusers/models/controlnet_flux.py
+src/diffusers/models/controlnet_hunyuan.py
+src/diffusers/models/controlnet_sd3.py
+src/diffusers/models/controlnet_sparsectrl.py
+src/diffusers/models/controlnet_xs.py
+src/diffusers/models/downsampling.py
+src/diffusers/models/embeddings.py
+src/diffusers/models/embeddings_flax.py
+src/diffusers/models/lora.py
+src/diffusers/models/model_loading_utils.py
+src/diffusers/models/modeling_flax_pytorch_utils.py
+src/diffusers/models/modeling_flax_utils.py
+src/diffusers/models/modeling_outputs.py
+src/diffusers/models/modeling_pytorch_flax_utils.py
+src/diffusers/models/modeling_utils.py
+src/diffusers/models/normalization.py
+src/diffusers/models/resnet.py
+src/diffusers/models/resnet_flax.py
+src/diffusers/models/upsampling.py
+src/diffusers/models/vae_flax.py
+src/diffusers/models/vq_model.py
+src/diffusers/models/autoencoders/__init__.py
+src/diffusers/models/autoencoders/autoencoder_asym_kl.py
+src/diffusers/models/autoencoders/autoencoder_kl.py
+src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
+src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
+src/diffusers/models/autoencoders/autoencoder_oobleck.py
+src/diffusers/models/autoencoders/autoencoder_tiny.py
+src/diffusers/models/autoencoders/consistency_decoder_vae.py
+src/diffusers/models/autoencoders/vae.py
+src/diffusers/models/autoencoders/vq_model.py
+src/diffusers/models/transformers/__init__.py
+src/diffusers/models/transformers/auraflow_transformer_2d.py
+src/diffusers/models/transformers/causal_sparse_dit.py
+src/diffusers/models/transformers/causal_sparse_dit_control.py
+src/diffusers/models/transformers/cogvideox_transformer_3d.py
+src/diffusers/models/transformers/colorguider_pixart.py
+src/diffusers/models/transformers/dit_transformer_2d.py
+src/diffusers/models/transformers/dual_transformer_2d.py
+src/diffusers/models/transformers/hunyuan_transformer_2d.py
+src/diffusers/models/transformers/latte_transformer_3d.py
+src/diffusers/models/transformers/lumina_nextdit2d.py
+src/diffusers/models/transformers/pixart_transformer_2d.py
+src/diffusers/models/transformers/prior_transformer.py
+src/diffusers/models/transformers/stable_audio_transformer.py
+src/diffusers/models/transformers/t5_film_transformer.py
+src/diffusers/models/transformers/transformer_2d.py
+src/diffusers/models/transformers/transformer_flux.py
+src/diffusers/models/transformers/transformer_sd3.py
+src/diffusers/models/transformers/transformer_temporal.py
+src/diffusers/models/unets/__init__.py
+src/diffusers/models/unets/unet_1d.py
+src/diffusers/models/unets/unet_1d_blocks.py
+src/diffusers/models/unets/unet_2d.py
+src/diffusers/models/unets/unet_2d_blocks.py
+src/diffusers/models/unets/unet_2d_blocks_flax.py
+src/diffusers/models/unets/unet_2d_condition.py
+src/diffusers/models/unets/unet_2d_condition_flax.py
+src/diffusers/models/unets/unet_3d_blocks.py
+src/diffusers/models/unets/unet_3d_condition.py
+src/diffusers/models/unets/unet_i2vgen_xl.py
+src/diffusers/models/unets/unet_kandinsky3.py
+src/diffusers/models/unets/unet_motion_model.py
+src/diffusers/models/unets/unet_spatio_temporal_condition.py
+src/diffusers/models/unets/unet_stable_cascade.py
+src/diffusers/models/unets/uvit_2d.py
+src/diffusers/pipelines/__init__.py
+src/diffusers/pipelines/auto_pipeline.py
+src/diffusers/pipelines/free_init_utils.py
+src/diffusers/pipelines/free_noise_utils.py
+src/diffusers/pipelines/onnx_utils.py
+src/diffusers/pipelines/pipeline_flax_utils.py
+src/diffusers/pipelines/pipeline_loading_utils.py
+src/diffusers/pipelines/pipeline_utils.py
+src/diffusers/pipelines/amused/__init__.py
+src/diffusers/pipelines/amused/pipeline_amused.py
+src/diffusers/pipelines/amused/pipeline_amused_img2img.py
+src/diffusers/pipelines/amused/pipeline_amused_inpaint.py
+src/diffusers/pipelines/animatediff/__init__.py
+src/diffusers/pipelines/animatediff/pipeline_animatediff.py
+src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py
+src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py
+src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py
+src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py
+src/diffusers/pipelines/animatediff/pipeline_output.py
+src/diffusers/pipelines/audioldm/__init__.py
+src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+src/diffusers/pipelines/audioldm2/__init__.py
+src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
+src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+src/diffusers/pipelines/aura_flow/__init__.py
+src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py
+src/diffusers/pipelines/blip_diffusion/__init__.py
+src/diffusers/pipelines/blip_diffusion/blip_image_processing.py
+src/diffusers/pipelines/blip_diffusion/modeling_blip2.py
+src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py
+src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py
+src/diffusers/pipelines/cobra/__init__.py
+src/diffusers/pipelines/cobra/pipeline_cobra_pixart.py
+src/diffusers/pipelines/cogvideo/__init__.py
+src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py
+src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py
+src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py
+src/diffusers/pipelines/cogvideo/pipeline_output.py
+src/diffusers/pipelines/colorflow/__init__.py
+src/diffusers/pipelines/colorflow/pipeline_colorflow_pixart.py
+src/diffusers/pipelines/colorflow/pipeline_colorflow_sd.py
+src/diffusers/pipelines/consistency_models/__init__.py
+src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
+src/diffusers/pipelines/controlnet/__init__.py
+src/diffusers/pipelines/controlnet/multicontrolnet.py
+src/diffusers/pipelines/controlnet/pipeline_controlnet.py
+src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py
+src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
+src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
+src/diffusers/pipelines/controlnet_hunyuandit/__init__.py
+src/diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py
+src/diffusers/pipelines/controlnet_sd3/__init__.py
+src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py
+src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py
+src/diffusers/pipelines/controlnet_xs/__init__.py
+src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
+src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+src/diffusers/pipelines/dance_diffusion/__init__.py
+src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
+src/diffusers/pipelines/ddim/__init__.py
+src/diffusers/pipelines/ddim/pipeline_ddim.py
+src/diffusers/pipelines/ddpm/__init__.py
+src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+src/diffusers/pipelines/deepfloyd_if/__init__.py
+src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
+src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
+src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
+src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
+src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
+src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
+src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
+src/diffusers/pipelines/deepfloyd_if/safety_checker.py
+src/diffusers/pipelines/deepfloyd_if/timesteps.py
+src/diffusers/pipelines/deepfloyd_if/watermark.py
+src/diffusers/pipelines/deprecated/__init__.py
+src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py
+src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py
+src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
+src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
+src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py
+src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py
+src/diffusers/pipelines/deprecated/audio_diffusion/mel.py
+src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py
+src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py
+src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
+src/diffusers/pipelines/deprecated/pndm/__init__.py
+src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py
+src/diffusers/pipelines/deprecated/repaint/__init__.py
+src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py
+src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py
+src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py
+src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py
+src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py
+src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py
+src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py
+src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py
+src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py
+src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py
+src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
+src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
+src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
+src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
+src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py
+src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
+src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py
+src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
+src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py
+src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py
+src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py
+src/diffusers/pipelines/dit/__init__.py
+src/diffusers/pipelines/dit/pipeline_dit.py
+src/diffusers/pipelines/flux/__init__.py
+src/diffusers/pipelines/flux/pipeline_flux.py
+src/diffusers/pipelines/flux/pipeline_flux_controlnet.py
+src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py
+src/diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py
+src/diffusers/pipelines/flux/pipeline_flux_img2img.py
+src/diffusers/pipelines/flux/pipeline_flux_inpaint.py
+src/diffusers/pipelines/flux/pipeline_output.py
+src/diffusers/pipelines/hunyuandit/__init__.py
+src/diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py
+src/diffusers/pipelines/i2vgen_xl/__init__.py
+src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
+src/diffusers/pipelines/kandinsky/__init__.py
+src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
+src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+src/diffusers/pipelines/kandinsky/text_encoder.py
+src/diffusers/pipelines/kandinsky2_2/__init__.py
+src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
+src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
+src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
+src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
+src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
+src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
+src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
+src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+src/diffusers/pipelines/kandinsky3/__init__.py
+src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py
+src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py
+src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
+src/diffusers/pipelines/kolors/__init__.py
+src/diffusers/pipelines/kolors/pipeline_kolors.py
+src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py
+src/diffusers/pipelines/kolors/pipeline_output.py
+src/diffusers/pipelines/kolors/text_encoder.py
+src/diffusers/pipelines/kolors/tokenizer.py
+src/diffusers/pipelines/latent_consistency_models/__init__.py
+src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
+src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
+src/diffusers/pipelines/latent_diffusion/__init__.py
+src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
+src/diffusers/pipelines/latte/__init__.py
+src/diffusers/pipelines/latte/pipeline_latte.py
+src/diffusers/pipelines/ledits_pp/__init__.py
+src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py
+src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
+src/diffusers/pipelines/ledits_pp/pipeline_output.py
+src/diffusers/pipelines/lumina/__init__.py
+src/diffusers/pipelines/lumina/pipeline_lumina.py
+src/diffusers/pipelines/marigold/__init__.py
+src/diffusers/pipelines/marigold/marigold_image_processing.py
+src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
+src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
+src/diffusers/pipelines/musicldm/__init__.py
+src/diffusers/pipelines/musicldm/pipeline_musicldm.py
+src/diffusers/pipelines/pag/__init__.py
+src/diffusers/pipelines/pag/pag_utils.py
+src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py
+src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py
+src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
+src/diffusers/pipelines/pag/pipeline_pag_hunyuandit.py
+src/diffusers/pipelines/pag/pipeline_pag_kolors.py
+src/diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py
+src/diffusers/pipelines/pag/pipeline_pag_sd.py
+src/diffusers/pipelines/pag/pipeline_pag_sd_3.py
+src/diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py
+src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py
+src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py
+src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py
+src/diffusers/pipelines/paint_by_example/__init__.py
+src/diffusers/pipelines/paint_by_example/image_encoder.py
+src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
+src/diffusers/pipelines/pia/__init__.py
+src/diffusers/pipelines/pia/pipeline_pia.py
+src/diffusers/pipelines/pixart_alpha/__init__.py
+src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
+src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py
+src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
+src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py
+src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
+src/diffusers/pipelines/shap_e/__init__.py
+src/diffusers/pipelines/shap_e/camera.py
+src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+src/diffusers/pipelines/shap_e/renderer.py
+src/diffusers/pipelines/stable_audio/__init__.py
+src/diffusers/pipelines/stable_audio/modeling_stable_audio.py
+src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py
+src/diffusers/pipelines/stable_cascade/__init__.py
+src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
+src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
+src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
+src/diffusers/pipelines/stable_diffusion/__init__.py
+src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py
+src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
+src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
+src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py
+src/diffusers/pipelines/stable_diffusion/pipeline_output.py
+src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
+src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+src/diffusers/pipelines/stable_diffusion/safety_checker.py
+src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
+src/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
+src/diffusers/pipelines/stable_diffusion_3/__init__.py
+src/diffusers/pipelines/stable_diffusion_3/pipeline_output.py
+src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
+src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py
+src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py
+src/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py
+src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py
+src/diffusers/pipelines/stable_diffusion_diffedit/__init__.py
+src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
+src/diffusers/pipelines/stable_diffusion_gligen/__init__.py
+src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py
+src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py
+src/diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py
+src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
+src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
+src/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py
+src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
+src/diffusers/pipelines/stable_diffusion_panorama/__init__.py
+src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
+src/diffusers/pipelines/stable_diffusion_safe/__init__.py
+src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py
+src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py
+src/diffusers/pipelines/stable_diffusion_sag/__init__.py
+src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
+src/diffusers/pipelines/stable_diffusion_xl/__init__.py
+src/diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py
+src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py
+src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+src/diffusers/pipelines/stable_diffusion_xl/watermark.py
+src/diffusers/pipelines/stable_video_diffusion/__init__.py
+src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+src/diffusers/pipelines/t2i_adapter/__init__.py
+src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
+src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+src/diffusers/pipelines/text_to_video_synthesis/__init__.py
+src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
+src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
+src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
+src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
+src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
+src/diffusers/pipelines/unclip/__init__.py
+src/diffusers/pipelines/unclip/pipeline_unclip.py
+src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
+src/diffusers/pipelines/unclip/text_proj.py
+src/diffusers/pipelines/unidiffuser/__init__.py
+src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py
+src/diffusers/pipelines/unidiffuser/modeling_uvit.py
+src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
+src/diffusers/pipelines/wuerstchen/__init__.py
+src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
+src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py
+src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py
+src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py
+src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
+src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
+src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
+src/diffusers/schedulers/__init__.py
+src/diffusers/schedulers/scheduling_amused.py
+src/diffusers/schedulers/scheduling_consistency_decoder.py
+src/diffusers/schedulers/scheduling_consistency_models.py
+src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
+src/diffusers/schedulers/scheduling_ddim.py
+src/diffusers/schedulers/scheduling_ddim_cogvideox.py
+src/diffusers/schedulers/scheduling_ddim_flax.py
+src/diffusers/schedulers/scheduling_ddim_inverse.py
+src/diffusers/schedulers/scheduling_ddim_parallel.py
+src/diffusers/schedulers/scheduling_ddpm.py
+src/diffusers/schedulers/scheduling_ddpm_flax.py
+src/diffusers/schedulers/scheduling_ddpm_parallel.py
+src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
+src/diffusers/schedulers/scheduling_deis_multistep.py
+src/diffusers/schedulers/scheduling_dpm_cogvideox.py
+src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
+src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
+src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
+src/diffusers/schedulers/scheduling_edm_euler.py
+src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+src/diffusers/schedulers/scheduling_euler_discrete.py
+src/diffusers/schedulers/scheduling_euler_discrete_flax.py
+src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
+src/diffusers/schedulers/scheduling_flow_match_heun_discrete.py
+src/diffusers/schedulers/scheduling_heun_discrete.py
+src/diffusers/schedulers/scheduling_ipndm.py
+src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
+src/diffusers/schedulers/scheduling_karras_ve_flax.py
+src/diffusers/schedulers/scheduling_lcm.py
+src/diffusers/schedulers/scheduling_lms_discrete.py
+src/diffusers/schedulers/scheduling_lms_discrete_flax.py
+src/diffusers/schedulers/scheduling_pndm.py
+src/diffusers/schedulers/scheduling_pndm_flax.py
+src/diffusers/schedulers/scheduling_repaint.py
+src/diffusers/schedulers/scheduling_sasolver.py
+src/diffusers/schedulers/scheduling_sde_ve.py
+src/diffusers/schedulers/scheduling_sde_ve_flax.py
+src/diffusers/schedulers/scheduling_tcd.py
+src/diffusers/schedulers/scheduling_unclip.py
+src/diffusers/schedulers/scheduling_unipc_multistep.py
+src/diffusers/schedulers/scheduling_utils.py
+src/diffusers/schedulers/scheduling_utils_flax.py
+src/diffusers/schedulers/scheduling_vq_diffusion.py
+src/diffusers/schedulers/deprecated/__init__.py
+src/diffusers/schedulers/deprecated/scheduling_karras_ve.py
+src/diffusers/schedulers/deprecated/scheduling_sde_vp.py
+src/diffusers/utils/__init__.py
+src/diffusers/utils/accelerate_utils.py
+src/diffusers/utils/constants.py
+src/diffusers/utils/deprecation_utils.py
+src/diffusers/utils/doc_utils.py
+src/diffusers/utils/dummy_flax_and_transformers_objects.py
+src/diffusers/utils/dummy_flax_objects.py
+src/diffusers/utils/dummy_note_seq_objects.py
+src/diffusers/utils/dummy_onnx_objects.py
+src/diffusers/utils/dummy_pt_objects.py
+src/diffusers/utils/dummy_torch_and_librosa_objects.py
+src/diffusers/utils/dummy_torch_and_scipy_objects.py
+src/diffusers/utils/dummy_torch_and_torchsde_objects.py
+src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py
+src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
+src/diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py
+src/diffusers/utils/dummy_torch_and_transformers_objects.py
+src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py
+src/diffusers/utils/dynamic_modules_utils.py
+src/diffusers/utils/export_utils.py
+src/diffusers/utils/hub_utils.py
+src/diffusers/utils/import_utils.py
+src/diffusers/utils/loading_utils.py
+src/diffusers/utils/logging.py
+src/diffusers/utils/model_card_template.md
+src/diffusers/utils/outputs.py
+src/diffusers/utils/peft_utils.py
+src/diffusers/utils/pil_utils.py
+src/diffusers/utils/state_dict_utils.py
+src/diffusers/utils/testing_utils.py
+src/diffusers/utils/torch_utils.py
+src/diffusers/utils/versions.py
\ No newline at end of file
diff --git a/diffusers/src/diffusers.egg-info/dependency_links.txt b/diffusers/src/diffusers.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/diffusers/src/diffusers.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/diffusers/src/diffusers.egg-info/entry_points.txt b/diffusers/src/diffusers.egg-info/entry_points.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4d8cdefa4968bbed7a122fa7a34940bf9a15360b
--- /dev/null
+++ b/diffusers/src/diffusers.egg-info/entry_points.txt
@@ -0,0 +1,2 @@
+[console_scripts]
+diffusers-cli = diffusers.commands.diffusers_cli:main
diff --git a/diffusers/src/diffusers.egg-info/requires.txt b/diffusers/src/diffusers.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9663bbe8b6c7596458141b3ce2fcc3f5f27cc284
--- /dev/null
+++ b/diffusers/src/diffusers.egg-info/requires.txt
@@ -0,0 +1,84 @@
+importlib_metadata
+filelock
+huggingface-hub>=0.23.2
+numpy
+regex!=2019.12.17
+requests
+safetensors>=0.3.1
+Pillow
+
+[dev]
+urllib3<=2.0.0
+isort>=5.5.4
+ruff==0.1.5
+hf-doc-builder>=0.3.0
+compel==0.1.8
+GitPython<3.1.19
+datasets
+Jinja2
+invisible-watermark>=0.2.0
+k-diffusion>=0.0.12
+librosa
+parameterized
+pytest
+pytest-timeout
+pytest-xdist
+requests-mock==1.10.0
+safetensors>=0.3.1
+sentencepiece!=0.1.92,>=0.1.91
+scipy
+torchvision
+transformers>=4.41.2
+accelerate>=0.31.0
+protobuf<4,>=3.20.3
+tensorboard
+peft>=0.6.0
+torch>=1.4
+jax>=0.4.1
+jaxlib>=0.4.1
+flax>=0.4.1
+
+[docs]
+hf-doc-builder>=0.3.0
+
+[flax]
+jax>=0.4.1
+jaxlib>=0.4.1
+flax>=0.4.1
+
+[quality]
+urllib3<=2.0.0
+isort>=5.5.4
+ruff==0.1.5
+hf-doc-builder>=0.3.0
+
+[test]
+compel==0.1.8
+GitPython<3.1.19
+datasets
+Jinja2
+invisible-watermark>=0.2.0
+k-diffusion>=0.0.12
+librosa
+parameterized
+pytest
+pytest-timeout
+pytest-xdist
+requests-mock==1.10.0
+safetensors>=0.3.1
+sentencepiece!=0.1.92,>=0.1.91
+scipy
+torchvision
+transformers>=4.41.2
+
+[torch]
+torch>=1.4
+accelerate>=0.31.0
+
+[training]
+accelerate>=0.31.0
+datasets
+protobuf<4,>=3.20.3
+tensorboard
+Jinja2
+peft>=0.6.0
diff --git a/diffusers/src/diffusers.egg-info/top_level.txt b/diffusers/src/diffusers.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6033efb6dbaaff9bc81792fd75a6b39d9f195aeb
--- /dev/null
+++ b/diffusers/src/diffusers.egg-info/top_level.txt
@@ -0,0 +1 @@
+diffusers
diff --git a/diffusers/src/diffusers/__init__.py b/diffusers/src/diffusers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6aeae3f42399d75d12a732a59174474c4adab08
--- /dev/null
+++ b/diffusers/src/diffusers/__init__.py
@@ -0,0 +1,954 @@
+__version__ = "0.31.0.dev0"
+
+from typing import TYPE_CHECKING
+
+from .utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_flax_available,
+    is_k_diffusion_available,
+    is_librosa_available,
+    is_note_seq_available,
+    is_onnx_available,
+    is_scipy_available,
+    is_sentencepiece_available,
+    is_torch_available,
+    is_torchsde_available,
+    is_transformers_available,
+)
+
+
+# Lazy Import based on
+# https://github.com/huggingface/transformers/blob/main/src/transformers/__init__.py
+
+# When adding a new object to this init, please add it to `_import_structure`. The `_import_structure` is a dictionary submodule to list of object names,
+# and is used to defer the actual importing for when the objects are requested.
+# This way `import diffusers` provides the names in the namespace without actually importing anything (and especially none of the backends).
+
+_import_structure = {
+    "configuration_utils": ["ConfigMixin"],
+    "loaders": ["FromOriginalModelMixin"],
+    "models": [],
+    "pipelines": [],
+    "schedulers": [],
+    "utils": [
+        "OptionalDependencyNotAvailable",
+        "is_flax_available",
+        "is_inflect_available",
+        "is_invisible_watermark_available",
+        "is_k_diffusion_available",
+        "is_k_diffusion_version",
+        "is_librosa_available",
+        "is_note_seq_available",
+        "is_onnx_available",
+        "is_scipy_available",
+        "is_torch_available",
+        "is_torchsde_available",
+        "is_transformers_available",
+        "is_transformers_version",
+        "is_unidecode_available",
+        "logging",
+    ],
+}
+
+try:
+    if not is_onnx_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_onnx_objects  # noqa F403
+
+    _import_structure["utils.dummy_onnx_objects"] = [
+        name for name in dir(dummy_onnx_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["pipelines"].extend(["OnnxRuntimeModel"])
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_pt_objects  # noqa F403
+
+    _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")]
+
+else:
+    _import_structure["models"].extend(
+        [
+            "AsymmetricAutoencoderKL",
+            "AuraFlowTransformer2DModel",
+            "AutoencoderKL",
+            "AutoencoderKLCogVideoX",
+            "AutoencoderKLTemporalDecoder",
+            "AutoencoderOobleck",
+            "AutoencoderTiny",
+            "CogVideoXTransformer3DModel",
+            "ConsistencyDecoderVAE",
+            "ControlNetModel",
+            "ColorGuiderSDModel",
+            "ControlNetXSAdapter",
+            "DiTTransformer2DModel",
+            "FluxControlNetModel",
+            "FluxMultiControlNetModel",
+            "FluxTransformer2DModel",
+            "HunyuanDiT2DControlNetModel",
+            "HunyuanDiT2DModel",
+            "HunyuanDiT2DMultiControlNetModel",
+            "I2VGenXLUNet",
+            "Kandinsky3UNet",
+            "LatteTransformer3DModel",
+            "LuminaNextDiT2DModel",
+            "ModelMixin",
+            "MotionAdapter",
+            "MultiAdapter",
+            "PixArtTransformer2DModel",
+            "ColorGuiderPixArtModel",
+            "CausalSparseDiTModel",
+            "CausalSparseDiTControlModel",
+            "PriorTransformer",
+            "SD3ControlNetModel",
+            "SD3MultiControlNetModel",
+            "SD3Transformer2DModel",
+            "SparseControlNetModel",
+            "StableAudioDiTModel",
+            "StableCascadeUNet",
+            "T2IAdapter",
+            "T5FilmDecoder",
+            "Transformer2DModel",
+            "UNet1DModel",
+            "UNet2DConditionModel",
+            "UNet2DModel",
+            "UNet3DConditionModel",
+            "UNetControlNetXSModel",
+            "UNetMotionModel",
+            "UNetSpatioTemporalConditionModel",
+            "UVit2DModel",
+            "VQModel",
+        ]
+    )
+
+    _import_structure["optimization"] = [
+        "get_constant_schedule",
+        "get_constant_schedule_with_warmup",
+        "get_cosine_schedule_with_warmup",
+        "get_cosine_with_hard_restarts_schedule_with_warmup",
+        "get_linear_schedule_with_warmup",
+        "get_polynomial_decay_schedule_with_warmup",
+        "get_scheduler",
+    ]
+    _import_structure["pipelines"].extend(
+        [
+            "AudioPipelineOutput",
+            "AutoPipelineForImage2Image",
+            "AutoPipelineForInpainting",
+            "AutoPipelineForText2Image",
+            "ConsistencyModelPipeline",
+            "DanceDiffusionPipeline",
+            "DDIMPipeline",
+            "DDPMPipeline",
+            "DiffusionPipeline",
+            "DiTPipeline",
+            "ImagePipelineOutput",
+            "KarrasVePipeline",
+            "LDMPipeline",
+            "LDMSuperResolutionPipeline",
+            "PNDMPipeline",
+            "RePaintPipeline",
+            "ScoreSdeVePipeline",
+            "StableDiffusionMixin",
+        ]
+    )
+    _import_structure["schedulers"].extend(
+        [
+            "AmusedScheduler",
+            "CMStochasticIterativeScheduler",
+            "CogVideoXDDIMScheduler",
+            "CogVideoXDPMScheduler",
+            "DDIMInverseScheduler",
+            "DDIMParallelScheduler",
+            "DDIMScheduler",
+            "DDPMParallelScheduler",
+            "DDPMScheduler",
+            "DDPMWuerstchenScheduler",
+            "DEISMultistepScheduler",
+            "DPMSolverMultistepInverseScheduler",
+            "DPMSolverMultistepScheduler",
+            "DPMSolverSinglestepScheduler",
+            "EDMDPMSolverMultistepScheduler",
+            "EDMEulerScheduler",
+            "EulerAncestralDiscreteScheduler",
+            "EulerDiscreteScheduler",
+            "FlowMatchEulerDiscreteScheduler",
+            "FlowMatchHeunDiscreteScheduler",
+            "HeunDiscreteScheduler",
+            "IPNDMScheduler",
+            "KarrasVeScheduler",
+            "KDPM2AncestralDiscreteScheduler",
+            "KDPM2DiscreteScheduler",
+            "LCMScheduler",
+            "PNDMScheduler",
+            "RePaintScheduler",
+            "SASolverScheduler",
+            "SchedulerMixin",
+            "ScoreSdeVeScheduler",
+            "TCDScheduler",
+            "UnCLIPScheduler",
+            "UniPCMultistepScheduler",
+            "VQDiffusionScheduler",
+        ]
+    )
+    _import_structure["training_utils"] = ["EMAModel"]
+
+try:
+    if not (is_torch_available() and is_scipy_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_scipy_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_scipy_objects"] = [
+        name for name in dir(dummy_torch_and_scipy_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["schedulers"].extend(["LMSDiscreteScheduler"])
+
+try:
+    if not (is_torch_available() and is_torchsde_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_torchsde_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_torchsde_objects"] = [
+        name for name in dir(dummy_torch_and_torchsde_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["schedulers"].extend(["CosineDPMSolverMultistepScheduler", "DPMSolverSDEScheduler"])
+
+try:
+    if not (is_torch_available() and is_transformers_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["pipelines"].extend(
+        [
+            "AltDiffusionImg2ImgPipeline",
+            "AltDiffusionPipeline",
+            "AmusedImg2ImgPipeline",
+            "AmusedInpaintPipeline",
+            "AmusedPipeline",
+            "AnimateDiffControlNetPipeline",
+            "AnimateDiffPAGPipeline",
+            "AnimateDiffPipeline",
+            "AnimateDiffSDXLPipeline",
+            "AnimateDiffSparseControlNetPipeline",
+            "AnimateDiffVideoToVideoControlNetPipeline",
+            "AnimateDiffVideoToVideoPipeline",
+            "AudioLDM2Pipeline",
+            "AudioLDM2ProjectionModel",
+            "AudioLDM2UNet2DConditionModel",
+            "AudioLDMPipeline",
+            "AuraFlowPipeline",
+            "BlipDiffusionControlNetPipeline",
+            "BlipDiffusionPipeline",
+            "CLIPImageProjection",
+            "CogVideoXImageToVideoPipeline",
+            "CogVideoXPipeline",
+            "CogVideoXVideoToVideoPipeline",
+            "CycleDiffusionPipeline",
+            "FluxControlNetImg2ImgPipeline",
+            "FluxControlNetInpaintPipeline",
+            "FluxControlNetPipeline",
+            "FluxImg2ImgPipeline",
+            "FluxInpaintPipeline",
+            "FluxPipeline",
+            "HunyuanDiTControlNetPipeline",
+            "HunyuanDiTPAGPipeline",
+            "HunyuanDiTPipeline",
+            "I2VGenXLPipeline",
+            "IFImg2ImgPipeline",
+            "IFImg2ImgSuperResolutionPipeline",
+            "IFInpaintingPipeline",
+            "IFInpaintingSuperResolutionPipeline",
+            "IFPipeline",
+            "IFSuperResolutionPipeline",
+            "ImageTextPipelineOutput",
+            "Kandinsky3Img2ImgPipeline",
+            "Kandinsky3Pipeline",
+            "KandinskyCombinedPipeline",
+            "KandinskyImg2ImgCombinedPipeline",
+            "KandinskyImg2ImgPipeline",
+            "KandinskyInpaintCombinedPipeline",
+            "KandinskyInpaintPipeline",
+            "KandinskyPipeline",
+            "KandinskyPriorPipeline",
+            "KandinskyV22CombinedPipeline",
+            "KandinskyV22ControlnetImg2ImgPipeline",
+            "KandinskyV22ControlnetPipeline",
+            "KandinskyV22Img2ImgCombinedPipeline",
+            "KandinskyV22Img2ImgPipeline",
+            "KandinskyV22InpaintCombinedPipeline",
+            "KandinskyV22InpaintPipeline",
+            "KandinskyV22Pipeline",
+            "KandinskyV22PriorEmb2EmbPipeline",
+            "KandinskyV22PriorPipeline",
+            "LatentConsistencyModelImg2ImgPipeline",
+            "LatentConsistencyModelPipeline",
+            "LattePipeline",
+            "LDMTextToImagePipeline",
+            "LEditsPPPipelineStableDiffusion",
+            "LEditsPPPipelineStableDiffusionXL",
+            "LuminaText2ImgPipeline",
+            "MarigoldDepthPipeline",
+            "MarigoldNormalsPipeline",
+            "MusicLDMPipeline",
+            "PaintByExamplePipeline",
+            "PIAPipeline",
+            "PixArtAlphaPipeline",
+            "ColorFlowPixArtAlphaPipeline",
+            "CobraPixArtAlphaPipeline",
+            "ColorFlowSDPipeline",
+            "PixArtSigmaPAGPipeline",
+            "PixArtSigmaPipeline",
+            "SemanticStableDiffusionPipeline",
+            "ShapEImg2ImgPipeline",
+            "ShapEPipeline",
+            "StableAudioPipeline",
+            "StableAudioProjectionModel",
+            "StableCascadeCombinedPipeline",
+            "StableCascadeDecoderPipeline",
+            "StableCascadePriorPipeline",
+            "StableDiffusion3ControlNetInpaintingPipeline",
+            "StableDiffusion3ControlNetPipeline",
+            "StableDiffusion3Img2ImgPipeline",
+            "StableDiffusion3InpaintPipeline",
+            "StableDiffusion3PAGPipeline",
+            "StableDiffusion3Pipeline",
+            "StableDiffusionAdapterPipeline",
+            "StableDiffusionAttendAndExcitePipeline",
+            "StableDiffusionControlNetImg2ImgPipeline",
+            "StableDiffusionControlNetInpaintPipeline",
+            "StableDiffusionControlNetPAGPipeline",
+            "StableDiffusionControlNetPipeline",
+            "StableDiffusionControlNetXSPipeline",
+            "StableDiffusionDepth2ImgPipeline",
+            "StableDiffusionDiffEditPipeline",
+            "StableDiffusionGLIGENPipeline",
+            "StableDiffusionGLIGENTextImagePipeline",
+            "StableDiffusionImageVariationPipeline",
+            "StableDiffusionImg2ImgPipeline",
+            "StableDiffusionInpaintPipeline",
+            "StableDiffusionInpaintPipelineLegacy",
+            "StableDiffusionInstructPix2PixPipeline",
+            "StableDiffusionLatentUpscalePipeline",
+            "StableDiffusionLDM3DPipeline",
+            "StableDiffusionModelEditingPipeline",
+            "StableDiffusionPAGPipeline",
+            "StableDiffusionPanoramaPipeline",
+            "StableDiffusionParadigmsPipeline",
+            "StableDiffusionPipeline",
+            "StableDiffusionPipelineSafe",
+            "StableDiffusionPix2PixZeroPipeline",
+            "StableDiffusionSAGPipeline",
+            "StableDiffusionUpscalePipeline",
+            "StableDiffusionXLAdapterPipeline",
+            "StableDiffusionXLControlNetImg2ImgPipeline",
+            "StableDiffusionXLControlNetInpaintPipeline",
+            "StableDiffusionXLControlNetPAGImg2ImgPipeline",
+            "StableDiffusionXLControlNetPAGPipeline",
+            "StableDiffusionXLControlNetPipeline",
+            "StableDiffusionXLControlNetXSPipeline",
+            "StableDiffusionXLImg2ImgPipeline",
+            "StableDiffusionXLInpaintPipeline",
+            "StableDiffusionXLInstructPix2PixPipeline",
+            "StableDiffusionXLPAGImg2ImgPipeline",
+            "StableDiffusionXLPAGInpaintPipeline",
+            "StableDiffusionXLPAGPipeline",
+            "StableDiffusionXLPipeline",
+            "StableUnCLIPImg2ImgPipeline",
+            "StableUnCLIPPipeline",
+            "StableVideoDiffusionPipeline",
+            "TextToVideoSDPipeline",
+            "TextToVideoZeroPipeline",
+            "TextToVideoZeroSDXLPipeline",
+            "UnCLIPImageVariationPipeline",
+            "UnCLIPPipeline",
+            "UniDiffuserModel",
+            "UniDiffuserPipeline",
+            "UniDiffuserTextDecoder",
+            "VersatileDiffusionDualGuidedPipeline",
+            "VersatileDiffusionImageVariationPipeline",
+            "VersatileDiffusionPipeline",
+            "VersatileDiffusionTextToImagePipeline",
+            "VideoToVideoSDPipeline",
+            "VQDiffusionPipeline",
+            "WuerstchenCombinedPipeline",
+            "WuerstchenDecoderPipeline",
+            "WuerstchenPriorPipeline",
+        ]
+    )
+
+try:
+    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_and_k_diffusion_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_and_k_diffusion_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["pipelines"].extend(["StableDiffusionKDiffusionPipeline", "StableDiffusionXLKDiffusionPipeline"])
+
+try:
+    if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_transformers_and_sentencepiece_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_and_sentencepiece_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_and_sentencepiece_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["pipelines"].extend(["KolorsImg2ImgPipeline", "KolorsPAGPipeline", "KolorsPipeline"])
+
+try:
+    if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_and_onnx_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_and_onnx_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["pipelines"].extend(
+        [
+            "OnnxStableDiffusionImg2ImgPipeline",
+            "OnnxStableDiffusionInpaintPipeline",
+            "OnnxStableDiffusionInpaintPipelineLegacy",
+            "OnnxStableDiffusionPipeline",
+            "OnnxStableDiffusionUpscalePipeline",
+            "StableDiffusionOnnxPipeline",
+        ]
+    )
+
+try:
+    if not (is_torch_available() and is_librosa_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_librosa_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_librosa_objects"] = [
+        name for name in dir(dummy_torch_and_librosa_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["pipelines"].extend(["AudioDiffusionPipeline", "Mel"])
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
+
+    _import_structure["utils.dummy_transformers_and_torch_and_note_seq_objects"] = [
+        name for name in dir(dummy_transformers_and_torch_and_note_seq_objects) if not name.startswith("_")
+    ]
+
+
+else:
+    _import_structure["pipelines"].extend(["SpectrogramDiffusionPipeline"])
+
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_flax_objects  # noqa F403
+
+    _import_structure["utils.dummy_flax_objects"] = [
+        name for name in dir(dummy_flax_objects) if not name.startswith("_")
+    ]
+
+
+else:
+    _import_structure["models.controlnet_flax"] = ["FlaxControlNetModel"]
+    _import_structure["models.modeling_flax_utils"] = ["FlaxModelMixin"]
+    _import_structure["models.unets.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
+    _import_structure["models.vae_flax"] = ["FlaxAutoencoderKL"]
+    _import_structure["pipelines"].extend(["FlaxDiffusionPipeline"])
+    _import_structure["schedulers"].extend(
+        [
+            "FlaxDDIMScheduler",
+            "FlaxDDPMScheduler",
+            "FlaxDPMSolverMultistepScheduler",
+            "FlaxEulerDiscreteScheduler",
+            "FlaxKarrasVeScheduler",
+            "FlaxLMSDiscreteScheduler",
+            "FlaxPNDMScheduler",
+            "FlaxSchedulerMixin",
+            "FlaxScoreSdeVeScheduler",
+        ]
+    )
+
+
+try:
+    if not (is_flax_available() and is_transformers_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_flax_and_transformers_objects  # noqa F403
+
+    _import_structure["utils.dummy_flax_and_transformers_objects"] = [
+        name for name in dir(dummy_flax_and_transformers_objects) if not name.startswith("_")
+    ]
+
+
+else:
+    _import_structure["pipelines"].extend(
+        [
+            "FlaxStableDiffusionControlNetPipeline",
+            "FlaxStableDiffusionImg2ImgPipeline",
+            "FlaxStableDiffusionInpaintPipeline",
+            "FlaxStableDiffusionPipeline",
+            "FlaxStableDiffusionXLPipeline",
+        ]
+    )
+
+try:
+    if not (is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_note_seq_objects  # noqa F403
+
+    _import_structure["utils.dummy_note_seq_objects"] = [
+        name for name in dir(dummy_note_seq_objects) if not name.startswith("_")
+    ]
+
+
+else:
+    _import_structure["pipelines"].extend(["MidiProcessor"])
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .configuration_utils import ConfigMixin
+
+    try:
+        if not is_onnx_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_onnx_objects import *  # noqa F403
+    else:
+        from .pipelines import OnnxRuntimeModel
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_pt_objects import *  # noqa F403
+    else:
+        from .models import (
+            AsymmetricAutoencoderKL,
+            AuraFlowTransformer2DModel,
+            AutoencoderKL,
+            AutoencoderKLCogVideoX,
+            AutoencoderKLTemporalDecoder,
+            AutoencoderOobleck,
+            AutoencoderTiny,
+            CogVideoXTransformer3DModel,
+            ConsistencyDecoderVAE,
+            ControlNetModel,
+            ColorGuiderSDModel,
+            ControlNetXSAdapter,
+            DiTTransformer2DModel,
+            FluxControlNetModel,
+            FluxMultiControlNetModel,
+            FluxTransformer2DModel,
+            HunyuanDiT2DControlNetModel,
+            HunyuanDiT2DModel,
+            HunyuanDiT2DMultiControlNetModel,
+            I2VGenXLUNet,
+            Kandinsky3UNet,
+            LatteTransformer3DModel,
+            LuminaNextDiT2DModel,
+            ModelMixin,
+            MotionAdapter,
+            MultiAdapter,
+            PixArtTransformer2DModel,
+            ColorGuiderPixArtModel,
+            CausalSparseDiTModel,
+            CausalSparseDiTControlModel,
+            PriorTransformer,
+            SD3ControlNetModel,
+            SD3MultiControlNetModel,
+            SD3Transformer2DModel,
+            SparseControlNetModel,
+            StableAudioDiTModel,
+            T2IAdapter,
+            T5FilmDecoder,
+            Transformer2DModel,
+            UNet1DModel,
+            UNet2DConditionModel,
+            UNet2DModel,
+            UNet3DConditionModel,
+            UNetControlNetXSModel,
+            UNetMotionModel,
+            UNetSpatioTemporalConditionModel,
+            UVit2DModel,
+            VQModel,
+        )
+        from .optimization import (
+            get_constant_schedule,
+            get_constant_schedule_with_warmup,
+            get_cosine_schedule_with_warmup,
+            get_cosine_with_hard_restarts_schedule_with_warmup,
+            get_linear_schedule_with_warmup,
+            get_polynomial_decay_schedule_with_warmup,
+            get_scheduler,
+        )
+        from .pipelines import (
+            AudioPipelineOutput,
+            AutoPipelineForImage2Image,
+            AutoPipelineForInpainting,
+            AutoPipelineForText2Image,
+            BlipDiffusionControlNetPipeline,
+            BlipDiffusionPipeline,
+            CLIPImageProjection,
+            ConsistencyModelPipeline,
+            DanceDiffusionPipeline,
+            DDIMPipeline,
+            DDPMPipeline,
+            DiffusionPipeline,
+            DiTPipeline,
+            ImagePipelineOutput,
+            KarrasVePipeline,
+            LDMPipeline,
+            LDMSuperResolutionPipeline,
+            PNDMPipeline,
+            RePaintPipeline,
+            ScoreSdeVePipeline,
+            StableDiffusionMixin,
+        )
+        from .schedulers import (
+            AmusedScheduler,
+            CMStochasticIterativeScheduler,
+            CogVideoXDDIMScheduler,
+            CogVideoXDPMScheduler,
+            DDIMInverseScheduler,
+            DDIMParallelScheduler,
+            DDIMScheduler,
+            DDPMParallelScheduler,
+            DDPMScheduler,
+            DDPMWuerstchenScheduler,
+            DEISMultistepScheduler,
+            DPMSolverMultistepInverseScheduler,
+            DPMSolverMultistepScheduler,
+            DPMSolverSinglestepScheduler,
+            EDMDPMSolverMultistepScheduler,
+            EDMEulerScheduler,
+            EulerAncestralDiscreteScheduler,
+            EulerDiscreteScheduler,
+            FlowMatchEulerDiscreteScheduler,
+            FlowMatchHeunDiscreteScheduler,
+            HeunDiscreteScheduler,
+            IPNDMScheduler,
+            KarrasVeScheduler,
+            KDPM2AncestralDiscreteScheduler,
+            KDPM2DiscreteScheduler,
+            LCMScheduler,
+            PNDMScheduler,
+            RePaintScheduler,
+            SASolverScheduler,
+            SchedulerMixin,
+            ScoreSdeVeScheduler,
+            TCDScheduler,
+            UnCLIPScheduler,
+            UniPCMultistepScheduler,
+            VQDiffusionScheduler,
+        )
+        from .training_utils import EMAModel
+
+    try:
+        if not (is_torch_available() and is_scipy_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_scipy_objects import *  # noqa F403
+    else:
+        from .schedulers import LMSDiscreteScheduler
+
+    try:
+        if not (is_torch_available() and is_torchsde_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_torchsde_objects import *  # noqa F403
+    else:
+        from .schedulers import CosineDPMSolverMultistepScheduler, DPMSolverSDEScheduler
+
+    try:
+        if not (is_torch_available() and is_transformers_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipelines import (
+            AltDiffusionImg2ImgPipeline,
+            AltDiffusionPipeline,
+            AmusedImg2ImgPipeline,
+            AmusedInpaintPipeline,
+            AmusedPipeline,
+            AnimateDiffControlNetPipeline,
+            AnimateDiffPAGPipeline,
+            AnimateDiffPipeline,
+            AnimateDiffSDXLPipeline,
+            AnimateDiffSparseControlNetPipeline,
+            AnimateDiffVideoToVideoControlNetPipeline,
+            AnimateDiffVideoToVideoPipeline,
+            AudioLDM2Pipeline,
+            AudioLDM2ProjectionModel,
+            AudioLDM2UNet2DConditionModel,
+            AudioLDMPipeline,
+            AuraFlowPipeline,
+            CLIPImageProjection,
+            CogVideoXImageToVideoPipeline,
+            CogVideoXPipeline,
+            CogVideoXVideoToVideoPipeline,
+            CycleDiffusionPipeline,
+            FluxControlNetImg2ImgPipeline,
+            FluxControlNetInpaintPipeline,
+            FluxControlNetPipeline,
+            FluxImg2ImgPipeline,
+            FluxInpaintPipeline,
+            FluxPipeline,
+            HunyuanDiTControlNetPipeline,
+            HunyuanDiTPAGPipeline,
+            HunyuanDiTPipeline,
+            I2VGenXLPipeline,
+            IFImg2ImgPipeline,
+            IFImg2ImgSuperResolutionPipeline,
+            IFInpaintingPipeline,
+            IFInpaintingSuperResolutionPipeline,
+            IFPipeline,
+            IFSuperResolutionPipeline,
+            ImageTextPipelineOutput,
+            Kandinsky3Img2ImgPipeline,
+            Kandinsky3Pipeline,
+            KandinskyCombinedPipeline,
+            KandinskyImg2ImgCombinedPipeline,
+            KandinskyImg2ImgPipeline,
+            KandinskyInpaintCombinedPipeline,
+            KandinskyInpaintPipeline,
+            KandinskyPipeline,
+            KandinskyPriorPipeline,
+            KandinskyV22CombinedPipeline,
+            KandinskyV22ControlnetImg2ImgPipeline,
+            KandinskyV22ControlnetPipeline,
+            KandinskyV22Img2ImgCombinedPipeline,
+            KandinskyV22Img2ImgPipeline,
+            KandinskyV22InpaintCombinedPipeline,
+            KandinskyV22InpaintPipeline,
+            KandinskyV22Pipeline,
+            KandinskyV22PriorEmb2EmbPipeline,
+            KandinskyV22PriorPipeline,
+            LatentConsistencyModelImg2ImgPipeline,
+            LatentConsistencyModelPipeline,
+            LattePipeline,
+            LDMTextToImagePipeline,
+            LEditsPPPipelineStableDiffusion,
+            LEditsPPPipelineStableDiffusionXL,
+            LuminaText2ImgPipeline,
+            MarigoldDepthPipeline,
+            MarigoldNormalsPipeline,
+            MusicLDMPipeline,
+            PaintByExamplePipeline,
+            PIAPipeline,
+            PixArtAlphaPipeline,
+            ColorFlowPixArtAlphaPipeline,
+            CobraPixArtAlphaPipeline,
+            ColorFlowSDPipeline,
+            PixArtSigmaPAGPipeline,
+            PixArtSigmaPipeline,
+            SemanticStableDiffusionPipeline,
+            ShapEImg2ImgPipeline,
+            ShapEPipeline,
+            StableAudioPipeline,
+            StableAudioProjectionModel,
+            StableCascadeCombinedPipeline,
+            StableCascadeDecoderPipeline,
+            StableCascadePriorPipeline,
+            StableDiffusion3ControlNetPipeline,
+            StableDiffusion3Img2ImgPipeline,
+            StableDiffusion3InpaintPipeline,
+            StableDiffusion3PAGPipeline,
+            StableDiffusion3Pipeline,
+            StableDiffusionAdapterPipeline,
+            StableDiffusionAttendAndExcitePipeline,
+            StableDiffusionControlNetImg2ImgPipeline,
+            StableDiffusionControlNetInpaintPipeline,
+            StableDiffusionControlNetPAGPipeline,
+            StableDiffusionControlNetPipeline,
+            StableDiffusionControlNetXSPipeline,
+            StableDiffusionDepth2ImgPipeline,
+            StableDiffusionDiffEditPipeline,
+            StableDiffusionGLIGENPipeline,
+            StableDiffusionGLIGENTextImagePipeline,
+            StableDiffusionImageVariationPipeline,
+            StableDiffusionImg2ImgPipeline,
+            StableDiffusionInpaintPipeline,
+            StableDiffusionInpaintPipelineLegacy,
+            StableDiffusionInstructPix2PixPipeline,
+            StableDiffusionLatentUpscalePipeline,
+            StableDiffusionLDM3DPipeline,
+            StableDiffusionModelEditingPipeline,
+            StableDiffusionPAGPipeline,
+            StableDiffusionPanoramaPipeline,
+            StableDiffusionParadigmsPipeline,
+            StableDiffusionPipeline,
+            StableDiffusionPipelineSafe,
+            StableDiffusionPix2PixZeroPipeline,
+            StableDiffusionSAGPipeline,
+            StableDiffusionUpscalePipeline,
+            StableDiffusionXLAdapterPipeline,
+            StableDiffusionXLControlNetImg2ImgPipeline,
+            StableDiffusionXLControlNetInpaintPipeline,
+            StableDiffusionXLControlNetPAGImg2ImgPipeline,
+            StableDiffusionXLControlNetPAGPipeline,
+            StableDiffusionXLControlNetPipeline,
+            StableDiffusionXLControlNetXSPipeline,
+            StableDiffusionXLImg2ImgPipeline,
+            StableDiffusionXLInpaintPipeline,
+            StableDiffusionXLInstructPix2PixPipeline,
+            StableDiffusionXLPAGImg2ImgPipeline,
+            StableDiffusionXLPAGInpaintPipeline,
+            StableDiffusionXLPAGPipeline,
+            StableDiffusionXLPipeline,
+            StableUnCLIPImg2ImgPipeline,
+            StableUnCLIPPipeline,
+            StableVideoDiffusionPipeline,
+            TextToVideoSDPipeline,
+            TextToVideoZeroPipeline,
+            TextToVideoZeroSDXLPipeline,
+            UnCLIPImageVariationPipeline,
+            UnCLIPPipeline,
+            UniDiffuserModel,
+            UniDiffuserPipeline,
+            UniDiffuserTextDecoder,
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+            VideoToVideoSDPipeline,
+            VQDiffusionPipeline,
+            WuerstchenCombinedPipeline,
+            WuerstchenDecoderPipeline,
+            WuerstchenPriorPipeline,
+        )
+
+    try:
+        if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_and_k_diffusion_objects import *  # noqa F403
+    else:
+        from .pipelines import StableDiffusionKDiffusionPipeline, StableDiffusionXLKDiffusionPipeline
+
+    try:
+        if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_and_sentencepiece_objects import *  # noqa F403
+    else:
+        from .pipelines import KolorsImg2ImgPipeline, KolorsPAGPipeline, KolorsPipeline
+    try:
+        if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_and_onnx_objects import *  # noqa F403
+    else:
+        from .pipelines import (
+            OnnxStableDiffusionImg2ImgPipeline,
+            OnnxStableDiffusionInpaintPipeline,
+            OnnxStableDiffusionInpaintPipelineLegacy,
+            OnnxStableDiffusionPipeline,
+            OnnxStableDiffusionUpscalePipeline,
+            StableDiffusionOnnxPipeline,
+        )
+
+    try:
+        if not (is_torch_available() and is_librosa_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_librosa_objects import *  # noqa F403
+    else:
+        from .pipelines import AudioDiffusionPipeline, Mel
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+    else:
+        from .pipelines import SpectrogramDiffusionPipeline
+
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_flax_objects import *  # noqa F403
+    else:
+        from .models.controlnet_flax import FlaxControlNetModel
+        from .models.modeling_flax_utils import FlaxModelMixin
+        from .models.unets.unet_2d_condition_flax import FlaxUNet2DConditionModel
+        from .models.vae_flax import FlaxAutoencoderKL
+        from .pipelines import FlaxDiffusionPipeline
+        from .schedulers import (
+            FlaxDDIMScheduler,
+            FlaxDDPMScheduler,
+            FlaxDPMSolverMultistepScheduler,
+            FlaxEulerDiscreteScheduler,
+            FlaxKarrasVeScheduler,
+            FlaxLMSDiscreteScheduler,
+            FlaxPNDMScheduler,
+            FlaxSchedulerMixin,
+            FlaxScoreSdeVeScheduler,
+        )
+
+    try:
+        if not (is_flax_available() and is_transformers_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_flax_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipelines import (
+            FlaxStableDiffusionControlNetPipeline,
+            FlaxStableDiffusionImg2ImgPipeline,
+            FlaxStableDiffusionInpaintPipeline,
+            FlaxStableDiffusionPipeline,
+            FlaxStableDiffusionXLPipeline,
+        )
+
+    try:
+        if not (is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_note_seq_objects import *  # noqa F403
+    else:
+        from .pipelines import MidiProcessor
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={"__version__": __version__},
+    )
diff --git a/diffusers/src/diffusers/__pycache__/__init__.cpython-311.pyc b/diffusers/src/diffusers/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be9befa1116b7508e4971e0e2a1218ae1f0cf879
Binary files /dev/null and b/diffusers/src/diffusers/__pycache__/__init__.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/__pycache__/configuration_utils.cpython-311.pyc b/diffusers/src/diffusers/__pycache__/configuration_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d56afd84a7b168d9bf6462e0d56a614d59636ec4
Binary files /dev/null and b/diffusers/src/diffusers/__pycache__/configuration_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/__pycache__/dependency_versions_check.cpython-311.pyc b/diffusers/src/diffusers/__pycache__/dependency_versions_check.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5bf84317596e9a74532c53881bbca5aa7d380261
Binary files /dev/null and b/diffusers/src/diffusers/__pycache__/dependency_versions_check.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/__pycache__/dependency_versions_table.cpython-311.pyc b/diffusers/src/diffusers/__pycache__/dependency_versions_table.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9deed463b6965a4fb883938fc6c2920dbefe6261
Binary files /dev/null and b/diffusers/src/diffusers/__pycache__/dependency_versions_table.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/__pycache__/image_processor.cpython-311.pyc b/diffusers/src/diffusers/__pycache__/image_processor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..579cd4053a508a3f91cfd148cc89a6cf4ccfa3a1
Binary files /dev/null and b/diffusers/src/diffusers/__pycache__/image_processor.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/callbacks.py b/diffusers/src/diffusers/callbacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..38542407e31fa1255eb26b563632c7a9f3d2fded
--- /dev/null
+++ b/diffusers/src/diffusers/callbacks.py
@@ -0,0 +1,156 @@
+from typing import Any, Dict, List
+
+from .configuration_utils import ConfigMixin, register_to_config
+from .utils import CONFIG_NAME
+
+
+class PipelineCallback(ConfigMixin):
+    """
+    Base class for all the official callbacks used in a pipeline. This class provides a structure for implementing
+    custom callbacks and ensures that all callbacks have a consistent interface.
+
+    Please implement the following:
+        `tensor_inputs`: This should return a list of tensor inputs specific to your callback. You will only be able to
+        include
+            variables listed in the `._callback_tensor_inputs` attribute of your pipeline class.
+        `callback_fn`: This method defines the core functionality of your callback.
+    """
+
+    config_name = CONFIG_NAME
+
+    @register_to_config
+    def __init__(self, cutoff_step_ratio=1.0, cutoff_step_index=None):
+        super().__init__()
+
+        if (cutoff_step_ratio is None and cutoff_step_index is None) or (
+            cutoff_step_ratio is not None and cutoff_step_index is not None
+        ):
+            raise ValueError("Either cutoff_step_ratio or cutoff_step_index should be provided, not both or none.")
+
+        if cutoff_step_ratio is not None and (
+            not isinstance(cutoff_step_ratio, float) or not (0.0 <= cutoff_step_ratio <= 1.0)
+        ):
+            raise ValueError("cutoff_step_ratio must be a float between 0.0 and 1.0.")
+
+    @property
+    def tensor_inputs(self) -> List[str]:
+        raise NotImplementedError(f"You need to set the attribute `tensor_inputs` for {self.__class__}")
+
+    def callback_fn(self, pipeline, step_index, timesteps, callback_kwargs) -> Dict[str, Any]:
+        raise NotImplementedError(f"You need to implement the method `callback_fn` for {self.__class__}")
+
+    def __call__(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        return self.callback_fn(pipeline, step_index, timestep, callback_kwargs)
+
+
+class MultiPipelineCallbacks:
+    """
+    This class is designed to handle multiple pipeline callbacks. It accepts a list of PipelineCallback objects and
+    provides a unified interface for calling all of them.
+    """
+
+    def __init__(self, callbacks: List[PipelineCallback]):
+        self.callbacks = callbacks
+
+    @property
+    def tensor_inputs(self) -> List[str]:
+        return [input for callback in self.callbacks for input in callback.tensor_inputs]
+
+    def __call__(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        """
+        Calls all the callbacks in order with the given arguments and returns the final callback_kwargs.
+        """
+        for callback in self.callbacks:
+            callback_kwargs = callback(pipeline, step_index, timestep, callback_kwargs)
+
+        return callback_kwargs
+
+
+class SDCFGCutoffCallback(PipelineCallback):
+    """
+    Callback function for Stable Diffusion Pipelines. After certain number of steps (set by `cutoff_step_ratio` or
+    `cutoff_step_index`), this callback will disable the CFG.
+
+    Note: This callback mutates the pipeline by changing the `_guidance_scale` attribute to 0.0 after the cutoff step.
+    """
+
+    tensor_inputs = ["prompt_embeds"]
+
+    def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        cutoff_step_ratio = self.config.cutoff_step_ratio
+        cutoff_step_index = self.config.cutoff_step_index
+
+        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
+        cutoff_step = (
+            cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio)
+        )
+
+        if step_index == cutoff_step:
+            prompt_embeds = callback_kwargs[self.tensor_inputs[0]]
+            prompt_embeds = prompt_embeds[-1:]  # "-1" denotes the embeddings for conditional text tokens.
+
+            pipeline._guidance_scale = 0.0
+
+            callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
+        return callback_kwargs
+
+
+class SDXLCFGCutoffCallback(PipelineCallback):
+    """
+    Callback function for Stable Diffusion XL Pipelines. After certain number of steps (set by `cutoff_step_ratio` or
+    `cutoff_step_index`), this callback will disable the CFG.
+
+    Note: This callback mutates the pipeline by changing the `_guidance_scale` attribute to 0.0 after the cutoff step.
+    """
+
+    tensor_inputs = ["prompt_embeds", "add_text_embeds", "add_time_ids"]
+
+    def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        cutoff_step_ratio = self.config.cutoff_step_ratio
+        cutoff_step_index = self.config.cutoff_step_index
+
+        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
+        cutoff_step = (
+            cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio)
+        )
+
+        if step_index == cutoff_step:
+            prompt_embeds = callback_kwargs[self.tensor_inputs[0]]
+            prompt_embeds = prompt_embeds[-1:]  # "-1" denotes the embeddings for conditional text tokens.
+
+            add_text_embeds = callback_kwargs[self.tensor_inputs[1]]
+            add_text_embeds = add_text_embeds[-1:]  # "-1" denotes the embeddings for conditional pooled text tokens
+
+            add_time_ids = callback_kwargs[self.tensor_inputs[2]]
+            add_time_ids = add_time_ids[-1:]  # "-1" denotes the embeddings for conditional added time vector
+
+            pipeline._guidance_scale = 0.0
+
+            callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
+            callback_kwargs[self.tensor_inputs[1]] = add_text_embeds
+            callback_kwargs[self.tensor_inputs[2]] = add_time_ids
+        return callback_kwargs
+
+
+class IPAdapterScaleCutoffCallback(PipelineCallback):
+    """
+    Callback function for any pipeline that inherits `IPAdapterMixin`. After certain number of steps (set by
+    `cutoff_step_ratio` or `cutoff_step_index`), this callback will set the IP Adapter scale to `0.0`.
+
+    Note: This callback mutates the IP Adapter attention processors by setting the scale to 0.0 after the cutoff step.
+    """
+
+    tensor_inputs = []
+
+    def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        cutoff_step_ratio = self.config.cutoff_step_ratio
+        cutoff_step_index = self.config.cutoff_step_index
+
+        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
+        cutoff_step = (
+            cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio)
+        )
+
+        if step_index == cutoff_step:
+            pipeline.set_ip_adapter_scale(0.0)
+        return callback_kwargs
diff --git a/diffusers/src/diffusers/commands/__init__.py b/diffusers/src/diffusers/commands/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8208283f6e40a8e46175b1672d6bf44f9d83a02b
--- /dev/null
+++ b/diffusers/src/diffusers/commands/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from argparse import ArgumentParser
+
+
+class BaseDiffusersCLICommand(ABC):
+    @staticmethod
+    @abstractmethod
+    def register_subcommand(parser: ArgumentParser):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def run(self):
+        raise NotImplementedError()
diff --git a/diffusers/src/diffusers/commands/diffusers_cli.py b/diffusers/src/diffusers/commands/diffusers_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..f582c3bcd0df0c5167a8de18123b4474e64bb344
--- /dev/null
+++ b/diffusers/src/diffusers/commands/diffusers_cli.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser
+
+from .env import EnvironmentCommand
+from .fp16_safetensors import FP16SafetensorsCommand
+
+
+def main():
+    parser = ArgumentParser("Diffusers CLI tool", usage="diffusers-cli <command> [<args>]")
+    commands_parser = parser.add_subparsers(help="diffusers-cli command helpers")
+
+    # Register commands
+    EnvironmentCommand.register_subcommand(commands_parser)
+    FP16SafetensorsCommand.register_subcommand(commands_parser)
+
+    # Let's go
+    args = parser.parse_args()
+
+    if not hasattr(args, "func"):
+        parser.print_help()
+        exit(1)
+
+    # Run
+    service = args.func(args)
+    service.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/src/diffusers/commands/env.py b/diffusers/src/diffusers/commands/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0af30bf1c65984c25a484c7f77b35f80f1a9fa9
--- /dev/null
+++ b/diffusers/src/diffusers/commands/env.py
@@ -0,0 +1,180 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import platform
+import subprocess
+from argparse import ArgumentParser
+
+import huggingface_hub
+
+from .. import __version__ as version
+from ..utils import (
+    is_accelerate_available,
+    is_bitsandbytes_available,
+    is_flax_available,
+    is_google_colab,
+    is_peft_available,
+    is_safetensors_available,
+    is_torch_available,
+    is_transformers_available,
+    is_xformers_available,
+)
+from . import BaseDiffusersCLICommand
+
+
+def info_command_factory(_):
+    return EnvironmentCommand()
+
+
+class EnvironmentCommand(BaseDiffusersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser) -> None:
+        download_parser = parser.add_parser("env")
+        download_parser.set_defaults(func=info_command_factory)
+
+    def run(self) -> dict:
+        hub_version = huggingface_hub.__version__
+
+        safetensors_version = "not installed"
+        if is_safetensors_available():
+            import safetensors
+
+            safetensors_version = safetensors.__version__
+
+        pt_version = "not installed"
+        pt_cuda_available = "NA"
+        if is_torch_available():
+            import torch
+
+            pt_version = torch.__version__
+            pt_cuda_available = torch.cuda.is_available()
+
+        flax_version = "not installed"
+        jax_version = "not installed"
+        jaxlib_version = "not installed"
+        jax_backend = "NA"
+        if is_flax_available():
+            import flax
+            import jax
+            import jaxlib
+
+            flax_version = flax.__version__
+            jax_version = jax.__version__
+            jaxlib_version = jaxlib.__version__
+            jax_backend = jax.lib.xla_bridge.get_backend().platform
+
+        transformers_version = "not installed"
+        if is_transformers_available():
+            import transformers
+
+            transformers_version = transformers.__version__
+
+        accelerate_version = "not installed"
+        if is_accelerate_available():
+            import accelerate
+
+            accelerate_version = accelerate.__version__
+
+        peft_version = "not installed"
+        if is_peft_available():
+            import peft
+
+            peft_version = peft.__version__
+
+        bitsandbytes_version = "not installed"
+        if is_bitsandbytes_available():
+            import bitsandbytes
+
+            bitsandbytes_version = bitsandbytes.__version__
+
+        xformers_version = "not installed"
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = xformers.__version__
+
+        platform_info = platform.platform()
+
+        is_google_colab_str = "Yes" if is_google_colab() else "No"
+
+        accelerator = "NA"
+        if platform.system() in {"Linux", "Windows"}:
+            try:
+                sp = subprocess.Popen(
+                    ["nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv,noheader"],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                )
+                out_str, _ = sp.communicate()
+                out_str = out_str.decode("utf-8")
+
+                if len(out_str) > 0:
+                    accelerator = out_str.strip()
+            except FileNotFoundError:
+                pass
+        elif platform.system() == "Darwin":  # Mac OS
+            try:
+                sp = subprocess.Popen(
+                    ["system_profiler", "SPDisplaysDataType"],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                )
+                out_str, _ = sp.communicate()
+                out_str = out_str.decode("utf-8")
+
+                start = out_str.find("Chipset Model:")
+                if start != -1:
+                    start += len("Chipset Model:")
+                    end = out_str.find("\n", start)
+                    accelerator = out_str[start:end].strip()
+
+                    start = out_str.find("VRAM (Total):")
+                    if start != -1:
+                        start += len("VRAM (Total):")
+                        end = out_str.find("\n", start)
+                        accelerator += " VRAM: " + out_str[start:end].strip()
+            except FileNotFoundError:
+                pass
+        else:
+            print("It seems you are running an unusual OS. Could you fill in the accelerator manually?")
+
+        info = {
+            "🤗 Diffusers version": version,
+            "Platform": platform_info,
+            "Running on Google Colab?": is_google_colab_str,
+            "Python version": platform.python_version(),
+            "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
+            "Flax version (CPU?/GPU?/TPU?)": f"{flax_version} ({jax_backend})",
+            "Jax version": jax_version,
+            "JaxLib version": jaxlib_version,
+            "Huggingface_hub version": hub_version,
+            "Transformers version": transformers_version,
+            "Accelerate version": accelerate_version,
+            "PEFT version": peft_version,
+            "Bitsandbytes version": bitsandbytes_version,
+            "Safetensors version": safetensors_version,
+            "xFormers version": xformers_version,
+            "Accelerator": accelerator,
+            "Using GPU in script?": "<fill in>",
+            "Using distributed or parallel set-up in script?": "<fill in>",
+        }
+
+        print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
+        print(self.format_dict(info))
+
+        return info
+
+    @staticmethod
+    def format_dict(d: dict) -> str:
+        return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"
diff --git a/diffusers/src/diffusers/commands/fp16_safetensors.py b/diffusers/src/diffusers/commands/fp16_safetensors.py
new file mode 100644
index 0000000000000000000000000000000000000000..b26b8816bc4cf1a6272d3eebf5ab2be8a5dd865b
--- /dev/null
+++ b/diffusers/src/diffusers/commands/fp16_safetensors.py
@@ -0,0 +1,132 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Usage example:
+    diffusers-cli fp16_safetensors --ckpt_id=openai/shap-e --fp16 --use_safetensors
+"""
+
+import glob
+import json
+import warnings
+from argparse import ArgumentParser, Namespace
+from importlib import import_module
+
+import huggingface_hub
+import torch
+from huggingface_hub import hf_hub_download
+from packaging import version
+
+from ..utils import logging
+from . import BaseDiffusersCLICommand
+
+
+def conversion_command_factory(args: Namespace):
+    if args.use_auth_token:
+        warnings.warn(
+            "The `--use_auth_token` flag is deprecated and will be removed in a future version. Authentication is now"
+            " handled automatically if user is logged in."
+        )
+    return FP16SafetensorsCommand(args.ckpt_id, args.fp16, args.use_safetensors)
+
+
+class FP16SafetensorsCommand(BaseDiffusersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        conversion_parser = parser.add_parser("fp16_safetensors")
+        conversion_parser.add_argument(
+            "--ckpt_id",
+            type=str,
+            help="Repo id of the checkpoints on which to run the conversion. Example: 'openai/shap-e'.",
+        )
+        conversion_parser.add_argument(
+            "--fp16", action="store_true", help="If serializing the variables in FP16 precision."
+        )
+        conversion_parser.add_argument(
+            "--use_safetensors", action="store_true", help="If serializing in the safetensors format."
+        )
+        conversion_parser.add_argument(
+            "--use_auth_token",
+            action="store_true",
+            help="When working with checkpoints having private visibility. When used `huggingface-cli login` needs to be run beforehand.",
+        )
+        conversion_parser.set_defaults(func=conversion_command_factory)
+
+    def __init__(self, ckpt_id: str, fp16: bool, use_safetensors: bool):
+        self.logger = logging.get_logger("diffusers-cli/fp16_safetensors")
+        self.ckpt_id = ckpt_id
+        self.local_ckpt_dir = f"/tmp/{ckpt_id}"
+        self.fp16 = fp16
+
+        self.use_safetensors = use_safetensors
+
+        if not self.use_safetensors and not self.fp16:
+            raise NotImplementedError(
+                "When `use_safetensors` and `fp16` both are False, then this command is of no use."
+            )
+
+    def run(self):
+        if version.parse(huggingface_hub.__version__) < version.parse("0.9.0"):
+            raise ImportError(
+                "The huggingface_hub version must be >= 0.9.0 to use this command. Please update your huggingface_hub"
+                " installation."
+            )
+        else:
+            from huggingface_hub import create_commit
+            from huggingface_hub._commit_api import CommitOperationAdd
+
+        model_index = hf_hub_download(repo_id=self.ckpt_id, filename="model_index.json")
+        with open(model_index, "r") as f:
+            pipeline_class_name = json.load(f)["_class_name"]
+        pipeline_class = getattr(import_module("diffusers"), pipeline_class_name)
+        self.logger.info(f"Pipeline class imported: {pipeline_class_name}.")
+
+        # Load the appropriate pipeline. We could have use `DiffusionPipeline`
+        # here, but just to avoid any rough edge cases.
+        pipeline = pipeline_class.from_pretrained(
+            self.ckpt_id, torch_dtype=torch.float16 if self.fp16 else torch.float32
+        )
+        pipeline.save_pretrained(
+            self.local_ckpt_dir,
+            safe_serialization=True if self.use_safetensors else False,
+            variant="fp16" if self.fp16 else None,
+        )
+        self.logger.info(f"Pipeline locally saved to {self.local_ckpt_dir}.")
+
+        # Fetch all the paths.
+        if self.fp16:
+            modified_paths = glob.glob(f"{self.local_ckpt_dir}/*/*.fp16.*")
+        elif self.use_safetensors:
+            modified_paths = glob.glob(f"{self.local_ckpt_dir}/*/*.safetensors")
+
+        # Prepare for the PR.
+        commit_message = f"Serialize variables with FP16: {self.fp16} and safetensors: {self.use_safetensors}."
+        operations = []
+        for path in modified_paths:
+            operations.append(CommitOperationAdd(path_in_repo="/".join(path.split("/")[4:]), path_or_fileobj=path))
+
+        # Open the PR.
+        commit_description = (
+            "Variables converted by the [`diffusers`' `fp16_safetensors`"
+            " CLI](https://github.com/huggingface/diffusers/blob/main/src/diffusers/commands/fp16_safetensors.py)."
+        )
+        hub_pr_url = create_commit(
+            repo_id=self.ckpt_id,
+            operations=operations,
+            commit_message=commit_message,
+            commit_description=commit_description,
+            repo_type="model",
+            create_pr=True,
+        ).pr_url
+        self.logger.info(f"PR created here: {hub_pr_url}.")
diff --git a/diffusers/src/diffusers/configuration_utils.py b/diffusers/src/diffusers/configuration_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dccd785cae4211ec8e8ee8a723c781430e99abe
--- /dev/null
+++ b/diffusers/src/diffusers/configuration_utils.py
@@ -0,0 +1,720 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ConfigMixin base class and utilities."""
+
+import dataclasses
+import functools
+import importlib
+import inspect
+import json
+import os
+import re
+from collections import OrderedDict
+from pathlib import Path
+from typing import Any, Dict, Tuple, Union
+
+import numpy as np
+from huggingface_hub import create_repo, hf_hub_download
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    validate_hf_hub_args,
+)
+from requests import HTTPError
+
+from . import __version__
+from .utils import (
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    DummyObject,
+    deprecate,
+    extract_commit_hash,
+    http_user_agent,
+    logging,
+)
+
+
+logger = logging.get_logger(__name__)
+
+_re_configuration_file = re.compile(r"config\.(.*)\.json")
+
+
+class FrozenDict(OrderedDict):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        for key, value in self.items():
+            setattr(self, key, value)
+
+        self.__frozen = True
+
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+    def setdefault(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+    def pop(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+    def __setattr__(self, name, value):
+        if hasattr(self, "__frozen") and self.__frozen:
+            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
+        super().__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if hasattr(self, "__frozen") and self.__frozen:
+            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
+        super().__setitem__(name, value)
+
+
+class ConfigMixin:
+    r"""
+    Base class for all configuration classes. All configuration parameters are stored under `self.config`. Also
+    provides the [`~ConfigMixin.from_config`] and [`~ConfigMixin.save_config`] methods for loading, downloading, and
+    saving classes that inherit from [`ConfigMixin`].
+
+    Class attributes:
+        - **config_name** (`str`) -- A filename under which the config should stored when calling
+          [`~ConfigMixin.save_config`] (should be overridden by parent class).
+        - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
+          overridden by subclass).
+        - **has_compatibles** (`bool`) -- Whether the class has compatible classes (should be overridden by subclass).
+        - **_deprecated_kwargs** (`List[str]`) -- Keyword arguments that are deprecated. Note that the `init` function
+          should only have a `kwargs` argument if at least one argument is deprecated (should be overridden by
+          subclass).
+    """
+
+    config_name = None
+    ignore_for_config = []
+    has_compatibles = False
+
+    _deprecated_kwargs = []
+
+    def register_to_config(self, **kwargs):
+        if self.config_name is None:
+            raise NotImplementedError(f"Make sure that {self.__class__} has defined a class name `config_name`")
+        # Special case for `kwargs` used in deprecation warning added to schedulers
+        # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
+        # or solve in a more general way.
+        kwargs.pop("kwargs", None)
+
+        if not hasattr(self, "_internal_dict"):
+            internal_dict = kwargs
+        else:
+            previous_dict = dict(self._internal_dict)
+            internal_dict = {**self._internal_dict, **kwargs}
+            logger.debug(f"Updating config from {previous_dict} to {internal_dict}")
+
+        self._internal_dict = FrozenDict(internal_dict)
+
+    def __getattr__(self, name: str) -> Any:
+        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
+        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129
+
+        This function is mostly copied from PyTorch's __getattr__ overwrite:
+        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        """
+
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
+        is_attribute = name in self.__dict__
+
+        if is_in_config and not is_attribute:
+            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'scheduler.config.{name}'."
+            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)
+            return self._internal_dict[name]
+
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+
+    def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save a configuration object to the directory specified in `save_directory` so that it can be reloaded using the
+        [`~ConfigMixin.from_config`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file is saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        # If we save using the predefined names, we can load using `from_config`
+        output_config_file = os.path.join(save_directory, self.config_name)
+
+        self.to_json_file(output_config_file)
+        logger.info(f"Configuration saved in {output_config_file}")
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", False)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
+
+    @classmethod
+    def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_unused_kwargs=False, **kwargs):
+        r"""
+        Instantiate a Python class from a config dictionary.
+
+        Parameters:
+            config (`Dict[str, Any]`):
+                A config dictionary from which the Python class is instantiated. Make sure to only load configuration
+                files of compatible classes.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                Whether kwargs that are not consumed by the Python class should be returned or not.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it is loaded) and initiate the Python class.
+                `**kwargs` are passed directly to the underlying scheduler/model's `__init__` method and eventually
+                overwrite the same named arguments in `config`.
+
+        Returns:
+            [`ModelMixin`] or [`SchedulerMixin`]:
+                A model or scheduler object instantiated from a config dictionary.
+
+        Examples:
+
+        ```python
+        >>> from diffusers import DDPMScheduler, DDIMScheduler, PNDMScheduler
+
+        >>> # Download scheduler from huggingface.co and cache.
+        >>> scheduler = DDPMScheduler.from_pretrained("google/ddpm-cifar10-32")
+
+        >>> # Instantiate DDIM scheduler class with same config as DDPM
+        >>> scheduler = DDIMScheduler.from_config(scheduler.config)
+
+        >>> # Instantiate PNDM scheduler class with same config as DDPM
+        >>> scheduler = PNDMScheduler.from_config(scheduler.config)
+        ```
+        """
+        # <===== TO BE REMOVED WITH DEPRECATION
+        # TODO(Patrick) - make sure to remove the following lines when config=="model_path" is deprecated
+        if "pretrained_model_name_or_path" in kwargs:
+            config = kwargs.pop("pretrained_model_name_or_path")
+
+        if config is None:
+            raise ValueError("Please make sure to provide a config as the first positional argument.")
+        # ======>
+
+        if not isinstance(config, dict):
+            deprecation_message = "It is deprecated to pass a pretrained model name or path to `from_config`."
+            if "Scheduler" in cls.__name__:
+                deprecation_message += (
+                    f"If you were trying to load a scheduler, please use {cls}.from_pretrained(...) instead."
+                    " Otherwise, please make sure to pass a configuration dictionary instead. This functionality will"
+                    " be removed in v1.0.0."
+                )
+            elif "Model" in cls.__name__:
+                deprecation_message += (
+                    f"If you were trying to load a model, please use {cls}.load_config(...) followed by"
+                    f" {cls}.from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary"
+                    " instead. This functionality will be removed in v1.0.0."
+                )
+            deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+            config, kwargs = cls.load_config(pretrained_model_name_or_path=config, return_unused_kwargs=True, **kwargs)
+
+        init_dict, unused_kwargs, hidden_dict = cls.extract_init_dict(config, **kwargs)
+
+        # Allow dtype to be specified on initialization
+        if "dtype" in unused_kwargs:
+            init_dict["dtype"] = unused_kwargs.pop("dtype")
+
+        # add possible deprecated kwargs
+        for deprecated_kwarg in cls._deprecated_kwargs:
+            if deprecated_kwarg in unused_kwargs:
+                init_dict[deprecated_kwarg] = unused_kwargs.pop(deprecated_kwarg)
+
+        # Return model and optionally state and/or unused_kwargs
+        model = cls(**init_dict)
+
+        # make sure to also save config parameters that might be used for compatible classes
+        # update _class_name
+        if "_class_name" in hidden_dict:
+            hidden_dict["_class_name"] = cls.__name__
+
+        model.register_to_config(**hidden_dict)
+
+        # add hidden kwargs of compatible classes to unused_kwargs
+        unused_kwargs = {**unused_kwargs, **hidden_dict}
+
+        if return_unused_kwargs:
+            return (model, unused_kwargs)
+        else:
+            return model
+
+    @classmethod
+    def get_config_dict(cls, *args, **kwargs):
+        deprecation_message = (
+            f" The function get_config_dict is deprecated. Please use {cls}.load_config instead. This function will be"
+            " removed in version v1.0.0"
+        )
+        deprecate("get_config_dict", "1.0.0", deprecation_message, standard_warn=False)
+        return cls.load_config(*args, **kwargs)
+
+    @classmethod
+    @validate_hf_hub_args
+    def load_config(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        return_unused_kwargs=False,
+        return_commit_hash=False,
+        **kwargs,
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        r"""
+        Load a model or scheduler configuration.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing model weights saved with
+                      [`~ConfigMixin.save_config`].
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False):
+                Whether unused keyword arguments of the config are returned.
+            return_commit_hash (`bool`, *optional*, defaults to `False):
+                Whether the `commit_hash` of the loaded configuration are returned.
+
+        Returns:
+            `dict`:
+                A dictionary of all the parameters stored in a JSON configuration file.
+
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        local_dir = kwargs.pop("local_dir", None)
+        local_dir_use_symlinks = kwargs.pop("local_dir_use_symlinks", "auto")
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+        _ = kwargs.pop("mirror", None)
+        subfolder = kwargs.pop("subfolder", None)
+        user_agent = kwargs.pop("user_agent", {})
+
+        user_agent = {**user_agent, "file_type": "config"}
+        user_agent = http_user_agent(user_agent)
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+
+        if cls.config_name is None:
+            raise ValueError(
+                "`self.config_name` is not defined. Note that one should not load a config from "
+                "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
+            )
+
+        if os.path.isfile(pretrained_model_name_or_path):
+            config_file = pretrained_model_name_or_path
+        elif os.path.isdir(pretrained_model_name_or_path):
+            if subfolder is not None and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+            ):
+                config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+            elif os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
+                # Load from a PyTorch checkpoint
+                config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
+            else:
+                raise EnvironmentError(
+                    f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
+                )
+        else:
+            try:
+                # Load from URL or cache if already cached
+                config_file = hf_hub_download(
+                    pretrained_model_name_or_path,
+                    filename=cls.config_name,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    subfolder=subfolder,
+                    revision=revision,
+                    local_dir=local_dir,
+                    local_dir_use_symlinks=local_dir_use_symlinks,
+                )
+            except RepositoryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier"
+                    " listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a"
+                    " token having permission to this repo with `token` or log in with `huggingface-cli login`."
+                )
+            except RevisionNotFoundError:
+                raise EnvironmentError(
+                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for"
+                    " this model name. Check the model page at"
+                    f" 'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+                )
+            except EntryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}."
+                )
+            except HTTPError as err:
+                raise EnvironmentError(
+                    "There was a specific connection error when trying to load"
+                    f" {pretrained_model_name_or_path}:\n{err}"
+                )
+            except ValueError:
+                raise EnvironmentError(
+                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+                    f" directory containing a {cls.config_name} file.\nCheckout your internet connection or see how to"
+                    " run the library in offline mode at"
+                    " 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
+                )
+            except EnvironmentError:
+                raise EnvironmentError(
+                    f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                    f"containing a {cls.config_name} file"
+                )
+
+        try:
+            # Load config dict
+            config_dict = cls._dict_from_json_file(config_file)
+
+            commit_hash = extract_commit_hash(config_file)
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
+
+        if not (return_unused_kwargs or return_commit_hash):
+            return config_dict
+
+        outputs = (config_dict,)
+
+        if return_unused_kwargs:
+            outputs += (kwargs,)
+
+        if return_commit_hash:
+            outputs += (commit_hash,)
+
+        return outputs
+
+    @staticmethod
+    def _get_init_keys(input_class):
+        return set(dict(inspect.signature(input_class.__init__).parameters).keys())
+
+    @classmethod
+    def extract_init_dict(cls, config_dict, **kwargs):
+        # Skip keys that were not present in the original config, so default __init__ values were used
+        used_defaults = config_dict.get("_use_default_values", [])
+        config_dict = {k: v for k, v in config_dict.items() if k not in used_defaults and k != "_use_default_values"}
+
+        # 0. Copy origin config dict
+        original_dict = dict(config_dict.items())
+
+        # 1. Retrieve expected config attributes from __init__ signature
+        expected_keys = cls._get_init_keys(cls)
+        expected_keys.remove("self")
+        # remove general kwargs if present in dict
+        if "kwargs" in expected_keys:
+            expected_keys.remove("kwargs")
+        # remove flax internal keys
+        if hasattr(cls, "_flax_internal_args"):
+            for arg in cls._flax_internal_args:
+                expected_keys.remove(arg)
+
+        # 2. Remove attributes that cannot be expected from expected config attributes
+        # remove keys to be ignored
+        if len(cls.ignore_for_config) > 0:
+            expected_keys = expected_keys - set(cls.ignore_for_config)
+
+        # load diffusers library to import compatible and original scheduler
+        diffusers_library = importlib.import_module(__name__.split(".")[0])
+
+        if cls.has_compatibles:
+            compatible_classes = [c for c in cls._get_compatibles() if not isinstance(c, DummyObject)]
+        else:
+            compatible_classes = []
+
+        expected_keys_comp_cls = set()
+        for c in compatible_classes:
+            expected_keys_c = cls._get_init_keys(c)
+            expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)
+        expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)
+        config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}
+
+        # remove attributes from orig class that cannot be expected
+        orig_cls_name = config_dict.pop("_class_name", cls.__name__)
+        if (
+            isinstance(orig_cls_name, str)
+            and orig_cls_name != cls.__name__
+            and hasattr(diffusers_library, orig_cls_name)
+        ):
+            orig_cls = getattr(diffusers_library, orig_cls_name)
+            unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys
+            config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}
+        elif not isinstance(orig_cls_name, str) and not isinstance(orig_cls_name, (list, tuple)):
+            raise ValueError(
+                "Make sure that the `_class_name` is of type string or list of string (for custom pipelines)."
+            )
+
+        # remove private attributes
+        config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
+
+        # 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments
+        init_dict = {}
+        for key in expected_keys:
+            # if config param is passed to kwarg and is present in config dict
+            # it should overwrite existing config dict key
+            if key in kwargs and key in config_dict:
+                config_dict[key] = kwargs.pop(key)
+
+            if key in kwargs:
+                # overwrite key
+                init_dict[key] = kwargs.pop(key)
+            elif key in config_dict:
+                # use value from config dict
+                init_dict[key] = config_dict.pop(key)
+
+        # 4. Give nice warning if unexpected values have been passed
+        if len(config_dict) > 0:
+            logger.warning(
+                f"The config attributes {config_dict} were passed to {cls.__name__}, "
+                "but are not expected and will be ignored. Please verify your "
+                f"{cls.config_name} configuration file."
+            )
+
+        # 5. Give nice info if config attributes are initialized to default because they have not been passed
+        passed_keys = set(init_dict.keys())
+        if len(expected_keys - passed_keys) > 0:
+            logger.info(
+                f"{expected_keys - passed_keys} was not found in config. Values will be initialized to default values."
+            )
+
+        # 6. Define unused keyword arguments
+        unused_kwargs = {**config_dict, **kwargs}
+
+        # 7. Define "hidden" config parameters that were saved for compatible classes
+        hidden_config_dict = {k: v for k, v in original_dict.items() if k not in init_dict}
+
+        return init_dict, unused_kwargs, hidden_config_dict
+
+    @classmethod
+    def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return json.loads(text)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    @property
+    def config(self) -> Dict[str, Any]:
+        """
+        Returns the config of the class as a frozen dictionary
+
+        Returns:
+            `Dict[str, Any]`: Config of the class.
+        """
+        return self._internal_dict
+
+    def to_json_string(self) -> str:
+        """
+        Serializes the configuration instance to a JSON string.
+
+        Returns:
+            `str`:
+                String containing all the attributes that make up the configuration instance in JSON format.
+        """
+        config_dict = self._internal_dict if hasattr(self, "_internal_dict") else {}
+        config_dict["_class_name"] = self.__class__.__name__
+        config_dict["_diffusers_version"] = __version__
+
+        def to_json_saveable(value):
+            if isinstance(value, np.ndarray):
+                value = value.tolist()
+            elif isinstance(value, Path):
+                value = value.as_posix()
+            return value
+
+        config_dict = {k: to_json_saveable(v) for k, v in config_dict.items()}
+        # Don't save "_ignore_files" or "_use_default_values"
+        config_dict.pop("_ignore_files", None)
+        config_dict.pop("_use_default_values", None)
+
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save the configuration instance's parameters to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file to save a configuration instance's parameters.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+
+def register_to_config(init):
+    r"""
+    Decorator to apply on the init of classes inheriting from [`ConfigMixin`] so that all the arguments are
+    automatically sent to `self.register_for_config`. To ignore a specific argument accepted by the init but that
+    shouldn't be registered in the config, use the `ignore_for_config` class variable
+
+    Warning: Once decorated, all private arguments (beginning with an underscore) are trashed and not sent to the init!
+    """
+
+    @functools.wraps(init)
+    def inner_init(self, *args, **kwargs):
+        # Ignore private kwargs in the init.
+        init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
+        config_init_kwargs = {k: v for k, v in kwargs.items() if k.startswith("_")}
+        if not isinstance(self, ConfigMixin):
+            raise RuntimeError(
+                f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
+                "not inherit from `ConfigMixin`."
+            )
+
+        ignore = getattr(self, "ignore_for_config", [])
+        # Get positional arguments aligned with kwargs
+        new_kwargs = {}
+        signature = inspect.signature(init)
+        parameters = {
+            name: p.default for i, (name, p) in enumerate(signature.parameters.items()) if i > 0 and name not in ignore
+        }
+        for arg, name in zip(args, parameters.keys()):
+            new_kwargs[name] = arg
+
+        # Then add all kwargs
+        new_kwargs.update(
+            {
+                k: init_kwargs.get(k, default)
+                for k, default in parameters.items()
+                if k not in ignore and k not in new_kwargs
+            }
+        )
+
+        # Take note of the parameters that were not present in the loaded config
+        if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
+            new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
+
+        new_kwargs = {**config_init_kwargs, **new_kwargs}
+        getattr(self, "register_to_config")(**new_kwargs)
+        init(self, *args, **init_kwargs)
+
+    return inner_init
+
+
+def flax_register_to_config(cls):
+    original_init = cls.__init__
+
+    @functools.wraps(original_init)
+    def init(self, *args, **kwargs):
+        if not isinstance(self, ConfigMixin):
+            raise RuntimeError(
+                f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
+                "not inherit from `ConfigMixin`."
+            )
+
+        # Ignore private kwargs in the init. Retrieve all passed attributes
+        init_kwargs = dict(kwargs.items())
+
+        # Retrieve default values
+        fields = dataclasses.fields(self)
+        default_kwargs = {}
+        for field in fields:
+            # ignore flax specific attributes
+            if field.name in self._flax_internal_args:
+                continue
+            if type(field.default) == dataclasses._MISSING_TYPE:
+                default_kwargs[field.name] = None
+            else:
+                default_kwargs[field.name] = getattr(self, field.name)
+
+        # Make sure init_kwargs override default kwargs
+        new_kwargs = {**default_kwargs, **init_kwargs}
+        # dtype should be part of `init_kwargs`, but not `new_kwargs`
+        if "dtype" in new_kwargs:
+            new_kwargs.pop("dtype")
+
+        # Get positional arguments aligned with kwargs
+        for i, arg in enumerate(args):
+            name = fields[i].name
+            new_kwargs[name] = arg
+
+        # Take note of the parameters that were not present in the loaded config
+        if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
+            new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
+
+        getattr(self, "register_to_config")(**new_kwargs)
+        original_init(self, *args, **kwargs)
+
+    cls.__init__ = init
+    return cls
+
+
+class LegacyConfigMixin(ConfigMixin):
+    r"""
+    A subclass of `ConfigMixin` to resolve class mapping from legacy classes (like `Transformer2DModel`) to more
+    pipeline-specific classes (like `DiTTransformer2DModel`).
+    """
+
+    @classmethod
+    def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_unused_kwargs=False, **kwargs):
+        # To prevent dependency import problem.
+        from .models.model_loading_utils import _fetch_remapped_cls_from_config
+
+        # resolve remapping
+        remapped_class = _fetch_remapped_cls_from_config(config, cls)
+
+        return remapped_class.from_config(config, return_unused_kwargs, **kwargs)
diff --git a/diffusers/src/diffusers/dependency_versions_check.py b/diffusers/src/diffusers/dependency_versions_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..0728b3a7c0932cd06c920947e1ea57f3864f239a
--- /dev/null
+++ b/diffusers/src/diffusers/dependency_versions_check.py
@@ -0,0 +1,34 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dependency_versions_table import deps
+from .utils.versions import require_version, require_version_core
+
+
+# define which module versions we always want to check at run time
+# (usually the ones defined in `install_requires` in setup.py)
+#
+# order specific notes:
+# - tqdm must be checked before tokenizers
+
+pkgs_to_check_at_runtime = "python requests filelock numpy".split()
+for pkg in pkgs_to_check_at_runtime:
+    if pkg in deps:
+        require_version_core(deps[pkg])
+    else:
+        raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py")
+
+
+def dep_version_check(pkg, hint=None):
+    require_version(deps[pkg], hint)
diff --git a/diffusers/src/diffusers/dependency_versions_table.py b/diffusers/src/diffusers/dependency_versions_table.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e7bf242eca78aca87da4c01b0e3a9830c781299
--- /dev/null
+++ b/diffusers/src/diffusers/dependency_versions_table.py
@@ -0,0 +1,46 @@
+# THIS FILE HAS BEEN AUTOGENERATED. To update:
+# 1. modify the `_deps` dict in setup.py
+# 2. run `make deps_table_update`
+deps = {
+    "Pillow": "Pillow",
+    "accelerate": "accelerate>=0.31.0",
+    "compel": "compel==0.1.8",
+    "datasets": "datasets",
+    "filelock": "filelock",
+    "flax": "flax>=0.4.1",
+    "hf-doc-builder": "hf-doc-builder>=0.3.0",
+    "huggingface-hub": "huggingface-hub>=0.23.2",
+    "requests-mock": "requests-mock==1.10.0",
+    "importlib_metadata": "importlib_metadata",
+    "invisible-watermark": "invisible-watermark>=0.2.0",
+    "isort": "isort>=5.5.4",
+    "jax": "jax>=0.4.1",
+    "jaxlib": "jaxlib>=0.4.1",
+    "Jinja2": "Jinja2",
+    "k-diffusion": "k-diffusion>=0.0.12",
+    "torchsde": "torchsde",
+    "note_seq": "note_seq",
+    "librosa": "librosa",
+    "numpy": "numpy",
+    "parameterized": "parameterized",
+    "peft": "peft>=0.6.0",
+    "protobuf": "protobuf>=3.20.3,<4",
+    "pytest": "pytest",
+    "pytest-timeout": "pytest-timeout",
+    "pytest-xdist": "pytest-xdist",
+    "python": "python>=3.8.0",
+    "ruff": "ruff==0.1.5",
+    "safetensors": "safetensors>=0.3.1",
+    "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
+    "GitPython": "GitPython<3.1.19",
+    "scipy": "scipy",
+    "onnx": "onnx",
+    "regex": "regex!=2019.12.17",
+    "requests": "requests",
+    "tensorboard": "tensorboard",
+    "torch": "torch>=1.4",
+    "torchvision": "torchvision",
+    "transformers": "transformers>=4.41.2",
+    "urllib3": "urllib3<=2.0.0",
+    "black": "black",
+}
diff --git a/diffusers/src/diffusers/experimental/README.md b/diffusers/src/diffusers/experimental/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..81a9de81c73728ea41eb6e8617a5429c3c9645ff
--- /dev/null
+++ b/diffusers/src/diffusers/experimental/README.md
@@ -0,0 +1,5 @@
+# 🧨 Diffusers Experimental
+
+We are adding experimental code to support novel applications and usages of the Diffusers library.
+Currently, the following experiments are supported:
+* Reinforcement learning via an implementation of the [Diffuser](https://arxiv.org/abs/2205.09991) model.
\ No newline at end of file
diff --git a/diffusers/src/diffusers/experimental/__init__.py b/diffusers/src/diffusers/experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc8155403016dfd8ad7fb78d246f9da9098ac50
--- /dev/null
+++ b/diffusers/src/diffusers/experimental/__init__.py
@@ -0,0 +1 @@
+from .rl import ValueGuidedRLPipeline
diff --git a/diffusers/src/diffusers/experimental/rl/__init__.py b/diffusers/src/diffusers/experimental/rl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b338d3173e12d478b6b6d6fd0e50650a0ab5a4c
--- /dev/null
+++ b/diffusers/src/diffusers/experimental/rl/__init__.py
@@ -0,0 +1 @@
+from .value_guided_sampling import ValueGuidedRLPipeline
diff --git a/diffusers/src/diffusers/experimental/rl/value_guided_sampling.py b/diffusers/src/diffusers/experimental/rl/value_guided_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f9de857480ec590ae1f795bb2d29bbeccec1331
--- /dev/null
+++ b/diffusers/src/diffusers/experimental/rl/value_guided_sampling.py
@@ -0,0 +1,153 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+import tqdm
+
+from ...models.unets.unet_1d import UNet1DModel
+from ...pipelines import DiffusionPipeline
+from ...utils.dummy_pt_objects import DDPMScheduler
+from ...utils.torch_utils import randn_tensor
+
+
+class ValueGuidedRLPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for value-guided sampling from a diffusion model trained to predict sequences of states.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        value_function ([`UNet1DModel`]):
+            A specialized UNet for fine-tuning trajectories base on reward.
+        unet ([`UNet1DModel`]):
+            UNet architecture to denoise the encoded trajectories.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded trajectories. Default for this
+            application is [`DDPMScheduler`].
+        env ():
+            An environment following the OpenAI gym API to act in. For now only Hopper has pretrained models.
+    """
+
+    def __init__(
+        self,
+        value_function: UNet1DModel,
+        unet: UNet1DModel,
+        scheduler: DDPMScheduler,
+        env,
+    ):
+        super().__init__()
+
+        self.register_modules(value_function=value_function, unet=unet, scheduler=scheduler, env=env)
+
+        self.data = env.get_dataset()
+        self.means = {}
+        for key in self.data.keys():
+            try:
+                self.means[key] = self.data[key].mean()
+            except:  # noqa: E722
+                pass
+        self.stds = {}
+        for key in self.data.keys():
+            try:
+                self.stds[key] = self.data[key].std()
+            except:  # noqa: E722
+                pass
+        self.state_dim = env.observation_space.shape[0]
+        self.action_dim = env.action_space.shape[0]
+
+    def normalize(self, x_in, key):
+        return (x_in - self.means[key]) / self.stds[key]
+
+    def de_normalize(self, x_in, key):
+        return x_in * self.stds[key] + self.means[key]
+
+    def to_torch(self, x_in):
+        if isinstance(x_in, dict):
+            return {k: self.to_torch(v) for k, v in x_in.items()}
+        elif torch.is_tensor(x_in):
+            return x_in.to(self.unet.device)
+        return torch.tensor(x_in, device=self.unet.device)
+
+    def reset_x0(self, x_in, cond, act_dim):
+        for key, val in cond.items():
+            x_in[:, key, act_dim:] = val.clone()
+        return x_in
+
+    def run_diffusion(self, x, conditions, n_guide_steps, scale):
+        batch_size = x.shape[0]
+        y = None
+        for i in tqdm.tqdm(self.scheduler.timesteps):
+            # create batch of timesteps to pass into model
+            timesteps = torch.full((batch_size,), i, device=self.unet.device, dtype=torch.long)
+            for _ in range(n_guide_steps):
+                with torch.enable_grad():
+                    x.requires_grad_()
+
+                    # permute to match dimension for pre-trained models
+                    y = self.value_function(x.permute(0, 2, 1), timesteps).sample
+                    grad = torch.autograd.grad([y.sum()], [x])[0]
+
+                    posterior_variance = self.scheduler._get_variance(i)
+                    model_std = torch.exp(0.5 * posterior_variance)
+                    grad = model_std * grad
+
+                grad[timesteps < 2] = 0
+                x = x.detach()
+                x = x + scale * grad
+                x = self.reset_x0(x, conditions, self.action_dim)
+
+            prev_x = self.unet(x.permute(0, 2, 1), timesteps).sample.permute(0, 2, 1)
+
+            # TODO: verify deprecation of this kwarg
+            x = self.scheduler.step(prev_x, i, x)["prev_sample"]
+
+            # apply conditions to the trajectory (set the initial state)
+            x = self.reset_x0(x, conditions, self.action_dim)
+            x = self.to_torch(x)
+        return x, y
+
+    def __call__(self, obs, batch_size=64, planning_horizon=32, n_guide_steps=2, scale=0.1):
+        # normalize the observations and create  batch dimension
+        obs = self.normalize(obs, "observations")
+        obs = obs[None].repeat(batch_size, axis=0)
+
+        conditions = {0: self.to_torch(obs)}
+        shape = (batch_size, planning_horizon, self.state_dim + self.action_dim)
+
+        # generate initial noise and apply our conditions (to make the trajectories start at current state)
+        x1 = randn_tensor(shape, device=self.unet.device)
+        x = self.reset_x0(x1, conditions, self.action_dim)
+        x = self.to_torch(x)
+
+        # run the diffusion process
+        x, y = self.run_diffusion(x, conditions, n_guide_steps, scale)
+
+        # sort output trajectories by value
+        sorted_idx = y.argsort(0, descending=True).squeeze()
+        sorted_values = x[sorted_idx]
+        actions = sorted_values[:, :, : self.action_dim]
+        actions = actions.detach().cpu().numpy()
+        denorm_actions = self.de_normalize(actions, key="actions")
+
+        # select the action with the highest value
+        if y is not None:
+            selected_index = 0
+        else:
+            # if we didn't run value guiding, select a random action
+            selected_index = np.random.randint(0, batch_size)
+
+        denorm_actions = denorm_actions[selected_index, 0]
+        return denorm_actions
diff --git a/diffusers/src/diffusers/image_processor.py b/diffusers/src/diffusers/image_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..d58bd9e3e3758906d6a2648a5bec31bfc422a1fe
--- /dev/null
+++ b/diffusers/src/diffusers/image_processor.py
@@ -0,0 +1,1103 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from PIL import Image, ImageFilter, ImageOps
+
+from .configuration_utils import ConfigMixin, register_to_config
+from .utils import CONFIG_NAME, PIL_INTERPOLATION, deprecate
+
+
+PipelineImageInput = Union[
+    PIL.Image.Image,
+    np.ndarray,
+    torch.Tensor,
+    List[PIL.Image.Image],
+    List[np.ndarray],
+    List[torch.Tensor],
+]
+
+PipelineDepthInput = PipelineImageInput
+
+
+def is_valid_image(image):
+    return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim in (2, 3)
+
+
+def is_valid_image_imagelist(images):
+    # check if the image input is one of the supported formats for image and image list:
+    # it can be either one of below 3
+    # (1) a 4d pytorch tensor or numpy array,
+    # (2) a valid image: PIL.Image.Image, 2-d np.ndarray or torch.Tensor (grayscale image), 3-d np.ndarray or torch.Tensor
+    # (3) a list of valid image
+    if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4:
+        return True
+    elif is_valid_image(images):
+        return True
+    elif isinstance(images, list):
+        return all(is_valid_image(image) for image in images)
+    return False
+
+
+class VaeImageProcessor(ConfigMixin):
+    """
+    Image processor for VAE.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
+            `height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+        do_binarize (`bool`, *optional*, defaults to `False`):
+            Whether to binarize the image to 0/1.
+        do_convert_rgb (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to RGB format.
+        do_convert_grayscale (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to grayscale format.
+    """
+
+    config_name = CONFIG_NAME
+
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        vae_latent_channels: int = 4,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+        do_binarize: bool = False,
+        do_convert_rgb: bool = False,
+        do_convert_grayscale: bool = False,
+    ):
+        super().__init__()
+        if do_convert_rgb and do_convert_grayscale:
+            raise ValueError(
+                "`do_convert_rgb` and `do_convert_grayscale` can not both be set to `True`,"
+                " if you intended to convert the image into RGB format, please set `do_convert_grayscale = False`.",
+                " if you intended to convert the image into grayscale format, please set `do_convert_rgb = False`",
+            )
+
+    @staticmethod
+    def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+
+        return pil_images
+
+    @staticmethod
+    def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
+        """
+        Convert a PIL image or a list of PIL images to NumPy arrays.
+        """
+        if not isinstance(images, list):
+            images = [images]
+        images = [np.array(image).astype(np.float32) / 255.0 for image in images]
+        images = np.stack(images, axis=0)
+
+        return images
+
+    @staticmethod
+    def numpy_to_pt(images: np.ndarray) -> torch.Tensor:
+        """
+        Convert a NumPy image to a PyTorch tensor.
+        """
+        if images.ndim == 3:
+            images = images[..., None]
+
+        images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+        return images
+
+    @staticmethod
+    def pt_to_numpy(images: torch.Tensor) -> np.ndarray:
+        """
+        Convert a PyTorch tensor to a NumPy image.
+        """
+        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+        return images
+
+    @staticmethod
+    def normalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Normalize an image array to [-1,1].
+        """
+        return 2.0 * images - 1.0
+
+    @staticmethod
+    def denormalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Denormalize an image array to [0,1].
+        """
+        return (images / 2 + 0.5).clamp(0, 1)
+
+    @staticmethod
+    def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Converts a PIL image to RGB format.
+        """
+        image = image.convert("RGB")
+
+        return image
+
+    @staticmethod
+    def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Converts a PIL image to grayscale format.
+        """
+        image = image.convert("L")
+
+        return image
+
+    @staticmethod
+    def blur(image: PIL.Image.Image, blur_factor: int = 4) -> PIL.Image.Image:
+        """
+        Applies Gaussian blur to an image.
+        """
+        image = image.filter(ImageFilter.GaussianBlur(blur_factor))
+
+        return image
+
+    @staticmethod
+    def get_crop_region(mask_image: PIL.Image.Image, width: int, height: int, pad=0):
+        """
+        Finds a rectangular region that contains all masked ares in an image, and expands region to match the aspect
+        ratio of the original image; for example, if user drew mask in a 128x32 region, and the dimensions for
+        processing are 512x512, the region will be expanded to 128x128.
+
+        Args:
+            mask_image (PIL.Image.Image): Mask image.
+            width (int): Width of the image to be processed.
+            height (int): Height of the image to be processed.
+            pad (int, optional): Padding to be added to the crop region. Defaults to 0.
+
+        Returns:
+            tuple: (x1, y1, x2, y2) represent a rectangular region that contains all masked ares in an image and
+            matches the original aspect ratio.
+        """
+
+        mask_image = mask_image.convert("L")
+        mask = np.array(mask_image)
+
+        # 1. find a rectangular region that contains all masked ares in an image
+        h, w = mask.shape
+        crop_left = 0
+        for i in range(w):
+            if not (mask[:, i] == 0).all():
+                break
+            crop_left += 1
+
+        crop_right = 0
+        for i in reversed(range(w)):
+            if not (mask[:, i] == 0).all():
+                break
+            crop_right += 1
+
+        crop_top = 0
+        for i in range(h):
+            if not (mask[i] == 0).all():
+                break
+            crop_top += 1
+
+        crop_bottom = 0
+        for i in reversed(range(h)):
+            if not (mask[i] == 0).all():
+                break
+            crop_bottom += 1
+
+        # 2. add padding to the crop region
+        x1, y1, x2, y2 = (
+            int(max(crop_left - pad, 0)),
+            int(max(crop_top - pad, 0)),
+            int(min(w - crop_right + pad, w)),
+            int(min(h - crop_bottom + pad, h)),
+        )
+
+        # 3. expands crop region to match the aspect ratio of the image to be processed
+        ratio_crop_region = (x2 - x1) / (y2 - y1)
+        ratio_processing = width / height
+
+        if ratio_crop_region > ratio_processing:
+            desired_height = (x2 - x1) / ratio_processing
+            desired_height_diff = int(desired_height - (y2 - y1))
+            y1 -= desired_height_diff // 2
+            y2 += desired_height_diff - desired_height_diff // 2
+            if y2 >= mask_image.height:
+                diff = y2 - mask_image.height
+                y2 -= diff
+                y1 -= diff
+            if y1 < 0:
+                y2 -= y1
+                y1 -= y1
+            if y2 >= mask_image.height:
+                y2 = mask_image.height
+        else:
+            desired_width = (y2 - y1) * ratio_processing
+            desired_width_diff = int(desired_width - (x2 - x1))
+            x1 -= desired_width_diff // 2
+            x2 += desired_width_diff - desired_width_diff // 2
+            if x2 >= mask_image.width:
+                diff = x2 - mask_image.width
+                x2 -= diff
+                x1 -= diff
+            if x1 < 0:
+                x2 -= x1
+                x1 -= x1
+            if x2 >= mask_image.width:
+                x2 = mask_image.width
+
+        return x1, y1, x2, y2
+
+    def _resize_and_fill(
+        self,
+        image: PIL.Image.Image,
+        width: int,
+        height: int,
+    ) -> PIL.Image.Image:
+        """
+        Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
+        the image within the dimensions, filling empty with data from image.
+
+        Args:
+            image: The image to resize.
+            width: The width to resize the image to.
+            height: The height to resize the image to.
+        """
+
+        ratio = width / height
+        src_ratio = image.width / image.height
+
+        src_w = width if ratio < src_ratio else image.width * height // image.height
+        src_h = height if ratio >= src_ratio else image.height * width // image.width
+
+        resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
+        res = Image.new("RGB", (width, height))
+        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+
+        if ratio < src_ratio:
+            fill_height = height // 2 - src_h // 2
+            if fill_height > 0:
+                res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0))
+                res.paste(
+                    resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)),
+                    box=(0, fill_height + src_h),
+                )
+        elif ratio > src_ratio:
+            fill_width = width // 2 - src_w // 2
+            if fill_width > 0:
+                res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0))
+                res.paste(
+                    resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)),
+                    box=(fill_width + src_w, 0),
+                )
+
+        return res
+
+    def _resize_and_crop(
+        self,
+        image: PIL.Image.Image,
+        width: int,
+        height: int,
+    ) -> PIL.Image.Image:
+        """
+        Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
+        the image within the dimensions, cropping the excess.
+
+        Args:
+            image: The image to resize.
+            width: The width to resize the image to.
+            height: The height to resize the image to.
+        """
+        ratio = width / height
+        src_ratio = image.width / image.height
+
+        src_w = width if ratio > src_ratio else image.width * height // image.height
+        src_h = height if ratio <= src_ratio else image.height * width // image.width
+
+        resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
+        res = Image.new("RGB", (width, height))
+        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+        return res
+
+    def resize(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
+        height: int,
+        width: int,
+        resize_mode: str = "default",  # "default", "fill", "crop"
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]:
+        """
+        Resize image.
+
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`):
+                The image input, can be a PIL image, numpy array or pytorch tensor.
+            height (`int`):
+                The height to resize to.
+            width (`int`):
+                The width to resize to.
+            resize_mode (`str`, *optional*, defaults to `default`):
+                The resize mode to use, can be one of `default` or `fill`. If `default`, will resize the image to fit
+                within the specified width and height, and it may not maintaining the original aspect ratio. If `fill`,
+                will resize the image to fit within the specified width and height, maintaining the aspect ratio, and
+                then center the image within the dimensions, filling empty with data from image. If `crop`, will resize
+                the image to fit within the specified width and height, maintaining the aspect ratio, and then center
+                the image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
+                supported for PIL image input.
+
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`:
+                The resized image.
+        """
+        if resize_mode != "default" and not isinstance(image, PIL.Image.Image):
+            raise ValueError(f"Only PIL image input is supported for resize_mode {resize_mode}")
+        if isinstance(image, PIL.Image.Image):
+            if resize_mode == "default":
+                image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
+            elif resize_mode == "fill":
+                image = self._resize_and_fill(image, width, height)
+            elif resize_mode == "crop":
+                image = self._resize_and_crop(image, width, height)
+            else:
+                raise ValueError(f"resize_mode {resize_mode} is not supported")
+
+        elif isinstance(image, torch.Tensor):
+            image = torch.nn.functional.interpolate(
+                image,
+                size=(height, width),
+            )
+        elif isinstance(image, np.ndarray):
+            image = self.numpy_to_pt(image)
+            image = torch.nn.functional.interpolate(
+                image,
+                size=(height, width),
+            )
+            image = self.pt_to_numpy(image)
+        return image
+
+    def binarize(self, image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Create a mask.
+
+        Args:
+            image (`PIL.Image.Image`):
+                The image input, should be a PIL image.
+
+        Returns:
+            `PIL.Image.Image`:
+                The binarized image. Values less than 0.5 are set to 0, values greater than 0.5 are set to 1.
+        """
+        image[image < 0.5] = 0
+        image[image >= 0.5] = 1
+
+        return image
+
+    def get_default_height_width(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> Tuple[int, int]:
+        """
+        This function return the height and width that are downscaled to the next integer multiple of
+        `vae_scale_factor`.
+
+        Args:
+            image(`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`):
+                The image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have
+                shape `[batch, height, width]` or `[batch, height, width, channel]` if it is a pytorch tensor, should
+                have shape `[batch, channel, height, width]`.
+            height (`int`, *optional*, defaults to `None`):
+                The height in preprocessed image. If `None`, will use the height of `image` input.
+            width (`int`, *optional*`, defaults to `None`):
+                The width in preprocessed. If `None`, will use the width of the `image` input.
+        """
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[2]
+            else:
+                height = image.shape[1]
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[3]
+            else:
+                width = image.shape[2]
+
+        width, height = (
+            x - x % self.config.vae_scale_factor for x in (width, height)
+        )  # resize to integer multiple of vae_scale_factor
+
+        return height, width
+
+    def preprocess(
+        self,
+        image: PipelineImageInput,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        resize_mode: str = "default",  # "default", "fill", "crop"
+        crops_coords: Optional[Tuple[int, int, int, int]] = None,
+    ) -> torch.Tensor:
+        """
+        Preprocess the image input.
+
+        Args:
+            image (`pipeline_image_input`):
+                The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of
+                supported formats.
+            height (`int`, *optional*, defaults to `None`):
+                The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
+                height.
+            width (`int`, *optional*`, defaults to `None`):
+                The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
+            resize_mode (`str`, *optional*, defaults to `default`):
+                The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
+                the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will
+                resize the image to fit within the specified width and height, maintaining the aspect ratio, and then
+                center the image within the dimensions, filling empty with data from image. If `crop`, will resize the
+                image to fit within the specified width and height, maintaining the aspect ratio, and then center the
+                image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
+                supported for PIL image input.
+            crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
+                The crop coordinates for each image in the batch. If `None`, will not crop the image.
+        """
+        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
+
+        # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
+        if self.config.do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3:
+            if isinstance(image, torch.Tensor):
+                # if image is a pytorch tensor could have 2 possible shapes:
+                #    1. batch x height x width: we should insert the channel dimension at position 1
+                #    2. channel x height x width: we should insert batch dimension at position 0,
+                #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
+                #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
+                image = image.unsqueeze(1)
+            else:
+                # if it is a numpy array, it could have 2 possible shapes:
+                #   1. batch x height x width: insert channel dimension on last position
+                #   2. height x width x channel: insert batch dimension on first position
+                if image.shape[-1] == 1:
+                    image = np.expand_dims(image, axis=0)
+                else:
+                    image = np.expand_dims(image, axis=-1)
+
+        if isinstance(image, list) and isinstance(image[0], np.ndarray) and image[0].ndim == 4:
+            warnings.warn(
+                "Passing `image` as a list of 4d np.ndarray is deprecated."
+                "Please concatenate the list along the batch dimension and pass it as a single 4d np.ndarray",
+                FutureWarning,
+            )
+            image = np.concatenate(image, axis=0)
+        if isinstance(image, list) and isinstance(image[0], torch.Tensor) and image[0].ndim == 4:
+            warnings.warn(
+                "Passing `image` as a list of 4d torch.Tensor is deprecated."
+                "Please concatenate the list along the batch dimension and pass it as a single 4d torch.Tensor",
+                FutureWarning,
+            )
+            image = torch.cat(image, axis=0)
+
+        if not is_valid_image_imagelist(image):
+            raise ValueError(
+                f"Input is in incorrect format. Currently, we only support {', '.join(str(x) for x in supported_formats)}"
+            )
+        if not isinstance(image, list):
+            image = [image]
+
+        if isinstance(image[0], PIL.Image.Image):
+            if crops_coords is not None:
+                image = [i.crop(crops_coords) for i in image]
+            if self.config.do_resize:
+                height, width = self.get_default_height_width(image[0], height, width)
+                image = [self.resize(i, height, width, resize_mode=resize_mode) for i in image]
+            if self.config.do_convert_rgb:
+                image = [self.convert_to_rgb(i) for i in image]
+            elif self.config.do_convert_grayscale:
+                image = [self.convert_to_grayscale(i) for i in image]
+            image = self.pil_to_numpy(image)  # to np
+            image = self.numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+
+            image = self.numpy_to_pt(image)
+
+            height, width = self.get_default_height_width(image, height, width)
+            if self.config.do_resize:
+                image = self.resize(image, height, width)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+            if self.config.do_convert_grayscale and image.ndim == 3:
+                image = image.unsqueeze(1)
+
+            channel = image.shape[1]
+            # don't need any preprocess if the image is latents
+            if channel == self.config.vae_latent_channels:
+                return image
+
+            height, width = self.get_default_height_width(image, height, width)
+            if self.config.do_resize:
+                image = self.resize(image, height, width)
+
+        # expected range [0,1], normalize to [-1,1]
+        do_normalize = self.config.do_normalize
+        if do_normalize and image.min() < 0:
+            warnings.warn(
+                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
+                FutureWarning,
+            )
+            do_normalize = False
+        if do_normalize:
+            image = self.normalize(image)
+
+        if self.config.do_binarize:
+            image = self.binarize(image)
+
+        return image
+
+    def postprocess(
+        self,
+        image: torch.Tensor,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]:
+        """
+        Postprocess the image output from tensor to `output_type`.
+
+        Args:
+            image (`torch.Tensor`):
+                The image input, should be a pytorch tensor with shape `B x C x H x W`.
+            output_type (`str`, *optional*, defaults to `pil`):
+                The output type of the image, can be one of `pil`, `np`, `pt`, `latent`.
+            do_denormalize (`List[bool]`, *optional*, defaults to `None`):
+                Whether to denormalize the image to [0,1]. If `None`, will use the value of `do_normalize` in the
+                `VaeImageProcessor` config.
+
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`:
+                The postprocessed image.
+        """
+        if not isinstance(image, torch.Tensor):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
+            )
+        if output_type not in ["latent", "pt", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
+
+        if output_type == "latent":
+            return image
+
+        if do_denormalize is None:
+            do_denormalize = [self.config.do_normalize] * image.shape[0]
+
+        image = torch.stack(
+            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
+        )
+
+        if output_type == "pt":
+            return image
+
+        image = self.pt_to_numpy(image)
+
+        if output_type == "np":
+            return image
+
+        if output_type == "pil":
+            return self.numpy_to_pil(image)
+
+    def apply_overlay(
+        self,
+        mask: PIL.Image.Image,
+        init_image: PIL.Image.Image,
+        image: PIL.Image.Image,
+        crop_coords: Optional[Tuple[int, int, int, int]] = None,
+    ) -> PIL.Image.Image:
+        """
+        overlay the inpaint output to the original image
+        """
+
+        width, height = image.width, image.height
+
+        init_image = self.resize(init_image, width=width, height=height)
+        mask = self.resize(mask, width=width, height=height)
+
+        init_image_masked = PIL.Image.new("RGBa", (width, height))
+        init_image_masked.paste(init_image.convert("RGBA").convert("RGBa"), mask=ImageOps.invert(mask.convert("L")))
+        init_image_masked = init_image_masked.convert("RGBA")
+
+        if crop_coords is not None:
+            x, y, x2, y2 = crop_coords
+            w = x2 - x
+            h = y2 - y
+            base_image = PIL.Image.new("RGBA", (width, height))
+            image = self.resize(image, height=h, width=w, resize_mode="crop")
+            base_image.paste(image, (x, y))
+            image = base_image.convert("RGB")
+
+        image = image.convert("RGBA")
+        image.alpha_composite(init_image_masked)
+        image = image.convert("RGB")
+
+        return image
+
+
+class VaeImageProcessorLDM3D(VaeImageProcessor):
+    """
+    Image processor for VAE LDM3D.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+    """
+
+    config_name = CONFIG_NAME
+
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+    ):
+        super().__init__()
+
+    @staticmethod
+    def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
+        """
+        Convert a NumPy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image[:, :, :3]) for image in images]
+
+        return pil_images
+
+    @staticmethod
+    def depth_pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
+        """
+        Convert a PIL image or a list of PIL images to NumPy arrays.
+        """
+        if not isinstance(images, list):
+            images = [images]
+
+        images = [np.array(image).astype(np.float32) / (2**16 - 1) for image in images]
+        images = np.stack(images, axis=0)
+        return images
+
+    @staticmethod
+    def rgblike_to_depthmap(image: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Args:
+            image: RGB-like depth image
+
+        Returns: depth map
+
+        """
+        return image[:, :, 1] * 2**8 + image[:, :, 2]
+
+    def numpy_to_depth(self, images: np.ndarray) -> List[PIL.Image.Image]:
+        """
+        Convert a NumPy depth image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images_depth = images[:, :, :, 3:]
+        if images.shape[-1] == 6:
+            images_depth = (images_depth * 255).round().astype("uint8")
+            pil_images = [
+                Image.fromarray(self.rgblike_to_depthmap(image_depth), mode="I;16") for image_depth in images_depth
+            ]
+        elif images.shape[-1] == 4:
+            images_depth = (images_depth * 65535.0).astype(np.uint16)
+            pil_images = [Image.fromarray(image_depth, mode="I;16") for image_depth in images_depth]
+        else:
+            raise Exception("Not supported")
+
+        return pil_images
+
+    def postprocess(
+        self,
+        image: torch.Tensor,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]:
+        """
+        Postprocess the image output from tensor to `output_type`.
+
+        Args:
+            image (`torch.Tensor`):
+                The image input, should be a pytorch tensor with shape `B x C x H x W`.
+            output_type (`str`, *optional*, defaults to `pil`):
+                The output type of the image, can be one of `pil`, `np`, `pt`, `latent`.
+            do_denormalize (`List[bool]`, *optional*, defaults to `None`):
+                Whether to denormalize the image to [0,1]. If `None`, will use the value of `do_normalize` in the
+                `VaeImageProcessor` config.
+
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`:
+                The postprocessed image.
+        """
+        if not isinstance(image, torch.Tensor):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
+            )
+        if output_type not in ["latent", "pt", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
+
+        if do_denormalize is None:
+            do_denormalize = [self.config.do_normalize] * image.shape[0]
+
+        image = torch.stack(
+            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
+        )
+
+        image = self.pt_to_numpy(image)
+
+        if output_type == "np":
+            if image.shape[-1] == 6:
+                image_depth = np.stack([self.rgblike_to_depthmap(im[:, :, 3:]) for im in image], axis=0)
+            else:
+                image_depth = image[:, :, :, 3:]
+            return image[:, :, :, :3], image_depth
+
+        if output_type == "pil":
+            return self.numpy_to_pil(image), self.numpy_to_depth(image)
+        else:
+            raise Exception(f"This type {output_type} is not supported")
+
+    def preprocess(
+        self,
+        rgb: Union[torch.Tensor, PIL.Image.Image, np.ndarray],
+        depth: Union[torch.Tensor, PIL.Image.Image, np.ndarray],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        target_res: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors.
+        """
+        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
+
+        # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
+        if self.config.do_convert_grayscale and isinstance(rgb, (torch.Tensor, np.ndarray)) and rgb.ndim == 3:
+            raise Exception("This is not yet supported")
+
+        if isinstance(rgb, supported_formats):
+            rgb = [rgb]
+            depth = [depth]
+        elif not (isinstance(rgb, list) and all(isinstance(i, supported_formats) for i in rgb)):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in rgb]}. Currently, we only support {', '.join(supported_formats)}"
+            )
+
+        if isinstance(rgb[0], PIL.Image.Image):
+            if self.config.do_convert_rgb:
+                raise Exception("This is not yet supported")
+                # rgb = [self.convert_to_rgb(i) for i in rgb]
+                # depth = [self.convert_to_depth(i) for i in depth]  #TODO define convert_to_depth
+            if self.config.do_resize or target_res:
+                height, width = self.get_default_height_width(rgb[0], height, width) if not target_res else target_res
+                rgb = [self.resize(i, height, width) for i in rgb]
+                depth = [self.resize(i, height, width) for i in depth]
+            rgb = self.pil_to_numpy(rgb)  # to np
+            rgb = self.numpy_to_pt(rgb)  # to pt
+
+            depth = self.depth_pil_to_numpy(depth)  # to np
+            depth = self.numpy_to_pt(depth)  # to pt
+
+        elif isinstance(rgb[0], np.ndarray):
+            rgb = np.concatenate(rgb, axis=0) if rgb[0].ndim == 4 else np.stack(rgb, axis=0)
+            rgb = self.numpy_to_pt(rgb)
+            height, width = self.get_default_height_width(rgb, height, width)
+            if self.config.do_resize:
+                rgb = self.resize(rgb, height, width)
+
+            depth = np.concatenate(depth, axis=0) if rgb[0].ndim == 4 else np.stack(depth, axis=0)
+            depth = self.numpy_to_pt(depth)
+            height, width = self.get_default_height_width(depth, height, width)
+            if self.config.do_resize:
+                depth = self.resize(depth, height, width)
+
+        elif isinstance(rgb[0], torch.Tensor):
+            raise Exception("This is not yet supported")
+            # rgb = torch.cat(rgb, axis=0) if rgb[0].ndim == 4 else torch.stack(rgb, axis=0)
+
+            # if self.config.do_convert_grayscale and rgb.ndim == 3:
+            #     rgb = rgb.unsqueeze(1)
+
+            # channel = rgb.shape[1]
+
+            # height, width = self.get_default_height_width(rgb, height, width)
+            # if self.config.do_resize:
+            #     rgb = self.resize(rgb, height, width)
+
+            # depth = torch.cat(depth, axis=0) if depth[0].ndim == 4 else torch.stack(depth, axis=0)
+
+            # if self.config.do_convert_grayscale and depth.ndim == 3:
+            #     depth = depth.unsqueeze(1)
+
+            # channel = depth.shape[1]
+            # # don't need any preprocess if the image is latents
+            # if depth == 4:
+            #     return rgb, depth
+
+            # height, width = self.get_default_height_width(depth, height, width)
+            # if self.config.do_resize:
+            #     depth = self.resize(depth, height, width)
+        # expected range [0,1], normalize to [-1,1]
+        do_normalize = self.config.do_normalize
+        if rgb.min() < 0 and do_normalize:
+            warnings.warn(
+                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{rgb.min()},{rgb.max()}]",
+                FutureWarning,
+            )
+            do_normalize = False
+
+        if do_normalize:
+            rgb = self.normalize(rgb)
+            depth = self.normalize(depth)
+
+        if self.config.do_binarize:
+            rgb = self.binarize(rgb)
+            depth = self.binarize(depth)
+
+        return rgb, depth
+
+
+class IPAdapterMaskProcessor(VaeImageProcessor):
+    """
+    Image processor for IP Adapter image masks.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the image to [-1,1].
+        do_binarize (`bool`, *optional*, defaults to `True`):
+            Whether to binarize the image to 0/1.
+        do_convert_grayscale (`bool`, *optional*, defaults to be `True`):
+            Whether to convert the images to grayscale format.
+
+    """
+
+    config_name = CONFIG_NAME
+
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        resample: str = "lanczos",
+        do_normalize: bool = False,
+        do_binarize: bool = True,
+        do_convert_grayscale: bool = True,
+    ):
+        super().__init__(
+            do_resize=do_resize,
+            vae_scale_factor=vae_scale_factor,
+            resample=resample,
+            do_normalize=do_normalize,
+            do_binarize=do_binarize,
+            do_convert_grayscale=do_convert_grayscale,
+        )
+
+    @staticmethod
+    def downsample(mask: torch.Tensor, batch_size: int, num_queries: int, value_embed_dim: int):
+        """
+        Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. If the
+        aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued.
+
+        Args:
+            mask (`torch.Tensor`):
+                The input mask tensor generated with `IPAdapterMaskProcessor.preprocess()`.
+            batch_size (`int`):
+                The batch size.
+            num_queries (`int`):
+                The number of queries.
+            value_embed_dim (`int`):
+                The dimensionality of the value embeddings.
+
+        Returns:
+            `torch.Tensor`:
+                The downsampled mask tensor.
+
+        """
+        o_h = mask.shape[1]
+        o_w = mask.shape[2]
+        ratio = o_w / o_h
+        mask_h = int(math.sqrt(num_queries / ratio))
+        mask_h = int(mask_h) + int((num_queries % int(mask_h)) != 0)
+        mask_w = num_queries // mask_h
+
+        mask_downsample = F.interpolate(mask.unsqueeze(0), size=(mask_h, mask_w), mode="bicubic").squeeze(0)
+
+        # Repeat batch_size times
+        if mask_downsample.shape[0] < batch_size:
+            mask_downsample = mask_downsample.repeat(batch_size, 1, 1)
+
+        mask_downsample = mask_downsample.view(mask_downsample.shape[0], -1)
+
+        downsampled_area = mask_h * mask_w
+        # If the output image and the mask do not have the same aspect ratio, tensor shapes will not match
+        # Pad tensor if downsampled_mask.shape[1] is smaller than num_queries
+        if downsampled_area < num_queries:
+            warnings.warn(
+                "The aspect ratio of the mask does not match the aspect ratio of the output image. "
+                "Please update your masks or adjust the output size for optimal performance.",
+                UserWarning,
+            )
+            mask_downsample = F.pad(mask_downsample, (0, num_queries - mask_downsample.shape[1]), value=0.0)
+        # Discard last embeddings if downsampled_mask.shape[1] is bigger than num_queries
+        if downsampled_area > num_queries:
+            warnings.warn(
+                "The aspect ratio of the mask does not match the aspect ratio of the output image. "
+                "Please update your masks or adjust the output size for optimal performance.",
+                UserWarning,
+            )
+            mask_downsample = mask_downsample[:, :num_queries]
+
+        # Repeat last dimension to match SDPA output shape
+        mask_downsample = mask_downsample.view(mask_downsample.shape[0], mask_downsample.shape[1], 1).repeat(
+            1, 1, value_embed_dim
+        )
+
+        return mask_downsample
+
+
+class PixArtImageProcessor(VaeImageProcessor):
+    """
+    Image processor for PixArt image resize and crop.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
+            `height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+        do_binarize (`bool`, *optional*, defaults to `False`):
+            Whether to binarize the image to 0/1.
+        do_convert_rgb (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to RGB format.
+        do_convert_grayscale (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to grayscale format.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+        do_binarize: bool = False,
+        do_convert_grayscale: bool = False,
+    ):
+        super().__init__(
+            do_resize=do_resize,
+            vae_scale_factor=vae_scale_factor,
+            resample=resample,
+            do_normalize=do_normalize,
+            do_binarize=do_binarize,
+            do_convert_grayscale=do_convert_grayscale,
+        )
+
+    @staticmethod
+    def classify_height_width_bin(height: int, width: int, ratios: dict) -> Tuple[int, int]:
+        """Returns binned height and width."""
+        ar = float(height / width)
+        closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
+        default_hw = ratios[closest_ratio]
+        return int(default_hw[0]), int(default_hw[1])
+
+    @staticmethod
+    def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int) -> torch.Tensor:
+        orig_height, orig_width = samples.shape[2], samples.shape[3]
+
+        # Check if resizing is needed
+        if orig_height != new_height or orig_width != new_width:
+            ratio = max(new_height / orig_height, new_width / orig_width)
+            resized_width = int(orig_width * ratio)
+            resized_height = int(orig_height * ratio)
+
+            # Resize
+            samples = F.interpolate(
+                samples, size=(resized_height, resized_width), mode="bilinear", align_corners=False
+            )
+
+            # Center Crop
+            start_x = (resized_width - new_width) // 2
+            end_x = start_x + new_width
+            start_y = (resized_height - new_height) // 2
+            end_y = start_y + new_height
+            samples = samples[:, :, start_y:end_y, start_x:end_x]
+
+        return samples
diff --git a/diffusers/src/diffusers/loaders/__init__.py b/diffusers/src/diffusers/loaders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf7212216845599cb5995024bd2202ba4ca6c584
--- /dev/null
+++ b/diffusers/src/diffusers/loaders/__init__.py
@@ -0,0 +1,102 @@
+from typing import TYPE_CHECKING
+
+from ..utils import DIFFUSERS_SLOW_IMPORT, _LazyModule, deprecate
+from ..utils.import_utils import is_peft_available, is_torch_available, is_transformers_available
+
+
+def text_encoder_lora_state_dict(text_encoder):
+    deprecate(
+        "text_encoder_load_state_dict in `models`",
+        "0.27.0",
+        "`text_encoder_lora_state_dict` is deprecated and will be removed in 0.27.0. Make sure to retrieve the weights using `get_peft_model`. See https://huggingface.co/docs/peft/v0.6.2/en/quicktour#peftmodel for more information.",
+    )
+    state_dict = {}
+
+    for name, module in text_encoder_attn_modules(text_encoder):
+        for k, v in module.q_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.q_proj.lora_linear_layer.{k}"] = v
+
+        for k, v in module.k_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.k_proj.lora_linear_layer.{k}"] = v
+
+        for k, v in module.v_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.v_proj.lora_linear_layer.{k}"] = v
+
+        for k, v in module.out_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.out_proj.lora_linear_layer.{k}"] = v
+
+    return state_dict
+
+
+if is_transformers_available():
+
+    def text_encoder_attn_modules(text_encoder):
+        deprecate(
+            "text_encoder_attn_modules in `models`",
+            "0.27.0",
+            "`text_encoder_lora_state_dict` is deprecated and will be removed in 0.27.0. Make sure to retrieve the weights using `get_peft_model`. See https://huggingface.co/docs/peft/v0.6.2/en/quicktour#peftmodel for more information.",
+        )
+        from transformers import CLIPTextModel, CLIPTextModelWithProjection
+
+        attn_modules = []
+
+        if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
+            for i, layer in enumerate(text_encoder.text_model.encoder.layers):
+                name = f"text_model.encoder.layers.{i}.self_attn"
+                mod = layer.self_attn
+                attn_modules.append((name, mod))
+        else:
+            raise ValueError(f"do not know how to get attention modules for: {text_encoder.__class__.__name__}")
+
+        return attn_modules
+
+
+_import_structure = {}
+
+if is_torch_available():
+    _import_structure["single_file_model"] = ["FromOriginalModelMixin"]
+
+    _import_structure["unet"] = ["UNet2DConditionLoadersMixin"]
+    _import_structure["utils"] = ["AttnProcsLayers"]
+    if is_transformers_available():
+        _import_structure["single_file"] = ["FromSingleFileMixin"]
+        _import_structure["lora_pipeline"] = [
+            "AmusedLoraLoaderMixin",
+            "StableDiffusionLoraLoaderMixin",
+            "SD3LoraLoaderMixin",
+            "StableDiffusionXLLoraLoaderMixin",
+            "LoraLoaderMixin",
+            "FluxLoraLoaderMixin",
+            "CogVideoXLoraLoaderMixin",
+        ]
+        _import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"]
+        _import_structure["ip_adapter"] = ["IPAdapterMixin"]
+
+_import_structure["peft"] = ["PeftAdapterMixin"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    if is_torch_available():
+        from .single_file_model import FromOriginalModelMixin
+        from .unet import UNet2DConditionLoadersMixin
+        from .utils import AttnProcsLayers
+
+        if is_transformers_available():
+            from .ip_adapter import IPAdapterMixin
+            from .lora_pipeline import (
+                AmusedLoraLoaderMixin,
+                CogVideoXLoraLoaderMixin,
+                FluxLoraLoaderMixin,
+                LoraLoaderMixin,
+                SD3LoraLoaderMixin,
+                StableDiffusionLoraLoaderMixin,
+                StableDiffusionXLLoraLoaderMixin,
+            )
+            from .single_file import FromSingleFileMixin
+            from .textual_inversion import TextualInversionLoaderMixin
+
+    from .peft import PeftAdapterMixin
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/diffusers/src/diffusers/loaders/__pycache__/__init__.cpython-311.pyc b/diffusers/src/diffusers/loaders/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8dea8438b3a81150757ea9127c068e6f033617ad
Binary files /dev/null and b/diffusers/src/diffusers/loaders/__pycache__/__init__.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/loaders/__pycache__/lora_base.cpython-311.pyc b/diffusers/src/diffusers/loaders/__pycache__/lora_base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02934b86a81ee717ed11a23204163a74f8854bad
Binary files /dev/null and b/diffusers/src/diffusers/loaders/__pycache__/lora_base.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/loaders/__pycache__/lora_conversion_utils.cpython-311.pyc b/diffusers/src/diffusers/loaders/__pycache__/lora_conversion_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0242fb05a22ca82cac354d1bd0c99f89184caedd
Binary files /dev/null and b/diffusers/src/diffusers/loaders/__pycache__/lora_conversion_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/loaders/__pycache__/lora_pipeline.cpython-311.pyc b/diffusers/src/diffusers/loaders/__pycache__/lora_pipeline.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25cfae1c6c46a353a19e1d16ff793bfd92a7cd1e
Binary files /dev/null and b/diffusers/src/diffusers/loaders/__pycache__/lora_pipeline.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/loaders/__pycache__/peft.cpython-311.pyc b/diffusers/src/diffusers/loaders/__pycache__/peft.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e63a1a7fca8da13073e7d6629d582eb637baa69
Binary files /dev/null and b/diffusers/src/diffusers/loaders/__pycache__/peft.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/loaders/__pycache__/single_file_model.cpython-311.pyc b/diffusers/src/diffusers/loaders/__pycache__/single_file_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3188b1f0f5ad6123c8e8f3787a44ed55f9e684d3
Binary files /dev/null and b/diffusers/src/diffusers/loaders/__pycache__/single_file_model.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/loaders/__pycache__/single_file_utils.cpython-311.pyc b/diffusers/src/diffusers/loaders/__pycache__/single_file_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..285e550deabbc5c4c375f3f686548baaf0abf31e
Binary files /dev/null and b/diffusers/src/diffusers/loaders/__pycache__/single_file_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/loaders/__pycache__/unet.cpython-311.pyc b/diffusers/src/diffusers/loaders/__pycache__/unet.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5171655f1c91a25faf60ab1161396cb46fbb3aa4
Binary files /dev/null and b/diffusers/src/diffusers/loaders/__pycache__/unet.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/loaders/__pycache__/unet_loader_utils.cpython-311.pyc b/diffusers/src/diffusers/loaders/__pycache__/unet_loader_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad7d66e029888f38f2780d37dbaebdd36436bf1c
Binary files /dev/null and b/diffusers/src/diffusers/loaders/__pycache__/unet_loader_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/loaders/__pycache__/utils.cpython-311.pyc b/diffusers/src/diffusers/loaders/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0140d6e578d0e0af2e322cf1f295430c18b989a
Binary files /dev/null and b/diffusers/src/diffusers/loaders/__pycache__/utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/loaders/ip_adapter.py b/diffusers/src/diffusers/loaders/ip_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..1006dab9e4b97d98903b7bdb40a5e152b9134824
--- /dev/null
+++ b/diffusers/src/diffusers/loaders/ip_adapter.py
@@ -0,0 +1,348 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from huggingface_hub.utils import validate_hf_hub_args
+from safetensors import safe_open
+
+from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_state_dict
+from ..utils import (
+    USE_PEFT_BACKEND,
+    _get_model_file,
+    is_accelerate_available,
+    is_torch_version,
+    is_transformers_available,
+    logging,
+)
+from .unet_loader_utils import _maybe_expand_lora_scales
+
+
+if is_transformers_available():
+    from transformers import (
+        CLIPImageProcessor,
+        CLIPVisionModelWithProjection,
+    )
+
+    from ..models.attention_processor import (
+        AttnProcessor,
+        AttnProcessor2_0,
+        IPAdapterAttnProcessor,
+        IPAdapterAttnProcessor2_0,
+    )
+
+logger = logging.get_logger(__name__)
+
+
+class IPAdapterMixin:
+    """Mixin for handling IP Adapters."""
+
+    @validate_hf_hub_args
+    def load_ip_adapter(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, List[str], Dict[str, torch.Tensor]],
+        subfolder: Union[str, List[str]],
+        weight_name: Union[str, List[str]],
+        image_encoder_folder: Optional[str] = "image_encoder",
+        **kwargs,
+    ):
+        """
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `List[str]` or `os.PathLike` or `List[os.PathLike]` or `dict` or `List[dict]`):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+            subfolder (`str` or `List[str]`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally. If a
+                list is passed, it should have the same length as `weight_name`.
+            weight_name (`str` or `List[str]`):
+                The name of the weight file to load. If a list is passed, it should have the same length as
+                `weight_name`.
+            image_encoder_folder (`str`, *optional*, defaults to `image_encoder`):
+                The subfolder location of the image encoder within a larger model repository on the Hub or locally.
+                Pass `None` to not load the image encoder. If the image encoder is located in a folder inside
+                `subfolder`, you only need to pass the name of the folder that contains image encoder weights, e.g.
+                `image_encoder_folder="image_encoder"`. If the image encoder is located in a folder other than
+                `subfolder`, you should pass the path to the folder that contains image encoder weights, for example,
+                `image_encoder_folder="different_subfolder/image_encoder"`.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+        """
+
+        # handle the list inputs for multiple IP Adapters
+        if not isinstance(weight_name, list):
+            weight_name = [weight_name]
+
+        if not isinstance(pretrained_model_name_or_path_or_dict, list):
+            pretrained_model_name_or_path_or_dict = [pretrained_model_name_or_path_or_dict]
+        if len(pretrained_model_name_or_path_or_dict) == 1:
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict * len(weight_name)
+
+        if not isinstance(subfolder, list):
+            subfolder = [subfolder]
+        if len(subfolder) == 1:
+            subfolder = subfolder * len(weight_name)
+
+        if len(weight_name) != len(pretrained_model_name_or_path_or_dict):
+            raise ValueError("`weight_name` and `pretrained_model_name_or_path_or_dict` must have the same length.")
+
+        if len(weight_name) != len(subfolder):
+            raise ValueError("`weight_name` and `subfolder` must have the same length.")
+
+        # Load the main state dict first.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+        state_dicts = []
+        for pretrained_model_name_or_path_or_dict, weight_name, subfolder in zip(
+            pretrained_model_name_or_path_or_dict, weight_name, subfolder
+        ):
+            if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path_or_dict,
+                    weights_name=weight_name,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                )
+                if weight_name.endswith(".safetensors"):
+                    state_dict = {"image_proj": {}, "ip_adapter": {}}
+                    with safe_open(model_file, framework="pt", device="cpu") as f:
+                        for key in f.keys():
+                            if key.startswith("image_proj."):
+                                state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                            elif key.startswith("ip_adapter."):
+                                state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+                else:
+                    state_dict = load_state_dict(model_file)
+            else:
+                state_dict = pretrained_model_name_or_path_or_dict
+
+            keys = list(state_dict.keys())
+            if keys != ["image_proj", "ip_adapter"]:
+                raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")
+
+            state_dicts.append(state_dict)
+
+            # load CLIP image encoder here if it has not been registered to the pipeline yet
+            if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
+                if image_encoder_folder is not None:
+                    if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+                        logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
+                        if image_encoder_folder.count("/") == 0:
+                            image_encoder_subfolder = Path(subfolder, image_encoder_folder).as_posix()
+                        else:
+                            image_encoder_subfolder = Path(image_encoder_folder).as_posix()
+
+                        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                            pretrained_model_name_or_path_or_dict,
+                            subfolder=image_encoder_subfolder,
+                            low_cpu_mem_usage=low_cpu_mem_usage,
+                            cache_dir=cache_dir,
+                            local_files_only=local_files_only,
+                        ).to(self.device, dtype=self.dtype)
+                        self.register_modules(image_encoder=image_encoder)
+                    else:
+                        raise ValueError(
+                            "`image_encoder` cannot be loaded because `pretrained_model_name_or_path_or_dict` is a state dict."
+                        )
+                else:
+                    logger.warning(
+                        "image_encoder is not loaded since `image_encoder_folder=None` passed. You will not be able to use `ip_adapter_image` when calling the pipeline with IP-Adapter."
+                        "Use `ip_adapter_image_embeds` to pass pre-generated image embedding instead."
+                    )
+
+            # create feature extractor if it has not been registered to the pipeline yet
+            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
+                # FaceID IP adapters don't need the image encoder so it's not present, in this case we default to 224
+                default_clip_size = 224
+                clip_image_size = (
+                    self.image_encoder.config.image_size if self.image_encoder is not None else default_clip_size
+                )
+                feature_extractor = CLIPImageProcessor(size=clip_image_size, crop_size=clip_image_size)
+                self.register_modules(feature_extractor=feature_extractor)
+
+        # load ip-adapter into unet
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        unet._load_ip_adapter_weights(state_dicts, low_cpu_mem_usage=low_cpu_mem_usage)
+
+        extra_loras = unet._load_ip_adapter_loras(state_dicts)
+        if extra_loras != {}:
+            if not USE_PEFT_BACKEND:
+                logger.warning("PEFT backend is required to load these weights.")
+            else:
+                # apply the IP Adapter Face ID LoRA weights
+                peft_config = getattr(unet, "peft_config", {})
+                for k, lora in extra_loras.items():
+                    if f"faceid_{k}" not in peft_config:
+                        self.load_lora_weights(lora, adapter_name=f"faceid_{k}")
+                        self.set_adapters([f"faceid_{k}"], adapter_weights=[1.0])
+
+    def set_ip_adapter_scale(self, scale):
+        """
+        Set IP-Adapter scales per-transformer block. Input `scale` could be a single config or a list of configs for
+        granular control over each IP-Adapter behavior. A config can be a float or a dictionary.
+
+        Example:
+
+        ```py
+        # To use original IP-Adapter
+        scale = 1.0
+        pipeline.set_ip_adapter_scale(scale)
+
+        # To use style block only
+        scale = {
+            "up": {"block_0": [0.0, 1.0, 0.0]},
+        }
+        pipeline.set_ip_adapter_scale(scale)
+
+        # To use style+layout blocks
+        scale = {
+            "down": {"block_2": [0.0, 1.0]},
+            "up": {"block_0": [0.0, 1.0, 0.0]},
+        }
+        pipeline.set_ip_adapter_scale(scale)
+
+        # To use style and layout from 2 reference images
+        scales = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}]
+        pipeline.set_ip_adapter_scale(scales)
+        ```
+        """
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        if not isinstance(scale, list):
+            scale = [scale]
+        scale_configs = _maybe_expand_lora_scales(unet, scale, default_scale=0.0)
+
+        for attn_name, attn_processor in unet.attn_processors.items():
+            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
+                if len(scale_configs) != len(attn_processor.scale):
+                    raise ValueError(
+                        f"Cannot assign {len(scale_configs)} scale_configs to "
+                        f"{len(attn_processor.scale)} IP-Adapter."
+                    )
+                elif len(scale_configs) == 1:
+                    scale_configs = scale_configs * len(attn_processor.scale)
+                for i, scale_config in enumerate(scale_configs):
+                    if isinstance(scale_config, dict):
+                        for k, s in scale_config.items():
+                            if attn_name.startswith(k):
+                                attn_processor.scale[i] = s
+                    else:
+                        attn_processor.scale[i] = scale_config
+
+    def unload_ip_adapter(self):
+        """
+        Unloads the IP Adapter weights
+
+        Examples:
+
+        ```python
+        >>> # Assuming `pipeline` is already loaded with the IP Adapter weights.
+        >>> pipeline.unload_ip_adapter()
+        >>> ...
+        ```
+        """
+        # remove CLIP image encoder
+        if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is not None:
+            self.image_encoder = None
+            self.register_to_config(image_encoder=[None, None])
+
+        # remove feature extractor only when safety_checker is None as safety_checker uses
+        # the feature_extractor later
+        if not hasattr(self, "safety_checker"):
+            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is not None:
+                self.feature_extractor = None
+                self.register_to_config(feature_extractor=[None, None])
+
+        # remove hidden encoder
+        self.unet.encoder_hid_proj = None
+        self.unet.config.encoder_hid_dim_type = None
+
+        # Kolors: restore `encoder_hid_proj` with `text_encoder_hid_proj`
+        if hasattr(self.unet, "text_encoder_hid_proj") and self.unet.text_encoder_hid_proj is not None:
+            self.unet.encoder_hid_proj = self.unet.text_encoder_hid_proj
+            self.unet.text_encoder_hid_proj = None
+            self.unet.config.encoder_hid_dim_type = "text_proj"
+
+        # restore original Unet attention processors layers
+        attn_procs = {}
+        for name, value in self.unet.attn_processors.items():
+            attn_processor_class = (
+                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnProcessor()
+            )
+            attn_procs[name] = (
+                attn_processor_class
+                if isinstance(value, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0))
+                else value.__class__()
+            )
+        self.unet.set_attn_processor(attn_procs)
diff --git a/diffusers/src/diffusers/loaders/lora_base.py b/diffusers/src/diffusers/loaders/lora_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e124b6eeacf3bc7bcda1c9e797de60fcfad65e57
--- /dev/null
+++ b/diffusers/src/diffusers/loaders/lora_base.py
@@ -0,0 +1,759 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+import os
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Union
+
+import safetensors
+import torch
+import torch.nn as nn
+from huggingface_hub import model_info
+from huggingface_hub.constants import HF_HUB_OFFLINE
+
+from ..models.modeling_utils import ModelMixin, load_state_dict
+from ..utils import (
+    USE_PEFT_BACKEND,
+    _get_model_file,
+    delete_adapter_layers,
+    deprecate,
+    is_accelerate_available,
+    is_peft_available,
+    is_transformers_available,
+    logging,
+    recurse_remove_peft_layers,
+    set_adapter_layers,
+    set_weights_and_activate_adapters,
+)
+
+
+if is_transformers_available():
+    from transformers import PreTrainedModel
+
+if is_peft_available():
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+if is_accelerate_available():
+    from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+
+logger = logging.get_logger(__name__)
+
+
+def fuse_text_encoder_lora(text_encoder, lora_scale=1.0, safe_fusing=False, adapter_names=None):
+    """
+    Fuses LoRAs for the text encoder.
+
+    Args:
+        text_encoder (`torch.nn.Module`):
+            The text encoder module to set the adapter layers for. If `None`, it will try to get the `text_encoder`
+            attribute.
+        lora_scale (`float`, defaults to 1.0):
+            Controls how much to influence the outputs with the LoRA parameters.
+        safe_fusing (`bool`, defaults to `False`):
+            Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
+        adapter_names (`List[str]` or `str`):
+            The names of the adapters to use.
+    """
+    merge_kwargs = {"safe_merge": safe_fusing}
+
+    for module in text_encoder.modules():
+        if isinstance(module, BaseTunerLayer):
+            if lora_scale != 1.0:
+                module.scale_layer(lora_scale)
+
+            # For BC with previous PEFT versions, we need to check the signature
+            # of the `merge` method to see if it supports the `adapter_names` argument.
+            supported_merge_kwargs = list(inspect.signature(module.merge).parameters)
+            if "adapter_names" in supported_merge_kwargs:
+                merge_kwargs["adapter_names"] = adapter_names
+            elif "adapter_names" not in supported_merge_kwargs and adapter_names is not None:
+                raise ValueError(
+                    "The `adapter_names` argument is not supported with your PEFT version. "
+                    "Please upgrade to the latest version of PEFT. `pip install -U peft`"
+                )
+
+            module.merge(**merge_kwargs)
+
+
+def unfuse_text_encoder_lora(text_encoder):
+    """
+    Unfuses LoRAs for the text encoder.
+
+    Args:
+        text_encoder (`torch.nn.Module`):
+            The text encoder module to set the adapter layers for. If `None`, it will try to get the `text_encoder`
+            attribute.
+    """
+    for module in text_encoder.modules():
+        if isinstance(module, BaseTunerLayer):
+            module.unmerge()
+
+
+def set_adapters_for_text_encoder(
+    adapter_names: Union[List[str], str],
+    text_encoder: Optional["PreTrainedModel"] = None,  # noqa: F821
+    text_encoder_weights: Optional[Union[float, List[float], List[None]]] = None,
+):
+    """
+    Sets the adapter layers for the text encoder.
+
+    Args:
+        adapter_names (`List[str]` or `str`):
+            The names of the adapters to use.
+        text_encoder (`torch.nn.Module`, *optional*):
+            The text encoder module to set the adapter layers for. If `None`, it will try to get the `text_encoder`
+            attribute.
+        text_encoder_weights (`List[float]`, *optional*):
+            The weights to use for the text encoder. If `None`, the weights are set to `1.0` for all the adapters.
+    """
+    if text_encoder is None:
+        raise ValueError(
+            "The pipeline does not have a default `pipe.text_encoder` class. Please make sure to pass a `text_encoder` instead."
+        )
+
+    def process_weights(adapter_names, weights):
+        # Expand weights into a list, one entry per adapter
+        # e.g. for 2 adapters:  7 -> [7,7] ; [3, None] -> [3, None]
+        if not isinstance(weights, list):
+            weights = [weights] * len(adapter_names)
+
+        if len(adapter_names) != len(weights):
+            raise ValueError(
+                f"Length of adapter names {len(adapter_names)} is not equal to the length of the weights {len(weights)}"
+            )
+
+        # Set None values to default of 1.0
+        # e.g. [7,7] -> [7,7] ; [3, None] -> [3,1]
+        weights = [w if w is not None else 1.0 for w in weights]
+
+        return weights
+
+    adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names
+    text_encoder_weights = process_weights(adapter_names, text_encoder_weights)
+    set_weights_and_activate_adapters(text_encoder, adapter_names, text_encoder_weights)
+
+
+def disable_lora_for_text_encoder(text_encoder: Optional["PreTrainedModel"] = None):
+    """
+    Disables the LoRA layers for the text encoder.
+
+    Args:
+        text_encoder (`torch.nn.Module`, *optional*):
+            The text encoder module to disable the LoRA layers for. If `None`, it will try to get the `text_encoder`
+            attribute.
+    """
+    if text_encoder is None:
+        raise ValueError("Text Encoder not found.")
+    set_adapter_layers(text_encoder, enabled=False)
+
+
+def enable_lora_for_text_encoder(text_encoder: Optional["PreTrainedModel"] = None):
+    """
+    Enables the LoRA layers for the text encoder.
+
+    Args:
+        text_encoder (`torch.nn.Module`, *optional*):
+            The text encoder module to enable the LoRA layers for. If `None`, it will try to get the `text_encoder`
+            attribute.
+    """
+    if text_encoder is None:
+        raise ValueError("Text Encoder not found.")
+    set_adapter_layers(text_encoder, enabled=True)
+
+
+def _remove_text_encoder_monkey_patch(text_encoder):
+    recurse_remove_peft_layers(text_encoder)
+    if getattr(text_encoder, "peft_config", None) is not None:
+        del text_encoder.peft_config
+        text_encoder._hf_peft_config_loaded = None
+
+
+class LoraBaseMixin:
+    """Utility class for handling LoRAs."""
+
+    _lora_loadable_modules = []
+    num_fused_loras = 0
+
+    def load_lora_weights(self, **kwargs):
+        raise NotImplementedError("`load_lora_weights()` is not implemented.")
+
+    @classmethod
+    def save_lora_weights(cls, **kwargs):
+        raise NotImplementedError("`save_lora_weights()` not implemented.")
+
+    @classmethod
+    def lora_state_dict(cls, **kwargs):
+        raise NotImplementedError("`lora_state_dict()` is not implemented.")
+
+    @classmethod
+    def _optionally_disable_offloading(cls, _pipeline):
+        """
+        Optionally removes offloading in case the pipeline has been already sequentially offloaded to CPU.
+
+        Args:
+            _pipeline (`DiffusionPipeline`):
+                The pipeline to disable offloading for.
+
+        Returns:
+            tuple:
+                A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True.
+        """
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+
+        if _pipeline is not None and _pipeline.hf_device_map is None:
+            for _, component in _pipeline.components.items():
+                if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
+                    if not is_model_cpu_offload:
+                        is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
+                    if not is_sequential_cpu_offload:
+                        is_sequential_cpu_offload = (
+                            isinstance(component._hf_hook, AlignDevicesHook)
+                            or hasattr(component._hf_hook, "hooks")
+                            and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+                        )
+
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                    )
+                    remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
+
+        return (is_model_cpu_offload, is_sequential_cpu_offload)
+
+    @classmethod
+    def _fetch_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict,
+        weight_name,
+        use_safetensors,
+        local_files_only,
+        cache_dir,
+        force_download,
+        proxies,
+        token,
+        revision,
+        subfolder,
+        user_agent,
+        allow_pickle,
+    ):
+        from .lora_pipeline import LORA_WEIGHT_NAME, LORA_WEIGHT_NAME_SAFE
+
+        model_file = None
+        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+            # Let's first try to load .safetensors weights
+            if (use_safetensors and weight_name is None) or (
+                weight_name is not None and weight_name.endswith(".safetensors")
+            ):
+                try:
+                    # Here we're relaxing the loading check to enable more Inference API
+                    # friendliness where sometimes, it's not at all possible to automatically
+                    # determine `weight_name`.
+                    if weight_name is None:
+                        weight_name = cls._best_guess_weight_name(
+                            pretrained_model_name_or_path_or_dict,
+                            file_extension=".safetensors",
+                            local_files_only=local_files_only,
+                        )
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path_or_dict,
+                        weights_name=weight_name or LORA_WEIGHT_NAME_SAFE,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                    )
+                    state_dict = safetensors.torch.load_file(model_file, device="cpu")
+                except (IOError, safetensors.SafetensorError) as e:
+                    if not allow_pickle:
+                        raise e
+                    # try loading non-safetensors weights
+                    model_file = None
+                    pass
+
+            if model_file is None:
+                if weight_name is None:
+                    weight_name = cls._best_guess_weight_name(
+                        pretrained_model_name_or_path_or_dict, file_extension=".bin", local_files_only=local_files_only
+                    )
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path_or_dict,
+                    weights_name=weight_name or LORA_WEIGHT_NAME,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                )
+                state_dict = load_state_dict(model_file)
+        else:
+            state_dict = pretrained_model_name_or_path_or_dict
+
+        return state_dict
+
+    @classmethod
+    def _best_guess_weight_name(
+        cls, pretrained_model_name_or_path_or_dict, file_extension=".safetensors", local_files_only=False
+    ):
+        from .lora_pipeline import LORA_WEIGHT_NAME, LORA_WEIGHT_NAME_SAFE
+
+        if local_files_only or HF_HUB_OFFLINE:
+            raise ValueError("When using the offline mode, you must specify a `weight_name`.")
+
+        targeted_files = []
+
+        if os.path.isfile(pretrained_model_name_or_path_or_dict):
+            return
+        elif os.path.isdir(pretrained_model_name_or_path_or_dict):
+            targeted_files = [
+                f for f in os.listdir(pretrained_model_name_or_path_or_dict) if f.endswith(file_extension)
+            ]
+        else:
+            files_in_repo = model_info(pretrained_model_name_or_path_or_dict).siblings
+            targeted_files = [f.rfilename for f in files_in_repo if f.rfilename.endswith(file_extension)]
+        if len(targeted_files) == 0:
+            return
+
+        # "scheduler" does not correspond to a LoRA checkpoint.
+        # "optimizer" does not correspond to a LoRA checkpoint
+        # only top-level checkpoints are considered and not the other ones, hence "checkpoint".
+        unallowed_substrings = {"scheduler", "optimizer", "checkpoint"}
+        targeted_files = list(
+            filter(lambda x: all(substring not in x for substring in unallowed_substrings), targeted_files)
+        )
+
+        if any(f.endswith(LORA_WEIGHT_NAME) for f in targeted_files):
+            targeted_files = list(filter(lambda x: x.endswith(LORA_WEIGHT_NAME), targeted_files))
+        elif any(f.endswith(LORA_WEIGHT_NAME_SAFE) for f in targeted_files):
+            targeted_files = list(filter(lambda x: x.endswith(LORA_WEIGHT_NAME_SAFE), targeted_files))
+
+        if len(targeted_files) > 1:
+            raise ValueError(
+                f"Provided path contains more than one weights file in the {file_extension} format. Either specify `weight_name` in `load_lora_weights` or make sure there's only one  `.safetensors` or `.bin` file in  {pretrained_model_name_or_path_or_dict}."
+            )
+        weight_name = targeted_files[0]
+        return weight_name
+
+    def unload_lora_weights(self):
+        """
+        Unloads the LoRA parameters.
+
+        Examples:
+
+        ```python
+        >>> # Assuming `pipeline` is already loaded with the LoRA parameters.
+        >>> pipeline.unload_lora_weights()
+        >>> ...
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        for component in self._lora_loadable_modules:
+            model = getattr(self, component, None)
+            if model is not None:
+                if issubclass(model.__class__, ModelMixin):
+                    model.unload_lora()
+                elif issubclass(model.__class__, PreTrainedModel):
+                    _remove_text_encoder_monkey_patch(model)
+
+    def fuse_lora(
+        self,
+        components: List[str] = [],
+        lora_scale: float = 1.0,
+        safe_fusing: bool = False,
+        adapter_names: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        r"""
+        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
+            lora_scale (`float`, defaults to 1.0):
+                Controls how much to influence the outputs with the LoRA parameters.
+            safe_fusing (`bool`, defaults to `False`):
+                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
+            adapter_names (`List[str]`, *optional*):
+                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.fuse_lora(lora_scale=0.7)
+        ```
+        """
+        if "fuse_unet" in kwargs:
+            depr_message = "Passing `fuse_unet` to `fuse_lora()` is deprecated and will be ignored. Please use the `components` argument and provide a list of the components whose LoRAs are to be fused. `fuse_unet` will be removed in a future version."
+            deprecate(
+                "fuse_unet",
+                "1.0.0",
+                depr_message,
+            )
+        if "fuse_transformer" in kwargs:
+            depr_message = "Passing `fuse_transformer` to `fuse_lora()` is deprecated and will be ignored. Please use the `components` argument and provide a list of the components whose LoRAs are to be fused. `fuse_transformer` will be removed in a future version."
+            deprecate(
+                "fuse_transformer",
+                "1.0.0",
+                depr_message,
+            )
+        if "fuse_text_encoder" in kwargs:
+            depr_message = "Passing `fuse_text_encoder` to `fuse_lora()` is deprecated and will be ignored. Please use the `components` argument and provide a list of the components whose LoRAs are to be fused. `fuse_text_encoder` will be removed in a future version."
+            deprecate(
+                "fuse_text_encoder",
+                "1.0.0",
+                depr_message,
+            )
+
+        if len(components) == 0:
+            raise ValueError("`components` cannot be an empty list.")
+
+        for fuse_component in components:
+            if fuse_component not in self._lora_loadable_modules:
+                raise ValueError(f"{fuse_component} is not found in {self._lora_loadable_modules=}.")
+
+            model = getattr(self, fuse_component, None)
+            if model is not None:
+                # check if diffusers model
+                if issubclass(model.__class__, ModelMixin):
+                    model.fuse_lora(lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names)
+                # handle transformers models.
+                if issubclass(model.__class__, PreTrainedModel):
+                    fuse_text_encoder_lora(
+                        model, lora_scale=lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names
+                    )
+
+        self.num_fused_loras += 1
+
+    def unfuse_lora(self, components: List[str] = [], **kwargs):
+        r"""
+        Reverses the effect of
+        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
+            unfuse_unet (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+            unfuse_text_encoder (`bool`, defaults to `True`):
+                Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
+                LoRA parameters then it won't have any effect.
+        """
+        if "unfuse_unet" in kwargs:
+            depr_message = "Passing `unfuse_unet` to `unfuse_lora()` is deprecated and will be ignored. Please use the `components` argument. `unfuse_unet` will be removed in a future version."
+            deprecate(
+                "unfuse_unet",
+                "1.0.0",
+                depr_message,
+            )
+        if "unfuse_transformer" in kwargs:
+            depr_message = "Passing `unfuse_transformer` to `unfuse_lora()` is deprecated and will be ignored. Please use the `components` argument. `unfuse_transformer` will be removed in a future version."
+            deprecate(
+                "unfuse_transformer",
+                "1.0.0",
+                depr_message,
+            )
+        if "unfuse_text_encoder" in kwargs:
+            depr_message = "Passing `unfuse_text_encoder` to `unfuse_lora()` is deprecated and will be ignored. Please use the `components` argument. `unfuse_text_encoder` will be removed in a future version."
+            deprecate(
+                "unfuse_text_encoder",
+                "1.0.0",
+                depr_message,
+            )
+
+        if len(components) == 0:
+            raise ValueError("`components` cannot be an empty list.")
+
+        for fuse_component in components:
+            if fuse_component not in self._lora_loadable_modules:
+                raise ValueError(f"{fuse_component} is not found in {self._lora_loadable_modules=}.")
+
+            model = getattr(self, fuse_component, None)
+            if model is not None:
+                if issubclass(model.__class__, (ModelMixin, PreTrainedModel)):
+                    for module in model.modules():
+                        if isinstance(module, BaseTunerLayer):
+                            module.unmerge()
+
+        self.num_fused_loras -= 1
+
+    def set_adapters(
+        self,
+        adapter_names: Union[List[str], str],
+        adapter_weights: Optional[Union[float, Dict, List[float], List[Dict]]] = None,
+    ):
+        adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names
+
+        adapter_weights = copy.deepcopy(adapter_weights)
+
+        # Expand weights into a list, one entry per adapter
+        if not isinstance(adapter_weights, list):
+            adapter_weights = [adapter_weights] * len(adapter_names)
+
+        if len(adapter_names) != len(adapter_weights):
+            raise ValueError(
+                f"Length of adapter names {len(adapter_names)} is not equal to the length of the weights {len(adapter_weights)}"
+            )
+
+        list_adapters = self.get_list_adapters()  # eg {"unet": ["adapter1", "adapter2"], "text_encoder": ["adapter2"]}
+        # eg ["adapter1", "adapter2"]
+        all_adapters = {adapter for adapters in list_adapters.values() for adapter in adapters}
+        missing_adapters = set(adapter_names) - all_adapters
+        if len(missing_adapters) > 0:
+            raise ValueError(
+                f"Adapter name(s) {missing_adapters} not in the list of present adapters: {all_adapters}."
+            )
+
+        # eg {"adapter1": ["unet"], "adapter2": ["unet", "text_encoder"]}
+        invert_list_adapters = {
+            adapter: [part for part, adapters in list_adapters.items() if adapter in adapters]
+            for adapter in all_adapters
+        }
+
+        # Decompose weights into weights for denoiser and text encoders.
+        _component_adapter_weights = {}
+        for component in self._lora_loadable_modules:
+            model = getattr(self, component)
+
+            for adapter_name, weights in zip(adapter_names, adapter_weights):
+                if isinstance(weights, dict):
+                    component_adapter_weights = weights.pop(component, None)
+
+                    if component_adapter_weights is not None and not hasattr(self, component):
+                        logger.warning(
+                            f"Lora weight dict contains {component} weights but will be ignored because pipeline does not have {component}."
+                        )
+
+                    if component_adapter_weights is not None and component not in invert_list_adapters[adapter_name]:
+                        logger.warning(
+                            (
+                                f"Lora weight dict for adapter '{adapter_name}' contains {component},"
+                                f"but this will be ignored because {adapter_name} does not contain weights for {component}."
+                                f"Valid parts for {adapter_name} are: {invert_list_adapters[adapter_name]}."
+                            )
+                        )
+
+                else:
+                    component_adapter_weights = weights
+
+                _component_adapter_weights.setdefault(component, [])
+                _component_adapter_weights[component].append(component_adapter_weights)
+
+            if issubclass(model.__class__, ModelMixin):
+                model.set_adapters(adapter_names, _component_adapter_weights[component])
+            elif issubclass(model.__class__, PreTrainedModel):
+                set_adapters_for_text_encoder(adapter_names, model, _component_adapter_weights[component])
+
+    def disable_lora(self):
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        for component in self._lora_loadable_modules:
+            model = getattr(self, component, None)
+            if model is not None:
+                if issubclass(model.__class__, ModelMixin):
+                    model.disable_lora()
+                elif issubclass(model.__class__, PreTrainedModel):
+                    disable_lora_for_text_encoder(model)
+
+    def enable_lora(self):
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        for component in self._lora_loadable_modules:
+            model = getattr(self, component, None)
+            if model is not None:
+                if issubclass(model.__class__, ModelMixin):
+                    model.enable_lora()
+                elif issubclass(model.__class__, PreTrainedModel):
+                    enable_lora_for_text_encoder(model)
+
+    def delete_adapters(self, adapter_names: Union[List[str], str]):
+        """
+        Args:
+        Deletes the LoRA layers of `adapter_name` for the unet and text-encoder(s).
+            adapter_names (`Union[List[str], str]`):
+                The names of the adapter to delete. Can be a single string or a list of strings
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        if isinstance(adapter_names, str):
+            adapter_names = [adapter_names]
+
+        for component in self._lora_loadable_modules:
+            model = getattr(self, component, None)
+            if model is not None:
+                if issubclass(model.__class__, ModelMixin):
+                    model.delete_adapters(adapter_names)
+                elif issubclass(model.__class__, PreTrainedModel):
+                    for adapter_name in adapter_names:
+                        delete_adapter_layers(model, adapter_name)
+
+    def get_active_adapters(self) -> List[str]:
+        """
+        Gets the list of the current active adapters.
+
+        Example:
+
+        ```python
+        from diffusers import DiffusionPipeline
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+        ).to("cuda")
+        pipeline.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
+        pipeline.get_active_adapters()
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError(
+                "PEFT backend is required for this method. Please install the latest version of PEFT `pip install -U peft`"
+            )
+
+        active_adapters = []
+
+        for component in self._lora_loadable_modules:
+            model = getattr(self, component, None)
+            if model is not None and issubclass(model.__class__, ModelMixin):
+                for module in model.modules():
+                    if isinstance(module, BaseTunerLayer):
+                        active_adapters = module.active_adapters
+                        break
+
+        return active_adapters
+
+    def get_list_adapters(self) -> Dict[str, List[str]]:
+        """
+        Gets the current list of all available adapters in the pipeline.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError(
+                "PEFT backend is required for this method. Please install the latest version of PEFT `pip install -U peft`"
+            )
+
+        set_adapters = {}
+
+        for component in self._lora_loadable_modules:
+            model = getattr(self, component, None)
+            if (
+                model is not None
+                and issubclass(model.__class__, (ModelMixin, PreTrainedModel))
+                and hasattr(model, "peft_config")
+            ):
+                set_adapters[component] = list(model.peft_config.keys())
+
+        return set_adapters
+
+    def set_lora_device(self, adapter_names: List[str], device: Union[torch.device, str, int]) -> None:
+        """
+        Moves the LoRAs listed in `adapter_names` to a target device. Useful for offloading the LoRA to the CPU in case
+        you want to load multiple adapters and free some GPU memory.
+
+        Args:
+            adapter_names (`List[str]`):
+                List of adapters to send device to.
+            device (`Union[torch.device, str, int]`):
+                Device to send the adapters to. Can be either a torch device, a str or an integer.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        for component in self._lora_loadable_modules:
+            model = getattr(self, component, None)
+            if model is not None:
+                for module in model.modules():
+                    if isinstance(module, BaseTunerLayer):
+                        for adapter_name in adapter_names:
+                            module.lora_A[adapter_name].to(device)
+                            module.lora_B[adapter_name].to(device)
+                            # this is a param, not a module, so device placement is not in-place -> re-assign
+                            if hasattr(module, "lora_magnitude_vector") and module.lora_magnitude_vector is not None:
+                                if adapter_name in module.lora_magnitude_vector:
+                                    module.lora_magnitude_vector[adapter_name] = module.lora_magnitude_vector[
+                                        adapter_name
+                                    ].to(device)
+
+    @staticmethod
+    def pack_weights(layers, prefix):
+        layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
+        layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
+        return layers_state_dict
+
+    @staticmethod
+    def write_lora_layers(
+        state_dict: Dict[str, torch.Tensor],
+        save_directory: str,
+        is_main_process: bool,
+        weight_name: str,
+        save_function: Callable,
+        safe_serialization: bool,
+    ):
+        from .lora_pipeline import LORA_WEIGHT_NAME, LORA_WEIGHT_NAME_SAFE
+
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        if save_function is None:
+            if safe_serialization:
+
+                def save_function(weights, filename):
+                    return safetensors.torch.save_file(weights, filename, metadata={"format": "pt"})
+
+            else:
+                save_function = torch.save
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if weight_name is None:
+            if safe_serialization:
+                weight_name = LORA_WEIGHT_NAME_SAFE
+            else:
+                weight_name = LORA_WEIGHT_NAME
+
+        save_path = Path(save_directory, weight_name).as_posix()
+        save_function(state_dict, save_path)
+        logger.info(f"Model weights saved in {save_path}")
+
+    @property
+    def lora_scale(self) -> float:
+        # property function that returns the lora scale which can be set at run time by the pipeline.
+        # if _lora_scale has not been set, return 1
+        return self._lora_scale if hasattr(self, "_lora_scale") else 1.0
diff --git a/diffusers/src/diffusers/loaders/lora_conversion_utils.py b/diffusers/src/diffusers/loaders/lora_conversion_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6dea33e8e82d1634d3833d5cc3d9870a0e6590b
--- /dev/null
+++ b/diffusers/src/diffusers/loaders/lora_conversion_utils.py
@@ -0,0 +1,623 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import torch
+
+from ..utils import is_peft_version, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config, delimiter="_", block_slice_pos=5):
+    # 1. get all state_dict_keys
+    all_keys = list(state_dict.keys())
+    sgm_patterns = ["input_blocks", "middle_block", "output_blocks"]
+
+    # 2. check if needs remapping, if not return original dict
+    is_in_sgm_format = False
+    for key in all_keys:
+        if any(p in key for p in sgm_patterns):
+            is_in_sgm_format = True
+            break
+
+    if not is_in_sgm_format:
+        return state_dict
+
+    # 3. Else remap from SGM patterns
+    new_state_dict = {}
+    inner_block_map = ["resnets", "attentions", "upsamplers"]
+
+    # Retrieves # of down, mid and up blocks
+    input_block_ids, middle_block_ids, output_block_ids = set(), set(), set()
+
+    for layer in all_keys:
+        if "text" in layer:
+            new_state_dict[layer] = state_dict.pop(layer)
+        else:
+            layer_id = int(layer.split(delimiter)[:block_slice_pos][-1])
+            if sgm_patterns[0] in layer:
+                input_block_ids.add(layer_id)
+            elif sgm_patterns[1] in layer:
+                middle_block_ids.add(layer_id)
+            elif sgm_patterns[2] in layer:
+                output_block_ids.add(layer_id)
+            else:
+                raise ValueError(f"Checkpoint not supported because layer {layer} not supported.")
+
+    input_blocks = {
+        layer_id: [key for key in state_dict if f"input_blocks{delimiter}{layer_id}" in key]
+        for layer_id in input_block_ids
+    }
+    middle_blocks = {
+        layer_id: [key for key in state_dict if f"middle_block{delimiter}{layer_id}" in key]
+        for layer_id in middle_block_ids
+    }
+    output_blocks = {
+        layer_id: [key for key in state_dict if f"output_blocks{delimiter}{layer_id}" in key]
+        for layer_id in output_block_ids
+    }
+
+    # Rename keys accordingly
+    for i in input_block_ids:
+        block_id = (i - 1) // (unet_config.layers_per_block + 1)
+        layer_in_block_id = (i - 1) % (unet_config.layers_per_block + 1)
+
+        for key in input_blocks[i]:
+            inner_block_id = int(key.split(delimiter)[block_slice_pos])
+            inner_block_key = inner_block_map[inner_block_id] if "op" not in key else "downsamplers"
+            inner_layers_in_block = str(layer_in_block_id) if "op" not in key else "0"
+            new_key = delimiter.join(
+                key.split(delimiter)[: block_slice_pos - 1]
+                + [str(block_id), inner_block_key, inner_layers_in_block]
+                + key.split(delimiter)[block_slice_pos + 1 :]
+            )
+            new_state_dict[new_key] = state_dict.pop(key)
+
+    for i in middle_block_ids:
+        key_part = None
+        if i == 0:
+            key_part = [inner_block_map[0], "0"]
+        elif i == 1:
+            key_part = [inner_block_map[1], "0"]
+        elif i == 2:
+            key_part = [inner_block_map[0], "1"]
+        else:
+            raise ValueError(f"Invalid middle block id {i}.")
+
+        for key in middle_blocks[i]:
+            new_key = delimiter.join(
+                key.split(delimiter)[: block_slice_pos - 1] + key_part + key.split(delimiter)[block_slice_pos:]
+            )
+            new_state_dict[new_key] = state_dict.pop(key)
+
+    for i in output_block_ids:
+        block_id = i // (unet_config.layers_per_block + 1)
+        layer_in_block_id = i % (unet_config.layers_per_block + 1)
+
+        for key in output_blocks[i]:
+            inner_block_id = int(key.split(delimiter)[block_slice_pos])
+            inner_block_key = inner_block_map[inner_block_id]
+            inner_layers_in_block = str(layer_in_block_id) if inner_block_id < 2 else "0"
+            new_key = delimiter.join(
+                key.split(delimiter)[: block_slice_pos - 1]
+                + [str(block_id), inner_block_key, inner_layers_in_block]
+                + key.split(delimiter)[block_slice_pos + 1 :]
+            )
+            new_state_dict[new_key] = state_dict.pop(key)
+
+    if len(state_dict) > 0:
+        raise ValueError("At this point all state dict entries have to be converted.")
+
+    return new_state_dict
+
+
+def _convert_non_diffusers_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_name="text_encoder"):
+    """
+    Converts a non-Diffusers LoRA state dict to a Diffusers compatible state dict.
+
+    Args:
+        state_dict (`dict`): The state dict to convert.
+        unet_name (`str`, optional): The name of the U-Net module in the Diffusers model. Defaults to "unet".
+        text_encoder_name (`str`, optional): The name of the text encoder module in the Diffusers model. Defaults to
+            "text_encoder".
+
+    Returns:
+        `tuple`: A tuple containing the converted state dict and a dictionary of alphas.
+    """
+    unet_state_dict = {}
+    te_state_dict = {}
+    te2_state_dict = {}
+    network_alphas = {}
+
+    # Check for DoRA-enabled LoRAs.
+    dora_present_in_unet = any("dora_scale" in k and "lora_unet_" in k for k in state_dict)
+    dora_present_in_te = any("dora_scale" in k and ("lora_te_" in k or "lora_te1_" in k) for k in state_dict)
+    dora_present_in_te2 = any("dora_scale" in k and "lora_te2_" in k for k in state_dict)
+    if dora_present_in_unet or dora_present_in_te or dora_present_in_te2:
+        if is_peft_version("<", "0.9.0"):
+            raise ValueError(
+                "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
+            )
+
+    # Iterate over all LoRA weights.
+    all_lora_keys = list(state_dict.keys())
+    for key in all_lora_keys:
+        if not key.endswith("lora_down.weight"):
+            continue
+
+        # Extract LoRA name.
+        lora_name = key.split(".")[0]
+
+        # Find corresponding up weight and alpha.
+        lora_name_up = lora_name + ".lora_up.weight"
+        lora_name_alpha = lora_name + ".alpha"
+
+        # Handle U-Net LoRAs.
+        if lora_name.startswith("lora_unet_"):
+            diffusers_name = _convert_unet_lora_key(key)
+
+            # Store down and up weights.
+            unet_state_dict[diffusers_name] = state_dict.pop(key)
+            unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+
+            # Store DoRA scale if present.
+            if dora_present_in_unet:
+                dora_scale_key_to_replace = "_lora.down." if "_lora.down." in diffusers_name else ".lora.down."
+                unet_state_dict[
+                    diffusers_name.replace(dora_scale_key_to_replace, ".lora_magnitude_vector.")
+                ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale"))
+
+        # Handle text encoder LoRAs.
+        elif lora_name.startswith(("lora_te_", "lora_te1_", "lora_te2_")):
+            diffusers_name = _convert_text_encoder_lora_key(key, lora_name)
+
+            # Store down and up weights for te or te2.
+            if lora_name.startswith(("lora_te_", "lora_te1_")):
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+            else:
+                te2_state_dict[diffusers_name] = state_dict.pop(key)
+                te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+
+            # Store DoRA scale if present.
+            if dora_present_in_te or dora_present_in_te2:
+                dora_scale_key_to_replace_te = (
+                    "_lora.down." if "_lora.down." in diffusers_name else ".lora_linear_layer."
+                )
+                if lora_name.startswith(("lora_te_", "lora_te1_")):
+                    te_state_dict[
+                        diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.")
+                    ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale"))
+                elif lora_name.startswith("lora_te2_"):
+                    te2_state_dict[
+                        diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.")
+                    ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale"))
+
+        # Store alpha if present.
+        if lora_name_alpha in state_dict:
+            alpha = state_dict.pop(lora_name_alpha).item()
+            network_alphas.update(_get_alpha_name(lora_name_alpha, diffusers_name, alpha))
+
+    # Check if any keys remain.
+    if len(state_dict) > 0:
+        raise ValueError(f"The following keys have not been correctly renamed: \n\n {', '.join(state_dict.keys())}")
+
+    logger.info("Non-diffusers checkpoint detected.")
+
+    # Construct final state dict.
+    unet_state_dict = {f"{unet_name}.{module_name}": params for module_name, params in unet_state_dict.items()}
+    te_state_dict = {f"{text_encoder_name}.{module_name}": params for module_name, params in te_state_dict.items()}
+    te2_state_dict = (
+        {f"text_encoder_2.{module_name}": params for module_name, params in te2_state_dict.items()}
+        if len(te2_state_dict) > 0
+        else None
+    )
+    if te2_state_dict is not None:
+        te_state_dict.update(te2_state_dict)
+
+    new_state_dict = {**unet_state_dict, **te_state_dict}
+    return new_state_dict, network_alphas
+
+
+def _convert_unet_lora_key(key):
+    """
+    Converts a U-Net LoRA key to a Diffusers compatible key.
+    """
+    diffusers_name = key.replace("lora_unet_", "").replace("_", ".")
+
+    # Replace common U-Net naming patterns.
+    diffusers_name = diffusers_name.replace("input.blocks", "down_blocks")
+    diffusers_name = diffusers_name.replace("down.blocks", "down_blocks")
+    diffusers_name = diffusers_name.replace("middle.block", "mid_block")
+    diffusers_name = diffusers_name.replace("mid.block", "mid_block")
+    diffusers_name = diffusers_name.replace("output.blocks", "up_blocks")
+    diffusers_name = diffusers_name.replace("up.blocks", "up_blocks")
+    diffusers_name = diffusers_name.replace("transformer.blocks", "transformer_blocks")
+    diffusers_name = diffusers_name.replace("to.q.lora", "to_q_lora")
+    diffusers_name = diffusers_name.replace("to.k.lora", "to_k_lora")
+    diffusers_name = diffusers_name.replace("to.v.lora", "to_v_lora")
+    diffusers_name = diffusers_name.replace("to.out.0.lora", "to_out_lora")
+    diffusers_name = diffusers_name.replace("proj.in", "proj_in")
+    diffusers_name = diffusers_name.replace("proj.out", "proj_out")
+    diffusers_name = diffusers_name.replace("emb.layers", "time_emb_proj")
+
+    # SDXL specific conversions.
+    if "emb" in diffusers_name and "time.emb.proj" not in diffusers_name:
+        pattern = r"\.\d+(?=\D*$)"
+        diffusers_name = re.sub(pattern, "", diffusers_name, count=1)
+    if ".in." in diffusers_name:
+        diffusers_name = diffusers_name.replace("in.layers.2", "conv1")
+    if ".out." in diffusers_name:
+        diffusers_name = diffusers_name.replace("out.layers.3", "conv2")
+    if "downsamplers" in diffusers_name or "upsamplers" in diffusers_name:
+        diffusers_name = diffusers_name.replace("op", "conv")
+    if "skip" in diffusers_name:
+        diffusers_name = diffusers_name.replace("skip.connection", "conv_shortcut")
+
+    # LyCORIS specific conversions.
+    if "time.emb.proj" in diffusers_name:
+        diffusers_name = diffusers_name.replace("time.emb.proj", "time_emb_proj")
+    if "conv.shortcut" in diffusers_name:
+        diffusers_name = diffusers_name.replace("conv.shortcut", "conv_shortcut")
+
+    # General conversions.
+    if "transformer_blocks" in diffusers_name:
+        if "attn1" in diffusers_name or "attn2" in diffusers_name:
+            diffusers_name = diffusers_name.replace("attn1", "attn1.processor")
+            diffusers_name = diffusers_name.replace("attn2", "attn2.processor")
+        elif "ff" in diffusers_name:
+            pass
+    elif any(key in diffusers_name for key in ("proj_in", "proj_out")):
+        pass
+    else:
+        pass
+
+    return diffusers_name
+
+
+def _convert_text_encoder_lora_key(key, lora_name):
+    """
+    Converts a text encoder LoRA key to a Diffusers compatible key.
+    """
+    if lora_name.startswith(("lora_te_", "lora_te1_")):
+        key_to_replace = "lora_te_" if lora_name.startswith("lora_te_") else "lora_te1_"
+    else:
+        key_to_replace = "lora_te2_"
+
+    diffusers_name = key.replace(key_to_replace, "").replace("_", ".")
+    diffusers_name = diffusers_name.replace("text.model", "text_model")
+    diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+    diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+    diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+    diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+    diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
+    diffusers_name = diffusers_name.replace("text.projection", "text_projection")
+
+    if "self_attn" in diffusers_name or "text_projection" in diffusers_name:
+        pass
+    elif "mlp" in diffusers_name:
+        # Be aware that this is the new diffusers convention and the rest of the code might
+        # not utilize it yet.
+        diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
+    return diffusers_name
+
+
+def _get_alpha_name(lora_name_alpha, diffusers_name, alpha):
+    """
+    Gets the correct alpha name for the Diffusers model.
+    """
+    if lora_name_alpha.startswith("lora_unet_"):
+        prefix = "unet."
+    elif lora_name_alpha.startswith(("lora_te_", "lora_te1_")):
+        prefix = "text_encoder."
+    else:
+        prefix = "text_encoder_2."
+    new_name = prefix + diffusers_name.split(".lora.")[0] + ".alpha"
+    return {new_name: alpha}
+
+
+# The utilities under `_convert_kohya_flux_lora_to_diffusers()`
+# are taken from https://github.com/kohya-ss/sd-scripts/blob/a61cf73a5cb5209c3f4d1a3688dd276a4dfd1ecb/networks/convert_flux_lora.py
+# All credits go to `kohya-ss`.
+def _convert_kohya_flux_lora_to_diffusers(state_dict):
+    def _convert_to_ai_toolkit(sds_sd, ait_sd, sds_key, ait_key):
+        if sds_key + ".lora_down.weight" not in sds_sd:
+            return
+        down_weight = sds_sd.pop(sds_key + ".lora_down.weight")
+
+        # scale weight by alpha and dim
+        rank = down_weight.shape[0]
+        alpha = sds_sd.pop(sds_key + ".alpha").item()  # alpha is scalar
+        scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
+
+        # calculate scale_down and scale_up to keep the same value. if scale is 4, scale_down is 2 and scale_up is 2
+        scale_down = scale
+        scale_up = 1.0
+        while scale_down * 2 < scale_up:
+            scale_down *= 2
+            scale_up /= 2
+
+        ait_sd[ait_key + ".lora_A.weight"] = down_weight * scale_down
+        ait_sd[ait_key + ".lora_B.weight"] = sds_sd.pop(sds_key + ".lora_up.weight") * scale_up
+
+    def _convert_to_ai_toolkit_cat(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
+        if sds_key + ".lora_down.weight" not in sds_sd:
+            return
+        down_weight = sds_sd.pop(sds_key + ".lora_down.weight")
+        up_weight = sds_sd.pop(sds_key + ".lora_up.weight")
+        sd_lora_rank = down_weight.shape[0]
+
+        # scale weight by alpha and dim
+        alpha = sds_sd.pop(sds_key + ".alpha")
+        scale = alpha / sd_lora_rank
+
+        # calculate scale_down and scale_up
+        scale_down = scale
+        scale_up = 1.0
+        while scale_down * 2 < scale_up:
+            scale_down *= 2
+            scale_up /= 2
+
+        down_weight = down_weight * scale_down
+        up_weight = up_weight * scale_up
+
+        # calculate dims if not provided
+        num_splits = len(ait_keys)
+        if dims is None:
+            dims = [up_weight.shape[0] // num_splits] * num_splits
+        else:
+            assert sum(dims) == up_weight.shape[0]
+
+        # check upweight is sparse or not
+        is_sparse = False
+        if sd_lora_rank % num_splits == 0:
+            ait_rank = sd_lora_rank // num_splits
+            is_sparse = True
+            i = 0
+            for j in range(len(dims)):
+                for k in range(len(dims)):
+                    if j == k:
+                        continue
+                    is_sparse = is_sparse and torch.all(
+                        up_weight[i : i + dims[j], k * ait_rank : (k + 1) * ait_rank] == 0
+                    )
+                i += dims[j]
+            if is_sparse:
+                logger.info(f"weight is sparse: {sds_key}")
+
+        # make ai-toolkit weight
+        ait_down_keys = [k + ".lora_A.weight" for k in ait_keys]
+        ait_up_keys = [k + ".lora_B.weight" for k in ait_keys]
+        if not is_sparse:
+            # down_weight is copied to each split
+            ait_sd.update({k: down_weight for k in ait_down_keys})
+
+            # up_weight is split to each split
+            ait_sd.update({k: v for k, v in zip(ait_up_keys, torch.split(up_weight, dims, dim=0))})  # noqa: C416
+        else:
+            # down_weight is chunked to each split
+            ait_sd.update({k: v for k, v in zip(ait_down_keys, torch.chunk(down_weight, num_splits, dim=0))})  # noqa: C416
+
+            # up_weight is sparse: only non-zero values are copied to each split
+            i = 0
+            for j in range(len(dims)):
+                ait_sd[ait_up_keys[j]] = up_weight[i : i + dims[j], j * ait_rank : (j + 1) * ait_rank].contiguous()
+                i += dims[j]
+
+    def _convert_sd_scripts_to_ai_toolkit(sds_sd):
+        ait_sd = {}
+        for i in range(19):
+            _convert_to_ai_toolkit(
+                sds_sd,
+                ait_sd,
+                f"lora_unet_double_blocks_{i}_img_attn_proj",
+                f"transformer.transformer_blocks.{i}.attn.to_out.0",
+            )
+            _convert_to_ai_toolkit_cat(
+                sds_sd,
+                ait_sd,
+                f"lora_unet_double_blocks_{i}_img_attn_qkv",
+                [
+                    f"transformer.transformer_blocks.{i}.attn.to_q",
+                    f"transformer.transformer_blocks.{i}.attn.to_k",
+                    f"transformer.transformer_blocks.{i}.attn.to_v",
+                ],
+            )
+            _convert_to_ai_toolkit(
+                sds_sd,
+                ait_sd,
+                f"lora_unet_double_blocks_{i}_img_mlp_0",
+                f"transformer.transformer_blocks.{i}.ff.net.0.proj",
+            )
+            _convert_to_ai_toolkit(
+                sds_sd,
+                ait_sd,
+                f"lora_unet_double_blocks_{i}_img_mlp_2",
+                f"transformer.transformer_blocks.{i}.ff.net.2",
+            )
+            _convert_to_ai_toolkit(
+                sds_sd,
+                ait_sd,
+                f"lora_unet_double_blocks_{i}_img_mod_lin",
+                f"transformer.transformer_blocks.{i}.norm1.linear",
+            )
+            _convert_to_ai_toolkit(
+                sds_sd,
+                ait_sd,
+                f"lora_unet_double_blocks_{i}_txt_attn_proj",
+                f"transformer.transformer_blocks.{i}.attn.to_add_out",
+            )
+            _convert_to_ai_toolkit_cat(
+                sds_sd,
+                ait_sd,
+                f"lora_unet_double_blocks_{i}_txt_attn_qkv",
+                [
+                    f"transformer.transformer_blocks.{i}.attn.add_q_proj",
+                    f"transformer.transformer_blocks.{i}.attn.add_k_proj",
+                    f"transformer.transformer_blocks.{i}.attn.add_v_proj",
+                ],
+            )
+            _convert_to_ai_toolkit(
+                sds_sd,
+                ait_sd,
+                f"lora_unet_double_blocks_{i}_txt_mlp_0",
+                f"transformer.transformer_blocks.{i}.ff_context.net.0.proj",
+            )
+            _convert_to_ai_toolkit(
+                sds_sd,
+                ait_sd,
+                f"lora_unet_double_blocks_{i}_txt_mlp_2",
+                f"transformer.transformer_blocks.{i}.ff_context.net.2",
+            )
+            _convert_to_ai_toolkit(
+                sds_sd,
+                ait_sd,
+                f"lora_unet_double_blocks_{i}_txt_mod_lin",
+                f"transformer.transformer_blocks.{i}.norm1_context.linear",
+            )
+
+        for i in range(38):
+            _convert_to_ai_toolkit_cat(
+                sds_sd,
+                ait_sd,
+                f"lora_unet_single_blocks_{i}_linear1",
+                [
+                    f"transformer.single_transformer_blocks.{i}.attn.to_q",
+                    f"transformer.single_transformer_blocks.{i}.attn.to_k",
+                    f"transformer.single_transformer_blocks.{i}.attn.to_v",
+                    f"transformer.single_transformer_blocks.{i}.proj_mlp",
+                ],
+                dims=[3072, 3072, 3072, 12288],
+            )
+            _convert_to_ai_toolkit(
+                sds_sd,
+                ait_sd,
+                f"lora_unet_single_blocks_{i}_linear2",
+                f"transformer.single_transformer_blocks.{i}.proj_out",
+            )
+            _convert_to_ai_toolkit(
+                sds_sd,
+                ait_sd,
+                f"lora_unet_single_blocks_{i}_modulation_lin",
+                f"transformer.single_transformer_blocks.{i}.norm.linear",
+            )
+
+        if len(sds_sd) > 0:
+            logger.warning(f"Unsuppored keys for ai-toolkit: {sds_sd.keys()}")
+
+        return ait_sd
+
+    return _convert_sd_scripts_to_ai_toolkit(state_dict)
+
+
+# Adapted from https://gist.github.com/Leommm-byte/6b331a1e9bd53271210b26543a7065d6
+# Some utilities were reused from
+# https://github.com/kohya-ss/sd-scripts/blob/a61cf73a5cb5209c3f4d1a3688dd276a4dfd1ecb/networks/convert_flux_lora.py
+def _convert_xlabs_flux_lora_to_diffusers(old_state_dict):
+    new_state_dict = {}
+    orig_keys = list(old_state_dict.keys())
+
+    def handle_qkv(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
+        down_weight = sds_sd.pop(sds_key)
+        up_weight = sds_sd.pop(sds_key.replace(".down.weight", ".up.weight"))
+
+        # calculate dims if not provided
+        num_splits = len(ait_keys)
+        if dims is None:
+            dims = [up_weight.shape[0] // num_splits] * num_splits
+        else:
+            assert sum(dims) == up_weight.shape[0]
+
+        # make ai-toolkit weight
+        ait_down_keys = [k + ".lora_A.weight" for k in ait_keys]
+        ait_up_keys = [k + ".lora_B.weight" for k in ait_keys]
+
+        # down_weight is copied to each split
+        ait_sd.update({k: down_weight for k in ait_down_keys})
+
+        # up_weight is split to each split
+        ait_sd.update({k: v for k, v in zip(ait_up_keys, torch.split(up_weight, dims, dim=0))})  # noqa: C416
+
+    for old_key in orig_keys:
+        # Handle double_blocks
+        if old_key.startswith(("diffusion_model.double_blocks", "double_blocks")):
+            block_num = re.search(r"double_blocks\.(\d+)", old_key).group(1)
+            new_key = f"transformer.transformer_blocks.{block_num}"
+
+            if "processor.proj_lora1" in old_key:
+                new_key += ".attn.to_out.0"
+            elif "processor.proj_lora2" in old_key:
+                new_key += ".attn.to_add_out"
+            # Handle text latents.
+            elif "processor.qkv_lora2" in old_key and "up" not in old_key:
+                handle_qkv(
+                    old_state_dict,
+                    new_state_dict,
+                    old_key,
+                    [
+                        f"transformer.transformer_blocks.{block_num}.attn.add_q_proj",
+                        f"transformer.transformer_blocks.{block_num}.attn.add_k_proj",
+                        f"transformer.transformer_blocks.{block_num}.attn.add_v_proj",
+                    ],
+                )
+                # continue
+            # Handle image latents.
+            elif "processor.qkv_lora1" in old_key and "up" not in old_key:
+                handle_qkv(
+                    old_state_dict,
+                    new_state_dict,
+                    old_key,
+                    [
+                        f"transformer.transformer_blocks.{block_num}.attn.to_q",
+                        f"transformer.transformer_blocks.{block_num}.attn.to_k",
+                        f"transformer.transformer_blocks.{block_num}.attn.to_v",
+                    ],
+                )
+                # continue
+
+            if "down" in old_key:
+                new_key += ".lora_A.weight"
+            elif "up" in old_key:
+                new_key += ".lora_B.weight"
+
+        # Handle single_blocks
+        elif old_key.startswith("diffusion_model.single_blocks", "single_blocks"):
+            block_num = re.search(r"single_blocks\.(\d+)", old_key).group(1)
+            new_key = f"transformer.single_transformer_blocks.{block_num}"
+
+            if "proj_lora1" in old_key or "proj_lora2" in old_key:
+                new_key += ".proj_out"
+            elif "qkv_lora1" in old_key or "qkv_lora2" in old_key:
+                new_key += ".norm.linear"
+
+            if "down" in old_key:
+                new_key += ".lora_A.weight"
+            elif "up" in old_key:
+                new_key += ".lora_B.weight"
+
+        else:
+            # Handle other potential key patterns here
+            new_key = old_key
+
+        # Since we already handle qkv above.
+        if "qkv" not in old_key:
+            new_state_dict[new_key] = old_state_dict.pop(old_key)
+
+    if len(old_state_dict) > 0:
+        raise ValueError(f"`old_state_dict` should be at this point but has: {list(old_state_dict.keys())}.")
+
+    return new_state_dict
diff --git a/diffusers/src/diffusers/loaders/lora_pipeline.py b/diffusers/src/diffusers/loaders/lora_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba1435a8cbdc6d8d349bbbf1fee410fc7c77f072
--- /dev/null
+++ b/diffusers/src/diffusers/loaders/lora_pipeline.py
@@ -0,0 +1,2616 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Callable, Dict, List, Optional, Union
+
+import torch
+from huggingface_hub.utils import validate_hf_hub_args
+
+from ..utils import (
+    USE_PEFT_BACKEND,
+    convert_state_dict_to_diffusers,
+    convert_state_dict_to_peft,
+    convert_unet_state_dict_to_peft,
+    deprecate,
+    get_adapter_name,
+    get_peft_kwargs,
+    is_peft_version,
+    is_transformers_available,
+    logging,
+    scale_lora_layers,
+)
+from .lora_base import LoraBaseMixin
+from .lora_conversion_utils import (
+    _convert_kohya_flux_lora_to_diffusers,
+    _convert_non_diffusers_lora_to_diffusers,
+    _convert_xlabs_flux_lora_to_diffusers,
+    _maybe_map_sgm_blocks_to_diffusers,
+)
+
+
+if is_transformers_available():
+    from ..models.lora import text_encoder_attn_modules, text_encoder_mlp_modules
+
+logger = logging.get_logger(__name__)
+
+TEXT_ENCODER_NAME = "text_encoder"
+UNET_NAME = "unet"
+TRANSFORMER_NAME = "transformer"
+
+LORA_WEIGHT_NAME = "pytorch_lora_weights.bin"
+LORA_WEIGHT_NAME_SAFE = "pytorch_lora_weights.safetensors"
+
+
+class StableDiffusionLoraLoaderMixin(LoraBaseMixin):
+    r"""
+    Load LoRA layers into Stable Diffusion [`UNet2DConditionModel`] and
+    [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel).
+    """
+
+    _lora_loadable_modules = ["unet", "text_encoder"]
+    unet_name = UNET_NAME
+    text_encoder_name = TEXT_ENCODER_NAME
+
+    def load_lora_weights(
+        self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs
+    ):
+        """
+        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and
+        `self.text_encoder`.
+
+        All kwargs are forwarded to `self.lora_state_dict`.
+
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is
+        loaded.
+
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details on how the state dict is
+        loaded into `self.unet`.
+
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder`] for more details on how the state
+        dict is loaded into `self.text_encoder`.
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+            kwargs (`dict`, *optional*):
+                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        # if a dict is passed, copy it instead of modifying it inplace
+        if isinstance(pretrained_model_name_or_path_or_dict, dict):
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
+
+        # First, ensure that the checkpoint is a compatible one and can be successfully loaded.
+        state_dict, network_alphas = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
+
+        is_correct_format = all("lora" in key or "dora_scale" in key for key in state_dict.keys())
+        if not is_correct_format:
+            raise ValueError("Invalid LoRA checkpoint.")
+
+        self.load_lora_into_unet(
+            state_dict,
+            network_alphas=network_alphas,
+            unet=getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet,
+            adapter_name=adapter_name,
+            _pipeline=self,
+        )
+        self.load_lora_into_text_encoder(
+            state_dict,
+            network_alphas=network_alphas,
+            text_encoder=getattr(self, self.text_encoder_name)
+            if not hasattr(self, "text_encoder")
+            else self.text_encoder,
+            lora_scale=self.lora_scale,
+            adapter_name=adapter_name,
+            _pipeline=self,
+        )
+
+    @classmethod
+    @validate_hf_hub_args
+    def lora_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        r"""
+        Return state dict for lora weights and the network alphas.
+
+        <Tip warning={true}>
+
+        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
+
+        This function is experimental and might change in the future.
+
+        </Tip>
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            weight_name (`str`, *optional*, defaults to None):
+                Name of the serialized state dict file.
+        """
+        # Load the main state dict first which has the LoRA layers for either of
+        # UNet and text encoder or both.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        unet_config = kwargs.pop("unet_config", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+
+        state_dict = cls._fetch_state_dict(
+            pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
+            weight_name=weight_name,
+            use_safetensors=use_safetensors,
+            local_files_only=local_files_only,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            allow_pickle=allow_pickle,
+        )
+
+        network_alphas = None
+        # TODO: replace it with a method from `state_dict_utils`
+        if all(
+            (
+                k.startswith("lora_te_")
+                or k.startswith("lora_unet_")
+                or k.startswith("lora_te1_")
+                or k.startswith("lora_te2_")
+            )
+            for k in state_dict.keys()
+        ):
+            # Map SDXL blocks correctly.
+            if unet_config is not None:
+                # use unet config to remap block numbers
+                state_dict = _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config)
+            state_dict, network_alphas = _convert_non_diffusers_lora_to_diffusers(state_dict)
+
+        return state_dict, network_alphas
+
+    @classmethod
+    def load_lora_into_unet(cls, state_dict, network_alphas, unet, adapter_name=None, _pipeline=None):
+        """
+        This will load the LoRA layers specified in `state_dict` into `unet`.
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
+                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
+                encoder lora layers.
+            network_alphas (`Dict[str, float]`):
+                The value of the network alpha used for stable learning and preventing underflow. This value has the
+                same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+                link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
+            unet (`UNet2DConditionModel`):
+                The UNet model to load the LoRA layers into.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
+        # then the `state_dict` keys should have `cls.unet_name` and/or `cls.text_encoder_name` as
+        # their prefixes.
+        keys = list(state_dict.keys())
+        only_text_encoder = all(key.startswith(cls.text_encoder_name) for key in keys)
+        if not only_text_encoder:
+            # Load the layers corresponding to UNet.
+            logger.info(f"Loading {cls.unet_name}.")
+            unet.load_attn_procs(
+                state_dict, network_alphas=network_alphas, adapter_name=adapter_name, _pipeline=_pipeline
+            )
+
+    @classmethod
+    def load_lora_into_text_encoder(
+        cls,
+        state_dict,
+        network_alphas,
+        text_encoder,
+        prefix=None,
+        lora_scale=1.0,
+        adapter_name=None,
+        _pipeline=None,
+    ):
+        """
+        This will load the LoRA layers specified in `state_dict` into `text_encoder`
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The key should be prefixed with an
+                additional `text_encoder` to distinguish between unet lora layers.
+            network_alphas (`Dict[str, float]`):
+                The value of the network alpha used for stable learning and preventing underflow. This value has the
+                same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+                link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
+            text_encoder (`CLIPTextModel`):
+                The text encoder model to load the LoRA layers into.
+            prefix (`str`):
+                Expected prefix of the `text_encoder` in the `state_dict`.
+            lora_scale (`float`):
+                How much to scale the output of the lora linear layer before it is added with the output of the regular
+                lora layer.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        from peft import LoraConfig
+
+        # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
+        # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
+        # their prefixes.
+        keys = list(state_dict.keys())
+        prefix = cls.text_encoder_name if prefix is None else prefix
+
+        # Safe prefix to check with.
+        if any(cls.text_encoder_name in key for key in keys):
+            # Load the layers corresponding to text encoder and make necessary adjustments.
+            text_encoder_keys = [k for k in keys if k.startswith(prefix) and k.split(".")[0] == prefix]
+            text_encoder_lora_state_dict = {
+                k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
+            }
+
+            if len(text_encoder_lora_state_dict) > 0:
+                logger.info(f"Loading {prefix}.")
+                rank = {}
+                text_encoder_lora_state_dict = convert_state_dict_to_diffusers(text_encoder_lora_state_dict)
+
+                # convert state dict
+                text_encoder_lora_state_dict = convert_state_dict_to_peft(text_encoder_lora_state_dict)
+
+                for name, _ in text_encoder_attn_modules(text_encoder):
+                    for module in ("out_proj", "q_proj", "k_proj", "v_proj"):
+                        rank_key = f"{name}.{module}.lora_B.weight"
+                        if rank_key not in text_encoder_lora_state_dict:
+                            continue
+                        rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1]
+
+                for name, _ in text_encoder_mlp_modules(text_encoder):
+                    for module in ("fc1", "fc2"):
+                        rank_key = f"{name}.{module}.lora_B.weight"
+                        if rank_key not in text_encoder_lora_state_dict:
+                            continue
+                        rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1]
+
+                if network_alphas is not None:
+                    alpha_keys = [
+                        k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix
+                    ]
+                    network_alphas = {
+                        k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
+                    }
+
+                lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False)
+                if "use_dora" in lora_config_kwargs:
+                    if lora_config_kwargs["use_dora"]:
+                        if is_peft_version("<", "0.9.0"):
+                            raise ValueError(
+                                "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
+                            )
+                    else:
+                        if is_peft_version("<", "0.9.0"):
+                            lora_config_kwargs.pop("use_dora")
+                lora_config = LoraConfig(**lora_config_kwargs)
+
+                # adapter_name
+                if adapter_name is None:
+                    adapter_name = get_adapter_name(text_encoder)
+
+                is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline)
+
+                # inject LoRA layers and load the state dict
+                # in transformers we automatically check whether the adapter name is already in use or not
+                text_encoder.load_adapter(
+                    adapter_name=adapter_name,
+                    adapter_state_dict=text_encoder_lora_state_dict,
+                    peft_config=lora_config,
+                )
+
+                # scale LoRA layers with `lora_scale`
+                scale_lora_layers(text_encoder, weight=lora_scale)
+
+                text_encoder.to(device=text_encoder.device, dtype=text_encoder.dtype)
+
+                # Offload back.
+                if is_model_cpu_offload:
+                    _pipeline.enable_model_cpu_offload()
+                elif is_sequential_cpu_offload:
+                    _pipeline.enable_sequential_cpu_offload()
+                # Unsafe code />
+
+    @classmethod
+    def save_lora_weights(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_lora_layers: Dict[str, torch.nn.Module] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+    ):
+        r"""
+        Save the LoRA parameters corresponding to the UNet and text encoder.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save LoRA parameters to. Will be created if it doesn't exist.
+            unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `unet`.
+            text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text
+                encoder LoRA state dict because it comes from 🤗 Transformers.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+        """
+        state_dict = {}
+
+        if not (unet_lora_layers or text_encoder_lora_layers):
+            raise ValueError("You must pass at least one of `unet_lora_layers` and `text_encoder_lora_layers`.")
+
+        if unet_lora_layers:
+            state_dict.update(cls.pack_weights(unet_lora_layers, cls.unet_name))
+
+        if text_encoder_lora_layers:
+            state_dict.update(cls.pack_weights(text_encoder_lora_layers, cls.text_encoder_name))
+
+        # Save the model
+        cls.write_lora_layers(
+            state_dict=state_dict,
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+
+    def fuse_lora(
+        self,
+        components: List[str] = ["unet", "text_encoder"],
+        lora_scale: float = 1.0,
+        safe_fusing: bool = False,
+        adapter_names: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        r"""
+        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
+            lora_scale (`float`, defaults to 1.0):
+                Controls how much to influence the outputs with the LoRA parameters.
+            safe_fusing (`bool`, defaults to `False`):
+                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
+            adapter_names (`List[str]`, *optional*):
+                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.fuse_lora(lora_scale=0.7)
+        ```
+        """
+        super().fuse_lora(
+            components=components, lora_scale=lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names
+        )
+
+    def unfuse_lora(self, components: List[str] = ["unet", "text_encoder"], **kwargs):
+        r"""
+        Reverses the effect of
+        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
+            unfuse_unet (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+            unfuse_text_encoder (`bool`, defaults to `True`):
+                Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
+                LoRA parameters then it won't have any effect.
+        """
+        super().unfuse_lora(components=components)
+
+
+class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin):
+    r"""
+    Load LoRA layers into Stable Diffusion XL [`UNet2DConditionModel`],
+    [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), and
+    [`CLIPTextModelWithProjection`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection).
+    """
+
+    _lora_loadable_modules = ["unet", "text_encoder", "text_encoder_2"]
+    unet_name = UNET_NAME
+    text_encoder_name = TEXT_ENCODER_NAME
+
+    def load_lora_weights(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        adapter_name: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and
+        `self.text_encoder`.
+
+        All kwargs are forwarded to `self.lora_state_dict`.
+
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is
+        loaded.
+
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details on how the state dict is
+        loaded into `self.unet`.
+
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder`] for more details on how the state
+        dict is loaded into `self.text_encoder`.
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+            kwargs (`dict`, *optional*):
+                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        # We could have accessed the unet config from `lora_state_dict()` too. We pass
+        # it here explicitly to be able to tell that it's coming from an SDXL
+        # pipeline.
+
+        # if a dict is passed, copy it instead of modifying it inplace
+        if isinstance(pretrained_model_name_or_path_or_dict, dict):
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
+
+        # First, ensure that the checkpoint is a compatible one and can be successfully loaded.
+        state_dict, network_alphas = self.lora_state_dict(
+            pretrained_model_name_or_path_or_dict,
+            unet_config=self.unet.config,
+            **kwargs,
+        )
+        is_correct_format = all("lora" in key or "dora_scale" in key for key in state_dict.keys())
+        if not is_correct_format:
+            raise ValueError("Invalid LoRA checkpoint.")
+
+        self.load_lora_into_unet(
+            state_dict, network_alphas=network_alphas, unet=self.unet, adapter_name=adapter_name, _pipeline=self
+        )
+        text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
+        if len(text_encoder_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_state_dict,
+                network_alphas=network_alphas,
+                text_encoder=self.text_encoder,
+                prefix="text_encoder",
+                lora_scale=self.lora_scale,
+                adapter_name=adapter_name,
+                _pipeline=self,
+            )
+
+        text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
+        if len(text_encoder_2_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_2_state_dict,
+                network_alphas=network_alphas,
+                text_encoder=self.text_encoder_2,
+                prefix="text_encoder_2",
+                lora_scale=self.lora_scale,
+                adapter_name=adapter_name,
+                _pipeline=self,
+            )
+
+    @classmethod
+    @validate_hf_hub_args
+    # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.lora_state_dict
+    def lora_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        r"""
+        Return state dict for lora weights and the network alphas.
+
+        <Tip warning={true}>
+
+        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
+
+        This function is experimental and might change in the future.
+
+        </Tip>
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            weight_name (`str`, *optional*, defaults to None):
+                Name of the serialized state dict file.
+        """
+        # Load the main state dict first which has the LoRA layers for either of
+        # UNet and text encoder or both.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        unet_config = kwargs.pop("unet_config", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+
+        state_dict = cls._fetch_state_dict(
+            pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
+            weight_name=weight_name,
+            use_safetensors=use_safetensors,
+            local_files_only=local_files_only,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            allow_pickle=allow_pickle,
+        )
+
+        network_alphas = None
+        # TODO: replace it with a method from `state_dict_utils`
+        if all(
+            (
+                k.startswith("lora_te_")
+                or k.startswith("lora_unet_")
+                or k.startswith("lora_te1_")
+                or k.startswith("lora_te2_")
+            )
+            for k in state_dict.keys()
+        ):
+            # Map SDXL blocks correctly.
+            if unet_config is not None:
+                # use unet config to remap block numbers
+                state_dict = _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config)
+            state_dict, network_alphas = _convert_non_diffusers_lora_to_diffusers(state_dict)
+
+        return state_dict, network_alphas
+
+    @classmethod
+    # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.load_lora_into_unet
+    def load_lora_into_unet(cls, state_dict, network_alphas, unet, adapter_name=None, _pipeline=None):
+        """
+        This will load the LoRA layers specified in `state_dict` into `unet`.
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
+                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
+                encoder lora layers.
+            network_alphas (`Dict[str, float]`):
+                The value of the network alpha used for stable learning and preventing underflow. This value has the
+                same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+                link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
+            unet (`UNet2DConditionModel`):
+                The UNet model to load the LoRA layers into.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
+        # then the `state_dict` keys should have `cls.unet_name` and/or `cls.text_encoder_name` as
+        # their prefixes.
+        keys = list(state_dict.keys())
+        only_text_encoder = all(key.startswith(cls.text_encoder_name) for key in keys)
+        if not only_text_encoder:
+            # Load the layers corresponding to UNet.
+            logger.info(f"Loading {cls.unet_name}.")
+            unet.load_attn_procs(
+                state_dict, network_alphas=network_alphas, adapter_name=adapter_name, _pipeline=_pipeline
+            )
+
+    @classmethod
+    # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder
+    def load_lora_into_text_encoder(
+        cls,
+        state_dict,
+        network_alphas,
+        text_encoder,
+        prefix=None,
+        lora_scale=1.0,
+        adapter_name=None,
+        _pipeline=None,
+    ):
+        """
+        This will load the LoRA layers specified in `state_dict` into `text_encoder`
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The key should be prefixed with an
+                additional `text_encoder` to distinguish between unet lora layers.
+            network_alphas (`Dict[str, float]`):
+                The value of the network alpha used for stable learning and preventing underflow. This value has the
+                same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+                link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
+            text_encoder (`CLIPTextModel`):
+                The text encoder model to load the LoRA layers into.
+            prefix (`str`):
+                Expected prefix of the `text_encoder` in the `state_dict`.
+            lora_scale (`float`):
+                How much to scale the output of the lora linear layer before it is added with the output of the regular
+                lora layer.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        from peft import LoraConfig
+
+        # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
+        # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
+        # their prefixes.
+        keys = list(state_dict.keys())
+        prefix = cls.text_encoder_name if prefix is None else prefix
+
+        # Safe prefix to check with.
+        if any(cls.text_encoder_name in key for key in keys):
+            # Load the layers corresponding to text encoder and make necessary adjustments.
+            text_encoder_keys = [k for k in keys if k.startswith(prefix) and k.split(".")[0] == prefix]
+            text_encoder_lora_state_dict = {
+                k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
+            }
+
+            if len(text_encoder_lora_state_dict) > 0:
+                logger.info(f"Loading {prefix}.")
+                rank = {}
+                text_encoder_lora_state_dict = convert_state_dict_to_diffusers(text_encoder_lora_state_dict)
+
+                # convert state dict
+                text_encoder_lora_state_dict = convert_state_dict_to_peft(text_encoder_lora_state_dict)
+
+                for name, _ in text_encoder_attn_modules(text_encoder):
+                    for module in ("out_proj", "q_proj", "k_proj", "v_proj"):
+                        rank_key = f"{name}.{module}.lora_B.weight"
+                        if rank_key not in text_encoder_lora_state_dict:
+                            continue
+                        rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1]
+
+                for name, _ in text_encoder_mlp_modules(text_encoder):
+                    for module in ("fc1", "fc2"):
+                        rank_key = f"{name}.{module}.lora_B.weight"
+                        if rank_key not in text_encoder_lora_state_dict:
+                            continue
+                        rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1]
+
+                if network_alphas is not None:
+                    alpha_keys = [
+                        k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix
+                    ]
+                    network_alphas = {
+                        k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
+                    }
+
+                lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False)
+                if "use_dora" in lora_config_kwargs:
+                    if lora_config_kwargs["use_dora"]:
+                        if is_peft_version("<", "0.9.0"):
+                            raise ValueError(
+                                "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
+                            )
+                    else:
+                        if is_peft_version("<", "0.9.0"):
+                            lora_config_kwargs.pop("use_dora")
+                lora_config = LoraConfig(**lora_config_kwargs)
+
+                # adapter_name
+                if adapter_name is None:
+                    adapter_name = get_adapter_name(text_encoder)
+
+                is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline)
+
+                # inject LoRA layers and load the state dict
+                # in transformers we automatically check whether the adapter name is already in use or not
+                text_encoder.load_adapter(
+                    adapter_name=adapter_name,
+                    adapter_state_dict=text_encoder_lora_state_dict,
+                    peft_config=lora_config,
+                )
+
+                # scale LoRA layers with `lora_scale`
+                scale_lora_layers(text_encoder, weight=lora_scale)
+
+                text_encoder.to(device=text_encoder.device, dtype=text_encoder.dtype)
+
+                # Offload back.
+                if is_model_cpu_offload:
+                    _pipeline.enable_model_cpu_offload()
+                elif is_sequential_cpu_offload:
+                    _pipeline.enable_sequential_cpu_offload()
+                # Unsafe code />
+
+    @classmethod
+    def save_lora_weights(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+    ):
+        r"""
+        Save the LoRA parameters corresponding to the UNet and text encoder.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save LoRA parameters to. Will be created if it doesn't exist.
+            unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `unet`.
+            text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text
+                encoder LoRA state dict because it comes from 🤗 Transformers.
+            text_encoder_2_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `text_encoder_2`. Must explicitly pass the text
+                encoder LoRA state dict because it comes from 🤗 Transformers.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+        """
+        state_dict = {}
+
+        if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers):
+            raise ValueError(
+                "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`."
+            )
+
+        if unet_lora_layers:
+            state_dict.update(cls.pack_weights(unet_lora_layers, "unet"))
+
+        if text_encoder_lora_layers:
+            state_dict.update(cls.pack_weights(text_encoder_lora_layers, "text_encoder"))
+
+        if text_encoder_2_lora_layers:
+            state_dict.update(cls.pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
+
+        cls.write_lora_layers(
+            state_dict=state_dict,
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+
+    def fuse_lora(
+        self,
+        components: List[str] = ["unet", "text_encoder", "text_encoder_2"],
+        lora_scale: float = 1.0,
+        safe_fusing: bool = False,
+        adapter_names: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        r"""
+        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
+            lora_scale (`float`, defaults to 1.0):
+                Controls how much to influence the outputs with the LoRA parameters.
+            safe_fusing (`bool`, defaults to `False`):
+                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
+            adapter_names (`List[str]`, *optional*):
+                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.fuse_lora(lora_scale=0.7)
+        ```
+        """
+        super().fuse_lora(
+            components=components, lora_scale=lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names
+        )
+
+    def unfuse_lora(self, components: List[str] = ["unet", "text_encoder", "text_encoder_2"], **kwargs):
+        r"""
+        Reverses the effect of
+        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
+            unfuse_unet (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+            unfuse_text_encoder (`bool`, defaults to `True`):
+                Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
+                LoRA parameters then it won't have any effect.
+        """
+        super().unfuse_lora(components=components)
+
+
+class SD3LoraLoaderMixin(LoraBaseMixin):
+    r"""
+    Load LoRA layers into [`SD3Transformer2DModel`],
+    [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), and
+    [`CLIPTextModelWithProjection`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection).
+
+    Specific to [`StableDiffusion3Pipeline`].
+    """
+
+    _lora_loadable_modules = ["transformer", "text_encoder", "text_encoder_2"]
+    transformer_name = TRANSFORMER_NAME
+    text_encoder_name = TEXT_ENCODER_NAME
+
+    @classmethod
+    @validate_hf_hub_args
+    def lora_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        r"""
+        Return state dict for lora weights and the network alphas.
+
+        <Tip warning={true}>
+
+        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
+
+        This function is experimental and might change in the future.
+
+        </Tip>
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+
+        """
+        # Load the main state dict first which has the LoRA layers for either of
+        # transformer and text encoder or both.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+
+        state_dict = cls._fetch_state_dict(
+            pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
+            weight_name=weight_name,
+            use_safetensors=use_safetensors,
+            local_files_only=local_files_only,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            allow_pickle=allow_pickle,
+        )
+
+        return state_dict
+
+    def load_lora_weights(
+        self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs
+    ):
+        """
+        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and
+        `self.text_encoder`.
+
+        All kwargs are forwarded to `self.lora_state_dict`.
+
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is
+        loaded.
+
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
+        dict is loaded into `self.transformer`.
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+            kwargs (`dict`, *optional*):
+                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        # if a dict is passed, copy it instead of modifying it inplace
+        if isinstance(pretrained_model_name_or_path_or_dict, dict):
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
+
+        # First, ensure that the checkpoint is a compatible one and can be successfully loaded.
+        state_dict = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
+
+        is_correct_format = all("lora" in key or "dora_scale" in key for key in state_dict.keys())
+        if not is_correct_format:
+            raise ValueError("Invalid LoRA checkpoint.")
+
+        self.load_lora_into_transformer(
+            state_dict,
+            transformer=getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer,
+            adapter_name=adapter_name,
+            _pipeline=self,
+        )
+
+        text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
+        if len(text_encoder_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_state_dict,
+                network_alphas=None,
+                text_encoder=self.text_encoder,
+                prefix="text_encoder",
+                lora_scale=self.lora_scale,
+                adapter_name=adapter_name,
+                _pipeline=self,
+            )
+
+        text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
+        if len(text_encoder_2_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_2_state_dict,
+                network_alphas=None,
+                text_encoder=self.text_encoder_2,
+                prefix="text_encoder_2",
+                lora_scale=self.lora_scale,
+                adapter_name=adapter_name,
+                _pipeline=self,
+            )
+
+    @classmethod
+    def load_lora_into_transformer(cls, state_dict, transformer, adapter_name=None, _pipeline=None):
+        """
+        This will load the LoRA layers specified in `state_dict` into `transformer`.
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
+                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
+                encoder lora layers.
+            transformer (`SD3Transformer2DModel`):
+                The Transformer model to load the LoRA layers into.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
+
+        keys = list(state_dict.keys())
+
+        transformer_keys = [k for k in keys if k.startswith(cls.transformer_name)]
+        state_dict = {
+            k.replace(f"{cls.transformer_name}.", ""): v for k, v in state_dict.items() if k in transformer_keys
+        }
+
+        if len(state_dict.keys()) > 0:
+            # check with first key if is not in peft format
+            first_key = next(iter(state_dict.keys()))
+            if "lora_A" not in first_key:
+                state_dict = convert_unet_state_dict_to_peft(state_dict)
+
+            if adapter_name in getattr(transformer, "peft_config", {}):
+                raise ValueError(
+                    f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name."
+                )
+
+            rank = {}
+            for key, val in state_dict.items():
+                if "lora_B" in key:
+                    rank[key] = val.shape[1]
+
+            lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=None, peft_state_dict=state_dict)
+            if "use_dora" in lora_config_kwargs:
+                if lora_config_kwargs["use_dora"] and is_peft_version("<", "0.9.0"):
+                    raise ValueError(
+                        "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
+                    )
+                else:
+                    lora_config_kwargs.pop("use_dora")
+            lora_config = LoraConfig(**lora_config_kwargs)
+
+            # adapter_name
+            if adapter_name is None:
+                adapter_name = get_adapter_name(transformer)
+
+            # In case the pipeline has been already offloaded to CPU - temporarily remove the hooks
+            # otherwise loading LoRA weights will lead to an error
+            is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline)
+
+            inject_adapter_in_model(lora_config, transformer, adapter_name=adapter_name)
+            incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name)
+
+            if incompatible_keys is not None:
+                # check only for unexpected keys
+                unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+                if unexpected_keys:
+                    logger.warning(
+                        f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                        f" {unexpected_keys}. "
+                    )
+
+            # Offload back.
+            if is_model_cpu_offload:
+                _pipeline.enable_model_cpu_offload()
+            elif is_sequential_cpu_offload:
+                _pipeline.enable_sequential_cpu_offload()
+            # Unsafe code />
+
+    @classmethod
+    # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder
+    def load_lora_into_text_encoder(
+        cls,
+        state_dict,
+        network_alphas,
+        text_encoder,
+        prefix=None,
+        lora_scale=1.0,
+        adapter_name=None,
+        _pipeline=None,
+    ):
+        """
+        This will load the LoRA layers specified in `state_dict` into `text_encoder`
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The key should be prefixed with an
+                additional `text_encoder` to distinguish between unet lora layers.
+            network_alphas (`Dict[str, float]`):
+                The value of the network alpha used for stable learning and preventing underflow. This value has the
+                same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+                link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
+            text_encoder (`CLIPTextModel`):
+                The text encoder model to load the LoRA layers into.
+            prefix (`str`):
+                Expected prefix of the `text_encoder` in the `state_dict`.
+            lora_scale (`float`):
+                How much to scale the output of the lora linear layer before it is added with the output of the regular
+                lora layer.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        from peft import LoraConfig
+
+        # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
+        # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
+        # their prefixes.
+        keys = list(state_dict.keys())
+        prefix = cls.text_encoder_name if prefix is None else prefix
+
+        # Safe prefix to check with.
+        if any(cls.text_encoder_name in key for key in keys):
+            # Load the layers corresponding to text encoder and make necessary adjustments.
+            text_encoder_keys = [k for k in keys if k.startswith(prefix) and k.split(".")[0] == prefix]
+            text_encoder_lora_state_dict = {
+                k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
+            }
+
+            if len(text_encoder_lora_state_dict) > 0:
+                logger.info(f"Loading {prefix}.")
+                rank = {}
+                text_encoder_lora_state_dict = convert_state_dict_to_diffusers(text_encoder_lora_state_dict)
+
+                # convert state dict
+                text_encoder_lora_state_dict = convert_state_dict_to_peft(text_encoder_lora_state_dict)
+
+                for name, _ in text_encoder_attn_modules(text_encoder):
+                    for module in ("out_proj", "q_proj", "k_proj", "v_proj"):
+                        rank_key = f"{name}.{module}.lora_B.weight"
+                        if rank_key not in text_encoder_lora_state_dict:
+                            continue
+                        rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1]
+
+                for name, _ in text_encoder_mlp_modules(text_encoder):
+                    for module in ("fc1", "fc2"):
+                        rank_key = f"{name}.{module}.lora_B.weight"
+                        if rank_key not in text_encoder_lora_state_dict:
+                            continue
+                        rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1]
+
+                if network_alphas is not None:
+                    alpha_keys = [
+                        k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix
+                    ]
+                    network_alphas = {
+                        k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
+                    }
+
+                lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False)
+                if "use_dora" in lora_config_kwargs:
+                    if lora_config_kwargs["use_dora"]:
+                        if is_peft_version("<", "0.9.0"):
+                            raise ValueError(
+                                "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
+                            )
+                    else:
+                        if is_peft_version("<", "0.9.0"):
+                            lora_config_kwargs.pop("use_dora")
+                lora_config = LoraConfig(**lora_config_kwargs)
+
+                # adapter_name
+                if adapter_name is None:
+                    adapter_name = get_adapter_name(text_encoder)
+
+                is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline)
+
+                # inject LoRA layers and load the state dict
+                # in transformers we automatically check whether the adapter name is already in use or not
+                text_encoder.load_adapter(
+                    adapter_name=adapter_name,
+                    adapter_state_dict=text_encoder_lora_state_dict,
+                    peft_config=lora_config,
+                )
+
+                # scale LoRA layers with `lora_scale`
+                scale_lora_layers(text_encoder, weight=lora_scale)
+
+                text_encoder.to(device=text_encoder.device, dtype=text_encoder.dtype)
+
+                # Offload back.
+                if is_model_cpu_offload:
+                    _pipeline.enable_model_cpu_offload()
+                elif is_sequential_cpu_offload:
+                    _pipeline.enable_sequential_cpu_offload()
+                # Unsafe code />
+
+    @classmethod
+    def save_lora_weights(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        transformer_lora_layers: Dict[str, torch.nn.Module] = None,
+        text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+    ):
+        r"""
+        Save the LoRA parameters corresponding to the UNet and text encoder.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save LoRA parameters to. Will be created if it doesn't exist.
+            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `transformer`.
+            text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text
+                encoder LoRA state dict because it comes from 🤗 Transformers.
+            text_encoder_2_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `text_encoder_2`. Must explicitly pass the text
+                encoder LoRA state dict because it comes from 🤗 Transformers.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+        """
+        state_dict = {}
+
+        if not (transformer_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers):
+            raise ValueError(
+                "You must pass at least one of `transformer_lora_layers`, `text_encoder_lora_layers`, `text_encoder_2_lora_layers`."
+            )
+
+        if transformer_lora_layers:
+            state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name))
+
+        if text_encoder_lora_layers:
+            state_dict.update(cls.pack_weights(text_encoder_lora_layers, "text_encoder"))
+
+        if text_encoder_2_lora_layers:
+            state_dict.update(cls.pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
+
+        # Save the model
+        cls.write_lora_layers(
+            state_dict=state_dict,
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+
+    def fuse_lora(
+        self,
+        components: List[str] = ["transformer", "text_encoder", "text_encoder_2"],
+        lora_scale: float = 1.0,
+        safe_fusing: bool = False,
+        adapter_names: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        r"""
+        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
+            lora_scale (`float`, defaults to 1.0):
+                Controls how much to influence the outputs with the LoRA parameters.
+            safe_fusing (`bool`, defaults to `False`):
+                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
+            adapter_names (`List[str]`, *optional*):
+                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.fuse_lora(lora_scale=0.7)
+        ```
+        """
+        super().fuse_lora(
+            components=components, lora_scale=lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names
+        )
+
+    def unfuse_lora(self, components: List[str] = ["transformer", "text_encoder", "text_encoder_2"], **kwargs):
+        r"""
+        Reverses the effect of
+        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
+            unfuse_unet (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+            unfuse_text_encoder (`bool`, defaults to `True`):
+                Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
+                LoRA parameters then it won't have any effect.
+        """
+        super().unfuse_lora(components=components)
+
+
+class FluxLoraLoaderMixin(LoraBaseMixin):
+    r"""
+    Load LoRA layers into [`FluxTransformer2DModel`],
+    [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel).
+
+    Specific to [`StableDiffusion3Pipeline`].
+    """
+
+    _lora_loadable_modules = ["transformer", "text_encoder"]
+    transformer_name = TRANSFORMER_NAME
+    text_encoder_name = TEXT_ENCODER_NAME
+
+    @classmethod
+    @validate_hf_hub_args
+    def lora_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        return_alphas: bool = False,
+        **kwargs,
+    ):
+        r"""
+        Return state dict for lora weights and the network alphas.
+
+        <Tip warning={true}>
+
+        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
+
+        This function is experimental and might change in the future.
+
+        </Tip>
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+
+        """
+        # Load the main state dict first which has the LoRA layers for either of
+        # transformer and text encoder or both.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+
+        state_dict = cls._fetch_state_dict(
+            pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
+            weight_name=weight_name,
+            use_safetensors=use_safetensors,
+            local_files_only=local_files_only,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            allow_pickle=allow_pickle,
+        )
+
+        # TODO (sayakpaul): to a follow-up to clean and try to unify the conditions.
+
+        is_kohya = any(".lora_down.weight" in k for k in state_dict)
+        if is_kohya:
+            state_dict = _convert_kohya_flux_lora_to_diffusers(state_dict)
+            # Kohya already takes care of scaling the LoRA parameters with alpha.
+            return (state_dict, None) if return_alphas else state_dict
+
+        is_xlabs = any("processor" in k for k in state_dict)
+        if is_xlabs:
+            state_dict = _convert_xlabs_flux_lora_to_diffusers(state_dict)
+            # xlabs doesn't use `alpha`.
+            return (state_dict, None) if return_alphas else state_dict
+
+        # For state dicts like
+        # https://huggingface.co/TheLastBen/Jon_Snow_Flux_LoRA
+        keys = list(state_dict.keys())
+        network_alphas = {}
+        for k in keys:
+            if "alpha" in k:
+                alpha_value = state_dict.get(k)
+                if (torch.is_tensor(alpha_value) and torch.is_floating_point(alpha_value)) or isinstance(
+                    alpha_value, float
+                ):
+                    network_alphas[k] = state_dict.pop(k)
+                else:
+                    raise ValueError(
+                        f"The alpha key ({k}) seems to be incorrect. If you think this error is unexpected, please open as issue."
+                    )
+
+        if return_alphas:
+            return state_dict, network_alphas
+        else:
+            return state_dict
+
+    def load_lora_weights(
+        self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs
+    ):
+        """
+        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
+        `self.text_encoder`.
+
+        All kwargs are forwarded to `self.lora_state_dict`.
+
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is
+        loaded.
+
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
+        dict is loaded into `self.transformer`.
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+            kwargs (`dict`, *optional*):
+                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        # if a dict is passed, copy it instead of modifying it inplace
+        if isinstance(pretrained_model_name_or_path_or_dict, dict):
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
+
+        # First, ensure that the checkpoint is a compatible one and can be successfully loaded.
+        state_dict, network_alphas = self.lora_state_dict(
+            pretrained_model_name_or_path_or_dict, return_alphas=True, **kwargs
+        )
+
+        is_correct_format = all("lora" in key or "dora_scale" in key for key in state_dict.keys())
+        if not is_correct_format:
+            raise ValueError("Invalid LoRA checkpoint.")
+
+        self.load_lora_into_transformer(
+            state_dict,
+            network_alphas=network_alphas,
+            transformer=getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer,
+            adapter_name=adapter_name,
+            _pipeline=self,
+        )
+
+        text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
+        if len(text_encoder_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_state_dict,
+                network_alphas=network_alphas,
+                text_encoder=self.text_encoder,
+                prefix="text_encoder",
+                lora_scale=self.lora_scale,
+                adapter_name=adapter_name,
+                _pipeline=self,
+            )
+
+    @classmethod
+    def load_lora_into_transformer(cls, state_dict, network_alphas, transformer, adapter_name=None, _pipeline=None):
+        """
+        This will load the LoRA layers specified in `state_dict` into `transformer`.
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
+                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
+                encoder lora layers.
+            network_alphas (`Dict[str, float]`):
+                The value of the network alpha used for stable learning and preventing underflow. This value has the
+                same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+                link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
+            transformer (`SD3Transformer2DModel`):
+                The Transformer model to load the LoRA layers into.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
+
+        keys = list(state_dict.keys())
+
+        transformer_keys = [k for k in keys if k.startswith(cls.transformer_name)]
+        state_dict = {
+            k.replace(f"{cls.transformer_name}.", ""): v for k, v in state_dict.items() if k in transformer_keys
+        }
+
+        if len(state_dict.keys()) > 0:
+            # check with first key if is not in peft format
+            first_key = next(iter(state_dict.keys()))
+            if "lora_A" not in first_key:
+                state_dict = convert_unet_state_dict_to_peft(state_dict)
+
+            if adapter_name in getattr(transformer, "peft_config", {}):
+                raise ValueError(
+                    f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name."
+                )
+
+            rank = {}
+            for key, val in state_dict.items():
+                if "lora_B" in key:
+                    rank[key] = val.shape[1]
+
+            if network_alphas is not None and len(network_alphas) >= 1:
+                prefix = cls.transformer_name
+                alpha_keys = [k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix]
+                network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
+
+            lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=network_alphas, peft_state_dict=state_dict)
+            if "use_dora" in lora_config_kwargs:
+                if lora_config_kwargs["use_dora"] and is_peft_version("<", "0.9.0"):
+                    raise ValueError(
+                        "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
+                    )
+                else:
+                    lora_config_kwargs.pop("use_dora")
+            lora_config = LoraConfig(**lora_config_kwargs)
+
+            # adapter_name
+            if adapter_name is None:
+                adapter_name = get_adapter_name(transformer)
+
+            # In case the pipeline has been already offloaded to CPU - temporarily remove the hooks
+            # otherwise loading LoRA weights will lead to an error
+            is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline)
+
+            inject_adapter_in_model(lora_config, transformer, adapter_name=adapter_name)
+            incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name)
+
+            if incompatible_keys is not None:
+                # check only for unexpected keys
+                unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+                if unexpected_keys:
+                    logger.warning(
+                        f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                        f" {unexpected_keys}. "
+                    )
+
+            # Offload back.
+            if is_model_cpu_offload:
+                _pipeline.enable_model_cpu_offload()
+            elif is_sequential_cpu_offload:
+                _pipeline.enable_sequential_cpu_offload()
+            # Unsafe code />
+
+    @classmethod
+    # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder
+    def load_lora_into_text_encoder(
+        cls,
+        state_dict,
+        network_alphas,
+        text_encoder,
+        prefix=None,
+        lora_scale=1.0,
+        adapter_name=None,
+        _pipeline=None,
+    ):
+        """
+        This will load the LoRA layers specified in `state_dict` into `text_encoder`
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The key should be prefixed with an
+                additional `text_encoder` to distinguish between unet lora layers.
+            network_alphas (`Dict[str, float]`):
+                The value of the network alpha used for stable learning and preventing underflow. This value has the
+                same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+                link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
+            text_encoder (`CLIPTextModel`):
+                The text encoder model to load the LoRA layers into.
+            prefix (`str`):
+                Expected prefix of the `text_encoder` in the `state_dict`.
+            lora_scale (`float`):
+                How much to scale the output of the lora linear layer before it is added with the output of the regular
+                lora layer.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        from peft import LoraConfig
+
+        # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
+        # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
+        # their prefixes.
+        keys = list(state_dict.keys())
+        prefix = cls.text_encoder_name if prefix is None else prefix
+
+        # Safe prefix to check with.
+        if any(cls.text_encoder_name in key for key in keys):
+            # Load the layers corresponding to text encoder and make necessary adjustments.
+            text_encoder_keys = [k for k in keys if k.startswith(prefix) and k.split(".")[0] == prefix]
+            text_encoder_lora_state_dict = {
+                k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
+            }
+
+            if len(text_encoder_lora_state_dict) > 0:
+                logger.info(f"Loading {prefix}.")
+                rank = {}
+                text_encoder_lora_state_dict = convert_state_dict_to_diffusers(text_encoder_lora_state_dict)
+
+                # convert state dict
+                text_encoder_lora_state_dict = convert_state_dict_to_peft(text_encoder_lora_state_dict)
+
+                for name, _ in text_encoder_attn_modules(text_encoder):
+                    for module in ("out_proj", "q_proj", "k_proj", "v_proj"):
+                        rank_key = f"{name}.{module}.lora_B.weight"
+                        if rank_key not in text_encoder_lora_state_dict:
+                            continue
+                        rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1]
+
+                for name, _ in text_encoder_mlp_modules(text_encoder):
+                    for module in ("fc1", "fc2"):
+                        rank_key = f"{name}.{module}.lora_B.weight"
+                        if rank_key not in text_encoder_lora_state_dict:
+                            continue
+                        rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1]
+
+                if network_alphas is not None:
+                    alpha_keys = [
+                        k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix
+                    ]
+                    network_alphas = {
+                        k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
+                    }
+
+                lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False)
+                if "use_dora" in lora_config_kwargs:
+                    if lora_config_kwargs["use_dora"]:
+                        if is_peft_version("<", "0.9.0"):
+                            raise ValueError(
+                                "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
+                            )
+                    else:
+                        if is_peft_version("<", "0.9.0"):
+                            lora_config_kwargs.pop("use_dora")
+                lora_config = LoraConfig(**lora_config_kwargs)
+
+                # adapter_name
+                if adapter_name is None:
+                    adapter_name = get_adapter_name(text_encoder)
+
+                is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline)
+
+                # inject LoRA layers and load the state dict
+                # in transformers we automatically check whether the adapter name is already in use or not
+                text_encoder.load_adapter(
+                    adapter_name=adapter_name,
+                    adapter_state_dict=text_encoder_lora_state_dict,
+                    peft_config=lora_config,
+                )
+
+                # scale LoRA layers with `lora_scale`
+                scale_lora_layers(text_encoder, weight=lora_scale)
+
+                text_encoder.to(device=text_encoder.device, dtype=text_encoder.dtype)
+
+                # Offload back.
+                if is_model_cpu_offload:
+                    _pipeline.enable_model_cpu_offload()
+                elif is_sequential_cpu_offload:
+                    _pipeline.enable_sequential_cpu_offload()
+                # Unsafe code />
+
+    @classmethod
+    # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.save_lora_weights with unet->transformer
+    def save_lora_weights(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        transformer_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_lora_layers: Dict[str, torch.nn.Module] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+    ):
+        r"""
+        Save the LoRA parameters corresponding to the UNet and text encoder.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save LoRA parameters to. Will be created if it doesn't exist.
+            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `transformer`.
+            text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text
+                encoder LoRA state dict because it comes from 🤗 Transformers.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+        """
+        state_dict = {}
+
+        if not (transformer_lora_layers or text_encoder_lora_layers):
+            raise ValueError("You must pass at least one of `transformer_lora_layers` and `text_encoder_lora_layers`.")
+
+        if transformer_lora_layers:
+            state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name))
+
+        if text_encoder_lora_layers:
+            state_dict.update(cls.pack_weights(text_encoder_lora_layers, cls.text_encoder_name))
+
+        # Save the model
+        cls.write_lora_layers(
+            state_dict=state_dict,
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+
+    # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.fuse_lora with unet->transformer
+    def fuse_lora(
+        self,
+        components: List[str] = ["transformer", "text_encoder"],
+        lora_scale: float = 1.0,
+        safe_fusing: bool = False,
+        adapter_names: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        r"""
+        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
+            lora_scale (`float`, defaults to 1.0):
+                Controls how much to influence the outputs with the LoRA parameters.
+            safe_fusing (`bool`, defaults to `False`):
+                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
+            adapter_names (`List[str]`, *optional*):
+                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.fuse_lora(lora_scale=0.7)
+        ```
+        """
+        super().fuse_lora(
+            components=components, lora_scale=lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names
+        )
+
+    def unfuse_lora(self, components: List[str] = ["transformer", "text_encoder"], **kwargs):
+        r"""
+        Reverses the effect of
+        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
+        """
+        super().unfuse_lora(components=components)
+
+
+# The reason why we subclass from `StableDiffusionLoraLoaderMixin` here is because Amused initially
+# relied on `StableDiffusionLoraLoaderMixin` for its LoRA support.
+class AmusedLoraLoaderMixin(StableDiffusionLoraLoaderMixin):
+    _lora_loadable_modules = ["transformer", "text_encoder"]
+    transformer_name = TRANSFORMER_NAME
+    text_encoder_name = TEXT_ENCODER_NAME
+
+    @classmethod
+    def load_lora_into_transformer(cls, state_dict, network_alphas, transformer, adapter_name=None, _pipeline=None):
+        """
+        This will load the LoRA layers specified in `state_dict` into `transformer`.
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
+                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
+                encoder lora layers.
+            network_alphas (`Dict[str, float]`):
+                The value of the network alpha used for stable learning and preventing underflow. This value has the
+                same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+                link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
+            unet (`UNet2DConditionModel`):
+                The UNet model to load the LoRA layers into.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
+
+        keys = list(state_dict.keys())
+
+        transformer_keys = [k for k in keys if k.startswith(cls.transformer_name)]
+        state_dict = {
+            k.replace(f"{cls.transformer_name}.", ""): v for k, v in state_dict.items() if k in transformer_keys
+        }
+
+        if network_alphas is not None:
+            alpha_keys = [k for k in network_alphas.keys() if k.startswith(cls.transformer_name)]
+            network_alphas = {
+                k.replace(f"{cls.transformer_name}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
+            }
+
+        if len(state_dict.keys()) > 0:
+            if adapter_name in getattr(transformer, "peft_config", {}):
+                raise ValueError(
+                    f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name."
+                )
+
+            rank = {}
+            for key, val in state_dict.items():
+                if "lora_B" in key:
+                    rank[key] = val.shape[1]
+
+            lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict)
+            if "use_dora" in lora_config_kwargs:
+                if lora_config_kwargs["use_dora"] and is_peft_version("<", "0.9.0"):
+                    raise ValueError(
+                        "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
+                    )
+                else:
+                    lora_config_kwargs.pop("use_dora")
+            lora_config = LoraConfig(**lora_config_kwargs)
+
+            # adapter_name
+            if adapter_name is None:
+                adapter_name = get_adapter_name(transformer)
+
+            # In case the pipeline has been already offloaded to CPU - temporarily remove the hooks
+            # otherwise loading LoRA weights will lead to an error
+            is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline)
+
+            inject_adapter_in_model(lora_config, transformer, adapter_name=adapter_name)
+            incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name)
+
+            if incompatible_keys is not None:
+                # check only for unexpected keys
+                unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+                if unexpected_keys:
+                    logger.warning(
+                        f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                        f" {unexpected_keys}. "
+                    )
+
+            # Offload back.
+            if is_model_cpu_offload:
+                _pipeline.enable_model_cpu_offload()
+            elif is_sequential_cpu_offload:
+                _pipeline.enable_sequential_cpu_offload()
+            # Unsafe code />
+
+    @classmethod
+    # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder
+    def load_lora_into_text_encoder(
+        cls,
+        state_dict,
+        network_alphas,
+        text_encoder,
+        prefix=None,
+        lora_scale=1.0,
+        adapter_name=None,
+        _pipeline=None,
+    ):
+        """
+        This will load the LoRA layers specified in `state_dict` into `text_encoder`
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The key should be prefixed with an
+                additional `text_encoder` to distinguish between unet lora layers.
+            network_alphas (`Dict[str, float]`):
+                The value of the network alpha used for stable learning and preventing underflow. This value has the
+                same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+                link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
+            text_encoder (`CLIPTextModel`):
+                The text encoder model to load the LoRA layers into.
+            prefix (`str`):
+                Expected prefix of the `text_encoder` in the `state_dict`.
+            lora_scale (`float`):
+                How much to scale the output of the lora linear layer before it is added with the output of the regular
+                lora layer.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        from peft import LoraConfig
+
+        # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
+        # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
+        # their prefixes.
+        keys = list(state_dict.keys())
+        prefix = cls.text_encoder_name if prefix is None else prefix
+
+        # Safe prefix to check with.
+        if any(cls.text_encoder_name in key for key in keys):
+            # Load the layers corresponding to text encoder and make necessary adjustments.
+            text_encoder_keys = [k for k in keys if k.startswith(prefix) and k.split(".")[0] == prefix]
+            text_encoder_lora_state_dict = {
+                k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
+            }
+
+            if len(text_encoder_lora_state_dict) > 0:
+                logger.info(f"Loading {prefix}.")
+                rank = {}
+                text_encoder_lora_state_dict = convert_state_dict_to_diffusers(text_encoder_lora_state_dict)
+
+                # convert state dict
+                text_encoder_lora_state_dict = convert_state_dict_to_peft(text_encoder_lora_state_dict)
+
+                for name, _ in text_encoder_attn_modules(text_encoder):
+                    for module in ("out_proj", "q_proj", "k_proj", "v_proj"):
+                        rank_key = f"{name}.{module}.lora_B.weight"
+                        if rank_key not in text_encoder_lora_state_dict:
+                            continue
+                        rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1]
+
+                for name, _ in text_encoder_mlp_modules(text_encoder):
+                    for module in ("fc1", "fc2"):
+                        rank_key = f"{name}.{module}.lora_B.weight"
+                        if rank_key not in text_encoder_lora_state_dict:
+                            continue
+                        rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1]
+
+                if network_alphas is not None:
+                    alpha_keys = [
+                        k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix
+                    ]
+                    network_alphas = {
+                        k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
+                    }
+
+                lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False)
+                if "use_dora" in lora_config_kwargs:
+                    if lora_config_kwargs["use_dora"]:
+                        if is_peft_version("<", "0.9.0"):
+                            raise ValueError(
+                                "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
+                            )
+                    else:
+                        if is_peft_version("<", "0.9.0"):
+                            lora_config_kwargs.pop("use_dora")
+                lora_config = LoraConfig(**lora_config_kwargs)
+
+                # adapter_name
+                if adapter_name is None:
+                    adapter_name = get_adapter_name(text_encoder)
+
+                is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline)
+
+                # inject LoRA layers and load the state dict
+                # in transformers we automatically check whether the adapter name is already in use or not
+                text_encoder.load_adapter(
+                    adapter_name=adapter_name,
+                    adapter_state_dict=text_encoder_lora_state_dict,
+                    peft_config=lora_config,
+                )
+
+                # scale LoRA layers with `lora_scale`
+                scale_lora_layers(text_encoder, weight=lora_scale)
+
+                text_encoder.to(device=text_encoder.device, dtype=text_encoder.dtype)
+
+                # Offload back.
+                if is_model_cpu_offload:
+                    _pipeline.enable_model_cpu_offload()
+                elif is_sequential_cpu_offload:
+                    _pipeline.enable_sequential_cpu_offload()
+                # Unsafe code />
+
+    @classmethod
+    def save_lora_weights(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        text_encoder_lora_layers: Dict[str, torch.nn.Module] = None,
+        transformer_lora_layers: Dict[str, torch.nn.Module] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+    ):
+        r"""
+        Save the LoRA parameters corresponding to the UNet and text encoder.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save LoRA parameters to. Will be created if it doesn't exist.
+            unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `unet`.
+            text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text
+                encoder LoRA state dict because it comes from 🤗 Transformers.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+        """
+        state_dict = {}
+
+        if not (transformer_lora_layers or text_encoder_lora_layers):
+            raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.")
+
+        if transformer_lora_layers:
+            state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name))
+
+        if text_encoder_lora_layers:
+            state_dict.update(cls.pack_weights(text_encoder_lora_layers, cls.text_encoder_name))
+
+        # Save the model
+        cls.write_lora_layers(
+            state_dict=state_dict,
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+
+
+class CogVideoXLoraLoaderMixin(LoraBaseMixin):
+    r"""
+    Load LoRA layers into [`CogVideoXTransformer3DModel`]. Specific to [`CogVideoX`].
+    """
+
+    _lora_loadable_modules = ["transformer"]
+    transformer_name = TRANSFORMER_NAME
+
+    @classmethod
+    @validate_hf_hub_args
+    # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.lora_state_dict
+    def lora_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        r"""
+        Return state dict for lora weights and the network alphas.
+
+        <Tip warning={true}>
+
+        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
+
+        This function is experimental and might change in the future.
+
+        </Tip>
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+
+        """
+        # Load the main state dict first which has the LoRA layers for either of
+        # transformer and text encoder or both.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+
+        state_dict = cls._fetch_state_dict(
+            pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
+            weight_name=weight_name,
+            use_safetensors=use_safetensors,
+            local_files_only=local_files_only,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            allow_pickle=allow_pickle,
+        )
+
+        return state_dict
+
+    def load_lora_weights(
+        self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs
+    ):
+        """
+        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
+        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
+        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
+        dict is loaded into `self.transformer`.
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+            kwargs (`dict`, *optional*):
+                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        # if a dict is passed, copy it instead of modifying it inplace
+        if isinstance(pretrained_model_name_or_path_or_dict, dict):
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
+
+        # First, ensure that the checkpoint is a compatible one and can be successfully loaded.
+        state_dict = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
+
+        is_correct_format = all("lora" in key or "dora_scale" in key for key in state_dict.keys())
+        if not is_correct_format:
+            raise ValueError("Invalid LoRA checkpoint.")
+
+        self.load_lora_into_transformer(
+            state_dict,
+            transformer=getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer,
+            adapter_name=adapter_name,
+            _pipeline=self,
+        )
+
+    @classmethod
+    # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer
+    def load_lora_into_transformer(cls, state_dict, transformer, adapter_name=None, _pipeline=None):
+        """
+        This will load the LoRA layers specified in `state_dict` into `transformer`.
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
+                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
+                encoder lora layers.
+            transformer (`SD3Transformer2DModel`):
+                The Transformer model to load the LoRA layers into.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
+
+        keys = list(state_dict.keys())
+
+        transformer_keys = [k for k in keys if k.startswith(cls.transformer_name)]
+        state_dict = {
+            k.replace(f"{cls.transformer_name}.", ""): v for k, v in state_dict.items() if k in transformer_keys
+        }
+
+        if len(state_dict.keys()) > 0:
+            # check with first key if is not in peft format
+            first_key = next(iter(state_dict.keys()))
+            if "lora_A" not in first_key:
+                state_dict = convert_unet_state_dict_to_peft(state_dict)
+
+            if adapter_name in getattr(transformer, "peft_config", {}):
+                raise ValueError(
+                    f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name."
+                )
+
+            rank = {}
+            for key, val in state_dict.items():
+                if "lora_B" in key:
+                    rank[key] = val.shape[1]
+
+            lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=None, peft_state_dict=state_dict)
+            if "use_dora" in lora_config_kwargs:
+                if lora_config_kwargs["use_dora"] and is_peft_version("<", "0.9.0"):
+                    raise ValueError(
+                        "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
+                    )
+                else:
+                    lora_config_kwargs.pop("use_dora")
+            lora_config = LoraConfig(**lora_config_kwargs)
+
+            # adapter_name
+            if adapter_name is None:
+                adapter_name = get_adapter_name(transformer)
+
+            # In case the pipeline has been already offloaded to CPU - temporarily remove the hooks
+            # otherwise loading LoRA weights will lead to an error
+            is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline)
+
+            inject_adapter_in_model(lora_config, transformer, adapter_name=adapter_name)
+            incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name)
+
+            if incompatible_keys is not None:
+                # check only for unexpected keys
+                unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+                if unexpected_keys:
+                    logger.warning(
+                        f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                        f" {unexpected_keys}. "
+                    )
+
+            # Offload back.
+            if is_model_cpu_offload:
+                _pipeline.enable_model_cpu_offload()
+            elif is_sequential_cpu_offload:
+                _pipeline.enable_sequential_cpu_offload()
+            # Unsafe code />
+
+    @classmethod
+    # Adapted from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.save_lora_weights without support for text encoder
+    def save_lora_weights(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        transformer_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+    ):
+        r"""
+        Save the LoRA parameters corresponding to the UNet and text encoder.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save LoRA parameters to. Will be created if it doesn't exist.
+            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `transformer`.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+        """
+        state_dict = {}
+
+        if not transformer_lora_layers:
+            raise ValueError("You must pass `transformer_lora_layers`.")
+
+        if transformer_lora_layers:
+            state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name))
+
+        # Save the model
+        cls.write_lora_layers(
+            state_dict=state_dict,
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+
+    # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.fuse_lora with unet->transformer
+    def fuse_lora(
+        self,
+        components: List[str] = ["transformer", "text_encoder"],
+        lora_scale: float = 1.0,
+        safe_fusing: bool = False,
+        adapter_names: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        r"""
+        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
+            lora_scale (`float`, defaults to 1.0):
+                Controls how much to influence the outputs with the LoRA parameters.
+            safe_fusing (`bool`, defaults to `False`):
+                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
+            adapter_names (`List[str]`, *optional*):
+                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.fuse_lora(lora_scale=0.7)
+        ```
+        """
+        super().fuse_lora(
+            components=components, lora_scale=lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names
+        )
+
+    # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.unfuse_lora with unet->transformer
+    def unfuse_lora(self, components: List[str] = ["transformer", "text_encoder"], **kwargs):
+        r"""
+        Reverses the effect of
+        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
+            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+            unfuse_text_encoder (`bool`, defaults to `True`):
+                Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
+                LoRA parameters then it won't have any effect.
+        """
+        super().unfuse_lora(components=components)
+
+
+class LoraLoaderMixin(StableDiffusionLoraLoaderMixin):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "LoraLoaderMixin is deprecated and this will be removed in a future version. Please use `StableDiffusionLoraLoaderMixin`, instead."
+        deprecate("LoraLoaderMixin", "1.0.0", deprecation_message)
+        super().__init__(*args, **kwargs)
diff --git a/diffusers/src/diffusers/loaders/peft.py b/diffusers/src/diffusers/loaders/peft.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1c6721512fafcc2f7ff6b56d0be15ea26a183ef
--- /dev/null
+++ b/diffusers/src/diffusers/loaders/peft.py
@@ -0,0 +1,396 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+from ..utils import (
+    MIN_PEFT_VERSION,
+    USE_PEFT_BACKEND,
+    check_peft_version,
+    delete_adapter_layers,
+    is_peft_available,
+    set_adapter_layers,
+    set_weights_and_activate_adapters,
+)
+from .unet_loader_utils import _maybe_expand_lora_scales
+
+
+_SET_ADAPTER_SCALE_FN_MAPPING = {
+    "UNet2DConditionModel": _maybe_expand_lora_scales,
+    "UNetMotionModel": _maybe_expand_lora_scales,
+    "SD3Transformer2DModel": lambda model_cls, weights: weights,
+    "FluxTransformer2DModel": lambda model_cls, weights: weights,
+    "CogVideoXTransformer3DModel": lambda model_cls, weights: weights,
+}
+
+
+class PeftAdapterMixin:
+    """
+    A class containing all functions for loading and using adapters weights that are supported in PEFT library. For
+    more details about adapters and injecting them in a base model, check out the PEFT
+    [documentation](https://huggingface.co/docs/peft/index).
+
+    Install the latest version of PEFT, and use this mixin to:
+
+    - Attach new adapters in the model.
+    - Attach multiple adapters and iteratively activate/deactivate them.
+    - Activate/deactivate all adapters from the model.
+    - Get a list of the active adapters.
+    """
+
+    _hf_peft_config_loaded = False
+
+    def set_adapters(
+        self,
+        adapter_names: Union[List[str], str],
+        weights: Optional[Union[float, Dict, List[float], List[Dict], List[None]]] = None,
+    ):
+        """
+        Set the currently active adapters for use in the UNet.
+
+        Args:
+            adapter_names (`List[str]` or `str`):
+                The names of the adapters to use.
+            adapter_weights (`Union[List[float], float]`, *optional*):
+                The adapter(s) weights to use with the UNet. If `None`, the weights are set to `1.0` for all the
+                adapters.
+
+        Example:
+
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.set_adapters(["cinematic", "pixel"], adapter_weights=[0.5, 0.5])
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for `set_adapters()`.")
+
+        adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names
+
+        # Expand weights into a list, one entry per adapter
+        # examples for e.g. 2 adapters:  [{...}, 7] -> [7,7] ; None -> [None, None]
+        if not isinstance(weights, list):
+            weights = [weights] * len(adapter_names)
+
+        if len(adapter_names) != len(weights):
+            raise ValueError(
+                f"Length of adapter names {len(adapter_names)} is not equal to the length of their weights {len(weights)}."
+            )
+
+        # Set None values to default of 1.0
+        # e.g. [{...}, 7] -> [{...}, 7] ; [None, None] -> [1.0, 1.0]
+        weights = [w if w is not None else 1.0 for w in weights]
+
+        # e.g. [{...}, 7] -> [{expanded dict...}, 7]
+        scale_expansion_fn = _SET_ADAPTER_SCALE_FN_MAPPING[self.__class__.__name__]
+        weights = scale_expansion_fn(self, weights)
+
+        set_weights_and_activate_adapters(self, adapter_names, weights)
+
+    def add_adapter(self, adapter_config, adapter_name: str = "default") -> None:
+        r"""
+        Adds a new adapter to the current model for training. If no adapter name is passed, a default name is assigned
+        to the adapter to follow the convention of the PEFT library.
+
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them in the PEFT
+        [documentation](https://huggingface.co/docs/peft).
+
+        Args:
+            adapter_config (`[~peft.PeftConfig]`):
+                The configuration of the adapter to add; supported adapters are non-prefix tuning and adaption prompt
+                methods.
+            adapter_name (`str`, *optional*, defaults to `"default"`):
+                The name of the adapter to add. If no name is passed, a default name is assigned to the adapter.
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not is_peft_available():
+            raise ImportError("PEFT is not available. Please install PEFT to use this function: `pip install peft`.")
+
+        from peft import PeftConfig, inject_adapter_in_model
+
+        if not self._hf_peft_config_loaded:
+            self._hf_peft_config_loaded = True
+        elif adapter_name in self.peft_config:
+            raise ValueError(f"Adapter with name {adapter_name} already exists. Please use a different name.")
+
+        if not isinstance(adapter_config, PeftConfig):
+            raise ValueError(
+                f"adapter_config should be an instance of PeftConfig. Got {type(adapter_config)} instead."
+            )
+
+        # Unlike transformers, here we don't need to retrieve the name_or_path of the unet as the loading logic is
+        # handled by the `load_lora_layers` or `StableDiffusionLoraLoaderMixin`. Therefore we set it to `None` here.
+        adapter_config.base_model_name_or_path = None
+        inject_adapter_in_model(adapter_config, self, adapter_name)
+        self.set_adapter(adapter_name)
+
+    def set_adapter(self, adapter_name: Union[str, List[str]]) -> None:
+        """
+        Sets a specific adapter by forcing the model to only use that adapter and disables the other adapters.
+
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        [documentation](https://huggingface.co/docs/peft).
+
+        Args:
+            adapter_name (Union[str, List[str]])):
+                The list of adapters to set or the adapter name in the case of a single adapter.
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+
+        if isinstance(adapter_name, str):
+            adapter_name = [adapter_name]
+
+        missing = set(adapter_name) - set(self.peft_config)
+        if len(missing) > 0:
+            raise ValueError(
+                f"Following adapter(s) could not be found: {', '.join(missing)}. Make sure you are passing the correct adapter name(s)."
+                f" current loaded adapters are: {list(self.peft_config.keys())}"
+            )
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        _adapters_has_been_set = False
+
+        for _, module in self.named_modules():
+            if isinstance(module, BaseTunerLayer):
+                if hasattr(module, "set_adapter"):
+                    module.set_adapter(adapter_name)
+                # Previous versions of PEFT does not support multi-adapter inference
+                elif not hasattr(module, "set_adapter") and len(adapter_name) != 1:
+                    raise ValueError(
+                        "You are trying to set multiple adapters and you have a PEFT version that does not support multi-adapter inference. Please upgrade to the latest version of PEFT."
+                        " `pip install -U peft` or `pip install -U git+https://github.com/huggingface/peft.git`"
+                    )
+                else:
+                    module.active_adapter = adapter_name
+                _adapters_has_been_set = True
+
+        if not _adapters_has_been_set:
+            raise ValueError(
+                "Did not succeeded in setting the adapter. Please make sure you are using a model that supports adapters."
+            )
+
+    def disable_adapters(self) -> None:
+        r"""
+        Disable all adapters attached to the model and fallback to inference with the base model only.
+
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        [documentation](https://huggingface.co/docs/peft).
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        for _, module in self.named_modules():
+            if isinstance(module, BaseTunerLayer):
+                if hasattr(module, "enable_adapters"):
+                    module.enable_adapters(enabled=False)
+                else:
+                    # support for older PEFT versions
+                    module.disable_adapters = True
+
+    def enable_adapters(self) -> None:
+        """
+        Enable adapters that are attached to the model. The model uses `self.active_adapters()` to retrieve the list of
+        adapters to enable.
+
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        [documentation](https://huggingface.co/docs/peft).
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        for _, module in self.named_modules():
+            if isinstance(module, BaseTunerLayer):
+                if hasattr(module, "enable_adapters"):
+                    module.enable_adapters(enabled=True)
+                else:
+                    # support for older PEFT versions
+                    module.disable_adapters = False
+
+    def active_adapters(self) -> List[str]:
+        """
+        Gets the current list of active adapters of the model.
+
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        [documentation](https://huggingface.co/docs/peft).
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not is_peft_available():
+            raise ImportError("PEFT is not available. Please install PEFT to use this function: `pip install peft`.")
+
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        for _, module in self.named_modules():
+            if isinstance(module, BaseTunerLayer):
+                return module.active_adapter
+
+    def fuse_lora(self, lora_scale=1.0, safe_fusing=False, adapter_names=None):
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for `fuse_lora()`.")
+
+        self.lora_scale = lora_scale
+        self._safe_fusing = safe_fusing
+        self.apply(partial(self._fuse_lora_apply, adapter_names=adapter_names))
+
+    def _fuse_lora_apply(self, module, adapter_names=None):
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        merge_kwargs = {"safe_merge": self._safe_fusing}
+
+        if isinstance(module, BaseTunerLayer):
+            if self.lora_scale != 1.0:
+                module.scale_layer(self.lora_scale)
+
+            # For BC with prevous PEFT versions, we need to check the signature
+            # of the `merge` method to see if it supports the `adapter_names` argument.
+            supported_merge_kwargs = list(inspect.signature(module.merge).parameters)
+            if "adapter_names" in supported_merge_kwargs:
+                merge_kwargs["adapter_names"] = adapter_names
+            elif "adapter_names" not in supported_merge_kwargs and adapter_names is not None:
+                raise ValueError(
+                    "The `adapter_names` argument is not supported with your PEFT version. Please upgrade"
+                    " to the latest version of PEFT. `pip install -U peft`"
+                )
+
+            module.merge(**merge_kwargs)
+
+    def unfuse_lora(self):
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for `unfuse_lora()`.")
+        self.apply(self._unfuse_lora_apply)
+
+    def _unfuse_lora_apply(self, module):
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        if isinstance(module, BaseTunerLayer):
+            module.unmerge()
+
+    def unload_lora(self):
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for `unload_lora()`.")
+
+        from ..utils import recurse_remove_peft_layers
+
+        recurse_remove_peft_layers(self)
+        if hasattr(self, "peft_config"):
+            del self.peft_config
+
+    def disable_lora(self):
+        """
+        Disables the active LoRA layers of the underlying model.
+
+        Example:
+
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        pipeline.disable_lora()
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+        set_adapter_layers(self, enabled=False)
+
+    def enable_lora(self):
+        """
+        Enables the active LoRA layers of the underlying model.
+
+        Example:
+
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        pipeline.enable_lora()
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+        set_adapter_layers(self, enabled=True)
+
+    def delete_adapters(self, adapter_names: Union[List[str], str]):
+        """
+        Delete an adapter's LoRA layers from the underlying model.
+
+        Args:
+            adapter_names (`Union[List[str], str]`):
+                The names (single string or list of strings) of the adapter to delete.
+
+        Example:
+
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_names="cinematic"
+        )
+        pipeline.delete_adapters("cinematic")
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        if isinstance(adapter_names, str):
+            adapter_names = [adapter_names]
+
+        for adapter_name in adapter_names:
+            delete_adapter_layers(self, adapter_name)
+
+            # Pop also the corresponding adapter from the config
+            if hasattr(self, "peft_config"):
+                self.peft_config.pop(adapter_name, None)
diff --git a/diffusers/src/diffusers/loaders/single_file.py b/diffusers/src/diffusers/loaders/single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0cbfc7138572d0c6aa74e51776920eeed29d864
--- /dev/null
+++ b/diffusers/src/diffusers/loaders/single_file.py
@@ -0,0 +1,550 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import inspect
+import os
+
+import torch
+from huggingface_hub import snapshot_download
+from huggingface_hub.utils import LocalEntryNotFoundError, validate_hf_hub_args
+from packaging import version
+
+from ..utils import deprecate, is_transformers_available, logging
+from .single_file_utils import (
+    SingleFileComponentError,
+    _is_legacy_scheduler_kwargs,
+    _is_model_weights_in_cached_folder,
+    _legacy_load_clip_tokenizer,
+    _legacy_load_safety_checker,
+    _legacy_load_scheduler,
+    create_diffusers_clip_model_from_ldm,
+    create_diffusers_t5_model_from_checkpoint,
+    fetch_diffusers_config,
+    fetch_original_config,
+    is_clip_model_in_single_file,
+    is_t5_in_single_file,
+    load_single_file_checkpoint,
+)
+
+
+logger = logging.get_logger(__name__)
+
+# Legacy behaviour. `from_single_file` does not load the safety checker unless explicitly provided
+SINGLE_FILE_OPTIONAL_COMPONENTS = ["safety_checker"]
+
+if is_transformers_available():
+    import transformers
+    from transformers import PreTrainedModel, PreTrainedTokenizer
+
+
+def load_single_file_sub_model(
+    library_name,
+    class_name,
+    name,
+    checkpoint,
+    pipelines,
+    is_pipeline_module,
+    cached_model_config_path,
+    original_config=None,
+    local_files_only=False,
+    torch_dtype=None,
+    is_legacy_loading=False,
+    **kwargs,
+):
+    if is_pipeline_module:
+        pipeline_module = getattr(pipelines, library_name)
+        class_obj = getattr(pipeline_module, class_name)
+    else:
+        # else we just import it from the library.
+        library = importlib.import_module(library_name)
+        class_obj = getattr(library, class_name)
+
+    if is_transformers_available():
+        transformers_version = version.parse(version.parse(transformers.__version__).base_version)
+    else:
+        transformers_version = "N/A"
+
+    is_transformers_model = (
+        is_transformers_available()
+        and issubclass(class_obj, PreTrainedModel)
+        and transformers_version >= version.parse("4.20.0")
+    )
+    is_tokenizer = (
+        is_transformers_available()
+        and issubclass(class_obj, PreTrainedTokenizer)
+        and transformers_version >= version.parse("4.20.0")
+    )
+
+    diffusers_module = importlib.import_module(__name__.split(".")[0])
+    is_diffusers_single_file_model = issubclass(class_obj, diffusers_module.FromOriginalModelMixin)
+    is_diffusers_model = issubclass(class_obj, diffusers_module.ModelMixin)
+    is_diffusers_scheduler = issubclass(class_obj, diffusers_module.SchedulerMixin)
+
+    if is_diffusers_single_file_model:
+        load_method = getattr(class_obj, "from_single_file")
+
+        # We cannot provide two different config options to the `from_single_file` method
+        # Here we have to ignore loading the config from `cached_model_config_path` if `original_config` is provided
+        if original_config:
+            cached_model_config_path = None
+
+        loaded_sub_model = load_method(
+            pretrained_model_link_or_path_or_dict=checkpoint,
+            original_config=original_config,
+            config=cached_model_config_path,
+            subfolder=name,
+            torch_dtype=torch_dtype,
+            local_files_only=local_files_only,
+            **kwargs,
+        )
+
+    elif is_transformers_model and is_clip_model_in_single_file(class_obj, checkpoint):
+        loaded_sub_model = create_diffusers_clip_model_from_ldm(
+            class_obj,
+            checkpoint=checkpoint,
+            config=cached_model_config_path,
+            subfolder=name,
+            torch_dtype=torch_dtype,
+            local_files_only=local_files_only,
+            is_legacy_loading=is_legacy_loading,
+        )
+
+    elif is_transformers_model and is_t5_in_single_file(checkpoint):
+        loaded_sub_model = create_diffusers_t5_model_from_checkpoint(
+            class_obj,
+            checkpoint=checkpoint,
+            config=cached_model_config_path,
+            subfolder=name,
+            torch_dtype=torch_dtype,
+            local_files_only=local_files_only,
+        )
+
+    elif is_tokenizer and is_legacy_loading:
+        loaded_sub_model = _legacy_load_clip_tokenizer(
+            class_obj, checkpoint=checkpoint, config=cached_model_config_path, local_files_only=local_files_only
+        )
+
+    elif is_diffusers_scheduler and (is_legacy_loading or _is_legacy_scheduler_kwargs(kwargs)):
+        loaded_sub_model = _legacy_load_scheduler(
+            class_obj, checkpoint=checkpoint, component_name=name, original_config=original_config, **kwargs
+        )
+
+    else:
+        if not hasattr(class_obj, "from_pretrained"):
+            raise ValueError(
+                (
+                    f"The component {class_obj.__name__} cannot be loaded as it does not seem to have"
+                    " a supported loading method."
+                )
+            )
+
+        loading_kwargs = {}
+        loading_kwargs.update(
+            {
+                "pretrained_model_name_or_path": cached_model_config_path,
+                "subfolder": name,
+                "local_files_only": local_files_only,
+            }
+        )
+
+        # Schedulers and Tokenizers don't make use of torch_dtype
+        # Skip passing it to those objects
+        if issubclass(class_obj, torch.nn.Module):
+            loading_kwargs.update({"torch_dtype": torch_dtype})
+
+        if is_diffusers_model or is_transformers_model:
+            if not _is_model_weights_in_cached_folder(cached_model_config_path, name):
+                raise SingleFileComponentError(
+                    f"Failed to load {class_name}. Weights for this component appear to be missing in the checkpoint."
+                )
+
+        load_method = getattr(class_obj, "from_pretrained")
+        loaded_sub_model = load_method(**loading_kwargs)
+
+    return loaded_sub_model
+
+
+def _map_component_types_to_config_dict(component_types):
+    diffusers_module = importlib.import_module(__name__.split(".")[0])
+    config_dict = {}
+    component_types.pop("self", None)
+
+    if is_transformers_available():
+        transformers_version = version.parse(version.parse(transformers.__version__).base_version)
+    else:
+        transformers_version = "N/A"
+
+    for component_name, component_value in component_types.items():
+        is_diffusers_model = issubclass(component_value[0], diffusers_module.ModelMixin)
+        is_scheduler_enum = component_value[0].__name__ == "KarrasDiffusionSchedulers"
+        is_scheduler = issubclass(component_value[0], diffusers_module.SchedulerMixin)
+
+        is_transformers_model = (
+            is_transformers_available()
+            and issubclass(component_value[0], PreTrainedModel)
+            and transformers_version >= version.parse("4.20.0")
+        )
+        is_transformers_tokenizer = (
+            is_transformers_available()
+            and issubclass(component_value[0], PreTrainedTokenizer)
+            and transformers_version >= version.parse("4.20.0")
+        )
+
+        if is_diffusers_model and component_name not in SINGLE_FILE_OPTIONAL_COMPONENTS:
+            config_dict[component_name] = ["diffusers", component_value[0].__name__]
+
+        elif is_scheduler_enum or is_scheduler:
+            if is_scheduler_enum:
+                # Since we cannot fetch a scheduler config from the hub, we default to DDIMScheduler
+                # if the type hint is a KarrassDiffusionSchedulers enum
+                config_dict[component_name] = ["diffusers", "DDIMScheduler"]
+
+            elif is_scheduler:
+                config_dict[component_name] = ["diffusers", component_value[0].__name__]
+
+        elif (
+            is_transformers_model or is_transformers_tokenizer
+        ) and component_name not in SINGLE_FILE_OPTIONAL_COMPONENTS:
+            config_dict[component_name] = ["transformers", component_value[0].__name__]
+
+        else:
+            config_dict[component_name] = [None, None]
+
+    return config_dict
+
+
+def _infer_pipeline_config_dict(pipeline_class):
+    parameters = inspect.signature(pipeline_class.__init__).parameters
+    required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+    component_types = pipeline_class._get_signature_types()
+
+    # Ignore parameters that are not required for the pipeline
+    component_types = {k: v for k, v in component_types.items() if k in required_parameters}
+    config_dict = _map_component_types_to_config_dict(component_types)
+
+    return config_dict
+
+
+def _download_diffusers_model_config_from_hub(
+    pretrained_model_name_or_path,
+    cache_dir,
+    revision,
+    proxies,
+    force_download=None,
+    local_files_only=None,
+    token=None,
+):
+    allow_patterns = ["**/*.json", "*.json", "*.txt", "**/*.txt", "**/*.model"]
+    cached_model_path = snapshot_download(
+        pretrained_model_name_or_path,
+        cache_dir=cache_dir,
+        revision=revision,
+        proxies=proxies,
+        force_download=force_download,
+        local_files_only=local_files_only,
+        token=token,
+        allow_patterns=allow_patterns,
+    )
+
+    return cached_model_path
+
+
+class FromSingleFileMixin:
+    """
+    Load model weights saved in the `.ckpt` format into a [`DiffusionPipeline`].
+    """
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
+        r"""
+        Instantiate a [`DiffusionPipeline`] from pretrained pipeline weights saved in the `.ckpt` or `.safetensors`
+        format. The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A link to the `.ckpt` file (for example
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
+                    - A path to a *file* containing all pipeline weights.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            original_config_file (`str`, *optional*):
+                The path to the original config file that was used to train the model. If not provided, the config file
+                will be inferred from the checkpoint file.
+            config (`str`, *optional*):
+                Can be either:
+                    - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_pipeline_directory/`) containing the pipeline
+                      component configs in Diffusers format.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline
+                class). The overwritten components are passed directly to the pipelines `__init__` method. See example
+                below for more information.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+
+        >>> # Download pipeline from huggingface.co and cache.
+        >>> pipeline = StableDiffusionPipeline.from_single_file(
+        ...     "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors"
+        ... )
+
+        >>> # Download pipeline from local file
+        >>> # file is downloaded under ./v1-5-pruned-emaonly.ckpt
+        >>> pipeline = StableDiffusionPipeline.from_single_file("./v1-5-pruned-emaonly.ckpt")
+
+        >>> # Enable float16 and move to GPU
+        >>> pipeline = StableDiffusionPipeline.from_single_file(
+        ...     "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipeline.to("cuda")
+        ```
+
+        """
+        original_config_file = kwargs.pop("original_config_file", None)
+        config = kwargs.pop("config", None)
+        original_config = kwargs.pop("original_config", None)
+
+        if original_config_file is not None:
+            deprecation_message = (
+                "`original_config_file` argument is deprecated and will be removed in future versions."
+                "please use the `original_config` argument instead."
+            )
+            deprecate("original_config_file", "1.0.0", deprecation_message)
+            original_config = original_config_file
+
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+
+        is_legacy_loading = False
+
+        # We shouldn't allow configuring individual models components through a Pipeline creation method
+        # These model kwargs should be deprecated
+        scaling_factor = kwargs.get("scaling_factor", None)
+        if scaling_factor is not None:
+            deprecation_message = (
+                "Passing the `scaling_factor` argument to `from_single_file is deprecated "
+                "and will be ignored in future versions."
+            )
+            deprecate("scaling_factor", "1.0.0", deprecation_message)
+
+        if original_config is not None:
+            original_config = fetch_original_config(original_config, local_files_only=local_files_only)
+
+        from ..pipelines.pipeline_utils import _get_pipeline_class
+
+        pipeline_class = _get_pipeline_class(cls, config=None)
+
+        checkpoint = load_single_file_checkpoint(
+            pretrained_model_link_or_path,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            cache_dir=cache_dir,
+            local_files_only=local_files_only,
+            revision=revision,
+        )
+
+        if config is None:
+            config = fetch_diffusers_config(checkpoint)
+            default_pretrained_model_config_name = config["pretrained_model_name_or_path"]
+        else:
+            default_pretrained_model_config_name = config
+
+        if not os.path.isdir(default_pretrained_model_config_name):
+            # Provided config is a repo_id
+            if default_pretrained_model_config_name.count("/") > 1:
+                raise ValueError(
+                    f'The provided config "{config}"'
+                    " is neither a valid local path nor a valid repo id. Please check the parameter."
+                )
+            try:
+                # Attempt to download the config files for the pipeline
+                cached_model_config_path = _download_diffusers_model_config_from_hub(
+                    default_pretrained_model_config_name,
+                    cache_dir=cache_dir,
+                    revision=revision,
+                    proxies=proxies,
+                    force_download=force_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                )
+                config_dict = pipeline_class.load_config(cached_model_config_path)
+
+            except LocalEntryNotFoundError:
+                # `local_files_only=True` but a local diffusers format model config is not available in the cache
+                # If `original_config` is not provided, we need override `local_files_only` to False
+                # to fetch the config files from the hub so that we have a way
+                # to configure the pipeline components.
+
+                if original_config is None:
+                    logger.warning(
+                        "`local_files_only` is True but no local configs were found for this checkpoint.\n"
+                        "Attempting to download the necessary config files for this pipeline.\n"
+                    )
+                    cached_model_config_path = _download_diffusers_model_config_from_hub(
+                        default_pretrained_model_config_name,
+                        cache_dir=cache_dir,
+                        revision=revision,
+                        proxies=proxies,
+                        force_download=force_download,
+                        local_files_only=False,
+                        token=token,
+                    )
+                    config_dict = pipeline_class.load_config(cached_model_config_path)
+
+                else:
+                    # For backwards compatibility
+                    # If `original_config` is provided, then we need to assume we are using legacy loading for pipeline components
+                    logger.warning(
+                        "Detected legacy `from_single_file` loading behavior. Attempting to create the pipeline based on inferred components.\n"
+                        "This may lead to errors if the model components are not correctly inferred. \n"
+                        "To avoid this warning, please explicity pass the `config` argument to `from_single_file` with a path to a local diffusers model repo \n"
+                        "e.g. `from_single_file(<my model checkpoint path>, config=<path to local diffusers model repo>) \n"
+                        "or run `from_single_file` with `local_files_only=False` first to update the local cache directory with "
+                        "the necessary config files.\n"
+                    )
+                    is_legacy_loading = True
+                    cached_model_config_path = None
+
+                    config_dict = _infer_pipeline_config_dict(pipeline_class)
+                    config_dict["_class_name"] = pipeline_class.__name__
+
+        else:
+            # Provided config is a path to a local directory attempt to load directly.
+            cached_model_config_path = default_pretrained_model_config_name
+            config_dict = pipeline_class.load_config(cached_model_config_path)
+
+        #   pop out "_ignore_files" as it is only needed for download
+        config_dict.pop("_ignore_files", None)
+
+        expected_modules, optional_kwargs = pipeline_class._get_signature_keys(cls)
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+
+        init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
+        init_kwargs = {k: init_dict.pop(k) for k in optional_kwargs if k in init_dict}
+        init_kwargs = {**init_kwargs, **passed_pipe_kwargs}
+
+        from diffusers import pipelines
+
+        # remove `null` components
+        def load_module(name, value):
+            if value[0] is None:
+                return False
+            if name in passed_class_obj and passed_class_obj[name] is None:
+                return False
+            if name in SINGLE_FILE_OPTIONAL_COMPONENTS:
+                return False
+
+            return True
+
+        init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}
+
+        for name, (library_name, class_name) in logging.tqdm(
+            sorted(init_dict.items()), desc="Loading pipeline components..."
+        ):
+            loaded_sub_model = None
+            is_pipeline_module = hasattr(pipelines, library_name)
+
+            if name in passed_class_obj:
+                loaded_sub_model = passed_class_obj[name]
+
+            else:
+                try:
+                    loaded_sub_model = load_single_file_sub_model(
+                        library_name=library_name,
+                        class_name=class_name,
+                        name=name,
+                        checkpoint=checkpoint,
+                        is_pipeline_module=is_pipeline_module,
+                        cached_model_config_path=cached_model_config_path,
+                        pipelines=pipelines,
+                        torch_dtype=torch_dtype,
+                        original_config=original_config,
+                        local_files_only=local_files_only,
+                        is_legacy_loading=is_legacy_loading,
+                        **kwargs,
+                    )
+                except SingleFileComponentError as e:
+                    raise SingleFileComponentError(
+                        (
+                            f"{e.message}\n"
+                            f"Please load the component before passing it in as an argument to `from_single_file`.\n"
+                            f"\n"
+                            f"{name} = {class_name}.from_pretrained('...')\n"
+                            f"pipe = {pipeline_class.__name__}.from_single_file(<checkpoint path>, {name}={name})\n"
+                            f"\n"
+                        )
+                    )
+
+            init_kwargs[name] = loaded_sub_model
+
+        missing_modules = set(expected_modules) - set(init_kwargs.keys())
+        passed_modules = list(passed_class_obj.keys())
+        optional_modules = pipeline_class._optional_components
+
+        if len(missing_modules) > 0 and missing_modules <= set(passed_modules + optional_modules):
+            for module in missing_modules:
+                init_kwargs[module] = passed_class_obj.get(module, None)
+        elif len(missing_modules) > 0:
+            passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs
+            raise ValueError(
+                f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed."
+            )
+
+        # deprecated kwargs
+        load_safety_checker = kwargs.pop("load_safety_checker", None)
+        if load_safety_checker is not None:
+            deprecation_message = (
+                "Please pass instances of `StableDiffusionSafetyChecker` and `AutoImageProcessor`"
+                "using the `safety_checker` and `feature_extractor` arguments in `from_single_file`"
+            )
+            deprecate("load_safety_checker", "1.0.0", deprecation_message)
+
+            safety_checker_components = _legacy_load_safety_checker(local_files_only, torch_dtype)
+            init_kwargs.update(safety_checker_components)
+
+        pipe = pipeline_class(**init_kwargs)
+
+        return pipe
diff --git a/diffusers/src/diffusers/loaders/single_file_model.py b/diffusers/src/diffusers/loaders/single_file_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fe1abfbead51be31035d2f3cdcf55dca48c8e94
--- /dev/null
+++ b/diffusers/src/diffusers/loaders/single_file_model.py
@@ -0,0 +1,318 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import inspect
+import re
+from contextlib import nullcontext
+from typing import Optional
+
+from huggingface_hub.utils import validate_hf_hub_args
+
+from ..utils import deprecate, is_accelerate_available, logging
+from .single_file_utils import (
+    SingleFileComponentError,
+    convert_animatediff_checkpoint_to_diffusers,
+    convert_controlnet_checkpoint,
+    convert_flux_transformer_checkpoint_to_diffusers,
+    convert_ldm_unet_checkpoint,
+    convert_ldm_vae_checkpoint,
+    convert_sd3_transformer_checkpoint_to_diffusers,
+    convert_stable_cascade_unet_single_file_to_diffusers,
+    create_controlnet_diffusers_config_from_ldm,
+    create_unet_diffusers_config_from_ldm,
+    create_vae_diffusers_config_from_ldm,
+    fetch_diffusers_config,
+    fetch_original_config,
+    load_single_file_checkpoint,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+    from ..models.modeling_utils import load_model_dict_into_meta
+
+
+SINGLE_FILE_LOADABLE_CLASSES = {
+    "StableCascadeUNet": {
+        "checkpoint_mapping_fn": convert_stable_cascade_unet_single_file_to_diffusers,
+    },
+    "UNet2DConditionModel": {
+        "checkpoint_mapping_fn": convert_ldm_unet_checkpoint,
+        "config_mapping_fn": create_unet_diffusers_config_from_ldm,
+        "default_subfolder": "unet",
+        "legacy_kwargs": {
+            "num_in_channels": "in_channels",  # Legacy kwargs supported by `from_single_file` mapped to new args
+        },
+    },
+    "AutoencoderKL": {
+        "checkpoint_mapping_fn": convert_ldm_vae_checkpoint,
+        "config_mapping_fn": create_vae_diffusers_config_from_ldm,
+        "default_subfolder": "vae",
+    },
+    "ControlNetModel": {
+        "checkpoint_mapping_fn": convert_controlnet_checkpoint,
+        "config_mapping_fn": create_controlnet_diffusers_config_from_ldm,
+    },
+    "SD3Transformer2DModel": {
+        "checkpoint_mapping_fn": convert_sd3_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "MotionAdapter": {
+        "checkpoint_mapping_fn": convert_animatediff_checkpoint_to_diffusers,
+    },
+    "SparseControlNetModel": {
+        "checkpoint_mapping_fn": convert_animatediff_checkpoint_to_diffusers,
+    },
+    "FluxTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_flux_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+}
+
+
+def _get_single_file_loadable_mapping_class(cls):
+    diffusers_module = importlib.import_module(__name__.split(".")[0])
+    for loadable_class_str in SINGLE_FILE_LOADABLE_CLASSES:
+        loadable_class = getattr(diffusers_module, loadable_class_str)
+
+        if issubclass(cls, loadable_class):
+            return loadable_class_str
+
+    return None
+
+
+def _get_mapping_function_kwargs(mapping_fn, **kwargs):
+    parameters = inspect.signature(mapping_fn).parameters
+
+    mapping_kwargs = {}
+    for parameter in parameters:
+        if parameter in kwargs:
+            mapping_kwargs[parameter] = kwargs[parameter]
+
+    return mapping_kwargs
+
+
+class FromOriginalModelMixin:
+    """
+    Load pretrained weights saved in the `.ckpt` or `.safetensors` format into a model.
+    """
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = None, **kwargs):
+        r"""
+        Instantiate a model from pretrained weights saved in the original `.ckpt` or `.safetensors` format. The model
+        is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pretrained_model_link_or_path_or_dict (`str`, *optional*):
+                Can be either:
+                    - A link to the `.safetensors` or `.ckpt` file (for example
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.safetensors"`) on the Hub.
+                    - A path to a local *file* containing the weights of the component model.
+                    - A state dict containing the component model weights.
+            config (`str`, *optional*):
+                - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline hosted
+                  on the Hub.
+                - A path to a *directory* (for example `./my_pipeline_directory/`) containing the pipeline component
+                  configs in Diffusers format.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            original_config (`str`, *optional*):
+                Dict or path to a yaml file containing the configuration for the model in its original format.
+                    If a dict is provided, it will be used to initialize the model configuration.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to True, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (for example the pipeline components of the
+                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
+                method. See example below for more information.
+
+        ```py
+        >>> from diffusers import StableCascadeUNet
+
+        >>> ckpt_path = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_lite.safetensors"
+        >>> model = StableCascadeUNet.from_single_file(ckpt_path)
+        ```
+        """
+
+        mapping_class_name = _get_single_file_loadable_mapping_class(cls)
+        # if class_name not in SINGLE_FILE_LOADABLE_CLASSES:
+        if mapping_class_name is None:
+            raise ValueError(
+                f"FromOriginalModelMixin is currently only compatible with {', '.join(SINGLE_FILE_LOADABLE_CLASSES.keys())}"
+            )
+
+        pretrained_model_link_or_path = kwargs.get("pretrained_model_link_or_path", None)
+        if pretrained_model_link_or_path is not None:
+            deprecation_message = (
+                "Please use `pretrained_model_link_or_path_or_dict` argument instead for model classes"
+            )
+            deprecate("pretrained_model_link_or_path", "1.0.0", deprecation_message)
+            pretrained_model_link_or_path_or_dict = pretrained_model_link_or_path
+
+        config = kwargs.pop("config", None)
+        original_config = kwargs.pop("original_config", None)
+
+        if config is not None and original_config is not None:
+            raise ValueError(
+                "`from_single_file` cannot accept both `config` and `original_config` arguments. Please provide only one of these arguments"
+            )
+
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        subfolder = kwargs.pop("subfolder", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+
+        if isinstance(pretrained_model_link_or_path_or_dict, dict):
+            checkpoint = pretrained_model_link_or_path_or_dict
+        else:
+            checkpoint = load_single_file_checkpoint(
+                pretrained_model_link_or_path_or_dict,
+                force_download=force_download,
+                proxies=proxies,
+                token=token,
+                cache_dir=cache_dir,
+                local_files_only=local_files_only,
+                revision=revision,
+            )
+
+        mapping_functions = SINGLE_FILE_LOADABLE_CLASSES[mapping_class_name]
+
+        checkpoint_mapping_fn = mapping_functions["checkpoint_mapping_fn"]
+        if original_config:
+            if "config_mapping_fn" in mapping_functions:
+                config_mapping_fn = mapping_functions["config_mapping_fn"]
+            else:
+                config_mapping_fn = None
+
+            if config_mapping_fn is None:
+                raise ValueError(
+                    (
+                        f"`original_config` has been provided for {mapping_class_name} but no mapping function"
+                        "was found to convert the original config to a Diffusers config in"
+                        "`diffusers.loaders.single_file_utils`"
+                    )
+                )
+
+            if isinstance(original_config, str):
+                # If original_config is a URL or filepath fetch the original_config dict
+                original_config = fetch_original_config(original_config, local_files_only=local_files_only)
+
+            config_mapping_kwargs = _get_mapping_function_kwargs(config_mapping_fn, **kwargs)
+            diffusers_model_config = config_mapping_fn(
+                original_config=original_config, checkpoint=checkpoint, **config_mapping_kwargs
+            )
+        else:
+            if config:
+                if isinstance(config, str):
+                    default_pretrained_model_config_name = config
+                else:
+                    raise ValueError(
+                        (
+                            "Invalid `config` argument. Please provide a string representing a repo id"
+                            "or path to a local Diffusers model repo."
+                        )
+                    )
+
+            else:
+                config = fetch_diffusers_config(checkpoint)
+                default_pretrained_model_config_name = config["pretrained_model_name_or_path"]
+
+                if "default_subfolder" in mapping_functions:
+                    subfolder = mapping_functions["default_subfolder"]
+
+                subfolder = subfolder or config.pop(
+                    "subfolder", None
+                )  # some configs contain a subfolder key, e.g. StableCascadeUNet
+
+            diffusers_model_config = cls.load_config(
+                pretrained_model_name_or_path=default_pretrained_model_config_name,
+                subfolder=subfolder,
+                local_files_only=local_files_only,
+            )
+            expected_kwargs, optional_kwargs = cls._get_signature_keys(cls)
+
+            # Map legacy kwargs to new kwargs
+            if "legacy_kwargs" in mapping_functions:
+                legacy_kwargs = mapping_functions["legacy_kwargs"]
+                for legacy_key, new_key in legacy_kwargs.items():
+                    if legacy_key in kwargs:
+                        kwargs[new_key] = kwargs.pop(legacy_key)
+
+            model_kwargs = {k: kwargs.get(k) for k in kwargs if k in expected_kwargs or k in optional_kwargs}
+            diffusers_model_config.update(model_kwargs)
+
+        checkpoint_mapping_kwargs = _get_mapping_function_kwargs(checkpoint_mapping_fn, **kwargs)
+        diffusers_format_checkpoint = checkpoint_mapping_fn(
+            config=diffusers_model_config, checkpoint=checkpoint, **checkpoint_mapping_kwargs
+        )
+        if not diffusers_format_checkpoint:
+            raise SingleFileComponentError(
+                f"Failed to load {mapping_class_name}. Weights for this component appear to be missing in the checkpoint."
+            )
+
+        ctx = init_empty_weights if is_accelerate_available() else nullcontext
+        with ctx():
+            model = cls.from_config(diffusers_model_config)
+
+        if is_accelerate_available():
+            unexpected_keys = load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
+
+        else:
+            _, unexpected_keys = model.load_state_dict(diffusers_format_checkpoint, strict=False)
+
+        if model._keys_to_ignore_on_load_unexpected is not None:
+            for pat in model._keys_to_ignore_on_load_unexpected:
+                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
+            )
+
+        if torch_dtype is not None:
+            model.to(torch_dtype)
+
+        model.eval()
+
+        return model
diff --git a/diffusers/src/diffusers/loaders/single_file_utils.py b/diffusers/src/diffusers/loaders/single_file_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..236fbd0c2295f5a29577c733095187823de497ec
--- /dev/null
+++ b/diffusers/src/diffusers/loaders/single_file_utils.py
@@ -0,0 +1,2100 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the Stable Diffusion checkpoints."""
+
+import copy
+import os
+import re
+from contextlib import nullcontext
+from io import BytesIO
+from urllib.parse import urlparse
+
+import requests
+import torch
+import yaml
+
+from ..models.modeling_utils import load_state_dict
+from ..schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EDMDPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ..utils import (
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    deprecate,
+    is_accelerate_available,
+    is_transformers_available,
+    logging,
+)
+from ..utils.hub_utils import _get_model_file
+
+
+if is_transformers_available():
+    from transformers import AutoImageProcessor
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+    from ..models.modeling_utils import load_model_dict_into_meta
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+CHECKPOINT_KEY_NAMES = {
+    "v2": "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight",
+    "xl_base": "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.bias",
+    "xl_refiner": "conditioner.embedders.0.model.transformer.resblocks.9.mlp.c_proj.bias",
+    "upscale": "model.diffusion_model.input_blocks.10.0.skip_connection.bias",
+    "controlnet": "control_model.time_embed.0.weight",
+    "playground-v2-5": "edm_mean",
+    "inpainting": "model.diffusion_model.input_blocks.0.0.weight",
+    "clip": "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight",
+    "clip_sdxl": "conditioner.embedders.0.transformer.text_model.embeddings.position_embedding.weight",
+    "clip_sd3": "text_encoders.clip_l.transformer.text_model.embeddings.position_embedding.weight",
+    "open_clip": "cond_stage_model.model.token_embedding.weight",
+    "open_clip_sdxl": "conditioner.embedders.1.model.positional_embedding",
+    "open_clip_sdxl_refiner": "conditioner.embedders.0.model.text_projection",
+    "open_clip_sd3": "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight",
+    "stable_cascade_stage_b": "down_blocks.1.0.channelwise.0.weight",
+    "stable_cascade_stage_c": "clip_txt_mapper.weight",
+    "sd3": "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias",
+    "animatediff": "down_blocks.0.motion_modules.0.temporal_transformer.transformer_blocks.0.attention_blocks.0.pos_encoder.pe",
+    "animatediff_v2": "mid_block.motion_modules.0.temporal_transformer.norm.bias",
+    "animatediff_sdxl_beta": "up_blocks.2.motion_modules.0.temporal_transformer.norm.weight",
+    "animatediff_scribble": "controlnet_cond_embedding.conv_in.weight",
+    "animatediff_rgb": "controlnet_cond_embedding.weight",
+    "flux": [
+        "double_blocks.0.img_attn.norm.key_norm.scale",
+        "model.diffusion_model.double_blocks.0.img_attn.norm.key_norm.scale",
+    ],
+}
+
+DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
+    "xl_base": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-xl-base-1.0"},
+    "xl_refiner": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-xl-refiner-1.0"},
+    "xl_inpaint": {"pretrained_model_name_or_path": "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"},
+    "playground-v2-5": {"pretrained_model_name_or_path": "playgroundai/playground-v2.5-1024px-aesthetic"},
+    "upscale": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-x4-upscaler"},
+    "inpainting": {"pretrained_model_name_or_path": "stable-diffusion-v1-5/stable-diffusion-inpainting"},
+    "inpainting_v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-inpainting"},
+    "controlnet": {"pretrained_model_name_or_path": "lllyasviel/control_v11p_sd15_canny"},
+    "v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-1"},
+    "v1": {"pretrained_model_name_or_path": "stable-diffusion-v1-5/stable-diffusion-v1-5"},
+    "stable_cascade_stage_b": {"pretrained_model_name_or_path": "stabilityai/stable-cascade", "subfolder": "decoder"},
+    "stable_cascade_stage_b_lite": {
+        "pretrained_model_name_or_path": "stabilityai/stable-cascade",
+        "subfolder": "decoder_lite",
+    },
+    "stable_cascade_stage_c": {
+        "pretrained_model_name_or_path": "stabilityai/stable-cascade-prior",
+        "subfolder": "prior",
+    },
+    "stable_cascade_stage_c_lite": {
+        "pretrained_model_name_or_path": "stabilityai/stable-cascade-prior",
+        "subfolder": "prior_lite",
+    },
+    "sd3": {
+        "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3-medium-diffusers",
+    },
+    "animatediff_v1": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5"},
+    "animatediff_v2": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-2"},
+    "animatediff_v3": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-3"},
+    "animatediff_sdxl_beta": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-sdxl-beta"},
+    "animatediff_scribble": {"pretrained_model_name_or_path": "guoyww/animatediff-sparsectrl-scribble"},
+    "animatediff_rgb": {"pretrained_model_name_or_path": "guoyww/animatediff-sparsectrl-rgb"},
+    "flux-dev": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev"},
+    "flux-schnell": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-schnell"},
+}
+
+# Use to configure model sample size when original config is provided
+DIFFUSERS_TO_LDM_DEFAULT_IMAGE_SIZE_MAP = {
+    "xl_base": 1024,
+    "xl_refiner": 1024,
+    "xl_inpaint": 1024,
+    "playground-v2-5": 1024,
+    "upscale": 512,
+    "inpainting": 512,
+    "inpainting_v2": 512,
+    "controlnet": 512,
+    "v2": 768,
+    "v1": 512,
+}
+
+
+DIFFUSERS_TO_LDM_MAPPING = {
+    "unet": {
+        "layers": {
+            "time_embedding.linear_1.weight": "time_embed.0.weight",
+            "time_embedding.linear_1.bias": "time_embed.0.bias",
+            "time_embedding.linear_2.weight": "time_embed.2.weight",
+            "time_embedding.linear_2.bias": "time_embed.2.bias",
+            "conv_in.weight": "input_blocks.0.0.weight",
+            "conv_in.bias": "input_blocks.0.0.bias",
+            "conv_norm_out.weight": "out.0.weight",
+            "conv_norm_out.bias": "out.0.bias",
+            "conv_out.weight": "out.2.weight",
+            "conv_out.bias": "out.2.bias",
+        },
+        "class_embed_type": {
+            "class_embedding.linear_1.weight": "label_emb.0.0.weight",
+            "class_embedding.linear_1.bias": "label_emb.0.0.bias",
+            "class_embedding.linear_2.weight": "label_emb.0.2.weight",
+            "class_embedding.linear_2.bias": "label_emb.0.2.bias",
+        },
+        "addition_embed_type": {
+            "add_embedding.linear_1.weight": "label_emb.0.0.weight",
+            "add_embedding.linear_1.bias": "label_emb.0.0.bias",
+            "add_embedding.linear_2.weight": "label_emb.0.2.weight",
+            "add_embedding.linear_2.bias": "label_emb.0.2.bias",
+        },
+    },
+    "controlnet": {
+        "layers": {
+            "time_embedding.linear_1.weight": "time_embed.0.weight",
+            "time_embedding.linear_1.bias": "time_embed.0.bias",
+            "time_embedding.linear_2.weight": "time_embed.2.weight",
+            "time_embedding.linear_2.bias": "time_embed.2.bias",
+            "conv_in.weight": "input_blocks.0.0.weight",
+            "conv_in.bias": "input_blocks.0.0.bias",
+            "controlnet_cond_embedding.conv_in.weight": "input_hint_block.0.weight",
+            "controlnet_cond_embedding.conv_in.bias": "input_hint_block.0.bias",
+            "controlnet_cond_embedding.conv_out.weight": "input_hint_block.14.weight",
+            "controlnet_cond_embedding.conv_out.bias": "input_hint_block.14.bias",
+        },
+        "class_embed_type": {
+            "class_embedding.linear_1.weight": "label_emb.0.0.weight",
+            "class_embedding.linear_1.bias": "label_emb.0.0.bias",
+            "class_embedding.linear_2.weight": "label_emb.0.2.weight",
+            "class_embedding.linear_2.bias": "label_emb.0.2.bias",
+        },
+        "addition_embed_type": {
+            "add_embedding.linear_1.weight": "label_emb.0.0.weight",
+            "add_embedding.linear_1.bias": "label_emb.0.0.bias",
+            "add_embedding.linear_2.weight": "label_emb.0.2.weight",
+            "add_embedding.linear_2.bias": "label_emb.0.2.bias",
+        },
+    },
+    "vae": {
+        "encoder.conv_in.weight": "encoder.conv_in.weight",
+        "encoder.conv_in.bias": "encoder.conv_in.bias",
+        "encoder.conv_out.weight": "encoder.conv_out.weight",
+        "encoder.conv_out.bias": "encoder.conv_out.bias",
+        "encoder.conv_norm_out.weight": "encoder.norm_out.weight",
+        "encoder.conv_norm_out.bias": "encoder.norm_out.bias",
+        "decoder.conv_in.weight": "decoder.conv_in.weight",
+        "decoder.conv_in.bias": "decoder.conv_in.bias",
+        "decoder.conv_out.weight": "decoder.conv_out.weight",
+        "decoder.conv_out.bias": "decoder.conv_out.bias",
+        "decoder.conv_norm_out.weight": "decoder.norm_out.weight",
+        "decoder.conv_norm_out.bias": "decoder.norm_out.bias",
+        "quant_conv.weight": "quant_conv.weight",
+        "quant_conv.bias": "quant_conv.bias",
+        "post_quant_conv.weight": "post_quant_conv.weight",
+        "post_quant_conv.bias": "post_quant_conv.bias",
+    },
+    "openclip": {
+        "layers": {
+            "text_model.embeddings.position_embedding.weight": "positional_embedding",
+            "text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+            "text_model.final_layer_norm.weight": "ln_final.weight",
+            "text_model.final_layer_norm.bias": "ln_final.bias",
+            "text_projection.weight": "text_projection",
+        },
+        "transformer": {
+            "text_model.encoder.layers.": "resblocks.",
+            "layer_norm1": "ln_1",
+            "layer_norm2": "ln_2",
+            ".fc1.": ".c_fc.",
+            ".fc2.": ".c_proj.",
+            ".self_attn": ".attn",
+            "transformer.text_model.final_layer_norm.": "ln_final.",
+            "transformer.text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+            "transformer.text_model.embeddings.position_embedding.weight": "positional_embedding",
+        },
+    },
+}
+
+SD_2_TEXT_ENCODER_KEYS_TO_IGNORE = [
+    "cond_stage_model.model.transformer.resblocks.23.attn.in_proj_bias",
+    "cond_stage_model.model.transformer.resblocks.23.attn.in_proj_weight",
+    "cond_stage_model.model.transformer.resblocks.23.attn.out_proj.bias",
+    "cond_stage_model.model.transformer.resblocks.23.attn.out_proj.weight",
+    "cond_stage_model.model.transformer.resblocks.23.ln_1.bias",
+    "cond_stage_model.model.transformer.resblocks.23.ln_1.weight",
+    "cond_stage_model.model.transformer.resblocks.23.ln_2.bias",
+    "cond_stage_model.model.transformer.resblocks.23.ln_2.weight",
+    "cond_stage_model.model.transformer.resblocks.23.mlp.c_fc.bias",
+    "cond_stage_model.model.transformer.resblocks.23.mlp.c_fc.weight",
+    "cond_stage_model.model.transformer.resblocks.23.mlp.c_proj.bias",
+    "cond_stage_model.model.transformer.resblocks.23.mlp.c_proj.weight",
+    "cond_stage_model.model.text_projection",
+]
+
+# To support legacy scheduler_type argument
+SCHEDULER_DEFAULT_CONFIG = {
+    "beta_schedule": "scaled_linear",
+    "beta_start": 0.00085,
+    "beta_end": 0.012,
+    "interpolation_type": "linear",
+    "num_train_timesteps": 1000,
+    "prediction_type": "epsilon",
+    "sample_max_value": 1.0,
+    "set_alpha_to_one": False,
+    "skip_prk_steps": True,
+    "steps_offset": 1,
+    "timestep_spacing": "leading",
+}
+
+LDM_VAE_KEYS = ["first_stage_model.", "vae."]
+LDM_VAE_DEFAULT_SCALING_FACTOR = 0.18215
+PLAYGROUND_VAE_SCALING_FACTOR = 0.5
+LDM_UNET_KEY = "model.diffusion_model."
+LDM_CONTROLNET_KEY = "control_model."
+LDM_CLIP_PREFIX_TO_REMOVE = [
+    "cond_stage_model.transformer.",
+    "conditioner.embedders.0.transformer.",
+]
+LDM_OPEN_CLIP_TEXT_PROJECTION_DIM = 1024
+SCHEDULER_LEGACY_KWARGS = ["prediction_type", "scheduler_type"]
+
+VALID_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
+
+
+class SingleFileComponentError(Exception):
+    def __init__(self, message=None):
+        self.message = message
+        super().__init__(self.message)
+
+
+def is_valid_url(url):
+    result = urlparse(url)
+    if result.scheme and result.netloc:
+        return True
+
+    return False
+
+
+def _extract_repo_id_and_weights_name(pretrained_model_name_or_path):
+    if not is_valid_url(pretrained_model_name_or_path):
+        raise ValueError("Invalid `pretrained_model_name_or_path` provided. Please set it to a valid URL.")
+
+    pattern = r"([^/]+)/([^/]+)/(?:blob/main/)?(.+)"
+    weights_name = None
+    repo_id = (None,)
+    for prefix in VALID_URL_PREFIXES:
+        pretrained_model_name_or_path = pretrained_model_name_or_path.replace(prefix, "")
+    match = re.match(pattern, pretrained_model_name_or_path)
+    if not match:
+        logger.warning("Unable to identify the repo_id and weights_name from the provided URL.")
+        return repo_id, weights_name
+
+    repo_id = f"{match.group(1)}/{match.group(2)}"
+    weights_name = match.group(3)
+
+    return repo_id, weights_name
+
+
+def _is_model_weights_in_cached_folder(cached_folder, name):
+    pretrained_model_name_or_path = os.path.join(cached_folder, name)
+    weights_exist = False
+
+    for weights_name in [WEIGHTS_NAME, SAFETENSORS_WEIGHTS_NAME]:
+        if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)):
+            weights_exist = True
+
+    return weights_exist
+
+
+def _is_legacy_scheduler_kwargs(kwargs):
+    return any(k in SCHEDULER_LEGACY_KWARGS for k in kwargs.keys())
+
+
+def load_single_file_checkpoint(
+    pretrained_model_link_or_path,
+    force_download=False,
+    proxies=None,
+    token=None,
+    cache_dir=None,
+    local_files_only=None,
+    revision=None,
+):
+    if os.path.isfile(pretrained_model_link_or_path):
+        pretrained_model_link_or_path = pretrained_model_link_or_path
+
+    else:
+        repo_id, weights_name = _extract_repo_id_and_weights_name(pretrained_model_link_or_path)
+        pretrained_model_link_or_path = _get_model_file(
+            repo_id,
+            weights_name=weights_name,
+            force_download=force_download,
+            cache_dir=cache_dir,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+        )
+
+    checkpoint = load_state_dict(pretrained_model_link_or_path)
+
+    # some checkpoints contain the model state dict under a "state_dict" key
+    while "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    return checkpoint
+
+
+def fetch_original_config(original_config_file, local_files_only=False):
+    if os.path.isfile(original_config_file):
+        with open(original_config_file, "r") as fp:
+            original_config_file = fp.read()
+
+    elif is_valid_url(original_config_file):
+        if local_files_only:
+            raise ValueError(
+                "`local_files_only` is set to True, but a URL was provided as `original_config_file`. "
+                "Please provide a valid local file path."
+            )
+
+        original_config_file = BytesIO(requests.get(original_config_file).content)
+
+    else:
+        raise ValueError("Invalid `original_config_file` provided. Please set it to a valid file path or URL.")
+
+    original_config = yaml.safe_load(original_config_file)
+
+    return original_config
+
+
+def is_clip_model(checkpoint):
+    if CHECKPOINT_KEY_NAMES["clip"] in checkpoint:
+        return True
+
+    return False
+
+
+def is_clip_sdxl_model(checkpoint):
+    if CHECKPOINT_KEY_NAMES["clip_sdxl"] in checkpoint:
+        return True
+
+    return False
+
+
+def is_clip_sd3_model(checkpoint):
+    if CHECKPOINT_KEY_NAMES["clip_sd3"] in checkpoint:
+        return True
+
+    return False
+
+
+def is_open_clip_model(checkpoint):
+    if CHECKPOINT_KEY_NAMES["open_clip"] in checkpoint:
+        return True
+
+    return False
+
+
+def is_open_clip_sdxl_model(checkpoint):
+    if CHECKPOINT_KEY_NAMES["open_clip_sdxl"] in checkpoint:
+        return True
+
+    return False
+
+
+def is_open_clip_sd3_model(checkpoint):
+    if CHECKPOINT_KEY_NAMES["open_clip_sd3"] in checkpoint:
+        return True
+
+    return False
+
+
+def is_open_clip_sdxl_refiner_model(checkpoint):
+    if CHECKPOINT_KEY_NAMES["open_clip_sdxl_refiner"] in checkpoint:
+        return True
+
+    return False
+
+
+def is_clip_model_in_single_file(class_obj, checkpoint):
+    is_clip_in_checkpoint = any(
+        [
+            is_clip_model(checkpoint),
+            is_clip_sd3_model(checkpoint),
+            is_open_clip_model(checkpoint),
+            is_open_clip_sdxl_model(checkpoint),
+            is_open_clip_sdxl_refiner_model(checkpoint),
+            is_open_clip_sd3_model(checkpoint),
+        ]
+    )
+    if (
+        class_obj.__name__ == "CLIPTextModel" or class_obj.__name__ == "CLIPTextModelWithProjection"
+    ) and is_clip_in_checkpoint:
+        return True
+
+    return False
+
+
+def infer_diffusers_model_type(checkpoint):
+    if (
+        CHECKPOINT_KEY_NAMES["inpainting"] in checkpoint
+        and checkpoint[CHECKPOINT_KEY_NAMES["inpainting"]].shape[1] == 9
+    ):
+        if CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024:
+            model_type = "inpainting_v2"
+        elif CHECKPOINT_KEY_NAMES["xl_base"] in checkpoint:
+            model_type = "xl_inpaint"
+        else:
+            model_type = "inpainting"
+
+    elif CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024:
+        model_type = "v2"
+
+    elif CHECKPOINT_KEY_NAMES["playground-v2-5"] in checkpoint:
+        model_type = "playground-v2-5"
+
+    elif CHECKPOINT_KEY_NAMES["xl_base"] in checkpoint:
+        model_type = "xl_base"
+
+    elif CHECKPOINT_KEY_NAMES["xl_refiner"] in checkpoint:
+        model_type = "xl_refiner"
+
+    elif CHECKPOINT_KEY_NAMES["upscale"] in checkpoint:
+        model_type = "upscale"
+
+    elif CHECKPOINT_KEY_NAMES["controlnet"] in checkpoint:
+        model_type = "controlnet"
+
+    elif (
+        CHECKPOINT_KEY_NAMES["stable_cascade_stage_c"] in checkpoint
+        and checkpoint[CHECKPOINT_KEY_NAMES["stable_cascade_stage_c"]].shape[0] == 1536
+    ):
+        model_type = "stable_cascade_stage_c_lite"
+
+    elif (
+        CHECKPOINT_KEY_NAMES["stable_cascade_stage_c"] in checkpoint
+        and checkpoint[CHECKPOINT_KEY_NAMES["stable_cascade_stage_c"]].shape[0] == 2048
+    ):
+        model_type = "stable_cascade_stage_c"
+
+    elif (
+        CHECKPOINT_KEY_NAMES["stable_cascade_stage_b"] in checkpoint
+        and checkpoint[CHECKPOINT_KEY_NAMES["stable_cascade_stage_b"]].shape[-1] == 576
+    ):
+        model_type = "stable_cascade_stage_b_lite"
+
+    elif (
+        CHECKPOINT_KEY_NAMES["stable_cascade_stage_b"] in checkpoint
+        and checkpoint[CHECKPOINT_KEY_NAMES["stable_cascade_stage_b"]].shape[-1] == 640
+    ):
+        model_type = "stable_cascade_stage_b"
+
+    elif CHECKPOINT_KEY_NAMES["sd3"] in checkpoint:
+        model_type = "sd3"
+
+    elif CHECKPOINT_KEY_NAMES["animatediff"] in checkpoint:
+        if CHECKPOINT_KEY_NAMES["animatediff_scribble"] in checkpoint:
+            model_type = "animatediff_scribble"
+
+        elif CHECKPOINT_KEY_NAMES["animatediff_rgb"] in checkpoint:
+            model_type = "animatediff_rgb"
+
+        elif CHECKPOINT_KEY_NAMES["animatediff_v2"] in checkpoint:
+            model_type = "animatediff_v2"
+
+        elif checkpoint[CHECKPOINT_KEY_NAMES["animatediff_sdxl_beta"]].shape[-1] == 320:
+            model_type = "animatediff_sdxl_beta"
+
+        elif checkpoint[CHECKPOINT_KEY_NAMES["animatediff"]].shape[1] == 24:
+            model_type = "animatediff_v1"
+
+        else:
+            model_type = "animatediff_v3"
+
+    elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["flux"]):
+        if any(
+            g in checkpoint for g in ["guidance_in.in_layer.bias", "model.diffusion_model.guidance_in.in_layer.bias"]
+        ):
+            model_type = "flux-dev"
+        else:
+            model_type = "flux-schnell"
+    else:
+        model_type = "v1"
+
+    return model_type
+
+
+def fetch_diffusers_config(checkpoint):
+    model_type = infer_diffusers_model_type(checkpoint)
+    model_path = DIFFUSERS_DEFAULT_PIPELINE_PATHS[model_type]
+    model_path = copy.deepcopy(model_path)
+
+    return model_path
+
+
+def set_image_size(checkpoint, image_size=None):
+    if image_size:
+        return image_size
+
+    model_type = infer_diffusers_model_type(checkpoint)
+    image_size = DIFFUSERS_TO_LDM_DEFAULT_IMAGE_SIZE_MAP[model_type]
+
+    return image_size
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_unet_diffusers_config_from_ldm(
+    original_config, checkpoint, image_size=None, upcast_attention=None, num_in_channels=None
+):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    if image_size is not None:
+        deprecation_message = (
+            "Configuring UNet2DConditionModel with the `image_size` argument to `from_single_file`"
+            "is deprecated and will be ignored in future versions."
+        )
+        deprecate("image_size", "1.0.0", deprecation_message)
+
+    image_size = set_image_size(checkpoint, image_size=image_size)
+
+    if (
+        "unet_config" in original_config["model"]["params"]
+        and original_config["model"]["params"]["unet_config"] is not None
+    ):
+        unet_params = original_config["model"]["params"]["unet_config"]["params"]
+    else:
+        unet_params = original_config["model"]["params"]["network_config"]["params"]
+
+    if num_in_channels is not None:
+        deprecation_message = (
+            "Configuring UNet2DConditionModel with the `num_in_channels` argument to `from_single_file`"
+            "is deprecated and will be ignored in future versions."
+        )
+        deprecate("image_size", "1.0.0", deprecation_message)
+        in_channels = num_in_channels
+    else:
+        in_channels = unet_params["in_channels"]
+
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    if unet_params["transformer_depth"] is not None:
+        transformer_layers_per_block = (
+            unet_params["transformer_depth"]
+            if isinstance(unet_params["transformer_depth"], int)
+            else list(unet_params["transformer_depth"])
+        )
+    else:
+        transformer_layers_per_block = 1
+
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
+
+    head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None
+    use_linear_projection = (
+        unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim_mult = unet_params["model_channels"] // unet_params["num_head_channels"]
+            head_dim = [head_dim_mult * c for c in list(unet_params["channel_mult"])]
+
+    class_embed_type = None
+    addition_embed_type = None
+    addition_time_embed_dim = None
+    projection_class_embeddings_input_dim = None
+    context_dim = None
+
+    if unet_params["context_dim"] is not None:
+        context_dim = (
+            unet_params["context_dim"]
+            if isinstance(unet_params["context_dim"], int)
+            else unet_params["context_dim"][0]
+        )
+
+    if "num_classes" in unet_params:
+        if unet_params["num_classes"] == "sequential":
+            if context_dim in [2048, 1280]:
+                # SDXL
+                addition_embed_type = "text_time"
+                addition_time_embed_dim = 256
+            else:
+                class_embed_type = "projection"
+            assert "adm_in_channels" in unet_params
+            projection_class_embeddings_input_dim = unet_params["adm_in_channels"]
+
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": in_channels,
+        "down_block_types": down_block_types,
+        "block_out_channels": block_out_channels,
+        "layers_per_block": unet_params["num_res_blocks"],
+        "cross_attention_dim": context_dim,
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "addition_embed_type": addition_embed_type,
+        "addition_time_embed_dim": addition_time_embed_dim,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "transformer_layers_per_block": transformer_layers_per_block,
+    }
+
+    if upcast_attention is not None:
+        deprecation_message = (
+            "Configuring UNet2DConditionModel with the `upcast_attention` argument to `from_single_file`"
+            "is deprecated and will be ignored in future versions."
+        )
+        deprecate("image_size", "1.0.0", deprecation_message)
+        config["upcast_attention"] = upcast_attention
+
+    if "disable_self_attentions" in unet_params:
+        config["only_cross_attention"] = unet_params["disable_self_attentions"]
+
+    if "num_classes" in unet_params and isinstance(unet_params["num_classes"], int):
+        config["num_class_embeds"] = unet_params["num_classes"]
+
+    config["out_channels"] = unet_params["out_channels"]
+    config["up_block_types"] = up_block_types
+
+    return config
+
+
+def create_controlnet_diffusers_config_from_ldm(original_config, checkpoint, image_size=None, **kwargs):
+    if image_size is not None:
+        deprecation_message = (
+            "Configuring ControlNetModel with the `image_size` argument"
+            "is deprecated and will be ignored in future versions."
+        )
+        deprecate("image_size", "1.0.0", deprecation_message)
+
+    image_size = set_image_size(checkpoint, image_size=image_size)
+
+    unet_params = original_config["model"]["params"]["control_stage_config"]["params"]
+    diffusers_unet_config = create_unet_diffusers_config_from_ldm(original_config, image_size=image_size)
+
+    controlnet_config = {
+        "conditioning_channels": unet_params["hint_channels"],
+        "in_channels": diffusers_unet_config["in_channels"],
+        "down_block_types": diffusers_unet_config["down_block_types"],
+        "block_out_channels": diffusers_unet_config["block_out_channels"],
+        "layers_per_block": diffusers_unet_config["layers_per_block"],
+        "cross_attention_dim": diffusers_unet_config["cross_attention_dim"],
+        "attention_head_dim": diffusers_unet_config["attention_head_dim"],
+        "use_linear_projection": diffusers_unet_config["use_linear_projection"],
+        "class_embed_type": diffusers_unet_config["class_embed_type"],
+        "addition_embed_type": diffusers_unet_config["addition_embed_type"],
+        "addition_time_embed_dim": diffusers_unet_config["addition_time_embed_dim"],
+        "projection_class_embeddings_input_dim": diffusers_unet_config["projection_class_embeddings_input_dim"],
+        "transformer_layers_per_block": diffusers_unet_config["transformer_layers_per_block"],
+    }
+
+    return controlnet_config
+
+
+def create_vae_diffusers_config_from_ldm(original_config, checkpoint, image_size=None, scaling_factor=None):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    if image_size is not None:
+        deprecation_message = (
+            "Configuring AutoencoderKL with the `image_size` argument"
+            "is deprecated and will be ignored in future versions."
+        )
+        deprecate("image_size", "1.0.0", deprecation_message)
+
+    image_size = set_image_size(checkpoint, image_size=image_size)
+
+    if "edm_mean" in checkpoint and "edm_std" in checkpoint:
+        latents_mean = checkpoint["edm_mean"]
+        latents_std = checkpoint["edm_std"]
+    else:
+        latents_mean = None
+        latents_std = None
+
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    if (scaling_factor is None) and (latents_mean is not None) and (latents_std is not None):
+        scaling_factor = PLAYGROUND_VAE_SCALING_FACTOR
+
+    elif (scaling_factor is None) and ("scale_factor" in original_config["model"]["params"]):
+        scaling_factor = original_config["model"]["params"]["scale_factor"]
+
+    elif scaling_factor is None:
+        scaling_factor = LDM_VAE_DEFAULT_SCALING_FACTOR
+
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
+        "down_block_types": down_block_types,
+        "up_block_types": up_block_types,
+        "block_out_channels": block_out_channels,
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
+        "scaling_factor": scaling_factor,
+    }
+    if latents_mean is not None and latents_std is not None:
+        config.update({"latents_mean": latents_mean, "latents_std": latents_std})
+
+    return config
+
+
+def update_unet_resnet_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint, mapping=None):
+    for ldm_key in ldm_keys:
+        diffusers_key = (
+            ldm_key.replace("in_layers.0", "norm1")
+            .replace("in_layers.2", "conv1")
+            .replace("out_layers.0", "norm2")
+            .replace("out_layers.3", "conv2")
+            .replace("emb_layers.1", "time_emb_proj")
+            .replace("skip_connection", "conv_shortcut")
+        )
+        if mapping:
+            diffusers_key = diffusers_key.replace(mapping["old"], mapping["new"])
+        new_checkpoint[diffusers_key] = checkpoint.get(ldm_key)
+
+
+def update_unet_attention_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint, mapping):
+    for ldm_key in ldm_keys:
+        diffusers_key = ldm_key.replace(mapping["old"], mapping["new"])
+        new_checkpoint[diffusers_key] = checkpoint.get(ldm_key)
+
+
+def update_vae_resnet_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping):
+    for ldm_key in keys:
+        diffusers_key = ldm_key.replace(mapping["old"], mapping["new"]).replace("nin_shortcut", "conv_shortcut")
+        new_checkpoint[diffusers_key] = checkpoint.get(ldm_key)
+
+
+def update_vae_attentions_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping):
+    for ldm_key in keys:
+        diffusers_key = (
+            ldm_key.replace(mapping["old"], mapping["new"])
+            .replace("norm.weight", "group_norm.weight")
+            .replace("norm.bias", "group_norm.bias")
+            .replace("q.weight", "to_q.weight")
+            .replace("q.bias", "to_q.bias")
+            .replace("k.weight", "to_k.weight")
+            .replace("k.bias", "to_k.bias")
+            .replace("v.weight", "to_v.weight")
+            .replace("v.bias", "to_v.bias")
+            .replace("proj_out.weight", "to_out.0.weight")
+            .replace("proj_out.bias", "to_out.0.bias")
+        )
+        new_checkpoint[diffusers_key] = checkpoint.get(ldm_key)
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        shape = new_checkpoint[diffusers_key].shape
+
+        if len(shape) == 3:
+            new_checkpoint[diffusers_key] = new_checkpoint[diffusers_key][:, :, 0]
+        elif len(shape) == 4:
+            new_checkpoint[diffusers_key] = new_checkpoint[diffusers_key][:, :, 0, 0]
+
+
+def convert_stable_cascade_unet_single_file_to_diffusers(checkpoint, **kwargs):
+    is_stage_c = "clip_txt_mapper.weight" in checkpoint
+
+    if is_stage_c:
+        state_dict = {}
+        for key in checkpoint.keys():
+            if key.endswith("in_proj_weight"):
+                weights = checkpoint[key].chunk(3, 0)
+                state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
+                state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
+                state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
+            elif key.endswith("in_proj_bias"):
+                weights = checkpoint[key].chunk(3, 0)
+                state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
+                state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
+                state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
+            elif key.endswith("out_proj.weight"):
+                weights = checkpoint[key]
+                state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
+            elif key.endswith("out_proj.bias"):
+                weights = checkpoint[key]
+                state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
+            else:
+                state_dict[key] = checkpoint[key]
+    else:
+        state_dict = {}
+        for key in checkpoint.keys():
+            if key.endswith("in_proj_weight"):
+                weights = checkpoint[key].chunk(3, 0)
+                state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
+                state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
+                state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
+            elif key.endswith("in_proj_bias"):
+                weights = checkpoint[key].chunk(3, 0)
+                state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
+                state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
+                state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
+            elif key.endswith("out_proj.weight"):
+                weights = checkpoint[key]
+                state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
+            elif key.endswith("out_proj.bias"):
+                weights = checkpoint[key]
+                state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
+            # rename clip_mapper to clip_txt_pooled_mapper
+            elif key.endswith("clip_mapper.weight"):
+                weights = checkpoint[key]
+                state_dict[key.replace("clip_mapper.weight", "clip_txt_pooled_mapper.weight")] = weights
+            elif key.endswith("clip_mapper.bias"):
+                weights = checkpoint[key]
+                state_dict[key.replace("clip_mapper.bias", "clip_txt_pooled_mapper.bias")] = weights
+            else:
+                state_dict[key] = checkpoint[key]
+
+    return state_dict
+
+
+def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False, **kwargs):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+    unet_key = LDM_UNET_KEY
+
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        logger.warning("Checkpoint has both EMA and non-EMA weights.")
+        logger.warning(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.get(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            logger.warning(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+    ldm_unet_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["layers"]
+    for diffusers_key, ldm_key in ldm_unet_keys.items():
+        if ldm_key not in unet_state_dict:
+            continue
+        new_checkpoint[diffusers_key] = unet_state_dict[ldm_key]
+
+    if ("class_embed_type" in config) and (config["class_embed_type"] in ["timestep", "projection"]):
+        class_embed_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["class_embed_type"]
+        for diffusers_key, ldm_key in class_embed_keys.items():
+            new_checkpoint[diffusers_key] = unet_state_dict[ldm_key]
+
+    if ("addition_embed_type" in config) and (config["addition_embed_type"] == "text_time"):
+        addition_embed_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["addition_embed_type"]
+        for diffusers_key, ldm_key in addition_embed_keys.items():
+            new_checkpoint[diffusers_key] = unet_state_dict[ldm_key]
+
+    # Relevant to StableDiffusionUpscalePipeline
+    if "num_class_embeds" in config:
+        if (config["num_class_embeds"] is not None) and ("label_emb.weight" in unet_state_dict):
+            new_checkpoint["class_embedding.weight"] = unet_state_dict["label_emb.weight"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    # Down blocks
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        update_unet_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            unet_state_dict,
+            {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"},
+        )
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.get(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.get(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if attentions:
+            update_unet_attention_ldm_to_diffusers(
+                attentions,
+                new_checkpoint,
+                unet_state_dict,
+                {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"},
+            )
+
+    # Mid blocks
+    for key in middle_blocks.keys():
+        diffusers_key = max(key - 1, 0)
+        if key % 2 == 0:
+            update_unet_resnet_ldm_to_diffusers(
+                middle_blocks[key],
+                new_checkpoint,
+                unet_state_dict,
+                mapping={"old": f"middle_block.{key}", "new": f"mid_block.resnets.{diffusers_key}"},
+            )
+        else:
+            update_unet_attention_ldm_to_diffusers(
+                middle_blocks[key],
+                new_checkpoint,
+                unet_state_dict,
+                mapping={"old": f"middle_block.{key}", "new": f"mid_block.attentions.{diffusers_key}"},
+            )
+
+    # Up Blocks
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in output_blocks[i] if f"output_blocks.{i}.0" in key and f"output_blocks.{i}.0.op" not in key
+        ]
+        update_unet_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            unet_state_dict,
+            {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"},
+        )
+
+        attentions = [
+            key for key in output_blocks[i] if f"output_blocks.{i}.1" in key and f"output_blocks.{i}.1.conv" not in key
+        ]
+        if attentions:
+            update_unet_attention_ldm_to_diffusers(
+                attentions,
+                new_checkpoint,
+                unet_state_dict,
+                {"old": f"output_blocks.{i}.1", "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}"},
+            )
+
+        if f"output_blocks.{i}.1.conv.weight" in unet_state_dict:
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                f"output_blocks.{i}.1.conv.weight"
+            ]
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                f"output_blocks.{i}.1.conv.bias"
+            ]
+        if f"output_blocks.{i}.2.conv.weight" in unet_state_dict:
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                f"output_blocks.{i}.2.conv.weight"
+            ]
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                f"output_blocks.{i}.2.conv.bias"
+            ]
+
+    return new_checkpoint
+
+
+def convert_controlnet_checkpoint(
+    checkpoint,
+    config,
+    **kwargs,
+):
+    # Some controlnet ckpt files are distributed independently from the rest of the
+    # model components i.e. https://huggingface.co/thibaud/controlnet-sd21/
+    if "time_embed.0.weight" in checkpoint:
+        controlnet_state_dict = checkpoint
+
+    else:
+        controlnet_state_dict = {}
+        keys = list(checkpoint.keys())
+        controlnet_key = LDM_CONTROLNET_KEY
+        for key in keys:
+            if key.startswith(controlnet_key):
+                controlnet_state_dict[key.replace(controlnet_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+    ldm_controlnet_keys = DIFFUSERS_TO_LDM_MAPPING["controlnet"]["layers"]
+    for diffusers_key, ldm_key in ldm_controlnet_keys.items():
+        if ldm_key not in controlnet_state_dict:
+            continue
+        new_checkpoint[diffusers_key] = controlnet_state_dict[ldm_key]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len(
+        {".".join(layer.split(".")[:2]) for layer in controlnet_state_dict if "input_blocks" in layer}
+    )
+    input_blocks = {
+        layer_id: [key for key in controlnet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Down blocks
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        update_unet_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            controlnet_state_dict,
+            {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"},
+        )
+
+        if f"input_blocks.{i}.0.op.weight" in controlnet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = controlnet_state_dict.get(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = controlnet_state_dict.get(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if attentions:
+            update_unet_attention_ldm_to_diffusers(
+                attentions,
+                new_checkpoint,
+                controlnet_state_dict,
+                {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"},
+            )
+
+    # controlnet down blocks
+    for i in range(num_input_blocks):
+        new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = controlnet_state_dict.get(f"zero_convs.{i}.0.weight")
+        new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = controlnet_state_dict.get(f"zero_convs.{i}.0.bias")
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len(
+        {".".join(layer.split(".")[:2]) for layer in controlnet_state_dict if "middle_block" in layer}
+    )
+    middle_blocks = {
+        layer_id: [key for key in controlnet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Mid blocks
+    for key in middle_blocks.keys():
+        diffusers_key = max(key - 1, 0)
+        if key % 2 == 0:
+            update_unet_resnet_ldm_to_diffusers(
+                middle_blocks[key],
+                new_checkpoint,
+                controlnet_state_dict,
+                mapping={"old": f"middle_block.{key}", "new": f"mid_block.resnets.{diffusers_key}"},
+            )
+        else:
+            update_unet_attention_ldm_to_diffusers(
+                middle_blocks[key],
+                new_checkpoint,
+                controlnet_state_dict,
+                mapping={"old": f"middle_block.{key}", "new": f"mid_block.attentions.{diffusers_key}"},
+            )
+
+    # mid block
+    new_checkpoint["controlnet_mid_block.weight"] = controlnet_state_dict.get("middle_block_out.0.weight")
+    new_checkpoint["controlnet_mid_block.bias"] = controlnet_state_dict.get("middle_block_out.0.bias")
+
+    # controlnet cond embedding blocks
+    cond_embedding_blocks = {
+        ".".join(layer.split(".")[:2])
+        for layer in controlnet_state_dict
+        if "input_hint_block" in layer and ("input_hint_block.0" not in layer) and ("input_hint_block.14" not in layer)
+    }
+    num_cond_embedding_blocks = len(cond_embedding_blocks)
+
+    for idx in range(1, num_cond_embedding_blocks + 1):
+        diffusers_idx = idx - 1
+        cond_block_id = 2 * idx
+
+        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_idx}.weight"] = controlnet_state_dict.get(
+            f"input_hint_block.{cond_block_id}.weight"
+        )
+        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_idx}.bias"] = controlnet_state_dict.get(
+            f"input_hint_block.{cond_block_id}.bias"
+        )
+
+    return new_checkpoint
+
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    # remove the LDM_VAE_KEY prefix from the ldm checkpoint keys so that it is easier to map them to diffusers keys
+    vae_state_dict = {}
+    keys = list(checkpoint.keys())
+    vae_key = ""
+    for ldm_vae_key in LDM_VAE_KEYS:
+        if any(k.startswith(ldm_vae_key) for k in keys):
+            vae_key = ldm_vae_key
+
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+    vae_diffusers_ldm_map = DIFFUSERS_TO_LDM_MAPPING["vae"]
+    for diffusers_key, ldm_key in vae_diffusers_ldm_map.items():
+        if ldm_key not in vae_state_dict:
+            continue
+        new_checkpoint[diffusers_key] = vae_state_dict[ldm_key]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len(config["down_block_types"])
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"},
+        )
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.get(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.get(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"},
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    update_vae_attentions_ldm_to_diffusers(
+        mid_attentions, new_checkpoint, vae_state_dict, mapping={"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    )
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len(config["up_block_types"])
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"},
+        )
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"},
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    update_vae_attentions_ldm_to_diffusers(
+        mid_attentions, new_checkpoint, vae_state_dict, mapping={"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    )
+    conv_attn_to_linear(new_checkpoint)
+
+    return new_checkpoint
+
+
+def convert_ldm_clip_checkpoint(checkpoint, remove_prefix=None):
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+
+    remove_prefixes = []
+    remove_prefixes.extend(LDM_CLIP_PREFIX_TO_REMOVE)
+    if remove_prefix:
+        remove_prefixes.append(remove_prefix)
+
+    for key in keys:
+        for prefix in remove_prefixes:
+            if key.startswith(prefix):
+                diffusers_key = key.replace(prefix, "")
+                text_model_dict[diffusers_key] = checkpoint.get(key)
+
+    return text_model_dict
+
+
+def convert_open_clip_checkpoint(
+    text_model,
+    checkpoint,
+    prefix="cond_stage_model.model.",
+):
+    text_model_dict = {}
+    text_proj_key = prefix + "text_projection"
+
+    if text_proj_key in checkpoint:
+        text_proj_dim = int(checkpoint[text_proj_key].shape[0])
+    elif hasattr(text_model.config, "projection_dim"):
+        text_proj_dim = text_model.config.projection_dim
+    else:
+        text_proj_dim = LDM_OPEN_CLIP_TEXT_PROJECTION_DIM
+
+    keys = list(checkpoint.keys())
+    keys_to_ignore = SD_2_TEXT_ENCODER_KEYS_TO_IGNORE
+
+    openclip_diffusers_ldm_map = DIFFUSERS_TO_LDM_MAPPING["openclip"]["layers"]
+    for diffusers_key, ldm_key in openclip_diffusers_ldm_map.items():
+        ldm_key = prefix + ldm_key
+        if ldm_key not in checkpoint:
+            continue
+        if ldm_key in keys_to_ignore:
+            continue
+        if ldm_key.endswith("text_projection"):
+            text_model_dict[diffusers_key] = checkpoint[ldm_key].T.contiguous()
+        else:
+            text_model_dict[diffusers_key] = checkpoint[ldm_key]
+
+    for key in keys:
+        if key in keys_to_ignore:
+            continue
+
+        if not key.startswith(prefix + "transformer."):
+            continue
+
+        diffusers_key = key.replace(prefix + "transformer.", "")
+        transformer_diffusers_to_ldm_map = DIFFUSERS_TO_LDM_MAPPING["openclip"]["transformer"]
+        for new_key, old_key in transformer_diffusers_to_ldm_map.items():
+            diffusers_key = (
+                diffusers_key.replace(old_key, new_key).replace(".in_proj_weight", "").replace(".in_proj_bias", "")
+            )
+
+        if key.endswith(".in_proj_weight"):
+            weight_value = checkpoint.get(key)
+
+            text_model_dict[diffusers_key + ".q_proj.weight"] = weight_value[:text_proj_dim, :].clone().detach()
+            text_model_dict[diffusers_key + ".k_proj.weight"] = (
+                weight_value[text_proj_dim : text_proj_dim * 2, :].clone().detach()
+            )
+            text_model_dict[diffusers_key + ".v_proj.weight"] = weight_value[text_proj_dim * 2 :, :].clone().detach()
+
+        elif key.endswith(".in_proj_bias"):
+            weight_value = checkpoint.get(key)
+            text_model_dict[diffusers_key + ".q_proj.bias"] = weight_value[:text_proj_dim].clone().detach()
+            text_model_dict[diffusers_key + ".k_proj.bias"] = (
+                weight_value[text_proj_dim : text_proj_dim * 2].clone().detach()
+            )
+            text_model_dict[diffusers_key + ".v_proj.bias"] = weight_value[text_proj_dim * 2 :].clone().detach()
+        else:
+            text_model_dict[diffusers_key] = checkpoint.get(key)
+
+    return text_model_dict
+
+
+def create_diffusers_clip_model_from_ldm(
+    cls,
+    checkpoint,
+    subfolder="",
+    config=None,
+    torch_dtype=None,
+    local_files_only=None,
+    is_legacy_loading=False,
+):
+    if config:
+        config = {"pretrained_model_name_or_path": config}
+    else:
+        config = fetch_diffusers_config(checkpoint)
+
+    # For backwards compatibility
+    # Older versions of `from_single_file` expected CLIP configs to be placed in their original transformers model repo
+    # in the cache_dir, rather than in a subfolder of the Diffusers model
+    if is_legacy_loading:
+        logger.warning(
+            (
+                "Detected legacy CLIP loading behavior. Please run `from_single_file` with `local_files_only=False once to update "
+                "the local cache directory with the necessary CLIP model config files. "
+                "Attempting to load CLIP model from legacy cache directory."
+            )
+        )
+
+        if is_clip_model(checkpoint) or is_clip_sdxl_model(checkpoint):
+            clip_config = "openai/clip-vit-large-patch14"
+            config["pretrained_model_name_or_path"] = clip_config
+            subfolder = ""
+
+        elif is_open_clip_model(checkpoint):
+            clip_config = "stabilityai/stable-diffusion-2"
+            config["pretrained_model_name_or_path"] = clip_config
+            subfolder = "text_encoder"
+
+        else:
+            clip_config = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+            config["pretrained_model_name_or_path"] = clip_config
+            subfolder = ""
+
+    model_config = cls.config_class.from_pretrained(**config, subfolder=subfolder, local_files_only=local_files_only)
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        model = cls(model_config)
+
+    position_embedding_dim = model.text_model.embeddings.position_embedding.weight.shape[-1]
+
+    if is_clip_model(checkpoint):
+        diffusers_format_checkpoint = convert_ldm_clip_checkpoint(checkpoint)
+
+    elif (
+        is_clip_sdxl_model(checkpoint)
+        and checkpoint[CHECKPOINT_KEY_NAMES["clip_sdxl"]].shape[-1] == position_embedding_dim
+    ):
+        diffusers_format_checkpoint = convert_ldm_clip_checkpoint(checkpoint)
+
+    elif (
+        is_clip_sd3_model(checkpoint)
+        and checkpoint[CHECKPOINT_KEY_NAMES["clip_sd3"]].shape[-1] == position_embedding_dim
+    ):
+        diffusers_format_checkpoint = convert_ldm_clip_checkpoint(checkpoint, "text_encoders.clip_l.transformer.")
+        diffusers_format_checkpoint["text_projection.weight"] = torch.eye(position_embedding_dim)
+
+    elif is_open_clip_model(checkpoint):
+        prefix = "cond_stage_model.model."
+        diffusers_format_checkpoint = convert_open_clip_checkpoint(model, checkpoint, prefix=prefix)
+
+    elif (
+        is_open_clip_sdxl_model(checkpoint)
+        and checkpoint[CHECKPOINT_KEY_NAMES["open_clip_sdxl"]].shape[-1] == position_embedding_dim
+    ):
+        prefix = "conditioner.embedders.1.model."
+        diffusers_format_checkpoint = convert_open_clip_checkpoint(model, checkpoint, prefix=prefix)
+
+    elif is_open_clip_sdxl_refiner_model(checkpoint):
+        prefix = "conditioner.embedders.0.model."
+        diffusers_format_checkpoint = convert_open_clip_checkpoint(model, checkpoint, prefix=prefix)
+
+    elif (
+        is_open_clip_sd3_model(checkpoint)
+        and checkpoint[CHECKPOINT_KEY_NAMES["open_clip_sd3"]].shape[-1] == position_embedding_dim
+    ):
+        diffusers_format_checkpoint = convert_ldm_clip_checkpoint(checkpoint, "text_encoders.clip_g.transformer.")
+
+    else:
+        raise ValueError("The provided checkpoint does not seem to contain a valid CLIP model.")
+
+    if is_accelerate_available():
+        unexpected_keys = load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
+    else:
+        _, unexpected_keys = model.load_state_dict(diffusers_format_checkpoint, strict=False)
+
+    if model._keys_to_ignore_on_load_unexpected is not None:
+        for pat in model._keys_to_ignore_on_load_unexpected:
+            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+    if len(unexpected_keys) > 0:
+        logger.warning(
+            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
+        )
+
+    if torch_dtype is not None:
+        model.to(torch_dtype)
+
+    model.eval()
+
+    return model
+
+
+def _legacy_load_scheduler(
+    cls,
+    checkpoint,
+    component_name,
+    original_config=None,
+    **kwargs,
+):
+    scheduler_type = kwargs.get("scheduler_type", None)
+    prediction_type = kwargs.get("prediction_type", None)
+
+    if scheduler_type is not None:
+        deprecation_message = (
+            "Please pass an instance of a Scheduler object directly to the `scheduler` argument in `from_single_file`\n\n"
+            "Example:\n\n"
+            "from diffusers import StableDiffusionPipeline, DDIMScheduler\n\n"
+            "scheduler = DDIMScheduler()\n"
+            "pipe = StableDiffusionPipeline.from_single_file(<checkpoint path>, scheduler=scheduler)\n"
+        )
+        deprecate("scheduler_type", "1.0.0", deprecation_message)
+
+    if prediction_type is not None:
+        deprecation_message = (
+            "Please configure an instance of a Scheduler with the appropriate `prediction_type` and "
+            "pass the object directly to the `scheduler` argument in `from_single_file`.\n\n"
+            "Example:\n\n"
+            "from diffusers import StableDiffusionPipeline, DDIMScheduler\n\n"
+            'scheduler = DDIMScheduler(prediction_type="v_prediction")\n'
+            "pipe = StableDiffusionPipeline.from_single_file(<checkpoint path>, scheduler=scheduler)\n"
+        )
+        deprecate("prediction_type", "1.0.0", deprecation_message)
+
+    scheduler_config = SCHEDULER_DEFAULT_CONFIG
+    model_type = infer_diffusers_model_type(checkpoint=checkpoint)
+
+    global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
+
+    if original_config:
+        num_train_timesteps = getattr(original_config["model"]["params"], "timesteps", 1000)
+    else:
+        num_train_timesteps = 1000
+
+    scheduler_config["num_train_timesteps"] = num_train_timesteps
+
+    if model_type == "v2":
+        if prediction_type is None:
+            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"` # as it relies on a brittle global step parameter here
+            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
+
+    else:
+        prediction_type = prediction_type or "epsilon"
+
+    scheduler_config["prediction_type"] = prediction_type
+
+    if model_type in ["xl_base", "xl_refiner"]:
+        scheduler_type = "euler"
+    elif model_type == "playground":
+        scheduler_type = "edm_dpm_solver_multistep"
+    else:
+        if original_config:
+            beta_start = original_config["model"]["params"].get("linear_start")
+            beta_end = original_config["model"]["params"].get("linear_end")
+
+        else:
+            beta_start = 0.02
+            beta_end = 0.085
+
+        scheduler_config["beta_start"] = beta_start
+        scheduler_config["beta_end"] = beta_end
+        scheduler_config["beta_schedule"] = "scaled_linear"
+        scheduler_config["clip_sample"] = False
+        scheduler_config["set_alpha_to_one"] = False
+
+    # to deal with an edge case StableDiffusionUpscale pipeline has two schedulers
+    if component_name == "low_res_scheduler":
+        return cls.from_config(
+            {
+                "beta_end": 0.02,
+                "beta_schedule": "scaled_linear",
+                "beta_start": 0.0001,
+                "clip_sample": True,
+                "num_train_timesteps": 1000,
+                "prediction_type": "epsilon",
+                "trained_betas": None,
+                "variance_type": "fixed_small",
+            }
+        )
+
+    if scheduler_type is None:
+        return cls.from_config(scheduler_config)
+
+    elif scheduler_type == "pndm":
+        scheduler_config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "ddim":
+        scheduler = DDIMScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "edm_dpm_solver_multistep":
+        scheduler_config = {
+            "algorithm_type": "dpmsolver++",
+            "dynamic_thresholding_ratio": 0.995,
+            "euler_at_final": False,
+            "final_sigmas_type": "zero",
+            "lower_order_final": True,
+            "num_train_timesteps": 1000,
+            "prediction_type": "epsilon",
+            "rho": 7.0,
+            "sample_max_value": 1.0,
+            "sigma_data": 0.5,
+            "sigma_max": 80.0,
+            "sigma_min": 0.002,
+            "solver_order": 2,
+            "solver_type": "midpoint",
+            "thresholding": False,
+        }
+        scheduler = EDMDPMSolverMultistepScheduler(**scheduler_config)
+
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+    return scheduler
+
+
+def _legacy_load_clip_tokenizer(cls, checkpoint, config=None, local_files_only=False):
+    if config:
+        config = {"pretrained_model_name_or_path": config}
+    else:
+        config = fetch_diffusers_config(checkpoint)
+
+    if is_clip_model(checkpoint) or is_clip_sdxl_model(checkpoint):
+        clip_config = "openai/clip-vit-large-patch14"
+        config["pretrained_model_name_or_path"] = clip_config
+        subfolder = ""
+
+    elif is_open_clip_model(checkpoint):
+        clip_config = "stabilityai/stable-diffusion-2"
+        config["pretrained_model_name_or_path"] = clip_config
+        subfolder = "tokenizer"
+
+    else:
+        clip_config = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+        config["pretrained_model_name_or_path"] = clip_config
+        subfolder = ""
+
+    tokenizer = cls.from_pretrained(**config, subfolder=subfolder, local_files_only=local_files_only)
+
+    return tokenizer
+
+
+def _legacy_load_safety_checker(local_files_only, torch_dtype):
+    # Support for loading safety checker components using the deprecated
+    # `load_safety_checker` argument.
+
+    from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+    feature_extractor = AutoImageProcessor.from_pretrained(
+        "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only, torch_dtype=torch_dtype
+    )
+    safety_checker = StableDiffusionSafetyChecker.from_pretrained(
+        "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only, torch_dtype=torch_dtype
+    )
+
+    return {"safety_checker": safety_checker, "feature_extractor": feature_extractor}
+
+
+# in SD3 original implementation of AdaLayerNormContinuous, it split linear projection output into shift, scale;
+# while in diffusers it split into scale, shift. Here we swap the linear projection weights in order to be able to use diffusers implementation
+def swap_scale_shift(weight, dim):
+    shift, scale = weight.chunk(2, dim=0)
+    new_weight = torch.cat([scale, shift], dim=0)
+    return new_weight
+
+
+def convert_sd3_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
+    converted_state_dict = {}
+    keys = list(checkpoint.keys())
+    for k in keys:
+        if "model.diffusion_model." in k:
+            checkpoint[k.replace("model.diffusion_model.", "")] = checkpoint.pop(k)
+
+    num_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "joint_blocks" in k))[-1] + 1  # noqa: C401
+    caption_projection_dim = 1536
+
+    # Positional and patch embeddings.
+    converted_state_dict["pos_embed.pos_embed"] = checkpoint.pop("pos_embed")
+    converted_state_dict["pos_embed.proj.weight"] = checkpoint.pop("x_embedder.proj.weight")
+    converted_state_dict["pos_embed.proj.bias"] = checkpoint.pop("x_embedder.proj.bias")
+
+    # Timestep embeddings.
+    converted_state_dict["time_text_embed.timestep_embedder.linear_1.weight"] = checkpoint.pop(
+        "t_embedder.mlp.0.weight"
+    )
+    converted_state_dict["time_text_embed.timestep_embedder.linear_1.bias"] = checkpoint.pop("t_embedder.mlp.0.bias")
+    converted_state_dict["time_text_embed.timestep_embedder.linear_2.weight"] = checkpoint.pop(
+        "t_embedder.mlp.2.weight"
+    )
+    converted_state_dict["time_text_embed.timestep_embedder.linear_2.bias"] = checkpoint.pop("t_embedder.mlp.2.bias")
+
+    # Context projections.
+    converted_state_dict["context_embedder.weight"] = checkpoint.pop("context_embedder.weight")
+    converted_state_dict["context_embedder.bias"] = checkpoint.pop("context_embedder.bias")
+
+    # Pooled context projection.
+    converted_state_dict["time_text_embed.text_embedder.linear_1.weight"] = checkpoint.pop("y_embedder.mlp.0.weight")
+    converted_state_dict["time_text_embed.text_embedder.linear_1.bias"] = checkpoint.pop("y_embedder.mlp.0.bias")
+    converted_state_dict["time_text_embed.text_embedder.linear_2.weight"] = checkpoint.pop("y_embedder.mlp.2.weight")
+    converted_state_dict["time_text_embed.text_embedder.linear_2.bias"] = checkpoint.pop("y_embedder.mlp.2.bias")
+
+    # Transformer blocks 🎸.
+    for i in range(num_layers):
+        # Q, K, V
+        sample_q, sample_k, sample_v = torch.chunk(
+            checkpoint.pop(f"joint_blocks.{i}.x_block.attn.qkv.weight"), 3, dim=0
+        )
+        context_q, context_k, context_v = torch.chunk(
+            checkpoint.pop(f"joint_blocks.{i}.context_block.attn.qkv.weight"), 3, dim=0
+        )
+        sample_q_bias, sample_k_bias, sample_v_bias = torch.chunk(
+            checkpoint.pop(f"joint_blocks.{i}.x_block.attn.qkv.bias"), 3, dim=0
+        )
+        context_q_bias, context_k_bias, context_v_bias = torch.chunk(
+            checkpoint.pop(f"joint_blocks.{i}.context_block.attn.qkv.bias"), 3, dim=0
+        )
+
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_q.weight"] = torch.cat([sample_q])
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_q.bias"] = torch.cat([sample_q_bias])
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_k.weight"] = torch.cat([sample_k])
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_k.bias"] = torch.cat([sample_k_bias])
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_v.weight"] = torch.cat([sample_v])
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_v.bias"] = torch.cat([sample_v_bias])
+
+        converted_state_dict[f"transformer_blocks.{i}.attn.add_q_proj.weight"] = torch.cat([context_q])
+        converted_state_dict[f"transformer_blocks.{i}.attn.add_q_proj.bias"] = torch.cat([context_q_bias])
+        converted_state_dict[f"transformer_blocks.{i}.attn.add_k_proj.weight"] = torch.cat([context_k])
+        converted_state_dict[f"transformer_blocks.{i}.attn.add_k_proj.bias"] = torch.cat([context_k_bias])
+        converted_state_dict[f"transformer_blocks.{i}.attn.add_v_proj.weight"] = torch.cat([context_v])
+        converted_state_dict[f"transformer_blocks.{i}.attn.add_v_proj.bias"] = torch.cat([context_v_bias])
+
+        # output projections.
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_out.0.weight"] = checkpoint.pop(
+            f"joint_blocks.{i}.x_block.attn.proj.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{i}.attn.to_out.0.bias"] = checkpoint.pop(
+            f"joint_blocks.{i}.x_block.attn.proj.bias"
+        )
+        if not (i == num_layers - 1):
+            converted_state_dict[f"transformer_blocks.{i}.attn.to_add_out.weight"] = checkpoint.pop(
+                f"joint_blocks.{i}.context_block.attn.proj.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.attn.to_add_out.bias"] = checkpoint.pop(
+                f"joint_blocks.{i}.context_block.attn.proj.bias"
+            )
+
+        # norms.
+        converted_state_dict[f"transformer_blocks.{i}.norm1.linear.weight"] = checkpoint.pop(
+            f"joint_blocks.{i}.x_block.adaLN_modulation.1.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{i}.norm1.linear.bias"] = checkpoint.pop(
+            f"joint_blocks.{i}.x_block.adaLN_modulation.1.bias"
+        )
+        if not (i == num_layers - 1):
+            converted_state_dict[f"transformer_blocks.{i}.norm1_context.linear.weight"] = checkpoint.pop(
+                f"joint_blocks.{i}.context_block.adaLN_modulation.1.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.norm1_context.linear.bias"] = checkpoint.pop(
+                f"joint_blocks.{i}.context_block.adaLN_modulation.1.bias"
+            )
+        else:
+            converted_state_dict[f"transformer_blocks.{i}.norm1_context.linear.weight"] = swap_scale_shift(
+                checkpoint.pop(f"joint_blocks.{i}.context_block.adaLN_modulation.1.weight"),
+                dim=caption_projection_dim,
+            )
+            converted_state_dict[f"transformer_blocks.{i}.norm1_context.linear.bias"] = swap_scale_shift(
+                checkpoint.pop(f"joint_blocks.{i}.context_block.adaLN_modulation.1.bias"),
+                dim=caption_projection_dim,
+            )
+
+        # ffs.
+        converted_state_dict[f"transformer_blocks.{i}.ff.net.0.proj.weight"] = checkpoint.pop(
+            f"joint_blocks.{i}.x_block.mlp.fc1.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{i}.ff.net.0.proj.bias"] = checkpoint.pop(
+            f"joint_blocks.{i}.x_block.mlp.fc1.bias"
+        )
+        converted_state_dict[f"transformer_blocks.{i}.ff.net.2.weight"] = checkpoint.pop(
+            f"joint_blocks.{i}.x_block.mlp.fc2.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{i}.ff.net.2.bias"] = checkpoint.pop(
+            f"joint_blocks.{i}.x_block.mlp.fc2.bias"
+        )
+        if not (i == num_layers - 1):
+            converted_state_dict[f"transformer_blocks.{i}.ff_context.net.0.proj.weight"] = checkpoint.pop(
+                f"joint_blocks.{i}.context_block.mlp.fc1.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.ff_context.net.0.proj.bias"] = checkpoint.pop(
+                f"joint_blocks.{i}.context_block.mlp.fc1.bias"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.ff_context.net.2.weight"] = checkpoint.pop(
+                f"joint_blocks.{i}.context_block.mlp.fc2.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.ff_context.net.2.bias"] = checkpoint.pop(
+                f"joint_blocks.{i}.context_block.mlp.fc2.bias"
+            )
+
+    # Final blocks.
+    converted_state_dict["proj_out.weight"] = checkpoint.pop("final_layer.linear.weight")
+    converted_state_dict["proj_out.bias"] = checkpoint.pop("final_layer.linear.bias")
+    converted_state_dict["norm_out.linear.weight"] = swap_scale_shift(
+        checkpoint.pop("final_layer.adaLN_modulation.1.weight"), dim=caption_projection_dim
+    )
+    converted_state_dict["norm_out.linear.bias"] = swap_scale_shift(
+        checkpoint.pop("final_layer.adaLN_modulation.1.bias"), dim=caption_projection_dim
+    )
+
+    return converted_state_dict
+
+
+def is_t5_in_single_file(checkpoint):
+    if "text_encoders.t5xxl.transformer.shared.weight" in checkpoint:
+        return True
+
+    return False
+
+
+def convert_sd3_t5_checkpoint_to_diffusers(checkpoint):
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+
+    remove_prefixes = ["text_encoders.t5xxl.transformer."]
+
+    for key in keys:
+        for prefix in remove_prefixes:
+            if key.startswith(prefix):
+                diffusers_key = key.replace(prefix, "")
+                text_model_dict[diffusers_key] = checkpoint.get(key)
+
+    return text_model_dict
+
+
+def create_diffusers_t5_model_from_checkpoint(
+    cls,
+    checkpoint,
+    subfolder="",
+    config=None,
+    torch_dtype=None,
+    local_files_only=None,
+):
+    if config:
+        config = {"pretrained_model_name_or_path": config}
+    else:
+        config = fetch_diffusers_config(checkpoint)
+
+    model_config = cls.config_class.from_pretrained(**config, subfolder=subfolder, local_files_only=local_files_only)
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        model = cls(model_config)
+
+    diffusers_format_checkpoint = convert_sd3_t5_checkpoint_to_diffusers(checkpoint)
+
+    if is_accelerate_available():
+        unexpected_keys = load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
+        if model._keys_to_ignore_on_load_unexpected is not None:
+            for pat in model._keys_to_ignore_on_load_unexpected:
+                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
+            )
+
+    else:
+        model.load_state_dict(diffusers_format_checkpoint)
+
+    use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (torch_dtype == torch.float16)
+    if use_keep_in_fp32_modules:
+        keep_in_fp32_modules = model._keep_in_fp32_modules
+    else:
+        keep_in_fp32_modules = []
+
+    if keep_in_fp32_modules is not None:
+        for name, param in model.named_parameters():
+            if any(module_to_keep_in_fp32 in name.split(".") for module_to_keep_in_fp32 in keep_in_fp32_modules):
+                # param = param.to(torch.float32) does not work here as only in the local scope.
+                param.data = param.data.to(torch.float32)
+
+    return model
+
+
+def convert_animatediff_checkpoint_to_diffusers(checkpoint, **kwargs):
+    converted_state_dict = {}
+    for k, v in checkpoint.items():
+        if "pos_encoder" in k:
+            continue
+
+        else:
+            converted_state_dict[
+                k.replace(".norms.0", ".norm1")
+                .replace(".norms.1", ".norm2")
+                .replace(".ff_norm", ".norm3")
+                .replace(".attention_blocks.0", ".attn1")
+                .replace(".attention_blocks.1", ".attn2")
+                .replace(".temporal_transformer", "")
+            ] = v
+
+    return converted_state_dict
+
+
+def convert_flux_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
+    converted_state_dict = {}
+    keys = list(checkpoint.keys())
+    for k in keys:
+        if "model.diffusion_model." in k:
+            checkpoint[k.replace("model.diffusion_model.", "")] = checkpoint.pop(k)
+
+    num_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "double_blocks." in k))[-1] + 1  # noqa: C401
+    num_single_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "single_blocks." in k))[-1] + 1  # noqa: C401
+    mlp_ratio = 4.0
+    inner_dim = 3072
+
+    # in SD3 original implementation of AdaLayerNormContinuous, it split linear projection output into shift, scale;
+    # while in diffusers it split into scale, shift. Here we swap the linear projection weights in order to be able to use diffusers implementation
+    def swap_scale_shift(weight):
+        shift, scale = weight.chunk(2, dim=0)
+        new_weight = torch.cat([scale, shift], dim=0)
+        return new_weight
+
+    ## time_text_embed.timestep_embedder <-  time_in
+    converted_state_dict["time_text_embed.timestep_embedder.linear_1.weight"] = checkpoint.pop(
+        "time_in.in_layer.weight"
+    )
+    converted_state_dict["time_text_embed.timestep_embedder.linear_1.bias"] = checkpoint.pop("time_in.in_layer.bias")
+    converted_state_dict["time_text_embed.timestep_embedder.linear_2.weight"] = checkpoint.pop(
+        "time_in.out_layer.weight"
+    )
+    converted_state_dict["time_text_embed.timestep_embedder.linear_2.bias"] = checkpoint.pop("time_in.out_layer.bias")
+
+    ## time_text_embed.text_embedder <- vector_in
+    converted_state_dict["time_text_embed.text_embedder.linear_1.weight"] = checkpoint.pop("vector_in.in_layer.weight")
+    converted_state_dict["time_text_embed.text_embedder.linear_1.bias"] = checkpoint.pop("vector_in.in_layer.bias")
+    converted_state_dict["time_text_embed.text_embedder.linear_2.weight"] = checkpoint.pop(
+        "vector_in.out_layer.weight"
+    )
+    converted_state_dict["time_text_embed.text_embedder.linear_2.bias"] = checkpoint.pop("vector_in.out_layer.bias")
+
+    # guidance
+    has_guidance = any("guidance" in k for k in checkpoint)
+    if has_guidance:
+        converted_state_dict["time_text_embed.guidance_embedder.linear_1.weight"] = checkpoint.pop(
+            "guidance_in.in_layer.weight"
+        )
+        converted_state_dict["time_text_embed.guidance_embedder.linear_1.bias"] = checkpoint.pop(
+            "guidance_in.in_layer.bias"
+        )
+        converted_state_dict["time_text_embed.guidance_embedder.linear_2.weight"] = checkpoint.pop(
+            "guidance_in.out_layer.weight"
+        )
+        converted_state_dict["time_text_embed.guidance_embedder.linear_2.bias"] = checkpoint.pop(
+            "guidance_in.out_layer.bias"
+        )
+
+    # context_embedder
+    converted_state_dict["context_embedder.weight"] = checkpoint.pop("txt_in.weight")
+    converted_state_dict["context_embedder.bias"] = checkpoint.pop("txt_in.bias")
+
+    # x_embedder
+    converted_state_dict["x_embedder.weight"] = checkpoint.pop("img_in.weight")
+    converted_state_dict["x_embedder.bias"] = checkpoint.pop("img_in.bias")
+
+    # double transformer blocks
+    for i in range(num_layers):
+        block_prefix = f"transformer_blocks.{i}."
+        # norms.
+        ## norm1
+        converted_state_dict[f"{block_prefix}norm1.linear.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.img_mod.lin.weight"
+        )
+        converted_state_dict[f"{block_prefix}norm1.linear.bias"] = checkpoint.pop(
+            f"double_blocks.{i}.img_mod.lin.bias"
+        )
+        ## norm1_context
+        converted_state_dict[f"{block_prefix}norm1_context.linear.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_mod.lin.weight"
+        )
+        converted_state_dict[f"{block_prefix}norm1_context.linear.bias"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_mod.lin.bias"
+        )
+        # Q, K, V
+        sample_q, sample_k, sample_v = torch.chunk(checkpoint.pop(f"double_blocks.{i}.img_attn.qkv.weight"), 3, dim=0)
+        context_q, context_k, context_v = torch.chunk(
+            checkpoint.pop(f"double_blocks.{i}.txt_attn.qkv.weight"), 3, dim=0
+        )
+        sample_q_bias, sample_k_bias, sample_v_bias = torch.chunk(
+            checkpoint.pop(f"double_blocks.{i}.img_attn.qkv.bias"), 3, dim=0
+        )
+        context_q_bias, context_k_bias, context_v_bias = torch.chunk(
+            checkpoint.pop(f"double_blocks.{i}.txt_attn.qkv.bias"), 3, dim=0
+        )
+        converted_state_dict[f"{block_prefix}attn.to_q.weight"] = torch.cat([sample_q])
+        converted_state_dict[f"{block_prefix}attn.to_q.bias"] = torch.cat([sample_q_bias])
+        converted_state_dict[f"{block_prefix}attn.to_k.weight"] = torch.cat([sample_k])
+        converted_state_dict[f"{block_prefix}attn.to_k.bias"] = torch.cat([sample_k_bias])
+        converted_state_dict[f"{block_prefix}attn.to_v.weight"] = torch.cat([sample_v])
+        converted_state_dict[f"{block_prefix}attn.to_v.bias"] = torch.cat([sample_v_bias])
+        converted_state_dict[f"{block_prefix}attn.add_q_proj.weight"] = torch.cat([context_q])
+        converted_state_dict[f"{block_prefix}attn.add_q_proj.bias"] = torch.cat([context_q_bias])
+        converted_state_dict[f"{block_prefix}attn.add_k_proj.weight"] = torch.cat([context_k])
+        converted_state_dict[f"{block_prefix}attn.add_k_proj.bias"] = torch.cat([context_k_bias])
+        converted_state_dict[f"{block_prefix}attn.add_v_proj.weight"] = torch.cat([context_v])
+        converted_state_dict[f"{block_prefix}attn.add_v_proj.bias"] = torch.cat([context_v_bias])
+        # qk_norm
+        converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.img_attn.norm.query_norm.scale"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.img_attn.norm.key_norm.scale"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_added_q.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_attn.norm.query_norm.scale"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_added_k.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_attn.norm.key_norm.scale"
+        )
+        # ff img_mlp
+        converted_state_dict[f"{block_prefix}ff.net.0.proj.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.img_mlp.0.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff.net.0.proj.bias"] = checkpoint.pop(f"double_blocks.{i}.img_mlp.0.bias")
+        converted_state_dict[f"{block_prefix}ff.net.2.weight"] = checkpoint.pop(f"double_blocks.{i}.img_mlp.2.weight")
+        converted_state_dict[f"{block_prefix}ff.net.2.bias"] = checkpoint.pop(f"double_blocks.{i}.img_mlp.2.bias")
+        converted_state_dict[f"{block_prefix}ff_context.net.0.proj.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_mlp.0.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.0.proj.bias"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_mlp.0.bias"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.2.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_mlp.2.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.2.bias"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_mlp.2.bias"
+        )
+        # output projections.
+        converted_state_dict[f"{block_prefix}attn.to_out.0.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.img_attn.proj.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_out.0.bias"] = checkpoint.pop(
+            f"double_blocks.{i}.img_attn.proj.bias"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_add_out.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_attn.proj.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_add_out.bias"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_attn.proj.bias"
+        )
+
+    # single transfomer blocks
+    for i in range(num_single_layers):
+        block_prefix = f"single_transformer_blocks.{i}."
+        # norm.linear  <- single_blocks.0.modulation.lin
+        converted_state_dict[f"{block_prefix}norm.linear.weight"] = checkpoint.pop(
+            f"single_blocks.{i}.modulation.lin.weight"
+        )
+        converted_state_dict[f"{block_prefix}norm.linear.bias"] = checkpoint.pop(
+            f"single_blocks.{i}.modulation.lin.bias"
+        )
+        # Q, K, V, mlp
+        mlp_hidden_dim = int(inner_dim * mlp_ratio)
+        split_size = (inner_dim, inner_dim, inner_dim, mlp_hidden_dim)
+        q, k, v, mlp = torch.split(checkpoint.pop(f"single_blocks.{i}.linear1.weight"), split_size, dim=0)
+        q_bias, k_bias, v_bias, mlp_bias = torch.split(
+            checkpoint.pop(f"single_blocks.{i}.linear1.bias"), split_size, dim=0
+        )
+        converted_state_dict[f"{block_prefix}attn.to_q.weight"] = torch.cat([q])
+        converted_state_dict[f"{block_prefix}attn.to_q.bias"] = torch.cat([q_bias])
+        converted_state_dict[f"{block_prefix}attn.to_k.weight"] = torch.cat([k])
+        converted_state_dict[f"{block_prefix}attn.to_k.bias"] = torch.cat([k_bias])
+        converted_state_dict[f"{block_prefix}attn.to_v.weight"] = torch.cat([v])
+        converted_state_dict[f"{block_prefix}attn.to_v.bias"] = torch.cat([v_bias])
+        converted_state_dict[f"{block_prefix}proj_mlp.weight"] = torch.cat([mlp])
+        converted_state_dict[f"{block_prefix}proj_mlp.bias"] = torch.cat([mlp_bias])
+        # qk norm
+        converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = checkpoint.pop(
+            f"single_blocks.{i}.norm.query_norm.scale"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = checkpoint.pop(
+            f"single_blocks.{i}.norm.key_norm.scale"
+        )
+        # output projections.
+        converted_state_dict[f"{block_prefix}proj_out.weight"] = checkpoint.pop(f"single_blocks.{i}.linear2.weight")
+        converted_state_dict[f"{block_prefix}proj_out.bias"] = checkpoint.pop(f"single_blocks.{i}.linear2.bias")
+
+    converted_state_dict["proj_out.weight"] = checkpoint.pop("final_layer.linear.weight")
+    converted_state_dict["proj_out.bias"] = checkpoint.pop("final_layer.linear.bias")
+    converted_state_dict["norm_out.linear.weight"] = swap_scale_shift(
+        checkpoint.pop("final_layer.adaLN_modulation.1.weight")
+    )
+    converted_state_dict["norm_out.linear.bias"] = swap_scale_shift(
+        checkpoint.pop("final_layer.adaLN_modulation.1.bias")
+    )
+
+    return converted_state_dict
diff --git a/diffusers/src/diffusers/loaders/textual_inversion.py b/diffusers/src/diffusers/loaders/textual_inversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..574b89233cc15cb15f0fad56de74fdc29bd77fba
--- /dev/null
+++ b/diffusers/src/diffusers/loaders/textual_inversion.py
@@ -0,0 +1,578 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, List, Optional, Union
+
+import safetensors
+import torch
+from huggingface_hub.utils import validate_hf_hub_args
+from torch import nn
+
+from ..models.modeling_utils import load_state_dict
+from ..utils import _get_model_file, is_accelerate_available, is_transformers_available, logging
+
+
+if is_transformers_available():
+    from transformers import PreTrainedModel, PreTrainedTokenizer
+
+if is_accelerate_available():
+    from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+
+logger = logging.get_logger(__name__)
+
+TEXT_INVERSION_NAME = "learned_embeds.bin"
+TEXT_INVERSION_NAME_SAFE = "learned_embeds.safetensors"
+
+
+@validate_hf_hub_args
+def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs):
+    cache_dir = kwargs.pop("cache_dir", None)
+    force_download = kwargs.pop("force_download", False)
+    proxies = kwargs.pop("proxies", None)
+    local_files_only = kwargs.pop("local_files_only", None)
+    token = kwargs.pop("token", None)
+    revision = kwargs.pop("revision", None)
+    subfolder = kwargs.pop("subfolder", None)
+    weight_name = kwargs.pop("weight_name", None)
+    use_safetensors = kwargs.pop("use_safetensors", None)
+
+    allow_pickle = False
+    if use_safetensors is None:
+        use_safetensors = True
+        allow_pickle = True
+
+    user_agent = {
+        "file_type": "text_inversion",
+        "framework": "pytorch",
+    }
+    state_dicts = []
+    for pretrained_model_name_or_path in pretrained_model_name_or_paths:
+        if not isinstance(pretrained_model_name_or_path, (dict, torch.Tensor)):
+            # 3.1. Load textual inversion file
+            model_file = None
+
+            # Let's first try to load .safetensors weights
+            if (use_safetensors and weight_name is None) or (
+                weight_name is not None and weight_name.endswith(".safetensors")
+            ):
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path,
+                        weights_name=weight_name or TEXT_INVERSION_NAME_SAFE,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                    )
+                    state_dict = safetensors.torch.load_file(model_file, device="cpu")
+                except Exception as e:
+                    if not allow_pickle:
+                        raise e
+
+                    model_file = None
+
+            if model_file is None:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path,
+                    weights_name=weight_name or TEXT_INVERSION_NAME,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                )
+                state_dict = load_state_dict(model_file)
+        else:
+            state_dict = pretrained_model_name_or_path
+
+        state_dicts.append(state_dict)
+
+    return state_dicts
+
+
+class TextualInversionLoaderMixin:
+    r"""
+    Load Textual Inversion tokens and embeddings to the tokenizer and text encoder.
+    """
+
+    def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "PreTrainedTokenizer"):  # noqa: F821
+        r"""
+        Processes prompts that include a special token corresponding to a multi-vector textual inversion embedding to
+        be replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual
+        inversion token or if the textual inversion token is a single vector, the input prompt is returned.
+
+        Parameters:
+            prompt (`str` or list of `str`):
+                The prompt or prompts to guide the image generation.
+            tokenizer (`PreTrainedTokenizer`):
+                The tokenizer responsible for encoding the prompt into input tokens.
+
+        Returns:
+            `str` or list of `str`: The converted prompt
+        """
+        if not isinstance(prompt, List):
+            prompts = [prompt]
+        else:
+            prompts = prompt
+
+        prompts = [self._maybe_convert_prompt(p, tokenizer) for p in prompts]
+
+        if not isinstance(prompt, List):
+            return prompts[0]
+
+        return prompts
+
+    def _maybe_convert_prompt(self, prompt: str, tokenizer: "PreTrainedTokenizer"):  # noqa: F821
+        r"""
+        Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds
+        to a multi-vector textual inversion embedding, this function will process the prompt so that the special token
+        is replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual
+        inversion token or a textual inversion token that is a single vector, the input prompt is simply returned.
+
+        Parameters:
+            prompt (`str`):
+                The prompt to guide the image generation.
+            tokenizer (`PreTrainedTokenizer`):
+                The tokenizer responsible for encoding the prompt into input tokens.
+
+        Returns:
+            `str`: The converted prompt
+        """
+        tokens = tokenizer.tokenize(prompt)
+        unique_tokens = set(tokens)
+        for token in unique_tokens:
+            if token in tokenizer.added_tokens_encoder:
+                replacement = token
+                i = 1
+                while f"{token}_{i}" in tokenizer.added_tokens_encoder:
+                    replacement += f" {token}_{i}"
+                    i += 1
+
+                prompt = prompt.replace(token, replacement)
+
+        return prompt
+
+    def _check_text_inv_inputs(self, tokenizer, text_encoder, pretrained_model_name_or_paths, tokens):
+        if tokenizer is None:
+            raise ValueError(
+                f"{self.__class__.__name__} requires `self.tokenizer` or passing a `tokenizer` of type `PreTrainedTokenizer` for calling"
+                f" `{self.load_textual_inversion.__name__}`"
+            )
+
+        if text_encoder is None:
+            raise ValueError(
+                f"{self.__class__.__name__} requires `self.text_encoder` or passing a `text_encoder` of type `PreTrainedModel` for calling"
+                f" `{self.load_textual_inversion.__name__}`"
+            )
+
+        if len(pretrained_model_name_or_paths) > 1 and len(pretrained_model_name_or_paths) != len(tokens):
+            raise ValueError(
+                f"You have passed a list of models of length {len(pretrained_model_name_or_paths)}, and list of tokens of length {len(tokens)} "
+                f"Make sure both lists have the same length."
+            )
+
+        valid_tokens = [t for t in tokens if t is not None]
+        if len(set(valid_tokens)) < len(valid_tokens):
+            raise ValueError(f"You have passed a list of tokens that contains duplicates: {tokens}")
+
+    @staticmethod
+    def _retrieve_tokens_and_embeddings(tokens, state_dicts, tokenizer):
+        all_tokens = []
+        all_embeddings = []
+        for state_dict, token in zip(state_dicts, tokens):
+            if isinstance(state_dict, torch.Tensor):
+                if token is None:
+                    raise ValueError(
+                        "You are trying to load a textual inversion embedding that has been saved as a PyTorch tensor. Make sure to pass the name of the corresponding token in this case: `token=...`."
+                    )
+                loaded_token = token
+                embedding = state_dict
+            elif len(state_dict) == 1:
+                # diffusers
+                loaded_token, embedding = next(iter(state_dict.items()))
+            elif "string_to_param" in state_dict:
+                # A1111
+                loaded_token = state_dict["name"]
+                embedding = state_dict["string_to_param"]["*"]
+            else:
+                raise ValueError(
+                    f"Loaded state dictionary is incorrect: {state_dict}. \n\n"
+                    "Please verify that the loaded state dictionary of the textual embedding either only has a single key or includes the `string_to_param`"
+                    " input key."
+                )
+
+            if token is not None and loaded_token != token:
+                logger.info(f"The loaded token: {loaded_token} is overwritten by the passed token {token}.")
+            else:
+                token = loaded_token
+
+            if token in tokenizer.get_vocab():
+                raise ValueError(
+                    f"Token {token} already in tokenizer vocabulary. Please choose a different token name or remove {token} and embedding from the tokenizer and text encoder."
+                )
+
+            all_tokens.append(token)
+            all_embeddings.append(embedding)
+
+        return all_tokens, all_embeddings
+
+    @staticmethod
+    def _extend_tokens_and_embeddings(tokens, embeddings, tokenizer):
+        all_tokens = []
+        all_embeddings = []
+
+        for embedding, token in zip(embeddings, tokens):
+            if f"{token}_1" in tokenizer.get_vocab():
+                multi_vector_tokens = [token]
+                i = 1
+                while f"{token}_{i}" in tokenizer.added_tokens_encoder:
+                    multi_vector_tokens.append(f"{token}_{i}")
+                    i += 1
+
+                raise ValueError(
+                    f"Multi-vector Token {multi_vector_tokens} already in tokenizer vocabulary. Please choose a different token name or remove the {multi_vector_tokens} and embedding from the tokenizer and text encoder."
+                )
+
+            is_multi_vector = len(embedding.shape) > 1 and embedding.shape[0] > 1
+            if is_multi_vector:
+                all_tokens += [token] + [f"{token}_{i}" for i in range(1, embedding.shape[0])]
+                all_embeddings += [e for e in embedding]  # noqa: C416
+            else:
+                all_tokens += [token]
+                all_embeddings += [embedding[0]] if len(embedding.shape) > 1 else [embedding]
+
+        return all_tokens, all_embeddings
+
+    @validate_hf_hub_args
+    def load_textual_inversion(
+        self,
+        pretrained_model_name_or_path: Union[str, List[str], Dict[str, torch.Tensor], List[Dict[str, torch.Tensor]]],
+        token: Optional[Union[str, List[str]]] = None,
+        tokenizer: Optional["PreTrainedTokenizer"] = None,  # noqa: F821
+        text_encoder: Optional["PreTrainedModel"] = None,  # noqa: F821
+        **kwargs,
+    ):
+        r"""
+        Load Textual Inversion embeddings into the text encoder of [`StableDiffusionPipeline`] (both 🤗 Diffusers and
+        Automatic1111 formats are supported).
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike` or `List[str or os.PathLike]` or `Dict` or `List[Dict]`):
+                Can be either one of the following or a list of them:
+
+                    - A string, the *model id* (for example `sd-concepts-library/low-poly-hd-logos-icons`) of a
+                      pretrained model hosted on the Hub.
+                    - A path to a *directory* (for example `./my_text_inversion_directory/`) containing the textual
+                      inversion weights.
+                    - A path to a *file* (for example `./my_text_inversions.pt`) containing textual inversion weights.
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+
+            token (`str` or `List[str]`, *optional*):
+                Override the token to use for the textual inversion weights. If `pretrained_model_name_or_path` is a
+                list, then `token` must also be a list of equal length.
+            text_encoder ([`~transformers.CLIPTextModel`], *optional*):
+                Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+                If not specified, function will take self.tokenizer.
+            tokenizer ([`~transformers.CLIPTokenizer`], *optional*):
+                A `CLIPTokenizer` to tokenize text. If not specified, function will take self.tokenizer.
+            weight_name (`str`, *optional*):
+                Name of a custom weight file. This should be used when:
+
+                    - The saved textual inversion file is in 🤗 Diffusers format, but was saved under a specific weight
+                      name such as `text_inv.bin`.
+                    - The saved textual inversion file is in the Automatic1111 format.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+
+        Example:
+
+        To load a Textual Inversion embedding vector in 🤗 Diffusers format:
+
+        ```py
+        from diffusers import StableDiffusionPipeline
+        import torch
+
+        model_id = "runwayml/stable-diffusion-v1-5"
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+        pipe.load_textual_inversion("sd-concepts-library/cat-toy")
+
+        prompt = "A <cat-toy> backpack"
+
+        image = pipe(prompt, num_inference_steps=50).images[0]
+        image.save("cat-backpack.png")
+        ```
+
+        To load a Textual Inversion embedding vector in Automatic1111 format, make sure to download the vector first
+        (for example from [civitAI](https://civitai.com/models/3036?modelVersionId=9857)) and then load the vector
+        locally:
+
+        ```py
+        from diffusers import StableDiffusionPipeline
+        import torch
+
+        model_id = "runwayml/stable-diffusion-v1-5"
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+        pipe.load_textual_inversion("./charturnerv2.pt", token="charturnerv2")
+
+        prompt = "charturnerv2, multiple views of the same character in the same outfit, a character turnaround of a woman wearing a black jacket and red shirt, best quality, intricate details."
+
+        image = pipe(prompt, num_inference_steps=50).images[0]
+        image.save("character.png")
+        ```
+
+        """
+        # 1. Set correct tokenizer and text encoder
+        tokenizer = tokenizer or getattr(self, "tokenizer", None)
+        text_encoder = text_encoder or getattr(self, "text_encoder", None)
+
+        # 2. Normalize inputs
+        pretrained_model_name_or_paths = (
+            [pretrained_model_name_or_path]
+            if not isinstance(pretrained_model_name_or_path, list)
+            else pretrained_model_name_or_path
+        )
+        tokens = [token] if not isinstance(token, list) else token
+        if tokens[0] is None:
+            tokens = tokens * len(pretrained_model_name_or_paths)
+
+        # 3. Check inputs
+        self._check_text_inv_inputs(tokenizer, text_encoder, pretrained_model_name_or_paths, tokens)
+
+        # 4. Load state dicts of textual embeddings
+        state_dicts = load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs)
+
+        # 4.1 Handle the special case when state_dict is a tensor that contains n embeddings for n tokens
+        if len(tokens) > 1 and len(state_dicts) == 1:
+            if isinstance(state_dicts[0], torch.Tensor):
+                state_dicts = list(state_dicts[0])
+                if len(tokens) != len(state_dicts):
+                    raise ValueError(
+                        f"You have passed a state_dict contains {len(state_dicts)} embeddings, and list of tokens of length {len(tokens)} "
+                        f"Make sure both have the same length."
+                    )
+
+        # 4. Retrieve tokens and embeddings
+        tokens, embeddings = self._retrieve_tokens_and_embeddings(tokens, state_dicts, tokenizer)
+
+        # 5. Extend tokens and embeddings for multi vector
+        tokens, embeddings = self._extend_tokens_and_embeddings(tokens, embeddings, tokenizer)
+
+        # 6. Make sure all embeddings have the correct size
+        expected_emb_dim = text_encoder.get_input_embeddings().weight.shape[-1]
+        if any(expected_emb_dim != emb.shape[-1] for emb in embeddings):
+            raise ValueError(
+                "Loaded embeddings are of incorrect shape. Expected each textual inversion embedding "
+                "to be of shape {input_embeddings.shape[-1]}, but are {embeddings.shape[-1]} "
+            )
+
+        # 7. Now we can be sure that loading the embedding matrix works
+        # < Unsafe code:
+
+        # 7.1 Offload all hooks in case the pipeline was cpu offloaded before make sure, we offload and onload again
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        if self.hf_device_map is None:
+            for _, component in self.components.items():
+                if isinstance(component, nn.Module):
+                    if hasattr(component, "_hf_hook"):
+                        is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                        is_sequential_cpu_offload = (
+                            isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                            or hasattr(component._hf_hook, "hooks")
+                            and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+                        )
+                        logger.info(
+                            "Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again."
+                        )
+                        remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
+
+        # 7.2 save expected device and dtype
+        device = text_encoder.device
+        dtype = text_encoder.dtype
+
+        # 7.3 Increase token embedding matrix
+        text_encoder.resize_token_embeddings(len(tokenizer) + len(tokens))
+        input_embeddings = text_encoder.get_input_embeddings().weight
+
+        # 7.4 Load token and embedding
+        for token, embedding in zip(tokens, embeddings):
+            # add tokens and get ids
+            tokenizer.add_tokens(token)
+            token_id = tokenizer.convert_tokens_to_ids(token)
+            input_embeddings.data[token_id] = embedding
+            logger.info(f"Loaded textual inversion embedding for {token}.")
+
+        input_embeddings.to(dtype=dtype, device=device)
+
+        # 7.5 Offload the model again
+        if is_model_cpu_offload:
+            self.enable_model_cpu_offload()
+        elif is_sequential_cpu_offload:
+            self.enable_sequential_cpu_offload()
+
+        # / Unsafe Code >
+
+    def unload_textual_inversion(
+        self,
+        tokens: Optional[Union[str, List[str]]] = None,
+        tokenizer: Optional["PreTrainedTokenizer"] = None,
+        text_encoder: Optional["PreTrainedModel"] = None,
+    ):
+        r"""
+        Unload Textual Inversion embeddings from the text encoder of [`StableDiffusionPipeline`]
+
+        Example:
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5")
+
+        # Example 1
+        pipeline.load_textual_inversion("sd-concepts-library/gta5-artwork")
+        pipeline.load_textual_inversion("sd-concepts-library/moeb-style")
+
+        # Remove all token embeddings
+        pipeline.unload_textual_inversion()
+
+        # Example 2
+        pipeline.load_textual_inversion("sd-concepts-library/moeb-style")
+        pipeline.load_textual_inversion("sd-concepts-library/gta5-artwork")
+
+        # Remove just one token
+        pipeline.unload_textual_inversion("<moe-bius>")
+
+        # Example 3: unload from SDXL
+        pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        embedding_path = hf_hub_download(
+            repo_id="linoyts/web_y2k", filename="web_y2k_emb.safetensors", repo_type="model"
+        )
+
+        # load embeddings to the text encoders
+        state_dict = load_file(embedding_path)
+
+        # load embeddings of text_encoder 1 (CLIP ViT-L/14)
+        pipeline.load_textual_inversion(
+            state_dict["clip_l"],
+            token=["<s0>", "<s1>"],
+            text_encoder=pipeline.text_encoder,
+            tokenizer=pipeline.tokenizer,
+        )
+        # load embeddings of text_encoder 2 (CLIP ViT-G/14)
+        pipeline.load_textual_inversion(
+            state_dict["clip_g"],
+            token=["<s0>", "<s1>"],
+            text_encoder=pipeline.text_encoder_2,
+            tokenizer=pipeline.tokenizer_2,
+        )
+
+        # Unload explicitly from both text encoders abd tokenizers
+        pipeline.unload_textual_inversion(
+            tokens=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer
+        )
+        pipeline.unload_textual_inversion(
+            tokens=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2
+        )
+        ```
+        """
+
+        tokenizer = tokenizer or getattr(self, "tokenizer", None)
+        text_encoder = text_encoder or getattr(self, "text_encoder", None)
+
+        # Get textual inversion tokens and ids
+        token_ids = []
+        last_special_token_id = None
+
+        if tokens:
+            if isinstance(tokens, str):
+                tokens = [tokens]
+            for added_token_id, added_token in tokenizer.added_tokens_decoder.items():
+                if not added_token.special:
+                    if added_token.content in tokens:
+                        token_ids.append(added_token_id)
+                else:
+                    last_special_token_id = added_token_id
+            if len(token_ids) == 0:
+                raise ValueError("No tokens to remove found")
+        else:
+            tokens = []
+            for added_token_id, added_token in tokenizer.added_tokens_decoder.items():
+                if not added_token.special:
+                    token_ids.append(added_token_id)
+                    tokens.append(added_token.content)
+                else:
+                    last_special_token_id = added_token_id
+
+        # Delete from tokenizer
+        for token_id, token_to_remove in zip(token_ids, tokens):
+            del tokenizer._added_tokens_decoder[token_id]
+            del tokenizer._added_tokens_encoder[token_to_remove]
+
+        # Make all token ids sequential in tokenizer
+        key_id = 1
+        for token_id in tokenizer.added_tokens_decoder:
+            if token_id > last_special_token_id and token_id > last_special_token_id + key_id:
+                token = tokenizer._added_tokens_decoder[token_id]
+                tokenizer._added_tokens_decoder[last_special_token_id + key_id] = token
+                del tokenizer._added_tokens_decoder[token_id]
+                tokenizer._added_tokens_encoder[token.content] = last_special_token_id + key_id
+                key_id += 1
+        tokenizer._update_trie()
+
+        # Delete from text encoder
+        text_embedding_dim = text_encoder.get_input_embeddings().embedding_dim
+        temp_text_embedding_weights = text_encoder.get_input_embeddings().weight
+        text_embedding_weights = temp_text_embedding_weights[: last_special_token_id + 1]
+        to_append = []
+        for i in range(last_special_token_id + 1, temp_text_embedding_weights.shape[0]):
+            if i not in token_ids:
+                to_append.append(temp_text_embedding_weights[i].unsqueeze(0))
+        if len(to_append) > 0:
+            to_append = torch.cat(to_append, dim=0)
+            text_embedding_weights = torch.cat([text_embedding_weights, to_append], dim=0)
+        text_embeddings_filtered = nn.Embedding(text_embedding_weights.shape[0], text_embedding_dim)
+        text_embeddings_filtered.weight.data = text_embedding_weights
+        text_encoder.set_input_embeddings(text_embeddings_filtered)
diff --git a/diffusers/src/diffusers/loaders/unet.py b/diffusers/src/diffusers/loaders/unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ace77b62246938ea263cc28f70593bcff51205
--- /dev/null
+++ b/diffusers/src/diffusers/loaders/unet.py
@@ -0,0 +1,906 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from collections import defaultdict
+from contextlib import nullcontext
+from pathlib import Path
+from typing import Callable, Dict, Union
+
+import safetensors
+import torch
+import torch.nn.functional as F
+from huggingface_hub.utils import validate_hf_hub_args
+from torch import nn
+
+from ..models.embeddings import (
+    ImageProjection,
+    IPAdapterFaceIDImageProjection,
+    IPAdapterFaceIDPlusImageProjection,
+    IPAdapterFullImageProjection,
+    IPAdapterPlusImageProjection,
+    MultiIPAdapterImageProjection,
+)
+from ..models.modeling_utils import load_model_dict_into_meta, load_state_dict
+from ..utils import (
+    USE_PEFT_BACKEND,
+    _get_model_file,
+    convert_unet_state_dict_to_peft,
+    get_adapter_name,
+    get_peft_kwargs,
+    is_accelerate_available,
+    is_peft_version,
+    is_torch_version,
+    logging,
+)
+from .lora_pipeline import LORA_WEIGHT_NAME, LORA_WEIGHT_NAME_SAFE, TEXT_ENCODER_NAME, UNET_NAME
+from .utils import AttnProcsLayers
+
+
+if is_accelerate_available():
+    from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+
+logger = logging.get_logger(__name__)
+
+
+CUSTOM_DIFFUSION_WEIGHT_NAME = "pytorch_custom_diffusion_weights.bin"
+CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE = "pytorch_custom_diffusion_weights.safetensors"
+
+
+class UNet2DConditionLoadersMixin:
+    """
+    Load LoRA layers into a [`UNet2DCondtionModel`].
+    """
+
+    text_encoder_name = TEXT_ENCODER_NAME
+    unet_name = UNET_NAME
+
+    @validate_hf_hub_args
+    def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
+        r"""
+        Load pretrained attention processor layers into [`UNet2DConditionModel`]. Attention processor layers have to be
+        defined in
+        [`attention_processor.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py)
+        and be a `torch.nn.Module` class. Currently supported: LoRA, Custom Diffusion. For LoRA, one must install
+        `peft`: `pip install -U peft`.
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+
+                    - A string, the model id (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a directory (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            network_alphas (`Dict[str, float]`):
+                The value of the network alpha used for stable learning and preventing underflow. This value has the
+                same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+                link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
+            adapter_name (`str`, *optional*, defaults to None):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+            weight_name (`str`, *optional*, defaults to None):
+                Name of the serialized state dict file.
+
+        Example:
+
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.unet.load_attn_procs(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        adapter_name = kwargs.pop("adapter_name", None)
+        _pipeline = kwargs.pop("_pipeline", None)
+        network_alphas = kwargs.pop("network_alphas", None)
+        allow_pickle = False
+
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+
+        model_file = None
+        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+            # Let's first try to load .safetensors weights
+            if (use_safetensors and weight_name is None) or (
+                weight_name is not None and weight_name.endswith(".safetensors")
+            ):
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path_or_dict,
+                        weights_name=weight_name or LORA_WEIGHT_NAME_SAFE,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                    )
+                    state_dict = safetensors.torch.load_file(model_file, device="cpu")
+                except IOError as e:
+                    if not allow_pickle:
+                        raise e
+                    # try loading non-safetensors weights
+                    pass
+            if model_file is None:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path_or_dict,
+                    weights_name=weight_name or LORA_WEIGHT_NAME,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                )
+                state_dict = load_state_dict(model_file)
+        else:
+            state_dict = pretrained_model_name_or_path_or_dict
+
+        is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys())
+        is_lora = all(("lora" in k or k.endswith(".alpha")) for k in state_dict.keys())
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+
+        if is_custom_diffusion:
+            attn_processors = self._process_custom_diffusion(state_dict=state_dict)
+        elif is_lora:
+            is_model_cpu_offload, is_sequential_cpu_offload = self._process_lora(
+                state_dict=state_dict,
+                unet_identifier_key=self.unet_name,
+                network_alphas=network_alphas,
+                adapter_name=adapter_name,
+                _pipeline=_pipeline,
+            )
+        else:
+            raise ValueError(
+                f"{model_file} does not seem to be in the correct format expected by Custom Diffusion training."
+            )
+
+        # <Unsafe code
+        # We can be sure that the following works as it just sets attention processors, lora layers and puts all in the same dtype
+        # Now we remove any existing hooks to `_pipeline`.
+
+        # For LoRA, the UNet is already offloaded at this stage as it is handled inside `_process_lora`.
+        if is_custom_diffusion and _pipeline is not None:
+            is_model_cpu_offload, is_sequential_cpu_offload = self._optionally_disable_offloading(_pipeline=_pipeline)
+
+            # only custom diffusion needs to set attn processors
+            self.set_attn_processor(attn_processors)
+            self.to(dtype=self.dtype, device=self.device)
+
+        # Offload back.
+        if is_model_cpu_offload:
+            _pipeline.enable_model_cpu_offload()
+        elif is_sequential_cpu_offload:
+            _pipeline.enable_sequential_cpu_offload()
+        # Unsafe code />
+
+    def _process_custom_diffusion(self, state_dict):
+        from ..models.attention_processor import CustomDiffusionAttnProcessor
+
+        attn_processors = {}
+        custom_diffusion_grouped_dict = defaultdict(dict)
+        for key, value in state_dict.items():
+            if len(value) == 0:
+                custom_diffusion_grouped_dict[key] = {}
+            else:
+                if "to_out" in key:
+                    attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+                else:
+                    attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:])
+                custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value
+
+        for key, value_dict in custom_diffusion_grouped_dict.items():
+            if len(value_dict) == 0:
+                attn_processors[key] = CustomDiffusionAttnProcessor(
+                    train_kv=False, train_q_out=False, hidden_size=None, cross_attention_dim=None
+                )
+            else:
+                cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[1]
+                hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[0]
+                train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
+                attn_processors[key] = CustomDiffusionAttnProcessor(
+                    train_kv=True,
+                    train_q_out=train_q_out,
+                    hidden_size=hidden_size,
+                    cross_attention_dim=cross_attention_dim,
+                )
+                attn_processors[key].load_state_dict(value_dict)
+
+        return attn_processors
+
+    def _process_lora(self, state_dict, unet_identifier_key, network_alphas, adapter_name, _pipeline):
+        # This method does the following things:
+        # 1. Filters the `state_dict` with keys matching  `unet_identifier_key` when using the non-legacy
+        #    format. For legacy format no filtering is applied.
+        # 2. Converts the `state_dict` to the `peft` compatible format.
+        # 3. Creates a `LoraConfig` and then injects the converted `state_dict` into the UNet per the
+        #    `LoraConfig` specs.
+        # 4. It also reports if the underlying `_pipeline` has any kind of offloading inside of it.
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
+
+        keys = list(state_dict.keys())
+
+        unet_keys = [k for k in keys if k.startswith(unet_identifier_key)]
+        unet_state_dict = {
+            k.replace(f"{unet_identifier_key}.", ""): v for k, v in state_dict.items() if k in unet_keys
+        }
+
+        if network_alphas is not None:
+            alpha_keys = [k for k in network_alphas.keys() if k.startswith(unet_identifier_key)]
+            network_alphas = {
+                k.replace(f"{unet_identifier_key}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
+            }
+
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        state_dict_to_be_used = unet_state_dict if len(unet_state_dict) > 0 else state_dict
+
+        if len(state_dict_to_be_used) > 0:
+            if adapter_name in getattr(self, "peft_config", {}):
+                raise ValueError(
+                    f"Adapter name {adapter_name} already in use in the Unet - please select a new adapter name."
+                )
+
+            state_dict = convert_unet_state_dict_to_peft(state_dict_to_be_used)
+
+            if network_alphas is not None:
+                # The alphas state dict have the same structure as Unet, thus we convert it to peft format using
+                # `convert_unet_state_dict_to_peft` method.
+                network_alphas = convert_unet_state_dict_to_peft(network_alphas)
+
+            rank = {}
+            for key, val in state_dict.items():
+                if "lora_B" in key:
+                    rank[key] = val.shape[1]
+
+            lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict, is_unet=True)
+            if "use_dora" in lora_config_kwargs:
+                if lora_config_kwargs["use_dora"]:
+                    if is_peft_version("<", "0.9.0"):
+                        raise ValueError(
+                            "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
+                        )
+                else:
+                    if is_peft_version("<", "0.9.0"):
+                        lora_config_kwargs.pop("use_dora")
+            lora_config = LoraConfig(**lora_config_kwargs)
+
+            # adapter_name
+            if adapter_name is None:
+                adapter_name = get_adapter_name(self)
+
+            # In case the pipeline has been already offloaded to CPU - temporarily remove the hooks
+            # otherwise loading LoRA weights will lead to an error
+            is_model_cpu_offload, is_sequential_cpu_offload = self._optionally_disable_offloading(_pipeline)
+
+            inject_adapter_in_model(lora_config, self, adapter_name=adapter_name)
+            incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name)
+
+            if incompatible_keys is not None:
+                # check only for unexpected keys
+                unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+                if unexpected_keys:
+                    logger.warning(
+                        f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                        f" {unexpected_keys}. "
+                    )
+
+        return is_model_cpu_offload, is_sequential_cpu_offload
+
+    @classmethod
+    # Copied from diffusers.loaders.lora_base.LoraBaseMixin._optionally_disable_offloading
+    def _optionally_disable_offloading(cls, _pipeline):
+        """
+        Optionally removes offloading in case the pipeline has been already sequentially offloaded to CPU.
+
+        Args:
+            _pipeline (`DiffusionPipeline`):
+                The pipeline to disable offloading for.
+
+        Returns:
+            tuple:
+                A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True.
+        """
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+
+        if _pipeline is not None and _pipeline.hf_device_map is None:
+            for _, component in _pipeline.components.items():
+                if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
+                    if not is_model_cpu_offload:
+                        is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
+                    if not is_sequential_cpu_offload:
+                        is_sequential_cpu_offload = (
+                            isinstance(component._hf_hook, AlignDevicesHook)
+                            or hasattr(component._hf_hook, "hooks")
+                            and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+                        )
+
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                    )
+                    remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
+
+        return (is_model_cpu_offload, is_sequential_cpu_offload)
+
+    def save_attn_procs(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+        **kwargs,
+    ):
+        r"""
+        Save attention processor layers to a directory so that it can be reloaded with the
+        [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`] method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save an attention processor to (will be created if it doesn't exist).
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or with `pickle`.
+
+        Example:
+
+        ```py
+        import torch
+        from diffusers import DiffusionPipeline
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            torch_dtype=torch.float16,
+        ).to("cuda")
+        pipeline.unet.load_attn_procs("path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin")
+        pipeline.unet.save_attn_procs("path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin")
+        ```
+        """
+        from ..models.attention_processor import (
+            CustomDiffusionAttnProcessor,
+            CustomDiffusionAttnProcessor2_0,
+            CustomDiffusionXFormersAttnProcessor,
+        )
+
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        is_custom_diffusion = any(
+            isinstance(
+                x,
+                (CustomDiffusionAttnProcessor, CustomDiffusionAttnProcessor2_0, CustomDiffusionXFormersAttnProcessor),
+            )
+            for (_, x) in self.attn_processors.items()
+        )
+        if is_custom_diffusion:
+            state_dict = self._get_custom_diffusion_state_dict()
+            if save_function is None and safe_serialization:
+                # safetensors does not support saving dicts with non-tensor values
+                empty_state_dict = {k: v for k, v in state_dict.items() if not isinstance(v, torch.Tensor)}
+                if len(empty_state_dict) > 0:
+                    logger.warning(
+                        f"Safetensors does not support saving dicts with non-tensor values. "
+                        f"The following keys will be ignored: {empty_state_dict.keys()}"
+                    )
+                state_dict = {k: v for k, v in state_dict.items() if isinstance(v, torch.Tensor)}
+        else:
+            if not USE_PEFT_BACKEND:
+                raise ValueError("PEFT backend is required for saving LoRAs using the `save_attn_procs()` method.")
+
+            from peft.utils import get_peft_model_state_dict
+
+            state_dict = get_peft_model_state_dict(self)
+
+        if save_function is None:
+            if safe_serialization:
+
+                def save_function(weights, filename):
+                    return safetensors.torch.save_file(weights, filename, metadata={"format": "pt"})
+
+            else:
+                save_function = torch.save
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if weight_name is None:
+            if safe_serialization:
+                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE if is_custom_diffusion else LORA_WEIGHT_NAME_SAFE
+            else:
+                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else LORA_WEIGHT_NAME
+
+        # Save the model
+        save_path = Path(save_directory, weight_name).as_posix()
+        save_function(state_dict, save_path)
+        logger.info(f"Model weights saved in {save_path}")
+
+    def _get_custom_diffusion_state_dict(self):
+        from ..models.attention_processor import (
+            CustomDiffusionAttnProcessor,
+            CustomDiffusionAttnProcessor2_0,
+            CustomDiffusionXFormersAttnProcessor,
+        )
+
+        model_to_save = AttnProcsLayers(
+            {
+                y: x
+                for (y, x) in self.attn_processors.items()
+                if isinstance(
+                    x,
+                    (
+                        CustomDiffusionAttnProcessor,
+                        CustomDiffusionAttnProcessor2_0,
+                        CustomDiffusionXFormersAttnProcessor,
+                    ),
+                )
+            }
+        )
+        state_dict = model_to_save.state_dict()
+        for name, attn in self.attn_processors.items():
+            if len(attn.state_dict()) == 0:
+                state_dict[name] = {}
+
+        return state_dict
+
+    def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_usage=False):
+        if low_cpu_mem_usage:
+            if is_accelerate_available():
+                from accelerate import init_empty_weights
+
+            else:
+                low_cpu_mem_usage = False
+                logger.warning(
+                    "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                    " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                    " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                    " install accelerate\n```\n."
+                )
+
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        updated_state_dict = {}
+        image_projection = None
+        init_context = init_empty_weights if low_cpu_mem_usage else nullcontext
+
+        if "proj.weight" in state_dict:
+            # IP-Adapter
+            num_image_text_embeds = 4
+            clip_embeddings_dim = state_dict["proj.weight"].shape[-1]
+            cross_attention_dim = state_dict["proj.weight"].shape[0] // 4
+
+            with init_context():
+                image_projection = ImageProjection(
+                    cross_attention_dim=cross_attention_dim,
+                    image_embed_dim=clip_embeddings_dim,
+                    num_image_text_embeds=num_image_text_embeds,
+                )
+
+            for key, value in state_dict.items():
+                diffusers_name = key.replace("proj", "image_embeds")
+                updated_state_dict[diffusers_name] = value
+
+        elif "proj.3.weight" in state_dict:
+            # IP-Adapter Full
+            clip_embeddings_dim = state_dict["proj.0.weight"].shape[0]
+            cross_attention_dim = state_dict["proj.3.weight"].shape[0]
+
+            with init_context():
+                image_projection = IPAdapterFullImageProjection(
+                    cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim
+                )
+
+            for key, value in state_dict.items():
+                diffusers_name = key.replace("proj.0", "ff.net.0.proj")
+                diffusers_name = diffusers_name.replace("proj.2", "ff.net.2")
+                diffusers_name = diffusers_name.replace("proj.3", "norm")
+                updated_state_dict[diffusers_name] = value
+
+        elif "perceiver_resampler.proj_in.weight" in state_dict:
+            # IP-Adapter Face ID Plus
+            id_embeddings_dim = state_dict["proj.0.weight"].shape[1]
+            embed_dims = state_dict["perceiver_resampler.proj_in.weight"].shape[0]
+            hidden_dims = state_dict["perceiver_resampler.proj_in.weight"].shape[1]
+            output_dims = state_dict["perceiver_resampler.proj_out.weight"].shape[0]
+            heads = state_dict["perceiver_resampler.layers.0.0.to_q.weight"].shape[0] // 64
+
+            with init_context():
+                image_projection = IPAdapterFaceIDPlusImageProjection(
+                    embed_dims=embed_dims,
+                    output_dims=output_dims,
+                    hidden_dims=hidden_dims,
+                    heads=heads,
+                    id_embeddings_dim=id_embeddings_dim,
+                )
+
+            for key, value in state_dict.items():
+                diffusers_name = key.replace("perceiver_resampler.", "")
+                diffusers_name = diffusers_name.replace("0.to", "attn.to")
+                diffusers_name = diffusers_name.replace("0.1.0.", "0.ff.0.")
+                diffusers_name = diffusers_name.replace("0.1.1.weight", "0.ff.1.net.0.proj.weight")
+                diffusers_name = diffusers_name.replace("0.1.3.weight", "0.ff.1.net.2.weight")
+                diffusers_name = diffusers_name.replace("1.1.0.", "1.ff.0.")
+                diffusers_name = diffusers_name.replace("1.1.1.weight", "1.ff.1.net.0.proj.weight")
+                diffusers_name = diffusers_name.replace("1.1.3.weight", "1.ff.1.net.2.weight")
+                diffusers_name = diffusers_name.replace("2.1.0.", "2.ff.0.")
+                diffusers_name = diffusers_name.replace("2.1.1.weight", "2.ff.1.net.0.proj.weight")
+                diffusers_name = diffusers_name.replace("2.1.3.weight", "2.ff.1.net.2.weight")
+                diffusers_name = diffusers_name.replace("3.1.0.", "3.ff.0.")
+                diffusers_name = diffusers_name.replace("3.1.1.weight", "3.ff.1.net.0.proj.weight")
+                diffusers_name = diffusers_name.replace("3.1.3.weight", "3.ff.1.net.2.weight")
+                diffusers_name = diffusers_name.replace("layers.0.0", "layers.0.ln0")
+                diffusers_name = diffusers_name.replace("layers.0.1", "layers.0.ln1")
+                diffusers_name = diffusers_name.replace("layers.1.0", "layers.1.ln0")
+                diffusers_name = diffusers_name.replace("layers.1.1", "layers.1.ln1")
+                diffusers_name = diffusers_name.replace("layers.2.0", "layers.2.ln0")
+                diffusers_name = diffusers_name.replace("layers.2.1", "layers.2.ln1")
+                diffusers_name = diffusers_name.replace("layers.3.0", "layers.3.ln0")
+                diffusers_name = diffusers_name.replace("layers.3.1", "layers.3.ln1")
+
+                if "norm1" in diffusers_name:
+                    updated_state_dict[diffusers_name.replace("0.norm1", "0")] = value
+                elif "norm2" in diffusers_name:
+                    updated_state_dict[diffusers_name.replace("0.norm2", "1")] = value
+                elif "to_kv" in diffusers_name:
+                    v_chunk = value.chunk(2, dim=0)
+                    updated_state_dict[diffusers_name.replace("to_kv", "to_k")] = v_chunk[0]
+                    updated_state_dict[diffusers_name.replace("to_kv", "to_v")] = v_chunk[1]
+                elif "to_out" in diffusers_name:
+                    updated_state_dict[diffusers_name.replace("to_out", "to_out.0")] = value
+                elif "proj.0.weight" == diffusers_name:
+                    updated_state_dict["proj.net.0.proj.weight"] = value
+                elif "proj.0.bias" == diffusers_name:
+                    updated_state_dict["proj.net.0.proj.bias"] = value
+                elif "proj.2.weight" == diffusers_name:
+                    updated_state_dict["proj.net.2.weight"] = value
+                elif "proj.2.bias" == diffusers_name:
+                    updated_state_dict["proj.net.2.bias"] = value
+                else:
+                    updated_state_dict[diffusers_name] = value
+
+        elif "norm.weight" in state_dict:
+            # IP-Adapter Face ID
+            id_embeddings_dim_in = state_dict["proj.0.weight"].shape[1]
+            id_embeddings_dim_out = state_dict["proj.0.weight"].shape[0]
+            multiplier = id_embeddings_dim_out // id_embeddings_dim_in
+            norm_layer = "norm.weight"
+            cross_attention_dim = state_dict[norm_layer].shape[0]
+            num_tokens = state_dict["proj.2.weight"].shape[0] // cross_attention_dim
+
+            with init_context():
+                image_projection = IPAdapterFaceIDImageProjection(
+                    cross_attention_dim=cross_attention_dim,
+                    image_embed_dim=id_embeddings_dim_in,
+                    mult=multiplier,
+                    num_tokens=num_tokens,
+                )
+
+            for key, value in state_dict.items():
+                diffusers_name = key.replace("proj.0", "ff.net.0.proj")
+                diffusers_name = diffusers_name.replace("proj.2", "ff.net.2")
+                updated_state_dict[diffusers_name] = value
+
+        else:
+            # IP-Adapter Plus
+            num_image_text_embeds = state_dict["latents"].shape[1]
+            embed_dims = state_dict["proj_in.weight"].shape[1]
+            output_dims = state_dict["proj_out.weight"].shape[0]
+            hidden_dims = state_dict["latents"].shape[2]
+            attn_key_present = any("attn" in k for k in state_dict)
+            heads = (
+                state_dict["layers.0.attn.to_q.weight"].shape[0] // 64
+                if attn_key_present
+                else state_dict["layers.0.0.to_q.weight"].shape[0] // 64
+            )
+
+            with init_context():
+                image_projection = IPAdapterPlusImageProjection(
+                    embed_dims=embed_dims,
+                    output_dims=output_dims,
+                    hidden_dims=hidden_dims,
+                    heads=heads,
+                    num_queries=num_image_text_embeds,
+                )
+
+            for key, value in state_dict.items():
+                diffusers_name = key.replace("0.to", "2.to")
+
+                diffusers_name = diffusers_name.replace("0.0.norm1", "0.ln0")
+                diffusers_name = diffusers_name.replace("0.0.norm2", "0.ln1")
+                diffusers_name = diffusers_name.replace("1.0.norm1", "1.ln0")
+                diffusers_name = diffusers_name.replace("1.0.norm2", "1.ln1")
+                diffusers_name = diffusers_name.replace("2.0.norm1", "2.ln0")
+                diffusers_name = diffusers_name.replace("2.0.norm2", "2.ln1")
+                diffusers_name = diffusers_name.replace("3.0.norm1", "3.ln0")
+                diffusers_name = diffusers_name.replace("3.0.norm2", "3.ln1")
+
+                if "to_kv" in diffusers_name:
+                    parts = diffusers_name.split(".")
+                    parts[2] = "attn"
+                    diffusers_name = ".".join(parts)
+                    v_chunk = value.chunk(2, dim=0)
+                    updated_state_dict[diffusers_name.replace("to_kv", "to_k")] = v_chunk[0]
+                    updated_state_dict[diffusers_name.replace("to_kv", "to_v")] = v_chunk[1]
+                elif "to_q" in diffusers_name:
+                    parts = diffusers_name.split(".")
+                    parts[2] = "attn"
+                    diffusers_name = ".".join(parts)
+                    updated_state_dict[diffusers_name] = value
+                elif "to_out" in diffusers_name:
+                    parts = diffusers_name.split(".")
+                    parts[2] = "attn"
+                    diffusers_name = ".".join(parts)
+                    updated_state_dict[diffusers_name.replace("to_out", "to_out.0")] = value
+                else:
+                    diffusers_name = diffusers_name.replace("0.1.0", "0.ff.0")
+                    diffusers_name = diffusers_name.replace("0.1.1", "0.ff.1.net.0.proj")
+                    diffusers_name = diffusers_name.replace("0.1.3", "0.ff.1.net.2")
+
+                    diffusers_name = diffusers_name.replace("1.1.0", "1.ff.0")
+                    diffusers_name = diffusers_name.replace("1.1.1", "1.ff.1.net.0.proj")
+                    diffusers_name = diffusers_name.replace("1.1.3", "1.ff.1.net.2")
+
+                    diffusers_name = diffusers_name.replace("2.1.0", "2.ff.0")
+                    diffusers_name = diffusers_name.replace("2.1.1", "2.ff.1.net.0.proj")
+                    diffusers_name = diffusers_name.replace("2.1.3", "2.ff.1.net.2")
+
+                    diffusers_name = diffusers_name.replace("3.1.0", "3.ff.0")
+                    diffusers_name = diffusers_name.replace("3.1.1", "3.ff.1.net.0.proj")
+                    diffusers_name = diffusers_name.replace("3.1.3", "3.ff.1.net.2")
+                    updated_state_dict[diffusers_name] = value
+
+        if not low_cpu_mem_usage:
+            image_projection.load_state_dict(updated_state_dict, strict=True)
+        else:
+            load_model_dict_into_meta(image_projection, updated_state_dict, device=self.device, dtype=self.dtype)
+
+        return image_projection
+
+    def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=False):
+        from ..models.attention_processor import (
+            IPAdapterAttnProcessor,
+            IPAdapterAttnProcessor2_0,
+        )
+
+        if low_cpu_mem_usage:
+            if is_accelerate_available():
+                from accelerate import init_empty_weights
+
+            else:
+                low_cpu_mem_usage = False
+                logger.warning(
+                    "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                    " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                    " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                    " install accelerate\n```\n."
+                )
+
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        # set ip-adapter cross-attention processors & load state_dict
+        attn_procs = {}
+        key_id = 1
+        init_context = init_empty_weights if low_cpu_mem_usage else nullcontext
+        for name in self.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else self.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = self.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.config.block_out_channels[block_id]
+
+            if cross_attention_dim is None or "motion_modules" in name:
+                attn_processor_class = self.attn_processors[name].__class__
+                attn_procs[name] = attn_processor_class()
+
+            else:
+                attn_processor_class = (
+                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
+                )
+                num_image_text_embeds = []
+                for state_dict in state_dicts:
+                    if "proj.weight" in state_dict["image_proj"]:
+                        # IP-Adapter
+                        num_image_text_embeds += [4]
+                    elif "proj.3.weight" in state_dict["image_proj"]:
+                        # IP-Adapter Full Face
+                        num_image_text_embeds += [257]  # 256 CLIP tokens + 1 CLS token
+                    elif "perceiver_resampler.proj_in.weight" in state_dict["image_proj"]:
+                        # IP-Adapter Face ID Plus
+                        num_image_text_embeds += [4]
+                    elif "norm.weight" in state_dict["image_proj"]:
+                        # IP-Adapter Face ID
+                        num_image_text_embeds += [4]
+                    else:
+                        # IP-Adapter Plus
+                        num_image_text_embeds += [state_dict["image_proj"]["latents"].shape[1]]
+
+                with init_context():
+                    attn_procs[name] = attn_processor_class(
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                        scale=1.0,
+                        num_tokens=num_image_text_embeds,
+                    )
+
+                value_dict = {}
+                for i, state_dict in enumerate(state_dicts):
+                    value_dict.update({f"to_k_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_k_ip.weight"]})
+                    value_dict.update({f"to_v_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_v_ip.weight"]})
+
+                if not low_cpu_mem_usage:
+                    attn_procs[name].load_state_dict(value_dict)
+                else:
+                    device = next(iter(value_dict.values())).device
+                    dtype = next(iter(value_dict.values())).dtype
+                    load_model_dict_into_meta(attn_procs[name], value_dict, device=device, dtype=dtype)
+
+                key_id += 2
+
+        return attn_procs
+
+    def _load_ip_adapter_weights(self, state_dicts, low_cpu_mem_usage=False):
+        if not isinstance(state_dicts, list):
+            state_dicts = [state_dicts]
+
+        # Kolors Unet already has a `encoder_hid_proj`
+        if (
+            self.encoder_hid_proj is not None
+            and self.config.encoder_hid_dim_type == "text_proj"
+            and not hasattr(self, "text_encoder_hid_proj")
+        ):
+            self.text_encoder_hid_proj = self.encoder_hid_proj
+
+        # Set encoder_hid_proj after loading ip_adapter weights,
+        # because `IPAdapterPlusImageProjection` also has `attn_processors`.
+        self.encoder_hid_proj = None
+
+        attn_procs = self._convert_ip_adapter_attn_to_diffusers(state_dicts, low_cpu_mem_usage=low_cpu_mem_usage)
+        self.set_attn_processor(attn_procs)
+
+        # convert IP-Adapter Image Projection layers to diffusers
+        image_projection_layers = []
+        for state_dict in state_dicts:
+            image_projection_layer = self._convert_ip_adapter_image_proj_to_diffusers(
+                state_dict["image_proj"], low_cpu_mem_usage=low_cpu_mem_usage
+            )
+            image_projection_layers.append(image_projection_layer)
+
+        self.encoder_hid_proj = MultiIPAdapterImageProjection(image_projection_layers)
+        self.config.encoder_hid_dim_type = "ip_image_proj"
+
+        self.to(dtype=self.dtype, device=self.device)
+
+    def _load_ip_adapter_loras(self, state_dicts):
+        lora_dicts = {}
+        for key_id, name in enumerate(self.attn_processors.keys()):
+            for i, state_dict in enumerate(state_dicts):
+                if f"{key_id}.to_k_lora.down.weight" in state_dict["ip_adapter"]:
+                    if i not in lora_dicts:
+                        lora_dicts[i] = {}
+                    lora_dicts[i].update(
+                        {
+                            f"unet.{name}.to_k_lora.down.weight": state_dict["ip_adapter"][
+                                f"{key_id}.to_k_lora.down.weight"
+                            ]
+                        }
+                    )
+                    lora_dicts[i].update(
+                        {
+                            f"unet.{name}.to_q_lora.down.weight": state_dict["ip_adapter"][
+                                f"{key_id}.to_q_lora.down.weight"
+                            ]
+                        }
+                    )
+                    lora_dicts[i].update(
+                        {
+                            f"unet.{name}.to_v_lora.down.weight": state_dict["ip_adapter"][
+                                f"{key_id}.to_v_lora.down.weight"
+                            ]
+                        }
+                    )
+                    lora_dicts[i].update(
+                        {
+                            f"unet.{name}.to_out_lora.down.weight": state_dict["ip_adapter"][
+                                f"{key_id}.to_out_lora.down.weight"
+                            ]
+                        }
+                    )
+                    lora_dicts[i].update(
+                        {f"unet.{name}.to_k_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_k_lora.up.weight"]}
+                    )
+                    lora_dicts[i].update(
+                        {f"unet.{name}.to_q_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_q_lora.up.weight"]}
+                    )
+                    lora_dicts[i].update(
+                        {f"unet.{name}.to_v_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_v_lora.up.weight"]}
+                    )
+                    lora_dicts[i].update(
+                        {
+                            f"unet.{name}.to_out_lora.up.weight": state_dict["ip_adapter"][
+                                f"{key_id}.to_out_lora.up.weight"
+                            ]
+                        }
+                    )
+        return lora_dicts
diff --git a/diffusers/src/diffusers/loaders/unet_loader_utils.py b/diffusers/src/diffusers/loaders/unet_loader_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f202ed4d44bdf0965dcf6e02efbfe26c42c8705
--- /dev/null
+++ b/diffusers/src/diffusers/loaders/unet_loader_utils.py
@@ -0,0 +1,163 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from typing import TYPE_CHECKING, Dict, List, Union
+
+from ..utils import logging
+
+
+if TYPE_CHECKING:
+    # import here to avoid circular imports
+    from ..models import UNet2DConditionModel
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def _translate_into_actual_layer_name(name):
+    """Translate user-friendly name (e.g. 'mid') into actual layer name (e.g. 'mid_block.attentions.0')"""
+    if name == "mid":
+        return "mid_block.attentions.0"
+
+    updown, block, attn = name.split(".")
+
+    updown = updown.replace("down", "down_blocks").replace("up", "up_blocks")
+    block = block.replace("block_", "")
+    attn = "attentions." + attn
+
+    return ".".join((updown, block, attn))
+
+
+def _maybe_expand_lora_scales(
+    unet: "UNet2DConditionModel", weight_scales: List[Union[float, Dict]], default_scale=1.0
+):
+    blocks_with_transformer = {
+        "down": [i for i, block in enumerate(unet.down_blocks) if hasattr(block, "attentions")],
+        "up": [i for i, block in enumerate(unet.up_blocks) if hasattr(block, "attentions")],
+    }
+    transformer_per_block = {"down": unet.config.layers_per_block, "up": unet.config.layers_per_block + 1}
+
+    expanded_weight_scales = [
+        _maybe_expand_lora_scales_for_one_adapter(
+            weight_for_adapter,
+            blocks_with_transformer,
+            transformer_per_block,
+            unet.state_dict(),
+            default_scale=default_scale,
+        )
+        for weight_for_adapter in weight_scales
+    ]
+
+    return expanded_weight_scales
+
+
+def _maybe_expand_lora_scales_for_one_adapter(
+    scales: Union[float, Dict],
+    blocks_with_transformer: Dict[str, int],
+    transformer_per_block: Dict[str, int],
+    state_dict: None,
+    default_scale: float = 1.0,
+):
+    """
+    Expands the inputs into a more granular dictionary. See the example below for more details.
+
+    Parameters:
+        scales (`Union[float, Dict]`):
+            Scales dict to expand.
+        blocks_with_transformer (`Dict[str, int]`):
+            Dict with keys 'up' and 'down', showing which blocks have transformer layers
+        transformer_per_block (`Dict[str, int]`):
+            Dict with keys 'up' and 'down', showing how many transformer layers each block has
+
+    E.g. turns
+    ```python
+    scales = {"down": 2, "mid": 3, "up": {"block_0": 4, "block_1": [5, 6, 7]}}
+    blocks_with_transformer = {"down": [1, 2], "up": [0, 1]}
+    transformer_per_block = {"down": 2, "up": 3}
+    ```
+    into
+    ```python
+    {
+        "down.block_1.0": 2,
+        "down.block_1.1": 2,
+        "down.block_2.0": 2,
+        "down.block_2.1": 2,
+        "mid": 3,
+        "up.block_0.0": 4,
+        "up.block_0.1": 4,
+        "up.block_0.2": 4,
+        "up.block_1.0": 5,
+        "up.block_1.1": 6,
+        "up.block_1.2": 7,
+    }
+    ```
+    """
+    if sorted(blocks_with_transformer.keys()) != ["down", "up"]:
+        raise ValueError("blocks_with_transformer needs to be a dict with keys `'down' and `'up'`")
+
+    if sorted(transformer_per_block.keys()) != ["down", "up"]:
+        raise ValueError("transformer_per_block needs to be a dict with keys `'down' and `'up'`")
+
+    if not isinstance(scales, dict):
+        # don't expand if scales is a single number
+        return scales
+
+    scales = copy.deepcopy(scales)
+
+    if "mid" not in scales:
+        scales["mid"] = default_scale
+    elif isinstance(scales["mid"], list):
+        if len(scales["mid"]) == 1:
+            scales["mid"] = scales["mid"][0]
+        else:
+            raise ValueError(f"Expected 1 scales for mid, got {len(scales['mid'])}.")
+
+    for updown in ["up", "down"]:
+        if updown not in scales:
+            scales[updown] = default_scale
+
+        # eg {"down": 1} to {"down": {"block_1": 1, "block_2": 1}}}
+        if not isinstance(scales[updown], dict):
+            scales[updown] = {f"block_{i}": copy.deepcopy(scales[updown]) for i in blocks_with_transformer[updown]}
+
+        # eg {"down": {"block_1": 1}} to {"down": {"block_1": [1, 1]}}
+        for i in blocks_with_transformer[updown]:
+            block = f"block_{i}"
+            # set not assigned blocks to default scale
+            if block not in scales[updown]:
+                scales[updown][block] = default_scale
+            if not isinstance(scales[updown][block], list):
+                scales[updown][block] = [scales[updown][block] for _ in range(transformer_per_block[updown])]
+            elif len(scales[updown][block]) == 1:
+                # a list specifying scale to each masked IP input
+                scales[updown][block] = scales[updown][block] * transformer_per_block[updown]
+            elif len(scales[updown][block]) != transformer_per_block[updown]:
+                raise ValueError(
+                    f"Expected {transformer_per_block[updown]} scales for {updown}.{block}, got {len(scales[updown][block])}."
+                )
+
+        # eg {"down": "block_1": [1, 1]}}  to {"down.block_1.0": 1, "down.block_1.1": 1}
+        for i in blocks_with_transformer[updown]:
+            block = f"block_{i}"
+            for tf_idx, value in enumerate(scales[updown][block]):
+                scales[f"{updown}.{block}.{tf_idx}"] = value
+
+        del scales[updown]
+
+    for layer in scales.keys():
+        if not any(_translate_into_actual_layer_name(layer) in module for module in state_dict.keys()):
+            raise ValueError(
+                f"Can't set lora scale for layer {layer}. It either doesn't exist in this unet or it has no attentions."
+            )
+
+    return {_translate_into_actual_layer_name(name): weight for name, weight in scales.items()}
diff --git a/diffusers/src/diffusers/loaders/utils.py b/diffusers/src/diffusers/loaders/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..142d72bf6b77edf4af72ee0d30d3d190cd4b3eef
--- /dev/null
+++ b/diffusers/src/diffusers/loaders/utils.py
@@ -0,0 +1,59 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+import torch
+
+
+class AttnProcsLayers(torch.nn.Module):
+    def __init__(self, state_dict: Dict[str, torch.Tensor]):
+        super().__init__()
+        self.layers = torch.nn.ModuleList(state_dict.values())
+        self.mapping = dict(enumerate(state_dict.keys()))
+        self.rev_mapping = {v: k for k, v in enumerate(state_dict.keys())}
+
+        # .processor for unet, .self_attn for text encoder
+        self.split_keys = [".processor", ".self_attn"]
+
+        # we add a hook to state_dict() and load_state_dict() so that the
+        # naming fits with `unet.attn_processors`
+        def map_to(module, state_dict, *args, **kwargs):
+            new_state_dict = {}
+            for key, value in state_dict.items():
+                num = int(key.split(".")[1])  # 0 is always "layers"
+                new_key = key.replace(f"layers.{num}", module.mapping[num])
+                new_state_dict[new_key] = value
+
+            return new_state_dict
+
+        def remap_key(key, state_dict):
+            for k in self.split_keys:
+                if k in key:
+                    return key.split(k)[0] + k
+
+            raise ValueError(
+                f"There seems to be a problem with the state_dict: {set(state_dict.keys())}. {key} has to have one of {self.split_keys}."
+            )
+
+        def map_from(module, state_dict, *args, **kwargs):
+            all_keys = list(state_dict.keys())
+            for key in all_keys:
+                replace_key = remap_key(key, state_dict)
+                new_key = key.replace(replace_key, f"layers.{module.rev_mapping[replace_key]}")
+                state_dict[new_key] = state_dict[key]
+                del state_dict[key]
+
+        self._register_state_dict_hook(map_to)
+        self._register_load_state_dict_pre_hook(map_from, with_module=True)
diff --git a/diffusers/src/diffusers/models/README.md b/diffusers/src/diffusers/models/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fb91f59411265660e01d8b4bcc0b99e8b8fe9d55
--- /dev/null
+++ b/diffusers/src/diffusers/models/README.md
@@ -0,0 +1,3 @@
+# Models
+
+For more detail on the models, please refer to the [docs](https://huggingface.co/docs/diffusers/api/models/overview).
\ No newline at end of file
diff --git a/diffusers/src/diffusers/models/__init__.py b/diffusers/src/diffusers/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f661ee5eddd4c4647d93a879f455f319424aba3
--- /dev/null
+++ b/diffusers/src/diffusers/models/__init__.py
@@ -0,0 +1,143 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ..utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    _LazyModule,
+    is_flax_available,
+    is_torch_available,
+)
+
+
+_import_structure = {}
+
+if is_torch_available():
+    _import_structure["adapter"] = ["MultiAdapter", "T2IAdapter"]
+    _import_structure["autoencoders.autoencoder_asym_kl"] = ["AsymmetricAutoencoderKL"]
+    _import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
+    _import_structure["autoencoders.autoencoder_kl_cogvideox"] = ["AutoencoderKLCogVideoX"]
+    _import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
+    _import_structure["autoencoders.autoencoder_oobleck"] = ["AutoencoderOobleck"]
+    _import_structure["autoencoders.autoencoder_tiny"] = ["AutoencoderTiny"]
+    _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
+    _import_structure["autoencoders.vq_model"] = ["VQModel"]
+    _import_structure["controlnet"] = ["ControlNetModel"]
+    _import_structure["colorguider_sd"] = ["ColorGuiderSDModel"]
+    _import_structure["controlnet_flux"] = ["FluxControlNetModel", "FluxMultiControlNetModel"]
+    _import_structure["controlnet_hunyuan"] = ["HunyuanDiT2DControlNetModel", "HunyuanDiT2DMultiControlNetModel"]
+    _import_structure["controlnet_sd3"] = ["SD3ControlNetModel", "SD3MultiControlNetModel"]
+    _import_structure["controlnet_sparsectrl"] = ["SparseControlNetModel"]
+    _import_structure["controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
+    _import_structure["embeddings"] = ["ImageProjection"]
+    _import_structure["modeling_utils"] = ["ModelMixin"]
+    _import_structure["transformers.auraflow_transformer_2d"] = ["AuraFlowTransformer2DModel"]
+    _import_structure["transformers.cogvideox_transformer_3d"] = ["CogVideoXTransformer3DModel"]
+    _import_structure["transformers.dit_transformer_2d"] = ["DiTTransformer2DModel"]
+    _import_structure["transformers.dual_transformer_2d"] = ["DualTransformer2DModel"]
+    _import_structure["transformers.hunyuan_transformer_2d"] = ["HunyuanDiT2DModel"]
+    _import_structure["transformers.latte_transformer_3d"] = ["LatteTransformer3DModel"]
+    _import_structure["transformers.lumina_nextdit2d"] = ["LuminaNextDiT2DModel"]
+    _import_structure["transformers.pixart_transformer_2d"] = ["PixArtTransformer2DModel"]
+    _import_structure["transformers.colorguider_pixart"] = ["ColorGuiderPixArtModel"]
+    _import_structure["transformers.causal_sparse_dit"] = ["CausalSparseDiTModel"]
+    _import_structure["transformers.causal_sparse_dit_control"] = ["CausalSparseDiTControlModel"]
+    _import_structure["transformers.prior_transformer"] = ["PriorTransformer"]
+    _import_structure["transformers.stable_audio_transformer"] = ["StableAudioDiTModel"]
+    _import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
+    _import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
+    _import_structure["transformers.transformer_flux"] = ["FluxTransformer2DModel"]
+    _import_structure["transformers.transformer_sd3"] = ["SD3Transformer2DModel"]
+    _import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
+    _import_structure["unets.unet_1d"] = ["UNet1DModel"]
+    _import_structure["unets.unet_2d"] = ["UNet2DModel"]
+    _import_structure["unets.unet_2d_condition"] = ["UNet2DConditionModel"]
+    _import_structure["unets.unet_3d_condition"] = ["UNet3DConditionModel"]
+    _import_structure["unets.unet_i2vgen_xl"] = ["I2VGenXLUNet"]
+    _import_structure["unets.unet_kandinsky3"] = ["Kandinsky3UNet"]
+    _import_structure["unets.unet_motion_model"] = ["MotionAdapter", "UNetMotionModel"]
+    _import_structure["unets.unet_spatio_temporal_condition"] = ["UNetSpatioTemporalConditionModel"]
+    _import_structure["unets.unet_stable_cascade"] = ["StableCascadeUNet"]
+    _import_structure["unets.uvit_2d"] = ["UVit2DModel"]
+
+if is_flax_available():
+    _import_structure["controlnet_flax"] = ["FlaxControlNetModel"]
+    _import_structure["unets.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
+    _import_structure["vae_flax"] = ["FlaxAutoencoderKL"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    if is_torch_available():
+        from .adapter import MultiAdapter, T2IAdapter
+        from .autoencoders import (
+            AsymmetricAutoencoderKL,
+            AutoencoderKL,
+            AutoencoderKLCogVideoX,
+            AutoencoderKLTemporalDecoder,
+            AutoencoderOobleck,
+            AutoencoderTiny,
+            ConsistencyDecoderVAE,
+            VQModel,
+        )
+        from .controlnet import ControlNetModel
+        from .colorguider_sd import ColorGuiderSDModel
+        from .controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
+        from .controlnet_hunyuan import HunyuanDiT2DControlNetModel, HunyuanDiT2DMultiControlNetModel
+        from .controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel
+        from .controlnet_sparsectrl import SparseControlNetModel
+        from .controlnet_xs import ControlNetXSAdapter, UNetControlNetXSModel
+        from .embeddings import ImageProjection
+        from .modeling_utils import ModelMixin
+        from .transformers import (
+            AuraFlowTransformer2DModel,
+            CogVideoXTransformer3DModel,
+            DiTTransformer2DModel,
+            DualTransformer2DModel,
+            FluxTransformer2DModel,
+            HunyuanDiT2DModel,
+            LatteTransformer3DModel,
+            LuminaNextDiT2DModel,
+            PixArtTransformer2DModel,
+            PixArtTransformerControl2DModel,
+            PriorTransformer,
+            SD3Transformer2DModel,
+            StableAudioDiTModel,
+            T5FilmDecoder,
+            Transformer2DModel,
+            TransformerTemporalModel,
+        )
+        from .unets import (
+            I2VGenXLUNet,
+            Kandinsky3UNet,
+            MotionAdapter,
+            StableCascadeUNet,
+            UNet1DModel,
+            UNet2DConditionModel,
+            UNet2DModel,
+            UNet3DConditionModel,
+            UNetMotionModel,
+            UNetSpatioTemporalConditionModel,
+            UVit2DModel,
+        )
+
+    if is_flax_available():
+        from .controlnet_flax import FlaxControlNetModel
+        from .unets import FlaxUNet2DConditionModel
+        from .vae_flax import FlaxAutoencoderKL
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/diffusers/src/diffusers/models/__pycache__/__init__.cpython-311.pyc b/diffusers/src/diffusers/models/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..059a2f0ac3de6b29cc84dd3c5e82654d8fd28f9c
Binary files /dev/null and b/diffusers/src/diffusers/models/__pycache__/__init__.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/__pycache__/activations.cpython-311.pyc b/diffusers/src/diffusers/models/__pycache__/activations.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42a73dadc3aa07a276abf0c4917eb777da819e38
Binary files /dev/null and b/diffusers/src/diffusers/models/__pycache__/activations.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/__pycache__/attention.cpython-311.pyc b/diffusers/src/diffusers/models/__pycache__/attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f43950b8b435e67c34619880544c19c48ca48839
Binary files /dev/null and b/diffusers/src/diffusers/models/__pycache__/attention.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/__pycache__/attention_processor.cpython-311.pyc b/diffusers/src/diffusers/models/__pycache__/attention_processor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eecd43dd796c28c5a2608741447f5ac1764daebb
--- /dev/null
+++ b/diffusers/src/diffusers/models/__pycache__/attention_processor.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d38a3a29e96261ffe5dd87dd132c3a17b8ae61a642b0136ed3ff2668c5c85ec
+size 244585
diff --git a/diffusers/src/diffusers/models/__pycache__/downsampling.cpython-311.pyc b/diffusers/src/diffusers/models/__pycache__/downsampling.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..463ead07cbfe133eaf01b296a42c0ad3e1520590
Binary files /dev/null and b/diffusers/src/diffusers/models/__pycache__/downsampling.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/__pycache__/embeddings.cpython-311.pyc b/diffusers/src/diffusers/models/__pycache__/embeddings.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5815fc8dcf7c3442da6a6f1b4cac1c96b8549d7
Binary files /dev/null and b/diffusers/src/diffusers/models/__pycache__/embeddings.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/__pycache__/lora.cpython-311.pyc b/diffusers/src/diffusers/models/__pycache__/lora.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba0acde00d36685bce1b526f8e886a4caf36283d
Binary files /dev/null and b/diffusers/src/diffusers/models/__pycache__/lora.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/__pycache__/model_loading_utils.cpython-311.pyc b/diffusers/src/diffusers/models/__pycache__/model_loading_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9031ca8977a9fe0e566264ec3db762c409b1a361
Binary files /dev/null and b/diffusers/src/diffusers/models/__pycache__/model_loading_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/__pycache__/modeling_outputs.cpython-311.pyc b/diffusers/src/diffusers/models/__pycache__/modeling_outputs.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9111421cb06298434d56672b272ead6463af67a
Binary files /dev/null and b/diffusers/src/diffusers/models/__pycache__/modeling_outputs.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/__pycache__/modeling_utils.cpython-311.pyc b/diffusers/src/diffusers/models/__pycache__/modeling_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62fb4cf6b9fc770c6714a2cfebb52328099e7387
Binary files /dev/null and b/diffusers/src/diffusers/models/__pycache__/modeling_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/__pycache__/normalization.cpython-311.pyc b/diffusers/src/diffusers/models/__pycache__/normalization.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ae92f99a85488da0ac6de27034bca3aa09e7c53
Binary files /dev/null and b/diffusers/src/diffusers/models/__pycache__/normalization.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/__pycache__/resnet.cpython-311.pyc b/diffusers/src/diffusers/models/__pycache__/resnet.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b256b3ca07905daedc1c4fe6a4d7de82bcc4be7
Binary files /dev/null and b/diffusers/src/diffusers/models/__pycache__/resnet.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/__pycache__/upsampling.cpython-311.pyc b/diffusers/src/diffusers/models/__pycache__/upsampling.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d48f8073a6d6f55361def2cd22b872dc97aad70
Binary files /dev/null and b/diffusers/src/diffusers/models/__pycache__/upsampling.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/activations.py b/diffusers/src/diffusers/models/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb24a36bae7574a338feef026f332fa07f1a7557
--- /dev/null
+++ b/diffusers/src/diffusers/models/activations.py
@@ -0,0 +1,165 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ..utils import deprecate
+from ..utils.import_utils import is_torch_npu_available
+
+
+if is_torch_npu_available():
+    import torch_npu
+
+ACTIVATION_FUNCTIONS = {
+    "swish": nn.SiLU(),
+    "silu": nn.SiLU(),
+    "mish": nn.Mish(),
+    "gelu": nn.GELU(),
+    "relu": nn.ReLU(),
+}
+
+
+def get_activation(act_fn: str) -> nn.Module:
+    """Helper function to get activation function from string.
+
+    Args:
+        act_fn (str): Name of activation function.
+
+    Returns:
+        nn.Module: Activation function.
+    """
+
+    act_fn = act_fn.lower()
+    if act_fn in ACTIVATION_FUNCTIONS:
+        return ACTIVATION_FUNCTIONS[act_fn]
+    else:
+        raise ValueError(f"Unsupported activation function: {act_fn}")
+
+
+class FP32SiLU(nn.Module):
+    r"""
+    SiLU activation function with input upcasted to torch.float32.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        return F.silu(inputs.float(), inplace=False).to(inputs.dtype)
+
+
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.approximate = approximate
+
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate, approximate=self.approximate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+
+
+class GEGLU(nn.Module):
+    r"""
+    A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function.
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+
+    def forward(self, hidden_states, *args, **kwargs):
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        hidden_states = self.proj(hidden_states)
+        if is_torch_npu_available():
+            # using torch_npu.npu_geglu can run faster and save memory on NPU.
+            return torch_npu.npu_geglu(hidden_states, dim=-1, approximate=1)[0]
+        else:
+            hidden_states, gate = hidden_states.chunk(2, dim=-1)
+            return hidden_states * self.gelu(gate)
+
+
+class SwiGLU(nn.Module):
+    r"""
+    A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function. It's similar to `GEGLU`
+    but uses SiLU / Swish instead of GeLU.
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+        self.activation = nn.SiLU()
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states, gate = hidden_states.chunk(2, dim=-1)
+        return hidden_states * self.activation(gate)
+
+
+class ApproximateGELU(nn.Module):
+    r"""
+    The approximate form of the Gaussian Error Linear Unit (GELU). For more details, see section 2 of this
+    [paper](https://arxiv.org/abs/1606.08415).
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
diff --git a/diffusers/src/diffusers/models/adapter.py b/diffusers/src/diffusers/models/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f4b2ec033717a8d4eb10fe34226d88e966d1ec5
--- /dev/null
+++ b/diffusers/src/diffusers/models/adapter.py
@@ -0,0 +1,584 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Callable, List, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import logging
+from .modeling_utils import ModelMixin
+
+
+logger = logging.get_logger(__name__)
+
+
+class MultiAdapter(ModelMixin):
+    r"""
+    MultiAdapter is a wrapper model that contains multiple adapter models and merges their outputs according to
+    user-assigned weighting.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the model (such as downloading or saving, etc.)
+
+    Parameters:
+        adapters (`List[T2IAdapter]`, *optional*, defaults to None):
+            A list of `T2IAdapter` model instances.
+    """
+
+    def __init__(self, adapters: List["T2IAdapter"]):
+        super(MultiAdapter, self).__init__()
+
+        self.num_adapter = len(adapters)
+        self.adapters = nn.ModuleList(adapters)
+
+        if len(adapters) == 0:
+            raise ValueError("Expecting at least one adapter")
+
+        if len(adapters) == 1:
+            raise ValueError("For a single adapter, please use the `T2IAdapter` class instead of `MultiAdapter`")
+
+        # The outputs from each adapter are added together with a weight.
+        # This means that the change in dimensions from downsampling must
+        # be the same for all adapters. Inductively, it also means the
+        # downscale_factor and total_downscale_factor must be the same for all
+        # adapters.
+        first_adapter_total_downscale_factor = adapters[0].total_downscale_factor
+        first_adapter_downscale_factor = adapters[0].downscale_factor
+        for idx in range(1, len(adapters)):
+            if (
+                adapters[idx].total_downscale_factor != first_adapter_total_downscale_factor
+                or adapters[idx].downscale_factor != first_adapter_downscale_factor
+            ):
+                raise ValueError(
+                    f"Expecting all adapters to have the same downscaling behavior, but got:\n"
+                    f"adapters[0].total_downscale_factor={first_adapter_total_downscale_factor}\n"
+                    f"adapters[0].downscale_factor={first_adapter_downscale_factor}\n"
+                    f"adapter[`{idx}`].total_downscale_factor={adapters[idx].total_downscale_factor}\n"
+                    f"adapter[`{idx}`].downscale_factor={adapters[idx].downscale_factor}"
+                )
+
+        self.total_downscale_factor = first_adapter_total_downscale_factor
+        self.downscale_factor = first_adapter_downscale_factor
+
+    def forward(self, xs: torch.Tensor, adapter_weights: Optional[List[float]] = None) -> List[torch.Tensor]:
+        r"""
+        Args:
+            xs (`torch.Tensor`):
+                (batch, channel, height, width) input images for multiple adapter models concated along dimension 1,
+                `channel` should equal to `num_adapter` * "number of channel of image".
+            adapter_weights (`List[float]`, *optional*, defaults to None):
+                List of floats representing the weight which will be multiply to each adapter's output before adding
+                them together.
+        """
+        if adapter_weights is None:
+            adapter_weights = torch.tensor([1 / self.num_adapter] * self.num_adapter)
+        else:
+            adapter_weights = torch.tensor(adapter_weights)
+
+        accume_state = None
+        for x, w, adapter in zip(xs, adapter_weights, self.adapters):
+            features = adapter(x)
+            if accume_state is None:
+                accume_state = features
+                for i in range(len(accume_state)):
+                    accume_state[i] = w * accume_state[i]
+            else:
+                for i in range(len(features)):
+                    accume_state[i] += w * features[i]
+        return accume_state
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+    ):
+        """
+        Save a model and its configuration file to a directory, so that it can be re-loaded using the
+        `[`~models.adapter.MultiAdapter.from_pretrained`]` class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to which to save. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful when in distributed training like
+                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
+                the main process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
+                need to replace `torch.save` by another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format pytorch_model.<variant>.bin.
+        """
+        idx = 0
+        model_path_to_save = save_directory
+        for adapter in self.adapters:
+            adapter.save_pretrained(
+                model_path_to_save,
+                is_main_process=is_main_process,
+                save_function=save_function,
+                safe_serialization=safe_serialization,
+                variant=variant,
+            )
+
+            idx += 1
+            model_path_to_save = model_path_to_save + f"_{idx}"
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained MultiAdapter model from multiple pre-trained adapter models.
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+        the model, you should first set it back in training mode with `model.train()`.
+
+        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
+        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+        task.
+
+        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
+        weights are discarded.
+
+        Parameters:
+            pretrained_model_path (`os.PathLike`):
+                A path to a *directory* containing model weights saved using
+                [`~diffusers.models.adapter.MultiAdapter.save_pretrained`], e.g., `./my_model_directory/adapter`.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+                will be automatically derived from the model's weights.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be refined to each
+                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
+                GPU and the available CPU RAM if unset.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
+                also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
+                model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
+                setting this argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
+                ignored when using `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights will be downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model will be forcibly loaded from
+                `safetensors` weights. If set to `False`, loading will *not* use `safetensors`.
+        """
+        idx = 0
+        adapters = []
+
+        # load adapter and append to list until no adapter directory exists anymore
+        # first adapter has to be saved under `./mydirectory/adapter` to be compliant with `DiffusionPipeline.from_pretrained`
+        # second, third, ... adapters have to be saved under `./mydirectory/adapter_1`, `./mydirectory/adapter_2`, ...
+        model_path_to_load = pretrained_model_path
+        while os.path.isdir(model_path_to_load):
+            adapter = T2IAdapter.from_pretrained(model_path_to_load, **kwargs)
+            adapters.append(adapter)
+
+            idx += 1
+            model_path_to_load = pretrained_model_path + f"_{idx}"
+
+        logger.info(f"{len(adapters)} adapters loaded from {pretrained_model_path}.")
+
+        if len(adapters) == 0:
+            raise ValueError(
+                f"No T2IAdapters found under {os.path.dirname(pretrained_model_path)}. Expected at least {pretrained_model_path + '_0'}."
+            )
+
+        return cls(adapters)
+
+
+class T2IAdapter(ModelMixin, ConfigMixin):
+    r"""
+    A simple ResNet-like model that accepts images containing control signals such as keyposes and depth. The model
+    generates multiple feature maps that are used as additional conditioning in [`UNet2DConditionModel`]. The model's
+    architecture follows the original implementation of
+    [Adapter](https://github.com/TencentARC/T2I-Adapter/blob/686de4681515662c0ac2ffa07bf5dda83af1038a/ldm/modules/encoders/adapter.py#L97)
+     and
+     [AdapterLight](https://github.com/TencentARC/T2I-Adapter/blob/686de4681515662c0ac2ffa07bf5dda83af1038a/ldm/modules/encoders/adapter.py#L235).
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the model (such as downloading or saving, etc.)
+
+    Parameters:
+        in_channels (`int`, *optional*, defaults to 3):
+            Number of channels of Aapter's input(*control image*). Set this parameter to 1 if you're using gray scale
+            image as *control image*.
+        channels (`List[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The number of channel of each downsample block's output hidden state. The `len(block_out_channels)` will
+            also determine the number of downsample blocks in the Adapter.
+        num_res_blocks (`int`, *optional*, defaults to 2):
+            Number of ResNet blocks in each downsample block.
+        downscale_factor (`int`, *optional*, defaults to 8):
+            A factor that determines the total downscale factor of the Adapter.
+        adapter_type (`str`, *optional*, defaults to `full_adapter`):
+            The type of Adapter to use. Choose either `full_adapter` or `full_adapter_xl` or `light_adapter`.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        channels: List[int] = [320, 640, 1280, 1280],
+        num_res_blocks: int = 2,
+        downscale_factor: int = 8,
+        adapter_type: str = "full_adapter",
+    ):
+        super().__init__()
+
+        if adapter_type == "full_adapter":
+            self.adapter = FullAdapter(in_channels, channels, num_res_blocks, downscale_factor)
+        elif adapter_type == "full_adapter_xl":
+            self.adapter = FullAdapterXL(in_channels, channels, num_res_blocks, downscale_factor)
+        elif adapter_type == "light_adapter":
+            self.adapter = LightAdapter(in_channels, channels, num_res_blocks, downscale_factor)
+        else:
+            raise ValueError(
+                f"Unsupported adapter_type: '{adapter_type}'. Choose either 'full_adapter' or "
+                "'full_adapter_xl' or 'light_adapter'."
+            )
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        r"""
+        This function processes the input tensor `x` through the adapter model and returns a list of feature tensors,
+        each representing information extracted at a different scale from the input. The length of the list is
+        determined by the number of downsample blocks in the Adapter, as specified by the `channels` and
+        `num_res_blocks` parameters during initialization.
+        """
+        return self.adapter(x)
+
+    @property
+    def total_downscale_factor(self):
+        return self.adapter.total_downscale_factor
+
+    @property
+    def downscale_factor(self):
+        """The downscale factor applied in the T2I-Adapter's initial pixel unshuffle operation. If an input image's dimensions are
+        not evenly divisible by the downscale_factor then an exception will be raised.
+        """
+        return self.adapter.unshuffle.downscale_factor
+
+
+# full adapter
+
+
+class FullAdapter(nn.Module):
+    r"""
+    See [`T2IAdapter`] for more information.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        channels: List[int] = [320, 640, 1280, 1280],
+        num_res_blocks: int = 2,
+        downscale_factor: int = 8,
+    ):
+        super().__init__()
+
+        in_channels = in_channels * downscale_factor**2
+
+        self.unshuffle = nn.PixelUnshuffle(downscale_factor)
+        self.conv_in = nn.Conv2d(in_channels, channels[0], kernel_size=3, padding=1)
+
+        self.body = nn.ModuleList(
+            [
+                AdapterBlock(channels[0], channels[0], num_res_blocks),
+                *[
+                    AdapterBlock(channels[i - 1], channels[i], num_res_blocks, down=True)
+                    for i in range(1, len(channels))
+                ],
+            ]
+        )
+
+        self.total_downscale_factor = downscale_factor * 2 ** (len(channels) - 1)
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        r"""
+        This method processes the input tensor `x` through the FullAdapter model and performs operations including
+        pixel unshuffling, convolution, and a stack of AdapterBlocks. It returns a list of feature tensors, each
+        capturing information at a different stage of processing within the FullAdapter model. The number of feature
+        tensors in the list is determined by the number of downsample blocks specified during initialization.
+        """
+        x = self.unshuffle(x)
+        x = self.conv_in(x)
+
+        features = []
+
+        for block in self.body:
+            x = block(x)
+            features.append(x)
+
+        return features
+
+
+class FullAdapterXL(nn.Module):
+    r"""
+    See [`T2IAdapter`] for more information.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        channels: List[int] = [320, 640, 1280, 1280],
+        num_res_blocks: int = 2,
+        downscale_factor: int = 16,
+    ):
+        super().__init__()
+
+        in_channels = in_channels * downscale_factor**2
+
+        self.unshuffle = nn.PixelUnshuffle(downscale_factor)
+        self.conv_in = nn.Conv2d(in_channels, channels[0], kernel_size=3, padding=1)
+
+        self.body = []
+        # blocks to extract XL features with dimensions of [320, 64, 64], [640, 64, 64], [1280, 32, 32], [1280, 32, 32]
+        for i in range(len(channels)):
+            if i == 1:
+                self.body.append(AdapterBlock(channels[i - 1], channels[i], num_res_blocks))
+            elif i == 2:
+                self.body.append(AdapterBlock(channels[i - 1], channels[i], num_res_blocks, down=True))
+            else:
+                self.body.append(AdapterBlock(channels[i], channels[i], num_res_blocks))
+
+        self.body = nn.ModuleList(self.body)
+        # XL has only one downsampling AdapterBlock.
+        self.total_downscale_factor = downscale_factor * 2
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        r"""
+        This method takes the tensor x as input and processes it through FullAdapterXL model. It consists of operations
+        including unshuffling pixels, applying convolution layer and appending each block into list of feature tensors.
+        """
+        x = self.unshuffle(x)
+        x = self.conv_in(x)
+
+        features = []
+
+        for block in self.body:
+            x = block(x)
+            features.append(x)
+
+        return features
+
+
+class AdapterBlock(nn.Module):
+    r"""
+    An AdapterBlock is a helper model that contains multiple ResNet-like blocks. It is used in the `FullAdapter` and
+    `FullAdapterXL` models.
+
+    Parameters:
+        in_channels (`int`):
+            Number of channels of AdapterBlock's input.
+        out_channels (`int`):
+            Number of channels of AdapterBlock's output.
+        num_res_blocks (`int`):
+            Number of ResNet blocks in the AdapterBlock.
+        down (`bool`, *optional*, defaults to `False`):
+            Whether to perform downsampling on AdapterBlock's input.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, num_res_blocks: int, down: bool = False):
+        super().__init__()
+
+        self.downsample = None
+        if down:
+            self.downsample = nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True)
+
+        self.in_conv = None
+        if in_channels != out_channels:
+            self.in_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)
+
+        self.resnets = nn.Sequential(
+            *[AdapterResnetBlock(out_channels) for _ in range(num_res_blocks)],
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        This method takes tensor x as input and performs operations downsampling and convolutional layers if the
+        self.downsample and self.in_conv properties of AdapterBlock model are specified. Then it applies a series of
+        residual blocks to the input tensor.
+        """
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        if self.in_conv is not None:
+            x = self.in_conv(x)
+
+        x = self.resnets(x)
+
+        return x
+
+
+class AdapterResnetBlock(nn.Module):
+    r"""
+    An `AdapterResnetBlock` is a helper model that implements a ResNet-like block.
+
+    Parameters:
+        channels (`int`):
+            Number of channels of AdapterResnetBlock's input and output.
+    """
+
+    def __init__(self, channels: int):
+        super().__init__()
+        self.block1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.act = nn.ReLU()
+        self.block2 = nn.Conv2d(channels, channels, kernel_size=1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        This method takes input tensor x and applies a convolutional layer, ReLU activation, and another convolutional
+        layer on the input tensor. It returns addition with the input tensor.
+        """
+
+        h = self.act(self.block1(x))
+        h = self.block2(h)
+
+        return h + x
+
+
+# light adapter
+
+
+class LightAdapter(nn.Module):
+    r"""
+    See [`T2IAdapter`] for more information.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        channels: List[int] = [320, 640, 1280],
+        num_res_blocks: int = 4,
+        downscale_factor: int = 8,
+    ):
+        super().__init__()
+
+        in_channels = in_channels * downscale_factor**2
+
+        self.unshuffle = nn.PixelUnshuffle(downscale_factor)
+
+        self.body = nn.ModuleList(
+            [
+                LightAdapterBlock(in_channels, channels[0], num_res_blocks),
+                *[
+                    LightAdapterBlock(channels[i], channels[i + 1], num_res_blocks, down=True)
+                    for i in range(len(channels) - 1)
+                ],
+                LightAdapterBlock(channels[-1], channels[-1], num_res_blocks, down=True),
+            ]
+        )
+
+        self.total_downscale_factor = downscale_factor * (2 ** len(channels))
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        r"""
+        This method takes the input tensor x and performs downscaling and appends it in list of feature tensors. Each
+        feature tensor corresponds to a different level of processing within the LightAdapter.
+        """
+        x = self.unshuffle(x)
+
+        features = []
+
+        for block in self.body:
+            x = block(x)
+            features.append(x)
+
+        return features
+
+
+class LightAdapterBlock(nn.Module):
+    r"""
+    A `LightAdapterBlock` is a helper model that contains multiple `LightAdapterResnetBlocks`. It is used in the
+    `LightAdapter` model.
+
+    Parameters:
+        in_channels (`int`):
+            Number of channels of LightAdapterBlock's input.
+        out_channels (`int`):
+            Number of channels of LightAdapterBlock's output.
+        num_res_blocks (`int`):
+            Number of LightAdapterResnetBlocks in the LightAdapterBlock.
+        down (`bool`, *optional*, defaults to `False`):
+            Whether to perform downsampling on LightAdapterBlock's input.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, num_res_blocks: int, down: bool = False):
+        super().__init__()
+        mid_channels = out_channels // 4
+
+        self.downsample = None
+        if down:
+            self.downsample = nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True)
+
+        self.in_conv = nn.Conv2d(in_channels, mid_channels, kernel_size=1)
+        self.resnets = nn.Sequential(*[LightAdapterResnetBlock(mid_channels) for _ in range(num_res_blocks)])
+        self.out_conv = nn.Conv2d(mid_channels, out_channels, kernel_size=1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        This method takes tensor x as input and performs downsampling if required. Then it applies in convolution
+        layer, a sequence of residual blocks, and out convolutional layer.
+        """
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        x = self.in_conv(x)
+        x = self.resnets(x)
+        x = self.out_conv(x)
+
+        return x
+
+
+class LightAdapterResnetBlock(nn.Module):
+    """
+    A `LightAdapterResnetBlock` is a helper model that implements a ResNet-like block with a slightly different
+    architecture than `AdapterResnetBlock`.
+
+    Parameters:
+        channels (`int`):
+            Number of channels of LightAdapterResnetBlock's input and output.
+    """
+
+    def __init__(self, channels: int):
+        super().__init__()
+        self.block1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.act = nn.ReLU()
+        self.block2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        This function takes input tensor x and processes it through one convolutional layer, ReLU activation, and
+        another convolutional layer and adds it to input tensor.
+        """
+
+        h = self.act(self.block1(x))
+        h = self.block2(h)
+
+        return h + x
diff --git a/diffusers/src/diffusers/models/attention.py b/diffusers/src/diffusers/models/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cf65822ef9e10c2c8e4ad686f47ad60f2ae37da
--- /dev/null
+++ b/diffusers/src/diffusers/models/attention.py
@@ -0,0 +1,2060 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ..utils import deprecate, logging
+from ..utils.torch_utils import maybe_allow_in_graph
+from .activations import GEGLU, GELU, ApproximateGELU, FP32SiLU, SwiGLU
+from .attention_processor import Attention, JointAttnProcessor2_0, CausalKVcacheAttention, CausalAttention
+from .embeddings import SinusoidalPositionalEmbedding
+from .normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm
+from einops import rearrange
+
+logger = logging.get_logger(__name__)
+
+
+def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int):
+    # "feed_forward_chunk_size" can be used to save memory
+    if hidden_states.shape[chunk_dim] % chunk_size != 0:
+        raise ValueError(
+            f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+        )
+
+    num_chunks = hidden_states.shape[chunk_dim] // chunk_size
+    ff_output = torch.cat(
+        [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
+        dim=chunk_dim,
+    )
+    return ff_output
+
+
+@maybe_allow_in_graph
+class GatedSelfAttentionDense(nn.Module):
+    r"""
+    A gated self-attention dense layer that combines visual features and object features.
+
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        context_dim (`int`): The number of channels in the context.
+        n_heads (`int`): The number of heads to use for attention.
+        d_head (`int`): The number of channels in each head.
+    """
+
+    def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
+        super().__init__()
+
+        # we need a linear projection since we need cat visual feature and obj feature
+        self.linear = nn.Linear(context_dim, query_dim)
+
+        self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
+        self.ff = FeedForward(query_dim, activation_fn="geglu")
+
+        self.norm1 = nn.LayerNorm(query_dim)
+        self.norm2 = nn.LayerNorm(query_dim)
+
+        self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
+        self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
+
+        self.enabled = True
+
+    def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
+        if not self.enabled:
+            return x
+
+        n_visual = x.shape[1]
+        objs = self.linear(objs)
+
+        x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
+        x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
+
+        return x
+
+
+@maybe_allow_in_graph
+class JointTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+
+    Reference: https://arxiv.org/abs/2403.03206
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+
+    def __init__(self, dim, num_attention_heads, attention_head_dim, context_pre_only=False):
+        super().__init__()
+
+        self.context_pre_only = context_pre_only
+        context_norm_type = "ada_norm_continous" if context_pre_only else "ada_norm_zero"
+
+        self.norm1 = AdaLayerNormZero(dim)
+
+        if context_norm_type == "ada_norm_continous":
+            self.norm1_context = AdaLayerNormContinuous(
+                dim, dim, elementwise_affine=False, eps=1e-6, bias=True, norm_type="layer_norm"
+            )
+        elif context_norm_type == "ada_norm_zero":
+            self.norm1_context = AdaLayerNormZero(dim)
+        else:
+            raise ValueError(
+                f"Unknown context_norm_type: {context_norm_type}, currently only support `ada_norm_continous`, `ada_norm_zero`"
+            )
+        if hasattr(F, "scaled_dot_product_attention"):
+            processor = JointAttnProcessor2_0()
+        else:
+            raise ValueError(
+                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
+            )
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=context_pre_only,
+            bias=True,
+            processor=processor,
+        )
+
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+
+        if not context_pre_only:
+            self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+            self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        else:
+            self.norm2_context = None
+            self.ff_context = None
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+
+    # Copied from diffusers.models.attention.BasicTransformerBlock.set_chunk_feed_forward
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor
+    ):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+
+        if self.context_pre_only:
+            norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states, temb)
+        else:
+            norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+                encoder_hidden_states, emb=temb
+            )
+
+        # Attention.
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states
+        )
+
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+
+        hidden_states = hidden_states + ff_output
+
+        # Process attention outputs for the `encoder_hidden_states`.
+        if self.context_pre_only:
+            encoder_hidden_states = None
+        else:
+            context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+            encoder_hidden_states = encoder_hidden_states + context_attn_output
+
+            norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+            norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+            if self._chunk_size is not None:
+                # "feed_forward_chunk_size" can be used to save memory
+                context_ff_output = _chunked_feed_forward(
+                    self.ff_context, norm_encoder_hidden_states, self._chunk_dim, self._chunk_size
+                )
+            else:
+                context_ff_output = self.ff_context(norm_encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+
+        return encoder_hidden_states, hidden_states
+
+
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+        ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+        ada_norm_bias: Optional[int] = None,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.dropout = dropout
+        self.cross_attention_dim = cross_attention_dim
+        self.activation_fn = activation_fn
+        self.attention_bias = attention_bias
+        self.double_self_attention = double_self_attention
+        self.norm_elementwise_affine = norm_elementwise_affine
+        self.positional_embeddings = positional_embeddings
+        self.num_positional_embeddings = num_positional_embeddings
+        self.only_cross_attention = only_cross_attention
+
+        # We keep these boolean flags for backward-compatibility.
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+        self.use_layer_norm = norm_type == "layer_norm"
+        self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+
+        self.norm_type = norm_type
+        self.num_embeds_ada_norm = num_embeds_ada_norm
+
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+        else:
+            self.pos_embed = None
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if norm_type == "ada_norm":
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif norm_type == "ada_norm_zero":
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        elif norm_type == "ada_norm_continuous":
+            self.norm1 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "rms_norm",
+            )
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            if norm_type == "ada_norm":
+                self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            elif norm_type == "ada_norm_continuous":
+                self.norm2 = AdaLayerNormContinuous(
+                    dim,
+                    ada_norm_continous_conditioning_embedding_dim,
+                    norm_elementwise_affine,
+                    norm_eps,
+                    ada_norm_bias,
+                    "rms_norm",
+                )
+            else:
+                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            if norm_type == "ada_norm_single":  # For Latte
+                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+            else:
+                self.norm2 = None
+            self.attn2 = None
+
+        # 3. Feed-forward
+        if norm_type == "ada_norm_continuous":
+            self.norm3 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "layer_norm",
+            )
+
+        elif norm_type in ["ada_norm_zero", "ada_norm", "layer_norm"]:
+            self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+        elif norm_type == "layer_norm_i2vgen":
+            self.norm3 = None
+
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+
+        # 4. Fuser
+        if attention_type == "gated" or attention_type == "gated-text-image":
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+
+        # 5. Scale-shift for PixArt-Alpha.
+        if norm_type == "ada_norm_single":
+            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+
+        if self.norm_type == "ada_norm":
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.norm_type == "ada_norm_zero":
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+            norm_hidden_states = self.norm1(hidden_states)
+        elif self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif self.norm_type == "ada_norm_single":
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+        else:
+            raise ValueError("Incorrect norm used")
+
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+        # 1. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+        if self.norm_type == "ada_norm_zero":
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.norm_type == "ada_norm_single":
+            attn_output = gate_msa * attn_output
+
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        # 1.2 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            if self.norm_type == "ada_norm":
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+            elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
+                norm_hidden_states = self.norm2(hidden_states)
+            elif self.norm_type == "ada_norm_single":
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            elif self.norm_type == "ada_norm_continuous":
+                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+            else:
+                raise ValueError("Incorrect norm")
+
+            if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+
+        # 4. Feed-forward
+        # i2vgen doesn't have this norm 🤷‍♂️
+        if self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif not self.norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm3(hidden_states)
+
+        if self.norm_type == "ada_norm_zero":
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+        if self.norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+
+        if self.norm_type == "ada_norm_zero":
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.norm_type == "ada_norm_single":
+            ff_output = gate_mlp * ff_output
+
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states
+
+
+class LuminaFeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+
+    Parameters:
+        hidden_size (`int`):
+            The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
+            hidden representations.
+        intermediate_size (`int`): The intermediate dimension of the feedforward layer.
+        multiple_of (`int`, *optional*): Value to ensure hidden dimension is a multiple
+            of this value.
+        ffn_dim_multiplier (float, *optional*): Custom multiplier for hidden
+            dimension. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        inner_dim: int,
+        multiple_of: Optional[int] = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+    ):
+        super().__init__()
+        inner_dim = int(2 * inner_dim / 3)
+        # custom hidden_size factor multiplier
+        if ffn_dim_multiplier is not None:
+            inner_dim = int(ffn_dim_multiplier * inner_dim)
+        inner_dim = multiple_of * ((inner_dim + multiple_of - 1) // multiple_of)
+
+        self.linear_1 = nn.Linear(
+            dim,
+            inner_dim,
+            bias=False,
+        )
+        self.linear_2 = nn.Linear(
+            inner_dim,
+            dim,
+            bias=False,
+        )
+        self.linear_3 = nn.Linear(
+            dim,
+            inner_dim,
+            bias=False,
+        )
+        self.silu = FP32SiLU()
+
+    def forward(self, x):
+        return self.linear_2(self.silu(self.linear_1(x)) * self.linear_3(x))
+
+
+@maybe_allow_in_graph
+class TemporalBasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block for video like data.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        time_mix_inner_dim (`int`): The number of channels for temporal attention.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        time_mix_inner_dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        cross_attention_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.is_res = dim == time_mix_inner_dim
+
+        self.norm_in = nn.LayerNorm(dim)
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.ff_in = FeedForward(
+            dim,
+            dim_out=time_mix_inner_dim,
+            activation_fn="geglu",
+        )
+
+        self.norm1 = nn.LayerNorm(time_mix_inner_dim)
+        self.attn1 = Attention(
+            query_dim=time_mix_inner_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            cross_attention_dim=None,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = nn.LayerNorm(time_mix_inner_dim)
+            self.attn2 = Attention(
+                query_dim=time_mix_inner_dim,
+                cross_attention_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(time_mix_inner_dim)
+        self.ff = FeedForward(time_mix_inner_dim, activation_fn="geglu")
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = None
+
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], **kwargs):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        # chunk dim should be hardcoded to 1 to have better speed vs. memory trade-off
+        self._chunk_dim = 1
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        num_frames: int,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+
+        batch_frames, seq_length, channels = hidden_states.shape
+        batch_size = batch_frames // num_frames
+
+        hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, seq_length, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * seq_length, num_frames, channels)
+
+        residual = hidden_states
+        hidden_states = self.norm_in(hidden_states)
+
+        if self._chunk_size is not None:
+            hidden_states = _chunked_feed_forward(self.ff_in, hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            hidden_states = self.ff_in(hidden_states)
+
+        if self.is_res:
+            hidden_states = hidden_states + residual
+
+        norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None)
+        hidden_states = attn_output + hidden_states
+
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = self.norm2(hidden_states)
+            attn_output = self.attn2(norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+            hidden_states = attn_output + hidden_states
+
+        # 4. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+
+        if self._chunk_size is not None:
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+
+        if self.is_res:
+            hidden_states = ff_output + hidden_states
+        else:
+            hidden_states = ff_output
+
+        hidden_states = hidden_states[None, :].reshape(batch_size, seq_length, num_frames, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * num_frames, seq_length, channels)
+
+        return hidden_states
+
+
+class SkipFFTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        kv_input_dim: int,
+        kv_input_dim_proj_use_bias: bool,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        if kv_input_dim != dim:
+            self.kv_mapper = nn.Linear(kv_input_dim, dim, kv_input_dim_proj_use_bias)
+        else:
+            self.kv_mapper = None
+
+        self.norm1 = RMSNorm(dim, 1e-06)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim,
+            out_bias=attention_out_bias,
+        )
+
+        self.norm2 = RMSNorm(dim, 1e-06)
+
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+        )
+
+    def forward(self, hidden_states, encoder_hidden_states, cross_attention_kwargs):
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+
+        if self.kv_mapper is not None:
+            encoder_hidden_states = self.kv_mapper(F.silu(encoder_hidden_states))
+
+        norm_hidden_states = self.norm1(hidden_states)
+
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            **cross_attention_kwargs,
+        )
+
+        hidden_states = attn_output + hidden_states
+
+        norm_hidden_states = self.norm2(hidden_states)
+
+        attn_output = self.attn2(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            **cross_attention_kwargs,
+        )
+
+        hidden_states = attn_output + hidden_states
+
+        return hidden_states
+
+
+@maybe_allow_in_graph
+class FreeNoiseTransformerBlock(nn.Module):
+    r"""
+    A FreeNoise Transformer block.
+
+    Parameters:
+        dim (`int`):
+            The number of channels in the input and output.
+        num_attention_heads (`int`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`):
+            The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+        cross_attention_dim (`int`, *optional*):
+            The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`):
+            Activation function to be used in feed-forward.
+        num_embeds_ada_norm (`int`, *optional*):
+            The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (`bool`, defaults to `False`):
+            Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, defaults to `False`):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, defaults to `False`):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, defaults to `False`):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` defaults to `False`):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+        ff_inner_dim (`int`, *optional*):
+            Hidden dimension of feed-forward MLP.
+        ff_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in feed-forward MLP.
+        attention_out_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in attention output project layer.
+        context_length (`int`, defaults to `16`):
+            The maximum number of frames that the FreeNoise block processes at once.
+        context_stride (`int`, defaults to `4`):
+            The number of frames to be skipped before starting to process a new batch of `context_length` frames.
+        weighting_scheme (`str`, defaults to `"pyramid"`):
+            The weighting scheme to use for weighting averaging of processed latent frames. As described in the
+            Equation 9. of the [FreeNoise](https://arxiv.org/abs/2310.15169) paper, "pyramid" is the default setting
+            used.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout: float = 0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+        context_length: int = 16,
+        context_stride: int = 4,
+        weighting_scheme: str = "pyramid",
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.dropout = dropout
+        self.cross_attention_dim = cross_attention_dim
+        self.activation_fn = activation_fn
+        self.attention_bias = attention_bias
+        self.double_self_attention = double_self_attention
+        self.norm_elementwise_affine = norm_elementwise_affine
+        self.positional_embeddings = positional_embeddings
+        self.num_positional_embeddings = num_positional_embeddings
+        self.only_cross_attention = only_cross_attention
+
+        self.set_free_noise_properties(context_length, context_stride, weighting_scheme)
+
+        # We keep these boolean flags for backward-compatibility.
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+        self.use_layer_norm = norm_type == "layer_norm"
+        self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+
+        self.norm_type = norm_type
+        self.num_embeds_ada_norm = num_embeds_ada_norm
+
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+        else:
+            self.pos_embed = None
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+            )  # is self-attn if encoder_hidden_states is none
+
+        # 3. Feed-forward
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+
+        self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+
+    def _get_frame_indices(self, num_frames: int) -> List[Tuple[int, int]]:
+        frame_indices = []
+        for i in range(0, num_frames - self.context_length + 1, self.context_stride):
+            window_start = i
+            window_end = min(num_frames, i + self.context_length)
+            frame_indices.append((window_start, window_end))
+        return frame_indices
+
+    def _get_frame_weights(self, num_frames: int, weighting_scheme: str = "pyramid") -> List[float]:
+        if weighting_scheme == "flat":
+            weights = [1.0] * num_frames
+
+        elif weighting_scheme == "pyramid":
+            if num_frames % 2 == 0:
+                # num_frames = 4 => [1, 2, 2, 1]
+                mid = num_frames // 2
+                weights = list(range(1, mid + 1))
+                weights = weights + weights[::-1]
+            else:
+                # num_frames = 5 => [1, 2, 3, 2, 1]
+                mid = (num_frames + 1) // 2
+                weights = list(range(1, mid))
+                weights = weights + [mid] + weights[::-1]
+
+        elif weighting_scheme == "delayed_reverse_sawtooth":
+            if num_frames % 2 == 0:
+                # num_frames = 4 => [0.01, 2, 2, 1]
+                mid = num_frames // 2
+                weights = [0.01] * (mid - 1) + [mid]
+                weights = weights + list(range(mid, 0, -1))
+            else:
+                # num_frames = 5 => [0.01, 0.01, 3, 2, 1]
+                mid = (num_frames + 1) // 2
+                weights = [0.01] * mid
+                weights = weights + list(range(mid, 0, -1))
+        else:
+            raise ValueError(f"Unsupported value for weighting_scheme={weighting_scheme}")
+
+        return weights
+
+    def set_free_noise_properties(
+        self, context_length: int, context_stride: int, weighting_scheme: str = "pyramid"
+    ) -> None:
+        self.context_length = context_length
+        self.context_stride = context_stride
+        self.weighting_scheme = weighting_scheme
+
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0) -> None:
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+
+        # hidden_states: [B x H x W, F, C]
+        device = hidden_states.device
+        dtype = hidden_states.dtype
+
+        num_frames = hidden_states.size(1)
+        frame_indices = self._get_frame_indices(num_frames)
+        frame_weights = self._get_frame_weights(self.context_length, self.weighting_scheme)
+        frame_weights = torch.tensor(frame_weights, device=device, dtype=dtype).unsqueeze(0).unsqueeze(-1)
+        is_last_frame_batch_complete = frame_indices[-1][1] == num_frames
+
+        # Handle out-of-bounds case if num_frames isn't perfectly divisible by context_length
+        # For example, num_frames=25, context_length=16, context_stride=4, then we expect the ranges:
+        #    [(0, 16), (4, 20), (8, 24), (10, 26)]
+        if not is_last_frame_batch_complete:
+            if num_frames < self.context_length:
+                raise ValueError(f"Expected {num_frames} to be greater or equal than {self.context_length}")
+            last_frame_batch_length = num_frames - frame_indices[-1][1]
+            frame_indices.append((num_frames - self.context_length, num_frames))
+
+        num_times_accumulated = torch.zeros((1, num_frames, 1), device=device)
+        accumulated_values = torch.zeros_like(hidden_states)
+
+        for i, (frame_start, frame_end) in enumerate(frame_indices):
+            # The reason for slicing here is to ensure that if (frame_end - frame_start) is to handle
+            # cases like frame_indices=[(0, 16), (16, 20)], if the user provided a video with 19 frames, or
+            # essentially a non-multiple of `context_length`.
+            weights = torch.ones_like(num_times_accumulated[:, frame_start:frame_end])
+            weights *= frame_weights
+
+            hidden_states_chunk = hidden_states[:, frame_start:frame_end]
+
+            # Notice that normalization is always applied before the real computation in the following blocks.
+            # 1. Self-Attention
+            norm_hidden_states = self.norm1(hidden_states_chunk)
+
+            if self.pos_embed is not None:
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+            attn_output = self.attn1(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs,
+            )
+
+            hidden_states_chunk = attn_output + hidden_states_chunk
+            if hidden_states_chunk.ndim == 4:
+                hidden_states_chunk = hidden_states_chunk.squeeze(1)
+
+            # 2. Cross-Attention
+            if self.attn2 is not None:
+                norm_hidden_states = self.norm2(hidden_states_chunk)
+
+                if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+                    norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+                attn_output = self.attn2(
+                    norm_hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=encoder_attention_mask,
+                    **cross_attention_kwargs,
+                )
+                hidden_states_chunk = attn_output + hidden_states_chunk
+
+            if i == len(frame_indices) - 1 and not is_last_frame_batch_complete:
+                accumulated_values[:, -last_frame_batch_length:] += (
+                    hidden_states_chunk[:, -last_frame_batch_length:] * weights[:, -last_frame_batch_length:]
+                )
+                num_times_accumulated[:, -last_frame_batch_length:] += weights[:, -last_frame_batch_length]
+            else:
+                accumulated_values[:, frame_start:frame_end] += hidden_states_chunk * weights
+                num_times_accumulated[:, frame_start:frame_end] += weights
+
+        # TODO(aryan): Maybe this could be done in a better way.
+        #
+        # Previously, this was:
+        # hidden_states = torch.where(
+        #    num_times_accumulated > 0, accumulated_values / num_times_accumulated, accumulated_values
+        # )
+        #
+        # The reasoning for the change here is `torch.where` became a bottleneck at some point when golfing memory
+        # spikes. It is particularly noticeable when the number of frames is high. My understanding is that this comes
+        # from tensors being copied - which is why we resort to spliting and concatenating here. I've not particularly
+        # looked into this deeply because other memory optimizations led to more pronounced reductions.
+        hidden_states = torch.cat(
+            [
+                torch.where(num_times_split > 0, accumulated_split / num_times_split, accumulated_split)
+                for accumulated_split, num_times_split in zip(
+                    accumulated_values.split(self.context_length, dim=1),
+                    num_times_accumulated.split(self.context_length, dim=1),
+                )
+            ],
+            dim=1,
+        ).to(dtype)
+
+        # 3. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+
+        if self._chunk_size is not None:
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states
+
+
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "swiglu":
+            act_fn = SwiGLU(dim, inner_dim, bias=bias)
+
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+
+    def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+
+
+@maybe_allow_in_graph
+class CausalTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+        ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+        ada_norm_bias: Optional[int] = None,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.dropout = dropout
+        self.cross_attention_dim = cross_attention_dim
+        self.activation_fn = activation_fn
+        self.attention_bias = attention_bias
+        self.double_self_attention = double_self_attention
+        self.norm_elementwise_affine = norm_elementwise_affine
+        self.positional_embeddings = positional_embeddings
+        self.num_positional_embeddings = num_positional_embeddings
+        self.only_cross_attention = only_cross_attention
+
+        # We keep these boolean flags for backward-compatibility.
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+        self.use_layer_norm = norm_type == "layer_norm"
+        self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+
+        self.norm_type = norm_type
+        self.num_embeds_ada_norm = num_embeds_ada_norm
+
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+        else:
+            self.pos_embed = None
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if norm_type == "ada_norm":
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif norm_type == "ada_norm_zero":
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        elif norm_type == "ada_norm_continuous":
+            self.norm1 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "rms_norm",
+            )
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+        self.attn1 = CausalAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            if norm_type == "ada_norm":
+                self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            elif norm_type == "ada_norm_continuous":
+                self.norm2 = AdaLayerNormContinuous(
+                    dim,
+                    ada_norm_continous_conditioning_embedding_dim,
+                    norm_elementwise_affine,
+                    norm_eps,
+                    ada_norm_bias,
+                    "rms_norm",
+                )
+            else:
+                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            if norm_type == "ada_norm_single":  # For Latte
+                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+            else:
+                self.norm2 = None
+            self.attn2 = None
+
+        # 3. Feed-forward
+        if norm_type == "ada_norm_continuous":
+            self.norm3 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "layer_norm",
+            )
+
+        elif norm_type in ["ada_norm_zero", "ada_norm", "layer_norm"]:
+            self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+        elif norm_type == "layer_norm_i2vgen":
+            self.norm3 = None
+
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+
+        # 4. Fuser
+        if attention_type == "gated" or attention_type == "gated-text-image":
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+
+        # 5. Scale-shift for PixArt-Alpha.
+        if norm_type == "ada_norm_single":
+            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor, # (b, l, c)
+        # cond_hidden_states: torch.Tensor, # (b, l, c)
+        ref_hidden_states: torch.Tensor, # (b, n_ref, l, c)
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+        N_ref = ref_hidden_states.shape[1]
+        flat_ref_hidden_states = rearrange(ref_hidden_states, 'b n_ref l c -> (b n_ref) l c')
+
+        if self.norm_type == "ada_norm":
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+            # norm_cond_hidden_states = self.norm1(cond_hidden_states, timestep)
+            norm_ref_hidden_states = self.norm1(flat_ref_hidden_states, timestep)
+        elif self.norm_type == "ada_norm_zero":
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+            # norm_cond_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+            #     cond_hidden_states, timestep, class_labels, hidden_dtype=cond_hidden_states.dtype
+            # )
+            norm_ref_hidden_states = self.norm1(flat_ref_hidden_states, timestep, class_labels, hidden_dtype=flat_ref_hidden_states.dtype)
+        elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+            norm_hidden_states = self.norm1(hidden_states)
+            # norm_cond_hidden_states = self.norm1(cond_hidden_states)
+            norm_ref_hidden_states = self.norm1(flat_ref_hidden_states)
+        elif self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+            # norm_cond_hidden_states = self.norm1(cond_hidden_states, added_cond_kwargs["pooled_text_emb"])
+            norm_ref_hidden_states = self.norm1(flat_ref_hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif self.norm_type == "ada_norm_single":
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+
+            # norm_cond_hidden_states = self.norm1(cond_hidden_states)
+            # norm_cond_hidden_states = norm_cond_hidden_states * (1 + scale_msa) + shift_msa
+
+            norm_ref_hidden_states = self.norm1(flat_ref_hidden_states)
+            norm_ref_hidden_states = rearrange(norm_ref_hidden_states, '(b n_ref) l c -> b (n_ref l) c', n_ref=N_ref)
+            norm_ref_hidden_states = norm_ref_hidden_states * (1 + scale_msa) + shift_msa
+            norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b (n_ref l) c -> (b n_ref) l c', n_ref=N_ref)
+        else:
+            raise ValueError("Incorrect norm used")
+
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+            # norm_cond_hidden_states = self.pos_embed(norm_cond_hidden_states)
+            norm_ref_hidden_states = self.pos_embed(norm_ref_hidden_states)
+
+        # 1. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+
+        norm_ref_hidden_states = rearrange(norm_ref_hidden_states, '(b n_ref) l c -> b n_ref l c', n_ref=N_ref)
+        attn_output, attn_output_ref = self.attn1(
+            norm_hidden_states,
+            norm_ref_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+        if self.norm_type == "ada_norm_zero":
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+            # attn_output_cond = gate_msa.unsqueeze(1) * attn_output_cond
+        elif self.norm_type == "ada_norm_single":
+            attn_output = gate_msa * attn_output
+            # attn_output_cond = gate_msa * attn_output_cond
+            attn_output_ref = rearrange(attn_output_ref, 'b n_ref l c -> b (n_ref l) c', n_ref=N_ref)
+            attn_output_ref = gate_msa * attn_output_ref
+            attn_output_ref = rearrange(attn_output_ref, 'b (n_ref l) c -> b n_ref l c', n_ref=N_ref)
+
+        hidden_states = attn_output + hidden_states # (b, l, c)
+        # cond_hidden_states = attn_output_cond + cond_hidden_states # (b, l, c)
+
+        ref_hidden_states = attn_output_ref + ref_hidden_states # (b, n_ref, l, c)
+
+        reshape_ref_hidden_states = rearrange(ref_hidden_states, 'b n_ref l c -> b (n_ref l) c', n_ref=N_ref)
+        hidden_states_seq_len = hidden_states.shape[1]
+        # residual_cross = torch.cat([hidden_states, cond_hidden_states, reshape_ref_hidden_states], dim=1)
+        residual_cross = torch.cat([hidden_states, reshape_ref_hidden_states], dim=1)
+
+        if hidden_states.ndim != 3:
+            raise ValueError("hidden_states.ndim should be 3")
+
+
+
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            if self.norm_type == "ada_norm":
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+                # norm_cond_hidden_states = self.norm2(cond_hidden_states, timestep)
+            elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
+                norm_hidden_states = self.norm2(hidden_states)
+                # norm_cond_hidden_states = self.norm2(cond_hidden_states)
+            elif self.norm_type == "ada_norm_single":
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+                # norm_cond_hidden_states = cond_hidden_states
+                norm_ref_hidden_states = ref_hidden_states
+            elif self.norm_type == "ada_norm_continuous":
+                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+                # norm_cond_hidden_states = self.norm2(cond_hidden_states, added_cond_kwargs["pooled_text_emb"])
+            else:
+                raise ValueError("Incorrect norm")
+
+            if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+                # norm_cond_hidden_states = self.pos_embed(norm_cond_hidden_states)
+                norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b n_ref l c -> (b n_ref) l c', n_ref=N_ref)
+                norm_ref_hidden_states = self.pos_embed(norm_ref_hidden_states)
+                norm_ref_hidden_states = rearrange(norm_ref_hidden_states, '(b n_ref) l c -> b n_ref l c', n_ref=N_ref)
+
+            norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b n_ref l c -> b (n_ref l) c', n_ref=N_ref)
+            
+            norm_hidden_states_cross = torch.cat([norm_hidden_states, norm_ref_hidden_states], dim=1)
+            attn_output = self.attn2(
+                norm_hidden_states_cross,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+
+            hidden_states_cross = attn_output + residual_cross
+
+            hidden_states = hidden_states_cross[:, :hidden_states_seq_len, :]
+            # cond_hidden_states = hidden_states_cross[:, hidden_states_seq_len:hidden_states_seq_len*2, :]
+            ref_hidden_states = hidden_states_cross[:, hidden_states_seq_len:, :]
+
+            ref_hidden_states = rearrange(ref_hidden_states, 'b (n_ref l) c -> b n_ref l c', n_ref=N_ref)
+
+            # ref_hidden_states = hidden_states_cross[:, 1:, :, :]
+
+            # hidden_states = hidden_states_cross[:, 0, :, :]
+            
+
+        # 4. Feed-forward
+        # i2vgen doesn't have this norm 🤷‍♂️
+        if self.norm_type == "ada_norm_single":
+            # print('ada_norm_single')
+            norm_hidden_states = self.norm2(hidden_states)
+            # print('scale_mlp', scale_mlp.shape, 'shift_mlp', shift_mlp.shape,'norm_hidden_states', norm_hidden_states.shape)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+            # norm_cond_hidden_states = self.norm2(cond_hidden_states)
+            # norm_cond_hidden_states = norm_cond_hidden_states * (1 + scale_mlp) + shift_mlp
+
+            norm_ref_hidden_states = self.norm2(ref_hidden_states)
+            norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b n_ref l c -> b (n_ref l) c', n_ref=N_ref)
+            # print('scale_mlp', scale_mlp.shape, 'shift_mlp', shift_mlp.shape, 'norm_ref_hidden_states', norm_ref_hidden_states.shape)
+            norm_ref_hidden_states = norm_ref_hidden_states * (1 + scale_mlp) + shift_mlp
+            norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b (n_ref l) c -> b n_ref l c', n_ref=N_ref)
+
+
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            # print('chunk_size', self._chunk_size)
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+            # ff_output_cond = _chunked_feed_forward(self.ff, norm_cond_hidden_states, self._chunk_dim, self._chunk_size)
+            norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b n_ref l c -> (b n_ref) l c', n_ref=N_ref)
+            ff_output_ref = _chunked_feed_forward(self.ff, norm_ref_hidden_states, self._chunk_dim, self._chunk_size)
+            ff_output_ref = rearrange(ff_output_ref, '(b n_ref) l c -> b n_ref l c', n_ref=N_ref)
+
+        else:
+            # print('no chunk_size')
+            ff_output = self.ff(norm_hidden_states)
+            # ff_output_cond = self.ff(norm_cond_hidden_states)
+            norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b n_ref l c -> (b n_ref) l c', n_ref=N_ref)
+            ff_output_ref = self.ff(norm_ref_hidden_states)
+            ff_output_ref = rearrange(ff_output_ref, '(b n_ref) l c -> b n_ref l c', n_ref=N_ref)
+
+        if self.norm_type == "ada_norm_zero":
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.norm_type == "ada_norm_single":
+            ff_output = gate_mlp * ff_output
+            # ff_output_cond = gate_mlp * ff_output_cond
+            ff_output_ref = rearrange(ff_output_ref, 'b n_ref l c -> b (n_ref l) c', n_ref=N_ref)
+            ff_output_ref = gate_mlp * ff_output_ref
+            ff_output_ref = rearrange(ff_output_ref, 'b (n_ref l) c -> b n_ref l c', n_ref=N_ref)
+
+        hidden_states = ff_output + hidden_states
+        # cond_hidden_states = ff_output_cond + cond_hidden_states
+        ref_hidden_states = ff_output_ref + ref_hidden_states
+
+
+        return hidden_states, ref_hidden_states
+
+
+@maybe_allow_in_graph
+class CausalTransformerKVcacheBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+        ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+        ada_norm_bias: Optional[int] = None,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.dropout = dropout
+        self.cross_attention_dim = cross_attention_dim
+        self.activation_fn = activation_fn
+        self.attention_bias = attention_bias
+        self.double_self_attention = double_self_attention
+        self.norm_elementwise_affine = norm_elementwise_affine
+        self.positional_embeddings = positional_embeddings
+        self.num_positional_embeddings = num_positional_embeddings
+        self.only_cross_attention = only_cross_attention
+
+        # We keep these boolean flags for backward-compatibility.
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+        self.use_layer_norm = norm_type == "layer_norm"
+        self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+
+        self.norm_type = norm_type
+        self.num_embeds_ada_norm = num_embeds_ada_norm
+
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+        else:
+            self.pos_embed = None
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if norm_type == "ada_norm":
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif norm_type == "ada_norm_zero":
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        elif norm_type == "ada_norm_continuous":
+            self.norm1 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "rms_norm",
+            )
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+        self.attn1 = CausalKVcacheAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            if norm_type == "ada_norm":
+                self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            elif norm_type == "ada_norm_continuous":
+                self.norm2 = AdaLayerNormContinuous(
+                    dim,
+                    ada_norm_continous_conditioning_embedding_dim,
+                    norm_elementwise_affine,
+                    norm_eps,
+                    ada_norm_bias,
+                    "rms_norm",
+                )
+            else:
+                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            if norm_type == "ada_norm_single":  # For Latte
+                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+            else:
+                self.norm2 = None
+            self.attn2 = None
+
+        # 3. Feed-forward
+        if norm_type == "ada_norm_continuous":
+            self.norm3 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "layer_norm",
+            )
+
+        elif norm_type in ["ada_norm_zero", "ada_norm", "layer_norm"]:
+            self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+        elif norm_type == "layer_norm_i2vgen":
+            self.norm3 = None
+
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+
+        # 4. Fuser
+        if attention_type == "gated" or attention_type == "gated-text-image":
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+
+        # 5. Scale-shift for PixArt-Alpha.
+        if norm_type == "ada_norm_single":
+            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor, # (b, l, c)
+        # cond_hidden_states: torch.Tensor, # (b, l, c)
+        ref_hidden_states: torch.Tensor, # (b, n_ref, l, c)
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        K_cache: Optional[torch.Tensor] = None,
+        V_cache: Optional[torch.Tensor] = None,
+        timestep_ref: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+
+        no_cache = K_cache is None and V_cache is None
+
+        if no_cache:
+            N_ref = ref_hidden_states.shape[1]
+            flat_ref_hidden_states = rearrange(ref_hidden_states, 'b n_ref l c -> (b n_ref) l c')
+
+        if self.norm_type == "ada_norm":
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+            if no_cache:
+                norm_ref_hidden_states = self.norm1(flat_ref_hidden_states, timestep_ref)
+        elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+            norm_hidden_states = self.norm1(hidden_states)
+            if no_cache:
+                norm_ref_hidden_states = self.norm1(flat_ref_hidden_states)
+        elif self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+            if no_cache:
+                norm_ref_hidden_states = self.norm1(flat_ref_hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif self.norm_type == "ada_norm_single":
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+
+
+
+            norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+
+
+            if no_cache:
+                shift_msa_ref, scale_msa_ref, gate_msa_ref, shift_mlp_ref, scale_mlp_ref, gate_mlp_ref = (
+                    self.scale_shift_table[None] + timestep_ref.reshape(batch_size, 6, -1)
+                ).chunk(6, dim=1)
+    
+                norm_ref_hidden_states = self.norm1(flat_ref_hidden_states)
+                norm_ref_hidden_states = rearrange(norm_ref_hidden_states, '(b n_ref) l c -> b (n_ref l) c', n_ref=N_ref)
+                norm_ref_hidden_states = norm_ref_hidden_states * (1 + scale_msa_ref) + shift_msa_ref
+                norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b (n_ref l) c -> (b n_ref) l c', n_ref=N_ref)
+        else:
+            raise ValueError("Incorrect norm used")
+
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+            # norm_cond_hidden_states = self.pos_embed(norm_cond_hidden_states)
+            if no_cache:
+                norm_ref_hidden_states = self.pos_embed(norm_ref_hidden_states)
+
+        # 1. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+
+
+        if no_cache:
+            norm_ref_hidden_states = rearrange(norm_ref_hidden_states, '(b n_ref) l c -> b n_ref l c', n_ref=N_ref)
+            attn_output, attn_output_ref, K_sample, V_sample = self.attn1(
+                norm_hidden_states,
+                norm_ref_hidden_states,
+                encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                attention_mask=attention_mask,
+                K_cache = None,
+                V_cache = None,
+                **cross_attention_kwargs,
+            )
+        else:
+            attn_output, _, _, _ = self.attn1(
+                norm_hidden_states,
+                None,
+                encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                attention_mask=attention_mask,
+                K_cache = K_cache,
+                V_cache = V_cache,
+                **cross_attention_kwargs,
+            )
+
+        if self.norm_type == "ada_norm_zero":
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.norm_type == "ada_norm_single":
+            attn_output = gate_msa * attn_output
+            if no_cache:
+                attn_output_ref = rearrange(attn_output_ref, 'b n_ref l c -> b (n_ref l) c', n_ref=N_ref)
+                attn_output_ref = gate_msa_ref * attn_output_ref
+                attn_output_ref = rearrange(attn_output_ref, 'b (n_ref l) c -> b n_ref l c', n_ref=N_ref)
+
+        hidden_states = attn_output + hidden_states # (b, l, c)
+        
+        if no_cache:
+            ref_hidden_states = attn_output_ref + ref_hidden_states # (b, n_ref, l, c)
+
+            reshape_ref_hidden_states = rearrange(ref_hidden_states, 'b n_ref l c -> b (n_ref l) c', n_ref=N_ref)
+            hidden_states_seq_len = hidden_states.shape[1]
+            residual_cross = torch.cat([hidden_states, reshape_ref_hidden_states], dim=1)
+
+        else:
+            hidden_states_seq_len = hidden_states.shape[1]
+            residual_cross = hidden_states
+
+        if hidden_states.ndim != 3:
+            raise ValueError("hidden_states.ndim should be 3")
+
+
+
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            if self.norm_type == "ada_norm":
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+                
+            elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
+                norm_hidden_states = self.norm2(hidden_states)
+                # norm_cond_hidden_states = self.norm2(cond_hidden_states)
+            elif self.norm_type == "ada_norm_single":
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+                if no_cache:
+                    norm_ref_hidden_states = ref_hidden_states
+            elif self.norm_type == "ada_norm_continuous":
+                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+                # norm_cond_hidden_states = self.norm2(cond_hidden_states, added_cond_kwargs["pooled_text_emb"])
+            else:
+                raise ValueError("Incorrect norm")
+
+            if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+                if no_cache:
+                    norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b n_ref l c -> (b n_ref) l c', n_ref=N_ref)
+                    norm_ref_hidden_states = self.pos_embed(norm_ref_hidden_states)
+                    norm_ref_hidden_states = rearrange(norm_ref_hidden_states, '(b n_ref) l c -> b n_ref l c', n_ref=N_ref)
+                    
+            
+            if no_cache:
+                norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b n_ref l c -> b (n_ref l) c', n_ref=N_ref)
+                norm_hidden_states_cross = torch.cat([norm_hidden_states, norm_ref_hidden_states], dim=1)
+            else:
+                norm_hidden_states_cross = norm_hidden_states
+
+            attn_output = self.attn2(
+                norm_hidden_states_cross,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+
+
+            hidden_states_cross = attn_output + residual_cross
+
+
+            if no_cache:
+                hidden_states = hidden_states_cross[:, :hidden_states_seq_len, :]
+                ref_hidden_states = hidden_states_cross[:, hidden_states_seq_len:, :]
+                ref_hidden_states = rearrange(ref_hidden_states, 'b (n_ref l) c -> b n_ref l c', n_ref=N_ref)
+            else:
+                hidden_states = hidden_states_cross
+
+
+            
+
+        # 4. Feed-forward
+        # i2vgen doesn't have this norm 🤷‍♂️
+        if self.norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+            
+            if no_cache:
+                norm_ref_hidden_states = self.norm2(ref_hidden_states)
+                norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b n_ref l c -> b (n_ref l) c', n_ref=N_ref)
+                norm_ref_hidden_states = norm_ref_hidden_states * (1 + scale_mlp_ref) + shift_mlp_ref
+                norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b (n_ref l) c -> b n_ref l c', n_ref=N_ref)
+
+
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            # print('chunk_size', self._chunk_size)
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+            
+            if no_cache:
+                norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b n_ref l c -> (b n_ref) l c', n_ref=N_ref)
+                ff_output_ref = _chunked_feed_forward(self.ff, norm_ref_hidden_states, self._chunk_dim, self._chunk_size)
+                ff_output_ref = rearrange(ff_output_ref, '(b n_ref) l c -> b n_ref l c', n_ref=N_ref)
+
+        else:
+            # print('no chunk_size')
+            ff_output = self.ff(norm_hidden_states)
+            
+            if no_cache:
+                norm_ref_hidden_states = rearrange(norm_ref_hidden_states, 'b n_ref l c -> (b n_ref) l c', n_ref=N_ref)
+                ff_output_ref = self.ff(norm_ref_hidden_states)
+                ff_output_ref = rearrange(ff_output_ref, '(b n_ref) l c -> b n_ref l c', n_ref=N_ref)
+
+        if self.norm_type == "ada_norm_zero":
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.norm_type == "ada_norm_single":
+            ff_output = gate_mlp * ff_output
+            
+            if no_cache:
+                ff_output_ref = rearrange(ff_output_ref, 'b n_ref l c -> b (n_ref l) c', n_ref=N_ref)
+                ff_output_ref = gate_mlp_ref * ff_output_ref
+                ff_output_ref = rearrange(ff_output_ref, 'b (n_ref l) c -> b n_ref l c', n_ref=N_ref)
+
+        hidden_states = ff_output + hidden_states
+        
+        if no_cache:
+            ref_hidden_states = ff_output_ref + ref_hidden_states
+
+
+        if no_cache:
+            return hidden_states, ref_hidden_states, K_sample, V_sample
+        else:
+            return hidden_states, None, None, None
diff --git a/diffusers/src/diffusers/models/attention_flax.py b/diffusers/src/diffusers/models/attention_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..25ae5d0a5d6344d566a731c0b1e18f54e4d42581
--- /dev/null
+++ b/diffusers/src/diffusers/models/attention_flax.py
@@ -0,0 +1,494 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import math
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+
+
+def _query_chunk_attention(query, key, value, precision, key_chunk_size: int = 4096):
+    """Multi-head dot product attention with a limited number of queries."""
+    num_kv, num_heads, k_features = key.shape[-3:]
+    v_features = value.shape[-1]
+    key_chunk_size = min(key_chunk_size, num_kv)
+    query = query / jnp.sqrt(k_features)
+
+    @functools.partial(jax.checkpoint, prevent_cse=False)
+    def summarize_chunk(query, key, value):
+        attn_weights = jnp.einsum("...qhd,...khd->...qhk", query, key, precision=precision)
+
+        max_score = jnp.max(attn_weights, axis=-1, keepdims=True)
+        max_score = jax.lax.stop_gradient(max_score)
+        exp_weights = jnp.exp(attn_weights - max_score)
+
+        exp_values = jnp.einsum("...vhf,...qhv->...qhf", value, exp_weights, precision=precision)
+        max_score = jnp.einsum("...qhk->...qh", max_score)
+
+        return (exp_values, exp_weights.sum(axis=-1), max_score)
+
+    def chunk_scanner(chunk_idx):
+        # julienne key array
+        key_chunk = jax.lax.dynamic_slice(
+            operand=key,
+            start_indices=[0] * (key.ndim - 3) + [chunk_idx, 0, 0],  # [...,k,h,d]
+            slice_sizes=list(key.shape[:-3]) + [key_chunk_size, num_heads, k_features],  # [...,k,h,d]
+        )
+
+        # julienne value array
+        value_chunk = jax.lax.dynamic_slice(
+            operand=value,
+            start_indices=[0] * (value.ndim - 3) + [chunk_idx, 0, 0],  # [...,v,h,d]
+            slice_sizes=list(value.shape[:-3]) + [key_chunk_size, num_heads, v_features],  # [...,v,h,d]
+        )
+
+        return summarize_chunk(query, key_chunk, value_chunk)
+
+    chunk_values, chunk_weights, chunk_max = jax.lax.map(f=chunk_scanner, xs=jnp.arange(0, num_kv, key_chunk_size))
+
+    global_max = jnp.max(chunk_max, axis=0, keepdims=True)
+    max_diffs = jnp.exp(chunk_max - global_max)
+
+    chunk_values *= jnp.expand_dims(max_diffs, axis=-1)
+    chunk_weights *= max_diffs
+
+    all_values = chunk_values.sum(axis=0)
+    all_weights = jnp.expand_dims(chunk_weights, -1).sum(axis=0)
+
+    return all_values / all_weights
+
+
+def jax_memory_efficient_attention(
+    query, key, value, precision=jax.lax.Precision.HIGHEST, query_chunk_size: int = 1024, key_chunk_size: int = 4096
+):
+    r"""
+    Flax Memory-efficient multi-head dot product attention. https://arxiv.org/abs/2112.05682v2
+    https://github.com/AminRezaei0x443/memory-efficient-attention
+
+    Args:
+        query (`jnp.ndarray`): (batch..., query_length, head, query_key_depth_per_head)
+        key (`jnp.ndarray`): (batch..., key_value_length, head, query_key_depth_per_head)
+        value (`jnp.ndarray`): (batch..., key_value_length, head, value_depth_per_head)
+        precision (`jax.lax.Precision`, *optional*, defaults to `jax.lax.Precision.HIGHEST`):
+            numerical precision for computation
+        query_chunk_size (`int`, *optional*, defaults to 1024):
+            chunk size to divide query array value must divide query_length equally without remainder
+        key_chunk_size (`int`, *optional*, defaults to 4096):
+            chunk size to divide key and value array value must divide key_value_length equally without remainder
+
+    Returns:
+        (`jnp.ndarray`) with shape of (batch..., query_length, head, value_depth_per_head)
+    """
+    num_q, num_heads, q_features = query.shape[-3:]
+
+    def chunk_scanner(chunk_idx, _):
+        # julienne query array
+        query_chunk = jax.lax.dynamic_slice(
+            operand=query,
+            start_indices=([0] * (query.ndim - 3)) + [chunk_idx, 0, 0],  # [...,q,h,d]
+            slice_sizes=list(query.shape[:-3]) + [min(query_chunk_size, num_q), num_heads, q_features],  # [...,q,h,d]
+        )
+
+        return (
+            chunk_idx + query_chunk_size,  # unused ignore it
+            _query_chunk_attention(
+                query=query_chunk, key=key, value=value, precision=precision, key_chunk_size=key_chunk_size
+            ),
+        )
+
+    _, res = jax.lax.scan(
+        f=chunk_scanner,
+        init=0,
+        xs=None,
+        length=math.ceil(num_q / query_chunk_size),  # start counter  # stop counter
+    )
+
+    return jnp.concatenate(res, axis=-3)  # fuse the chunked result back
+
+
+class FlaxAttention(nn.Module):
+    r"""
+    A Flax multi-head attention module as described in: https://arxiv.org/abs/1706.03762
+
+    Parameters:
+        query_dim (:obj:`int`):
+            Input hidden states dimension
+        heads (:obj:`int`, *optional*, defaults to 8):
+            Number of heads
+        dim_head (:obj:`int`, *optional*, defaults to 64):
+            Hidden states dimension inside each head
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+
+    """
+
+    query_dim: int
+    heads: int = 8
+    dim_head: int = 64
+    dropout: float = 0.0
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        inner_dim = self.dim_head * self.heads
+        self.scale = self.dim_head**-0.5
+
+        # Weights were exported with old names {to_q, to_k, to_v, to_out}
+        self.query = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_q")
+        self.key = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_k")
+        self.value = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_v")
+
+        self.proj_attn = nn.Dense(self.query_dim, dtype=self.dtype, name="to_out_0")
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+
+    def reshape_heads_to_batch_dim(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = jnp.transpose(tensor, (0, 2, 1, 3))
+        tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+
+    def reshape_batch_dim_to_heads(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = jnp.transpose(tensor, (0, 2, 1, 3))
+        tensor = tensor.reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+
+    def __call__(self, hidden_states, context=None, deterministic=True):
+        context = hidden_states if context is None else context
+
+        query_proj = self.query(hidden_states)
+        key_proj = self.key(context)
+        value_proj = self.value(context)
+
+        if self.split_head_dim:
+            b = hidden_states.shape[0]
+            query_states = jnp.reshape(query_proj, (b, -1, self.heads, self.dim_head))
+            key_states = jnp.reshape(key_proj, (b, -1, self.heads, self.dim_head))
+            value_states = jnp.reshape(value_proj, (b, -1, self.heads, self.dim_head))
+        else:
+            query_states = self.reshape_heads_to_batch_dim(query_proj)
+            key_states = self.reshape_heads_to_batch_dim(key_proj)
+            value_states = self.reshape_heads_to_batch_dim(value_proj)
+
+        if self.use_memory_efficient_attention:
+            query_states = query_states.transpose(1, 0, 2)
+            key_states = key_states.transpose(1, 0, 2)
+            value_states = value_states.transpose(1, 0, 2)
+
+            # this if statement create a chunk size for each layer of the unet
+            # the chunk size is equal to the query_length dimension of the deepest layer of the unet
+
+            flatten_latent_dim = query_states.shape[-3]
+            if flatten_latent_dim % 64 == 0:
+                query_chunk_size = int(flatten_latent_dim / 64)
+            elif flatten_latent_dim % 16 == 0:
+                query_chunk_size = int(flatten_latent_dim / 16)
+            elif flatten_latent_dim % 4 == 0:
+                query_chunk_size = int(flatten_latent_dim / 4)
+            else:
+                query_chunk_size = int(flatten_latent_dim)
+
+            hidden_states = jax_memory_efficient_attention(
+                query_states, key_states, value_states, query_chunk_size=query_chunk_size, key_chunk_size=4096 * 4
+            )
+
+            hidden_states = hidden_states.transpose(1, 0, 2)
+        else:
+            # compute attentions
+            if self.split_head_dim:
+                attention_scores = jnp.einsum("b t n h, b f n h -> b n f t", key_states, query_states)
+            else:
+                attention_scores = jnp.einsum("b i d, b j d->b i j", query_states, key_states)
+
+            attention_scores = attention_scores * self.scale
+            attention_probs = nn.softmax(attention_scores, axis=-1 if self.split_head_dim else 2)
+
+            # attend to values
+            if self.split_head_dim:
+                hidden_states = jnp.einsum("b n f t, b t n h -> b f n h", attention_probs, value_states)
+                b = hidden_states.shape[0]
+                hidden_states = jnp.reshape(hidden_states, (b, -1, self.heads * self.dim_head))
+            else:
+                hidden_states = jnp.einsum("b i j, b j d -> b i d", attention_probs, value_states)
+                hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+
+        hidden_states = self.proj_attn(hidden_states)
+        return self.dropout_layer(hidden_states, deterministic=deterministic)
+
+
+class FlaxBasicTransformerBlock(nn.Module):
+    r"""
+    A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
+    https://arxiv.org/abs/1706.03762
+
+
+    Parameters:
+        dim (:obj:`int`):
+            Inner hidden states dimension
+        n_heads (:obj:`int`):
+            Number of heads
+        d_head (:obj:`int`):
+            Hidden states dimension inside each head
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        only_cross_attention (`bool`, defaults to `False`):
+            Whether to only apply cross attention.
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+    """
+
+    dim: int
+    n_heads: int
+    d_head: int
+    dropout: float = 0.0
+    only_cross_attention: bool = False
+    dtype: jnp.dtype = jnp.float32
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+
+    def setup(self):
+        # self attention (or cross_attention if only_cross_attention is True)
+        self.attn1 = FlaxAttention(
+            self.dim,
+            self.n_heads,
+            self.d_head,
+            self.dropout,
+            self.use_memory_efficient_attention,
+            self.split_head_dim,
+            dtype=self.dtype,
+        )
+        # cross attention
+        self.attn2 = FlaxAttention(
+            self.dim,
+            self.n_heads,
+            self.d_head,
+            self.dropout,
+            self.use_memory_efficient_attention,
+            self.split_head_dim,
+            dtype=self.dtype,
+        )
+        self.ff = FlaxFeedForward(dim=self.dim, dropout=self.dropout, dtype=self.dtype)
+        self.norm1 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+        self.norm2 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+        self.norm3 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+
+    def __call__(self, hidden_states, context, deterministic=True):
+        # self attention
+        residual = hidden_states
+        if self.only_cross_attention:
+            hidden_states = self.attn1(self.norm1(hidden_states), context, deterministic=deterministic)
+        else:
+            hidden_states = self.attn1(self.norm1(hidden_states), deterministic=deterministic)
+        hidden_states = hidden_states + residual
+
+        # cross attention
+        residual = hidden_states
+        hidden_states = self.attn2(self.norm2(hidden_states), context, deterministic=deterministic)
+        hidden_states = hidden_states + residual
+
+        # feed forward
+        residual = hidden_states
+        hidden_states = self.ff(self.norm3(hidden_states), deterministic=deterministic)
+        hidden_states = hidden_states + residual
+
+        return self.dropout_layer(hidden_states, deterministic=deterministic)
+
+
+class FlaxTransformer2DModel(nn.Module):
+    r"""
+    A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
+    https://arxiv.org/pdf/1506.02025.pdf
+
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input number of channels
+        n_heads (:obj:`int`):
+            Number of heads
+        d_head (:obj:`int`):
+            Hidden states dimension inside each head
+        depth (:obj:`int`, *optional*, defaults to 1):
+            Number of transformers block
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        use_linear_projection (`bool`, defaults to `False`): tbd
+        only_cross_attention (`bool`, defaults to `False`): tbd
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+    """
+
+    in_channels: int
+    n_heads: int
+    d_head: int
+    depth: int = 1
+    dropout: float = 0.0
+    use_linear_projection: bool = False
+    only_cross_attention: bool = False
+    dtype: jnp.dtype = jnp.float32
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+
+    def setup(self):
+        self.norm = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+
+        inner_dim = self.n_heads * self.d_head
+        if self.use_linear_projection:
+            self.proj_in = nn.Dense(inner_dim, dtype=self.dtype)
+        else:
+            self.proj_in = nn.Conv(
+                inner_dim,
+                kernel_size=(1, 1),
+                strides=(1, 1),
+                padding="VALID",
+                dtype=self.dtype,
+            )
+
+        self.transformer_blocks = [
+            FlaxBasicTransformerBlock(
+                inner_dim,
+                self.n_heads,
+                self.d_head,
+                dropout=self.dropout,
+                only_cross_attention=self.only_cross_attention,
+                dtype=self.dtype,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                split_head_dim=self.split_head_dim,
+            )
+            for _ in range(self.depth)
+        ]
+
+        if self.use_linear_projection:
+            self.proj_out = nn.Dense(inner_dim, dtype=self.dtype)
+        else:
+            self.proj_out = nn.Conv(
+                inner_dim,
+                kernel_size=(1, 1),
+                strides=(1, 1),
+                padding="VALID",
+                dtype=self.dtype,
+            )
+
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+
+    def __call__(self, hidden_states, context, deterministic=True):
+        batch, height, width, channels = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if self.use_linear_projection:
+            hidden_states = hidden_states.reshape(batch, height * width, channels)
+            hidden_states = self.proj_in(hidden_states)
+        else:
+            hidden_states = self.proj_in(hidden_states)
+            hidden_states = hidden_states.reshape(batch, height * width, channels)
+
+        for transformer_block in self.transformer_blocks:
+            hidden_states = transformer_block(hidden_states, context, deterministic=deterministic)
+
+        if self.use_linear_projection:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = hidden_states.reshape(batch, height, width, channels)
+        else:
+            hidden_states = hidden_states.reshape(batch, height, width, channels)
+            hidden_states = self.proj_out(hidden_states)
+
+        hidden_states = hidden_states + residual
+        return self.dropout_layer(hidden_states, deterministic=deterministic)
+
+
+class FlaxFeedForward(nn.Module):
+    r"""
+    Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
+    [`FeedForward`] class, with the following simplifications:
+    - The activation function is currently hardcoded to a gated linear unit from:
+    https://arxiv.org/abs/2002.05202
+    - `dim_out` is equal to `dim`.
+    - The number of hidden dimensions is hardcoded to `dim * 4` in [`FlaxGELU`].
+
+    Parameters:
+        dim (:obj:`int`):
+            Inner hidden states dimension
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    dim: int
+    dropout: float = 0.0
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        # The second linear layer needs to be called
+        # net_2 for now to match the index of the Sequential layer
+        self.net_0 = FlaxGEGLU(self.dim, self.dropout, self.dtype)
+        self.net_2 = nn.Dense(self.dim, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = self.net_0(hidden_states, deterministic=deterministic)
+        hidden_states = self.net_2(hidden_states)
+        return hidden_states
+
+
+class FlaxGEGLU(nn.Module):
+    r"""
+    Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
+    https://arxiv.org/abs/2002.05202.
+
+    Parameters:
+        dim (:obj:`int`):
+            Input hidden states dimension
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    dim: int
+    dropout: float = 0.0
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        inner_dim = self.dim * 4
+        self.proj = nn.Dense(inner_dim * 2, dtype=self.dtype)
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = self.proj(hidden_states)
+        hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=2)
+        return self.dropout_layer(hidden_linear * nn.gelu(hidden_gelu), deterministic=deterministic)
diff --git a/diffusers/src/diffusers/models/attention_processor.py b/diffusers/src/diffusers/models/attention_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..bebfe0efccd9b7a35ad865e365d1eabcafbe0512
--- /dev/null
+++ b/diffusers/src/diffusers/models/attention_processor.py
@@ -0,0 +1,6007 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import math
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ..image_processor import IPAdapterMaskProcessor
+from ..utils import deprecate, logging
+from ..utils.import_utils import is_torch_npu_available, is_xformers_available
+from ..utils.torch_utils import is_torch_version, maybe_allow_in_graph
+from einops import rearrange
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_torch_npu_available():
+    import torch_npu
+
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+
+
+@maybe_allow_in_graph
+class Attention(nn.Module):
+    r"""
+    A cross attention layer.
+
+    Parameters:
+        query_dim (`int`):
+            The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8):
+            The number of heads to use for multi-head attention.
+        kv_heads (`int`,  *optional*, defaults to `None`):
+            The number of key and value heads to use for multi-head attention. Defaults to `heads`. If
+            `kv_heads=heads`, the model will use Multi Head Attention (MHA), if `kv_heads=1` the model will use Multi
+            Query Attention (MQA) otherwise GQA is used.
+        dim_head (`int`,  *optional*, defaults to 64):
+            The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the attention computation to `float32`.
+        upcast_softmax (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the softmax computation to `float32`.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the group norm in the cross attention.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+        norm_num_groups (`int`, *optional*, defaults to `None`):
+            The number of groups to use for the group norm in the attention.
+        spatial_norm_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the spatial normalization.
+        out_bias (`bool`, *optional*, defaults to `True`):
+            Set to `True` to use a bias in the output linear layer.
+        scale_qk (`bool`, *optional*, defaults to `True`):
+            Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
+        only_cross_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
+            `added_kv_proj_dim` is not `None`.
+        eps (`float`, *optional*, defaults to 1e-5):
+            An additional value added to the denominator in group normalization that is used for numerical stability.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0):
+            A factor to rescale the output by dividing it with this value.
+        residual_connection (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add the residual connection to the output.
+        _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
+            Set to `True` if the attention block is loaded from a deprecated state dict.
+        processor (`AttnProcessor`, *optional*, defaults to `None`):
+            The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
+            `AttnProcessor` otherwise.
+    """
+
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        kv_heads: Optional[int] = None,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        qk_norm: Optional[str] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        added_proj_bias: Optional[bool] = True,
+        norm_num_groups: Optional[int] = None,
+        spatial_norm_dim: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        residual_connection: bool = False,
+        _from_deprecated_attn_block: bool = False,
+        processor: Optional["AttnProcessor"] = None,
+        out_dim: int = None,
+        context_pre_only=None,
+        pre_only=False,
+    ):
+        super().__init__()
+
+        # To prevent circular import.
+        from .normalization import FP32LayerNorm, RMSNorm
+
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.is_cross_attention = cross_attention_dim is not None
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        self.dropout = dropout
+        self.fused_projections = False
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.context_pre_only = context_pre_only
+        self.pre_only = pre_only
+
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+        self._from_deprecated_attn_block = _from_deprecated_attn_block
+
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
+        else:
+            self.group_norm = None
+
+        if spatial_norm_dim is not None:
+            self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
+        else:
+            self.spatial_norm = None
+
+        if qk_norm is None:
+            self.norm_q = None
+            self.norm_k = None
+        elif qk_norm == "layer_norm":
+            self.norm_q = nn.LayerNorm(dim_head, eps=eps)
+            self.norm_k = nn.LayerNorm(dim_head, eps=eps)
+        elif qk_norm == "fp32_layer_norm":
+            self.norm_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+            self.norm_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+        elif qk_norm == "layer_norm_across_heads":
+            # Lumina applys qk norm across all heads
+            self.norm_q = nn.LayerNorm(dim_head * heads, eps=eps)
+            self.norm_k = nn.LayerNorm(dim_head * kv_heads, eps=eps)
+        elif qk_norm == "rms_norm":
+            self.norm_q = RMSNorm(dim_head, eps=eps)
+            self.norm_k = RMSNorm(dim_head, eps=eps)
+        else:
+            raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None or 'layer_norm'")
+
+        if cross_attention_norm is None:
+            self.norm_cross = None
+        elif cross_attention_norm == "layer_norm":
+            self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
+        elif cross_attention_norm == "group_norm":
+            if self.added_kv_proj_dim is not None:
+                # The given `encoder_hidden_states` are initially of shape
+                # (batch_size, seq_len, added_kv_proj_dim) before being projected
+                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+                # before the projection, so we need to use `added_kv_proj_dim` as
+                # the number of channels for the group norm.
+                norm_cross_num_channels = added_kv_proj_dim
+            else:
+                norm_cross_num_channels = self.cross_attention_dim
+
+            self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
+            )
+        else:
+            raise ValueError(
+                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+            )
+
+        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
+
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+            self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+
+        self.added_proj_bias = added_proj_bias
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
+            if self.context_pre_only is not None:
+                self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+
+        if not self.pre_only:
+            self.to_out = nn.ModuleList([])
+            self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
+            self.to_out.append(nn.Dropout(dropout))
+
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_add_out = nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)
+
+        if qk_norm is not None and added_kv_proj_dim is not None:
+            if qk_norm == "fp32_layer_norm":
+                self.norm_added_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+                self.norm_added_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+            elif qk_norm == "rms_norm":
+                self.norm_added_q = RMSNorm(dim_head, eps=eps)
+                self.norm_added_k = RMSNorm(dim_head, eps=eps)
+        else:
+            self.norm_added_q = None
+            self.norm_added_k = None
+
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+        if processor is None:
+            processor = (
+                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+        self.set_processor(processor)
+
+    def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None:
+        r"""
+        Set whether to use npu flash attention from `torch_npu` or not.
+
+        """
+        if use_npu_flash_attention:
+            processor = AttnProcessorNPU()
+        else:
+            # set attention processor
+            # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+            # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+            # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+            processor = (
+                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+        self.set_processor(processor)
+
+    def set_use_memory_efficient_attention_xformers(
+        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        r"""
+        Set whether to use memory efficient attention from `xformers` or not.
+
+        Args:
+            use_memory_efficient_attention_xformers (`bool`):
+                Whether to use memory efficient attention from `xformers` or not.
+            attention_op (`Callable`, *optional*):
+                The attention operation to use. Defaults to `None` which uses the default attention operation from
+                `xformers`.
+        """
+        is_custom_diffusion = hasattr(self, "processor") and isinstance(
+            self.processor,
+            (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor, CustomDiffusionAttnProcessor2_0),
+        )
+        is_added_kv_processor = hasattr(self, "processor") and isinstance(
+            self.processor,
+            (
+                AttnAddedKVProcessor,
+                AttnAddedKVProcessor2_0,
+                SlicedAttnAddedKVProcessor,
+                XFormersAttnAddedKVProcessor,
+            ),
+        )
+
+        if use_memory_efficient_attention_xformers:
+            if is_added_kv_processor and is_custom_diffusion:
+                raise NotImplementedError(
+                    f"Memory efficient attention is currently not supported for custom diffusion for attention processor type {self.processor}"
+                )
+            if not is_xformers_available():
+                raise ModuleNotFoundError(
+                    (
+                        "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                        " xformers"
+                    ),
+                    name="xformers",
+                )
+            elif not torch.cuda.is_available():
+                raise ValueError(
+                    "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
+                    " only available for GPU "
+                )
+            else:
+                try:
+                    # Make sure we can run the memory efficient attention
+                    _ = xformers.ops.memory_efficient_attention(
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                    )
+                except Exception as e:
+                    raise e
+
+            if is_custom_diffusion:
+                processor = CustomDiffusionXFormersAttnProcessor(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    attention_op=attention_op,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
+            elif is_added_kv_processor:
+                # TODO(Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
+                # which uses this type of cross attention ONLY because the attention mask of format
+                # [0, ..., -10.000, ..., 0, ...,] is not supported
+                # throw warning
+                logger.info(
+                    "Memory efficient attention with `xformers` might currently not work correctly if an attention mask is required for the attention operation."
+                )
+                processor = XFormersAttnAddedKVProcessor(attention_op=attention_op)
+            else:
+                processor = XFormersAttnProcessor(attention_op=attention_op)
+        else:
+            if is_custom_diffusion:
+                attn_processor_class = (
+                    CustomDiffusionAttnProcessor2_0
+                    if hasattr(F, "scaled_dot_product_attention")
+                    else CustomDiffusionAttnProcessor
+                )
+                processor = attn_processor_class(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
+            else:
+                # set attention processor
+                # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+                # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+                # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+                processor = (
+                    AttnProcessor2_0()
+                    if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
+                    else AttnProcessor()
+                )
+
+        self.set_processor(processor)
+
+    def set_attention_slice(self, slice_size: int) -> None:
+        r"""
+        Set the slice size for attention computation.
+
+        Args:
+            slice_size (`int`):
+                The slice size for attention computation.
+        """
+        if slice_size is not None and slice_size > self.sliceable_head_dim:
+            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
+
+        if slice_size is not None and self.added_kv_proj_dim is not None:
+            processor = SlicedAttnAddedKVProcessor(slice_size)
+        elif slice_size is not None:
+            processor = SlicedAttnProcessor(slice_size)
+        elif self.added_kv_proj_dim is not None:
+            processor = AttnAddedKVProcessor()
+        else:
+            # set attention processor
+            # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+            # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+            # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+            processor = (
+                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+
+        self.set_processor(processor)
+
+    def set_processor(self, processor: "AttnProcessor") -> None:
+        r"""
+        Set the attention processor to use.
+
+        Args:
+            processor (`AttnProcessor`):
+                The attention processor to use.
+        """
+        # if current processor is in `self._modules` and if passed `processor` is not, we need to
+        # pop `processor` from `self._modules`
+        if (
+            hasattr(self, "processor")
+            and isinstance(self.processor, torch.nn.Module)
+            and not isinstance(processor, torch.nn.Module)
+        ):
+            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
+            self._modules.pop("processor")
+
+        self.processor = processor
+
+    def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
+        r"""
+        Get the attention processor in use.
+
+        Args:
+            return_deprecated_lora (`bool`, *optional*, defaults to `False`):
+                Set to `True` to return the deprecated LoRA attention processor.
+
+        Returns:
+            "AttentionProcessor": The attention processor in use.
+        """
+        if not return_deprecated_lora:
+            return self.processor
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        r"""
+        The forward method of the `Attention` class.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                The hidden states of the query.
+            encoder_hidden_states (`torch.Tensor`, *optional*):
+                The hidden states of the encoder.
+            attention_mask (`torch.Tensor`, *optional*):
+                The attention mask to use. If `None`, no mask is applied.
+            **cross_attention_kwargs:
+                Additional keyword arguments to pass along to the cross attention.
+
+        Returns:
+            `torch.Tensor`: The output of the attention layer.
+        """
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+
+        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
+        quiet_attn_parameters = {"ip_adapter_masks"}
+        unused_kwargs = [
+            k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters and k not in quiet_attn_parameters
+        ]
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"cross_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
+            )
+        cross_attention_kwargs = {k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters}
+
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+    def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`. `heads`
+        is the number of heads initialized while constructing the `Attention` class.
+
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+
+    def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size, seq_len, heads, dim // heads]` `heads` is
+        the number of heads initialized while constructing the `Attention` class.
+
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+            out_dim (`int`, *optional*, defaults to `3`): The output dimension of the tensor. If `3`, the tensor is
+                reshaped to `[batch_size * heads, seq_len, dim // heads]`.
+
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        if tensor.ndim == 3:
+            batch_size, seq_len, dim = tensor.shape
+            extra_dim = 1
+        else:
+            batch_size, extra_dim, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size, seq_len * extra_dim, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3)
+
+        if out_dim == 3:
+            tensor = tensor.reshape(batch_size * head_size, seq_len * extra_dim, dim // head_size)
+
+        return tensor
+
+    def get_attention_scores(
+        self, query: torch.Tensor, key: torch.Tensor, attention_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        r"""
+        Compute the attention scores.
+
+        Args:
+            query (`torch.Tensor`): The query tensor.
+            key (`torch.Tensor`): The key tensor.
+            attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
+
+        Returns:
+            `torch.Tensor`: The attention probabilities/scores.
+        """
+        dtype = query.dtype
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+
+        if attention_mask is None:
+            baddbmm_input = torch.empty(
+                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
+            )
+            beta = 0
+        else:
+            baddbmm_input = attention_mask
+            beta = 1
+
+        attention_scores = torch.baddbmm(
+            baddbmm_input,
+            query,
+            key.transpose(-1, -2),
+            beta=beta,
+            alpha=self.scale,
+        )
+        del baddbmm_input
+
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+
+        attention_probs = attention_scores.softmax(dim=-1)
+        del attention_scores
+
+        attention_probs = attention_probs.to(dtype)
+
+        return attention_probs
+
+    def prepare_attention_mask(
+        self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
+    ) -> torch.Tensor:
+        r"""
+        Prepare the attention mask for the attention computation.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                The attention mask to prepare.
+            target_length (`int`):
+                The target length of the attention mask. This is the length of the attention mask after padding.
+            batch_size (`int`):
+                The batch size, which is used to repeat the attention mask.
+            out_dim (`int`, *optional*, defaults to `3`):
+                The output dimension of the attention mask. Can be either `3` or `4`.
+
+        Returns:
+            `torch.Tensor`: The prepared attention mask.
+        """
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+
+        return attention_mask
+
+    def norm_encoder_hidden_states(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
+        r"""
+        Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the
+        `Attention` class.
+
+        Args:
+            encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder.
+
+        Returns:
+            `torch.Tensor`: The normalized encoder hidden states.
+        """
+        assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+
+        if isinstance(self.norm_cross, nn.LayerNorm):
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+        elif isinstance(self.norm_cross, nn.GroupNorm):
+            # Group norm norms along the channels dimension and expects
+            # input to be in the shape of (N, C, *). In this case, we want
+            # to norm along the hidden dimension, so we need to move
+            # (batch_size, sequence_length, hidden_size) ->
+            # (batch_size, hidden_size, sequence_length)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+        else:
+            assert False
+
+        return encoder_hidden_states
+
+    @torch.no_grad()
+    def fuse_projections(self, fuse=True):
+        device = self.to_q.weight.data.device
+        dtype = self.to_q.weight.data.dtype
+
+        if not self.is_cross_attention:
+            # fetch weight matrices.
+            concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data])
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+
+            # create a new single projection layer and copy over the weights.
+            self.to_qkv = nn.Linear(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype)
+            self.to_qkv.weight.copy_(concatenated_weights)
+            if self.use_bias:
+                concatenated_bias = torch.cat([self.to_q.bias.data, self.to_k.bias.data, self.to_v.bias.data])
+                self.to_qkv.bias.copy_(concatenated_bias)
+
+        else:
+            concatenated_weights = torch.cat([self.to_k.weight.data, self.to_v.weight.data])
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+
+            self.to_kv = nn.Linear(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype)
+            self.to_kv.weight.copy_(concatenated_weights)
+            if self.use_bias:
+                concatenated_bias = torch.cat([self.to_k.bias.data, self.to_v.bias.data])
+                self.to_kv.bias.copy_(concatenated_bias)
+
+        # handle added projections for SD3 and others.
+        if hasattr(self, "add_q_proj") and hasattr(self, "add_k_proj") and hasattr(self, "add_v_proj"):
+            concatenated_weights = torch.cat(
+                [self.add_q_proj.weight.data, self.add_k_proj.weight.data, self.add_v_proj.weight.data]
+            )
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+
+            self.to_added_qkv = nn.Linear(
+                in_features, out_features, bias=self.added_proj_bias, device=device, dtype=dtype
+            )
+            self.to_added_qkv.weight.copy_(concatenated_weights)
+            if self.added_proj_bias:
+                concatenated_bias = torch.cat(
+                    [self.add_q_proj.bias.data, self.add_k_proj.bias.data, self.add_v_proj.bias.data]
+                )
+                self.to_added_qkv.bias.copy_(concatenated_bias)
+
+        self.fused_projections = fuse
+
+
+class AttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class CustomDiffusionAttnProcessor(nn.Module):
+    r"""
+    Processor for implementing attention for the Custom Diffusion method.
+
+    Args:
+        train_kv (`bool`, defaults to `True`):
+            Whether to newly train the key and value matrices corresponding to the text features.
+        train_q_out (`bool`, defaults to `True`):
+            Whether to newly train query matrices corresponding to the latent image features.
+        hidden_size (`int`, *optional*, defaults to `None`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*, defaults to `None`):
+            The number of channels in the `encoder_hidden_states`.
+        out_bias (`bool`, defaults to `True`):
+            Whether to include the bias parameter in `train_q_out`.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+    """
+
+    def __init__(
+        self,
+        train_kv: bool = True,
+        train_q_out: bool = True,
+        hidden_size: Optional[int] = None,
+        cross_attention_dim: Optional[int] = None,
+        out_bias: bool = True,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.train_kv = train_kv
+        self.train_q_out = train_q_out
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+
+        # `_custom_diffusion` id for easy serialization and loading.
+        if self.train_kv:
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        if self.train_q_out:
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
+            self.to_out_custom_diffusion = nn.ModuleList([])
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
+            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if self.train_q_out:
+            query = self.to_q_custom_diffusion(hidden_states).to(attn.to_q.weight.dtype)
+        else:
+            query = attn.to_q(hidden_states.to(attn.to_q.weight.dtype))
+
+        if encoder_hidden_states is None:
+            crossattn = False
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        if self.train_kv:
+            key = self.to_k_custom_diffusion(encoder_hidden_states.to(self.to_k_custom_diffusion.weight.dtype))
+            value = self.to_v_custom_diffusion(encoder_hidden_states.to(self.to_v_custom_diffusion.weight.dtype))
+            key = key.to(attn.to_q.weight.dtype)
+            value = value.to(attn.to_q.weight.dtype)
+        else:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+
+        if crossattn:
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.0
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        if self.train_q_out:
+            # linear proj
+            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
+            # dropout
+            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
+        else:
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class AttnAddedKVProcessor:
+    r"""
+    Processor for performing attention-related computations with extra learnable key and value matrices for the text
+    encoder.
+    """
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        residual = hidden_states
+
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class AttnAddedKVProcessor2_0:
+    r"""
+    Processor for performing scaled dot-product attention (enabled by default if you're using PyTorch 2.0), with extra
+    learnable key and value matrices for the text encoder.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AttnAddedKVProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        residual = hidden_states
+
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=4)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query, out_dim=4)
+
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj, out_dim=4)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj, out_dim=4)
+
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key, out_dim=4)
+            value = attn.head_to_batch_dim(value, out_dim=4)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, residual.shape[1])
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class JointAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        context_input_ndim = encoder_hidden_states.ndim
+        if context_input_ndim == 4:
+            batch_size, channel, height, width = encoder_hidden_states.shape
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size = encoder_hidden_states.shape[0]
+
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        # `context` projections.
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+
+        # attention
+        query = torch.cat([query, encoder_hidden_states_query_proj], dim=1)
+        key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
+        value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # Split the attention outputs.
+        hidden_states, encoder_hidden_states = (
+            hidden_states[:, : residual.shape[1]],
+            hidden_states[:, residual.shape[1] :],
+        )
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if not attn.context_pre_only:
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if context_input_ndim == 4:
+            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        return hidden_states, encoder_hidden_states
+
+
+class PAGJointAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "PAGJointAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        context_input_ndim = encoder_hidden_states.ndim
+        if context_input_ndim == 4:
+            batch_size, channel, height, width = encoder_hidden_states.shape
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        # store the length of image patch sequences to create a mask that prevents interaction between patches
+        # similar to making the self-attention map an identity matrix
+        identity_block_size = hidden_states.shape[1]
+
+        # chunk
+        hidden_states_org, hidden_states_ptb = hidden_states.chunk(2)
+        encoder_hidden_states_org, encoder_hidden_states_ptb = encoder_hidden_states.chunk(2)
+
+        ################## original path ##################
+        batch_size = encoder_hidden_states_org.shape[0]
+
+        # `sample` projections.
+        query_org = attn.to_q(hidden_states_org)
+        key_org = attn.to_k(hidden_states_org)
+        value_org = attn.to_v(hidden_states_org)
+
+        # `context` projections.
+        encoder_hidden_states_org_query_proj = attn.add_q_proj(encoder_hidden_states_org)
+        encoder_hidden_states_org_key_proj = attn.add_k_proj(encoder_hidden_states_org)
+        encoder_hidden_states_org_value_proj = attn.add_v_proj(encoder_hidden_states_org)
+
+        # attention
+        query_org = torch.cat([query_org, encoder_hidden_states_org_query_proj], dim=1)
+        key_org = torch.cat([key_org, encoder_hidden_states_org_key_proj], dim=1)
+        value_org = torch.cat([value_org, encoder_hidden_states_org_value_proj], dim=1)
+
+        inner_dim = key_org.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query_org = query_org.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key_org = key_org.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value_org = value_org.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        hidden_states_org = F.scaled_dot_product_attention(
+            query_org, key_org, value_org, dropout_p=0.0, is_causal=False
+        )
+        hidden_states_org = hidden_states_org.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states_org = hidden_states_org.to(query_org.dtype)
+
+        # Split the attention outputs.
+        hidden_states_org, encoder_hidden_states_org = (
+            hidden_states_org[:, : residual.shape[1]],
+            hidden_states_org[:, residual.shape[1] :],
+        )
+
+        # linear proj
+        hidden_states_org = attn.to_out[0](hidden_states_org)
+        # dropout
+        hidden_states_org = attn.to_out[1](hidden_states_org)
+        if not attn.context_pre_only:
+            encoder_hidden_states_org = attn.to_add_out(encoder_hidden_states_org)
+
+        if input_ndim == 4:
+            hidden_states_org = hidden_states_org.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if context_input_ndim == 4:
+            encoder_hidden_states_org = encoder_hidden_states_org.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+
+        ################## perturbed path ##################
+
+        batch_size = encoder_hidden_states_ptb.shape[0]
+
+        # `sample` projections.
+        query_ptb = attn.to_q(hidden_states_ptb)
+        key_ptb = attn.to_k(hidden_states_ptb)
+        value_ptb = attn.to_v(hidden_states_ptb)
+
+        # `context` projections.
+        encoder_hidden_states_ptb_query_proj = attn.add_q_proj(encoder_hidden_states_ptb)
+        encoder_hidden_states_ptb_key_proj = attn.add_k_proj(encoder_hidden_states_ptb)
+        encoder_hidden_states_ptb_value_proj = attn.add_v_proj(encoder_hidden_states_ptb)
+
+        # attention
+        query_ptb = torch.cat([query_ptb, encoder_hidden_states_ptb_query_proj], dim=1)
+        key_ptb = torch.cat([key_ptb, encoder_hidden_states_ptb_key_proj], dim=1)
+        value_ptb = torch.cat([value_ptb, encoder_hidden_states_ptb_value_proj], dim=1)
+
+        inner_dim = key_ptb.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query_ptb = query_ptb.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key_ptb = key_ptb.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value_ptb = value_ptb.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # create a full mask with all entries set to 0
+        seq_len = query_ptb.size(2)
+        full_mask = torch.zeros((seq_len, seq_len), device=query_ptb.device, dtype=query_ptb.dtype)
+
+        # set the attention value between image patches to -inf
+        full_mask[:identity_block_size, :identity_block_size] = float("-inf")
+
+        # set the diagonal of the attention value between image patches to 0
+        full_mask[:identity_block_size, :identity_block_size].fill_diagonal_(0)
+
+        # expand the mask to match the attention weights shape
+        full_mask = full_mask.unsqueeze(0).unsqueeze(0)  # Add batch and num_heads dimensions
+
+        hidden_states_ptb = F.scaled_dot_product_attention(
+            query_ptb, key_ptb, value_ptb, attn_mask=full_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states_ptb = hidden_states_ptb.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states_ptb = hidden_states_ptb.to(query_ptb.dtype)
+
+        # split the attention outputs.
+        hidden_states_ptb, encoder_hidden_states_ptb = (
+            hidden_states_ptb[:, : residual.shape[1]],
+            hidden_states_ptb[:, residual.shape[1] :],
+        )
+
+        # linear proj
+        hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
+        # dropout
+        hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
+        if not attn.context_pre_only:
+            encoder_hidden_states_ptb = attn.to_add_out(encoder_hidden_states_ptb)
+
+        if input_ndim == 4:
+            hidden_states_ptb = hidden_states_ptb.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if context_input_ndim == 4:
+            encoder_hidden_states_ptb = encoder_hidden_states_ptb.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+
+        ################ concat ###############
+        hidden_states = torch.cat([hidden_states_org, hidden_states_ptb])
+        encoder_hidden_states = torch.cat([encoder_hidden_states_org, encoder_hidden_states_ptb])
+
+        return hidden_states, encoder_hidden_states
+
+
+class PAGCFGJointAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "PAGCFGJointAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        context_input_ndim = encoder_hidden_states.ndim
+        if context_input_ndim == 4:
+            batch_size, channel, height, width = encoder_hidden_states.shape
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        identity_block_size = hidden_states.shape[
+            1
+        ]  # patch embeddings width * height (correspond to self-attention map width or height)
+
+        # chunk
+        hidden_states_uncond, hidden_states_org, hidden_states_ptb = hidden_states.chunk(3)
+        hidden_states_org = torch.cat([hidden_states_uncond, hidden_states_org])
+
+        (
+            encoder_hidden_states_uncond,
+            encoder_hidden_states_org,
+            encoder_hidden_states_ptb,
+        ) = encoder_hidden_states.chunk(3)
+        encoder_hidden_states_org = torch.cat([encoder_hidden_states_uncond, encoder_hidden_states_org])
+
+        ################## original path ##################
+        batch_size = encoder_hidden_states_org.shape[0]
+
+        # `sample` projections.
+        query_org = attn.to_q(hidden_states_org)
+        key_org = attn.to_k(hidden_states_org)
+        value_org = attn.to_v(hidden_states_org)
+
+        # `context` projections.
+        encoder_hidden_states_org_query_proj = attn.add_q_proj(encoder_hidden_states_org)
+        encoder_hidden_states_org_key_proj = attn.add_k_proj(encoder_hidden_states_org)
+        encoder_hidden_states_org_value_proj = attn.add_v_proj(encoder_hidden_states_org)
+
+        # attention
+        query_org = torch.cat([query_org, encoder_hidden_states_org_query_proj], dim=1)
+        key_org = torch.cat([key_org, encoder_hidden_states_org_key_proj], dim=1)
+        value_org = torch.cat([value_org, encoder_hidden_states_org_value_proj], dim=1)
+
+        inner_dim = key_org.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query_org = query_org.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key_org = key_org.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value_org = value_org.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        hidden_states_org = F.scaled_dot_product_attention(
+            query_org, key_org, value_org, dropout_p=0.0, is_causal=False
+        )
+        hidden_states_org = hidden_states_org.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states_org = hidden_states_org.to(query_org.dtype)
+
+        # Split the attention outputs.
+        hidden_states_org, encoder_hidden_states_org = (
+            hidden_states_org[:, : residual.shape[1]],
+            hidden_states_org[:, residual.shape[1] :],
+        )
+
+        # linear proj
+        hidden_states_org = attn.to_out[0](hidden_states_org)
+        # dropout
+        hidden_states_org = attn.to_out[1](hidden_states_org)
+        if not attn.context_pre_only:
+            encoder_hidden_states_org = attn.to_add_out(encoder_hidden_states_org)
+
+        if input_ndim == 4:
+            hidden_states_org = hidden_states_org.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if context_input_ndim == 4:
+            encoder_hidden_states_org = encoder_hidden_states_org.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+
+        ################## perturbed path ##################
+
+        batch_size = encoder_hidden_states_ptb.shape[0]
+
+        # `sample` projections.
+        query_ptb = attn.to_q(hidden_states_ptb)
+        key_ptb = attn.to_k(hidden_states_ptb)
+        value_ptb = attn.to_v(hidden_states_ptb)
+
+        # `context` projections.
+        encoder_hidden_states_ptb_query_proj = attn.add_q_proj(encoder_hidden_states_ptb)
+        encoder_hidden_states_ptb_key_proj = attn.add_k_proj(encoder_hidden_states_ptb)
+        encoder_hidden_states_ptb_value_proj = attn.add_v_proj(encoder_hidden_states_ptb)
+
+        # attention
+        query_ptb = torch.cat([query_ptb, encoder_hidden_states_ptb_query_proj], dim=1)
+        key_ptb = torch.cat([key_ptb, encoder_hidden_states_ptb_key_proj], dim=1)
+        value_ptb = torch.cat([value_ptb, encoder_hidden_states_ptb_value_proj], dim=1)
+
+        inner_dim = key_ptb.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query_ptb = query_ptb.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key_ptb = key_ptb.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value_ptb = value_ptb.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # create a full mask with all entries set to 0
+        seq_len = query_ptb.size(2)
+        full_mask = torch.zeros((seq_len, seq_len), device=query_ptb.device, dtype=query_ptb.dtype)
+
+        # set the attention value between image patches to -inf
+        full_mask[:identity_block_size, :identity_block_size] = float("-inf")
+
+        # set the diagonal of the attention value between image patches to 0
+        full_mask[:identity_block_size, :identity_block_size].fill_diagonal_(0)
+
+        # expand the mask to match the attention weights shape
+        full_mask = full_mask.unsqueeze(0).unsqueeze(0)  # Add batch and num_heads dimensions
+
+        hidden_states_ptb = F.scaled_dot_product_attention(
+            query_ptb, key_ptb, value_ptb, attn_mask=full_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states_ptb = hidden_states_ptb.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states_ptb = hidden_states_ptb.to(query_ptb.dtype)
+
+        # split the attention outputs.
+        hidden_states_ptb, encoder_hidden_states_ptb = (
+            hidden_states_ptb[:, : residual.shape[1]],
+            hidden_states_ptb[:, residual.shape[1] :],
+        )
+
+        # linear proj
+        hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
+        # dropout
+        hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
+        if not attn.context_pre_only:
+            encoder_hidden_states_ptb = attn.to_add_out(encoder_hidden_states_ptb)
+
+        if input_ndim == 4:
+            hidden_states_ptb = hidden_states_ptb.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if context_input_ndim == 4:
+            encoder_hidden_states_ptb = encoder_hidden_states_ptb.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+
+        ################ concat ###############
+        hidden_states = torch.cat([hidden_states_org, hidden_states_ptb])
+        encoder_hidden_states = torch.cat([encoder_hidden_states_org, encoder_hidden_states_ptb])
+
+        return hidden_states, encoder_hidden_states
+
+
+class FusedJointAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        context_input_ndim = encoder_hidden_states.ndim
+        if context_input_ndim == 4:
+            batch_size, channel, height, width = encoder_hidden_states.shape
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size = encoder_hidden_states.shape[0]
+
+        # `sample` projections.
+        qkv = attn.to_qkv(hidden_states)
+        split_size = qkv.shape[-1] // 3
+        query, key, value = torch.split(qkv, split_size, dim=-1)
+
+        # `context` projections.
+        encoder_qkv = attn.to_added_qkv(encoder_hidden_states)
+        split_size = encoder_qkv.shape[-1] // 3
+        (
+            encoder_hidden_states_query_proj,
+            encoder_hidden_states_key_proj,
+            encoder_hidden_states_value_proj,
+        ) = torch.split(encoder_qkv, split_size, dim=-1)
+
+        # attention
+        query = torch.cat([query, encoder_hidden_states_query_proj], dim=1)
+        key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
+        value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # Split the attention outputs.
+        hidden_states, encoder_hidden_states = (
+            hidden_states[:, : residual.shape[1]],
+            hidden_states[:, residual.shape[1] :],
+        )
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if not attn.context_pre_only:
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if context_input_ndim == 4:
+            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        return hidden_states, encoder_hidden_states
+
+
+class AuraFlowAttnProcessor2_0:
+    """Attention processor used typically in processing Aura Flow."""
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention") and is_torch_version("<", "2.1"):
+            raise ImportError(
+                "AuraFlowAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to at least 2.1 or above as we use `scale` in `F.scaled_dot_product_attention()`. "
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        batch_size = hidden_states.shape[0]
+
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        # `context` projections.
+        if encoder_hidden_states is not None:
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+
+        # Reshape.
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim)
+        key = key.view(batch_size, -1, attn.heads, head_dim)
+        value = value.view(batch_size, -1, attn.heads, head_dim)
+
+        # Apply QK norm.
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Concatenate the projections.
+        if encoder_hidden_states is not None:
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            )
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(batch_size, -1, attn.heads, head_dim)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            )
+
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_q(encoder_hidden_states_key_proj)
+
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=1)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        # Attention.
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, dropout_p=0.0, scale=attn.scale, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # Split the attention outputs.
+        if encoder_hidden_states is not None:
+            hidden_states, encoder_hidden_states = (
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+            )
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+        if encoder_hidden_states is not None:
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
+
+
+class FusedAuraFlowAttnProcessor2_0:
+    """Attention processor used typically in processing Aura Flow with fused projections."""
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention") and is_torch_version("<", "2.1"):
+            raise ImportError(
+                "FusedAuraFlowAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to at least 2.1 or above as we use `scale` in `F.scaled_dot_product_attention()`. "
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        batch_size = hidden_states.shape[0]
+
+        # `sample` projections.
+        qkv = attn.to_qkv(hidden_states)
+        split_size = qkv.shape[-1] // 3
+        query, key, value = torch.split(qkv, split_size, dim=-1)
+
+        # `context` projections.
+        if encoder_hidden_states is not None:
+            encoder_qkv = attn.to_added_qkv(encoder_hidden_states)
+            split_size = encoder_qkv.shape[-1] // 3
+            (
+                encoder_hidden_states_query_proj,
+                encoder_hidden_states_key_proj,
+                encoder_hidden_states_value_proj,
+            ) = torch.split(encoder_qkv, split_size, dim=-1)
+
+        # Reshape.
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim)
+        key = key.view(batch_size, -1, attn.heads, head_dim)
+        value = value.view(batch_size, -1, attn.heads, head_dim)
+
+        # Apply QK norm.
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Concatenate the projections.
+        if encoder_hidden_states is not None:
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            )
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(batch_size, -1, attn.heads, head_dim)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            )
+
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_q(encoder_hidden_states_key_proj)
+
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=1)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        # Attention.
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, dropout_p=0.0, scale=attn.scale, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # Split the attention outputs.
+        if encoder_hidden_states is not None:
+            hidden_states, encoder_hidden_states = (
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+            )
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+        if encoder_hidden_states is not None:
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
+
+
+class FluxAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+        if encoder_hidden_states is not None:
+            # `context` projections.
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+
+        if image_rotary_emb is not None:
+            from .embeddings import apply_rotary_emb
+
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+
+        hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+            )
+
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
+
+
+class FusedFluxAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "FusedFluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+
+        # `sample` projections.
+        qkv = attn.to_qkv(hidden_states)
+        split_size = qkv.shape[-1] // 3
+        query, key, value = torch.split(qkv, split_size, dim=-1)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+        # `context` projections.
+        if encoder_hidden_states is not None:
+            encoder_qkv = attn.to_added_qkv(encoder_hidden_states)
+            split_size = encoder_qkv.shape[-1] // 3
+            (
+                encoder_hidden_states_query_proj,
+                encoder_hidden_states_key_proj,
+                encoder_hidden_states_value_proj,
+            ) = torch.split(encoder_qkv, split_size, dim=-1)
+
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+
+        if image_rotary_emb is not None:
+            from .embeddings import apply_rotary_emb
+
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+
+        hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+            )
+
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
+
+
+class CogVideoXAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            from .embeddings import apply_rotary_emb
+
+            query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
+            if not attn.is_cross_attention:
+                key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
+
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        encoder_hidden_states, hidden_states = hidden_states.split(
+            [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+        )
+        return hidden_states, encoder_hidden_states
+
+
+class FusedCogVideoXAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        qkv = attn.to_qkv(hidden_states)
+        split_size = qkv.shape[-1] // 3
+        query, key, value = torch.split(qkv, split_size, dim=-1)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            from .embeddings import apply_rotary_emb
+
+            query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
+            if not attn.is_cross_attention:
+                key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
+
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        encoder_hidden_states, hidden_states = hidden_states.split(
+            [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+        )
+        return hidden_states, encoder_hidden_states
+
+
+class XFormersAttnAddedKVProcessor:
+    r"""
+    Processor for implementing memory efficient attention using xFormers.
+
+    Args:
+        attention_op (`Callable`, *optional*, defaults to `None`):
+            The base
+            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
+            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
+            operator.
+    """
+
+    def __init__(self, attention_op: Optional[Callable] = None):
+        self.attention_op = attention_op
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class XFormersAttnProcessor:
+    r"""
+    Processor for implementing memory efficient attention using xFormers.
+
+    Args:
+        attention_op (`Callable`, *optional*, defaults to `None`):
+            The base
+            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
+            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
+            operator.
+    """
+
+    def __init__(self, attention_op: Optional[Callable] = None):
+        self.attention_op = attention_op
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, key_tokens, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, key_tokens, batch_size)
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class AttnProcessorNPU:
+    r"""
+    Processor for implementing flash attention using torch_npu. Torch_npu supports only fp16 and bf16 data types. If
+    fp32 is used, F.scaled_dot_product_attention will be used for computation, but the acceleration effect on NPU is
+    not significant.
+
+    """
+
+    def __init__(self):
+        if not is_torch_npu_available():
+            raise ImportError("AttnProcessorNPU requires torch_npu extensions and is supported only on npu devices.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        if query.dtype in (torch.float16, torch.bfloat16):
+            hidden_states = torch_npu.npu_fusion_attention(
+                query,
+                key,
+                value,
+                attn.heads,
+                input_layout="BNSD",
+                pse=None,
+                atten_mask=attention_mask,
+                scale=1.0 / math.sqrt(query.shape[-1]),
+                pre_tockens=65536,
+                next_tockens=65536,
+                keep_prob=1.0,
+                sync=False,
+                inner_precise=0,
+            )[0]
+        else:
+            # TODO: add support for attn.scale when we move to Torch 2.1
+            hidden_states = F.scaled_dot_product_attention(
+                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class AttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class StableAudioAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
+    used in the Stable Audio model. It applies rotary embedding on query and key vector, and allows MHA, GQA or MQA.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "StableAudioAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+    def apply_partial_rotary_emb(
+        self,
+        x: torch.Tensor,
+        freqs_cis: Tuple[torch.Tensor],
+    ) -> torch.Tensor:
+        from .embeddings import apply_rotary_emb
+
+        rot_dim = freqs_cis[0].shape[-1]
+        x_to_rotate, x_unrotated = x[..., :rot_dim], x[..., rot_dim:]
+
+        x_rotated = apply_rotary_emb(x_to_rotate, freqs_cis, use_real=True, use_real_unbind_dim=-2)
+
+        out = torch.cat((x_rotated, x_unrotated), dim=-1)
+        return out
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from .embeddings import apply_rotary_emb
+
+        residual = hidden_states
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        head_dim = query.shape[-1] // attn.heads
+        kv_heads = key.shape[-1] // head_dim
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, kv_heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, kv_heads, head_dim).transpose(1, 2)
+
+        if kv_heads != attn.heads:
+            # if GQA or MQA, repeat the key/value heads to reach the number of query heads.
+            heads_per_kv_head = attn.heads // kv_heads
+            key = torch.repeat_interleave(key, heads_per_kv_head, dim=1)
+            value = torch.repeat_interleave(value, heads_per_kv_head, dim=1)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE if needed
+        if rotary_emb is not None:
+            query_dtype = query.dtype
+            key_dtype = key.dtype
+            query = query.to(torch.float32)
+            key = key.to(torch.float32)
+
+            rot_dim = rotary_emb[0].shape[-1]
+            query_to_rotate, query_unrotated = query[..., :rot_dim], query[..., rot_dim:]
+            query_rotated = apply_rotary_emb(query_to_rotate, rotary_emb, use_real=True, use_real_unbind_dim=-2)
+
+            query = torch.cat((query_rotated, query_unrotated), dim=-1)
+
+            if not attn.is_cross_attention:
+                key_to_rotate, key_unrotated = key[..., :rot_dim], key[..., rot_dim:]
+                key_rotated = apply_rotary_emb(key_to_rotate, rotary_emb, use_real=True, use_real_unbind_dim=-2)
+
+                key = torch.cat((key_rotated, key_unrotated), dim=-1)
+
+            query = query.to(query_dtype)
+            key = key.to(key_dtype)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class HunyuanAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
+    used in the HunyuanDiT model. It applies a s normalization layer and rotary embedding on query and key vector.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from .embeddings import apply_rotary_emb
+
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            if not attn.is_cross_attention:
+                key = apply_rotary_emb(key, image_rotary_emb)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class FusedHunyuanAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0) with fused
+    projection layers. This is used in the HunyuanDiT model. It applies a s normalization layer and rotary embedding on
+    query and key vector.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "FusedHunyuanAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from .embeddings import apply_rotary_emb
+
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        if encoder_hidden_states is None:
+            qkv = attn.to_qkv(hidden_states)
+            split_size = qkv.shape[-1] // 3
+            query, key, value = torch.split(qkv, split_size, dim=-1)
+        else:
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+            query = attn.to_q(hidden_states)
+
+            kv = attn.to_kv(encoder_hidden_states)
+            split_size = kv.shape[-1] // 2
+            key, value = torch.split(kv, split_size, dim=-1)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            if not attn.is_cross_attention:
+                key = apply_rotary_emb(key, image_rotary_emb)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class PAGHunyuanAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
+    used in the HunyuanDiT model. It applies a normalization layer and rotary embedding on query and key vector. This
+    variant of the processor employs [Pertubed Attention Guidance](https://arxiv.org/abs/2403.17377).
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "PAGHunyuanAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from .embeddings import apply_rotary_emb
+
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        # chunk
+        hidden_states_org, hidden_states_ptb = hidden_states.chunk(2)
+
+        # 1. Original Path
+        batch_size, sequence_length, _ = (
+            hidden_states_org.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states_org = attn.group_norm(hidden_states_org.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states_org)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states_org
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            if not attn.is_cross_attention:
+                key = apply_rotary_emb(key, image_rotary_emb)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states_org = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states_org = hidden_states_org.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states_org = hidden_states_org.to(query.dtype)
+
+        # linear proj
+        hidden_states_org = attn.to_out[0](hidden_states_org)
+        # dropout
+        hidden_states_org = attn.to_out[1](hidden_states_org)
+
+        if input_ndim == 4:
+            hidden_states_org = hidden_states_org.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        # 2. Perturbed Path
+        if attn.group_norm is not None:
+            hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states_ptb = attn.to_v(hidden_states_ptb)
+        hidden_states_ptb = hidden_states_ptb.to(query.dtype)
+
+        # linear proj
+        hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
+        # dropout
+        hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
+
+        if input_ndim == 4:
+            hidden_states_ptb = hidden_states_ptb.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        # cat
+        hidden_states = torch.cat([hidden_states_org, hidden_states_ptb])
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class PAGCFGHunyuanAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
+    used in the HunyuanDiT model. It applies a normalization layer and rotary embedding on query and key vector. This
+    variant of the processor employs [Pertubed Attention Guidance](https://arxiv.org/abs/2403.17377).
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "PAGCFGHunyuanAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from .embeddings import apply_rotary_emb
+
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        # chunk
+        hidden_states_uncond, hidden_states_org, hidden_states_ptb = hidden_states.chunk(3)
+        hidden_states_org = torch.cat([hidden_states_uncond, hidden_states_org])
+
+        # 1. Original Path
+        batch_size, sequence_length, _ = (
+            hidden_states_org.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states_org = attn.group_norm(hidden_states_org.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states_org)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states_org
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            if not attn.is_cross_attention:
+                key = apply_rotary_emb(key, image_rotary_emb)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states_org = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states_org = hidden_states_org.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states_org = hidden_states_org.to(query.dtype)
+
+        # linear proj
+        hidden_states_org = attn.to_out[0](hidden_states_org)
+        # dropout
+        hidden_states_org = attn.to_out[1](hidden_states_org)
+
+        if input_ndim == 4:
+            hidden_states_org = hidden_states_org.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        # 2. Perturbed Path
+        if attn.group_norm is not None:
+            hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states_ptb = attn.to_v(hidden_states_ptb)
+        hidden_states_ptb = hidden_states_ptb.to(query.dtype)
+
+        # linear proj
+        hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
+        # dropout
+        hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
+
+        if input_ndim == 4:
+            hidden_states_ptb = hidden_states_ptb.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        # cat
+        hidden_states = torch.cat([hidden_states_org, hidden_states_ptb])
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class LuminaAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
+    used in the LuminaNextDiT model. It applies a s normalization layer and rotary embedding on query and key vector.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        query_rotary_emb: Optional[torch.Tensor] = None,
+        key_rotary_emb: Optional[torch.Tensor] = None,
+        base_sequence_length: Optional[int] = None,
+    ) -> torch.Tensor:
+        from .embeddings import apply_rotary_emb
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        # Get Query-Key-Value Pair
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query_dim = query.shape[-1]
+        inner_dim = key.shape[-1]
+        head_dim = query_dim // attn.heads
+        dtype = query.dtype
+
+        # Get key-value heads
+        kv_heads = inner_dim // head_dim
+
+        # Apply Query-Key Norm if needed
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        query = query.view(batch_size, -1, attn.heads, head_dim)
+
+        key = key.view(batch_size, -1, kv_heads, head_dim)
+        value = value.view(batch_size, -1, kv_heads, head_dim)
+
+        # Apply RoPE if needed
+        if query_rotary_emb is not None:
+            query = apply_rotary_emb(query, query_rotary_emb, use_real=False)
+        if key_rotary_emb is not None:
+            key = apply_rotary_emb(key, key_rotary_emb, use_real=False)
+
+        query, key = query.to(dtype), key.to(dtype)
+
+        # Apply proportional attention if true
+        if key_rotary_emb is None:
+            softmax_scale = None
+        else:
+            if base_sequence_length is not None:
+                softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
+            else:
+                softmax_scale = attn.scale
+
+        # perform Grouped-qurey Attention (GQA)
+        n_rep = attn.heads // kv_heads
+        if n_rep >= 1:
+            key = key.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+            value = value.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+
+        # scaled_dot_product_attention expects attention_mask shape to be
+        # (batch, heads, source_length, target_length)
+        attention_mask = attention_mask.bool().view(batch_size, 1, 1, -1)
+        attention_mask = attention_mask.expand(-1, attn.heads, sequence_length, -1)
+
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, scale=softmax_scale
+        )
+        hidden_states = hidden_states.transpose(1, 2).to(dtype)
+
+        return hidden_states
+
+
+class FusedAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). It uses
+    fused projection layers. For self-attention modules, all projection matrices (i.e., query, key, value) are fused.
+    For cross-attention modules, key and value projection matrices are fused.
+
+    <Tip warning={true}>
+
+    This API is currently 🧪 experimental in nature and can change in future.
+
+    </Tip>
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "FusedAttnProcessor2_0 requires at least PyTorch 2.0, to use it. Please upgrade PyTorch to > 2.0."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        if encoder_hidden_states is None:
+            qkv = attn.to_qkv(hidden_states)
+            split_size = qkv.shape[-1] // 3
+            query, key, value = torch.split(qkv, split_size, dim=-1)
+        else:
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+            query = attn.to_q(hidden_states)
+
+            kv = attn.to_kv(encoder_hidden_states)
+            split_size = kv.shape[-1] // 2
+            key, value = torch.split(kv, split_size, dim=-1)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class CustomDiffusionXFormersAttnProcessor(nn.Module):
+    r"""
+    Processor for implementing memory efficient attention using xFormers for the Custom Diffusion method.
+
+    Args:
+    train_kv (`bool`, defaults to `True`):
+        Whether to newly train the key and value matrices corresponding to the text features.
+    train_q_out (`bool`, defaults to `True`):
+        Whether to newly train query matrices corresponding to the latent image features.
+    hidden_size (`int`, *optional*, defaults to `None`):
+        The hidden size of the attention layer.
+    cross_attention_dim (`int`, *optional*, defaults to `None`):
+        The number of channels in the `encoder_hidden_states`.
+    out_bias (`bool`, defaults to `True`):
+        Whether to include the bias parameter in `train_q_out`.
+    dropout (`float`, *optional*, defaults to 0.0):
+        The dropout probability to use.
+    attention_op (`Callable`, *optional*, defaults to `None`):
+        The base
+        [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to use
+        as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best operator.
+    """
+
+    def __init__(
+        self,
+        train_kv: bool = True,
+        train_q_out: bool = False,
+        hidden_size: Optional[int] = None,
+        cross_attention_dim: Optional[int] = None,
+        out_bias: bool = True,
+        dropout: float = 0.0,
+        attention_op: Optional[Callable] = None,
+    ):
+        super().__init__()
+        self.train_kv = train_kv
+        self.train_q_out = train_q_out
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.attention_op = attention_op
+
+        # `_custom_diffusion` id for easy serialization and loading.
+        if self.train_kv:
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        if self.train_q_out:
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
+            self.to_out_custom_diffusion = nn.ModuleList([])
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
+            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if self.train_q_out:
+            query = self.to_q_custom_diffusion(hidden_states).to(attn.to_q.weight.dtype)
+        else:
+            query = attn.to_q(hidden_states.to(attn.to_q.weight.dtype))
+
+        if encoder_hidden_states is None:
+            crossattn = False
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        if self.train_kv:
+            key = self.to_k_custom_diffusion(encoder_hidden_states.to(self.to_k_custom_diffusion.weight.dtype))
+            value = self.to_v_custom_diffusion(encoder_hidden_states.to(self.to_v_custom_diffusion.weight.dtype))
+            key = key.to(attn.to_q.weight.dtype)
+            value = value.to(attn.to_q.weight.dtype)
+        else:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+
+        if crossattn:
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.0
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
+
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        if self.train_q_out:
+            # linear proj
+            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
+            # dropout
+            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
+        else:
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class CustomDiffusionAttnProcessor2_0(nn.Module):
+    r"""
+    Processor for implementing attention for the Custom Diffusion method using PyTorch 2.0’s memory-efficient scaled
+    dot-product attention.
+
+    Args:
+        train_kv (`bool`, defaults to `True`):
+            Whether to newly train the key and value matrices corresponding to the text features.
+        train_q_out (`bool`, defaults to `True`):
+            Whether to newly train query matrices corresponding to the latent image features.
+        hidden_size (`int`, *optional*, defaults to `None`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*, defaults to `None`):
+            The number of channels in the `encoder_hidden_states`.
+        out_bias (`bool`, defaults to `True`):
+            Whether to include the bias parameter in `train_q_out`.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+    """
+
+    def __init__(
+        self,
+        train_kv: bool = True,
+        train_q_out: bool = True,
+        hidden_size: Optional[int] = None,
+        cross_attention_dim: Optional[int] = None,
+        out_bias: bool = True,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.train_kv = train_kv
+        self.train_q_out = train_q_out
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+
+        # `_custom_diffusion` id for easy serialization and loading.
+        if self.train_kv:
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        if self.train_q_out:
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
+            self.to_out_custom_diffusion = nn.ModuleList([])
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
+            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if self.train_q_out:
+            query = self.to_q_custom_diffusion(hidden_states)
+        else:
+            query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            crossattn = False
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        if self.train_kv:
+            key = self.to_k_custom_diffusion(encoder_hidden_states.to(self.to_k_custom_diffusion.weight.dtype))
+            value = self.to_v_custom_diffusion(encoder_hidden_states.to(self.to_v_custom_diffusion.weight.dtype))
+            key = key.to(attn.to_q.weight.dtype)
+            value = value.to(attn.to_q.weight.dtype)
+
+        else:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+
+        if crossattn:
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.0
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
+
+        inner_dim = hidden_states.shape[-1]
+
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        if self.train_q_out:
+            # linear proj
+            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
+            # dropout
+            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
+        else:
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class SlicedAttnProcessor:
+    r"""
+    Processor for implementing sliced attention.
+
+    Args:
+        slice_size (`int`, *optional*):
+            The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and
+            `attention_head_dim` must be a multiple of the `slice_size`.
+    """
+
+    def __init__(self, slice_size: int):
+        self.slice_size = slice_size
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = attn.head_to_batch_dim(query)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        batch_size_attention, query_tokens, _ = query.shape
+        hidden_states = torch.zeros(
+            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
+        )
+
+        for i in range((batch_size_attention - 1) // self.slice_size + 1):
+            start_idx = i * self.slice_size
+            end_idx = (i + 1) * self.slice_size
+
+            query_slice = query[start_idx:end_idx]
+            key_slice = key[start_idx:end_idx]
+            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
+
+            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+
+            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+
+            hidden_states[start_idx:end_idx] = attn_slice
+
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class SlicedAttnAddedKVProcessor:
+    r"""
+    Processor for implementing sliced attention with extra learnable key and value matrices for the text encoder.
+
+    Args:
+        slice_size (`int`, *optional*):
+            The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and
+            `attention_head_dim` must be a multiple of the `slice_size`.
+    """
+
+    def __init__(self, slice_size):
+        self.slice_size = slice_size
+
+    def __call__(
+        self,
+        attn: "Attention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = attn.head_to_batch_dim(query)
+
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+
+        batch_size_attention, query_tokens, _ = query.shape
+        hidden_states = torch.zeros(
+            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
+        )
+
+        for i in range((batch_size_attention - 1) // self.slice_size + 1):
+            start_idx = i * self.slice_size
+            end_idx = (i + 1) * self.slice_size
+
+            query_slice = query[start_idx:end_idx]
+            key_slice = key[start_idx:end_idx]
+            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
+
+            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+
+            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+
+            hidden_states[start_idx:end_idx] = attn_slice
+
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class SpatialNorm(nn.Module):
+    """
+    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002.
+
+    Args:
+        f_channels (`int`):
+            The number of channels for input to group normalization layer, and output of the spatial norm layer.
+        zq_channels (`int`):
+            The number of channels for the quantized vector as described in the paper.
+    """
+
+    def __init__(
+        self,
+        f_channels: int,
+        zq_channels: int,
+    ):
+        super().__init__()
+        self.norm_layer = nn.GroupNorm(num_channels=f_channels, num_groups=32, eps=1e-6, affine=True)
+        self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+        self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, f: torch.Tensor, zq: torch.Tensor) -> torch.Tensor:
+        f_size = f.shape[-2:]
+        zq = F.interpolate(zq, size=f_size, mode="nearest")
+        norm_f = self.norm_layer(f)
+        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
+        return new_f
+
+
+class IPAdapterAttnProcessor(nn.Module):
+    r"""
+    Attention processor for Multiple IP-Adapters.
+
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        num_tokens (`int`, `Tuple[int]` or `List[int]`, defaults to `(4,)`):
+            The context length of the image features.
+        scale (`float` or List[`float`], defaults to 1.0):
+            the weight scale of image prompt.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale=1.0):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+
+        if not isinstance(num_tokens, (tuple, list)):
+            num_tokens = [num_tokens]
+        self.num_tokens = num_tokens
+
+        if not isinstance(scale, list):
+            scale = [scale] * len(num_tokens)
+        if len(scale) != len(num_tokens):
+            raise ValueError("`scale` should be a list of integers with the same length as `num_tokens`.")
+        self.scale = scale
+
+        self.to_k_ip = nn.ModuleList(
+            [nn.Linear(cross_attention_dim, hidden_size, bias=False) for _ in range(len(num_tokens))]
+        )
+        self.to_v_ip = nn.ModuleList(
+            [nn.Linear(cross_attention_dim, hidden_size, bias=False) for _ in range(len(num_tokens))]
+        )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        scale: float = 1.0,
+        ip_adapter_masks: Optional[torch.Tensor] = None,
+    ):
+        residual = hidden_states
+
+        # separate ip_hidden_states from encoder_hidden_states
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, tuple):
+                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+            else:
+                deprecation_message = (
+                    "You have passed a tensor as `encoder_hidden_states`. This is deprecated and will be removed in a future release."
+                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to suppress this warning."
+                )
+                deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False)
+                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
+                encoder_hidden_states, ip_hidden_states = (
+                    encoder_hidden_states[:, :end_pos, :],
+                    [encoder_hidden_states[:, end_pos:, :]],
+                )
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        if ip_adapter_masks is not None:
+            if not isinstance(ip_adapter_masks, List):
+                # for backward compatibility, we accept `ip_adapter_mask` as a tensor of shape [num_ip_adapter, 1, height, width]
+                ip_adapter_masks = list(ip_adapter_masks.unsqueeze(1))
+            if not (len(ip_adapter_masks) == len(self.scale) == len(ip_hidden_states)):
+                raise ValueError(
+                    f"Length of ip_adapter_masks array ({len(ip_adapter_masks)}) must match "
+                    f"length of self.scale array ({len(self.scale)}) and number of ip_hidden_states "
+                    f"({len(ip_hidden_states)})"
+                )
+            else:
+                for index, (mask, scale, ip_state) in enumerate(zip(ip_adapter_masks, self.scale, ip_hidden_states)):
+                    if not isinstance(mask, torch.Tensor) or mask.ndim != 4:
+                        raise ValueError(
+                            "Each element of the ip_adapter_masks array should be a tensor with shape "
+                            "[1, num_images_for_ip_adapter, height, width]."
+                            " Please use `IPAdapterMaskProcessor` to preprocess your mask"
+                        )
+                    if mask.shape[1] != ip_state.shape[1]:
+                        raise ValueError(
+                            f"Number of masks ({mask.shape[1]}) does not match "
+                            f"number of ip images ({ip_state.shape[1]}) at index {index}"
+                        )
+                    if isinstance(scale, list) and not len(scale) == mask.shape[1]:
+                        raise ValueError(
+                            f"Number of masks ({mask.shape[1]}) does not match "
+                            f"number of scales ({len(scale)}) at index {index}"
+                        )
+        else:
+            ip_adapter_masks = [None] * len(self.scale)
+
+        # for ip-adapter
+        for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip(
+            ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks
+        ):
+            skip = False
+            if isinstance(scale, list):
+                if all(s == 0 for s in scale):
+                    skip = True
+            elif scale == 0:
+                skip = True
+            if not skip:
+                if mask is not None:
+                    if not isinstance(scale, list):
+                        scale = [scale] * mask.shape[1]
+
+                    current_num_images = mask.shape[1]
+                    for i in range(current_num_images):
+                        ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :])
+                        ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :])
+
+                        ip_key = attn.head_to_batch_dim(ip_key)
+                        ip_value = attn.head_to_batch_dim(ip_value)
+
+                        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+                        _current_ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+                        _current_ip_hidden_states = attn.batch_to_head_dim(_current_ip_hidden_states)
+
+                        mask_downsample = IPAdapterMaskProcessor.downsample(
+                            mask[:, i, :, :],
+                            batch_size,
+                            _current_ip_hidden_states.shape[1],
+                            _current_ip_hidden_states.shape[2],
+                        )
+
+                        mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device)
+
+                        hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample)
+                else:
+                    ip_key = to_k_ip(current_ip_hidden_states)
+                    ip_value = to_v_ip(current_ip_hidden_states)
+
+                    ip_key = attn.head_to_batch_dim(ip_key)
+                    ip_value = attn.head_to_batch_dim(ip_value)
+
+                    ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+                    current_ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+                    current_ip_hidden_states = attn.batch_to_head_dim(current_ip_hidden_states)
+
+                    hidden_states = hidden_states + scale * current_ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class IPAdapterAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for IP-Adapter for PyTorch 2.0.
+
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        num_tokens (`int`, `Tuple[int]` or `List[int]`, defaults to `(4,)`):
+            The context length of the image features.
+        scale (`float` or `List[float]`, defaults to 1.0):
+            the weight scale of image prompt.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale=1.0):
+        super().__init__()
+
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                f"{self.__class__.__name__} requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+
+        if not isinstance(num_tokens, (tuple, list)):
+            num_tokens = [num_tokens]
+        self.num_tokens = num_tokens
+
+        if not isinstance(scale, list):
+            scale = [scale] * len(num_tokens)
+        if len(scale) != len(num_tokens):
+            raise ValueError("`scale` should be a list of integers with the same length as `num_tokens`.")
+        self.scale = scale
+
+        self.to_k_ip = nn.ModuleList(
+            [nn.Linear(cross_attention_dim, hidden_size, bias=False) for _ in range(len(num_tokens))]
+        )
+        self.to_v_ip = nn.ModuleList(
+            [nn.Linear(cross_attention_dim, hidden_size, bias=False) for _ in range(len(num_tokens))]
+        )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        scale: float = 1.0,
+        ip_adapter_masks: Optional[torch.Tensor] = None,
+    ):
+        residual = hidden_states
+
+        # separate ip_hidden_states from encoder_hidden_states
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, tuple):
+                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+            else:
+                deprecation_message = (
+                    "You have passed a tensor as `encoder_hidden_states`. This is deprecated and will be removed in a future release."
+                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to suppress this warning."
+                )
+                deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False)
+                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
+                encoder_hidden_states, ip_hidden_states = (
+                    encoder_hidden_states[:, :end_pos, :],
+                    [encoder_hidden_states[:, end_pos:, :]],
+                )
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        if ip_adapter_masks is not None:
+            if not isinstance(ip_adapter_masks, List):
+                # for backward compatibility, we accept `ip_adapter_mask` as a tensor of shape [num_ip_adapter, 1, height, width]
+                ip_adapter_masks = list(ip_adapter_masks.unsqueeze(1))
+            if not (len(ip_adapter_masks) == len(self.scale) == len(ip_hidden_states)):
+                raise ValueError(
+                    f"Length of ip_adapter_masks array ({len(ip_adapter_masks)}) must match "
+                    f"length of self.scale array ({len(self.scale)}) and number of ip_hidden_states "
+                    f"({len(ip_hidden_states)})"
+                )
+            else:
+                for index, (mask, scale, ip_state) in enumerate(zip(ip_adapter_masks, self.scale, ip_hidden_states)):
+                    if not isinstance(mask, torch.Tensor) or mask.ndim != 4:
+                        raise ValueError(
+                            "Each element of the ip_adapter_masks array should be a tensor with shape "
+                            "[1, num_images_for_ip_adapter, height, width]."
+                            " Please use `IPAdapterMaskProcessor` to preprocess your mask"
+                        )
+                    if mask.shape[1] != ip_state.shape[1]:
+                        raise ValueError(
+                            f"Number of masks ({mask.shape[1]}) does not match "
+                            f"number of ip images ({ip_state.shape[1]}) at index {index}"
+                        )
+                    if isinstance(scale, list) and not len(scale) == mask.shape[1]:
+                        raise ValueError(
+                            f"Number of masks ({mask.shape[1]}) does not match "
+                            f"number of scales ({len(scale)}) at index {index}"
+                        )
+        else:
+            ip_adapter_masks = [None] * len(self.scale)
+
+        # for ip-adapter
+        for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip(
+            ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks
+        ):
+            skip = False
+            if isinstance(scale, list):
+                if all(s == 0 for s in scale):
+                    skip = True
+            elif scale == 0:
+                skip = True
+            if not skip:
+                if mask is not None:
+                    if not isinstance(scale, list):
+                        scale = [scale] * mask.shape[1]
+
+                    current_num_images = mask.shape[1]
+                    for i in range(current_num_images):
+                        ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :])
+                        ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :])
+
+                        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+                        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+                        # TODO: add support for attn.scale when we move to Torch 2.1
+                        _current_ip_hidden_states = F.scaled_dot_product_attention(
+                            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+                        )
+
+                        _current_ip_hidden_states = _current_ip_hidden_states.transpose(1, 2).reshape(
+                            batch_size, -1, attn.heads * head_dim
+                        )
+                        _current_ip_hidden_states = _current_ip_hidden_states.to(query.dtype)
+
+                        mask_downsample = IPAdapterMaskProcessor.downsample(
+                            mask[:, i, :, :],
+                            batch_size,
+                            _current_ip_hidden_states.shape[1],
+                            _current_ip_hidden_states.shape[2],
+                        )
+
+                        mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device)
+                        hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample)
+                else:
+                    ip_key = to_k_ip(current_ip_hidden_states)
+                    ip_value = to_v_ip(current_ip_hidden_states)
+
+                    ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                    ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+                    # the output of sdp = (batch, num_heads, seq_len, head_dim)
+                    # TODO: add support for attn.scale when we move to Torch 2.1
+                    current_ip_hidden_states = F.scaled_dot_product_attention(
+                        query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+                    )
+
+                    current_ip_hidden_states = current_ip_hidden_states.transpose(1, 2).reshape(
+                        batch_size, -1, attn.heads * head_dim
+                    )
+                    current_ip_hidden_states = current_ip_hidden_states.to(query.dtype)
+
+                    hidden_states = hidden_states + scale * current_ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class PAGIdentitySelfAttnProcessor2_0:
+    r"""
+    Processor for implementing PAG using scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    PAG reference: https://arxiv.org/abs/2403.17377
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "PAGIdentitySelfAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        # chunk
+        hidden_states_org, hidden_states_ptb = hidden_states.chunk(2)
+
+        # original path
+        batch_size, sequence_length, _ = hidden_states_org.shape
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states_org = attn.group_norm(hidden_states_org.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states_org)
+        key = attn.to_k(hidden_states_org)
+        value = attn.to_v(hidden_states_org)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states_org = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states_org = hidden_states_org.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states_org = hidden_states_org.to(query.dtype)
+
+        # linear proj
+        hidden_states_org = attn.to_out[0](hidden_states_org)
+        # dropout
+        hidden_states_org = attn.to_out[1](hidden_states_org)
+
+        if input_ndim == 4:
+            hidden_states_org = hidden_states_org.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        # perturbed path (identity attention)
+        batch_size, sequence_length, _ = hidden_states_ptb.shape
+
+        if attn.group_norm is not None:
+            hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states_ptb = attn.to_v(hidden_states_ptb)
+        hidden_states_ptb = hidden_states_ptb.to(query.dtype)
+
+        # linear proj
+        hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
+        # dropout
+        hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
+
+        if input_ndim == 4:
+            hidden_states_ptb = hidden_states_ptb.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        # cat
+        hidden_states = torch.cat([hidden_states_org, hidden_states_ptb])
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class PAGCFGIdentitySelfAttnProcessor2_0:
+    r"""
+    Processor for implementing PAG using scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    PAG reference: https://arxiv.org/abs/2403.17377
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "PAGCFGIdentitySelfAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        # chunk
+        hidden_states_uncond, hidden_states_org, hidden_states_ptb = hidden_states.chunk(3)
+        hidden_states_org = torch.cat([hidden_states_uncond, hidden_states_org])
+
+        # original path
+        batch_size, sequence_length, _ = hidden_states_org.shape
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states_org = attn.group_norm(hidden_states_org.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states_org)
+        key = attn.to_k(hidden_states_org)
+        value = attn.to_v(hidden_states_org)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states_org = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states_org = hidden_states_org.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states_org = hidden_states_org.to(query.dtype)
+
+        # linear proj
+        hidden_states_org = attn.to_out[0](hidden_states_org)
+        # dropout
+        hidden_states_org = attn.to_out[1](hidden_states_org)
+
+        if input_ndim == 4:
+            hidden_states_org = hidden_states_org.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        # perturbed path (identity attention)
+        batch_size, sequence_length, _ = hidden_states_ptb.shape
+
+        if attn.group_norm is not None:
+            hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)
+
+        value = attn.to_v(hidden_states_ptb)
+        hidden_states_ptb = value
+        hidden_states_ptb = hidden_states_ptb.to(query.dtype)
+
+        # linear proj
+        hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
+        # dropout
+        hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
+
+        if input_ndim == 4:
+            hidden_states_ptb = hidden_states_ptb.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        # cat
+        hidden_states = torch.cat([hidden_states_org, hidden_states_ptb])
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class LoRAAttnProcessor:
+    def __init__(self):
+        pass
+
+
+class LoRAAttnProcessor2_0:
+    def __init__(self):
+        pass
+
+
+class LoRAXFormersAttnProcessor:
+    def __init__(self):
+        pass
+
+
+class LoRAAttnAddedKVProcessor:
+    def __init__(self):
+        pass
+
+
+class FluxSingleAttnProcessor2_0(FluxAttnProcessor2_0):
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def __init__(self):
+        deprecation_message = "`FluxSingleAttnProcessor2_0` is deprecated and will be removed in a future version. Please use `FluxAttnProcessor2_0` instead."
+        deprecate("FluxSingleAttnProcessor2_0", "0.32.0", deprecation_message)
+        super().__init__()
+
+
+
+@maybe_allow_in_graph
+class CausalAttention(nn.Module):
+    r"""
+    A cross attention layer.
+
+    Parameters:
+        query_dim (`int`):
+            The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8):
+            The number of heads to use for multi-head attention.
+        kv_heads (`int`,  *optional*, defaults to `None`):
+            The number of key and value heads to use for multi-head attention. Defaults to `heads`. If
+            `kv_heads=heads`, the model will use Multi Head Attention (MHA), if `kv_heads=1` the model will use Multi
+            Query Attention (MQA) otherwise GQA is used.
+        dim_head (`int`,  *optional*, defaults to 64):
+            The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the attention computation to `float32`.
+        upcast_softmax (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the softmax computation to `float32`.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the group norm in the cross attention.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+        norm_num_groups (`int`, *optional*, defaults to `None`):
+            The number of groups to use for the group norm in the attention.
+        spatial_norm_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the spatial normalization.
+        out_bias (`bool`, *optional*, defaults to `True`):
+            Set to `True` to use a bias in the output linear layer.
+        scale_qk (`bool`, *optional*, defaults to `True`):
+            Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
+        only_cross_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
+            `added_kv_proj_dim` is not `None`.
+        eps (`float`, *optional*, defaults to 1e-5):
+            An additional value added to the denominator in group normalization that is used for numerical stability.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0):
+            A factor to rescale the output by dividing it with this value.
+        residual_connection (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add the residual connection to the output.
+        _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
+            Set to `True` if the attention block is loaded from a deprecated state dict.
+        processor (`AttnProcessor`, *optional*, defaults to `None`):
+            The attention processor to use. If `None`, defaults to `CausalAttnProcessor2_0` if `torch 2.x` is used and
+            `AttnProcessor` otherwise.
+    """
+
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        kv_heads: Optional[int] = None,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        qk_norm: Optional[str] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        added_proj_bias: Optional[bool] = True,
+        norm_num_groups: Optional[int] = None,
+        spatial_norm_dim: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        residual_connection: bool = False,
+        _from_deprecated_attn_block: bool = False,
+        processor: Optional["AttnProcessor"] = None,
+        out_dim: int = None,
+        context_pre_only=None,
+        pre_only=False,
+    ):
+        super().__init__()
+
+        # To prevent circular import.
+        from .normalization import FP32LayerNorm, RMSNorm
+
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.is_cross_attention = cross_attention_dim is not None
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        self.dropout = dropout
+        self.fused_projections = False
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.context_pre_only = context_pre_only
+        self.pre_only = pre_only
+
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+        self._from_deprecated_attn_block = _from_deprecated_attn_block
+
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
+        else:
+            self.group_norm = None
+
+        if spatial_norm_dim is not None:
+            self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
+        else:
+            self.spatial_norm = None
+
+        if qk_norm is None:
+            self.norm_q = None
+            self.norm_k = None
+        elif qk_norm == "layer_norm":
+            self.norm_q = nn.LayerNorm(dim_head, eps=eps)
+            self.norm_k = nn.LayerNorm(dim_head, eps=eps)
+        elif qk_norm == "fp32_layer_norm":
+            self.norm_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+            self.norm_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+        elif qk_norm == "layer_norm_across_heads":
+            # Lumina applys qk norm across all heads
+            self.norm_q = nn.LayerNorm(dim_head * heads, eps=eps)
+            self.norm_k = nn.LayerNorm(dim_head * kv_heads, eps=eps)
+        elif qk_norm == "rms_norm":
+            self.norm_q = RMSNorm(dim_head, eps=eps)
+            self.norm_k = RMSNorm(dim_head, eps=eps)
+        else:
+            raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None or 'layer_norm'")
+
+        if cross_attention_norm is None:
+            self.norm_cross = None
+        elif cross_attention_norm == "layer_norm":
+            self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
+        elif cross_attention_norm == "group_norm":
+            if self.added_kv_proj_dim is not None:
+                # The given `encoder_hidden_states` are initially of shape
+                # (batch_size, seq_len, added_kv_proj_dim) before being projected
+                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+                # before the projection, so we need to use `added_kv_proj_dim` as
+                # the number of channels for the group norm.
+                norm_cross_num_channels = added_kv_proj_dim
+            else:
+                norm_cross_num_channels = self.cross_attention_dim
+
+            self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
+            )
+        else:
+            raise ValueError(
+                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+            )
+
+        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
+
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+            self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+
+        self.added_proj_bias = added_proj_bias
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
+            if self.context_pre_only is not None:
+                self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+
+        if not self.pre_only:
+            self.to_out = nn.ModuleList([])
+            self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
+            self.to_out.append(nn.Dropout(dropout))
+
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_add_out = nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)
+
+        if qk_norm is not None and added_kv_proj_dim is not None:
+            if qk_norm == "fp32_layer_norm":
+                self.norm_added_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+                self.norm_added_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+            elif qk_norm == "rms_norm":
+                self.norm_added_q = RMSNorm(dim_head, eps=eps)
+                self.norm_added_k = RMSNorm(dim_head, eps=eps)
+        else:
+            self.norm_added_q = None
+            self.norm_added_k = None
+
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+        if processor is None:
+            processor = (
+                CausalAttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+        self.set_processor(processor)
+
+    def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None:
+        r"""
+        Set whether to use npu flash attention from `torch_npu` or not.
+
+        """
+        if use_npu_flash_attention:
+            processor = AttnProcessorNPU()
+        else:
+            # set attention processor
+            # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+            # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+            # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+            processor = (
+                CausalAttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+        self.set_processor(processor)
+
+    def set_use_memory_efficient_attention_xformers(
+        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        r"""
+        Set whether to use memory efficient attention from `xformers` or not.
+
+        Args:
+            use_memory_efficient_attention_xformers (`bool`):
+                Whether to use memory efficient attention from `xformers` or not.
+            attention_op (`Callable`, *optional*):
+                The attention operation to use. Defaults to `None` which uses the default attention operation from
+                `xformers`.
+        """
+        is_custom_diffusion = hasattr(self, "processor") and isinstance(
+            self.processor,
+            (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor, CustomDiffusionAttnProcessor2_0),
+        )
+        is_added_kv_processor = hasattr(self, "processor") and isinstance(
+            self.processor,
+            (
+                AttnAddedKVProcessor,
+                AttnAddedKVProcessor2_0,
+                SlicedAttnAddedKVProcessor,
+                XFormersAttnAddedKVProcessor,
+            ),
+        )
+
+        if use_memory_efficient_attention_xformers:
+            if is_added_kv_processor and is_custom_diffusion:
+                raise NotImplementedError(
+                    f"Memory efficient attention is currently not supported for custom diffusion for attention processor type {self.processor}"
+                )
+            if not is_xformers_available():
+                raise ModuleNotFoundError(
+                    (
+                        "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                        " xformers"
+                    ),
+                    name="xformers",
+                )
+            elif not torch.cuda.is_available():
+                raise ValueError(
+                    "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
+                    " only available for GPU "
+                )
+            else:
+                try:
+                    # Make sure we can run the memory efficient attention
+                    _ = xformers.ops.memory_efficient_attention(
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                    )
+                except Exception as e:
+                    raise e
+
+            if is_custom_diffusion:
+                processor = CustomDiffusionXFormersAttnProcessor(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    attention_op=attention_op,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
+            elif is_added_kv_processor:
+                # TODO(Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
+                # which uses this type of cross attention ONLY because the attention mask of format
+                # [0, ..., -10.000, ..., 0, ...,] is not supported
+                # throw warning
+                logger.info(
+                    "Memory efficient attention with `xformers` might currently not work correctly if an attention mask is required for the attention operation."
+                )
+                processor = XFormersAttnAddedKVProcessor(attention_op=attention_op)
+            else:
+                processor = XFormersAttnProcessor(attention_op=attention_op)
+        else:
+            if is_custom_diffusion:
+                attn_processor_class = (
+                    CustomDiffusionAttnProcessor2_0
+                    if hasattr(F, "scaled_dot_product_attention")
+                    else CustomDiffusionAttnProcessor
+                )
+                processor = attn_processor_class(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
+            else:
+                # set attention processor
+                # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+                # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+                # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+                processor = (
+                    CausalAttnProcessor2_0()
+                    if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
+                    else AttnProcessor()
+                )
+
+        self.set_processor(processor)
+
+    def set_attention_slice(self, slice_size: int) -> None:
+        r"""
+        Set the slice size for attention computation.
+
+        Args:
+            slice_size (`int`):
+                The slice size for attention computation.
+        """
+        if slice_size is not None and slice_size > self.sliceable_head_dim:
+            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
+
+        if slice_size is not None and self.added_kv_proj_dim is not None:
+            processor = SlicedAttnAddedKVProcessor(slice_size)
+        elif slice_size is not None:
+            processor = SlicedAttnProcessor(slice_size)
+        elif self.added_kv_proj_dim is not None:
+            processor = AttnAddedKVProcessor()
+        else:
+            # set attention processor
+            # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+            # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+            # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+            processor = (
+                CausalAttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+
+        self.set_processor(processor)
+
+    def set_processor(self, processor: "AttnProcessor") -> None:
+        r"""
+        Set the attention processor to use.
+
+        Args:
+            processor (`AttnProcessor`):
+                The attention processor to use.
+        """
+        # if current processor is in `self._modules` and if passed `processor` is not, we need to
+        # pop `processor` from `self._modules`
+        if (
+            hasattr(self, "processor")
+            and isinstance(self.processor, torch.nn.Module)
+            and not isinstance(processor, torch.nn.Module)
+        ):
+            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
+            self._modules.pop("processor")
+
+        self.processor = processor
+
+    def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
+        r"""
+        Get the attention processor in use.
+
+        Args:
+            return_deprecated_lora (`bool`, *optional*, defaults to `False`):
+                Set to `True` to return the deprecated LoRA attention processor.
+
+        Returns:
+            "AttentionProcessor": The attention processor in use.
+        """
+        if not return_deprecated_lora:
+            return self.processor
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor, # (b, l, c)
+        # cond_hidden_states: Optional[torch.Tensor] = None, # (b, l, c)
+        ref_hidden_states: Optional[torch.Tensor] = None, # (b, n_ref, l, c)
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        r"""
+        The forward method of the `Attention` class.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                The hidden states of the query.
+            encoder_hidden_states (`torch.Tensor`, *optional*):
+                The hidden states of the encoder.
+            attention_mask (`torch.Tensor`, *optional*):
+                The attention mask to use. If `None`, no mask is applied.
+            **cross_attention_kwargs:
+                Additional keyword arguments to pass along to the cross attention.
+
+        Returns:
+            `torch.Tensor`: The output of the attention layer.
+        """
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+
+        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
+        quiet_attn_parameters = {"ip_adapter_masks"}
+        unused_kwargs = [
+            k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters and k not in quiet_attn_parameters
+        ]
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"cross_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
+            )
+        cross_attention_kwargs = {k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters}
+
+        return self.processor(
+            self,
+            hidden_states, # (b, l, c)
+            # cond_hidden_states, # (b, l, c)
+            ref_hidden_states, # (b, n_ref, l, c)
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+    def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`. `heads`
+        is the number of heads initialized while constructing the `Attention` class.
+
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+
+    def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size, seq_len, heads, dim // heads]` `heads` is
+        the number of heads initialized while constructing the `Attention` class.
+
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+            out_dim (`int`, *optional*, defaults to `3`): The output dimension of the tensor. If `3`, the tensor is
+                reshaped to `[batch_size * heads, seq_len, dim // heads]`.
+
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        if tensor.ndim == 3:
+            batch_size, seq_len, dim = tensor.shape
+            extra_dim = 1
+        else:
+            batch_size, extra_dim, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size, seq_len * extra_dim, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3)
+
+        if out_dim == 3:
+            tensor = tensor.reshape(batch_size * head_size, seq_len * extra_dim, dim // head_size)
+
+        return tensor
+
+    def get_attention_scores(
+        self, query: torch.Tensor, key: torch.Tensor, attention_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        r"""
+        Compute the attention scores.
+
+        Args:
+            query (`torch.Tensor`): The query tensor.
+            key (`torch.Tensor`): The key tensor.
+            attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
+
+        Returns:
+            `torch.Tensor`: The attention probabilities/scores.
+        """
+        dtype = query.dtype
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+
+        if attention_mask is None:
+            baddbmm_input = torch.empty(
+                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
+            )
+            beta = 0
+        else:
+            baddbmm_input = attention_mask
+            beta = 1
+
+        attention_scores = torch.baddbmm(
+            baddbmm_input,
+            query,
+            key.transpose(-1, -2),
+            beta=beta,
+            alpha=self.scale,
+        )
+        del baddbmm_input
+
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+
+        attention_probs = attention_scores.softmax(dim=-1)
+        del attention_scores
+
+        attention_probs = attention_probs.to(dtype)
+
+        return attention_probs
+
+    def prepare_attention_mask(
+        self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
+    ) -> torch.Tensor:
+        r"""
+        Prepare the attention mask for the attention computation.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                The attention mask to prepare.
+            target_length (`int`):
+                The target length of the attention mask. This is the length of the attention mask after padding.
+            batch_size (`int`):
+                The batch size, which is used to repeat the attention mask.
+            out_dim (`int`, *optional*, defaults to `3`):
+                The output dimension of the attention mask. Can be either `3` or `4`.
+
+        Returns:
+            `torch.Tensor`: The prepared attention mask.
+        """
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+
+        return attention_mask
+
+    def norm_encoder_hidden_states(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
+        r"""
+        Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the
+        `Attention` class.
+
+        Args:
+            encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder.
+
+        Returns:
+            `torch.Tensor`: The normalized encoder hidden states.
+        """
+        assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+
+        if isinstance(self.norm_cross, nn.LayerNorm):
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+        elif isinstance(self.norm_cross, nn.GroupNorm):
+            # Group norm norms along the channels dimension and expects
+            # input to be in the shape of (N, C, *). In this case, we want
+            # to norm along the hidden dimension, so we need to move
+            # (batch_size, sequence_length, hidden_size) ->
+            # (batch_size, hidden_size, sequence_length)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+        else:
+            assert False
+
+        return encoder_hidden_states
+
+    @torch.no_grad()
+    def fuse_projections(self, fuse=True):
+        device = self.to_q.weight.data.device
+        dtype = self.to_q.weight.data.dtype
+
+        if not self.is_cross_attention:
+            # fetch weight matrices.
+            concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data])
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+
+            # create a new single projection layer and copy over the weights.
+            self.to_qkv = nn.Linear(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype)
+            self.to_qkv.weight.copy_(concatenated_weights)
+            if self.use_bias:
+                concatenated_bias = torch.cat([self.to_q.bias.data, self.to_k.bias.data, self.to_v.bias.data])
+                self.to_qkv.bias.copy_(concatenated_bias)
+
+        else:
+            concatenated_weights = torch.cat([self.to_k.weight.data, self.to_v.weight.data])
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+
+            self.to_kv = nn.Linear(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype)
+            self.to_kv.weight.copy_(concatenated_weights)
+            if self.use_bias:
+                concatenated_bias = torch.cat([self.to_k.bias.data, self.to_v.bias.data])
+                self.to_kv.bias.copy_(concatenated_bias)
+
+        # handle added projections for SD3 and others.
+        if hasattr(self, "add_q_proj") and hasattr(self, "add_k_proj") and hasattr(self, "add_v_proj"):
+            concatenated_weights = torch.cat(
+                [self.add_q_proj.weight.data, self.add_k_proj.weight.data, self.add_v_proj.weight.data]
+            )
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+
+            self.to_added_qkv = nn.Linear(
+                in_features, out_features, bias=self.added_proj_bias, device=device, dtype=dtype
+            )
+            self.to_added_qkv.weight.copy_(concatenated_weights)
+            if self.added_proj_bias:
+                concatenated_bias = torch.cat(
+                    [self.add_q_proj.bias.data, self.add_k_proj.bias.data, self.add_v_proj.bias.data]
+                )
+                self.to_added_qkv.bias.copy_(concatenated_bias)
+
+        self.fused_projections = fuse
+
+
+
+class CausalAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CausalAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: CausalAttention,
+        hidden_states: torch.Tensor,
+        # cond_hidden_states: Optional[torch.Tensor] = None, # (b, l, c)
+        ref_hidden_states: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None, # 始终为None
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+    
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        encoder_hidden_states = None
+
+        N_ref = ref_hidden_states.shape[1]
+        flat_ref_hidden_states = rearrange(ref_hidden_states, 'b n_ref l c -> (b n_ref) l c')
+        residual = hidden_states
+        # residual_cond = cond_hidden_states
+        residual_ref = flat_ref_hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+            # cond_hidden_states = attn.spatial_norm(cond_hidden_states, temb)
+            flat_ref_hidden_states = attn.spatial_norm(flat_ref_hidden_states, temb)
+
+
+
+
+        batch_size = hidden_states.shape[0]
+
+        # if attn.group_norm is not None:
+        #     hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        #     cond_hidden_states = attn.group_norm(cond_hidden_states.transpose(1, 2)).transpose(1, 2)
+        #     flat_ref_hidden_states = attn.group_norm(flat_ref_hidden_states.transpose(1, 2)).transpose(1, 2)
+        #     print('group_norm')
+
+        # query = attn.to_q(hidden_states)
+        # query_ref = attn.to_q(flat_ref_hidden_states)
+
+        # reshape_flat_ref_hidden_states = rearrange(flat_ref_hidden_states, '(b n_ref) l c -> b (n_ref l) c', n_ref=N_ref)
+
+        # encoder_hidden_states = torch.cat([hidden_states, reshape_flat_ref_hidden_states], dim=1)
+        encoder_hidden_states = hidden_states
+        # encoder_hidden_states_cond = cond_hidden_states
+        encoder_hidden_states_ref = flat_ref_hidden_states # (b, n_ref), l, c
+
+
+
+        query = attn.to_q(hidden_states) # (b, l, c)
+        key_0 = attn.to_k(encoder_hidden_states) # (b, l, c)
+        value_0 = attn.to_v(encoder_hidden_states) # (b, l, c)
+
+        # query_cond = attn.to_q(cond_hidden_states) # (b, l, c)
+        # key_cond = attn.to_k(encoder_hidden_states_cond) # (b, l, c)
+        # value_cond = attn.to_v(encoder_hidden_states_cond) # (b, l, c)
+
+        query_ref = attn.to_q(flat_ref_hidden_states) # (b, n_ref), l, c
+        # print('encoder_hidden_states_ref', encoder_hidden_states_ref.shape)
+        key_ref = attn.to_k(encoder_hidden_states_ref) # (b, n_ref), l, c
+        value_ref = attn.to_v(encoder_hidden_states_ref) # (b, n_ref), l, c
+
+        reshape_key_ref = rearrange(key_ref, '(b n_ref) l c -> b (n_ref l) c', n_ref=N_ref)
+        reshape_value_ref = rearrange(value_ref, '(b n_ref) l c -> b (n_ref l) c', n_ref=N_ref)
+
+        key = torch.cat([key_0, reshape_key_ref], dim=1) # (b, n_ref*l + l1, c)
+        value = torch.cat([value_0, reshape_value_ref], dim=1) # (b, n_ref*l + l1, c)
+
+        # print('key',key.shape, key_0.shape, key_cond.shape, reshape_key_ref.shape)
+        # print('value',value.shape, value_0.shape, value_cond.shape, reshape_value_ref.shape)
+
+        # key_cond = torch.cat([key_cond, key_0], dim=1)
+        # value_cond = torch.cat([value_cond, value_0], dim=1)
+
+        # key_ref = key.unsqueeze(1).repeat(1, N_ref, 1, 1)
+        # key_ref = rearrange(key_ref, 'b n_ref l c -> (b n_ref)  l c', n_ref=N_ref)
+        # value_ref = value.unsqueeze(1).repeat(1, N_ref, 1, 1)
+        # value_ref = rearrange(value_ref, 'b n_ref l c -> (b n_ref) l c', n_ref=N_ref)
+
+        # key_for_ref = key_0.unsqueeze(1).repeat(1, N_ref, 1, 1).squeeze(0)
+        # value_for_ref = value_0.unsqueeze(1).repeat(1, N_ref, 1, 1).squeeze(0)
+        # print('key_for_ref',key_for_ref.shape)
+        # print('value_for_ref',value_for_ref.shape)
+        # print('key_ref',key_ref.shape)
+        # print('value_ref',value_ref.shape)
+
+        # key_ref = torch.cat([key_ref, key_for_ref], dim=1)
+        # value_ref = torch.cat([value_ref, value_for_ref], dim=1)
+
+
+        # print('key',key.shape, key_ref.shape, key_cond.shape)
+        # print('value',value.shape, value_ref.shape, value_cond.shape)
+
+
+
+
+
+
+        inner_dim = key.shape[-1] # c
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # query_cond = query_cond.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # key_cond = key_cond.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # value_cond = value_cond.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        query_ref = query_ref.view(batch_size*N_ref, -1, attn.heads, head_dim).transpose(1, 2)
+        key_ref = key_ref.view(batch_size*N_ref, -1, attn.heads, head_dim).transpose(1, 2)
+        value_ref = value_ref.view(batch_size*N_ref, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+            # query_cond = attn.norm_q(query_cond)
+            query_ref = attn.norm_q(query_ref)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+            # key_cond = attn.norm_k(key_cond)
+            key_ref = attn.norm_k(key_ref)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        ) 
+
+        # hidden_states_cond = F.scaled_dot_product_attention(
+        #     query_cond, key_cond, value_cond, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        # ) 
+
+        hidden_states_ref = F.scaled_dot_product_attention(
+            query_ref, key_ref, value_ref, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        ) 
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # hidden_states_cond = hidden_states_cond.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        # hidden_states_cond = hidden_states_cond.to(query.dtype)
+
+        hidden_states_ref = hidden_states_ref.transpose(1, 2).reshape(batch_size*N_ref, -1, attn.heads * head_dim)
+        hidden_states_ref = hidden_states_ref.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        # hidden_states_cond = attn.to_out[0](hidden_states_cond)
+        # hidden_states_cond = attn.to_out[1](hidden_states_cond)
+
+        hidden_states_ref = attn.to_out[0](hidden_states_ref)
+        hidden_states_ref = attn.to_out[1](hidden_states_ref)
+
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+            # hidden_states_cond = hidden_states_cond + residual_cond
+            hidden_states_ref = hidden_states_ref + residual_ref
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+        # hidden_states_cond = hidden_states_cond / attn.rescale_output_factor
+        hidden_states_ref = hidden_states_ref / attn.rescale_output_factor
+
+        hidden_states_ref = rearrange(hidden_states_ref, '(b n_ref) l c -> b n_ref l c', n_ref=N_ref)
+        return hidden_states, hidden_states_ref
+
+
+@maybe_allow_in_graph
+class CausalKVcacheAttention(nn.Module):
+    r"""
+    A cross attention layer.
+
+    Parameters:
+        query_dim (`int`):
+            The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8):
+            The number of heads to use for multi-head attention.
+        kv_heads (`int`,  *optional*, defaults to `None`):
+            The number of key and value heads to use for multi-head attention. Defaults to `heads`. If
+            `kv_heads=heads`, the model will use Multi Head Attention (MHA), if `kv_heads=1` the model will use Multi
+            Query Attention (MQA) otherwise GQA is used.
+        dim_head (`int`,  *optional*, defaults to 64):
+            The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the attention computation to `float32`.
+        upcast_softmax (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the softmax computation to `float32`.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the group norm in the cross attention.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+        norm_num_groups (`int`, *optional*, defaults to `None`):
+            The number of groups to use for the group norm in the attention.
+        spatial_norm_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the spatial normalization.
+        out_bias (`bool`, *optional*, defaults to `True`):
+            Set to `True` to use a bias in the output linear layer.
+        scale_qk (`bool`, *optional*, defaults to `True`):
+            Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
+        only_cross_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
+            `added_kv_proj_dim` is not `None`.
+        eps (`float`, *optional*, defaults to 1e-5):
+            An additional value added to the denominator in group normalization that is used for numerical stability.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0):
+            A factor to rescale the output by dividing it with this value.
+        residual_connection (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add the residual connection to the output.
+        _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
+            Set to `True` if the attention block is loaded from a deprecated state dict.
+        processor (`AttnProcessor`, *optional*, defaults to `None`):
+            The attention processor to use. If `None`, defaults to `CausalKVcacheAttnProcessor2_0` if `torch 2.x` is used and
+            `AttnProcessor` otherwise.
+    """
+
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        kv_heads: Optional[int] = None,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        qk_norm: Optional[str] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        added_proj_bias: Optional[bool] = True,
+        norm_num_groups: Optional[int] = None,
+        spatial_norm_dim: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        residual_connection: bool = False,
+        _from_deprecated_attn_block: bool = False,
+        processor: Optional["AttnProcessor"] = None,
+        out_dim: int = None,
+        context_pre_only=None,
+        pre_only=False,
+    ):
+        super().__init__()
+
+        # To prevent circular import.
+        from .normalization import FP32LayerNorm, RMSNorm
+
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.is_cross_attention = cross_attention_dim is not None
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        self.dropout = dropout
+        self.fused_projections = False
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.context_pre_only = context_pre_only
+        self.pre_only = pre_only
+
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+        self._from_deprecated_attn_block = _from_deprecated_attn_block
+
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
+        else:
+            self.group_norm = None
+
+        if spatial_norm_dim is not None:
+            self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
+        else:
+            self.spatial_norm = None
+
+        if qk_norm is None:
+            self.norm_q = None
+            self.norm_k = None
+        elif qk_norm == "layer_norm":
+            self.norm_q = nn.LayerNorm(dim_head, eps=eps)
+            self.norm_k = nn.LayerNorm(dim_head, eps=eps)
+        elif qk_norm == "fp32_layer_norm":
+            self.norm_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+            self.norm_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+        elif qk_norm == "layer_norm_across_heads":
+            # Lumina applys qk norm across all heads
+            self.norm_q = nn.LayerNorm(dim_head * heads, eps=eps)
+            self.norm_k = nn.LayerNorm(dim_head * kv_heads, eps=eps)
+        elif qk_norm == "rms_norm":
+            self.norm_q = RMSNorm(dim_head, eps=eps)
+            self.norm_k = RMSNorm(dim_head, eps=eps)
+        else:
+            raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None or 'layer_norm'")
+
+        if cross_attention_norm is None:
+            self.norm_cross = None
+        elif cross_attention_norm == "layer_norm":
+            self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
+        elif cross_attention_norm == "group_norm":
+            if self.added_kv_proj_dim is not None:
+                # The given `encoder_hidden_states` are initially of shape
+                # (batch_size, seq_len, added_kv_proj_dim) before being projected
+                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+                # before the projection, so we need to use `added_kv_proj_dim` as
+                # the number of channels for the group norm.
+                norm_cross_num_channels = added_kv_proj_dim
+            else:
+                norm_cross_num_channels = self.cross_attention_dim
+
+            self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
+            )
+        else:
+            raise ValueError(
+                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+            )
+
+        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
+
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+            self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+
+        self.added_proj_bias = added_proj_bias
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
+            if self.context_pre_only is not None:
+                self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+
+        if not self.pre_only:
+            self.to_out = nn.ModuleList([])
+            self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
+            self.to_out.append(nn.Dropout(dropout))
+
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_add_out = nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)
+
+        if qk_norm is not None and added_kv_proj_dim is not None:
+            if qk_norm == "fp32_layer_norm":
+                self.norm_added_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+                self.norm_added_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+            elif qk_norm == "rms_norm":
+                self.norm_added_q = RMSNorm(dim_head, eps=eps)
+                self.norm_added_k = RMSNorm(dim_head, eps=eps)
+        else:
+            self.norm_added_q = None
+            self.norm_added_k = None
+
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+        if processor is None:
+            processor = (
+                CausalKVcacheAttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+        self.set_processor(processor)
+
+    def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None:
+        r"""
+        Set whether to use npu flash attention from `torch_npu` or not.
+
+        """
+        if use_npu_flash_attention:
+            processor = AttnProcessorNPU()
+        else:
+            # set attention processor
+            # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+            # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+            # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+            processor = (
+                CausalKVcacheAttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+        self.set_processor(processor)
+
+    def set_use_memory_efficient_attention_xformers(
+        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        r"""
+        Set whether to use memory efficient attention from `xformers` or not.
+
+        Args:
+            use_memory_efficient_attention_xformers (`bool`):
+                Whether to use memory efficient attention from `xformers` or not.
+            attention_op (`Callable`, *optional*):
+                The attention operation to use. Defaults to `None` which uses the default attention operation from
+                `xformers`.
+        """
+        is_custom_diffusion = hasattr(self, "processor") and isinstance(
+            self.processor,
+            (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor, CustomDiffusionAttnProcessor2_0),
+        )
+        is_added_kv_processor = hasattr(self, "processor") and isinstance(
+            self.processor,
+            (
+                AttnAddedKVProcessor,
+                AttnAddedKVProcessor2_0,
+                SlicedAttnAddedKVProcessor,
+                XFormersAttnAddedKVProcessor,
+            ),
+        )
+
+        if use_memory_efficient_attention_xformers:
+            if is_added_kv_processor and is_custom_diffusion:
+                raise NotImplementedError(
+                    f"Memory efficient attention is currently not supported for custom diffusion for attention processor type {self.processor}"
+                )
+            if not is_xformers_available():
+                raise ModuleNotFoundError(
+                    (
+                        "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                        " xformers"
+                    ),
+                    name="xformers",
+                )
+            elif not torch.cuda.is_available():
+                raise ValueError(
+                    "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
+                    " only available for GPU "
+                )
+            else:
+                try:
+                    # Make sure we can run the memory efficient attention
+                    _ = xformers.ops.memory_efficient_attention(
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                    )
+                except Exception as e:
+                    raise e
+
+            if is_custom_diffusion:
+                processor = CustomDiffusionXFormersAttnProcessor(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    attention_op=attention_op,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
+            elif is_added_kv_processor:
+                # TODO(Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
+                # which uses this type of cross attention ONLY because the attention mask of format
+                # [0, ..., -10.000, ..., 0, ...,] is not supported
+                # throw warning
+                logger.info(
+                    "Memory efficient attention with `xformers` might currently not work correctly if an attention mask is required for the attention operation."
+                )
+                processor = XFormersAttnAddedKVProcessor(attention_op=attention_op)
+            else:
+                processor = XFormersAttnProcessor(attention_op=attention_op)
+        else:
+            if is_custom_diffusion:
+                attn_processor_class = (
+                    CustomDiffusionAttnProcessor2_0
+                    if hasattr(F, "scaled_dot_product_attention")
+                    else CustomDiffusionAttnProcessor
+                )
+                processor = attn_processor_class(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
+            else:
+                # set attention processor
+                # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+                # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+                # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+                processor = (
+                    CausalKVcacheAttnProcessor2_0()
+                    if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
+                    else AttnProcessor()
+                )
+
+        self.set_processor(processor)
+
+    def set_attention_slice(self, slice_size: int) -> None:
+        r"""
+        Set the slice size for attention computation.
+
+        Args:
+            slice_size (`int`):
+                The slice size for attention computation.
+        """
+        if slice_size is not None and slice_size > self.sliceable_head_dim:
+            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
+
+        if slice_size is not None and self.added_kv_proj_dim is not None:
+            processor = SlicedAttnAddedKVProcessor(slice_size)
+        elif slice_size is not None:
+            processor = SlicedAttnProcessor(slice_size)
+        elif self.added_kv_proj_dim is not None:
+            processor = AttnAddedKVProcessor()
+        else:
+            # set attention processor
+            # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+            # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+            # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+            processor = (
+                CausalKVcacheAttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+
+        self.set_processor(processor)
+
+    def set_processor(self, processor: "AttnProcessor") -> None:
+        r"""
+        Set the attention processor to use.
+
+        Args:
+            processor (`AttnProcessor`):
+                The attention processor to use.
+        """
+        # if current processor is in `self._modules` and if passed `processor` is not, we need to
+        # pop `processor` from `self._modules`
+        if (
+            hasattr(self, "processor")
+            and isinstance(self.processor, torch.nn.Module)
+            and not isinstance(processor, torch.nn.Module)
+        ):
+            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
+            self._modules.pop("processor")
+
+        self.processor = processor
+
+    def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
+        r"""
+        Get the attention processor in use.
+
+        Args:
+            return_deprecated_lora (`bool`, *optional*, defaults to `False`):
+                Set to `True` to return the deprecated LoRA attention processor.
+
+        Returns:
+            "AttentionProcessor": The attention processor in use.
+        """
+        if not return_deprecated_lora:
+            return self.processor
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor, # (b, l, c)
+        # cond_hidden_states: Optional[torch.Tensor] = None, # (b, l, c)
+        ref_hidden_states: Optional[torch.Tensor] = None, # (b, n_ref, l, c)
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        K_cache: Optional[torch.Tensor] = None,
+        V_cache: Optional[torch.Tensor] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        r"""
+        The forward method of the `Attention` class.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                The hidden states of the query.
+            encoder_hidden_states (`torch.Tensor`, *optional*):
+                The hidden states of the encoder.
+            attention_mask (`torch.Tensor`, *optional*):
+                The attention mask to use. If `None`, no mask is applied.
+            **cross_attention_kwargs:
+                Additional keyword arguments to pass along to the cross attention.
+
+        Returns:
+            `torch.Tensor`: The output of the attention layer.
+        """
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+
+        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
+        quiet_attn_parameters = {"ip_adapter_masks"}
+        unused_kwargs = [
+            k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters and k not in quiet_attn_parameters
+        ]
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"cross_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
+            )
+        cross_attention_kwargs = {k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters}
+
+        return self.processor(
+            self,
+            hidden_states, # (b, l, c)
+            ref_hidden_states, # (b, n_ref, l, c)
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            K_cache=K_cache,
+            V_cache=V_cache,    
+            **cross_attention_kwargs,
+        )
+
+    def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`. `heads`
+        is the number of heads initialized while constructing the `Attention` class.
+
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+
+    def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size, seq_len, heads, dim // heads]` `heads` is
+        the number of heads initialized while constructing the `Attention` class.
+
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+            out_dim (`int`, *optional*, defaults to `3`): The output dimension of the tensor. If `3`, the tensor is
+                reshaped to `[batch_size * heads, seq_len, dim // heads]`.
+
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        if tensor.ndim == 3:
+            batch_size, seq_len, dim = tensor.shape
+            extra_dim = 1
+        else:
+            batch_size, extra_dim, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size, seq_len * extra_dim, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3)
+
+        if out_dim == 3:
+            tensor = tensor.reshape(batch_size * head_size, seq_len * extra_dim, dim // head_size)
+
+        return tensor
+
+    def get_attention_scores(
+        self, query: torch.Tensor, key: torch.Tensor, attention_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        r"""
+        Compute the attention scores.
+
+        Args:
+            query (`torch.Tensor`): The query tensor.
+            key (`torch.Tensor`): The key tensor.
+            attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
+
+        Returns:
+            `torch.Tensor`: The attention probabilities/scores.
+        """
+        dtype = query.dtype
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+
+        if attention_mask is None:
+            baddbmm_input = torch.empty(
+                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
+            )
+            beta = 0
+        else:
+            baddbmm_input = attention_mask
+            beta = 1
+
+        attention_scores = torch.baddbmm(
+            baddbmm_input,
+            query,
+            key.transpose(-1, -2),
+            beta=beta,
+            alpha=self.scale,
+        )
+        del baddbmm_input
+
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+
+        attention_probs = attention_scores.softmax(dim=-1)
+        del attention_scores
+
+        attention_probs = attention_probs.to(dtype)
+
+        return attention_probs
+
+    def prepare_attention_mask(
+        self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
+    ) -> torch.Tensor:
+        r"""
+        Prepare the attention mask for the attention computation.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                The attention mask to prepare.
+            target_length (`int`):
+                The target length of the attention mask. This is the length of the attention mask after padding.
+            batch_size (`int`):
+                The batch size, which is used to repeat the attention mask.
+            out_dim (`int`, *optional*, defaults to `3`):
+                The output dimension of the attention mask. Can be either `3` or `4`.
+
+        Returns:
+            `torch.Tensor`: The prepared attention mask.
+        """
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+
+        return attention_mask
+
+    def norm_encoder_hidden_states(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
+        r"""
+        Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the
+        `Attention` class.
+
+        Args:
+            encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder.
+
+        Returns:
+            `torch.Tensor`: The normalized encoder hidden states.
+        """
+        assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+
+        if isinstance(self.norm_cross, nn.LayerNorm):
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+        elif isinstance(self.norm_cross, nn.GroupNorm):
+            # Group norm norms along the channels dimension and expects
+            # input to be in the shape of (N, C, *). In this case, we want
+            # to norm along the hidden dimension, so we need to move
+            # (batch_size, sequence_length, hidden_size) ->
+            # (batch_size, hidden_size, sequence_length)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+        else:
+            assert False
+
+        return encoder_hidden_states
+
+    @torch.no_grad()
+    def fuse_projections(self, fuse=True):
+        device = self.to_q.weight.data.device
+        dtype = self.to_q.weight.data.dtype
+
+        if not self.is_cross_attention:
+            # fetch weight matrices.
+            concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data])
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+
+            # create a new single projection layer and copy over the weights.
+            self.to_qkv = nn.Linear(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype)
+            self.to_qkv.weight.copy_(concatenated_weights)
+            if self.use_bias:
+                concatenated_bias = torch.cat([self.to_q.bias.data, self.to_k.bias.data, self.to_v.bias.data])
+                self.to_qkv.bias.copy_(concatenated_bias)
+
+        else:
+            concatenated_weights = torch.cat([self.to_k.weight.data, self.to_v.weight.data])
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+
+            self.to_kv = nn.Linear(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype)
+            self.to_kv.weight.copy_(concatenated_weights)
+            if self.use_bias:
+                concatenated_bias = torch.cat([self.to_k.bias.data, self.to_v.bias.data])
+                self.to_kv.bias.copy_(concatenated_bias)
+
+        # handle added projections for SD3 and others.
+        if hasattr(self, "add_q_proj") and hasattr(self, "add_k_proj") and hasattr(self, "add_v_proj"):
+            concatenated_weights = torch.cat(
+                [self.add_q_proj.weight.data, self.add_k_proj.weight.data, self.add_v_proj.weight.data]
+            )
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+
+            self.to_added_qkv = nn.Linear(
+                in_features, out_features, bias=self.added_proj_bias, device=device, dtype=dtype
+            )
+            self.to_added_qkv.weight.copy_(concatenated_weights)
+            if self.added_proj_bias:
+                concatenated_bias = torch.cat(
+                    [self.add_q_proj.bias.data, self.add_k_proj.bias.data, self.add_v_proj.bias.data]
+                )
+                self.to_added_qkv.bias.copy_(concatenated_bias)
+
+        self.fused_projections = fuse
+
+
+class CausalKVcacheAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CausalKVcacheAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: CausalKVcacheAttention,
+        hidden_states: torch.Tensor,
+        ref_hidden_states: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None, # 始终为None
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        K_cache: Optional[torch.Tensor] = None,
+        V_cache: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+
+        no_cache = K_cache is None and V_cache is None
+    
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        encoder_hidden_states = None
+
+        residual = hidden_states
+        if no_cache:
+            N_ref = ref_hidden_states.shape[1]
+            flat_ref_hidden_states = rearrange(ref_hidden_states, 'b n_ref l c -> (b n_ref) l c')
+            residual_ref = flat_ref_hidden_states
+
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+            if no_cache:
+                flat_ref_hidden_states = attn.spatial_norm(flat_ref_hidden_states, temb)
+
+
+
+
+        batch_size = hidden_states.shape[0]
+
+        encoder_hidden_states = hidden_states
+        if no_cache:
+            encoder_hidden_states_ref = flat_ref_hidden_states # (b, n_ref), l, c
+
+
+
+        query = attn.to_q(hidden_states) # (b, l, c)
+        key_0 = attn.to_k(encoder_hidden_states) # (b, l, c)
+        value_0 = attn.to_v(encoder_hidden_states) # (b, l, c)
+
+        if no_cache:
+            query_ref = attn.to_q(flat_ref_hidden_states) # (b, n_ref), l, c
+            key_ref = attn.to_k(encoder_hidden_states_ref) # (b, n_ref), l, c
+            value_ref = attn.to_v(encoder_hidden_states_ref) # (b, n_ref), l, c
+            reshape_key_ref = rearrange(key_ref, '(b n_ref) l c -> b (n_ref l) c', n_ref=N_ref)
+            reshape_value_ref = rearrange(value_ref, '(b n_ref) l c -> b (n_ref l) c', n_ref=N_ref)
+
+        if no_cache:
+            key = torch.cat([key_0, reshape_key_ref], dim=1) # (b, n_ref*l + l1, c)
+            value = torch.cat([value_0, reshape_value_ref], dim=1) # (b, n_ref*l + l1, c)
+            output_K_cache = reshape_key_ref
+            output_V_cache = reshape_value_ref
+        else:
+            key = torch.cat([key_0, K_cache], dim=1) # (b, n_ref*l + l1, c)
+            value = torch.cat([value_0, V_cache], dim=1) # (b, n_ref*l + l1, c)
+
+
+
+        inner_dim = key.shape[-1] # c
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if no_cache:
+            query_ref = query_ref.view(batch_size*N_ref, -1, attn.heads, head_dim).transpose(1, 2)
+            key_ref = key_ref.view(batch_size*N_ref, -1, attn.heads, head_dim).transpose(1, 2)
+            value_ref = value_ref.view(batch_size*N_ref, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+            
+            if no_cache:
+                query_ref = attn.norm_q(query_ref)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+            
+            if no_cache:
+                key_ref = attn.norm_k(key_ref)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        # print('query',query.dtype, query.shape)
+        # print('key',key.dtype, key.shape)
+        # print('value',value.dtype, value.shape)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        ) 
+
+        # hidden_states_cond = F.scaled_dot_product_attention(
+        #     query_cond, key_cond, value_cond, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        # ) 
+        if no_cache:
+            # print('query_ref',query_ref.dtype, query_ref.shape)
+            # print('key_ref',key_ref.dtype, key_ref.shape)
+            # print('value_ref',value_ref.dtype, value_ref.shape)
+            hidden_states_ref = F.scaled_dot_product_attention(
+                query_ref, key_ref, value_ref, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            ) 
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # hidden_states_cond = hidden_states_cond.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        # hidden_states_cond = hidden_states_cond.to(query.dtype)
+
+
+        if no_cache:
+            hidden_states_ref = hidden_states_ref.transpose(1, 2).reshape(batch_size*N_ref, -1, attn.heads * head_dim)
+            hidden_states_ref = hidden_states_ref.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if no_cache:
+            hidden_states_ref = attn.to_out[0](hidden_states_ref)
+            hidden_states_ref = attn.to_out[1](hidden_states_ref)
+
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+            
+            if no_cache:
+                hidden_states_ref = hidden_states_ref + residual_ref
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+        if no_cache:
+            hidden_states_ref = hidden_states_ref / attn.rescale_output_factor
+
+
+        if no_cache:
+            hidden_states_ref = rearrange(hidden_states_ref, '(b n_ref) l c -> b n_ref l c', n_ref=N_ref)
+            return hidden_states, hidden_states_ref, output_K_cache, output_V_cache
+        else:
+            return hidden_states, None, None, None
+            
+
+ADDED_KV_ATTENTION_PROCESSORS = (
+    AttnAddedKVProcessor,
+    SlicedAttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    XFormersAttnAddedKVProcessor,
+)
+
+CROSS_ATTENTION_PROCESSORS = (
+    AttnProcessor,
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+    SlicedAttnProcessor,
+    IPAdapterAttnProcessor,
+    IPAdapterAttnProcessor2_0,
+)
+
+AttentionProcessor = Union[
+    AttnProcessor,
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    XFormersAttnProcessor,
+    SlicedAttnProcessor,
+    AttnAddedKVProcessor,
+    SlicedAttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    XFormersAttnAddedKVProcessor,
+    CustomDiffusionAttnProcessor,
+    CustomDiffusionXFormersAttnProcessor,
+    CustomDiffusionAttnProcessor2_0,
+    PAGCFGIdentitySelfAttnProcessor2_0,
+    PAGIdentitySelfAttnProcessor2_0,
+    PAGCFGHunyuanAttnProcessor2_0,
+    PAGHunyuanAttnProcessor2_0,
+]
diff --git a/diffusers/src/diffusers/models/autoencoders/__init__.py b/diffusers/src/diffusers/models/autoencoders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccf4552b2a5eb85502cab0b47554dd9c7b265dff
--- /dev/null
+++ b/diffusers/src/diffusers/models/autoencoders/__init__.py
@@ -0,0 +1,8 @@
+from .autoencoder_asym_kl import AsymmetricAutoencoderKL
+from .autoencoder_kl import AutoencoderKL
+from .autoencoder_kl_cogvideox import AutoencoderKLCogVideoX
+from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
+from .autoencoder_oobleck import AutoencoderOobleck
+from .autoencoder_tiny import AutoencoderTiny
+from .consistency_decoder_vae import ConsistencyDecoderVAE
+from .vq_model import VQModel
diff --git a/diffusers/src/diffusers/models/autoencoders/__pycache__/__init__.cpython-311.pyc b/diffusers/src/diffusers/models/autoencoders/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ce7a8e27349f683c46f8c9de0c20c0e8f044880
Binary files /dev/null and b/diffusers/src/diffusers/models/autoencoders/__pycache__/__init__.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_asym_kl.cpython-311.pyc b/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_asym_kl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06febef62aa793bd27c0938e421a9f5493deb3f3
Binary files /dev/null and b/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_asym_kl.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_kl.cpython-311.pyc b/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_kl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1c2dec32777efec96e89e770879bf0a99b3deeb
Binary files /dev/null and b/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_kl.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_kl_cogvideox.cpython-311.pyc b/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_kl_cogvideox.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b205a4e24de3c14e622a11182753ea48a65f434a
Binary files /dev/null and b/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_kl_cogvideox.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_kl_temporal_decoder.cpython-311.pyc b/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_kl_temporal_decoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e010d83f429a8d7a23dbebd8a450fd5b155b809d
Binary files /dev/null and b/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_kl_temporal_decoder.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_oobleck.cpython-311.pyc b/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_oobleck.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..413f09d6314307d2f6400fc3aaac67fc8b9c7f89
Binary files /dev/null and b/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_oobleck.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_tiny.cpython-311.pyc b/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_tiny.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..126b73091dea8195e765b0cf1c9588dcdd1c76df
Binary files /dev/null and b/diffusers/src/diffusers/models/autoencoders/__pycache__/autoencoder_tiny.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/autoencoders/__pycache__/consistency_decoder_vae.cpython-311.pyc b/diffusers/src/diffusers/models/autoencoders/__pycache__/consistency_decoder_vae.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..058b3fd8c8ac163fa5548a11f343c027a97c5805
Binary files /dev/null and b/diffusers/src/diffusers/models/autoencoders/__pycache__/consistency_decoder_vae.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/autoencoders/__pycache__/vae.cpython-311.pyc b/diffusers/src/diffusers/models/autoencoders/__pycache__/vae.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76d4d65ef7b9aac49358d29d0cdcaded67e3877c
Binary files /dev/null and b/diffusers/src/diffusers/models/autoencoders/__pycache__/vae.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/autoencoders/__pycache__/vq_model.cpython-311.pyc b/diffusers/src/diffusers/models/autoencoders/__pycache__/vq_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be2b57b6e9f8ce64bce5bd0d755dac4df1c2c063
Binary files /dev/null and b/diffusers/src/diffusers/models/autoencoders/__pycache__/vq_model.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/autoencoders/autoencoder_asym_kl.py b/diffusers/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc2041d2e930ec83264aa4de1273180680a32234
--- /dev/null
+++ b/diffusers/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
@@ -0,0 +1,186 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils.accelerate_utils import apply_forward_hook
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder, MaskConditionDecoder
+
+
+class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin):
+    r"""
+    Designing a Better Asymmetric VQGAN for StableDiffusion https://arxiv.org/abs/2306.04632 . A VAE model with KL loss
+    for encoding images into latents and decoding latent representations into images.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        down_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of down block output channels.
+        layers_per_down_block (`int`, *optional*, defaults to `1`):
+            Number layers for down block.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        up_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of up block output channels.
+        layers_per_up_block (`int`, *optional*, defaults to `1`):
+            Number layers for up block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        norm_num_groups (`int`, *optional*, defaults to `32`):
+            Number of groups to use for the first normalization layer in ResNet blocks.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        down_block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_down_block: int = 1,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        up_block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_up_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        scaling_factor: float = 0.18215,
+    ) -> None:
+        super().__init__()
+
+        # pass init params to Encoder
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=down_block_out_channels,
+            layers_per_block=layers_per_down_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+        )
+
+        # pass init params to Decoder
+        self.decoder = MaskConditionDecoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=up_block_out_channels,
+            layers_per_block=layers_per_up_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+        )
+
+        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
+        self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1)
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+        self.register_to_config(block_out_channels=up_block_out_channels)
+        self.register_to_config(force_upcast=False)
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[torch.FloatTensor]]:
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _decode(
+        self,
+        z: torch.FloatTensor,
+        image: Optional[torch.FloatTensor] = None,
+        mask: Optional[torch.FloatTensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z, image, mask)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    @apply_forward_hook
+    def decode(
+        self,
+        z: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        image: Optional[torch.FloatTensor] = None,
+        mask: Optional[torch.FloatTensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        decoded = self._decode(z, image, mask).sample
+
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        mask: Optional[torch.FloatTensor] = None,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            mask (`torch.FloatTensor`, *optional*, defaults to `None`): Optional inpainting mask.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z, sample, mask).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
diff --git a/diffusers/src/diffusers/models/autoencoders/autoencoder_kl.py b/diffusers/src/diffusers/models/autoencoders/autoencoder_kl.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7870138e3700b30f5f9923177f50ea2e6972db6
--- /dev/null
+++ b/diffusers/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -0,0 +1,520 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin
+from ...utils.accelerate_utils import apply_forward_hook
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
+
+
+class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    r"""
+    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        force_upcast (`bool`, *optional*, default to `True`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without loosing too much precision in which case
+            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        scaling_factor: float = 0.18215,
+        latents_mean: Optional[Tuple[float]] = None,
+        latents_std: Optional[Tuple[float]] = None,
+        force_upcast: float = True,
+    ):
+        super().__init__()
+
+        # pass init params to Encoder
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+        )
+
+        # pass init params to Decoder
+        self.decoder = Decoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+        )
+
+        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
+        self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1)
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_size = self.config.sample_size
+        sample_size = (
+            self.config.sample_size[0]
+            if isinstance(self.config.sample_size, (list, tuple))
+            else self.config.sample_size
+        )
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor = 0.25
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Encoder, Decoder)):
+            module.gradient_checkpointing = value
+
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.use_tiling = use_tiling
+
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.enable_tiling(False)
+
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            return self.tiled_encode(x, return_dict=return_dict)
+
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self.encoder(x)
+
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _encode(
+        self, x: torch.FloatTensor, return_dict: bool = True, hidden_flag = False
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if hidden_flag:
+            h,hidden_list = self.encoder(x, hidden_flag = hidden_flag)
+        else:
+            h = self.encoder(x, hidden_flag = hidden_flag)
+
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            if hidden_flag:
+                return (posterior, hidden_list)
+            else:
+                return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _decode(self, z: torch.FloatTensor, return_dict: bool = True, hidden_list = None) -> Union[DecoderOutput, torch.FloatTensor]:
+        if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+            return self.tiled_decode(z, return_dict=return_dict)
+
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z, hidden_list = hidden_list)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    @apply_forward_hook
+    def decode(
+        self, z: torch.FloatTensor, return_dict: bool = True, generator=None
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        """
+        Decode a batch of images.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
+        return b
+
+    def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+                `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+
+        # Split the image into 512x512 tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[2], overlap_size):
+            row = []
+            for j in range(0, x.shape[3], overlap_size):
+                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        moments = torch.cat(result_rows, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+
+        # Split z into overlapping 64x64 tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[2], overlap_size):
+            row = []
+            for j in range(0, z.shape[3], overlap_size):
+                tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        dec = torch.cat(result_rows, dim=2)
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
diff --git a/diffusers/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/diffusers/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
new file mode 100644
index 0000000000000000000000000000000000000000..04c787ee3e840d3d8855a7a8e15c03f0c1a3e979
--- /dev/null
+++ b/diffusers/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
@@ -0,0 +1,1374 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders.single_file_model import FromOriginalModelMixin
+from ...utils import logging
+from ...utils.accelerate_utils import apply_forward_hook
+from ..activations import get_activation
+from ..downsampling import CogVideoXDownsample3D
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from ..upsampling import CogVideoXUpsample3D
+from .vae import DecoderOutput, DiagonalGaussianDistribution
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class CogVideoXSafeConv3d(nn.Conv3d):
+    r"""
+    A 3D convolution layer that splits the input tensor into smaller parts to avoid OOM in CogVideoX Model.
+    """
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        memory_count = torch.prod(torch.tensor(input.shape)).item() * 2 / 1024**3
+
+        # Set to 2GB, suitable for CuDNN
+        if memory_count > 2:
+            kernel_size = self.kernel_size[0]
+            part_num = int(memory_count / 2) + 1
+            input_chunks = torch.chunk(input, part_num, dim=2)
+
+            if kernel_size > 1:
+                input_chunks = [input_chunks[0]] + [
+                    torch.cat((input_chunks[i - 1][:, :, -kernel_size + 1 :], input_chunks[i]), dim=2)
+                    for i in range(1, len(input_chunks))
+                ]
+
+            output_chunks = []
+            for input_chunk in input_chunks:
+                output_chunks.append(super().forward(input_chunk))
+            output = torch.cat(output_chunks, dim=2)
+            return output
+        else:
+            return super().forward(input)
+
+
+class CogVideoXCausalConv3d(nn.Module):
+    r"""A 3D causal convolution layer that pads the input tensor to ensure causality in CogVideoX Model.
+
+    Args:
+        in_channels (`int`): Number of channels in the input tensor.
+        out_channels (`int`): Number of output channels produced by the convolution.
+        kernel_size (`int` or `Tuple[int, int, int]`): Kernel size of the convolutional kernel.
+        stride (`int`, defaults to `1`): Stride of the convolution.
+        dilation (`int`, defaults to `1`): Dilation rate of the convolution.
+        pad_mode (`str`, defaults to `"constant"`): Padding mode.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: int = 1,
+        dilation: int = 1,
+        pad_mode: str = "constant",
+    ):
+        super().__init__()
+
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size,) * 3
+
+        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
+
+        self.pad_mode = pad_mode
+        time_pad = dilation * (time_kernel_size - 1) + (1 - stride)
+        height_pad = height_kernel_size // 2
+        width_pad = width_kernel_size // 2
+
+        self.height_pad = height_pad
+        self.width_pad = width_pad
+        self.time_pad = time_pad
+        self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)
+
+        self.temporal_dim = 2
+        self.time_kernel_size = time_kernel_size
+
+        stride = (stride, 1, 1)
+        dilation = (dilation, 1, 1)
+        self.conv = CogVideoXSafeConv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+        )
+
+        self.conv_cache = None
+
+    def fake_context_parallel_forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        kernel_size = self.time_kernel_size
+        if kernel_size > 1:
+            cached_inputs = (
+                [self.conv_cache] if self.conv_cache is not None else [inputs[:, :, :1]] * (kernel_size - 1)
+            )
+            inputs = torch.cat(cached_inputs + [inputs], dim=2)
+        return inputs
+
+    def _clear_fake_context_parallel_cache(self):
+        del self.conv_cache
+        self.conv_cache = None
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        inputs = self.fake_context_parallel_forward(inputs)
+
+        self._clear_fake_context_parallel_cache()
+        # Note: we could move these to the cpu for a lower maximum memory usage but its only a few
+        # hundred megabytes and so let's not do it for now
+        self.conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
+
+        padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
+        inputs = F.pad(inputs, padding_2d, mode="constant", value=0)
+
+        output = self.conv(inputs)
+        return output
+
+
+class CogVideoXSpatialNorm3D(nn.Module):
+    r"""
+    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002. This implementation is specific
+    to 3D-video like data.
+
+    CogVideoXSafeConv3d is used instead of nn.Conv3d to avoid OOM in CogVideoX Model.
+
+    Args:
+        f_channels (`int`):
+            The number of channels for input to group normalization layer, and output of the spatial norm layer.
+        zq_channels (`int`):
+            The number of channels for the quantized vector as described in the paper.
+        groups (`int`):
+            Number of groups to separate the channels into for group normalization.
+    """
+
+    def __init__(
+        self,
+        f_channels: int,
+        zq_channels: int,
+        groups: int = 32,
+    ):
+        super().__init__()
+        self.norm_layer = nn.GroupNorm(num_channels=f_channels, num_groups=groups, eps=1e-6, affine=True)
+        self.conv_y = CogVideoXCausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
+        self.conv_b = CogVideoXCausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
+
+    def forward(self, f: torch.Tensor, zq: torch.Tensor) -> torch.Tensor:
+        if f.shape[2] > 1 and f.shape[2] % 2 == 1:
+            f_first, f_rest = f[:, :, :1], f[:, :, 1:]
+            f_first_size, f_rest_size = f_first.shape[-3:], f_rest.shape[-3:]
+            z_first, z_rest = zq[:, :, :1], zq[:, :, 1:]
+            z_first = F.interpolate(z_first, size=f_first_size)
+            z_rest = F.interpolate(z_rest, size=f_rest_size)
+            zq = torch.cat([z_first, z_rest], dim=2)
+        else:
+            zq = F.interpolate(zq, size=f.shape[-3:])
+
+        norm_f = self.norm_layer(f)
+        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
+        return new_f
+
+
+class CogVideoXResnetBlock3D(nn.Module):
+    r"""
+    A 3D ResNet block used in the CogVideoX model.
+
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        non_linearity (`str`, defaults to `"swish"`):
+            Activation function to use.
+        conv_shortcut (bool, defaults to `False`):
+            Whether or not to use a convolution shortcut.
+        spatial_norm_dim (`int`, *optional*):
+            The dimension to use for spatial norm if it is to be used instead of group norm.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        conv_shortcut: bool = False,
+        spatial_norm_dim: Optional[int] = None,
+        pad_mode: str = "first",
+    ):
+        super().__init__()
+
+        out_channels = out_channels or in_channels
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.nonlinearity = get_activation(non_linearity)
+        self.use_conv_shortcut = conv_shortcut
+
+        if spatial_norm_dim is None:
+            self.norm1 = nn.GroupNorm(num_channels=in_channels, num_groups=groups, eps=eps)
+            self.norm2 = nn.GroupNorm(num_channels=out_channels, num_groups=groups, eps=eps)
+        else:
+            self.norm1 = CogVideoXSpatialNorm3D(
+                f_channels=in_channels,
+                zq_channels=spatial_norm_dim,
+                groups=groups,
+            )
+            self.norm2 = CogVideoXSpatialNorm3D(
+                f_channels=out_channels,
+                zq_channels=spatial_norm_dim,
+                groups=groups,
+            )
+
+        self.conv1 = CogVideoXCausalConv3d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode
+        )
+
+        if temb_channels > 0:
+            self.temb_proj = nn.Linear(in_features=temb_channels, out_features=out_channels)
+
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = CogVideoXCausalConv3d(
+            in_channels=out_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode
+        )
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = CogVideoXCausalConv3d(
+                    in_channels=in_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode
+                )
+            else:
+                self.conv_shortcut = CogVideoXSafeConv3d(
+                    in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0
+                )
+
+    def forward(
+        self,
+        inputs: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        zq: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = inputs
+
+        if zq is not None:
+            hidden_states = self.norm1(hidden_states, zq)
+        else:
+            hidden_states = self.norm1(hidden_states)
+
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        if temb is not None:
+            hidden_states = hidden_states + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None]
+
+        if zq is not None:
+            hidden_states = self.norm2(hidden_states, zq)
+        else:
+            hidden_states = self.norm2(hidden_states)
+
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            inputs = self.conv_shortcut(inputs)
+
+        hidden_states = hidden_states + inputs
+        return hidden_states
+
+
+class CogVideoXDownBlock3D(nn.Module):
+    r"""
+    A downsampling block used in the CogVideoX model.
+
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        resnet_groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        add_downsample (`bool`, defaults to `True`):
+            Whether or not to use a downsampling layer. If not used, output dimension would be same as input dimension.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to downsample across temporal dimension.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        add_downsample: bool = True,
+        downsample_padding: int = 0,
+        compress_time: bool = False,
+        pad_mode: str = "first",
+    ):
+        super().__init__()
+
+        resnets = []
+        for i in range(num_layers):
+            in_channel = in_channels if i == 0 else out_channels
+            resnets.append(
+                CogVideoXResnetBlock3D(
+                    in_channels=in_channel,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=resnet_groups,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    pad_mode=pad_mode,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.downsamplers = None
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    CogVideoXDownsample3D(
+                        out_channels, out_channels, padding=downsample_padding, compress_time=compress_time
+                    )
+                ]
+            )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        zq: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def create_forward(*inputs):
+                        return module(*inputs)
+
+                    return create_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb, zq
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, zq)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states
+
+
+class CogVideoXMidBlock3D(nn.Module):
+    r"""
+    A middle block used in the CogVideoX model.
+
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        resnet_groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        spatial_norm_dim (`int`, *optional*):
+            The dimension to use for spatial norm if it is to be used instead of group norm.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        spatial_norm_dim: Optional[int] = None,
+        pad_mode: str = "first",
+    ):
+        super().__init__()
+
+        resnets = []
+        for _ in range(num_layers):
+            resnets.append(
+                CogVideoXResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=resnet_groups,
+                    eps=resnet_eps,
+                    spatial_norm_dim=spatial_norm_dim,
+                    non_linearity=resnet_act_fn,
+                    pad_mode=pad_mode,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        zq: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def create_forward(*inputs):
+                        return module(*inputs)
+
+                    return create_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb, zq
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, zq)
+
+        return hidden_states
+
+
+class CogVideoXUpBlock3D(nn.Module):
+    r"""
+    An upsampling block used in the CogVideoX model.
+
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        resnet_groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        spatial_norm_dim (`int`, defaults to `16`):
+            The dimension to use for spatial norm if it is to be used instead of group norm.
+        add_upsample (`bool`, defaults to `True`):
+            Whether or not to use a upsampling layer. If not used, output dimension would be same as input dimension.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to downsample across temporal dimension.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        spatial_norm_dim: int = 16,
+        add_upsample: bool = True,
+        upsample_padding: int = 1,
+        compress_time: bool = False,
+        pad_mode: str = "first",
+    ):
+        super().__init__()
+
+        resnets = []
+        for i in range(num_layers):
+            in_channel = in_channels if i == 0 else out_channels
+            resnets.append(
+                CogVideoXResnetBlock3D(
+                    in_channels=in_channel,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=resnet_groups,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    spatial_norm_dim=spatial_norm_dim,
+                    pad_mode=pad_mode,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.upsamplers = None
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    CogVideoXUpsample3D(
+                        out_channels, out_channels, padding=upsample_padding, compress_time=compress_time
+                    )
+                ]
+            )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        zq: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""Forward method of the `CogVideoXUpBlock3D` class."""
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def create_forward(*inputs):
+                        return module(*inputs)
+
+                    return create_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb, zq
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, zq)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class CogVideoXEncoder3D(nn.Module):
+    r"""
+    The `CogVideoXEncoder3D` layer of a variational autoencoder that encodes its input into a latent representation.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
+            options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 16,
+        down_block_types: Tuple[str, ...] = (
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+        ),
+        block_out_channels: Tuple[int, ...] = (128, 256, 256, 512),
+        layers_per_block: int = 3,
+        act_fn: str = "silu",
+        norm_eps: float = 1e-6,
+        norm_num_groups: int = 32,
+        dropout: float = 0.0,
+        pad_mode: str = "first",
+        temporal_compression_ratio: float = 4,
+    ):
+        super().__init__()
+
+        # log2 of temporal_compress_times
+        temporal_compress_level = int(np.log2(temporal_compression_ratio))
+
+        self.conv_in = CogVideoXCausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode)
+        self.down_blocks = nn.ModuleList([])
+
+        # down blocks
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            compress_time = i < temporal_compress_level
+
+            if down_block_type == "CogVideoXDownBlock3D":
+                down_block = CogVideoXDownBlock3D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    temb_channels=0,
+                    dropout=dropout,
+                    num_layers=layers_per_block,
+                    resnet_eps=norm_eps,
+                    resnet_act_fn=act_fn,
+                    resnet_groups=norm_num_groups,
+                    add_downsample=not is_final_block,
+                    compress_time=compress_time,
+                )
+            else:
+                raise ValueError("Invalid `down_block_type` encountered. Must be `CogVideoXDownBlock3D`")
+
+            self.down_blocks.append(down_block)
+
+        # mid block
+        self.mid_block = CogVideoXMidBlock3D(
+            in_channels=block_out_channels[-1],
+            temb_channels=0,
+            dropout=dropout,
+            num_layers=2,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            resnet_groups=norm_num_groups,
+            pad_mode=pad_mode,
+        )
+
+        self.norm_out = nn.GroupNorm(norm_num_groups, block_out_channels[-1], eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = CogVideoXCausalConv3d(
+            block_out_channels[-1], 2 * out_channels, kernel_size=3, pad_mode=pad_mode
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(self, sample: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+        r"""The forward method of the `CogVideoXEncoder3D` class."""
+        hidden_states = self.conv_in(sample)
+
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            # 1. Down
+            for down_block in self.down_blocks:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(down_block), hidden_states, temb, None
+                )
+
+            # 2. Mid
+            hidden_states = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self.mid_block), hidden_states, temb, None
+            )
+        else:
+            # 1. Down
+            for down_block in self.down_blocks:
+                hidden_states = down_block(hidden_states, temb, None)
+
+            # 2. Mid
+            hidden_states = self.mid_block(hidden_states, temb, None)
+
+        # 3. Post-process
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+
+
+class CogVideoXDecoder3D(nn.Module):
+    r"""
+    The `CogVideoXDecoder3D` layer of a variational autoencoder that decodes its latent representation into an output
+    sample.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        in_channels: int = 16,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = (
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+        ),
+        block_out_channels: Tuple[int, ...] = (128, 256, 256, 512),
+        layers_per_block: int = 3,
+        act_fn: str = "silu",
+        norm_eps: float = 1e-6,
+        norm_num_groups: int = 32,
+        dropout: float = 0.0,
+        pad_mode: str = "first",
+        temporal_compression_ratio: float = 4,
+    ):
+        super().__init__()
+
+        reversed_block_out_channels = list(reversed(block_out_channels))
+
+        self.conv_in = CogVideoXCausalConv3d(
+            in_channels, reversed_block_out_channels[0], kernel_size=3, pad_mode=pad_mode
+        )
+
+        # mid block
+        self.mid_block = CogVideoXMidBlock3D(
+            in_channels=reversed_block_out_channels[0],
+            temb_channels=0,
+            num_layers=2,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            resnet_groups=norm_num_groups,
+            spatial_norm_dim=in_channels,
+            pad_mode=pad_mode,
+        )
+
+        # up blocks
+        self.up_blocks = nn.ModuleList([])
+
+        output_channel = reversed_block_out_channels[0]
+        temporal_compress_level = int(np.log2(temporal_compression_ratio))
+
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            compress_time = i < temporal_compress_level
+
+            if up_block_type == "CogVideoXUpBlock3D":
+                up_block = CogVideoXUpBlock3D(
+                    in_channels=prev_output_channel,
+                    out_channels=output_channel,
+                    temb_channels=0,
+                    dropout=dropout,
+                    num_layers=layers_per_block + 1,
+                    resnet_eps=norm_eps,
+                    resnet_act_fn=act_fn,
+                    resnet_groups=norm_num_groups,
+                    spatial_norm_dim=in_channels,
+                    add_upsample=not is_final_block,
+                    compress_time=compress_time,
+                    pad_mode=pad_mode,
+                )
+                prev_output_channel = output_channel
+            else:
+                raise ValueError("Invalid `up_block_type` encountered. Must be `CogVideoXUpBlock3D`")
+
+            self.up_blocks.append(up_block)
+
+        self.norm_out = CogVideoXSpatialNorm3D(reversed_block_out_channels[-1], in_channels, groups=norm_num_groups)
+        self.conv_act = nn.SiLU()
+        self.conv_out = CogVideoXCausalConv3d(
+            reversed_block_out_channels[-1], out_channels, kernel_size=3, pad_mode=pad_mode
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(self, sample: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+        r"""The forward method of the `CogVideoXDecoder3D` class."""
+        hidden_states = self.conv_in(sample)
+
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            # 1. Mid
+            hidden_states = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self.mid_block), hidden_states, temb, sample
+            )
+
+            # 2. Up
+            for up_block in self.up_blocks:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(up_block), hidden_states, temb, sample
+                )
+        else:
+            # 1. Mid
+            hidden_states = self.mid_block(hidden_states, temb, sample)
+
+            # 2. Up
+            for up_block in self.up_blocks:
+                hidden_states = up_block(hidden_states, temb, sample)
+
+        # 3. Post-process
+        hidden_states = self.norm_out(hidden_states, sample)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+
+
+class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    r"""
+    A VAE model with KL loss for encoding images into latents and decoding latent representations into images. Used in
+    [CogVideoX](https://github.com/THUDM/CogVideo).
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        scaling_factor (`float`, *optional*, defaults to `1.15258426`):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        force_upcast (`bool`, *optional*, default to `True`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without loosing too much precision in which case
+            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["CogVideoXResnetBlock3D"]
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = (
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+        ),
+        up_block_types: Tuple[str] = (
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+        ),
+        block_out_channels: Tuple[int] = (128, 256, 256, 512),
+        latent_channels: int = 16,
+        layers_per_block: int = 3,
+        act_fn: str = "silu",
+        norm_eps: float = 1e-6,
+        norm_num_groups: int = 32,
+        temporal_compression_ratio: float = 4,
+        sample_height: int = 480,
+        sample_width: int = 720,
+        scaling_factor: float = 1.15258426,
+        shift_factor: Optional[float] = None,
+        latents_mean: Optional[Tuple[float]] = None,
+        latents_std: Optional[Tuple[float]] = None,
+        force_upcast: float = True,
+        use_quant_conv: bool = False,
+        use_post_quant_conv: bool = False,
+    ):
+        super().__init__()
+
+        self.encoder = CogVideoXEncoder3D(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_eps=norm_eps,
+            norm_num_groups=norm_num_groups,
+            temporal_compression_ratio=temporal_compression_ratio,
+        )
+        self.decoder = CogVideoXDecoder3D(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_eps=norm_eps,
+            norm_num_groups=norm_num_groups,
+            temporal_compression_ratio=temporal_compression_ratio,
+        )
+        self.quant_conv = CogVideoXSafeConv3d(2 * out_channels, 2 * out_channels, 1) if use_quant_conv else None
+        self.post_quant_conv = CogVideoXSafeConv3d(out_channels, out_channels, 1) if use_post_quant_conv else None
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+        # Can be increased to decode more latent frames at once, but comes at a reasonable memory cost and it is not
+        # recommended because the temporal parts of the VAE, here, are tricky to understand.
+        # If you decode X latent frames together, the number of output frames is:
+        #     (X + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale)) => X + 6 frames
+        #
+        # Example with num_latent_frames_batch_size = 2:
+        #     - 12 latent frames: (0, 1), (2, 3), (4, 5), (6, 7), (8, 9), (10, 11) are processed together
+        #         => (12 // 2 frame slices) * ((2 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale))
+        #         => 6 * 8 = 48 frames
+        #     - 13 latent frames: (0, 1, 2) (special case), (3, 4), (5, 6), (7, 8), (9, 10), (11, 12) are processed together
+        #         => (1 frame slice) * ((3 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale)) +
+        #            ((13 - 3) // 2) * ((2 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale))
+        #         => 1 * 9 + 5 * 8 = 49 frames
+        # It has been implemented this way so as to not have "magic values" in the code base that would be hard to explain. Note that
+        # setting it to anything other than 2 would give poor results because the VAE hasn't been trained to be adaptive with different
+        # number of temporal frames.
+        self.num_latent_frames_batch_size = 2
+        self.num_sample_frames_batch_size = 8
+
+        # We make the minimum height and width of sample for tiling half that of the generally supported
+        self.tile_sample_min_height = sample_height // 2
+        self.tile_sample_min_width = sample_width // 2
+        self.tile_latent_min_height = int(
+            self.tile_sample_min_height / (2 ** (len(self.config.block_out_channels) - 1))
+        )
+        self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** (len(self.config.block_out_channels) - 1)))
+
+        # These are experimental overlap factors that were chosen based on experimentation and seem to work best for
+        # 720x480 (WxH) resolution. The above resolution is the strongly recommended generation resolution in CogVideoX
+        # and so the tiling implementation has only been tested on those specific resolutions.
+        self.tile_overlap_factor_height = 1 / 6
+        self.tile_overlap_factor_width = 1 / 5
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CogVideoXEncoder3D, CogVideoXDecoder3D)):
+            module.gradient_checkpointing = value
+
+    def _clear_fake_context_parallel_cache(self):
+        for name, module in self.named_modules():
+            if isinstance(module, CogVideoXCausalConv3d):
+                logger.debug(f"Clearing fake Context Parallel cache for layer: {name}")
+                module._clear_fake_context_parallel_cache()
+
+    def enable_tiling(
+        self,
+        tile_sample_min_height: Optional[int] = None,
+        tile_sample_min_width: Optional[int] = None,
+        tile_overlap_factor_height: Optional[float] = None,
+        tile_overlap_factor_width: Optional[float] = None,
+    ) -> None:
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+
+        Args:
+            tile_sample_min_height (`int`, *optional*):
+                The minimum height required for a sample to be separated into tiles across the height dimension.
+            tile_sample_min_width (`int`, *optional*):
+                The minimum width required for a sample to be separated into tiles across the width dimension.
+            tile_overlap_factor_height (`int`, *optional*):
+                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
+                no tiling artifacts produced across the height dimension. Must be between 0 and 1. Setting a higher
+                value might cause more tiles to be processed leading to slow down of the decoding process.
+            tile_overlap_factor_width (`int`, *optional*):
+                The minimum amount of overlap between two consecutive horizontal tiles. This is to ensure that there
+                are no tiling artifacts produced across the width dimension. Must be between 0 and 1. Setting a higher
+                value might cause more tiles to be processed leading to slow down of the decoding process.
+        """
+        self.use_tiling = True
+        self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
+        self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
+        self.tile_latent_min_height = int(
+            self.tile_sample_min_height / (2 ** (len(self.config.block_out_channels) - 1))
+        )
+        self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height
+        self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width
+
+    def disable_tiling(self) -> None:
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_tiling = False
+
+    def enable_slicing(self) -> None:
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self) -> None:
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    def _encode(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = x.shape
+
+        if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
+            return self.tiled_encode(x)
+
+        frame_batch_size = self.num_sample_frames_batch_size
+        # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
+        num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
+        enc = []
+        for i in range(num_batches):
+            remaining_frames = num_frames % frame_batch_size
+            start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
+            end_frame = frame_batch_size * (i + 1) + remaining_frames
+            x_intermediate = x[:, :, start_frame:end_frame]
+            x_intermediate = self.encoder(x_intermediate)
+            if self.quant_conv is not None:
+                x_intermediate = self.quant_conv(x_intermediate)
+            enc.append(x_intermediate)
+
+        self._clear_fake_context_parallel_cache()
+        enc = torch.cat(enc, dim=2)
+
+        return enc
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded videos. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self._encode(x)
+
+        posterior = DiagonalGaussianDistribution(h)
+
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        batch_size, num_channels, num_frames, height, width = z.shape
+
+        if self.use_tiling and (width > self.tile_latent_min_width or height > self.tile_latent_min_height):
+            return self.tiled_decode(z, return_dict=return_dict)
+
+        frame_batch_size = self.num_latent_frames_batch_size
+        num_batches = num_frames // frame_batch_size
+        dec = []
+        for i in range(num_batches):
+            remaining_frames = num_frames % frame_batch_size
+            start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
+            end_frame = frame_batch_size * (i + 1) + remaining_frames
+            z_intermediate = z[:, :, start_frame:end_frame]
+            if self.post_quant_conv is not None:
+                z_intermediate = self.post_quant_conv(z_intermediate)
+            z_intermediate = self.decoder(z_intermediate)
+            dec.append(z_intermediate)
+
+        self._clear_fake_context_parallel_cache()
+        dec = torch.cat(dec, dim=2)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    @apply_forward_hook
+    def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        """
+        Decode a batch of images.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
+                y / blend_extent
+            )
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[4], b.shape[4], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
+                x / blend_extent
+            )
+        return b
+
+    def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.Tensor`): Input batch of videos.
+
+        Returns:
+            `torch.Tensor`:
+                The latent representation of the encoded videos.
+        """
+        # For a rough memory estimate, take a look at the `tiled_decode` method.
+        batch_size, num_channels, num_frames, height, width = x.shape
+
+        overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height))
+        overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width))
+        blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height)
+        blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width)
+        row_limit_height = self.tile_latent_min_height - blend_extent_height
+        row_limit_width = self.tile_latent_min_width - blend_extent_width
+        frame_batch_size = self.num_sample_frames_batch_size
+
+        # Split x into overlapping tiles and encode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, overlap_height):
+            row = []
+            for j in range(0, width, overlap_width):
+                # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
+                num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
+                time = []
+                for k in range(num_batches):
+                    remaining_frames = num_frames % frame_batch_size
+                    start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
+                    end_frame = frame_batch_size * (k + 1) + remaining_frames
+                    tile = x[
+                        :,
+                        :,
+                        start_frame:end_frame,
+                        i : i + self.tile_sample_min_height,
+                        j : j + self.tile_sample_min_width,
+                    ]
+                    tile = self.encoder(tile)
+                    if self.quant_conv is not None:
+                        tile = self.quant_conv(tile)
+                    time.append(tile)
+                self._clear_fake_context_parallel_cache()
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent_width)
+                result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
+            result_rows.append(torch.cat(result_row, dim=4))
+
+        enc = torch.cat(result_rows, dim=3)
+        return enc
+
+    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        # Rough memory assessment:
+        #   - In CogVideoX-2B, there are a total of 24 CausalConv3d layers.
+        #   - The biggest intermediate dimensions are: [1, 128, 9, 480, 720].
+        #   - Assume fp16 (2 bytes per value).
+        # Memory required: 1 * 128 * 9 * 480 * 720 * 24 * 2 / 1024**3 = 17.8 GB
+        #
+        # Memory assessment when using tiling:
+        #   - Assume everything as above but now HxW is 240x360 by tiling in half
+        # Memory required: 1 * 128 * 9 * 240 * 360 * 24 * 2 / 1024**3 = 4.5 GB
+
+        batch_size, num_channels, num_frames, height, width = z.shape
+
+        overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor_height))
+        overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor_width))
+        blend_extent_height = int(self.tile_sample_min_height * self.tile_overlap_factor_height)
+        blend_extent_width = int(self.tile_sample_min_width * self.tile_overlap_factor_width)
+        row_limit_height = self.tile_sample_min_height - blend_extent_height
+        row_limit_width = self.tile_sample_min_width - blend_extent_width
+        frame_batch_size = self.num_latent_frames_batch_size
+
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, overlap_height):
+            row = []
+            for j in range(0, width, overlap_width):
+                num_batches = num_frames // frame_batch_size
+                time = []
+                for k in range(num_batches):
+                    remaining_frames = num_frames % frame_batch_size
+                    start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
+                    end_frame = frame_batch_size * (k + 1) + remaining_frames
+                    tile = z[
+                        :,
+                        :,
+                        start_frame:end_frame,
+                        i : i + self.tile_latent_min_height,
+                        j : j + self.tile_latent_min_width,
+                    ]
+                    if self.post_quant_conv is not None:
+                        tile = self.post_quant_conv(tile)
+                    tile = self.decoder(tile)
+                    time.append(tile)
+                self._clear_fake_context_parallel_cache()
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent_width)
+                result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
+            result_rows.append(torch.cat(result_row, dim=4))
+
+        dec = torch.cat(result_rows, dim=3)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[torch.Tensor, torch.Tensor]:
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        if not return_dict:
+            return (dec,)
+        return dec
diff --git a/diffusers/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/diffusers/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..b12226fa4bac2a0659d1b24fa19498e505d558ae
--- /dev/null
+++ b/diffusers/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
@@ -0,0 +1,399 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import is_torch_version
+from ...utils.accelerate_utils import apply_forward_hook
+from ..attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from ..unets.unet_3d_blocks import MidBlockTemporalDecoder, UpBlockTemporalDecoder
+from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder
+
+
+class TemporalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 4,
+        out_channels: int = 3,
+        block_out_channels: Tuple[int] = (128, 256, 512, 512),
+        layers_per_block: int = 2,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1)
+        self.mid_block = MidBlockTemporalDecoder(
+            num_layers=self.layers_per_block,
+            in_channels=block_out_channels[-1],
+            out_channels=block_out_channels[-1],
+            attention_head_dim=block_out_channels[-1],
+        )
+
+        # up
+        self.up_blocks = nn.ModuleList([])
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i in range(len(block_out_channels)):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+            up_block = UpBlockTemporalDecoder(
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                add_upsample=not is_final_block,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-6)
+
+        self.conv_act = nn.SiLU()
+        self.conv_out = torch.nn.Conv2d(
+            in_channels=block_out_channels[0],
+            out_channels=out_channels,
+            kernel_size=3,
+            padding=1,
+        )
+
+        conv_out_kernel_size = (3, 1, 1)
+        padding = [int(k // 2) for k in conv_out_kernel_size]
+        self.time_conv_out = torch.nn.Conv3d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=conv_out_kernel_size,
+            padding=padding,
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        image_only_indicator: torch.FloatTensor,
+        num_frames: int = 1,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `Decoder` class."""
+
+        sample = self.conv_in(sample)
+
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    image_only_indicator,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        image_only_indicator,
+                        use_reentrant=False,
+                    )
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    image_only_indicator,
+                )
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        image_only_indicator,
+                    )
+        else:
+            # middle
+            sample = self.mid_block(sample, image_only_indicator=image_only_indicator)
+            sample = sample.to(upscale_dtype)
+
+            # up
+            for up_block in self.up_blocks:
+                sample = up_block(sample, image_only_indicator=image_only_indicator)
+
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        batch_frames, channels, height, width = sample.shape
+        batch_size = batch_frames // num_frames
+        sample = sample[None, :].reshape(batch_size, num_frames, channels, height, width).permute(0, 2, 1, 3, 4)
+        sample = self.time_conv_out(sample)
+
+        sample = sample.permute(0, 2, 1, 3, 4).reshape(batch_frames, channels, height, width)
+
+        return sample
+
+
+class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin):
+    r"""
+    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        layers_per_block: (`int`, *optional*, defaults to 1): Number of layers per block.
+        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        force_upcast (`bool`, *optional*, default to `True`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without loosing too much precision in which case
+            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        latent_channels: int = 4,
+        sample_size: int = 32,
+        scaling_factor: float = 0.18215,
+        force_upcast: float = True,
+    ):
+        super().__init__()
+
+        # pass init params to Encoder
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            double_z=True,
+        )
+
+        # pass init params to Decoder
+        self.decoder = TemporalDecoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+        )
+
+        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
+
+        sample_size = (
+            self.config.sample_size[0]
+            if isinstance(self.config.sample_size, (list, tuple))
+            else self.config.sample_size
+        )
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor = 0.25
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Encoder, TemporalDecoder)):
+            module.gradient_checkpointing = value
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    @apply_forward_hook
+    def decode(
+        self,
+        z: torch.FloatTensor,
+        num_frames: int,
+        return_dict: bool = True,
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        """
+        Decode a batch of images.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+
+        """
+        batch_size = z.shape[0] // num_frames
+        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=z.dtype, device=z.device)
+        decoded = self.decoder(z, num_frames=num_frames, image_only_indicator=image_only_indicator)
+
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+        num_frames: int = 1,
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+
+        dec = self.decode(z, num_frames=num_frames).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
diff --git a/diffusers/src/diffusers/models/autoencoders/autoencoder_oobleck.py b/diffusers/src/diffusers/models/autoencoders/autoencoder_oobleck.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8e372a709d78b21f362f64f6a47d56cf746d3bd
--- /dev/null
+++ b/diffusers/src/diffusers/models/autoencoders/autoencoder_oobleck.py
@@ -0,0 +1,464 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ...utils.accelerate_utils import apply_forward_hook
+from ...utils.torch_utils import randn_tensor
+from ..modeling_utils import ModelMixin
+
+
+class Snake1d(nn.Module):
+    """
+    A 1-dimensional Snake activation function module.
+    """
+
+    def __init__(self, hidden_dim, logscale=True):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.zeros(1, hidden_dim, 1))
+        self.beta = nn.Parameter(torch.zeros(1, hidden_dim, 1))
+
+        self.alpha.requires_grad = True
+        self.beta.requires_grad = True
+        self.logscale = logscale
+
+    def forward(self, hidden_states):
+        shape = hidden_states.shape
+
+        alpha = self.alpha if not self.logscale else torch.exp(self.alpha)
+        beta = self.beta if not self.logscale else torch.exp(self.beta)
+
+        hidden_states = hidden_states.reshape(shape[0], shape[1], -1)
+        hidden_states = hidden_states + (beta + 1e-9).reciprocal() * torch.sin(alpha * hidden_states).pow(2)
+        hidden_states = hidden_states.reshape(shape)
+        return hidden_states
+
+
+class OobleckResidualUnit(nn.Module):
+    """
+    A residual unit composed of Snake1d and weight-normalized Conv1d layers with dilations.
+    """
+
+    def __init__(self, dimension: int = 16, dilation: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+
+        self.snake1 = Snake1d(dimension)
+        self.conv1 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=7, dilation=dilation, padding=pad))
+        self.snake2 = Snake1d(dimension)
+        self.conv2 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=1))
+
+    def forward(self, hidden_state):
+        """
+        Forward pass through the residual unit.
+
+        Args:
+            hidden_state (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
+                Input tensor .
+
+        Returns:
+            output_tensor (`torch.Tensor` of shape `(batch_size, channels, time_steps)`)
+                Input tensor after passing through the residual unit.
+        """
+        output_tensor = hidden_state
+        output_tensor = self.conv1(self.snake1(output_tensor))
+        output_tensor = self.conv2(self.snake2(output_tensor))
+
+        padding = (hidden_state.shape[-1] - output_tensor.shape[-1]) // 2
+        if padding > 0:
+            hidden_state = hidden_state[..., padding:-padding]
+        output_tensor = hidden_state + output_tensor
+        return output_tensor
+
+
+class OobleckEncoderBlock(nn.Module):
+    """Encoder block used in Oobleck encoder."""
+
+    def __init__(self, input_dim, output_dim, stride: int = 1):
+        super().__init__()
+
+        self.res_unit1 = OobleckResidualUnit(input_dim, dilation=1)
+        self.res_unit2 = OobleckResidualUnit(input_dim, dilation=3)
+        self.res_unit3 = OobleckResidualUnit(input_dim, dilation=9)
+        self.snake1 = Snake1d(input_dim)
+        self.conv1 = weight_norm(
+            nn.Conv1d(input_dim, output_dim, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2))
+        )
+
+    def forward(self, hidden_state):
+        hidden_state = self.res_unit1(hidden_state)
+        hidden_state = self.res_unit2(hidden_state)
+        hidden_state = self.snake1(self.res_unit3(hidden_state))
+        hidden_state = self.conv1(hidden_state)
+
+        return hidden_state
+
+
+class OobleckDecoderBlock(nn.Module):
+    """Decoder block used in Oobleck decoder."""
+
+    def __init__(self, input_dim, output_dim, stride: int = 1):
+        super().__init__()
+
+        self.snake1 = Snake1d(input_dim)
+        self.conv_t1 = weight_norm(
+            nn.ConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            )
+        )
+        self.res_unit1 = OobleckResidualUnit(output_dim, dilation=1)
+        self.res_unit2 = OobleckResidualUnit(output_dim, dilation=3)
+        self.res_unit3 = OobleckResidualUnit(output_dim, dilation=9)
+
+    def forward(self, hidden_state):
+        hidden_state = self.snake1(hidden_state)
+        hidden_state = self.conv_t1(hidden_state)
+        hidden_state = self.res_unit1(hidden_state)
+        hidden_state = self.res_unit2(hidden_state)
+        hidden_state = self.res_unit3(hidden_state)
+
+        return hidden_state
+
+
+class OobleckDiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        self.parameters = parameters
+        self.mean, self.scale = parameters.chunk(2, dim=1)
+        self.std = nn.functional.softplus(self.scale) + 1e-4
+        self.var = self.std * self.std
+        self.logvar = torch.log(self.var)
+        self.deterministic = deterministic
+
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.Tensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = randn_tensor(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+
+    def kl(self, other: "OobleckDiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return (self.mean * self.mean + self.var - self.logvar - 1.0).sum(1).mean()
+            else:
+                normalized_diff = torch.pow(self.mean - other.mean, 2) / other.var
+                var_ratio = self.var / other.var
+                logvar_diff = self.logvar - other.logvar
+
+                kl = normalized_diff + var_ratio + logvar_diff - 1
+
+                kl = kl.sum(1).mean()
+                return kl
+
+    def mode(self) -> torch.Tensor:
+        return self.mean
+
+
+@dataclass
+class AutoencoderOobleckOutput(BaseOutput):
+    """
+    Output of AutoencoderOobleck encoding method.
+
+    Args:
+        latent_dist (`OobleckDiagonalGaussianDistribution`):
+            Encoded outputs of `Encoder` represented as the mean and standard deviation of
+            `OobleckDiagonalGaussianDistribution`. `OobleckDiagonalGaussianDistribution` allows for sampling latents
+            from the distribution.
+    """
+
+    latent_dist: "OobleckDiagonalGaussianDistribution"  # noqa: F821
+
+
+@dataclass
+class OobleckDecoderOutput(BaseOutput):
+    r"""
+    Output of decoding method.
+
+    Args:
+        sample (`torch.Tensor` of shape `(batch_size, audio_channels, sequence_length)`):
+            The decoded output sample from the last layer of the model.
+    """
+
+    sample: torch.Tensor
+
+
+class OobleckEncoder(nn.Module):
+    """Oobleck Encoder"""
+
+    def __init__(self, encoder_hidden_size, audio_channels, downsampling_ratios, channel_multiples):
+        super().__init__()
+
+        strides = downsampling_ratios
+        channel_multiples = [1] + channel_multiples
+
+        # Create first convolution
+        self.conv1 = weight_norm(nn.Conv1d(audio_channels, encoder_hidden_size, kernel_size=7, padding=3))
+
+        self.block = []
+        # Create EncoderBlocks that double channels as they downsample by `stride`
+        for stride_index, stride in enumerate(strides):
+            self.block += [
+                OobleckEncoderBlock(
+                    input_dim=encoder_hidden_size * channel_multiples[stride_index],
+                    output_dim=encoder_hidden_size * channel_multiples[stride_index + 1],
+                    stride=stride,
+                )
+            ]
+
+        self.block = nn.ModuleList(self.block)
+        d_model = encoder_hidden_size * channel_multiples[-1]
+        self.snake1 = Snake1d(d_model)
+        self.conv2 = weight_norm(nn.Conv1d(d_model, encoder_hidden_size, kernel_size=3, padding=1))
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv1(hidden_state)
+
+        for module in self.block:
+            hidden_state = module(hidden_state)
+
+        hidden_state = self.snake1(hidden_state)
+        hidden_state = self.conv2(hidden_state)
+
+        return hidden_state
+
+
+class OobleckDecoder(nn.Module):
+    """Oobleck Decoder"""
+
+    def __init__(self, channels, input_channels, audio_channels, upsampling_ratios, channel_multiples):
+        super().__init__()
+
+        strides = upsampling_ratios
+        channel_multiples = [1] + channel_multiples
+
+        # Add first conv layer
+        self.conv1 = weight_norm(nn.Conv1d(input_channels, channels * channel_multiples[-1], kernel_size=7, padding=3))
+
+        # Add upsampling + MRF blocks
+        block = []
+        for stride_index, stride in enumerate(strides):
+            block += [
+                OobleckDecoderBlock(
+                    input_dim=channels * channel_multiples[len(strides) - stride_index],
+                    output_dim=channels * channel_multiples[len(strides) - stride_index - 1],
+                    stride=stride,
+                )
+            ]
+
+        self.block = nn.ModuleList(block)
+        output_dim = channels
+        self.snake1 = Snake1d(output_dim)
+        self.conv2 = weight_norm(nn.Conv1d(channels, audio_channels, kernel_size=7, padding=3, bias=False))
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv1(hidden_state)
+
+        for layer in self.block:
+            hidden_state = layer(hidden_state)
+
+        hidden_state = self.snake1(hidden_state)
+        hidden_state = self.conv2(hidden_state)
+
+        return hidden_state
+
+
+class AutoencoderOobleck(ModelMixin, ConfigMixin):
+    r"""
+    An autoencoder for encoding waveforms into latents and decoding latent representations into waveforms. First
+    introduced in Stable Audio.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        encoder_hidden_size (`int`, *optional*, defaults to 128):
+            Intermediate representation dimension for the encoder.
+        downsampling_ratios (`List[int]`, *optional*, defaults to `[2, 4, 4, 8, 8]`):
+            Ratios for downsampling in the encoder. These are used in reverse order for upsampling in the decoder.
+        channel_multiples (`List[int]`, *optional*, defaults to `[1, 2, 4, 8, 16]`):
+            Multiples used to determine the hidden sizes of the hidden layers.
+        decoder_channels (`int`, *optional*, defaults to 128):
+            Intermediate representation dimension for the decoder.
+        decoder_input_channels (`int`, *optional*, defaults to 64):
+            Input dimension for the decoder. Corresponds to the latent dimension.
+        audio_channels (`int`, *optional*, defaults to 2):
+            Number of channels in the audio data. Either 1 for mono or 2 for stereo.
+        sampling_rate (`int`, *optional*, defaults to 44100):
+            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+    """
+
+    _supports_gradient_checkpointing = False
+
+    @register_to_config
+    def __init__(
+        self,
+        encoder_hidden_size=128,
+        downsampling_ratios=[2, 4, 4, 8, 8],
+        channel_multiples=[1, 2, 4, 8, 16],
+        decoder_channels=128,
+        decoder_input_channels=64,
+        audio_channels=2,
+        sampling_rate=44100,
+    ):
+        super().__init__()
+
+        self.encoder_hidden_size = encoder_hidden_size
+        self.downsampling_ratios = downsampling_ratios
+        self.decoder_channels = decoder_channels
+        self.upsampling_ratios = downsampling_ratios[::-1]
+        self.hop_length = int(np.prod(downsampling_ratios))
+        self.sampling_rate = sampling_rate
+
+        self.encoder = OobleckEncoder(
+            encoder_hidden_size=encoder_hidden_size,
+            audio_channels=audio_channels,
+            downsampling_ratios=downsampling_ratios,
+            channel_multiples=channel_multiples,
+        )
+
+        self.decoder = OobleckDecoder(
+            channels=decoder_channels,
+            input_channels=decoder_input_channels,
+            audio_channels=audio_channels,
+            upsampling_ratios=self.upsampling_ratios,
+            channel_multiples=channel_multiples,
+        )
+
+        self.use_slicing = False
+
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderOobleckOutput, Tuple[OobleckDiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self.encoder(x)
+
+        posterior = OobleckDiagonalGaussianDistribution(h)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderOobleckOutput(latent_dist=posterior)
+
+    def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[OobleckDecoderOutput, torch.Tensor]:
+        dec = self.decoder(z)
+
+        if not return_dict:
+            return (dec,)
+
+        return OobleckDecoderOutput(sample=dec)
+
+    @apply_forward_hook
+    def decode(
+        self, z: torch.FloatTensor, return_dict: bool = True, generator=None
+    ) -> Union[OobleckDecoderOutput, torch.FloatTensor]:
+        """
+        Decode a batch of images.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.OobleckDecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.OobleckDecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.OobleckDecoderOutput`] is returned, otherwise a plain `tuple`
+                is returned.
+
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+
+        if not return_dict:
+            return (decoded,)
+
+        return OobleckDecoderOutput(sample=decoded)
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[OobleckDecoderOutput, torch.Tensor]:
+        r"""
+        Args:
+            sample (`torch.Tensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`OobleckDecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return OobleckDecoderOutput(sample=dec)
diff --git a/diffusers/src/diffusers/models/autoencoders/autoencoder_tiny.py b/diffusers/src/diffusers/models/autoencoders/autoencoder_tiny.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef43526cf8a03ea0792904fed7e7bc1ee8d8910c
--- /dev/null
+++ b/diffusers/src/diffusers/models/autoencoders/autoencoder_tiny.py
@@ -0,0 +1,347 @@
+# Copyright 2024 Ollin Boer Bohan and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ...utils.accelerate_utils import apply_forward_hook
+from ..modeling_utils import ModelMixin
+from .vae import DecoderOutput, DecoderTiny, EncoderTiny
+
+
+@dataclass
+class AutoencoderTinyOutput(BaseOutput):
+    """
+    Output of AutoencoderTiny encoding method.
+
+    Args:
+        latents (`torch.Tensor`): Encoded outputs of the `Encoder`.
+
+    """
+
+    latents: torch.Tensor
+
+
+class AutoencoderTiny(ModelMixin, ConfigMixin):
+    r"""
+    A tiny distilled VAE model for encoding images into latents and decoding latent representations into images.
+
+    [`AutoencoderTiny`] is a wrapper around the original implementation of `TAESD`.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for its generic methods implemented for
+    all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (`int`, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (`int`,  *optional*, defaults to 3): Number of channels in the output.
+        encoder_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64, 64, 64, 64)`):
+            Tuple of integers representing the number of output channels for each encoder block. The length of the
+            tuple should be equal to the number of encoder blocks.
+        decoder_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64, 64, 64, 64)`):
+            Tuple of integers representing the number of output channels for each decoder block. The length of the
+            tuple should be equal to the number of decoder blocks.
+        act_fn (`str`, *optional*, defaults to `"relu"`):
+            Activation function to be used throughout the model.
+        latent_channels (`int`, *optional*, defaults to 4):
+            Number of channels in the latent representation. The latent space acts as a compressed representation of
+            the input image.
+        upsampling_scaling_factor (`int`, *optional*, defaults to 2):
+            Scaling factor for upsampling in the decoder. It determines the size of the output image during the
+            upsampling process.
+        num_encoder_blocks (`Tuple[int]`, *optional*, defaults to `(1, 3, 3, 3)`):
+            Tuple of integers representing the number of encoder blocks at each stage of the encoding process. The
+            length of the tuple should be equal to the number of stages in the encoder. Each stage has a different
+            number of encoder blocks.
+        num_decoder_blocks (`Tuple[int]`, *optional*, defaults to `(3, 3, 3, 1)`):
+            Tuple of integers representing the number of decoder blocks at each stage of the decoding process. The
+            length of the tuple should be equal to the number of stages in the decoder. Each stage has a different
+            number of decoder blocks.
+        latent_magnitude (`float`, *optional*, defaults to 3.0):
+            Magnitude of the latent representation. This parameter scales the latent representation values to control
+            the extent of information preservation.
+        latent_shift (float, *optional*, defaults to 0.5):
+            Shift applied to the latent representation. This parameter controls the center of the latent space.
+        scaling_factor (`float`, *optional*, defaults to 1.0):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper. For this Autoencoder,
+            however, no such scaling factor was used, hence the value of 1.0 as the default.
+        force_upcast (`bool`, *optional*, default to `False`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without losing too much precision, in which case
+            `force_upcast` can be set to `False` (see this fp16-friendly
+            [AutoEncoder](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        encoder_block_out_channels: Tuple[int, ...] = (64, 64, 64, 64),
+        decoder_block_out_channels: Tuple[int, ...] = (64, 64, 64, 64),
+        act_fn: str = "relu",
+        latent_channels: int = 4,
+        upsampling_scaling_factor: int = 2,
+        num_encoder_blocks: Tuple[int, ...] = (1, 3, 3, 3),
+        num_decoder_blocks: Tuple[int, ...] = (3, 3, 3, 1),
+        latent_magnitude: int = 3,
+        latent_shift: float = 0.5,
+        force_upcast: bool = False,
+        scaling_factor: float = 1.0,
+    ):
+        super().__init__()
+
+        if len(encoder_block_out_channels) != len(num_encoder_blocks):
+            raise ValueError("`encoder_block_out_channels` should have the same length as `num_encoder_blocks`.")
+        if len(decoder_block_out_channels) != len(num_decoder_blocks):
+            raise ValueError("`decoder_block_out_channels` should have the same length as `num_decoder_blocks`.")
+
+        self.encoder = EncoderTiny(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            num_blocks=num_encoder_blocks,
+            block_out_channels=encoder_block_out_channels,
+            act_fn=act_fn,
+        )
+
+        self.decoder = DecoderTiny(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            num_blocks=num_decoder_blocks,
+            block_out_channels=decoder_block_out_channels,
+            upsampling_scaling_factor=upsampling_scaling_factor,
+            act_fn=act_fn,
+        )
+
+        self.latent_magnitude = latent_magnitude
+        self.latent_shift = latent_shift
+        self.scaling_factor = scaling_factor
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+        # only relevant if vae tiling is enabled
+        self.spatial_scale_factor = 2**out_channels
+        self.tile_overlap_factor = 0.125
+        self.tile_sample_min_size = 512
+        self.tile_latent_min_size = self.tile_sample_min_size // self.spatial_scale_factor
+
+        self.register_to_config(block_out_channels=decoder_block_out_channels)
+        self.register_to_config(force_upcast=False)
+
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (EncoderTiny, DecoderTiny)):
+            module.gradient_checkpointing = value
+
+    def scale_latents(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        """raw latents -> [0, 1]"""
+        return x.div(2 * self.latent_magnitude).add(self.latent_shift).clamp(0, 1)
+
+    def unscale_latents(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        """[0, 1] -> raw latents"""
+        return x.sub(self.latent_shift).mul(2 * self.latent_magnitude)
+
+    def enable_slicing(self) -> None:
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self) -> None:
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    def enable_tiling(self, use_tiling: bool = True) -> None:
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.use_tiling = use_tiling
+
+    def disable_tiling(self) -> None:
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.enable_tiling(False)
+
+    def _tiled_encode(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+
+        Returns:
+            `torch.FloatTensor`: Encoded batch of images.
+        """
+        # scale of encoder output relative to input
+        sf = self.spatial_scale_factor
+        tile_size = self.tile_sample_min_size
+
+        # number of pixels to blend and to traverse between tile
+        blend_size = int(tile_size * self.tile_overlap_factor)
+        traverse_size = tile_size - blend_size
+
+        # tiles index (up/left)
+        ti = range(0, x.shape[-2], traverse_size)
+        tj = range(0, x.shape[-1], traverse_size)
+
+        # mask for blending
+        blend_masks = torch.stack(
+            torch.meshgrid([torch.arange(tile_size / sf) / (blend_size / sf - 1)] * 2, indexing="ij")
+        )
+        blend_masks = blend_masks.clamp(0, 1).to(x.device)
+
+        # output array
+        out = torch.zeros(x.shape[0], 4, x.shape[-2] // sf, x.shape[-1] // sf, device=x.device)
+        for i in ti:
+            for j in tj:
+                tile_in = x[..., i : i + tile_size, j : j + tile_size]
+                # tile result
+                tile_out = out[..., i // sf : (i + tile_size) // sf, j // sf : (j + tile_size) // sf]
+                tile = self.encoder(tile_in)
+                h, w = tile.shape[-2], tile.shape[-1]
+                # blend tile result into output
+                blend_mask_i = torch.ones_like(blend_masks[0]) if i == 0 else blend_masks[0]
+                blend_mask_j = torch.ones_like(blend_masks[1]) if j == 0 else blend_masks[1]
+                blend_mask = blend_mask_i * blend_mask_j
+                tile, blend_mask = tile[..., :h, :w], blend_mask[..., :h, :w]
+                tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out)
+        return out
+
+    def _tiled_decode(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+
+        Returns:
+            `torch.FloatTensor`: Encoded batch of images.
+        """
+        # scale of decoder output relative to input
+        sf = self.spatial_scale_factor
+        tile_size = self.tile_latent_min_size
+
+        # number of pixels to blend and to traverse between tiles
+        blend_size = int(tile_size * self.tile_overlap_factor)
+        traverse_size = tile_size - blend_size
+
+        # tiles index (up/left)
+        ti = range(0, x.shape[-2], traverse_size)
+        tj = range(0, x.shape[-1], traverse_size)
+
+        # mask for blending
+        blend_masks = torch.stack(
+            torch.meshgrid([torch.arange(tile_size * sf) / (blend_size * sf - 1)] * 2, indexing="ij")
+        )
+        blend_masks = blend_masks.clamp(0, 1).to(x.device)
+
+        # output array
+        out = torch.zeros(x.shape[0], 3, x.shape[-2] * sf, x.shape[-1] * sf, device=x.device)
+        for i in ti:
+            for j in tj:
+                tile_in = x[..., i : i + tile_size, j : j + tile_size]
+                # tile result
+                tile_out = out[..., i * sf : (i + tile_size) * sf, j * sf : (j + tile_size) * sf]
+                tile = self.decoder(tile_in)
+                h, w = tile.shape[-2], tile.shape[-1]
+                # blend tile result into output
+                blend_mask_i = torch.ones_like(blend_masks[0]) if i == 0 else blend_masks[0]
+                blend_mask_j = torch.ones_like(blend_masks[1]) if j == 0 else blend_masks[1]
+                blend_mask = (blend_mask_i * blend_mask_j)[..., :h, :w]
+                tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out)
+        return out
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderTinyOutput, Tuple[torch.FloatTensor]]:
+        if self.use_slicing and x.shape[0] > 1:
+            output = [
+                self._tiled_encode(x_slice) if self.use_tiling else self.encoder(x_slice) for x_slice in x.split(1)
+            ]
+            output = torch.cat(output)
+        else:
+            output = self._tiled_encode(x) if self.use_tiling else self.encoder(x)
+
+        if not return_dict:
+            return (output,)
+
+        return AutoencoderTinyOutput(latents=output)
+
+    @apply_forward_hook
+    def decode(
+        self, x: torch.FloatTensor, generator: Optional[torch.Generator] = None, return_dict: bool = True
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        if self.use_slicing and x.shape[0] > 1:
+            output = [self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.split(1)]
+            output = torch.cat(output)
+        else:
+            output = self._tiled_decode(x) if self.use_tiling else self.decoder(x)
+
+        if not return_dict:
+            return (output,)
+
+        return DecoderOutput(sample=output)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        enc = self.encode(sample).latents
+
+        # scale latents to be in [0, 1], then quantize latents to a byte tensor,
+        # as if we were storing the latents in an RGBA uint8 image.
+        scaled_enc = self.scale_latents(enc).mul_(255).round_().byte()
+
+        # unquantize latents back into [0, 1], then unscale latents back to their original range,
+        # as if we were loading the latents from an RGBA uint8 image.
+        unscaled_enc = self.unscale_latents(scaled_enc / 255.0)
+
+        dec = self.decode(unscaled_enc)
+
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
diff --git a/diffusers/src/diffusers/models/autoencoders/consistency_decoder_vae.py b/diffusers/src/diffusers/models/autoencoders/consistency_decoder_vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..72c512da983060685375311567363884d652585e
--- /dev/null
+++ b/diffusers/src/diffusers/models/autoencoders/consistency_decoder_vae.py
@@ -0,0 +1,435 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...schedulers import ConsistencyDecoderScheduler
+from ...utils import BaseOutput
+from ...utils.accelerate_utils import apply_forward_hook
+from ...utils.torch_utils import randn_tensor
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ..modeling_utils import ModelMixin
+from ..unets.unet_2d import UNet2DModel
+from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder
+
+
+@dataclass
+class ConsistencyDecoderVAEOutput(BaseOutput):
+    """
+    Output of encoding method.
+
+    Args:
+        latent_dist (`DiagonalGaussianDistribution`):
+            Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
+            `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
+    """
+
+    latent_dist: "DiagonalGaussianDistribution"
+
+
+class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
+    r"""
+    The consistency decoder used with DALL-E 3.
+
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPipeline, ConsistencyDecoderVAE
+
+        >>> vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", vae=vae, torch_dtype=torch.float16
+        ... ).to("cuda")
+
+        >>> pipe("horse", generator=torch.manual_seed(0)).images
+        ```
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        scaling_factor: float = 0.18215,
+        latent_channels: int = 4,
+        encoder_act_fn: str = "silu",
+        encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
+        encoder_double_z: bool = True,
+        encoder_down_block_types: Tuple[str, ...] = (
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+        ),
+        encoder_in_channels: int = 3,
+        encoder_layers_per_block: int = 2,
+        encoder_norm_num_groups: int = 32,
+        encoder_out_channels: int = 4,
+        decoder_add_attention: bool = False,
+        decoder_block_out_channels: Tuple[int, ...] = (320, 640, 1024, 1024),
+        decoder_down_block_types: Tuple[str, ...] = (
+            "ResnetDownsampleBlock2D",
+            "ResnetDownsampleBlock2D",
+            "ResnetDownsampleBlock2D",
+            "ResnetDownsampleBlock2D",
+        ),
+        decoder_downsample_padding: int = 1,
+        decoder_in_channels: int = 7,
+        decoder_layers_per_block: int = 3,
+        decoder_norm_eps: float = 1e-05,
+        decoder_norm_num_groups: int = 32,
+        decoder_num_train_timesteps: int = 1024,
+        decoder_out_channels: int = 6,
+        decoder_resnet_time_scale_shift: str = "scale_shift",
+        decoder_time_embedding_type: str = "learned",
+        decoder_up_block_types: Tuple[str, ...] = (
+            "ResnetUpsampleBlock2D",
+            "ResnetUpsampleBlock2D",
+            "ResnetUpsampleBlock2D",
+            "ResnetUpsampleBlock2D",
+        ),
+    ):
+        super().__init__()
+        self.encoder = Encoder(
+            act_fn=encoder_act_fn,
+            block_out_channels=encoder_block_out_channels,
+            double_z=encoder_double_z,
+            down_block_types=encoder_down_block_types,
+            in_channels=encoder_in_channels,
+            layers_per_block=encoder_layers_per_block,
+            norm_num_groups=encoder_norm_num_groups,
+            out_channels=encoder_out_channels,
+        )
+
+        self.decoder_unet = UNet2DModel(
+            add_attention=decoder_add_attention,
+            block_out_channels=decoder_block_out_channels,
+            down_block_types=decoder_down_block_types,
+            downsample_padding=decoder_downsample_padding,
+            in_channels=decoder_in_channels,
+            layers_per_block=decoder_layers_per_block,
+            norm_eps=decoder_norm_eps,
+            norm_num_groups=decoder_norm_num_groups,
+            num_train_timesteps=decoder_num_train_timesteps,
+            out_channels=decoder_out_channels,
+            resnet_time_scale_shift=decoder_resnet_time_scale_shift,
+            time_embedding_type=decoder_time_embedding_type,
+            up_block_types=decoder_up_block_types,
+        )
+        self.decoder_scheduler = ConsistencyDecoderScheduler()
+        self.register_to_config(block_out_channels=encoder_block_out_channels)
+        self.register_to_config(force_upcast=False)
+        self.register_buffer(
+            "means",
+            torch.tensor([0.38862467, 0.02253063, 0.07381133, -0.0171294])[None, :, None, None],
+            persistent=False,
+        )
+        self.register_buffer(
+            "stds", torch.tensor([0.9654121, 1.0440036, 0.76147926, 0.77022034])[None, :, None, None], persistent=False
+        )
+
+        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.enable_tiling
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.use_tiling = use_tiling
+
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.disable_tiling
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.enable_tiling(False)
+
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.enable_slicing
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.disable_slicing
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[ConsistencyDecoderVAEOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.consistecy_decoder_vae.ConsistencyDecoderOoutput`] instead of a plain
+                tuple.
+
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned, otherwise a plain `tuple`
+                is returned.
+        """
+        if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            return self.tiled_encode(x, return_dict=return_dict)
+
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self.encoder(x)
+
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return ConsistencyDecoderVAEOutput(latent_dist=posterior)
+
+    @apply_forward_hook
+    def decode(
+        self,
+        z: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+        num_inference_steps: int = 2,
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        z = (z * self.config.scaling_factor - self.means) / self.stds
+
+        scale_factor = 2 ** (len(self.config.block_out_channels) - 1)
+        z = F.interpolate(z, mode="nearest", scale_factor=scale_factor)
+
+        batch_size, _, height, width = z.shape
+
+        self.decoder_scheduler.set_timesteps(num_inference_steps, device=self.device)
+
+        x_t = self.decoder_scheduler.init_noise_sigma * randn_tensor(
+            (batch_size, 3, height, width), generator=generator, dtype=z.dtype, device=z.device
+        )
+
+        for t in self.decoder_scheduler.timesteps:
+            model_input = torch.concat([self.decoder_scheduler.scale_model_input(x_t, t), z], dim=1)
+            model_output = self.decoder_unet(model_input, t).sample[:, :3, :, :]
+            prev_sample = self.decoder_scheduler.step(model_output, t, x_t, generator).prev_sample
+            x_t = prev_sample
+
+        x_0 = x_t
+
+        if not return_dict:
+            return (x_0,)
+
+        return DecoderOutput(sample=x_0)
+
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.blend_v
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
+        return b
+
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.blend_h
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
+        return b
+
+    def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> ConsistencyDecoderVAEOutput:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a
+                plain tuple.
+
+        Returns:
+            [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] or `tuple`:
+                If return_dict is True, a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned,
+                otherwise a plain `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+
+        # Split the image into 512x512 tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[2], overlap_size):
+            row = []
+            for j in range(0, x.shape[3], overlap_size):
+                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        moments = torch.cat(result_rows, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return ConsistencyDecoderVAEOutput(latent_dist=posterior)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+            generator (`torch.Generator`, *optional*, defaults to `None`):
+                Generator to use for sampling.
+
+        Returns:
+            [`DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`DecoderOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z, generator=generator).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
diff --git a/diffusers/src/diffusers/models/autoencoders/vae.py b/diffusers/src/diffusers/models/autoencoders/vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad9866115f795a784bbef0dcb0cd39262491a528
--- /dev/null
+++ b/diffusers/src/diffusers/models/autoencoders/vae.py
@@ -0,0 +1,1005 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ...utils import BaseOutput, is_torch_version
+from ...utils.torch_utils import randn_tensor
+from ..activations import get_activation
+from ..attention_processor import SpatialNorm
+from ..unets.unet_2d_blocks import (
+    AutoencoderTinyBlock,
+    UNetMidBlock2D,
+    get_down_block,
+    get_up_block,
+)
+
+
+@dataclass
+class DecoderOutput(BaseOutput):
+    r"""
+    Output of decoding method.
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The decoded output sample from the last layer of the model.
+    """
+
+    sample: torch.FloatTensor
+
+
+class Encoder(nn.Module):
+    r"""
+    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
+            options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        double_z (`bool`, *optional*, defaults to `True`):
+            Whether to double the number of output channels for the last block.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        double_z: bool = True,
+        mid_block_add_attention=True,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.mid_block = None
+        self.down_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=self.layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-6,
+                downsample_padding=0,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=None,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=None,
+            add_attention=mid_block_add_attention,
+        )
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+
+        conv_out_channels = 2 * out_channels if double_z else out_channels
+        self.conv_out = nn.Conv2d(block_out_channels[-1], conv_out_channels, 3, padding=1)
+
+        self.gradient_checkpointing = False
+
+    def forward(self, sample: torch.FloatTensor, hidden_flag = False) -> torch.FloatTensor:
+        r"""The forward method of the `Encoder` class."""
+
+        sample = self.conv_in(sample)
+        hidden_list = []
+
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            # down
+            if is_torch_version(">=", "1.11.0"):
+                for down_block in self.down_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(down_block), sample, use_reentrant=False
+                    )
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, use_reentrant=False
+                )
+            else:
+                for down_block in self.down_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample)
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+
+        else:
+            # down
+            hidden_list.append(sample)
+            for down_block in self.down_blocks:
+                sample = down_block(sample)
+                hidden_list.append(sample)
+
+            # middle
+            sample = self.mid_block(sample)
+            # hidden_list.append(sample)
+
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if hidden_flag:
+            return sample, hidden_list
+        else:
+            return sample
+
+
+class Decoder(nn.Module):
+    r"""
+    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            The normalization type to use. Can be either `"group"` or `"spatial"`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        norm_type: str = "group",  # group, spatial
+        mid_block_add_attention=True,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+
+        temb_channels = in_channels if norm_type == "spatial" else None
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=temb_channels,
+            add_attention=mid_block_add_attention,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=not is_final_block,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=temb_channels,
+                resnet_time_scale_shift=norm_type,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_type == "spatial":
+            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
+        else:
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        latent_embeds: Optional[torch.FloatTensor] = None,
+        hidden_list: list = None,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `Decoder` class."""
+
+        if hidden_list is not None:
+            hidden_list.reverse()
+            hidden_idx = 0
+        sample = self.conv_in(sample)
+
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    latent_embeds,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        latent_embeds,
+                        use_reentrant=False,
+                    )
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, latent_embeds
+                )
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+        else:
+            # middle
+            # print(sample.shape)
+            if hidden_list is not None:
+                # print(sample.shape, hidden_list[hidden_idx].shape)
+                sample += hidden_list[hidden_idx]
+                hidden_idx += 1
+            sample = self.mid_block(sample, latent_embeds)
+            sample = sample.to(upscale_dtype)
+            
+
+            # up
+            for up_block in self.up_blocks:
+                # print(sample.shape)
+                if hidden_list is not None:
+                    # print(sample.shape, hidden_list[hidden_idx].shape)
+                    sample += hidden_list[hidden_idx]
+                    hidden_idx += 1
+                sample = up_block(sample, latent_embeds)
+
+        # post-process
+        if latent_embeds is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, latent_embeds)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class UpSample(nn.Module):
+    r"""
+    The `UpSample` layer of a variational autoencoder that upsamples its input.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1)
+
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `UpSample` class."""
+        x = torch.relu(x)
+        x = self.deconv(x)
+        return x
+
+
+class MaskConditionEncoder(nn.Module):
+    """
+    used in AsymmetricAutoencoderKL
+    """
+
+    def __init__(
+        self,
+        in_ch: int,
+        out_ch: int = 192,
+        res_ch: int = 768,
+        stride: int = 16,
+    ) -> None:
+        super().__init__()
+
+        channels = []
+        while stride > 1:
+            stride = stride // 2
+            in_ch_ = out_ch * 2
+            if out_ch > res_ch:
+                out_ch = res_ch
+            if stride == 1:
+                in_ch_ = res_ch
+            channels.append((in_ch_, out_ch))
+            out_ch *= 2
+
+        out_channels = []
+        for _in_ch, _out_ch in channels:
+            out_channels.append(_out_ch)
+        out_channels.append(channels[-1][0])
+
+        layers = []
+        in_ch_ = in_ch
+        for l in range(len(out_channels)):
+            out_ch_ = out_channels[l]
+            if l == 0 or l == 1:
+                layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=3, stride=1, padding=1))
+            else:
+                layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=4, stride=2, padding=1))
+            in_ch_ = out_ch_
+
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, x: torch.FloatTensor, mask=None) -> torch.FloatTensor:
+        r"""The forward method of the `MaskConditionEncoder` class."""
+        out = {}
+        for l in range(len(self.layers)):
+            layer = self.layers[l]
+            x = layer(x)
+            out[str(tuple(x.shape))] = x
+            x = torch.relu(x)
+        return out
+
+
+class MaskConditionDecoder(nn.Module):
+    r"""The `MaskConditionDecoder` should be used in combination with [`AsymmetricAutoencoderKL`] to enhance the model's
+    decoder with a conditioner on the mask and masked image.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            The normalization type to use. Can be either `"group"` or `"spatial"`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        norm_type: str = "group",  # group, spatial
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+
+        temb_channels = in_channels if norm_type == "spatial" else None
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=temb_channels,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=not is_final_block,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=temb_channels,
+                resnet_time_scale_shift=norm_type,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # condition encoder
+        self.condition_encoder = MaskConditionEncoder(
+            in_ch=out_channels,
+            out_ch=block_out_channels[0],
+            res_ch=block_out_channels[-1],
+        )
+
+        # out
+        if norm_type == "spatial":
+            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
+        else:
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        z: torch.FloatTensor,
+        image: Optional[torch.FloatTensor] = None,
+        mask: Optional[torch.FloatTensor] = None,
+        latent_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `MaskConditionDecoder` class."""
+        sample = z
+        sample = self.conv_in(sample)
+
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    latent_embeds,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+
+                # condition encoder
+                if image is not None and mask is not None:
+                    masked_image = (1 - mask) * image
+                    im_x = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(self.condition_encoder),
+                        masked_image,
+                        mask,
+                        use_reentrant=False,
+                    )
+
+                # up
+                for up_block in self.up_blocks:
+                    if image is not None and mask is not None:
+                        sample_ = im_x[str(tuple(sample.shape))]
+                        mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
+                        sample = sample * mask_ + sample_ * (1 - mask_)
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        latent_embeds,
+                        use_reentrant=False,
+                    )
+                if image is not None and mask is not None:
+                    sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, latent_embeds
+                )
+                sample = sample.to(upscale_dtype)
+
+                # condition encoder
+                if image is not None and mask is not None:
+                    masked_image = (1 - mask) * image
+                    im_x = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(self.condition_encoder),
+                        masked_image,
+                        mask,
+                    )
+
+                # up
+                for up_block in self.up_blocks:
+                    if image is not None and mask is not None:
+                        sample_ = im_x[str(tuple(sample.shape))]
+                        mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
+                        sample = sample * mask_ + sample_ * (1 - mask_)
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+                if image is not None and mask is not None:
+                    sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
+        else:
+            # middle
+            sample = self.mid_block(sample, latent_embeds)
+            sample = sample.to(upscale_dtype)
+
+            # condition encoder
+            if image is not None and mask is not None:
+                masked_image = (1 - mask) * image
+                im_x = self.condition_encoder(masked_image, mask)
+
+            # up
+            for up_block in self.up_blocks:
+                if image is not None and mask is not None:
+                    sample_ = im_x[str(tuple(sample.shape))]
+                    mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
+                    sample = sample * mask_ + sample_ * (1 - mask_)
+                sample = up_block(sample, latent_embeds)
+            if image is not None and mask is not None:
+                sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
+
+        # post-process
+        if latent_embeds is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, latent_embeds)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class VectorQuantizer(nn.Module):
+    """
+    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix
+    multiplications and allows for post-hoc remapping of indices.
+    """
+
+    # NOTE: due to a bug the beta term was applied to the wrong term. for
+    # backwards compatibility we use the buggy version by default, but you can
+    # specify legacy=False to fix it.
+    def __init__(
+        self,
+        n_e: int,
+        vq_embed_dim: int,
+        beta: float,
+        remap=None,
+        unknown_index: str = "random",
+        sane_index_shape: bool = False,
+        legacy: bool = True,
+    ):
+        super().__init__()
+        self.n_e = n_e
+        self.vq_embed_dim = vq_embed_dim
+        self.beta = beta
+        self.legacy = legacy
+
+        self.embedding = nn.Embedding(self.n_e, self.vq_embed_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+
+        self.remap = remap
+        if self.remap is not None:
+            self.register_buffer("used", torch.tensor(np.load(self.remap)))
+            self.used: torch.Tensor
+            self.re_embed = self.used.shape[0]
+            self.unknown_index = unknown_index  # "random" or "extra" or integer
+            if self.unknown_index == "extra":
+                self.unknown_index = self.re_embed
+                self.re_embed = self.re_embed + 1
+            print(
+                f"Remapping {self.n_e} indices to {self.re_embed} indices. "
+                f"Using {self.unknown_index} for unknown indices."
+            )
+        else:
+            self.re_embed = n_e
+
+        self.sane_index_shape = sane_index_shape
+
+    def remap_to_used(self, inds: torch.LongTensor) -> torch.LongTensor:
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        match = (inds[:, :, None] == used[None, None, ...]).long()
+        new = match.argmax(-1)
+        unknown = match.sum(2) < 1
+        if self.unknown_index == "random":
+            new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device)
+        else:
+            new[unknown] = self.unknown_index
+        return new.reshape(ishape)
+
+    def unmap_to_all(self, inds: torch.LongTensor) -> torch.LongTensor:
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        if self.re_embed > self.used.shape[0]:  # extra token
+            inds[inds >= self.used.shape[0]] = 0  # simply set to zero
+        back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
+        return back.reshape(ishape)
+
+    def forward(self, z: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatTensor, Tuple]:
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = z.permute(0, 2, 3, 1).contiguous()
+        z_flattened = z.view(-1, self.vq_embed_dim)
+
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        min_encoding_indices = torch.argmin(torch.cdist(z_flattened, self.embedding.weight), dim=1)
+
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+        perplexity = None
+        min_encodings = None
+
+        # compute loss for embedding
+        if not self.legacy:
+            loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean((z_q - z.detach()) ** 2)
+        else:
+            loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
+
+        # preserve gradients
+        z_q: torch.FloatTensor = z + (z_q - z).detach()
+
+        # reshape back to match original input shape
+        z_q = z_q.permute(0, 3, 1, 2).contiguous()
+
+        if self.remap is not None:
+            min_encoding_indices = min_encoding_indices.reshape(z.shape[0], -1)  # add batch axis
+            min_encoding_indices = self.remap_to_used(min_encoding_indices)
+            min_encoding_indices = min_encoding_indices.reshape(-1, 1)  # flatten
+
+        if self.sane_index_shape:
+            min_encoding_indices = min_encoding_indices.reshape(z_q.shape[0], z_q.shape[2], z_q.shape[3])
+
+        return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
+
+    def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.FloatTensor:
+        # shape specifying (batch, height, width, channel)
+        if self.remap is not None:
+            indices = indices.reshape(shape[0], -1)  # add batch axis
+            indices = self.unmap_to_all(indices)
+            indices = indices.reshape(-1)  # flatten again
+
+        # get quantized latent vectors
+        z_q: torch.FloatTensor = self.embedding(indices)
+
+        if shape is not None:
+            z_q = z_q.view(shape)
+            # reshape back to match original input shape
+            z_q = z_q.permute(0, 3, 1, 2).contiguous()
+
+        return z_q
+
+
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(
+                self.mean, device=self.parameters.device, dtype=self.parameters.dtype
+            )
+
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = randn_tensor(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+
+    def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2, 3],
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=[1, 2, 3],
+                )
+
+    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+
+    def mode(self) -> torch.Tensor:
+        return self.mean
+
+
+class EncoderTiny(nn.Module):
+    r"""
+    The `EncoderTiny` layer is a simpler version of the `Encoder` layer.
+
+    Args:
+        in_channels (`int`):
+            The number of input channels.
+        out_channels (`int`):
+            The number of output channels.
+        num_blocks (`Tuple[int, ...]`):
+            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
+            use.
+        block_out_channels (`Tuple[int, ...]`):
+            The number of output channels for each block.
+        act_fn (`str`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_blocks: Tuple[int, ...],
+        block_out_channels: Tuple[int, ...],
+        act_fn: str,
+    ):
+        super().__init__()
+
+        layers = []
+        for i, num_block in enumerate(num_blocks):
+            num_channels = block_out_channels[i]
+
+            if i == 0:
+                layers.append(nn.Conv2d(in_channels, num_channels, kernel_size=3, padding=1))
+            else:
+                layers.append(
+                    nn.Conv2d(
+                        num_channels,
+                        num_channels,
+                        kernel_size=3,
+                        padding=1,
+                        stride=2,
+                        bias=False,
+                    )
+                )
+
+            for _ in range(num_block):
+                layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))
+
+        layers.append(nn.Conv2d(block_out_channels[-1], out_channels, kernel_size=3, padding=1))
+
+        self.layers = nn.Sequential(*layers)
+        self.gradient_checkpointing = False
+
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `EncoderTiny` class."""
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False)
+            else:
+                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x)
+
+        else:
+            # scale image from [-1, 1] to [0, 1] to match TAESD convention
+            x = self.layers(x.add(1).div(2))
+
+        return x
+
+
+class DecoderTiny(nn.Module):
+    r"""
+    The `DecoderTiny` layer is a simpler version of the `Decoder` layer.
+
+    Args:
+        in_channels (`int`):
+            The number of input channels.
+        out_channels (`int`):
+            The number of output channels.
+        num_blocks (`Tuple[int, ...]`):
+            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
+            use.
+        block_out_channels (`Tuple[int, ...]`):
+            The number of output channels for each block.
+        upsampling_scaling_factor (`int`):
+            The scaling factor to use for upsampling.
+        act_fn (`str`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_blocks: Tuple[int, ...],
+        block_out_channels: Tuple[int, ...],
+        upsampling_scaling_factor: int,
+        act_fn: str,
+    ):
+        super().__init__()
+
+        layers = [
+            nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=1),
+            get_activation(act_fn),
+        ]
+
+        for i, num_block in enumerate(num_blocks):
+            is_final_block = i == (len(num_blocks) - 1)
+            num_channels = block_out_channels[i]
+
+            for _ in range(num_block):
+                layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))
+
+            if not is_final_block:
+                layers.append(nn.Upsample(scale_factor=upsampling_scaling_factor))
+
+            conv_out_channel = num_channels if not is_final_block else out_channels
+            layers.append(
+                nn.Conv2d(
+                    num_channels,
+                    conv_out_channel,
+                    kernel_size=3,
+                    padding=1,
+                    bias=is_final_block,
+                )
+            )
+
+        self.layers = nn.Sequential(*layers)
+        self.gradient_checkpointing = False
+
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `DecoderTiny` class."""
+        # Clamp.
+        x = torch.tanh(x / 3) * 3
+
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False)
+            else:
+                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x)
+
+        else:
+            x = self.layers(x)
+
+        # scale image from [0, 1] to [-1, 1] to match diffusers convention
+        return x.mul(2).sub(1)
diff --git a/diffusers/src/diffusers/models/autoencoders/vq_model.py b/diffusers/src/diffusers/models/autoencoders/vq_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae8a118d719a5c46f19e31cc5b65fd5e215a90d0
--- /dev/null
+++ b/diffusers/src/diffusers/models/autoencoders/vq_model.py
@@ -0,0 +1,182 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ...utils.accelerate_utils import apply_forward_hook
+from ..autoencoders.vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
+from ..modeling_utils import ModelMixin
+
+
+@dataclass
+class VQEncoderOutput(BaseOutput):
+    """
+    Output of VQModel encoding method.
+
+    Args:
+        latents (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            The encoded output sample from the last layer of the model.
+    """
+
+    latents: torch.Tensor
+
+
+class VQModel(ModelMixin, ConfigMixin):
+    r"""
+    A VQ-VAE model for decoding latent representations.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        layers_per_block (`int`, *optional*, defaults to `1`): Number of layers per block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
+        norm_num_groups (`int`, *optional*, defaults to `32`): Number of groups for normalization layers.
+        vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
+        scaling_factor (`float`, *optional*, defaults to `0.18215`):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 3,
+        sample_size: int = 32,
+        num_vq_embeddings: int = 256,
+        norm_num_groups: int = 32,
+        vq_embed_dim: Optional[int] = None,
+        scaling_factor: float = 0.18215,
+        norm_type: str = "group",  # group, spatial
+        mid_block_add_attention=True,
+        lookup_from_codebook=False,
+        force_upcast=False,
+    ):
+        super().__init__()
+
+        # pass init params to Encoder
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=False,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+        vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
+
+        self.quant_conv = nn.Conv2d(latent_channels, vq_embed_dim, 1)
+        self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False)
+        self.post_quant_conv = nn.Conv2d(vq_embed_dim, latent_channels, 1)
+
+        # pass init params to Decoder
+        self.decoder = Decoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            norm_type=norm_type,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+    @apply_forward_hook
+    def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput:
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+
+        if not return_dict:
+            return (h,)
+
+        return VQEncoderOutput(latents=h)
+
+    @apply_forward_hook
+    def decode(
+        self, h: torch.Tensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        # also go through quantization layer
+        if not force_not_quantize:
+            quant, commit_loss, _ = self.quantize(h)
+        elif self.config.lookup_from_codebook:
+            quant = self.quantize.get_codebook_entry(h, shape)
+            commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
+        else:
+            quant = h
+            commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
+        quant2 = self.post_quant_conv(quant)
+        dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
+
+        if not return_dict:
+            return dec, commit_loss
+
+        return DecoderOutput(sample=dec, commit_loss=commit_loss)
+
+    def forward(
+        self, sample: torch.Tensor, return_dict: bool = True
+    ) -> Union[DecoderOutput, Tuple[torch.Tensor, ...]]:
+        r"""
+        The [`VQModel`] forward method.
+
+        Args:
+            sample (`torch.Tensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.autoencoders.vq_model.VQEncoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.autoencoders.vq_model.VQEncoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoders.vq_model.VQEncoderOutput`] is returned, otherwise a
+                plain `tuple` is returned.
+        """
+
+        h = self.encode(sample).latents
+        dec = self.decode(h)
+
+        if not return_dict:
+            return dec.sample, dec.commit_loss
+        return dec
diff --git a/diffusers/src/diffusers/models/colorguider_sd.py b/diffusers/src/diffusers/models/colorguider_sd.py
new file mode 100644
index 0000000000000000000000000000000000000000..52b2b80dc33cbfd43967bad4e8cc26f2a107a430
--- /dev/null
+++ b/diffusers/src/diffusers/models/colorguider_sd.py
@@ -0,0 +1,937 @@
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from .embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
+from .modeling_utils import ModelMixin
+from .unets.unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    DownBlock2D,
+    UNetMidBlock2D,
+    get_down_block,
+    get_mid_block,
+    get_up_block,
+    MidBlock2D
+)
+
+from .unets.unet_2d_condition import UNet2DConditionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class ColorGuiderOutput(BaseOutput):
+    """
+    The output of [`ColorGuiderModel`].
+
+    Args:
+        up_block_res_samples (`tuple[torch.Tensor]`):
+            A tuple of upsample activations at different resolutions for each upsampling block. Each tensor should
+            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
+            used to condition the original UNet's upsampling activations.
+        down_block_res_samples (`tuple[torch.Tensor]`):
+            A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
+            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
+            used to condition the original UNet's downsampling activations.
+        mid_down_block_re_sample (`torch.Tensor`):
+            The activation of the midde block (the lowest sample resolution). Each tensor should be of shape
+            `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
+            Output can be used to condition the original UNet's middle block activation.
+    """
+
+    up_block_res_samples: Tuple[torch.Tensor]
+    down_block_res_samples: Tuple[torch.Tensor]
+    mid_block_res_sample: torch.Tensor
+
+
+class ColorGuiderSDModel(ModelMixin, ConfigMixin):
+    """
+    A ColorGuider model.
+
+    Args:
+        in_channels (`int`, defaults to 4):
+            The number of channels in the input sample.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, defaults to 0):
+            The frequency shift to apply to the time embedding.
+        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
+        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, defaults to 2):
+            The number of layers per block.
+        downsample_padding (`int`, defaults to 1):
+            The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, defaults to 1):
+            The scale factor to use for the mid block.
+        act_fn (`str`, defaults to "silu"):
+            The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
+            in post-processing.
+        norm_eps (`float`, defaults to 1e-5):
+            The epsilon to use for the normalization.
+        cross_attention_dim (`int`, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
+            The dimension of the attention heads.
+        use_linear_projection (`bool`, defaults to `False`):
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        num_class_embeds (`int`, *optional*, defaults to 0):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        upcast_attention (`bool`, defaults to `False`):
+        resnet_time_scale_shift (`str`, defaults to `"default"`):
+            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
+        projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
+            The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
+            `class_embed_type="projection"`.
+        brushnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
+            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+        conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
+            The tuple of output channel for each block in the `conditioning_embedding` layer.
+        global_pool_conditions (`bool`, defaults to `False`):
+            TODO(Patrick) - unused parameter.
+        addition_embed_type_num_heads (`int`, defaults to 64):
+            The number of heads to use for the `TextTimeEmbedding` layer.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 4,
+        conditioning_channels: int = 5,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str, ...] = (
+            "SelfAttnDownBlock2D",
+            "SelfAttnDownBlock2D",
+            "SelfAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DSelfAttn",
+        up_block_types: Tuple[str, ...] = (
+            "UpBlock2D", 
+            "SelfAttnUpBlock2D", 
+            "SelfAttnUpBlock2D", 
+            "SelfAttnUpBlock2D",
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int, ...]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        brushnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
+        addition_embed_type_num_heads: int = 64,
+    ):
+        super().__init__()
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
+        # input
+        conv_in_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in_condition = nn.Conv2d(
+            in_channels+conditioning_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+        )
+
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+
+        self.down_blocks = nn.ModuleList([])
+        self.brushnet_down_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+
+        brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+        brushnet_block = zero_module(brushnet_block)
+        self.brushnet_down_blocks.append(brushnet_block)
+
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[i],
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.down_blocks.append(down_block)
+
+            for _ in range(layers_per_block):
+                brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                brushnet_block = zero_module(brushnet_block)
+                self.brushnet_down_blocks.append(brushnet_block)
+
+            if not is_final_block:
+                brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                brushnet_block = zero_module(brushnet_block)
+                self.brushnet_down_blocks.append(brushnet_block)
+
+        # mid
+        mid_block_channel = block_out_channels[-1]
+
+        brushnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
+        brushnet_block = zero_module(brushnet_block)
+        self.brushnet_mid_block = brushnet_block
+
+        self.mid_block = get_mid_block(
+                mid_block_type,
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=mid_block_channel,
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+        )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_transformer_layers_per_block = (list(reversed(transformer_layers_per_block)))
+        only_cross_attention = list(reversed(only_cross_attention))
+
+        output_channel = reversed_block_out_channels[0]
+        
+        self.up_blocks = nn.ModuleList([])
+        self.brushnet_up_blocks = nn.ModuleList([])
+
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block+1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resolution_idx=i,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=reversed_num_attention_heads[i],
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+            
+            for _ in range(layers_per_block+1):
+                brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                brushnet_block = zero_module(brushnet_block)
+                self.brushnet_up_blocks.append(brushnet_block)
+
+            if not is_final_block:
+                brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                brushnet_block = zero_module(brushnet_block)
+                self.brushnet_up_blocks.append(brushnet_block)
+
+
+    @classmethod
+    def from_unet(
+        cls,
+        unet: UNet2DConditionModel,
+        brushnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
+        load_weights_from_unet: bool = True,
+        conditioning_channels: int = 5,
+    ):
+        r"""
+        Instantiate a [`BrushNetModel`] from [`UNet2DConditionModel`].
+
+        Parameters:
+            unet (`UNet2DConditionModel`):
+                The UNet model weights to copy to the [`BrushNetModel`]. All configuration options are also copied
+                where applicable.
+        """
+        transformer_layers_per_block = (
+            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
+        )
+        encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
+        encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
+        addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
+        addition_time_embed_dim = (
+            unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
+        )
+
+        brushnet = cls(
+            in_channels=unet.config.in_channels,
+            conditioning_channels=conditioning_channels,
+            flip_sin_to_cos=unet.config.flip_sin_to_cos,
+            freq_shift=unet.config.freq_shift,
+            down_block_types=["SelfAttnDownBlock2D",
+                            "SelfAttnDownBlock2D",
+                            "SelfAttnDownBlock2D",
+                            "DownBlock2D",],
+            mid_block_type='UNetMidBlock2DSelfAttn',
+            up_block_types=["UpBlock2D", 
+                            "SelfAttnUpBlock2D", 
+                            "SelfAttnUpBlock2D", 
+                            "SelfAttnUpBlock2D",],
+            only_cross_attention=unet.config.only_cross_attention,
+            block_out_channels=unet.config.block_out_channels,
+            layers_per_block=unet.config.layers_per_block,
+            downsample_padding=unet.config.downsample_padding,
+            mid_block_scale_factor=unet.config.mid_block_scale_factor,
+            act_fn=unet.config.act_fn,
+            norm_num_groups=unet.config.norm_num_groups,
+            norm_eps=unet.config.norm_eps,
+            cross_attention_dim=unet.config.cross_attention_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            encoder_hid_dim=encoder_hid_dim,
+            encoder_hid_dim_type=encoder_hid_dim_type,
+            attention_head_dim=unet.config.attention_head_dim,
+            num_attention_heads=unet.config.num_attention_heads,
+            use_linear_projection=unet.config.use_linear_projection,
+            class_embed_type=unet.config.class_embed_type,
+            addition_embed_type=addition_embed_type,
+            addition_time_embed_dim=addition_time_embed_dim,
+            num_class_embeds=unet.config.num_class_embeds,
+            upcast_attention=unet.config.upcast_attention,
+            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
+            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
+            brushnet_conditioning_channel_order=brushnet_conditioning_channel_order,
+            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
+        )
+
+        if load_weights_from_unet:
+            conv_in_condition_weight=torch.zeros_like(brushnet.conv_in_condition.weight)
+            conv_in_condition_weight[:,:4,...]=unet.conv_in.weight
+            conv_in_condition_weight[:,4:8,...]=unet.conv_in.weight
+            brushnet.conv_in_condition.weight=torch.nn.Parameter(conv_in_condition_weight)
+            brushnet.conv_in_condition.bias=unet.conv_in.bias
+
+            brushnet.time_proj.load_state_dict(unet.time_proj.state_dict())
+            brushnet.time_embedding.load_state_dict(unet.time_embedding.state_dict())
+
+            if brushnet.class_embedding:
+                brushnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())
+
+            brushnet.down_blocks.load_state_dict(unet.down_blocks.state_dict(),strict=False)
+            brushnet.mid_block.load_state_dict(unet.mid_block.state_dict(),strict=False)
+            brushnet.up_blocks.load_state_dict(unet.up_blocks.state_dict(),strict=False)
+
+        return brushnet
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        colorguider_cond: torch.FloatTensor,
+        conditioning_scale: float = 1.0,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[ColorGuiderOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]:
+        """
+        The [`BrushNetModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor.
+            timestep (`Union[torch.Tensor, float, int]`):
+                The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states.
+            brushnet_cond (`torch.FloatTensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                The scale factor for BrushNet outputs.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
+                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+                embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            added_cond_kwargs (`dict`):
+                Additional conditions for the Stable Diffusion XL UNet.
+            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
+            guess_mode (`bool`, defaults to `False`):
+                In this mode, the BrushNet encoder tries its best to recognize the input content of the input even if
+                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~models.brushnet.ColorGuiderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.brushnet.ColorGuiderOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.brushnet.ColorGuiderOutput`] is returned, otherwise a tuple is
+                returned where the first element is the sample tensor.
+        """
+        brushnet_cond = colorguider_cond
+        # check channel order
+        channel_order = self.config.brushnet_conditioning_channel_order
+
+        if channel_order == "rgb":
+            # in rgb order by default
+            ...
+        elif channel_order == "bgr":
+            brushnet_cond = torch.flip(brushnet_cond, dims=[1])
+        else:
+            raise ValueError(f"unknown `brushnet_conditioning_channel_order`: {channel_order}")
+
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+
+        if self.config.addition_embed_type is not None:
+            if self.config.addition_embed_type == "text":
+                aug_emb = self.add_embedding(encoder_hidden_states)
+
+            elif self.config.addition_embed_type == "text_time":
+                if "text_embeds" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                    )
+                text_embeds = added_cond_kwargs.get("text_embeds")
+                if "time_ids" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                    )
+                time_ids = added_cond_kwargs.get("time_ids")
+                time_embeds = self.add_time_proj(time_ids.flatten())
+                time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+
+                add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+                add_embeds = add_embeds.to(emb.dtype)
+                aug_emb = self.add_embedding(add_embeds)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        # 2. pre-process
+        brushnet_cond=torch.concat([sample,brushnet_cond],1)
+        sample = self.conv_in_condition(brushnet_cond)
+
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. PaintingNet down blocks
+        brushnet_down_block_res_samples = ()
+        for down_block_res_sample, brushnet_down_block in zip(down_block_res_samples, self.brushnet_down_blocks):
+            down_block_res_sample = brushnet_down_block(down_block_res_sample)
+            brushnet_down_block_res_samples = brushnet_down_block_res_samples + (down_block_res_sample,)
+
+
+        # 5. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+
+        # 6. BrushNet mid blocks
+        brushnet_mid_block_res_sample = self.brushnet_mid_block(sample)
+
+
+        # 7. up
+        up_block_res_samples = ()
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample, up_res_samples = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    return_res_samples=True
+                )
+            else:
+                sample, up_res_samples = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    return_res_samples=True
+                )
+
+            up_block_res_samples += up_res_samples
+
+        # 8. BrushNet up blocks
+        brushnet_up_block_res_samples = ()
+        for up_block_res_sample, brushnet_up_block in zip(up_block_res_samples, self.brushnet_up_blocks):
+            up_block_res_sample = brushnet_up_block(up_block_res_sample)
+            brushnet_up_block_res_samples = brushnet_up_block_res_samples + (up_block_res_sample,)
+
+        # 6. scaling
+        if guess_mode and not self.config.global_pool_conditions:
+            scales = torch.logspace(-1, 0, len(brushnet_down_block_res_samples) + 1 + len(brushnet_up_block_res_samples), device=sample.device)  # 0.1 to 1.0
+            scales = scales * conditioning_scale
+
+            brushnet_down_block_res_samples = [sample * scale for sample, scale in zip(brushnet_down_block_res_samples, scales[:len(brushnet_down_block_res_samples)])]
+            brushnet_mid_block_res_sample = brushnet_mid_block_res_sample * scales[len(brushnet_down_block_res_samples)]
+            brushnet_up_block_res_samples = [sample * scale for sample, scale in zip(brushnet_up_block_res_samples, scales[len(brushnet_down_block_res_samples)+1:])]
+        else:
+            brushnet_down_block_res_samples = [sample * conditioning_scale for sample in brushnet_down_block_res_samples]
+            brushnet_mid_block_res_sample = brushnet_mid_block_res_sample * conditioning_scale
+            brushnet_up_block_res_samples = [sample * conditioning_scale for sample in brushnet_up_block_res_samples]
+
+
+        if self.config.global_pool_conditions:
+            brushnet_down_block_res_samples = [
+                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in brushnet_down_block_res_samples
+            ]
+            brushnet_mid_block_res_sample = torch.mean(brushnet_mid_block_res_sample, dim=(2, 3), keepdim=True)
+            brushnet_up_block_res_samples = [
+                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in brushnet_up_block_res_samples
+            ]
+
+        if not return_dict:
+            return (brushnet_down_block_res_samples, brushnet_mid_block_res_sample, brushnet_up_block_res_samples)
+
+        return ColorGuiderOutput(
+            down_block_res_samples=brushnet_down_block_res_samples, 
+            mid_block_res_sample=brushnet_mid_block_res_sample,
+            up_block_res_samples=brushnet_up_block_res_samples
+        )
+
+
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
diff --git a/diffusers/src/diffusers/models/controlnet.py b/diffusers/src/diffusers/models/controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3ae96605077678ec6b17706c0d1fed0727de391
--- /dev/null
+++ b/diffusers/src/diffusers/models/controlnet.py
@@ -0,0 +1,870 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..loaders.single_file_model import FromOriginalModelMixin
+from ..utils import BaseOutput, logging
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from .embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
+from .modeling_utils import ModelMixin
+from .unets.unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    DownBlock2D,
+    UNetMidBlock2D,
+    UNetMidBlock2DCrossAttn,
+    get_down_block,
+)
+from .unets.unet_2d_condition import UNet2DConditionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class ControlNetOutput(BaseOutput):
+    """
+    The output of [`ControlNetModel`].
+
+    Args:
+        down_block_res_samples (`tuple[torch.Tensor]`):
+            A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
+            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
+            used to condition the original UNet's downsampling activations.
+        mid_down_block_re_sample (`torch.Tensor`):
+            The activation of the middle block (the lowest sample resolution). Each tensor should be of shape
+            `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
+            Output can be used to condition the original UNet's middle block activation.
+    """
+
+    down_block_res_samples: Tuple[torch.Tensor]
+    mid_block_res_sample: torch.Tensor
+
+
+class ControlNetConditioningEmbedding(nn.Module):
+    """
+    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
+    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
+    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
+    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
+    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
+    model) to encode image-space conditions ... into feature maps ..."
+    """
+
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
+    ):
+        super().__init__()
+
+        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+
+        self.blocks = nn.ModuleList([])
+
+        for i in range(len(block_out_channels) - 1):
+            channel_in = block_out_channels[i]
+            channel_out = block_out_channels[i + 1]
+            self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
+            self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
+
+        self.conv_out = zero_module(
+            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
+        )
+
+    def forward(self, conditioning):
+        embedding = self.conv_in(conditioning)
+        embedding = F.silu(embedding)
+
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = F.silu(embedding)
+
+        embedding = self.conv_out(embedding)
+
+        return embedding
+
+
+class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    """
+    A ControlNet model.
+
+    Args:
+        in_channels (`int`, defaults to 4):
+            The number of channels in the input sample.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, defaults to 0):
+            The frequency shift to apply to the time embedding.
+        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
+        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, defaults to 2):
+            The number of layers per block.
+        downsample_padding (`int`, defaults to 1):
+            The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, defaults to 1):
+            The scale factor to use for the mid block.
+        act_fn (`str`, defaults to "silu"):
+            The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
+            in post-processing.
+        norm_eps (`float`, defaults to 1e-5):
+            The epsilon to use for the normalization.
+        cross_attention_dim (`int`, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
+            The dimension of the attention heads.
+        use_linear_projection (`bool`, defaults to `False`):
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        num_class_embeds (`int`, *optional*, defaults to 0):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        upcast_attention (`bool`, defaults to `False`):
+        resnet_time_scale_shift (`str`, defaults to `"default"`):
+            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
+        projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
+            The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
+            `class_embed_type="projection"`.
+        controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
+            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+        conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
+            The tuple of output channel for each block in the `conditioning_embedding` layer.
+        global_pool_conditions (`bool`, defaults to `False`):
+            TODO(Patrick) - unused parameter.
+        addition_embed_type_num_heads (`int`, defaults to 64):
+            The number of heads to use for the `TextTimeEmbedding` layer.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 4,
+        conditioning_channels: int = 3,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str, ...] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int, ...]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
+        addition_embed_type_num_heads: int = 64,
+    ):
+        super().__init__()
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
+        # input
+        conv_in_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+        )
+
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kandinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kandinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+
+        # control net conditioning embedding
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
+            conditioning_embedding_channels=block_out_channels[0],
+            block_out_channels=conditioning_embedding_out_channels,
+            conditioning_channels=conditioning_channels,
+        )
+
+        self.down_blocks = nn.ModuleList([])
+        self.controlnet_down_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+
+        controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_down_blocks.append(controlnet_block)
+
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[i],
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.down_blocks.append(down_block)
+
+            for _ in range(layers_per_block):
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+
+            if not is_final_block:
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+
+        # mid
+        mid_block_channel = block_out_channels[-1]
+
+        controlnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_mid_block = controlnet_block
+
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=mid_block_channel,
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+        elif mid_block_type == "UNetMidBlock2D":
+            self.mid_block = UNetMidBlock2D(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                num_layers=0,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                add_attention=False,
+            )
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+
+    @classmethod
+    def from_unet(
+        cls,
+        unet: UNet2DConditionModel,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
+        load_weights_from_unet: bool = True,
+        conditioning_channels: int = 3,
+    ):
+        r"""
+        Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`].
+
+        Parameters:
+            unet (`UNet2DConditionModel`):
+                The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied
+                where applicable.
+        """
+        transformer_layers_per_block = (
+            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
+        )
+        encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
+        encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
+        addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
+        addition_time_embed_dim = (
+            unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
+        )
+
+        controlnet = cls(
+            encoder_hid_dim=encoder_hid_dim,
+            encoder_hid_dim_type=encoder_hid_dim_type,
+            addition_embed_type=addition_embed_type,
+            addition_time_embed_dim=addition_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=unet.config.in_channels,
+            flip_sin_to_cos=unet.config.flip_sin_to_cos,
+            freq_shift=unet.config.freq_shift,
+            down_block_types=unet.config.down_block_types,
+            only_cross_attention=unet.config.only_cross_attention,
+            block_out_channels=unet.config.block_out_channels,
+            layers_per_block=unet.config.layers_per_block,
+            downsample_padding=unet.config.downsample_padding,
+            mid_block_scale_factor=unet.config.mid_block_scale_factor,
+            act_fn=unet.config.act_fn,
+            norm_num_groups=unet.config.norm_num_groups,
+            norm_eps=unet.config.norm_eps,
+            cross_attention_dim=unet.config.cross_attention_dim,
+            attention_head_dim=unet.config.attention_head_dim,
+            num_attention_heads=unet.config.num_attention_heads,
+            use_linear_projection=unet.config.use_linear_projection,
+            class_embed_type=unet.config.class_embed_type,
+            num_class_embeds=unet.config.num_class_embeds,
+            upcast_attention=unet.config.upcast_attention,
+            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
+            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
+            mid_block_type=unet.config.mid_block_type,
+            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
+            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
+            conditioning_channels=conditioning_channels,
+        )
+
+        if load_weights_from_unet:
+            controlnet.conv_in.load_state_dict(unet.conv_in.state_dict())
+            controlnet.time_proj.load_state_dict(unet.time_proj.state_dict())
+            controlnet.time_embedding.load_state_dict(unet.time_embedding.state_dict())
+
+            if controlnet.class_embedding:
+                controlnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())
+
+            if hasattr(controlnet, "add_embedding"):
+                controlnet.add_embedding.load_state_dict(unet.add_embedding.state_dict())
+
+            controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict())
+            controlnet.mid_block.load_state_dict(unet.mid_block.state_dict())
+
+        return controlnet
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        controlnet_cond: torch.Tensor,
+        conditioning_scale: float = 1.0,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]:
+        """
+        The [`ControlNetModel`] forward method.
+
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor.
+            timestep (`Union[torch.Tensor, float, int]`):
+                The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states.
+            controlnet_cond (`torch.Tensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                The scale factor for ControlNet outputs.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
+                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+                embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            added_cond_kwargs (`dict`):
+                Additional conditions for the Stable Diffusion XL UNet.
+            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
+            guess_mode (`bool`, defaults to `False`):
+                In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
+                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
+                returned where the first element is the sample tensor.
+        """
+        # check channel order
+        channel_order = self.config.controlnet_conditioning_channel_order
+
+        if channel_order == "rgb":
+            # in rgb order by default
+            ...
+        elif channel_order == "bgr":
+            controlnet_cond = torch.flip(controlnet_cond, dims=[1])
+        else:
+            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
+
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+
+        if self.config.addition_embed_type is not None:
+            if self.config.addition_embed_type == "text":
+                aug_emb = self.add_embedding(encoder_hidden_states)
+
+            elif self.config.addition_embed_type == "text_time":
+                if "text_embeds" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                    )
+                text_embeds = added_cond_kwargs.get("text_embeds")
+                if "time_ids" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                    )
+                time_ids = added_cond_kwargs.get("time_ids")
+                time_embeds = self.add_time_proj(time_ids.flatten())
+                time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+
+                add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+                add_embeds = add_embeds.to(emb.dtype)
+                aug_emb = self.add_embedding(add_embeds)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
+        sample = sample + controlnet_cond
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+
+        # 5. Control net blocks
+        controlnet_down_block_res_samples = ()
+
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
+
+        down_block_res_samples = controlnet_down_block_res_samples
+
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+
+        # 6. scaling
+        if guess_mode and not self.config.global_pool_conditions:
+            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device)  # 0.1 to 1.0
+            scales = scales * conditioning_scale
+            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
+            mid_block_res_sample = mid_block_res_sample * scales[-1]  # last one
+        else:
+            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+            mid_block_res_sample = mid_block_res_sample * conditioning_scale
+
+        if self.config.global_pool_conditions:
+            down_block_res_samples = [
+                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
+            ]
+            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
+
+        if not return_dict:
+            return (down_block_res_samples, mid_block_res_sample)
+
+        return ControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )
+
+
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
diff --git a/diffusers/src/diffusers/models/controlnet_flax.py b/diffusers/src/diffusers/models/controlnet_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..0540850a9e61c2856a632bd9d7728983956a2ec5
--- /dev/null
+++ b/diffusers/src/diffusers/models/controlnet_flax.py
@@ -0,0 +1,395 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict
+
+from ..configuration_utils import ConfigMixin, flax_register_to_config
+from ..utils import BaseOutput
+from .embeddings_flax import FlaxTimestepEmbedding, FlaxTimesteps
+from .modeling_flax_utils import FlaxModelMixin
+from .unets.unet_2d_blocks_flax import (
+    FlaxCrossAttnDownBlock2D,
+    FlaxDownBlock2D,
+    FlaxUNetMidBlock2DCrossAttn,
+)
+
+
+@flax.struct.dataclass
+class FlaxControlNetOutput(BaseOutput):
+    """
+    The output of [`FlaxControlNetModel`].
+
+    Args:
+        down_block_res_samples (`jnp.ndarray`):
+        mid_block_res_sample (`jnp.ndarray`):
+    """
+
+    down_block_res_samples: jnp.ndarray
+    mid_block_res_sample: jnp.ndarray
+
+
+class FlaxControlNetConditioningEmbedding(nn.Module):
+    conditioning_embedding_channels: int
+    block_out_channels: Tuple[int, ...] = (16, 32, 96, 256)
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self) -> None:
+        self.conv_in = nn.Conv(
+            self.block_out_channels[0],
+            kernel_size=(3, 3),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        blocks = []
+        for i in range(len(self.block_out_channels) - 1):
+            channel_in = self.block_out_channels[i]
+            channel_out = self.block_out_channels[i + 1]
+            conv1 = nn.Conv(
+                channel_in,
+                kernel_size=(3, 3),
+                padding=((1, 1), (1, 1)),
+                dtype=self.dtype,
+            )
+            blocks.append(conv1)
+            conv2 = nn.Conv(
+                channel_out,
+                kernel_size=(3, 3),
+                strides=(2, 2),
+                padding=((1, 1), (1, 1)),
+                dtype=self.dtype,
+            )
+            blocks.append(conv2)
+        self.blocks = blocks
+
+        self.conv_out = nn.Conv(
+            self.conditioning_embedding_channels,
+            kernel_size=(3, 3),
+            padding=((1, 1), (1, 1)),
+            kernel_init=nn.initializers.zeros_init(),
+            bias_init=nn.initializers.zeros_init(),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, conditioning: jnp.ndarray) -> jnp.ndarray:
+        embedding = self.conv_in(conditioning)
+        embedding = nn.silu(embedding)
+
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = nn.silu(embedding)
+
+        embedding = self.conv_out(embedding)
+
+        return embedding
+
+
+@flax_register_to_config
+class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin):
+    r"""
+    A ControlNet model.
+
+    This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for it’s generic methods
+    implemented for all models (such as downloading or saving).
+
+    This model is also a Flax Linen [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax Linen module and refer to the Flax documentation for all matters related to its
+    general usage and behavior.
+
+    Inherent JAX features such as the following are supported:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        sample_size (`int`, *optional*):
+            The size of the input sample.
+        in_channels (`int`, *optional*, defaults to 4):
+            The number of channels in the input sample.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D")`):
+            The tuple of downsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        attention_head_dim (`int` or `Tuple[int]`, *optional*, defaults to 8):
+            The dimension of the attention heads.
+        num_attention_heads (`int` or `Tuple[int]`, *optional*):
+            The number of attention heads.
+        cross_attention_dim (`int`, *optional*, defaults to 768):
+            The dimension of the cross attention features.
+        dropout (`float`, *optional*, defaults to 0):
+            Dropout probability for down, up and bottleneck blocks.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        controlnet_conditioning_channel_order (`str`, *optional*, defaults to `rgb`):
+            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+        conditioning_embedding_out_channels (`tuple`, *optional*, defaults to `(16, 32, 96, 256)`):
+            The tuple of output channel for each block in the `conditioning_embedding` layer.
+    """
+
+    sample_size: int = 32
+    in_channels: int = 4
+    down_block_types: Tuple[str, ...] = (
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "DownBlock2D",
+    )
+    only_cross_attention: Union[bool, Tuple[bool, ...]] = False
+    block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280)
+    layers_per_block: int = 2
+    attention_head_dim: Union[int, Tuple[int, ...]] = 8
+    num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None
+    cross_attention_dim: int = 1280
+    dropout: float = 0.0
+    use_linear_projection: bool = False
+    dtype: jnp.dtype = jnp.float32
+    flip_sin_to_cos: bool = True
+    freq_shift: int = 0
+    controlnet_conditioning_channel_order: str = "rgb"
+    conditioning_embedding_out_channels: Tuple[int, ...] = (16, 32, 96, 256)
+
+    def init_weights(self, rng: jax.Array) -> FrozenDict:
+        # init input tensors
+        sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
+        sample = jnp.zeros(sample_shape, dtype=jnp.float32)
+        timesteps = jnp.ones((1,), dtype=jnp.int32)
+        encoder_hidden_states = jnp.zeros((1, 1, self.cross_attention_dim), dtype=jnp.float32)
+        controlnet_cond_shape = (1, 3, self.sample_size * 8, self.sample_size * 8)
+        controlnet_cond = jnp.zeros(controlnet_cond_shape, dtype=jnp.float32)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        return self.init(rngs, sample, timesteps, encoder_hidden_states, controlnet_cond)["params"]
+
+    def setup(self) -> None:
+        block_out_channels = self.block_out_channels
+        time_embed_dim = block_out_channels[0] * 4
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = self.num_attention_heads or self.attention_head_dim
+
+        # input
+        self.conv_in = nn.Conv(
+            block_out_channels[0],
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        # time
+        self.time_proj = FlaxTimesteps(
+            block_out_channels[0], flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.config.freq_shift
+        )
+        self.time_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
+
+        self.controlnet_cond_embedding = FlaxControlNetConditioningEmbedding(
+            conditioning_embedding_channels=block_out_channels[0],
+            block_out_channels=self.conditioning_embedding_out_channels,
+        )
+
+        only_cross_attention = self.only_cross_attention
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = (only_cross_attention,) * len(self.down_block_types)
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(self.down_block_types)
+
+        # down
+        down_blocks = []
+        controlnet_down_blocks = []
+
+        output_channel = block_out_channels[0]
+
+        controlnet_block = nn.Conv(
+            output_channel,
+            kernel_size=(1, 1),
+            padding="VALID",
+            kernel_init=nn.initializers.zeros_init(),
+            bias_init=nn.initializers.zeros_init(),
+            dtype=self.dtype,
+        )
+        controlnet_down_blocks.append(controlnet_block)
+
+        for i, down_block_type in enumerate(self.down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            if down_block_type == "CrossAttnDownBlock2D":
+                down_block = FlaxCrossAttnDownBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    dropout=self.dropout,
+                    num_layers=self.layers_per_block,
+                    num_attention_heads=num_attention_heads[i],
+                    add_downsample=not is_final_block,
+                    use_linear_projection=self.use_linear_projection,
+                    only_cross_attention=only_cross_attention[i],
+                    dtype=self.dtype,
+                )
+            else:
+                down_block = FlaxDownBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    dropout=self.dropout,
+                    num_layers=self.layers_per_block,
+                    add_downsample=not is_final_block,
+                    dtype=self.dtype,
+                )
+
+            down_blocks.append(down_block)
+
+            for _ in range(self.layers_per_block):
+                controlnet_block = nn.Conv(
+                    output_channel,
+                    kernel_size=(1, 1),
+                    padding="VALID",
+                    kernel_init=nn.initializers.zeros_init(),
+                    bias_init=nn.initializers.zeros_init(),
+                    dtype=self.dtype,
+                )
+                controlnet_down_blocks.append(controlnet_block)
+
+            if not is_final_block:
+                controlnet_block = nn.Conv(
+                    output_channel,
+                    kernel_size=(1, 1),
+                    padding="VALID",
+                    kernel_init=nn.initializers.zeros_init(),
+                    bias_init=nn.initializers.zeros_init(),
+                    dtype=self.dtype,
+                )
+                controlnet_down_blocks.append(controlnet_block)
+
+        self.down_blocks = down_blocks
+        self.controlnet_down_blocks = controlnet_down_blocks
+
+        # mid
+        mid_block_channel = block_out_channels[-1]
+        self.mid_block = FlaxUNetMidBlock2DCrossAttn(
+            in_channels=mid_block_channel,
+            dropout=self.dropout,
+            num_attention_heads=num_attention_heads[-1],
+            use_linear_projection=self.use_linear_projection,
+            dtype=self.dtype,
+        )
+
+        self.controlnet_mid_block = nn.Conv(
+            mid_block_channel,
+            kernel_size=(1, 1),
+            padding="VALID",
+            kernel_init=nn.initializers.zeros_init(),
+            bias_init=nn.initializers.zeros_init(),
+            dtype=self.dtype,
+        )
+
+    def __call__(
+        self,
+        sample: jnp.ndarray,
+        timesteps: Union[jnp.ndarray, float, int],
+        encoder_hidden_states: jnp.ndarray,
+        controlnet_cond: jnp.ndarray,
+        conditioning_scale: float = 1.0,
+        return_dict: bool = True,
+        train: bool = False,
+    ) -> Union[FlaxControlNetOutput, Tuple[Tuple[jnp.ndarray, ...], jnp.ndarray]]:
+        r"""
+        Args:
+            sample (`jnp.ndarray`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`jnp.ndarray` or `float` or `int`): timesteps
+            encoder_hidden_states (`jnp.ndarray`): (batch_size, sequence_length, hidden_size) encoder hidden states
+            controlnet_cond (`jnp.ndarray`): (batch, channel, height, width) the conditional input tensor
+            conditioning_scale (`float`, *optional*, defaults to `1.0`): the scale factor for controlnet outputs
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of
+                a plain tuple.
+            train (`bool`, *optional*, defaults to `False`):
+                Use deterministic functions and disable dropout when not training.
+
+        Returns:
+            [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`:
+                [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise
+                a `tuple`. When returning a tuple, the first element is the sample tensor.
+        """
+        channel_order = self.controlnet_conditioning_channel_order
+        if channel_order == "bgr":
+            controlnet_cond = jnp.flip(controlnet_cond, axis=1)
+
+        # 1. time
+        if not isinstance(timesteps, jnp.ndarray):
+            timesteps = jnp.array([timesteps], dtype=jnp.int32)
+        elif isinstance(timesteps, jnp.ndarray) and len(timesteps.shape) == 0:
+            timesteps = timesteps.astype(dtype=jnp.float32)
+            timesteps = jnp.expand_dims(timesteps, 0)
+
+        t_emb = self.time_proj(timesteps)
+        t_emb = self.time_embedding(t_emb)
+
+        # 2. pre-process
+        sample = jnp.transpose(sample, (0, 2, 3, 1))
+        sample = self.conv_in(sample)
+
+        controlnet_cond = jnp.transpose(controlnet_cond, (0, 2, 3, 1))
+        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
+        sample += controlnet_cond
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for down_block in self.down_blocks:
+            if isinstance(down_block, FlaxCrossAttnDownBlock2D):
+                sample, res_samples = down_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
+            else:
+                sample, res_samples = down_block(sample, t_emb, deterministic=not train)
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        sample = self.mid_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
+
+        # 5. contronet blocks
+        controlnet_down_block_res_samples = ()
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples += (down_block_res_sample,)
+
+        down_block_res_samples = controlnet_down_block_res_samples
+
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+
+        # 6. scaling
+        down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+        mid_block_res_sample *= conditioning_scale
+
+        if not return_dict:
+            return (down_block_res_samples, mid_block_res_sample)
+
+        return FlaxControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )
diff --git a/diffusers/src/diffusers/models/controlnet_flux.py b/diffusers/src/diffusers/models/controlnet_flux.py
new file mode 100644
index 0000000000000000000000000000000000000000..88ad49d2b776053a7ff5dfb7b41fcfbb4b1e28c2
--- /dev/null
+++ b/diffusers/src/diffusers/models/controlnet_flux.py
@@ -0,0 +1,518 @@
+# Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..loaders import PeftAdapterMixin
+from ..models.attention_processor import AttentionProcessor
+from ..models.modeling_utils import ModelMixin
+from ..utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from .controlnet import BaseOutput, zero_module
+from .embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
+from .modeling_outputs import Transformer2DModelOutput
+from .transformers.transformer_flux import FluxSingleTransformerBlock, FluxTransformerBlock
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class FluxControlNetOutput(BaseOutput):
+    controlnet_block_samples: Tuple[torch.Tensor]
+    controlnet_single_block_samples: Tuple[torch.Tensor]
+
+
+class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        guidance_embeds: bool = False,
+        axes_dims_rope: List[int] = [16, 56, 56],
+        num_mode: int = None,
+    ):
+        super().__init__()
+        self.out_channels = in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+
+        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+        text_time_guidance_cls = (
+            CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
+        )
+        self.time_text_embed = text_time_guidance_cls(
+            embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
+        )
+
+        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
+        self.x_embedder = torch.nn.Linear(in_channels, self.inner_dim)
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for i in range(num_layers)
+            ]
+        )
+
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for i in range(num_single_layers)
+            ]
+        )
+
+        # controlnet_blocks
+        self.controlnet_blocks = nn.ModuleList([])
+        for _ in range(len(self.transformer_blocks)):
+            self.controlnet_blocks.append(zero_module(nn.Linear(self.inner_dim, self.inner_dim)))
+
+        self.controlnet_single_blocks = nn.ModuleList([])
+        for _ in range(len(self.single_transformer_blocks)):
+            self.controlnet_single_blocks.append(zero_module(nn.Linear(self.inner_dim, self.inner_dim)))
+
+        self.union = num_mode is not None
+        if self.union:
+            self.controlnet_mode_embedder = nn.Embedding(num_mode, self.inner_dim)
+
+        self.controlnet_x_embedder = zero_module(torch.nn.Linear(in_channels, self.inner_dim))
+
+        self.gradient_checkpointing = False
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self):
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    @classmethod
+    def from_transformer(
+        cls,
+        transformer,
+        num_layers: int = 4,
+        num_single_layers: int = 10,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        load_weights_from_transformer=True,
+    ):
+        config = transformer.config
+        config["num_layers"] = num_layers
+        config["num_single_layers"] = num_single_layers
+        config["attention_head_dim"] = attention_head_dim
+        config["num_attention_heads"] = num_attention_heads
+
+        controlnet = cls(**config)
+
+        if load_weights_from_transformer:
+            controlnet.pos_embed.load_state_dict(transformer.pos_embed.state_dict())
+            controlnet.time_text_embed.load_state_dict(transformer.time_text_embed.state_dict())
+            controlnet.context_embedder.load_state_dict(transformer.context_embedder.state_dict())
+            controlnet.x_embedder.load_state_dict(transformer.x_embedder.state_dict())
+            controlnet.transformer_blocks.load_state_dict(transformer.transformer_blocks.state_dict(), strict=False)
+            controlnet.single_transformer_blocks.load_state_dict(
+                transformer.single_transformer_blocks.state_dict(), strict=False
+            )
+
+            controlnet.controlnet_x_embedder = zero_module(controlnet.controlnet_x_embedder)
+
+        return controlnet
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        controlnet_cond: torch.Tensor,
+        controlnet_mode: torch.Tensor = None,
+        conditioning_scale: float = 1.0,
+        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+        """
+        The [`FluxTransformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            controlnet_cond (`torch.Tensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            controlnet_mode (`torch.Tensor`):
+                The mode tensor of shape `(batch_size, 1)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                The scale factor for ControlNet outputs.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+                from the embeddings of input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        hidden_states = self.x_embedder(hidden_states)
+
+        # add
+        hidden_states = hidden_states + self.controlnet_x_embedder(controlnet_cond)
+
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        else:
+            guidance = None
+        temb = (
+            self.time_text_embed(timestep, pooled_projections)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, pooled_projections)
+        )
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+
+        if self.union:
+            # union mode
+            if controlnet_mode is None:
+                raise ValueError("`controlnet_mode` cannot be `None` when applying ControlNet-Union")
+            # union mode emb
+            controlnet_mode_emb = self.controlnet_mode_embedder(controlnet_mode)
+            encoder_hidden_states = torch.cat([controlnet_mode_emb, encoder_hidden_states], dim=1)
+            txt_ids = torch.cat([txt_ids[:1], txt_ids], dim=0)
+
+        if txt_ids.ndim == 3:
+            logger.warning(
+                "Passing `txt_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            logger.warning(
+                "Passing `img_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            img_ids = img_ids[0]
+
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        image_rotary_emb = self.pos_embed(ids)
+
+        block_samples = ()
+        for index_block, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+            block_samples = block_samples + (hidden_states,)
+
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        single_block_samples = ()
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+            single_block_samples = single_block_samples + (hidden_states[:, encoder_hidden_states.shape[1] :],)
+
+        # controlnet block
+        controlnet_block_samples = ()
+        for block_sample, controlnet_block in zip(block_samples, self.controlnet_blocks):
+            block_sample = controlnet_block(block_sample)
+            controlnet_block_samples = controlnet_block_samples + (block_sample,)
+
+        controlnet_single_block_samples = ()
+        for single_block_sample, controlnet_block in zip(single_block_samples, self.controlnet_single_blocks):
+            single_block_sample = controlnet_block(single_block_sample)
+            controlnet_single_block_samples = controlnet_single_block_samples + (single_block_sample,)
+
+        # scaling
+        controlnet_block_samples = [sample * conditioning_scale for sample in controlnet_block_samples]
+        controlnet_single_block_samples = [sample * conditioning_scale for sample in controlnet_single_block_samples]
+
+        controlnet_block_samples = None if len(controlnet_block_samples) == 0 else controlnet_block_samples
+        controlnet_single_block_samples = (
+            None if len(controlnet_single_block_samples) == 0 else controlnet_single_block_samples
+        )
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (controlnet_block_samples, controlnet_single_block_samples)
+
+        return FluxControlNetOutput(
+            controlnet_block_samples=controlnet_block_samples,
+            controlnet_single_block_samples=controlnet_single_block_samples,
+        )
+
+
+class FluxMultiControlNetModel(ModelMixin):
+    r"""
+    `FluxMultiControlNetModel` wrapper class for Multi-FluxControlNetModel
+
+    This module is a wrapper for multiple instances of the `FluxControlNetModel`. The `forward()` API is designed to be
+    compatible with `FluxControlNetModel`.
+
+    Args:
+        controlnets (`List[FluxControlNetModel]`):
+            Provides additional conditioning to the unet during the denoising process. You must set multiple
+            `FluxControlNetModel` as a list.
+    """
+
+    def __init__(self, controlnets):
+        super().__init__()
+        self.nets = nn.ModuleList(controlnets)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        controlnet_cond: List[torch.tensor],
+        controlnet_mode: List[torch.tensor],
+        conditioning_scale: List[float],
+        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[FluxControlNetOutput, Tuple]:
+        # ControlNet-Union with multiple conditions
+        # only load one ControlNet for saving memories
+        if len(self.nets) == 1 and self.nets[0].union:
+            controlnet = self.nets[0]
+
+            for i, (image, mode, scale) in enumerate(zip(controlnet_cond, controlnet_mode, conditioning_scale)):
+                block_samples, single_block_samples = controlnet(
+                    hidden_states=hidden_states,
+                    controlnet_cond=image,
+                    controlnet_mode=mode[:, None],
+                    conditioning_scale=scale,
+                    timestep=timestep,
+                    guidance=guidance,
+                    pooled_projections=pooled_projections,
+                    encoder_hidden_states=encoder_hidden_states,
+                    txt_ids=txt_ids,
+                    img_ids=img_ids,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                    return_dict=return_dict,
+                )
+
+                # merge samples
+                if i == 0:
+                    control_block_samples = block_samples
+                    control_single_block_samples = single_block_samples
+                else:
+                    control_block_samples = [
+                        control_block_sample + block_sample
+                        for control_block_sample, block_sample in zip(control_block_samples, block_samples)
+                    ]
+
+                    control_single_block_samples = [
+                        control_single_block_sample + block_sample
+                        for control_single_block_sample, block_sample in zip(
+                            control_single_block_samples, single_block_samples
+                        )
+                    ]
+
+        # Regular Multi-ControlNets
+        # load all ControlNets into memories
+        else:
+            for i, (image, mode, scale, controlnet) in enumerate(
+                zip(controlnet_cond, controlnet_mode, conditioning_scale, self.nets)
+            ):
+                block_samples, single_block_samples = controlnet(
+                    hidden_states=hidden_states,
+                    controlnet_cond=image,
+                    controlnet_mode=mode[:, None],
+                    conditioning_scale=scale,
+                    timestep=timestep,
+                    guidance=guidance,
+                    pooled_projections=pooled_projections,
+                    encoder_hidden_states=encoder_hidden_states,
+                    txt_ids=txt_ids,
+                    img_ids=img_ids,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                    return_dict=return_dict,
+                )
+
+                # merge samples
+                if i == 0:
+                    control_block_samples = block_samples
+                    control_single_block_samples = single_block_samples
+                else:
+                    if block_samples is not None and control_block_samples is not None:
+                        control_block_samples = [
+                            control_block_sample + block_sample
+                            for control_block_sample, block_sample in zip(control_block_samples, block_samples)
+                        ]
+                    if single_block_samples is not None and control_single_block_samples is not None:
+                        control_single_block_samples = [
+                            control_single_block_sample + block_sample
+                            for control_single_block_sample, block_sample in zip(
+                                control_single_block_samples, single_block_samples
+                            )
+                        ]
+
+        return control_block_samples, control_single_block_samples
diff --git a/diffusers/src/diffusers/models/controlnet_hunyuan.py b/diffusers/src/diffusers/models/controlnet_hunyuan.py
new file mode 100644
index 0000000000000000000000000000000000000000..4277d81d1cb9bc20ccb39a0a36b5141e40cc1a1e
--- /dev/null
+++ b/diffusers/src/diffusers/models/controlnet_hunyuan.py
@@ -0,0 +1,401 @@
+# Copyright 2024 HunyuanDiT Authors, Qixun Wang and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Dict, Optional, Union
+
+import torch
+from torch import nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import logging
+from .attention_processor import AttentionProcessor
+from .controlnet import BaseOutput, Tuple, zero_module
+from .embeddings import (
+    HunyuanCombinedTimestepTextSizeStyleEmbedding,
+    PatchEmbed,
+    PixArtAlphaTextProjection,
+)
+from .modeling_utils import ModelMixin
+from .transformers.hunyuan_transformer_2d import HunyuanDiTBlock
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class HunyuanControlNetOutput(BaseOutput):
+    controlnet_block_samples: Tuple[torch.Tensor]
+
+
+class HunyuanDiT2DControlNetModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        conditioning_channels: int = 3,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "gelu-approximate",
+        sample_size=32,
+        hidden_size=1152,
+        transformer_num_layers: int = 40,
+        mlp_ratio: float = 4.0,
+        cross_attention_dim: int = 1024,
+        cross_attention_dim_t5: int = 2048,
+        pooled_projection_dim: int = 1024,
+        text_len: int = 77,
+        text_len_t5: int = 256,
+        use_style_cond_and_image_meta_size: bool = True,
+    ):
+        super().__init__()
+        self.num_heads = num_attention_heads
+        self.inner_dim = num_attention_heads * attention_head_dim
+
+        self.text_embedder = PixArtAlphaTextProjection(
+            in_features=cross_attention_dim_t5,
+            hidden_size=cross_attention_dim_t5 * 4,
+            out_features=cross_attention_dim,
+            act_fn="silu_fp32",
+        )
+
+        self.text_embedding_padding = nn.Parameter(
+            torch.randn(text_len + text_len_t5, cross_attention_dim, dtype=torch.float32)
+        )
+
+        self.pos_embed = PatchEmbed(
+            height=sample_size,
+            width=sample_size,
+            in_channels=in_channels,
+            embed_dim=hidden_size,
+            patch_size=patch_size,
+            pos_embed_type=None,
+        )
+
+        self.time_extra_emb = HunyuanCombinedTimestepTextSizeStyleEmbedding(
+            hidden_size,
+            pooled_projection_dim=pooled_projection_dim,
+            seq_len=text_len_t5,
+            cross_attention_dim=cross_attention_dim_t5,
+            use_style_cond_and_image_meta_size=use_style_cond_and_image_meta_size,
+        )
+
+        # controlnet_blocks
+        self.controlnet_blocks = nn.ModuleList([])
+
+        # HunyuanDiT Blocks
+        self.blocks = nn.ModuleList(
+            [
+                HunyuanDiTBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    activation_fn=activation_fn,
+                    ff_inner_dim=int(self.inner_dim * mlp_ratio),
+                    cross_attention_dim=cross_attention_dim,
+                    qk_norm=True,  # See http://arxiv.org/abs/2302.05442 for details.
+                    skip=False,  # always False as it is the first half of the model
+                )
+                for layer in range(transformer_num_layers // 2 - 1)
+            ]
+        )
+        self.input_block = zero_module(nn.Linear(hidden_size, hidden_size))
+        for _ in range(len(self.blocks)):
+            controlnet_block = nn.Linear(hidden_size, hidden_size)
+            controlnet_block = zero_module(controlnet_block)
+            self.controlnet_blocks.append(controlnet_block)
+
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers. If `processor` is a dict, the key needs to define the path to the
+                corresponding cross attention processor. This is strongly recommended when setting trainable attention
+                processors.
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    @classmethod
+    def from_transformer(
+        cls, transformer, conditioning_channels=3, transformer_num_layers=None, load_weights_from_transformer=True
+    ):
+        config = transformer.config
+        activation_fn = config.activation_fn
+        attention_head_dim = config.attention_head_dim
+        cross_attention_dim = config.cross_attention_dim
+        cross_attention_dim_t5 = config.cross_attention_dim_t5
+        hidden_size = config.hidden_size
+        in_channels = config.in_channels
+        mlp_ratio = config.mlp_ratio
+        num_attention_heads = config.num_attention_heads
+        patch_size = config.patch_size
+        sample_size = config.sample_size
+        text_len = config.text_len
+        text_len_t5 = config.text_len_t5
+
+        conditioning_channels = conditioning_channels
+        transformer_num_layers = transformer_num_layers or config.transformer_num_layers
+
+        controlnet = cls(
+            conditioning_channels=conditioning_channels,
+            transformer_num_layers=transformer_num_layers,
+            activation_fn=activation_fn,
+            attention_head_dim=attention_head_dim,
+            cross_attention_dim=cross_attention_dim,
+            cross_attention_dim_t5=cross_attention_dim_t5,
+            hidden_size=hidden_size,
+            in_channels=in_channels,
+            mlp_ratio=mlp_ratio,
+            num_attention_heads=num_attention_heads,
+            patch_size=patch_size,
+            sample_size=sample_size,
+            text_len=text_len,
+            text_len_t5=text_len_t5,
+        )
+        if load_weights_from_transformer:
+            key = controlnet.load_state_dict(transformer.state_dict(), strict=False)
+            logger.warning(f"controlnet load from Hunyuan-DiT. missing_keys: {key[0]}")
+        return controlnet
+
+    def forward(
+        self,
+        hidden_states,
+        timestep,
+        controlnet_cond: torch.Tensor,
+        conditioning_scale: float = 1.0,
+        encoder_hidden_states=None,
+        text_embedding_mask=None,
+        encoder_hidden_states_t5=None,
+        text_embedding_mask_t5=None,
+        image_meta_size=None,
+        style=None,
+        image_rotary_emb=None,
+        return_dict=True,
+    ):
+        """
+        The [`HunyuanDiT2DControlNetModel`] forward method.
+
+        Args:
+        hidden_states (`torch.Tensor` of shape `(batch size, dim, height, width)`):
+            The input tensor.
+        timestep ( `torch.LongTensor`, *optional*):
+            Used to indicate denoising step.
+        controlnet_cond ( `torch.Tensor` ):
+            The conditioning input to ControlNet.
+        conditioning_scale ( `float` ):
+            Indicate the conditioning scale.
+        encoder_hidden_states ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+            Conditional embeddings for cross attention layer. This is the output of `BertModel`.
+        text_embedding_mask: torch.Tensor
+            An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. This is the output
+            of `BertModel`.
+        encoder_hidden_states_t5 ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+            Conditional embeddings for cross attention layer. This is the output of T5 Text Encoder.
+        text_embedding_mask_t5: torch.Tensor
+            An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. This is the output
+            of T5 Text Encoder.
+        image_meta_size (torch.Tensor):
+            Conditional embedding indicate the image sizes
+        style: torch.Tensor:
+            Conditional embedding indicate the style
+        image_rotary_emb (`torch.Tensor`):
+            The image rotary embeddings to apply on query and key tensors during attention calculation.
+        return_dict: bool
+            Whether to return a dictionary.
+        """
+
+        height, width = hidden_states.shape[-2:]
+
+        hidden_states = self.pos_embed(hidden_states)  # b,c,H,W -> b, N, C
+
+        # 2. pre-process
+        hidden_states = hidden_states + self.input_block(self.pos_embed(controlnet_cond))
+
+        temb = self.time_extra_emb(
+            timestep, encoder_hidden_states_t5, image_meta_size, style, hidden_dtype=timestep.dtype
+        )  # [B, D]
+
+        # text projection
+        batch_size, sequence_length, _ = encoder_hidden_states_t5.shape
+        encoder_hidden_states_t5 = self.text_embedder(
+            encoder_hidden_states_t5.view(-1, encoder_hidden_states_t5.shape[-1])
+        )
+        encoder_hidden_states_t5 = encoder_hidden_states_t5.view(batch_size, sequence_length, -1)
+
+        encoder_hidden_states = torch.cat([encoder_hidden_states, encoder_hidden_states_t5], dim=1)
+        text_embedding_mask = torch.cat([text_embedding_mask, text_embedding_mask_t5], dim=-1)
+        text_embedding_mask = text_embedding_mask.unsqueeze(2).bool()
+
+        encoder_hidden_states = torch.where(text_embedding_mask, encoder_hidden_states, self.text_embedding_padding)
+
+        block_res_samples = ()
+        for layer, block in enumerate(self.blocks):
+            hidden_states = block(
+                hidden_states,
+                temb=temb,
+                encoder_hidden_states=encoder_hidden_states,
+                image_rotary_emb=image_rotary_emb,
+            )  # (N, L, D)
+
+            block_res_samples = block_res_samples + (hidden_states,)
+
+        controlnet_block_res_samples = ()
+        for block_res_sample, controlnet_block in zip(block_res_samples, self.controlnet_blocks):
+            block_res_sample = controlnet_block(block_res_sample)
+            controlnet_block_res_samples = controlnet_block_res_samples + (block_res_sample,)
+
+        # 6. scaling
+        controlnet_block_res_samples = [sample * conditioning_scale for sample in controlnet_block_res_samples]
+
+        if not return_dict:
+            return (controlnet_block_res_samples,)
+
+        return HunyuanControlNetOutput(controlnet_block_samples=controlnet_block_res_samples)
+
+
+class HunyuanDiT2DMultiControlNetModel(ModelMixin):
+    r"""
+    `HunyuanDiT2DMultiControlNetModel` wrapper class for Multi-HunyuanDiT2DControlNetModel
+
+    This module is a wrapper for multiple instances of the `HunyuanDiT2DControlNetModel`. The `forward()` API is
+    designed to be compatible with `HunyuanDiT2DControlNetModel`.
+
+    Args:
+        controlnets (`List[HunyuanDiT2DControlNetModel]`):
+            Provides additional conditioning to the unet during the denoising process. You must set multiple
+            `HunyuanDiT2DControlNetModel` as a list.
+    """
+
+    def __init__(self, controlnets):
+        super().__init__()
+        self.nets = nn.ModuleList(controlnets)
+
+    def forward(
+        self,
+        hidden_states,
+        timestep,
+        controlnet_cond: torch.Tensor,
+        conditioning_scale: float = 1.0,
+        encoder_hidden_states=None,
+        text_embedding_mask=None,
+        encoder_hidden_states_t5=None,
+        text_embedding_mask_t5=None,
+        image_meta_size=None,
+        style=None,
+        image_rotary_emb=None,
+        return_dict=True,
+    ):
+        """
+        The [`HunyuanDiT2DControlNetModel`] forward method.
+
+        Args:
+        hidden_states (`torch.Tensor` of shape `(batch size, dim, height, width)`):
+            The input tensor.
+        timestep ( `torch.LongTensor`, *optional*):
+            Used to indicate denoising step.
+        controlnet_cond ( `torch.Tensor` ):
+            The conditioning input to ControlNet.
+        conditioning_scale ( `float` ):
+            Indicate the conditioning scale.
+        encoder_hidden_states ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+            Conditional embeddings for cross attention layer. This is the output of `BertModel`.
+        text_embedding_mask: torch.Tensor
+            An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. This is the output
+            of `BertModel`.
+        encoder_hidden_states_t5 ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+            Conditional embeddings for cross attention layer. This is the output of T5 Text Encoder.
+        text_embedding_mask_t5: torch.Tensor
+            An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. This is the output
+            of T5 Text Encoder.
+        image_meta_size (torch.Tensor):
+            Conditional embedding indicate the image sizes
+        style: torch.Tensor:
+            Conditional embedding indicate the style
+        image_rotary_emb (`torch.Tensor`):
+            The image rotary embeddings to apply on query and key tensors during attention calculation.
+        return_dict: bool
+            Whether to return a dictionary.
+        """
+        for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
+            block_samples = controlnet(
+                hidden_states=hidden_states,
+                timestep=timestep,
+                controlnet_cond=image,
+                conditioning_scale=scale,
+                encoder_hidden_states=encoder_hidden_states,
+                text_embedding_mask=text_embedding_mask,
+                encoder_hidden_states_t5=encoder_hidden_states_t5,
+                text_embedding_mask_t5=text_embedding_mask_t5,
+                image_meta_size=image_meta_size,
+                style=style,
+                image_rotary_emb=image_rotary_emb,
+                return_dict=return_dict,
+            )
+
+            # merge samples
+            if i == 0:
+                control_block_samples = block_samples
+            else:
+                control_block_samples = [
+                    control_block_sample + block_sample
+                    for control_block_sample, block_sample in zip(control_block_samples[0], block_samples[0])
+                ]
+                control_block_samples = (control_block_samples,)
+
+        return control_block_samples
diff --git a/diffusers/src/diffusers/models/controlnet_sd3.py b/diffusers/src/diffusers/models/controlnet_sd3.py
new file mode 100644
index 0000000000000000000000000000000000000000..43b52a645a0d88e508418fcff3c8da6537032080
--- /dev/null
+++ b/diffusers/src/diffusers/models/controlnet_sd3.py
@@ -0,0 +1,422 @@
+# Copyright 2024 Stability AI, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..loaders import FromOriginalModelMixin, PeftAdapterMixin
+from ..models.attention import JointTransformerBlock
+from ..models.attention_processor import Attention, AttentionProcessor, FusedJointAttnProcessor2_0
+from ..models.modeling_outputs import Transformer2DModelOutput
+from ..models.modeling_utils import ModelMixin
+from ..utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from .controlnet import BaseOutput, zero_module
+from .embeddings import CombinedTimestepTextProjEmbeddings, PatchEmbed
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class SD3ControlNetOutput(BaseOutput):
+    controlnet_block_samples: Tuple[torch.Tensor]
+
+
+class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int = 128,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        num_layers: int = 18,
+        attention_head_dim: int = 64,
+        num_attention_heads: int = 18,
+        joint_attention_dim: int = 4096,
+        caption_projection_dim: int = 1152,
+        pooled_projection_dim: int = 2048,
+        out_channels: int = 16,
+        pos_embed_max_size: int = 96,
+        extra_conditioning_channels: int = 0,
+    ):
+        super().__init__()
+        default_out_channels = in_channels
+        self.out_channels = out_channels if out_channels is not None else default_out_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+
+        self.pos_embed = PatchEmbed(
+            height=sample_size,
+            width=sample_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=self.inner_dim,
+            pos_embed_max_size=pos_embed_max_size,
+        )
+        self.time_text_embed = CombinedTimestepTextProjEmbeddings(
+            embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
+        )
+        self.context_embedder = nn.Linear(joint_attention_dim, caption_projection_dim)
+
+        # `attention_head_dim` is doubled to account for the mixing.
+        # It needs to crafted when we get the actual checkpoints.
+        self.transformer_blocks = nn.ModuleList(
+            [
+                JointTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                    context_pre_only=False,
+                )
+                for i in range(num_layers)
+            ]
+        )
+
+        # controlnet_blocks
+        self.controlnet_blocks = nn.ModuleList([])
+        for _ in range(len(self.transformer_blocks)):
+            controlnet_block = nn.Linear(self.inner_dim, self.inner_dim)
+            controlnet_block = zero_module(controlnet_block)
+            self.controlnet_blocks.append(controlnet_block)
+        pos_embed_input = PatchEmbed(
+            height=sample_size,
+            width=sample_size,
+            patch_size=patch_size,
+            in_channels=in_channels + extra_conditioning_channels,
+            embed_dim=self.inner_dim,
+            pos_embed_type=None,
+        )
+        self.pos_embed_input = zero_module(pos_embed_input)
+
+        self.gradient_checkpointing = False
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.transformers.transformer_sd3.SD3Transformer2DModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedJointAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    @classmethod
+    def from_transformer(
+        cls, transformer, num_layers=12, num_extra_conditioning_channels=1, load_weights_from_transformer=True
+    ):
+        config = transformer.config
+        config["num_layers"] = num_layers or config.num_layers
+        config["extra_conditioning_channels"] = num_extra_conditioning_channels
+        controlnet = cls(**config)
+
+        if load_weights_from_transformer:
+            controlnet.pos_embed.load_state_dict(transformer.pos_embed.state_dict())
+            controlnet.time_text_embed.load_state_dict(transformer.time_text_embed.state_dict())
+            controlnet.context_embedder.load_state_dict(transformer.context_embedder.state_dict())
+            controlnet.transformer_blocks.load_state_dict(transformer.transformer_blocks.state_dict(), strict=False)
+
+            controlnet.pos_embed_input = zero_module(controlnet.pos_embed_input)
+
+        return controlnet
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        controlnet_cond: torch.Tensor,
+        conditioning_scale: float = 1.0,
+        encoder_hidden_states: torch.FloatTensor = None,
+        pooled_projections: torch.FloatTensor = None,
+        timestep: torch.LongTensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+        """
+        The [`SD3Transformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            controlnet_cond (`torch.Tensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                The scale factor for ControlNet outputs.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+                from the embeddings of input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
+        hidden_states = self.pos_embed(hidden_states)  # takes care of adding positional embeddings too.
+        temb = self.time_text_embed(timestep, pooled_projections)
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+
+        # add
+        hidden_states = hidden_states + self.pos_embed_input(controlnet_cond)
+
+        block_res_samples = ()
+
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, temb=temb
+                )
+
+            block_res_samples = block_res_samples + (hidden_states,)
+
+        controlnet_block_res_samples = ()
+        for block_res_sample, controlnet_block in zip(block_res_samples, self.controlnet_blocks):
+            block_res_sample = controlnet_block(block_res_sample)
+            controlnet_block_res_samples = controlnet_block_res_samples + (block_res_sample,)
+
+        # 6. scaling
+        controlnet_block_res_samples = [sample * conditioning_scale for sample in controlnet_block_res_samples]
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (controlnet_block_res_samples,)
+
+        return SD3ControlNetOutput(controlnet_block_samples=controlnet_block_res_samples)
+
+
+class SD3MultiControlNetModel(ModelMixin):
+    r"""
+    `SD3ControlNetModel` wrapper class for Multi-SD3ControlNet
+
+    This module is a wrapper for multiple instances of the `SD3ControlNetModel`. The `forward()` API is designed to be
+    compatible with `SD3ControlNetModel`.
+
+    Args:
+        controlnets (`List[SD3ControlNetModel]`):
+            Provides additional conditioning to the unet during the denoising process. You must set multiple
+            `SD3ControlNetModel` as a list.
+    """
+
+    def __init__(self, controlnets):
+        super().__init__()
+        self.nets = nn.ModuleList(controlnets)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        controlnet_cond: List[torch.tensor],
+        conditioning_scale: List[float],
+        pooled_projections: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        timestep: torch.LongTensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[SD3ControlNetOutput, Tuple]:
+        for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
+            block_samples = controlnet(
+                hidden_states=hidden_states,
+                timestep=timestep,
+                encoder_hidden_states=encoder_hidden_states,
+                pooled_projections=pooled_projections,
+                controlnet_cond=image,
+                conditioning_scale=scale,
+                joint_attention_kwargs=joint_attention_kwargs,
+                return_dict=return_dict,
+            )
+
+            # merge samples
+            if i == 0:
+                control_block_samples = block_samples
+            else:
+                control_block_samples = [
+                    control_block_sample + block_sample
+                    for control_block_sample, block_sample in zip(control_block_samples[0], block_samples[0])
+                ]
+                control_block_samples = (tuple(control_block_samples),)
+
+        return control_block_samples
diff --git a/diffusers/src/diffusers/models/controlnet_sparsectrl.py b/diffusers/src/diffusers/models/controlnet_sparsectrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa37e1f9e393598b4975e4dcfd49ee68a6873d99
--- /dev/null
+++ b/diffusers/src/diffusers/models/controlnet_sparsectrl.py
@@ -0,0 +1,788 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..loaders import FromOriginalModelMixin
+from ..utils import BaseOutput, logging
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from .embeddings import TimestepEmbedding, Timesteps
+from .modeling_utils import ModelMixin
+from .unets.unet_2d_blocks import UNetMidBlock2DCrossAttn
+from .unets.unet_2d_condition import UNet2DConditionModel
+from .unets.unet_motion_model import CrossAttnDownBlockMotion, DownBlockMotion
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class SparseControlNetOutput(BaseOutput):
+    """
+    The output of [`SparseControlNetModel`].
+
+    Args:
+        down_block_res_samples (`tuple[torch.Tensor]`):
+            A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
+            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
+            used to condition the original UNet's downsampling activations.
+        mid_down_block_re_sample (`torch.Tensor`):
+            The activation of the middle block (the lowest sample resolution). Each tensor should be of shape
+            `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
+            Output can be used to condition the original UNet's middle block activation.
+    """
+
+    down_block_res_samples: Tuple[torch.Tensor]
+    mid_block_res_sample: torch.Tensor
+
+
+class SparseControlNetConditioningEmbedding(nn.Module):
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
+    ):
+        super().__init__()
+
+        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+        self.blocks = nn.ModuleList([])
+
+        for i in range(len(block_out_channels) - 1):
+            channel_in = block_out_channels[i]
+            channel_out = block_out_channels[i + 1]
+            self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
+            self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
+
+        self.conv_out = zero_module(
+            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
+        )
+
+    def forward(self, conditioning: torch.Tensor) -> torch.Tensor:
+        embedding = self.conv_in(conditioning)
+        embedding = F.silu(embedding)
+
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = F.silu(embedding)
+
+        embedding = self.conv_out(embedding)
+        return embedding
+
+
+class SparseControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    """
+    A SparseControlNet model as described in [SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion
+    Models](https://arxiv.org/abs/2311.16933).
+
+    Args:
+        in_channels (`int`, defaults to 4):
+            The number of channels in the input sample.
+        conditioning_channels (`int`, defaults to 4):
+            The number of input channels in the controlnet conditional embedding module. If
+            `concat_condition_embedding` is True, the value provided here is incremented by 1.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, defaults to 0):
+            The frequency shift to apply to the time embedding.
+        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
+        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, defaults to 2):
+            The number of layers per block.
+        downsample_padding (`int`, defaults to 1):
+            The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, defaults to 1):
+            The scale factor to use for the mid block.
+        act_fn (`str`, defaults to "silu"):
+            The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
+            in post-processing.
+        norm_eps (`float`, defaults to 1e-5):
+            The epsilon to use for the normalization.
+        cross_attention_dim (`int`, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        transformer_layers_per_mid_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer layers to use in each layer in the middle block.
+        attention_head_dim (`int` or `Tuple[int]`, defaults to 8):
+            The dimension of the attention heads.
+        num_attention_heads (`int` or `Tuple[int]`, *optional*):
+            The number of heads to use for multi-head attention.
+        use_linear_projection (`bool`, defaults to `False`):
+        upcast_attention (`bool`, defaults to `False`):
+        resnet_time_scale_shift (`str`, defaults to `"default"`):
+            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
+        conditioning_embedding_out_channels (`Tuple[int]`, defaults to `(16, 32, 96, 256)`):
+            The tuple of output channel for each block in the `conditioning_embedding` layer.
+        global_pool_conditions (`bool`, defaults to `False`):
+            TODO(Patrick) - unused parameter
+        controlnet_conditioning_channel_order (`str`, defaults to `rgb`):
+        motion_max_seq_length (`int`, defaults to `32`):
+            The maximum sequence length to use in the motion module.
+        motion_num_attention_heads (`int` or `Tuple[int]`, defaults to `8`):
+            The number of heads to use in each attention layer of the motion module.
+        concat_conditioning_mask (`bool`, defaults to `True`):
+        use_simplified_condition_embedding (`bool`, defaults to `True`):
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 4,
+        conditioning_channels: int = 4,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str, ...] = (
+            "CrossAttnDownBlockMotion",
+            "CrossAttnDownBlockMotion",
+            "CrossAttnDownBlockMotion",
+            "DownBlockMotion",
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 768,
+        transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
+        transformer_layers_per_mid_block: Optional[Union[int, Tuple[int]]] = None,
+        temporal_transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
+        attention_head_dim: Union[int, Tuple[int, ...]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
+        controlnet_conditioning_channel_order: str = "rgb",
+        motion_max_seq_length: int = 32,
+        motion_num_attention_heads: int = 8,
+        concat_conditioning_mask: bool = True,
+        use_simplified_condition_embedding: bool = True,
+    ):
+        super().__init__()
+        self.use_simplified_condition_embedding = use_simplified_condition_embedding
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if isinstance(temporal_transformer_layers_per_block, int):
+            temporal_transformer_layers_per_block = [temporal_transformer_layers_per_block] * len(down_block_types)
+
+        # input
+        conv_in_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        if concat_conditioning_mask:
+            conditioning_channels = conditioning_channels + 1
+
+        self.concat_conditioning_mask = concat_conditioning_mask
+
+        # control net conditioning embedding
+        if use_simplified_condition_embedding:
+            self.controlnet_cond_embedding = zero_module(
+                nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+            )
+        else:
+            self.controlnet_cond_embedding = SparseControlNetConditioningEmbedding(
+                conditioning_embedding_channels=block_out_channels[0],
+                block_out_channels=conditioning_embedding_out_channels,
+                conditioning_channels=conditioning_channels,
+            )
+
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+        )
+
+        self.down_blocks = nn.ModuleList([])
+        self.controlnet_down_blocks = nn.ModuleList([])
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        if isinstance(motion_num_attention_heads, int):
+            motion_num_attention_heads = (motion_num_attention_heads,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+
+        controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_down_blocks.append(controlnet_block)
+
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            if down_block_type == "CrossAttnDownBlockMotion":
+                down_block = CrossAttnDownBlockMotion(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    temb_channels=time_embed_dim,
+                    dropout=0,
+                    num_layers=layers_per_block,
+                    transformer_layers_per_block=transformer_layers_per_block[i],
+                    resnet_eps=norm_eps,
+                    resnet_time_scale_shift=resnet_time_scale_shift,
+                    resnet_act_fn=act_fn,
+                    resnet_groups=norm_num_groups,
+                    resnet_pre_norm=True,
+                    num_attention_heads=num_attention_heads[i],
+                    cross_attention_dim=cross_attention_dim[i],
+                    add_downsample=not is_final_block,
+                    dual_cross_attention=False,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention[i],
+                    upcast_attention=upcast_attention,
+                    temporal_num_attention_heads=motion_num_attention_heads[i],
+                    temporal_max_seq_length=motion_max_seq_length,
+                    temporal_transformer_layers_per_block=temporal_transformer_layers_per_block[i],
+                    temporal_double_self_attention=False,
+                )
+            elif down_block_type == "DownBlockMotion":
+                down_block = DownBlockMotion(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    temb_channels=time_embed_dim,
+                    dropout=0,
+                    num_layers=layers_per_block,
+                    resnet_eps=norm_eps,
+                    resnet_time_scale_shift=resnet_time_scale_shift,
+                    resnet_act_fn=act_fn,
+                    resnet_groups=norm_num_groups,
+                    resnet_pre_norm=True,
+                    add_downsample=not is_final_block,
+                    temporal_num_attention_heads=motion_num_attention_heads[i],
+                    temporal_max_seq_length=motion_max_seq_length,
+                    temporal_transformer_layers_per_block=temporal_transformer_layers_per_block[i],
+                    temporal_double_self_attention=False,
+                )
+            else:
+                raise ValueError(
+                    "Invalid `block_type` encountered. Must be one of `CrossAttnDownBlockMotion` or `DownBlockMotion`"
+                )
+
+            self.down_blocks.append(down_block)
+
+            for _ in range(layers_per_block):
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+
+            if not is_final_block:
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+
+        # mid
+        mid_block_channels = block_out_channels[-1]
+
+        controlnet_block = nn.Conv2d(mid_block_channels, mid_block_channels, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_mid_block = controlnet_block
+
+        if transformer_layers_per_mid_block is None:
+            transformer_layers_per_mid_block = (
+                transformer_layers_per_block[-1] if isinstance(transformer_layers_per_block[-1], int) else 1
+            )
+
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            in_channels=mid_block_channels,
+            temb_channels=time_embed_dim,
+            dropout=0,
+            num_layers=1,
+            transformer_layers_per_block=transformer_layers_per_mid_block,
+            resnet_eps=norm_eps,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            resnet_act_fn=act_fn,
+            resnet_groups=norm_num_groups,
+            resnet_pre_norm=True,
+            num_attention_heads=num_attention_heads[-1],
+            output_scale_factor=mid_block_scale_factor,
+            cross_attention_dim=cross_attention_dim[-1],
+            dual_cross_attention=False,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+            attention_type="default",
+        )
+
+    @classmethod
+    def from_unet(
+        cls,
+        unet: UNet2DConditionModel,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
+        load_weights_from_unet: bool = True,
+        conditioning_channels: int = 3,
+    ) -> "SparseControlNetModel":
+        r"""
+        Instantiate a [`SparseControlNetModel`] from [`UNet2DConditionModel`].
+
+        Parameters:
+            unet (`UNet2DConditionModel`):
+                The UNet model weights to copy to the [`SparseControlNetModel`]. All configuration options are also
+                copied where applicable.
+        """
+        transformer_layers_per_block = (
+            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
+        )
+        down_block_types = unet.config.down_block_types
+
+        for i in range(len(down_block_types)):
+            if "CrossAttn" in down_block_types[i]:
+                down_block_types[i] = "CrossAttnDownBlockMotion"
+            elif "Down" in down_block_types[i]:
+                down_block_types[i] = "DownBlockMotion"
+            else:
+                raise ValueError("Invalid `block_type` encountered. Must be a cross-attention or down block")
+
+        controlnet = cls(
+            in_channels=unet.config.in_channels,
+            conditioning_channels=conditioning_channels,
+            flip_sin_to_cos=unet.config.flip_sin_to_cos,
+            freq_shift=unet.config.freq_shift,
+            down_block_types=unet.config.down_block_types,
+            only_cross_attention=unet.config.only_cross_attention,
+            block_out_channels=unet.config.block_out_channels,
+            layers_per_block=unet.config.layers_per_block,
+            downsample_padding=unet.config.downsample_padding,
+            mid_block_scale_factor=unet.config.mid_block_scale_factor,
+            act_fn=unet.config.act_fn,
+            norm_num_groups=unet.config.norm_num_groups,
+            norm_eps=unet.config.norm_eps,
+            cross_attention_dim=unet.config.cross_attention_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            attention_head_dim=unet.config.attention_head_dim,
+            num_attention_heads=unet.config.num_attention_heads,
+            use_linear_projection=unet.config.use_linear_projection,
+            upcast_attention=unet.config.upcast_attention,
+            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
+            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
+            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
+        )
+
+        if load_weights_from_unet:
+            controlnet.conv_in.load_state_dict(unet.conv_in.state_dict(), strict=False)
+            controlnet.time_proj.load_state_dict(unet.time_proj.state_dict(), strict=False)
+            controlnet.time_embedding.load_state_dict(unet.time_embedding.state_dict(), strict=False)
+            controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict(), strict=False)
+            controlnet.mid_block.load_state_dict(unet.mid_block.state_dict(), strict=False)
+
+        return controlnet
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (CrossAttnDownBlockMotion, DownBlockMotion, UNetMidBlock2DCrossAttn)):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        controlnet_cond: torch.Tensor,
+        conditioning_scale: float = 1.0,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        conditioning_mask: Optional[torch.Tensor] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[SparseControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]:
+        """
+        The [`SparseControlNetModel`] forward method.
+
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor.
+            timestep (`Union[torch.Tensor, float, int]`):
+                The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states.
+            controlnet_cond (`torch.Tensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                The scale factor for ControlNet outputs.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
+                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+                embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            added_cond_kwargs (`dict`):
+                Additional conditions for the Stable Diffusion XL UNet.
+            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
+            guess_mode (`bool`, defaults to `False`):
+                In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
+                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
+                returned where the first element is the sample tensor.
+        """
+        sample_batch_size, sample_channels, sample_num_frames, sample_height, sample_width = sample.shape
+        sample = torch.zeros_like(sample)
+
+        # check channel order
+        channel_order = self.config.controlnet_conditioning_channel_order
+
+        if channel_order == "rgb":
+            # in rgb order by default
+            ...
+        elif channel_order == "bgr":
+            controlnet_cond = torch.flip(controlnet_cond, dims=[1])
+        else:
+            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
+
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        emb = emb.repeat_interleave(sample_num_frames, dim=0)
+
+        # 2. pre-process
+        batch_size, channels, num_frames, height, width = sample.shape
+
+        sample = sample.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+        sample = self.conv_in(sample)
+
+        batch_frames, channels, height, width = sample.shape
+        sample = sample[:, None].reshape(sample_batch_size, sample_num_frames, channels, height, width)
+
+        if self.concat_conditioning_mask:
+            controlnet_cond = torch.cat([controlnet_cond, conditioning_mask], dim=1)
+
+        batch_size, channels, num_frames, height, width = controlnet_cond.shape
+        controlnet_cond = controlnet_cond.permute(0, 2, 1, 3, 4).reshape(
+            batch_size * num_frames, channels, height, width
+        )
+        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
+        batch_frames, channels, height, width = controlnet_cond.shape
+        controlnet_cond = controlnet_cond[:, None].reshape(batch_size, num_frames, channels, height, width)
+
+        sample = sample + controlnet_cond
+
+        batch_size, num_frames, channels, height, width = sample.shape
+        sample = sample.reshape(sample_batch_size * sample_num_frames, channels, height, width)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+
+        # 5. Control net blocks
+        controlnet_down_block_res_samples = ()
+
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
+
+        down_block_res_samples = controlnet_down_block_res_samples
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+
+        # 6. scaling
+        if guess_mode and not self.config.global_pool_conditions:
+            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device)  # 0.1 to 1.0
+            scales = scales * conditioning_scale
+            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
+            mid_block_res_sample = mid_block_res_sample * scales[-1]  # last one
+        else:
+            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+            mid_block_res_sample = mid_block_res_sample * conditioning_scale
+
+        if self.config.global_pool_conditions:
+            down_block_res_samples = [
+                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
+            ]
+            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
+
+        if not return_dict:
+            return (down_block_res_samples, mid_block_res_sample)
+
+        return SparseControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )
+
+
+# Copied from diffusers.models.controlnet.zero_module
+def zero_module(module: nn.Module) -> nn.Module:
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
diff --git a/diffusers/src/diffusers/models/controlnet_xs.py b/diffusers/src/diffusers/models/controlnet_xs.py
new file mode 100644
index 0000000000000000000000000000000000000000..f676a70f060a5b7d6b44da8ed4e8553a6182222d
--- /dev/null
+++ b/diffusers/src/diffusers/models/controlnet_xs.py
@@ -0,0 +1,1945 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from math import gcd
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import Tensor, nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, is_torch_version, logging
+from ..utils.torch_utils import apply_freeu
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+    FusedAttnProcessor2_0,
+)
+from .controlnet import ControlNetConditioningEmbedding
+from .embeddings import TimestepEmbedding, Timesteps
+from .modeling_utils import ModelMixin
+from .unets.unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    Downsample2D,
+    ResnetBlock2D,
+    Transformer2DModel,
+    UNetMidBlock2DCrossAttn,
+    Upsample2D,
+)
+from .unets.unet_2d_condition import UNet2DConditionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class ControlNetXSOutput(BaseOutput):
+    """
+    The output of [`UNetControlNetXSModel`].
+
+    Args:
+        sample (`Tensor` of shape `(batch_size, num_channels, height, width)`):
+            The output of the `UNetControlNetXSModel`. Unlike `ControlNetOutput` this is NOT to be added to the base
+            model output, but is already the final output.
+    """
+
+    sample: Tensor = None
+
+
+class DownBlockControlNetXSAdapter(nn.Module):
+    """Components that together with corresponding components from the base model will form a
+    `ControlNetXSCrossAttnDownBlock2D`"""
+
+    def __init__(
+        self,
+        resnets: nn.ModuleList,
+        base_to_ctrl: nn.ModuleList,
+        ctrl_to_base: nn.ModuleList,
+        attentions: Optional[nn.ModuleList] = None,
+        downsampler: Optional[nn.Conv2d] = None,
+    ):
+        super().__init__()
+        self.resnets = resnets
+        self.base_to_ctrl = base_to_ctrl
+        self.ctrl_to_base = ctrl_to_base
+        self.attentions = attentions
+        self.downsamplers = downsampler
+
+
+class MidBlockControlNetXSAdapter(nn.Module):
+    """Components that together with corresponding components from the base model will form a
+    `ControlNetXSCrossAttnMidBlock2D`"""
+
+    def __init__(self, midblock: UNetMidBlock2DCrossAttn, base_to_ctrl: nn.ModuleList, ctrl_to_base: nn.ModuleList):
+        super().__init__()
+        self.midblock = midblock
+        self.base_to_ctrl = base_to_ctrl
+        self.ctrl_to_base = ctrl_to_base
+
+
+class UpBlockControlNetXSAdapter(nn.Module):
+    """Components that together with corresponding components from the base model will form a `ControlNetXSCrossAttnUpBlock2D`"""
+
+    def __init__(self, ctrl_to_base: nn.ModuleList):
+        super().__init__()
+        self.ctrl_to_base = ctrl_to_base
+
+
+def get_down_block_adapter(
+    base_in_channels: int,
+    base_out_channels: int,
+    ctrl_in_channels: int,
+    ctrl_out_channels: int,
+    temb_channels: int,
+    max_norm_num_groups: Optional[int] = 32,
+    has_crossattn=True,
+    transformer_layers_per_block: Optional[Union[int, Tuple[int]]] = 1,
+    num_attention_heads: Optional[int] = 1,
+    cross_attention_dim: Optional[int] = 1024,
+    add_downsample: bool = True,
+    upcast_attention: Optional[bool] = False,
+    use_linear_projection: Optional[bool] = True,
+):
+    num_layers = 2  # only support sd + sdxl
+
+    resnets = []
+    attentions = []
+    ctrl_to_base = []
+    base_to_ctrl = []
+
+    if isinstance(transformer_layers_per_block, int):
+        transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+    for i in range(num_layers):
+        base_in_channels = base_in_channels if i == 0 else base_out_channels
+        ctrl_in_channels = ctrl_in_channels if i == 0 else ctrl_out_channels
+
+        # Before the resnet/attention application, information is concatted from base to control.
+        # Concat doesn't require change in number of channels
+        base_to_ctrl.append(make_zero_conv(base_in_channels, base_in_channels))
+
+        resnets.append(
+            ResnetBlock2D(
+                in_channels=ctrl_in_channels + base_in_channels,  # information from base is concatted to ctrl
+                out_channels=ctrl_out_channels,
+                temb_channels=temb_channels,
+                groups=find_largest_factor(ctrl_in_channels + base_in_channels, max_factor=max_norm_num_groups),
+                groups_out=find_largest_factor(ctrl_out_channels, max_factor=max_norm_num_groups),
+                eps=1e-5,
+            )
+        )
+
+        if has_crossattn:
+            attentions.append(
+                Transformer2DModel(
+                    num_attention_heads,
+                    ctrl_out_channels // num_attention_heads,
+                    in_channels=ctrl_out_channels,
+                    num_layers=transformer_layers_per_block[i],
+                    cross_attention_dim=cross_attention_dim,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                    norm_num_groups=find_largest_factor(ctrl_out_channels, max_factor=max_norm_num_groups),
+                )
+            )
+
+        # After the resnet/attention application, information is added from control to base
+        # Addition requires change in number of channels
+        ctrl_to_base.append(make_zero_conv(ctrl_out_channels, base_out_channels))
+
+    if add_downsample:
+        # Before the downsampler application, information is concatted from base to control
+        # Concat doesn't require change in number of channels
+        base_to_ctrl.append(make_zero_conv(base_out_channels, base_out_channels))
+
+        downsamplers = Downsample2D(
+            ctrl_out_channels + base_out_channels, use_conv=True, out_channels=ctrl_out_channels, name="op"
+        )
+
+        # After the downsampler application, information is added from control to base
+        # Addition requires change in number of channels
+        ctrl_to_base.append(make_zero_conv(ctrl_out_channels, base_out_channels))
+    else:
+        downsamplers = None
+
+    down_block_components = DownBlockControlNetXSAdapter(
+        resnets=nn.ModuleList(resnets),
+        base_to_ctrl=nn.ModuleList(base_to_ctrl),
+        ctrl_to_base=nn.ModuleList(ctrl_to_base),
+    )
+
+    if has_crossattn:
+        down_block_components.attentions = nn.ModuleList(attentions)
+    if downsamplers is not None:
+        down_block_components.downsamplers = downsamplers
+
+    return down_block_components
+
+
+def get_mid_block_adapter(
+    base_channels: int,
+    ctrl_channels: int,
+    temb_channels: Optional[int] = None,
+    max_norm_num_groups: Optional[int] = 32,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = 1,
+    cross_attention_dim: Optional[int] = 1024,
+    upcast_attention: bool = False,
+    use_linear_projection: bool = True,
+):
+    # Before the midblock application, information is concatted from base to control.
+    # Concat doesn't require change in number of channels
+    base_to_ctrl = make_zero_conv(base_channels, base_channels)
+
+    midblock = UNetMidBlock2DCrossAttn(
+        transformer_layers_per_block=transformer_layers_per_block,
+        in_channels=ctrl_channels + base_channels,
+        out_channels=ctrl_channels,
+        temb_channels=temb_channels,
+        # number or norm groups must divide both in_channels and out_channels
+        resnet_groups=find_largest_factor(gcd(ctrl_channels, ctrl_channels + base_channels), max_norm_num_groups),
+        cross_attention_dim=cross_attention_dim,
+        num_attention_heads=num_attention_heads,
+        use_linear_projection=use_linear_projection,
+        upcast_attention=upcast_attention,
+    )
+
+    # After the midblock application, information is added from control to base
+    # Addition requires change in number of channels
+    ctrl_to_base = make_zero_conv(ctrl_channels, base_channels)
+
+    return MidBlockControlNetXSAdapter(base_to_ctrl=base_to_ctrl, midblock=midblock, ctrl_to_base=ctrl_to_base)
+
+
+def get_up_block_adapter(
+    out_channels: int,
+    prev_output_channel: int,
+    ctrl_skip_channels: List[int],
+):
+    ctrl_to_base = []
+    num_layers = 3  # only support sd + sdxl
+    for i in range(num_layers):
+        resnet_in_channels = prev_output_channel if i == 0 else out_channels
+        ctrl_to_base.append(make_zero_conv(ctrl_skip_channels[i], resnet_in_channels))
+
+    return UpBlockControlNetXSAdapter(ctrl_to_base=nn.ModuleList(ctrl_to_base))
+
+
+class ControlNetXSAdapter(ModelMixin, ConfigMixin):
+    r"""
+    A `ControlNetXSAdapter` model. To use it, pass it into a `UNetControlNetXSModel` (together with a
+    `UNet2DConditionModel` base model).
+
+    This model inherits from [`ModelMixin`] and [`ConfigMixin`]. Check the superclass documentation for it's generic
+    methods implemented for all models (such as downloading or saving).
+
+    Like `UNetControlNetXSModel`, `ControlNetXSAdapter` is compatible with StableDiffusion and StableDiffusion-XL. It's
+    default parameters are compatible with StableDiffusion.
+
+    Parameters:
+        conditioning_channels (`int`, defaults to 3):
+            Number of channels of conditioning input (e.g. an image)
+        conditioning_channel_order (`str`, defaults to `"rgb"`):
+            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+        conditioning_embedding_out_channels (`tuple[int]`, defaults to `(16, 32, 96, 256)`):
+            The tuple of output channels for each block in the `controlnet_cond_embedding` layer.
+        time_embedding_mix (`float`, defaults to 1.0):
+            If 0, then only the control adapters's time embedding is used. If 1, then only the base unet's time
+            embedding is used. Otherwise, both are combined.
+        learn_time_embedding (`bool`, defaults to `False`):
+            Whether a time embedding should be learned. If yes, `UNetControlNetXSModel` will combine the time
+            embeddings of the base model and the control adapter. If no, `UNetControlNetXSModel` will use the base
+            model's time embedding.
+        num_attention_heads (`list[int]`, defaults to `[4]`):
+            The number of attention heads.
+        block_out_channels (`list[int]`, defaults to `[4, 8, 16, 16]`):
+            The tuple of output channels for each block.
+        base_block_out_channels (`list[int]`, defaults to `[320, 640, 1280, 1280]`):
+            The tuple of output channels for each block in the base unet.
+        cross_attention_dim (`int`, defaults to 1024):
+            The dimension of the cross attention features.
+        down_block_types (`list[str]`, defaults to `["CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"]`):
+            The tuple of downsample blocks to use.
+        sample_size (`int`, defaults to 96):
+            Height and width of input/output sample.
+        transformer_layers_per_block (`Union[int, Tuple[int]]`, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        upcast_attention (`bool`, defaults to `True`):
+            Whether the attention computation should always be upcasted.
+        max_norm_num_groups (`int`, defaults to 32):
+            Maximum number of groups in group normal. The actual number will be the largest divisor of the respective
+            channels, that is <= max_norm_num_groups.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        conditioning_channels: int = 3,
+        conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256),
+        time_embedding_mix: float = 1.0,
+        learn_time_embedding: bool = False,
+        num_attention_heads: Union[int, Tuple[int]] = 4,
+        block_out_channels: Tuple[int] = (4, 8, 16, 16),
+        base_block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        cross_attention_dim: int = 1024,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        sample_size: Optional[int] = 96,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        upcast_attention: bool = True,
+        max_norm_num_groups: int = 32,
+        use_linear_projection: bool = True,
+    ):
+        super().__init__()
+
+        time_embedding_input_dim = base_block_out_channels[0]
+        time_embedding_dim = base_block_out_channels[0] * 4
+
+        # Check inputs
+        if conditioning_channel_order not in ["rgb", "bgr"]:
+            raise ValueError(f"unknown `conditioning_channel_order`: {conditioning_channel_order}")
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(transformer_layers_per_block, (list, tuple)):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if not isinstance(cross_attention_dim, (list, tuple)):
+            cross_attention_dim = [cross_attention_dim] * len(down_block_types)
+        # see https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why `ControlNetXSAdapter` takes `num_attention_heads` instead of `attention_head_dim`
+        if not isinstance(num_attention_heads, (list, tuple)):
+            num_attention_heads = [num_attention_heads] * len(down_block_types)
+
+        if len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        # 5 - Create conditioning hint embedding
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
+            conditioning_embedding_channels=block_out_channels[0],
+            block_out_channels=conditioning_embedding_out_channels,
+            conditioning_channels=conditioning_channels,
+        )
+
+        # time
+        if learn_time_embedding:
+            self.time_embedding = TimestepEmbedding(time_embedding_input_dim, time_embedding_dim)
+        else:
+            self.time_embedding = None
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_connections = nn.ModuleList([])
+
+        # input
+        self.conv_in = nn.Conv2d(4, block_out_channels[0], kernel_size=3, padding=1)
+        self.control_to_base_for_conv_in = make_zero_conv(block_out_channels[0], base_block_out_channels[0])
+
+        # down
+        base_out_channels = base_block_out_channels[0]
+        ctrl_out_channels = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            base_in_channels = base_out_channels
+            base_out_channels = base_block_out_channels[i]
+            ctrl_in_channels = ctrl_out_channels
+            ctrl_out_channels = block_out_channels[i]
+            has_crossattn = "CrossAttn" in down_block_type
+            is_final_block = i == len(down_block_types) - 1
+
+            self.down_blocks.append(
+                get_down_block_adapter(
+                    base_in_channels=base_in_channels,
+                    base_out_channels=base_out_channels,
+                    ctrl_in_channels=ctrl_in_channels,
+                    ctrl_out_channels=ctrl_out_channels,
+                    temb_channels=time_embedding_dim,
+                    max_norm_num_groups=max_norm_num_groups,
+                    has_crossattn=has_crossattn,
+                    transformer_layers_per_block=transformer_layers_per_block[i],
+                    num_attention_heads=num_attention_heads[i],
+                    cross_attention_dim=cross_attention_dim[i],
+                    add_downsample=not is_final_block,
+                    upcast_attention=upcast_attention,
+                    use_linear_projection=use_linear_projection,
+                )
+            )
+
+        # mid
+        self.mid_block = get_mid_block_adapter(
+            base_channels=base_block_out_channels[-1],
+            ctrl_channels=block_out_channels[-1],
+            temb_channels=time_embedding_dim,
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            num_attention_heads=num_attention_heads[-1],
+            cross_attention_dim=cross_attention_dim[-1],
+            upcast_attention=upcast_attention,
+            use_linear_projection=use_linear_projection,
+        )
+
+        # up
+        # The skip connection channels are the output of the conv_in and of all the down subblocks
+        ctrl_skip_channels = [block_out_channels[0]]
+        for i, out_channels in enumerate(block_out_channels):
+            number_of_subblocks = (
+                3 if i < len(block_out_channels) - 1 else 2
+            )  # every block has 3 subblocks, except last one, which has 2 as it has no downsampler
+            ctrl_skip_channels.extend([out_channels] * number_of_subblocks)
+
+        reversed_base_block_out_channels = list(reversed(base_block_out_channels))
+
+        base_out_channels = reversed_base_block_out_channels[0]
+        for i in range(len(down_block_types)):
+            prev_base_output_channel = base_out_channels
+            base_out_channels = reversed_base_block_out_channels[i]
+            ctrl_skip_channels_ = [ctrl_skip_channels.pop() for _ in range(3)]
+
+            self.up_connections.append(
+                get_up_block_adapter(
+                    out_channels=base_out_channels,
+                    prev_output_channel=prev_base_output_channel,
+                    ctrl_skip_channels=ctrl_skip_channels_,
+                )
+            )
+
+    @classmethod
+    def from_unet(
+        cls,
+        unet: UNet2DConditionModel,
+        size_ratio: Optional[float] = None,
+        block_out_channels: Optional[List[int]] = None,
+        num_attention_heads: Optional[List[int]] = None,
+        learn_time_embedding: bool = False,
+        time_embedding_mix: int = 1.0,
+        conditioning_channels: int = 3,
+        conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256),
+    ):
+        r"""
+        Instantiate a [`ControlNetXSAdapter`] from a [`UNet2DConditionModel`].
+
+        Parameters:
+            unet (`UNet2DConditionModel`):
+                The UNet model we want to control. The dimensions of the ControlNetXSAdapter will be adapted to it.
+            size_ratio (float, *optional*, defaults to `None`):
+                When given, block_out_channels is set to a fraction of the base model's block_out_channels. Either this
+                or `block_out_channels` must be given.
+            block_out_channels (`List[int]`, *optional*, defaults to `None`):
+                Down blocks output channels in control model. Either this or `size_ratio` must be given.
+            num_attention_heads (`List[int]`, *optional*, defaults to `None`):
+                The dimension of the attention heads. The naming seems a bit confusing and it is, see
+                https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why.
+            learn_time_embedding (`bool`, defaults to `False`):
+                Whether the `ControlNetXSAdapter` should learn a time embedding.
+            time_embedding_mix (`float`, defaults to 1.0):
+                If 0, then only the control adapter's time embedding is used. If 1, then only the base unet's time
+                embedding is used. Otherwise, both are combined.
+            conditioning_channels (`int`, defaults to 3):
+                Number of channels of conditioning input (e.g. an image)
+            conditioning_channel_order (`str`, defaults to `"rgb"`):
+                The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+            conditioning_embedding_out_channels (`Tuple[int]`, defaults to `(16, 32, 96, 256)`):
+                The tuple of output channel for each block in the `controlnet_cond_embedding` layer.
+        """
+
+        # Check input
+        fixed_size = block_out_channels is not None
+        relative_size = size_ratio is not None
+        if not (fixed_size ^ relative_size):
+            raise ValueError(
+                "Pass exactly one of `block_out_channels` (for absolute sizing) or `size_ratio` (for relative sizing)."
+            )
+
+        # Create model
+        block_out_channels = block_out_channels or [int(b * size_ratio) for b in unet.config.block_out_channels]
+        if num_attention_heads is None:
+            # The naming seems a bit confusing and it is, see https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why.
+            num_attention_heads = unet.config.attention_head_dim
+
+        model = cls(
+            conditioning_channels=conditioning_channels,
+            conditioning_channel_order=conditioning_channel_order,
+            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
+            time_embedding_mix=time_embedding_mix,
+            learn_time_embedding=learn_time_embedding,
+            num_attention_heads=num_attention_heads,
+            block_out_channels=block_out_channels,
+            base_block_out_channels=unet.config.block_out_channels,
+            cross_attention_dim=unet.config.cross_attention_dim,
+            down_block_types=unet.config.down_block_types,
+            sample_size=unet.config.sample_size,
+            transformer_layers_per_block=unet.config.transformer_layers_per_block,
+            upcast_attention=unet.config.upcast_attention,
+            max_norm_num_groups=unet.config.norm_num_groups,
+            use_linear_projection=unet.config.use_linear_projection,
+        )
+
+        # ensure that the ControlNetXSAdapter is the same dtype as the UNet2DConditionModel
+        model.to(unet.dtype)
+
+        return model
+
+    def forward(self, *args, **kwargs):
+        raise ValueError(
+            "A ControlNetXSAdapter cannot be run by itself. Use it together with a UNet2DConditionModel to instantiate a UNetControlNetXSModel."
+        )
+
+
+class UNetControlNetXSModel(ModelMixin, ConfigMixin):
+    r"""
+    A UNet fused with a ControlNet-XS adapter model
+
+    This model inherits from [`ModelMixin`] and [`ConfigMixin`]. Check the superclass documentation for it's generic
+    methods implemented for all models (such as downloading or saving).
+
+    `UNetControlNetXSModel` is compatible with StableDiffusion and StableDiffusion-XL. It's default parameters are
+    compatible with StableDiffusion.
+
+    It's parameters are either passed to the underlying `UNet2DConditionModel` or used exactly like in
+    `ControlNetXSAdapter` . See their documentation for details.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        # unet configs
+        sample_size: Optional[int] = 96,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        norm_num_groups: Optional[int] = 32,
+        cross_attention_dim: Union[int, Tuple[int]] = 1024,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        num_attention_heads: Union[int, Tuple[int]] = 8,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        upcast_attention: bool = True,
+        use_linear_projection: bool = True,
+        time_cond_proj_dim: Optional[int] = None,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        # additional controlnet configs
+        time_embedding_mix: float = 1.0,
+        ctrl_conditioning_channels: int = 3,
+        ctrl_conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256),
+        ctrl_conditioning_channel_order: str = "rgb",
+        ctrl_learn_time_embedding: bool = False,
+        ctrl_block_out_channels: Tuple[int] = (4, 8, 16, 16),
+        ctrl_num_attention_heads: Union[int, Tuple[int]] = 4,
+        ctrl_max_norm_num_groups: int = 32,
+    ):
+        super().__init__()
+
+        if time_embedding_mix < 0 or time_embedding_mix > 1:
+            raise ValueError("`time_embedding_mix` needs to be between 0 and 1.")
+        if time_embedding_mix < 1 and not ctrl_learn_time_embedding:
+            raise ValueError("To use `time_embedding_mix` < 1, `ctrl_learn_time_embedding` must be `True`")
+
+        if addition_embed_type is not None and addition_embed_type != "text_time":
+            raise ValueError(
+                "As `UNetControlNetXSModel` currently only supports StableDiffusion and StableDiffusion-XL, `addition_embed_type` must be `None` or `'text_time'`."
+            )
+
+        if not isinstance(transformer_layers_per_block, (list, tuple)):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if not isinstance(cross_attention_dim, (list, tuple)):
+            cross_attention_dim = [cross_attention_dim] * len(down_block_types)
+        if not isinstance(num_attention_heads, (list, tuple)):
+            num_attention_heads = [num_attention_heads] * len(down_block_types)
+        if not isinstance(ctrl_num_attention_heads, (list, tuple)):
+            ctrl_num_attention_heads = [ctrl_num_attention_heads] * len(down_block_types)
+
+        base_num_attention_heads = num_attention_heads
+
+        self.in_channels = 4
+
+        # # Input
+        self.base_conv_in = nn.Conv2d(4, block_out_channels[0], kernel_size=3, padding=1)
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
+            conditioning_embedding_channels=ctrl_block_out_channels[0],
+            block_out_channels=ctrl_conditioning_embedding_out_channels,
+            conditioning_channels=ctrl_conditioning_channels,
+        )
+        self.ctrl_conv_in = nn.Conv2d(4, ctrl_block_out_channels[0], kernel_size=3, padding=1)
+        self.control_to_base_for_conv_in = make_zero_conv(ctrl_block_out_channels[0], block_out_channels[0])
+
+        # # Time
+        time_embed_input_dim = block_out_channels[0]
+        time_embed_dim = block_out_channels[0] * 4
+
+        self.base_time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.base_time_embedding = TimestepEmbedding(
+            time_embed_input_dim,
+            time_embed_dim,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+        if ctrl_learn_time_embedding:
+            self.ctrl_time_embedding = TimestepEmbedding(
+                in_channels=time_embed_input_dim, time_embed_dim=time_embed_dim
+            )
+        else:
+            self.ctrl_time_embedding = None
+
+        if addition_embed_type is None:
+            self.base_add_time_proj = None
+            self.base_add_embedding = None
+        else:
+            self.base_add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
+            self.base_add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+
+        # # Create down blocks
+        down_blocks = []
+        base_out_channels = block_out_channels[0]
+        ctrl_out_channels = ctrl_block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            base_in_channels = base_out_channels
+            base_out_channels = block_out_channels[i]
+            ctrl_in_channels = ctrl_out_channels
+            ctrl_out_channels = ctrl_block_out_channels[i]
+            has_crossattn = "CrossAttn" in down_block_type
+            is_final_block = i == len(down_block_types) - 1
+
+            down_blocks.append(
+                ControlNetXSCrossAttnDownBlock2D(
+                    base_in_channels=base_in_channels,
+                    base_out_channels=base_out_channels,
+                    ctrl_in_channels=ctrl_in_channels,
+                    ctrl_out_channels=ctrl_out_channels,
+                    temb_channels=time_embed_dim,
+                    norm_num_groups=norm_num_groups,
+                    ctrl_max_norm_num_groups=ctrl_max_norm_num_groups,
+                    has_crossattn=has_crossattn,
+                    transformer_layers_per_block=transformer_layers_per_block[i],
+                    base_num_attention_heads=base_num_attention_heads[i],
+                    ctrl_num_attention_heads=ctrl_num_attention_heads[i],
+                    cross_attention_dim=cross_attention_dim[i],
+                    add_downsample=not is_final_block,
+                    upcast_attention=upcast_attention,
+                    use_linear_projection=use_linear_projection,
+                )
+            )
+
+        # # Create mid block
+        self.mid_block = ControlNetXSCrossAttnMidBlock2D(
+            base_channels=block_out_channels[-1],
+            ctrl_channels=ctrl_block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            norm_num_groups=norm_num_groups,
+            ctrl_max_norm_num_groups=ctrl_max_norm_num_groups,
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            base_num_attention_heads=base_num_attention_heads[-1],
+            ctrl_num_attention_heads=ctrl_num_attention_heads[-1],
+            cross_attention_dim=cross_attention_dim[-1],
+            upcast_attention=upcast_attention,
+            use_linear_projection=use_linear_projection,
+        )
+
+        # # Create up blocks
+        up_blocks = []
+        rev_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        rev_num_attention_heads = list(reversed(base_num_attention_heads))
+        rev_cross_attention_dim = list(reversed(cross_attention_dim))
+
+        # The skip connection channels are the output of the conv_in and of all the down subblocks
+        ctrl_skip_channels = [ctrl_block_out_channels[0]]
+        for i, out_channels in enumerate(ctrl_block_out_channels):
+            number_of_subblocks = (
+                3 if i < len(ctrl_block_out_channels) - 1 else 2
+            )  # every block has 3 subblocks, except last one, which has 2 as it has no downsampler
+            ctrl_skip_channels.extend([out_channels] * number_of_subblocks)
+
+        reversed_block_out_channels = list(reversed(block_out_channels))
+
+        out_channels = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = out_channels
+            out_channels = reversed_block_out_channels[i]
+            in_channels = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            ctrl_skip_channels_ = [ctrl_skip_channels.pop() for _ in range(3)]
+
+            has_crossattn = "CrossAttn" in up_block_type
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_blocks.append(
+                ControlNetXSCrossAttnUpBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    prev_output_channel=prev_output_channel,
+                    ctrl_skip_channels=ctrl_skip_channels_,
+                    temb_channels=time_embed_dim,
+                    resolution_idx=i,
+                    has_crossattn=has_crossattn,
+                    transformer_layers_per_block=rev_transformer_layers_per_block[i],
+                    num_attention_heads=rev_num_attention_heads[i],
+                    cross_attention_dim=rev_cross_attention_dim[i],
+                    add_upsample=not is_final_block,
+                    upcast_attention=upcast_attention,
+                    norm_num_groups=norm_num_groups,
+                    use_linear_projection=use_linear_projection,
+                )
+            )
+
+        self.down_blocks = nn.ModuleList(down_blocks)
+        self.up_blocks = nn.ModuleList(up_blocks)
+
+        self.base_conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups)
+        self.base_conv_act = nn.SiLU()
+        self.base_conv_out = nn.Conv2d(block_out_channels[0], 4, kernel_size=3, padding=1)
+
+    @classmethod
+    def from_unet(
+        cls,
+        unet: UNet2DConditionModel,
+        controlnet: Optional[ControlNetXSAdapter] = None,
+        size_ratio: Optional[float] = None,
+        ctrl_block_out_channels: Optional[List[float]] = None,
+        time_embedding_mix: Optional[float] = None,
+        ctrl_optional_kwargs: Optional[Dict] = None,
+    ):
+        r"""
+        Instantiate a [`UNetControlNetXSModel`] from a [`UNet2DConditionModel`] and an optional [`ControlNetXSAdapter`]
+        .
+
+        Parameters:
+            unet (`UNet2DConditionModel`):
+                The UNet model we want to control.
+            controlnet (`ControlNetXSAdapter`):
+                The ConntrolNet-XS adapter with which the UNet will be fused. If none is given, a new ConntrolNet-XS
+                adapter will be created.
+            size_ratio (float, *optional*, defaults to `None`):
+                Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
+            ctrl_block_out_channels (`List[int]`, *optional*, defaults to `None`):
+                Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details,
+                where this parameter is called `block_out_channels`.
+            time_embedding_mix (`float`, *optional*, defaults to None):
+                Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
+            ctrl_optional_kwargs (`Dict`, *optional*, defaults to `None`):
+                Passed to the `init` of the new controlent if no controlent was given.
+        """
+        if controlnet is None:
+            controlnet = ControlNetXSAdapter.from_unet(
+                unet, size_ratio, ctrl_block_out_channels, **ctrl_optional_kwargs
+            )
+        else:
+            if any(
+                o is not None for o in (size_ratio, ctrl_block_out_channels, time_embedding_mix, ctrl_optional_kwargs)
+            ):
+                raise ValueError(
+                    "When a controlnet is passed, none of these parameters should be passed: size_ratio, ctrl_block_out_channels, time_embedding_mix, ctrl_optional_kwargs."
+                )
+
+        # # get params
+        params_for_unet = [
+            "sample_size",
+            "down_block_types",
+            "up_block_types",
+            "block_out_channels",
+            "norm_num_groups",
+            "cross_attention_dim",
+            "transformer_layers_per_block",
+            "addition_embed_type",
+            "addition_time_embed_dim",
+            "upcast_attention",
+            "use_linear_projection",
+            "time_cond_proj_dim",
+            "projection_class_embeddings_input_dim",
+        ]
+        params_for_unet = {k: v for k, v in unet.config.items() if k in params_for_unet}
+        # The naming seems a bit confusing and it is, see https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why.
+        params_for_unet["num_attention_heads"] = unet.config.attention_head_dim
+
+        params_for_controlnet = [
+            "conditioning_channels",
+            "conditioning_embedding_out_channels",
+            "conditioning_channel_order",
+            "learn_time_embedding",
+            "block_out_channels",
+            "num_attention_heads",
+            "max_norm_num_groups",
+        ]
+        params_for_controlnet = {"ctrl_" + k: v for k, v in controlnet.config.items() if k in params_for_controlnet}
+        params_for_controlnet["time_embedding_mix"] = controlnet.config.time_embedding_mix
+
+        # # create model
+        model = cls.from_config({**params_for_unet, **params_for_controlnet})
+
+        # # load weights
+        # from unet
+        modules_from_unet = [
+            "time_embedding",
+            "conv_in",
+            "conv_norm_out",
+            "conv_out",
+        ]
+        for m in modules_from_unet:
+            getattr(model, "base_" + m).load_state_dict(getattr(unet, m).state_dict())
+
+        optional_modules_from_unet = [
+            "add_time_proj",
+            "add_embedding",
+        ]
+        for m in optional_modules_from_unet:
+            if hasattr(unet, m) and getattr(unet, m) is not None:
+                getattr(model, "base_" + m).load_state_dict(getattr(unet, m).state_dict())
+
+        # from controlnet
+        model.controlnet_cond_embedding.load_state_dict(controlnet.controlnet_cond_embedding.state_dict())
+        model.ctrl_conv_in.load_state_dict(controlnet.conv_in.state_dict())
+        if controlnet.time_embedding is not None:
+            model.ctrl_time_embedding.load_state_dict(controlnet.time_embedding.state_dict())
+        model.control_to_base_for_conv_in.load_state_dict(controlnet.control_to_base_for_conv_in.state_dict())
+
+        # from both
+        model.down_blocks = nn.ModuleList(
+            ControlNetXSCrossAttnDownBlock2D.from_modules(b, c)
+            for b, c in zip(unet.down_blocks, controlnet.down_blocks)
+        )
+        model.mid_block = ControlNetXSCrossAttnMidBlock2D.from_modules(unet.mid_block, controlnet.mid_block)
+        model.up_blocks = nn.ModuleList(
+            ControlNetXSCrossAttnUpBlock2D.from_modules(b, c)
+            for b, c in zip(unet.up_blocks, controlnet.up_connections)
+        )
+
+        # ensure that the UNetControlNetXSModel is the same dtype as the UNet2DConditionModel
+        model.to(unet.dtype)
+
+        return model
+
+    def freeze_unet_params(self) -> None:
+        """Freeze the weights of the parts belonging to the base UNet2DConditionModel, and leave everything else unfrozen for fine
+        tuning."""
+        # Freeze everything
+        for param in self.parameters():
+            param.requires_grad = True
+
+        # Unfreeze ControlNetXSAdapter
+        base_parts = [
+            "base_time_proj",
+            "base_time_embedding",
+            "base_add_time_proj",
+            "base_add_embedding",
+            "base_conv_in",
+            "base_conv_norm_out",
+            "base_conv_act",
+            "base_conv_out",
+        ]
+        base_parts = [getattr(self, part) for part in base_parts if getattr(self, part) is not None]
+        for part in base_parts:
+            for param in part.parameters():
+                param.requires_grad = False
+
+        for d in self.down_blocks:
+            d.freeze_base_params()
+        self.mid_block.freeze_base_params()
+        for u in self.up_blocks:
+            u.freeze_base_params()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.enable_freeu
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        sample: Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        controlnet_cond: Optional[torch.Tensor] = None,
+        conditioning_scale: Optional[float] = 1.0,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        return_dict: bool = True,
+        apply_control: bool = True,
+    ) -> Union[ControlNetXSOutput, Tuple]:
+        """
+        The [`ControlNetXSModel`] forward method.
+
+        Args:
+            sample (`Tensor`):
+                The noisy input tensor.
+            timestep (`Union[torch.Tensor, float, int]`):
+                The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states.
+            controlnet_cond (`Tensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                How much the control model affects the base model outputs.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
+                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+                embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
+            added_cond_kwargs (`dict`):
+                Additional conditions for the Stable Diffusion XL UNet.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+            apply_control (`bool`, defaults to `True`):
+                If `False`, the input is run only through the base model.
+
+        Returns:
+            [`~models.controlnetxs.ControlNetXSOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.controlnetxs.ControlNetXSOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+
+        # check channel order
+        if self.config.ctrl_conditioning_channel_order == "bgr":
+            controlnet_cond = torch.flip(controlnet_cond, dims=[1])
+
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.base_time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        if self.config.ctrl_learn_time_embedding and apply_control:
+            ctrl_temb = self.ctrl_time_embedding(t_emb, timestep_cond)
+            base_temb = self.base_time_embedding(t_emb, timestep_cond)
+            interpolation_param = self.config.time_embedding_mix**0.3
+
+            temb = ctrl_temb * interpolation_param + base_temb * (1 - interpolation_param)
+        else:
+            temb = self.base_time_embedding(t_emb)
+
+        # added time & text embeddings
+        aug_emb = None
+
+        if self.config.addition_embed_type is None:
+            pass
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.base_add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(temb.dtype)
+            aug_emb = self.base_add_embedding(add_embeds)
+        else:
+            raise ValueError(
+                f"ControlNet-XS currently only supports StableDiffusion and StableDiffusion-XL, so addition_embed_type = {self.config.addition_embed_type} is currently not supported."
+            )
+
+        temb = temb + aug_emb if aug_emb is not None else temb
+
+        # text embeddings
+        cemb = encoder_hidden_states
+
+        # Preparation
+        h_ctrl = h_base = sample
+        hs_base, hs_ctrl = [], []
+
+        # Cross Control
+        guided_hint = self.controlnet_cond_embedding(controlnet_cond)
+
+        # 1 - conv in & down
+
+        h_base = self.base_conv_in(h_base)
+        h_ctrl = self.ctrl_conv_in(h_ctrl)
+        if guided_hint is not None:
+            h_ctrl += guided_hint
+        if apply_control:
+            h_base = h_base + self.control_to_base_for_conv_in(h_ctrl) * conditioning_scale  # add ctrl -> base
+
+        hs_base.append(h_base)
+        hs_ctrl.append(h_ctrl)
+
+        for down in self.down_blocks:
+            h_base, h_ctrl, residual_hb, residual_hc = down(
+                hidden_states_base=h_base,
+                hidden_states_ctrl=h_ctrl,
+                temb=temb,
+                encoder_hidden_states=cemb,
+                conditioning_scale=conditioning_scale,
+                cross_attention_kwargs=cross_attention_kwargs,
+                attention_mask=attention_mask,
+                apply_control=apply_control,
+            )
+            hs_base.extend(residual_hb)
+            hs_ctrl.extend(residual_hc)
+
+        # 2 - mid
+        h_base, h_ctrl = self.mid_block(
+            hidden_states_base=h_base,
+            hidden_states_ctrl=h_ctrl,
+            temb=temb,
+            encoder_hidden_states=cemb,
+            conditioning_scale=conditioning_scale,
+            cross_attention_kwargs=cross_attention_kwargs,
+            attention_mask=attention_mask,
+            apply_control=apply_control,
+        )
+
+        # 3 - up
+        for up in self.up_blocks:
+            n_resnets = len(up.resnets)
+            skips_hb = hs_base[-n_resnets:]
+            skips_hc = hs_ctrl[-n_resnets:]
+            hs_base = hs_base[:-n_resnets]
+            hs_ctrl = hs_ctrl[:-n_resnets]
+            h_base = up(
+                hidden_states=h_base,
+                res_hidden_states_tuple_base=skips_hb,
+                res_hidden_states_tuple_ctrl=skips_hc,
+                temb=temb,
+                encoder_hidden_states=cemb,
+                conditioning_scale=conditioning_scale,
+                cross_attention_kwargs=cross_attention_kwargs,
+                attention_mask=attention_mask,
+                apply_control=apply_control,
+            )
+
+        # 4 - conv out
+        h_base = self.base_conv_norm_out(h_base)
+        h_base = self.base_conv_act(h_base)
+        h_base = self.base_conv_out(h_base)
+
+        if not return_dict:
+            return (h_base,)
+
+        return ControlNetXSOutput(sample=h_base)
+
+
+class ControlNetXSCrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        base_in_channels: int,
+        base_out_channels: int,
+        ctrl_in_channels: int,
+        ctrl_out_channels: int,
+        temb_channels: int,
+        norm_num_groups: int = 32,
+        ctrl_max_norm_num_groups: int = 32,
+        has_crossattn=True,
+        transformer_layers_per_block: Optional[Union[int, Tuple[int]]] = 1,
+        base_num_attention_heads: Optional[int] = 1,
+        ctrl_num_attention_heads: Optional[int] = 1,
+        cross_attention_dim: Optional[int] = 1024,
+        add_downsample: bool = True,
+        upcast_attention: Optional[bool] = False,
+        use_linear_projection: Optional[bool] = True,
+    ):
+        super().__init__()
+        base_resnets = []
+        base_attentions = []
+        ctrl_resnets = []
+        ctrl_attentions = []
+        ctrl_to_base = []
+        base_to_ctrl = []
+
+        num_layers = 2  # only support sd + sdxl
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            base_in_channels = base_in_channels if i == 0 else base_out_channels
+            ctrl_in_channels = ctrl_in_channels if i == 0 else ctrl_out_channels
+
+            # Before the resnet/attention application, information is concatted from base to control.
+            # Concat doesn't require change in number of channels
+            base_to_ctrl.append(make_zero_conv(base_in_channels, base_in_channels))
+
+            base_resnets.append(
+                ResnetBlock2D(
+                    in_channels=base_in_channels,
+                    out_channels=base_out_channels,
+                    temb_channels=temb_channels,
+                    groups=norm_num_groups,
+                )
+            )
+            ctrl_resnets.append(
+                ResnetBlock2D(
+                    in_channels=ctrl_in_channels + base_in_channels,  # information from base is concatted to ctrl
+                    out_channels=ctrl_out_channels,
+                    temb_channels=temb_channels,
+                    groups=find_largest_factor(
+                        ctrl_in_channels + base_in_channels, max_factor=ctrl_max_norm_num_groups
+                    ),
+                    groups_out=find_largest_factor(ctrl_out_channels, max_factor=ctrl_max_norm_num_groups),
+                    eps=1e-5,
+                )
+            )
+
+            if has_crossattn:
+                base_attentions.append(
+                    Transformer2DModel(
+                        base_num_attention_heads,
+                        base_out_channels // base_num_attention_heads,
+                        in_channels=base_out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        norm_num_groups=norm_num_groups,
+                    )
+                )
+                ctrl_attentions.append(
+                    Transformer2DModel(
+                        ctrl_num_attention_heads,
+                        ctrl_out_channels // ctrl_num_attention_heads,
+                        in_channels=ctrl_out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        norm_num_groups=find_largest_factor(ctrl_out_channels, max_factor=ctrl_max_norm_num_groups),
+                    )
+                )
+
+            # After the resnet/attention application, information is added from control to base
+            # Addition requires change in number of channels
+            ctrl_to_base.append(make_zero_conv(ctrl_out_channels, base_out_channels))
+
+        if add_downsample:
+            # Before the downsampler application, information is concatted from base to control
+            # Concat doesn't require change in number of channels
+            base_to_ctrl.append(make_zero_conv(base_out_channels, base_out_channels))
+
+            self.base_downsamplers = Downsample2D(
+                base_out_channels, use_conv=True, out_channels=base_out_channels, name="op"
+            )
+            self.ctrl_downsamplers = Downsample2D(
+                ctrl_out_channels + base_out_channels, use_conv=True, out_channels=ctrl_out_channels, name="op"
+            )
+
+            # After the downsampler application, information is added from control to base
+            # Addition requires change in number of channels
+            ctrl_to_base.append(make_zero_conv(ctrl_out_channels, base_out_channels))
+        else:
+            self.base_downsamplers = None
+            self.ctrl_downsamplers = None
+
+        self.base_resnets = nn.ModuleList(base_resnets)
+        self.ctrl_resnets = nn.ModuleList(ctrl_resnets)
+        self.base_attentions = nn.ModuleList(base_attentions) if has_crossattn else [None] * num_layers
+        self.ctrl_attentions = nn.ModuleList(ctrl_attentions) if has_crossattn else [None] * num_layers
+        self.base_to_ctrl = nn.ModuleList(base_to_ctrl)
+        self.ctrl_to_base = nn.ModuleList(ctrl_to_base)
+
+        self.gradient_checkpointing = False
+
+    @classmethod
+    def from_modules(cls, base_downblock: CrossAttnDownBlock2D, ctrl_downblock: DownBlockControlNetXSAdapter):
+        # get params
+        def get_first_cross_attention(block):
+            return block.attentions[0].transformer_blocks[0].attn2
+
+        base_in_channels = base_downblock.resnets[0].in_channels
+        base_out_channels = base_downblock.resnets[0].out_channels
+        ctrl_in_channels = (
+            ctrl_downblock.resnets[0].in_channels - base_in_channels
+        )  # base channels are concatted to ctrl channels in init
+        ctrl_out_channels = ctrl_downblock.resnets[0].out_channels
+        temb_channels = base_downblock.resnets[0].time_emb_proj.in_features
+        num_groups = base_downblock.resnets[0].norm1.num_groups
+        ctrl_num_groups = ctrl_downblock.resnets[0].norm1.num_groups
+        if hasattr(base_downblock, "attentions"):
+            has_crossattn = True
+            transformer_layers_per_block = len(base_downblock.attentions[0].transformer_blocks)
+            base_num_attention_heads = get_first_cross_attention(base_downblock).heads
+            ctrl_num_attention_heads = get_first_cross_attention(ctrl_downblock).heads
+            cross_attention_dim = get_first_cross_attention(base_downblock).cross_attention_dim
+            upcast_attention = get_first_cross_attention(base_downblock).upcast_attention
+            use_linear_projection = base_downblock.attentions[0].use_linear_projection
+        else:
+            has_crossattn = False
+            transformer_layers_per_block = None
+            base_num_attention_heads = None
+            ctrl_num_attention_heads = None
+            cross_attention_dim = None
+            upcast_attention = None
+            use_linear_projection = None
+        add_downsample = base_downblock.downsamplers is not None
+
+        # create model
+        model = cls(
+            base_in_channels=base_in_channels,
+            base_out_channels=base_out_channels,
+            ctrl_in_channels=ctrl_in_channels,
+            ctrl_out_channels=ctrl_out_channels,
+            temb_channels=temb_channels,
+            norm_num_groups=num_groups,
+            ctrl_max_norm_num_groups=ctrl_num_groups,
+            has_crossattn=has_crossattn,
+            transformer_layers_per_block=transformer_layers_per_block,
+            base_num_attention_heads=base_num_attention_heads,
+            ctrl_num_attention_heads=ctrl_num_attention_heads,
+            cross_attention_dim=cross_attention_dim,
+            add_downsample=add_downsample,
+            upcast_attention=upcast_attention,
+            use_linear_projection=use_linear_projection,
+        )
+
+        # # load weights
+        model.base_resnets.load_state_dict(base_downblock.resnets.state_dict())
+        model.ctrl_resnets.load_state_dict(ctrl_downblock.resnets.state_dict())
+        if has_crossattn:
+            model.base_attentions.load_state_dict(base_downblock.attentions.state_dict())
+            model.ctrl_attentions.load_state_dict(ctrl_downblock.attentions.state_dict())
+        if add_downsample:
+            model.base_downsamplers.load_state_dict(base_downblock.downsamplers[0].state_dict())
+            model.ctrl_downsamplers.load_state_dict(ctrl_downblock.downsamplers.state_dict())
+        model.base_to_ctrl.load_state_dict(ctrl_downblock.base_to_ctrl.state_dict())
+        model.ctrl_to_base.load_state_dict(ctrl_downblock.ctrl_to_base.state_dict())
+
+        return model
+
+    def freeze_base_params(self) -> None:
+        """Freeze the weights of the parts belonging to the base UNet2DConditionModel, and leave everything else unfrozen for fine
+        tuning."""
+        # Unfreeze everything
+        for param in self.parameters():
+            param.requires_grad = True
+
+        # Freeze base part
+        base_parts = [self.base_resnets]
+        if isinstance(self.base_attentions, nn.ModuleList):  # attentions can be a list of Nones
+            base_parts.append(self.base_attentions)
+        if self.base_downsamplers is not None:
+            base_parts.append(self.base_downsamplers)
+        for part in base_parts:
+            for param in part.parameters():
+                param.requires_grad = False
+
+    def forward(
+        self,
+        hidden_states_base: Tensor,
+        temb: Tensor,
+        encoder_hidden_states: Optional[Tensor] = None,
+        hidden_states_ctrl: Optional[Tensor] = None,
+        conditioning_scale: Optional[float] = 1.0,
+        attention_mask: Optional[Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[Tensor] = None,
+        apply_control: bool = True,
+    ) -> Tuple[Tensor, Tensor, Tuple[Tensor, ...], Tuple[Tensor, ...]]:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        h_base = hidden_states_base
+        h_ctrl = hidden_states_ctrl
+
+        base_output_states = ()
+        ctrl_output_states = ()
+
+        base_blocks = list(zip(self.base_resnets, self.base_attentions))
+        ctrl_blocks = list(zip(self.ctrl_resnets, self.ctrl_attentions))
+
+        def create_custom_forward(module, return_dict=None):
+            def custom_forward(*inputs):
+                if return_dict is not None:
+                    return module(*inputs, return_dict=return_dict)
+                else:
+                    return module(*inputs)
+
+            return custom_forward
+
+        for (b_res, b_attn), (c_res, c_attn), b2c, c2b in zip(
+            base_blocks, ctrl_blocks, self.base_to_ctrl, self.ctrl_to_base
+        ):
+            # concat base -> ctrl
+            if apply_control:
+                h_ctrl = torch.cat([h_ctrl, b2c(h_base)], dim=1)
+
+            # apply base subblock
+            if self.training and self.gradient_checkpointing:
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                h_base = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(b_res),
+                    h_base,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                h_base = b_res(h_base, temb)
+
+            if b_attn is not None:
+                h_base = b_attn(
+                    h_base,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+
+            # apply ctrl subblock
+            if apply_control:
+                if self.training and self.gradient_checkpointing:
+                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                    h_ctrl = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(c_res),
+                        h_ctrl,
+                        temb,
+                        **ckpt_kwargs,
+                    )
+                else:
+                    h_ctrl = c_res(h_ctrl, temb)
+                if c_attn is not None:
+                    h_ctrl = c_attn(
+                        h_ctrl,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        attention_mask=attention_mask,
+                        encoder_attention_mask=encoder_attention_mask,
+                        return_dict=False,
+                    )[0]
+
+            # add ctrl -> base
+            if apply_control:
+                h_base = h_base + c2b(h_ctrl) * conditioning_scale
+
+            base_output_states = base_output_states + (h_base,)
+            ctrl_output_states = ctrl_output_states + (h_ctrl,)
+
+        if self.base_downsamplers is not None:  # if we have a base_downsampler, then also a ctrl_downsampler
+            b2c = self.base_to_ctrl[-1]
+            c2b = self.ctrl_to_base[-1]
+
+            # concat base -> ctrl
+            if apply_control:
+                h_ctrl = torch.cat([h_ctrl, b2c(h_base)], dim=1)
+            # apply base subblock
+            h_base = self.base_downsamplers(h_base)
+            # apply ctrl subblock
+            if apply_control:
+                h_ctrl = self.ctrl_downsamplers(h_ctrl)
+            # add ctrl -> base
+            if apply_control:
+                h_base = h_base + c2b(h_ctrl) * conditioning_scale
+
+            base_output_states = base_output_states + (h_base,)
+            ctrl_output_states = ctrl_output_states + (h_ctrl,)
+
+        return h_base, h_ctrl, base_output_states, ctrl_output_states
+
+
+class ControlNetXSCrossAttnMidBlock2D(nn.Module):
+    def __init__(
+        self,
+        base_channels: int,
+        ctrl_channels: int,
+        temb_channels: Optional[int] = None,
+        norm_num_groups: int = 32,
+        ctrl_max_norm_num_groups: int = 32,
+        transformer_layers_per_block: int = 1,
+        base_num_attention_heads: Optional[int] = 1,
+        ctrl_num_attention_heads: Optional[int] = 1,
+        cross_attention_dim: Optional[int] = 1024,
+        upcast_attention: bool = False,
+        use_linear_projection: Optional[bool] = True,
+    ):
+        super().__init__()
+
+        # Before the midblock application, information is concatted from base to control.
+        # Concat doesn't require change in number of channels
+        self.base_to_ctrl = make_zero_conv(base_channels, base_channels)
+
+        self.base_midblock = UNetMidBlock2DCrossAttn(
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=base_channels,
+            temb_channels=temb_channels,
+            resnet_groups=norm_num_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=base_num_attention_heads,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+        )
+
+        self.ctrl_midblock = UNetMidBlock2DCrossAttn(
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=ctrl_channels + base_channels,
+            out_channels=ctrl_channels,
+            temb_channels=temb_channels,
+            # number or norm groups must divide both in_channels and out_channels
+            resnet_groups=find_largest_factor(
+                gcd(ctrl_channels, ctrl_channels + base_channels), ctrl_max_norm_num_groups
+            ),
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=ctrl_num_attention_heads,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+        )
+
+        # After the midblock application, information is added from control to base
+        # Addition requires change in number of channels
+        self.ctrl_to_base = make_zero_conv(ctrl_channels, base_channels)
+
+        self.gradient_checkpointing = False
+
+    @classmethod
+    def from_modules(
+        cls,
+        base_midblock: UNetMidBlock2DCrossAttn,
+        ctrl_midblock: MidBlockControlNetXSAdapter,
+    ):
+        base_to_ctrl = ctrl_midblock.base_to_ctrl
+        ctrl_to_base = ctrl_midblock.ctrl_to_base
+        ctrl_midblock = ctrl_midblock.midblock
+
+        # get params
+        def get_first_cross_attention(midblock):
+            return midblock.attentions[0].transformer_blocks[0].attn2
+
+        base_channels = ctrl_to_base.out_channels
+        ctrl_channels = ctrl_to_base.in_channels
+        transformer_layers_per_block = len(base_midblock.attentions[0].transformer_blocks)
+        temb_channels = base_midblock.resnets[0].time_emb_proj.in_features
+        num_groups = base_midblock.resnets[0].norm1.num_groups
+        ctrl_num_groups = ctrl_midblock.resnets[0].norm1.num_groups
+        base_num_attention_heads = get_first_cross_attention(base_midblock).heads
+        ctrl_num_attention_heads = get_first_cross_attention(ctrl_midblock).heads
+        cross_attention_dim = get_first_cross_attention(base_midblock).cross_attention_dim
+        upcast_attention = get_first_cross_attention(base_midblock).upcast_attention
+        use_linear_projection = base_midblock.attentions[0].use_linear_projection
+
+        # create model
+        model = cls(
+            base_channels=base_channels,
+            ctrl_channels=ctrl_channels,
+            temb_channels=temb_channels,
+            norm_num_groups=num_groups,
+            ctrl_max_norm_num_groups=ctrl_num_groups,
+            transformer_layers_per_block=transformer_layers_per_block,
+            base_num_attention_heads=base_num_attention_heads,
+            ctrl_num_attention_heads=ctrl_num_attention_heads,
+            cross_attention_dim=cross_attention_dim,
+            upcast_attention=upcast_attention,
+            use_linear_projection=use_linear_projection,
+        )
+
+        # load weights
+        model.base_to_ctrl.load_state_dict(base_to_ctrl.state_dict())
+        model.base_midblock.load_state_dict(base_midblock.state_dict())
+        model.ctrl_midblock.load_state_dict(ctrl_midblock.state_dict())
+        model.ctrl_to_base.load_state_dict(ctrl_to_base.state_dict())
+
+        return model
+
+    def freeze_base_params(self) -> None:
+        """Freeze the weights of the parts belonging to the base UNet2DConditionModel, and leave everything else unfrozen for fine
+        tuning."""
+        # Unfreeze everything
+        for param in self.parameters():
+            param.requires_grad = True
+
+        # Freeze base part
+        for param in self.base_midblock.parameters():
+            param.requires_grad = False
+
+    def forward(
+        self,
+        hidden_states_base: Tensor,
+        temb: Tensor,
+        encoder_hidden_states: Tensor,
+        hidden_states_ctrl: Optional[Tensor] = None,
+        conditioning_scale: Optional[float] = 1.0,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        attention_mask: Optional[Tensor] = None,
+        encoder_attention_mask: Optional[Tensor] = None,
+        apply_control: bool = True,
+    ) -> Tuple[Tensor, Tensor]:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        h_base = hidden_states_base
+        h_ctrl = hidden_states_ctrl
+
+        joint_args = {
+            "temb": temb,
+            "encoder_hidden_states": encoder_hidden_states,
+            "attention_mask": attention_mask,
+            "cross_attention_kwargs": cross_attention_kwargs,
+            "encoder_attention_mask": encoder_attention_mask,
+        }
+
+        if apply_control:
+            h_ctrl = torch.cat([h_ctrl, self.base_to_ctrl(h_base)], dim=1)  # concat base -> ctrl
+        h_base = self.base_midblock(h_base, **joint_args)  # apply base mid block
+        if apply_control:
+            h_ctrl = self.ctrl_midblock(h_ctrl, **joint_args)  # apply ctrl mid block
+            h_base = h_base + self.ctrl_to_base(h_ctrl) * conditioning_scale  # add ctrl -> base
+
+        return h_base, h_ctrl
+
+
+class ControlNetXSCrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        ctrl_skip_channels: List[int],
+        temb_channels: int,
+        norm_num_groups: int = 32,
+        resolution_idx: Optional[int] = None,
+        has_crossattn=True,
+        transformer_layers_per_block: int = 1,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1024,
+        add_upsample: bool = True,
+        upcast_attention: bool = False,
+        use_linear_projection: Optional[bool] = True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        ctrl_to_base = []
+
+        num_layers = 3  # only support sd + sdxl
+
+        self.has_cross_attention = has_crossattn
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            ctrl_to_base.append(make_zero_conv(ctrl_skip_channels[i], resnet_in_channels))
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    groups=norm_num_groups,
+                )
+            )
+
+            if has_crossattn:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        norm_num_groups=norm_num_groups,
+                    )
+                )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.attentions = nn.ModuleList(attentions) if has_crossattn else [None] * num_layers
+        self.ctrl_to_base = nn.ModuleList(ctrl_to_base)
+
+        if add_upsample:
+            self.upsamplers = Upsample2D(out_channels, use_conv=True, out_channels=out_channels)
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    @classmethod
+    def from_modules(cls, base_upblock: CrossAttnUpBlock2D, ctrl_upblock: UpBlockControlNetXSAdapter):
+        ctrl_to_base_skip_connections = ctrl_upblock.ctrl_to_base
+
+        # get params
+        def get_first_cross_attention(block):
+            return block.attentions[0].transformer_blocks[0].attn2
+
+        out_channels = base_upblock.resnets[0].out_channels
+        in_channels = base_upblock.resnets[-1].in_channels - out_channels
+        prev_output_channels = base_upblock.resnets[0].in_channels - out_channels
+        ctrl_skip_channelss = [c.in_channels for c in ctrl_to_base_skip_connections]
+        temb_channels = base_upblock.resnets[0].time_emb_proj.in_features
+        num_groups = base_upblock.resnets[0].norm1.num_groups
+        resolution_idx = base_upblock.resolution_idx
+        if hasattr(base_upblock, "attentions"):
+            has_crossattn = True
+            transformer_layers_per_block = len(base_upblock.attentions[0].transformer_blocks)
+            num_attention_heads = get_first_cross_attention(base_upblock).heads
+            cross_attention_dim = get_first_cross_attention(base_upblock).cross_attention_dim
+            upcast_attention = get_first_cross_attention(base_upblock).upcast_attention
+            use_linear_projection = base_upblock.attentions[0].use_linear_projection
+        else:
+            has_crossattn = False
+            transformer_layers_per_block = None
+            num_attention_heads = None
+            cross_attention_dim = None
+            upcast_attention = None
+            use_linear_projection = None
+        add_upsample = base_upblock.upsamplers is not None
+
+        # create model
+        model = cls(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channels,
+            ctrl_skip_channels=ctrl_skip_channelss,
+            temb_channels=temb_channels,
+            norm_num_groups=num_groups,
+            resolution_idx=resolution_idx,
+            has_crossattn=has_crossattn,
+            transformer_layers_per_block=transformer_layers_per_block,
+            num_attention_heads=num_attention_heads,
+            cross_attention_dim=cross_attention_dim,
+            add_upsample=add_upsample,
+            upcast_attention=upcast_attention,
+            use_linear_projection=use_linear_projection,
+        )
+
+        # load weights
+        model.resnets.load_state_dict(base_upblock.resnets.state_dict())
+        if has_crossattn:
+            model.attentions.load_state_dict(base_upblock.attentions.state_dict())
+        if add_upsample:
+            model.upsamplers.load_state_dict(base_upblock.upsamplers[0].state_dict())
+        model.ctrl_to_base.load_state_dict(ctrl_to_base_skip_connections.state_dict())
+
+        return model
+
+    def freeze_base_params(self) -> None:
+        """Freeze the weights of the parts belonging to the base UNet2DConditionModel, and leave everything else unfrozen for fine
+        tuning."""
+        # Unfreeze everything
+        for param in self.parameters():
+            param.requires_grad = True
+
+        # Freeze base part
+        base_parts = [self.resnets]
+        if isinstance(self.attentions, nn.ModuleList):  # attentions can be a list of Nones
+            base_parts.append(self.attentions)
+        if self.upsamplers is not None:
+            base_parts.append(self.upsamplers)
+        for part in base_parts:
+            for param in part.parameters():
+                param.requires_grad = False
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        res_hidden_states_tuple_base: Tuple[Tensor, ...],
+        res_hidden_states_tuple_ctrl: Tuple[Tensor, ...],
+        temb: Tensor,
+        encoder_hidden_states: Optional[Tensor] = None,
+        conditioning_scale: Optional[float] = 1.0,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        attention_mask: Optional[Tensor] = None,
+        upsample_size: Optional[int] = None,
+        encoder_attention_mask: Optional[Tensor] = None,
+        apply_control: bool = True,
+    ) -> Tensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        def create_custom_forward(module, return_dict=None):
+            def custom_forward(*inputs):
+                if return_dict is not None:
+                    return module(*inputs, return_dict=return_dict)
+                else:
+                    return module(*inputs)
+
+            return custom_forward
+
+        def maybe_apply_freeu_to_subblock(hidden_states, res_h_base):
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                return apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_h_base,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+            else:
+                return hidden_states, res_h_base
+
+        for resnet, attn, c2b, res_h_base, res_h_ctrl in zip(
+            self.resnets,
+            self.attentions,
+            self.ctrl_to_base,
+            reversed(res_hidden_states_tuple_base),
+            reversed(res_hidden_states_tuple_ctrl),
+        ):
+            if apply_control:
+                hidden_states += c2b(res_h_ctrl) * conditioning_scale
+
+            hidden_states, res_h_base = maybe_apply_freeu_to_subblock(hidden_states, res_h_base)
+            hidden_states = torch.cat([hidden_states, res_h_base], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+            if attn is not None:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+
+        if self.upsamplers is not None:
+            hidden_states = self.upsamplers(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+def make_zero_conv(in_channels, out_channels=None):
+    return zero_module(nn.Conv2d(in_channels, out_channels, 1, padding=0))
+
+
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
+
+
+def find_largest_factor(number, max_factor):
+    factor = max_factor
+    if factor >= number:
+        return number
+    while factor != 0:
+        residual = number % factor
+        if residual == 0:
+            return factor
+        factor -= 1
diff --git a/diffusers/src/diffusers/models/downsampling.py b/diffusers/src/diffusers/models/downsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ac8953e3dccd0efdd9eae0e2be315bdeaf1d494
--- /dev/null
+++ b/diffusers/src/diffusers/models/downsampling.py
@@ -0,0 +1,401 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import deprecate
+from .normalization import RMSNorm
+from .upsampling import upfirdn2d_native
+
+
+class Downsample1D(nn.Module):
+    """A 1D downsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        padding (`int`, default `1`):
+            padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 1D layer.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        padding: int = 1,
+        name: str = "conv",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+
+        if use_conv:
+            self.conv = nn.Conv1d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            assert self.channels == self.out_channels
+            self.conv = nn.AvgPool1d(kernel_size=stride, stride=stride)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        assert inputs.shape[1] == self.channels
+        return self.conv(inputs)
+
+
+class Downsample2D(nn.Module):
+    """A 2D downsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        padding (`int`, default `1`):
+            padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 2D layer.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        padding: int = 1,
+        name: str = "conv",
+        kernel_size=3,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+
+        if norm_type == "ln_norm":
+            self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels, eps, elementwise_affine)
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+
+        if use_conv:
+            conv = nn.Conv2d(
+                self.channels, self.out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias
+            )
+        else:
+            assert self.channels == self.out_channels
+            conv = nn.AvgPool2d(kernel_size=stride, stride=stride)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.Conv2d_0 = conv
+            self.conv = conv
+        elif name == "Conv2d_0":
+            self.conv = conv
+        else:
+            self.conv = conv
+
+    def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        assert hidden_states.shape[1] == self.channels
+
+        if self.norm is not None:
+            hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+
+        if self.use_conv and self.padding == 0:
+            pad = (0, 1, 0, 1)
+            hidden_states = F.pad(hidden_states, pad, mode="constant", value=0)
+
+        assert hidden_states.shape[1] == self.channels
+
+        hidden_states = self.conv(hidden_states)
+
+        return hidden_states
+
+
+class FirDownsample2D(nn.Module):
+    """A 2D FIR downsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        fir_kernel (`tuple`, default `(1, 3, 3, 1)`):
+            kernel for the FIR filter.
+    """
+
+    def __init__(
+        self,
+        channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        use_conv: bool = False,
+        fir_kernel: Tuple[int, int, int, int] = (1, 3, 3, 1),
+    ):
+        super().__init__()
+        out_channels = out_channels if out_channels else channels
+        if use_conv:
+            self.Conv2d_0 = nn.Conv2d(channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.fir_kernel = fir_kernel
+        self.use_conv = use_conv
+        self.out_channels = out_channels
+
+    def _downsample_2d(
+        self,
+        hidden_states: torch.Tensor,
+        weight: Optional[torch.Tensor] = None,
+        kernel: Optional[torch.Tensor] = None,
+        factor: int = 2,
+        gain: float = 1,
+    ) -> torch.Tensor:
+        """Fused `Conv2d()` followed by `downsample_2d()`.
+        Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
+        efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of
+        arbitrary order.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+            weight (`torch.Tensor`, *optional*):
+                Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be
+                performed by `inChannels = x.shape[0] // numGroups`.
+            kernel (`torch.Tensor`, *optional*):
+                FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which
+                corresponds to average pooling.
+            factor (`int`, *optional*, default to `2`):
+                Integer downsampling factor.
+            gain (`float`, *optional*, default to `1.0`):
+                Scaling factor for signal magnitude.
+
+        Returns:
+            output (`torch.Tensor`):
+                Tensor of the shape `[N, C, H // factor, W // factor]` or `[N, H // factor, W // factor, C]`, and same
+                datatype as `x`.
+        """
+
+        assert isinstance(factor, int) and factor >= 1
+        if kernel is None:
+            kernel = [1] * factor
+
+        # setup kernel
+        kernel = torch.tensor(kernel, dtype=torch.float32)
+        if kernel.ndim == 1:
+            kernel = torch.outer(kernel, kernel)
+        kernel /= torch.sum(kernel)
+
+        kernel = kernel * gain
+
+        if self.use_conv:
+            _, _, convH, convW = weight.shape
+            pad_value = (kernel.shape[0] - factor) + (convW - 1)
+            stride_value = [factor, factor]
+            upfirdn_input = upfirdn2d_native(
+                hidden_states,
+                torch.tensor(kernel, device=hidden_states.device),
+                pad=((pad_value + 1) // 2, pad_value // 2),
+            )
+            output = F.conv2d(upfirdn_input, weight, stride=stride_value, padding=0)
+        else:
+            pad_value = kernel.shape[0] - factor
+            output = upfirdn2d_native(
+                hidden_states,
+                torch.tensor(kernel, device=hidden_states.device),
+                down=factor,
+                pad=((pad_value + 1) // 2, pad_value // 2),
+            )
+
+        return output
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.use_conv:
+            downsample_input = self._downsample_2d(hidden_states, weight=self.Conv2d_0.weight, kernel=self.fir_kernel)
+            hidden_states = downsample_input + self.Conv2d_0.bias.reshape(1, -1, 1, 1)
+        else:
+            hidden_states = self._downsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
+
+        return hidden_states
+
+
+# downsample/upsample layer used in k-upscaler, might be able to use FirDownsample2D/DirUpsample2D instead
+class KDownsample2D(nn.Module):
+    r"""A 2D K-downsampling layer.
+
+    Parameters:
+        pad_mode (`str`, *optional*, default to `"reflect"`): the padding mode to use.
+    """
+
+    def __init__(self, pad_mode: str = "reflect"):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = torch.tensor([[1 / 8, 3 / 8, 3 / 8, 1 / 8]])
+        self.pad = kernel_1d.shape[1] // 2 - 1
+        self.register_buffer("kernel", kernel_1d.T @ kernel_1d, persistent=False)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        inputs = F.pad(inputs, (self.pad,) * 4, self.pad_mode)
+        weight = inputs.new_zeros(
+            [
+                inputs.shape[1],
+                inputs.shape[1],
+                self.kernel.shape[0],
+                self.kernel.shape[1],
+            ]
+        )
+        indices = torch.arange(inputs.shape[1], device=inputs.device)
+        kernel = self.kernel.to(weight)[None, :].expand(inputs.shape[1], -1, -1)
+        weight[indices, indices] = kernel
+        return F.conv2d(inputs, weight, stride=2)
+
+
+class CogVideoXDownsample3D(nn.Module):
+    # Todo: Wait for paper relase.
+    r"""
+    A 3D Downsampling layer using in [CogVideoX]() by Tsinghua University & ZhipuAI
+
+    Args:
+        in_channels (`int`):
+            Number of channels in the input image.
+        out_channels (`int`):
+            Number of channels produced by the convolution.
+        kernel_size (`int`, defaults to `3`):
+            Size of the convolving kernel.
+        stride (`int`, defaults to `2`):
+            Stride of the convolution.
+        padding (`int`, defaults to `0`):
+            Padding added to all four sides of the input.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to compress the time dimension.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 2,
+        padding: int = 0,
+        compress_time: bool = False,
+    ):
+        super().__init__()
+
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.compress_time = compress_time
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.compress_time:
+            batch_size, channels, frames, height, width = x.shape
+
+            # (batch_size, channels, frames, height, width) -> (batch_size, height, width, channels, frames) -> (batch_size * height * width, channels, frames)
+            x = x.permute(0, 3, 4, 1, 2).reshape(batch_size * height * width, channels, frames)
+
+            if x.shape[-1] % 2 == 1:
+                x_first, x_rest = x[..., 0], x[..., 1:]
+                if x_rest.shape[-1] > 0:
+                    # (batch_size * height * width, channels, frames - 1) -> (batch_size * height * width, channels, (frames - 1) // 2)
+                    x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
+
+                x = torch.cat([x_first[..., None], x_rest], dim=-1)
+                # (batch_size * height * width, channels, (frames // 2) + 1) -> (batch_size, height, width, channels, (frames // 2) + 1) -> (batch_size, channels, (frames // 2) + 1, height, width)
+                x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2)
+            else:
+                # (batch_size * height * width, channels, frames) -> (batch_size * height * width, channels, frames // 2)
+                x = F.avg_pool1d(x, kernel_size=2, stride=2)
+                # (batch_size * height * width, channels, frames // 2) -> (batch_size, height, width, channels, frames // 2) -> (batch_size, channels, frames // 2, height, width)
+                x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2)
+
+        # Pad the tensor
+        pad = (0, 1, 0, 1)
+        x = F.pad(x, pad, mode="constant", value=0)
+        batch_size, channels, frames, height, width = x.shape
+        # (batch_size, channels, frames, height, width) -> (batch_size, frames, channels, height, width) -> (batch_size * frames, channels, height, width)
+        x = x.permute(0, 2, 1, 3, 4).reshape(batch_size * frames, channels, height, width)
+        x = self.conv(x)
+        # (batch_size * frames, channels, height, width) -> (batch_size, frames, channels, height, width) -> (batch_size, channels, frames, height, width)
+        x = x.reshape(batch_size, frames, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4)
+        return x
+
+
+def downsample_2d(
+    hidden_states: torch.Tensor,
+    kernel: Optional[torch.Tensor] = None,
+    factor: int = 2,
+    gain: float = 1,
+) -> torch.Tensor:
+    r"""Downsample2D a batch of 2D images with the given filter.
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and downsamples each image with the
+    given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the
+    specified `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its
+    shape is a multiple of the downsampling factor.
+
+    Args:
+        hidden_states (`torch.Tensor`)
+            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        kernel (`torch.Tensor`, *optional*):
+            FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which
+            corresponds to average pooling.
+        factor (`int`, *optional*, default to `2`):
+            Integer downsampling factor.
+        gain (`float`, *optional*, default to `1.0`):
+            Scaling factor for signal magnitude.
+
+    Returns:
+        output (`torch.Tensor`):
+            Tensor of the shape `[N, C, H // factor, W // factor]`
+    """
+
+    assert isinstance(factor, int) and factor >= 1
+    if kernel is None:
+        kernel = [1] * factor
+
+    kernel = torch.tensor(kernel, dtype=torch.float32)
+    if kernel.ndim == 1:
+        kernel = torch.outer(kernel, kernel)
+    kernel /= torch.sum(kernel)
+
+    kernel = kernel * gain
+    pad_value = kernel.shape[0] - factor
+    output = upfirdn2d_native(
+        hidden_states,
+        kernel.to(device=hidden_states.device),
+        down=factor,
+        pad=((pad_value + 1) // 2, pad_value // 2),
+    )
+    return output
diff --git a/diffusers/src/diffusers/models/embeddings.py b/diffusers/src/diffusers/models/embeddings.py
new file mode 100644
index 0000000000000000000000000000000000000000..f88a005e7f959609e87ab2a1720537e8495e2a4e
--- /dev/null
+++ b/diffusers/src/diffusers/models/embeddings.py
@@ -0,0 +1,1829 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ..utils import deprecate
+from .activations import FP32SiLU, get_activation
+from .attention_processor import Attention
+
+
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+
+    # scale embeddings
+    emb = scale * emb
+
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+
+
+def get_3d_sincos_pos_embed(
+    embed_dim: int,
+    spatial_size: Union[int, Tuple[int, int]],
+    temporal_size: int,
+    spatial_interpolation_scale: float = 1.0,
+    temporal_interpolation_scale: float = 1.0,
+) -> np.ndarray:
+    r"""
+    Args:
+        embed_dim (`int`):
+        spatial_size (`int` or `Tuple[int, int]`):
+        temporal_size (`int`):
+        spatial_interpolation_scale (`float`, defaults to 1.0):
+        temporal_interpolation_scale (`float`, defaults to 1.0):
+    """
+    if embed_dim % 4 != 0:
+        raise ValueError("`embed_dim` must be divisible by 4")
+    if isinstance(spatial_size, int):
+        spatial_size = (spatial_size, spatial_size)
+
+    embed_dim_spatial = 3 * embed_dim // 4
+    embed_dim_temporal = embed_dim // 4
+
+    # 1. Spatial
+    grid_h = np.arange(spatial_size[1], dtype=np.float32) / spatial_interpolation_scale
+    grid_w = np.arange(spatial_size[0], dtype=np.float32) / spatial_interpolation_scale
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, spatial_size[1], spatial_size[0]])
+    pos_embed_spatial = get_2d_sincos_pos_embed_from_grid(embed_dim_spatial, grid)
+
+    # 2. Temporal
+    grid_t = np.arange(temporal_size, dtype=np.float32) / temporal_interpolation_scale
+    pos_embed_temporal = get_1d_sincos_pos_embed_from_grid(embed_dim_temporal, grid_t)
+
+    # 3. Concat
+    pos_embed_spatial = pos_embed_spatial[np.newaxis, :, :]
+    pos_embed_spatial = np.repeat(pos_embed_spatial, temporal_size, axis=0)  # [T, H*W, D // 4 * 3]
+
+    pos_embed_temporal = pos_embed_temporal[:, np.newaxis, :]
+    pos_embed_temporal = np.repeat(pos_embed_temporal, spatial_size[0] * spatial_size[1], axis=1)  # [T, H*W, D // 4]
+
+    pos_embed = np.concatenate([pos_embed_temporal, pos_embed_spatial], axis=-1)  # [T, H*W, D]
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed(
+    embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16
+):
+    """
+    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_size = (grid_size, grid_size)
+
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size) / interpolation_scale
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size) / interpolation_scale
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding with support for SD3 cropping."""
+
+    def __init__(
+        self,
+        height=224,
+        width=224,
+        patch_size=16,
+        in_channels=3,
+        embed_dim=768,
+        layer_norm=False,
+        flatten=True,
+        bias=True,
+        interpolation_scale=1,
+        pos_embed_type="sincos",
+        pos_embed_max_size=None,  # For SD3 cropping
+    ):
+        super().__init__()
+
+        num_patches = (height // patch_size) * (width // patch_size)
+        self.flatten = flatten
+        self.layer_norm = layer_norm
+        self.pos_embed_max_size = pos_embed_max_size
+
+        self.proj = nn.Conv2d(
+            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
+        )
+        if layer_norm:
+            self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            self.norm = None
+
+        self.patch_size = patch_size
+        self.height, self.width = height // patch_size, width // patch_size
+        self.base_size = height // patch_size
+        self.interpolation_scale = interpolation_scale
+
+        # Calculate positional embeddings based on max size or default
+        if pos_embed_max_size:
+            grid_size = pos_embed_max_size
+        else:
+            grid_size = int(num_patches**0.5)
+
+        if pos_embed_type is None:
+            self.pos_embed = None
+        elif pos_embed_type == "sincos":
+            pos_embed = get_2d_sincos_pos_embed(
+                embed_dim, grid_size, base_size=self.base_size, interpolation_scale=self.interpolation_scale
+            )
+            persistent = True if pos_embed_max_size else False
+            self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=persistent)
+        else:
+            raise ValueError(f"Unsupported pos_embed_type: {pos_embed_type}")
+
+    def cropped_pos_embed(self, height, width):
+        """Crops positional embeddings for SD3 compatibility."""
+        if self.pos_embed_max_size is None:
+            raise ValueError("`pos_embed_max_size` must be set for cropping.")
+
+        height = height // self.patch_size
+        width = width // self.patch_size
+        if height > self.pos_embed_max_size:
+            raise ValueError(
+                f"Height ({height}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+        if width > self.pos_embed_max_size:
+            raise ValueError(
+                f"Width ({width}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+
+        top = (self.pos_embed_max_size - height) // 2
+        left = (self.pos_embed_max_size - width) // 2
+        spatial_pos_embed = self.pos_embed.reshape(1, self.pos_embed_max_size, self.pos_embed_max_size, -1)
+        spatial_pos_embed = spatial_pos_embed[:, top : top + height, left : left + width, :]
+        spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1])
+        return spatial_pos_embed
+
+    def forward(self, latent):
+        if self.pos_embed_max_size is not None:
+            height, width = latent.shape[-2:]
+        else:
+            height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+
+        latent = self.proj(latent)
+        if self.flatten:
+            latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        if self.layer_norm:
+            latent = self.norm(latent)
+        if self.pos_embed is None:
+            return latent.to(latent.dtype)
+        # Interpolate or crop positional embeddings as needed
+        if self.pos_embed_max_size:
+            pos_embed = self.cropped_pos_embed(height, width)
+        else:
+            if self.height != height or self.width != width:
+                pos_embed = get_2d_sincos_pos_embed(
+                    embed_dim=self.pos_embed.shape[-1],
+                    grid_size=(height, width),
+                    base_size=self.base_size,
+                    interpolation_scale=self.interpolation_scale,
+                )
+                pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).to(latent.device)
+            else:
+                pos_embed = self.pos_embed
+
+        return (latent + pos_embed).to(latent.dtype)
+
+
+class CausalPatchEmbed(nn.Module):
+    """2D Image to Patch Embedding with support for SD3 cropping."""
+
+    def __init__(
+        self,
+        height=224,
+        width=224,
+        patch_size=16,
+        in_channels=3,
+        embed_dim=768,
+        layer_norm=False,
+        flatten=True,
+        bias=True,
+        interpolation_scale=1,
+        pos_embed_type="sincos",
+        pos_embed_max_size=None,  # For SD3 cropping
+    ):
+        super().__init__()
+
+        num_patches = (height // patch_size) * (width // patch_size)
+        self.flatten = flatten
+        self.layer_norm = layer_norm
+        self.pos_embed_max_size = pos_embed_max_size
+
+        self.proj = nn.Conv2d(
+            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
+        )
+        if layer_norm:
+            self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            self.norm = None
+
+        self.patch_size = patch_size
+        self.height, self.width = height // patch_size, width // patch_size
+        self.base_size = height // patch_size
+        self.interpolation_scale = interpolation_scale
+
+        # Calculate positional embeddings based on max size or default
+        if pos_embed_max_size:
+            grid_size = pos_embed_max_size
+        else:
+            grid_size = int(num_patches**0.5)
+
+        if pos_embed_type is None:
+            self.pos_embed = None
+        elif pos_embed_type == "sincos":
+            pos_embed = get_2d_sincos_pos_embed(
+                embed_dim, grid_size, base_size=self.base_size, interpolation_scale=self.interpolation_scale
+            )
+            persistent = True if pos_embed_max_size else False
+            self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=persistent)
+        else:
+            raise ValueError(f"Unsupported pos_embed_type: {pos_embed_type}")
+
+    def cropped_pos_embed(self, height, width):
+        """Crops positional embeddings for SD3 compatibility."""
+        if self.pos_embed_max_size is None:
+            raise ValueError("`pos_embed_max_size` must be set for cropping.")
+
+        height = height // self.patch_size
+        width = width // self.patch_size
+        if height > self.pos_embed_max_size:
+            raise ValueError(
+                f"Height ({height}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+        if width > self.pos_embed_max_size:
+            raise ValueError(
+                f"Width ({width}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+
+        top = (self.pos_embed_max_size - height) // 2
+        left = (self.pos_embed_max_size - width) // 2
+        spatial_pos_embed = self.pos_embed.reshape(1, self.pos_embed_max_size, self.pos_embed_max_size, -1)
+        spatial_pos_embed = spatial_pos_embed[:, top : top + height, left : left + width, :]
+        spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1])
+        return spatial_pos_embed
+
+    def get_pos_embed(self, height, width):
+        pos_embed = get_2d_sincos_pos_embed(
+                    embed_dim=self.pos_embed.shape[-1],
+                    grid_size=(height, width*2),
+                    base_size=self.base_size,
+                    interpolation_scale=self.interpolation_scale,
+        )
+        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0)
+        return pos_embed
+
+    def forward(self, latent):
+        latent = self.proj(latent)
+        if self.flatten:
+            latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        if self.layer_norm:
+            latent = self.norm(latent)
+        if self.pos_embed is None:
+            return latent.to(latent.dtype)
+
+        return latent.to(latent.dtype)
+
+
+class LuminaPatchEmbed(nn.Module):
+    """2D Image to Patch Embedding with support for Lumina-T2X"""
+
+    def __init__(self, patch_size=2, in_channels=4, embed_dim=768, bias=True):
+        super().__init__()
+        self.patch_size = patch_size
+        self.proj = nn.Linear(
+            in_features=patch_size * patch_size * in_channels,
+            out_features=embed_dim,
+            bias=bias,
+        )
+
+    def forward(self, x, freqs_cis):
+        """
+        Patchifies and embeds the input tensor(s).
+
+        Args:
+            x (List[torch.Tensor] | torch.Tensor): The input tensor(s) to be patchified and embedded.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], torch.Tensor]: A tuple containing the patchified
+            and embedded tensor(s), the mask indicating the valid patches, the original image size(s), and the
+            frequency tensor(s).
+        """
+        freqs_cis = freqs_cis.to(x[0].device)
+        patch_height = patch_width = self.patch_size
+        batch_size, channel, height, width = x.size()
+        height_tokens, width_tokens = height // patch_height, width // patch_width
+
+        x = x.view(batch_size, channel, height_tokens, patch_height, width_tokens, patch_width).permute(
+            0, 2, 4, 1, 3, 5
+        )
+        x = x.flatten(3)
+        x = self.proj(x)
+        x = x.flatten(1, 2)
+
+        mask = torch.ones(x.shape[0], x.shape[1], dtype=torch.int32, device=x.device)
+
+        return (
+            x,
+            mask,
+            [(height, width)] * batch_size,
+            freqs_cis[:height_tokens, :width_tokens].flatten(0, 1).unsqueeze(0),
+        )
+
+
+class CogVideoXPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        embed_dim: int = 1920,
+        text_embed_dim: int = 4096,
+        bias: bool = True,
+        sample_width: int = 90,
+        sample_height: int = 60,
+        sample_frames: int = 49,
+        temporal_compression_ratio: int = 4,
+        max_text_seq_length: int = 226,
+        spatial_interpolation_scale: float = 1.875,
+        temporal_interpolation_scale: float = 1.0,
+        use_positional_embeddings: bool = True,
+        use_learned_positional_embeddings: bool = True,
+    ) -> None:
+        super().__init__()
+
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.sample_height = sample_height
+        self.sample_width = sample_width
+        self.sample_frames = sample_frames
+        self.temporal_compression_ratio = temporal_compression_ratio
+        self.max_text_seq_length = max_text_seq_length
+        self.spatial_interpolation_scale = spatial_interpolation_scale
+        self.temporal_interpolation_scale = temporal_interpolation_scale
+        self.use_positional_embeddings = use_positional_embeddings
+        self.use_learned_positional_embeddings = use_learned_positional_embeddings
+
+        self.proj = nn.Conv2d(
+            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
+        )
+        self.text_proj = nn.Linear(text_embed_dim, embed_dim)
+
+        if use_positional_embeddings or use_learned_positional_embeddings:
+            persistent = use_learned_positional_embeddings
+            pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
+            self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
+
+    def _get_positional_embeddings(self, sample_height: int, sample_width: int, sample_frames: int) -> torch.Tensor:
+        post_patch_height = sample_height // self.patch_size
+        post_patch_width = sample_width // self.patch_size
+        post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
+        num_patches = post_patch_height * post_patch_width * post_time_compression_frames
+
+        pos_embedding = get_3d_sincos_pos_embed(
+            self.embed_dim,
+            (post_patch_width, post_patch_height),
+            post_time_compression_frames,
+            self.spatial_interpolation_scale,
+            self.temporal_interpolation_scale,
+        )
+        pos_embedding = torch.from_numpy(pos_embedding).flatten(0, 1)
+        joint_pos_embedding = torch.zeros(
+            1, self.max_text_seq_length + num_patches, self.embed_dim, requires_grad=False
+        )
+        joint_pos_embedding.data[:, self.max_text_seq_length :].copy_(pos_embedding)
+
+        return joint_pos_embedding
+
+    def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
+        r"""
+        Args:
+            text_embeds (`torch.Tensor`):
+                Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
+            image_embeds (`torch.Tensor`):
+                Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
+        """
+        text_embeds = self.text_proj(text_embeds)
+
+        batch, num_frames, channels, height, width = image_embeds.shape
+        image_embeds = image_embeds.reshape(-1, channels, height, width)
+        image_embeds = self.proj(image_embeds)
+        image_embeds = image_embeds.view(batch, num_frames, *image_embeds.shape[1:])
+        image_embeds = image_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
+        image_embeds = image_embeds.flatten(1, 2)  # [batch, num_frames x height x width, channels]
+
+        embeds = torch.cat(
+            [text_embeds, image_embeds], dim=1
+        ).contiguous()  # [batch, seq_length + num_frames x height x width, channels]
+
+        if self.use_positional_embeddings or self.use_learned_positional_embeddings:
+            if self.use_learned_positional_embeddings and (self.sample_width != width or self.sample_height != height):
+                raise ValueError(
+                    "It is currently not possible to generate videos at a different resolution that the defaults. This should only be the case with 'THUDM/CogVideoX-5b-I2V'."
+                    "If you think this is incorrect, please open an issue at https://github.com/huggingface/diffusers/issues."
+                )
+
+            pre_time_compression_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
+
+            if (
+                self.sample_height != height
+                or self.sample_width != width
+                or self.sample_frames != pre_time_compression_frames
+            ):
+                pos_embedding = self._get_positional_embeddings(height, width, pre_time_compression_frames)
+                pos_embedding = pos_embedding.to(embeds.device, dtype=embeds.dtype)
+            else:
+                pos_embedding = self.pos_embedding
+
+            embeds = embeds + pos_embedding
+
+        return embeds
+
+
+def get_3d_rotary_pos_embed(
+    embed_dim, crops_coords, grid_size, temporal_size, theta: int = 10000, use_real: bool = True
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    RoPE for video tokens with 3D structure.
+
+    Args:
+    embed_dim: (`int`):
+        The embedding dimension size, corresponding to hidden_size_head.
+    crops_coords (`Tuple[int]`):
+        The top-left and bottom-right coordinates of the crop.
+    grid_size (`Tuple[int]`):
+        The grid size of the spatial positional embedding (height, width).
+    temporal_size (`int`):
+        The size of the temporal dimension.
+    theta (`float`):
+        Scaling factor for frequency computation.
+
+    Returns:
+        `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
+    """
+    if use_real is not True:
+        raise ValueError(" `use_real = False` is not currently supported for get_3d_rotary_pos_embed")
+    start, stop = crops_coords
+    grid_size_h, grid_size_w = grid_size
+    grid_h = np.linspace(start[0], stop[0], grid_size_h, endpoint=False, dtype=np.float32)
+    grid_w = np.linspace(start[1], stop[1], grid_size_w, endpoint=False, dtype=np.float32)
+    grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32)
+
+    # Compute dimensions for each axis
+    dim_t = embed_dim // 4
+    dim_h = embed_dim // 8 * 3
+    dim_w = embed_dim // 8 * 3
+
+    # Temporal frequencies
+    freqs_t = get_1d_rotary_pos_embed(dim_t, grid_t, use_real=True)
+    # Spatial frequencies for height and width
+    freqs_h = get_1d_rotary_pos_embed(dim_h, grid_h, use_real=True)
+    freqs_w = get_1d_rotary_pos_embed(dim_w, grid_w, use_real=True)
+
+    # BroadCast and concatenate temporal and spaial frequencie (height and width) into a 3d tensor
+    def combine_time_height_width(freqs_t, freqs_h, freqs_w):
+        freqs_t = freqs_t[:, None, None, :].expand(
+            -1, grid_size_h, grid_size_w, -1
+        )  # temporal_size, grid_size_h, grid_size_w, dim_t
+        freqs_h = freqs_h[None, :, None, :].expand(
+            temporal_size, -1, grid_size_w, -1
+        )  # temporal_size, grid_size_h, grid_size_2, dim_h
+        freqs_w = freqs_w[None, None, :, :].expand(
+            temporal_size, grid_size_h, -1, -1
+        )  # temporal_size, grid_size_h, grid_size_2, dim_w
+
+        freqs = torch.cat(
+            [freqs_t, freqs_h, freqs_w], dim=-1
+        )  # temporal_size, grid_size_h, grid_size_w, (dim_t + dim_h + dim_w)
+        freqs = freqs.view(
+            temporal_size * grid_size_h * grid_size_w, -1
+        )  # (temporal_size * grid_size_h * grid_size_w), (dim_t + dim_h + dim_w)
+        return freqs
+
+    t_cos, t_sin = freqs_t  # both t_cos and t_sin has shape: temporal_size, dim_t
+    h_cos, h_sin = freqs_h  # both h_cos and h_sin has shape: grid_size_h, dim_h
+    w_cos, w_sin = freqs_w  # both w_cos and w_sin has shape: grid_size_w, dim_w
+    cos = combine_time_height_width(t_cos, h_cos, w_cos)
+    sin = combine_time_height_width(t_sin, h_sin, w_sin)
+    return cos, sin
+
+
+def get_2d_rotary_pos_embed(embed_dim, crops_coords, grid_size, use_real=True):
+    """
+    RoPE for image tokens with 2d structure.
+
+    Args:
+    embed_dim: (`int`):
+        The embedding dimension size
+    crops_coords (`Tuple[int]`)
+        The top-left and bottom-right coordinates of the crop.
+    grid_size (`Tuple[int]`):
+        The grid size of the positional embedding.
+    use_real (`bool`):
+        If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+
+    Returns:
+        `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`.
+    """
+    start, stop = crops_coords
+    grid_h = np.linspace(start[0], stop[0], grid_size[0], endpoint=False, dtype=np.float32)
+    grid_w = np.linspace(start[1], stop[1], grid_size[1], endpoint=False, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)  # [2, W, H]
+
+    grid = grid.reshape([2, 1, *grid.shape[1:]])
+    pos_embed = get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=use_real)
+    return pos_embed
+
+
+def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False):
+    assert embed_dim % 4 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_rotary_pos_embed(
+        embed_dim // 2, grid[0].reshape(-1), use_real=use_real
+    )  # (H*W, D/2) if use_real else (H*W, D/4)
+    emb_w = get_1d_rotary_pos_embed(
+        embed_dim // 2, grid[1].reshape(-1), use_real=use_real
+    )  # (H*W, D/2) if use_real else (H*W, D/4)
+
+    if use_real:
+        cos = torch.cat([emb_h[0], emb_w[0]], dim=1)  # (H*W, D)
+        sin = torch.cat([emb_h[1], emb_w[1]], dim=1)  # (H*W, D)
+        return cos, sin
+    else:
+        emb = torch.cat([emb_h, emb_w], dim=1)  # (H*W, D/2)
+        return emb
+
+
+def get_2d_rotary_pos_embed_lumina(embed_dim, len_h, len_w, linear_factor=1.0, ntk_factor=1.0):
+    assert embed_dim % 4 == 0
+
+    emb_h = get_1d_rotary_pos_embed(
+        embed_dim // 2, len_h, linear_factor=linear_factor, ntk_factor=ntk_factor
+    )  # (H, D/4)
+    emb_w = get_1d_rotary_pos_embed(
+        embed_dim // 2, len_w, linear_factor=linear_factor, ntk_factor=ntk_factor
+    )  # (W, D/4)
+    emb_h = emb_h.view(len_h, 1, embed_dim // 4, 1).repeat(1, len_w, 1, 1)  # (H, W, D/4, 1)
+    emb_w = emb_w.view(1, len_w, embed_dim // 4, 1).repeat(len_h, 1, 1, 1)  # (H, W, D/4, 1)
+
+    emb = torch.cat([emb_h, emb_w], dim=-1).flatten(2)  # (H, W, D/2)
+    return emb
+
+
+def get_1d_rotary_pos_embed(
+    dim: int,
+    pos: Union[np.ndarray, int],
+    theta: float = 10000.0,
+    use_real=False,
+    linear_factor=1.0,
+    ntk_factor=1.0,
+    repeat_interleave_real=True,
+    freqs_dtype=torch.float32,  #  torch.float32, torch.float64 (flux)
+):
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end
+    index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64
+    data type.
+
+    Args:
+        dim (`int`): Dimension of the frequency tensor.
+        pos (`np.ndarray` or `int`): Position indices for the frequency tensor. [S] or scalar
+        theta (`float`, *optional*, defaults to 10000.0):
+            Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (`bool`, *optional*):
+            If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+        linear_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for the context extrapolation. Defaults to 1.0.
+        ntk_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for the NTK-Aware RoPE. Defaults to 1.0.
+        repeat_interleave_real (`bool`, *optional*, defaults to `True`):
+            If `True` and `use_real`, real part and imaginary part are each interleaved with themselves to reach `dim`.
+            Otherwise, they are concateanted with themselves.
+        freqs_dtype (`torch.float32` or `torch.float64`, *optional*, defaults to `torch.float32`):
+            the dtype of the frequency tensor.
+    Returns:
+        `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
+    """
+    assert dim % 2 == 0
+
+    if isinstance(pos, int):
+        pos = torch.arange(pos)
+    if isinstance(pos, np.ndarray):
+        pos = torch.from_numpy(pos)  # type: ignore  # [S]
+
+    theta = theta * ntk_factor
+    freqs = (
+        1.0
+        / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=pos.device)[: (dim // 2)] / dim))
+        / linear_factor
+    )  # [D/2]
+    freqs = torch.outer(pos, freqs)  # type: ignore   # [S, D/2]
+    if use_real and repeat_interleave_real:
+        # flux, hunyuan-dit, cogvideox
+        freqs_cos = freqs.cos().repeat_interleave(2, dim=1).float()  # [S, D]
+        freqs_sin = freqs.sin().repeat_interleave(2, dim=1).float()  # [S, D]
+        return freqs_cos, freqs_sin
+    elif use_real:
+        # stable audio
+        freqs_cos = torch.cat([freqs.cos(), freqs.cos()], dim=-1).float()  # [S, D]
+        freqs_sin = torch.cat([freqs.sin(), freqs.sin()], dim=-1).float()  # [S, D]
+        return freqs_cos, freqs_sin
+    else:
+        # lumina
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64     # [S, D/2]
+        return freqs_cis
+
+
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+
+        if use_real_unbind_dim == -1:
+            # Used for flux, cogvideox, hunyuan-dit
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            # Used for Stable Audio
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+
+        return out
+    else:
+        # used for lumina
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(2)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+
+        return x_out.type_as(x)
+
+
+class FluxPosEmbed(nn.Module):
+    # modified from https://github.com/black-forest-labs/flux/blob/c00d7c60b085fce8058b9df845e036090873f2ce/src/flux/modules/layers.py#L11
+    def __init__(self, theta: int, axes_dim: List[int]):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        n_axes = ids.shape[-1]
+        cos_out = []
+        sin_out = []
+        pos = ids.float()
+        is_mps = ids.device.type == "mps"
+        freqs_dtype = torch.float32 if is_mps else torch.float64
+        for i in range(n_axes):
+            cos, sin = get_1d_rotary_pos_embed(
+                self.axes_dim[i], pos[:, i], repeat_interleave_real=True, use_real=True, freqs_dtype=freqs_dtype
+            )
+            cos_out.append(cos)
+            sin_out.append(sin)
+        freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device)
+        freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device)
+        return freqs_cos, freqs_sin
+
+
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+        sample_proj_bias=True,
+    ):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
+
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+
+        self.act = get_activation(act_fn)
+
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
+
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+
+        if self.act is not None:
+            sample = self.act(sample)
+
+        sample = self.linear_2(sample)
+
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+
+
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+        self.scale = scale
+
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+            scale=self.scale,
+        )
+        return t_emb
+
+
+class GaussianFourierProjection(nn.Module):
+    """Gaussian Fourier embeddings for noise levels."""
+
+    def __init__(
+        self, embedding_size: int = 256, scale: float = 1.0, set_W_to_weight=True, log=True, flip_sin_to_cos=False
+    ):
+        super().__init__()
+        self.weight = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+        self.log = log
+        self.flip_sin_to_cos = flip_sin_to_cos
+
+        if set_W_to_weight:
+            # to delete later
+            del self.weight
+            self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+            self.weight = self.W
+            del self.W
+
+    def forward(self, x):
+        if self.log:
+            x = torch.log(x)
+
+        x_proj = x[:, None] * self.weight[None, :] * 2 * np.pi
+
+        if self.flip_sin_to_cos:
+            out = torch.cat([torch.cos(x_proj), torch.sin(x_proj)], dim=-1)
+        else:
+            out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+        return out
+
+
+class SinusoidalPositionalEmbedding(nn.Module):
+    """Apply positional information to a sequence of embeddings.
+
+    Takes in a sequence of embeddings with shape (batch_size, seq_length, embed_dim) and adds positional embeddings to
+    them
+
+    Args:
+        embed_dim: (int): Dimension of the positional embedding.
+        max_seq_length: Maximum sequence length to apply positional embeddings
+
+    """
+
+    def __init__(self, embed_dim: int, max_seq_length: int = 32):
+        super().__init__()
+        position = torch.arange(max_seq_length).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
+        pe = torch.zeros(1, max_seq_length, embed_dim)
+        pe[0, :, 0::2] = torch.sin(position * div_term)
+        pe[0, :, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+
+    def forward(self, x):
+        _, seq_length, _ = x.shape
+        x = x + self.pe[:, :seq_length]
+        return x
+
+
+class ImagePositionalEmbeddings(nn.Module):
+    """
+    Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
+    height and width of the latent space.
+
+    For more details, see figure 10 of the dall-e paper: https://arxiv.org/abs/2102.12092
+
+    For VQ-diffusion:
+
+    Output vector embeddings are used as input for the transformer.
+
+    Note that the vector embeddings for the transformer are different than the vector embeddings from the VQVAE.
+
+    Args:
+        num_embed (`int`):
+            Number of embeddings for the latent pixels embeddings.
+        height (`int`):
+            Height of the latent image i.e. the number of height embeddings.
+        width (`int`):
+            Width of the latent image i.e. the number of width embeddings.
+        embed_dim (`int`):
+            Dimension of the produced vector embeddings. Used for the latent pixel, height, and width embeddings.
+    """
+
+    def __init__(
+        self,
+        num_embed: int,
+        height: int,
+        width: int,
+        embed_dim: int,
+    ):
+        super().__init__()
+
+        self.height = height
+        self.width = width
+        self.num_embed = num_embed
+        self.embed_dim = embed_dim
+
+        self.emb = nn.Embedding(self.num_embed, embed_dim)
+        self.height_emb = nn.Embedding(self.height, embed_dim)
+        self.width_emb = nn.Embedding(self.width, embed_dim)
+
+    def forward(self, index):
+        emb = self.emb(index)
+
+        height_emb = self.height_emb(torch.arange(self.height, device=index.device).view(1, self.height))
+
+        # 1 x H x D -> 1 x H x 1 x D
+        height_emb = height_emb.unsqueeze(2)
+
+        width_emb = self.width_emb(torch.arange(self.width, device=index.device).view(1, self.width))
+
+        # 1 x W x D -> 1 x 1 x W x D
+        width_emb = width_emb.unsqueeze(1)
+
+        pos_emb = height_emb + width_emb
+
+        # 1 x H x W x D -> 1 x L xD
+        pos_emb = pos_emb.view(1, self.height * self.width, -1)
+
+        emb = emb + pos_emb[:, : emb.shape[1], :]
+
+        return emb
+
+
+class LabelEmbedding(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+
+    Args:
+        num_classes (`int`): The number of classes.
+        hidden_size (`int`): The size of the vector embeddings.
+        dropout_prob (`float`): The probability of dropping a label.
+    """
+
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = torch.tensor(force_drop_ids == 1)
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+
+    def forward(self, labels: torch.LongTensor, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (self.training and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+
+
+class TextImageProjection(nn.Module):
+    def __init__(
+        self,
+        text_embed_dim: int = 1024,
+        image_embed_dim: int = 768,
+        cross_attention_dim: int = 768,
+        num_image_text_embeds: int = 10,
+    ):
+        super().__init__()
+
+        self.num_image_text_embeds = num_image_text_embeds
+        self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim)
+        self.text_proj = nn.Linear(text_embed_dim, cross_attention_dim)
+
+    def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
+        batch_size = text_embeds.shape[0]
+
+        # image
+        image_text_embeds = self.image_embeds(image_embeds)
+        image_text_embeds = image_text_embeds.reshape(batch_size, self.num_image_text_embeds, -1)
+
+        # text
+        text_embeds = self.text_proj(text_embeds)
+
+        return torch.cat([image_text_embeds, text_embeds], dim=1)
+
+
+class ImageProjection(nn.Module):
+    def __init__(
+        self,
+        image_embed_dim: int = 768,
+        cross_attention_dim: int = 768,
+        num_image_text_embeds: int = 32,
+    ):
+        super().__init__()
+
+        self.num_image_text_embeds = num_image_text_embeds
+        self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim)
+        self.norm = nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, image_embeds: torch.Tensor):
+        batch_size = image_embeds.shape[0]
+
+        # image
+        image_embeds = self.image_embeds(image_embeds)
+        image_embeds = image_embeds.reshape(batch_size, self.num_image_text_embeds, -1)
+        image_embeds = self.norm(image_embeds)
+        return image_embeds
+
+
+class IPAdapterFullImageProjection(nn.Module):
+    def __init__(self, image_embed_dim=1024, cross_attention_dim=1024):
+        super().__init__()
+        from .attention import FeedForward
+
+        self.ff = FeedForward(image_embed_dim, cross_attention_dim, mult=1, activation_fn="gelu")
+        self.norm = nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, image_embeds: torch.Tensor):
+        return self.norm(self.ff(image_embeds))
+
+
+class IPAdapterFaceIDImageProjection(nn.Module):
+    def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mult=1, num_tokens=1):
+        super().__init__()
+        from .attention import FeedForward
+
+        self.num_tokens = num_tokens
+        self.cross_attention_dim = cross_attention_dim
+        self.ff = FeedForward(image_embed_dim, cross_attention_dim * num_tokens, mult=mult, activation_fn="gelu")
+        self.norm = nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, image_embeds: torch.Tensor):
+        x = self.ff(image_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        return self.norm(x)
+
+
+class CombinedTimestepLabelEmbeddings(nn.Module):
+    def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1):
+        super().__init__()
+
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.class_embedder = LabelEmbedding(num_classes, embedding_dim, class_dropout_prob)
+
+    def forward(self, timestep, class_labels, hidden_dtype=None):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+
+        class_labels = self.class_embedder(class_labels)  # (N, D)
+
+        conditioning = timesteps_emb + class_labels  # (N, D)
+
+        return conditioning
+
+
+class CombinedTimestepTextProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+
+    def forward(self, timestep, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))  # (N, D)
+
+        pooled_projections = self.text_embedder(pooled_projection)
+
+        conditioning = timesteps_emb + pooled_projections
+
+        return conditioning
+
+
+class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+
+    def forward(self, timestep, guidance, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))  # (N, D)
+
+        guidance_proj = self.time_proj(guidance)
+        guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype))  # (N, D)
+
+        time_guidance_emb = timesteps_emb + guidance_emb
+
+        pooled_projections = self.text_embedder(pooled_projection)
+        conditioning = time_guidance_emb + pooled_projections
+
+        return conditioning
+
+
+class HunyuanDiTAttentionPool(nn.Module):
+    # Copied from https://github.com/Tencent/HunyuanDiT/blob/cb709308d92e6c7e8d59d0dff41b74d35088db6a/hydit/modules/poolers.py#L6
+
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+
+    def forward(self, x):
+        x = x.permute(1, 0, 2)  # NLC -> LNC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (L+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (L+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1],
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False,
+        )
+        return x.squeeze(0)
+
+
+class HunyuanCombinedTimestepTextSizeStyleEmbedding(nn.Module):
+    def __init__(
+        self,
+        embedding_dim,
+        pooled_projection_dim=1024,
+        seq_len=256,
+        cross_attention_dim=2048,
+        use_style_cond_and_image_meta_size=True,
+    ):
+        super().__init__()
+
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+        self.size_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+
+        self.pooler = HunyuanDiTAttentionPool(
+            seq_len, cross_attention_dim, num_heads=8, output_dim=pooled_projection_dim
+        )
+
+        # Here we use a default learned embedder layer for future extension.
+        self.use_style_cond_and_image_meta_size = use_style_cond_and_image_meta_size
+        if use_style_cond_and_image_meta_size:
+            self.style_embedder = nn.Embedding(1, embedding_dim)
+            extra_in_dim = 256 * 6 + embedding_dim + pooled_projection_dim
+        else:
+            extra_in_dim = pooled_projection_dim
+
+        self.extra_embedder = PixArtAlphaTextProjection(
+            in_features=extra_in_dim,
+            hidden_size=embedding_dim * 4,
+            out_features=embedding_dim,
+            act_fn="silu_fp32",
+        )
+
+    def forward(self, timestep, encoder_hidden_states, image_meta_size, style, hidden_dtype=None):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, 256)
+
+        # extra condition1: text
+        pooled_projections = self.pooler(encoder_hidden_states)  # (N, 1024)
+
+        if self.use_style_cond_and_image_meta_size:
+            # extra condition2: image meta size embedding
+            image_meta_size = self.size_proj(image_meta_size.view(-1))
+            image_meta_size = image_meta_size.to(dtype=hidden_dtype)
+            image_meta_size = image_meta_size.view(-1, 6 * 256)  # (N, 1536)
+
+            # extra condition3: style embedding
+            style_embedding = self.style_embedder(style)  # (N, embedding_dim)
+
+            # Concatenate all extra vectors
+            extra_cond = torch.cat([pooled_projections, image_meta_size, style_embedding], dim=1)
+        else:
+            extra_cond = torch.cat([pooled_projections], dim=1)
+
+        conditioning = timesteps_emb + self.extra_embedder(extra_cond)  # [B, D]
+
+        return conditioning
+
+
+class LuminaCombinedTimestepCaptionEmbedding(nn.Module):
+    def __init__(self, hidden_size=4096, cross_attention_dim=2048, frequency_embedding_size=256):
+        super().__init__()
+        self.time_proj = Timesteps(
+            num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0.0
+        )
+
+        self.timestep_embedder = TimestepEmbedding(in_channels=frequency_embedding_size, time_embed_dim=hidden_size)
+
+        self.caption_embedder = nn.Sequential(
+            nn.LayerNorm(cross_attention_dim),
+            nn.Linear(
+                cross_attention_dim,
+                hidden_size,
+                bias=True,
+            ),
+        )
+
+    def forward(self, timestep, caption_feat, caption_mask):
+        # timestep embedding:
+        time_freq = self.time_proj(timestep)
+        time_embed = self.timestep_embedder(time_freq.to(dtype=self.timestep_embedder.linear_1.weight.dtype))
+
+        # caption condition embedding:
+        caption_mask_float = caption_mask.float().unsqueeze(-1)
+        caption_feats_pool = (caption_feat * caption_mask_float).sum(dim=1) / caption_mask_float.sum(dim=1)
+        caption_feats_pool = caption_feats_pool.to(caption_feat)
+        caption_embed = self.caption_embedder(caption_feats_pool)
+
+        conditioning = time_embed + caption_embed
+
+        return conditioning
+
+
+class TextTimeEmbedding(nn.Module):
+    def __init__(self, encoder_dim: int, time_embed_dim: int, num_heads: int = 64):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(encoder_dim)
+        self.pool = AttentionPooling(num_heads, encoder_dim)
+        self.proj = nn.Linear(encoder_dim, time_embed_dim)
+        self.norm2 = nn.LayerNorm(time_embed_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.pool(hidden_states)
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        return hidden_states
+
+
+class TextImageTimeEmbedding(nn.Module):
+    def __init__(self, text_embed_dim: int = 768, image_embed_dim: int = 768, time_embed_dim: int = 1536):
+        super().__init__()
+        self.text_proj = nn.Linear(text_embed_dim, time_embed_dim)
+        self.text_norm = nn.LayerNorm(time_embed_dim)
+        self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
+
+    def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
+        # text
+        time_text_embeds = self.text_proj(text_embeds)
+        time_text_embeds = self.text_norm(time_text_embeds)
+
+        # image
+        time_image_embeds = self.image_proj(image_embeds)
+
+        return time_image_embeds + time_text_embeds
+
+
+class ImageTimeEmbedding(nn.Module):
+    def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536):
+        super().__init__()
+        self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
+        self.image_norm = nn.LayerNorm(time_embed_dim)
+
+    def forward(self, image_embeds: torch.Tensor):
+        # image
+        time_image_embeds = self.image_proj(image_embeds)
+        time_image_embeds = self.image_norm(time_image_embeds)
+        return time_image_embeds
+
+
+class ImageHintTimeEmbedding(nn.Module):
+    def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536):
+        super().__init__()
+        self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
+        self.image_norm = nn.LayerNorm(time_embed_dim)
+        self.input_hint_block = nn.Sequential(
+            nn.Conv2d(3, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 32, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(32, 32, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(32, 96, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(96, 96, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(96, 256, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(256, 4, 3, padding=1),
+        )
+
+    def forward(self, image_embeds: torch.Tensor, hint: torch.Tensor):
+        # image
+        time_image_embeds = self.image_proj(image_embeds)
+        time_image_embeds = self.image_norm(time_image_embeds)
+        hint = self.input_hint_block(hint)
+        return time_image_embeds, hint
+
+
+class AttentionPooling(nn.Module):
+    # Copied from https://github.com/deep-floyd/IF/blob/2f91391f27dd3c468bf174be5805b4cc92980c0b/deepfloyd_if/model/nn.py#L54
+
+    def __init__(self, num_heads, embed_dim, dtype=None):
+        super().__init__()
+        self.dtype = dtype
+        self.positional_embedding = nn.Parameter(torch.randn(1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
+        self.num_heads = num_heads
+        self.dim_per_head = embed_dim // self.num_heads
+
+    def forward(self, x):
+        bs, length, width = x.size()
+
+        def shape(x):
+            # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+            x = x.view(bs, -1, self.num_heads, self.dim_per_head)
+            # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+            x = x.transpose(1, 2)
+            # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+            x = x.reshape(bs * self.num_heads, -1, self.dim_per_head)
+            # (bs*n_heads, length, dim_per_head) --> (bs*n_heads, dim_per_head, length)
+            x = x.transpose(1, 2)
+            return x
+
+        class_token = x.mean(dim=1, keepdim=True) + self.positional_embedding.to(x.dtype)
+        x = torch.cat([class_token, x], dim=1)  # (bs, length+1, width)
+
+        # (bs*n_heads, class_token_length, dim_per_head)
+        q = shape(self.q_proj(class_token))
+        # (bs*n_heads, length+class_token_length, dim_per_head)
+        k = shape(self.k_proj(x))
+        v = shape(self.v_proj(x))
+
+        # (bs*n_heads, class_token_length, length+class_token_length):
+        scale = 1 / math.sqrt(math.sqrt(self.dim_per_head))
+        weight = torch.einsum("bct,bcs->bts", q * scale, k * scale)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+
+        # (bs*n_heads, dim_per_head, class_token_length)
+        a = torch.einsum("bts,bcs->bct", weight, v)
+
+        # (bs, length+1, width)
+        a = a.reshape(bs, -1, 1).transpose(1, 2)
+
+        return a[:, 0, :]  # cls_token
+
+
+def get_fourier_embeds_from_boundingbox(embed_dim, box):
+    """
+    Args:
+        embed_dim: int
+        box: a 3-D tensor [B x N x 4] representing the bounding boxes for GLIGEN pipeline
+    Returns:
+        [B x N x embed_dim] tensor of positional embeddings
+    """
+
+    batch_size, num_boxes = box.shape[:2]
+
+    emb = 100 ** (torch.arange(embed_dim) / embed_dim)
+    emb = emb[None, None, None].to(device=box.device, dtype=box.dtype)
+    emb = emb * box.unsqueeze(-1)
+
+    emb = torch.stack((emb.sin(), emb.cos()), dim=-1)
+    emb = emb.permute(0, 1, 3, 4, 2).reshape(batch_size, num_boxes, embed_dim * 2 * 4)
+
+    return emb
+
+
+class GLIGENTextBoundingboxProjection(nn.Module):
+    def __init__(self, positive_len, out_dim, feature_type="text-only", fourier_freqs=8):
+        super().__init__()
+        self.positive_len = positive_len
+        self.out_dim = out_dim
+
+        self.fourier_embedder_dim = fourier_freqs
+        self.position_dim = fourier_freqs * 2 * 4  # 2: sin/cos, 4: xyxy
+
+        if isinstance(out_dim, tuple):
+            out_dim = out_dim[0]
+
+        if feature_type == "text-only":
+            self.linears = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.null_positive_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+
+        elif feature_type == "text-image":
+            self.linears_text = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.linears_image = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.null_text_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+            self.null_image_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+
+        self.null_position_feature = torch.nn.Parameter(torch.zeros([self.position_dim]))
+
+    def forward(
+        self,
+        boxes,
+        masks,
+        positive_embeddings=None,
+        phrases_masks=None,
+        image_masks=None,
+        phrases_embeddings=None,
+        image_embeddings=None,
+    ):
+        masks = masks.unsqueeze(-1)
+
+        # embedding position (it may includes padding as placeholder)
+        xyxy_embedding = get_fourier_embeds_from_boundingbox(self.fourier_embedder_dim, boxes)  # B*N*4 -> B*N*C
+
+        # learnable null embedding
+        xyxy_null = self.null_position_feature.view(1, 1, -1)
+
+        # replace padding with learnable null embedding
+        xyxy_embedding = xyxy_embedding * masks + (1 - masks) * xyxy_null
+
+        # positionet with text only information
+        if positive_embeddings is not None:
+            # learnable null embedding
+            positive_null = self.null_positive_feature.view(1, 1, -1)
+
+            # replace padding with learnable null embedding
+            positive_embeddings = positive_embeddings * masks + (1 - masks) * positive_null
+
+            objs = self.linears(torch.cat([positive_embeddings, xyxy_embedding], dim=-1))
+
+        # positionet with text and image information
+        else:
+            phrases_masks = phrases_masks.unsqueeze(-1)
+            image_masks = image_masks.unsqueeze(-1)
+
+            # learnable null embedding
+            text_null = self.null_text_feature.view(1, 1, -1)
+            image_null = self.null_image_feature.view(1, 1, -1)
+
+            # replace padding with learnable null embedding
+            phrases_embeddings = phrases_embeddings * phrases_masks + (1 - phrases_masks) * text_null
+            image_embeddings = image_embeddings * image_masks + (1 - image_masks) * image_null
+
+            objs_text = self.linears_text(torch.cat([phrases_embeddings, xyxy_embedding], dim=-1))
+            objs_image = self.linears_image(torch.cat([image_embeddings, xyxy_embedding], dim=-1))
+            objs = torch.cat([objs_text, objs_image], dim=1)
+
+        return objs
+
+
+class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module):
+    """
+    For PixArt-Alpha.
+
+    Reference:
+    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
+    """
+
+    def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False):
+        super().__init__()
+
+        self.outdim = size_emb_dim
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+        self.use_additional_conditions = use_additional_conditions
+        if use_additional_conditions:
+            self.additional_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+            self.resolution_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
+            self.aspect_ratio_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
+
+    def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+
+        if self.use_additional_conditions:
+            resolution_emb = self.additional_condition_proj(resolution.flatten()).to(hidden_dtype)
+            resolution_emb = self.resolution_embedder(resolution_emb).reshape(batch_size, -1)
+            aspect_ratio_emb = self.additional_condition_proj(aspect_ratio.flatten()).to(hidden_dtype)
+            aspect_ratio_emb = self.aspect_ratio_embedder(aspect_ratio_emb).reshape(batch_size, -1)
+            conditioning = timesteps_emb + torch.cat([resolution_emb, aspect_ratio_emb], dim=1)
+        else:
+            conditioning = timesteps_emb
+
+        return conditioning
+
+
+class PixArtAlphaTextProjection(nn.Module):
+    """
+    Projects caption embeddings. Also handles dropout for classifier-free guidance.
+
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+
+    def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh"):
+        super().__init__()
+        if out_features is None:
+            out_features = hidden_size
+        self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
+        if act_fn == "gelu_tanh":
+            self.act_1 = nn.GELU(approximate="tanh")
+        elif act_fn == "silu":
+            self.act_1 = nn.SiLU()
+        elif act_fn == "silu_fp32":
+            self.act_1 = FP32SiLU()
+        else:
+            raise ValueError(f"Unknown activation function: {act_fn}")
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=out_features, bias=True)
+
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class IPAdapterPlusImageProjectionBlock(nn.Module):
+    def __init__(
+        self,
+        embed_dims: int = 768,
+        dim_head: int = 64,
+        heads: int = 16,
+        ffn_ratio: float = 4,
+    ) -> None:
+        super().__init__()
+        from .attention import FeedForward
+
+        self.ln0 = nn.LayerNorm(embed_dims)
+        self.ln1 = nn.LayerNorm(embed_dims)
+        self.attn = Attention(
+            query_dim=embed_dims,
+            dim_head=dim_head,
+            heads=heads,
+            out_bias=False,
+        )
+        self.ff = nn.Sequential(
+            nn.LayerNorm(embed_dims),
+            FeedForward(embed_dims, embed_dims, activation_fn="gelu", mult=ffn_ratio, bias=False),
+        )
+
+    def forward(self, x, latents, residual):
+        encoder_hidden_states = self.ln0(x)
+        latents = self.ln1(latents)
+        encoder_hidden_states = torch.cat([encoder_hidden_states, latents], dim=-2)
+        latents = self.attn(latents, encoder_hidden_states) + residual
+        latents = self.ff(latents) + latents
+        return latents
+
+
+class IPAdapterPlusImageProjection(nn.Module):
+    """Resampler of IP-Adapter Plus.
+
+    Args:
+        embed_dims (int): The feature dimension. Defaults to 768. output_dims (int): The number of output channels,
+        that is the same
+            number of the channels in the `unet.config.cross_attention_dim`. Defaults to 1024.
+        hidden_dims (int):
+            The number of hidden channels. Defaults to 1280. depth (int): The number of blocks. Defaults
+        to 8. dim_head (int): The number of head channels. Defaults to 64. heads (int): Parallel attention heads.
+        Defaults to 16. num_queries (int):
+            The number of queries. Defaults to 8. ffn_ratio (float): The expansion ratio
+        of feedforward network hidden
+            layer channels. Defaults to 4.
+    """
+
+    def __init__(
+        self,
+        embed_dims: int = 768,
+        output_dims: int = 1024,
+        hidden_dims: int = 1280,
+        depth: int = 4,
+        dim_head: int = 64,
+        heads: int = 16,
+        num_queries: int = 8,
+        ffn_ratio: float = 4,
+    ) -> None:
+        super().__init__()
+        self.latents = nn.Parameter(torch.randn(1, num_queries, hidden_dims) / hidden_dims**0.5)
+
+        self.proj_in = nn.Linear(embed_dims, hidden_dims)
+
+        self.proj_out = nn.Linear(hidden_dims, output_dims)
+        self.norm_out = nn.LayerNorm(output_dims)
+
+        self.layers = nn.ModuleList(
+            [IPAdapterPlusImageProjectionBlock(hidden_dims, dim_head, heads, ffn_ratio) for _ in range(depth)]
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass.
+
+        Args:
+            x (torch.Tensor): Input Tensor.
+        Returns:
+            torch.Tensor: Output Tensor.
+        """
+        latents = self.latents.repeat(x.size(0), 1, 1)
+
+        x = self.proj_in(x)
+
+        for block in self.layers:
+            residual = latents
+            latents = block(x, latents, residual)
+
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+
+
+class IPAdapterFaceIDPlusImageProjection(nn.Module):
+    """FacePerceiverResampler of IP-Adapter Plus.
+
+    Args:
+        embed_dims (int): The feature dimension. Defaults to 768. output_dims (int): The number of output channels,
+        that is the same
+            number of the channels in the `unet.config.cross_attention_dim`. Defaults to 1024.
+        hidden_dims (int):
+            The number of hidden channels. Defaults to 1280. depth (int): The number of blocks. Defaults
+        to 8. dim_head (int): The number of head channels. Defaults to 64. heads (int): Parallel attention heads.
+        Defaults to 16. num_tokens (int): Number of tokens num_queries (int): The number of queries. Defaults to 8.
+        ffn_ratio (float): The expansion ratio of feedforward network hidden
+            layer channels. Defaults to 4.
+        ffproj_ratio (float): The expansion ratio of feedforward network hidden
+            layer channels (for ID embeddings). Defaults to 4.
+    """
+
+    def __init__(
+        self,
+        embed_dims: int = 768,
+        output_dims: int = 768,
+        hidden_dims: int = 1280,
+        id_embeddings_dim: int = 512,
+        depth: int = 4,
+        dim_head: int = 64,
+        heads: int = 16,
+        num_tokens: int = 4,
+        num_queries: int = 8,
+        ffn_ratio: float = 4,
+        ffproj_ratio: int = 2,
+    ) -> None:
+        super().__init__()
+        from .attention import FeedForward
+
+        self.num_tokens = num_tokens
+        self.embed_dim = embed_dims
+        self.clip_embeds = None
+        self.shortcut = False
+        self.shortcut_scale = 1.0
+
+        self.proj = FeedForward(id_embeddings_dim, embed_dims * num_tokens, activation_fn="gelu", mult=ffproj_ratio)
+        self.norm = nn.LayerNorm(embed_dims)
+
+        self.proj_in = nn.Linear(hidden_dims, embed_dims)
+
+        self.proj_out = nn.Linear(embed_dims, output_dims)
+        self.norm_out = nn.LayerNorm(output_dims)
+
+        self.layers = nn.ModuleList(
+            [IPAdapterPlusImageProjectionBlock(embed_dims, dim_head, heads, ffn_ratio) for _ in range(depth)]
+        )
+
+    def forward(self, id_embeds: torch.Tensor) -> torch.Tensor:
+        """Forward pass.
+
+        Args:
+            id_embeds (torch.Tensor): Input Tensor (ID embeds).
+        Returns:
+            torch.Tensor: Output Tensor.
+        """
+        id_embeds = id_embeds.to(self.clip_embeds.dtype)
+        id_embeds = self.proj(id_embeds)
+        id_embeds = id_embeds.reshape(-1, self.num_tokens, self.embed_dim)
+        id_embeds = self.norm(id_embeds)
+        latents = id_embeds
+
+        clip_embeds = self.proj_in(self.clip_embeds)
+        x = clip_embeds.reshape(-1, clip_embeds.shape[2], clip_embeds.shape[3])
+
+        for block in self.layers:
+            residual = latents
+            latents = block(x, latents, residual)
+
+        latents = self.proj_out(latents)
+        out = self.norm_out(latents)
+        if self.shortcut:
+            out = id_embeds + self.shortcut_scale * out
+        return out
+
+
+class MultiIPAdapterImageProjection(nn.Module):
+    def __init__(self, IPAdapterImageProjectionLayers: Union[List[nn.Module], Tuple[nn.Module]]):
+        super().__init__()
+        self.image_projection_layers = nn.ModuleList(IPAdapterImageProjectionLayers)
+
+    def forward(self, image_embeds: List[torch.Tensor]):
+        projected_image_embeds = []
+
+        # currently, we accept `image_embeds` as
+        #  1. a tensor (deprecated) with shape [batch_size, embed_dim] or [batch_size, sequence_length, embed_dim]
+        #  2. list of `n` tensors where `n` is number of ip-adapters, each tensor can hae shape [batch_size, num_images, embed_dim] or [batch_size, num_images, sequence_length, embed_dim]
+        if not isinstance(image_embeds, list):
+            deprecation_message = (
+                "You have passed a tensor as `image_embeds`.This is deprecated and will be removed in a future release."
+                " Please make sure to update your script to pass `image_embeds` as a list of tensors to suppress this warning."
+            )
+            deprecate("image_embeds not a list", "1.0.0", deprecation_message, standard_warn=False)
+            image_embeds = [image_embeds.unsqueeze(1)]
+
+        if len(image_embeds) != len(self.image_projection_layers):
+            raise ValueError(
+                f"image_embeds must have the same length as image_projection_layers, got {len(image_embeds)} and {len(self.image_projection_layers)}"
+            )
+
+        for image_embed, image_projection_layer in zip(image_embeds, self.image_projection_layers):
+            batch_size, num_images = image_embed.shape[0], image_embed.shape[1]
+            image_embed = image_embed.reshape((batch_size * num_images,) + image_embed.shape[2:])
+            image_embed = image_projection_layer(image_embed)
+            image_embed = image_embed.reshape((batch_size, num_images) + image_embed.shape[1:])
+
+            projected_image_embeds.append(image_embed)
+
+        return projected_image_embeds
diff --git a/diffusers/src/diffusers/models/embeddings_flax.py b/diffusers/src/diffusers/models/embeddings_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e343be0d3b71bfcc1f7fb0a1787ad2bc4916dd2
--- /dev/null
+++ b/diffusers/src/diffusers/models/embeddings_flax.py
@@ -0,0 +1,97 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import flax.linen as nn
+import jax.numpy as jnp
+
+
+def get_sinusoidal_embeddings(
+    timesteps: jnp.ndarray,
+    embedding_dim: int,
+    freq_shift: float = 1,
+    min_timescale: float = 1,
+    max_timescale: float = 1.0e4,
+    flip_sin_to_cos: bool = False,
+    scale: float = 1.0,
+) -> jnp.ndarray:
+    """Returns the positional encoding (same as Tensor2Tensor).
+
+    Args:
+        timesteps: a 1-D Tensor of N indices, one per batch element.
+        These may be fractional.
+        embedding_dim: The number of output channels.
+        min_timescale: The smallest time unit (should probably be 0.0).
+        max_timescale: The largest time unit.
+    Returns:
+        a Tensor of timing signals [N, num_channels]
+    """
+    assert timesteps.ndim == 1, "Timesteps should be a 1d-array"
+    assert embedding_dim % 2 == 0, f"Embedding dimension {embedding_dim} should be even"
+    num_timescales = float(embedding_dim // 2)
+    log_timescale_increment = math.log(max_timescale / min_timescale) / (num_timescales - freq_shift)
+    inv_timescales = min_timescale * jnp.exp(jnp.arange(num_timescales, dtype=jnp.float32) * -log_timescale_increment)
+    emb = jnp.expand_dims(timesteps, 1) * jnp.expand_dims(inv_timescales, 0)
+
+    # scale embeddings
+    scaled_time = scale * emb
+
+    if flip_sin_to_cos:
+        signal = jnp.concatenate([jnp.cos(scaled_time), jnp.sin(scaled_time)], axis=1)
+    else:
+        signal = jnp.concatenate([jnp.sin(scaled_time), jnp.cos(scaled_time)], axis=1)
+    signal = jnp.reshape(signal, [jnp.shape(timesteps)[0], embedding_dim])
+    return signal
+
+
+class FlaxTimestepEmbedding(nn.Module):
+    r"""
+    Time step Embedding Module. Learns embeddings for input time steps.
+
+    Args:
+        time_embed_dim (`int`, *optional*, defaults to `32`):
+                Time step embedding dimension
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+                Parameters `dtype`
+    """
+
+    time_embed_dim: int = 32
+    dtype: jnp.dtype = jnp.float32
+
+    @nn.compact
+    def __call__(self, temb):
+        temb = nn.Dense(self.time_embed_dim, dtype=self.dtype, name="linear_1")(temb)
+        temb = nn.silu(temb)
+        temb = nn.Dense(self.time_embed_dim, dtype=self.dtype, name="linear_2")(temb)
+        return temb
+
+
+class FlaxTimesteps(nn.Module):
+    r"""
+    Wrapper Module for sinusoidal Time step Embeddings as described in https://arxiv.org/abs/2006.11239
+
+    Args:
+        dim (`int`, *optional*, defaults to `32`):
+                Time step embedding dimension
+    """
+
+    dim: int = 32
+    flip_sin_to_cos: bool = False
+    freq_shift: float = 1
+
+    @nn.compact
+    def __call__(self, timesteps):
+        return get_sinusoidal_embeddings(
+            timesteps, embedding_dim=self.dim, flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.freq_shift
+        )
diff --git a/diffusers/src/diffusers/models/lora.py b/diffusers/src/diffusers/models/lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e9e0c07ca75bded9530df87d6daaa7d196b2756
--- /dev/null
+++ b/diffusers/src/diffusers/models/lora.py
@@ -0,0 +1,457 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# IMPORTANT:                                                      #
+###################################################################
+# ----------------------------------------------------------------#
+# This file is deprecated and will be removed soon                #
+# (as soon as PEFT will become a required dependency for LoRA)    #
+# ----------------------------------------------------------------#
+###################################################################
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ..utils import deprecate, logging
+from ..utils.import_utils import is_transformers_available
+
+
+if is_transformers_available():
+    from transformers import CLIPTextModel, CLIPTextModelWithProjection
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def text_encoder_attn_modules(text_encoder):
+    attn_modules = []
+
+    if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
+        for i, layer in enumerate(text_encoder.text_model.encoder.layers):
+            name = f"text_model.encoder.layers.{i}.self_attn"
+            mod = layer.self_attn
+            attn_modules.append((name, mod))
+    else:
+        raise ValueError(f"do not know how to get attention modules for: {text_encoder.__class__.__name__}")
+
+    return attn_modules
+
+
+def text_encoder_mlp_modules(text_encoder):
+    mlp_modules = []
+
+    if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
+        for i, layer in enumerate(text_encoder.text_model.encoder.layers):
+            mlp_mod = layer.mlp
+            name = f"text_model.encoder.layers.{i}.mlp"
+            mlp_modules.append((name, mlp_mod))
+    else:
+        raise ValueError(f"do not know how to get mlp modules for: {text_encoder.__class__.__name__}")
+
+    return mlp_modules
+
+
+def adjust_lora_scale_text_encoder(text_encoder, lora_scale: float = 1.0):
+    for _, attn_module in text_encoder_attn_modules(text_encoder):
+        if isinstance(attn_module.q_proj, PatchedLoraProjection):
+            attn_module.q_proj.lora_scale = lora_scale
+            attn_module.k_proj.lora_scale = lora_scale
+            attn_module.v_proj.lora_scale = lora_scale
+            attn_module.out_proj.lora_scale = lora_scale
+
+    for _, mlp_module in text_encoder_mlp_modules(text_encoder):
+        if isinstance(mlp_module.fc1, PatchedLoraProjection):
+            mlp_module.fc1.lora_scale = lora_scale
+            mlp_module.fc2.lora_scale = lora_scale
+
+
+class PatchedLoraProjection(torch.nn.Module):
+    def __init__(self, regular_linear_layer, lora_scale=1, network_alpha=None, rank=4, dtype=None):
+        deprecation_message = "Use of `PatchedLoraProjection` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("PatchedLoraProjection", "1.0.0", deprecation_message)
+
+        super().__init__()
+        from ..models.lora import LoRALinearLayer
+
+        self.regular_linear_layer = regular_linear_layer
+
+        device = self.regular_linear_layer.weight.device
+
+        if dtype is None:
+            dtype = self.regular_linear_layer.weight.dtype
+
+        self.lora_linear_layer = LoRALinearLayer(
+            self.regular_linear_layer.in_features,
+            self.regular_linear_layer.out_features,
+            network_alpha=network_alpha,
+            device=device,
+            dtype=dtype,
+            rank=rank,
+        )
+
+        self.lora_scale = lora_scale
+
+    # overwrite PyTorch's `state_dict` to be sure that only the 'regular_linear_layer' weights are saved
+    # when saving the whole text encoder model and when LoRA is unloaded or fused
+    def state_dict(self, *args, destination=None, prefix="", keep_vars=False):
+        if self.lora_linear_layer is None:
+            return self.regular_linear_layer.state_dict(
+                *args, destination=destination, prefix=prefix, keep_vars=keep_vars
+            )
+
+        return super().state_dict(*args, destination=destination, prefix=prefix, keep_vars=keep_vars)
+
+    def _fuse_lora(self, lora_scale=1.0, safe_fusing=False):
+        if self.lora_linear_layer is None:
+            return
+
+        dtype, device = self.regular_linear_layer.weight.data.dtype, self.regular_linear_layer.weight.data.device
+
+        w_orig = self.regular_linear_layer.weight.data.float()
+        w_up = self.lora_linear_layer.up.weight.data.float()
+        w_down = self.lora_linear_layer.down.weight.data.float()
+
+        if self.lora_linear_layer.network_alpha is not None:
+            w_up = w_up * self.lora_linear_layer.network_alpha / self.lora_linear_layer.rank
+
+        fused_weight = w_orig + (lora_scale * torch.bmm(w_up[None, :], w_down[None, :])[0])
+
+        if safe_fusing and torch.isnan(fused_weight).any().item():
+            raise ValueError(
+                "This LoRA weight seems to be broken. "
+                f"Encountered NaN values when trying to fuse LoRA weights for {self}."
+                "LoRA weights will not be fused."
+            )
+
+        self.regular_linear_layer.weight.data = fused_weight.to(device=device, dtype=dtype)
+
+        # we can drop the lora layer now
+        self.lora_linear_layer = None
+
+        # offload the up and down matrices to CPU to not blow the memory
+        self.w_up = w_up.cpu()
+        self.w_down = w_down.cpu()
+        self.lora_scale = lora_scale
+
+    def _unfuse_lora(self):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
+            return
+
+        fused_weight = self.regular_linear_layer.weight.data
+        dtype, device = fused_weight.dtype, fused_weight.device
+
+        w_up = self.w_up.to(device=device).float()
+        w_down = self.w_down.to(device).float()
+
+        unfused_weight = fused_weight.float() - (self.lora_scale * torch.bmm(w_up[None, :], w_down[None, :])[0])
+        self.regular_linear_layer.weight.data = unfused_weight.to(device=device, dtype=dtype)
+
+        self.w_up = None
+        self.w_down = None
+
+    def forward(self, input):
+        if self.lora_scale is None:
+            self.lora_scale = 1.0
+        if self.lora_linear_layer is None:
+            return self.regular_linear_layer(input)
+        return self.regular_linear_layer(input) + (self.lora_scale * self.lora_linear_layer(input))
+
+
+class LoRALinearLayer(nn.Module):
+    r"""
+    A linear layer that is used with LoRA.
+
+    Parameters:
+        in_features (`int`):
+            Number of input features.
+        out_features (`int`):
+            Number of output features.
+        rank (`int`, `optional`, defaults to 4):
+            The rank of the LoRA layer.
+        network_alpha (`float`, `optional`, defaults to `None`):
+            The value of the network alpha used for stable learning and preventing underflow. This value has the same
+            meaning as the `--network_alpha` option in the kohya-ss trainer script. See
+            https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        device (`torch.device`, `optional`, defaults to `None`):
+            The device to use for the layer's weights.
+        dtype (`torch.dtype`, `optional`, defaults to `None`):
+            The dtype to use for the layer's weights.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 4,
+        network_alpha: Optional[float] = None,
+        device: Optional[Union[torch.device, str]] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+
+        deprecation_message = "Use of `LoRALinearLayer` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("LoRALinearLayer", "1.0.0", deprecation_message)
+
+        self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype)
+        self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+        self.out_features = out_features
+        self.in_features = in_features
+
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+
+        return up_hidden_states.to(orig_dtype)
+
+
+class LoRAConv2dLayer(nn.Module):
+    r"""
+    A convolutional layer that is used with LoRA.
+
+    Parameters:
+        in_features (`int`):
+            Number of input features.
+        out_features (`int`):
+            Number of output features.
+        rank (`int`, `optional`, defaults to 4):
+            The rank of the LoRA layer.
+        kernel_size (`int` or `tuple` of two `int`, `optional`, defaults to 1):
+            The kernel size of the convolution.
+        stride (`int` or `tuple` of two `int`, `optional`, defaults to 1):
+            The stride of the convolution.
+        padding (`int` or `tuple` of two `int` or `str`, `optional`, defaults to 0):
+            The padding of the convolution.
+        network_alpha (`float`, `optional`, defaults to `None`):
+            The value of the network alpha used for stable learning and preventing underflow. This value has the same
+            meaning as the `--network_alpha` option in the kohya-ss trainer script. See
+            https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 4,
+        kernel_size: Union[int, Tuple[int, int]] = (1, 1),
+        stride: Union[int, Tuple[int, int]] = (1, 1),
+        padding: Union[int, Tuple[int, int], str] = 0,
+        network_alpha: Optional[float] = None,
+    ):
+        super().__init__()
+
+        deprecation_message = "Use of `LoRAConv2dLayer` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("LoRAConv2dLayer", "1.0.0", deprecation_message)
+
+        self.down = nn.Conv2d(in_features, rank, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
+        # according to the official kohya_ss trainer kernel_size are always fixed for the up layer
+        # # see: https://github.com/bmaltais/kohya_ss/blob/2accb1305979ba62f5077a23aabac23b4c37e935/networks/lora_diffusers.py#L129
+        self.up = nn.Conv2d(rank, out_features, kernel_size=(1, 1), stride=(1, 1), bias=False)
+
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+
+        return up_hidden_states.to(orig_dtype)
+
+
+class LoRACompatibleConv(nn.Conv2d):
+    """
+    A convolutional layer that can be used with LoRA.
+    """
+
+    def __init__(self, *args, lora_layer: Optional[LoRAConv2dLayer] = None, **kwargs):
+        deprecation_message = "Use of `LoRACompatibleConv` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("LoRACompatibleConv", "1.0.0", deprecation_message)
+
+        super().__init__(*args, **kwargs)
+        self.lora_layer = lora_layer
+
+    def set_lora_layer(self, lora_layer: Optional[LoRAConv2dLayer]):
+        deprecation_message = "Use of `set_lora_layer()` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("set_lora_layer", "1.0.0", deprecation_message)
+
+        self.lora_layer = lora_layer
+
+    def _fuse_lora(self, lora_scale: float = 1.0, safe_fusing: bool = False):
+        if self.lora_layer is None:
+            return
+
+        dtype, device = self.weight.data.dtype, self.weight.data.device
+
+        w_orig = self.weight.data.float()
+        w_up = self.lora_layer.up.weight.data.float()
+        w_down = self.lora_layer.down.weight.data.float()
+
+        if self.lora_layer.network_alpha is not None:
+            w_up = w_up * self.lora_layer.network_alpha / self.lora_layer.rank
+
+        fusion = torch.mm(w_up.flatten(start_dim=1), w_down.flatten(start_dim=1))
+        fusion = fusion.reshape((w_orig.shape))
+        fused_weight = w_orig + (lora_scale * fusion)
+
+        if safe_fusing and torch.isnan(fused_weight).any().item():
+            raise ValueError(
+                "This LoRA weight seems to be broken. "
+                f"Encountered NaN values when trying to fuse LoRA weights for {self}."
+                "LoRA weights will not be fused."
+            )
+
+        self.weight.data = fused_weight.to(device=device, dtype=dtype)
+
+        # we can drop the lora layer now
+        self.lora_layer = None
+
+        # offload the up and down matrices to CPU to not blow the memory
+        self.w_up = w_up.cpu()
+        self.w_down = w_down.cpu()
+        self._lora_scale = lora_scale
+
+    def _unfuse_lora(self):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
+            return
+
+        fused_weight = self.weight.data
+        dtype, device = fused_weight.data.dtype, fused_weight.data.device
+
+        self.w_up = self.w_up.to(device=device).float()
+        self.w_down = self.w_down.to(device).float()
+
+        fusion = torch.mm(self.w_up.flatten(start_dim=1), self.w_down.flatten(start_dim=1))
+        fusion = fusion.reshape((fused_weight.shape))
+        unfused_weight = fused_weight.float() - (self._lora_scale * fusion)
+        self.weight.data = unfused_weight.to(device=device, dtype=dtype)
+
+        self.w_up = None
+        self.w_down = None
+
+    def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+        if self.padding_mode != "zeros":
+            hidden_states = F.pad(hidden_states, self._reversed_padding_repeated_twice, mode=self.padding_mode)
+            padding = (0, 0)
+        else:
+            padding = self.padding
+
+        original_outputs = F.conv2d(
+            hidden_states, self.weight, self.bias, self.stride, padding, self.dilation, self.groups
+        )
+
+        if self.lora_layer is None:
+            return original_outputs
+        else:
+            return original_outputs + (scale * self.lora_layer(hidden_states))
+
+
+class LoRACompatibleLinear(nn.Linear):
+    """
+    A Linear layer that can be used with LoRA.
+    """
+
+    def __init__(self, *args, lora_layer: Optional[LoRALinearLayer] = None, **kwargs):
+        deprecation_message = "Use of `LoRACompatibleLinear` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)
+
+        super().__init__(*args, **kwargs)
+        self.lora_layer = lora_layer
+
+    def set_lora_layer(self, lora_layer: Optional[LoRALinearLayer]):
+        deprecation_message = "Use of `set_lora_layer()` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("set_lora_layer", "1.0.0", deprecation_message)
+        self.lora_layer = lora_layer
+
+    def _fuse_lora(self, lora_scale: float = 1.0, safe_fusing: bool = False):
+        if self.lora_layer is None:
+            return
+
+        dtype, device = self.weight.data.dtype, self.weight.data.device
+
+        w_orig = self.weight.data.float()
+        w_up = self.lora_layer.up.weight.data.float()
+        w_down = self.lora_layer.down.weight.data.float()
+
+        if self.lora_layer.network_alpha is not None:
+            w_up = w_up * self.lora_layer.network_alpha / self.lora_layer.rank
+
+        fused_weight = w_orig + (lora_scale * torch.bmm(w_up[None, :], w_down[None, :])[0])
+
+        if safe_fusing and torch.isnan(fused_weight).any().item():
+            raise ValueError(
+                "This LoRA weight seems to be broken. "
+                f"Encountered NaN values when trying to fuse LoRA weights for {self}."
+                "LoRA weights will not be fused."
+            )
+
+        self.weight.data = fused_weight.to(device=device, dtype=dtype)
+
+        # we can drop the lora layer now
+        self.lora_layer = None
+
+        # offload the up and down matrices to CPU to not blow the memory
+        self.w_up = w_up.cpu()
+        self.w_down = w_down.cpu()
+        self._lora_scale = lora_scale
+
+    def _unfuse_lora(self):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
+            return
+
+        fused_weight = self.weight.data
+        dtype, device = fused_weight.dtype, fused_weight.device
+
+        w_up = self.w_up.to(device=device).float()
+        w_down = self.w_down.to(device).float()
+
+        unfused_weight = fused_weight.float() - (self._lora_scale * torch.bmm(w_up[None, :], w_down[None, :])[0])
+        self.weight.data = unfused_weight.to(device=device, dtype=dtype)
+
+        self.w_up = None
+        self.w_down = None
+
+    def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+        if self.lora_layer is None:
+            out = super().forward(hidden_states)
+            return out
+        else:
+            out = super().forward(hidden_states) + (scale * self.lora_layer(hidden_states))
+            return out
diff --git a/diffusers/src/diffusers/models/model_loading_utils.py b/diffusers/src/diffusers/models/model_loading_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..969eb5f5fa373c55e8df06e6df74c688c7a7940e
--- /dev/null
+++ b/diffusers/src/diffusers/models/model_loading_utils.py
@@ -0,0 +1,230 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+import os
+from collections import OrderedDict
+from pathlib import Path
+from typing import List, Optional, Union
+
+import safetensors
+import torch
+from huggingface_hub.utils import EntryNotFoundError
+
+from ..utils import (
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFETENSORS_FILE_EXTENSION,
+    WEIGHTS_INDEX_NAME,
+    _add_variant,
+    _get_model_file,
+    is_accelerate_available,
+    is_torch_version,
+    logging,
+)
+
+
+logger = logging.get_logger(__name__)
+
+_CLASS_REMAPPING_DICT = {
+    "Transformer2DModel": {
+        "ada_norm_zero": "DiTTransformer2DModel",
+        "ada_norm_single": "PixArtTransformer2DModel",
+    }
+}
+
+
+if is_accelerate_available():
+    from accelerate import infer_auto_device_map
+    from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device
+
+
+# Adapted from `transformers` (see modeling_utils.py)
+def _determine_device_map(model: torch.nn.Module, device_map, max_memory, torch_dtype):
+    if isinstance(device_map, str):
+        no_split_modules = model._get_no_split_modules(device_map)
+        device_map_kwargs = {"no_split_module_classes": no_split_modules}
+
+        if device_map != "sequential":
+            max_memory = get_balanced_memory(
+                model,
+                dtype=torch_dtype,
+                low_zero=(device_map == "balanced_low_0"),
+                max_memory=max_memory,
+                **device_map_kwargs,
+            )
+        else:
+            max_memory = get_max_memory(max_memory)
+
+        device_map_kwargs["max_memory"] = max_memory
+        device_map = infer_auto_device_map(model, dtype=torch_dtype, **device_map_kwargs)
+
+    return device_map
+
+
+def _fetch_remapped_cls_from_config(config, old_class):
+    previous_class_name = old_class.__name__
+    remapped_class_name = _CLASS_REMAPPING_DICT.get(previous_class_name).get(config["norm_type"], None)
+
+    # Details:
+    # https://github.com/huggingface/diffusers/pull/7647#discussion_r1621344818
+    if remapped_class_name:
+        # load diffusers library to import compatible and original scheduler
+        diffusers_library = importlib.import_module(__name__.split(".")[0])
+        remapped_class = getattr(diffusers_library, remapped_class_name)
+        logger.info(
+            f"Changing class object to be of `{remapped_class_name}` type from `{previous_class_name}` type."
+            f"This is because `{previous_class_name}` is scheduled to be deprecated in a future version. Note that this"
+            " DOESN'T affect the final results."
+        )
+        return remapped_class
+    else:
+        return old_class
+
+
+def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None):
+    """
+    Reads a checkpoint file, returning properly formatted errors if they arise.
+    """
+    try:
+        file_extension = os.path.basename(checkpoint_file).split(".")[-1]
+        if file_extension == SAFETENSORS_FILE_EXTENSION:
+            return safetensors.torch.load_file(checkpoint_file, device="cpu")
+        else:
+            weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {}
+            return torch.load(
+                checkpoint_file,
+                map_location="cpu",
+                **weights_only_kwarg,
+            )
+    except Exception as e:
+        try:
+            with open(checkpoint_file) as f:
+                if f.read().startswith("version"):
+                    raise OSError(
+                        "You seem to have cloned a repository without having git-lfs installed. Please install "
+                        "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
+                        "you cloned."
+                    )
+                else:
+                    raise ValueError(
+                        f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
+                        "model. Make sure you have saved the model properly."
+                    ) from e
+        except (UnicodeDecodeError, ValueError):
+            raise OSError(
+                f"Unable to load weights from checkpoint file for '{checkpoint_file}' " f"at '{checkpoint_file}'. "
+            )
+
+
+def load_model_dict_into_meta(
+    model,
+    state_dict: OrderedDict,
+    device: Optional[Union[str, torch.device]] = None,
+    dtype: Optional[Union[str, torch.dtype]] = None,
+    model_name_or_path: Optional[str] = None,
+) -> List[str]:
+    device = device or torch.device("cpu")
+    dtype = dtype or torch.float32
+
+    accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
+
+    unexpected_keys = []
+    empty_state_dict = model.state_dict()
+    for param_name, param in state_dict.items():
+        if param_name not in empty_state_dict:
+            unexpected_keys.append(param_name)
+            continue
+
+        if empty_state_dict[param_name].shape != param.shape:
+            model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
+            raise ValueError(
+                f"Cannot load {model_name_or_path_str}because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
+            )
+
+        if accepts_dtype:
+            set_module_tensor_to_device(model, param_name, device, value=param, dtype=dtype)
+        else:
+            set_module_tensor_to_device(model, param_name, device, value=param)
+    return unexpected_keys
+
+
+def _load_state_dict_into_model(model_to_load, state_dict: OrderedDict) -> List[str]:
+    # Convert old format to new format if needed from a PyTorch state_dict
+    # copy state_dict so _load_from_state_dict can modify it
+    state_dict = state_dict.copy()
+    error_msgs = []
+
+    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+    # so we need to apply the function recursively.
+    def load(module: torch.nn.Module, prefix: str = ""):
+        args = (state_dict, prefix, {}, True, [], [], error_msgs)
+        module._load_from_state_dict(*args)
+
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + ".")
+
+    load(model_to_load)
+
+    return error_msgs
+
+
+def _fetch_index_file(
+    is_local,
+    pretrained_model_name_or_path,
+    subfolder,
+    use_safetensors,
+    cache_dir,
+    variant,
+    force_download,
+    proxies,
+    local_files_only,
+    token,
+    revision,
+    user_agent,
+    commit_hash,
+):
+    if is_local:
+        index_file = Path(
+            pretrained_model_name_or_path,
+            subfolder or "",
+            _add_variant(SAFE_WEIGHTS_INDEX_NAME if use_safetensors else WEIGHTS_INDEX_NAME, variant),
+        )
+    else:
+        index_file_in_repo = Path(
+            subfolder or "",
+            _add_variant(SAFE_WEIGHTS_INDEX_NAME if use_safetensors else WEIGHTS_INDEX_NAME, variant),
+        ).as_posix()
+        try:
+            index_file = _get_model_file(
+                pretrained_model_name_or_path,
+                weights_name=index_file_in_repo,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                subfolder=None,
+                user_agent=user_agent,
+                commit_hash=commit_hash,
+            )
+            index_file = Path(index_file)
+        except (EntryNotFoundError, EnvironmentError):
+            index_file = None
+
+    return index_file
diff --git a/diffusers/src/diffusers/models/modeling_flax_pytorch_utils.py b/diffusers/src/diffusers/models/modeling_flax_pytorch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db537f54b94ae0250b637e883471fe39a166b80
--- /dev/null
+++ b/diffusers/src/diffusers/models/modeling_flax_pytorch_utils.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch - Flax general utilities."""
+
+import re
+
+import jax.numpy as jnp
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax.random import PRNGKey
+
+from ..utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def rename_key(key):
+    regex = r"\w+[.]\d+"
+    pats = re.findall(regex, key)
+    for pat in pats:
+        key = key.replace(pat, "_".join(pat.split(".")))
+    return key
+
+
+#####################
+# PyTorch => Flax #
+#####################
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/c603c80f46881ae18b2ca50770ef65fa4033eacd/src/transformers/modeling_flax_pytorch_utils.py#L69
+# and https://github.com/patil-suraj/stable-diffusion-jax/blob/main/stable_diffusion_jax/convert_diffusers_to_jax.py
+def rename_key_and_reshape_tensor(pt_tuple_key, pt_tensor, random_flax_state_dict):
+    """Rename PT weight names to corresponding Flax weight names and reshape tensor if necessary"""
+    # conv norm or layer norm
+    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
+
+    # rename attention layers
+    if len(pt_tuple_key) > 1:
+        for rename_from, rename_to in (
+            ("to_out_0", "proj_attn"),
+            ("to_k", "key"),
+            ("to_v", "value"),
+            ("to_q", "query"),
+        ):
+            if pt_tuple_key[-2] == rename_from:
+                weight_name = pt_tuple_key[-1]
+                weight_name = "kernel" if weight_name == "weight" else weight_name
+                renamed_pt_tuple_key = pt_tuple_key[:-2] + (rename_to, weight_name)
+                if renamed_pt_tuple_key in random_flax_state_dict:
+                    assert random_flax_state_dict[renamed_pt_tuple_key].shape == pt_tensor.T.shape
+                    return renamed_pt_tuple_key, pt_tensor.T
+
+    if (
+        any("norm" in str_ for str_ in pt_tuple_key)
+        and (pt_tuple_key[-1] == "bias")
+        and (pt_tuple_key[:-1] + ("bias",) not in random_flax_state_dict)
+        and (pt_tuple_key[:-1] + ("scale",) in random_flax_state_dict)
+    ):
+        renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
+        return renamed_pt_tuple_key, pt_tensor
+    elif pt_tuple_key[-1] in ["weight", "gamma"] and pt_tuple_key[:-1] + ("scale",) in random_flax_state_dict:
+        renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
+        return renamed_pt_tuple_key, pt_tensor
+
+    # embedding
+    if pt_tuple_key[-1] == "weight" and pt_tuple_key[:-1] + ("embedding",) in random_flax_state_dict:
+        pt_tuple_key = pt_tuple_key[:-1] + ("embedding",)
+        return renamed_pt_tuple_key, pt_tensor
+
+    # conv layer
+    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
+    if pt_tuple_key[-1] == "weight" and pt_tensor.ndim == 4:
+        pt_tensor = pt_tensor.transpose(2, 3, 1, 0)
+        return renamed_pt_tuple_key, pt_tensor
+
+    # linear layer
+    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
+    if pt_tuple_key[-1] == "weight":
+        pt_tensor = pt_tensor.T
+        return renamed_pt_tuple_key, pt_tensor
+
+    # old PyTorch layer norm weight
+    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("weight",)
+    if pt_tuple_key[-1] == "gamma":
+        return renamed_pt_tuple_key, pt_tensor
+
+    # old PyTorch layer norm bias
+    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("bias",)
+    if pt_tuple_key[-1] == "beta":
+        return renamed_pt_tuple_key, pt_tensor
+
+    return pt_tuple_key, pt_tensor
+
+
+def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model, init_key=42):
+    # Step 1: Convert pytorch tensor to numpy
+    pt_state_dict = {k: v.numpy() for k, v in pt_state_dict.items()}
+
+    # Step 2: Since the model is stateless, get random Flax params
+    random_flax_params = flax_model.init_weights(PRNGKey(init_key))
+
+    random_flax_state_dict = flatten_dict(random_flax_params)
+    flax_state_dict = {}
+
+    # Need to change some parameters name to match Flax names
+    for pt_key, pt_tensor in pt_state_dict.items():
+        renamed_pt_key = rename_key(pt_key)
+        pt_tuple_key = tuple(renamed_pt_key.split("."))
+
+        # Correctly rename weight parameters
+        flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, pt_tensor, random_flax_state_dict)
+
+        if flax_key in random_flax_state_dict:
+            if flax_tensor.shape != random_flax_state_dict[flax_key].shape:
+                raise ValueError(
+                    f"PyTorch checkpoint seems to be incorrect. Weight {pt_key} was expected to be of shape "
+                    f"{random_flax_state_dict[flax_key].shape}, but is {flax_tensor.shape}."
+                )
+
+        # also add unexpected weight so that warning is thrown
+        flax_state_dict[flax_key] = jnp.asarray(flax_tensor)
+
+    return unflatten_dict(flax_state_dict)
diff --git a/diffusers/src/diffusers/models/modeling_flax_utils.py b/diffusers/src/diffusers/models/modeling_flax_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c35fab0fc16b640907f5f3d1d36401b793fe01a
--- /dev/null
+++ b/diffusers/src/diffusers/models/modeling_flax_utils.py
@@ -0,0 +1,561 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pickle import UnpicklingError
+from typing import Any, Dict, Union
+
+import jax
+import jax.numpy as jnp
+import msgpack.exceptions
+from flax.core.frozen_dict import FrozenDict, unfreeze
+from flax.serialization import from_bytes, to_bytes
+from flax.traverse_util import flatten_dict, unflatten_dict
+from huggingface_hub import create_repo, hf_hub_download
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    validate_hf_hub_args,
+)
+from requests import HTTPError
+
+from .. import __version__, is_torch_available
+from ..utils import (
+    CONFIG_NAME,
+    FLAX_WEIGHTS_NAME,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    WEIGHTS_NAME,
+    PushToHubMixin,
+    logging,
+)
+from .modeling_flax_pytorch_utils import convert_pytorch_state_dict_to_flax
+
+
+logger = logging.get_logger(__name__)
+
+
+class FlaxModelMixin(PushToHubMixin):
+    r"""
+    Base class for all Flax models.
+
+    [`FlaxModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and
+    saving models.
+
+        - **config_name** ([`str`]) -- Filename to save a model to when calling [`~FlaxModelMixin.save_pretrained`].
+    """
+
+    config_name = CONFIG_NAME
+    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
+    _flax_internal_args = ["name", "parent", "dtype"]
+
+    @classmethod
+    def _from_config(cls, config, **kwargs):
+        """
+        All context managers that the model should be initialized under go here.
+        """
+        return cls(config, **kwargs)
+
+    def _cast_floating_to(self, params: Union[Dict, FrozenDict], dtype: jnp.dtype, mask: Any = None) -> Any:
+        """
+        Helper method to cast floating-point values of given parameter `PyTree` to given `dtype`.
+        """
+
+        # taken from https://github.com/deepmind/jmp/blob/3a8318abc3292be38582794dbf7b094e6583b192/jmp/_src/policy.py#L27
+        def conditional_cast(param):
+            if isinstance(param, jnp.ndarray) and jnp.issubdtype(param.dtype, jnp.floating):
+                param = param.astype(dtype)
+            return param
+
+        if mask is None:
+            return jax.tree_map(conditional_cast, params)
+
+        flat_params = flatten_dict(params)
+        flat_mask, _ = jax.tree_flatten(mask)
+
+        for masked, key in zip(flat_mask, flat_params.keys()):
+            if masked:
+                param = flat_params[key]
+                flat_params[key] = conditional_cast(param)
+
+        return unflatten_dict(flat_params)
+
+    def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None):
+        r"""
+        Cast the floating-point `params` to `jax.numpy.bfloat16`. This returns a new `params` tree and does not cast
+        the `params` in place.
+
+        This method can be used on a TPU to explicitly convert the model parameters to bfloat16 precision to do full
+        half-precision training or to save weights in bfloat16 for inference in order to save memory and improve speed.
+
+        Arguments:
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+            mask (`Union[Dict, FrozenDict]`):
+                A `PyTree` with same structure as the `params` tree. The leaves should be booleans. It should be `True`
+                for params you want to cast, and `False` for those you want to skip.
+
+        Examples:
+
+        ```python
+        >>> from diffusers import FlaxUNet2DConditionModel
+
+        >>> # load model
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> # By default, the model parameters will be in fp32 precision, to cast these to bfloat16 precision
+        >>> params = model.to_bf16(params)
+        >>> # If you don't want to cast certain parameters (for example layer norm bias and scale)
+        >>> # then pass the mask as follows
+        >>> from flax import traverse_util
+
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> flat_params = traverse_util.flatten_dict(params)
+        >>> mask = {
+        ...     path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
+        ...     for path in flat_params
+        ... }
+        >>> mask = traverse_util.unflatten_dict(mask)
+        >>> params = model.to_bf16(params, mask)
+        ```"""
+        return self._cast_floating_to(params, jnp.bfloat16, mask)
+
+    def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
+        r"""
+        Cast the floating-point `params` to `jax.numpy.float32`. This method can be used to explicitly convert the
+        model parameters to fp32 precision. This returns a new `params` tree and does not cast the `params` in place.
+
+        Arguments:
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+            mask (`Union[Dict, FrozenDict]`):
+                A `PyTree` with same structure as the `params` tree. The leaves should be booleans. It should be `True`
+                for params you want to cast, and `False` for those you want to skip.
+
+        Examples:
+
+        ```python
+        >>> from diffusers import FlaxUNet2DConditionModel
+
+        >>> # Download model and configuration from huggingface.co
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> # By default, the model params will be in fp32, to illustrate the use of this method,
+        >>> # we'll first cast to fp16 and back to fp32
+        >>> params = model.to_f16(params)
+        >>> # now cast back to fp32
+        >>> params = model.to_fp32(params)
+        ```"""
+        return self._cast_floating_to(params, jnp.float32, mask)
+
+    def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None):
+        r"""
+        Cast the floating-point `params` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the
+        `params` in place.
+
+        This method can be used on a GPU to explicitly convert the model parameters to float16 precision to do full
+        half-precision training or to save weights in float16 for inference in order to save memory and improve speed.
+
+        Arguments:
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+            mask (`Union[Dict, FrozenDict]`):
+                A `PyTree` with same structure as the `params` tree. The leaves should be booleans. It should be `True`
+                for params you want to cast, and `False` for those you want to skip.
+
+        Examples:
+
+        ```python
+        >>> from diffusers import FlaxUNet2DConditionModel
+
+        >>> # load model
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> # By default, the model params will be in fp32, to cast these to float16
+        >>> params = model.to_fp16(params)
+        >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
+        >>> # then pass the mask as follows
+        >>> from flax import traverse_util
+
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> flat_params = traverse_util.flatten_dict(params)
+        >>> mask = {
+        ...     path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
+        ...     for path in flat_params
+        ... }
+        >>> mask = traverse_util.unflatten_dict(mask)
+        >>> params = model.to_fp16(params, mask)
+        ```"""
+        return self._cast_floating_to(params, jnp.float16, mask)
+
+    def init_weights(self, rng: jax.Array) -> Dict:
+        raise NotImplementedError(f"init_weights method has to be implemented for {self}")
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        dtype: jnp.dtype = jnp.float32,
+        *model_args,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a pretrained Flax model from a pretrained model configuration.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
+
+                    - A string, the *model id* (for example `runwayml/stable-diffusion-v1-5`) of a pretrained model
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      using [`~FlaxModelMixin.save_pretrained`].
+            dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+                The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+                `jax.numpy.bfloat16` (on TPUs).
+
+                This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+                specified, all the computation will be performed with the given `dtype`.
+
+                <Tip>
+
+                This only specifies the dtype of the *computation* and does not influence the dtype of model
+                parameters.
+
+                If you wish to change the dtype of the model parameters, see [`~FlaxModelMixin.to_fp16`] and
+                [`~FlaxModelMixin.to_bf16`].
+
+                </Tip>
+
+            model_args (sequence of positional arguments, *optional*):
+                All remaining positional arguments are passed to the underlying model's `__init__` method.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            from_pt (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a PyTorch checkpoint save file.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it is loaded) and initiate the model (for
+                example, `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
+                automatically loaded:
+
+                    - If a configuration is provided with `config`, `kwargs` are directly passed to the underlying
+                      model's `__init__` method (we assume all relevant updates to the configuration have already been
+                      done).
+                    - If a configuration is not provided, `kwargs` are first passed to the configuration class
+                      initialization function [`~ConfigMixin.from_config`]. Each key of the `kwargs` that corresponds
+                      to a configuration attribute is used to override said attribute with the supplied `kwargs` value.
+                      Remaining keys that do not correspond to any configuration attribute are passed to the underlying
+                      model's `__init__` function.
+
+        Examples:
+
+        ```python
+        >>> from diffusers import FlaxUNet2DConditionModel
+
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("./test/saved_model/")
+        ```
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```bash
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+        """
+        config = kwargs.pop("config", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        from_pt = kwargs.pop("from_pt", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "flax",
+        }
+
+        # Load config if we don't provide one
+        if config is None:
+            config, unused_kwargs = cls.load_config(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                subfolder=subfolder,
+                **kwargs,
+            )
+
+        model, model_kwargs = cls.from_config(config, dtype=dtype, return_unused_kwargs=True, **unused_kwargs)
+
+        # Load model
+        pretrained_path_with_subfolder = (
+            pretrained_model_name_or_path
+            if subfolder is None
+            else os.path.join(pretrained_model_name_or_path, subfolder)
+        )
+        if os.path.isdir(pretrained_path_with_subfolder):
+            if from_pt:
+                if not os.path.isfile(os.path.join(pretrained_path_with_subfolder, WEIGHTS_NAME)):
+                    raise EnvironmentError(
+                        f"Error no file named {WEIGHTS_NAME} found in directory {pretrained_path_with_subfolder} "
+                    )
+                model_file = os.path.join(pretrained_path_with_subfolder, WEIGHTS_NAME)
+            elif os.path.isfile(os.path.join(pretrained_path_with_subfolder, FLAX_WEIGHTS_NAME)):
+                # Load from a Flax checkpoint
+                model_file = os.path.join(pretrained_path_with_subfolder, FLAX_WEIGHTS_NAME)
+            # Check if pytorch weights exist instead
+            elif os.path.isfile(os.path.join(pretrained_path_with_subfolder, WEIGHTS_NAME)):
+                raise EnvironmentError(
+                    f"{WEIGHTS_NAME} file found in directory {pretrained_path_with_subfolder}. Please load the model"
+                    " using `from_pt=True`."
+                )
+            else:
+                raise EnvironmentError(
+                    f"Error no file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME} found in directory "
+                    f"{pretrained_path_with_subfolder}."
+                )
+        else:
+            try:
+                model_file = hf_hub_download(
+                    pretrained_model_name_or_path,
+                    filename=FLAX_WEIGHTS_NAME if not from_pt else WEIGHTS_NAME,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    subfolder=subfolder,
+                    revision=revision,
+                )
+
+            except RepositoryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
+                    "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
+                    "token having permission to this repo with `token` or log in with `huggingface-cli "
+                    "login`."
+                )
+            except RevisionNotFoundError:
+                raise EnvironmentError(
+                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
+                    "this model name. Check the model page at "
+                    f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+                )
+            except EntryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} does not appear to have a file named {FLAX_WEIGHTS_NAME}."
+                )
+            except HTTPError as err:
+                raise EnvironmentError(
+                    f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n"
+                    f"{err}"
+                )
+            except ValueError:
+                raise EnvironmentError(
+                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+                    f" directory containing a file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}.\nCheckout your"
+                    " internet connection or see how to run the library in offline mode at"
+                    " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
+                )
+            except EnvironmentError:
+                raise EnvironmentError(
+                    f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                    f"containing a file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}."
+                )
+
+        if from_pt:
+            if is_torch_available():
+                from .modeling_utils import load_state_dict
+            else:
+                raise EnvironmentError(
+                    "Can't load the model in PyTorch format because PyTorch is not installed. "
+                    "Please, install PyTorch or use native Flax weights."
+                )
+
+            # Step 1: Get the pytorch file
+            pytorch_model_file = load_state_dict(model_file)
+
+            # Step 2: Convert the weights
+            state = convert_pytorch_state_dict_to_flax(pytorch_model_file, model)
+        else:
+            try:
+                with open(model_file, "rb") as state_f:
+                    state = from_bytes(cls, state_f.read())
+            except (UnpicklingError, msgpack.exceptions.ExtraData) as e:
+                try:
+                    with open(model_file) as f:
+                        if f.read().startswith("version"):
+                            raise OSError(
+                                "You seem to have cloned a repository without having git-lfs installed. Please"
+                                " install git-lfs and run `git lfs install` followed by `git lfs pull` in the"
+                                " folder you cloned."
+                            )
+                        else:
+                            raise ValueError from e
+                except (UnicodeDecodeError, ValueError):
+                    raise EnvironmentError(f"Unable to convert {model_file} to Flax deserializable object. ")
+            # make sure all arrays are stored as jnp.ndarray
+            # NOTE: This is to prevent a bug this will be fixed in Flax >= v0.3.4:
+            # https://github.com/google/flax/issues/1261
+        state = jax.tree_util.tree_map(lambda x: jax.device_put(x, jax.local_devices(backend="cpu")[0]), state)
+
+        # flatten dicts
+        state = flatten_dict(state)
+
+        params_shape_tree = jax.eval_shape(model.init_weights, rng=jax.random.PRNGKey(0))
+        required_params = set(flatten_dict(unfreeze(params_shape_tree)).keys())
+
+        shape_state = flatten_dict(unfreeze(params_shape_tree))
+
+        missing_keys = required_params - set(state.keys())
+        unexpected_keys = set(state.keys()) - required_params
+
+        if missing_keys:
+            logger.warning(
+                f"The checkpoint {pretrained_model_name_or_path} is missing required keys: {missing_keys}. "
+                "Make sure to call model.init_weights to initialize the missing weights."
+            )
+            cls._missing_keys = missing_keys
+
+        for key in state.keys():
+            if key in shape_state and state[key].shape != shape_state[key].shape:
+                raise ValueError(
+                    f"Trying to load the pretrained weight for {key} failed: checkpoint has shape "
+                    f"{state[key].shape} which is incompatible with the model shape {shape_state[key].shape}. "
+                )
+
+        # remove unexpected keys to not be saved again
+        for unexpected_key in unexpected_keys:
+            del state[unexpected_key]
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
+                " with another architecture."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        else:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint"
+                f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
+                " training."
+            )
+
+        return model, unflatten_dict(state)
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        params: Union[Dict, FrozenDict],
+        is_main_process: bool = True,
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        """
+        Save a model and its configuration file to a directory so that it can be reloaded using the
+        [`~FlaxModelMixin.from_pretrained`] class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save a model and its configuration file to. Will be created if it doesn't exist.
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", False)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+
+        model_to_save = self
+
+        # Attach architecture to the config
+        # Save the config
+        if is_main_process:
+            model_to_save.save_config(save_directory)
+
+        # save model
+        output_model_file = os.path.join(save_directory, FLAX_WEIGHTS_NAME)
+        with open(output_model_file, "wb") as f:
+            model_bytes = to_bytes(params)
+            f.write(model_bytes)
+
+        logger.info(f"Model weights saved in {output_model_file}")
+
+        if push_to_hub:
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
diff --git a/diffusers/src/diffusers/models/modeling_outputs.py b/diffusers/src/diffusers/models/modeling_outputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e4f3e1e9af57041d4f945aa36381477a652c45c
--- /dev/null
+++ b/diffusers/src/diffusers/models/modeling_outputs.py
@@ -0,0 +1,47 @@
+from dataclasses import dataclass
+
+from ..utils import BaseOutput
+
+
+@dataclass
+class AutoencoderKLOutput(BaseOutput):
+    """
+    Output of AutoencoderKL encoding method.
+
+    Args:
+        latent_dist (`DiagonalGaussianDistribution`):
+            Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
+            `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
+    """
+
+    latent_dist: "DiagonalGaussianDistribution"  # noqa: F821
+
+
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    The output of [`Transformer2DModel`].
+
+    Args:
+        sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+
+    sample: "torch.Tensor"  # noqa: F821
+
+
+
+@dataclass
+class CausalTransformer2DModelOutput(BaseOutput):
+    """
+    The output of [`CausalTransformer2DModel`].
+
+    Args:
+        sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+
+    sample: "torch.Tensor"  # noqa: F821
+    ref_sample: "torch.Tensor"  # noqa: F821
diff --git a/diffusers/src/diffusers/models/modeling_pytorch_flax_utils.py b/diffusers/src/diffusers/models/modeling_pytorch_flax_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..55eff0e1ed54aaa446eeaa29836b270ead8886cf
--- /dev/null
+++ b/diffusers/src/diffusers/models/modeling_pytorch_flax_utils.py
@@ -0,0 +1,161 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch - Flax general utilities."""
+
+from pickle import UnpicklingError
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.serialization import from_bytes
+from flax.traverse_util import flatten_dict
+
+from ..utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+#####################
+# Flax => PyTorch #
+#####################
+
+
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_flax_pytorch_utils.py#L224-L352
+def load_flax_checkpoint_in_pytorch_model(pt_model, model_file):
+    try:
+        with open(model_file, "rb") as flax_state_f:
+            flax_state = from_bytes(None, flax_state_f.read())
+    except UnpicklingError as e:
+        try:
+            with open(model_file) as f:
+                if f.read().startswith("version"):
+                    raise OSError(
+                        "You seem to have cloned a repository without having git-lfs installed. Please"
+                        " install git-lfs and run `git lfs install` followed by `git lfs pull` in the"
+                        " folder you cloned."
+                    )
+                else:
+                    raise ValueError from e
+        except (UnicodeDecodeError, ValueError):
+            raise EnvironmentError(f"Unable to convert {model_file} to Flax deserializable object. ")
+
+    return load_flax_weights_in_pytorch_model(pt_model, flax_state)
+
+
+def load_flax_weights_in_pytorch_model(pt_model, flax_state):
+    """Load flax checkpoints in a PyTorch model"""
+
+    try:
+        import torch  # noqa: F401
+    except ImportError:
+        logger.error(
+            "Loading Flax weights in PyTorch requires both PyTorch and Flax to be installed. Please see"
+            " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for installation"
+            " instructions."
+        )
+        raise
+
+    # check if we have bf16 weights
+    is_type_bf16 = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype == jnp.bfloat16, flax_state)).values()
+    if any(is_type_bf16):
+        # convert all weights to fp32 if they are bf16 since torch.from_numpy can-not handle bf16
+
+        # and bf16 is not fully supported in PT yet.
+        logger.warning(
+            "Found ``bfloat16`` weights in Flax model. Casting all ``bfloat16`` weights to ``float32`` "
+            "before loading those in PyTorch model."
+        )
+        flax_state = jax.tree_util.tree_map(
+            lambda params: params.astype(np.float32) if params.dtype == jnp.bfloat16 else params, flax_state
+        )
+
+    pt_model.base_model_prefix = ""
+
+    flax_state_dict = flatten_dict(flax_state, sep=".")
+    pt_model_dict = pt_model.state_dict()
+
+    # keep track of unexpected & missing keys
+    unexpected_keys = []
+    missing_keys = set(pt_model_dict.keys())
+
+    for flax_key_tuple, flax_tensor in flax_state_dict.items():
+        flax_key_tuple_array = flax_key_tuple.split(".")
+
+        if flax_key_tuple_array[-1] == "kernel" and flax_tensor.ndim == 4:
+            flax_key_tuple_array = flax_key_tuple_array[:-1] + ["weight"]
+            flax_tensor = jnp.transpose(flax_tensor, (3, 2, 0, 1))
+        elif flax_key_tuple_array[-1] == "kernel":
+            flax_key_tuple_array = flax_key_tuple_array[:-1] + ["weight"]
+            flax_tensor = flax_tensor.T
+        elif flax_key_tuple_array[-1] == "scale":
+            flax_key_tuple_array = flax_key_tuple_array[:-1] + ["weight"]
+
+        if "time_embedding" not in flax_key_tuple_array:
+            for i, flax_key_tuple_string in enumerate(flax_key_tuple_array):
+                flax_key_tuple_array[i] = (
+                    flax_key_tuple_string.replace("_0", ".0")
+                    .replace("_1", ".1")
+                    .replace("_2", ".2")
+                    .replace("_3", ".3")
+                    .replace("_4", ".4")
+                    .replace("_5", ".5")
+                    .replace("_6", ".6")
+                    .replace("_7", ".7")
+                    .replace("_8", ".8")
+                    .replace("_9", ".9")
+                )
+
+        flax_key = ".".join(flax_key_tuple_array)
+
+        if flax_key in pt_model_dict:
+            if flax_tensor.shape != pt_model_dict[flax_key].shape:
+                raise ValueError(
+                    f"Flax checkpoint seems to be incorrect. Weight {flax_key_tuple} was expected "
+                    f"to be of shape {pt_model_dict[flax_key].shape}, but is {flax_tensor.shape}."
+                )
+            else:
+                # add weight to pytorch dict
+                flax_tensor = np.asarray(flax_tensor) if not isinstance(flax_tensor, np.ndarray) else flax_tensor
+                pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)
+                # remove from missing keys
+                missing_keys.remove(flax_key)
+        else:
+            # weight is not expected by PyTorch model
+            unexpected_keys.append(flax_key)
+
+    pt_model.load_state_dict(pt_model_dict)
+
+    # re-transform missing_keys to list
+    missing_keys = list(missing_keys)
+
+    if len(unexpected_keys) > 0:
+        logger.warning(
+            "Some weights of the Flax model were not used when initializing the PyTorch model"
+            f" {pt_model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are initializing"
+            f" {pt_model.__class__.__name__} from a Flax model trained on another task or with another architecture"
+            " (e.g. initializing a BertForSequenceClassification model from a FlaxBertForPreTraining model).\n- This"
+            f" IS NOT expected if you are initializing {pt_model.__class__.__name__} from a Flax model that you expect"
+            " to be exactly identical (e.g. initializing a BertForSequenceClassification model from a"
+            " FlaxBertForSequenceClassification model)."
+        )
+    if len(missing_keys) > 0:
+        logger.warning(
+            f"Some weights of {pt_model.__class__.__name__} were not initialized from the Flax model and are newly"
+            f" initialized: {missing_keys}\nYou should probably TRAIN this model on a down-stream task to be able to"
+            " use it for predictions and inference."
+        )
+
+    return pt_model
diff --git a/diffusers/src/diffusers/models/modeling_utils.py b/diffusers/src/diffusers/models/modeling_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e0c50e8b37b31a65f17b8b1787df10b4f7886fe
--- /dev/null
+++ b/diffusers/src/diffusers/models/modeling_utils.py
@@ -0,0 +1,1199 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import itertools
+import json
+import os
+import re
+from collections import OrderedDict
+from functools import partial
+from pathlib import Path
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import safetensors
+import torch
+from huggingface_hub import create_repo, split_torch_state_dict_into_shards
+from huggingface_hub.utils import validate_hf_hub_args
+from torch import Tensor, nn
+
+from .. import __version__
+from ..utils import (
+    CONFIG_NAME,
+    FLAX_WEIGHTS_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+    _add_variant,
+    _get_checkpoint_shard_files,
+    _get_model_file,
+    deprecate,
+    is_accelerate_available,
+    is_torch_version,
+    logging,
+)
+from ..utils.hub_utils import (
+    PushToHubMixin,
+    load_or_create_model_card,
+    populate_model_card,
+)
+from .model_loading_utils import (
+    _determine_device_map,
+    _fetch_index_file,
+    _load_state_dict_into_model,
+    load_model_dict_into_meta,
+    load_state_dict,
+)
+
+
+logger = logging.get_logger(__name__)
+
+_REGEX_SHARD = re.compile(r"(.*?)-\d{5}-of-\d{5}")
+
+
+if is_torch_version(">=", "1.9.0"):
+    _LOW_CPU_MEM_USAGE_DEFAULT = True
+else:
+    _LOW_CPU_MEM_USAGE_DEFAULT = False
+
+
+if is_accelerate_available():
+    import accelerate
+
+
+def get_parameter_device(parameter: torch.nn.Module) -> torch.device:
+    try:
+        parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers())
+        return next(parameters_and_buffers).device
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].device
+
+
+def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype:
+    try:
+        return next(parameter.parameters()).dtype
+    except StopIteration:
+        try:
+            return next(parameter.buffers()).dtype
+        except StopIteration:
+            # For torch.nn.DataParallel compatibility in PyTorch 1.5
+
+            def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+                tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+                return tuples
+
+            gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+            first_tuple = next(gen)
+            return first_tuple[1].dtype
+
+
+class ModelMixin(torch.nn.Module, PushToHubMixin):
+    r"""
+    Base class for all models.
+
+    [`ModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and
+    saving models.
+
+        - **config_name** ([`str`]) -- Filename to save a model to when calling [`~models.ModelMixin.save_pretrained`].
+    """
+
+    config_name = CONFIG_NAME
+    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
+    _supports_gradient_checkpointing = False
+    _keys_to_ignore_on_load_unexpected = None
+    _no_split_modules = None
+
+    def __init__(self):
+        super().__init__()
+
+    def __getattr__(self, name: str) -> Any:
+        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
+        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite
+        __getattr__ here in addition so that we don't trigger `torch.nn.Module`'s __getattr__':
+        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        """
+
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
+        is_attribute = name in self.__dict__
+
+        if is_in_config and not is_attribute:
+            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'unet.config.{name}'."
+            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3)
+            return self._internal_dict[name]
+
+        # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        return super().__getattr__(name)
+
+    @property
+    def is_gradient_checkpointing(self) -> bool:
+        """
+        Whether gradient checkpointing is activated for this model or not.
+        """
+        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
+
+    def enable_gradient_checkpointing(self) -> None:
+        """
+        Activates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
+        *checkpoint activations* in other frameworks).
+        """
+        if not self._supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+
+    def disable_gradient_checkpointing(self) -> None:
+        """
+        Deactivates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
+        *checkpoint activations* in other frameworks).
+        """
+        if self._supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+
+    def set_use_npu_flash_attention(self, valid: bool) -> None:
+        r"""
+        Set the switch for the npu flash attention.
+        """
+
+        def fn_recursive_set_npu_flash_attention(module: torch.nn.Module):
+            if hasattr(module, "set_use_npu_flash_attention"):
+                module.set_use_npu_flash_attention(valid)
+
+            for child in module.children():
+                fn_recursive_set_npu_flash_attention(child)
+
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_npu_flash_attention(module)
+
+    def enable_npu_flash_attention(self) -> None:
+        r"""
+        Enable npu flash attention from torch_npu
+
+        """
+        self.set_use_npu_flash_attention(True)
+
+    def disable_npu_flash_attention(self) -> None:
+        r"""
+        disable npu flash attention from torch_npu
+
+        """
+        self.set_use_npu_flash_attention(False)
+
+    def set_use_memory_efficient_attention_xformers(
+        self, valid: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_memory_efficient_attention_xformers method
+        # gets the message
+        def fn_recursive_set_mem_eff(module: torch.nn.Module):
+            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
+
+            for child in module.children():
+                fn_recursive_set_mem_eff(child)
+
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_mem_eff(module)
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None) -> None:
+        r"""
+        Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up during
+        inference. Speed up during training is not guaranteed.
+
+        <Tip warning={true}>
+
+        ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes
+        precedent.
+
+        </Tip>
+
+        Parameters:
+            attention_op (`Callable`, *optional*):
+                Override the default `None` operator for use as `op` argument to the
+                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
+                function of xFormers.
+
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> from diffusers import UNet2DConditionModel
+        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
+
+        >>> model = UNet2DConditionModel.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16
+        ... )
+        >>> model = model.to("cuda")
+        >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
+        ```
+        """
+        self.set_use_memory_efficient_attention_xformers(True, attention_op)
+
+    def disable_xformers_memory_efficient_attention(self) -> None:
+        r"""
+        Disable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
+        """
+        self.set_use_memory_efficient_attention_xformers(False)
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Optional[Callable] = None,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+        max_shard_size: Union[int, str] = "10GB",
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        """
+        Save a model and its configuration file to a directory so that it can be reloaded using the
+        [`~models.ModelMixin.from_pretrained`] class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save a model and its configuration file to. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
+            max_shard_size (`int` or `str`, defaults to `"10GB"`):
+                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
+                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5GB"`).
+                If expressed as an integer, the unit is bytes. Note that this limit will be decreased after a certain
+                period of time (starting from Oct 2024) to allow users to upgrade to the latest version of `diffusers`.
+                This is to establish a common default size for this argument across different libraries in the Hugging
+                Face ecosystem (`transformers`, and `accelerate`, for example).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+        weights_name = _add_variant(weights_name, variant)
+        weight_name_split = weights_name.split(".")
+        if len(weight_name_split) in [2, 3]:
+            weights_name_pattern = weight_name_split[0] + "{suffix}." + ".".join(weight_name_split[1:])
+        else:
+            raise ValueError(f"Invalid {weights_name} provided.")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", False)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+
+        # Only save the model itself if we are using distributed training
+        model_to_save = self
+
+        # Attach architecture to the config
+        # Save the config
+        if is_main_process:
+            model_to_save.save_config(save_directory)
+
+        # Save the model
+        state_dict = model_to_save.state_dict()
+
+        # Save the model
+        state_dict_split = split_torch_state_dict_into_shards(
+            state_dict, max_shard_size=max_shard_size, filename_pattern=weights_name_pattern
+        )
+
+        # Clean the folder from a previous save
+        if is_main_process:
+            for filename in os.listdir(save_directory):
+                if filename in state_dict_split.filename_to_tensors.keys():
+                    continue
+                full_filename = os.path.join(save_directory, filename)
+                if not os.path.isfile(full_filename):
+                    continue
+                weights_without_ext = weights_name_pattern.replace(".bin", "").replace(".safetensors", "")
+                weights_without_ext = weights_without_ext.replace("{suffix}", "")
+                filename_without_ext = filename.replace(".bin", "").replace(".safetensors", "")
+                # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
+                if (
+                    filename.startswith(weights_without_ext)
+                    and _REGEX_SHARD.fullmatch(filename_without_ext) is not None
+                ):
+                    os.remove(full_filename)
+
+        for filename, tensors in state_dict_split.filename_to_tensors.items():
+            shard = {tensor: state_dict[tensor] for tensor in tensors}
+            filepath = os.path.join(save_directory, filename)
+            if safe_serialization:
+                # At some point we will need to deal better with save_function (used for TPU and other distributed
+                # joyfulness), but for now this enough.
+                safetensors.torch.save_file(shard, filepath, metadata={"format": "pt"})
+            else:
+                torch.save(shard, filepath)
+
+        if state_dict_split.is_sharded:
+            index = {
+                "metadata": state_dict_split.metadata,
+                "weight_map": state_dict_split.tensor_to_filename,
+            }
+            save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
+            save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant))
+            # Save the index as well
+            with open(save_index_file, "w", encoding="utf-8") as f:
+                content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+                f.write(content)
+            logger.info(
+                f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
+                f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where each parameters has been saved in the "
+                f"index located at {save_index_file}."
+            )
+        else:
+            path_to_weights = os.path.join(save_directory, weights_name)
+            logger.info(f"Model weights saved in {path_to_weights}")
+
+        if push_to_hub:
+            # Create a new empty model card and eventually tag it
+            model_card = load_or_create_model_card(repo_id, token=token)
+            model_card = populate_model_card(model_card)
+            model_card.save(Path(save_directory, "README.md").as_posix())
+
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained PyTorch model from a pretrained model configuration.
+
+        The model is set in evaluation mode - `model.eval()` - by default, and dropout modules are deactivated. To
+        train the model, set it back in training mode with `model.train()`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`~ModelMixin.save_pretrained`].
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info (`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            from_flax (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a Flax checkpoint save file.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device. Defaults to `None`, meaning that the model will be loaded on CPU.
+
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if `device_map` contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                Load weights from a specified `variant` filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights are downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model is forcibly loaded from `safetensors`
+                weights. If set to `False`, `safetensors` weights are not loaded.
+
+        <Tip>
+
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`. You can also activate the special
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        firewalled environment.
+
+        </Tip>
+
+        Example:
+
+        ```py
+        from diffusers import UNet2DConditionModel
+
+        unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
+        ```
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```bash
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        force_download = kwargs.pop("force_download", False)
+        from_flax = kwargs.pop("from_flax", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        subfolder = kwargs.pop("subfolder", None)
+        device_map = kwargs.pop("device_map", None)
+        max_memory = kwargs.pop("max_memory", None)
+        offload_folder = kwargs.pop("offload_folder", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+
+        if device_map is not None and not is_accelerate_available():
+            raise NotImplementedError(
+                "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
+                " `device_map=None`. You can install accelerate with `pip install accelerate`."
+            )
+
+        # Check if we can handle device_map and dispatching the weights
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
+
+        # change device_map into a map if we passed an int, a str or a torch.device
+        if isinstance(device_map, torch.device):
+            device_map = {"": device_map}
+        elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
+            try:
+                device_map = {"": torch.device(device_map)}
+            except RuntimeError:
+                raise ValueError(
+                    "When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or "
+                    f"'auto', 'balanced', 'balanced_low_0', 'sequential' but found {device_map}."
+                )
+        elif isinstance(device_map, int):
+            if device_map < 0:
+                raise ValueError(
+                    "You can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' "
+                )
+            else:
+                device_map = {"": device_map}
+
+        if device_map is not None:
+            if low_cpu_mem_usage is None:
+                low_cpu_mem_usage = True
+            elif not low_cpu_mem_usage:
+                raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`")
+
+        if low_cpu_mem_usage:
+            if device_map is not None and not is_torch_version(">=", "1.10"):
+                # The max memory utils require PyTorch >= 1.10 to have torch.cuda.mem_get_info.
+                raise ValueError("`low_cpu_mem_usage` and `device_map` require PyTorch >= 1.10.")
+
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+
+        # load config
+        config, unused_kwargs, commit_hash = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=force_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            **kwargs,
+        )
+
+        # Determine if we're loading from a directory of sharded checkpoints.
+        is_sharded = False
+        index_file = None
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        index_file = _fetch_index_file(
+            is_local=is_local,
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            subfolder=subfolder or "",
+            use_safetensors=use_safetensors,
+            cache_dir=cache_dir,
+            variant=variant,
+            force_download=force_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            user_agent=user_agent,
+            commit_hash=commit_hash,
+        )
+        if index_file is not None and index_file.is_file():
+            is_sharded = True
+
+        if is_sharded and from_flax:
+            raise ValueError("Loading of sharded checkpoints is not supported when `from_flax=True`.")
+
+        # load model
+        model_file = None
+        if from_flax:
+            model_file = _get_model_file(
+                pretrained_model_name_or_path,
+                weights_name=FLAX_WEIGHTS_NAME,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+                commit_hash=commit_hash,
+            )
+            model = cls.from_config(config, **unused_kwargs)
+
+            # Convert the weights
+            from .modeling_pytorch_flax_utils import load_flax_checkpoint_in_pytorch_model
+
+            model = load_flax_checkpoint_in_pytorch_model(model, model_file)
+        else:
+            if is_sharded:
+                sharded_ckpt_cached_folder, sharded_metadata = _get_checkpoint_shard_files(
+                    pretrained_model_name_or_path,
+                    index_file,
+                    cache_dir=cache_dir,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder or "",
+                )
+
+            elif use_safetensors and not is_sharded:
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path,
+                        weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                        commit_hash=commit_hash,
+                    )
+
+                except IOError as e:
+                    logger.error(f"An error occurred while trying to fetch {pretrained_model_name_or_path}: {e}")
+                    if not allow_pickle:
+                        raise
+                    logger.warning(
+                        "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead."
+                    )
+
+            if model_file is None and not is_sharded:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path,
+                    weights_name=_add_variant(WEIGHTS_NAME, variant),
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                    commit_hash=commit_hash,
+                )
+
+            if low_cpu_mem_usage:
+                # Instantiate model with empty weights
+                with accelerate.init_empty_weights():
+                    model = cls.from_config(config, **unused_kwargs)
+
+                # if device_map is None, load the state dict and move the params from meta device to the cpu
+                if device_map is None and not is_sharded:
+                    param_device = "cpu"
+                    state_dict = load_state_dict(model_file, variant=variant)
+                    model._convert_deprecated_attention_blocks(state_dict)
+                    # move the params from meta device to cpu
+                    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+                    if len(missing_keys) > 0:
+                        raise ValueError(
+                            f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are"
+                            f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
+                            " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
+                            " those weights or else make sure your checkpoint file is correct."
+                        )
+
+                    unexpected_keys = load_model_dict_into_meta(
+                        model,
+                        state_dict,
+                        device=param_device,
+                        dtype=torch_dtype,
+                        model_name_or_path=pretrained_model_name_or_path,
+                    )
+
+                    if cls._keys_to_ignore_on_load_unexpected is not None:
+                        for pat in cls._keys_to_ignore_on_load_unexpected:
+                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+                    if len(unexpected_keys) > 0:
+                        logger.warning(
+                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
+                        )
+
+                else:  # else let accelerate handle loading and dispatching.
+                    # Load weights and dispatch according to the device_map
+                    # by default the device_map is None and the weights are loaded on the CPU
+                    force_hook = True
+                    device_map = _determine_device_map(model, device_map, max_memory, torch_dtype)
+                    if device_map is None and is_sharded:
+                        # we load the parameters on the cpu
+                        device_map = {"": "cpu"}
+                        force_hook = False
+                    try:
+                        accelerate.load_checkpoint_and_dispatch(
+                            model,
+                            model_file if not is_sharded else index_file,
+                            device_map,
+                            max_memory=max_memory,
+                            offload_folder=offload_folder,
+                            offload_state_dict=offload_state_dict,
+                            dtype=torch_dtype,
+                            force_hooks=force_hook,
+                            strict=True,
+                        )
+                    except AttributeError as e:
+                        # When using accelerate loading, we do not have the ability to load the state
+                        # dict and rename the weight names manually. Additionally, accelerate skips
+                        # torch loading conventions and directly writes into `module.{_buffers, _parameters}`
+                        # (which look like they should be private variables?), so we can't use the standard hooks
+                        # to rename parameters on load. We need to mimic the original weight names so the correct
+                        # attributes are available. After we have loaded the weights, we convert the deprecated
+                        # names to the new non-deprecated names. Then we _greatly encourage_ the user to convert
+                        # the weights so we don't have to do this again.
+
+                        if "'Attention' object has no attribute" in str(e):
+                            logger.warning(
+                                f"Taking `{str(e)}` while using `accelerate.load_checkpoint_and_dispatch` to mean {pretrained_model_name_or_path}"
+                                " was saved with deprecated attention block weight names. We will load it with the deprecated attention block"
+                                " names and convert them on the fly to the new attention block format. Please re-save the model after this conversion,"
+                                " so we don't have to do the on the fly renaming in the future. If the model is from a hub checkpoint,"
+                                " please also re-upload it or open a PR on the original repository."
+                            )
+                            model._temp_convert_self_to_deprecated_attention_blocks()
+                            accelerate.load_checkpoint_and_dispatch(
+                                model,
+                                model_file if not is_sharded else index_file,
+                                device_map,
+                                max_memory=max_memory,
+                                offload_folder=offload_folder,
+                                offload_state_dict=offload_state_dict,
+                                dtype=torch_dtype,
+                                force_hooks=force_hook,
+                                strict=True,
+                            )
+                            model._undo_temp_convert_self_to_deprecated_attention_blocks()
+                        else:
+                            raise e
+
+                loading_info = {
+                    "missing_keys": [],
+                    "unexpected_keys": [],
+                    "mismatched_keys": [],
+                    "error_msgs": [],
+                }
+            else:
+                model = cls.from_config(config, **unused_kwargs)
+
+                state_dict = load_state_dict(model_file, variant=variant)
+                model._convert_deprecated_attention_blocks(state_dict)
+
+                model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
+                    model,
+                    state_dict,
+                    model_file,
+                    pretrained_model_name_or_path,
+                    ignore_mismatched_sizes=ignore_mismatched_sizes,
+                )
+
+                loading_info = {
+                    "missing_keys": missing_keys,
+                    "unexpected_keys": unexpected_keys,
+                    "mismatched_keys": mismatched_keys,
+                    "error_msgs": error_msgs,
+                }
+
+        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
+            raise ValueError(
+                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
+            )
+        elif torch_dtype is not None:
+            model = model.to(torch_dtype)
+
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            return model, loading_info
+
+        return model
+
+    @classmethod
+    def _load_pretrained_model(
+        cls,
+        model,
+        state_dict: OrderedDict,
+        resolved_archive_file,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        ignore_mismatched_sizes: bool = False,
+    ):
+        # Retrieve missing & unexpected_keys
+        model_state_dict = model.state_dict()
+        loaded_keys = list(state_dict.keys())
+
+        expected_keys = list(model_state_dict.keys())
+
+        original_loaded_keys = loaded_keys
+
+        missing_keys = list(set(expected_keys) - set(loaded_keys))
+        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+
+        # Make sure we are able to load base models as well as derived models (with heads)
+        model_to_load = model
+
+        def _find_mismatched_keys(
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
+        ):
+            mismatched_keys = []
+            if ignore_mismatched_sizes:
+                for checkpoint_key in loaded_keys:
+                    model_key = checkpoint_key
+
+                    if (
+                        model_key in model_state_dict
+                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+                    ):
+                        mismatched_keys.append(
+                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+                        )
+                        del state_dict[checkpoint_key]
+            return mismatched_keys
+
+        if state_dict is not None:
+            # Whole checkpoint
+            mismatched_keys = _find_mismatched_keys(
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
+            )
+            error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
+
+        if len(error_msgs) > 0:
+            error_msg = "\n\t".join(error_msgs)
+            if "size mismatch" in error_msg:
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+                " identical (initializing a BertForSequenceClassification model from a"
+                " BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+                " without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+                " able to use it for predictions and inference."
+            )
+
+        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
+
+    @classmethod
+    def _get_signature_keys(cls, obj):
+        parameters = inspect.signature(obj.__init__).parameters
+        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
+        expected_modules = set(required_parameters.keys()) - {"self"}
+
+        return expected_modules, optional_parameters
+
+    # Adapted from `transformers` modeling_utils.py
+    def _get_no_split_modules(self, device_map: str):
+        """
+        Get the modules of the model that should not be spit when using device_map. We iterate through the modules to
+        get the underlying `_no_split_modules`.
+
+        Args:
+            device_map (`str`):
+                The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"]
+
+        Returns:
+            `List[str]`: List of modules that should not be split
+        """
+        _no_split_modules = set()
+        modules_to_check = [self]
+        while len(modules_to_check) > 0:
+            module = modules_to_check.pop(-1)
+            # if the module does not appear in _no_split_modules, we also check the children
+            if module.__class__.__name__ not in _no_split_modules:
+                if isinstance(module, ModelMixin):
+                    if module._no_split_modules is None:
+                        raise ValueError(
+                            f"{module.__class__.__name__} does not support `device_map='{device_map}'`. To implement support, the model "
+                            "class needs to implement the `_no_split_modules` attribute."
+                        )
+                    else:
+                        _no_split_modules = _no_split_modules | set(module._no_split_modules)
+                modules_to_check += list(module.children())
+        return list(_no_split_modules)
+
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        return get_parameter_device(self)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+
+    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
+        """
+        Get number of (trainable or non-embedding) parameters in the module.
+
+        Args:
+            only_trainable (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of trainable parameters.
+            exclude_embeddings (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of non-embedding parameters.
+
+        Returns:
+            `int`: The number of parameters.
+
+        Example:
+
+        ```py
+        from diffusers import UNet2DConditionModel
+
+        model_id = "runwayml/stable-diffusion-v1-5"
+        unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet")
+        unet.num_parameters(only_trainable=True)
+        859520964
+        ```
+        """
+
+        if exclude_embeddings:
+            embedding_param_names = [
+                f"{name}.weight"
+                for name, module_type in self.named_modules()
+                if isinstance(module_type, torch.nn.Embedding)
+            ]
+            non_embedding_parameters = [
+                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
+            ]
+            return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
+        else:
+            return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
+
+    def _convert_deprecated_attention_blocks(self, state_dict: OrderedDict) -> None:
+        deprecated_attention_block_paths = []
+
+        def recursive_find_attn_block(name, module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_paths.append(name)
+
+            for sub_name, sub_module in module.named_children():
+                sub_name = sub_name if name == "" else f"{name}.{sub_name}"
+                recursive_find_attn_block(sub_name, sub_module)
+
+        recursive_find_attn_block("", self)
+
+        # NOTE: we have to check if the deprecated parameters are in the state dict
+        # because it is possible we are loading from a state dict that was already
+        # converted
+
+        for path in deprecated_attention_block_paths:
+            # group_norm path stays the same
+
+            # query -> to_q
+            if f"{path}.query.weight" in state_dict:
+                state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight")
+            if f"{path}.query.bias" in state_dict:
+                state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias")
+
+            # key -> to_k
+            if f"{path}.key.weight" in state_dict:
+                state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight")
+            if f"{path}.key.bias" in state_dict:
+                state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias")
+
+            # value -> to_v
+            if f"{path}.value.weight" in state_dict:
+                state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight")
+            if f"{path}.value.bias" in state_dict:
+                state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias")
+
+            # proj_attn -> to_out.0
+            if f"{path}.proj_attn.weight" in state_dict:
+                state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
+            if f"{path}.proj_attn.bias" in state_dict:
+                state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")
+
+    def _temp_convert_self_to_deprecated_attention_blocks(self) -> None:
+        deprecated_attention_block_modules = []
+
+        def recursive_find_attn_block(module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_modules.append(module)
+
+            for sub_module in module.children():
+                recursive_find_attn_block(sub_module)
+
+        recursive_find_attn_block(self)
+
+        for module in deprecated_attention_block_modules:
+            module.query = module.to_q
+            module.key = module.to_k
+            module.value = module.to_v
+            module.proj_attn = module.to_out[0]
+
+            # We don't _have_ to delete the old attributes, but it's helpful to ensure
+            # that _all_ the weights are loaded into the new attributes and we're not
+            # making an incorrect assumption that this model should be converted when
+            # it really shouldn't be.
+            del module.to_q
+            del module.to_k
+            del module.to_v
+            del module.to_out
+
+    def _undo_temp_convert_self_to_deprecated_attention_blocks(self) -> None:
+        deprecated_attention_block_modules = []
+
+        def recursive_find_attn_block(module) -> None:
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_modules.append(module)
+
+            for sub_module in module.children():
+                recursive_find_attn_block(sub_module)
+
+        recursive_find_attn_block(self)
+
+        for module in deprecated_attention_block_modules:
+            module.to_q = module.query
+            module.to_k = module.key
+            module.to_v = module.value
+            module.to_out = nn.ModuleList([module.proj_attn, nn.Dropout(module.dropout)])
+
+            del module.query
+            del module.key
+            del module.value
+            del module.proj_attn
+
+
+class LegacyModelMixin(ModelMixin):
+    r"""
+    A subclass of `ModelMixin` to resolve class mapping from legacy classes (like `Transformer2DModel`) to more
+    pipeline-specific classes (like `DiTTransformer2DModel`).
+    """
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        # To prevent dependency import problem.
+        from .model_loading_utils import _fetch_remapped_cls_from_config
+
+        # Create a copy of the kwargs so that we don't mess with the keyword arguments in the downstream calls.
+        kwargs_copy = kwargs.copy()
+
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+
+        # load config
+        config, _, _ = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=force_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            **kwargs,
+        )
+        # resolve remapping
+        remapped_class = _fetch_remapped_cls_from_config(config, cls)
+
+        return remapped_class.from_pretrained(pretrained_model_name_or_path, **kwargs_copy)
diff --git a/diffusers/src/diffusers/models/normalization.py b/diffusers/src/diffusers/models/normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..5740fed9f30cb60c21eb544274ac55f434c5a0f6
--- /dev/null
+++ b/diffusers/src/diffusers/models/normalization.py
@@ -0,0 +1,451 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import is_torch_version
+from .activations import get_activation
+from .embeddings import (
+    CombinedTimestepLabelEmbeddings,
+    PixArtAlphaCombinedTimestepSizeEmbeddings,
+)
+
+
+class AdaLayerNorm(nn.Module):
+    r"""
+    Norm layer modified to incorporate timestep embeddings.
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`, *optional*): The size of the embeddings dictionary.
+        output_dim (`int`, *optional*):
+        norm_elementwise_affine (`bool`, defaults to `False):
+        norm_eps (`bool`, defaults to `False`):
+        chunk_dim (`int`, defaults to `0`):
+    """
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_embeddings: Optional[int] = None,
+        output_dim: Optional[int] = None,
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-5,
+        chunk_dim: int = 0,
+    ):
+        super().__init__()
+
+        self.chunk_dim = chunk_dim
+        output_dim = output_dim or embedding_dim * 2
+
+        if num_embeddings is not None:
+            self.emb = nn.Embedding(num_embeddings, embedding_dim)
+        else:
+            self.emb = None
+
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, output_dim)
+        self.norm = nn.LayerNorm(output_dim // 2, norm_eps, norm_elementwise_affine)
+
+    def forward(
+        self, x: torch.Tensor, timestep: Optional[torch.Tensor] = None, temb: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        if self.emb is not None:
+            temb = self.emb(timestep)
+
+        temb = self.linear(self.silu(temb))
+
+        if self.chunk_dim == 1:
+            # This is a bit weird why we have the order of "shift, scale" here and "scale, shift" in the
+            # other if-branch. This branch is specific to CogVideoX for now.
+            shift, scale = temb.chunk(2, dim=1)
+            shift = shift[:, None, :]
+            scale = scale[:, None, :]
+        else:
+            scale, shift = temb.chunk(2, dim=0)
+
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+
+
+class FP32LayerNorm(nn.LayerNorm):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        origin_dtype = inputs.dtype
+        return F.layer_norm(
+            inputs.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        ).to(origin_dtype)
+
+
+class AdaLayerNormZero(nn.Module):
+    r"""
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+
+    def __init__(self, embedding_dim: int, num_embeddings: Optional[int] = None, norm_type="layer_norm", bias=True):
+        super().__init__()
+        if num_embeddings is not None:
+            self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
+        else:
+            self.emb = None
+
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+        elif norm_type == "fp32_layer_norm":
+            self.norm = FP32LayerNorm(embedding_dim, elementwise_affine=False, bias=False)
+        else:
+            raise ValueError(
+                f"Unsupported `norm_type` ({norm_type}) provided. Supported ones are: 'layer_norm', 'fp32_layer_norm'."
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep: Optional[torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+        emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        if self.emb is not None:
+            emb = self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+
+
+class AdaLayerNormZeroSingle(nn.Module):
+    r"""
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+
+    def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
+        super().__init__()
+
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 3 * embedding_dim, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            raise ValueError(
+                f"Unsupported `norm_type` ({norm_type}) provided. Supported ones are: 'layer_norm', 'fp32_layer_norm'."
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa
+
+
+class LuminaRMSNormZero(nn.Module):
+    """
+    Norm layer adaptive RMS normalization zero.
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+    """
+
+    def __init__(self, embedding_dim: int, norm_eps: float, norm_elementwise_affine: bool):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(
+            min(embedding_dim, 1024),
+            4 * embedding_dim,
+            bias=True,
+        )
+        self.norm = RMSNorm(embedding_dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # emb = self.emb(timestep, encoder_hidden_states, encoder_mask)
+        emb = self.linear(self.silu(emb))
+        scale_msa, gate_msa, scale_mlp, gate_mlp = emb.chunk(4, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None])
+
+        return x, gate_msa, scale_mlp, gate_mlp
+
+
+class AdaLayerNormSingle(nn.Module):
+    r"""
+    Norm layer adaptive layer norm single (adaLN-single).
+
+    As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        use_additional_conditions (`bool`): To use additional conditions for normalization or not.
+    """
+
+    def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
+        super().__init__()
+
+        self.emb = PixArtAlphaCombinedTimestepSizeEmbeddings(
+            embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
+        )
+
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        batch_size: Optional[int] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # No modulation happening here.
+        embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
+        return self.linear(self.silu(embedded_timestep)), embedded_timestep
+
+
+class AdaGroupNorm(nn.Module):
+    r"""
+    GroupNorm layer modified to incorporate timestep embeddings.
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+        num_groups (`int`): The number of groups to separate the channels into.
+        act_fn (`str`, *optional*, defaults to `None`): The activation function to use.
+        eps (`float`, *optional*, defaults to `1e-5`): The epsilon value to use for numerical stability.
+    """
+
+    def __init__(
+        self, embedding_dim: int, out_dim: int, num_groups: int, act_fn: Optional[str] = None, eps: float = 1e-5
+    ):
+        super().__init__()
+        self.num_groups = num_groups
+        self.eps = eps
+
+        if act_fn is None:
+            self.act = None
+        else:
+            self.act = get_activation(act_fn)
+
+        self.linear = nn.Linear(embedding_dim, out_dim * 2)
+
+    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        if self.act:
+            emb = self.act(emb)
+        emb = self.linear(emb)
+        emb = emb[:, :, None, None]
+        scale, shift = emb.chunk(2, dim=1)
+
+        x = F.group_norm(x, self.num_groups, eps=self.eps)
+        x = x * (1 + scale) + shift
+        return x
+
+
+class AdaLayerNormContinuous(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+
+    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
+        # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
+        emb = self.linear(self.silu(conditioning_embedding).to(x.dtype))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+
+
+class LuminaLayerNormContinuous(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+        out_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        # AdaLN
+        self.silu = nn.SiLU()
+        self.linear_1 = nn.Linear(conditioning_embedding_dim, embedding_dim, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+        # linear_2
+        if out_dim is not None:
+            self.linear_2 = nn.Linear(
+                embedding_dim,
+                out_dim,
+                bias=bias,
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        conditioning_embedding: torch.Tensor,
+    ) -> torch.Tensor:
+        # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
+        emb = self.linear_1(self.silu(conditioning_embedding).to(x.dtype))
+        scale = emb
+        x = self.norm(x) * (1 + scale)[:, None, :]
+
+        if self.linear_2 is not None:
+            x = self.linear_2(x)
+
+        return x
+
+
+class CogVideoXLayerNormZero(nn.Module):
+    def __init__(
+        self,
+        conditioning_dim: int,
+        embedding_dim: int,
+        elementwise_affine: bool = True,
+        eps: float = 1e-5,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_dim, 6 * embedding_dim, bias=bias)
+        self.norm = nn.LayerNorm(embedding_dim, eps=eps, elementwise_affine=elementwise_affine)
+
+    def forward(
+        self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, temb: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        shift, scale, gate, enc_shift, enc_scale, enc_gate = self.linear(self.silu(temb)).chunk(6, dim=1)
+        hidden_states = self.norm(hidden_states) * (1 + scale)[:, None, :] + shift[:, None, :]
+        encoder_hidden_states = self.norm(encoder_hidden_states) * (1 + enc_scale)[:, None, :] + enc_shift[:, None, :]
+        return hidden_states, encoder_hidden_states, gate[:, None, :], enc_gate[:, None, :]
+
+
+if is_torch_version(">=", "2.1.0"):
+    LayerNorm = nn.LayerNorm
+else:
+    # Has optional bias parameter compared to torch layer norm
+    # TODO: replace with torch layernorm once min required torch version >= 2.1
+    class LayerNorm(nn.Module):
+        def __init__(self, dim, eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True):
+            super().__init__()
+
+            self.eps = eps
+
+            if isinstance(dim, numbers.Integral):
+                dim = (dim,)
+
+            self.dim = torch.Size(dim)
+
+            if elementwise_affine:
+                self.weight = nn.Parameter(torch.ones(dim))
+                self.bias = nn.Parameter(torch.zeros(dim)) if bias else None
+            else:
+                self.weight = None
+                self.bias = None
+
+        def forward(self, input):
+            return F.layer_norm(input, self.dim, self.weight, self.bias, self.eps)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True):
+        super().__init__()
+
+        self.eps = eps
+
+        if isinstance(dim, numbers.Integral):
+            dim = (dim,)
+
+        self.dim = torch.Size(dim)
+
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+        else:
+            self.weight = None
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+
+        if self.weight is not None:
+            # convert into half-precision if necessary
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+            hidden_states = hidden_states * self.weight
+        else:
+            hidden_states = hidden_states.to(input_dtype)
+
+        return hidden_states
+
+
+class GlobalResponseNorm(nn.Module):
+    # Taken from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
+
+    def forward(self, x):
+        gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
+        nx = gx / (gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * nx) + self.beta + x
diff --git a/diffusers/src/diffusers/models/resnet.py b/diffusers/src/diffusers/models/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b55cd9c9d69f703b244c0bee8d5702a9712a77
--- /dev/null
+++ b/diffusers/src/diffusers/models/resnet.py
@@ -0,0 +1,797 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+# `TemporalConvLayer` Copyright 2024 Alibaba DAMO-VILAB, The ModelScope Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import deprecate
+from .activations import get_activation
+from .attention_processor import SpatialNorm
+from .downsampling import (  # noqa
+    Downsample1D,
+    Downsample2D,
+    FirDownsample2D,
+    KDownsample2D,
+    downsample_2d,
+)
+from .normalization import AdaGroupNorm
+from .upsampling import (  # noqa
+    FirUpsample2D,
+    KUpsample2D,
+    Upsample1D,
+    Upsample2D,
+    upfirdn2d_native,
+    upsample_2d,
+)
+
+
+class ResnetBlockCondNorm2D(nn.Module):
+    r"""
+    A Resnet block that use normalization layer that incorporate conditioning information.
+
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        groups_out (`int`, *optional*, default to None):
+            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
+        time_embedding_norm (`str`, *optional*, default to `"ada_group"` ):
+            The normalization layer for time embedding `temb`. Currently only support "ada_group" or "spatial".
+        kernel (`torch.Tensor`, optional, default to None): FIR filter, see
+            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
+        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
+        use_in_shortcut (`bool`, *optional*, default to `True`):
+            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
+        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
+        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
+        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
+            `conv_shortcut` output.
+        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
+            If None, same as `out_channels`.
+    """
+
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        time_embedding_norm: str = "ada_group",  # ada_group, spatial
+        output_scale_factor: float = 1.0,
+        use_in_shortcut: Optional[bool] = None,
+        up: bool = False,
+        down: bool = False,
+        conv_shortcut_bias: bool = True,
+        conv_2d_out_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+        self.time_embedding_norm = time_embedding_norm
+
+        if groups_out is None:
+            groups_out = groups
+
+        if self.time_embedding_norm == "ada_group":  # ada_group
+            self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm1 = SpatialNorm(in_channels, temb_channels)
+        else:
+            raise ValueError(f" unsupported time_embedding_norm: {self.time_embedding_norm}")
+
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        if self.time_embedding_norm == "ada_group":  # ada_group
+            self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
+        elif self.time_embedding_norm == "spatial":  # spatial
+            self.norm2 = SpatialNorm(out_channels, temb_channels)
+        else:
+            raise ValueError(f" unsupported time_embedding_norm: {self.time_embedding_norm}")
+
+        self.dropout = torch.nn.Dropout(dropout)
+
+        conv_2d_out_channels = conv_2d_out_channels or out_channels
+        self.conv2 = nn.Conv2d(out_channels, conv_2d_out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.nonlinearity = get_activation(non_linearity)
+
+        self.upsample = self.downsample = None
+        if self.up:
+            self.upsample = Upsample2D(in_channels, use_conv=False)
+        elif self.down:
+            self.downsample = Downsample2D(in_channels, use_conv=False, padding=1, name="op")
+
+        self.use_in_shortcut = self.in_channels != conv_2d_out_channels if use_in_shortcut is None else use_in_shortcut
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = nn.Conv2d(
+                in_channels,
+                conv_2d_out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=conv_shortcut_bias,
+            )
+
+    def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        hidden_states = input_tensor
+
+        hidden_states = self.norm1(hidden_states, temb)
+
+        hidden_states = self.nonlinearity(hidden_states)
+
+        if self.upsample is not None:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if hidden_states.shape[0] >= 64:
+                input_tensor = input_tensor.contiguous()
+                hidden_states = hidden_states.contiguous()
+            input_tensor = self.upsample(input_tensor)
+            hidden_states = self.upsample(hidden_states)
+
+        elif self.downsample is not None:
+            input_tensor = self.downsample(input_tensor)
+            hidden_states = self.downsample(hidden_states)
+
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states, temb)
+
+        hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+
+        return output_tensor
+
+
+class ResnetBlock2D(nn.Module):
+    r"""
+    A Resnet block.
+
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        groups_out (`int`, *optional*, default to None):
+            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
+        time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
+            By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" for a
+            stronger conditioning with scale and shift.
+        kernel (`torch.Tensor`, optional, default to None): FIR filter, see
+            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
+        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
+        use_in_shortcut (`bool`, *optional*, default to `True`):
+            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
+        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
+        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
+        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
+            `conv_shortcut` output.
+        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
+            If None, same as `out_channels`.
+    """
+
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        pre_norm: bool = True,
+        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        skip_time_act: bool = False,
+        time_embedding_norm: str = "default",  # default, scale_shift,
+        kernel: Optional[torch.Tensor] = None,
+        output_scale_factor: float = 1.0,
+        use_in_shortcut: Optional[bool] = None,
+        up: bool = False,
+        down: bool = False,
+        conv_shortcut_bias: bool = True,
+        conv_2d_out_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        if time_embedding_norm == "ada_group":
+            raise ValueError(
+                "This class cannot be used with `time_embedding_norm==ada_group`, please use `ResnetBlockCondNorm2D` instead",
+            )
+        if time_embedding_norm == "spatial":
+            raise ValueError(
+                "This class cannot be used with `time_embedding_norm==spatial`, please use `ResnetBlockCondNorm2D` instead",
+            )
+
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+        self.time_embedding_norm = time_embedding_norm
+        self.skip_time_act = skip_time_act
+
+        if groups_out is None:
+            groups_out = groups
+
+        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                self.time_emb_proj = nn.Linear(temb_channels, out_channels)
+            elif self.time_embedding_norm == "scale_shift":
+                self.time_emb_proj = nn.Linear(temb_channels, 2 * out_channels)
+            else:
+                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+        else:
+            self.time_emb_proj = None
+
+        self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+
+        self.dropout = torch.nn.Dropout(dropout)
+        conv_2d_out_channels = conv_2d_out_channels or out_channels
+        self.conv2 = nn.Conv2d(out_channels, conv_2d_out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.nonlinearity = get_activation(non_linearity)
+
+        self.upsample = self.downsample = None
+        if self.up:
+            if kernel == "fir":
+                fir_kernel = (1, 3, 3, 1)
+                self.upsample = lambda x: upsample_2d(x, kernel=fir_kernel)
+            elif kernel == "sde_vp":
+                self.upsample = partial(F.interpolate, scale_factor=2.0, mode="nearest")
+            else:
+                self.upsample = Upsample2D(in_channels, use_conv=False)
+        elif self.down:
+            if kernel == "fir":
+                fir_kernel = (1, 3, 3, 1)
+                self.downsample = lambda x: downsample_2d(x, kernel=fir_kernel)
+            elif kernel == "sde_vp":
+                self.downsample = partial(F.avg_pool2d, kernel_size=2, stride=2)
+            else:
+                self.downsample = Downsample2D(in_channels, use_conv=False, padding=1, name="op")
+
+        self.use_in_shortcut = self.in_channels != conv_2d_out_channels if use_in_shortcut is None else use_in_shortcut
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = nn.Conv2d(
+                in_channels,
+                conv_2d_out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=conv_shortcut_bias,
+            )
+
+    def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        hidden_states = input_tensor
+
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+
+        if self.upsample is not None:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if hidden_states.shape[0] >= 64:
+                input_tensor = input_tensor.contiguous()
+                hidden_states = hidden_states.contiguous()
+            input_tensor = self.upsample(input_tensor)
+            hidden_states = self.upsample(hidden_states)
+        elif self.downsample is not None:
+            input_tensor = self.downsample(input_tensor)
+            hidden_states = self.downsample(hidden_states)
+
+        hidden_states = self.conv1(hidden_states)
+
+        if self.time_emb_proj is not None:
+            if not self.skip_time_act:
+                temb = self.nonlinearity(temb)
+            temb = self.time_emb_proj(temb)[:, :, None, None]
+
+        if self.time_embedding_norm == "default":
+            if temb is not None:
+                hidden_states = hidden_states + temb
+            hidden_states = self.norm2(hidden_states)
+        elif self.time_embedding_norm == "scale_shift":
+            if temb is None:
+                raise ValueError(
+                    f" `temb` should not be None when `time_embedding_norm` is {self.time_embedding_norm}"
+                )
+            time_scale, time_shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = self.norm2(hidden_states)
+            hidden_states = hidden_states * (1 + time_scale) + time_shift
+        else:
+            hidden_states = self.norm2(hidden_states)
+
+        hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+
+        return output_tensor
+
+
+# unet_rl.py
+def rearrange_dims(tensor: torch.Tensor) -> torch.Tensor:
+    if len(tensor.shape) == 2:
+        return tensor[:, :, None]
+    if len(tensor.shape) == 3:
+        return tensor[:, :, None, :]
+    elif len(tensor.shape) == 4:
+        return tensor[:, :, 0, :]
+    else:
+        raise ValueError(f"`len(tensor)`: {len(tensor)} has to be 2, 3 or 4.")
+
+
+class Conv1dBlock(nn.Module):
+    """
+    Conv1d --> GroupNorm --> Mish
+
+    Parameters:
+        inp_channels (`int`): Number of input channels.
+        out_channels (`int`): Number of output channels.
+        kernel_size (`int` or `tuple`): Size of the convolving kernel.
+        n_groups (`int`, default `8`): Number of groups to separate the channels into.
+        activation (`str`, defaults to `mish`): Name of the activation function.
+    """
+
+    def __init__(
+        self,
+        inp_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        n_groups: int = 8,
+        activation: str = "mish",
+    ):
+        super().__init__()
+
+        self.conv1d = nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2)
+        self.group_norm = nn.GroupNorm(n_groups, out_channels)
+        self.mish = get_activation(activation)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        intermediate_repr = self.conv1d(inputs)
+        intermediate_repr = rearrange_dims(intermediate_repr)
+        intermediate_repr = self.group_norm(intermediate_repr)
+        intermediate_repr = rearrange_dims(intermediate_repr)
+        output = self.mish(intermediate_repr)
+        return output
+
+
+# unet_rl.py
+class ResidualTemporalBlock1D(nn.Module):
+    """
+    Residual 1D block with temporal convolutions.
+
+    Parameters:
+        inp_channels (`int`): Number of input channels.
+        out_channels (`int`): Number of output channels.
+        embed_dim (`int`): Embedding dimension.
+        kernel_size (`int` or `tuple`): Size of the convolving kernel.
+        activation (`str`, defaults `mish`): It is possible to choose the right activation function.
+    """
+
+    def __init__(
+        self,
+        inp_channels: int,
+        out_channels: int,
+        embed_dim: int,
+        kernel_size: Union[int, Tuple[int, int]] = 5,
+        activation: str = "mish",
+    ):
+        super().__init__()
+        self.conv_in = Conv1dBlock(inp_channels, out_channels, kernel_size)
+        self.conv_out = Conv1dBlock(out_channels, out_channels, kernel_size)
+
+        self.time_emb_act = get_activation(activation)
+        self.time_emb = nn.Linear(embed_dim, out_channels)
+
+        self.residual_conv = (
+            nn.Conv1d(inp_channels, out_channels, 1) if inp_channels != out_channels else nn.Identity()
+        )
+
+    def forward(self, inputs: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            inputs : [ batch_size x inp_channels x horizon ]
+            t : [ batch_size x embed_dim ]
+
+        returns:
+            out : [ batch_size x out_channels x horizon ]
+        """
+        t = self.time_emb_act(t)
+        t = self.time_emb(t)
+        out = self.conv_in(inputs) + rearrange_dims(t)
+        out = self.conv_out(out)
+        return out + self.residual_conv(inputs)
+
+
+class TemporalConvLayer(nn.Module):
+    """
+    Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
+    https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016
+
+    Parameters:
+        in_dim (`int`): Number of input channels.
+        out_dim (`int`): Number of output channels.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: Optional[int] = None,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+    ):
+        super().__init__()
+        out_dim = out_dim or in_dim
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+
+        # conv layers
+        self.conv1 = nn.Sequential(
+            nn.GroupNorm(norm_num_groups, in_dim),
+            nn.SiLU(),
+            nn.Conv3d(in_dim, out_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        self.conv2 = nn.Sequential(
+            nn.GroupNorm(norm_num_groups, out_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        self.conv3 = nn.Sequential(
+            nn.GroupNorm(norm_num_groups, out_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        self.conv4 = nn.Sequential(
+            nn.GroupNorm(norm_num_groups, out_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+
+        # zero out the last layer params,so the conv block is identity
+        nn.init.zeros_(self.conv4[-1].weight)
+        nn.init.zeros_(self.conv4[-1].bias)
+
+    def forward(self, hidden_states: torch.Tensor, num_frames: int = 1) -> torch.Tensor:
+        hidden_states = (
+            hidden_states[None, :].reshape((-1, num_frames) + hidden_states.shape[1:]).permute(0, 2, 1, 3, 4)
+        )
+
+        identity = hidden_states
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.conv3(hidden_states)
+        hidden_states = self.conv4(hidden_states)
+
+        hidden_states = identity + hidden_states
+
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4).reshape(
+            (hidden_states.shape[0] * hidden_states.shape[2], -1) + hidden_states.shape[3:]
+        )
+        return hidden_states
+
+
+class TemporalResnetBlock(nn.Module):
+    r"""
+    A Resnet block.
+
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        temb_channels: int = 512,
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+
+        kernel_size = (3, 1, 1)
+        padding = [k // 2 for k in kernel_size]
+
+        self.norm1 = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=padding,
+        )
+
+        if temb_channels is not None:
+            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
+        else:
+            self.time_emb_proj = None
+
+        self.norm2 = torch.nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=eps, affine=True)
+
+        self.dropout = torch.nn.Dropout(0.0)
+        self.conv2 = nn.Conv3d(
+            out_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=padding,
+        )
+
+        self.nonlinearity = get_activation("silu")
+
+        self.use_in_shortcut = self.in_channels != out_channels
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = nn.Conv3d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+
+    def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor) -> torch.Tensor:
+        hidden_states = input_tensor
+
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        if self.time_emb_proj is not None:
+            temb = self.nonlinearity(temb)
+            temb = self.time_emb_proj(temb)[:, :, :, None, None]
+            temb = temb.permute(0, 2, 1, 3, 4)
+            hidden_states = hidden_states + temb
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+
+        output_tensor = input_tensor + hidden_states
+
+        return output_tensor
+
+
+# VideoResBlock
+class SpatioTemporalResBlock(nn.Module):
+    r"""
+    A SpatioTemporal Resnet block.
+
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the spatial resenet.
+        temporal_eps (`float`, *optional*, defaults to `eps`): The epsilon to use for the temporal resnet.
+        merge_factor (`float`, *optional*, defaults to `0.5`): The merge factor to use for the temporal mixing.
+        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
+            The merge strategy to use for the temporal mixing.
+        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
+            If `True`, switch the spatial and temporal mixing.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        temb_channels: int = 512,
+        eps: float = 1e-6,
+        temporal_eps: Optional[float] = None,
+        merge_factor: float = 0.5,
+        merge_strategy="learned_with_images",
+        switch_spatial_to_temporal_mix: bool = False,
+    ):
+        super().__init__()
+
+        self.spatial_res_block = ResnetBlock2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            eps=eps,
+        )
+
+        self.temporal_res_block = TemporalResnetBlock(
+            in_channels=out_channels if out_channels is not None else in_channels,
+            out_channels=out_channels if out_channels is not None else in_channels,
+            temb_channels=temb_channels,
+            eps=temporal_eps if temporal_eps is not None else eps,
+        )
+
+        self.time_mixer = AlphaBlender(
+            alpha=merge_factor,
+            merge_strategy=merge_strategy,
+            switch_spatial_to_temporal_mix=switch_spatial_to_temporal_mix,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ):
+        num_frames = image_only_indicator.shape[-1]
+        hidden_states = self.spatial_res_block(hidden_states, temb)
+
+        batch_frames, channels, height, width = hidden_states.shape
+        batch_size = batch_frames // num_frames
+
+        hidden_states_mix = (
+            hidden_states[None, :].reshape(batch_size, num_frames, channels, height, width).permute(0, 2, 1, 3, 4)
+        )
+        hidden_states = (
+            hidden_states[None, :].reshape(batch_size, num_frames, channels, height, width).permute(0, 2, 1, 3, 4)
+        )
+
+        if temb is not None:
+            temb = temb.reshape(batch_size, num_frames, -1)
+
+        hidden_states = self.temporal_res_block(hidden_states, temb)
+        hidden_states = self.time_mixer(
+            x_spatial=hidden_states_mix,
+            x_temporal=hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4).reshape(batch_frames, channels, height, width)
+        return hidden_states
+
+
+class AlphaBlender(nn.Module):
+    r"""
+    A module to blend spatial and temporal features.
+
+    Parameters:
+        alpha (`float`): The initial value of the blending factor.
+        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
+            The merge strategy to use for the temporal mixing.
+        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
+            If `True`, switch the spatial and temporal mixing.
+    """
+
+    strategies = ["learned", "fixed", "learned_with_images"]
+
+    def __init__(
+        self,
+        alpha: float,
+        merge_strategy: str = "learned_with_images",
+        switch_spatial_to_temporal_mix: bool = False,
+    ):
+        super().__init__()
+        self.merge_strategy = merge_strategy
+        self.switch_spatial_to_temporal_mix = switch_spatial_to_temporal_mix  # For TemporalVAE
+
+        if merge_strategy not in self.strategies:
+            raise ValueError(f"merge_strategy needs to be in {self.strategies}")
+
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif self.merge_strategy == "learned" or self.merge_strategy == "learned_with_images":
+            self.register_parameter("mix_factor", torch.nn.Parameter(torch.Tensor([alpha])))
+        else:
+            raise ValueError(f"Unknown merge strategy {self.merge_strategy}")
+
+    def get_alpha(self, image_only_indicator: torch.Tensor, ndims: int) -> torch.Tensor:
+        if self.merge_strategy == "fixed":
+            alpha = self.mix_factor
+
+        elif self.merge_strategy == "learned":
+            alpha = torch.sigmoid(self.mix_factor)
+
+        elif self.merge_strategy == "learned_with_images":
+            if image_only_indicator is None:
+                raise ValueError("Please provide image_only_indicator to use learned_with_images merge strategy")
+
+            alpha = torch.where(
+                image_only_indicator.bool(),
+                torch.ones(1, 1, device=image_only_indicator.device),
+                torch.sigmoid(self.mix_factor)[..., None],
+            )
+
+            # (batch, channel, frames, height, width)
+            if ndims == 5:
+                alpha = alpha[:, None, :, None, None]
+            # (batch*frames, height*width, channels)
+            elif ndims == 3:
+                alpha = alpha.reshape(-1)[:, None, None]
+            else:
+                raise ValueError(f"Unexpected ndims {ndims}. Dimensions should be 3 or 5")
+
+        else:
+            raise NotImplementedError
+
+        return alpha
+
+    def forward(
+        self,
+        x_spatial: torch.Tensor,
+        x_temporal: torch.Tensor,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        alpha = self.get_alpha(image_only_indicator, x_spatial.ndim)
+        alpha = alpha.to(x_spatial.dtype)
+
+        if self.switch_spatial_to_temporal_mix:
+            alpha = 1.0 - alpha
+
+        x = alpha * x_spatial + (1.0 - alpha) * x_temporal
+        return x
diff --git a/diffusers/src/diffusers/models/resnet_flax.py b/diffusers/src/diffusers/models/resnet_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8bb4788d9a5620cbe19607a5f80d00b1b12ddca
--- /dev/null
+++ b/diffusers/src/diffusers/models/resnet_flax.py
@@ -0,0 +1,124 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+
+
+class FlaxUpsample2D(nn.Module):
+    out_channels: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv = nn.Conv(
+            self.out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        batch, height, width, channels = hidden_states.shape
+        hidden_states = jax.image.resize(
+            hidden_states,
+            shape=(batch, height * 2, width * 2, channels),
+            method="nearest",
+        )
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class FlaxDownsample2D(nn.Module):
+    out_channels: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv = nn.Conv(
+            self.out_channels,
+            kernel_size=(3, 3),
+            strides=(2, 2),
+            padding=((1, 1), (1, 1)),  # padding="VALID",
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        # pad = ((0, 0), (0, 1), (0, 1), (0, 0))  # pad height and width dim
+        # hidden_states = jnp.pad(hidden_states, pad_width=pad)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class FlaxResnetBlock2D(nn.Module):
+    in_channels: int
+    out_channels: int = None
+    dropout_prob: float = 0.0
+    use_nin_shortcut: bool = None
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        out_channels = self.in_channels if self.out_channels is None else self.out_channels
+
+        self.norm1 = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+        self.conv1 = nn.Conv(
+            out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        self.time_emb_proj = nn.Dense(out_channels, dtype=self.dtype)
+
+        self.norm2 = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+        self.dropout = nn.Dropout(self.dropout_prob)
+        self.conv2 = nn.Conv(
+            out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        use_nin_shortcut = self.in_channels != out_channels if self.use_nin_shortcut is None else self.use_nin_shortcut
+
+        self.conv_shortcut = None
+        if use_nin_shortcut:
+            self.conv_shortcut = nn.Conv(
+                out_channels,
+                kernel_size=(1, 1),
+                strides=(1, 1),
+                padding="VALID",
+                dtype=self.dtype,
+            )
+
+    def __call__(self, hidden_states, temb, deterministic=True):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = nn.swish(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        temb = self.time_emb_proj(nn.swish(temb))
+        temb = jnp.expand_dims(jnp.expand_dims(temb, 1), 1)
+        hidden_states = hidden_states + temb
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = nn.swish(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            residual = self.conv_shortcut(residual)
+
+        return hidden_states + residual
diff --git a/diffusers/src/diffusers/models/transformers/__init__.py b/diffusers/src/diffusers/models/transformers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab2c6e32701266aa332af6496aeef6178e72e03f
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/__init__.py
@@ -0,0 +1,22 @@
+from ...utils import is_torch_available
+
+
+if is_torch_available():
+    from .auraflow_transformer_2d import AuraFlowTransformer2DModel
+    from .cogvideox_transformer_3d import CogVideoXTransformer3DModel
+    from .dit_transformer_2d import DiTTransformer2DModel
+    from .dual_transformer_2d import DualTransformer2DModel
+    from .hunyuan_transformer_2d import HunyuanDiT2DModel
+    from .latte_transformer_3d import LatteTransformer3DModel
+    from .lumina_nextdit2d import LuminaNextDiT2DModel
+    from .pixart_transformer_2d import PixArtTransformer2DModel
+    from .colorguider_pixart import ColorGuiderPixArtModel
+    from .causal_sparse_dit import CausalSparseDiTModel
+    from .causal_sparse_dit_control import CausalSparseDiTControlModel
+    from .prior_transformer import PriorTransformer
+    from .stable_audio_transformer import StableAudioDiTModel
+    from .t5_film_transformer import T5FilmDecoder
+    from .transformer_2d import Transformer2DModel
+    from .transformer_flux import FluxTransformer2DModel
+    from .transformer_sd3 import SD3Transformer2DModel
+    from .transformer_temporal import TransformerTemporalModel
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/__init__.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be8cc6fc13aba4e69065b311c5dfaf578bab8469
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/__init__.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/auraflow_transformer_2d.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/auraflow_transformer_2d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f657c34160c3aa7d364ae2a54d60e74ae86e84b4
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/auraflow_transformer_2d.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/causal_sparse_dit.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/causal_sparse_dit.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..986795c8bcf9ae8884556dfa35458c52eef5bef9
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/causal_sparse_dit.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/causal_sparse_dit_control.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/causal_sparse_dit_control.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94ca580c37f94e12b4cbaac98b95d0f3cea250b4
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/causal_sparse_dit_control.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/cogvideox_transformer_3d.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/cogvideox_transformer_3d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4c7ab10435264ec4b4d73113698567d3e53dccb
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/cogvideox_transformer_3d.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/colorguider_pixart.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/colorguider_pixart.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1986701e72209365a0b1db864a398b33acdd33db
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/colorguider_pixart.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/dit_transformer_2d.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/dit_transformer_2d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65980b2a295eed9818cb2f814e978c09b287b919
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/dit_transformer_2d.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/dual_transformer_2d.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/dual_transformer_2d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c6557db680d50abf78464f92f3823bb8061736a
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/dual_transformer_2d.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/hunyuan_transformer_2d.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/hunyuan_transformer_2d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8b673085340b86e7730d0aa8eb6f2c8b2eb2acf
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/hunyuan_transformer_2d.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/latte_transformer_3d.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/latte_transformer_3d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2777b52b4c1dea748b539325510adc9773336e42
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/latte_transformer_3d.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/lumina_nextdit2d.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/lumina_nextdit2d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e3caa5d99a731efcd7f2ab1e1a9da8f93244b33
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/lumina_nextdit2d.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/pixart_transformer_2d.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/pixart_transformer_2d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c835d799ba7756ec109aaebcaca2789e13d7c61
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/pixart_transformer_2d.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/prior_transformer.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/prior_transformer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8926c35addd322cbbbc65e3f3ac13f0109a1226e
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/prior_transformer.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/stable_audio_transformer.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/stable_audio_transformer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b687a3e1b5de371ee1468877503d67db0392f56
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/stable_audio_transformer.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/t5_film_transformer.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/t5_film_transformer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6368f5e8377079530537b6e2c29362b5113526ec
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/t5_film_transformer.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/transformer_2d.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/transformer_2d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8761987bd21b5b652f9546a894d130391c44a2bf
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/transformer_2d.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/transformer_flux.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/transformer_flux.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d855aebbb0825e3e0f7f3fcad2a065f7c711e96
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/transformer_flux.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/transformer_sd3.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/transformer_sd3.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..121b9796df4ebed2dc627ecd1cf479fd4a6cd370
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/transformer_sd3.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/__pycache__/transformer_temporal.cpython-311.pyc b/diffusers/src/diffusers/models/transformers/__pycache__/transformer_temporal.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..588f5c35b7785b5f997316af4d4abac8a1ed547e
Binary files /dev/null and b/diffusers/src/diffusers/models/transformers/__pycache__/transformer_temporal.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/transformers/auraflow_transformer_2d.py b/diffusers/src/diffusers/models/transformers/auraflow_transformer_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad64df0c0790725ba1876d88a75fd9c055918466
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/auraflow_transformer_2d.py
@@ -0,0 +1,544 @@
+# Copyright 2024 AuraFlow Authors, The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Dict, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import is_torch_version, logging
+from ...utils.torch_utils import maybe_allow_in_graph
+from ..attention_processor import (
+    Attention,
+    AttentionProcessor,
+    AuraFlowAttnProcessor2_0,
+    FusedAuraFlowAttnProcessor2_0,
+)
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormZero, FP32LayerNorm
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Taken from the original aura flow inference code.
+def find_multiple(n: int, k: int) -> int:
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+
+# Aura Flow patch embed doesn't use convs for projections.
+# Additionally, it uses learned positional embeddings.
+class AuraFlowPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        height=224,
+        width=224,
+        patch_size=16,
+        in_channels=3,
+        embed_dim=768,
+        pos_embed_max_size=None,
+    ):
+        super().__init__()
+
+        self.num_patches = (height // patch_size) * (width // patch_size)
+        self.pos_embed_max_size = pos_embed_max_size
+
+        self.proj = nn.Linear(patch_size * patch_size * in_channels, embed_dim)
+        self.pos_embed = nn.Parameter(torch.randn(1, pos_embed_max_size, embed_dim) * 0.1)
+
+        self.patch_size = patch_size
+        self.height, self.width = height // patch_size, width // patch_size
+        self.base_size = height // patch_size
+
+    def pe_selection_index_based_on_dim(self, h, w):
+        # select subset of positional embedding based on H, W, where H, W is size of latent
+        # PE will be viewed as 2d-grid, and H/p x W/p of the PE will be selected
+        # because original input are in flattened format, we have to flatten this 2d grid as well.
+        h_p, w_p = h // self.patch_size, w // self.patch_size
+        original_pe_indexes = torch.arange(self.pos_embed.shape[1])
+        h_max, w_max = int(self.pos_embed_max_size**0.5), int(self.pos_embed_max_size**0.5)
+        original_pe_indexes = original_pe_indexes.view(h_max, w_max)
+        starth = h_max // 2 - h_p // 2
+        endh = starth + h_p
+        startw = w_max // 2 - w_p // 2
+        endw = startw + w_p
+        original_pe_indexes = original_pe_indexes[starth:endh, startw:endw]
+        return original_pe_indexes.flatten()
+
+    def forward(self, latent):
+        batch_size, num_channels, height, width = latent.size()
+        latent = latent.view(
+            batch_size,
+            num_channels,
+            height // self.patch_size,
+            self.patch_size,
+            width // self.patch_size,
+            self.patch_size,
+        )
+        latent = latent.permute(0, 2, 4, 1, 3, 5).flatten(-3).flatten(1, 2)
+        latent = self.proj(latent)
+        pe_index = self.pe_selection_index_based_on_dim(height, width)
+        return latent + self.pos_embed[:, pe_index]
+
+
+# Taken from the original Aura flow inference code.
+# Our feedforward only has GELU but Aura uses SiLU.
+class AuraFlowFeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim=None) -> None:
+        super().__init__()
+        if hidden_dim is None:
+            hidden_dim = 4 * dim
+
+        final_hidden_dim = int(2 * hidden_dim / 3)
+        final_hidden_dim = find_multiple(final_hidden_dim, 256)
+
+        self.linear_1 = nn.Linear(dim, final_hidden_dim, bias=False)
+        self.linear_2 = nn.Linear(dim, final_hidden_dim, bias=False)
+        self.out_projection = nn.Linear(final_hidden_dim, dim, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.linear_1(x)) * self.linear_2(x)
+        x = self.out_projection(x)
+        return x
+
+
+class AuraFlowPreFinalBlock(nn.Module):
+    def __init__(self, embedding_dim: int, conditioning_embedding_dim: int):
+        super().__init__()
+
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=False)
+
+    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
+        emb = self.linear(self.silu(conditioning_embedding).to(x.dtype))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = x * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+
+
+@maybe_allow_in_graph
+class AuraFlowSingleTransformerBlock(nn.Module):
+    """Similar to `AuraFlowJointTransformerBlock` with a single DiT instead of an MMDiT."""
+
+    def __init__(self, dim, num_attention_heads, attention_head_dim):
+        super().__init__()
+
+        self.norm1 = AdaLayerNormZero(dim, bias=False, norm_type="fp32_layer_norm")
+
+        processor = AuraFlowAttnProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            qk_norm="fp32_layer_norm",
+            out_dim=dim,
+            bias=False,
+            out_bias=False,
+            processor=processor,
+        )
+
+        self.norm2 = FP32LayerNorm(dim, elementwise_affine=False, bias=False)
+        self.ff = AuraFlowFeedForward(dim, dim * 4)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: torch.FloatTensor):
+        residual = hidden_states
+
+        # Norm + Projection.
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+
+        # Attention.
+        attn_output = self.attn(hidden_states=norm_hidden_states)
+
+        # Process attention outputs for the `hidden_states`.
+        hidden_states = self.norm2(residual + gate_msa.unsqueeze(1) * attn_output)
+        hidden_states = hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(hidden_states)
+        hidden_states = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+@maybe_allow_in_graph
+class AuraFlowJointTransformerBlock(nn.Module):
+    r"""
+    Transformer block for Aura Flow. Similar to SD3 MMDiT. Differences (non-exhaustive):
+
+        * QK Norm in the attention blocks
+        * No bias in the attention blocks
+        * Most LayerNorms are in FP32
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        is_last (`bool`): Boolean to determine if this is the last block in the model.
+    """
+
+    def __init__(self, dim, num_attention_heads, attention_head_dim):
+        super().__init__()
+
+        self.norm1 = AdaLayerNormZero(dim, bias=False, norm_type="fp32_layer_norm")
+        self.norm1_context = AdaLayerNormZero(dim, bias=False, norm_type="fp32_layer_norm")
+
+        processor = AuraFlowAttnProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            added_proj_bias=False,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            qk_norm="fp32_layer_norm",
+            out_dim=dim,
+            bias=False,
+            out_bias=False,
+            processor=processor,
+            context_pre_only=False,
+        )
+
+        self.norm2 = FP32LayerNorm(dim, elementwise_affine=False, bias=False)
+        self.ff = AuraFlowFeedForward(dim, dim * 4)
+        self.norm2_context = FP32LayerNorm(dim, elementwise_affine=False, bias=False)
+        self.ff_context = AuraFlowFeedForward(dim, dim * 4)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor
+    ):
+        residual = hidden_states
+        residual_context = encoder_hidden_states
+
+        # Norm + Projection.
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+
+        # Attention.
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states
+        )
+
+        # Process attention outputs for the `hidden_states`.
+        hidden_states = self.norm2(residual + gate_msa.unsqueeze(1) * attn_output)
+        hidden_states = hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        hidden_states = gate_mlp.unsqueeze(1) * self.ff(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Process attention outputs for the `encoder_hidden_states`.
+        encoder_hidden_states = self.norm2_context(residual_context + c_gate_msa.unsqueeze(1) * context_attn_output)
+        encoder_hidden_states = encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+        encoder_hidden_states = c_gate_mlp.unsqueeze(1) * self.ff_context(encoder_hidden_states)
+        encoder_hidden_states = residual_context + encoder_hidden_states
+
+        return encoder_hidden_states, hidden_states
+
+
+class AuraFlowTransformer2DModel(ModelMixin, ConfigMixin):
+    r"""
+    A 2D Transformer model as introduced in AuraFlow (https://blog.fal.ai/auraflow/).
+
+    Parameters:
+        sample_size (`int`): The width of the latent images. This is fixed during training since
+            it is used to learn a number of position embeddings.
+        patch_size (`int`): Patch size to turn the input data into small patches.
+        in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
+        num_mmdit_layers (`int`, *optional*, defaults to 4): The number of layers of MMDiT Transformer blocks to use.
+        num_single_dit_layers (`int`, *optional*, defaults to 4):
+            The number of layers of Transformer blocks to use. These blocks use concatenated image and text
+            representations.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
+        joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        caption_projection_dim (`int`): Number of dimensions to use when projecting the `encoder_hidden_states`.
+        out_channels (`int`, defaults to 16): Number of output channels.
+        pos_embed_max_size (`int`, defaults to 4096): Maximum positions to embed from the image latents.
+    """
+
+    _no_split_modules = ["AuraFlowJointTransformerBlock", "AuraFlowSingleTransformerBlock", "AuraFlowPatchEmbed"]
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int = 64,
+        patch_size: int = 2,
+        in_channels: int = 4,
+        num_mmdit_layers: int = 4,
+        num_single_dit_layers: int = 32,
+        attention_head_dim: int = 256,
+        num_attention_heads: int = 12,
+        joint_attention_dim: int = 2048,
+        caption_projection_dim: int = 3072,
+        out_channels: int = 4,
+        pos_embed_max_size: int = 1024,
+    ):
+        super().__init__()
+        default_out_channels = in_channels
+        self.out_channels = out_channels if out_channels is not None else default_out_channels
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+
+        self.pos_embed = AuraFlowPatchEmbed(
+            height=self.config.sample_size,
+            width=self.config.sample_size,
+            patch_size=self.config.patch_size,
+            in_channels=self.config.in_channels,
+            embed_dim=self.inner_dim,
+            pos_embed_max_size=pos_embed_max_size,
+        )
+
+        self.context_embedder = nn.Linear(
+            self.config.joint_attention_dim, self.config.caption_projection_dim, bias=False
+        )
+        self.time_step_embed = Timesteps(num_channels=256, downscale_freq_shift=0, scale=1000, flip_sin_to_cos=True)
+        self.time_step_proj = TimestepEmbedding(in_channels=256, time_embed_dim=self.inner_dim)
+
+        self.joint_transformer_blocks = nn.ModuleList(
+            [
+                AuraFlowJointTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                )
+                for i in range(self.config.num_mmdit_layers)
+            ]
+        )
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                AuraFlowSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                )
+                for _ in range(self.config.num_single_dit_layers)
+            ]
+        )
+
+        self.norm_out = AuraFlowPreFinalBlock(self.inner_dim, self.inner_dim)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=False)
+
+        # https://arxiv.org/abs/2309.16588
+        # prevents artifacts in the attention maps
+        self.register_tokens = nn.Parameter(torch.randn(1, 8, self.inner_dim) * 0.02)
+
+        self.gradient_checkpointing = False
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedAuraFlowAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedAuraFlowAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        timestep: torch.LongTensor = None,
+        return_dict: bool = True,
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+        height, width = hidden_states.shape[-2:]
+
+        # Apply patch embedding, timestep embedding, and project the caption embeddings.
+        hidden_states = self.pos_embed(hidden_states)  # takes care of adding positional embeddings too.
+        temb = self.time_step_embed(timestep).to(dtype=next(self.parameters()).dtype)
+        temb = self.time_step_proj(temb)
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        encoder_hidden_states = torch.cat(
+            [self.register_tokens.repeat(encoder_hidden_states.size(0), 1, 1), encoder_hidden_states], dim=1
+        )
+
+        # MMDiT blocks.
+        for index_block, block in enumerate(self.joint_transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, temb=temb
+                )
+
+        # Single DiT blocks that combine the `hidden_states` (image) and `encoder_hidden_states` (text)
+        if len(self.single_transformer_blocks) > 0:
+            encoder_seq_len = encoder_hidden_states.size(1)
+            combined_hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+            for index_block, block in enumerate(self.single_transformer_blocks):
+                if self.training and self.gradient_checkpointing:
+
+                    def create_custom_forward(module, return_dict=None):
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+
+                        return custom_forward
+
+                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                    combined_hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        combined_hidden_states,
+                        temb,
+                        **ckpt_kwargs,
+                    )
+
+                else:
+                    combined_hidden_states = block(hidden_states=combined_hidden_states, temb=temb)
+
+            hidden_states = combined_hidden_states[:, encoder_seq_len:]
+
+        hidden_states = self.norm_out(hidden_states, temb)
+        hidden_states = self.proj_out(hidden_states)
+
+        # unpatchify
+        patch_size = self.config.patch_size
+        out_channels = self.config.out_channels
+        height = height // patch_size
+        width = width // patch_size
+
+        hidden_states = hidden_states.reshape(
+            shape=(hidden_states.shape[0], height, width, patch_size, patch_size, out_channels)
+        )
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(
+            shape=(hidden_states.shape[0], out_channels, height * patch_size, width * patch_size)
+        )
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/diffusers/src/diffusers/models/transformers/causal_sparse_dit.py b/diffusers/src/diffusers/models/transformers/causal_sparse_dit.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e462f5f23cadbf2848633d9b2cbde7f5bb7b2d4
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/causal_sparse_dit.py
@@ -0,0 +1,659 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    import torch_npu
+    from torch_npu.contrib import transfer_to_npu
+except:
+    print('torch_npu not found')
+from typing import Any, Dict, Optional, Union
+
+import torch
+from torch import nn
+from typing import List
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import is_torch_version, logging
+from ..attention import BasicTransformerBlock, CausalTransformerKVcacheBlock
+from ..attention_processor import Attention, AttentionProcessor, AttnProcessor, FusedAttnProcessor2_0
+from ..embeddings import PatchEmbed, PixArtAlphaTextProjection, CausalPatchEmbed
+from ..modeling_outputs import Transformer2DModelOutput, CausalTransformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormSingle
+from ...loaders import PeftAdapterMixin
+from einops import rearrange
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+def get_positional_encoding_1d(d_model, max_len=5000):
+    """
+    生成1D的正弦和余弦位置编码矩阵。
+
+    参数:
+        d_model (int): 模型的维度。
+        max_len (int): 序列的最大长度。
+
+    返回:
+        pe (torch.Tensor): 位置编码矩阵，形状为 (max_len, d_model)。
+    """
+    # 创建位置编码矩阵
+    pe = torch.zeros(max_len, d_model)
+    
+    # 生成位置索引
+    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+    
+    # 生成频率项
+    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+    
+    # 计算正弦和余弦编码
+    pe[:, 0::2] = torch.sin(position * div_term)  # 偶数位置使用正弦
+    pe[:, 1::2] = torch.cos(position * div_term)  # 奇数位置使用余弦
+    
+    return pe
+
+class CausalSparseDiTModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    r"""
+    A 2D Transformer model as introduced in PixArt family of models (https://arxiv.org/abs/2310.00426,
+    https://arxiv.org/abs/2403.04692).
+
+    Parameters:
+        num_attention_heads (int, optional, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (int, optional, defaults to 72): The number of channels in each head.
+        in_channels (int, defaults to 4): The number of channels in the input.
+        out_channels (int, optional):
+            The number of channels in the output. Specify this parameter if the output channel number differs from the
+            input.
+        num_layers (int, optional, defaults to 28): The number of layers of Transformer blocks to use.
+        dropout (float, optional, defaults to 0.0): The dropout probability to use within the Transformer blocks.
+        norm_num_groups (int, optional, defaults to 32):
+            Number of groups for group normalization within Transformer blocks.
+        cross_attention_dim (int, optional):
+            The dimensionality for cross-attention layers, typically matching the encoder's hidden dimension.
+        attention_bias (bool, optional, defaults to True):
+            Configure if the Transformer blocks' attention should contain a bias parameter.
+        sample_size (int, defaults to 128):
+            The width of the latent images. This parameter is fixed during training.
+        patch_size (int, defaults to 2):
+            Size of the patches the model processes, relevant for architectures working on non-sequential data.
+        activation_fn (str, optional, defaults to "gelu-approximate"):
+            Activation function to use in feed-forward networks within Transformer blocks.
+        num_embeds_ada_norm (int, optional, defaults to 1000):
+            Number of embeddings for AdaLayerNorm, fixed during training and affects the maximum denoising steps during
+            inference.
+        upcast_attention (bool, optional, defaults to False):
+            If true, upcasts the attention mechanism dimensions for potentially improved performance.
+        norm_type (str, optional, defaults to "ada_norm_zero"):
+            Specifies the type of normalization used, can be 'ada_norm_zero'.
+        norm_elementwise_affine (bool, optional, defaults to False):
+            If true, enables element-wise affine parameters in the normalization layers.
+        norm_eps (float, optional, defaults to 1e-6):
+            A small constant added to the denominator in normalization layers to prevent division by zero.
+        interpolation_scale (int, optional): Scale factor to use during interpolating the position embeddings.
+        use_additional_conditions (bool, optional): If we're using additional conditions as inputs.
+        attention_type (str, optional, defaults to "default"): Kind of attention mechanism to be used.
+        caption_channels (int, optional, defaults to None):
+            Number of channels to use for projecting the caption embeddings.
+        use_linear_projection (bool, optional, defaults to False):
+            Deprecated argument. Will be removed in a future version.
+        num_vector_embeds (bool, optional, defaults to False):
+            Deprecated argument. Will be removed in a future version.
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["CausalTransformerKVcacheBlock", "CausalPatchEmbed"]
+# {
+#   "_class_name": "Transformer2DModel",
+#   "_diffusers_version": "0.22.0.dev0",
+#   "activation_fn": "gelu-approximate",
+#   "attention_bias": true,
+#   "attention_head_dim": 72,
+#   "attention_type": "default",
+#   "caption_channels": 4096,
+#   "cross_attention_dim": 1152,
+#   "double_self_attention": false,
+#   "dropout": 0.0,
+#   "in_channels": 4,
+#   "norm_elementwise_affine": false,
+#   "norm_eps": 1e-06,
+#   "norm_num_groups": 32,
+#   "norm_type": "ada_norm_single",
+#   "num_attention_heads": 16,
+#   "num_embeds_ada_norm": 1000,
+#   "num_layers": 28,
+#   "num_vector_embeds": null,
+#   "only_cross_attention": false,
+#   "out_channels": 8,
+#   "patch_size": 2,
+#   "sample_size": 128,
+#   "upcast_attention": false,
+#   "use_linear_projection": false
+# }
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 72,
+        in_channels: int = 4,
+        out_channels: Optional[int] = 8,
+        num_layers: int = 28,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = 1152,
+        attention_bias: bool = True,
+        sample_size: int = 128,
+        patch_size: int = 2,
+        activation_fn: str = "gelu-approximate",
+        num_embeds_ada_norm: Optional[int] = 1000,
+        upcast_attention: bool = False,
+        norm_type: str = "ada_norm_single",
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-6,
+        interpolation_scale: Optional[int] = None,
+        use_additional_conditions: Optional[bool] = None,
+        caption_channels: Optional[int] = None,
+        attention_type: Optional[str] = "default",
+    ):
+        super().__init__()
+
+        # Validate inputs.
+        if norm_type != "ada_norm_single":
+            raise NotImplementedError(
+                f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
+            )
+        elif norm_type == "ada_norm_single" and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
+            )
+
+        # Set some common variables used across the board.
+        self.attention_head_dim = attention_head_dim
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if use_additional_conditions is None:
+            if sample_size == 128:
+                use_additional_conditions = True
+            else:
+                use_additional_conditions = False
+        self.use_additional_conditions = use_additional_conditions
+        # print(f"----------------self.use_additional_conditions: {self.use_additional_conditions} {sample_size}")
+
+        self.gradient_checkpointing = False
+
+        # 2. Initialize the position embedding and transformer blocks.
+        self.height = self.config.sample_size
+        self.width = self.config.sample_size
+
+        interpolation_scale = (
+            self.config.interpolation_scale
+            if self.config.interpolation_scale is not None
+            else max(self.config.sample_size // 64, 1)
+        )
+        self.pos_embed = CausalPatchEmbed(
+            height=self.config.sample_size,
+            width=self.config.sample_size,
+            patch_size=self.config.patch_size,
+            in_channels=self.config.in_channels,
+            embed_dim=self.inner_dim,
+            interpolation_scale=interpolation_scale,
+        )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                CausalTransformerKVcacheBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    cross_attention_dim=self.config.cross_attention_dim,
+                    activation_fn=self.config.activation_fn,
+                    num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+                    attention_bias=self.config.attention_bias,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                    attention_type=self.config.attention_type,
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+
+        # 3. Output blocks.
+        self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.scale_shift_table = nn.Parameter(torch.randn(2, self.inner_dim) / self.inner_dim**0.5)
+        self.proj_out = nn.Linear(self.inner_dim, self.config.patch_size * self.config.patch_size * self.out_channels)
+
+        self.adaln_single = AdaLayerNormSingle(
+            self.inner_dim, use_additional_conditions=self.use_additional_conditions
+        )
+        self.caption_projection = None
+        if self.config.caption_channels is not None:
+            self.caption_projection = PixArtAlphaTextProjection(
+                in_features=self.config.caption_channels, hidden_size=self.inner_dim
+            )
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+
+        Safe to just use `AttnProcessor()` as PixArt doesn't have any exotic attention processors in default model.
+        """
+        self.set_attn_processor(AttnProcessor())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor, # (b, c, h, w)
+        ref_hidden_states: torch.Tensor = None, # (b, n_ref, c, h, w)
+        n_ref_lists: List[List[int]] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Dict[str, torch.Tensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        control_list: Optional[list] = None,
+        return_dict: bool = True,
+        repa_layer_idx: Optional[int] = None,
+        K_cache: Optional[list] = None,
+        V_cache: Optional[list] = None,
+    ):
+        """
+        The [`PixArtTransformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep (`torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            added_cond_kwargs: (`Dict[str, Any]`, *optional*): Additional conditions to be used as inputs.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if self.use_additional_conditions and added_cond_kwargs is None:
+            raise ValueError("`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`.")
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        no_cache = K_cache is None and V_cache is None
+
+        # 1. Input
+        batch_size = hidden_states.shape[0]
+        if no_cache:
+            b_list = len(n_ref_lists)
+            if b_list!=batch_size:
+                raise ValueError(f"Number of reference images ({b_list}) does not match the batch size ({batch_size})")
+        height, width = (
+            hidden_states.shape[-2] // self.config.patch_size,
+            hidden_states.shape[-1] // self.config.patch_size,
+        )
+        
+        if no_cache:
+            K_cache_cur = []
+            V_cache_cur = []
+            N_ref = ref_hidden_states.shape[1]
+            if N_ref != sum(n_ref_lists[0]):
+                raise ValueError(f"Number of reference images ({N_ref}) does not match the sum of reference image counts ({sum(n_ref_lists[0])})")
+
+            height_refs, width_refs = (
+                ref_hidden_states.shape[-2] // self.config.patch_size,
+                ref_hidden_states.shape[-1] // self.config.patch_size,
+            )
+            # print('height_refs',height_refs, 'width_refs',width_refs, 'height',height, 'width',width)
+
+            if height_refs*2 != height or width_refs*2 != width:
+                raise ValueError(f"Height and width of ref_hidden_states must be twice the size of hidden_states. Got height_refs={height_refs}, width_refs={width_refs}, height={height}, width={width}")
+
+            ref_hidden_states = rearrange(ref_hidden_states, 'b n_ref c h w -> (b n_ref) c h w')
+            ref_hidden_states = self.pos_embed(ref_hidden_states)
+            ref_hidden_states = rearrange(ref_hidden_states, '(b n_ref) l c -> b n_ref l c', n_ref=N_ref)
+        
+
+        hidden_states = self.pos_embed(hidden_states) # (b, l, c)
+        # print('hidden_states',hidden_states.shape)
+
+        pos_all = self.pos_embed.get_pos_embed(height, width).to(hidden_states.device)
+        # print('pos_all',pos_all.shape)
+
+
+        pos_all = rearrange(pos_all, 'b (h w) c -> b h w c', h=height, w=width*2)
+        pos_all = rearrange(pos_all, 'b (ph h) (pw w) c -> b (ph pw) h w c', ph=2, pw=4)
+
+        if no_cache:
+            pos_idx0 = rearrange(pos_all[:,0,:,:,:], 'b h w c -> b (h w) c')
+            pos_idx1 = rearrange(pos_all[:,3,:,:,:], 'b h w c -> b (h w) c')
+            pos_idx2 = rearrange(pos_all[:,4,:,:,:], 'b h w c -> b (h w) c')
+            pos_idx3 = rearrange(pos_all[:,7,:,:,:], 'b h w c -> b (h w) c')
+        
+        pos_center = rearrange(rearrange(pos_all[:,[1,2,5,6],:,:,:], 'b (ph pw) h w c -> b (ph h) (pw w) c', ph=2, pw=2), 'b h w c -> b (h w) c')
+
+        hidden_states = (hidden_states + pos_center).to(dtype=hidden_states.dtype)
+
+
+        if no_cache:
+
+            pos_repeat_list = []
+            for ref_b_idx in range(b_list):
+                pos_idx0_repeat = None
+                pos_idx1_repeat = None
+                pos_idx2_repeat = None
+                pos_idx3_repeat = None
+                cur_n_ref_list = n_ref_lists[ref_b_idx]
+                if cur_n_ref_list[0] > 0:
+                    pos_idx0_repeat = pos_idx0.unsqueeze(0).repeat(1, cur_n_ref_list[0], 1, 1)
+                if cur_n_ref_list[1] > 0:
+                    pos_idx1_repeat = pos_idx1.unsqueeze(0).repeat(1, cur_n_ref_list[1], 1, 1)
+                if cur_n_ref_list[2] > 0:
+                    pos_idx2_repeat = pos_idx2.unsqueeze(0).repeat(1, cur_n_ref_list[2], 1, 1)
+                if cur_n_ref_list[3] > 0:
+                    pos_idx3_repeat = pos_idx3.unsqueeze(0).repeat(1, cur_n_ref_list[3], 1, 1)
+
+                pos_repeat = []
+                if cur_n_ref_list[0] > 0:
+                    pos_repeat.append(pos_idx0_repeat)
+                if cur_n_ref_list[1] > 0:
+                    pos_repeat.append(pos_idx1_repeat)
+                if cur_n_ref_list[2] > 0:
+                    pos_repeat.append(pos_idx2_repeat)
+                if cur_n_ref_list[3] > 0:
+                    pos_repeat.append(pos_idx3_repeat)
+
+                pos_repeat = torch.cat(pos_repeat, dim=1)
+                pos_repeat_list.append(pos_repeat)
+
+        
+            pos_repeat_all = torch.cat(pos_repeat_list, dim=0)
+        
+            ref_hidden_states = (ref_hidden_states + pos_repeat_all).to(dtype=hidden_states.dtype)
+
+            
+            timestep_ref, _ = self.adaln_single(
+                timestep*0, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+            )
+            # print('timestep_ref',timestep*0, timestep)
+
+        timestep, embedded_timestep = self.adaln_single(
+            timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+        )
+
+        
+
+        if self.caption_projection is not None:
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+
+        # 2. Blocks
+        # add
+        control_idx = 0
+        cache_idx = 0
+
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+
+                if no_cache:
+                    hidden_states, ref_hidden_states, K_sample, V_sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        hidden_states,
+                        ref_hidden_states,
+                        attention_mask,
+                        encoder_hidden_states,
+                        encoder_attention_mask,
+                        timestep,
+                        cross_attention_kwargs,
+                        None,
+                        K_cache = None,
+                        V_cache = None,
+                        timestep_ref = timestep_ref,
+                        **ckpt_kwargs,
+                    )
+                    K_cache_cur.append(K_sample)
+                    V_cache_cur.append(V_sample)
+                else:
+                    hidden_states, _, _, _ = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        hidden_states,
+                        None,
+                        attention_mask,
+                        encoder_hidden_states,
+                        encoder_attention_mask,
+                        timestep,
+                        cross_attention_kwargs,
+                        None,
+                        K_cache = K_cache[cache_idx],
+                        V_cache = V_cache[cache_idx],
+                        timestep_ref = None,
+                        **ckpt_kwargs,
+                    )
+            else:
+                if no_cache:
+                    hidden_states, ref_hidden_states, K_sample, V_sample = block(
+                        hidden_states,
+                        ref_hidden_states,
+                        attention_mask=attention_mask,
+                        encoder_hidden_states=encoder_hidden_states,
+                        encoder_attention_mask=encoder_attention_mask,
+                        timestep=timestep,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        class_labels=None,
+                        K_cache = None,
+                        V_cache = None,
+                        timestep_ref = timestep_ref,
+                    )
+                    K_cache_cur.append(K_sample)
+                    V_cache_cur.append(V_sample)
+                else:
+                    hidden_states, _, _, _ = block(
+                        hidden_states,
+                        None,
+                        attention_mask=attention_mask,
+                        encoder_hidden_states=encoder_hidden_states,
+                        encoder_attention_mask=encoder_attention_mask,
+                        timestep=timestep,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        K_cache = K_cache[cache_idx],
+                        V_cache = V_cache[cache_idx],
+                        timestep_ref = None,
+                    )
+
+                
+            if control_list!=None and control_idx<len(control_list):
+                hidden_states += control_list[control_idx][0]
+                
+                control_idx+=1
+
+            cache_idx+=1
+                
+        # 3. Output
+        shift, scale = (
+            self.scale_shift_table[None] + embedded_timestep[:, None].to(self.scale_shift_table.device)
+        ).chunk(2, dim=1)
+        hidden_states = self.norm_out(hidden_states)
+
+        hidden_states = hidden_states * (1 + scale.to(hidden_states.device)) + shift.to(hidden_states.device)
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.squeeze(1)
+
+        hidden_states = hidden_states.reshape(
+            shape=(-1, height, width, self.config.patch_size, self.config.patch_size, self.out_channels)
+        )
+        # print('hidden_states.shape', hidden_states.shape)
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(
+            shape=(-1, self.out_channels, height * self.config.patch_size, width * self.config.patch_size)
+        )
+
+        if no_cache:
+            return (output, K_cache_cur, V_cache_cur)
+        else:
+            return (output, None, None)
+
+
diff --git a/diffusers/src/diffusers/models/transformers/causal_sparse_dit_control.py b/diffusers/src/diffusers/models/transformers/causal_sparse_dit_control.py
new file mode 100644
index 0000000000000000000000000000000000000000..50f4ae55377080f0ba694a0c6bbc9ec28df7f2f9
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/causal_sparse_dit_control.py
@@ -0,0 +1,484 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    import torch_npu
+    from torch_npu.contrib import transfer_to_npu
+except:
+    print('torch_npu not found')
+from typing import Any, Dict, Optional, Union
+
+import torch
+from torch import nn
+from typing import List
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import is_torch_version, logging
+from ..attention import BasicTransformerBlock, CausalTransformerBlock
+from ..attention_processor import Attention, AttentionProcessor, AttnProcessor, FusedAttnProcessor2_0
+from ..embeddings import PatchEmbed, PixArtAlphaTextProjection, CausalPatchEmbed
+from ..modeling_outputs import Transformer2DModelOutput, CausalTransformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormSingle
+from ...loaders import PeftAdapterMixin
+from einops import rearrange
+from torch.nn import Module, Linear, init
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class CausalSparseDiTControlModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    r"""
+    A 2D Transformer model as introduced in PixArt family of models (https://arxiv.org/abs/2310.00426,
+    https://arxiv.org/abs/2403.04692).
+
+    Parameters:
+        num_attention_heads (int, optional, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (int, optional, defaults to 72): The number of channels in each head.
+        in_channels (int, defaults to 4): The number of channels in the input.
+        out_channels (int, optional):
+            The number of channels in the output. Specify this parameter if the output channel number differs from the
+            input.
+        num_layers (int, optional, defaults to 28): The number of layers of Transformer blocks to use.
+        dropout (float, optional, defaults to 0.0): The dropout probability to use within the Transformer blocks.
+        norm_num_groups (int, optional, defaults to 32):
+            Number of groups for group normalization within Transformer blocks.
+        cross_attention_dim (int, optional):
+            The dimensionality for cross-attention layers, typically matching the encoder's hidden dimension.
+        attention_bias (bool, optional, defaults to True):
+            Configure if the Transformer blocks' attention should contain a bias parameter.
+        sample_size (int, defaults to 128):
+            The width of the latent images. This parameter is fixed during training.
+        patch_size (int, defaults to 2):
+            Size of the patches the model processes, relevant for architectures working on non-sequential data.
+        activation_fn (str, optional, defaults to "gelu-approximate"):
+            Activation function to use in feed-forward networks within Transformer blocks.
+        num_embeds_ada_norm (int, optional, defaults to 1000):
+            Number of embeddings for AdaLayerNorm, fixed during training and affects the maximum denoising steps during
+            inference.
+        upcast_attention (bool, optional, defaults to False):
+            If true, upcasts the attention mechanism dimensions for potentially improved performance.
+        norm_type (str, optional, defaults to "ada_norm_zero"):
+            Specifies the type of normalization used, can be 'ada_norm_zero'.
+        norm_elementwise_affine (bool, optional, defaults to False):
+            If true, enables element-wise affine parameters in the normalization layers.
+        norm_eps (float, optional, defaults to 1e-6):
+            A small constant added to the denominator in normalization layers to prevent division by zero.
+        interpolation_scale (int, optional): Scale factor to use during interpolating the position embeddings.
+        use_additional_conditions (bool, optional): If we're using additional conditions as inputs.
+        attention_type (str, optional, defaults to "default"): Kind of attention mechanism to be used.
+        caption_channels (int, optional, defaults to None):
+            Number of channels to use for projecting the caption embeddings.
+        use_linear_projection (bool, optional, defaults to False):
+            Deprecated argument. Will be removed in a future version.
+        num_vector_embeds (bool, optional, defaults to False):
+            Deprecated argument. Will be removed in a future version.
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["CausalTransformerBlock", "CausalPatchEmbed"]
+# {
+#   "_class_name": "Transformer2DModel",
+#   "_diffusers_version": "0.22.0.dev0",
+#   "activation_fn": "gelu-approximate",
+#   "attention_bias": true,
+#   "attention_head_dim": 72,
+#   "attention_type": "default",
+#   "caption_channels": 4096,
+#   "cross_attention_dim": 1152,
+#   "double_self_attention": false,
+#   "dropout": 0.0,
+#   "in_channels": 4,
+#   "norm_elementwise_affine": false,
+#   "norm_eps": 1e-06,
+#   "norm_num_groups": 32,
+#   "norm_type": "ada_norm_single",
+#   "num_attention_heads": 16,
+#   "num_embeds_ada_norm": 1000,
+#   "num_layers": 28,
+#   "num_vector_embeds": null,
+#   "only_cross_attention": false,
+#   "out_channels": 8,
+#   "patch_size": 2,
+#   "sample_size": 128,
+#   "upcast_attention": false,
+#   "use_linear_projection": false
+# }
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 72,
+        in_channels: int = 4,
+        cond_chanels: int = 9,
+        out_channels: Optional[int] = 8,
+        num_layers: int = 28,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = 1152,
+        attention_bias: bool = True,
+        sample_size: int = 128,
+        patch_size: int = 2,
+        activation_fn: str = "gelu-approximate",
+        num_embeds_ada_norm: Optional[int] = 1000,
+        upcast_attention: bool = False,
+        norm_type: str = "ada_norm_single",
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-6,
+        interpolation_scale: Optional[int] = None,
+        use_additional_conditions: Optional[bool] = None,
+        caption_channels: Optional[int] = None,
+        attention_type: Optional[str] = "default",
+    ):
+        super().__init__()
+
+        # Validate inputs.
+        if norm_type != "ada_norm_single":
+            raise NotImplementedError(
+                f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
+            )
+        elif norm_type == "ada_norm_single" and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
+            )
+
+        # Set some common variables used across the board.
+        self.attention_head_dim = attention_head_dim
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if use_additional_conditions is None:
+            if sample_size == 128:
+                use_additional_conditions = True
+            else:
+                use_additional_conditions = False
+        self.use_additional_conditions = use_additional_conditions
+        # print(f"----------------self.use_additional_conditions: {self.use_additional_conditions} {sample_size}")
+
+        self.gradient_checkpointing = False
+
+        # 2. Initialize the position embedding and transformer blocks.
+        self.height = self.config.sample_size
+        self.width = self.config.sample_size
+
+        interpolation_scale = (
+            self.config.interpolation_scale
+            if self.config.interpolation_scale is not None
+            else max(self.config.sample_size // 64, 1)
+        )
+        self.pos_embed = CausalPatchEmbed(
+            height=self.config.sample_size,
+            width=self.config.sample_size,
+            patch_size=self.config.patch_size,
+            in_channels=self.config.in_channels + cond_chanels,
+            embed_dim=self.inner_dim,
+            interpolation_scale=interpolation_scale,
+        )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    cross_attention_dim=None,
+                    activation_fn=self.config.activation_fn,
+                    num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+                    attention_bias=self.config.attention_bias,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                    attention_type=self.config.attention_type,
+                )
+                for _ in range(self.config.num_layers//2)
+            ]
+        )
+
+        # add adapter
+        adapter_list = []
+        for _ in range(self.config.num_layers//2):
+            self.after_proj = Linear(self.inner_dim, self.inner_dim) 
+            init.zeros_(self.after_proj.weight)
+            init.zeros_(self.after_proj.bias)
+            adapter_list.append(self.after_proj)
+
+        self.adapter_linear = nn.ModuleList(adapter_list)
+
+        self.adaln_single = AdaLayerNormSingle(
+            self.inner_dim, use_additional_conditions=self.use_additional_conditions
+        )
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+
+        Safe to just use `AttnProcessor()` as PixArt doesn't have any exotic attention processors in default model.
+        """
+        self.set_attn_processor(AttnProcessor())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor, # (b, c, h, w)
+        # ref_hidden_states: torch.Tensor, # (b, n_ref, c, h, w)
+        # n_ref_lists: List[List[int]],
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Dict[str, torch.Tensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`PixArtTransformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep (`torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            added_cond_kwargs: (`Dict[str, Any]`, *optional*): Additional conditions to be used as inputs.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if self.use_additional_conditions and added_cond_kwargs is None:
+            raise ValueError("`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`.")
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 1. Input
+        batch_size = hidden_states.shape[0]
+        height, width = (
+            hidden_states.shape[-2] // self.config.patch_size,
+            hidden_states.shape[-1] // self.config.patch_size,
+        )
+
+        # print('hidden_states',hidden_states.shape)
+
+        hidden_states = self.pos_embed(hidden_states) # (b, l, c)
+        # print('hidden_states',hidden_states.shape)
+
+        pos_all = self.pos_embed.get_pos_embed(height, width).to(hidden_states.device)
+        # print('pos_all',pos_all.shape)
+
+
+        pos_all = rearrange(pos_all, 'b (h w) c -> b h w c', h=height, w=width*2)
+        pos_all = rearrange(pos_all, 'b (ph h) (pw w) c -> b (ph pw) h w c', ph=2, pw=4)
+
+        pos_center = rearrange(rearrange(pos_all[:,[1,2,5,6],:,:,:], 'b (ph pw) h w c -> b (ph h) (pw w) c', ph=2, pw=2), 'b h w c -> b (h w) c')
+
+        hidden_states = (hidden_states + pos_center).to(dtype=hidden_states.dtype)
+
+
+
+        timestep, embedded_timestep = self.adaln_single(
+            timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+        )
+
+        # if self.caption_projection is not None:
+        #     encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+        #     encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+
+        # 2. Blocks
+        # add
+        control_idx = 0
+
+        output_list = []
+        for block, zero_adapter in zip(self.transformer_blocks, self.adapter_linear):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    None,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=None,
+                )
+            output_list.append([zero_adapter(hidden_states)])
+        
+        return (output_list,)
\ No newline at end of file
diff --git a/diffusers/src/diffusers/models/transformers/cogvideox_transformer_3d.py b/diffusers/src/diffusers/models/transformers/cogvideox_transformer_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..821da6d032d59ef16833a49b04acd6a40063a0de
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/cogvideox_transformer_3d.py
@@ -0,0 +1,507 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin
+from ...utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import maybe_allow_in_graph
+from ..attention import Attention, FeedForward
+from ..attention_processor import AttentionProcessor, CogVideoXAttnProcessor2_0, FusedCogVideoXAttnProcessor2_0
+from ..embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNorm, CogVideoXLayerNormZero
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@maybe_allow_in_graph
+class CogVideoXBlock(nn.Module):
+    r"""
+    Transformer block used in [CogVideoX](https://github.com/THUDM/CogVideo) model.
+
+    Parameters:
+        dim (`int`):
+            The number of channels in the input and output.
+        num_attention_heads (`int`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`):
+            The number of channels in each head.
+        time_embed_dim (`int`):
+            The number of channels in timestep embedding.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to be used in feed-forward.
+        attention_bias (`bool`, defaults to `False`):
+            Whether or not to use bias in attention projection layers.
+        qk_norm (`bool`, defaults to `True`):
+            Whether or not to use normalization after query and key projections in Attention.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_eps (`float`, defaults to `1e-5`):
+            Epsilon value for normalization layers.
+        final_dropout (`bool` defaults to `False`):
+            Whether to apply a final dropout after the last feed-forward layer.
+        ff_inner_dim (`int`, *optional*, defaults to `None`):
+            Custom hidden dimension of Feed-forward layer. If not provided, `4 * dim` is used.
+        ff_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Feed-forward layer.
+        attention_out_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Attention output projection layer.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        time_embed_dim: int,
+        dropout: float = 0.0,
+        activation_fn: str = "gelu-approximate",
+        attention_bias: bool = False,
+        qk_norm: bool = True,
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        final_dropout: bool = True,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+
+        # 1. Self Attention
+        self.norm1 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            qk_norm="layer_norm" if qk_norm else None,
+            eps=1e-6,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+            processor=CogVideoXAttnProcessor2_0(),
+        )
+
+        # 2. Feed Forward
+        self.norm2 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+
+        # norm & modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_msa, enc_gate_msa = self.norm1(
+            hidden_states, encoder_hidden_states, temb
+        )
+
+        # attention
+        attn_hidden_states, attn_encoder_hidden_states = self.attn1(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+        )
+
+        hidden_states = hidden_states + gate_msa * attn_hidden_states
+        encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states
+
+        # norm & modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_ff, enc_gate_ff = self.norm2(
+            hidden_states, encoder_hidden_states, temb
+        )
+
+        # feed-forward
+        norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
+        ff_output = self.ff(norm_hidden_states)
+
+        hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
+        encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
+
+        return hidden_states, encoder_hidden_states
+
+
+class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    """
+    A Transformer model for video-like data in [CogVideoX](https://github.com/THUDM/CogVideo).
+
+    Parameters:
+        num_attention_heads (`int`, defaults to `30`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, defaults to `64`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `16`):
+            The number of channels in the output.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        time_embed_dim (`int`, defaults to `512`):
+            Output dimension of timestep embeddings.
+        text_embed_dim (`int`, defaults to `4096`):
+            Input dimension of text embeddings from the text encoder.
+        num_layers (`int`, defaults to `30`):
+            The number of layers of Transformer blocks to use.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        attention_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in the attention projection layers.
+        sample_width (`int`, defaults to `90`):
+            The width of the input latents.
+        sample_height (`int`, defaults to `60`):
+            The height of the input latents.
+        sample_frames (`int`, defaults to `49`):
+            The number of frames in the input latents. Note that this parameter was incorrectly initialized to 49
+            instead of 13 because CogVideoX processed 13 latent frames at once in its default and recommended settings,
+            but cannot be changed to the correct value to ensure backwards compatibility. To create a transformer with
+            K latent frames, the correct value to pass here would be: ((K - 1) * temporal_compression_ratio + 1).
+        patch_size (`int`, defaults to `2`):
+            The size of the patches to use in the patch embedding layer.
+        temporal_compression_ratio (`int`, defaults to `4`):
+            The compression ratio across the temporal dimension. See documentation for `sample_frames`.
+        max_text_seq_length (`int`, defaults to `226`):
+            The maximum sequence length of the input text embeddings.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to use in feed-forward.
+        timestep_activation_fn (`str`, defaults to `"silu"`):
+            Activation function to use when generating the timestep embeddings.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether or not to use elementwise affine in normalization layers.
+        norm_eps (`float`, defaults to `1e-5`):
+            The epsilon value to use in normalization layers.
+        spatial_interpolation_scale (`float`, defaults to `1.875`):
+            Scaling factor to apply in 3D positional embeddings across spatial dimensions.
+        temporal_interpolation_scale (`float`, defaults to `1.0`):
+            Scaling factor to apply in 3D positional embeddings across temporal dimensions.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 30,
+        attention_head_dim: int = 64,
+        in_channels: int = 16,
+        out_channels: Optional[int] = 16,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        time_embed_dim: int = 512,
+        text_embed_dim: int = 4096,
+        num_layers: int = 30,
+        dropout: float = 0.0,
+        attention_bias: bool = True,
+        sample_width: int = 90,
+        sample_height: int = 60,
+        sample_frames: int = 49,
+        patch_size: int = 2,
+        temporal_compression_ratio: int = 4,
+        max_text_seq_length: int = 226,
+        activation_fn: str = "gelu-approximate",
+        timestep_activation_fn: str = "silu",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        spatial_interpolation_scale: float = 1.875,
+        temporal_interpolation_scale: float = 1.0,
+        use_rotary_positional_embeddings: bool = False,
+        use_learned_positional_embeddings: bool = False,
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+
+        if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
+            raise ValueError(
+                "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
+                "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
+                "issue at https://github.com/huggingface/diffusers/issues."
+            )
+
+        # 1. Patch embedding
+        self.patch_embed = CogVideoXPatchEmbed(
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=inner_dim,
+            text_embed_dim=text_embed_dim,
+            bias=True,
+            sample_width=sample_width,
+            sample_height=sample_height,
+            sample_frames=sample_frames,
+            temporal_compression_ratio=temporal_compression_ratio,
+            max_text_seq_length=max_text_seq_length,
+            spatial_interpolation_scale=spatial_interpolation_scale,
+            temporal_interpolation_scale=temporal_interpolation_scale,
+            use_positional_embeddings=not use_rotary_positional_embeddings,
+            use_learned_positional_embeddings=use_learned_positional_embeddings,
+        )
+        self.embedding_dropout = nn.Dropout(dropout)
+
+        # 2. Time embeddings
+        self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
+
+        # 3. Define spatio-temporal transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                CogVideoXBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    time_embed_dim=time_embed_dim,
+                    dropout=dropout,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm_final = nn.LayerNorm(inner_dim, norm_eps, norm_elementwise_affine)
+
+        # 4. Output blocks
+        self.norm_out = AdaLayerNorm(
+            embedding_dim=time_embed_dim,
+            output_dim=2 * inner_dim,
+            norm_elementwise_affine=norm_elementwise_affine,
+            norm_eps=norm_eps,
+            chunk_dim=1,
+        )
+        self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels)
+
+        self.gradient_checkpointing = False
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedCogVideoXAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedCogVideoXAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: Union[int, float, torch.LongTensor],
+        timestep_cond: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ):
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
+        batch_size, num_frames, channels, height, width = hidden_states.shape
+
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+
+        # 2. Patch embedding
+        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
+        hidden_states = self.embedding_dropout(hidden_states)
+
+        text_seq_length = encoder_hidden_states.shape[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length]
+        hidden_states = hidden_states[:, text_seq_length:]
+
+        # 3. Transformer blocks
+        for i, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    emb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=emb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+
+        if not self.config.use_rotary_positional_embeddings:
+            # CogVideoX-2B
+            hidden_states = self.norm_final(hidden_states)
+        else:
+            # CogVideoX-5B
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+            hidden_states = self.norm_final(hidden_states)
+            hidden_states = hidden_states[:, text_seq_length:]
+
+        # 4. Final block
+        hidden_states = self.norm_out(hidden_states, temb=emb)
+        hidden_states = self.proj_out(hidden_states)
+
+        # 5. Unpatchify
+        # Note: we use `-1` instead of `channels`:
+        #   - It is okay to `channels` use for CogVideoX-2b and CogVideoX-5b (number of input channels is equal to output channels)
+        #   - However, for CogVideoX-5b-I2V also takes concatenated input image latents (number of input channels is twice the output channels)
+        p = self.config.patch_size
+        output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
+        output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
diff --git a/diffusers/src/diffusers/models/transformers/colorguider_pixart.py b/diffusers/src/diffusers/models/transformers/colorguider_pixart.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fd8f8a4369c8a71d36155cfb97ceb4f6f20ad8c
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/colorguider_pixart.py
@@ -0,0 +1,439 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Union
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import is_torch_version, logging
+from ..attention import BasicTransformerBlock
+from ..attention_processor import Attention, AttentionProcessor, AttnProcessor, FusedAttnProcessor2_0
+from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormSingle
+from torch.nn import Module, Linear, init
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class ColorGuiderPixArtModel(ModelMixin, ConfigMixin):
+    r"""
+    A 2D Transformer model as introduced in PixArt family of models (https://arxiv.org/abs/2310.00426,
+    https://arxiv.org/abs/2403.04692).
+
+    Parameters:
+        num_attention_heads (int, optional, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (int, optional, defaults to 72): The number of channels in each head.
+        in_channels (int, defaults to 4): The number of channels in the input.
+        out_channels (int, optional):
+            The number of channels in the output. Specify this parameter if the output channel number differs from the
+            input.
+        num_layers (int, optional, defaults to 28): The number of layers of Transformer blocks to use.
+        dropout (float, optional, defaults to 0.0): The dropout probability to use within the Transformer blocks.
+        norm_num_groups (int, optional, defaults to 32):
+            Number of groups for group normalization within Transformer blocks.
+        cross_attention_dim (int, optional):
+            The dimensionality for cross-attention layers, typically matching the encoder's hidden dimension.
+        attention_bias (bool, optional, defaults to True):
+            Configure if the Transformer blocks' attention should contain a bias parameter.
+        sample_size (int, defaults to 128):
+            The width of the latent images. This parameter is fixed during training.
+        patch_size (int, defaults to 2):
+            Size of the patches the model processes, relevant for architectures working on non-sequential data.
+        activation_fn (str, optional, defaults to "gelu-approximate"):
+            Activation function to use in feed-forward networks within Transformer blocks.
+        num_embeds_ada_norm (int, optional, defaults to 1000):
+            Number of embeddings for AdaLayerNorm, fixed during training and affects the maximum denoising steps during
+            inference.
+        upcast_attention (bool, optional, defaults to False):
+            If true, upcasts the attention mechanism dimensions for potentially improved performance.
+        norm_type (str, optional, defaults to "ada_norm_zero"):
+            Specifies the type of normalization used, can be 'ada_norm_zero'.
+        norm_elementwise_affine (bool, optional, defaults to False):
+            If true, enables element-wise affine parameters in the normalization layers.
+        norm_eps (float, optional, defaults to 1e-6):
+            A small constant added to the denominator in normalization layers to prevent division by zero.
+        interpolation_scale (int, optional): Scale factor to use during interpolating the position embeddings.
+        use_additional_conditions (bool, optional): If we're using additional conditions as inputs.
+        attention_type (str, optional, defaults to "default"): Kind of attention mechanism to be used.
+        caption_channels (int, optional, defaults to None):
+            Number of channels to use for projecting the caption embeddings.
+        use_linear_projection (bool, optional, defaults to False):
+            Deprecated argument. Will be removed in a future version.
+        num_vector_embeds (bool, optional, defaults to False):
+            Deprecated argument. Will be removed in a future version.
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["BasicTransformerBlock", "PatchEmbed"]
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 72,
+        in_channels: int = 4,
+        cond_channels: int = 5,
+        out_channels: Optional[int] = 8,
+        num_layers: int = 28,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = 1152,
+        attention_bias: bool = True,
+        sample_size: int = 128,
+        patch_size: int = 2,
+        activation_fn: str = "gelu-approximate",
+        num_embeds_ada_norm: Optional[int] = 1000,
+        upcast_attention: bool = False,
+        norm_type: str = "ada_norm_single",
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-6,
+        interpolation_scale: Optional[int] = None,
+        use_additional_conditions: Optional[bool] = None,
+        caption_channels: Optional[int] = None,
+        attention_type: Optional[str] = "default",
+    ):
+        super().__init__()
+
+        # Validate inputs.
+        if norm_type != "ada_norm_single":
+            raise NotImplementedError(
+                f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
+            )
+        elif norm_type == "ada_norm_single" and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
+            )
+
+        # Set some common variables used across the board.
+        self.attention_head_dim = attention_head_dim
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if use_additional_conditions is None:
+            if sample_size == 128:
+                use_additional_conditions = True
+            else:
+                use_additional_conditions = False
+        self.use_additional_conditions = use_additional_conditions
+
+        self.gradient_checkpointing = False
+
+        # 2. Initialize the position embedding and transformer blocks.
+        self.height = self.config.sample_size
+        self.width = self.config.sample_size
+
+        interpolation_scale = (
+            self.config.interpolation_scale
+            if self.config.interpolation_scale is not None
+            else max(self.config.sample_size // 64, 1)
+        )
+        self.pos_embed = PatchEmbed(
+            height=self.config.sample_size,
+            width=self.config.sample_size,
+            patch_size=self.config.patch_size,
+            in_channels=self.config.in_channels + cond_channels,
+            embed_dim=self.inner_dim,
+            interpolation_scale=interpolation_scale,
+        )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    cross_attention_dim=None,
+                    activation_fn=self.config.activation_fn,
+                    num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+                    attention_bias=self.config.attention_bias,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                    attention_type=self.config.attention_type,
+                )
+                for _ in range(self.config.num_layers//2)
+            ]
+        )
+
+        # add adapter
+        adapter_list = []
+        for _ in range(self.config.num_layers//2):
+            self.after_proj = Linear(self.inner_dim, self.inner_dim) 
+            init.zeros_(self.after_proj.weight)
+            init.zeros_(self.after_proj.bias)
+            adapter_list.append(self.after_proj)
+
+        self.adapter_linear = nn.ModuleList(adapter_list)
+
+        # # 3. Output blocks.
+        # self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+        # self.scale_shift_table = nn.Parameter(torch.randn(2, self.inner_dim) / self.inner_dim**0.5)
+        # self.proj_out = nn.Linear(self.inner_dim, self.config.patch_size * self.config.patch_size * self.out_channels)
+
+        self.adaln_single = AdaLayerNormSingle(
+            self.inner_dim, use_additional_conditions=self.use_additional_conditions
+        )
+        # self.caption_projection = None
+        # if self.config.caption_channels is not None:
+        #     self.caption_projection = PixArtAlphaTextProjection(
+        #         in_features=self.config.caption_channels, hidden_size=self.inner_dim
+        #     )
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+
+        Safe to just use `AttnProcessor()` as PixArt doesn't have any exotic attention processors in default model.
+        """
+        self.set_attn_processor(AttnProcessor())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Dict[str, torch.Tensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`PixArtTransformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep (`torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            added_cond_kwargs: (`Dict[str, Any]`, *optional*): Additional conditions to be used as inputs.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if self.use_additional_conditions and added_cond_kwargs is None:
+            raise ValueError("`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`.")
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 1. Input
+        batch_size = hidden_states.shape[0]
+        height, width = (
+            hidden_states.shape[-2] // self.config.patch_size,
+            hidden_states.shape[-1] // self.config.patch_size,
+        )
+        hidden_states = self.pos_embed(hidden_states)
+
+        timestep, embedded_timestep = self.adaln_single(
+            timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+        )
+
+        # if self.caption_projection is not None:
+        #     encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+        #     encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+
+        # 2. Blocks
+        # for block in self.transformer_blocks:
+        output_list = []
+        for block, zero_adapter in zip(self.transformer_blocks, self.adapter_linear):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    None,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=None,
+                )
+            output_list.append(zero_adapter(hidden_states))
+
+
+
+        return (output_list,)
diff --git a/diffusers/src/diffusers/models/transformers/dit_transformer_2d.py b/diffusers/src/diffusers/models/transformers/dit_transformer_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f8957737dbc7b91166ec89d9a99f2ca73bdeeee
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/dit_transformer_2d.py
@@ -0,0 +1,240 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import is_torch_version, logging
+from ..attention import BasicTransformerBlock
+from ..embeddings import PatchEmbed
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class DiTTransformer2DModel(ModelMixin, ConfigMixin):
+    r"""
+    A 2D Transformer model as introduced in DiT (https://arxiv.org/abs/2212.09748).
+
+    Parameters:
+        num_attention_heads (int, optional, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (int, optional, defaults to 72): The number of channels in each head.
+        in_channels (int, defaults to 4): The number of channels in the input.
+        out_channels (int, optional):
+            The number of channels in the output. Specify this parameter if the output channel number differs from the
+            input.
+        num_layers (int, optional, defaults to 28): The number of layers of Transformer blocks to use.
+        dropout (float, optional, defaults to 0.0): The dropout probability to use within the Transformer blocks.
+        norm_num_groups (int, optional, defaults to 32):
+            Number of groups for group normalization within Transformer blocks.
+        attention_bias (bool, optional, defaults to True):
+            Configure if the Transformer blocks' attention should contain a bias parameter.
+        sample_size (int, defaults to 32):
+            The width of the latent images. This parameter is fixed during training.
+        patch_size (int, defaults to 2):
+            Size of the patches the model processes, relevant for architectures working on non-sequential data.
+        activation_fn (str, optional, defaults to "gelu-approximate"):
+            Activation function to use in feed-forward networks within Transformer blocks.
+        num_embeds_ada_norm (int, optional, defaults to 1000):
+            Number of embeddings for AdaLayerNorm, fixed during training and affects the maximum denoising steps during
+            inference.
+        upcast_attention (bool, optional, defaults to False):
+            If true, upcasts the attention mechanism dimensions for potentially improved performance.
+        norm_type (str, optional, defaults to "ada_norm_zero"):
+            Specifies the type of normalization used, can be 'ada_norm_zero'.
+        norm_elementwise_affine (bool, optional, defaults to False):
+            If true, enables element-wise affine parameters in the normalization layers.
+        norm_eps (float, optional, defaults to 1e-5):
+            A small constant added to the denominator in normalization layers to prevent division by zero.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 72,
+        in_channels: int = 4,
+        out_channels: Optional[int] = None,
+        num_layers: int = 28,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        attention_bias: bool = True,
+        sample_size: int = 32,
+        patch_size: int = 2,
+        activation_fn: str = "gelu-approximate",
+        num_embeds_ada_norm: Optional[int] = 1000,
+        upcast_attention: bool = False,
+        norm_type: str = "ada_norm_zero",
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-5,
+    ):
+        super().__init__()
+
+        # Validate inputs.
+        if norm_type != "ada_norm_zero":
+            raise NotImplementedError(
+                f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
+            )
+        elif norm_type == "ada_norm_zero" and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
+            )
+
+        # Set some common variables used across the board.
+        self.attention_head_dim = attention_head_dim
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.gradient_checkpointing = False
+
+        # 2. Initialize the position embedding and transformer blocks.
+        self.height = self.config.sample_size
+        self.width = self.config.sample_size
+
+        self.patch_size = self.config.patch_size
+        self.pos_embed = PatchEmbed(
+            height=self.config.sample_size,
+            width=self.config.sample_size,
+            patch_size=self.config.patch_size,
+            in_channels=self.config.in_channels,
+            embed_dim=self.inner_dim,
+        )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    activation_fn=self.config.activation_fn,
+                    num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+                    attention_bias=self.config.attention_bias,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+
+        # 3. Output blocks.
+        self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out_1 = nn.Linear(self.inner_dim, 2 * self.inner_dim)
+        self.proj_out_2 = nn.Linear(
+            self.inner_dim, self.config.patch_size * self.config.patch_size * self.out_channels
+        )
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`DiTTransformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # 1. Input
+        height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+        hidden_states = self.pos_embed(hidden_states)
+
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    None,
+                    None,
+                    None,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=None,
+                    encoder_hidden_states=None,
+                    encoder_attention_mask=None,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
+                )
+
+        # 3. Output
+        conditioning = self.transformer_blocks[0].norm1.emb(timestep, class_labels, hidden_dtype=hidden_states.dtype)
+        shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+        hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+        hidden_states = self.proj_out_2(hidden_states)
+
+        # unpatchify
+        height = width = int(hidden_states.shape[1] ** 0.5)
+        hidden_states = hidden_states.reshape(
+            shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+        )
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(
+            shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+        )
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/diffusers/src/diffusers/models/transformers/dual_transformer_2d.py b/diffusers/src/diffusers/models/transformers/dual_transformer_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c48c4e3db79dbff1ef7dd357fa53446c828af58
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/dual_transformer_2d.py
@@ -0,0 +1,156 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+from torch import nn
+
+from ..modeling_outputs import Transformer2DModelOutput
+from .transformer_2d import Transformer2DModel
+
+
+class DualTransformer2DModel(nn.Module):
+    """
+    Dual transformer wrapper that combines two `Transformer2DModel`s for mixed inference.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+    """
+
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+    ):
+        super().__init__()
+        self.transformers = nn.ModuleList(
+            [
+                Transformer2DModel(
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    in_channels=in_channels,
+                    num_layers=num_layers,
+                    dropout=dropout,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    attention_bias=attention_bias,
+                    sample_size=sample_size,
+                    num_vector_embeds=num_vector_embeds,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                )
+                for _ in range(2)
+            ]
+        )
+
+        # Variables that can be set by a pipeline:
+
+        # The ratio of transformer1 to transformer2's output states to be combined during inference
+        self.mix_ratio = 0.5
+
+        # The shape of `encoder_hidden_states` is expected to be
+        # `(batch_size, condition_lengths[0]+condition_lengths[1], num_features)`
+        self.condition_lengths = [77, 257]
+
+        # Which transformer to use to encode which condition.
+        # E.g. `(1, 0)` means that we'll use `transformers[1](conditions[0])` and `transformers[0](conditions[1])`
+        self.transformer_index_for_condition = [1, 0]
+
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states,
+        timestep=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+        return_dict: bool = True,
+    ):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continuous, `torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states.
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            attention_mask (`torch.Tensor`, *optional*):
+                Optional attention mask to be applied in Attention.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            [`~models.transformers.transformer_2d.Transformer2DModelOutput`] or `tuple`:
+            [`~models.transformers.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+        """
+        input_states = hidden_states
+
+        encoded_states = []
+        tokens_start = 0
+        # attention_mask is not used yet
+        for i in range(2):
+            # for each of the two transformers, pass the corresponding condition tokens
+            condition_state = encoder_hidden_states[:, tokens_start : tokens_start + self.condition_lengths[i]]
+            transformer_index = self.transformer_index_for_condition[i]
+            encoded_state = self.transformers[transformer_index](
+                input_states,
+                encoder_hidden_states=condition_state,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+            encoded_states.append(encoded_state - input_states)
+            tokens_start += self.condition_lengths[i]
+
+        output_states = encoded_states[0] * self.mix_ratio + encoded_states[1] * (1 - self.mix_ratio)
+        output_states = output_states + input_states
+
+        if not return_dict:
+            return (output_states,)
+
+        return Transformer2DModelOutput(sample=output_states)
diff --git a/diffusers/src/diffusers/models/transformers/hunyuan_transformer_2d.py b/diffusers/src/diffusers/models/transformers/hunyuan_transformer_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f3dab220aaacc4a9a10e5692ecb09fcc6dedf3c
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/hunyuan_transformer_2d.py
@@ -0,0 +1,578 @@
+# Copyright 2024 HunyuanDiT Authors, Qixun Wang and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Union
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import logging
+from ...utils.torch_utils import maybe_allow_in_graph
+from ..attention import FeedForward
+from ..attention_processor import Attention, AttentionProcessor, FusedHunyuanAttnProcessor2_0, HunyuanAttnProcessor2_0
+from ..embeddings import (
+    HunyuanCombinedTimestepTextSizeStyleEmbedding,
+    PatchEmbed,
+    PixArtAlphaTextProjection,
+)
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormContinuous, FP32LayerNorm
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class AdaLayerNormShift(nn.Module):
+    r"""
+    Norm layer modified to incorporate timestep embeddings.
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+
+    def __init__(self, embedding_dim: int, elementwise_affine=True, eps=1e-6):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, embedding_dim)
+        self.norm = FP32LayerNorm(embedding_dim, elementwise_affine=elementwise_affine, eps=eps)
+
+    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        shift = self.linear(self.silu(emb.to(torch.float32)).to(emb.dtype))
+        x = self.norm(x) + shift.unsqueeze(dim=1)
+        return x
+
+
+@maybe_allow_in_graph
+class HunyuanDiTBlock(nn.Module):
+    r"""
+    Transformer block used in Hunyuan-DiT model (https://github.com/Tencent/HunyuanDiT). Allow skip connection and
+    QKNorm
+
+    Parameters:
+        dim (`int`):
+            The number of channels in the input and output.
+        num_attention_heads (`int`):
+            The number of headsto use for multi-head attention.
+        cross_attention_dim (`int`,*optional*):
+            The size of the encoder_hidden_states vector for cross attention.
+        dropout(`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+        activation_fn (`str`,*optional*, defaults to `"geglu"`):
+            Activation function to be used in feed-forward. .
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_eps (`float`, *optional*, defaults to 1e-6):
+            A small constant added to the denominator in normalization layers to prevent division by zero.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        ff_inner_dim (`int`, *optional*):
+            The size of the hidden layer in the feed-forward block. Defaults to `None`.
+        ff_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the feed-forward block.
+        skip (`bool`, *optional*, defaults to `False`):
+            Whether to use skip connection. Defaults to `False` for down-blocks and mid-blocks.
+        qk_norm (`bool`, *optional*, defaults to `True`):
+            Whether to use normalization in QK calculation. Defaults to `True`.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        cross_attention_dim: int = 1024,
+        dropout=0.0,
+        activation_fn: str = "geglu",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-6,
+        final_dropout: bool = False,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        skip: bool = False,
+        qk_norm: bool = True,
+    ):
+        super().__init__()
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # NOTE: when new version comes, check norm2 and norm 3
+        # 1. Self-Attn
+        self.norm1 = AdaLayerNormShift(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=dim // num_attention_heads,
+            heads=num_attention_heads,
+            qk_norm="layer_norm" if qk_norm else None,
+            eps=1e-6,
+            bias=True,
+            processor=HunyuanAttnProcessor2_0(),
+        )
+
+        # 2. Cross-Attn
+        self.norm2 = FP32LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            dim_head=dim // num_attention_heads,
+            heads=num_attention_heads,
+            qk_norm="layer_norm" if qk_norm else None,
+            eps=1e-6,
+            bias=True,
+            processor=HunyuanAttnProcessor2_0(),
+        )
+        # 3. Feed-forward
+        self.norm3 = FP32LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,  ### 0.0
+            activation_fn=activation_fn,  ### approx GeLU
+            final_dropout=final_dropout,  ### 0.0
+            inner_dim=ff_inner_dim,  ### int(dim * mlp_ratio)
+            bias=ff_bias,
+        )
+
+        # 4. Skip Connection
+        if skip:
+            self.skip_norm = FP32LayerNorm(2 * dim, norm_eps, elementwise_affine=True)
+            self.skip_linear = nn.Linear(2 * dim, dim)
+        else:
+            self.skip_linear = None
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+
+    # Copied from diffusers.models.attention.BasicTransformerBlock.set_chunk_feed_forward
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        image_rotary_emb=None,
+        skip=None,
+    ) -> torch.Tensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Long Skip Connection
+        if self.skip_linear is not None:
+            cat = torch.cat([hidden_states, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            hidden_states = self.skip_linear(cat)
+
+        # 1. Self-Attention
+        norm_hidden_states = self.norm1(hidden_states, temb)  ### checked: self.norm1 is correct
+        attn_output = self.attn1(
+            norm_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+        )
+        hidden_states = hidden_states + attn_output
+
+        # 2. Cross-Attention
+        hidden_states = hidden_states + self.attn2(
+            self.norm2(hidden_states),
+            encoder_hidden_states=encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+        )
+
+        # FFN Layer ### TODO: switch norm2 and norm3 in the state dict
+        mlp_inputs = self.norm3(hidden_states)
+        hidden_states = hidden_states + self.ff(mlp_inputs)
+
+        return hidden_states
+
+
+class HunyuanDiT2DModel(ModelMixin, ConfigMixin):
+    """
+    HunYuanDiT: Diffusion model with a Transformer backbone.
+
+    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88):
+            The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        patch_size (`int`, *optional*):
+            The size of the patch to use for the input.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`):
+            Activation function to use in feed-forward.
+        sample_size (`int`, *optional*):
+            The width of the latent images. This is fixed during training since it is used to learn a number of
+            position embeddings.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+        cross_attention_dim (`int`, *optional*):
+            The number of dimension in the clip text embedding.
+        hidden_size (`int`, *optional*):
+            The size of hidden layer in the conditioning embedding layers.
+        num_layers (`int`, *optional*, defaults to 1):
+            The number of layers of Transformer blocks to use.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            The ratio of the hidden layer size to the input size.
+        learn_sigma (`bool`, *optional*, defaults to `True`):
+             Whether to predict variance.
+        cross_attention_dim_t5 (`int`, *optional*):
+            The number dimensions in t5 text embedding.
+        pooled_projection_dim (`int`, *optional*):
+            The size of the pooled projection.
+        text_len (`int`, *optional*):
+            The length of the clip text embedding.
+        text_len_t5 (`int`, *optional*):
+            The length of the T5 text embedding.
+        use_style_cond_and_image_meta_size (`bool`,  *optional*):
+            Whether or not to use style condition and image meta size. True for version <=1.1, False for version >= 1.2
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "gelu-approximate",
+        sample_size=32,
+        hidden_size=1152,
+        num_layers: int = 28,
+        mlp_ratio: float = 4.0,
+        learn_sigma: bool = True,
+        cross_attention_dim: int = 1024,
+        norm_type: str = "layer_norm",
+        cross_attention_dim_t5: int = 2048,
+        pooled_projection_dim: int = 1024,
+        text_len: int = 77,
+        text_len_t5: int = 256,
+        use_style_cond_and_image_meta_size: bool = True,
+    ):
+        super().__init__()
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.num_heads = num_attention_heads
+        self.inner_dim = num_attention_heads * attention_head_dim
+
+        self.text_embedder = PixArtAlphaTextProjection(
+            in_features=cross_attention_dim_t5,
+            hidden_size=cross_attention_dim_t5 * 4,
+            out_features=cross_attention_dim,
+            act_fn="silu_fp32",
+        )
+
+        self.text_embedding_padding = nn.Parameter(
+            torch.randn(text_len + text_len_t5, cross_attention_dim, dtype=torch.float32)
+        )
+
+        self.pos_embed = PatchEmbed(
+            height=sample_size,
+            width=sample_size,
+            in_channels=in_channels,
+            embed_dim=hidden_size,
+            patch_size=patch_size,
+            pos_embed_type=None,
+        )
+
+        self.time_extra_emb = HunyuanCombinedTimestepTextSizeStyleEmbedding(
+            hidden_size,
+            pooled_projection_dim=pooled_projection_dim,
+            seq_len=text_len_t5,
+            cross_attention_dim=cross_attention_dim_t5,
+            use_style_cond_and_image_meta_size=use_style_cond_and_image_meta_size,
+        )
+
+        # HunyuanDiT Blocks
+        self.blocks = nn.ModuleList(
+            [
+                HunyuanDiTBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    activation_fn=activation_fn,
+                    ff_inner_dim=int(self.inner_dim * mlp_ratio),
+                    cross_attention_dim=cross_attention_dim,
+                    qk_norm=True,  # See http://arxiv.org/abs/2302.05442 for details.
+                    skip=layer > num_layers // 2,
+                )
+                for layer in range(num_layers)
+            ]
+        )
+
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedHunyuanAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedHunyuanAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(HunyuanAttnProcessor2_0())
+
+    def forward(
+        self,
+        hidden_states,
+        timestep,
+        encoder_hidden_states=None,
+        text_embedding_mask=None,
+        encoder_hidden_states_t5=None,
+        text_embedding_mask_t5=None,
+        image_meta_size=None,
+        style=None,
+        image_rotary_emb=None,
+        controlnet_block_samples=None,
+        return_dict=True,
+    ):
+        """
+        The [`HunyuanDiT2DModel`] forward method.
+
+        Args:
+        hidden_states (`torch.Tensor` of shape `(batch size, dim, height, width)`):
+            The input tensor.
+        timestep ( `torch.LongTensor`, *optional*):
+            Used to indicate denoising step.
+        encoder_hidden_states ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+            Conditional embeddings for cross attention layer. This is the output of `BertModel`.
+        text_embedding_mask: torch.Tensor
+            An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. This is the output
+            of `BertModel`.
+        encoder_hidden_states_t5 ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+            Conditional embeddings for cross attention layer. This is the output of T5 Text Encoder.
+        text_embedding_mask_t5: torch.Tensor
+            An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. This is the output
+            of T5 Text Encoder.
+        image_meta_size (torch.Tensor):
+            Conditional embedding indicate the image sizes
+        style: torch.Tensor:
+            Conditional embedding indicate the style
+        image_rotary_emb (`torch.Tensor`):
+            The image rotary embeddings to apply on query and key tensors during attention calculation.
+        return_dict: bool
+            Whether to return a dictionary.
+        """
+
+        height, width = hidden_states.shape[-2:]
+
+        hidden_states = self.pos_embed(hidden_states)
+
+        temb = self.time_extra_emb(
+            timestep, encoder_hidden_states_t5, image_meta_size, style, hidden_dtype=timestep.dtype
+        )  # [B, D]
+
+        # text projection
+        batch_size, sequence_length, _ = encoder_hidden_states_t5.shape
+        encoder_hidden_states_t5 = self.text_embedder(
+            encoder_hidden_states_t5.view(-1, encoder_hidden_states_t5.shape[-1])
+        )
+        encoder_hidden_states_t5 = encoder_hidden_states_t5.view(batch_size, sequence_length, -1)
+
+        encoder_hidden_states = torch.cat([encoder_hidden_states, encoder_hidden_states_t5], dim=1)
+        text_embedding_mask = torch.cat([text_embedding_mask, text_embedding_mask_t5], dim=-1)
+        text_embedding_mask = text_embedding_mask.unsqueeze(2).bool()
+
+        encoder_hidden_states = torch.where(text_embedding_mask, encoder_hidden_states, self.text_embedding_padding)
+
+        skips = []
+        for layer, block in enumerate(self.blocks):
+            if layer > self.config.num_layers // 2:
+                if controlnet_block_samples is not None:
+                    skip = skips.pop() + controlnet_block_samples.pop()
+                else:
+                    skip = skips.pop()
+                hidden_states = block(
+                    hidden_states,
+                    temb=temb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_rotary_emb=image_rotary_emb,
+                    skip=skip,
+                )  # (N, L, D)
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    temb=temb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_rotary_emb=image_rotary_emb,
+                )  # (N, L, D)
+
+            if layer < (self.config.num_layers // 2 - 1):
+                skips.append(hidden_states)
+
+        if controlnet_block_samples is not None and len(controlnet_block_samples) != 0:
+            raise ValueError("The number of controls is not equal to the number of skip connections.")
+
+        # final layer
+        hidden_states = self.norm_out(hidden_states, temb.to(torch.float32))
+        hidden_states = self.proj_out(hidden_states)
+        # (N, L, patch_size ** 2 * out_channels)
+
+        # unpatchify: (N, out_channels, H, W)
+        patch_size = self.pos_embed.patch_size
+        height = height // patch_size
+        width = width // patch_size
+
+        hidden_states = hidden_states.reshape(
+            shape=(hidden_states.shape[0], height, width, patch_size, patch_size, self.out_channels)
+        )
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(
+            shape=(hidden_states.shape[0], self.out_channels, height * patch_size, width * patch_size)
+        )
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.disable_forward_chunking
+    def disable_forward_chunking(self):
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, None, 0)
diff --git a/diffusers/src/diffusers/models/transformers/latte_transformer_3d.py b/diffusers/src/diffusers/models/transformers/latte_transformer_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d19216e5ff656daa8e8e5da147ef57587fb543
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/latte_transformer_3d.py
@@ -0,0 +1,327 @@
+# Copyright 2024 the Latte Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.embeddings import PixArtAlphaTextProjection, get_1d_sincos_pos_embed_from_grid
+from ..attention import BasicTransformerBlock
+from ..embeddings import PatchEmbed
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormSingle
+
+
+class LatteTransformer3DModel(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+
+    """
+    A 3D Transformer model for video-like data, paper: https://arxiv.org/abs/2401.03048, offical code:
+    https://github.com/Vchitect/Latte
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input.
+        out_channels (`int`, *optional*):
+            The number of channels in the output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        patch_size (`int`, *optional*):
+            The size of the patches to use in the patch embedding layer.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states. During inference, you can denoise for up to but not more steps than
+            `num_embeds_ada_norm`.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The type of normalization to use. Options are `"layer_norm"` or `"ada_layer_norm"`.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether or not to use elementwise affine in normalization layers.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon value to use in normalization layers.
+        caption_channels (`int`, *optional*):
+            The number of channels in the caption embeddings.
+        video_length (`int`, *optional*):
+            The number of frames in the video-like data.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: int = 64,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        caption_channels: int = None,
+        video_length: int = 16,
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+
+        # 1. Define input layers
+        self.height = sample_size
+        self.width = sample_size
+
+        interpolation_scale = self.config.sample_size // 64
+        interpolation_scale = max(interpolation_scale, 1)
+        self.pos_embed = PatchEmbed(
+            height=sample_size,
+            width=sample_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=inner_dim,
+            interpolation_scale=interpolation_scale,
+        )
+
+        # 2. Define spatial transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        # 3. Define temporal transformers blocks
+        self.temporal_transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=None,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+        self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
+        self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+
+        # 5. Latte other blocks.
+        self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=False)
+        self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
+
+        # define temporal positional embedding
+        temp_pos_embed = get_1d_sincos_pos_embed_from_grid(
+            inner_dim, torch.arange(0, video_length).unsqueeze(1)
+        )  # 1152 hidden size
+        self.register_buffer("temp_pos_embed", torch.from_numpy(temp_pos_embed).float().unsqueeze(0), persistent=False)
+
+        self.gradient_checkpointing = False
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        enable_temporal_attentions: bool = True,
+        return_dict: bool = True,
+    ):
+        """
+        The [`LatteTransformer3DModel`] forward method.
+
+        Args:
+            hidden_states shape `(batch size, channel, num_frame, height, width)`:
+                Input `hidden_states`.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+                    * Mask `(batcheight, sequence_length)` True = keep, False = discard.
+                    * Bias `(batcheight, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            enable_temporal_attentions:
+                (`bool`, *optional*, defaults to `True`): Whether to enable temporal attentions.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+
+        # Reshape hidden states
+        batch_size, channels, num_frame, height, width = hidden_states.shape
+        # batch_size channels num_frame height width -> (batch_size * num_frame) channels height width
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4).reshape(-1, channels, height, width)
+
+        # Input
+        height, width = (
+            hidden_states.shape[-2] // self.config.patch_size,
+            hidden_states.shape[-1] // self.config.patch_size,
+        )
+        num_patches = height * width
+
+        hidden_states = self.pos_embed(hidden_states)  # alrady add positional embeddings
+
+        added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+        timestep, embedded_timestep = self.adaln_single(
+            timestep, added_cond_kwargs=added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+        )
+
+        # Prepare text embeddings for spatial block
+        # batch_size num_tokens hidden_size -> (batch_size * num_frame) num_tokens hidden_size
+        encoder_hidden_states = self.caption_projection(encoder_hidden_states)  # 3 120 1152
+        encoder_hidden_states_spatial = encoder_hidden_states.repeat_interleave(num_frame, dim=0).view(
+            -1, encoder_hidden_states.shape[-2], encoder_hidden_states.shape[-1]
+        )
+
+        # Prepare timesteps for spatial and temporal block
+        timestep_spatial = timestep.repeat_interleave(num_frame, dim=0).view(-1, timestep.shape[-1])
+        timestep_temp = timestep.repeat_interleave(num_patches, dim=0).view(-1, timestep.shape[-1])
+
+        # Spatial and temporal transformer blocks
+        for i, (spatial_block, temp_block) in enumerate(
+            zip(self.transformer_blocks, self.temporal_transformer_blocks)
+        ):
+            if self.training and self.gradient_checkpointing:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    spatial_block,
+                    hidden_states,
+                    None,  # attention_mask
+                    encoder_hidden_states_spatial,
+                    encoder_attention_mask,
+                    timestep_spatial,
+                    None,  # cross_attention_kwargs
+                    None,  # class_labels
+                    use_reentrant=False,
+                )
+            else:
+                hidden_states = spatial_block(
+                    hidden_states,
+                    None,  # attention_mask
+                    encoder_hidden_states_spatial,
+                    encoder_attention_mask,
+                    timestep_spatial,
+                    None,  # cross_attention_kwargs
+                    None,  # class_labels
+                )
+
+            if enable_temporal_attentions:
+                # (batch_size * num_frame) num_tokens hidden_size -> (batch_size * num_tokens) num_frame hidden_size
+                hidden_states = hidden_states.reshape(
+                    batch_size, -1, hidden_states.shape[-2], hidden_states.shape[-1]
+                ).permute(0, 2, 1, 3)
+                hidden_states = hidden_states.reshape(-1, hidden_states.shape[-2], hidden_states.shape[-1])
+
+                if i == 0 and num_frame > 1:
+                    hidden_states = hidden_states + self.temp_pos_embed
+
+                if self.training and self.gradient_checkpointing:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        temp_block,
+                        hidden_states,
+                        None,  # attention_mask
+                        None,  # encoder_hidden_states
+                        None,  # encoder_attention_mask
+                        timestep_temp,
+                        None,  # cross_attention_kwargs
+                        None,  # class_labels
+                        use_reentrant=False,
+                    )
+                else:
+                    hidden_states = temp_block(
+                        hidden_states,
+                        None,  # attention_mask
+                        None,  # encoder_hidden_states
+                        None,  # encoder_attention_mask
+                        timestep_temp,
+                        None,  # cross_attention_kwargs
+                        None,  # class_labels
+                    )
+
+                # (batch_size * num_tokens) num_frame hidden_size -> (batch_size * num_frame) num_tokens hidden_size
+                hidden_states = hidden_states.reshape(
+                    batch_size, -1, hidden_states.shape[-2], hidden_states.shape[-1]
+                ).permute(0, 2, 1, 3)
+                hidden_states = hidden_states.reshape(-1, hidden_states.shape[-2], hidden_states.shape[-1])
+
+        embedded_timestep = embedded_timestep.repeat_interleave(num_frame, dim=0).view(-1, embedded_timestep.shape[-1])
+        shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+        hidden_states = self.norm_out(hidden_states)
+        # Modulation
+        hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.proj_out(hidden_states)
+
+        # unpatchify
+        if self.adaln_single is None:
+            height = width = int(hidden_states.shape[1] ** 0.5)
+        hidden_states = hidden_states.reshape(
+            shape=(-1, height, width, self.config.patch_size, self.config.patch_size, self.out_channels)
+        )
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(
+            shape=(-1, self.out_channels, height * self.config.patch_size, width * self.config.patch_size)
+        )
+        output = output.reshape(batch_size, -1, output.shape[-3], output.shape[-2], output.shape[-1]).permute(
+            0, 2, 1, 3, 4
+        )
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/diffusers/src/diffusers/models/transformers/lumina_nextdit2d.py b/diffusers/src/diffusers/models/transformers/lumina_nextdit2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4f5b4658542e571153b5de530b0750843eddbb4
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/lumina_nextdit2d.py
@@ -0,0 +1,340 @@
+# Copyright 2024 Alpha-VLLM Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import logging
+from ..attention import LuminaFeedForward
+from ..attention_processor import Attention, LuminaAttnProcessor2_0
+from ..embeddings import (
+    LuminaCombinedTimestepCaptionEmbedding,
+    LuminaPatchEmbed,
+)
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import LuminaLayerNormContinuous, LuminaRMSNormZero, RMSNorm
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class LuminaNextDiTBlock(nn.Module):
+    """
+    A LuminaNextDiTBlock for LuminaNextDiT2DModel.
+
+    Parameters:
+        dim (`int`): Embedding dimension of the input features.
+        num_attention_heads (`int`): Number of attention heads.
+        num_kv_heads (`int`):
+            Number of attention heads in key and value features (if using GQA), or set to None for the same as query.
+        multiple_of (`int`): The number of multiple of ffn layer.
+        ffn_dim_multiplier (`float`): The multipier factor of ffn layer dimension.
+        norm_eps (`float`): The eps for norm layer.
+        qk_norm (`bool`): normalization for query and key.
+        cross_attention_dim (`int`): Cross attention embedding dimension of the input text prompt hidden_states.
+        norm_elementwise_affine (`bool`, *optional*, defaults to True),
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float,
+        norm_eps: float,
+        qk_norm: bool,
+        cross_attention_dim: int,
+        norm_elementwise_affine: bool = True,
+    ) -> None:
+        super().__init__()
+        self.head_dim = dim // num_attention_heads
+
+        self.gate = nn.Parameter(torch.zeros([num_attention_heads]))
+
+        # Self-attention
+        self.attn1 = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=dim // num_attention_heads,
+            qk_norm="layer_norm_across_heads" if qk_norm else None,
+            heads=num_attention_heads,
+            kv_heads=num_kv_heads,
+            eps=1e-5,
+            bias=False,
+            out_bias=False,
+            processor=LuminaAttnProcessor2_0(),
+        )
+        self.attn1.to_out = nn.Identity()
+
+        # Cross-attention
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            dim_head=dim // num_attention_heads,
+            qk_norm="layer_norm_across_heads" if qk_norm else None,
+            heads=num_attention_heads,
+            kv_heads=num_kv_heads,
+            eps=1e-5,
+            bias=False,
+            out_bias=False,
+            processor=LuminaAttnProcessor2_0(),
+        )
+
+        self.feed_forward = LuminaFeedForward(
+            dim=dim,
+            inner_dim=4 * dim,
+            multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier,
+        )
+
+        self.norm1 = LuminaRMSNormZero(
+            embedding_dim=dim,
+            norm_eps=norm_eps,
+            norm_elementwise_affine=norm_elementwise_affine,
+        )
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
+
+        self.norm2 = RMSNorm(dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
+
+        self.norm1_context = RMSNorm(cross_attention_dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        image_rotary_emb: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_mask: torch.Tensor,
+        temb: torch.Tensor,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Perform a forward pass through the LuminaNextDiTBlock.
+
+        Parameters:
+            hidden_states (`torch.Tensor`): The input of hidden_states for LuminaNextDiTBlock.
+            attention_mask (`torch.Tensor): The input of hidden_states corresponse attention mask.
+            image_rotary_emb (`torch.Tensor`): Precomputed cosine and sine frequencies.
+            encoder_hidden_states: (`torch.Tensor`): The hidden_states of text prompt are processed by Gemma encoder.
+            encoder_mask (`torch.Tensor`): The hidden_states of text prompt attention mask.
+            temb (`torch.Tensor`): Timestep embedding with text prompt embedding.
+            cross_attention_kwargs (`Dict[str, Any]`): kwargs for cross attention.
+        """
+        residual = hidden_states
+
+        # Self-attention
+        norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
+        self_attn_output = self.attn1(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_hidden_states,
+            attention_mask=attention_mask,
+            query_rotary_emb=image_rotary_emb,
+            key_rotary_emb=image_rotary_emb,
+            **cross_attention_kwargs,
+        )
+
+        # Cross-attention
+        norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states)
+        cross_attn_output = self.attn2(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            attention_mask=encoder_mask,
+            query_rotary_emb=image_rotary_emb,
+            key_rotary_emb=None,
+            **cross_attention_kwargs,
+        )
+        cross_attn_output = cross_attn_output * self.gate.tanh().view(1, 1, -1, 1)
+        mixed_attn_output = self_attn_output + cross_attn_output
+        mixed_attn_output = mixed_attn_output.flatten(-2)
+        # linear proj
+        hidden_states = self.attn2.to_out[0](mixed_attn_output)
+
+        hidden_states = residual + gate_msa.unsqueeze(1).tanh() * self.norm2(hidden_states)
+
+        mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
+
+        hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
+
+        return hidden_states
+
+
+class LuminaNextDiT2DModel(ModelMixin, ConfigMixin):
+    """
+    LuminaNextDiT: Diffusion model with a Transformer backbone.
+
+    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.
+
+    Parameters:
+        sample_size (`int`): The width of the latent images. This is fixed during training since
+            it is used to learn a number of position embeddings.
+        patch_size (`int`, *optional*, (`int`, *optional*, defaults to 2):
+            The size of each patch in the image. This parameter defines the resolution of patches fed into the model.
+        in_channels (`int`, *optional*, defaults to 4):
+            The number of input channels for the model. Typically, this matches the number of channels in the input
+            images.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
+            hidden representations.
+        num_layers (`int`, *optional*, default to 32):
+            The number of layers in the model. This defines the depth of the neural network.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            The number of attention heads in each attention layer. This parameter specifies how many separate attention
+            mechanisms are used.
+        num_kv_heads (`int`, *optional*, defaults to 8):
+            The number of key-value heads in the attention mechanism, if different from the number of attention heads.
+            If None, it defaults to num_attention_heads.
+        multiple_of (`int`, *optional*, defaults to 256):
+            A factor that the hidden size should be a multiple of. This can help optimize certain hardware
+            configurations.
+        ffn_dim_multiplier (`float`, *optional*):
+            A multiplier for the dimensionality of the feed-forward network. If None, it uses a default value based on
+            the model configuration.
+        norm_eps (`float`, *optional*, defaults to 1e-5):
+            A small value added to the denominator for numerical stability in normalization layers.
+        learn_sigma (`bool`, *optional*, defaults to True):
+            Whether the model should learn the sigma parameter, which might be related to uncertainty or variance in
+            predictions.
+        qk_norm (`bool`, *optional*, defaults to True):
+            Indicates if the queries and keys in the attention mechanism should be normalized.
+        cross_attention_dim (`int`, *optional*, defaults to 2048):
+            The dimensionality of the text embeddings. This parameter defines the size of the text representations used
+            in the model.
+        scaling_factor (`float`, *optional*, defaults to 1.0):
+            A scaling factor applied to certain parameters or layers in the model. This can be used for adjusting the
+            overall scale of the model's operations.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int = 128,
+        patch_size: Optional[int] = 2,
+        in_channels: Optional[int] = 4,
+        hidden_size: Optional[int] = 2304,
+        num_layers: Optional[int] = 32,
+        num_attention_heads: Optional[int] = 32,
+        num_kv_heads: Optional[int] = None,
+        multiple_of: Optional[int] = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        norm_eps: Optional[float] = 1e-5,
+        learn_sigma: Optional[bool] = True,
+        qk_norm: Optional[bool] = True,
+        cross_attention_dim: Optional[int] = 2048,
+        scaling_factor: Optional[float] = 1.0,
+    ) -> None:
+        super().__init__()
+        self.sample_size = sample_size
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = hidden_size // num_attention_heads
+        self.scaling_factor = scaling_factor
+
+        self.patch_embedder = LuminaPatchEmbed(
+            patch_size=patch_size, in_channels=in_channels, embed_dim=hidden_size, bias=True
+        )
+
+        self.pad_token = nn.Parameter(torch.empty(hidden_size))
+
+        self.time_caption_embed = LuminaCombinedTimestepCaptionEmbedding(
+            hidden_size=min(hidden_size, 1024), cross_attention_dim=cross_attention_dim
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                LuminaNextDiTBlock(
+                    hidden_size,
+                    num_attention_heads,
+                    num_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    qk_norm,
+                    cross_attention_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm_out = LuminaLayerNormContinuous(
+            embedding_dim=hidden_size,
+            conditioning_embedding_dim=min(hidden_size, 1024),
+            elementwise_affine=False,
+            eps=1e-6,
+            bias=True,
+            out_dim=patch_size * patch_size * self.out_channels,
+        )
+        # self.final_layer = LuminaFinalLayer(hidden_size, patch_size, self.out_channels)
+
+        assert (hidden_size // num_attention_heads) % 4 == 0, "2d rope needs head dim to be divisible by 4"
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_mask: torch.Tensor,
+        image_rotary_emb: torch.Tensor,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        return_dict=True,
+    ) -> torch.Tensor:
+        """
+        Forward pass of LuminaNextDiT.
+
+        Parameters:
+            hidden_states (torch.Tensor): Input tensor of shape (N, C, H, W).
+            timestep (torch.Tensor): Tensor of diffusion timesteps of shape (N,).
+            encoder_hidden_states (torch.Tensor): Tensor of caption features of shape (N, D).
+            encoder_mask (torch.Tensor): Tensor of caption masks of shape (N, L).
+        """
+        hidden_states, mask, img_size, image_rotary_emb = self.patch_embedder(hidden_states, image_rotary_emb)
+        image_rotary_emb = image_rotary_emb.to(hidden_states.device)
+
+        temb = self.time_caption_embed(timestep, encoder_hidden_states, encoder_mask)
+
+        encoder_mask = encoder_mask.bool()
+        for layer in self.layers:
+            hidden_states = layer(
+                hidden_states,
+                mask,
+                image_rotary_emb,
+                encoder_hidden_states,
+                encoder_mask,
+                temb=temb,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+
+        hidden_states = self.norm_out(hidden_states, temb)
+
+        # unpatchify
+        height_tokens = width_tokens = self.patch_size
+        height, width = img_size[0]
+        batch_size = hidden_states.size(0)
+        sequence_length = (height // height_tokens) * (width // width_tokens)
+        hidden_states = hidden_states[:, :sequence_length].view(
+            batch_size, height // height_tokens, width // width_tokens, height_tokens, width_tokens, self.out_channels
+        )
+        output = hidden_states.permute(0, 5, 1, 3, 2, 4).flatten(4, 5).flatten(2, 3)
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/diffusers/src/diffusers/models/transformers/pixart_transformer_2d.py b/diffusers/src/diffusers/models/transformers/pixart_transformer_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..025df92de6cff45bf201c5cb1c9f870584b8a547
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/pixart_transformer_2d.py
@@ -0,0 +1,482 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Union
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import is_torch_version, logging
+from ..attention import BasicTransformerBlock
+from ..attention_processor import Attention, AttentionProcessor, AttnProcessor, FusedAttnProcessor2_0
+from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormSingle
+from ...loaders import PeftAdapterMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class PixArtTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    r"""
+    A 2D Transformer model as introduced in PixArt family of models (https://arxiv.org/abs/2310.00426,
+    https://arxiv.org/abs/2403.04692).
+
+    Parameters:
+        num_attention_heads (int, optional, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (int, optional, defaults to 72): The number of channels in each head.
+        in_channels (int, defaults to 4): The number of channels in the input.
+        out_channels (int, optional):
+            The number of channels in the output. Specify this parameter if the output channel number differs from the
+            input.
+        num_layers (int, optional, defaults to 28): The number of layers of Transformer blocks to use.
+        dropout (float, optional, defaults to 0.0): The dropout probability to use within the Transformer blocks.
+        norm_num_groups (int, optional, defaults to 32):
+            Number of groups for group normalization within Transformer blocks.
+        cross_attention_dim (int, optional):
+            The dimensionality for cross-attention layers, typically matching the encoder's hidden dimension.
+        attention_bias (bool, optional, defaults to True):
+            Configure if the Transformer blocks' attention should contain a bias parameter.
+        sample_size (int, defaults to 128):
+            The width of the latent images. This parameter is fixed during training.
+        patch_size (int, defaults to 2):
+            Size of the patches the model processes, relevant for architectures working on non-sequential data.
+        activation_fn (str, optional, defaults to "gelu-approximate"):
+            Activation function to use in feed-forward networks within Transformer blocks.
+        num_embeds_ada_norm (int, optional, defaults to 1000):
+            Number of embeddings for AdaLayerNorm, fixed during training and affects the maximum denoising steps during
+            inference.
+        upcast_attention (bool, optional, defaults to False):
+            If true, upcasts the attention mechanism dimensions for potentially improved performance.
+        norm_type (str, optional, defaults to "ada_norm_zero"):
+            Specifies the type of normalization used, can be 'ada_norm_zero'.
+        norm_elementwise_affine (bool, optional, defaults to False):
+            If true, enables element-wise affine parameters in the normalization layers.
+        norm_eps (float, optional, defaults to 1e-6):
+            A small constant added to the denominator in normalization layers to prevent division by zero.
+        interpolation_scale (int, optional): Scale factor to use during interpolating the position embeddings.
+        use_additional_conditions (bool, optional): If we're using additional conditions as inputs.
+        attention_type (str, optional, defaults to "default"): Kind of attention mechanism to be used.
+        caption_channels (int, optional, defaults to None):
+            Number of channels to use for projecting the caption embeddings.
+        use_linear_projection (bool, optional, defaults to False):
+            Deprecated argument. Will be removed in a future version.
+        num_vector_embeds (bool, optional, defaults to False):
+            Deprecated argument. Will be removed in a future version.
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["BasicTransformerBlock", "PatchEmbed"]
+# {
+#   "_class_name": "Transformer2DModel",
+#   "_diffusers_version": "0.22.0.dev0",
+#   "activation_fn": "gelu-approximate",
+#   "attention_bias": true,
+#   "attention_head_dim": 72,
+#   "attention_type": "default",
+#   "caption_channels": 4096,
+#   "cross_attention_dim": 1152,
+#   "double_self_attention": false,
+#   "dropout": 0.0,
+#   "in_channels": 4,
+#   "norm_elementwise_affine": false,
+#   "norm_eps": 1e-06,
+#   "norm_num_groups": 32,
+#   "norm_type": "ada_norm_single",
+#   "num_attention_heads": 16,
+#   "num_embeds_ada_norm": 1000,
+#   "num_layers": 28,
+#   "num_vector_embeds": null,
+#   "only_cross_attention": false,
+#   "out_channels": 8,
+#   "patch_size": 2,
+#   "sample_size": 128,
+#   "upcast_attention": false,
+#   "use_linear_projection": false
+# }
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 72,
+        in_channels: int = 4,
+        out_channels: Optional[int] = 8,
+        num_layers: int = 28,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = 1152,
+        attention_bias: bool = True,
+        sample_size: int = 128,
+        patch_size: int = 2,
+        activation_fn: str = "gelu-approximate",
+        num_embeds_ada_norm: Optional[int] = 1000,
+        upcast_attention: bool = False,
+        norm_type: str = "ada_norm_single",
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-6,
+        interpolation_scale: Optional[int] = None,
+        use_additional_conditions: Optional[bool] = None,
+        caption_channels: Optional[int] = None,
+        attention_type: Optional[str] = "default",
+    ):
+        super().__init__()
+
+        # Validate inputs.
+        if norm_type != "ada_norm_single":
+            raise NotImplementedError(
+                f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
+            )
+        elif norm_type == "ada_norm_single" and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
+            )
+
+        # Set some common variables used across the board.
+        self.attention_head_dim = attention_head_dim
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if use_additional_conditions is None:
+            if sample_size == 128:
+                use_additional_conditions = True
+            else:
+                use_additional_conditions = False
+        self.use_additional_conditions = use_additional_conditions
+        # print(f"----------------self.use_additional_conditions: {self.use_additional_conditions} {sample_size}")
+
+        self.gradient_checkpointing = False
+
+        # 2. Initialize the position embedding and transformer blocks.
+        self.height = self.config.sample_size
+        self.width = self.config.sample_size
+
+        interpolation_scale = (
+            self.config.interpolation_scale
+            if self.config.interpolation_scale is not None
+            else max(self.config.sample_size // 64, 1)
+        )
+        self.pos_embed = PatchEmbed(
+            height=self.config.sample_size,
+            width=self.config.sample_size,
+            patch_size=self.config.patch_size,
+            in_channels=self.config.in_channels,
+            embed_dim=self.inner_dim,
+            interpolation_scale=interpolation_scale,
+        )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    cross_attention_dim=self.config.cross_attention_dim,
+                    activation_fn=self.config.activation_fn,
+                    num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+                    attention_bias=self.config.attention_bias,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                    attention_type=self.config.attention_type,
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+
+        # 3. Output blocks.
+        self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.scale_shift_table = nn.Parameter(torch.randn(2, self.inner_dim) / self.inner_dim**0.5)
+        self.proj_out = nn.Linear(self.inner_dim, self.config.patch_size * self.config.patch_size * self.out_channels)
+
+        self.adaln_single = AdaLayerNormSingle(
+            self.inner_dim, use_additional_conditions=self.use_additional_conditions
+        )
+        self.caption_projection = None
+        if self.config.caption_channels is not None:
+            self.caption_projection = PixArtAlphaTextProjection(
+                in_features=self.config.caption_channels, hidden_size=self.inner_dim
+            )
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+
+        Safe to just use `AttnProcessor()` as PixArt doesn't have any exotic attention processors in default model.
+        """
+        self.set_attn_processor(AttnProcessor())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Dict[str, torch.Tensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        control_list: Optional[list] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`PixArtTransformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep (`torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            added_cond_kwargs: (`Dict[str, Any]`, *optional*): Additional conditions to be used as inputs.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if self.use_additional_conditions and added_cond_kwargs is None:
+            raise ValueError("`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`.")
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 1. Input
+        batch_size = hidden_states.shape[0]
+        height, width = (
+            hidden_states.shape[-2] // self.config.patch_size,
+            hidden_states.shape[-1] // self.config.patch_size,
+        )
+        hidden_states = self.pos_embed(hidden_states)
+
+        timestep, embedded_timestep = self.adaln_single(
+            timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+        )
+
+        if self.caption_projection is not None:
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+
+        # 2. Blocks
+        # add
+        control_idx = 0
+
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    None,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=None,
+                )
+            if control_list!=None and control_idx<len(control_list):
+                # print('hidden_states.shape', hidden_states.shape, control_list[control_idx].shape)
+                # print('is 0:', control_list[control_idx][0,0,:10])
+                hidden_states += control_list[control_idx]
+                control_idx+=1
+
+        # 3. Output
+        shift, scale = (
+            self.scale_shift_table[None] + embedded_timestep[:, None].to(self.scale_shift_table.device)
+        ).chunk(2, dim=1)
+        hidden_states = self.norm_out(hidden_states)
+        # Modulation
+        hidden_states = hidden_states * (1 + scale.to(hidden_states.device)) + shift.to(hidden_states.device)
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.squeeze(1)
+
+        # unpatchify
+        hidden_states = hidden_states.reshape(
+            shape=(-1, height, width, self.config.patch_size, self.config.patch_size, self.out_channels)
+        )
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(
+            shape=(-1, self.out_channels, height * self.config.patch_size, width * self.config.patch_size)
+        )
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/diffusers/src/diffusers/models/transformers/prior_transformer.py b/diffusers/src/diffusers/models/transformers/prior_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdb67384ff5ef1c310d5e69dce28ca03ebcfd1e7
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/prior_transformer.py
@@ -0,0 +1,380 @@
+from dataclasses import dataclass
+from typing import Dict, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin
+from ...utils import BaseOutput
+from ..attention import BasicTransformerBlock
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+
+
+@dataclass
+class PriorTransformerOutput(BaseOutput):
+    """
+    The output of [`PriorTransformer`].
+
+    Args:
+        predicted_image_embedding (`torch.Tensor` of shape `(batch_size, embedding_dim)`):
+            The predicted CLIP image embedding conditioned on the CLIP text embedding input.
+    """
+
+    predicted_image_embedding: torch.Tensor
+
+
+class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
+    """
+    A Prior Transformer model.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 32): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use.
+        embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `hidden_states`
+        num_embeddings (`int`, *optional*, defaults to 77):
+            The number of embeddings of the model input `hidden_states`
+        additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
+            projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings +
+            additional_embeddings`.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        time_embed_act_fn (`str`, *optional*, defaults to 'silu'):
+            The activation function to use to create timestep embeddings.
+        norm_in_type (`str`, *optional*, defaults to None): The normalization layer to apply on hidden states before
+            passing to Transformer blocks. Set it to `None` if normalization is not needed.
+        embedding_proj_norm_type (`str`, *optional*, defaults to None):
+            The normalization layer to apply on the input `proj_embedding`. Set it to `None` if normalization is not
+            needed.
+        encoder_hid_proj_type (`str`, *optional*, defaults to `linear`):
+            The projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if
+            `encoder_hidden_states` is `None`.
+        added_emb_type (`str`, *optional*, defaults to `prd`): Additional embeddings to condition the model.
+            Choose from `prd` or `None`. if choose `prd`, it will prepend a token indicating the (quantized) dot
+            product between the text embedding and image embedding as proposed in the unclip paper
+            https://arxiv.org/abs/2204.06125 If it is `None`, no additional embeddings will be prepended.
+        time_embed_dim (`int, *optional*, defaults to None): The dimension of timestep embeddings.
+            If None, will be set to `num_attention_heads * attention_head_dim`
+        embedding_proj_dim (`int`, *optional*, default to None):
+            The dimension of `proj_embedding`. If None, will be set to `embedding_dim`.
+        clip_embed_dim (`int`, *optional*, default to None):
+            The dimension of the output. If None, will be set to `embedding_dim`.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 32,
+        attention_head_dim: int = 64,
+        num_layers: int = 20,
+        embedding_dim: int = 768,
+        num_embeddings=77,
+        additional_embeddings=4,
+        dropout: float = 0.0,
+        time_embed_act_fn: str = "silu",
+        norm_in_type: Optional[str] = None,  # layer
+        embedding_proj_norm_type: Optional[str] = None,  # layer
+        encoder_hid_proj_type: Optional[str] = "linear",  # linear
+        added_emb_type: Optional[str] = "prd",  # prd
+        time_embed_dim: Optional[int] = None,
+        embedding_proj_dim: Optional[int] = None,
+        clip_embed_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.additional_embeddings = additional_embeddings
+
+        time_embed_dim = time_embed_dim or inner_dim
+        embedding_proj_dim = embedding_proj_dim or embedding_dim
+        clip_embed_dim = clip_embed_dim or embedding_dim
+
+        self.time_proj = Timesteps(inner_dim, True, 0)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, out_dim=inner_dim, act_fn=time_embed_act_fn)
+
+        self.proj_in = nn.Linear(embedding_dim, inner_dim)
+
+        if embedding_proj_norm_type is None:
+            self.embedding_proj_norm = None
+        elif embedding_proj_norm_type == "layer":
+            self.embedding_proj_norm = nn.LayerNorm(embedding_proj_dim)
+        else:
+            raise ValueError(f"unsupported embedding_proj_norm_type: {embedding_proj_norm_type}")
+
+        self.embedding_proj = nn.Linear(embedding_proj_dim, inner_dim)
+
+        if encoder_hid_proj_type is None:
+            self.encoder_hidden_states_proj = None
+        elif encoder_hid_proj_type == "linear":
+            self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
+        else:
+            raise ValueError(f"unsupported encoder_hid_proj_type: {encoder_hid_proj_type}")
+
+        self.positional_embedding = nn.Parameter(torch.zeros(1, num_embeddings + additional_embeddings, inner_dim))
+
+        if added_emb_type == "prd":
+            self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim))
+        elif added_emb_type is None:
+            self.prd_embedding = None
+        else:
+            raise ValueError(
+                f"`added_emb_type`: {added_emb_type} is not supported. Make sure to choose one of `'prd'` or `None`."
+            )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    activation_fn="gelu",
+                    attention_bias=True,
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        if norm_in_type == "layer":
+            self.norm_in = nn.LayerNorm(inner_dim)
+        elif norm_in_type is None:
+            self.norm_in = None
+        else:
+            raise ValueError(f"Unsupported norm_in_type: {norm_in_type}.")
+
+        self.norm_out = nn.LayerNorm(inner_dim)
+
+        self.proj_to_clip_embeddings = nn.Linear(inner_dim, clip_embed_dim)
+
+        causal_attention_mask = torch.full(
+            [num_embeddings + additional_embeddings, num_embeddings + additional_embeddings], -10000.0
+        )
+        causal_attention_mask.triu_(1)
+        causal_attention_mask = causal_attention_mask[None, ...]
+        self.register_buffer("causal_attention_mask", causal_attention_mask, persistent=False)
+
+        self.clip_mean = nn.Parameter(torch.zeros(1, clip_embed_dim))
+        self.clip_std = nn.Parameter(torch.zeros(1, clip_embed_dim))
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def forward(
+        self,
+        hidden_states,
+        timestep: Union[torch.Tensor, float, int],
+        proj_embedding: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`PriorTransformer`] forward method.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, embedding_dim)`):
+                The currently predicted image embeddings.
+            timestep (`torch.LongTensor`):
+                Current denoising step.
+            proj_embedding (`torch.Tensor` of shape `(batch_size, embedding_dim)`):
+                Projected embedding vector the denoising process is conditioned on.
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, num_embeddings, embedding_dim)`):
+                Hidden states of the text embeddings the denoising process is conditioned on.
+            attention_mask (`torch.BoolTensor` of shape `(batch_size, num_embeddings)`):
+                Text mask for the text embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformers.prior_transformer.PriorTransformerOutput`] instead of
+                a plain tuple.
+
+        Returns:
+            [`~models.transformers.prior_transformer.PriorTransformerOutput`] or `tuple`:
+                If return_dict is True, a [`~models.transformers.prior_transformer.PriorTransformerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        batch_size = hidden_states.shape[0]
+
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=hidden_states.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(hidden_states.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps * torch.ones(batch_size, dtype=timesteps.dtype, device=timesteps.device)
+
+        timesteps_projected = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might be fp16, so we need to cast here.
+        timesteps_projected = timesteps_projected.to(dtype=self.dtype)
+        time_embeddings = self.time_embedding(timesteps_projected)
+
+        if self.embedding_proj_norm is not None:
+            proj_embedding = self.embedding_proj_norm(proj_embedding)
+
+        proj_embeddings = self.embedding_proj(proj_embedding)
+        if self.encoder_hidden_states_proj is not None and encoder_hidden_states is not None:
+            encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
+        elif self.encoder_hidden_states_proj is not None and encoder_hidden_states is None:
+            raise ValueError("`encoder_hidden_states_proj` requires `encoder_hidden_states` to be set")
+
+        hidden_states = self.proj_in(hidden_states)
+
+        positional_embeddings = self.positional_embedding.to(hidden_states.dtype)
+
+        additional_embeds = []
+        additional_embeddings_len = 0
+
+        if encoder_hidden_states is not None:
+            additional_embeds.append(encoder_hidden_states)
+            additional_embeddings_len += encoder_hidden_states.shape[1]
+
+        if len(proj_embeddings.shape) == 2:
+            proj_embeddings = proj_embeddings[:, None, :]
+
+        if len(hidden_states.shape) == 2:
+            hidden_states = hidden_states[:, None, :]
+
+        additional_embeds = additional_embeds + [
+            proj_embeddings,
+            time_embeddings[:, None, :],
+            hidden_states,
+        ]
+
+        if self.prd_embedding is not None:
+            prd_embedding = self.prd_embedding.to(hidden_states.dtype).expand(batch_size, -1, -1)
+            additional_embeds.append(prd_embedding)
+
+        hidden_states = torch.cat(
+            additional_embeds,
+            dim=1,
+        )
+
+        # Allow positional_embedding to not include the `addtional_embeddings` and instead pad it with zeros for these additional tokens
+        additional_embeddings_len = additional_embeddings_len + proj_embeddings.shape[1] + 1
+        if positional_embeddings.shape[1] < hidden_states.shape[1]:
+            positional_embeddings = F.pad(
+                positional_embeddings,
+                (
+                    0,
+                    0,
+                    additional_embeddings_len,
+                    self.prd_embedding.shape[1] if self.prd_embedding is not None else 0,
+                ),
+                value=0.0,
+            )
+
+        hidden_states = hidden_states + positional_embeddings
+
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = F.pad(attention_mask, (0, self.additional_embeddings), value=0.0)
+            attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).to(hidden_states.dtype)
+            attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, dim=0)
+
+        if self.norm_in is not None:
+            hidden_states = self.norm_in(hidden_states)
+
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, attention_mask=attention_mask)
+
+        hidden_states = self.norm_out(hidden_states)
+
+        if self.prd_embedding is not None:
+            hidden_states = hidden_states[:, -1]
+        else:
+            hidden_states = hidden_states[:, additional_embeddings_len:]
+
+        predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states)
+
+        if not return_dict:
+            return (predicted_image_embedding,)
+
+        return PriorTransformerOutput(predicted_image_embedding=predicted_image_embedding)
+
+    def post_process_latents(self, prior_latents):
+        prior_latents = (prior_latents * self.clip_std) + self.clip_mean
+        return prior_latents
diff --git a/diffusers/src/diffusers/models/transformers/stable_audio_transformer.py b/diffusers/src/diffusers/models/transformers/stable_audio_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3462b51a4129844e588f684b6fe829f21d4414f
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/stable_audio_transformer.py
@@ -0,0 +1,458 @@
+# Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.attention import FeedForward
+from ...models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    StableAudioAttnProcessor2_0,
+)
+from ...models.modeling_utils import ModelMixin
+from ...models.transformers.transformer_2d import Transformer2DModelOutput
+from ...utils import is_torch_version, logging
+from ...utils.torch_utils import maybe_allow_in_graph
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class StableAudioGaussianFourierProjection(nn.Module):
+    """Gaussian Fourier embeddings for noise levels."""
+
+    # Copied from diffusers.models.embeddings.GaussianFourierProjection.__init__
+    def __init__(
+        self, embedding_size: int = 256, scale: float = 1.0, set_W_to_weight=True, log=True, flip_sin_to_cos=False
+    ):
+        super().__init__()
+        self.weight = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+        self.log = log
+        self.flip_sin_to_cos = flip_sin_to_cos
+
+        if set_W_to_weight:
+            # to delete later
+            del self.weight
+            self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+            self.weight = self.W
+            del self.W
+
+    def forward(self, x):
+        if self.log:
+            x = torch.log(x)
+
+        x_proj = 2 * np.pi * x[:, None] @ self.weight[None, :]
+
+        if self.flip_sin_to_cos:
+            out = torch.cat([torch.cos(x_proj), torch.sin(x_proj)], dim=-1)
+        else:
+            out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+        return out
+
+
+@maybe_allow_in_graph
+class StableAudioDiTBlock(nn.Module):
+    r"""
+    Transformer block used in Stable Audio model (https://github.com/Stability-AI/stable-audio-tools). Allow skip
+    connection and QKNorm
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for the query states.
+        num_key_value_attention_heads (`int`): The number of heads to use for the key and value states.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        num_key_value_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        upcast_attention: bool = False,
+        norm_eps: float = 1e-5,
+        ff_inner_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=True, eps=norm_eps)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=False,
+            upcast_attention=upcast_attention,
+            out_bias=False,
+            processor=StableAudioAttnProcessor2_0(),
+        )
+
+        # 2. Cross-Attn
+        self.norm2 = nn.LayerNorm(dim, norm_eps, True)
+
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            kv_heads=num_key_value_attention_heads,
+            dropout=dropout,
+            bias=False,
+            upcast_attention=upcast_attention,
+            out_bias=False,
+            processor=StableAudioAttnProcessor2_0(),
+        )  # is self-attn if encoder_hidden_states is none
+
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, norm_eps, True)
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn="swiglu",
+            final_dropout=False,
+            inner_dim=ff_inner_dim,
+            bias=True,
+        )
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        rotary_embedding: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        norm_hidden_states = self.norm1(hidden_states)
+
+        attn_output = self.attn1(
+            norm_hidden_states,
+            attention_mask=attention_mask,
+            rotary_emb=rotary_embedding,
+        )
+
+        hidden_states = attn_output + hidden_states
+
+        # 2. Cross-Attention
+        norm_hidden_states = self.norm2(hidden_states)
+
+        attn_output = self.attn2(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=encoder_attention_mask,
+        )
+        hidden_states = attn_output + hidden_states
+
+        # 3. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        ff_output = self.ff(norm_hidden_states)
+
+        hidden_states = ff_output + hidden_states
+
+        return hidden_states
+
+
+class StableAudioDiTModel(ModelMixin, ConfigMixin):
+    """
+    The Diffusion Transformer model introduced in Stable Audio.
+
+    Reference: https://github.com/Stability-AI/stable-audio-tools
+
+    Parameters:
+        sample_size ( `int`, *optional*, defaults to 1024): The size of the input sample.
+        in_channels (`int`, *optional*, defaults to 64): The number of channels in the input.
+        num_layers (`int`, *optional*, defaults to 24): The number of layers of Transformer blocks to use.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_attention_heads (`int`, *optional*, defaults to 24): The number of heads to use for the query states.
+        num_key_value_attention_heads (`int`, *optional*, defaults to 12):
+            The number of heads to use for the key and value states.
+        out_channels (`int`, defaults to 64): Number of output channels.
+        cross_attention_dim ( `int`, *optional*, defaults to 768): Dimension of the cross-attention projection.
+        time_proj_dim ( `int`, *optional*, defaults to 256): Dimension of the timestep inner projection.
+        global_states_input_dim ( `int`, *optional*, defaults to 1536):
+            Input dimension of the global hidden states projection.
+        cross_attention_input_dim ( `int`, *optional*, defaults to 768):
+            Input dimension of the cross-attention projection
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int = 1024,
+        in_channels: int = 64,
+        num_layers: int = 24,
+        attention_head_dim: int = 64,
+        num_attention_heads: int = 24,
+        num_key_value_attention_heads: int = 12,
+        out_channels: int = 64,
+        cross_attention_dim: int = 768,
+        time_proj_dim: int = 256,
+        global_states_input_dim: int = 1536,
+        cross_attention_input_dim: int = 768,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        self.out_channels = out_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+
+        self.time_proj = StableAudioGaussianFourierProjection(
+            embedding_size=time_proj_dim // 2,
+            flip_sin_to_cos=True,
+            log=False,
+            set_W_to_weight=False,
+        )
+
+        self.timestep_proj = nn.Sequential(
+            nn.Linear(time_proj_dim, self.inner_dim, bias=True),
+            nn.SiLU(),
+            nn.Linear(self.inner_dim, self.inner_dim, bias=True),
+        )
+
+        self.global_proj = nn.Sequential(
+            nn.Linear(global_states_input_dim, self.inner_dim, bias=False),
+            nn.SiLU(),
+            nn.Linear(self.inner_dim, self.inner_dim, bias=False),
+        )
+
+        self.cross_attention_proj = nn.Sequential(
+            nn.Linear(cross_attention_input_dim, cross_attention_dim, bias=False),
+            nn.SiLU(),
+            nn.Linear(cross_attention_dim, cross_attention_dim, bias=False),
+        )
+
+        self.preprocess_conv = nn.Conv1d(in_channels, in_channels, 1, bias=False)
+        self.proj_in = nn.Linear(in_channels, self.inner_dim, bias=False)
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                StableAudioDiTBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    num_key_value_attention_heads=num_key_value_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                )
+                for i in range(num_layers)
+            ]
+        )
+
+        self.proj_out = nn.Linear(self.inner_dim, self.out_channels, bias=False)
+        self.postprocess_conv = nn.Conv1d(self.out_channels, self.out_channels, 1, bias=False)
+
+        self.gradient_checkpointing = False
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.transformers.hunyuan_transformer_2d.HunyuanDiT2DModel.set_default_attn_processor with Hunyuan->StableAudio
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(StableAudioAttnProcessor2_0())
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        timestep: torch.LongTensor = None,
+        encoder_hidden_states: torch.FloatTensor = None,
+        global_hidden_states: torch.FloatTensor = None,
+        rotary_embedding: torch.FloatTensor = None,
+        return_dict: bool = True,
+        attention_mask: Optional[torch.LongTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+        """
+        The [`StableAudioDiTModel`] forward method.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, in_channels, sequence_len)`):
+                Input `hidden_states`.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, encoder_sequence_len, cross_attention_input_dim)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            global_hidden_states (`torch.FloatTensor` of shape `(batch size, global_sequence_len, global_states_input_dim)`):
+               Global embeddings that will be prepended to the hidden states.
+            rotary_embedding (`torch.Tensor`):
+                The rotary embeddings to apply on query and key tensors during attention calculation.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_len)`, *optional*):
+                Mask to avoid performing attention on padding token indices, formed by concatenating the attention
+                masks
+                    for the two text encoders together. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_len)`, *optional*):
+                Mask to avoid performing attention on padding token cross-attention indices, formed by concatenating
+                the attention masks
+                    for the two text encoders together. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        cross_attention_hidden_states = self.cross_attention_proj(encoder_hidden_states)
+        global_hidden_states = self.global_proj(global_hidden_states)
+        time_hidden_states = self.timestep_proj(self.time_proj(timestep.to(self.dtype)))
+
+        global_hidden_states = global_hidden_states + time_hidden_states.unsqueeze(1)
+
+        hidden_states = self.preprocess_conv(hidden_states) + hidden_states
+        # (batch_size, dim, sequence_length) -> (batch_size, sequence_length, dim)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.proj_in(hidden_states)
+
+        # prepend global states to hidden states
+        hidden_states = torch.cat([global_hidden_states, hidden_states], dim=-2)
+        if attention_mask is not None:
+            prepend_mask = torch.ones((hidden_states.shape[0], 1), device=hidden_states.device, dtype=torch.bool)
+            attention_mask = torch.cat([prepend_mask, attention_mask], dim=-1)
+
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    attention_mask,
+                    cross_attention_hidden_states,
+                    encoder_attention_mask,
+                    rotary_embedding,
+                    **ckpt_kwargs,
+                )
+
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=cross_attention_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    rotary_embedding=rotary_embedding,
+                )
+
+        hidden_states = self.proj_out(hidden_states)
+
+        # (batch_size, sequence_length, dim) -> (batch_size, dim, sequence_length)
+        # remove prepend length that has been added by global hidden states
+        hidden_states = hidden_states.transpose(1, 2)[:, :, 1:]
+        hidden_states = self.postprocess_conv(hidden_states) + hidden_states
+
+        if not return_dict:
+            return (hidden_states,)
+
+        return Transformer2DModelOutput(sample=hidden_states)
diff --git a/diffusers/src/diffusers/models/transformers/t5_film_transformer.py b/diffusers/src/diffusers/models/transformers/t5_film_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dea37a25910856fedc15583253c2d77abc5d84c
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/t5_film_transformer.py
@@ -0,0 +1,436 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ..attention_processor import Attention
+from ..embeddings import get_timestep_embedding
+from ..modeling_utils import ModelMixin
+
+
+class T5FilmDecoder(ModelMixin, ConfigMixin):
+    r"""
+    T5 style decoder with FiLM conditioning.
+
+    Args:
+        input_dims (`int`, *optional*, defaults to `128`):
+            The number of input dimensions.
+        targets_length (`int`, *optional*, defaults to `256`):
+            The length of the targets.
+        d_model (`int`, *optional*, defaults to `768`):
+            Size of the input hidden states.
+        num_layers (`int`, *optional*, defaults to `12`):
+            The number of `DecoderLayer`'s to use.
+        num_heads (`int`, *optional*, defaults to `12`):
+            The number of attention heads to use.
+        d_kv (`int`, *optional*, defaults to `64`):
+            Size of the key-value projection vectors.
+        d_ff (`int`, *optional*, defaults to `2048`):
+            The number of dimensions in the intermediate feed-forward layer of `DecoderLayer`'s.
+        dropout_rate (`float`, *optional*, defaults to `0.1`):
+            Dropout probability.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        input_dims: int = 128,
+        targets_length: int = 256,
+        max_decoder_noise_time: float = 2000.0,
+        d_model: int = 768,
+        num_layers: int = 12,
+        num_heads: int = 12,
+        d_kv: int = 64,
+        d_ff: int = 2048,
+        dropout_rate: float = 0.1,
+    ):
+        super().__init__()
+
+        self.conditioning_emb = nn.Sequential(
+            nn.Linear(d_model, d_model * 4, bias=False),
+            nn.SiLU(),
+            nn.Linear(d_model * 4, d_model * 4, bias=False),
+            nn.SiLU(),
+        )
+
+        self.position_encoding = nn.Embedding(targets_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias=False)
+
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+        self.decoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            # FiLM conditional T5 decoder
+            lyr = DecoderLayer(d_model=d_model, d_kv=d_kv, num_heads=num_heads, d_ff=d_ff, dropout_rate=dropout_rate)
+            self.decoders.append(lyr)
+
+        self.decoder_norm = T5LayerNorm(d_model)
+
+        self.post_dropout = nn.Dropout(p=dropout_rate)
+        self.spec_out = nn.Linear(d_model, input_dims, bias=False)
+
+    def encoder_decoder_mask(self, query_input: torch.Tensor, key_input: torch.Tensor) -> torch.Tensor:
+        mask = torch.mul(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
+        return mask.unsqueeze(-3)
+
+    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
+        batch, _, _ = decoder_input_tokens.shape
+        assert decoder_noise_time.shape == (batch,)
+
+        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
+        time_steps = get_timestep_embedding(
+            decoder_noise_time * self.config.max_decoder_noise_time,
+            embedding_dim=self.config.d_model,
+            max_period=self.config.max_decoder_noise_time,
+        ).to(dtype=self.dtype)
+
+        conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
+
+        assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4)
+
+        seq_length = decoder_input_tokens.shape[1]
+
+        # If we want to use relative positions for audio context, we can just offset
+        # this sequence by the length of encodings_and_masks.
+        decoder_positions = torch.broadcast_to(
+            torch.arange(seq_length, device=decoder_input_tokens.device),
+            (batch, seq_length),
+        )
+
+        position_encodings = self.position_encoding(decoder_positions)
+
+        inputs = self.continuous_inputs_projection(decoder_input_tokens)
+        inputs += position_encodings
+        y = self.dropout(inputs)
+
+        # decoder: No padding present.
+        decoder_mask = torch.ones(
+            decoder_input_tokens.shape[:2], device=decoder_input_tokens.device, dtype=inputs.dtype
+        )
+
+        # Translate encoding masks to encoder-decoder masks.
+        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
+
+        # cross attend style: concat encodings
+        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
+        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
+
+        for lyr in self.decoders:
+            y = lyr(
+                y,
+                conditioning_emb=conditioning_emb,
+                encoder_hidden_states=encoded,
+                encoder_attention_mask=encoder_decoder_mask,
+            )[0]
+
+        y = self.decoder_norm(y)
+        y = self.post_dropout(y)
+
+        spec_out = self.spec_out(y)
+        return spec_out
+
+
+class DecoderLayer(nn.Module):
+    r"""
+    T5 decoder layer.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_kv (`int`):
+            Size of the key-value projection vectors.
+        num_heads (`int`):
+            Number of attention heads.
+        d_ff (`int`):
+            Size of the intermediate feed-forward layer.
+        dropout_rate (`float`):
+            Dropout probability.
+        layer_norm_epsilon (`float`, *optional*, defaults to `1e-6`):
+            A small value used for numerical stability to avoid dividing by zero.
+    """
+
+    def __init__(
+        self, d_model: int, d_kv: int, num_heads: int, d_ff: int, dropout_rate: float, layer_norm_epsilon: float = 1e-6
+    ):
+        super().__init__()
+        self.layer = nn.ModuleList()
+
+        # cond self attention: layer 0
+        self.layer.append(
+            T5LayerSelfAttentionCond(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        )
+
+        # cross attention: layer 1
+        self.layer.append(
+            T5LayerCrossAttention(
+                d_model=d_model,
+                d_kv=d_kv,
+                num_heads=num_heads,
+                dropout_rate=dropout_rate,
+                layer_norm_epsilon=layer_norm_epsilon,
+            )
+        )
+
+        # Film Cond MLP + dropout: last layer
+        self.layer.append(
+            T5LayerFFCond(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon)
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        conditioning_emb: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        encoder_decoder_position_bias=None,
+    ) -> Tuple[torch.Tensor]:
+        hidden_states = self.layer[0](
+            hidden_states,
+            conditioning_emb=conditioning_emb,
+            attention_mask=attention_mask,
+        )
+
+        if encoder_hidden_states is not None:
+            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10).to(
+                encoder_hidden_states.dtype
+            )
+
+            hidden_states = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_extended_attention_mask,
+            )
+
+        # Apply Film Conditional Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
+
+        return (hidden_states,)
+
+
+class T5LayerSelfAttentionCond(nn.Module):
+    r"""
+    T5 style self-attention layer with conditioning.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_kv (`int`):
+            Size of the key-value projection vectors.
+        num_heads (`int`):
+            Number of attention heads.
+        dropout_rate (`float`):
+            Dropout probability.
+    """
+
+    def __init__(self, d_model: int, d_kv: int, num_heads: int, dropout_rate: float):
+        super().__init__()
+        self.layer_norm = T5LayerNorm(d_model)
+        self.FiLMLayer = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
+        self.attention = Attention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        conditioning_emb: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # pre_self_attention_layer_norm
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        if conditioning_emb is not None:
+            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
+
+        # Self-attention block
+        attention_output = self.attention(normed_hidden_states)
+
+        hidden_states = hidden_states + self.dropout(attention_output)
+
+        return hidden_states
+
+
+class T5LayerCrossAttention(nn.Module):
+    r"""
+    T5 style cross-attention layer.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_kv (`int`):
+            Size of the key-value projection vectors.
+        num_heads (`int`):
+            Number of attention heads.
+        dropout_rate (`float`):
+            Dropout probability.
+        layer_norm_epsilon (`float`):
+            A small value used for numerical stability to avoid dividing by zero.
+    """
+
+    def __init__(self, d_model: int, d_kv: int, num_heads: int, dropout_rate: float, layer_norm_epsilon: float):
+        super().__init__()
+        self.attention = Attention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
+        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.attention(
+            normed_hidden_states,
+            encoder_hidden_states=key_value_states,
+            attention_mask=attention_mask.squeeze(1),
+        )
+        layer_output = hidden_states + self.dropout(attention_output)
+        return layer_output
+
+
+class T5LayerFFCond(nn.Module):
+    r"""
+    T5 style feed-forward conditional layer.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_ff (`int`):
+            Size of the intermediate feed-forward layer.
+        dropout_rate (`float`):
+            Dropout probability.
+        layer_norm_epsilon (`float`):
+            A small value used for numerical stability to avoid dividing by zero.
+    """
+
+    def __init__(self, d_model: int, d_ff: int, dropout_rate: float, layer_norm_epsilon: float):
+        super().__init__()
+        self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
+        self.film = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
+        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(self, hidden_states: torch.Tensor, conditioning_emb: Optional[torch.Tensor] = None) -> torch.Tensor:
+        forwarded_states = self.layer_norm(hidden_states)
+        if conditioning_emb is not None:
+            forwarded_states = self.film(forwarded_states, conditioning_emb)
+
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5DenseGatedActDense(nn.Module):
+    r"""
+    T5 style feed-forward layer with gated activations and dropout.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_ff (`int`):
+            Size of the intermediate feed-forward layer.
+        dropout_rate (`float`):
+            Dropout probability.
+    """
+
+    def __init__(self, d_model: int, d_ff: int, dropout_rate: float):
+        super().__init__()
+        self.wi_0 = nn.Linear(d_model, d_ff, bias=False)
+        self.wi_1 = nn.Linear(d_model, d_ff, bias=False)
+        self.wo = nn.Linear(d_ff, d_model, bias=False)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.act = NewGELUActivation()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5LayerNorm(nn.Module):
+    r"""
+    T5 style layer normalization module.
+
+    Args:
+        hidden_size (`int`):
+            Size of the input hidden states.
+        eps (`float`, `optional`, defaults to `1e-6`):
+            A small value used for numerical stability to avoid dividing by zero.
+    """
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+class NewGELUActivation(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+
+
+class T5FiLMLayer(nn.Module):
+    """
+    T5 style FiLM Layer.
+
+    Args:
+        in_features (`int`):
+            Number of input features.
+        out_features (`int`):
+            Number of output features.
+    """
+
+    def __init__(self, in_features: int, out_features: int):
+        super().__init__()
+        self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False)
+
+    def forward(self, x: torch.Tensor, conditioning_emb: torch.Tensor) -> torch.Tensor:
+        emb = self.scale_bias(conditioning_emb)
+        scale, shift = torch.chunk(emb, 2, -1)
+        x = x * (1 + scale) + shift
+        return x
diff --git a/diffusers/src/diffusers/models/transformers/transformer_2d.py b/diffusers/src/diffusers/models/transformers/transformer_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7c19e4582c6017360a31dd5b2232f847a992e54
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/transformer_2d.py
@@ -0,0 +1,566 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...configuration_utils import LegacyConfigMixin, register_to_config
+from ...utils import deprecate, is_torch_version, logging
+from ..attention import BasicTransformerBlock
+from ..embeddings import ImagePositionalEmbeddings, PatchEmbed, PixArtAlphaTextProjection
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import LegacyModelMixin
+from ..normalization import AdaLayerNormSingle
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class Transformer2DModelOutput(Transformer2DModelOutput):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.modeling_outputs import Transformer2DModelOutput`, instead."
+        deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)
+        super().__init__(*args, **kwargs)
+
+
+class Transformer2DModel(LegacyModelMixin, LegacyConfigMixin):
+    """
+    A 2D Transformer model for image-like data.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        num_vector_embeds (`int`, *optional*):
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["BasicTransformerBlock"]
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        attention_type: str = "default",
+        caption_channels: int = None,
+        interpolation_scale: float = None,
+        use_additional_conditions: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        # Validate inputs.
+        if patch_size is not None:
+            if norm_type not in ["ada_norm", "ada_norm_zero", "ada_norm_single"]:
+                raise NotImplementedError(
+                    f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
+                )
+            elif norm_type in ["ada_norm", "ada_norm_zero"] and num_embeds_ada_norm is None:
+                raise ValueError(
+                    f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
+                )
+
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`. Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+
+        # Set some common variables used across the board.
+        self.use_linear_projection = use_linear_projection
+        self.interpolation_scale = interpolation_scale
+        self.caption_channels = caption_channels
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.gradient_checkpointing = False
+
+        if use_additional_conditions is None:
+            if norm_type == "ada_norm_single" and sample_size == 128:
+                use_additional_conditions = True
+            else:
+                use_additional_conditions = False
+        self.use_additional_conditions = use_additional_conditions
+
+        # 2. Initialize the right blocks.
+        # These functions follow a common structure:
+        # a. Initialize the input blocks. b. Initialize the transformer blocks.
+        # c. Initialize the output blocks and other projection blocks when necessary.
+        if self.is_input_continuous:
+            self._init_continuous_input(norm_type=norm_type)
+        elif self.is_input_vectorized:
+            self._init_vectorized_inputs(norm_type=norm_type)
+        elif self.is_input_patches:
+            self._init_patched_inputs(norm_type=norm_type)
+
+    def _init_continuous_input(self, norm_type):
+        self.norm = torch.nn.GroupNorm(
+            num_groups=self.config.norm_num_groups, num_channels=self.in_channels, eps=1e-6, affine=True
+        )
+        if self.use_linear_projection:
+            self.proj_in = torch.nn.Linear(self.in_channels, self.inner_dim)
+        else:
+            self.proj_in = torch.nn.Conv2d(self.in_channels, self.inner_dim, kernel_size=1, stride=1, padding=0)
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    cross_attention_dim=self.config.cross_attention_dim,
+                    activation_fn=self.config.activation_fn,
+                    num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+                    attention_bias=self.config.attention_bias,
+                    only_cross_attention=self.config.only_cross_attention,
+                    double_self_attention=self.config.double_self_attention,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                    attention_type=self.config.attention_type,
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+
+        if self.use_linear_projection:
+            self.proj_out = torch.nn.Linear(self.inner_dim, self.out_channels)
+        else:
+            self.proj_out = torch.nn.Conv2d(self.inner_dim, self.out_channels, kernel_size=1, stride=1, padding=0)
+
+    def _init_vectorized_inputs(self, norm_type):
+        assert self.config.sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+        assert (
+            self.config.num_vector_embeds is not None
+        ), "Transformer2DModel over discrete input must provide num_embed"
+
+        self.height = self.config.sample_size
+        self.width = self.config.sample_size
+        self.num_latent_pixels = self.height * self.width
+
+        self.latent_image_embedding = ImagePositionalEmbeddings(
+            num_embed=self.config.num_vector_embeds, embed_dim=self.inner_dim, height=self.height, width=self.width
+        )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    cross_attention_dim=self.config.cross_attention_dim,
+                    activation_fn=self.config.activation_fn,
+                    num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+                    attention_bias=self.config.attention_bias,
+                    only_cross_attention=self.config.only_cross_attention,
+                    double_self_attention=self.config.double_self_attention,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                    attention_type=self.config.attention_type,
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+
+        self.norm_out = nn.LayerNorm(self.inner_dim)
+        self.out = nn.Linear(self.inner_dim, self.config.num_vector_embeds - 1)
+
+    def _init_patched_inputs(self, norm_type):
+        assert self.config.sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+
+        self.height = self.config.sample_size
+        self.width = self.config.sample_size
+
+        self.patch_size = self.config.patch_size
+        interpolation_scale = (
+            self.config.interpolation_scale
+            if self.config.interpolation_scale is not None
+            else max(self.config.sample_size // 64, 1)
+        )
+        self.pos_embed = PatchEmbed(
+            height=self.config.sample_size,
+            width=self.config.sample_size,
+            patch_size=self.config.patch_size,
+            in_channels=self.in_channels,
+            embed_dim=self.inner_dim,
+            interpolation_scale=interpolation_scale,
+        )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    cross_attention_dim=self.config.cross_attention_dim,
+                    activation_fn=self.config.activation_fn,
+                    num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+                    attention_bias=self.config.attention_bias,
+                    only_cross_attention=self.config.only_cross_attention,
+                    double_self_attention=self.config.double_self_attention,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                    attention_type=self.config.attention_type,
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+
+        if self.config.norm_type != "ada_norm_single":
+            self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+            self.proj_out_1 = nn.Linear(self.inner_dim, 2 * self.inner_dim)
+            self.proj_out_2 = nn.Linear(
+                self.inner_dim, self.config.patch_size * self.config.patch_size * self.out_channels
+            )
+        elif self.config.norm_type == "ada_norm_single":
+            self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+            self.scale_shift_table = nn.Parameter(torch.randn(2, self.inner_dim) / self.inner_dim**0.5)
+            self.proj_out = nn.Linear(
+                self.inner_dim, self.config.patch_size * self.config.patch_size * self.out_channels
+            )
+
+        # PixArt-Alpha blocks.
+        self.adaln_single = None
+        if self.config.norm_type == "ada_norm_single":
+            # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+            # additional conditions until we find better name
+            self.adaln_single = AdaLayerNormSingle(
+                self.inner_dim, use_additional_conditions=self.use_additional_conditions
+            )
+
+        self.caption_projection = None
+        if self.caption_channels is not None:
+            self.caption_projection = PixArtAlphaTextProjection(
+                in_features=self.caption_channels, hidden_size=self.inner_dim
+            )
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Dict[str, torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformers.transformer_2d.Transformer2DModelOutput`] is returned,
+            otherwise a `tuple` where the first element is the sample tensor.
+        """
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 1. Input
+        if self.is_input_continuous:
+            batch_size, _, height, width = hidden_states.shape
+            residual = hidden_states
+            hidden_states, inner_dim = self._operate_on_continuous_inputs(hidden_states)
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        elif self.is_input_patches:
+            height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+            hidden_states, encoder_hidden_states, timestep, embedded_timestep = self._operate_on_patched_inputs(
+                hidden_states, encoder_hidden_states, timestep, added_cond_kwargs
+            )
+
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
+                )
+
+        # 3. Output
+        if self.is_input_continuous:
+            output = self._get_output_for_continuous_inputs(
+                hidden_states=hidden_states,
+                residual=residual,
+                batch_size=batch_size,
+                height=height,
+                width=width,
+                inner_dim=inner_dim,
+            )
+        elif self.is_input_vectorized:
+            output = self._get_output_for_vectorized_inputs(hidden_states)
+        elif self.is_input_patches:
+            output = self._get_output_for_patched_inputs(
+                hidden_states=hidden_states,
+                timestep=timestep,
+                class_labels=class_labels,
+                embedded_timestep=embedded_timestep,
+                height=height,
+                width=width,
+            )
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
+
+    def _operate_on_continuous_inputs(self, hidden_states):
+        batch, _, height, width = hidden_states.shape
+        hidden_states = self.norm(hidden_states)
+
+        if not self.use_linear_projection:
+            hidden_states = self.proj_in(hidden_states)
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+        else:
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+            hidden_states = self.proj_in(hidden_states)
+
+        return hidden_states, inner_dim
+
+    def _operate_on_patched_inputs(self, hidden_states, encoder_hidden_states, timestep, added_cond_kwargs):
+        batch_size = hidden_states.shape[0]
+        hidden_states = self.pos_embed(hidden_states)
+        embedded_timestep = None
+
+        if self.adaln_single is not None:
+            if self.use_additional_conditions and added_cond_kwargs is None:
+                raise ValueError(
+                    "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+                )
+            timestep, embedded_timestep = self.adaln_single(
+                timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+            )
+
+        if self.caption_projection is not None:
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+
+        return hidden_states, encoder_hidden_states, timestep, embedded_timestep
+
+    def _get_output_for_continuous_inputs(self, hidden_states, residual, batch_size, height, width, inner_dim):
+        if not self.use_linear_projection:
+            hidden_states = (
+                hidden_states.reshape(batch_size, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+            )
+            hidden_states = self.proj_out(hidden_states)
+        else:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = (
+                hidden_states.reshape(batch_size, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+            )
+
+        output = hidden_states + residual
+        return output
+
+    def _get_output_for_vectorized_inputs(self, hidden_states):
+        hidden_states = self.norm_out(hidden_states)
+        logits = self.out(hidden_states)
+        # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+        logits = logits.permute(0, 2, 1)
+        # log(p(x_0))
+        output = F.log_softmax(logits.double(), dim=1).float()
+        return output
+
+    def _get_output_for_patched_inputs(
+        self, hidden_states, timestep, class_labels, embedded_timestep, height=None, width=None
+    ):
+        if self.config.norm_type != "ada_norm_single":
+            conditioning = self.transformer_blocks[0].norm1.emb(
+                timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+            shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+            hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+            hidden_states = self.proj_out_2(hidden_states)
+        elif self.config.norm_type == "ada_norm_single":
+            shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+            hidden_states = self.norm_out(hidden_states)
+            # Modulation
+            hidden_states = hidden_states * (1 + scale) + shift
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = hidden_states.squeeze(1)
+
+        # unpatchify
+        if self.adaln_single is None:
+            height = width = int(hidden_states.shape[1] ** 0.5)
+        hidden_states = hidden_states.reshape(
+            shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+        )
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(
+            shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+        )
+        return output
diff --git a/diffusers/src/diffusers/models/transformers/transformer_flux.py b/diffusers/src/diffusers/models/transformers/transformer_flux.py
new file mode 100644
index 0000000000000000000000000000000000000000..e38efe668c6c65ce3f6c4c1096fd2be6adc0c739
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/transformer_flux.py
@@ -0,0 +1,559 @@
+# Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
+from ...models.attention import FeedForward
+from ...models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    FluxAttnProcessor2_0,
+    FusedFluxAttnProcessor2_0,
+)
+from ...models.modeling_utils import ModelMixin
+from ...models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
+from ...utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import maybe_allow_in_graph
+from ..embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
+from ..modeling_outputs import Transformer2DModelOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@maybe_allow_in_graph
+class FluxSingleTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+
+    Reference: https://arxiv.org/abs/2403.03206
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+
+    def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+
+        processor = FluxAttnProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        image_rotary_emb=None,
+    ):
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+        )
+
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+
+        return hidden_states
+
+
+@maybe_allow_in_graph
+class FluxTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+
+    Reference: https://arxiv.org/abs/2403.03206
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+
+    def __init__(self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6):
+        super().__init__()
+
+        self.norm1 = AdaLayerNormZero(dim)
+
+        self.norm1_context = AdaLayerNormZero(dim)
+
+        if hasattr(F, "scaled_dot_product_attention"):
+            processor = FluxAttnProcessor2_0()
+        else:
+            raise ValueError(
+                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
+            )
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=processor,
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        image_rotary_emb=None,
+    ):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+
+        # Attention.
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+        )
+
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+
+        hidden_states = hidden_states + ff_output
+
+        # Process attention outputs for the `encoder_hidden_states`.
+
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+
+        return encoder_hidden_states, hidden_states
+
+
+class FluxTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    """
+    The Transformer model introduced in Flux.
+
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+
+    Parameters:
+        patch_size (`int`): Patch size to turn the input data into small patches.
+        in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
+        num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
+        num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
+        joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
+        guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        guidance_embeds: bool = False,
+        axes_dims_rope: Tuple[int] = (16, 56, 56),
+    ):
+        super().__init__()
+        self.out_channels = in_channels
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+
+        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+
+        text_time_guidance_cls = (
+            CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
+        )
+        self.time_text_embed = text_time_guidance_cls(
+            embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
+        )
+
+        self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim)
+        self.x_embedder = torch.nn.Linear(self.config.in_channels, self.inner_dim)
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                )
+                for i in range(self.config.num_layers)
+            ]
+        )
+
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                )
+                for i in range(self.config.num_single_layers)
+            ]
+        )
+
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+
+        self.gradient_checkpointing = False
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedFluxAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_block_samples=None,
+        controlnet_single_block_samples=None,
+        return_dict: bool = True,
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+        """
+        The [`FluxTransformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+                from the embeddings of input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        hidden_states = self.x_embedder(hidden_states)
+
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        else:
+            guidance = None
+        temb = (
+            self.time_text_embed(timestep, pooled_projections)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, pooled_projections)
+        )
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+
+        if txt_ids.ndim == 3:
+            logger.warning(
+                "Passing `txt_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            logger.warning(
+                "Passing `img_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            img_ids = img_ids[0]
+
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        image_rotary_emb = self.pos_embed(ids)
+
+        for index_block, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+
+            # controlnet residual
+            if controlnet_block_samples is not None:
+                interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
+
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+
+            # controlnet residual
+            if controlnet_single_block_samples is not None:
+                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
+                    hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                    + controlnet_single_block_samples[index_block // interval_control]
+                )
+
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/diffusers/src/diffusers/models/transformers/transformer_sd3.py b/diffusers/src/diffusers/models/transformers/transformer_sd3.py
new file mode 100644
index 0000000000000000000000000000000000000000..9376c91d075602c7063076aeea434499bb5bbb76
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/transformer_sd3.py
@@ -0,0 +1,367 @@
+# Copyright 2024 Stability AI, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
+from ...models.attention import JointTransformerBlock
+from ...models.attention_processor import Attention, AttentionProcessor, FusedJointAttnProcessor2_0
+from ...models.modeling_utils import ModelMixin
+from ...models.normalization import AdaLayerNormContinuous
+from ...utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from ..embeddings import CombinedTimestepTextProjEmbeddings, PatchEmbed
+from ..modeling_outputs import Transformer2DModelOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class SD3Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    """
+    The Transformer model introduced in Stable Diffusion 3.
+
+    Reference: https://arxiv.org/abs/2403.03206
+
+    Parameters:
+        sample_size (`int`): The width of the latent images. This is fixed during training since
+            it is used to learn a number of position embeddings.
+        patch_size (`int`): Patch size to turn the input data into small patches.
+        in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
+        num_layers (`int`, *optional*, defaults to 18): The number of layers of Transformer blocks to use.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        caption_projection_dim (`int`): Number of dimensions to use when projecting the `encoder_hidden_states`.
+        pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
+        out_channels (`int`, defaults to 16): Number of output channels.
+
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int = 128,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        num_layers: int = 18,
+        attention_head_dim: int = 64,
+        num_attention_heads: int = 18,
+        joint_attention_dim: int = 4096,
+        caption_projection_dim: int = 1152,
+        pooled_projection_dim: int = 2048,
+        out_channels: int = 16,
+        pos_embed_max_size: int = 96,
+    ):
+        super().__init__()
+        default_out_channels = in_channels
+        self.out_channels = out_channels if out_channels is not None else default_out_channels
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+
+        self.pos_embed = PatchEmbed(
+            height=self.config.sample_size,
+            width=self.config.sample_size,
+            patch_size=self.config.patch_size,
+            in_channels=self.config.in_channels,
+            embed_dim=self.inner_dim,
+            pos_embed_max_size=pos_embed_max_size,  # hard-code for now.
+        )
+        self.time_text_embed = CombinedTimestepTextProjEmbeddings(
+            embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
+        )
+        self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.config.caption_projection_dim)
+
+        # `attention_head_dim` is doubled to account for the mixing.
+        # It needs to crafted when we get the actual checkpoints.
+        self.transformer_blocks = nn.ModuleList(
+            [
+                JointTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                    context_pre_only=i == num_layers - 1,
+                )
+                for i in range(self.config.num_layers)
+            ]
+        )
+
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+
+        self.gradient_checkpointing = False
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.disable_forward_chunking
+    def disable_forward_chunking(self):
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, None, 0)
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedJointAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedJointAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        pooled_projections: torch.FloatTensor = None,
+        timestep: torch.LongTensor = None,
+        block_controlnet_hidden_states: List = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+        """
+        The [`SD3Transformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+                from the embeddings of input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
+        height, width = hidden_states.shape[-2:]
+
+        hidden_states = self.pos_embed(hidden_states)  # takes care of adding positional embeddings too.
+        temb = self.time_text_embed(timestep, pooled_projections)
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+
+        for index_block, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, temb=temb
+                )
+
+            # controlnet residual
+            if block_controlnet_hidden_states is not None and block.context_pre_only is False:
+                interval_control = len(self.transformer_blocks) // len(block_controlnet_hidden_states)
+                hidden_states = hidden_states + block_controlnet_hidden_states[index_block // interval_control]
+
+        hidden_states = self.norm_out(hidden_states, temb)
+        hidden_states = self.proj_out(hidden_states)
+
+        # unpatchify
+        patch_size = self.config.patch_size
+        height = height // patch_size
+        width = width // patch_size
+
+        hidden_states = hidden_states.reshape(
+            shape=(hidden_states.shape[0], height, width, patch_size, patch_size, self.out_channels)
+        )
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(
+            shape=(hidden_states.shape[0], self.out_channels, height * patch_size, width * patch_size)
+        )
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/diffusers/src/diffusers/models/transformers/transformer_temporal.py b/diffusers/src/diffusers/models/transformers/transformer_temporal.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0c5467050ddc68a2f7357327ab47034a8c03f3d
--- /dev/null
+++ b/diffusers/src/diffusers/models/transformers/transformer_temporal.py
@@ -0,0 +1,381 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ..attention import BasicTransformerBlock, TemporalBasicTransformerBlock
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from ..resnet import AlphaBlender
+
+
+@dataclass
+class TransformerTemporalModelOutput(BaseOutput):
+    """
+    The output of [`TransformerTemporalModel`].
+
+    Args:
+        sample (`torch.Tensor` of shape `(batch_size x num_frames, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input.
+    """
+
+    sample: torch.Tensor
+
+
+class TransformerTemporalModel(ModelMixin, ConfigMixin):
+    """
+    A Transformer model for video-like data.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlock` attention should contain a bias parameter.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`):
+            Activation function to use in feed-forward. See `diffusers.models.activations.get_activation` for supported
+            activation functions.
+        norm_elementwise_affine (`bool`, *optional*):
+            Configure if the `TransformerBlock` should use learnable elementwise affine parameters for normalization.
+        double_self_attention (`bool`, *optional*):
+            Configure if each `TransformerBlock` should contain two self-attention layers.
+        positional_embeddings: (`str`, *optional*):
+            The type of positional embeddings to apply to the sequence input before passing use.
+        num_positional_embeddings: (`int`, *optional*):
+            The maximum length of the sequence over which to apply positional embeddings.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        norm_elementwise_affine: bool = True,
+        double_self_attention: bool = True,
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+
+        self.in_channels = in_channels
+
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    double_self_attention=double_self_attention,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    positional_embeddings=positional_embeddings,
+                    num_positional_embeddings=num_positional_embeddings,
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.LongTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: torch.LongTensor = None,
+        num_frames: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> TransformerTemporalModelOutput:
+        """
+        The [`TransformerTemporal`] forward method.
+
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input hidden_states.
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            num_frames (`int`, *optional*, defaults to 1):
+                The number of frames to be processed per batch. This is used to reshape the hidden states.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`]
+                instead of a plain tuple.
+
+        Returns:
+            [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
+                If `return_dict` is True, an
+                [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`] is returned, otherwise a
+                `tuple` where the first element is the sample tensor.
+        """
+        # 1. Input
+        batch_frames, channel, height, width = hidden_states.shape
+        batch_size = batch_frames // num_frames
+
+        residual = hidden_states
+
+        hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, channel, height, width)
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+
+        hidden_states = self.norm(hidden_states)
+        hidden_states = hidden_states.permute(0, 3, 4, 2, 1).reshape(batch_size * height * width, num_frames, channel)
+
+        hidden_states = self.proj_in(hidden_states)
+
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+            )
+
+        # 3. Output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = (
+            hidden_states[None, None, :]
+            .reshape(batch_size, height, width, num_frames, channel)
+            .permute(0, 3, 4, 1, 2)
+            .contiguous()
+        )
+        hidden_states = hidden_states.reshape(batch_frames, channel, height, width)
+
+        output = hidden_states + residual
+
+        if not return_dict:
+            return (output,)
+
+        return TransformerTemporalModelOutput(sample=output)
+
+
+class TransformerSpatioTemporalModel(nn.Module):
+    """
+    A Transformer model for video-like data.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        out_channels (`int`, *optional*):
+            The number of channels in the output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+    """
+
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: int = 320,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        cross_attention_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+
+        inner_dim = num_attention_heads * attention_head_dim
+        self.inner_dim = inner_dim
+
+        # 2. Define input layers
+        self.in_channels = in_channels
+        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        time_mix_inner_dim = inner_dim
+        self.temporal_transformer_blocks = nn.ModuleList(
+            [
+                TemporalBasicTransformerBlock(
+                    inner_dim,
+                    time_mix_inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        time_embed_dim = in_channels * 4
+        self.time_pos_embed = TimestepEmbedding(in_channels, time_embed_dim, out_dim=in_channels)
+        self.time_proj = Timesteps(in_channels, True, 0)
+        self.time_mixer = AlphaBlender(alpha=0.5, merge_strategy="learned_with_images")
+
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        # TODO: should use out_channels for continuous projections
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
+                Input hidden_states.
+            num_frames (`int`):
+                The number of frames to be processed per batch. This is used to reshape the hidden states.
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            image_only_indicator (`torch.LongTensor` of shape `(batch size, num_frames)`, *optional*):
+                A tensor indicating whether the input contains only images. 1 indicates that the input contains only
+                images, 0 indicates that the input contains video frames.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`]
+                instead of a plain tuple.
+
+        Returns:
+            [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
+                If `return_dict` is True, an
+                [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`] is returned, otherwise a
+                `tuple` where the first element is the sample tensor.
+        """
+        # 1. Input
+        batch_frames, _, height, width = hidden_states.shape
+        num_frames = image_only_indicator.shape[-1]
+        batch_size = batch_frames // num_frames
+
+        time_context = encoder_hidden_states
+        time_context_first_timestep = time_context[None, :].reshape(
+            batch_size, num_frames, -1, time_context.shape[-1]
+        )[:, 0]
+        time_context = time_context_first_timestep[:, None].broadcast_to(
+            batch_size, height * width, time_context.shape[-2], time_context.shape[-1]
+        )
+        time_context = time_context.reshape(batch_size * height * width, -1, time_context.shape[-1])
+
+        residual = hidden_states
+
+        hidden_states = self.norm(hidden_states)
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch_frames, height * width, inner_dim)
+        hidden_states = self.proj_in(hidden_states)
+
+        num_frames_emb = torch.arange(num_frames, device=hidden_states.device)
+        num_frames_emb = num_frames_emb.repeat(batch_size, 1)
+        num_frames_emb = num_frames_emb.reshape(-1)
+        t_emb = self.time_proj(num_frames_emb)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+
+        emb = self.time_pos_embed(t_emb)
+        emb = emb[:, None, :]
+
+        # 2. Blocks
+        for block, temporal_block in zip(self.transformer_blocks, self.temporal_transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    block,
+                    hidden_states,
+                    None,
+                    encoder_hidden_states,
+                    None,
+                    use_reentrant=False,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+
+            hidden_states_mix = hidden_states
+            hidden_states_mix = hidden_states_mix + emb
+
+            hidden_states_mix = temporal_block(
+                hidden_states_mix,
+                num_frames=num_frames,
+                encoder_hidden_states=time_context,
+            )
+            hidden_states = self.time_mixer(
+                x_spatial=hidden_states,
+                x_temporal=hidden_states_mix,
+                image_only_indicator=image_only_indicator,
+            )
+
+        # 3. Output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.reshape(batch_frames, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+
+        output = hidden_states + residual
+
+        if not return_dict:
+            return (output,)
+
+        return TransformerTemporalModelOutput(sample=output)
diff --git a/diffusers/src/diffusers/models/unets/__init__.py b/diffusers/src/diffusers/models/unets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ef04fb62606ed30699be8ac6cb38b011a692070
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/__init__.py
@@ -0,0 +1,18 @@
+from ...utils import is_flax_available, is_torch_available
+
+
+if is_torch_available():
+    from .unet_1d import UNet1DModel
+    from .unet_2d import UNet2DModel
+    from .unet_2d_condition import UNet2DConditionModel
+    from .unet_3d_condition import UNet3DConditionModel
+    from .unet_i2vgen_xl import I2VGenXLUNet
+    from .unet_kandinsky3 import Kandinsky3UNet
+    from .unet_motion_model import MotionAdapter, UNetMotionModel
+    from .unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel
+    from .unet_stable_cascade import StableCascadeUNet
+    from .uvit_2d import UVit2DModel
+
+
+if is_flax_available():
+    from .unet_2d_condition_flax import FlaxUNet2DConditionModel
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/__init__.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db5752d515a9320c8b225b4558c55042221919a4
Binary files /dev/null and b/diffusers/src/diffusers/models/unets/__pycache__/__init__.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/unet_1d.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/unet_1d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b83851767288f03246772eb04bc2e9f99242a03
Binary files /dev/null and b/diffusers/src/diffusers/models/unets/__pycache__/unet_1d.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/unet_1d_blocks.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/unet_1d_blocks.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d21af3be9d74422022ff051a226373758dd2e885
Binary files /dev/null and b/diffusers/src/diffusers/models/unets/__pycache__/unet_1d_blocks.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/unet_2d.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/unet_2d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91dd6c03490a3f6a4c70a1ce3b1d1f0a543a0854
Binary files /dev/null and b/diffusers/src/diffusers/models/unets/__pycache__/unet_2d.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/unet_2d_blocks.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/unet_2d_blocks.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1e48c5d2effa0783a1afb3f7c23815052bb5ac8
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/__pycache__/unet_2d_blocks.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06c707552eb4221651dbabf0f22329bb878aa4b2e5d24c5456f26e8e7b97383a
+size 130039
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/unet_2d_condition.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/unet_2d_condition.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15304c9dfaed8642897c25eff6f7d2589cb06dd4
Binary files /dev/null and b/diffusers/src/diffusers/models/unets/__pycache__/unet_2d_condition.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/unet_3d_blocks.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/unet_3d_blocks.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e8b16989895512273e39c0297602d36de63807e
Binary files /dev/null and b/diffusers/src/diffusers/models/unets/__pycache__/unet_3d_blocks.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/unet_3d_condition.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/unet_3d_condition.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c9907356d8f33b511bc67d16fe8b880f914f0b9
Binary files /dev/null and b/diffusers/src/diffusers/models/unets/__pycache__/unet_3d_condition.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/unet_i2vgen_xl.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/unet_i2vgen_xl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5170e9fba576f001644e0ab0fdbf797d4adf1c47
Binary files /dev/null and b/diffusers/src/diffusers/models/unets/__pycache__/unet_i2vgen_xl.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/unet_kandinsky3.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/unet_kandinsky3.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da84c60e8958417b92d67e7540249d53bda750e1
Binary files /dev/null and b/diffusers/src/diffusers/models/unets/__pycache__/unet_kandinsky3.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/unet_motion_model.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/unet_motion_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93c0f42ea9463ab68337a46b884426d5c59cb0b6
Binary files /dev/null and b/diffusers/src/diffusers/models/unets/__pycache__/unet_motion_model.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/unet_spatio_temporal_condition.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/unet_spatio_temporal_condition.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fa56ff9821af09e5cad59282820d49dfb50da47
Binary files /dev/null and b/diffusers/src/diffusers/models/unets/__pycache__/unet_spatio_temporal_condition.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/unet_stable_cascade.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/unet_stable_cascade.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ab92b8995b7a6141d41a56d36b1fd701658d601
Binary files /dev/null and b/diffusers/src/diffusers/models/unets/__pycache__/unet_stable_cascade.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/unets/__pycache__/uvit_2d.cpython-311.pyc b/diffusers/src/diffusers/models/unets/__pycache__/uvit_2d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f04f9c8e99037aeb20b1c309e30ff7a5ff6c86ec
Binary files /dev/null and b/diffusers/src/diffusers/models/unets/__pycache__/uvit_2d.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/models/unets/unet_1d.py b/diffusers/src/diffusers/models/unets/unet_1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8efabd98ee7dc16306940ab3bab099227fe37d23
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_1d.py
@@ -0,0 +1,255 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ..embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from .unet_1d_blocks import get_down_block, get_mid_block, get_out_block, get_up_block
+
+
+@dataclass
+class UNet1DOutput(BaseOutput):
+    """
+    The output of [`UNet1DModel`].
+
+    Args:
+        sample (`torch.Tensor` of shape `(batch_size, num_channels, sample_size)`):
+            The hidden states output from the last layer of the model.
+    """
+
+    sample: torch.Tensor
+
+
+class UNet1DModel(ModelMixin, ConfigMixin):
+    r"""
+    A 1D UNet model that takes a noisy sample and a timestep and returns a sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int`, *optional*): Default length of sample. Should be adaptable at runtime.
+        in_channels (`int`, *optional*, defaults to 2): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 2): Number of channels in the output.
+        extra_in_channels (`int`, *optional*, defaults to 0):
+            Number of additional channels to be added to the input of the first down block. Useful for cases where the
+            input data has more channels than what the model was initially designed for.
+        time_embedding_type (`str`, *optional*, defaults to `"fourier"`): Type of time embedding to use.
+        freq_shift (`float`, *optional*, defaults to 0.0): Frequency shift for Fourier time embedding.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip sin to cos for Fourier time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D")`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip")`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(32, 32, 64)`):
+            Tuple of block output channels.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock1D"`): Block type for middle of UNet.
+        out_block_type (`str`, *optional*, defaults to `None`): Optional output processing block of UNet.
+        act_fn (`str`, *optional*, defaults to `None`): Optional activation function in UNet blocks.
+        norm_num_groups (`int`, *optional*, defaults to 8): The number of groups for normalization.
+        layers_per_block (`int`, *optional*, defaults to 1): The number of layers per block.
+        downsample_each_block (`int`, *optional*, defaults to `False`):
+            Experimental feature for using a UNet without upsampling.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int = 65536,
+        sample_rate: Optional[int] = None,
+        in_channels: int = 2,
+        out_channels: int = 2,
+        extra_in_channels: int = 0,
+        time_embedding_type: str = "fourier",
+        flip_sin_to_cos: bool = True,
+        use_timestep_embedding: bool = False,
+        freq_shift: float = 0.0,
+        down_block_types: Tuple[str] = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
+        up_block_types: Tuple[str] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
+        mid_block_type: Tuple[str] = "UNetMidBlock1D",
+        out_block_type: str = None,
+        block_out_channels: Tuple[int] = (32, 32, 64),
+        act_fn: str = None,
+        norm_num_groups: int = 8,
+        layers_per_block: int = 1,
+        downsample_each_block: bool = False,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+
+        # time
+        if time_embedding_type == "fourier":
+            self.time_proj = GaussianFourierProjection(
+                embedding_size=8, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = 2 * block_out_channels[0]
+        elif time_embedding_type == "positional":
+            self.time_proj = Timesteps(
+                block_out_channels[0], flip_sin_to_cos=flip_sin_to_cos, downscale_freq_shift=freq_shift
+            )
+            timestep_input_dim = block_out_channels[0]
+
+        if use_timestep_embedding:
+            time_embed_dim = block_out_channels[0] * 4
+            self.time_mlp = TimestepEmbedding(
+                in_channels=timestep_input_dim,
+                time_embed_dim=time_embed_dim,
+                act_fn=act_fn,
+                out_dim=block_out_channels[0],
+            )
+
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        self.out_block = None
+
+        # down
+        output_channel = in_channels
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+
+            if i == 0:
+                input_channel += extra_in_channels
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=block_out_channels[0],
+                add_downsample=not is_final_block or downsample_each_block,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = get_mid_block(
+            mid_block_type,
+            in_channels=block_out_channels[-1],
+            mid_channels=block_out_channels[-1],
+            out_channels=block_out_channels[-1],
+            embed_dim=block_out_channels[0],
+            num_layers=layers_per_block,
+            add_downsample=downsample_each_block,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        if out_block_type is None:
+            final_upsample_channels = out_channels
+        else:
+            final_upsample_channels = block_out_channels[0]
+
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = (
+                reversed_block_out_channels[i + 1] if i < len(up_block_types) - 1 else final_upsample_channels
+            )
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                temb_channels=block_out_channels[0],
+                add_upsample=not is_final_block,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
+        self.out_block = get_out_block(
+            out_block_type=out_block_type,
+            num_groups_out=num_groups_out,
+            embed_dim=block_out_channels[0],
+            out_channels=out_channels,
+            act_fn=act_fn,
+            fc_dim=block_out_channels[-1] // 4,
+        )
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        return_dict: bool = True,
+    ) -> Union[UNet1DOutput, Tuple]:
+        r"""
+        The [`UNet1DModel`] forward method.
+
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor with the following shape `(batch_size, num_channels, sample_size)`.
+            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_1d.UNet1DOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.unets.unet_1d.UNet1DOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_1d.UNet1DOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is the sample tensor.
+        """
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        timestep_embed = self.time_proj(timesteps)
+        if self.config.use_timestep_embedding:
+            timestep_embed = self.time_mlp(timestep_embed)
+        else:
+            timestep_embed = timestep_embed[..., None]
+            timestep_embed = timestep_embed.repeat([1, 1, sample.shape[2]]).to(sample.dtype)
+            timestep_embed = timestep_embed.broadcast_to((sample.shape[:1] + timestep_embed.shape[1:]))
+
+        # 2. down
+        down_block_res_samples = ()
+        for downsample_block in self.down_blocks:
+            sample, res_samples = downsample_block(hidden_states=sample, temb=timestep_embed)
+            down_block_res_samples += res_samples
+
+        # 3. mid
+        if self.mid_block:
+            sample = self.mid_block(sample, timestep_embed)
+
+        # 4. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-1:]
+            down_block_res_samples = down_block_res_samples[:-1]
+            sample = upsample_block(sample, res_hidden_states_tuple=res_samples, temb=timestep_embed)
+
+        # 5. post-process
+        if self.out_block:
+            sample = self.out_block(sample, timestep_embed)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet1DOutput(sample=sample)
diff --git a/diffusers/src/diffusers/models/unets/unet_1d_blocks.py b/diffusers/src/diffusers/models/unets/unet_1d_blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fc27e94c474a294cb384e20f24c989490658bd9
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_1d_blocks.py
@@ -0,0 +1,702 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ..activations import get_activation
+from ..resnet import Downsample1D, ResidualTemporalBlock1D, Upsample1D, rearrange_dims
+
+
+class DownResnetBlock1D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        conv_shortcut: bool = False,
+        temb_channels: int = 32,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        non_linearity: Optional[str] = None,
+        time_embedding_norm: str = "default",
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.add_downsample = add_downsample
+        self.output_scale_factor = output_scale_factor
+
+        if groups_out is None:
+            groups_out = groups
+
+        # there will always be at least one resnet
+        resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=temb_channels)]
+
+        for _ in range(num_layers):
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if non_linearity is None:
+            self.nonlinearity = None
+        else:
+            self.nonlinearity = get_activation(non_linearity)
+
+        self.downsample = None
+        if add_downsample:
+            self.downsample = Downsample1D(out_channels, use_conv=True, padding=1)
+
+    def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+        output_states = ()
+
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for resnet in self.resnets[1:]:
+            hidden_states = resnet(hidden_states, temb)
+
+        output_states += (hidden_states,)
+
+        if self.nonlinearity is not None:
+            hidden_states = self.nonlinearity(hidden_states)
+
+        if self.downsample is not None:
+            hidden_states = self.downsample(hidden_states)
+
+        return hidden_states, output_states
+
+
+class UpResnetBlock1D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        temb_channels: int = 32,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        non_linearity: Optional[str] = None,
+        time_embedding_norm: str = "default",
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.time_embedding_norm = time_embedding_norm
+        self.add_upsample = add_upsample
+        self.output_scale_factor = output_scale_factor
+
+        if groups_out is None:
+            groups_out = groups
+
+        # there will always be at least one resnet
+        resnets = [ResidualTemporalBlock1D(2 * in_channels, out_channels, embed_dim=temb_channels)]
+
+        for _ in range(num_layers):
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if non_linearity is None:
+            self.nonlinearity = None
+        else:
+            self.nonlinearity = get_activation(non_linearity)
+
+        self.upsample = None
+        if add_upsample:
+            self.upsample = Upsample1D(out_channels, use_conv_transpose=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_tuple: Optional[Tuple[torch.Tensor, ...]] = None,
+        temb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if res_hidden_states_tuple is not None:
+            res_hidden_states = res_hidden_states_tuple[-1]
+            hidden_states = torch.cat((hidden_states, res_hidden_states), dim=1)
+
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for resnet in self.resnets[1:]:
+            hidden_states = resnet(hidden_states, temb)
+
+        if self.nonlinearity is not None:
+            hidden_states = self.nonlinearity(hidden_states)
+
+        if self.upsample is not None:
+            hidden_states = self.upsample(hidden_states)
+
+        return hidden_states
+
+
+class ValueFunctionMidBlock1D(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, embed_dim: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.embed_dim = embed_dim
+
+        self.res1 = ResidualTemporalBlock1D(in_channels, in_channels // 2, embed_dim=embed_dim)
+        self.down1 = Downsample1D(out_channels // 2, use_conv=True)
+        self.res2 = ResidualTemporalBlock1D(in_channels // 2, in_channels // 4, embed_dim=embed_dim)
+        self.down2 = Downsample1D(out_channels // 4, use_conv=True)
+
+    def forward(self, x: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = self.res1(x, temb)
+        x = self.down1(x)
+        x = self.res2(x, temb)
+        x = self.down2(x)
+        return x
+
+
+class MidResTemporalBlock1D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        embed_dim: int,
+        num_layers: int = 1,
+        add_downsample: bool = False,
+        add_upsample: bool = False,
+        non_linearity: Optional[str] = None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.add_downsample = add_downsample
+
+        # there will always be at least one resnet
+        resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=embed_dim)]
+
+        for _ in range(num_layers):
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=embed_dim))
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if non_linearity is None:
+            self.nonlinearity = None
+        else:
+            self.nonlinearity = get_activation(non_linearity)
+
+        self.upsample = None
+        if add_upsample:
+            self.upsample = Upsample1D(out_channels, use_conv=True)
+
+        self.downsample = None
+        if add_downsample:
+            self.downsample = Downsample1D(out_channels, use_conv=True)
+
+        if self.upsample and self.downsample:
+            raise ValueError("Block cannot downsample and upsample")
+
+    def forward(self, hidden_states: torch.Tensor, temb: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for resnet in self.resnets[1:]:
+            hidden_states = resnet(hidden_states, temb)
+
+        if self.upsample:
+            hidden_states = self.upsample(hidden_states)
+        if self.downsample:
+            self.downsample = self.downsample(hidden_states)
+
+        return hidden_states
+
+
+class OutConv1DBlock(nn.Module):
+    def __init__(self, num_groups_out: int, out_channels: int, embed_dim: int, act_fn: str):
+        super().__init__()
+        self.final_conv1d_1 = nn.Conv1d(embed_dim, embed_dim, 5, padding=2)
+        self.final_conv1d_gn = nn.GroupNorm(num_groups_out, embed_dim)
+        self.final_conv1d_act = get_activation(act_fn)
+        self.final_conv1d_2 = nn.Conv1d(embed_dim, out_channels, 1)
+
+    def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+        hidden_states = self.final_conv1d_1(hidden_states)
+        hidden_states = rearrange_dims(hidden_states)
+        hidden_states = self.final_conv1d_gn(hidden_states)
+        hidden_states = rearrange_dims(hidden_states)
+        hidden_states = self.final_conv1d_act(hidden_states)
+        hidden_states = self.final_conv1d_2(hidden_states)
+        return hidden_states
+
+
+class OutValueFunctionBlock(nn.Module):
+    def __init__(self, fc_dim: int, embed_dim: int, act_fn: str = "mish"):
+        super().__init__()
+        self.final_block = nn.ModuleList(
+            [
+                nn.Linear(fc_dim + embed_dim, fc_dim // 2),
+                get_activation(act_fn),
+                nn.Linear(fc_dim // 2, 1),
+            ]
+        )
+
+    def forward(self, hidden_states: torch.Tensor, temb: torch.Tensor) -> torch.Tensor:
+        hidden_states = hidden_states.view(hidden_states.shape[0], -1)
+        hidden_states = torch.cat((hidden_states, temb), dim=-1)
+        for layer in self.final_block:
+            hidden_states = layer(hidden_states)
+
+        return hidden_states
+
+
+_kernels = {
+    "linear": [1 / 8, 3 / 8, 3 / 8, 1 / 8],
+    "cubic": [-0.01171875, -0.03515625, 0.11328125, 0.43359375, 0.43359375, 0.11328125, -0.03515625, -0.01171875],
+    "lanczos3": [
+        0.003689131001010537,
+        0.015056144446134567,
+        -0.03399861603975296,
+        -0.066637322306633,
+        0.13550527393817902,
+        0.44638532400131226,
+        0.44638532400131226,
+        0.13550527393817902,
+        -0.066637322306633,
+        -0.03399861603975296,
+        0.015056144446134567,
+        0.003689131001010537,
+    ],
+}
+
+
+class Downsample1d(nn.Module):
+    def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = torch.tensor(_kernels[kernel])
+        self.pad = kernel_1d.shape[0] // 2 - 1
+        self.register_buffer("kernel", kernel_1d)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode)
+        weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]])
+        indices = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        kernel = self.kernel.to(weight)[None, :].expand(hidden_states.shape[1], -1)
+        weight[indices, indices] = kernel
+        return F.conv1d(hidden_states, weight, stride=2)
+
+
+class Upsample1d(nn.Module):
+    def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = torch.tensor(_kernels[kernel]) * 2
+        self.pad = kernel_1d.shape[0] // 2 - 1
+        self.register_buffer("kernel", kernel_1d)
+
+    def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+        hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode)
+        weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]])
+        indices = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        kernel = self.kernel.to(weight)[None, :].expand(hidden_states.shape[1], -1)
+        weight[indices, indices] = kernel
+        return F.conv_transpose1d(hidden_states, weight, stride=2, padding=self.pad * 2 + 1)
+
+
+class SelfAttention1d(nn.Module):
+    def __init__(self, in_channels: int, n_head: int = 1, dropout_rate: float = 0.0):
+        super().__init__()
+        self.channels = in_channels
+        self.group_norm = nn.GroupNorm(1, num_channels=in_channels)
+        self.num_heads = n_head
+
+        self.query = nn.Linear(self.channels, self.channels)
+        self.key = nn.Linear(self.channels, self.channels)
+        self.value = nn.Linear(self.channels, self.channels)
+
+        self.proj_attn = nn.Linear(self.channels, self.channels, bias=True)
+
+        self.dropout = nn.Dropout(dropout_rate, inplace=True)
+
+    def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:
+        new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)
+        # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
+        new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
+        return new_projection
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual = hidden_states
+        batch, channel_dim, seq = hidden_states.shape
+
+        hidden_states = self.group_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        query_proj = self.query(hidden_states)
+        key_proj = self.key(hidden_states)
+        value_proj = self.value(hidden_states)
+
+        query_states = self.transpose_for_scores(query_proj)
+        key_states = self.transpose_for_scores(key_proj)
+        value_states = self.transpose_for_scores(value_proj)
+
+        scale = 1 / math.sqrt(math.sqrt(key_states.shape[-1]))
+
+        attention_scores = torch.matmul(query_states * scale, key_states.transpose(-1, -2) * scale)
+        attention_probs = torch.softmax(attention_scores, dim=-1)
+
+        # compute attention output
+        hidden_states = torch.matmul(attention_probs, value_states)
+
+        hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
+        new_hidden_states_shape = hidden_states.size()[:-2] + (self.channels,)
+        hidden_states = hidden_states.view(new_hidden_states_shape)
+
+        # compute next hidden_states
+        hidden_states = self.proj_attn(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.dropout(hidden_states)
+
+        output = hidden_states + residual
+
+        return output
+
+
+class ResConvBlock(nn.Module):
+    def __init__(self, in_channels: int, mid_channels: int, out_channels: int, is_last: bool = False):
+        super().__init__()
+        self.is_last = is_last
+        self.has_conv_skip = in_channels != out_channels
+
+        if self.has_conv_skip:
+            self.conv_skip = nn.Conv1d(in_channels, out_channels, 1, bias=False)
+
+        self.conv_1 = nn.Conv1d(in_channels, mid_channels, 5, padding=2)
+        self.group_norm_1 = nn.GroupNorm(1, mid_channels)
+        self.gelu_1 = nn.GELU()
+        self.conv_2 = nn.Conv1d(mid_channels, out_channels, 5, padding=2)
+
+        if not self.is_last:
+            self.group_norm_2 = nn.GroupNorm(1, out_channels)
+            self.gelu_2 = nn.GELU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual = self.conv_skip(hidden_states) if self.has_conv_skip else hidden_states
+
+        hidden_states = self.conv_1(hidden_states)
+        hidden_states = self.group_norm_1(hidden_states)
+        hidden_states = self.gelu_1(hidden_states)
+        hidden_states = self.conv_2(hidden_states)
+
+        if not self.is_last:
+            hidden_states = self.group_norm_2(hidden_states)
+            hidden_states = self.gelu_2(hidden_states)
+
+        output = hidden_states + residual
+        return output
+
+
+class UNetMidBlock1D(nn.Module):
+    def __init__(self, mid_channels: int, in_channels: int, out_channels: Optional[int] = None):
+        super().__init__()
+
+        out_channels = in_channels if out_channels is None else out_channels
+
+        # there is always at least one resnet
+        self.down = Downsample1d("cubic")
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        attentions = [
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(out_channels, out_channels // 32),
+        ]
+        self.up = Upsample1d(kernel="cubic")
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+        hidden_states = self.down(hidden_states)
+        for attn, resnet in zip(self.attentions, self.resnets):
+            hidden_states = resnet(hidden_states)
+            hidden_states = attn(hidden_states)
+
+        hidden_states = self.up(hidden_states)
+
+        return hidden_states
+
+
+class AttnDownBlock1D(nn.Module):
+    def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+
+        self.down = Downsample1d("cubic")
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        attentions = [
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(out_channels, out_channels // 32),
+        ]
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+        hidden_states = self.down(hidden_states)
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states)
+            hidden_states = attn(hidden_states)
+
+        return hidden_states, (hidden_states,)
+
+
+class DownBlock1D(nn.Module):
+    def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+
+        self.down = Downsample1d("cubic")
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+        hidden_states = self.down(hidden_states)
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+
+        return hidden_states, (hidden_states,)
+
+
+class DownBlock1DNoSkip(nn.Module):
+    def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+        hidden_states = torch.cat([hidden_states, temb], dim=1)
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+
+        return hidden_states, (hidden_states,)
+
+
+class AttnUpBlock1D(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+
+        resnets = [
+            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        attentions = [
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(out_channels, out_channels // 32),
+        ]
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.up = Upsample1d(kernel="cubic")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+        temb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        res_hidden_states = res_hidden_states_tuple[-1]
+        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states)
+            hidden_states = attn(hidden_states)
+
+        hidden_states = self.up(hidden_states)
+
+        return hidden_states
+
+
+class UpBlock1D(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = in_channels if mid_channels is None else mid_channels
+
+        resnets = [
+            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+
+        self.resnets = nn.ModuleList(resnets)
+        self.up = Upsample1d(kernel="cubic")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+        temb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        res_hidden_states = res_hidden_states_tuple[-1]
+        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+
+        hidden_states = self.up(hidden_states)
+
+        return hidden_states
+
+
+class UpBlock1DNoSkip(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = in_channels if mid_channels is None else mid_channels
+
+        resnets = [
+            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels, is_last=True),
+        ]
+
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+        temb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        res_hidden_states = res_hidden_states_tuple[-1]
+        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+
+        return hidden_states
+
+
+DownBlockType = Union[DownResnetBlock1D, DownBlock1D, AttnDownBlock1D, DownBlock1DNoSkip]
+MidBlockType = Union[MidResTemporalBlock1D, ValueFunctionMidBlock1D, UNetMidBlock1D]
+OutBlockType = Union[OutConv1DBlock, OutValueFunctionBlock]
+UpBlockType = Union[UpResnetBlock1D, UpBlock1D, AttnUpBlock1D, UpBlock1DNoSkip]
+
+
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+) -> DownBlockType:
+    if down_block_type == "DownResnetBlock1D":
+        return DownResnetBlock1D(
+            in_channels=in_channels,
+            num_layers=num_layers,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+        )
+    elif down_block_type == "DownBlock1D":
+        return DownBlock1D(out_channels=out_channels, in_channels=in_channels)
+    elif down_block_type == "AttnDownBlock1D":
+        return AttnDownBlock1D(out_channels=out_channels, in_channels=in_channels)
+    elif down_block_type == "DownBlock1DNoSkip":
+        return DownBlock1DNoSkip(out_channels=out_channels, in_channels=in_channels)
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block(
+    up_block_type: str, num_layers: int, in_channels: int, out_channels: int, temb_channels: int, add_upsample: bool
+) -> UpBlockType:
+    if up_block_type == "UpResnetBlock1D":
+        return UpResnetBlock1D(
+            in_channels=in_channels,
+            num_layers=num_layers,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+        )
+    elif up_block_type == "UpBlock1D":
+        return UpBlock1D(in_channels=in_channels, out_channels=out_channels)
+    elif up_block_type == "AttnUpBlock1D":
+        return AttnUpBlock1D(in_channels=in_channels, out_channels=out_channels)
+    elif up_block_type == "UpBlock1DNoSkip":
+        return UpBlock1DNoSkip(in_channels=in_channels, out_channels=out_channels)
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+def get_mid_block(
+    mid_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    mid_channels: int,
+    out_channels: int,
+    embed_dim: int,
+    add_downsample: bool,
+) -> MidBlockType:
+    if mid_block_type == "MidResTemporalBlock1D":
+        return MidResTemporalBlock1D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            embed_dim=embed_dim,
+            add_downsample=add_downsample,
+        )
+    elif mid_block_type == "ValueFunctionMidBlock1D":
+        return ValueFunctionMidBlock1D(in_channels=in_channels, out_channels=out_channels, embed_dim=embed_dim)
+    elif mid_block_type == "UNetMidBlock1D":
+        return UNetMidBlock1D(in_channels=in_channels, mid_channels=mid_channels, out_channels=out_channels)
+    raise ValueError(f"{mid_block_type} does not exist.")
+
+
+def get_out_block(
+    *, out_block_type: str, num_groups_out: int, embed_dim: int, out_channels: int, act_fn: str, fc_dim: int
+) -> Optional[OutBlockType]:
+    if out_block_type == "OutConv1DBlock":
+        return OutConv1DBlock(num_groups_out, out_channels, embed_dim, act_fn)
+    elif out_block_type == "ValueFunction":
+        return OutValueFunctionBlock(fc_dim, embed_dim, act_fn)
+    return None
diff --git a/diffusers/src/diffusers/models/unets/unet_2d.py b/diffusers/src/diffusers/models/unets/unet_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5972505f2897c849a7c64f08ef0929fea2d09b0f
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_2d.py
@@ -0,0 +1,346 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ..embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
+
+
+@dataclass
+class UNet2DOutput(BaseOutput):
+    """
+    The output of [`UNet2DModel`].
+
+    Args:
+        sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output from the last layer of the model.
+    """
+
+    sample: torch.Tensor
+
+
+class UNet2DModel(ModelMixin, ConfigMixin):
+    r"""
+    A 2D UNet model that takes a noisy sample and a timestep and returns a sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample. Dimensions must be a multiple of `2 ** (len(block_out_channels) -
+            1)`.
+        in_channels (`int`, *optional*, defaults to 3): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 3): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        time_embedding_type (`str`, *optional*, defaults to `"positional"`): Type of time embedding to use.
+        freq_shift (`int`, *optional*, defaults to 0): Frequency shift for Fourier time embedding.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+            Whether to flip sin to cos for Fourier time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`):
+            Tuple of downsample block types.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2D"`):
+            Block type for middle of UNet, it can be either `UNetMidBlock2D` or `UnCLIPUNetMidBlock2D`.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(224, 448, 672, 896)`):
+            Tuple of block output channels.
+        layers_per_block (`int`, *optional*, defaults to `2`): The number of layers per block.
+        mid_block_scale_factor (`float`, *optional*, defaults to `1`): The scale factor for the mid block.
+        downsample_padding (`int`, *optional*, defaults to `1`): The padding for the downsample convolution.
+        downsample_type (`str`, *optional*, defaults to `conv`):
+            The downsample type for downsampling layers. Choose between "conv" and "resnet"
+        upsample_type (`str`, *optional*, defaults to `conv`):
+            The upsample type for upsampling layers. Choose between "conv" and "resnet"
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension.
+        norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for normalization.
+        attn_norm_num_groups (`int`, *optional*, defaults to `None`):
+            If set to an integer, a group norm layer will be created in the mid block's [`Attention`] layer with the
+            given number of groups. If left as `None`, the group norm layer will only be created if
+            `resnet_time_scale_shift` is set to `default`, and if created will have `norm_num_groups` groups.
+        norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for normalization.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, or `"identity"`.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim` when performing class
+            conditioning with `class_embed_type` equal to `None`.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[Union[int, Tuple[int, int]]] = None,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        center_input_sample: bool = False,
+        time_embedding_type: str = "positional",
+        freq_shift: int = 0,
+        flip_sin_to_cos: bool = True,
+        down_block_types: Tuple[str, ...] = ("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
+        up_block_types: Tuple[str, ...] = ("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
+        block_out_channels: Tuple[int, ...] = (224, 448, 672, 896),
+        layers_per_block: int = 2,
+        mid_block_scale_factor: float = 1,
+        downsample_padding: int = 1,
+        downsample_type: str = "conv",
+        upsample_type: str = "conv",
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        attention_head_dim: Optional[int] = 8,
+        norm_num_groups: int = 32,
+        attn_norm_num_groups: Optional[int] = None,
+        norm_eps: float = 1e-5,
+        resnet_time_scale_shift: str = "default",
+        add_attention: bool = True,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        num_train_timesteps: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+
+        # time
+        if time_embedding_type == "fourier":
+            self.time_proj = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16)
+            timestep_input_dim = 2 * block_out_channels[0]
+        elif time_embedding_type == "positional":
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        elif time_embedding_type == "learned":
+            self.time_proj = nn.Embedding(num_train_timesteps, block_out_channels[0])
+            timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
+                downsample_padding=downsample_padding,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                downsample_type=downsample_type,
+                dropout=dropout,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            dropout=dropout,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_head_dim=attention_head_dim if attention_head_dim is not None else block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            attn_groups=attn_norm_num_groups,
+            add_attention=add_attention,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                upsample_type=upsample_type,
+                dropout=dropout,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=num_groups_out, eps=norm_eps)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        class_labels: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DOutput, Tuple]:
+        r"""
+        The [`UNet2DModel`] forward method.
+
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d.UNet2DOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.unets.unet_2d.UNet2DOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is the sample tensor.
+        """
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps * torch.ones(sample.shape[0], dtype=timesteps.dtype, device=timesteps.device)
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when doing class conditioning")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        elif self.class_embedding is None and class_labels is not None:
+            raise ValueError("class_embedding needs to be initialized in order to use class conditioning")
+
+        # 2. pre-process
+        skip_sample = sample
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "skip_conv"):
+                sample, res_samples, skip_sample = downsample_block(
+                    hidden_states=sample, temb=emb, skip_sample=skip_sample
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        sample = self.mid_block(sample, emb)
+
+        # 5. up
+        skip_sample = None
+        for upsample_block in self.up_blocks:
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            if hasattr(upsample_block, "skip_conv"):
+                sample, skip_sample = upsample_block(sample, res_samples, emb, skip_sample)
+            else:
+                sample = upsample_block(sample, res_samples, emb)
+
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if skip_sample is not None:
+            sample += skip_sample
+
+        if self.config.time_embedding_type == "fourier":
+            timesteps = timesteps.reshape((sample.shape[0], *([1] * len(sample.shape[1:]))))
+            sample = sample / timesteps
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DOutput(sample=sample)
diff --git a/diffusers/src/diffusers/models/unets/unet_2d_blocks.py b/diffusers/src/diffusers/models/unets/unet_2d_blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..469f8f3a33fc1f42e4180567d8ed7b9be5bfd033
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_2d_blocks.py
@@ -0,0 +1,4387 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...utils import is_torch_version, logging
+from ...utils.torch_utils import apply_freeu
+from ..activations import get_activation
+from ..attention_processor import Attention, AttnAddedKVProcessor, AttnAddedKVProcessor2_0
+from ..normalization import AdaGroupNorm
+from ..resnet import (
+    Downsample2D,
+    FirDownsample2D,
+    FirUpsample2D,
+    KDownsample2D,
+    KUpsample2D,
+    ResnetBlock2D,
+    ResnetBlockCondNorm2D,
+    Upsample2D,
+)
+from ..transformers.dual_transformer_2d import DualTransformer2DModel
+from ..transformers.transformer_2d import Transformer2DModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    downsample_padding: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    downsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+):
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "ResnetDownsampleBlock2D":
+        return ResnetDownsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        if add_downsample is False:
+            downsample_type = None
+        else:
+            downsample_type = downsample_type or "conv"  # default to 'conv'
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            downsample_type=downsample_type,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+        )
+    elif down_block_type == "SelfAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SelfAttnDownBlock2D")
+        return SelfAttnDownBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+        )
+    elif down_block_type == "SimpleCrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D")
+        return SimpleCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnDownEncoderBlock2D":
+        return AttnDownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "KDownBlock2D":
+        return KDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif down_block_type == "KCrossAttnDownBlock2D":
+        return KCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            add_self_attention=True if not add_downsample else False,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_mid_block(
+    mid_block_type: str,
+    temb_channels: int,
+    in_channels: int,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    resnet_groups: int,
+    output_scale_factor: float = 1.0,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    mid_block_only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = 1,
+    dropout: float = 0.0,
+):
+    if mid_block_type == "UNetMidBlock2DCrossAttn":
+        return UNetMidBlock2DCrossAttn(
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            output_scale_factor=output_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            resnet_groups=resnet_groups,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+            attention_type=attention_type,
+        )
+    if mid_block_type == "UNetMidBlock2DSelfAttn":
+        return UNetMidBlock2DSelfAttn(
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            output_scale_factor=output_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            resnet_groups=resnet_groups,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+            attention_type=attention_type,
+        )
+    elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+        return UNetMidBlock2DSimpleCrossAttn(
+            in_channels=in_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            output_scale_factor=output_scale_factor,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            only_cross_attention=mid_block_only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif mid_block_type == "UNetMidBlock2D":
+        return UNetMidBlock2D(
+            in_channels=in_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            num_layers=0,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            output_scale_factor=output_scale_factor,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            add_attention=False,
+        )
+    elif mid_block_type == "MidBlock2D":
+        return MidBlock2D(
+            in_channels=in_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            output_scale_factor=output_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            resnet_groups=resnet_groups,
+            use_linear_projection=use_linear_projection,
+        )
+    elif mid_block_type is None:
+        return None
+    else:
+        raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+
+
+def get_up_block(
+    up_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    prev_output_channel: int,
+    temb_channels: int,
+    add_upsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    resolution_idx: Optional[int] = None,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    upsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+) -> nn.Module:
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "ResnetUpsampleBlock2D":
+        return ResnetUpsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+        )
+    elif up_block_type == "SelfAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SelfAttnUpBlock2D")
+        return SelfAttnUpBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+        )
+    elif up_block_type == "SimpleCrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D")
+        return SimpleCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        if add_upsample is False:
+            upsample_type = None
+        else:
+            upsample_type = upsample_type or "conv"  # default to 'conv'
+
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            upsample_type=upsample_type,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "AttnUpDecoderBlock2D":
+        return AttnUpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "KUpBlock2D":
+        return KUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "KCrossAttnUpBlock2D":
+        return KCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+        )
+
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class AutoencoderTinyBlock(nn.Module):
+    """
+    Tiny Autoencoder block used in [`AutoencoderTiny`]. It is a mini residual module consisting of plain conv + ReLU
+    blocks.
+
+    Args:
+        in_channels (`int`): The number of input channels.
+        out_channels (`int`): The number of output channels.
+        act_fn (`str`):
+            ` The activation function to use. Supported values are `"swish"`, `"mish"`, `"gelu"`, and `"relu"`.
+
+    Returns:
+        `torch.FloatTensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to
+        `out_channels`.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, act_fn: str):
+        super().__init__()
+        act_fn = get_activation(act_fn)
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
+            act_fn,
+            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
+            act_fn,
+            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
+        )
+        self.skip = (
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+        self.fuse = nn.ReLU()
+
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        return self.fuse(self.conv(x) + self.skip(x))
+
+
+class UNetMidBlock2D(nn.Module):
+    """
+    A 2D UNet mid-block [`UNetMidBlock2D`] with multiple residual blocks and optional attention blocks.
+
+    Args:
+        in_channels (`int`): The number of input channels.
+        temb_channels (`int`): The number of temporal embedding channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
+            The type of normalization to apply to the time embeddings. This can help to improve the performance of the
+            model on tasks with long-range temporal dependencies.
+        resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+        attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
+        resnet_pre_norm (`bool`, *optional*, defaults to `True`):
+            Whether to use pre-normalization for the resnet blocks.
+        add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
+        attention_head_dim (`int`, *optional*, defaults to 1):
+            Dimension of a single attention head. The number of attention heads is determined based on this value and
+            the number of input channels.
+        output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
+
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        self.add_attention = add_attention
+
+        if attn_groups is None:
+            attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
+
+        # there is always at least one resnet
+        if resnet_time_scale_shift == "spatial":
+            resnets = [
+                ResnetBlockCondNorm2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm="spatial",
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                )
+            ]
+        else:
+            resnets = [
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            ]
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+            )
+            attention_head_dim = in_channels
+
+        for _ in range(num_layers):
+            if self.add_attention:
+                attentions.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        rescale_output_factor=output_scale_factor,
+                        eps=resnet_eps,
+                        norm_num_groups=attn_groups,
+                        spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+            else:
+                attentions.append(None)
+
+            if resnet_time_scale_shift == "spatial":
+                resnets.append(
+                    ResnetBlockCondNorm2D(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm="spatial",
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                    )
+                )
+            else:
+                resnets.append(
+                    ResnetBlock2D(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                hidden_states = attn(hidden_states, temb=temb)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for i in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
+        return hidden_states
+
+
+class UNetMidBlock2DSimpleCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+
+        self.attention_head_dim = attention_head_dim
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        self.num_heads = in_channels // self.attention_head_dim
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                skip_time_act=skip_time_act,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=in_channels,
+                    cross_attention_dim=in_channels,
+                    heads=self.num_heads,
+                    dim_head=self.attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            # attn
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=mask,
+                **cross_attention_kwargs,
+            )
+
+            # resnet
+            hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
+        return hidden_states
+
+
+class MidBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        use_linear_projection: bool = False,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = False
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+
+        for i in range(num_layers):
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        lora_scale = 1.0
+        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        for resnet in self.resnets[1:]:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
+        return hidden_states
+
+
+class AttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        downsample_type: str = "conv",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.downsample_type = downsample_type
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if downsample_type == "conv":
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        elif downsample_type == "resnet":
+            self.downsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        down=True,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            cross_attention_kwargs.update({"scale": lora_scale})
+            hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                if self.downsample_type == "resnet":
+                    hidden_states = downsampler(hidden_states, temb=temb, scale=lora_scale)
+                else:
+                    hidden_states = downsampler(hidden_states, scale=lora_scale)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        additional_residuals: Optional[torch.FloatTensor] = None,
+        down_block_add_samples: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
+        blocks = list(zip(self.resnets, self.attentions))
+
+        for i, (resnet, attn) in enumerate(blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+
+            # apply additional residuals to the output of the last pair of resnet and attention blocks
+            if i == len(blocks) - 1 and additional_residuals is not None:
+                hidden_states = hidden_states + additional_residuals
+
+            if down_block_add_samples is not None:
+                hidden_states = hidden_states + down_block_add_samples.pop(0) 
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale=lora_scale)
+
+            if down_block_add_samples is not None:
+                hidden_states = hidden_states + down_block_add_samples.pop(0) # todo: add before or after
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0,
+        down_block_add_samples: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+
+            if down_block_add_samples is not None:
+                hidden_states = hidden_states + down_block_add_samples.pop(0) 
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale=scale)
+
+            if down_block_add_samples is not None:
+                hidden_states = hidden_states + down_block_add_samples.pop(0)  # todo: add before or after
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            if resnet_time_scale_shift == "spatial":
+                resnets.append(
+                    ResnetBlockCondNorm2D(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        temb_channels=None,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm="spatial",
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                    )
+                )
+            else:
+                resnets.append(
+                    ResnetBlock2D(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        temb_channels=None,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None, scale=scale)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale)
+
+        return hidden_states
+
+
+class AttnDownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            if resnet_time_scale_shift == "spatial":
+                resnets.append(
+                    ResnetBlockCondNorm2D(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        temb_channels=None,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm="spatial",
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                    )
+                )
+            else:
+                resnets.append(
+                    ResnetBlock2D(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        temb_channels=None,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb=None, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale)
+
+        return hidden_states
+
+
+class AttnSkipDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_downsample: bool = True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            self.attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=32,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]:
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb, scale=scale)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states, skip_sample
+
+
+class SkipDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb, scale)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb, scale)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states, skip_sample
+
+
+class ResnetDownsampleBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        skip_time_act: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        down=True,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale)
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, temb, scale)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class SimpleCrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+
+        resnets = []
+        attentions = []
+
+        self.attention_head_dim = attention_head_dim
+        self.num_heads = out_channels // self.attention_head_dim
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=out_channels,
+                    cross_attention_dim=out_channels,
+                    heads=self.num_heads,
+                    dim_head=attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        down=True,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, temb, scale=lora_scale)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class KDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: int = 32,
+        add_downsample: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            resnets.append(
+                ResnetBlockCondNorm2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=groups,
+                    groups_out=groups_out,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            # YiYi's comments- might be able to use FirDownsample2D, look into details later
+            self.downsamplers = nn.ModuleList([KDownsample2D()])
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale)
+
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states, output_states
+
+
+class KCrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        cross_attention_dim: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_group_size: int = 32,
+        add_downsample: bool = True,
+        attention_head_dim: int = 64,
+        add_self_attention: bool = False,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            resnets.append(
+                ResnetBlockCondNorm2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=groups,
+                    groups_out=groups_out,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+            attentions.append(
+                KAttentionBlock(
+                    out_channels,
+                    out_channels // attention_head_dim,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                    temb_channels=temb_channels,
+                    attention_bias=True,
+                    add_self_attention=add_self_attention,
+                    cross_attention_norm="layer_norm",
+                    group_size=resnet_group_size,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.attentions = nn.ModuleList(attentions)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList([KDownsample2D()])
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+
+            if self.downsamplers is None:
+                output_states += (None,)
+            else:
+                output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states, output_states
+
+
+class AttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: int = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        upsample_type: str = "conv",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.upsample_type = upsample_type
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if upsample_type == "conv":
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        elif upsample_type == "resnet":
+            self.upsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        up=True,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                if self.upsample_type == "resnet":
+                    hidden_states = upsampler(hidden_states, temb=temb, scale=scale)
+                else:
+                    hidden_states = upsampler(hidden_states, scale=scale)
+
+        return hidden_states
+
+
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        return_res_samples: Optional[bool]=False,
+        up_block_add_samples: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+        if return_res_samples:
+            output_states=()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            if return_res_samples:
+                output_states = output_states + (hidden_states,)
+            if up_block_add_samples is not None:
+                hidden_states = hidden_states + up_block_add_samples.pop(0) 
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size, scale=lora_scale)
+            if return_res_samples:
+                output_states = output_states + (hidden_states,)
+            if up_block_add_samples is not None:
+                hidden_states = hidden_states + up_block_add_samples.pop(0) 
+            
+        if return_res_samples:
+            return hidden_states, output_states
+        else:
+            return hidden_states
+
+class UpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+        return_res_samples: Optional[bool]=False,
+        up_block_add_samples: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+        if return_res_samples:
+            output_states = ()
+
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+
+            if return_res_samples:
+                output_states = output_states + (hidden_states,)
+            if up_block_add_samples is not None:
+                hidden_states = hidden_states + up_block_add_samples.pop(0)  # todo: add before or after
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size, scale=scale)
+
+            if return_res_samples:
+                output_states = output_states + (hidden_states,)
+            if up_block_add_samples is not None:
+                hidden_states = hidden_states + up_block_add_samples.pop(0)  # todo: add before or after
+            
+
+        if return_res_samples:
+            return hidden_states, output_states
+        else:
+            return hidden_states
+
+
+class UpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        temb_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            if resnet_time_scale_shift == "spatial":
+                resnets.append(
+                    ResnetBlockCondNorm2D(
+                        in_channels=input_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm="spatial",
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                    )
+                )
+            else:
+                resnets.append(
+                    ResnetBlock2D(
+                        in_channels=input_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=temb, scale=scale)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class AttnUpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        temb_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `out_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            if resnet_time_scale_shift == "spatial":
+                resnets.append(
+                    ResnetBlockCondNorm2D(
+                        in_channels=input_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm="spatial",
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                    )
+                )
+            else:
+                resnets.append(
+                    ResnetBlock2D(
+                        in_channels=input_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups if resnet_time_scale_shift != "spatial" else None,
+                    spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> torch.FloatTensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb=temb, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, temb=temb, **cross_attention_kwargs)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, scale=scale)
+
+        return hidden_states
+
+
+class AttnSkipUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(resnet_in_channels + res_skip_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `out_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        self.attentions.append(
+            Attention(
+                out_channels,
+                heads=out_channels // attention_head_dim,
+                dim_head=attention_head_dim,
+                rescale_output_factor=output_scale_factor,
+                eps=resnet_eps,
+                norm_num_groups=32,
+                residual_connection=True,
+                bias=True,
+                upcast_softmax=True,
+                _from_deprecated_attn_block=True,
+            )
+        )
+
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample=None,
+        scale: float = 1.0,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        cross_attention_kwargs = {"scale": scale}
+        hidden_states = self.attentions[0](hidden_states, **cross_attention_kwargs)
+
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+
+            skip_sample = skip_sample + skip_sample_states
+
+            hidden_states = self.resnet_up(hidden_states, temb, scale=scale)
+
+        return hidden_states, skip_sample
+
+
+class SkipUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_upsample: bool = True,
+        upsample_padding: int = 1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min((resnet_in_channels + res_skip_channels) // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample=None,
+        scale: float = 1.0,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+
+            skip_sample = skip_sample + skip_sample_states
+
+            hidden_states = self.resnet_up(hidden_states, temb, scale=scale)
+
+        return hidden_states, skip_sample
+
+
+class ResnetUpsampleBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        skip_time_act: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        up=True,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, temb, scale=scale)
+
+        return hidden_states
+
+
+class SimpleCrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.attention_head_dim = attention_head_dim
+
+        self.num_heads = out_channels // self.attention_head_dim
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=out_channels,
+                    cross_attention_dim=out_channels,
+                    heads=self.num_heads,
+                    dim_head=self.attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        up=True,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # resnet
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, temb, scale=lora_scale)
+
+        return hidden_states
+
+
+class KUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: int,
+        dropout: float = 0.0,
+        num_layers: int = 5,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: Optional[int] = 32,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        k_in_channels = 2 * out_channels
+        k_out_channels = in_channels
+        num_layers = num_layers - 1
+
+        for i in range(num_layers):
+            in_channels = k_in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            resnets.append(
+                ResnetBlockCondNorm2D(
+                    in_channels=in_channels,
+                    out_channels=k_out_channels if (i == num_layers - 1) else out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=groups,
+                    groups_out=groups_out,
+                    dropout=dropout,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([KUpsample2D()])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        res_hidden_states_tuple = res_hidden_states_tuple[-1]
+        if res_hidden_states_tuple is not None:
+            hidden_states = torch.cat([hidden_states, res_hidden_states_tuple], dim=1)
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class KCrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: int = 32,
+        attention_head_dim: int = 1,  # attention dim_head
+        cross_attention_dim: int = 768,
+        add_upsample: bool = True,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        is_first_block = in_channels == out_channels == temb_channels
+        is_middle_block = in_channels != out_channels
+        add_self_attention = True if is_first_block else False
+
+        self.has_cross_attention = True
+        self.attention_head_dim = attention_head_dim
+
+        # in_channels, and out_channels for the block (k-unet)
+        k_in_channels = out_channels if is_first_block else 2 * out_channels
+        k_out_channels = in_channels
+
+        num_layers = num_layers - 1
+
+        for i in range(num_layers):
+            in_channels = k_in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            if is_middle_block and (i == num_layers - 1):
+                conv_2d_out_channels = k_out_channels
+            else:
+                conv_2d_out_channels = None
+
+            resnets.append(
+                ResnetBlockCondNorm2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    conv_2d_out_channels=conv_2d_out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=groups,
+                    groups_out=groups_out,
+                    dropout=dropout,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+            attentions.append(
+                KAttentionBlock(
+                    k_out_channels if (i == num_layers - 1) else out_channels,
+                    k_out_channels // attention_head_dim
+                    if (i == num_layers - 1)
+                    else out_channels // attention_head_dim,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                    temb_channels=temb_channels,
+                    attention_bias=True,
+                    add_self_attention=add_self_attention,
+                    cross_attention_norm="layer_norm",
+                    upcast_attention=upcast_attention,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.attentions = nn.ModuleList(attentions)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([KUpsample2D()])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        res_hidden_states_tuple = res_hidden_states_tuple[-1]
+        if res_hidden_states_tuple is not None:
+            hidden_states = torch.cat([hidden_states, res_hidden_states_tuple], dim=1)
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+# can potentially later be renamed to `No-feed-forward` attention
+class KAttentionBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Configure if the attention layers should contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to upcast the attention computation to `float32`.
+        temb_channels (`int`, *optional*, defaults to 768):
+            The number of channels in the token embedding.
+        add_self_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add self-attention to the block.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        group_size (`int`, *optional*, defaults to 32):
+            The number of groups to separate the channels into for group normalization.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout: float = 0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        upcast_attention: bool = False,
+        temb_channels: int = 768,  # for ada_group_norm
+        add_self_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        group_size: int = 32,
+    ):
+        super().__init__()
+        self.add_self_attention = add_self_attention
+
+        # 1. Self-Attn
+        if add_self_attention:
+            self.norm1 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
+            self.attn1 = Attention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=None,
+                cross_attention_norm=None,
+            )
+
+        # 2. Cross-Attn
+        self.norm2 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            upcast_attention=upcast_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+
+    def _to_3d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor:
+        return hidden_states.permute(0, 2, 3, 1).reshape(hidden_states.shape[0], height * weight, -1)
+
+    def _to_4d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor:
+        return hidden_states.permute(0, 2, 1).reshape(hidden_states.shape[0], -1, height, weight)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        # TODO: mark emb as non-optional (self.norm2 requires it).
+        #       requires assessing impact of change to positional param interface.
+        emb: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+        # 1. Self-Attention
+        if self.add_self_attention:
+            norm_hidden_states = self.norm1(hidden_states, emb)
+
+            height, weight = norm_hidden_states.shape[2:]
+            norm_hidden_states = self._to_3d(norm_hidden_states, height, weight)
+
+            attn_output = self.attn1(
+                norm_hidden_states,
+                encoder_hidden_states=None,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs,
+            )
+            attn_output = self._to_4d(attn_output, height, weight)
+
+            hidden_states = attn_output + hidden_states
+
+        # 2. Cross-Attention/None
+        norm_hidden_states = self.norm2(hidden_states, emb)
+
+        height, weight = norm_hidden_states.shape[2:]
+        norm_hidden_states = self._to_3d(norm_hidden_states, height, weight)
+        attn_output = self.attn2(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask if encoder_hidden_states is None else encoder_attention_mask,
+            **cross_attention_kwargs,
+        )
+        attn_output = self._to_4d(attn_output, height, weight)
+
+        hidden_states = attn_output + hidden_states
+
+        return hidden_states
+
+
+
+
+
+class SelfAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=None,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=None,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        additional_residuals: Optional[torch.FloatTensor] = None,
+        down_block_add_samples: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
+        blocks = list(zip(self.resnets, self.attentions))
+
+        for i, (resnet, attn) in enumerate(blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+
+            # apply additional residuals to the output of the last pair of resnet and attention blocks
+            if i == len(blocks) - 1 and additional_residuals is not None:
+                hidden_states = hidden_states + additional_residuals
+
+            if down_block_add_samples is not None:
+                hidden_states = hidden_states + down_block_add_samples.pop(0) 
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale=lora_scale)
+
+            if down_block_add_samples is not None:
+                hidden_states = hidden_states + down_block_add_samples.pop(0) # todo: add before or after
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+
+class SelfAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=None,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=None,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        return_res_samples: Optional[bool]=False,
+        up_block_add_samples: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+        if return_res_samples:
+            output_states=()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            if return_res_samples:
+                output_states = output_states + (hidden_states,)
+            if up_block_add_samples is not None:
+                hidden_states = hidden_states + up_block_add_samples.pop(0) 
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size, scale=lora_scale)
+            if return_res_samples:
+                output_states = output_states + (hidden_states,)
+            if up_block_add_samples is not None:
+                hidden_states = hidden_states + up_block_add_samples.pop(0) 
+            
+        if return_res_samples:
+            return hidden_states, output_states
+        else:
+            return hidden_states
+
+
+
+
+class UNetMidBlock2DSelfAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for i in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=None,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=None,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
+        return hidden_states
\ No newline at end of file
diff --git a/diffusers/src/diffusers/models/unets/unet_2d_blocks_flax.py b/diffusers/src/diffusers/models/unets/unet_2d_blocks_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4585dbc8823185a19952a103bd911e1bd6fa590
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_2d_blocks_flax.py
@@ -0,0 +1,400 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import flax.linen as nn
+import jax.numpy as jnp
+
+from ..attention_flax import FlaxTransformer2DModel
+from ..resnet_flax import FlaxDownsample2D, FlaxResnetBlock2D, FlaxUpsample2D
+
+
+class FlaxCrossAttnDownBlock2D(nn.Module):
+    r"""
+    Cross Attention 2D Downsizing block - original architecture from Unet transformers:
+    https://arxiv.org/abs/2103.06104
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of attention blocks layers
+        num_attention_heads (:obj:`int`, *optional*, defaults to 1):
+            Number of attention heads of each spatial transformer block
+        add_downsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add downsampling layer before each final output
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    num_attention_heads: int = 1
+    add_downsample: bool = True
+    use_linear_projection: bool = False
+    only_cross_attention: bool = False
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+    dtype: jnp.dtype = jnp.float32
+    transformer_layers_per_block: int = 1
+
+    def setup(self):
+        resnets = []
+        attentions = []
+
+        for i in range(self.num_layers):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=self.out_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+
+            attn_block = FlaxTransformer2DModel(
+                in_channels=self.out_channels,
+                n_heads=self.num_attention_heads,
+                d_head=self.out_channels // self.num_attention_heads,
+                depth=self.transformer_layers_per_block,
+                use_linear_projection=self.use_linear_projection,
+                only_cross_attention=self.only_cross_attention,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                split_head_dim=self.split_head_dim,
+                dtype=self.dtype,
+            )
+            attentions.append(attn_block)
+
+        self.resnets = resnets
+        self.attentions = attentions
+
+        if self.add_downsample:
+            self.downsamplers_0 = FlaxDownsample2D(self.out_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_states, temb, encoder_hidden_states, deterministic=True):
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+            hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic)
+            output_states += (hidden_states,)
+
+        if self.add_downsample:
+            hidden_states = self.downsamplers_0(hidden_states)
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class FlaxDownBlock2D(nn.Module):
+    r"""
+    Flax 2D downsizing block
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of attention blocks layers
+        add_downsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add downsampling layer before each final output
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    add_downsample: bool = True
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        resnets = []
+
+        for i in range(self.num_layers):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=self.out_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+        self.resnets = resnets
+
+        if self.add_downsample:
+            self.downsamplers_0 = FlaxDownsample2D(self.out_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_states, temb, deterministic=True):
+        output_states = ()
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+            output_states += (hidden_states,)
+
+        if self.add_downsample:
+            hidden_states = self.downsamplers_0(hidden_states)
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class FlaxCrossAttnUpBlock2D(nn.Module):
+    r"""
+    Cross Attention 2D Upsampling block - original architecture from Unet transformers:
+    https://arxiv.org/abs/2103.06104
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of attention blocks layers
+        num_attention_heads (:obj:`int`, *optional*, defaults to 1):
+            Number of attention heads of each spatial transformer block
+        add_upsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add upsampling layer before each final output
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int
+    prev_output_channel: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    num_attention_heads: int = 1
+    add_upsample: bool = True
+    use_linear_projection: bool = False
+    only_cross_attention: bool = False
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+    dtype: jnp.dtype = jnp.float32
+    transformer_layers_per_block: int = 1
+
+    def setup(self):
+        resnets = []
+        attentions = []
+
+        for i in range(self.num_layers):
+            res_skip_channels = self.in_channels if (i == self.num_layers - 1) else self.out_channels
+            resnet_in_channels = self.prev_output_channel if i == 0 else self.out_channels
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=resnet_in_channels + res_skip_channels,
+                out_channels=self.out_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+
+            attn_block = FlaxTransformer2DModel(
+                in_channels=self.out_channels,
+                n_heads=self.num_attention_heads,
+                d_head=self.out_channels // self.num_attention_heads,
+                depth=self.transformer_layers_per_block,
+                use_linear_projection=self.use_linear_projection,
+                only_cross_attention=self.only_cross_attention,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                split_head_dim=self.split_head_dim,
+                dtype=self.dtype,
+            )
+            attentions.append(attn_block)
+
+        self.resnets = resnets
+        self.attentions = attentions
+
+        if self.add_upsample:
+            self.upsamplers_0 = FlaxUpsample2D(self.out_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_states, res_hidden_states_tuple, temb, encoder_hidden_states, deterministic=True):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = jnp.concatenate((hidden_states, res_hidden_states), axis=-1)
+
+            hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+            hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic)
+
+        if self.add_upsample:
+            hidden_states = self.upsamplers_0(hidden_states)
+
+        return hidden_states
+
+
+class FlaxUpBlock2D(nn.Module):
+    r"""
+    Flax 2D upsampling block
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        prev_output_channel (:obj:`int`):
+            Output channels from the previous block
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of attention blocks layers
+        add_downsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add downsampling layer before each final output
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int
+    prev_output_channel: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    add_upsample: bool = True
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        resnets = []
+
+        for i in range(self.num_layers):
+            res_skip_channels = self.in_channels if (i == self.num_layers - 1) else self.out_channels
+            resnet_in_channels = self.prev_output_channel if i == 0 else self.out_channels
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=resnet_in_channels + res_skip_channels,
+                out_channels=self.out_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+
+        self.resnets = resnets
+
+        if self.add_upsample:
+            self.upsamplers_0 = FlaxUpsample2D(self.out_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_states, res_hidden_states_tuple, temb, deterministic=True):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = jnp.concatenate((hidden_states, res_hidden_states), axis=-1)
+
+            hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+
+        if self.add_upsample:
+            hidden_states = self.upsamplers_0(hidden_states)
+
+        return hidden_states
+
+
+class FlaxUNetMidBlock2DCrossAttn(nn.Module):
+    r"""
+    Cross Attention 2D Mid-level block - original architecture from Unet transformers: https://arxiv.org/abs/2103.06104
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of attention blocks layers
+        num_attention_heads (:obj:`int`, *optional*, defaults to 1):
+            Number of attention heads of each spatial transformer block
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    num_attention_heads: int = 1
+    use_linear_projection: bool = False
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+    dtype: jnp.dtype = jnp.float32
+    transformer_layers_per_block: int = 1
+
+    def setup(self):
+        # there is always at least one resnet
+        resnets = [
+            FlaxResnetBlock2D(
+                in_channels=self.in_channels,
+                out_channels=self.in_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+        ]
+
+        attentions = []
+
+        for _ in range(self.num_layers):
+            attn_block = FlaxTransformer2DModel(
+                in_channels=self.in_channels,
+                n_heads=self.num_attention_heads,
+                d_head=self.in_channels // self.num_attention_heads,
+                depth=self.transformer_layers_per_block,
+                use_linear_projection=self.use_linear_projection,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                split_head_dim=self.split_head_dim,
+                dtype=self.dtype,
+            )
+            attentions.append(attn_block)
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=self.in_channels,
+                out_channels=self.in_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+
+        self.resnets = resnets
+        self.attentions = attentions
+
+    def __call__(self, hidden_states, temb, encoder_hidden_states, deterministic=True):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic)
+            hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+
+        return hidden_states
diff --git a/diffusers/src/diffusers/models/unets/unet_2d_condition.py b/diffusers/src/diffusers/models/unets/unet_2d_condition.py
new file mode 100644
index 0000000000000000000000000000000000000000..a34b147e68fd55eb3a845ffcb170e9da49489f35
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_2d_condition.py
@@ -0,0 +1,1350 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin
+from ...utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ..activations import get_activation
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ..embeddings import (
+    GaussianFourierProjection,
+    GLIGENTextBoundingboxProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from ..modeling_utils import ModelMixin
+from .unet_2d_blocks import (
+    get_down_block,
+    get_mid_block,
+    get_up_block,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet2DConditionModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+
+    sample: torch.FloatTensor = None
+
+
+class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+       reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
+            blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
+        *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
+        *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: float = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        attention_type: str = "default",
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads: int = 64,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        self._check_config(
+            down_block_types=down_block_types,
+            up_block_types=up_block_types,
+            only_cross_attention=only_cross_attention,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            cross_attention_dim=cross_attention_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            reverse_transformer_layers_per_block=reverse_transformer_layers_per_block,
+            attention_head_dim=attention_head_dim,
+            num_attention_heads=num_attention_heads,
+        )
+
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        time_embed_dim, timestep_input_dim = self._set_time_proj(
+            time_embedding_type,
+            block_out_channels=block_out_channels,
+            flip_sin_to_cos=flip_sin_to_cos,
+            freq_shift=freq_shift,
+            time_embedding_dim=time_embedding_dim,
+        )
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+
+        self._set_encoder_hid_proj(
+            encoder_hid_dim_type,
+            cross_attention_dim=cross_attention_dim,
+            encoder_hid_dim=encoder_hid_dim,
+        )
+
+        # class embedding
+        self._set_class_embedding(
+            class_embed_type,
+            act_fn=act_fn,
+            num_class_embeds=num_class_embeds,
+            projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
+            time_embed_dim=time_embed_dim,
+            timestep_input_dim=timestep_input_dim,
+        )
+
+        self._set_add_embedding(
+            addition_embed_type,
+            addition_embed_type_num_heads=addition_embed_type_num_heads,
+            addition_time_embed_dim=addition_time_embed_dim,
+            cross_attention_dim=cross_attention_dim,
+            encoder_hid_dim=encoder_hid_dim,
+            flip_sin_to_cos=flip_sin_to_cos,
+            freq_shift=freq_shift,
+            projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
+            time_embed_dim=time_embed_dim,
+        )
+
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = get_mid_block(
+            mid_block_type,
+            temb_channels=blocks_time_embed_dim,
+            in_channels=block_out_channels[-1],
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            resnet_groups=norm_num_groups,
+            output_scale_factor=mid_block_scale_factor,
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            num_attention_heads=num_attention_heads[-1],
+            cross_attention_dim=cross_attention_dim[-1],
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            mid_block_only_cross_attention=mid_block_only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+            resnet_skip_time_act=resnet_skip_time_act,
+            cross_attention_norm=cross_attention_norm,
+            attention_head_dim=attention_head_dim[-1],
+            dropout=dropout,
+        )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = (
+            list(reversed(transformer_layers_per_block))
+            if reverse_transformer_layers_per_block is None
+            else reverse_transformer_layers_per_block
+        )
+        only_cross_attention = list(reversed(only_cross_attention))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resolution_idx=i,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+
+            self.conv_act = get_activation(act_fn)
+
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+
+        self._set_pos_net_if_use_gligen(attention_type=attention_type, cross_attention_dim=cross_attention_dim)
+
+    def _check_config(
+        self,
+        down_block_types: Tuple[str],
+        up_block_types: Tuple[str],
+        only_cross_attention: Union[bool, Tuple[bool]],
+        block_out_channels: Tuple[int],
+        layers_per_block: Union[int, Tuple[int]],
+        cross_attention_dim: Union[int, Tuple[int]],
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]],
+        reverse_transformer_layers_per_block: bool,
+        attention_head_dim: int,
+        num_attention_heads: Optional[Union[int, Tuple[int]]],
+    ):
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
+            for layer_number_per_block in transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
+
+    def _set_time_proj(
+        self,
+        time_embedding_type: str,
+        block_out_channels: int,
+        flip_sin_to_cos: bool,
+        freq_shift: float,
+        time_embedding_dim: int,
+    ) -> Tuple[int, int]:
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+
+        return time_embed_dim, timestep_input_dim
+
+    def _set_encoder_hid_proj(
+        self,
+        encoder_hid_dim_type: Optional[str],
+        cross_attention_dim: Union[int, Tuple[int]],
+        encoder_hid_dim: Optional[int],
+    ):
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+
+    def _set_class_embedding(
+        self,
+        class_embed_type: Optional[str],
+        act_fn: str,
+        num_class_embeds: Optional[int],
+        projection_class_embeddings_input_dim: Optional[int],
+        time_embed_dim: int,
+        timestep_input_dim: int,
+    ):
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+    def _set_add_embedding(
+        self,
+        addition_embed_type: str,
+        addition_embed_type_num_heads: int,
+        addition_time_embed_dim: Optional[int],
+        flip_sin_to_cos: bool,
+        freq_shift: float,
+        cross_attention_dim: Optional[int],
+        encoder_hid_dim: Optional[int],
+        projection_class_embeddings_input_dim: Optional[int],
+        time_embed_dim: int,
+    ):
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+
+    def _set_pos_net_if_use_gligen(self, attention_type: str, cross_attention_dim: int):
+        if attention_type in ["gated", "gated-text-image"]:
+            positive_len = 768
+            if isinstance(cross_attention_dim, int):
+                positive_len = cross_attention_dim
+            elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
+                positive_len = cross_attention_dim[0]
+
+            feature_type = "text-only" if attention_type == "gated" else "text-image"
+            self.position_net = GLIGENTextBoundingboxProjection(
+                positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
+            )
+
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def set_attention_slice(self, slice_size: Union[str, int, List[int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def unload_lora(self):
+        """Unloads LoRA weights."""
+        deprecate(
+            "unload_lora",
+            "0.28.0",
+            "Calling `unload_lora()` is deprecated and will be removed in a future version. Please install `peft` and then call `disable_adapters().",
+        )
+        for module in self.modules():
+            if hasattr(module, "set_lora_layer"):
+                module.set_lora_layer(None)
+
+    def get_time_embed(
+        self, sample: torch.Tensor, timestep: Union[torch.Tensor, float, int]
+    ) -> Optional[torch.Tensor]:
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        return t_emb
+
+    def get_class_embed(self, sample: torch.Tensor, class_labels: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+        class_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+        return class_emb
+
+    def get_aug_embed(
+        self, emb: torch.Tensor, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict[str, Any]
+    ) -> Optional[torch.Tensor]:
+        aug_emb = None
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb = self.add_embedding(image_embs, hint)
+        return aug_emb
+
+    def process_encoder_hidden_states(
+        self, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict[str, Any]
+    ) -> torch.Tensor:
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds)
+            encoder_hidden_states = (encoder_hidden_states, image_embeds)
+        return encoder_hidden_states
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        down_block_add_samples: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_add_sample: Optional[Tuple[torch.Tensor]] = None,
+        up_block_add_samples: Optional[Tuple[torch.Tensor]] = None,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added to UNet long skip connections from down blocks to up blocks for
+                example from ControlNet side model(s)
+            mid_block_additional_residual (`torch.Tensor`, *optional*):
+                additional residual to be added to UNet mid block output, for example from ControlNet side model
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+
+        Returns:
+            [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                # Forward upsample size to force interpolation output size.
+                forward_upsample_size = True
+                break
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        t_emb = self.get_time_embed(sample=sample, timestep=timestep)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
+        if class_emb is not None:
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+
+        aug_emb = self.get_aug_embed(
+            emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+        )
+        if self.config.addition_embed_type == "image_hint":
+            aug_emb, hint = aug_emb
+            sample = torch.cat([sample, hint], dim=1)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+
+        encoder_hidden_states = self.process_encoder_hidden_states(
+            encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+        )
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+
+        # 3. down
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        is_brushnet = down_block_add_samples is not None and mid_block_add_sample is not None and up_block_add_samples is not None
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+
+        down_block_res_samples = (sample,)
+
+        if is_brushnet:
+            sample = sample + down_block_add_samples.pop(0)
+        # print(sample.shape)
+
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+
+                if is_brushnet and len(down_block_add_samples)>0:
+                    additional_residuals["down_block_add_samples"] = [down_block_add_samples.pop(0) 
+                                                        for _ in range(len(downsample_block.resnets)+(downsample_block.downsamplers !=None))]
+
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                additional_residuals = {}
+                if is_brushnet and len(down_block_add_samples)>0:
+                    additional_residuals["down_block_add_samples"] = [down_block_add_samples.pop(0) 
+                                                        for _ in range(len(downsample_block.resnets)+(downsample_block.downsamplers !=None))]
+
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale, **additional_residuals)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+            # print(sample.shape)
+
+            down_block_res_samples += res_samples
+
+        if is_controlnet:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+        
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+
+        if is_brushnet:
+            sample = sample + mid_block_add_sample
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                additional_residuals = {}
+                if is_brushnet and len(up_block_add_samples)>0:
+                    additional_residuals["up_block_add_samples"] = [up_block_add_samples.pop(0) 
+                                                        for _ in range(len(upsample_block.resnets)+(upsample_block.upsamplers !=None))]
+                
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                additional_residuals = {}
+                if is_brushnet and len(up_block_add_samples)>0:
+                    additional_residuals["up_block_add_samples"] = [up_block_add_samples.pop(0) 
+                                                        for _ in range(len(upsample_block.resnets)+(upsample_block.upsamplers !=None))]
+                
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    scale=lora_scale,
+                    **additional_residuals,
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DConditionOutput(sample=sample)
diff --git a/diffusers/src/diffusers/models/unets/unet_2d_condition_flax.py b/diffusers/src/diffusers/models/unets/unet_2d_condition_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..edbbcbaeda730328f31a85c7a55e9ddc55767525
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_2d_condition_flax.py
@@ -0,0 +1,454 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Tuple, Union
+
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict
+
+from ...configuration_utils import ConfigMixin, flax_register_to_config
+from ...utils import BaseOutput
+from ..embeddings_flax import FlaxTimestepEmbedding, FlaxTimesteps
+from ..modeling_flax_utils import FlaxModelMixin
+from .unet_2d_blocks_flax import (
+    FlaxCrossAttnDownBlock2D,
+    FlaxCrossAttnUpBlock2D,
+    FlaxDownBlock2D,
+    FlaxUNetMidBlock2DCrossAttn,
+    FlaxUpBlock2D,
+)
+
+
+@flax.struct.dataclass
+class FlaxUNet2DConditionOutput(BaseOutput):
+    """
+    The output of [`FlaxUNet2DConditionModel`].
+
+    Args:
+        sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+
+    sample: jnp.ndarray
+
+
+@flax_register_to_config
+class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+
+    This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for it's generic methods
+    implemented for all models (such as downloading or saving).
+
+    This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax Linen module and refer to the Flax documentation for all matters related to its
+    general usage and behavior.
+
+    Inherent JAX features such as the following are supported:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        sample_size (`int`, *optional*):
+            The size of the input sample.
+        in_channels (`int`, *optional*, defaults to 4):
+            The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4):
+            The number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`. If `None`, the mid block layer
+            is skipped.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        attention_head_dim (`int` or `Tuple[int]`, *optional*, defaults to 8):
+            The dimension of the attention heads.
+        num_attention_heads (`int` or `Tuple[int]`, *optional*):
+            The number of attention heads.
+        cross_attention_dim (`int`, *optional*, defaults to 768):
+            The dimension of the cross attention features.
+        dropout (`float`, *optional*, defaults to 0):
+            Dropout probability for down, up and bottleneck blocks.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            Enable memory efficient attention as described [here](https://arxiv.org/abs/2112.05682).
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+    """
+
+    sample_size: int = 32
+    in_channels: int = 4
+    out_channels: int = 4
+    down_block_types: Tuple[str, ...] = (
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "DownBlock2D",
+    )
+    up_block_types: Tuple[str, ...] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")
+    mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn"
+    only_cross_attention: Union[bool, Tuple[bool]] = False
+    block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280)
+    layers_per_block: int = 2
+    attention_head_dim: Union[int, Tuple[int, ...]] = 8
+    num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None
+    cross_attention_dim: int = 1280
+    dropout: float = 0.0
+    use_linear_projection: bool = False
+    dtype: jnp.dtype = jnp.float32
+    flip_sin_to_cos: bool = True
+    freq_shift: int = 0
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+    transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1
+    addition_embed_type: Optional[str] = None
+    addition_time_embed_dim: Optional[int] = None
+    addition_embed_type_num_heads: int = 64
+    projection_class_embeddings_input_dim: Optional[int] = None
+
+    def init_weights(self, rng: jax.Array) -> FrozenDict:
+        # init input tensors
+        sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
+        sample = jnp.zeros(sample_shape, dtype=jnp.float32)
+        timesteps = jnp.ones((1,), dtype=jnp.int32)
+        encoder_hidden_states = jnp.zeros((1, 1, self.cross_attention_dim), dtype=jnp.float32)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        added_cond_kwargs = None
+        if self.addition_embed_type == "text_time":
+            # we retrieve the expected `text_embeds_dim` by first checking if the architecture is a refiner
+            # or non-refiner architecture and then by "reverse-computing" from `projection_class_embeddings_input_dim`
+            is_refiner = (
+                5 * self.config.addition_time_embed_dim + self.config.cross_attention_dim
+                == self.config.projection_class_embeddings_input_dim
+            )
+            num_micro_conditions = 5 if is_refiner else 6
+
+            text_embeds_dim = self.config.projection_class_embeddings_input_dim - (
+                num_micro_conditions * self.config.addition_time_embed_dim
+            )
+
+            time_ids_channels = self.projection_class_embeddings_input_dim - text_embeds_dim
+            time_ids_dims = time_ids_channels // self.addition_time_embed_dim
+            added_cond_kwargs = {
+                "text_embeds": jnp.zeros((1, text_embeds_dim), dtype=jnp.float32),
+                "time_ids": jnp.zeros((1, time_ids_dims), dtype=jnp.float32),
+            }
+        return self.init(rngs, sample, timesteps, encoder_hidden_states, added_cond_kwargs)["params"]
+
+    def setup(self) -> None:
+        block_out_channels = self.block_out_channels
+        time_embed_dim = block_out_channels[0] * 4
+
+        if self.num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = self.num_attention_heads or self.attention_head_dim
+
+        # input
+        self.conv_in = nn.Conv(
+            block_out_channels[0],
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        # time
+        self.time_proj = FlaxTimesteps(
+            block_out_channels[0], flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.config.freq_shift
+        )
+        self.time_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
+
+        only_cross_attention = self.only_cross_attention
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = (only_cross_attention,) * len(self.down_block_types)
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(self.down_block_types)
+
+        # transformer layers per block
+        transformer_layers_per_block = self.transformer_layers_per_block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(self.down_block_types)
+
+        # addition embed types
+        if self.addition_embed_type is None:
+            self.add_embedding = None
+        elif self.addition_embed_type == "text_time":
+            if self.addition_time_embed_dim is None:
+                raise ValueError(
+                    f"addition_embed_type {self.addition_embed_type} requires `addition_time_embed_dim` to not be None"
+                )
+            self.add_time_proj = FlaxTimesteps(self.addition_time_embed_dim, self.flip_sin_to_cos, self.freq_shift)
+            self.add_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
+        else:
+            raise ValueError(f"addition_embed_type: {self.addition_embed_type} must be None or `text_time`.")
+
+        # down
+        down_blocks = []
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(self.down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            if down_block_type == "CrossAttnDownBlock2D":
+                down_block = FlaxCrossAttnDownBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    dropout=self.dropout,
+                    num_layers=self.layers_per_block,
+                    transformer_layers_per_block=transformer_layers_per_block[i],
+                    num_attention_heads=num_attention_heads[i],
+                    add_downsample=not is_final_block,
+                    use_linear_projection=self.use_linear_projection,
+                    only_cross_attention=only_cross_attention[i],
+                    use_memory_efficient_attention=self.use_memory_efficient_attention,
+                    split_head_dim=self.split_head_dim,
+                    dtype=self.dtype,
+                )
+            else:
+                down_block = FlaxDownBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    dropout=self.dropout,
+                    num_layers=self.layers_per_block,
+                    add_downsample=not is_final_block,
+                    dtype=self.dtype,
+                )
+
+            down_blocks.append(down_block)
+        self.down_blocks = down_blocks
+
+        # mid
+        if self.config.mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = FlaxUNetMidBlock2DCrossAttn(
+                in_channels=block_out_channels[-1],
+                dropout=self.dropout,
+                num_attention_heads=num_attention_heads[-1],
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                use_linear_projection=self.use_linear_projection,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                split_head_dim=self.split_head_dim,
+                dtype=self.dtype,
+            )
+        elif self.config.mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"Unexpected mid_block_type {self.config.mid_block_type}")
+
+        # up
+        up_blocks = []
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        for i, up_block_type in enumerate(self.up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            if up_block_type == "CrossAttnUpBlock2D":
+                up_block = FlaxCrossAttnUpBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    prev_output_channel=prev_output_channel,
+                    num_layers=self.layers_per_block + 1,
+                    transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                    num_attention_heads=reversed_num_attention_heads[i],
+                    add_upsample=not is_final_block,
+                    dropout=self.dropout,
+                    use_linear_projection=self.use_linear_projection,
+                    only_cross_attention=only_cross_attention[i],
+                    use_memory_efficient_attention=self.use_memory_efficient_attention,
+                    split_head_dim=self.split_head_dim,
+                    dtype=self.dtype,
+                )
+            else:
+                up_block = FlaxUpBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    prev_output_channel=prev_output_channel,
+                    num_layers=self.layers_per_block + 1,
+                    add_upsample=not is_final_block,
+                    dropout=self.dropout,
+                    dtype=self.dtype,
+                )
+
+            up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        self.up_blocks = up_blocks
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+        self.conv_out = nn.Conv(
+            self.out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+    def __call__(
+        self,
+        sample: jnp.ndarray,
+        timesteps: Union[jnp.ndarray, float, int],
+        encoder_hidden_states: jnp.ndarray,
+        added_cond_kwargs: Optional[Union[Dict, FrozenDict]] = None,
+        down_block_additional_residuals: Optional[Tuple[jnp.ndarray, ...]] = None,
+        mid_block_additional_residual: Optional[jnp.ndarray] = None,
+        return_dict: bool = True,
+        train: bool = False,
+    ) -> Union[FlaxUNet2DConditionOutput, Tuple[jnp.ndarray]]:
+        r"""
+        Args:
+            sample (`jnp.ndarray`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`jnp.ndarray` or `float` or `int`): timesteps
+            encoder_hidden_states (`jnp.ndarray`): (batch_size, sequence_length, hidden_size) encoder hidden states
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of
+                a plain tuple.
+            train (`bool`, *optional*, defaults to `False`):
+                Use deterministic functions and disable dropout when not training.
+
+        Returns:
+            [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`:
+            [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+        """
+        # 1. time
+        if not isinstance(timesteps, jnp.ndarray):
+            timesteps = jnp.array([timesteps], dtype=jnp.int32)
+        elif isinstance(timesteps, jnp.ndarray) and len(timesteps.shape) == 0:
+            timesteps = timesteps.astype(dtype=jnp.float32)
+            timesteps = jnp.expand_dims(timesteps, 0)
+
+        t_emb = self.time_proj(timesteps)
+        t_emb = self.time_embedding(t_emb)
+
+        # additional embeddings
+        aug_emb = None
+        if self.addition_embed_type == "text_time":
+            if added_cond_kwargs is None:
+                raise ValueError(
+                    f"Need to provide argument `added_cond_kwargs` for {self.__class__} when using `addition_embed_type={self.addition_embed_type}`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if text_embeds is None:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            if time_ids is None:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            # compute time embeds
+            time_embeds = self.add_time_proj(jnp.ravel(time_ids))  # (1, 6) => (6,) => (6, 256)
+            time_embeds = jnp.reshape(time_embeds, (text_embeds.shape[0], -1))
+            add_embeds = jnp.concatenate([text_embeds, time_embeds], axis=-1)
+            aug_emb = self.add_embedding(add_embeds)
+
+        t_emb = t_emb + aug_emb if aug_emb is not None else t_emb
+
+        # 2. pre-process
+        sample = jnp.transpose(sample, (0, 2, 3, 1))
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for down_block in self.down_blocks:
+            if isinstance(down_block, FlaxCrossAttnDownBlock2D):
+                sample, res_samples = down_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
+            else:
+                sample, res_samples = down_block(sample, t_emb, deterministic=not train)
+            down_block_res_samples += res_samples
+
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample += down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
+
+        if mid_block_additional_residual is not None:
+            sample += mid_block_additional_residual
+
+        # 5. up
+        for up_block in self.up_blocks:
+            res_samples = down_block_res_samples[-(self.layers_per_block + 1) :]
+            down_block_res_samples = down_block_res_samples[: -(self.layers_per_block + 1)]
+            if isinstance(up_block, FlaxCrossAttnUpBlock2D):
+                sample = up_block(
+                    sample,
+                    temb=t_emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    res_hidden_states_tuple=res_samples,
+                    deterministic=not train,
+                )
+            else:
+                sample = up_block(sample, temb=t_emb, res_hidden_states_tuple=res_samples, deterministic=not train)
+
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = nn.silu(sample)
+        sample = self.conv_out(sample)
+        sample = jnp.transpose(sample, (0, 3, 1, 2))
+
+        if not return_dict:
+            return (sample,)
+
+        return FlaxUNet2DConditionOutput(sample=sample)
diff --git a/diffusers/src/diffusers/models/unets/unet_3d_blocks.py b/diffusers/src/diffusers/models/unets/unet_3d_blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b472a89e13dec1716c33646ad6a2e385d5543e0
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_3d_blocks.py
@@ -0,0 +1,1538 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...utils import deprecate, is_torch_version, logging
+from ...utils.torch_utils import apply_freeu
+from ..attention import Attention
+from ..resnet import (
+    Downsample2D,
+    ResnetBlock2D,
+    SpatioTemporalResBlock,
+    TemporalConvLayer,
+    Upsample2D,
+)
+from ..transformers.transformer_2d import Transformer2DModel
+from ..transformers.transformer_temporal import (
+    TransformerSpatioTemporalModel,
+    TransformerTemporalModel,
+)
+from .unet_motion_model import (
+    CrossAttnDownBlockMotion,
+    CrossAttnUpBlockMotion,
+    DownBlockMotion,
+    UNetMidBlockCrossAttnMotion,
+    UpBlockMotion,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class DownBlockMotion(DownBlockMotion):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `DownBlockMotion` from `diffusers.models.unets.unet_3d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_motion_model import DownBlockMotion` instead."
+        deprecate("DownBlockMotion", "1.0.0", deprecation_message)
+        super().__init__(*args, **kwargs)
+
+
+class CrossAttnDownBlockMotion(CrossAttnDownBlockMotion):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `CrossAttnDownBlockMotion` from `diffusers.models.unets.unet_3d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_motion_model import CrossAttnDownBlockMotion` instead."
+        deprecate("CrossAttnDownBlockMotion", "1.0.0", deprecation_message)
+        super().__init__(*args, **kwargs)
+
+
+class UpBlockMotion(UpBlockMotion):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `UpBlockMotion` from `diffusers.models.unets.unet_3d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_motion_model import UpBlockMotion` instead."
+        deprecate("UpBlockMotion", "1.0.0", deprecation_message)
+        super().__init__(*args, **kwargs)
+
+
+class CrossAttnUpBlockMotion(CrossAttnUpBlockMotion):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `CrossAttnUpBlockMotion` from `diffusers.models.unets.unet_3d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_motion_model import CrossAttnUpBlockMotion` instead."
+        deprecate("CrossAttnUpBlockMotion", "1.0.0", deprecation_message)
+        super().__init__(*args, **kwargs)
+
+
+class UNetMidBlockCrossAttnMotion(UNetMidBlockCrossAttnMotion):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `UNetMidBlockCrossAttnMotion` from `diffusers.models.unets.unet_3d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_motion_model import UNetMidBlockCrossAttnMotion` instead."
+        deprecate("UNetMidBlockCrossAttnMotion", "1.0.0", deprecation_message)
+        super().__init__(*args, **kwargs)
+
+
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    num_attention_heads: int,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    downsample_padding: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = True,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    temporal_num_attention_heads: int = 8,
+    temporal_max_seq_length: int = 32,
+    transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+    temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+    dropout: float = 0.0,
+) -> Union[
+    "DownBlock3D",
+    "CrossAttnDownBlock3D",
+    "DownBlockSpatioTemporal",
+    "CrossAttnDownBlockSpatioTemporal",
+]:
+    if down_block_type == "DownBlock3D":
+        return DownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            dropout=dropout,
+        )
+    elif down_block_type == "CrossAttnDownBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
+        return CrossAttnDownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            dropout=dropout,
+        )
+    elif down_block_type == "DownBlockSpatioTemporal":
+        # added for SDV
+        return DownBlockSpatioTemporal(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+        )
+    elif down_block_type == "CrossAttnDownBlockSpatioTemporal":
+        # added for SDV
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockSpatioTemporal")
+        return CrossAttnDownBlockSpatioTemporal(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            add_downsample=add_downsample,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+        )
+
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block(
+    up_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    prev_output_channel: int,
+    temb_channels: int,
+    add_upsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    num_attention_heads: int,
+    resolution_idx: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = True,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    temporal_num_attention_heads: int = 8,
+    temporal_cross_attention_dim: Optional[int] = None,
+    temporal_max_seq_length: int = 32,
+    transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+    temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+    dropout: float = 0.0,
+) -> Union[
+    "UpBlock3D",
+    "CrossAttnUpBlock3D",
+    "UpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal",
+]:
+    if up_block_type == "UpBlock3D":
+        return UpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+        )
+    elif up_block_type == "CrossAttnUpBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
+        return CrossAttnUpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+        )
+    elif up_block_type == "UpBlockSpatioTemporal":
+        # added for SDV
+        return UpBlockSpatioTemporal(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            add_upsample=add_upsample,
+        )
+    elif up_block_type == "CrossAttnUpBlockSpatioTemporal":
+        # added for SDV
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockSpatioTemporal")
+        return CrossAttnUpBlockSpatioTemporal(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            add_upsample=add_upsample,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            resolution_idx=resolution_idx,
+        )
+
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class UNetMidBlock3DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = True,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        temp_convs = [
+            TemporalConvLayer(
+                in_channels,
+                in_channels,
+                dropout=0.1,
+                norm_num_groups=resnet_groups,
+            )
+        ]
+        attentions = []
+        temp_attentions = []
+
+        for _ in range(num_layers):
+            attentions.append(
+                Transformer2DModel(
+                    in_channels // num_attention_heads,
+                    num_attention_heads,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temp_attentions.append(
+                TransformerTemporalModel(
+                    in_channels // num_attention_heads,
+                    num_attention_heads,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    in_channels,
+                    in_channels,
+                    dropout=0.1,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        self.attentions = nn.ModuleList(attentions)
+        self.temp_attentions = nn.ModuleList(temp_attentions)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        num_frames: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        hidden_states = self.temp_convs[0](hidden_states, num_frames=num_frames)
+        for attn, temp_attn, resnet, temp_conv in zip(
+            self.attentions, self.temp_attentions, self.resnets[1:], self.temp_convs[1:]
+        ):
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+            hidden_states = temp_attn(
+                hidden_states,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+
+        return hidden_states
+
+
+class CrossAttnDownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        temp_attentions = []
+        temp_convs = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                    dropout=0.1,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+            attentions.append(
+                Transformer2DModel(
+                    out_channels // num_attention_heads,
+                    num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temp_attentions.append(
+                TransformerTemporalModel(
+                    out_channels // num_attention_heads,
+                    num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        self.attentions = nn.ModuleList(attentions)
+        self.temp_attentions = nn.ModuleList(temp_attentions)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        num_frames: int = 1,
+        cross_attention_kwargs: Dict[str, Any] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
+        # TODO(Patrick, William) - attention mask is not used
+        output_states = ()
+
+        for resnet, temp_conv, attn, temp_attn in zip(
+            self.resnets, self.temp_convs, self.attentions, self.temp_attentions
+        ):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+            hidden_states = temp_attn(
+                hidden_states,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+        temp_convs = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                    dropout=0.1,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        num_frames: int = 1,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
+        output_states = ()
+
+        for resnet, temp_conv in zip(self.resnets, self.temp_convs):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnUpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        resolution_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+        temp_convs = []
+        attentions = []
+        temp_attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                    dropout=0.1,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+            attentions.append(
+                Transformer2DModel(
+                    out_channels // num_attention_heads,
+                    num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temp_attentions.append(
+                TransformerTemporalModel(
+                    out_channels // num_attention_heads,
+                    num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        self.attentions = nn.ModuleList(attentions)
+        self.temp_attentions = nn.ModuleList(temp_attentions)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        num_frames: int = 1,
+        cross_attention_kwargs: Dict[str, Any] = None,
+    ) -> torch.Tensor:
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        # TODO(Patrick, William) - attention mask is not used
+        for resnet, temp_conv, attn, temp_attn in zip(
+            self.resnets, self.temp_convs, self.attentions, self.temp_attentions
+        ):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+            hidden_states = temp_attn(
+                hidden_states,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+class UpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        resolution_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+        temp_convs = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                    dropout=0.1,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+        temb: Optional[torch.Tensor] = None,
+        upsample_size: Optional[int] = None,
+        num_frames: int = 1,
+    ) -> torch.Tensor:
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+        for resnet, temp_conv in zip(self.resnets, self.temp_convs):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+class MidBlockTemporalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        attention_head_dim: int = 512,
+        num_layers: int = 1,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+
+        resnets = []
+        attentions = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=1e-6,
+                    temporal_eps=1e-5,
+                    merge_factor=0.0,
+                    merge_strategy="learned",
+                    switch_spatial_to_temporal_mix=True,
+                )
+            )
+
+        attentions.append(
+            Attention(
+                query_dim=in_channels,
+                heads=in_channels // attention_head_dim,
+                dim_head=attention_head_dim,
+                eps=1e-6,
+                upcast_attention=upcast_attention,
+                norm_num_groups=32,
+                bias=True,
+                residual_connection=True,
+            )
+        )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        image_only_indicator: torch.Tensor,
+    ):
+        hidden_states = self.resnets[0](
+            hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+        for resnet, attn in zip(self.resnets[1:], self.attentions):
+            hidden_states = attn(hidden_states)
+            hidden_states = resnet(
+                hidden_states,
+                image_only_indicator=image_only_indicator,
+            )
+
+        return hidden_states
+
+
+class UpBlockTemporalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_layers: int = 1,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=1e-6,
+                    temporal_eps=1e-5,
+                    merge_factor=0.0,
+                    merge_strategy="learned",
+                    switch_spatial_to_temporal_mix=True,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        image_only_indicator: torch.Tensor,
+    ) -> torch.Tensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(
+                hidden_states,
+                image_only_indicator=image_only_indicator,
+            )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UNetMidBlockSpatioTemporal(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        # there is always at least one resnet
+        resnets = [
+            SpatioTemporalResBlock(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=1e-5,
+            )
+        ]
+        attentions = []
+
+        for i in range(num_layers):
+            attentions.append(
+                TransformerSpatioTemporalModel(
+                    num_attention_heads,
+                    in_channels // num_attention_heads,
+                    in_channels=in_channels,
+                    num_layers=transformer_layers_per_block[i],
+                    cross_attention_dim=cross_attention_dim,
+                )
+            )
+
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=1e-5,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.resnets[0](
+            hidden_states,
+            temb,
+            image_only_indicator=image_only_indicator,
+        )
+
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.training and self.gradient_checkpointing:  # TODO
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    image_only_indicator,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+
+        return hidden_states
+
+
+class DownBlockSpatioTemporal(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        num_layers: int = 1,
+        add_downsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=1e-5,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]:
+        output_states = ()
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        image_only_indicator,
+                        use_reentrant=False,
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        image_only_indicator,
+                    )
+            else:
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnDownBlockSpatioTemporal(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        add_downsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=1e-6,
+                )
+            )
+            attentions.append(
+                TransformerSpatioTemporalModel(
+                    num_attention_heads,
+                    out_channels // num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=transformer_layers_per_block[i],
+                    cross_attention_dim=cross_attention_dim,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=1,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]:
+        output_states = ()
+
+        blocks = list(zip(self.resnets, self.attentions))
+        for resnet, attn in blocks:
+            if self.training and self.gradient_checkpointing:  # TODO
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    image_only_indicator,
+                    **ckpt_kwargs,
+                )
+
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class UpBlockSpatioTemporal(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+        temb: Optional[torch.Tensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        image_only_indicator,
+                        use_reentrant=False,
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        image_only_indicator,
+                    )
+            else:
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class CrossAttnUpBlockSpatioTemporal(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                )
+            )
+            attentions.append(
+                TransformerSpatioTemporalModel(
+                    num_attention_heads,
+                    out_channels // num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=transformer_layers_per_block[i],
+                    cross_attention_dim=cross_attention_dim,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:  # TODO
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    image_only_indicator,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
diff --git a/diffusers/src/diffusers/models/unets/unet_3d_condition.py b/diffusers/src/diffusers/models/unets/unet_3d_condition.py
new file mode 100644
index 0000000000000000000000000000000000000000..3081fdc4700c3318fe12a9e65020b13dc34d0466
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_3d_condition.py
@@ -0,0 +1,748 @@
+# Copyright 2024 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
+# Copyright 2024 The ModelScope Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import UNet2DConditionLoadersMixin
+from ...utils import BaseOutput, logging
+from ..activations import get_activation
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+    FusedAttnProcessor2_0,
+)
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from ..transformers.transformer_temporal import TransformerTemporalModel
+from .unet_3d_blocks import (
+    CrossAttnDownBlock3D,
+    CrossAttnUpBlock3D,
+    DownBlock3D,
+    UNetMidBlock3DCrossAttn,
+    UpBlock3D,
+    get_down_block,
+    get_up_block,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class UNet3DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet3DConditionModel`].
+
+    Args:
+        sample (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+
+    sample: torch.Tensor
+
+
+class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D")`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int`, *optional*, defaults to 1024): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 64): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*): The number of attention heads.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+    """
+
+    _supports_gradient_checkpointing = False
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        down_block_types: Tuple[str, ...] = (
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ),
+        up_block_types: Tuple[str, ...] = (
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+        ),
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1024,
+        attention_head_dim: Union[int, Tuple[int]] = 64,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        time_cond_proj_dim: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        if num_attention_heads is not None:
+            raise NotImplementedError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        conv_in_kernel = 3
+        conv_out_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, 0)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+
+        self.transformer_in = TransformerTemporalModel(
+            num_attention_heads=8,
+            attention_head_dim=attention_head_dim,
+            in_channels=block_out_channels[0],
+            num_layers=1,
+            norm_num_groups=norm_num_groups,
+        )
+
+        # class embedding
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=False,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock3DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads[-1],
+            resnet_groups=norm_num_groups,
+            dual_cross_attention=False,
+        )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=False,
+                resolution_idx=i,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+            self.conv_act = get_activation("silu")
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+
+    def disable_forward_chunking(self):
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, None, 0)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+            module.gradient_checkpointing = value
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.enable_freeu
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple[torch.Tensor]]:
+        r"""
+        The [`UNet3DConditionModel`] forward method.
+
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor with the following shape `(batch, num_channels, num_frames, height, width`.
+            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+
+        Returns:
+            [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        num_frames = sample.shape[2]
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        emb = emb.repeat_interleave(repeats=num_frames, dim=0)
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
+
+        # 2. pre-process
+        sample = sample.permute(0, 2, 1, 3, 4).reshape((sample.shape[0] * num_frames, -1) + sample.shape[3:])
+        sample = self.conv_in(sample)
+
+        sample = self.transformer_in(
+            sample,
+            num_frames=num_frames,
+            cross_attention_kwargs=cross_attention_kwargs,
+            return_dict=False,
+        )[0]
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
+
+            down_block_res_samples += res_samples
+
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+
+        if mid_block_additional_residual is not None:
+            sample = sample + mid_block_additional_residual
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    num_frames=num_frames,
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+
+        sample = self.conv_out(sample)
+
+        # reshape to (batch, channel, framerate, width, height)
+        sample = sample[None, :].reshape((-1, num_frames) + sample.shape[1:]).permute(0, 2, 1, 3, 4)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet3DConditionOutput(sample=sample)
diff --git a/diffusers/src/diffusers/models/unets/unet_i2vgen_xl.py b/diffusers/src/diffusers/models/unets/unet_i2vgen_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ab3a577b892da3a18645e24245cc85cff1a16ce
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_i2vgen_xl.py
@@ -0,0 +1,728 @@
+# Copyright 2024 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import UNet2DConditionLoadersMixin
+from ...utils import logging
+from ..activations import get_activation
+from ..attention import Attention, FeedForward
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+    FusedAttnProcessor2_0,
+)
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from ..transformers.transformer_temporal import TransformerTemporalModel
+from .unet_3d_blocks import (
+    CrossAttnDownBlock3D,
+    CrossAttnUpBlock3D,
+    DownBlock3D,
+    UNetMidBlock3DCrossAttn,
+    UpBlock3D,
+    get_down_block,
+    get_up_block,
+)
+from .unet_3d_condition import UNet3DConditionOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class I2VGenXLTransformerTemporalEncoder(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        activation_fn: str = "geglu",
+        upcast_attention: bool = False,
+        ff_inner_dim: Optional[int] = None,
+        dropout: int = 0.0,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=True, eps=1e-5)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=False,
+            upcast_attention=upcast_attention,
+            out_bias=True,
+        )
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=False,
+            inner_dim=ff_inner_dim,
+            bias=True,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None)
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        ff_output = self.ff(hidden_states)
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states
+
+
+class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    I2VGenXL UNet. It is a conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep and
+    returns a sample-shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 64): Attention head dim.
+        num_attention_heads (`int`, *optional*): The number of attention heads.
+    """
+
+    _supports_gradient_checkpointing = False
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        down_block_types: Tuple[str, ...] = (
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ),
+        up_block_types: Tuple[str, ...] = (
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+        ),
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        norm_num_groups: Optional[int] = 32,
+        cross_attention_dim: int = 1024,
+        attention_head_dim: Union[int, Tuple[int]] = 64,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+    ):
+        super().__init__()
+
+        # When we first integrated the UNet into the library, we didn't have `attention_head_dim`. As a consequence
+        # of that, we used `num_attention_heads` for arguments that actually denote attention head dimension. This
+        # is why we ignore `num_attention_heads` and calculate it from `attention_head_dims` below.
+        # This is still an incorrect way of calculating `num_attention_heads` but we need to stick to it
+        # without running proper depcrecation cycles for the {down,mid,up} blocks which are a
+        # part of the public API.
+        num_attention_heads = attention_head_dim
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        self.conv_in = nn.Conv2d(in_channels + in_channels, block_out_channels[0], kernel_size=3, padding=1)
+
+        self.transformer_in = TransformerTemporalModel(
+            num_attention_heads=8,
+            attention_head_dim=num_attention_heads,
+            in_channels=block_out_channels[0],
+            num_layers=1,
+            norm_num_groups=norm_num_groups,
+        )
+
+        # image embedding
+        self.image_latents_proj_in = nn.Sequential(
+            nn.Conv2d(4, in_channels * 4, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(in_channels * 4, in_channels * 4, 3, stride=1, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(in_channels * 4, in_channels, 3, stride=1, padding=1),
+        )
+        self.image_latents_temporal_encoder = I2VGenXLTransformerTemporalEncoder(
+            dim=in_channels,
+            num_attention_heads=2,
+            ff_inner_dim=in_channels * 4,
+            attention_head_dim=in_channels,
+            activation_fn="gelu",
+        )
+        self.image_latents_context_embedding = nn.Sequential(
+            nn.Conv2d(4, in_channels * 8, 3, padding=1),
+            nn.SiLU(),
+            nn.AdaptiveAvgPool2d((32, 32)),
+            nn.Conv2d(in_channels * 8, in_channels * 16, 3, stride=2, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(in_channels * 16, cross_attention_dim, 3, stride=2, padding=1),
+        )
+
+        # other embeddings -- time, context, fps, etc.
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, 0)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn="silu")
+        self.context_embedding = nn.Sequential(
+            nn.Linear(cross_attention_dim, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, cross_attention_dim * in_channels),
+        )
+        self.fps_embedding = nn.Sequential(
+            nn.Linear(timestep_input_dim, time_embed_dim), nn.SiLU(), nn.Linear(time_embed_dim, time_embed_dim)
+        )
+
+        # blocks
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-05,
+                resnet_act_fn="silu",
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=1,
+                dual_cross_attention=False,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock3DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=1e-05,
+            resnet_act_fn="silu",
+            output_scale_factor=1,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads[-1],
+            resnet_groups=norm_num_groups,
+            dual_cross_attention=False,
+        )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=1e-05,
+                resnet_act_fn="silu",
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=False,
+                resolution_idx=i,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-05)
+        self.conv_act = get_activation("silu")
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.disable_forward_chunking
+    def disable_forward_chunking(self):
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, None, 0)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel._set_gradient_checkpointing
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+            module.gradient_checkpointing = value
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.enable_freeu
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        fps: torch.Tensor,
+        image_latents: torch.Tensor,
+        image_embeddings: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple[torch.Tensor]]:
+        r"""
+        The [`I2VGenXLUNet`] forward method.
+
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
+            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+            fps (`torch.Tensor`): Frames per second for the video being generated. Used as a "micro-condition".
+            image_latents (`torch.Tensor`): Image encodings from the VAE.
+            image_embeddings (`torch.Tensor`):
+                Projection embeddings of the conditioning image computed with a vision encoder.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
+        """
+        batch_size, channels, num_frames, height, width = sample.shape
+
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass `timesteps` as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timesteps, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        t_emb = self.time_embedding(t_emb, timestep_cond)
+
+        # 2. FPS
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        fps = fps.expand(fps.shape[0])
+        fps_emb = self.fps_embedding(self.time_proj(fps).to(dtype=self.dtype))
+
+        # 3. time + FPS embeddings.
+        emb = t_emb + fps_emb
+        emb = emb.repeat_interleave(repeats=num_frames, dim=0)
+
+        # 4. context embeddings.
+        # The context embeddings consist of both text embeddings from the input prompt
+        # AND the image embeddings from the input image. For images, both VAE encodings
+        # and the CLIP image embeddings are incorporated.
+        # So the final `context_embeddings` becomes the query for cross-attention.
+        context_emb = sample.new_zeros(batch_size, 0, self.config.cross_attention_dim)
+        context_emb = torch.cat([context_emb, encoder_hidden_states], dim=1)
+
+        image_latents_for_context_embds = image_latents[:, :, :1, :]
+        image_latents_context_embs = image_latents_for_context_embds.permute(0, 2, 1, 3, 4).reshape(
+            image_latents_for_context_embds.shape[0] * image_latents_for_context_embds.shape[2],
+            image_latents_for_context_embds.shape[1],
+            image_latents_for_context_embds.shape[3],
+            image_latents_for_context_embds.shape[4],
+        )
+        image_latents_context_embs = self.image_latents_context_embedding(image_latents_context_embs)
+
+        _batch_size, _channels, _height, _width = image_latents_context_embs.shape
+        image_latents_context_embs = image_latents_context_embs.permute(0, 2, 3, 1).reshape(
+            _batch_size, _height * _width, _channels
+        )
+        context_emb = torch.cat([context_emb, image_latents_context_embs], dim=1)
+
+        image_emb = self.context_embedding(image_embeddings)
+        image_emb = image_emb.view(-1, self.config.in_channels, self.config.cross_attention_dim)
+        context_emb = torch.cat([context_emb, image_emb], dim=1)
+        context_emb = context_emb.repeat_interleave(repeats=num_frames, dim=0)
+
+        image_latents = image_latents.permute(0, 2, 1, 3, 4).reshape(
+            image_latents.shape[0] * image_latents.shape[2],
+            image_latents.shape[1],
+            image_latents.shape[3],
+            image_latents.shape[4],
+        )
+        image_latents = self.image_latents_proj_in(image_latents)
+        image_latents = (
+            image_latents[None, :]
+            .reshape(batch_size, num_frames, channels, height, width)
+            .permute(0, 3, 4, 1, 2)
+            .reshape(batch_size * height * width, num_frames, channels)
+        )
+        image_latents = self.image_latents_temporal_encoder(image_latents)
+        image_latents = image_latents.reshape(batch_size, height, width, num_frames, channels).permute(0, 4, 3, 1, 2)
+
+        # 5. pre-process
+        sample = torch.cat([sample, image_latents], dim=1)
+        sample = sample.permute(0, 2, 1, 3, 4).reshape((sample.shape[0] * num_frames, -1) + sample.shape[3:])
+        sample = self.conv_in(sample)
+        sample = self.transformer_in(
+            sample,
+            num_frames=num_frames,
+            cross_attention_kwargs=cross_attention_kwargs,
+            return_dict=False,
+        )[0]
+
+        # 6. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=context_emb,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
+
+            down_block_res_samples += res_samples
+
+        # 7. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=context_emb,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+        # 8. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=context_emb,
+                    upsample_size=upsample_size,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    num_frames=num_frames,
+                )
+
+        # 9. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+
+        sample = self.conv_out(sample)
+
+        # reshape to (batch, channel, framerate, width, height)
+        sample = sample[None, :].reshape((-1, num_frames) + sample.shape[1:]).permute(0, 2, 1, 3, 4)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet3DConditionOutput(sample=sample)
diff --git a/diffusers/src/diffusers/models/unets/unet_kandinsky3.py b/diffusers/src/diffusers/models/unets/unet_kandinsky3.py
new file mode 100644
index 0000000000000000000000000000000000000000..f611e7d82b1d9f1bebf1f228b9e75d532852b476
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_kandinsky3.py
@@ -0,0 +1,535 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Dict, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput, logging
+from ..attention_processor import Attention, AttentionProcessor, AttnProcessor
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class Kandinsky3UNetOutput(BaseOutput):
+    sample: torch.Tensor = None
+
+
+class Kandinsky3EncoderProj(nn.Module):
+    def __init__(self, encoder_hid_dim, cross_attention_dim):
+        super().__init__()
+        self.projection_linear = nn.Linear(encoder_hid_dim, cross_attention_dim, bias=False)
+        self.projection_norm = nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, x):
+        x = self.projection_linear(x)
+        x = self.projection_norm(x)
+        return x
+
+
+class Kandinsky3UNet(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 4,
+        time_embedding_dim: int = 1536,
+        groups: int = 32,
+        attention_head_dim: int = 64,
+        layers_per_block: Union[int, Tuple[int]] = 3,
+        block_out_channels: Tuple[int] = (384, 768, 1536, 3072),
+        cross_attention_dim: Union[int, Tuple[int]] = 4096,
+        encoder_hid_dim: int = 4096,
+    ):
+        super().__init__()
+
+        # TODO(Yiyi): Give better name and put into config for the following 4 parameters
+        expansion_ratio = 4
+        compression_ratio = 2
+        add_cross_attention = (False, True, True, True)
+        add_self_attention = (False, True, True, True)
+
+        out_channels = in_channels
+        init_channels = block_out_channels[0] // 2
+        self.time_proj = Timesteps(init_channels, flip_sin_to_cos=False, downscale_freq_shift=1)
+
+        self.time_embedding = TimestepEmbedding(
+            init_channels,
+            time_embedding_dim,
+        )
+
+        self.add_time_condition = Kandinsky3AttentionPooling(
+            time_embedding_dim, cross_attention_dim, attention_head_dim
+        )
+
+        self.conv_in = nn.Conv2d(in_channels, init_channels, kernel_size=3, padding=1)
+
+        self.encoder_hid_proj = Kandinsky3EncoderProj(encoder_hid_dim, cross_attention_dim)
+
+        hidden_dims = [init_channels] + list(block_out_channels)
+        in_out_dims = list(zip(hidden_dims[:-1], hidden_dims[1:]))
+        text_dims = [cross_attention_dim if is_exist else None for is_exist in add_cross_attention]
+        num_blocks = len(block_out_channels) * [layers_per_block]
+        layer_params = [num_blocks, text_dims, add_self_attention]
+        rev_layer_params = map(reversed, layer_params)
+
+        cat_dims = []
+        self.num_levels = len(in_out_dims)
+        self.down_blocks = nn.ModuleList([])
+        for level, ((in_dim, out_dim), res_block_num, text_dim, self_attention) in enumerate(
+            zip(in_out_dims, *layer_params)
+        ):
+            down_sample = level != (self.num_levels - 1)
+            cat_dims.append(out_dim if level != (self.num_levels - 1) else 0)
+            self.down_blocks.append(
+                Kandinsky3DownSampleBlock(
+                    in_dim,
+                    out_dim,
+                    time_embedding_dim,
+                    text_dim,
+                    res_block_num,
+                    groups,
+                    attention_head_dim,
+                    expansion_ratio,
+                    compression_ratio,
+                    down_sample,
+                    self_attention,
+                )
+            )
+
+        self.up_blocks = nn.ModuleList([])
+        for level, ((out_dim, in_dim), res_block_num, text_dim, self_attention) in enumerate(
+            zip(reversed(in_out_dims), *rev_layer_params)
+        ):
+            up_sample = level != 0
+            self.up_blocks.append(
+                Kandinsky3UpSampleBlock(
+                    in_dim,
+                    cat_dims.pop(),
+                    out_dim,
+                    time_embedding_dim,
+                    text_dim,
+                    res_block_num,
+                    groups,
+                    attention_head_dim,
+                    expansion_ratio,
+                    compression_ratio,
+                    up_sample,
+                    self_attention,
+                )
+            )
+
+        self.conv_norm_out = nn.GroupNorm(groups, init_channels)
+        self.conv_act_out = nn.SiLU()
+        self.conv_out = nn.Conv2d(init_channels, out_channels, kernel_size=3, padding=1)
+
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(AttnProcessor())
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(self, sample, timestep, encoder_hidden_states=None, encoder_attention_mask=None, return_dict=True):
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        if not torch.is_tensor(timestep):
+            dtype = torch.float32 if isinstance(timestep, float) else torch.int32
+            timestep = torch.tensor([timestep], dtype=dtype, device=sample.device)
+        elif len(timestep.shape) == 0:
+            timestep = timestep[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timestep = timestep.expand(sample.shape[0])
+        time_embed_input = self.time_proj(timestep).to(sample.dtype)
+        time_embed = self.time_embedding(time_embed_input)
+
+        encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+
+        if encoder_hidden_states is not None:
+            time_embed = self.add_time_condition(time_embed, encoder_hidden_states, encoder_attention_mask)
+
+        hidden_states = []
+        sample = self.conv_in(sample)
+        for level, down_sample in enumerate(self.down_blocks):
+            sample = down_sample(sample, time_embed, encoder_hidden_states, encoder_attention_mask)
+            if level != self.num_levels - 1:
+                hidden_states.append(sample)
+
+        for level, up_sample in enumerate(self.up_blocks):
+            if level != 0:
+                sample = torch.cat([sample, hidden_states.pop()], dim=1)
+            sample = up_sample(sample, time_embed, encoder_hidden_states, encoder_attention_mask)
+
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act_out(sample)
+        sample = self.conv_out(sample)
+
+        if not return_dict:
+            return (sample,)
+        return Kandinsky3UNetOutput(sample=sample)
+
+
+class Kandinsky3UpSampleBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        cat_dim,
+        out_channels,
+        time_embed_dim,
+        context_dim=None,
+        num_blocks=3,
+        groups=32,
+        head_dim=64,
+        expansion_ratio=4,
+        compression_ratio=2,
+        up_sample=True,
+        self_attention=True,
+    ):
+        super().__init__()
+        up_resolutions = [[None, True if up_sample else None, None, None]] + [[None] * 4] * (num_blocks - 1)
+        hidden_channels = (
+            [(in_channels + cat_dim, in_channels)]
+            + [(in_channels, in_channels)] * (num_blocks - 2)
+            + [(in_channels, out_channels)]
+        )
+        attentions = []
+        resnets_in = []
+        resnets_out = []
+
+        self.self_attention = self_attention
+        self.context_dim = context_dim
+
+        if self_attention:
+            attentions.append(
+                Kandinsky3AttentionBlock(out_channels, time_embed_dim, None, groups, head_dim, expansion_ratio)
+            )
+        else:
+            attentions.append(nn.Identity())
+
+        for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions):
+            resnets_in.append(
+                Kandinsky3ResNetBlock(in_channel, in_channel, time_embed_dim, groups, compression_ratio, up_resolution)
+            )
+
+            if context_dim is not None:
+                attentions.append(
+                    Kandinsky3AttentionBlock(
+                        in_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio
+                    )
+                )
+            else:
+                attentions.append(nn.Identity())
+
+            resnets_out.append(
+                Kandinsky3ResNetBlock(in_channel, out_channel, time_embed_dim, groups, compression_ratio)
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets_in = nn.ModuleList(resnets_in)
+        self.resnets_out = nn.ModuleList(resnets_out)
+
+    def forward(self, x, time_embed, context=None, context_mask=None, image_mask=None):
+        for attention, resnet_in, resnet_out in zip(self.attentions[1:], self.resnets_in, self.resnets_out):
+            x = resnet_in(x, time_embed)
+            if self.context_dim is not None:
+                x = attention(x, time_embed, context, context_mask, image_mask)
+            x = resnet_out(x, time_embed)
+
+        if self.self_attention:
+            x = self.attentions[0](x, time_embed, image_mask=image_mask)
+        return x
+
+
+class Kandinsky3DownSampleBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        time_embed_dim,
+        context_dim=None,
+        num_blocks=3,
+        groups=32,
+        head_dim=64,
+        expansion_ratio=4,
+        compression_ratio=2,
+        down_sample=True,
+        self_attention=True,
+    ):
+        super().__init__()
+        attentions = []
+        resnets_in = []
+        resnets_out = []
+
+        self.self_attention = self_attention
+        self.context_dim = context_dim
+
+        if self_attention:
+            attentions.append(
+                Kandinsky3AttentionBlock(in_channels, time_embed_dim, None, groups, head_dim, expansion_ratio)
+            )
+        else:
+            attentions.append(nn.Identity())
+
+        up_resolutions = [[None] * 4] * (num_blocks - 1) + [[None, None, False if down_sample else None, None]]
+        hidden_channels = [(in_channels, out_channels)] + [(out_channels, out_channels)] * (num_blocks - 1)
+        for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions):
+            resnets_in.append(
+                Kandinsky3ResNetBlock(in_channel, out_channel, time_embed_dim, groups, compression_ratio)
+            )
+
+            if context_dim is not None:
+                attentions.append(
+                    Kandinsky3AttentionBlock(
+                        out_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio
+                    )
+                )
+            else:
+                attentions.append(nn.Identity())
+
+            resnets_out.append(
+                Kandinsky3ResNetBlock(
+                    out_channel, out_channel, time_embed_dim, groups, compression_ratio, up_resolution
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets_in = nn.ModuleList(resnets_in)
+        self.resnets_out = nn.ModuleList(resnets_out)
+
+    def forward(self, x, time_embed, context=None, context_mask=None, image_mask=None):
+        if self.self_attention:
+            x = self.attentions[0](x, time_embed, image_mask=image_mask)
+
+        for attention, resnet_in, resnet_out in zip(self.attentions[1:], self.resnets_in, self.resnets_out):
+            x = resnet_in(x, time_embed)
+            if self.context_dim is not None:
+                x = attention(x, time_embed, context, context_mask, image_mask)
+            x = resnet_out(x, time_embed)
+        return x
+
+
+class Kandinsky3ConditionalGroupNorm(nn.Module):
+    def __init__(self, groups, normalized_shape, context_dim):
+        super().__init__()
+        self.norm = nn.GroupNorm(groups, normalized_shape, affine=False)
+        self.context_mlp = nn.Sequential(nn.SiLU(), nn.Linear(context_dim, 2 * normalized_shape))
+        self.context_mlp[1].weight.data.zero_()
+        self.context_mlp[1].bias.data.zero_()
+
+    def forward(self, x, context):
+        context = self.context_mlp(context)
+
+        for _ in range(len(x.shape[2:])):
+            context = context.unsqueeze(-1)
+
+        scale, shift = context.chunk(2, dim=1)
+        x = self.norm(x) * (scale + 1.0) + shift
+        return x
+
+
+class Kandinsky3Block(nn.Module):
+    def __init__(self, in_channels, out_channels, time_embed_dim, kernel_size=3, norm_groups=32, up_resolution=None):
+        super().__init__()
+        self.group_norm = Kandinsky3ConditionalGroupNorm(norm_groups, in_channels, time_embed_dim)
+        self.activation = nn.SiLU()
+        if up_resolution is not None and up_resolution:
+            self.up_sample = nn.ConvTranspose2d(in_channels, in_channels, kernel_size=2, stride=2)
+        else:
+            self.up_sample = nn.Identity()
+
+        padding = int(kernel_size > 1)
+        self.projection = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding)
+
+        if up_resolution is not None and not up_resolution:
+            self.down_sample = nn.Conv2d(out_channels, out_channels, kernel_size=2, stride=2)
+        else:
+            self.down_sample = nn.Identity()
+
+    def forward(self, x, time_embed):
+        x = self.group_norm(x, time_embed)
+        x = self.activation(x)
+        x = self.up_sample(x)
+        x = self.projection(x)
+        x = self.down_sample(x)
+        return x
+
+
+class Kandinsky3ResNetBlock(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, time_embed_dim, norm_groups=32, compression_ratio=2, up_resolutions=4 * [None]
+    ):
+        super().__init__()
+        kernel_sizes = [1, 3, 3, 1]
+        hidden_channel = max(in_channels, out_channels) // compression_ratio
+        hidden_channels = (
+            [(in_channels, hidden_channel)] + [(hidden_channel, hidden_channel)] * 2 + [(hidden_channel, out_channels)]
+        )
+        self.resnet_blocks = nn.ModuleList(
+            [
+                Kandinsky3Block(in_channel, out_channel, time_embed_dim, kernel_size, norm_groups, up_resolution)
+                for (in_channel, out_channel), kernel_size, up_resolution in zip(
+                    hidden_channels, kernel_sizes, up_resolutions
+                )
+            ]
+        )
+        self.shortcut_up_sample = (
+            nn.ConvTranspose2d(in_channels, in_channels, kernel_size=2, stride=2)
+            if True in up_resolutions
+            else nn.Identity()
+        )
+        self.shortcut_projection = (
+            nn.Conv2d(in_channels, out_channels, kernel_size=1) if in_channels != out_channels else nn.Identity()
+        )
+        self.shortcut_down_sample = (
+            nn.Conv2d(out_channels, out_channels, kernel_size=2, stride=2)
+            if False in up_resolutions
+            else nn.Identity()
+        )
+
+    def forward(self, x, time_embed):
+        out = x
+        for resnet_block in self.resnet_blocks:
+            out = resnet_block(out, time_embed)
+
+        x = self.shortcut_up_sample(x)
+        x = self.shortcut_projection(x)
+        x = self.shortcut_down_sample(x)
+        x = x + out
+        return x
+
+
+class Kandinsky3AttentionPooling(nn.Module):
+    def __init__(self, num_channels, context_dim, head_dim=64):
+        super().__init__()
+        self.attention = Attention(
+            context_dim,
+            context_dim,
+            dim_head=head_dim,
+            out_dim=num_channels,
+            out_bias=False,
+        )
+
+    def forward(self, x, context, context_mask=None):
+        context_mask = context_mask.to(dtype=context.dtype)
+        context = self.attention(context.mean(dim=1, keepdim=True), context, context_mask)
+        return x + context.squeeze(1)
+
+
+class Kandinsky3AttentionBlock(nn.Module):
+    def __init__(self, num_channels, time_embed_dim, context_dim=None, norm_groups=32, head_dim=64, expansion_ratio=4):
+        super().__init__()
+        self.in_norm = Kandinsky3ConditionalGroupNorm(norm_groups, num_channels, time_embed_dim)
+        self.attention = Attention(
+            num_channels,
+            context_dim or num_channels,
+            dim_head=head_dim,
+            out_dim=num_channels,
+            out_bias=False,
+        )
+
+        hidden_channels = expansion_ratio * num_channels
+        self.out_norm = Kandinsky3ConditionalGroupNorm(norm_groups, num_channels, time_embed_dim)
+        self.feed_forward = nn.Sequential(
+            nn.Conv2d(num_channels, hidden_channels, kernel_size=1, bias=False),
+            nn.SiLU(),
+            nn.Conv2d(hidden_channels, num_channels, kernel_size=1, bias=False),
+        )
+
+    def forward(self, x, time_embed, context=None, context_mask=None, image_mask=None):
+        height, width = x.shape[-2:]
+        out = self.in_norm(x, time_embed)
+        out = out.reshape(x.shape[0], -1, height * width).permute(0, 2, 1)
+        context = context if context is not None else out
+        if context_mask is not None:
+            context_mask = context_mask.to(dtype=context.dtype)
+
+        out = self.attention(out, context, context_mask)
+        out = out.permute(0, 2, 1).unsqueeze(-1).reshape(out.shape[0], -1, height, width)
+        x = x + out
+
+        out = self.out_norm(x, time_embed)
+        out = self.feed_forward(out)
+        x = x + out
+        return x
diff --git a/diffusers/src/diffusers/models/unets/unet_motion_model.py b/diffusers/src/diffusers/models/unets/unet_motion_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6125feba58995ccb98c807acb5b74724f7ac67df
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_motion_model.py
@@ -0,0 +1,2272 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+
+from ...configuration_utils import ConfigMixin, FrozenDict, register_to_config
+from ...loaders import FromOriginalModelMixin, PeftAdapterMixin, UNet2DConditionLoadersMixin
+from ...utils import BaseOutput, deprecate, is_torch_version, logging
+from ...utils.torch_utils import apply_freeu
+from ..attention import BasicTransformerBlock
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    IPAdapterAttnProcessor,
+    IPAdapterAttnProcessor2_0,
+)
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from ..resnet import Downsample2D, ResnetBlock2D, Upsample2D
+from ..transformers.dual_transformer_2d import DualTransformer2DModel
+from ..transformers.transformer_2d import Transformer2DModel
+from .unet_2d_blocks import UNetMidBlock2DCrossAttn
+from .unet_2d_condition import UNet2DConditionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class UNetMotionOutput(BaseOutput):
+    """
+    The output of [`UNetMotionOutput`].
+
+    Args:
+        sample (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+
+    sample: torch.Tensor
+
+
+class AnimateDiffTransformer3D(nn.Module):
+    """
+    A Transformer model for video-like data.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlock` attention should contain a bias parameter.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`):
+            Activation function to use in feed-forward. See `diffusers.models.activations.get_activation` for supported
+            activation functions.
+        norm_elementwise_affine (`bool`, *optional*):
+            Configure if the `TransformerBlock` should use learnable elementwise affine parameters for normalization.
+        double_self_attention (`bool`, *optional*):
+            Configure if each `TransformerBlock` should contain two self-attention layers.
+        positional_embeddings: (`str`, *optional*):
+            The type of positional embeddings to apply to the sequence input before passing use.
+        num_positional_embeddings: (`int`, *optional*):
+            The maximum length of the sequence over which to apply positional embeddings.
+    """
+
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        norm_elementwise_affine: bool = True,
+        double_self_attention: bool = True,
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+
+        self.in_channels = in_channels
+
+        self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    double_self_attention=double_self_attention,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    positional_embeddings=positional_embeddings,
+                    num_positional_embeddings=num_positional_embeddings,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.LongTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        num_frames: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.Tensor:
+        """
+        The [`AnimateDiffTransformer3D`] forward method.
+
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input hidden_states.
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            num_frames (`int`, *optional*, defaults to 1):
+                The number of frames to be processed per batch. This is used to reshape the hidden states.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Returns:
+            torch.Tensor:
+                The output tensor.
+        """
+        # 1. Input
+        batch_frames, channel, height, width = hidden_states.shape
+        batch_size = batch_frames // num_frames
+
+        residual = hidden_states
+
+        hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, channel, height, width)
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+
+        hidden_states = self.norm(hidden_states)
+        hidden_states = hidden_states.permute(0, 3, 4, 2, 1).reshape(batch_size * height * width, num_frames, channel)
+
+        hidden_states = self.proj_in(input=hidden_states)
+
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+            )
+
+        # 3. Output
+        hidden_states = self.proj_out(input=hidden_states)
+        hidden_states = (
+            hidden_states[None, None, :]
+            .reshape(batch_size, height, width, num_frames, channel)
+            .permute(0, 3, 4, 1, 2)
+            .contiguous()
+        )
+        hidden_states = hidden_states.reshape(batch_frames, channel, height, width)
+
+        output = hidden_states + residual
+        return output
+
+
+class DownBlockMotion(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+        temporal_num_attention_heads: Union[int, Tuple[int]] = 1,
+        temporal_cross_attention_dim: Optional[int] = None,
+        temporal_max_seq_length: int = 32,
+        temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        temporal_double_self_attention: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+
+        # support for variable transformer layers per temporal block
+        if isinstance(temporal_transformer_layers_per_block, int):
+            temporal_transformer_layers_per_block = (temporal_transformer_layers_per_block,) * num_layers
+        elif len(temporal_transformer_layers_per_block) != num_layers:
+            raise ValueError(
+                f"`temporal_transformer_layers_per_block` must be an integer or a tuple of integers of length {num_layers}"
+            )
+
+        # support for variable number of attention head per temporal layers
+        if isinstance(temporal_num_attention_heads, int):
+            temporal_num_attention_heads = (temporal_num_attention_heads,) * num_layers
+        elif len(temporal_num_attention_heads) != num_layers:
+            raise ValueError(
+                f"`temporal_num_attention_heads` must be an integer or a tuple of integers of length {num_layers}"
+            )
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            motion_modules.append(
+                AnimateDiffTransformer3D(
+                    num_attention_heads=temporal_num_attention_heads[i],
+                    in_channels=out_channels,
+                    num_layers=temporal_transformer_layers_per_block[i],
+                    norm_num_groups=resnet_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    activation_fn="geglu",
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    attention_head_dim=out_channels // temporal_num_attention_heads[i],
+                    double_self_attention=temporal_double_self_attention,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        num_frames: int = 1,
+        *args,
+        **kwargs,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        output_states = ()
+
+        blocks = zip(self.resnets, self.motion_modules)
+        for resnet, motion_module in blocks:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        use_reentrant=False,
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+
+            else:
+                hidden_states = resnet(input_tensor=hidden_states, temb=temb)
+
+            hidden_states = motion_module(hidden_states, num_frames=num_frames)
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states=hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnDownBlockMotion(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+        temporal_cross_attention_dim: Optional[int] = None,
+        temporal_num_attention_heads: int = 8,
+        temporal_max_seq_length: int = 32,
+        temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        temporal_double_self_attention: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        motion_modules = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = (transformer_layers_per_block,) * num_layers
+        elif len(transformer_layers_per_block) != num_layers:
+            raise ValueError(
+                f"transformer_layers_per_block must be an integer or a list of integers of length {num_layers}"
+            )
+
+        # support for variable transformer layers per temporal block
+        if isinstance(temporal_transformer_layers_per_block, int):
+            temporal_transformer_layers_per_block = (temporal_transformer_layers_per_block,) * num_layers
+        elif len(temporal_transformer_layers_per_block) != num_layers:
+            raise ValueError(
+                f"temporal_transformer_layers_per_block must be an integer or a list of integers of length {num_layers}"
+            )
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+
+            motion_modules.append(
+                AnimateDiffTransformer3D(
+                    num_attention_heads=temporal_num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=temporal_transformer_layers_per_block[i],
+                    norm_num_groups=resnet_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    activation_fn="geglu",
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    attention_head_dim=out_channels // temporal_num_attention_heads,
+                    double_self_attention=temporal_double_self_attention,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        num_frames: int = 1,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        additional_residuals: Optional[torch.Tensor] = None,
+    ):
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        output_states = ()
+
+        blocks = list(zip(self.resnets, self.attentions, self.motion_modules))
+        for i, (resnet, attn, motion_module) in enumerate(blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = resnet(input_tensor=hidden_states, temb=temb)
+
+            hidden_states = attn(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                attention_mask=attention_mask,
+                encoder_attention_mask=encoder_attention_mask,
+                return_dict=False,
+            )[0]
+
+            hidden_states = motion_module(
+                hidden_states,
+                num_frames=num_frames,
+            )
+
+            # apply additional residuals to the output of the last pair of resnet and attention blocks
+            if i == len(blocks) - 1 and additional_residuals is not None:
+                hidden_states = hidden_states + additional_residuals
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states=hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnUpBlockMotion(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+        temporal_cross_attention_dim: Optional[int] = None,
+        temporal_num_attention_heads: int = 8,
+        temporal_max_seq_length: int = 32,
+        temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        motion_modules = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = (transformer_layers_per_block,) * num_layers
+        elif len(transformer_layers_per_block) != num_layers:
+            raise ValueError(
+                f"transformer_layers_per_block must be an integer or a list of integers of length {num_layers}, got {len(transformer_layers_per_block)}"
+            )
+
+        # support for variable transformer layers per temporal block
+        if isinstance(temporal_transformer_layers_per_block, int):
+            temporal_transformer_layers_per_block = (temporal_transformer_layers_per_block,) * num_layers
+        elif len(temporal_transformer_layers_per_block) != num_layers:
+            raise ValueError(
+                f"temporal_transformer_layers_per_block must be an integer or a list of integers of length {num_layers}, got {len(temporal_transformer_layers_per_block)}"
+            )
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            motion_modules.append(
+                AnimateDiffTransformer3D(
+                    num_attention_heads=temporal_num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=temporal_transformer_layers_per_block[i],
+                    norm_num_groups=resnet_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    activation_fn="geglu",
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    attention_head_dim=out_channels // temporal_num_attention_heads,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        num_frames: int = 1,
+    ) -> torch.Tensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        blocks = zip(self.resnets, self.attentions, self.motion_modules)
+        for resnet, attn, motion_module in blocks:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = resnet(input_tensor=hidden_states, temb=temb)
+
+            hidden_states = attn(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                attention_mask=attention_mask,
+                encoder_attention_mask=encoder_attention_mask,
+                return_dict=False,
+            )[0]
+
+            hidden_states = motion_module(
+                hidden_states,
+                num_frames=num_frames,
+            )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states=hidden_states, output_size=upsample_size)
+
+        return hidden_states
+
+
+class UpBlockMotion(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        temporal_cross_attention_dim: Optional[int] = None,
+        temporal_num_attention_heads: int = 8,
+        temporal_max_seq_length: int = 32,
+        temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+
+        # support for variable transformer layers per temporal block
+        if isinstance(temporal_transformer_layers_per_block, int):
+            temporal_transformer_layers_per_block = (temporal_transformer_layers_per_block,) * num_layers
+        elif len(temporal_transformer_layers_per_block) != num_layers:
+            raise ValueError(
+                f"temporal_transformer_layers_per_block must be an integer or a list of integers of length {num_layers}"
+            )
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+            motion_modules.append(
+                AnimateDiffTransformer3D(
+                    num_attention_heads=temporal_num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=temporal_transformer_layers_per_block[i],
+                    norm_num_groups=resnet_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    activation_fn="geglu",
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    attention_head_dim=out_channels // temporal_num_attention_heads,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+        temb: Optional[torch.Tensor] = None,
+        upsample_size=None,
+        num_frames: int = 1,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        blocks = zip(self.resnets, self.motion_modules)
+
+        for resnet, motion_module in blocks:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        use_reentrant=False,
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(input_tensor=hidden_states, temb=temb)
+
+            hidden_states = motion_module(hidden_states, num_frames=num_frames)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states=hidden_states, output_size=upsample_size)
+
+        return hidden_states
+
+
+class UNetMidBlockCrossAttnMotion(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+        temporal_num_attention_heads: int = 1,
+        temporal_cross_attention_dim: Optional[int] = None,
+        temporal_max_seq_length: int = 32,
+        temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = (transformer_layers_per_block,) * num_layers
+        elif len(transformer_layers_per_block) != num_layers:
+            raise ValueError(
+                f"`transformer_layers_per_block` should be an integer or a list of integers of length {num_layers}."
+            )
+
+        # support for variable transformer layers per temporal block
+        if isinstance(temporal_transformer_layers_per_block, int):
+            temporal_transformer_layers_per_block = (temporal_transformer_layers_per_block,) * num_layers
+        elif len(temporal_transformer_layers_per_block) != num_layers:
+            raise ValueError(
+                f"`temporal_transformer_layers_per_block` should be an integer or a list of integers of length {num_layers}."
+            )
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        motion_modules = []
+
+        for i in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            motion_modules.append(
+                AnimateDiffTransformer3D(
+                    num_attention_heads=temporal_num_attention_heads,
+                    attention_head_dim=in_channels // temporal_num_attention_heads,
+                    in_channels=in_channels,
+                    num_layers=temporal_transformer_layers_per_block[i],
+                    norm_num_groups=resnet_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    activation_fn="geglu",
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        num_frames: int = 1,
+    ) -> torch.Tensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        hidden_states = self.resnets[0](input_tensor=hidden_states, temb=temb)
+
+        blocks = zip(self.attentions, self.resnets[1:], self.motion_modules)
+        for attn, resnet, motion_module in blocks:
+            hidden_states = attn(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                attention_mask=attention_mask,
+                encoder_attention_mask=encoder_attention_mask,
+                return_dict=False,
+            )[0]
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(motion_module),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = motion_module(
+                    hidden_states,
+                    num_frames=num_frames,
+                )
+                hidden_states = resnet(input_tensor=hidden_states, temb=temb)
+
+        return hidden_states
+
+
+class MotionModules(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        layers_per_block: int = 2,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Union[int, Tuple[int]] = 8,
+        attention_bias: bool = False,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        norm_num_groups: int = 32,
+        max_seq_length: int = 32,
+    ):
+        super().__init__()
+        self.motion_modules = nn.ModuleList([])
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = (transformer_layers_per_block,) * layers_per_block
+        elif len(transformer_layers_per_block) != layers_per_block:
+            raise ValueError(
+                f"The number of transformer layers per block must match the number of layers per block, "
+                f"got {layers_per_block} and {len(transformer_layers_per_block)}"
+            )
+
+        for i in range(layers_per_block):
+            self.motion_modules.append(
+                AnimateDiffTransformer3D(
+                    in_channels=in_channels,
+                    num_layers=transformer_layers_per_block[i],
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=in_channels // num_attention_heads,
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=max_seq_length,
+                )
+            )
+
+
+class MotionAdapter(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    @register_to_config
+    def __init__(
+        self,
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        motion_layers_per_block: Union[int, Tuple[int]] = 2,
+        motion_transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]] = 1,
+        motion_mid_block_layers_per_block: int = 1,
+        motion_transformer_layers_per_mid_block: Union[int, Tuple[int]] = 1,
+        motion_num_attention_heads: Union[int, Tuple[int]] = 8,
+        motion_norm_num_groups: int = 32,
+        motion_max_seq_length: int = 32,
+        use_motion_mid_block: bool = True,
+        conv_in_channels: Optional[int] = None,
+    ):
+        """Container to store AnimateDiff Motion Modules
+
+        Args:
+            block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each UNet block.
+            motion_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 2):
+                The number of motion layers per UNet block.
+            motion_transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple[int]]`, *optional*, defaults to 1):
+                The number of transformer layers to use in each motion layer in each block.
+            motion_mid_block_layers_per_block (`int`, *optional*, defaults to 1):
+                The number of motion layers in the middle UNet block.
+            motion_transformer_layers_per_mid_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+                The number of transformer layers to use in each motion layer in the middle block.
+            motion_num_attention_heads (`int` or `Tuple[int]`, *optional*, defaults to 8):
+                The number of heads to use in each attention layer of the motion module.
+            motion_norm_num_groups (`int`, *optional*, defaults to 32):
+                The number of groups to use in each group normalization layer of the motion module.
+            motion_max_seq_length (`int`, *optional*, defaults to 32):
+                The maximum sequence length to use in the motion module.
+            use_motion_mid_block (`bool`, *optional*, defaults to True):
+                Whether to use a motion module in the middle of the UNet.
+        """
+
+        super().__init__()
+        down_blocks = []
+        up_blocks = []
+
+        if isinstance(motion_layers_per_block, int):
+            motion_layers_per_block = (motion_layers_per_block,) * len(block_out_channels)
+        elif len(motion_layers_per_block) != len(block_out_channels):
+            raise ValueError(
+                f"The number of motion layers per block must match the number of blocks, "
+                f"got {len(block_out_channels)} and {len(motion_layers_per_block)}"
+            )
+
+        if isinstance(motion_transformer_layers_per_block, int):
+            motion_transformer_layers_per_block = (motion_transformer_layers_per_block,) * len(block_out_channels)
+
+        if isinstance(motion_transformer_layers_per_mid_block, int):
+            motion_transformer_layers_per_mid_block = (
+                motion_transformer_layers_per_mid_block,
+            ) * motion_mid_block_layers_per_block
+        elif len(motion_transformer_layers_per_mid_block) != motion_mid_block_layers_per_block:
+            raise ValueError(
+                f"The number of layers per mid block ({motion_mid_block_layers_per_block}) "
+                f"must match the length of motion_transformer_layers_per_mid_block ({len(motion_transformer_layers_per_mid_block)})"
+            )
+
+        if isinstance(motion_num_attention_heads, int):
+            motion_num_attention_heads = (motion_num_attention_heads,) * len(block_out_channels)
+        elif len(motion_num_attention_heads) != len(block_out_channels):
+            raise ValueError(
+                f"The length of the attention head number tuple in the motion module must match the "
+                f"number of block, got {len(motion_num_attention_heads)} and {len(block_out_channels)}"
+            )
+
+        if conv_in_channels:
+            # input
+            self.conv_in = nn.Conv2d(conv_in_channels, block_out_channels[0], kernel_size=3, padding=1)
+        else:
+            self.conv_in = None
+
+        for i, channel in enumerate(block_out_channels):
+            output_channel = block_out_channels[i]
+            down_blocks.append(
+                MotionModules(
+                    in_channels=output_channel,
+                    norm_num_groups=motion_norm_num_groups,
+                    cross_attention_dim=None,
+                    activation_fn="geglu",
+                    attention_bias=False,
+                    num_attention_heads=motion_num_attention_heads[i],
+                    max_seq_length=motion_max_seq_length,
+                    layers_per_block=motion_layers_per_block[i],
+                    transformer_layers_per_block=motion_transformer_layers_per_block[i],
+                )
+            )
+
+        if use_motion_mid_block:
+            self.mid_block = MotionModules(
+                in_channels=block_out_channels[-1],
+                norm_num_groups=motion_norm_num_groups,
+                cross_attention_dim=None,
+                activation_fn="geglu",
+                attention_bias=False,
+                num_attention_heads=motion_num_attention_heads[-1],
+                max_seq_length=motion_max_seq_length,
+                layers_per_block=motion_mid_block_layers_per_block,
+                transformer_layers_per_block=motion_transformer_layers_per_mid_block,
+            )
+        else:
+            self.mid_block = None
+
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+
+        reversed_motion_layers_per_block = list(reversed(motion_layers_per_block))
+        reversed_motion_transformer_layers_per_block = list(reversed(motion_transformer_layers_per_block))
+        reversed_motion_num_attention_heads = list(reversed(motion_num_attention_heads))
+        for i, channel in enumerate(reversed_block_out_channels):
+            output_channel = reversed_block_out_channels[i]
+            up_blocks.append(
+                MotionModules(
+                    in_channels=output_channel,
+                    norm_num_groups=motion_norm_num_groups,
+                    cross_attention_dim=None,
+                    activation_fn="geglu",
+                    attention_bias=False,
+                    num_attention_heads=reversed_motion_num_attention_heads[i],
+                    max_seq_length=motion_max_seq_length,
+                    layers_per_block=reversed_motion_layers_per_block[i] + 1,
+                    transformer_layers_per_block=reversed_motion_transformer_layers_per_block[i],
+                )
+            )
+
+        self.down_blocks = nn.ModuleList(down_blocks)
+        self.up_blocks = nn.ModuleList(up_blocks)
+
+    def forward(self, sample):
+        pass
+
+
+class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
+    r"""
+    A modified conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a
+    sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        down_block_types: Tuple[str, ...] = (
+            "CrossAttnDownBlockMotion",
+            "CrossAttnDownBlockMotion",
+            "CrossAttnDownBlockMotion",
+            "DownBlockMotion",
+        ),
+        up_block_types: Tuple[str, ...] = (
+            "UpBlockMotion",
+            "CrossAttnUpBlockMotion",
+            "CrossAttnUpBlockMotion",
+            "CrossAttnUpBlockMotion",
+        ),
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        reverse_transformer_layers_per_block: Optional[Union[int, Tuple[int], Tuple[Tuple]]] = None,
+        temporal_transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        reverse_temporal_transformer_layers_per_block: Optional[Union[int, Tuple[int], Tuple[Tuple]]] = None,
+        transformer_layers_per_mid_block: Optional[Union[int, Tuple[int]]] = None,
+        temporal_transformer_layers_per_mid_block: Optional[Union[int, Tuple[int]]] = 1,
+        use_linear_projection: bool = False,
+        num_attention_heads: Union[int, Tuple[int, ...]] = 8,
+        motion_max_seq_length: int = 32,
+        motion_num_attention_heads: Union[int, Tuple[int, ...]] = 8,
+        reverse_motion_num_attention_heads: Optional[Union[int, Tuple[int, ...], Tuple[Tuple[int, ...], ...]]] = None,
+        use_motion_mid_block: bool = True,
+        mid_block_layers: int = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        time_cond_proj_dim: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
+            for layer_number_per_block in transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
+
+        if (
+            isinstance(temporal_transformer_layers_per_block, list)
+            and reverse_temporal_transformer_layers_per_block is None
+        ):
+            for layer_number_per_block in temporal_transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError(
+                        "Must provide 'reverse_temporal_transformer_layers_per_block` if using asymmetrical motion module in UNet."
+                    )
+
+        # input
+        conv_in_kernel = 3
+        conv_out_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, 0)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim, time_embed_dim, act_fn=act_fn, cond_proj_dim=time_cond_proj_dim
+        )
+
+        if encoder_hid_dim_type is None:
+            self.encoder_hid_proj = None
+
+        if addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, True, 0)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+
+        # class embedding
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
+        if isinstance(reverse_transformer_layers_per_block, int):
+            reverse_transformer_layers_per_block = [reverse_transformer_layers_per_block] * len(down_block_types)
+
+        if isinstance(temporal_transformer_layers_per_block, int):
+            temporal_transformer_layers_per_block = [temporal_transformer_layers_per_block] * len(down_block_types)
+
+        if isinstance(reverse_temporal_transformer_layers_per_block, int):
+            reverse_temporal_transformer_layers_per_block = [reverse_temporal_transformer_layers_per_block] * len(
+                down_block_types
+            )
+
+        if isinstance(motion_num_attention_heads, int):
+            motion_num_attention_heads = (motion_num_attention_heads,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            if down_block_type == "CrossAttnDownBlockMotion":
+                down_block = CrossAttnDownBlockMotion(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    temb_channels=time_embed_dim,
+                    num_layers=layers_per_block[i],
+                    transformer_layers_per_block=transformer_layers_per_block[i],
+                    resnet_eps=norm_eps,
+                    resnet_act_fn=act_fn,
+                    resnet_groups=norm_num_groups,
+                    num_attention_heads=num_attention_heads[i],
+                    cross_attention_dim=cross_attention_dim[i],
+                    downsample_padding=downsample_padding,
+                    add_downsample=not is_final_block,
+                    use_linear_projection=use_linear_projection,
+                    temporal_num_attention_heads=motion_num_attention_heads[i],
+                    temporal_max_seq_length=motion_max_seq_length,
+                    temporal_transformer_layers_per_block=temporal_transformer_layers_per_block[i],
+                )
+            elif down_block_type == "DownBlockMotion":
+                down_block = DownBlockMotion(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    temb_channels=time_embed_dim,
+                    num_layers=layers_per_block[i],
+                    resnet_eps=norm_eps,
+                    resnet_act_fn=act_fn,
+                    resnet_groups=norm_num_groups,
+                    add_downsample=not is_final_block,
+                    downsample_padding=downsample_padding,
+                    temporal_num_attention_heads=motion_num_attention_heads[i],
+                    temporal_max_seq_length=motion_max_seq_length,
+                    temporal_transformer_layers_per_block=temporal_transformer_layers_per_block[i],
+                )
+            else:
+                raise ValueError(
+                    "Invalid `down_block_type` encountered. Must be one of `CrossAttnDownBlockMotion` or `DownBlockMotion`"
+                )
+
+            self.down_blocks.append(down_block)
+
+        # mid
+        if transformer_layers_per_mid_block is None:
+            transformer_layers_per_mid_block = (
+                transformer_layers_per_block[-1] if isinstance(transformer_layers_per_block[-1], int) else 1
+            )
+
+        if use_motion_mid_block:
+            self.mid_block = UNetMidBlockCrossAttnMotion(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=False,
+                use_linear_projection=use_linear_projection,
+                num_layers=mid_block_layers,
+                temporal_num_attention_heads=motion_num_attention_heads[-1],
+                temporal_max_seq_length=motion_max_seq_length,
+                transformer_layers_per_block=transformer_layers_per_mid_block,
+                temporal_transformer_layers_per_block=temporal_transformer_layers_per_mid_block,
+            )
+
+        else:
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=False,
+                use_linear_projection=use_linear_projection,
+                num_layers=mid_block_layers,
+                transformer_layers_per_block=transformer_layers_per_mid_block,
+            )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_motion_num_attention_heads = list(reversed(motion_num_attention_heads))
+
+        if reverse_transformer_layers_per_block is None:
+            reverse_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+
+        if reverse_temporal_transformer_layers_per_block is None:
+            reverse_temporal_transformer_layers_per_block = list(reversed(temporal_transformer_layers_per_block))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            if up_block_type == "CrossAttnUpBlockMotion":
+                up_block = CrossAttnUpBlockMotion(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    prev_output_channel=prev_output_channel,
+                    temb_channels=time_embed_dim,
+                    resolution_idx=i,
+                    num_layers=reversed_layers_per_block[i] + 1,
+                    transformer_layers_per_block=reverse_transformer_layers_per_block[i],
+                    resnet_eps=norm_eps,
+                    resnet_act_fn=act_fn,
+                    resnet_groups=norm_num_groups,
+                    num_attention_heads=reversed_num_attention_heads[i],
+                    cross_attention_dim=reversed_cross_attention_dim[i],
+                    add_upsample=add_upsample,
+                    use_linear_projection=use_linear_projection,
+                    temporal_num_attention_heads=reversed_motion_num_attention_heads[i],
+                    temporal_max_seq_length=motion_max_seq_length,
+                    temporal_transformer_layers_per_block=reverse_temporal_transformer_layers_per_block[i],
+                )
+            elif up_block_type == "UpBlockMotion":
+                up_block = UpBlockMotion(
+                    in_channels=input_channel,
+                    prev_output_channel=prev_output_channel,
+                    out_channels=output_channel,
+                    temb_channels=time_embed_dim,
+                    resolution_idx=i,
+                    num_layers=reversed_layers_per_block[i] + 1,
+                    resnet_eps=norm_eps,
+                    resnet_act_fn=act_fn,
+                    resnet_groups=norm_num_groups,
+                    add_upsample=add_upsample,
+                    temporal_num_attention_heads=reversed_motion_num_attention_heads[i],
+                    temporal_max_seq_length=motion_max_seq_length,
+                    temporal_transformer_layers_per_block=reverse_temporal_transformer_layers_per_block[i],
+                )
+            else:
+                raise ValueError(
+                    "Invalid `up_block_type` encountered. Must be one of `CrossAttnUpBlockMotion` or `UpBlockMotion`"
+                )
+
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+            self.conv_act = nn.SiLU()
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+
+    @classmethod
+    def from_unet2d(
+        cls,
+        unet: UNet2DConditionModel,
+        motion_adapter: Optional[MotionAdapter] = None,
+        load_weights: bool = True,
+    ):
+        has_motion_adapter = motion_adapter is not None
+
+        if has_motion_adapter:
+            motion_adapter.to(device=unet.device)
+
+            # check compatibility of number of blocks
+            if len(unet.config["down_block_types"]) != len(motion_adapter.config["block_out_channels"]):
+                raise ValueError("Incompatible Motion Adapter, got different number of blocks")
+
+            # check layers compatibility for each block
+            if isinstance(unet.config["layers_per_block"], int):
+                expanded_layers_per_block = [unet.config["layers_per_block"]] * len(unet.config["down_block_types"])
+            else:
+                expanded_layers_per_block = list(unet.config["layers_per_block"])
+            if isinstance(motion_adapter.config["motion_layers_per_block"], int):
+                expanded_adapter_layers_per_block = [motion_adapter.config["motion_layers_per_block"]] * len(
+                    motion_adapter.config["block_out_channels"]
+                )
+            else:
+                expanded_adapter_layers_per_block = list(motion_adapter.config["motion_layers_per_block"])
+            if expanded_layers_per_block != expanded_adapter_layers_per_block:
+                raise ValueError("Incompatible Motion Adapter, got different number of layers per block")
+
+        # based on https://github.com/guoyww/AnimateDiff/blob/895f3220c06318ea0760131ec70408b466c49333/animatediff/models/unet.py#L459
+        config = dict(unet.config)
+        config["_class_name"] = cls.__name__
+
+        down_blocks = []
+        for down_blocks_type in config["down_block_types"]:
+            if "CrossAttn" in down_blocks_type:
+                down_blocks.append("CrossAttnDownBlockMotion")
+            else:
+                down_blocks.append("DownBlockMotion")
+        config["down_block_types"] = down_blocks
+
+        up_blocks = []
+        for down_blocks_type in config["up_block_types"]:
+            if "CrossAttn" in down_blocks_type:
+                up_blocks.append("CrossAttnUpBlockMotion")
+            else:
+                up_blocks.append("UpBlockMotion")
+        config["up_block_types"] = up_blocks
+
+        if has_motion_adapter:
+            config["motion_num_attention_heads"] = motion_adapter.config["motion_num_attention_heads"]
+            config["motion_max_seq_length"] = motion_adapter.config["motion_max_seq_length"]
+            config["use_motion_mid_block"] = motion_adapter.config["use_motion_mid_block"]
+            config["layers_per_block"] = motion_adapter.config["motion_layers_per_block"]
+            config["temporal_transformer_layers_per_mid_block"] = motion_adapter.config[
+                "motion_transformer_layers_per_mid_block"
+            ]
+            config["temporal_transformer_layers_per_block"] = motion_adapter.config[
+                "motion_transformer_layers_per_block"
+            ]
+            config["motion_num_attention_heads"] = motion_adapter.config["motion_num_attention_heads"]
+
+            # For PIA UNets we need to set the number input channels to 9
+            if motion_adapter.config["conv_in_channels"]:
+                config["in_channels"] = motion_adapter.config["conv_in_channels"]
+
+        # Need this for backwards compatibility with UNet2DConditionModel checkpoints
+        if not config.get("num_attention_heads"):
+            config["num_attention_heads"] = config["attention_head_dim"]
+
+        expected_kwargs, optional_kwargs = cls._get_signature_keys(cls)
+        config = FrozenDict({k: config.get(k) for k in config if k in expected_kwargs or k in optional_kwargs})
+        config["_class_name"] = cls.__name__
+        model = cls.from_config(config)
+
+        if not load_weights:
+            return model
+
+        # Logic for loading PIA UNets which allow the first 4 channels to be any UNet2DConditionModel conv_in weight
+        # while the last 5 channels must be PIA conv_in weights.
+        if has_motion_adapter and motion_adapter.config["conv_in_channels"]:
+            model.conv_in = motion_adapter.conv_in
+            updated_conv_in_weight = torch.cat(
+                [unet.conv_in.weight, motion_adapter.conv_in.weight[:, 4:, :, :]], dim=1
+            )
+            model.conv_in.load_state_dict({"weight": updated_conv_in_weight, "bias": unet.conv_in.bias})
+        else:
+            model.conv_in.load_state_dict(unet.conv_in.state_dict())
+
+        model.time_proj.load_state_dict(unet.time_proj.state_dict())
+        model.time_embedding.load_state_dict(unet.time_embedding.state_dict())
+
+        if any(
+            isinstance(proc, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0))
+            for proc in unet.attn_processors.values()
+        ):
+            attn_procs = {}
+            for name, processor in unet.attn_processors.items():
+                if name.endswith("attn1.processor"):
+                    attn_processor_class = (
+                        AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                    )
+                    attn_procs[name] = attn_processor_class()
+                else:
+                    attn_processor_class = (
+                        IPAdapterAttnProcessor2_0
+                        if hasattr(F, "scaled_dot_product_attention")
+                        else IPAdapterAttnProcessor
+                    )
+                    attn_procs[name] = attn_processor_class(
+                        hidden_size=processor.hidden_size,
+                        cross_attention_dim=processor.cross_attention_dim,
+                        scale=processor.scale,
+                        num_tokens=processor.num_tokens,
+                    )
+            for name, processor in model.attn_processors.items():
+                if name not in attn_procs:
+                    attn_procs[name] = processor.__class__()
+            model.set_attn_processor(attn_procs)
+            model.config.encoder_hid_dim_type = "ip_image_proj"
+            model.encoder_hid_proj = unet.encoder_hid_proj
+
+        for i, down_block in enumerate(unet.down_blocks):
+            model.down_blocks[i].resnets.load_state_dict(down_block.resnets.state_dict())
+            if hasattr(model.down_blocks[i], "attentions"):
+                model.down_blocks[i].attentions.load_state_dict(down_block.attentions.state_dict())
+            if model.down_blocks[i].downsamplers:
+                model.down_blocks[i].downsamplers.load_state_dict(down_block.downsamplers.state_dict())
+
+        for i, up_block in enumerate(unet.up_blocks):
+            model.up_blocks[i].resnets.load_state_dict(up_block.resnets.state_dict())
+            if hasattr(model.up_blocks[i], "attentions"):
+                model.up_blocks[i].attentions.load_state_dict(up_block.attentions.state_dict())
+            if model.up_blocks[i].upsamplers:
+                model.up_blocks[i].upsamplers.load_state_dict(up_block.upsamplers.state_dict())
+
+        model.mid_block.resnets.load_state_dict(unet.mid_block.resnets.state_dict())
+        model.mid_block.attentions.load_state_dict(unet.mid_block.attentions.state_dict())
+
+        if unet.conv_norm_out is not None:
+            model.conv_norm_out.load_state_dict(unet.conv_norm_out.state_dict())
+        if unet.conv_act is not None:
+            model.conv_act.load_state_dict(unet.conv_act.state_dict())
+        model.conv_out.load_state_dict(unet.conv_out.state_dict())
+
+        if has_motion_adapter:
+            model.load_motion_modules(motion_adapter)
+
+        # ensure that the Motion UNet is the same dtype as the UNet2DConditionModel
+        model.to(unet.dtype)
+
+        return model
+
+    def freeze_unet2d_params(self) -> None:
+        """Freeze the weights of just the UNet2DConditionModel, and leave the motion modules
+        unfrozen for fine tuning.
+        """
+        # Freeze everything
+        for param in self.parameters():
+            param.requires_grad = False
+
+        # Unfreeze Motion Modules
+        for down_block in self.down_blocks:
+            motion_modules = down_block.motion_modules
+            for param in motion_modules.parameters():
+                param.requires_grad = True
+
+        for up_block in self.up_blocks:
+            motion_modules = up_block.motion_modules
+            for param in motion_modules.parameters():
+                param.requires_grad = True
+
+        if hasattr(self.mid_block, "motion_modules"):
+            motion_modules = self.mid_block.motion_modules
+            for param in motion_modules.parameters():
+                param.requires_grad = True
+
+    def load_motion_modules(self, motion_adapter: Optional[MotionAdapter]) -> None:
+        for i, down_block in enumerate(motion_adapter.down_blocks):
+            self.down_blocks[i].motion_modules.load_state_dict(down_block.motion_modules.state_dict())
+        for i, up_block in enumerate(motion_adapter.up_blocks):
+            self.up_blocks[i].motion_modules.load_state_dict(up_block.motion_modules.state_dict())
+
+        # to support older motion modules that don't have a mid_block
+        if hasattr(self.mid_block, "motion_modules"):
+            self.mid_block.motion_modules.load_state_dict(motion_adapter.mid_block.motion_modules.state_dict())
+
+    def save_motion_modules(
+        self,
+        save_directory: str,
+        is_main_process: bool = True,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+        push_to_hub: bool = False,
+        **kwargs,
+    ) -> None:
+        state_dict = self.state_dict()
+
+        # Extract all motion modules
+        motion_state_dict = {}
+        for k, v in state_dict.items():
+            if "motion_modules" in k:
+                motion_state_dict[k] = v
+
+        adapter = MotionAdapter(
+            block_out_channels=self.config["block_out_channels"],
+            motion_layers_per_block=self.config["layers_per_block"],
+            motion_norm_num_groups=self.config["norm_num_groups"],
+            motion_num_attention_heads=self.config["motion_num_attention_heads"],
+            motion_max_seq_length=self.config["motion_max_seq_length"],
+            use_motion_mid_block=self.config["use_motion_mid_block"],
+        )
+        adapter.load_state_dict(motion_state_dict)
+        adapter.save_pretrained(
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            safe_serialization=safe_serialization,
+            variant=variant,
+            push_to_hub=push_to_hub,
+            **kwargs,
+        )
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+
+    def disable_forward_chunking(self) -> None:
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, None, 0)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self) -> None:
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (CrossAttnDownBlockMotion, DownBlockMotion, CrossAttnUpBlockMotion, UpBlockMotion)):
+            module.gradient_checkpointing = value
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.enable_freeu
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float) -> None:
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.disable_freeu
+    def disable_freeu(self) -> None:
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNetMotionOutput, Tuple[torch.Tensor]]:
+        r"""
+        The [`UNetMotionModel`] forward method.
+
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
+            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_motion_model.UNetMotionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            [`~models.unets.unet_motion_model.UNetMotionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_motion_model.UNetMotionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        num_frames = sample.shape[2]
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.config.addition_embed_type == "text_time":
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+
+        emb = emb if aug_emb is None else emb + aug_emb
+        emb = emb.repeat_interleave(repeats=num_frames, dim=0)
+
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds)
+            image_embeds = [image_embed.repeat_interleave(repeats=num_frames, dim=0) for image_embed in image_embeds]
+            encoder_hidden_states = (encoder_hidden_states, image_embeds)
+
+        # 2. pre-process
+        sample = sample.permute(0, 2, 1, 3, 4).reshape((sample.shape[0] * num_frames, -1) + sample.shape[3:])
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
+
+            down_block_res_samples += res_samples
+
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            # To support older versions of motion modules that don't have a mid_block
+            if hasattr(self.mid_block, "motion_modules"):
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+
+        if mid_block_additional_residual is not None:
+            sample = sample + mid_block_additional_residual
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    num_frames=num_frames,
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+
+        sample = self.conv_out(sample)
+
+        # reshape to (batch, channel, framerate, width, height)
+        sample = sample[None, :].reshape((-1, num_frames) + sample.shape[1:]).permute(0, 2, 1, 3, 4)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNetMotionOutput(sample=sample)
diff --git a/diffusers/src/diffusers/models/unets/unet_spatio_temporal_condition.py b/diffusers/src/diffusers/models/unets/unet_spatio_temporal_condition.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fb975bc32d95678ca0729bea2a3d1044482d03a
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_spatio_temporal_condition.py
@@ -0,0 +1,490 @@
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import UNet2DConditionLoadersMixin
+from ...utils import BaseOutput, logging
+from ..attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from .unet_3d_blocks import UNetMidBlockSpatioTemporal, get_down_block, get_up_block
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class UNetSpatioTemporalConditionOutput(BaseOutput):
+    """
+    The output of [`UNetSpatioTemporalConditionModel`].
+
+    Args:
+        sample (`torch.Tensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+
+    sample: torch.Tensor = None
+
+
+class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and
+    returns a sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        addition_time_embed_dim: (`int`, defaults to 256):
+            Dimension to to encode the additional time ids.
+        projection_class_embeddings_input_dim (`int`, defaults to 768):
+            The dimension of the projection of encoded `added_time_ids`.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unets.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
+            [`~models.unets.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
+            [`~models.unets.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
+        num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
+            The number of attention heads.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 8,
+        out_channels: int = 4,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockSpatioTemporal",
+            "CrossAttnDownBlockSpatioTemporal",
+            "CrossAttnDownBlockSpatioTemporal",
+            "DownBlockSpatioTemporal",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        addition_time_embed_dim: int = 256,
+        projection_class_embeddings_input_dim: int = 768,
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        cross_attention_dim: Union[int, Tuple[int]] = 1024,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        num_attention_heads: Union[int, Tuple[int]] = (5, 10, 20, 20),
+        num_frames: int = 25,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            padding=1,
+        )
+
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+
+        self.time_proj = Timesteps(block_out_channels[0], True, downscale_freq_shift=0)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+        self.add_time_proj = Timesteps(addition_time_embed_dim, True, downscale_freq_shift=0)
+        self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
+        blocks_time_embed_dim = time_embed_dim
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-5,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlockSpatioTemporal(
+            block_out_channels[-1],
+            temb_channels=blocks_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            cross_attention_dim=cross_attention_dim[-1],
+            num_attention_heads=num_attention_heads[-1],
+        )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=1e-5,
+                resolution_idx=i,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-5)
+        self.conv_act = nn.SiLU()
+
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0],
+            out_channels,
+            kernel_size=3,
+            padding=1,
+        )
+
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(
+            name: str,
+            module: torch.nn.Module,
+            processors: Dict[str, AttentionProcessor],
+        ):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        added_time_ids: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[UNetSpatioTemporalConditionOutput, Tuple]:
+        r"""
+        The [`UNetSpatioTemporalConditionModel`] forward method.
+
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
+            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
+            added_time_ids: (`torch.Tensor`):
+                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
+                embeddings and added to the time embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead
+                of a plain tuple.
+        Returns:
+            [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is the sample tensor.
+        """
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        batch_size, num_frames = sample.shape[:2]
+        timesteps = timesteps.expand(batch_size)
+
+        t_emb = self.time_proj(timesteps)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb)
+
+        time_embeds = self.add_time_proj(added_time_ids.flatten())
+        time_embeds = time_embeds.reshape((batch_size, -1))
+        time_embeds = time_embeds.to(emb.dtype)
+        aug_emb = self.add_embedding(time_embeds)
+        emb = emb + aug_emb
+
+        # Flatten the batch and frames dimensions
+        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
+        sample = sample.flatten(0, 1)
+        # Repeat the embeddings num_video_frames times
+        # emb: [batch, channels] -> [batch * frames, channels]
+        emb = emb.repeat_interleave(num_frames, dim=0)
+        # encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels]
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)
+
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    image_only_indicator=image_only_indicator,
+                )
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        sample = self.mid_block(
+            hidden_states=sample,
+            temb=emb,
+            encoder_hidden_states=encoder_hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    image_only_indicator=image_only_indicator,
+                )
+
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        # 7. Reshape back to original shape
+        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])
+
+        if not return_dict:
+            return (sample,)
+
+        return UNetSpatioTemporalConditionOutput(sample=sample)
diff --git a/diffusers/src/diffusers/models/unets/unet_stable_cascade.py b/diffusers/src/diffusers/models/unets/unet_stable_cascade.py
new file mode 100644
index 0000000000000000000000000000000000000000..7deea9a714d4bfda1598e4f56828e9dc298e981d
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/unet_stable_cascade.py
@@ -0,0 +1,611 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin
+from ...utils import BaseOutput
+from ..attention_processor import Attention
+from ..modeling_utils import ModelMixin
+
+
+# Copied from diffusers.pipelines.wuerstchen.modeling_wuerstchen_common.WuerstchenLayerNorm with WuerstchenLayerNorm -> SDCascadeLayerNorm
+class SDCascadeLayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, x):
+        x = x.permute(0, 2, 3, 1)
+        x = super().forward(x)
+        return x.permute(0, 3, 1, 2)
+
+
+class SDCascadeTimestepBlock(nn.Module):
+    def __init__(self, c, c_timestep, conds=[]):
+        super().__init__()
+
+        self.mapper = nn.Linear(c_timestep, c * 2)
+        self.conds = conds
+        for cname in conds:
+            setattr(self, f"mapper_{cname}", nn.Linear(c_timestep, c * 2))
+
+    def forward(self, x, t):
+        t = t.chunk(len(self.conds) + 1, dim=1)
+        a, b = self.mapper(t[0])[:, :, None, None].chunk(2, dim=1)
+        for i, c in enumerate(self.conds):
+            ac, bc = getattr(self, f"mapper_{c}")(t[i + 1])[:, :, None, None].chunk(2, dim=1)
+            a, b = a + ac, b + bc
+        return x * (1 + a) + b
+
+
+class SDCascadeResBlock(nn.Module):
+    def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0):
+        super().__init__()
+        self.depthwise = nn.Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
+        self.norm = SDCascadeLayerNorm(c, elementwise_affine=False, eps=1e-6)
+        self.channelwise = nn.Sequential(
+            nn.Linear(c + c_skip, c * 4),
+            nn.GELU(),
+            GlobalResponseNorm(c * 4),
+            nn.Dropout(dropout),
+            nn.Linear(c * 4, c),
+        )
+
+    def forward(self, x, x_skip=None):
+        x_res = x
+        x = self.norm(self.depthwise(x))
+        if x_skip is not None:
+            x = torch.cat([x, x_skip], dim=1)
+        x = self.channelwise(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        return x + x_res
+
+
+# from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105
+class GlobalResponseNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
+
+    def forward(self, x):
+        agg_norm = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
+        stand_div_norm = agg_norm / (agg_norm.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * stand_div_norm) + self.beta + x
+
+
+class SDCascadeAttnBlock(nn.Module):
+    def __init__(self, c, c_cond, nhead, self_attn=True, dropout=0.0):
+        super().__init__()
+
+        self.self_attn = self_attn
+        self.norm = SDCascadeLayerNorm(c, elementwise_affine=False, eps=1e-6)
+        self.attention = Attention(query_dim=c, heads=nhead, dim_head=c // nhead, dropout=dropout, bias=True)
+        self.kv_mapper = nn.Sequential(nn.SiLU(), nn.Linear(c_cond, c))
+
+    def forward(self, x, kv):
+        kv = self.kv_mapper(kv)
+        norm_x = self.norm(x)
+        if self.self_attn:
+            batch_size, channel, _, _ = x.shape
+            kv = torch.cat([norm_x.view(batch_size, channel, -1).transpose(1, 2), kv], dim=1)
+        x = x + self.attention(norm_x, encoder_hidden_states=kv)
+        return x
+
+
+class UpDownBlock2d(nn.Module):
+    def __init__(self, in_channels, out_channels, mode, enabled=True):
+        super().__init__()
+        if mode not in ["up", "down"]:
+            raise ValueError(f"{mode} not supported")
+        interpolation = (
+            nn.Upsample(scale_factor=2 if mode == "up" else 0.5, mode="bilinear", align_corners=True)
+            if enabled
+            else nn.Identity()
+        )
+        mapping = nn.Conv2d(in_channels, out_channels, kernel_size=1)
+        self.blocks = nn.ModuleList([interpolation, mapping] if mode == "up" else [mapping, interpolation])
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        return x
+
+
+@dataclass
+class StableCascadeUNetOutput(BaseOutput):
+    sample: torch.Tensor = None
+
+
+class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        timestep_ratio_embedding_dim: int = 64,
+        patch_size: int = 1,
+        conditioning_dim: int = 2048,
+        block_out_channels: Tuple[int] = (2048, 2048),
+        num_attention_heads: Tuple[int] = (32, 32),
+        down_num_layers_per_block: Tuple[int] = (8, 24),
+        up_num_layers_per_block: Tuple[int] = (24, 8),
+        down_blocks_repeat_mappers: Optional[Tuple[int]] = (
+            1,
+            1,
+        ),
+        up_blocks_repeat_mappers: Optional[Tuple[int]] = (1, 1),
+        block_types_per_layer: Tuple[Tuple[str]] = (
+            ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
+            ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
+        ),
+        clip_text_in_channels: Optional[int] = None,
+        clip_text_pooled_in_channels=1280,
+        clip_image_in_channels: Optional[int] = None,
+        clip_seq=4,
+        effnet_in_channels: Optional[int] = None,
+        pixel_mapper_in_channels: Optional[int] = None,
+        kernel_size=3,
+        dropout: Union[float, Tuple[float]] = (0.1, 0.1),
+        self_attn: Union[bool, Tuple[bool]] = True,
+        timestep_conditioning_type: Tuple[str] = ("sca", "crp"),
+        switch_level: Optional[Tuple[bool]] = None,
+    ):
+        """
+
+        Parameters:
+            in_channels (`int`, defaults to 16):
+                Number of channels in the input sample.
+            out_channels (`int`, defaults to 16):
+                Number of channels in the output sample.
+            timestep_ratio_embedding_dim (`int`, defaults to 64):
+                Dimension of the projected time embedding.
+            patch_size (`int`, defaults to 1):
+                Patch size to use for pixel unshuffling layer
+            conditioning_dim (`int`, defaults to 2048):
+                Dimension of the image and text conditional embedding.
+            block_out_channels (Tuple[int], defaults to (2048, 2048)):
+                Tuple of output channels for each block.
+            num_attention_heads (Tuple[int], defaults to (32, 32)):
+                Number of attention heads in each attention block. Set to -1 to if block types in a layer do not have
+                attention.
+            down_num_layers_per_block (Tuple[int], defaults to [8, 24]):
+                Number of layers in each down block.
+            up_num_layers_per_block (Tuple[int], defaults to [24, 8]):
+                Number of layers in each up block.
+            down_blocks_repeat_mappers (Tuple[int], optional, defaults to [1, 1]):
+                Number of 1x1 Convolutional layers to repeat in each down block.
+            up_blocks_repeat_mappers (Tuple[int], optional, defaults to [1, 1]):
+                Number of 1x1 Convolutional layers to repeat in each up block.
+            block_types_per_layer (Tuple[Tuple[str]], optional,
+                defaults to (
+                    ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"), ("SDCascadeResBlock",
+                    "SDCascadeTimestepBlock", "SDCascadeAttnBlock")
+                ): Block types used in each layer of the up/down blocks.
+            clip_text_in_channels (`int`, *optional*, defaults to `None`):
+                Number of input channels for CLIP based text conditioning.
+            clip_text_pooled_in_channels (`int`, *optional*, defaults to 1280):
+                Number of input channels for pooled CLIP text embeddings.
+            clip_image_in_channels (`int`, *optional*):
+                Number of input channels for CLIP based image conditioning.
+            clip_seq (`int`, *optional*, defaults to 4):
+            effnet_in_channels (`int`, *optional*, defaults to `None`):
+                Number of input channels for effnet conditioning.
+            pixel_mapper_in_channels (`int`, defaults to `None`):
+                Number of input channels for pixel mapper conditioning.
+            kernel_size (`int`, *optional*, defaults to 3):
+                Kernel size to use in the block convolutional layers.
+            dropout (Tuple[float], *optional*, defaults to (0.1, 0.1)):
+                Dropout to use per block.
+            self_attn (Union[bool, Tuple[bool]]):
+                Tuple of booleans that determine whether to use self attention in a block or not.
+            timestep_conditioning_type (Tuple[str], defaults to ("sca", "crp")):
+                Timestep conditioning type.
+            switch_level (Optional[Tuple[bool]], *optional*, defaults to `None`):
+                Tuple that indicates whether upsampling or downsampling should be applied in a block
+        """
+
+        super().__init__()
+
+        if len(block_out_channels) != len(down_num_layers_per_block):
+            raise ValueError(
+                f"Number of elements in `down_num_layers_per_block` must match the length of `block_out_channels`: {len(block_out_channels)}"
+            )
+
+        elif len(block_out_channels) != len(up_num_layers_per_block):
+            raise ValueError(
+                f"Number of elements in `up_num_layers_per_block` must match the length of `block_out_channels`: {len(block_out_channels)}"
+            )
+
+        elif len(block_out_channels) != len(down_blocks_repeat_mappers):
+            raise ValueError(
+                f"Number of elements in `down_blocks_repeat_mappers` must match the length of `block_out_channels`: {len(block_out_channels)}"
+            )
+
+        elif len(block_out_channels) != len(up_blocks_repeat_mappers):
+            raise ValueError(
+                f"Number of elements in `up_blocks_repeat_mappers` must match the length of `block_out_channels`: {len(block_out_channels)}"
+            )
+
+        elif len(block_out_channels) != len(block_types_per_layer):
+            raise ValueError(
+                f"Number of elements in `block_types_per_layer` must match the length of `block_out_channels`: {len(block_out_channels)}"
+            )
+
+        if isinstance(dropout, float):
+            dropout = (dropout,) * len(block_out_channels)
+        if isinstance(self_attn, bool):
+            self_attn = (self_attn,) * len(block_out_channels)
+
+        # CONDITIONING
+        if effnet_in_channels is not None:
+            self.effnet_mapper = nn.Sequential(
+                nn.Conv2d(effnet_in_channels, block_out_channels[0] * 4, kernel_size=1),
+                nn.GELU(),
+                nn.Conv2d(block_out_channels[0] * 4, block_out_channels[0], kernel_size=1),
+                SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
+            )
+        if pixel_mapper_in_channels is not None:
+            self.pixels_mapper = nn.Sequential(
+                nn.Conv2d(pixel_mapper_in_channels, block_out_channels[0] * 4, kernel_size=1),
+                nn.GELU(),
+                nn.Conv2d(block_out_channels[0] * 4, block_out_channels[0], kernel_size=1),
+                SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
+            )
+
+        self.clip_txt_pooled_mapper = nn.Linear(clip_text_pooled_in_channels, conditioning_dim * clip_seq)
+        if clip_text_in_channels is not None:
+            self.clip_txt_mapper = nn.Linear(clip_text_in_channels, conditioning_dim)
+        if clip_image_in_channels is not None:
+            self.clip_img_mapper = nn.Linear(clip_image_in_channels, conditioning_dim * clip_seq)
+        self.clip_norm = nn.LayerNorm(conditioning_dim, elementwise_affine=False, eps=1e-6)
+
+        self.embedding = nn.Sequential(
+            nn.PixelUnshuffle(patch_size),
+            nn.Conv2d(in_channels * (patch_size**2), block_out_channels[0], kernel_size=1),
+            SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
+        )
+
+        def get_block(block_type, in_channels, nhead, c_skip=0, dropout=0, self_attn=True):
+            if block_type == "SDCascadeResBlock":
+                return SDCascadeResBlock(in_channels, c_skip, kernel_size=kernel_size, dropout=dropout)
+            elif block_type == "SDCascadeAttnBlock":
+                return SDCascadeAttnBlock(in_channels, conditioning_dim, nhead, self_attn=self_attn, dropout=dropout)
+            elif block_type == "SDCascadeTimestepBlock":
+                return SDCascadeTimestepBlock(
+                    in_channels, timestep_ratio_embedding_dim, conds=timestep_conditioning_type
+                )
+            else:
+                raise ValueError(f"Block type {block_type} not supported")
+
+        # BLOCKS
+        # -- down blocks
+        self.down_blocks = nn.ModuleList()
+        self.down_downscalers = nn.ModuleList()
+        self.down_repeat_mappers = nn.ModuleList()
+        for i in range(len(block_out_channels)):
+            if i > 0:
+                self.down_downscalers.append(
+                    nn.Sequential(
+                        SDCascadeLayerNorm(block_out_channels[i - 1], elementwise_affine=False, eps=1e-6),
+                        UpDownBlock2d(
+                            block_out_channels[i - 1], block_out_channels[i], mode="down", enabled=switch_level[i - 1]
+                        )
+                        if switch_level is not None
+                        else nn.Conv2d(block_out_channels[i - 1], block_out_channels[i], kernel_size=2, stride=2),
+                    )
+                )
+            else:
+                self.down_downscalers.append(nn.Identity())
+
+            down_block = nn.ModuleList()
+            for _ in range(down_num_layers_per_block[i]):
+                for block_type in block_types_per_layer[i]:
+                    block = get_block(
+                        block_type,
+                        block_out_channels[i],
+                        num_attention_heads[i],
+                        dropout=dropout[i],
+                        self_attn=self_attn[i],
+                    )
+                    down_block.append(block)
+            self.down_blocks.append(down_block)
+
+            if down_blocks_repeat_mappers is not None:
+                block_repeat_mappers = nn.ModuleList()
+                for _ in range(down_blocks_repeat_mappers[i] - 1):
+                    block_repeat_mappers.append(nn.Conv2d(block_out_channels[i], block_out_channels[i], kernel_size=1))
+                self.down_repeat_mappers.append(block_repeat_mappers)
+
+        # -- up blocks
+        self.up_blocks = nn.ModuleList()
+        self.up_upscalers = nn.ModuleList()
+        self.up_repeat_mappers = nn.ModuleList()
+        for i in reversed(range(len(block_out_channels))):
+            if i > 0:
+                self.up_upscalers.append(
+                    nn.Sequential(
+                        SDCascadeLayerNorm(block_out_channels[i], elementwise_affine=False, eps=1e-6),
+                        UpDownBlock2d(
+                            block_out_channels[i], block_out_channels[i - 1], mode="up", enabled=switch_level[i - 1]
+                        )
+                        if switch_level is not None
+                        else nn.ConvTranspose2d(
+                            block_out_channels[i], block_out_channels[i - 1], kernel_size=2, stride=2
+                        ),
+                    )
+                )
+            else:
+                self.up_upscalers.append(nn.Identity())
+
+            up_block = nn.ModuleList()
+            for j in range(up_num_layers_per_block[::-1][i]):
+                for k, block_type in enumerate(block_types_per_layer[i]):
+                    c_skip = block_out_channels[i] if i < len(block_out_channels) - 1 and j == k == 0 else 0
+                    block = get_block(
+                        block_type,
+                        block_out_channels[i],
+                        num_attention_heads[i],
+                        c_skip=c_skip,
+                        dropout=dropout[i],
+                        self_attn=self_attn[i],
+                    )
+                    up_block.append(block)
+            self.up_blocks.append(up_block)
+
+            if up_blocks_repeat_mappers is not None:
+                block_repeat_mappers = nn.ModuleList()
+                for _ in range(up_blocks_repeat_mappers[::-1][i] - 1):
+                    block_repeat_mappers.append(nn.Conv2d(block_out_channels[i], block_out_channels[i], kernel_size=1))
+                self.up_repeat_mappers.append(block_repeat_mappers)
+
+        # OUTPUT
+        self.clf = nn.Sequential(
+            SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
+            nn.Conv2d(block_out_channels[0], out_channels * (patch_size**2), kernel_size=1),
+            nn.PixelShuffle(patch_size),
+        )
+
+        self.gradient_checkpointing = False
+
+    def _set_gradient_checkpointing(self, value=False):
+        self.gradient_checkpointing = value
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            torch.nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+        nn.init.normal_(self.clip_txt_pooled_mapper.weight, std=0.02)
+        nn.init.normal_(self.clip_txt_mapper.weight, std=0.02) if hasattr(self, "clip_txt_mapper") else None
+        nn.init.normal_(self.clip_img_mapper.weight, std=0.02) if hasattr(self, "clip_img_mapper") else None
+
+        if hasattr(self, "effnet_mapper"):
+            nn.init.normal_(self.effnet_mapper[0].weight, std=0.02)  # conditionings
+            nn.init.normal_(self.effnet_mapper[2].weight, std=0.02)  # conditionings
+
+        if hasattr(self, "pixels_mapper"):
+            nn.init.normal_(self.pixels_mapper[0].weight, std=0.02)  # conditionings
+            nn.init.normal_(self.pixels_mapper[2].weight, std=0.02)  # conditionings
+
+        torch.nn.init.xavier_uniform_(self.embedding[1].weight, 0.02)  # inputs
+        nn.init.constant_(self.clf[1].weight, 0)  # outputs
+
+        # blocks
+        for level_block in self.down_blocks + self.up_blocks:
+            for block in level_block:
+                if isinstance(block, SDCascadeResBlock):
+                    block.channelwise[-1].weight.data *= np.sqrt(1 / sum(self.config.blocks[0]))
+                elif isinstance(block, SDCascadeTimestepBlock):
+                    nn.init.constant_(block.mapper.weight, 0)
+
+    def get_timestep_ratio_embedding(self, timestep_ratio, max_positions=10000):
+        r = timestep_ratio * max_positions
+        half_dim = self.config.timestep_ratio_embedding_dim // 2
+
+        emb = math.log(max_positions) / (half_dim - 1)
+        emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
+        emb = r[:, None] * emb[None, :]
+        emb = torch.cat([emb.sin(), emb.cos()], dim=1)
+
+        if self.config.timestep_ratio_embedding_dim % 2 == 1:  # zero pad
+            emb = nn.functional.pad(emb, (0, 1), mode="constant")
+
+        return emb.to(dtype=r.dtype)
+
+    def get_clip_embeddings(self, clip_txt_pooled, clip_txt=None, clip_img=None):
+        if len(clip_txt_pooled.shape) == 2:
+            clip_txt_pool = clip_txt_pooled.unsqueeze(1)
+        clip_txt_pool = self.clip_txt_pooled_mapper(clip_txt_pooled).view(
+            clip_txt_pooled.size(0), clip_txt_pooled.size(1) * self.config.clip_seq, -1
+        )
+        if clip_txt is not None and clip_img is not None:
+            clip_txt = self.clip_txt_mapper(clip_txt)
+            if len(clip_img.shape) == 2:
+                clip_img = clip_img.unsqueeze(1)
+            clip_img = self.clip_img_mapper(clip_img).view(
+                clip_img.size(0), clip_img.size(1) * self.config.clip_seq, -1
+            )
+            clip = torch.cat([clip_txt, clip_txt_pool, clip_img], dim=1)
+        else:
+            clip = clip_txt_pool
+        return self.clip_norm(clip)
+
+    def _down_encode(self, x, r_embed, clip):
+        level_outputs = []
+        block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers)
+
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            for down_block, downscaler, repmap in block_group:
+                x = downscaler(x)
+                for i in range(len(repmap) + 1):
+                    for block in down_block:
+                        if isinstance(block, SDCascadeResBlock):
+                            x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, use_reentrant=False)
+                        elif isinstance(block, SDCascadeAttnBlock):
+                            x = torch.utils.checkpoint.checkpoint(
+                                create_custom_forward(block), x, clip, use_reentrant=False
+                            )
+                        elif isinstance(block, SDCascadeTimestepBlock):
+                            x = torch.utils.checkpoint.checkpoint(
+                                create_custom_forward(block), x, r_embed, use_reentrant=False
+                            )
+                        else:
+                            x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), use_reentrant=False)
+                    if i < len(repmap):
+                        x = repmap[i](x)
+                level_outputs.insert(0, x)
+        else:
+            for down_block, downscaler, repmap in block_group:
+                x = downscaler(x)
+                for i in range(len(repmap) + 1):
+                    for block in down_block:
+                        if isinstance(block, SDCascadeResBlock):
+                            x = block(x)
+                        elif isinstance(block, SDCascadeAttnBlock):
+                            x = block(x, clip)
+                        elif isinstance(block, SDCascadeTimestepBlock):
+                            x = block(x, r_embed)
+                        else:
+                            x = block(x)
+                    if i < len(repmap):
+                        x = repmap[i](x)
+                level_outputs.insert(0, x)
+        return level_outputs
+
+    def _up_decode(self, level_outputs, r_embed, clip):
+        x = level_outputs[0]
+        block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers)
+
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            for i, (up_block, upscaler, repmap) in enumerate(block_group):
+                for j in range(len(repmap) + 1):
+                    for k, block in enumerate(up_block):
+                        if isinstance(block, SDCascadeResBlock):
+                            skip = level_outputs[i] if k == 0 and i > 0 else None
+                            if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)):
+                                orig_type = x.dtype
+                                x = torch.nn.functional.interpolate(
+                                    x.float(), skip.shape[-2:], mode="bilinear", align_corners=True
+                                )
+                                x = x.to(orig_type)
+                            x = torch.utils.checkpoint.checkpoint(
+                                create_custom_forward(block), x, skip, use_reentrant=False
+                            )
+                        elif isinstance(block, SDCascadeAttnBlock):
+                            x = torch.utils.checkpoint.checkpoint(
+                                create_custom_forward(block), x, clip, use_reentrant=False
+                            )
+                        elif isinstance(block, SDCascadeTimestepBlock):
+                            x = torch.utils.checkpoint.checkpoint(
+                                create_custom_forward(block), x, r_embed, use_reentrant=False
+                            )
+                        else:
+                            x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, use_reentrant=False)
+                    if j < len(repmap):
+                        x = repmap[j](x)
+                x = upscaler(x)
+        else:
+            for i, (up_block, upscaler, repmap) in enumerate(block_group):
+                for j in range(len(repmap) + 1):
+                    for k, block in enumerate(up_block):
+                        if isinstance(block, SDCascadeResBlock):
+                            skip = level_outputs[i] if k == 0 and i > 0 else None
+                            if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)):
+                                orig_type = x.dtype
+                                x = torch.nn.functional.interpolate(
+                                    x.float(), skip.shape[-2:], mode="bilinear", align_corners=True
+                                )
+                                x = x.to(orig_type)
+                            x = block(x, skip)
+                        elif isinstance(block, SDCascadeAttnBlock):
+                            x = block(x, clip)
+                        elif isinstance(block, SDCascadeTimestepBlock):
+                            x = block(x, r_embed)
+                        else:
+                            x = block(x)
+                    if j < len(repmap):
+                        x = repmap[j](x)
+                x = upscaler(x)
+        return x
+
+    def forward(
+        self,
+        sample,
+        timestep_ratio,
+        clip_text_pooled,
+        clip_text=None,
+        clip_img=None,
+        effnet=None,
+        pixels=None,
+        sca=None,
+        crp=None,
+        return_dict=True,
+    ):
+        if pixels is None:
+            pixels = sample.new_zeros(sample.size(0), 3, 8, 8)
+
+        # Process the conditioning embeddings
+        timestep_ratio_embed = self.get_timestep_ratio_embedding(timestep_ratio)
+        for c in self.config.timestep_conditioning_type:
+            if c == "sca":
+                cond = sca
+            elif c == "crp":
+                cond = crp
+            else:
+                cond = None
+            t_cond = cond or torch.zeros_like(timestep_ratio)
+            timestep_ratio_embed = torch.cat([timestep_ratio_embed, self.get_timestep_ratio_embedding(t_cond)], dim=1)
+        clip = self.get_clip_embeddings(clip_txt_pooled=clip_text_pooled, clip_txt=clip_text, clip_img=clip_img)
+
+        # Model Blocks
+        x = self.embedding(sample)
+        if hasattr(self, "effnet_mapper") and effnet is not None:
+            x = x + self.effnet_mapper(
+                nn.functional.interpolate(effnet, size=x.shape[-2:], mode="bilinear", align_corners=True)
+            )
+        if hasattr(self, "pixels_mapper"):
+            x = x + nn.functional.interpolate(
+                self.pixels_mapper(pixels), size=x.shape[-2:], mode="bilinear", align_corners=True
+            )
+        level_outputs = self._down_encode(x, timestep_ratio_embed, clip)
+        x = self._up_decode(level_outputs, timestep_ratio_embed, clip)
+        sample = self.clf(x)
+
+        if not return_dict:
+            return (sample,)
+        return StableCascadeUNetOutput(sample=sample)
diff --git a/diffusers/src/diffusers/models/unets/uvit_2d.py b/diffusers/src/diffusers/models/unets/uvit_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a379bf5f9c311a90ef71d1e921a10dd0647d6e9
--- /dev/null
+++ b/diffusers/src/diffusers/models/unets/uvit_2d.py
@@ -0,0 +1,470 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin
+from ..attention import BasicTransformerBlock, SkipFFTransformerBlock
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ..embeddings import TimestepEmbedding, get_timestep_embedding
+from ..modeling_utils import ModelMixin
+from ..normalization import GlobalResponseNorm, RMSNorm
+from ..resnet import Downsample2D, Upsample2D
+
+
+class UVit2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        # global config
+        hidden_size: int = 1024,
+        use_bias: bool = False,
+        hidden_dropout: float = 0.0,
+        # conditioning dimensions
+        cond_embed_dim: int = 768,
+        micro_cond_encode_dim: int = 256,
+        micro_cond_embed_dim: int = 1280,
+        encoder_hidden_size: int = 768,
+        # num tokens
+        vocab_size: int = 8256,  # codebook_size + 1 (for the mask token) rounded
+        codebook_size: int = 8192,
+        # `UVit2DConvEmbed`
+        in_channels: int = 768,
+        block_out_channels: int = 768,
+        num_res_blocks: int = 3,
+        downsample: bool = False,
+        upsample: bool = False,
+        block_num_heads: int = 12,
+        # `TransformerLayer`
+        num_hidden_layers: int = 22,
+        num_attention_heads: int = 16,
+        # `Attention`
+        attention_dropout: float = 0.0,
+        # `FeedForward`
+        intermediate_size: int = 2816,
+        # `Norm`
+        layer_norm_eps: float = 1e-6,
+        ln_elementwise_affine: bool = True,
+        sample_size: int = 64,
+    ):
+        super().__init__()
+
+        self.encoder_proj = nn.Linear(encoder_hidden_size, hidden_size, bias=use_bias)
+        self.encoder_proj_layer_norm = RMSNorm(hidden_size, layer_norm_eps, ln_elementwise_affine)
+
+        self.embed = UVit2DConvEmbed(
+            in_channels, block_out_channels, vocab_size, ln_elementwise_affine, layer_norm_eps, use_bias
+        )
+
+        self.cond_embed = TimestepEmbedding(
+            micro_cond_embed_dim + cond_embed_dim, hidden_size, sample_proj_bias=use_bias
+        )
+
+        self.down_block = UVitBlock(
+            block_out_channels,
+            num_res_blocks,
+            hidden_size,
+            hidden_dropout,
+            ln_elementwise_affine,
+            layer_norm_eps,
+            use_bias,
+            block_num_heads,
+            attention_dropout,
+            downsample,
+            False,
+        )
+
+        self.project_to_hidden_norm = RMSNorm(block_out_channels, layer_norm_eps, ln_elementwise_affine)
+        self.project_to_hidden = nn.Linear(block_out_channels, hidden_size, bias=use_bias)
+
+        self.transformer_layers = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    dim=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=hidden_size // num_attention_heads,
+                    dropout=hidden_dropout,
+                    cross_attention_dim=hidden_size,
+                    attention_bias=use_bias,
+                    norm_type="ada_norm_continuous",
+                    ada_norm_continous_conditioning_embedding_dim=hidden_size,
+                    norm_elementwise_affine=ln_elementwise_affine,
+                    norm_eps=layer_norm_eps,
+                    ada_norm_bias=use_bias,
+                    ff_inner_dim=intermediate_size,
+                    ff_bias=use_bias,
+                    attention_out_bias=use_bias,
+                )
+                for _ in range(num_hidden_layers)
+            ]
+        )
+
+        self.project_from_hidden_norm = RMSNorm(hidden_size, layer_norm_eps, ln_elementwise_affine)
+        self.project_from_hidden = nn.Linear(hidden_size, block_out_channels, bias=use_bias)
+
+        self.up_block = UVitBlock(
+            block_out_channels,
+            num_res_blocks,
+            hidden_size,
+            hidden_dropout,
+            ln_elementwise_affine,
+            layer_norm_eps,
+            use_bias,
+            block_num_heads,
+            attention_dropout,
+            downsample=False,
+            upsample=upsample,
+        )
+
+        self.mlm_layer = ConvMlmLayer(
+            block_out_channels, in_channels, use_bias, ln_elementwise_affine, layer_norm_eps, codebook_size
+        )
+
+        self.gradient_checkpointing = False
+
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        pass
+
+    def forward(self, input_ids, encoder_hidden_states, pooled_text_emb, micro_conds, cross_attention_kwargs=None):
+        encoder_hidden_states = self.encoder_proj(encoder_hidden_states)
+        encoder_hidden_states = self.encoder_proj_layer_norm(encoder_hidden_states)
+
+        micro_cond_embeds = get_timestep_embedding(
+            micro_conds.flatten(), self.config.micro_cond_encode_dim, flip_sin_to_cos=True, downscale_freq_shift=0
+        )
+
+        micro_cond_embeds = micro_cond_embeds.reshape((input_ids.shape[0], -1))
+
+        pooled_text_emb = torch.cat([pooled_text_emb, micro_cond_embeds], dim=1)
+        pooled_text_emb = pooled_text_emb.to(dtype=self.dtype)
+        pooled_text_emb = self.cond_embed(pooled_text_emb).to(encoder_hidden_states.dtype)
+
+        hidden_states = self.embed(input_ids)
+
+        hidden_states = self.down_block(
+            hidden_states,
+            pooled_text_emb=pooled_text_emb,
+            encoder_hidden_states=encoder_hidden_states,
+            cross_attention_kwargs=cross_attention_kwargs,
+        )
+
+        batch_size, channels, height, width = hidden_states.shape
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels)
+
+        hidden_states = self.project_to_hidden_norm(hidden_states)
+        hidden_states = self.project_to_hidden(hidden_states)
+
+        for layer in self.transformer_layers:
+            if self.training and self.gradient_checkpointing:
+
+                def layer_(*args):
+                    return checkpoint(layer, *args)
+
+            else:
+                layer_ = layer
+
+            hidden_states = layer_(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                added_cond_kwargs={"pooled_text_emb": pooled_text_emb},
+            )
+
+        hidden_states = self.project_from_hidden_norm(hidden_states)
+        hidden_states = self.project_from_hidden(hidden_states)
+
+        hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+
+        hidden_states = self.up_block(
+            hidden_states,
+            pooled_text_emb=pooled_text_emb,
+            encoder_hidden_states=encoder_hidden_states,
+            cross_attention_kwargs=cross_attention_kwargs,
+        )
+
+        logits = self.mlm_layer(hidden_states)
+
+        return logits
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+
+class UVit2DConvEmbed(nn.Module):
+    def __init__(self, in_channels, block_out_channels, vocab_size, elementwise_affine, eps, bias):
+        super().__init__()
+        self.embeddings = nn.Embedding(vocab_size, in_channels)
+        self.layer_norm = RMSNorm(in_channels, eps, elementwise_affine)
+        self.conv = nn.Conv2d(in_channels, block_out_channels, kernel_size=1, bias=bias)
+
+    def forward(self, input_ids):
+        embeddings = self.embeddings(input_ids)
+        embeddings = self.layer_norm(embeddings)
+        embeddings = embeddings.permute(0, 3, 1, 2)
+        embeddings = self.conv(embeddings)
+        return embeddings
+
+
+class UVitBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_res_blocks: int,
+        hidden_size,
+        hidden_dropout,
+        ln_elementwise_affine,
+        layer_norm_eps,
+        use_bias,
+        block_num_heads,
+        attention_dropout,
+        downsample: bool,
+        upsample: bool,
+    ):
+        super().__init__()
+
+        if downsample:
+            self.downsample = Downsample2D(
+                channels,
+                use_conv=True,
+                padding=0,
+                name="Conv2d_0",
+                kernel_size=2,
+                norm_type="rms_norm",
+                eps=layer_norm_eps,
+                elementwise_affine=ln_elementwise_affine,
+                bias=use_bias,
+            )
+        else:
+            self.downsample = None
+
+        self.res_blocks = nn.ModuleList(
+            [
+                ConvNextBlock(
+                    channels,
+                    layer_norm_eps,
+                    ln_elementwise_affine,
+                    use_bias,
+                    hidden_dropout,
+                    hidden_size,
+                )
+                for i in range(num_res_blocks)
+            ]
+        )
+
+        self.attention_blocks = nn.ModuleList(
+            [
+                SkipFFTransformerBlock(
+                    channels,
+                    block_num_heads,
+                    channels // block_num_heads,
+                    hidden_size,
+                    use_bias,
+                    attention_dropout,
+                    channels,
+                    attention_bias=use_bias,
+                    attention_out_bias=use_bias,
+                )
+                for _ in range(num_res_blocks)
+            ]
+        )
+
+        if upsample:
+            self.upsample = Upsample2D(
+                channels,
+                use_conv_transpose=True,
+                kernel_size=2,
+                padding=0,
+                name="conv",
+                norm_type="rms_norm",
+                eps=layer_norm_eps,
+                elementwise_affine=ln_elementwise_affine,
+                bias=use_bias,
+                interpolate=False,
+            )
+        else:
+            self.upsample = None
+
+    def forward(self, x, pooled_text_emb, encoder_hidden_states, cross_attention_kwargs):
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        for res_block, attention_block in zip(self.res_blocks, self.attention_blocks):
+            x = res_block(x, pooled_text_emb)
+
+            batch_size, channels, height, width = x.shape
+            x = x.view(batch_size, channels, height * width).permute(0, 2, 1)
+            x = attention_block(
+                x, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs
+            )
+            x = x.permute(0, 2, 1).view(batch_size, channels, height, width)
+
+        if self.upsample is not None:
+            x = self.upsample(x)
+
+        return x
+
+
+class ConvNextBlock(nn.Module):
+    def __init__(
+        self, channels, layer_norm_eps, ln_elementwise_affine, use_bias, hidden_dropout, hidden_size, res_ffn_factor=4
+    ):
+        super().__init__()
+        self.depthwise = nn.Conv2d(
+            channels,
+            channels,
+            kernel_size=3,
+            padding=1,
+            groups=channels,
+            bias=use_bias,
+        )
+        self.norm = RMSNorm(channels, layer_norm_eps, ln_elementwise_affine)
+        self.channelwise_linear_1 = nn.Linear(channels, int(channels * res_ffn_factor), bias=use_bias)
+        self.channelwise_act = nn.GELU()
+        self.channelwise_norm = GlobalResponseNorm(int(channels * res_ffn_factor))
+        self.channelwise_linear_2 = nn.Linear(int(channels * res_ffn_factor), channels, bias=use_bias)
+        self.channelwise_dropout = nn.Dropout(hidden_dropout)
+        self.cond_embeds_mapper = nn.Linear(hidden_size, channels * 2, use_bias)
+
+    def forward(self, x, cond_embeds):
+        x_res = x
+
+        x = self.depthwise(x)
+
+        x = x.permute(0, 2, 3, 1)
+        x = self.norm(x)
+
+        x = self.channelwise_linear_1(x)
+        x = self.channelwise_act(x)
+        x = self.channelwise_norm(x)
+        x = self.channelwise_linear_2(x)
+        x = self.channelwise_dropout(x)
+
+        x = x.permute(0, 3, 1, 2)
+
+        x = x + x_res
+
+        scale, shift = self.cond_embeds_mapper(F.silu(cond_embeds)).chunk(2, dim=1)
+        x = x * (1 + scale[:, :, None, None]) + shift[:, :, None, None]
+
+        return x
+
+
+class ConvMlmLayer(nn.Module):
+    def __init__(
+        self,
+        block_out_channels: int,
+        in_channels: int,
+        use_bias: bool,
+        ln_elementwise_affine: bool,
+        layer_norm_eps: float,
+        codebook_size: int,
+    ):
+        super().__init__()
+        self.conv1 = nn.Conv2d(block_out_channels, in_channels, kernel_size=1, bias=use_bias)
+        self.layer_norm = RMSNorm(in_channels, layer_norm_eps, ln_elementwise_affine)
+        self.conv2 = nn.Conv2d(in_channels, codebook_size, kernel_size=1, bias=use_bias)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.layer_norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        logits = self.conv2(hidden_states)
+        return logits
diff --git a/diffusers/src/diffusers/models/upsampling.py b/diffusers/src/diffusers/models/upsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd5ed28c70703c83c1f28acbf4f185bdeb3b14c9
--- /dev/null
+++ b/diffusers/src/diffusers/models/upsampling.py
@@ -0,0 +1,509 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import deprecate
+from .normalization import RMSNorm
+
+
+class Upsample1D(nn.Module):
+    """A 1D upsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        name (`str`, default `conv`):
+            name of the upsampling 1D layer.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        use_conv_transpose: bool = False,
+        out_channels: Optional[int] = None,
+        name: str = "conv",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+
+        self.conv = None
+        if use_conv_transpose:
+            self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1)
+        elif use_conv:
+            self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        assert inputs.shape[1] == self.channels
+        if self.use_conv_transpose:
+            return self.conv(inputs)
+
+        outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest")
+
+        if self.use_conv:
+            outputs = self.conv(outputs)
+
+        return outputs
+
+
+class Upsample2D(nn.Module):
+    """A 2D upsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        name (`str`, default `conv`):
+            name of the upsampling 2D layer.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        use_conv_transpose: bool = False,
+        out_channels: Optional[int] = None,
+        name: str = "conv",
+        kernel_size: Optional[int] = None,
+        padding=1,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+        interpolate=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        self.interpolate = interpolate
+
+        if norm_type == "ln_norm":
+            self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels, eps, elementwise_affine)
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+
+        conv = None
+        if use_conv_transpose:
+            if kernel_size is None:
+                kernel_size = 4
+            conv = nn.ConvTranspose2d(
+                channels, self.out_channels, kernel_size=kernel_size, stride=2, padding=padding, bias=bias
+            )
+        elif use_conv:
+            if kernel_size is None:
+                kernel_size = 3
+            conv = nn.Conv2d(self.channels, self.out_channels, kernel_size=kernel_size, padding=padding, bias=bias)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.conv = conv
+        else:
+            self.Conv2d_0 = conv
+
+    def forward(self, hidden_states: torch.Tensor, output_size: Optional[int] = None, *args, **kwargs) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        assert hidden_states.shape[1] == self.channels
+
+        if self.norm is not None:
+            hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+
+        if self.use_conv_transpose:
+            return self.conv(hidden_states)
+
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        # TODO(Suraj): Remove this cast once the issue is fixed in PyTorch
+        # https://github.com/pytorch/pytorch/issues/86679
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
+
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+
+        # if `output_size` is passed we force the interpolation output
+        # size and do not make use of `scale_factor=2`
+        if self.interpolate:
+            if output_size is None:
+                hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+            else:
+                hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
+
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if self.use_conv:
+            if self.name == "conv":
+                hidden_states = self.conv(hidden_states)
+            else:
+                hidden_states = self.Conv2d_0(hidden_states)
+
+        return hidden_states
+
+
+class FirUpsample2D(nn.Module):
+    """A 2D FIR upsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`, optional):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        fir_kernel (`tuple`, default `(1, 3, 3, 1)`):
+            kernel for the FIR filter.
+    """
+
+    def __init__(
+        self,
+        channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        use_conv: bool = False,
+        fir_kernel: Tuple[int, int, int, int] = (1, 3, 3, 1),
+    ):
+        super().__init__()
+        out_channels = out_channels if out_channels else channels
+        if use_conv:
+            self.Conv2d_0 = nn.Conv2d(channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.use_conv = use_conv
+        self.fir_kernel = fir_kernel
+        self.out_channels = out_channels
+
+    def _upsample_2d(
+        self,
+        hidden_states: torch.Tensor,
+        weight: Optional[torch.Tensor] = None,
+        kernel: Optional[torch.Tensor] = None,
+        factor: int = 2,
+        gain: float = 1,
+    ) -> torch.Tensor:
+        """Fused `upsample_2d()` followed by `Conv2d()`.
+
+        Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
+        efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of
+        arbitrary order.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+            weight (`torch.Tensor`, *optional*):
+                Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be
+                performed by `inChannels = x.shape[0] // numGroups`.
+            kernel (`torch.Tensor`, *optional*):
+                FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which
+                corresponds to nearest-neighbor upsampling.
+            factor (`int`, *optional*): Integer upsampling factor (default: 2).
+            gain (`float`, *optional*): Scaling factor for signal magnitude (default: 1.0).
+
+        Returns:
+            output (`torch.Tensor`):
+                Tensor of the shape `[N, C, H * factor, W * factor]` or `[N, H * factor, W * factor, C]`, and same
+                datatype as `hidden_states`.
+        """
+
+        assert isinstance(factor, int) and factor >= 1
+
+        # Setup filter kernel.
+        if kernel is None:
+            kernel = [1] * factor
+
+        # setup kernel
+        kernel = torch.tensor(kernel, dtype=torch.float32)
+        if kernel.ndim == 1:
+            kernel = torch.outer(kernel, kernel)
+        kernel /= torch.sum(kernel)
+
+        kernel = kernel * (gain * (factor**2))
+
+        if self.use_conv:
+            convH = weight.shape[2]
+            convW = weight.shape[3]
+            inC = weight.shape[1]
+
+            pad_value = (kernel.shape[0] - factor) - (convW - 1)
+
+            stride = (factor, factor)
+            # Determine data dimensions.
+            output_shape = (
+                (hidden_states.shape[2] - 1) * factor + convH,
+                (hidden_states.shape[3] - 1) * factor + convW,
+            )
+            output_padding = (
+                output_shape[0] - (hidden_states.shape[2] - 1) * stride[0] - convH,
+                output_shape[1] - (hidden_states.shape[3] - 1) * stride[1] - convW,
+            )
+            assert output_padding[0] >= 0 and output_padding[1] >= 0
+            num_groups = hidden_states.shape[1] // inC
+
+            # Transpose weights.
+            weight = torch.reshape(weight, (num_groups, -1, inC, convH, convW))
+            weight = torch.flip(weight, dims=[3, 4]).permute(0, 2, 1, 3, 4)
+            weight = torch.reshape(weight, (num_groups * inC, -1, convH, convW))
+
+            inverse_conv = F.conv_transpose2d(
+                hidden_states,
+                weight,
+                stride=stride,
+                output_padding=output_padding,
+                padding=0,
+            )
+
+            output = upfirdn2d_native(
+                inverse_conv,
+                torch.tensor(kernel, device=inverse_conv.device),
+                pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2 + 1),
+            )
+        else:
+            pad_value = kernel.shape[0] - factor
+            output = upfirdn2d_native(
+                hidden_states,
+                torch.tensor(kernel, device=hidden_states.device),
+                up=factor,
+                pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
+            )
+
+        return output
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.use_conv:
+            height = self._upsample_2d(hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel)
+            height = height + self.Conv2d_0.bias.reshape(1, -1, 1, 1)
+        else:
+            height = self._upsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
+
+        return height
+
+
+class KUpsample2D(nn.Module):
+    r"""A 2D K-upsampling layer.
+
+    Parameters:
+        pad_mode (`str`, *optional*, default to `"reflect"`): the padding mode to use.
+    """
+
+    def __init__(self, pad_mode: str = "reflect"):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = torch.tensor([[1 / 8, 3 / 8, 3 / 8, 1 / 8]]) * 2
+        self.pad = kernel_1d.shape[1] // 2 - 1
+        self.register_buffer("kernel", kernel_1d.T @ kernel_1d, persistent=False)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        inputs = F.pad(inputs, ((self.pad + 1) // 2,) * 4, self.pad_mode)
+        weight = inputs.new_zeros(
+            [
+                inputs.shape[1],
+                inputs.shape[1],
+                self.kernel.shape[0],
+                self.kernel.shape[1],
+            ]
+        )
+        indices = torch.arange(inputs.shape[1], device=inputs.device)
+        kernel = self.kernel.to(weight)[None, :].expand(inputs.shape[1], -1, -1)
+        weight[indices, indices] = kernel
+        return F.conv_transpose2d(inputs, weight, stride=2, padding=self.pad * 2 + 1)
+
+
+class CogVideoXUpsample3D(nn.Module):
+    r"""
+    A 3D Upsample layer using in CogVideoX by Tsinghua University & ZhipuAI # Todo: Wait for paper relase.
+
+    Args:
+        in_channels (`int`):
+            Number of channels in the input image.
+        out_channels (`int`):
+            Number of channels produced by the convolution.
+        kernel_size (`int`, defaults to `3`):
+            Size of the convolving kernel.
+        stride (`int`, defaults to `1`):
+            Stride of the convolution.
+        padding (`int`, defaults to `1`):
+            Padding added to all four sides of the input.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to compress the time dimension.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        padding: int = 1,
+        compress_time: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.compress_time = compress_time
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        if self.compress_time:
+            if inputs.shape[2] > 1 and inputs.shape[2] % 2 == 1:
+                # split first frame
+                x_first, x_rest = inputs[:, :, 0], inputs[:, :, 1:]
+
+                x_first = F.interpolate(x_first, scale_factor=2.0)
+                x_rest = F.interpolate(x_rest, scale_factor=2.0)
+                x_first = x_first[:, :, None, :, :]
+                inputs = torch.cat([x_first, x_rest], dim=2)
+            elif inputs.shape[2] > 1:
+                inputs = F.interpolate(inputs, scale_factor=2.0)
+            else:
+                inputs = inputs.squeeze(2)
+                inputs = F.interpolate(inputs, scale_factor=2.0)
+                inputs = inputs[:, :, None, :, :]
+        else:
+            # only interpolate 2D
+            b, c, t, h, w = inputs.shape
+            inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+            inputs = F.interpolate(inputs, scale_factor=2.0)
+            inputs = inputs.reshape(b, t, c, *inputs.shape[2:]).permute(0, 2, 1, 3, 4)
+
+        b, c, t, h, w = inputs.shape
+        inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        inputs = self.conv(inputs)
+        inputs = inputs.reshape(b, t, *inputs.shape[1:]).permute(0, 2, 1, 3, 4)
+
+        return inputs
+
+
+def upfirdn2d_native(
+    tensor: torch.Tensor,
+    kernel: torch.Tensor,
+    up: int = 1,
+    down: int = 1,
+    pad: Tuple[int, int] = (0, 0),
+) -> torch.Tensor:
+    up_x = up_y = up
+    down_x = down_y = down
+    pad_x0 = pad_y0 = pad[0]
+    pad_x1 = pad_y1 = pad[1]
+
+    _, channel, in_h, in_w = tensor.shape
+    tensor = tensor.reshape(-1, in_h, in_w, 1)
+
+    _, in_h, in_w, minor = tensor.shape
+    kernel_h, kernel_w = kernel.shape
+
+    out = tensor.view(-1, in_h, 1, in_w, 1, minor)
+    out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
+    out = out.view(-1, in_h * up_y, in_w * up_x, minor)
+
+    out = F.pad(out, [0, 0, max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)])
+    out = out.to(tensor.device)  # Move back to mps if necessary
+    out = out[
+        :,
+        max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0),
+        max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0),
+        :,
+    ]
+
+    out = out.permute(0, 3, 1, 2)
+    out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
+    w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
+    out = F.conv2d(out, w)
+    out = out.reshape(
+        -1,
+        minor,
+        in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
+        in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
+    )
+    out = out.permute(0, 2, 3, 1)
+    out = out[:, ::down_y, ::down_x, :]
+
+    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
+
+    return out.view(-1, channel, out_h, out_w)
+
+
+def upsample_2d(
+    hidden_states: torch.Tensor,
+    kernel: Optional[torch.Tensor] = None,
+    factor: int = 2,
+    gain: float = 1,
+) -> torch.Tensor:
+    r"""Upsample2D a batch of 2D images with the given filter.
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given
+    filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the specified
+    `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its shape is
+    a: multiple of the upsampling factor.
+
+    Args:
+        hidden_states (`torch.Tensor`):
+            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        kernel (`torch.Tensor`, *optional*):
+            FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which
+            corresponds to nearest-neighbor upsampling.
+        factor (`int`, *optional*, default to `2`):
+            Integer upsampling factor.
+        gain (`float`, *optional*, default to `1.0`):
+            Scaling factor for signal magnitude (default: 1.0).
+
+    Returns:
+        output (`torch.Tensor`):
+            Tensor of the shape `[N, C, H * factor, W * factor]`
+    """
+    assert isinstance(factor, int) and factor >= 1
+    if kernel is None:
+        kernel = [1] * factor
+
+    kernel = torch.tensor(kernel, dtype=torch.float32)
+    if kernel.ndim == 1:
+        kernel = torch.outer(kernel, kernel)
+    kernel /= torch.sum(kernel)
+
+    kernel = kernel * (gain * (factor**2))
+    pad_value = kernel.shape[0] - factor
+    output = upfirdn2d_native(
+        hidden_states,
+        kernel.to(device=hidden_states.device),
+        up=factor,
+        pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
+    )
+    return output
diff --git a/diffusers/src/diffusers/models/vae_flax.py b/diffusers/src/diffusers/models/vae_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..5027f4230e3b44875b1a2289be81b8ffbf92afef
--- /dev/null
+++ b/diffusers/src/diffusers/models/vae_flax.py
@@ -0,0 +1,876 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# JAX implementation of VQGAN from taming-transformers https://github.com/CompVis/taming-transformers
+
+import math
+from functools import partial
+from typing import Tuple
+
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict
+
+from ..configuration_utils import ConfigMixin, flax_register_to_config
+from ..utils import BaseOutput
+from .modeling_flax_utils import FlaxModelMixin
+
+
+@flax.struct.dataclass
+class FlaxDecoderOutput(BaseOutput):
+    """
+    Output of decoding method.
+
+    Args:
+        sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
+            The decoded output sample from the last layer of the model.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            The `dtype` of the parameters.
+    """
+
+    sample: jnp.ndarray
+
+
+@flax.struct.dataclass
+class FlaxAutoencoderKLOutput(BaseOutput):
+    """
+    Output of AutoencoderKL encoding method.
+
+    Args:
+        latent_dist (`FlaxDiagonalGaussianDistribution`):
+            Encoded outputs of `Encoder` represented as the mean and logvar of `FlaxDiagonalGaussianDistribution`.
+            `FlaxDiagonalGaussianDistribution` allows for sampling latents from the distribution.
+    """
+
+    latent_dist: "FlaxDiagonalGaussianDistribution"
+
+
+class FlaxUpsample2D(nn.Module):
+    """
+    Flax implementation of 2D Upsample layer
+
+    Args:
+        in_channels (`int`):
+            Input channels
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv = nn.Conv(
+            self.in_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        batch, height, width, channels = hidden_states.shape
+        hidden_states = jax.image.resize(
+            hidden_states,
+            shape=(batch, height * 2, width * 2, channels),
+            method="nearest",
+        )
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class FlaxDownsample2D(nn.Module):
+    """
+    Flax implementation of 2D Downsample layer
+
+    Args:
+        in_channels (`int`):
+            Input channels
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv = nn.Conv(
+            self.in_channels,
+            kernel_size=(3, 3),
+            strides=(2, 2),
+            padding="VALID",
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        pad = ((0, 0), (0, 1), (0, 1), (0, 0))  # pad height and width dim
+        hidden_states = jnp.pad(hidden_states, pad_width=pad)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class FlaxResnetBlock2D(nn.Module):
+    """
+    Flax implementation of 2D Resnet Block.
+
+    Args:
+        in_channels (`int`):
+            Input channels
+        out_channels (`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        groups (:obj:`int`, *optional*, defaults to `32`):
+            The number of groups to use for group norm.
+        use_nin_shortcut (:obj:`bool`, *optional*, defaults to `None`):
+            Whether to use `nin_shortcut`. This activates a new layer inside ResNet block
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int = None
+    dropout: float = 0.0
+    groups: int = 32
+    use_nin_shortcut: bool = None
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        out_channels = self.in_channels if self.out_channels is None else self.out_channels
+
+        self.norm1 = nn.GroupNorm(num_groups=self.groups, epsilon=1e-6)
+        self.conv1 = nn.Conv(
+            out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        self.norm2 = nn.GroupNorm(num_groups=self.groups, epsilon=1e-6)
+        self.dropout_layer = nn.Dropout(self.dropout)
+        self.conv2 = nn.Conv(
+            out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        use_nin_shortcut = self.in_channels != out_channels if self.use_nin_shortcut is None else self.use_nin_shortcut
+
+        self.conv_shortcut = None
+        if use_nin_shortcut:
+            self.conv_shortcut = nn.Conv(
+                out_channels,
+                kernel_size=(1, 1),
+                strides=(1, 1),
+                padding="VALID",
+                dtype=self.dtype,
+            )
+
+    def __call__(self, hidden_states, deterministic=True):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = nn.swish(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = nn.swish(hidden_states)
+        hidden_states = self.dropout_layer(hidden_states, deterministic)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            residual = self.conv_shortcut(residual)
+
+        return hidden_states + residual
+
+
+class FlaxAttentionBlock(nn.Module):
+    r"""
+    Flax Convolutional based multi-head attention block for diffusion-based VAE.
+
+    Parameters:
+        channels (:obj:`int`):
+            Input channels
+        num_head_channels (:obj:`int`, *optional*, defaults to `None`):
+            Number of attention heads
+        num_groups (:obj:`int`, *optional*, defaults to `32`):
+            The number of groups to use for group norm
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+
+    """
+
+    channels: int
+    num_head_channels: int = None
+    num_groups: int = 32
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.num_heads = self.channels // self.num_head_channels if self.num_head_channels is not None else 1
+
+        dense = partial(nn.Dense, self.channels, dtype=self.dtype)
+
+        self.group_norm = nn.GroupNorm(num_groups=self.num_groups, epsilon=1e-6)
+        self.query, self.key, self.value = dense(), dense(), dense()
+        self.proj_attn = dense()
+
+    def transpose_for_scores(self, projection):
+        new_projection_shape = projection.shape[:-1] + (self.num_heads, -1)
+        # move heads to 2nd position (B, T, H * D) -> (B, T, H, D)
+        new_projection = projection.reshape(new_projection_shape)
+        # (B, T, H, D) -> (B, H, T, D)
+        new_projection = jnp.transpose(new_projection, (0, 2, 1, 3))
+        return new_projection
+
+    def __call__(self, hidden_states):
+        residual = hidden_states
+        batch, height, width, channels = hidden_states.shape
+
+        hidden_states = self.group_norm(hidden_states)
+
+        hidden_states = hidden_states.reshape((batch, height * width, channels))
+
+        query = self.query(hidden_states)
+        key = self.key(hidden_states)
+        value = self.value(hidden_states)
+
+        # transpose
+        query = self.transpose_for_scores(query)
+        key = self.transpose_for_scores(key)
+        value = self.transpose_for_scores(value)
+
+        # compute attentions
+        scale = 1 / math.sqrt(math.sqrt(self.channels / self.num_heads))
+        attn_weights = jnp.einsum("...qc,...kc->...qk", query * scale, key * scale)
+        attn_weights = nn.softmax(attn_weights, axis=-1)
+
+        # attend to values
+        hidden_states = jnp.einsum("...kc,...qk->...qc", value, attn_weights)
+
+        hidden_states = jnp.transpose(hidden_states, (0, 2, 1, 3))
+        new_hidden_states_shape = hidden_states.shape[:-2] + (self.channels,)
+        hidden_states = hidden_states.reshape(new_hidden_states_shape)
+
+        hidden_states = self.proj_attn(hidden_states)
+        hidden_states = hidden_states.reshape((batch, height, width, channels))
+        hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class FlaxDownEncoderBlock2D(nn.Module):
+    r"""
+    Flax Resnet blocks-based Encoder block for diffusion-based VAE.
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of Resnet layer block
+        resnet_groups (:obj:`int`, *optional*, defaults to `32`):
+            The number of groups to use for the Resnet block group norm
+        add_downsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add downsample layer
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    resnet_groups: int = 32
+    add_downsample: bool = True
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        resnets = []
+        for i in range(self.num_layers):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=self.out_channels,
+                dropout=self.dropout,
+                groups=self.resnet_groups,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+        self.resnets = resnets
+
+        if self.add_downsample:
+            self.downsamplers_0 = FlaxDownsample2D(self.out_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic=True):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, deterministic=deterministic)
+
+        if self.add_downsample:
+            hidden_states = self.downsamplers_0(hidden_states)
+
+        return hidden_states
+
+
+class FlaxUpDecoderBlock2D(nn.Module):
+    r"""
+    Flax Resnet blocks-based Decoder block for diffusion-based VAE.
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of Resnet layer block
+        resnet_groups (:obj:`int`, *optional*, defaults to `32`):
+            The number of groups to use for the Resnet block group norm
+        add_upsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add upsample layer
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    resnet_groups: int = 32
+    add_upsample: bool = True
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        resnets = []
+        for i in range(self.num_layers):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+            res_block = FlaxResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=self.out_channels,
+                dropout=self.dropout,
+                groups=self.resnet_groups,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+
+        self.resnets = resnets
+
+        if self.add_upsample:
+            self.upsamplers_0 = FlaxUpsample2D(self.out_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic=True):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, deterministic=deterministic)
+
+        if self.add_upsample:
+            hidden_states = self.upsamplers_0(hidden_states)
+
+        return hidden_states
+
+
+class FlaxUNetMidBlock2D(nn.Module):
+    r"""
+    Flax Unet Mid-Block module.
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of Resnet layer block
+        resnet_groups (:obj:`int`, *optional*, defaults to `32`):
+            The number of groups to use for the Resnet and Attention block group norm
+        num_attention_heads (:obj:`int`, *optional*, defaults to `1`):
+            Number of attention heads for each attention block
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    resnet_groups: int = 32
+    num_attention_heads: int = 1
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        resnet_groups = self.resnet_groups if self.resnet_groups is not None else min(self.in_channels // 4, 32)
+
+        # there is always at least one resnet
+        resnets = [
+            FlaxResnetBlock2D(
+                in_channels=self.in_channels,
+                out_channels=self.in_channels,
+                dropout=self.dropout,
+                groups=resnet_groups,
+                dtype=self.dtype,
+            )
+        ]
+
+        attentions = []
+
+        for _ in range(self.num_layers):
+            attn_block = FlaxAttentionBlock(
+                channels=self.in_channels,
+                num_head_channels=self.num_attention_heads,
+                num_groups=resnet_groups,
+                dtype=self.dtype,
+            )
+            attentions.append(attn_block)
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=self.in_channels,
+                out_channels=self.in_channels,
+                dropout=self.dropout,
+                groups=resnet_groups,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+
+        self.resnets = resnets
+        self.attentions = attentions
+
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = self.resnets[0](hidden_states, deterministic=deterministic)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states)
+            hidden_states = resnet(hidden_states, deterministic=deterministic)
+
+        return hidden_states
+
+
+class FlaxEncoder(nn.Module):
+    r"""
+    Flax Implementation of VAE Encoder.
+
+    This model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+    general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        in_channels (:obj:`int`, *optional*, defaults to 3):
+            Input channels
+        out_channels (:obj:`int`, *optional*, defaults to 3):
+            Output channels
+        down_block_types (:obj:`Tuple[str]`, *optional*, defaults to `(DownEncoderBlock2D)`):
+            DownEncoder block type
+        block_out_channels (:obj:`Tuple[str]`, *optional*, defaults to `(64,)`):
+            Tuple containing the number of output channels for each block
+        layers_per_block (:obj:`int`, *optional*, defaults to `2`):
+            Number of Resnet layer for each block
+        norm_num_groups (:obj:`int`, *optional*, defaults to `32`):
+            norm num group
+        act_fn (:obj:`str`, *optional*, defaults to `silu`):
+            Activation function
+        double_z (:obj:`bool`, *optional*, defaults to `False`):
+            Whether to double the last output channels
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int = 3
+    out_channels: int = 3
+    down_block_types: Tuple[str] = ("DownEncoderBlock2D",)
+    block_out_channels: Tuple[int] = (64,)
+    layers_per_block: int = 2
+    norm_num_groups: int = 32
+    act_fn: str = "silu"
+    double_z: bool = False
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        block_out_channels = self.block_out_channels
+        # in
+        self.conv_in = nn.Conv(
+            block_out_channels[0],
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        # downsampling
+        down_blocks = []
+        output_channel = block_out_channels[0]
+        for i, _ in enumerate(self.down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = FlaxDownEncoderBlock2D(
+                in_channels=input_channel,
+                out_channels=output_channel,
+                num_layers=self.layers_per_block,
+                resnet_groups=self.norm_num_groups,
+                add_downsample=not is_final_block,
+                dtype=self.dtype,
+            )
+            down_blocks.append(down_block)
+        self.down_blocks = down_blocks
+
+        # middle
+        self.mid_block = FlaxUNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_groups=self.norm_num_groups,
+            num_attention_heads=None,
+            dtype=self.dtype,
+        )
+
+        # end
+        conv_out_channels = 2 * self.out_channels if self.double_z else self.out_channels
+        self.conv_norm_out = nn.GroupNorm(num_groups=self.norm_num_groups, epsilon=1e-6)
+        self.conv_out = nn.Conv(
+            conv_out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, sample, deterministic: bool = True):
+        # in
+        sample = self.conv_in(sample)
+
+        # downsampling
+        for block in self.down_blocks:
+            sample = block(sample, deterministic=deterministic)
+
+        # middle
+        sample = self.mid_block(sample, deterministic=deterministic)
+
+        # end
+        sample = self.conv_norm_out(sample)
+        sample = nn.swish(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class FlaxDecoder(nn.Module):
+    r"""
+    Flax Implementation of VAE Decoder.
+
+    This model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+    general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        in_channels (:obj:`int`, *optional*, defaults to 3):
+            Input channels
+        out_channels (:obj:`int`, *optional*, defaults to 3):
+            Output channels
+        up_block_types (:obj:`Tuple[str]`, *optional*, defaults to `(UpDecoderBlock2D)`):
+            UpDecoder block type
+        block_out_channels (:obj:`Tuple[str]`, *optional*, defaults to `(64,)`):
+            Tuple containing the number of output channels for each block
+        layers_per_block (:obj:`int`, *optional*, defaults to `2`):
+            Number of Resnet layer for each block
+        norm_num_groups (:obj:`int`, *optional*, defaults to `32`):
+            norm num group
+        act_fn (:obj:`str`, *optional*, defaults to `silu`):
+            Activation function
+        double_z (:obj:`bool`, *optional*, defaults to `False`):
+            Whether to double the last output channels
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            parameters `dtype`
+    """
+
+    in_channels: int = 3
+    out_channels: int = 3
+    up_block_types: Tuple[str] = ("UpDecoderBlock2D",)
+    block_out_channels: int = (64,)
+    layers_per_block: int = 2
+    norm_num_groups: int = 32
+    act_fn: str = "silu"
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        block_out_channels = self.block_out_channels
+
+        # z to block_in
+        self.conv_in = nn.Conv(
+            block_out_channels[-1],
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        # middle
+        self.mid_block = FlaxUNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_groups=self.norm_num_groups,
+            num_attention_heads=None,
+            dtype=self.dtype,
+        )
+
+        # upsampling
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        up_blocks = []
+        for i, _ in enumerate(self.up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = FlaxUpDecoderBlock2D(
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                num_layers=self.layers_per_block + 1,
+                resnet_groups=self.norm_num_groups,
+                add_upsample=not is_final_block,
+                dtype=self.dtype,
+            )
+            up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        self.up_blocks = up_blocks
+
+        # end
+        self.conv_norm_out = nn.GroupNorm(num_groups=self.norm_num_groups, epsilon=1e-6)
+        self.conv_out = nn.Conv(
+            self.out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, sample, deterministic: bool = True):
+        # z to block_in
+        sample = self.conv_in(sample)
+
+        # middle
+        sample = self.mid_block(sample, deterministic=deterministic)
+
+        # upsampling
+        for block in self.up_blocks:
+            sample = block(sample, deterministic=deterministic)
+
+        sample = self.conv_norm_out(sample)
+        sample = nn.swish(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class FlaxDiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        # Last axis to account for channels-last
+        self.mean, self.logvar = jnp.split(parameters, 2, axis=-1)
+        self.logvar = jnp.clip(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = jnp.exp(0.5 * self.logvar)
+        self.var = jnp.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = jnp.zeros_like(self.mean)
+
+    def sample(self, key):
+        return self.mean + self.std * jax.random.normal(key, self.mean.shape)
+
+    def kl(self, other=None):
+        if self.deterministic:
+            return jnp.array([0.0])
+
+        if other is None:
+            return 0.5 * jnp.sum(self.mean**2 + self.var - 1.0 - self.logvar, axis=[1, 2, 3])
+
+        return 0.5 * jnp.sum(
+            jnp.square(self.mean - other.mean) / other.var + self.var / other.var - 1.0 - self.logvar + other.logvar,
+            axis=[1, 2, 3],
+        )
+
+    def nll(self, sample, axis=[1, 2, 3]):
+        if self.deterministic:
+            return jnp.array([0.0])
+
+        logtwopi = jnp.log(2.0 * jnp.pi)
+        return 0.5 * jnp.sum(logtwopi + self.logvar + jnp.square(sample - self.mean) / self.var, axis=axis)
+
+    def mode(self):
+        return self.mean
+
+
+@flax_register_to_config
+class FlaxAutoencoderKL(nn.Module, FlaxModelMixin, ConfigMixin):
+    r"""
+    Flax implementation of a VAE model with KL loss for decoding latent representations.
+
+    This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for it's generic methods
+    implemented for all models (such as downloading or saving).
+
+    This model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax Linen module and refer to the Flax documentation for all matter related to its
+    general usage and behavior.
+
+    Inherent JAX features such as the following are supported:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        in_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input image.
+        out_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `(DownEncoderBlock2D)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `(UpDecoderBlock2D)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[str]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        layers_per_block (`int`, *optional*, defaults to `2`):
+            Number of ResNet layer for each block.
+        act_fn (`str`, *optional*, defaults to `silu`):
+            The activation function to use.
+        latent_channels (`int`, *optional*, defaults to `4`):
+            Number of channels in the latent space.
+        norm_num_groups (`int`, *optional*, defaults to `32`):
+            The number of groups for normalization.
+        sample_size (`int`, *optional*, defaults to 32):
+            Sample input size.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            The `dtype` of the parameters.
+    """
+
+    in_channels: int = 3
+    out_channels: int = 3
+    down_block_types: Tuple[str] = ("DownEncoderBlock2D",)
+    up_block_types: Tuple[str] = ("UpDecoderBlock2D",)
+    block_out_channels: Tuple[int] = (64,)
+    layers_per_block: int = 1
+    act_fn: str = "silu"
+    latent_channels: int = 4
+    norm_num_groups: int = 32
+    sample_size: int = 32
+    scaling_factor: float = 0.18215
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.encoder = FlaxEncoder(
+            in_channels=self.config.in_channels,
+            out_channels=self.config.latent_channels,
+            down_block_types=self.config.down_block_types,
+            block_out_channels=self.config.block_out_channels,
+            layers_per_block=self.config.layers_per_block,
+            act_fn=self.config.act_fn,
+            norm_num_groups=self.config.norm_num_groups,
+            double_z=True,
+            dtype=self.dtype,
+        )
+        self.decoder = FlaxDecoder(
+            in_channels=self.config.latent_channels,
+            out_channels=self.config.out_channels,
+            up_block_types=self.config.up_block_types,
+            block_out_channels=self.config.block_out_channels,
+            layers_per_block=self.config.layers_per_block,
+            norm_num_groups=self.config.norm_num_groups,
+            act_fn=self.config.act_fn,
+            dtype=self.dtype,
+        )
+        self.quant_conv = nn.Conv(
+            2 * self.config.latent_channels,
+            kernel_size=(1, 1),
+            strides=(1, 1),
+            padding="VALID",
+            dtype=self.dtype,
+        )
+        self.post_quant_conv = nn.Conv(
+            self.config.latent_channels,
+            kernel_size=(1, 1),
+            strides=(1, 1),
+            padding="VALID",
+            dtype=self.dtype,
+        )
+
+    def init_weights(self, rng: jax.Array) -> FrozenDict:
+        # init input tensors
+        sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
+        sample = jnp.zeros(sample_shape, dtype=jnp.float32)
+
+        params_rng, dropout_rng, gaussian_rng = jax.random.split(rng, 3)
+        rngs = {"params": params_rng, "dropout": dropout_rng, "gaussian": gaussian_rng}
+
+        return self.init(rngs, sample)["params"]
+
+    def encode(self, sample, deterministic: bool = True, return_dict: bool = True):
+        sample = jnp.transpose(sample, (0, 2, 3, 1))
+
+        hidden_states = self.encoder(sample, deterministic=deterministic)
+        moments = self.quant_conv(hidden_states)
+        posterior = FlaxDiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return FlaxAutoencoderKLOutput(latent_dist=posterior)
+
+    def decode(self, latents, deterministic: bool = True, return_dict: bool = True):
+        if latents.shape[-1] != self.config.latent_channels:
+            latents = jnp.transpose(latents, (0, 2, 3, 1))
+
+        hidden_states = self.post_quant_conv(latents)
+        hidden_states = self.decoder(hidden_states, deterministic=deterministic)
+
+        hidden_states = jnp.transpose(hidden_states, (0, 3, 1, 2))
+
+        if not return_dict:
+            return (hidden_states,)
+
+        return FlaxDecoderOutput(sample=hidden_states)
+
+    def __call__(self, sample, sample_posterior=False, deterministic: bool = True, return_dict: bool = True):
+        posterior = self.encode(sample, deterministic=deterministic, return_dict=return_dict)
+        if sample_posterior:
+            rng = self.make_rng("gaussian")
+            hidden_states = posterior.latent_dist.sample(rng)
+        else:
+            hidden_states = posterior.latent_dist.mode()
+
+        sample = self.decode(hidden_states, return_dict=return_dict).sample
+
+        if not return_dict:
+            return (sample,)
+
+        return FlaxDecoderOutput(sample=sample)
diff --git a/diffusers/src/diffusers/models/vq_model.py b/diffusers/src/diffusers/models/vq_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..f946e46344764a8d03050bfb8b042d603fd63912
--- /dev/null
+++ b/diffusers/src/diffusers/models/vq_model.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..utils import deprecate
+from .autoencoders.vq_model import VQEncoderOutput, VQModel
+
+
+class VQEncoderOutput(VQEncoderOutput):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `VQEncoderOutput` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQEncoderOutput`, instead."
+        deprecate("VQEncoderOutput", "0.31", deprecation_message)
+        super().__init__(*args, **kwargs)
+
+
+class VQModel(VQModel):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `VQModel` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQModel`, instead."
+        deprecate("VQModel", "0.31", deprecation_message)
+        super().__init__(*args, **kwargs)
diff --git a/diffusers/src/diffusers/optimization.py b/diffusers/src/diffusers/optimization.py
new file mode 100644
index 0000000000000000000000000000000000000000..f20bd94edffaa9b569d3fb395b2a4e826b96b146
--- /dev/null
+++ b/diffusers/src/diffusers/optimization.py
@@ -0,0 +1,361 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for diffusion models."""
+
+import math
+from enum import Enum
+from typing import Optional, Union
+
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR
+
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class SchedulerType(Enum):
+    LINEAR = "linear"
+    COSINE = "cosine"
+    COSINE_WITH_RESTARTS = "cosine_with_restarts"
+    POLYNOMIAL = "polynomial"
+    CONSTANT = "constant"
+    CONSTANT_WITH_WARMUP = "constant_with_warmup"
+    PIECEWISE_CONSTANT = "piecewise_constant"
+
+
+def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1) -> LambdaLR:
+    """
+    Create a schedule with a constant learning rate, using the learning rate set in optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
+
+
+def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1) -> LambdaLR:
+    """
+    Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
+    increases linearly between 0 and the initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1.0, num_warmup_steps))
+        return 1.0
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
+
+
+def get_piecewise_constant_schedule(optimizer: Optimizer, step_rules: str, last_epoch: int = -1) -> LambdaLR:
+    """
+    Create a schedule with a constant learning rate, using the learning rate set in optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        step_rules (`string`):
+            The rules for the learning rate. ex: rule_steps="1:10,0.1:20,0.01:30,0.005" it means that the learning rate
+            if multiple 1 for the first 10 steps, multiple 0.1 for the next 20 steps, multiple 0.01 for the next 30
+            steps and multiple 0.005 for the other steps.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    rules_dict = {}
+    rule_list = step_rules.split(",")
+    for rule_str in rule_list[:-1]:
+        value_str, steps_str = rule_str.split(":")
+        steps = int(steps_str)
+        value = float(value_str)
+        rules_dict[steps] = value
+    last_lr_multiple = float(rule_list[-1])
+
+    def create_rules_function(rules_dict, last_lr_multiple):
+        def rule_func(steps: int) -> float:
+            sorted_steps = sorted(rules_dict.keys())
+            for i, sorted_step in enumerate(sorted_steps):
+                if steps < sorted_step:
+                    return rules_dict[sorted_steps[i]]
+            return last_lr_multiple
+
+        return rule_func
+
+    rules_func = create_rules_function(rules_dict, last_lr_multiple)
+
+    return LambdaLR(optimizer, rules_func, last_epoch=last_epoch)
+
+
+def get_linear_schedule_with_warmup(
+    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, last_epoch: int = -1
+) -> LambdaLR:
+    """
+    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
+    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        return max(
+            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
+        )
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def get_cosine_schedule_with_warmup(
+    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
+) -> LambdaLR:
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_periods (`float`, *optional*, defaults to 0.5):
+            The number of periods of the cosine function in a schedule (the default is to just decrease from the max
+            value to 0 following a half-cosine).
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def get_cosine_with_hard_restarts_schedule_with_warmup(
+    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1
+) -> LambdaLR:
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
+    linearly between 0 and the initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_cycles (`int`, *optional*, defaults to 1):
+            The number of hard restarts to use.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        if progress >= 1.0:
+            return 0.0
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def get_polynomial_decay_schedule_with_warmup(
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    lr_end: float = 1e-7,
+    power: float = 1.0,
+    last_epoch: int = -1,
+) -> LambdaLR:
+    """
+    Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
+    optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the
+    initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        lr_end (`float`, *optional*, defaults to 1e-7):
+            The end LR.
+        power (`float`, *optional*, defaults to 1.0):
+            Power factor.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Note: *power* defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
+    implementation at
+    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+
+    """
+
+    lr_init = optimizer.defaults["lr"]
+    if not (lr_init > lr_end):
+        raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
+
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        elif current_step > num_training_steps:
+            return lr_end / lr_init  # as LambdaLR multiplies by lr_init
+        else:
+            lr_range = lr_init - lr_end
+            decay_steps = num_training_steps - num_warmup_steps
+            pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
+            decay = lr_range * pct_remaining**power + lr_end
+            return decay / lr_init  # as LambdaLR multiplies by lr_init
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+TYPE_TO_SCHEDULER_FUNCTION = {
+    SchedulerType.LINEAR: get_linear_schedule_with_warmup,
+    SchedulerType.COSINE: get_cosine_schedule_with_warmup,
+    SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
+    SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
+    SchedulerType.CONSTANT: get_constant_schedule,
+    SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
+    SchedulerType.PIECEWISE_CONSTANT: get_piecewise_constant_schedule,
+}
+
+
+def get_scheduler(
+    name: Union[str, SchedulerType],
+    optimizer: Optimizer,
+    step_rules: Optional[str] = None,
+    num_warmup_steps: Optional[int] = None,
+    num_training_steps: Optional[int] = None,
+    num_cycles: int = 1,
+    power: float = 1.0,
+    last_epoch: int = -1,
+) -> LambdaLR:
+    """
+    Unified API to get any scheduler from its name.
+
+    Args:
+        name (`str` or `SchedulerType`):
+            The name of the scheduler to use.
+        optimizer (`torch.optim.Optimizer`):
+            The optimizer that will be used during training.
+        step_rules (`str`, *optional*):
+            A string representing the step rules to use. This is only used by the `PIECEWISE_CONSTANT` scheduler.
+        num_warmup_steps (`int`, *optional*):
+            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_training_steps (`int``, *optional*):
+            The number of training steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_cycles (`int`, *optional*):
+            The number of hard restarts used in `COSINE_WITH_RESTARTS` scheduler.
+        power (`float`, *optional*, defaults to 1.0):
+            Power factor. See `POLYNOMIAL` scheduler
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    """
+    name = SchedulerType(name)
+    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+    if name == SchedulerType.CONSTANT:
+        return schedule_func(optimizer, last_epoch=last_epoch)
+
+    if name == SchedulerType.PIECEWISE_CONSTANT:
+        return schedule_func(optimizer, step_rules=step_rules, last_epoch=last_epoch)
+
+    # All other schedulers require `num_warmup_steps`
+    if num_warmup_steps is None:
+        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+
+    if name == SchedulerType.CONSTANT_WITH_WARMUP:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, last_epoch=last_epoch)
+
+    # All other schedulers require `num_training_steps`
+    if num_training_steps is None:
+        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+
+    if name == SchedulerType.COSINE_WITH_RESTARTS:
+        return schedule_func(
+            optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+            num_cycles=num_cycles,
+            last_epoch=last_epoch,
+        )
+
+    if name == SchedulerType.POLYNOMIAL:
+        return schedule_func(
+            optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+            power=power,
+            last_epoch=last_epoch,
+        )
+
+    return schedule_func(
+        optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, last_epoch=last_epoch
+    )
diff --git a/diffusers/src/diffusers/pipelines/README.md b/diffusers/src/diffusers/pipelines/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b2954c07438b806a1e7830cb7f8db8c067a93dac
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/README.md
@@ -0,0 +1,171 @@
+# 🧨 Diffusers Pipelines
+
+Pipelines provide a simple way to run state-of-the-art diffusion models in inference.
+Most diffusion systems consist of multiple independently-trained models and highly adaptable scheduler
+components - all of which are needed to have a functioning end-to-end diffusion system.
+
+As an example, [Stable Diffusion](https://huggingface.co/blog/stable_diffusion) has three independently trained models:
+- [Autoencoder](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/models/vae.py#L392)
+- [Conditional Unet](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/models/unet_2d_condition.py#L12)
+- [CLIP text encoder](https://huggingface.co/docs/transformers/main/en/model_doc/clip#transformers.CLIPTextModel)
+- a scheduler component, [scheduler](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_pndm.py),
+- a [CLIPImageProcessor](https://huggingface.co/docs/transformers/main/en/model_doc/clip#transformers.CLIPImageProcessor),
+- as well as a [safety checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py).
+All of these components are necessary to run stable diffusion in inference even though they were trained
+or created independently from each other.
+
+To that end, we strive to offer all open-sourced, state-of-the-art diffusion system under a unified API.
+More specifically, we strive to provide pipelines that
+- 1. can load the officially published weights and yield 1-to-1 the same outputs as the original implementation according to the corresponding paper (*e.g.* [LDMTextToImagePipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/latent_diffusion), uses the officially released weights of [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)),
+- 2. have a simple user interface to run the model in inference (see the [Pipelines API](#pipelines-api) section),
+- 3. are easy to understand with code that is self-explanatory and can be read along-side the official paper (see [Pipelines summary](#pipelines-summary)),
+- 4. can easily be contributed by the community (see the [Contribution](#contribution) section).
+
+**Note** that pipelines do not (and should not) offer any training functionality.
+If you are looking for *official* training examples, please have a look at [examples](https://github.com/huggingface/diffusers/tree/main/examples).
+
+
+## Pipelines Summary
+
+The following table summarizes all officially supported pipelines, their corresponding paper, and if
+available a colab notebook to directly try them out.
+
+| Pipeline                                                                                                                      | Source                                                                                                                       | Tasks | Colab
+|-------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|:---:|:---:|
+| [dance diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/dance_diffusion)                 | [**Dance Diffusion**](https://github.com/Harmonai-org/sample-generator)                                                      | *Unconditional Audio Generation* |
+| [ddpm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddpm)                                       | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239)                                             | *Unconditional Image Generation* |
+| [ddim](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddim)                                       | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502)                                                  | *Unconditional Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [latent_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion)               | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         | *Text-to-Image Generation* |
+| [latent_diffusion_uncond](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         | *Unconditional Image Generation* |
+| [pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pndm)                                       | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778)                           | *Unconditional Image Generation* |
+| [score_sde_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_ve)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* |
+| [score_sde_vp](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_vp)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* |
+| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb)
+| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stochastic_karras_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stochastic_karras_ve)       | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364)                    | *Unconditional Image Generation* |
+
+**Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers.
+However, most of them can be adapted to use different scheduler components or even different model components. Some pipeline examples are shown in the [Examples](#examples) below.
+
+## Pipelines API
+
+Diffusion models often consist of multiple independently-trained models or other previously existing components.
+
+
+Each model has been trained independently on a different task and the scheduler can easily be swapped out and replaced with a different one.
+During inference, we however want to be able to easily load all components and use them in inference - even if one component, *e.g.* CLIP's text encoder, originates from a different library, such as [Transformers](https://github.com/huggingface/transformers). To that end, all pipelines provide the following functionality:
+
+- [`from_pretrained` method](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L139) that accepts a Hugging Face Hub repository id, *e.g.* [stable-diffusion-v1-5/stable-diffusion-v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) or a path to a local directory, *e.g.*
+"./stable-diffusion". To correctly retrieve which models and components should be loaded, one has to provide a `model_index.json` file, *e.g.* [stable-diffusion-v1-5/stable-diffusion-v1-5/model_index.json](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/model_index.json), which defines all components that should be
+loaded into the pipelines. More specifically, for each model/component one needs to define the format `<name>: ["<library>", "<class name>"]`. `<name>` is the attribute name given to the loaded instance of `<class name>` which can be found in the library or pipeline folder called `"<library>"`.
+- [`save_pretrained`](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L90) that accepts a local path, *e.g.* `./stable-diffusion` under which all models/components of the pipeline will be saved. For each component/model a folder is created inside the local path that is named after the given attribute name, *e.g.* `./stable_diffusion/unet`.
+In addition, a `model_index.json` file is created at the root of the local path, *e.g.* `./stable_diffusion/model_index.json` so that the complete pipeline can again be instantiated
+from the local path.
+- [`to`](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L118) which accepts a `string` or `torch.device` to move all models that are of type `torch.nn.Module` to the passed device. The behavior is fully analogous to [PyTorch's `to` method](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.to).
+- [`__call__`] method to use the pipeline in inference. `__call__` defines inference logic of the pipeline and should ideally encompass all aspects of it, from pre-processing to forwarding tensors to the different models and schedulers, as well as post-processing. The API of the `__call__` method can strongly vary from pipeline to pipeline. *E.g.* a text-to-image pipeline, such as [`StableDiffusionPipeline`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py) should accept among other things the text prompt to generate the image. A pure image generation pipeline, such as [DDPMPipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/ddpm) on the other hand can be run without providing any inputs. To better understand what inputs can be adapted for
+each pipeline, one should look directly into the respective pipeline.
+
+**Note**: All pipelines have PyTorch's autograd disabled by decorating the `__call__` method with a [`torch.no_grad`](https://pytorch.org/docs/stable/generated/torch.no_grad.html) decorator because pipelines should
+not be used for training. If you want to store the gradients during the forward pass, we recommend writing your own pipeline, see also our [community-examples](https://github.com/huggingface/diffusers/tree/main/examples/community)
+
+## Contribution
+
+We are more than happy about any contribution to the officially supported pipelines 🤗. We aspire
+all of our pipelines to be  **self-contained**, **easy-to-tweak**, **beginner-friendly** and for **one-purpose-only**.
+
+- **Self-contained**: A pipeline shall be as self-contained as possible. More specifically, this means that all functionality should be either directly defined in the pipeline file itself, should be inherited from (and only from) the [`DiffusionPipeline` class](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L56) or be directly attached to the model and scheduler components of the pipeline.
+- **Easy-to-use**: Pipelines should be extremely easy to use - one should be able to load the pipeline and
+use it for its designated task, *e.g.* text-to-image generation, in just a couple of lines of code. Most
+logic including pre-processing, an unrolled diffusion loop, and post-processing should all happen inside the `__call__` method.
+- **Easy-to-tweak**: Certain pipelines will not be able to handle all use cases and tasks that you might like them to. If you want to use a certain pipeline for a specific use case that is not yet supported, you might have to copy the pipeline file and tweak the code to your needs. We try to make the pipeline code as readable as possible so that each part –from pre-processing to diffusing to post-processing– can easily be adapted. If you would like the community to benefit from your customized pipeline, we would love to see a contribution to our [community-examples](https://github.com/huggingface/diffusers/tree/main/examples/community). If you feel that an important pipeline should be part of the official pipelines but isn't, a contribution to the [official pipelines](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines) would be even better.
+- **One-purpose-only**: Pipelines should be used for one task and one task only. Even if two tasks are very similar from a modeling point of view, *e.g.* image2image translation and in-painting, pipelines shall be used for one task only to keep them *easy-to-tweak* and *readable*.
+
+## Examples
+
+### Text-to-Image generation with Stable Diffusion
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
+
+pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+
+image.save("astronaut_rides_horse.png")
+```
+
+### Image-to-Image text-guided generation with Stable Diffusion
+
+The `StableDiffusionImg2ImgPipeline` lets you pass a text prompt and an initial image to condition the generation of new images.
+
+```python
+import requests
+from PIL import Image
+from io import BytesIO
+
+from diffusers import StableDiffusionImg2ImgPipeline
+
+# load the pipeline
+device = "cuda"
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+).to(device)
+
+# let's download an initial image
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((768, 512))
+
+prompt = "A fantasy landscape, trending on artstation"
+
+images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
+
+images[0].save("fantasy_landscape.png")
+```
+You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+
+### Tweak prompts reusing seeds and latents
+
+You can generate your own latents to reproduce results, or tweak your prompt on a specific result you liked. [This notebook](https://github.com/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) shows how to do it step by step. You can also run it in Google Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb).
+
+
+### In-painting using Stable Diffusion
+
+The `StableDiffusionInpaintPipeline` lets you edit specific parts of an image by providing a mask and text prompt.
+
+```python
+import PIL
+import requests
+import torch
+from io import BytesIO
+
+from diffusers import StableDiffusionInpaintPipeline
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+
+pipe = StableDiffusionInpaintPipeline.from_pretrained(
+    "runwayml/stable-diffusion-inpainting",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+```
+
+You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
diff --git a/diffusers/src/diffusers/pipelines/__init__.py b/diffusers/src/diffusers/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..754ae328ed8ba1d3a5a8f924160b9ecca51c1f5a
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/__init__.py
@@ -0,0 +1,738 @@
+from typing import TYPE_CHECKING
+
+from ..utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_k_diffusion_available,
+    is_librosa_available,
+    is_note_seq_available,
+    is_onnx_available,
+    is_sentencepiece_available,
+    is_torch_available,
+    is_torch_npu_available,
+    is_transformers_available,
+)
+
+
+# These modules contain pipelines from multiple libraries/frameworks
+_dummy_objects = {}
+_import_structure = {
+    "controlnet": [],
+    "controlnet_hunyuandit": [],
+    "controlnet_sd3": [],
+    "controlnet_xs": [],
+    "deprecated": [],
+    "latent_diffusion": [],
+    "ledits_pp": [],
+    "marigold": [],
+    "pag": [],
+    "stable_diffusion": [],
+    "stable_diffusion_xl": [],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_pt_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_pt_objects))
+else:
+    _import_structure["auto_pipeline"] = [
+        "AutoPipelineForImage2Image",
+        "AutoPipelineForInpainting",
+        "AutoPipelineForText2Image",
+    ]
+    _import_structure["consistency_models"] = ["ConsistencyModelPipeline"]
+    _import_structure["dance_diffusion"] = ["DanceDiffusionPipeline"]
+    _import_structure["ddim"] = ["DDIMPipeline"]
+    _import_structure["ddpm"] = ["DDPMPipeline"]
+    _import_structure["dit"] = ["DiTPipeline"]
+    _import_structure["latent_diffusion"].extend(["LDMSuperResolutionPipeline"])
+    _import_structure["pipeline_utils"] = [
+        "AudioPipelineOutput",
+        "DiffusionPipeline",
+        "StableDiffusionMixin",
+        "ImagePipelineOutput",
+    ]
+    _import_structure["deprecated"].extend(
+        [
+            "PNDMPipeline",
+            "LDMPipeline",
+            "RePaintPipeline",
+            "ScoreSdeVePipeline",
+            "KarrasVePipeline",
+        ]
+    )
+try:
+    if not (is_torch_available() and is_librosa_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_librosa_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_librosa_objects))
+else:
+    _import_structure["deprecated"].extend(["AudioDiffusionPipeline", "Mel"])
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
+else:
+    _import_structure["deprecated"].extend(
+        [
+            "MidiProcessor",
+            "SpectrogramDiffusionPipeline",
+        ]
+    )
+
+try:
+    if not (is_torch_available() and is_transformers_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["deprecated"].extend(
+        [
+            "VQDiffusionPipeline",
+            "AltDiffusionPipeline",
+            "AltDiffusionImg2ImgPipeline",
+            "CycleDiffusionPipeline",
+            "StableDiffusionInpaintPipelineLegacy",
+            "StableDiffusionPix2PixZeroPipeline",
+            "StableDiffusionParadigmsPipeline",
+            "StableDiffusionModelEditingPipeline",
+            "VersatileDiffusionDualGuidedPipeline",
+            "VersatileDiffusionImageVariationPipeline",
+            "VersatileDiffusionPipeline",
+            "VersatileDiffusionTextToImagePipeline",
+        ]
+    )
+    _import_structure["amused"] = ["AmusedImg2ImgPipeline", "AmusedInpaintPipeline", "AmusedPipeline"]
+    _import_structure["animatediff"] = [
+        "AnimateDiffPipeline",
+        "AnimateDiffControlNetPipeline",
+        "AnimateDiffSDXLPipeline",
+        "AnimateDiffSparseControlNetPipeline",
+        "AnimateDiffVideoToVideoPipeline",
+        "AnimateDiffVideoToVideoControlNetPipeline",
+    ]
+    _import_structure["flux"] = [
+        "FluxControlNetPipeline",
+        "FluxControlNetImg2ImgPipeline",
+        "FluxControlNetInpaintPipeline",
+        "FluxImg2ImgPipeline",
+        "FluxInpaintPipeline",
+        "FluxPipeline",
+    ]
+    _import_structure["audioldm"] = ["AudioLDMPipeline"]
+    _import_structure["audioldm2"] = [
+        "AudioLDM2Pipeline",
+        "AudioLDM2ProjectionModel",
+        "AudioLDM2UNet2DConditionModel",
+    ]
+    _import_structure["blip_diffusion"] = ["BlipDiffusionPipeline"]
+    _import_structure["cogvideo"] = [
+        "CogVideoXPipeline",
+        "CogVideoXImageToVideoPipeline",
+        "CogVideoXVideoToVideoPipeline",
+    ]
+    _import_structure["controlnet"].extend(
+        [
+            "BlipDiffusionControlNetPipeline",
+            "StableDiffusionControlNetImg2ImgPipeline",
+            "StableDiffusionControlNetInpaintPipeline",
+            "StableDiffusionControlNetPipeline",
+            "StableDiffusionXLControlNetImg2ImgPipeline",
+            "StableDiffusionXLControlNetInpaintPipeline",
+            "StableDiffusionXLControlNetPipeline",
+        ]
+    )
+    _import_structure["pag"].extend(
+        [
+            "AnimateDiffPAGPipeline",
+            "KolorsPAGPipeline",
+            "HunyuanDiTPAGPipeline",
+            "StableDiffusion3PAGPipeline",
+            "StableDiffusionPAGPipeline",
+            "StableDiffusionControlNetPAGPipeline",
+            "StableDiffusionXLPAGPipeline",
+            "StableDiffusionXLPAGInpaintPipeline",
+            "StableDiffusionXLControlNetPAGImg2ImgPipeline",
+            "StableDiffusionXLControlNetPAGPipeline",
+            "StableDiffusionXLPAGImg2ImgPipeline",
+            "PixArtSigmaPAGPipeline",
+        ]
+    )
+    _import_structure["controlnet_xs"].extend(
+        [
+            "StableDiffusionControlNetXSPipeline",
+            "StableDiffusionXLControlNetXSPipeline",
+        ]
+    )
+    _import_structure["controlnet_hunyuandit"].extend(
+        [
+            "HunyuanDiTControlNetPipeline",
+        ]
+    )
+    _import_structure["controlnet_sd3"].extend(
+        [
+            "StableDiffusion3ControlNetPipeline",
+            "StableDiffusion3ControlNetInpaintingPipeline",
+        ]
+    )
+    _import_structure["deepfloyd_if"] = [
+        "IFImg2ImgPipeline",
+        "IFImg2ImgSuperResolutionPipeline",
+        "IFInpaintingPipeline",
+        "IFInpaintingSuperResolutionPipeline",
+        "IFPipeline",
+        "IFSuperResolutionPipeline",
+    ]
+    _import_structure["hunyuandit"] = ["HunyuanDiTPipeline"]
+    _import_structure["kandinsky"] = [
+        "KandinskyCombinedPipeline",
+        "KandinskyImg2ImgCombinedPipeline",
+        "KandinskyImg2ImgPipeline",
+        "KandinskyInpaintCombinedPipeline",
+        "KandinskyInpaintPipeline",
+        "KandinskyPipeline",
+        "KandinskyPriorPipeline",
+    ]
+    _import_structure["kandinsky2_2"] = [
+        "KandinskyV22CombinedPipeline",
+        "KandinskyV22ControlnetImg2ImgPipeline",
+        "KandinskyV22ControlnetPipeline",
+        "KandinskyV22Img2ImgCombinedPipeline",
+        "KandinskyV22Img2ImgPipeline",
+        "KandinskyV22InpaintCombinedPipeline",
+        "KandinskyV22InpaintPipeline",
+        "KandinskyV22Pipeline",
+        "KandinskyV22PriorEmb2EmbPipeline",
+        "KandinskyV22PriorPipeline",
+    ]
+    _import_structure["kandinsky3"] = [
+        "Kandinsky3Img2ImgPipeline",
+        "Kandinsky3Pipeline",
+    ]
+    _import_structure["latent_consistency_models"] = [
+        "LatentConsistencyModelImg2ImgPipeline",
+        "LatentConsistencyModelPipeline",
+    ]
+    _import_structure["latent_diffusion"].extend(["LDMTextToImagePipeline"])
+    _import_structure["ledits_pp"].extend(
+        [
+            "LEditsPPPipelineStableDiffusion",
+            "LEditsPPPipelineStableDiffusionXL",
+        ]
+    )
+    _import_structure["latte"] = ["LattePipeline"]
+    _import_structure["lumina"] = ["LuminaText2ImgPipeline"]
+    _import_structure["marigold"].extend(
+        [
+            "MarigoldDepthPipeline",
+            "MarigoldNormalsPipeline",
+        ]
+    )
+    _import_structure["musicldm"] = ["MusicLDMPipeline"]
+    _import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
+    _import_structure["pia"] = ["PIAPipeline"]
+    _import_structure["pixart_alpha"] = ["PixArtAlphaPipeline", "PixArtSigmaPipeline"]
+    _import_structure["colorflow"] = ["ColorFlowPixArtAlphaPipeline", "ColorFlowSDPipeline"]
+    _import_structure["cobra"] = ["CobraPixArtAlphaPipeline"]
+    _import_structure["semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
+    _import_structure["shap_e"] = ["ShapEImg2ImgPipeline", "ShapEPipeline"]
+    _import_structure["stable_audio"] = [
+        "StableAudioProjectionModel",
+        "StableAudioPipeline",
+    ]
+    _import_structure["stable_cascade"] = [
+        "StableCascadeCombinedPipeline",
+        "StableCascadeDecoderPipeline",
+        "StableCascadePriorPipeline",
+    ]
+    _import_structure["stable_diffusion"].extend(
+        [
+            "CLIPImageProjection",
+            "StableDiffusionDepth2ImgPipeline",
+            "StableDiffusionImageVariationPipeline",
+            "StableDiffusionImg2ImgPipeline",
+            "StableDiffusionInpaintPipeline",
+            "StableDiffusionInstructPix2PixPipeline",
+            "StableDiffusionLatentUpscalePipeline",
+            "StableDiffusionPipeline",
+            "StableDiffusionUpscalePipeline",
+            "StableUnCLIPImg2ImgPipeline",
+            "StableUnCLIPPipeline",
+            "StableDiffusionLDM3DPipeline",
+        ]
+    )
+    _import_structure["aura_flow"] = ["AuraFlowPipeline"]
+    _import_structure["stable_diffusion_3"] = [
+        "StableDiffusion3Pipeline",
+        "StableDiffusion3Img2ImgPipeline",
+        "StableDiffusion3InpaintPipeline",
+    ]
+    _import_structure["stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
+    _import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
+    _import_structure["stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"]
+    _import_structure["stable_diffusion_gligen"] = [
+        "StableDiffusionGLIGENPipeline",
+        "StableDiffusionGLIGENTextImagePipeline",
+    ]
+    _import_structure["stable_video_diffusion"] = ["StableVideoDiffusionPipeline"]
+    _import_structure["stable_diffusion_xl"].extend(
+        [
+            "StableDiffusionXLImg2ImgPipeline",
+            "StableDiffusionXLInpaintPipeline",
+            "StableDiffusionXLInstructPix2PixPipeline",
+            "StableDiffusionXLPipeline",
+        ]
+    )
+    _import_structure["stable_diffusion_diffedit"] = ["StableDiffusionDiffEditPipeline"]
+    _import_structure["stable_diffusion_ldm3d"] = ["StableDiffusionLDM3DPipeline"]
+    _import_structure["stable_diffusion_panorama"] = ["StableDiffusionPanoramaPipeline"]
+    _import_structure["t2i_adapter"] = [
+        "StableDiffusionAdapterPipeline",
+        "StableDiffusionXLAdapterPipeline",
+    ]
+    _import_structure["text_to_video_synthesis"] = [
+        "TextToVideoSDPipeline",
+        "TextToVideoZeroPipeline",
+        "TextToVideoZeroSDXLPipeline",
+        "VideoToVideoSDPipeline",
+    ]
+    _import_structure["i2vgen_xl"] = ["I2VGenXLPipeline"]
+    _import_structure["unclip"] = ["UnCLIPImageVariationPipeline", "UnCLIPPipeline"]
+    _import_structure["unidiffuser"] = [
+        "ImageTextPipelineOutput",
+        "UniDiffuserModel",
+        "UniDiffuserPipeline",
+        "UniDiffuserTextDecoder",
+    ]
+    _import_structure["wuerstchen"] = [
+        "WuerstchenCombinedPipeline",
+        "WuerstchenDecoderPipeline",
+        "WuerstchenPriorPipeline",
+    ]
+try:
+    if not is_onnx_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_onnx_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_onnx_objects))
+else:
+    _import_structure["onnx_utils"] = ["OnnxRuntimeModel"]
+try:
+    if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_onnx_objects))
+else:
+    _import_structure["stable_diffusion"].extend(
+        [
+            "OnnxStableDiffusionImg2ImgPipeline",
+            "OnnxStableDiffusionInpaintPipeline",
+            "OnnxStableDiffusionPipeline",
+            "OnnxStableDiffusionUpscalePipeline",
+            "StableDiffusionOnnxPipeline",
+        ]
+    )
+
+try:
+    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import (
+        dummy_torch_and_transformers_and_k_diffusion_objects,
+    )
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
+else:
+    _import_structure["stable_diffusion_k_diffusion"] = [
+        "StableDiffusionKDiffusionPipeline",
+        "StableDiffusionXLKDiffusionPipeline",
+    ]
+
+try:
+    if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import (
+        dummy_torch_and_transformers_and_sentencepiece_objects,
+    )
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_sentencepiece_objects))
+else:
+    _import_structure["kolors"] = [
+        "KolorsPipeline",
+        "KolorsImg2ImgPipeline",
+    ]
+
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_flax_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_flax_objects))
+else:
+    _import_structure["pipeline_flax_utils"] = ["FlaxDiffusionPipeline"]
+try:
+    if not (is_flax_available() and is_transformers_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_flax_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
+else:
+    _import_structure["controlnet"].extend(["FlaxStableDiffusionControlNetPipeline"])
+    _import_structure["stable_diffusion"].extend(
+        [
+            "FlaxStableDiffusionImg2ImgPipeline",
+            "FlaxStableDiffusionInpaintPipeline",
+            "FlaxStableDiffusionPipeline",
+        ]
+    )
+    _import_structure["stable_diffusion_xl"].extend(
+        [
+            "FlaxStableDiffusionXLPipeline",
+        ]
+    )
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_pt_objects import *  # noqa F403
+
+    else:
+        from .auto_pipeline import (
+            AutoPipelineForImage2Image,
+            AutoPipelineForInpainting,
+            AutoPipelineForText2Image,
+        )
+        from .consistency_models import ConsistencyModelPipeline
+        from .dance_diffusion import DanceDiffusionPipeline
+        from .ddim import DDIMPipeline
+        from .ddpm import DDPMPipeline
+        from .deprecated import KarrasVePipeline, LDMPipeline, PNDMPipeline, RePaintPipeline, ScoreSdeVePipeline
+        from .dit import DiTPipeline
+        from .latent_diffusion import LDMSuperResolutionPipeline
+        from .pipeline_utils import (
+            AudioPipelineOutput,
+            DiffusionPipeline,
+            ImagePipelineOutput,
+            StableDiffusionMixin,
+        )
+
+    try:
+        if not (is_torch_available() and is_librosa_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_librosa_objects import *
+    else:
+        from .deprecated import AudioDiffusionPipeline, Mel
+
+    try:
+        if not (is_torch_available() and is_transformers_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .amused import AmusedImg2ImgPipeline, AmusedInpaintPipeline, AmusedPipeline
+        from .animatediff import (
+            AnimateDiffControlNetPipeline,
+            AnimateDiffPipeline,
+            AnimateDiffSDXLPipeline,
+            AnimateDiffSparseControlNetPipeline,
+            AnimateDiffVideoToVideoControlNetPipeline,
+            AnimateDiffVideoToVideoPipeline,
+        )
+        from .audioldm import AudioLDMPipeline
+        from .audioldm2 import (
+            AudioLDM2Pipeline,
+            AudioLDM2ProjectionModel,
+            AudioLDM2UNet2DConditionModel,
+        )
+        from .aura_flow import AuraFlowPipeline
+        from .blip_diffusion import BlipDiffusionPipeline
+        from .cogvideo import CogVideoXImageToVideoPipeline, CogVideoXPipeline, CogVideoXVideoToVideoPipeline
+        from .controlnet import (
+            BlipDiffusionControlNetPipeline,
+            StableDiffusionControlNetImg2ImgPipeline,
+            StableDiffusionControlNetInpaintPipeline,
+            StableDiffusionControlNetPipeline,
+            StableDiffusionXLControlNetImg2ImgPipeline,
+            StableDiffusionXLControlNetInpaintPipeline,
+            StableDiffusionXLControlNetPipeline,
+        )
+        from .controlnet_hunyuandit import (
+            HunyuanDiTControlNetPipeline,
+        )
+        from .controlnet_sd3 import StableDiffusion3ControlNetInpaintingPipeline, StableDiffusion3ControlNetPipeline
+        from .controlnet_xs import (
+            StableDiffusionControlNetXSPipeline,
+            StableDiffusionXLControlNetXSPipeline,
+        )
+        from .deepfloyd_if import (
+            IFImg2ImgPipeline,
+            IFImg2ImgSuperResolutionPipeline,
+            IFInpaintingPipeline,
+            IFInpaintingSuperResolutionPipeline,
+            IFPipeline,
+            IFSuperResolutionPipeline,
+        )
+        from .deprecated import (
+            AltDiffusionImg2ImgPipeline,
+            AltDiffusionPipeline,
+            CycleDiffusionPipeline,
+            StableDiffusionInpaintPipelineLegacy,
+            StableDiffusionModelEditingPipeline,
+            StableDiffusionParadigmsPipeline,
+            StableDiffusionPix2PixZeroPipeline,
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+            VQDiffusionPipeline,
+        )
+        from .flux import (
+            FluxControlNetImg2ImgPipeline,
+            FluxControlNetInpaintPipeline,
+            FluxControlNetPipeline,
+            FluxImg2ImgPipeline,
+            FluxInpaintPipeline,
+            FluxPipeline,
+        )
+        from .hunyuandit import HunyuanDiTPipeline
+        from .i2vgen_xl import I2VGenXLPipeline
+        from .kandinsky import (
+            KandinskyCombinedPipeline,
+            KandinskyImg2ImgCombinedPipeline,
+            KandinskyImg2ImgPipeline,
+            KandinskyInpaintCombinedPipeline,
+            KandinskyInpaintPipeline,
+            KandinskyPipeline,
+            KandinskyPriorPipeline,
+        )
+        from .kandinsky2_2 import (
+            KandinskyV22CombinedPipeline,
+            KandinskyV22ControlnetImg2ImgPipeline,
+            KandinskyV22ControlnetPipeline,
+            KandinskyV22Img2ImgCombinedPipeline,
+            KandinskyV22Img2ImgPipeline,
+            KandinskyV22InpaintCombinedPipeline,
+            KandinskyV22InpaintPipeline,
+            KandinskyV22Pipeline,
+            KandinskyV22PriorEmb2EmbPipeline,
+            KandinskyV22PriorPipeline,
+        )
+        from .kandinsky3 import (
+            Kandinsky3Img2ImgPipeline,
+            Kandinsky3Pipeline,
+        )
+        from .latent_consistency_models import (
+            LatentConsistencyModelImg2ImgPipeline,
+            LatentConsistencyModelPipeline,
+        )
+        from .latent_diffusion import LDMTextToImagePipeline
+        from .latte import LattePipeline
+        from .ledits_pp import (
+            LEditsPPDiffusionPipelineOutput,
+            LEditsPPInversionPipelineOutput,
+            LEditsPPPipelineStableDiffusion,
+            LEditsPPPipelineStableDiffusionXL,
+        )
+        from .lumina import LuminaText2ImgPipeline
+        from .marigold import (
+            MarigoldDepthPipeline,
+            MarigoldNormalsPipeline,
+        )
+        from .musicldm import MusicLDMPipeline
+        from .pag import (
+            AnimateDiffPAGPipeline,
+            HunyuanDiTPAGPipeline,
+            KolorsPAGPipeline,
+            PixArtSigmaPAGPipeline,
+            StableDiffusion3PAGPipeline,
+            StableDiffusionControlNetPAGPipeline,
+            StableDiffusionPAGPipeline,
+            StableDiffusionXLControlNetPAGImg2ImgPipeline,
+            StableDiffusionXLControlNetPAGPipeline,
+            StableDiffusionXLPAGImg2ImgPipeline,
+            StableDiffusionXLPAGInpaintPipeline,
+            StableDiffusionXLPAGPipeline,
+        )
+        from .paint_by_example import PaintByExamplePipeline
+        from .pia import PIAPipeline
+        from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
+        from .colorflow import ColorFlowPixArtAlphaPipeline, ColorFlowSDPipeline
+        from .cobra import CobraPixArtAlphaPipeline
+        from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
+        from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
+        from .stable_audio import StableAudioPipeline, StableAudioProjectionModel
+        from .stable_cascade import (
+            StableCascadeCombinedPipeline,
+            StableCascadeDecoderPipeline,
+            StableCascadePriorPipeline,
+        )
+        from .stable_diffusion import (
+            CLIPImageProjection,
+            StableDiffusionDepth2ImgPipeline,
+            StableDiffusionImageVariationPipeline,
+            StableDiffusionImg2ImgPipeline,
+            StableDiffusionInpaintPipeline,
+            StableDiffusionInstructPix2PixPipeline,
+            StableDiffusionLatentUpscalePipeline,
+            StableDiffusionPipeline,
+            StableDiffusionUpscalePipeline,
+            StableUnCLIPImg2ImgPipeline,
+            StableUnCLIPPipeline,
+        )
+        from .stable_diffusion_3 import (
+            StableDiffusion3Img2ImgPipeline,
+            StableDiffusion3InpaintPipeline,
+            StableDiffusion3Pipeline,
+        )
+        from .stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
+        from .stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
+        from .stable_diffusion_gligen import StableDiffusionGLIGENPipeline, StableDiffusionGLIGENTextImagePipeline
+        from .stable_diffusion_ldm3d import StableDiffusionLDM3DPipeline
+        from .stable_diffusion_panorama import StableDiffusionPanoramaPipeline
+        from .stable_diffusion_safe import StableDiffusionPipelineSafe
+        from .stable_diffusion_sag import StableDiffusionSAGPipeline
+        from .stable_diffusion_xl import (
+            StableDiffusionXLImg2ImgPipeline,
+            StableDiffusionXLInpaintPipeline,
+            StableDiffusionXLInstructPix2PixPipeline,
+            StableDiffusionXLPipeline,
+        )
+        from .stable_video_diffusion import StableVideoDiffusionPipeline
+        from .t2i_adapter import (
+            StableDiffusionAdapterPipeline,
+            StableDiffusionXLAdapterPipeline,
+        )
+        from .text_to_video_synthesis import (
+            TextToVideoSDPipeline,
+            TextToVideoZeroPipeline,
+            TextToVideoZeroSDXLPipeline,
+            VideoToVideoSDPipeline,
+        )
+        from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
+        from .unidiffuser import (
+            ImageTextPipelineOutput,
+            UniDiffuserModel,
+            UniDiffuserPipeline,
+            UniDiffuserTextDecoder,
+        )
+        from .wuerstchen import (
+            WuerstchenCombinedPipeline,
+            WuerstchenDecoderPipeline,
+            WuerstchenPriorPipeline,
+        )
+
+        try:
+            if not is_onnx_available():
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_onnx_objects import *  # noqa F403
+
+        else:
+            from .onnx_utils import OnnxRuntimeModel
+
+        try:
+            if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_torch_and_transformers_and_onnx_objects import *
+        else:
+            from .stable_diffusion import (
+                OnnxStableDiffusionImg2ImgPipeline,
+                OnnxStableDiffusionInpaintPipeline,
+                OnnxStableDiffusionPipeline,
+                OnnxStableDiffusionUpscalePipeline,
+                StableDiffusionOnnxPipeline,
+            )
+
+        try:
+            if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_torch_and_transformers_and_k_diffusion_objects import *
+        else:
+            from .stable_diffusion_k_diffusion import (
+                StableDiffusionKDiffusionPipeline,
+                StableDiffusionXLKDiffusionPipeline,
+            )
+
+        try:
+            if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_torch_and_transformers_and_sentencepiece_objects import *
+        else:
+            from .kolors import (
+                KolorsImg2ImgPipeline,
+                KolorsPipeline,
+            )
+
+        try:
+            if not is_flax_available():
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_flax_objects import *  # noqa F403
+        else:
+            from .pipeline_flax_utils import FlaxDiffusionPipeline
+
+        try:
+            if not (is_flax_available() and is_transformers_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_flax_and_transformers_objects import *
+        else:
+            from .controlnet import FlaxStableDiffusionControlNetPipeline
+            from .stable_diffusion import (
+                FlaxStableDiffusionImg2ImgPipeline,
+                FlaxStableDiffusionInpaintPipeline,
+                FlaxStableDiffusionPipeline,
+            )
+            from .stable_diffusion_xl import (
+                FlaxStableDiffusionXLPipeline,
+            )
+
+        try:
+            if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+
+        else:
+            from .deprecated import (
+                MidiProcessor,
+                SpectrogramDiffusionPipeline,
+            )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/__pycache__/__init__.cpython-311.pyc b/diffusers/src/diffusers/pipelines/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1aaf632e0054ecc6302d15a3c253b2d2314fe75c
Binary files /dev/null and b/diffusers/src/diffusers/pipelines/__pycache__/__init__.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/pipelines/__pycache__/pipeline_loading_utils.cpython-311.pyc b/diffusers/src/diffusers/pipelines/__pycache__/pipeline_loading_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7b8ea90d54fbe1711fb3ee7bc1eb3a4672ca2fe
Binary files /dev/null and b/diffusers/src/diffusers/pipelines/__pycache__/pipeline_loading_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/pipelines/__pycache__/pipeline_utils.cpython-311.pyc b/diffusers/src/diffusers/pipelines/__pycache__/pipeline_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98823f7c54b1055b4b4856b3c93d4745883430af
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/__pycache__/pipeline_utils.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6079bba856891af9cd4c7703da7a848573915a582b670deedf485a04ae1a1426
+size 108804
diff --git a/diffusers/src/diffusers/pipelines/amused/__init__.py b/diffusers/src/diffusers/pipelines/amused/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c4d07a426b54fabfcdf35bfb8e4486cd828b3b3
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/amused/__init__.py
@@ -0,0 +1,62 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import (
+        AmusedImg2ImgPipeline,
+        AmusedInpaintPipeline,
+        AmusedPipeline,
+    )
+
+    _dummy_objects.update(
+        {
+            "AmusedPipeline": AmusedPipeline,
+            "AmusedImg2ImgPipeline": AmusedImg2ImgPipeline,
+            "AmusedInpaintPipeline": AmusedInpaintPipeline,
+        }
+    )
+else:
+    _import_structure["pipeline_amused"] = ["AmusedPipeline"]
+    _import_structure["pipeline_amused_img2img"] = ["AmusedImg2ImgPipeline"]
+    _import_structure["pipeline_amused_inpaint"] = ["AmusedInpaintPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import (
+            AmusedPipeline,
+        )
+    else:
+        from .pipeline_amused import AmusedPipeline
+        from .pipeline_amused_img2img import AmusedImg2ImgPipeline
+        from .pipeline_amused_inpaint import AmusedInpaintPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/amused/pipeline_amused.py b/diffusers/src/diffusers/pipelines/amused/pipeline_amused.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8c24b0aeeccc8e1980e98b4c3b7bd62900ff630
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/amused/pipeline_amused.py
@@ -0,0 +1,328 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...models import UVit2DModel, VQModel
+from ...schedulers import AmusedScheduler
+from ...utils import replace_example_docstring
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AmusedPipeline
+
+        >>> pipe = AmusedPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+class AmusedPipeline(DiffusionPipeline):
+    image_processor: VaeImageProcessor
+    vqvae: VQModel
+    tokenizer: CLIPTokenizer
+    text_encoder: CLIPTextModelWithProjection
+    transformer: UVit2DModel
+    scheduler: AmusedScheduler
+
+    model_cpu_offload_seq = "text_encoder->transformer->vqvae"
+
+    def __init__(
+        self,
+        vqvae: VQModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        transformer: UVit2DModel,
+        scheduler: AmusedScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vqvae=vqvae,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[List[str], str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 12,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.IntTensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_encoder_hidden_states: Optional[torch.Tensor] = None,
+        output_type="pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        micro_conditioning_aesthetic_score: int = 6,
+        micro_conditioning_crop_coord: Tuple[int, int] = (0, 0),
+        temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.transformer.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 16):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 10.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.IntTensor`, *optional*):
+                Pre-generated tokens representing latent vectors in `self.vqvae`, to be used as inputs for image
+                gneration. If not provided, the starting latents will be completely masked.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument. A single vector from the
+                pooled and projected final hidden states.
+            encoder_hidden_states (`torch.Tensor`, *optional*):
+                Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            negative_encoder_hidden_states (`torch.Tensor`, *optional*):
+                Analogous to `encoder_hidden_states` for the positive prompt.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):
+                The targeted aesthetic score according to the laion aesthetic classifier. See
+                https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of
+                https://arxiv.org/abs/2307.01952.
+            micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                The targeted height, width crop coordinates. See the micro-conditioning section of
+                https://arxiv.org/abs/2307.01952.
+            temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):
+                Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.pipeline_utils.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.pipeline_utils.ImagePipelineOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is a list with the generated images.
+        """
+        if (prompt_embeds is not None and encoder_hidden_states is None) or (
+            prompt_embeds is None and encoder_hidden_states is not None
+        ):
+            raise ValueError("pass either both `prompt_embeds` and `encoder_hidden_states` or neither")
+
+        if (negative_prompt_embeds is not None and negative_encoder_hidden_states is None) or (
+            negative_prompt_embeds is None and negative_encoder_hidden_states is not None
+        ):
+            raise ValueError(
+                "pass either both `negatve_prompt_embeds` and `negative_encoder_hidden_states` or neither"
+            )
+
+        if (prompt is None and prompt_embeds is None) or (prompt is not None and prompt_embeds is not None):
+            raise ValueError("pass only one of `prompt` or `prompt_embeds`")
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if height is None:
+            height = self.transformer.config.sample_size * self.vae_scale_factor
+
+        if width is None:
+            width = self.transformer.config.sample_size * self.vae_scale_factor
+
+        if prompt_embeds is None:
+            input_ids = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids.to(self._execution_device)
+
+            outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+            prompt_embeds = outputs.text_embeds
+            encoder_hidden_states = outputs.hidden_states[-2]
+
+        prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1)
+        encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+
+        if guidance_scale > 1.0:
+            if negative_prompt_embeds is None:
+                if negative_prompt is None:
+                    negative_prompt = [""] * len(prompt)
+
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+
+                input_ids = self.tokenizer(
+                    negative_prompt,
+                    return_tensors="pt",
+                    padding="max_length",
+                    truncation=True,
+                    max_length=self.tokenizer.model_max_length,
+                ).input_ids.to(self._execution_device)
+
+                outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+                negative_prompt_embeds = outputs.text_embeds
+                negative_encoder_hidden_states = outputs.hidden_states[-2]
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(num_images_per_prompt, 1)
+            negative_encoder_hidden_states = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+
+            prompt_embeds = torch.concat([negative_prompt_embeds, prompt_embeds])
+            encoder_hidden_states = torch.concat([negative_encoder_hidden_states, encoder_hidden_states])
+
+        # Note that the micro conditionings _do_ flip the order of width, height for the original size
+        # and the crop coordinates. This is how it was done in the original code base
+        micro_conds = torch.tensor(
+            [
+                width,
+                height,
+                micro_conditioning_crop_coord[0],
+                micro_conditioning_crop_coord[1],
+                micro_conditioning_aesthetic_score,
+            ],
+            device=self._execution_device,
+            dtype=encoder_hidden_states.dtype,
+        )
+        micro_conds = micro_conds.unsqueeze(0)
+        micro_conds = micro_conds.expand(2 * batch_size if guidance_scale > 1.0 else batch_size, -1)
+
+        shape = (batch_size, height // self.vae_scale_factor, width // self.vae_scale_factor)
+
+        if latents is None:
+            latents = torch.full(
+                shape, self.scheduler.config.mask_token_id, dtype=torch.long, device=self._execution_device
+            )
+
+        self.scheduler.set_timesteps(num_inference_steps, temperature, self._execution_device)
+
+        num_warmup_steps = len(self.scheduler.timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, timestep in enumerate(self.scheduler.timesteps):
+                if guidance_scale > 1.0:
+                    model_input = torch.cat([latents] * 2)
+                else:
+                    model_input = latents
+
+                model_output = self.transformer(
+                    model_input,
+                    micro_conds=micro_conds,
+                    pooled_text_emb=prompt_embeds,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+
+                if guidance_scale > 1.0:
+                    uncond_logits, cond_logits = model_output.chunk(2)
+                    model_output = uncond_logits + guidance_scale * (cond_logits - uncond_logits)
+
+                latents = self.scheduler.step(
+                    model_output=model_output,
+                    timestep=timestep,
+                    sample=latents,
+                    generator=generator,
+                ).prev_sample
+
+                if i == len(self.scheduler.timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, timestep, latents)
+
+        if output_type == "latent":
+            output = latents
+        else:
+            needs_upcasting = self.vqvae.dtype == torch.float16 and self.vqvae.config.force_upcast
+
+            if needs_upcasting:
+                self.vqvae.float()
+
+            output = self.vqvae.decode(
+                latents,
+                force_not_quantize=True,
+                shape=(
+                    batch_size,
+                    height // self.vae_scale_factor,
+                    width // self.vae_scale_factor,
+                    self.vqvae.config.latent_channels,
+                ),
+            ).sample.clip(0, 1)
+            output = self.image_processor.postprocess(output, output_type)
+
+            if needs_upcasting:
+                self.vqvae.half()
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (output,)
+
+        return ImagePipelineOutput(output)
diff --git a/diffusers/src/diffusers/pipelines/amused/pipeline_amused_img2img.py b/diffusers/src/diffusers/pipelines/amused/pipeline_amused_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..c74275b414d4283634b2aecb8037d6946c338244
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/amused/pipeline_amused_img2img.py
@@ -0,0 +1,349 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...models import UVit2DModel, VQModel
+from ...schedulers import AmusedScheduler
+from ...utils import replace_example_docstring
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AmusedImg2ImgPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = AmusedImg2ImgPipeline.from_pretrained(
+        ...     "amused/amused-512", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "winter mountains"
+        >>> input_image = (
+        ...     load_image(
+        ...         "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg"
+        ...     )
+        ...     .resize((512, 512))
+        ...     .convert("RGB")
+        ... )
+        >>> image = pipe(prompt, input_image).images[0]
+        ```
+"""
+
+
+class AmusedImg2ImgPipeline(DiffusionPipeline):
+    image_processor: VaeImageProcessor
+    vqvae: VQModel
+    tokenizer: CLIPTokenizer
+    text_encoder: CLIPTextModelWithProjection
+    transformer: UVit2DModel
+    scheduler: AmusedScheduler
+
+    model_cpu_offload_seq = "text_encoder->transformer->vqvae"
+
+    # TODO - when calling self.vqvae.quantize, it uses self.vqvae.quantize.embedding.weight before
+    # the forward method of self.vqvae.quantize, so the hook doesn't get called to move the parameter
+    # off the meta device. There should be a way to fix this instead of just not offloading it
+    _exclude_from_cpu_offload = ["vqvae"]
+
+    def __init__(
+        self,
+        vqvae: VQModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        transformer: UVit2DModel,
+        scheduler: AmusedScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vqvae=vqvae,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[List[str], str]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.5,
+        num_inference_steps: int = 12,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[torch.Generator] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_encoder_hidden_states: Optional[torch.Tensor] = None,
+        output_type="pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        micro_conditioning_aesthetic_score: int = 6,
+        micro_conditioning_crop_coord: Tuple[int, int] = (0, 0),
+        temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            strength (`float`, *optional*, defaults to 0.5):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 12):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 10.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument. A single vector from the
+                pooled and projected final hidden states.
+            encoder_hidden_states (`torch.Tensor`, *optional*):
+                Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            negative_encoder_hidden_states (`torch.Tensor`, *optional*):
+                Analogous to `encoder_hidden_states` for the positive prompt.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):
+                The targeted aesthetic score according to the laion aesthetic classifier. See
+                https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of
+                https://arxiv.org/abs/2307.01952.
+            micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                The targeted height, width crop coordinates. See the micro-conditioning section of
+                https://arxiv.org/abs/2307.01952.
+            temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):
+                Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.pipeline_utils.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.pipeline_utils.ImagePipelineOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is a list with the generated images.
+        """
+
+        if (prompt_embeds is not None and encoder_hidden_states is None) or (
+            prompt_embeds is None and encoder_hidden_states is not None
+        ):
+            raise ValueError("pass either both `prompt_embeds` and `encoder_hidden_states` or neither")
+
+        if (negative_prompt_embeds is not None and negative_encoder_hidden_states is None) or (
+            negative_prompt_embeds is None and negative_encoder_hidden_states is not None
+        ):
+            raise ValueError(
+                "pass either both `negative_prompt_embeds` and `negative_encoder_hidden_states` or neither"
+            )
+
+        if (prompt is None and prompt_embeds is None) or (prompt is not None and prompt_embeds is not None):
+            raise ValueError("pass only one of `prompt` or `prompt_embeds`")
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if prompt_embeds is None:
+            input_ids = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids.to(self._execution_device)
+
+            outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+            prompt_embeds = outputs.text_embeds
+            encoder_hidden_states = outputs.hidden_states[-2]
+
+        prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1)
+        encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+
+        if guidance_scale > 1.0:
+            if negative_prompt_embeds is None:
+                if negative_prompt is None:
+                    negative_prompt = [""] * len(prompt)
+
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+
+                input_ids = self.tokenizer(
+                    negative_prompt,
+                    return_tensors="pt",
+                    padding="max_length",
+                    truncation=True,
+                    max_length=self.tokenizer.model_max_length,
+                ).input_ids.to(self._execution_device)
+
+                outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+                negative_prompt_embeds = outputs.text_embeds
+                negative_encoder_hidden_states = outputs.hidden_states[-2]
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(num_images_per_prompt, 1)
+            negative_encoder_hidden_states = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+
+            prompt_embeds = torch.concat([negative_prompt_embeds, prompt_embeds])
+            encoder_hidden_states = torch.concat([negative_encoder_hidden_states, encoder_hidden_states])
+
+        image = self.image_processor.preprocess(image)
+
+        height, width = image.shape[-2:]
+
+        # Note that the micro conditionings _do_ flip the order of width, height for the original size
+        # and the crop coordinates. This is how it was done in the original code base
+        micro_conds = torch.tensor(
+            [
+                width,
+                height,
+                micro_conditioning_crop_coord[0],
+                micro_conditioning_crop_coord[1],
+                micro_conditioning_aesthetic_score,
+            ],
+            device=self._execution_device,
+            dtype=encoder_hidden_states.dtype,
+        )
+
+        micro_conds = micro_conds.unsqueeze(0)
+        micro_conds = micro_conds.expand(2 * batch_size if guidance_scale > 1.0 else batch_size, -1)
+
+        self.scheduler.set_timesteps(num_inference_steps, temperature, self._execution_device)
+        num_inference_steps = int(len(self.scheduler.timesteps) * strength)
+        start_timestep_idx = len(self.scheduler.timesteps) - num_inference_steps
+
+        needs_upcasting = self.vqvae.dtype == torch.float16 and self.vqvae.config.force_upcast
+
+        if needs_upcasting:
+            self.vqvae.float()
+
+        latents = self.vqvae.encode(image.to(dtype=self.vqvae.dtype, device=self._execution_device)).latents
+        latents_bsz, channels, latents_height, latents_width = latents.shape
+        latents = self.vqvae.quantize(latents)[2][2].reshape(latents_bsz, latents_height, latents_width)
+        latents = self.scheduler.add_noise(
+            latents, self.scheduler.timesteps[start_timestep_idx - 1], generator=generator
+        )
+        latents = latents.repeat(num_images_per_prompt, 1, 1)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i in range(start_timestep_idx, len(self.scheduler.timesteps)):
+                timestep = self.scheduler.timesteps[i]
+
+                if guidance_scale > 1.0:
+                    model_input = torch.cat([latents] * 2)
+                else:
+                    model_input = latents
+
+                model_output = self.transformer(
+                    model_input,
+                    micro_conds=micro_conds,
+                    pooled_text_emb=prompt_embeds,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+
+                if guidance_scale > 1.0:
+                    uncond_logits, cond_logits = model_output.chunk(2)
+                    model_output = uncond_logits + guidance_scale * (cond_logits - uncond_logits)
+
+                latents = self.scheduler.step(
+                    model_output=model_output,
+                    timestep=timestep,
+                    sample=latents,
+                    generator=generator,
+                ).prev_sample
+
+                if i == len(self.scheduler.timesteps) - 1 or ((i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, timestep, latents)
+
+        if output_type == "latent":
+            output = latents
+        else:
+            output = self.vqvae.decode(
+                latents,
+                force_not_quantize=True,
+                shape=(
+                    batch_size,
+                    height // self.vae_scale_factor,
+                    width // self.vae_scale_factor,
+                    self.vqvae.config.latent_channels,
+                ),
+            ).sample.clip(0, 1)
+            output = self.image_processor.postprocess(output, output_type)
+
+            if needs_upcasting:
+                self.vqvae.half()
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (output,)
+
+        return ImagePipelineOutput(output)
diff --git a/diffusers/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py b/diffusers/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..24801e0ef977f40280b7af35c21cd427a8f2cf4b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py
@@ -0,0 +1,380 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...models import UVit2DModel, VQModel
+from ...schedulers import AmusedScheduler
+from ...utils import replace_example_docstring
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AmusedInpaintPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = AmusedInpaintPipeline.from_pretrained(
+        ...     "amused/amused-512", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "fall mountains"
+        >>> input_image = (
+        ...     load_image(
+        ...         "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg"
+        ...     )
+        ...     .resize((512, 512))
+        ...     .convert("RGB")
+        ... )
+        >>> mask = (
+        ...     load_image(
+        ...         "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
+        ...     )
+        ...     .resize((512, 512))
+        ...     .convert("L")
+        ... )
+        >>> pipe(prompt, input_image, mask).images[0].save("out.png")
+        ```
+"""
+
+
+class AmusedInpaintPipeline(DiffusionPipeline):
+    image_processor: VaeImageProcessor
+    vqvae: VQModel
+    tokenizer: CLIPTokenizer
+    text_encoder: CLIPTextModelWithProjection
+    transformer: UVit2DModel
+    scheduler: AmusedScheduler
+
+    model_cpu_offload_seq = "text_encoder->transformer->vqvae"
+
+    # TODO - when calling self.vqvae.quantize, it uses self.vqvae.quantize.embedding.weight before
+    # the forward method of self.vqvae.quantize, so the hook doesn't get called to move the parameter
+    # off the meta device. There should be a way to fix this instead of just not offloading it
+    _exclude_from_cpu_offload = ["vqvae"]
+
+    def __init__(
+        self,
+        vqvae: VQModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        transformer: UVit2DModel,
+        scheduler: AmusedScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vqvae=vqvae,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_normalize=False,
+            do_binarize=True,
+            do_convert_grayscale=True,
+            do_resize=True,
+        )
+        self.scheduler.register_to_config(masking_schedule="linear")
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[List[str], str]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 12,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[torch.Generator] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_encoder_hidden_states: Optional[torch.Tensor] = None,
+        output_type="pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        micro_conditioning_aesthetic_score: int = 6,
+        micro_conditioning_crop_coord: Tuple[int, int] = (0, 0),
+        temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
+                1)`, or `(H, W)`.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 16):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 10.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument. A single vector from the
+                pooled and projected final hidden states.
+            encoder_hidden_states (`torch.Tensor`, *optional*):
+                Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            negative_encoder_hidden_states (`torch.Tensor`, *optional*):
+                Analogous to `encoder_hidden_states` for the positive prompt.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):
+                The targeted aesthetic score according to the laion aesthetic classifier. See
+                https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of
+                https://arxiv.org/abs/2307.01952.
+            micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                The targeted height, width crop coordinates. See the micro-conditioning section of
+                https://arxiv.org/abs/2307.01952.
+            temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):
+                Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.pipeline_utils.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.pipeline_utils.ImagePipelineOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is a list with the generated images.
+        """
+
+        if (prompt_embeds is not None and encoder_hidden_states is None) or (
+            prompt_embeds is None and encoder_hidden_states is not None
+        ):
+            raise ValueError("pass either both `prompt_embeds` and `encoder_hidden_states` or neither")
+
+        if (negative_prompt_embeds is not None and negative_encoder_hidden_states is None) or (
+            negative_prompt_embeds is None and negative_encoder_hidden_states is not None
+        ):
+            raise ValueError(
+                "pass either both `negatve_prompt_embeds` and `negative_encoder_hidden_states` or neither"
+            )
+
+        if (prompt is None and prompt_embeds is None) or (prompt is not None and prompt_embeds is not None):
+            raise ValueError("pass only one of `prompt` or `prompt_embeds`")
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if prompt_embeds is None:
+            input_ids = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids.to(self._execution_device)
+
+            outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+            prompt_embeds = outputs.text_embeds
+            encoder_hidden_states = outputs.hidden_states[-2]
+
+        prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1)
+        encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+
+        if guidance_scale > 1.0:
+            if negative_prompt_embeds is None:
+                if negative_prompt is None:
+                    negative_prompt = [""] * len(prompt)
+
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+
+                input_ids = self.tokenizer(
+                    negative_prompt,
+                    return_tensors="pt",
+                    padding="max_length",
+                    truncation=True,
+                    max_length=self.tokenizer.model_max_length,
+                ).input_ids.to(self._execution_device)
+
+                outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+                negative_prompt_embeds = outputs.text_embeds
+                negative_encoder_hidden_states = outputs.hidden_states[-2]
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(num_images_per_prompt, 1)
+            negative_encoder_hidden_states = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+
+            prompt_embeds = torch.concat([negative_prompt_embeds, prompt_embeds])
+            encoder_hidden_states = torch.concat([negative_encoder_hidden_states, encoder_hidden_states])
+
+        image = self.image_processor.preprocess(image)
+
+        height, width = image.shape[-2:]
+
+        # Note that the micro conditionings _do_ flip the order of width, height for the original size
+        # and the crop coordinates. This is how it was done in the original code base
+        micro_conds = torch.tensor(
+            [
+                width,
+                height,
+                micro_conditioning_crop_coord[0],
+                micro_conditioning_crop_coord[1],
+                micro_conditioning_aesthetic_score,
+            ],
+            device=self._execution_device,
+            dtype=encoder_hidden_states.dtype,
+        )
+
+        micro_conds = micro_conds.unsqueeze(0)
+        micro_conds = micro_conds.expand(2 * batch_size if guidance_scale > 1.0 else batch_size, -1)
+
+        self.scheduler.set_timesteps(num_inference_steps, temperature, self._execution_device)
+        num_inference_steps = int(len(self.scheduler.timesteps) * strength)
+        start_timestep_idx = len(self.scheduler.timesteps) - num_inference_steps
+
+        needs_upcasting = self.vqvae.dtype == torch.float16 and self.vqvae.config.force_upcast
+
+        if needs_upcasting:
+            self.vqvae.float()
+
+        latents = self.vqvae.encode(image.to(dtype=self.vqvae.dtype, device=self._execution_device)).latents
+        latents_bsz, channels, latents_height, latents_width = latents.shape
+        latents = self.vqvae.quantize(latents)[2][2].reshape(latents_bsz, latents_height, latents_width)
+
+        mask = self.mask_processor.preprocess(
+            mask_image, height // self.vae_scale_factor, width // self.vae_scale_factor
+        )
+        mask = mask.reshape(mask.shape[0], latents_height, latents_width).bool().to(latents.device)
+        latents[mask] = self.scheduler.config.mask_token_id
+
+        starting_mask_ratio = mask.sum() / latents.numel()
+
+        latents = latents.repeat(num_images_per_prompt, 1, 1)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i in range(start_timestep_idx, len(self.scheduler.timesteps)):
+                timestep = self.scheduler.timesteps[i]
+
+                if guidance_scale > 1.0:
+                    model_input = torch.cat([latents] * 2)
+                else:
+                    model_input = latents
+
+                model_output = self.transformer(
+                    model_input,
+                    micro_conds=micro_conds,
+                    pooled_text_emb=prompt_embeds,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+
+                if guidance_scale > 1.0:
+                    uncond_logits, cond_logits = model_output.chunk(2)
+                    model_output = uncond_logits + guidance_scale * (cond_logits - uncond_logits)
+
+                latents = self.scheduler.step(
+                    model_output=model_output,
+                    timestep=timestep,
+                    sample=latents,
+                    generator=generator,
+                    starting_mask_ratio=starting_mask_ratio,
+                ).prev_sample
+
+                if i == len(self.scheduler.timesteps) - 1 or ((i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, timestep, latents)
+
+        if output_type == "latent":
+            output = latents
+        else:
+            output = self.vqvae.decode(
+                latents,
+                force_not_quantize=True,
+                shape=(
+                    batch_size,
+                    height // self.vae_scale_factor,
+                    width // self.vae_scale_factor,
+                    self.vqvae.config.latent_channels,
+                ),
+            ).sample.clip(0, 1)
+            output = self.image_processor.postprocess(output, output_type)
+
+            if needs_upcasting:
+                self.vqvae.half()
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (output,)
+
+        return ImagePipelineOutput(output)
diff --git a/diffusers/src/diffusers/pipelines/animatediff/__init__.py b/diffusers/src/diffusers/pipelines/animatediff/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d916abf2d85dfeee217479f14711dc2d33002ae6
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/animatediff/__init__.py
@@ -0,0 +1,57 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {"pipeline_output": ["AnimateDiffPipelineOutput"]}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline"]
+    _import_structure["pipeline_animatediff_controlnet"] = ["AnimateDiffControlNetPipeline"]
+    _import_structure["pipeline_animatediff_sdxl"] = ["AnimateDiffSDXLPipeline"]
+    _import_structure["pipeline_animatediff_sparsectrl"] = ["AnimateDiffSparseControlNetPipeline"]
+    _import_structure["pipeline_animatediff_video2video"] = ["AnimateDiffVideoToVideoPipeline"]
+    _import_structure["pipeline_animatediff_video2video_controlnet"] = ["AnimateDiffVideoToVideoControlNetPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .pipeline_animatediff import AnimateDiffPipeline
+        from .pipeline_animatediff_controlnet import AnimateDiffControlNetPipeline
+        from .pipeline_animatediff_sdxl import AnimateDiffSDXLPipeline
+        from .pipeline_animatediff_sparsectrl import AnimateDiffSparseControlNetPipeline
+        from .pipeline_animatediff_video2video import AnimateDiffVideoToVideoPipeline
+        from .pipeline_animatediff_video2video_controlnet import AnimateDiffVideoToVideoControlNetPipeline
+        from .pipeline_output import AnimateDiffPipelineOutput
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb6f50f43c4fd958abb9d90f40b418361fa7a18c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
@@ -0,0 +1,860 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput
+from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...models.unets.unet_motion_model import MotionAdapter
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..free_init_utils import FreeInitMixin
+from ..free_noise_utils import AnimateDiffFreeNoiseMixin
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import AnimateDiffPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
+        >>> from diffusers.utils import export_to_gif
+
+        >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
+        >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
+        >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
+        >>> output = pipe(prompt="A corgi walking in the park")
+        >>> frames = output.frames[0]
+        >>> export_to_gif(frames, "animation.gif")
+        ```
+"""
+
+
+class AnimateDiffPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    FreeInitMixin,
+    AnimateDiffFreeNoiseMixin,
+):
+    r"""
+    Pipeline for text-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: Union[UNet2DConditionModel, UNetMotionModel],
+        motion_adapter: MotionAdapter,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    def decode_latents(self, latents, decode_chunk_size: int = 16):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        video = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            batch_latents = latents[i : i + decode_chunk_size]
+            batch_latents = self.vae.decode(batch_latents).sample
+            video.append(batch_latents)
+
+        video = torch.cat(video)
+        video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and not isinstance(prompt, (str, list, dict)):
+            raise ValueError(f"`prompt` has to be of type `str`, `list` or `dict` but is {type(prompt)=}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        # If FreeNoise is enabled, generate latents as described in Equation (7) of [FreeNoise](https://arxiv.org/abs/2310.15169)
+        if self.free_noise_enabled:
+            latents = self._prepare_latents_free_noise(
+                batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents
+            )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        num_frames: Optional[int] = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        decode_chunk_size: int = 16,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            decode_chunk_size (`int`, defaults to `16`):
+                The number of frames to decode at a time when calling `decode_latents` method.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, (str, dict)):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        if self.free_noise_enabled:
+            prompt_embeds, negative_prompt_embeds = self._encode_prompt_free_noise(
+                prompt=prompt,
+                num_frames=num_frames,
+                device=device,
+                num_videos_per_prompt=num_videos_per_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                negative_prompt=negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=self.clip_skip,
+            )
+        else:
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt,
+                device,
+                num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=self.clip_skip,
+            )
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            if self.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+            prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+
+            self._num_timesteps = len(timesteps)
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+            # 8. Denoising loop
+            with self.progress_bar(total=self._num_timesteps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if self.interrupt:
+                        continue
+
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            callback(i, t, latents)
+
+        # 9. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents, decode_chunk_size)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+        # 10. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
diff --git a/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py b/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5357d6d5b8d96609a8ecb5d81892dfc5371dc53f
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py
@@ -0,0 +1,1100 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput
+from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...models.unets.unet_motion_model import MotionAdapter
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...video_processor import VideoProcessor
+from ..controlnet.multicontrolnet import MultiControlNetModel
+from ..free_init_utils import FreeInitMixin
+from ..free_noise_utils import AnimateDiffFreeNoiseMixin
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import AnimateDiffPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import (
+        ...     AnimateDiffControlNetPipeline,
+        ...     AutoencoderKL,
+        ...     ControlNetModel,
+        ...     MotionAdapter,
+        ...     LCMScheduler,
+        ... )
+        >>> from diffusers.utils import export_to_gif, load_video
+
+        >>> # Additionally, you will need a preprocess videos before they can be used with the ControlNet
+        >>> # HF maintains just the right package for it: `pip install controlnet_aux`
+        >>> from controlnet_aux.processor import ZoeDetector
+
+        >>> # Download controlnets from https://huggingface.co/lllyasviel/ControlNet-v1-1 to use .from_single_file
+        >>> # Download Diffusers-format controlnets, such as https://huggingface.co/lllyasviel/sd-controlnet-depth, to use .from_pretrained()
+        >>> controlnet = ControlNetModel.from_single_file("control_v11f1p_sd15_depth.pth", torch_dtype=torch.float16)
+
+        >>> # We use AnimateLCM for this example but one can use the original motion adapters as well (for example, https://huggingface.co/guoyww/animatediff-motion-adapter-v1-5-3)
+        >>> motion_adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
+
+        >>> vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
+        >>> pipe: AnimateDiffControlNetPipeline = AnimateDiffControlNetPipeline.from_pretrained(
+        ...     "SG161222/Realistic_Vision_V5.1_noVAE",
+        ...     motion_adapter=motion_adapter,
+        ...     controlnet=controlnet,
+        ...     vae=vae,
+        ... ).to(device="cuda", dtype=torch.float16)
+        >>> pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
+        >>> pipe.load_lora_weights(
+        ...     "wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora"
+        ... )
+        >>> pipe.set_adapters(["lcm-lora"], [0.8])
+
+        >>> depth_detector = ZoeDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
+        >>> video = load_video(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif"
+        ... )
+        >>> conditioning_frames = []
+
+        >>> with pipe.progress_bar(total=len(video)) as progress_bar:
+        ...     for frame in video:
+        ...         conditioning_frames.append(depth_detector(frame))
+        ...         progress_bar.update()
+
+        >>> prompt = "a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality"
+        >>> negative_prompt = "bad quality, worst quality"
+
+        >>> video = pipe(
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     num_frames=len(video),
+        ...     num_inference_steps=10,
+        ...     guidance_scale=2.0,
+        ...     conditioning_frames=conditioning_frames,
+        ...     generator=torch.Generator().manual_seed(42),
+        ... ).frames[0]
+
+        >>> export_to_gif(video, "animatediff_controlnet.gif", fps=8)
+        ```
+"""
+
+
+class AnimateDiffControlNetPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    FreeInitMixin,
+    AnimateDiffFreeNoiseMixin,
+):
+    r"""
+    Pipeline for text-to-video generation with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: Union[UNet2DConditionModel, UNetMotionModel],
+        motion_adapter: MotionAdapter,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+    ):
+        super().__init__()
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.control_video_processor = VideoProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
+    def decode_latents(self, latents, decode_chunk_size: int = 16):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        video = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            batch_latents = latents[i : i + decode_chunk_size]
+            batch_latents = self.vae.decode(batch_latents).sample
+            video.append(batch_latents)
+
+        video = torch.cat(video)
+        video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        num_frames,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        video=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and not isinstance(prompt, (str, list, dict)):
+            raise ValueError(f"`prompt` has to be of type `str`, `list` or `dict` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(video, list):
+                raise TypeError(f"For single controlnet, `image` must be of type `list` but got {type(video)}")
+            if len(video) != num_frames:
+                raise ValueError(f"Excepted image to have length {num_frames} but got {len(video)=}")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(video, list) or not isinstance(video[0], list):
+                raise TypeError(f"For multiple controlnets: `image` must be type list of lists but got {type(video)=}")
+            if len(video[0]) != num_frames:
+                raise ValueError(f"Expected length of image sublist as {num_frames} but got {len(video[0])=}")
+            if any(len(img) != len(video[0]) for img in video):
+                raise ValueError("All conditioning frame batches for multicontrolnet must be same size")
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        # If FreeNoise is enabled, generate latents as described in Equation (7) of [FreeNoise](https://arxiv.org/abs/2310.15169)
+        if self.free_noise_enabled:
+            latents = self._prepare_latents_free_noise(
+                batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents
+            )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_video(
+        self,
+        video,
+        width,
+        height,
+        batch_size,
+        num_videos_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        video = self.control_video_processor.preprocess_video(video, height=height, width=width).to(
+            dtype=torch.float32
+        )
+        video = video.permute(0, 2, 1, 3, 4).flatten(0, 1)
+        video_batch_size = video.shape[0]
+
+        if video_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_videos_per_prompt
+
+        video = video.repeat_interleave(repeat_by, dim=0)
+        video = video.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            video = torch.cat([video] * 2)
+
+        return video
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_frames: Optional[int] = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[PipelineImageInput] = None,
+        conditioning_frames: Optional[List[PipelineImageInput]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        decode_chunk_size: int = 16,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            conditioning_frames (`List[PipelineImageInput]`, *optional*):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If multiple
+                ControlNets are specified, images must be passed as a list such that each element of the list can be
+                correctly batched for input to a single ControlNet.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            negative_prompt=negative_prompt,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            video=conditioning_frames,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            control_guidance_start=control_guidance_start,
+            control_guidance_end=control_guidance_end,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, (str, dict)):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        if self.free_noise_enabled:
+            prompt_embeds, negative_prompt_embeds = self._encode_prompt_free_noise(
+                prompt=prompt,
+                num_frames=num_frames,
+                device=device,
+                num_videos_per_prompt=num_videos_per_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                negative_prompt=negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=self.clip_skip,
+            )
+        else:
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt,
+                device,
+                num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=self.clip_skip,
+            )
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            if self.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+            prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        if isinstance(controlnet, ControlNetModel):
+            conditioning_frames = self.prepare_video(
+                video=conditioning_frames,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_videos_per_prompt * num_frames,
+                num_videos_per_prompt=num_videos_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            cond_prepared_videos = []
+            for frame_ in conditioning_frames:
+                prepared_video = self.prepare_video(
+                    video=frame_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_videos_per_prompt * num_frames,
+                    num_videos_per_prompt=num_videos_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+                cond_prepared_videos.append(prepared_video)
+            conditioning_frames = cond_prepared_videos
+        else:
+            assert False
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+
+            self._num_timesteps = len(timesteps)
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+            # 8. Denoising loop
+            with self.progress_bar(total=self._num_timesteps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if self.interrupt:
+                        continue
+
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    if guess_mode and self.do_classifier_free_guidance:
+                        # Infer ControlNet only for the conditional batch.
+                        control_model_input = latents
+                        control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                        controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    else:
+                        control_model_input = latent_model_input
+                        controlnet_prompt_embeds = prompt_embeds
+
+                    if isinstance(controlnet_keep[i], list):
+                        cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                    else:
+                        controlnet_cond_scale = controlnet_conditioning_scale
+                        if isinstance(controlnet_cond_scale, list):
+                            controlnet_cond_scale = controlnet_cond_scale[0]
+                        cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                    control_model_input = torch.transpose(control_model_input, 1, 2)
+                    control_model_input = control_model_input.reshape(
+                        (-1, control_model_input.shape[2], control_model_input.shape[3], control_model_input.shape[4])
+                    )
+
+                    down_block_res_samples, mid_block_res_sample = self.controlnet(
+                        control_model_input,
+                        t,
+                        encoder_hidden_states=controlnet_prompt_embeds,
+                        controlnet_cond=conditioning_frames,
+                        conditioning_scale=cond_scale,
+                        guess_mode=guess_mode,
+                        return_dict=False,
+                    )
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=self.cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                        down_block_additional_residuals=down_block_res_samples,
+                        mid_block_additional_residual=mid_block_res_sample,
+                    ).sample
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+
+        # 9. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents, decode_chunk_size)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+        # 10. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
diff --git a/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py b/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..e531c91c168fc1dc780c7f2588d53d7666baad7c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py
@@ -0,0 +1,1276 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import PipelineImageInput
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, MotionAdapter, UNet2DConditionModel, UNetMotionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..free_init_utils import FreeInitMixin
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import AnimateDiffPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers.models import MotionAdapter
+        >>> from diffusers import AnimateDiffSDXLPipeline, DDIMScheduler
+        >>> from diffusers.utils import export_to_gif
+
+        >>> adapter = MotionAdapter.from_pretrained(
+        ...     "a-r-r-o-w/animatediff-motion-adapter-sdxl-beta", torch_dtype=torch.float16
+        ... )
+
+        >>> model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+        >>> scheduler = DDIMScheduler.from_pretrained(
+        ...     model_id,
+        ...     subfolder="scheduler",
+        ...     clip_sample=False,
+        ...     timestep_spacing="linspace",
+        ...     beta_schedule="linear",
+        ...     steps_offset=1,
+        ... )
+        >>> pipe = AnimateDiffSDXLPipeline.from_pretrained(
+        ...     model_id,
+        ...     motion_adapter=adapter,
+        ...     scheduler=scheduler,
+        ...     torch_dtype=torch.float16,
+        ...     variant="fp16",
+        ... ).to("cuda")
+
+        >>> # enable memory savings
+        >>> pipe.enable_vae_slicing()
+        >>> pipe.enable_vae_tiling()
+
+        >>> output = pipe(
+        ...     prompt="a panda surfing in the ocean, realistic, high quality",
+        ...     negative_prompt="low quality, worst quality",
+        ...     num_inference_steps=20,
+        ...     guidance_scale=8,
+        ...     width=1024,
+        ...     height=1024,
+        ...     num_frames=16,
+        ... )
+
+        >>> frames = output.frames[0]
+        >>> export_to_gif(frames, "animation.gif")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class AnimateDiffSDXLPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    FreeInitMixin,
+):
+    r"""
+    Pipeline for text-to-video generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: Union[UNet2DConditionModel, UNetMotionModel],
+        motion_adapter: MotionAdapter,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+    ):
+        super().__init__()
+
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt with num_images_per_prompt->num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_videos_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_videos_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_videos_per_prompt).view(
+            bs_embed * num_videos_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_videos_per_prompt).view(
+                bs_embed * num_videos_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        num_frames: int = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the video generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            num_frames:
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated video. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated video. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower video quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the video generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the video generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the
+                `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.AnimateDiffPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_videos_per_prompt=num_videos_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_videos_per_prompt, 1)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 7.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 8. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_videos_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+
+            self._num_timesteps = len(timesteps)
+
+            # 9. Denoising loop
+            with self.progress_bar(total=self._num_timesteps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if self.interrupt:
+                        continue
+
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # predict the noise residual
+                    added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                    if ip_adapter_image is not None or ip_adapter_image_embeds:
+                        added_cond_kwargs["image_embeds"] = image_embeds
+
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep_cond=timestep_cond,
+                        cross_attention_kwargs=self.cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=False,
+                    )[0]
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        noise_pred = rescale_noise_cfg(
+                            noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
+                        )
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                        add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                        negative_pooled_prompt_embeds = callback_outputs.pop(
+                            "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                        )
+                        add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                        negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                    progress_bar.update()
+
+        # make sure the VAE is in float32 mode, as it overflows in float16
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+        if needs_upcasting:
+            self.upcast_vae()
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+        # 10. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+
+        # 11. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
diff --git a/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py b/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b037cdc34fbb45282e06d65798844429f1be2c5
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py
@@ -0,0 +1,1010 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from ...models.controlnet_sparsectrl import SparseControlNetModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...models.unets.unet_motion_model import MotionAdapter
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...video_processor import VideoProcessor
+from ..free_init_utils import FreeInitMixin
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import AnimateDiffPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> from diffusers import AnimateDiffSparseControlNetPipeline
+        >>> from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel
+        >>> from diffusers.schedulers import DPMSolverMultistepScheduler
+        >>> from diffusers.utils import export_to_gif, load_image
+
+        >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+        >>> motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
+        >>> controlnet_id = "guoyww/animatediff-sparsectrl-scribble"
+        >>> lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3"
+        >>> vae_id = "stabilityai/sd-vae-ft-mse"
+        >>> device = "cuda"
+
+        >>> motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device)
+        >>> controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device)
+        >>> vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device)
+        >>> scheduler = DPMSolverMultistepScheduler.from_pretrained(
+        ...     model_id,
+        ...     subfolder="scheduler",
+        ...     beta_schedule="linear",
+        ...     algorithm_type="dpmsolver++",
+        ...     use_karras_sigmas=True,
+        ... )
+        >>> pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
+        ...     model_id,
+        ...     motion_adapter=motion_adapter,
+        ...     controlnet=controlnet,
+        ...     vae=vae,
+        ...     scheduler=scheduler,
+        ...     torch_dtype=torch.float16,
+        ... ).to(device)
+        >>> pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
+        >>> pipe.fuse_lora(lora_scale=1.0)
+
+        >>> prompt = "an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality"
+        >>> negative_prompt = "low quality, worst quality, letterboxed"
+
+        >>> image_files = [
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-1.png",
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-2.png",
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-3.png",
+        ... ]
+        >>> condition_frame_indices = [0, 8, 15]
+        >>> conditioning_frames = [load_image(img_file) for img_file in image_files]
+
+        >>> video = pipe(
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     num_inference_steps=25,
+        ...     conditioning_frames=conditioning_frames,
+        ...     controlnet_conditioning_scale=1.0,
+        ...     controlnet_frame_indices=condition_frame_indices,
+        ...     generator=torch.Generator().manual_seed(1337),
+        ... ).frames[0]
+        >>> export_to_gif(video, "output.gif")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class AnimateDiffSparseControlNetPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    FreeInitMixin,
+):
+    r"""
+    Pipeline for controlled text-to-video generation using the method described in [SparseCtrl: Adding Sparse Controls
+    to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933).
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: Union[UNet2DConditionModel, UNetMotionModel],
+        motion_adapter: MotionAdapter,
+        controlnet: SparseControlNetModel,
+        scheduler: KarrasDiffusionSchedulers,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        image=None,
+        controlnet_conditioning_scale: float = 1.0,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+
+        # check `image`
+        if (
+            isinstance(self.controlnet, SparseControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, SparseControlNetModel)
+        ):
+            if isinstance(image, list):
+                for image_ in image:
+                    self.check_image(image_, prompt, prompt_embeds)
+            else:
+                self.check_image(image, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, SparseControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, SparseControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        else:
+            assert False
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_image(self, image, width, height, device, dtype):
+        image = self.control_image_processor.preprocess(image, height=height, width=width)
+        controlnet_images = image.unsqueeze(0).to(device, dtype)
+        batch_size, num_frames, channels, height, width = controlnet_images.shape
+
+        # TODO: remove below line
+        assert controlnet_images.min() >= 0 and controlnet_images.max() <= 1
+
+        if self.controlnet.use_simplified_condition_embedding:
+            controlnet_images = controlnet_images.reshape(batch_size * num_frames, channels, height, width)
+            controlnet_images = 2 * controlnet_images - 1
+            conditioning_frames = retrieve_latents(self.vae.encode(controlnet_images)) * self.vae.config.scaling_factor
+            conditioning_frames = conditioning_frames.reshape(
+                batch_size, num_frames, 4, height // self.vae_scale_factor, width // self.vae_scale_factor
+            )
+        else:
+            conditioning_frames = controlnet_images
+
+        conditioning_frames = conditioning_frames.permute(0, 2, 1, 3, 4)  # [b, c, f, h, w]
+        return conditioning_frames
+
+    def prepare_sparse_control_conditioning(
+        self,
+        conditioning_frames: torch.Tensor,
+        num_frames: int,
+        controlnet_frame_indices: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert conditioning_frames.shape[2] >= len(controlnet_frame_indices)
+
+        batch_size, channels, _, height, width = conditioning_frames.shape
+        controlnet_cond = torch.zeros((batch_size, channels, num_frames, height, width), dtype=dtype, device=device)
+        controlnet_cond_mask = torch.zeros((batch_size, 1, num_frames, height, width), dtype=dtype, device=device)
+        controlnet_cond[:, :, controlnet_frame_indices] = conditioning_frames[:, :, : len(controlnet_frame_indices)]
+        controlnet_cond_mask[:, :, controlnet_frame_indices] = 1
+
+        return controlnet_cond, controlnet_cond_mask
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 16,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        conditioning_frames: Optional[List[PipelineImageInput]] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        controlnet_frame_indices: List[int] = [0],
+        guess_mode: bool = False,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            conditioning_frames (`List[PipelineImageInput]`, *optional*):
+                The SparseControlNet input to provide guidance to the `unet` for generation.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            controlnet_frame_indices (`List[int]`):
+                The indices where the conditioning frames must be applied for generation. Multiple frames can be
+                provided to guide the model to generate similar structure outputs, where the `unet` can
+                "fill-in-the-gaps" for interpolation videos, or a single frame could be provided for general expected
+                structure. Must have the same length as `conditioning_frames`.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            ip_adapter_image=ip_adapter_image,
+            ip_adapter_image_embeds=ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            image=conditioning_frames,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, SparseControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
+
+        # 4. Prepare IP-Adapter embeddings
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 5. Prepare controlnet conditioning
+        conditioning_frames = self.prepare_image(conditioning_frames, width, height, device, controlnet.dtype)
+        controlnet_cond, controlnet_cond_mask = self.prepare_sparse_control_conditioning(
+            conditioning_frames, num_frames, controlnet_frame_indices, device, controlnet.dtype
+        )
+
+        # 6. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 7. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+
+            self._num_timesteps = len(timesteps)
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+            # 10. Denoising loop
+            with self.progress_bar(total=self._num_timesteps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    if guess_mode and self.do_classifier_free_guidance:
+                        # Infer SparseControlNetModel only for the conditional batch.
+                        control_model_input = latents
+                        control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                        controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    else:
+                        control_model_input = latent_model_input
+                        controlnet_prompt_embeds = prompt_embeds
+
+                    down_block_res_samples, mid_block_res_sample = self.controlnet(
+                        control_model_input,
+                        t,
+                        encoder_hidden_states=controlnet_prompt_embeds,
+                        controlnet_cond=controlnet_cond,
+                        conditioning_mask=controlnet_cond_mask,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        guess_mode=guess_mode,
+                        return_dict=False,
+                    )
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                        down_block_additional_residuals=down_block_res_samples,
+                        mid_block_additional_residual=mid_block_res_sample,
+                    ).sample
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+
+        # 11. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+        # 12. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
diff --git a/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ebe2b9b60ddd84efd2fb78e3369d523a73f8d4f
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -0,0 +1,1059 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput
+from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...models.unets.unet_motion_model import MotionAdapter
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..free_init_utils import FreeInitMixin
+from ..free_noise_utils import AnimateDiffFreeNoiseMixin
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import AnimateDiffPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import imageio
+        >>> import requests
+        >>> import torch
+        >>> from diffusers import AnimateDiffVideoToVideoPipeline, DDIMScheduler, MotionAdapter
+        >>> from diffusers.utils import export_to_gif
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> adapter = MotionAdapter.from_pretrained(
+        ...     "guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16
+        ... )
+        >>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained(
+        ...     "SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter
+        ... ).to("cuda")
+        >>> pipe.scheduler = DDIMScheduler(
+        ...     beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace"
+        ... )
+
+
+        >>> def load_video(file_path: str):
+        ...     images = []
+
+        ...     if file_path.startswith(("http://", "https://")):
+        ...         # If the file_path is a URL
+        ...         response = requests.get(file_path)
+        ...         response.raise_for_status()
+        ...         content = BytesIO(response.content)
+        ...         vid = imageio.get_reader(content)
+        ...     else:
+        ...         # Assuming it's a local file path
+        ...         vid = imageio.get_reader(file_path)
+
+        ...     for frame in vid:
+        ...         pil_image = Image.fromarray(frame)
+        ...         images.append(pil_image)
+
+        ...     return images
+
+
+        >>> video = load_video(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif"
+        ... )
+        >>> output = pipe(
+        ...     video=video, prompt="panda playing a guitar, on a boat, in the ocean, high quality", strength=0.5
+        ... )
+        >>> frames = output.frames[0]
+        >>> export_to_gif(frames, "animation.gif")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class AnimateDiffVideoToVideoPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    FreeInitMixin,
+    AnimateDiffFreeNoiseMixin,
+):
+    r"""
+    Pipeline for video-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        motion_adapter: MotionAdapter,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, (str, dict)):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    def encode_video(self, video, generator, decode_chunk_size: int = 16) -> torch.Tensor:
+        latents = []
+        for i in range(0, len(video), decode_chunk_size):
+            batch_video = video[i : i + decode_chunk_size]
+            batch_video = retrieve_latents(self.vae.encode(batch_video), generator=generator)
+            latents.append(batch_video)
+        return torch.cat(latents)
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
+    def decode_latents(self, latents, decode_chunk_size: int = 16):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        video = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            batch_latents = latents[i : i + decode_chunk_size]
+            batch_latents = self.vae.decode(batch_latents).sample
+            video.append(batch_latents)
+
+        video = torch.cat(video)
+        video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        height,
+        width,
+        video=None,
+        latents=None,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and not isinstance(prompt, (str, list, dict)):
+            raise ValueError(f"`prompt` has to be of type `str`, `list` or `dict` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if video is not None and latents is not None:
+            raise ValueError("Only one of `video` or `latents` should be provided")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def get_timesteps(self, num_inference_steps, timesteps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(
+        self,
+        video: Optional[torch.Tensor] = None,
+        height: int = 64,
+        width: int = 64,
+        num_channels_latents: int = 4,
+        batch_size: int = 1,
+        timestep: Optional[int] = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        decode_chunk_size: int = 16,
+        add_noise: bool = False,
+    ) -> torch.Tensor:
+        num_frames = video.shape[1] if latents is None else latents.shape[2]
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                video = video.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list):
+                if len(generator) != batch_size:
+                    raise ValueError(
+                        f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                        f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                    )
+
+                init_latents = [
+                    self.encode_video(video[i], generator[i], decode_chunk_size).unsqueeze(0)
+                    for i in range(batch_size)
+                ]
+            else:
+                init_latents = [self.encode_video(vid, generator, decode_chunk_size).unsqueeze(0) for vid in video]
+
+            init_latents = torch.cat(init_latents, dim=0)
+
+            # restore vae to original dtype
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+            if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                error_message = (
+                    f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                    " images (`image`). Please make sure to update your script to pass as many initial images as text prompts"
+                )
+                raise ValueError(error_message)
+            elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                init_latents = torch.cat([init_latents], dim=0)
+
+            noise = randn_tensor(init_latents.shape, generator=generator, device=device, dtype=dtype)
+            latents = self.scheduler.add_noise(init_latents, noise, timestep).permute(0, 2, 1, 3, 4)
+        else:
+            if shape != latents.shape:
+                # [B, C, F, H, W]
+                raise ValueError(f"`latents` expected to have {shape=}, but found {latents.shape=}")
+
+            latents = latents.to(device, dtype=dtype)
+
+            if add_noise:
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                latents = self.scheduler.add_noise(latents, noise, timestep)
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        video: List[List[PipelineImageInput]] = None,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        enforce_inference_steps: bool = False,
+        timesteps: Optional[List[int]] = None,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        decode_chunk_size: int = 16,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            video (`List[PipelineImageInput]`):
+                The input video to condition the generation on. Must be a list of images/frames of the video.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            strength (`float`, *optional*, defaults to 0.8):
+                Higher strength leads to more differences between original video and generated video.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            decode_chunk_size (`int`, defaults to `16`):
+                The number of frames to decode at a time when calling `decode_latents` method.
+
+        Examples:
+
+        Returns:
+            [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            strength=strength,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            video=video,
+            latents=latents,
+            ip_adapter_image=ip_adapter_image,
+            ip_adapter_image_embeds=ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, (str, dict)):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        dtype = self.dtype
+
+        # 3. Prepare timesteps
+        if not enforce_inference_steps:
+            timesteps, num_inference_steps = retrieve_timesteps(
+                self.scheduler, num_inference_steps, device, timesteps, sigmas
+            )
+            timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
+            latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+        else:
+            denoising_inference_steps = int(num_inference_steps / strength)
+            timesteps, denoising_inference_steps = retrieve_timesteps(
+                self.scheduler, denoising_inference_steps, device, timesteps, sigmas
+            )
+            timesteps = timesteps[-num_inference_steps:]
+            latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+
+        # 4. Prepare latent variables
+        if latents is None:
+            video = self.video_processor.preprocess_video(video, height=height, width=width)
+            # Move the number of frames before the number of channels.
+            video = video.permute(0, 2, 1, 3, 4)
+            video = video.to(device=device, dtype=dtype)
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            video=video,
+            height=height,
+            width=width,
+            num_channels_latents=num_channels_latents,
+            batch_size=batch_size * num_videos_per_prompt,
+            timestep=latent_timestep,
+            dtype=dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+            decode_chunk_size=decode_chunk_size,
+            add_noise=enforce_inference_steps,
+        )
+
+        # 5. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        num_frames = latents.shape[2]
+        if self.free_noise_enabled:
+            prompt_embeds, negative_prompt_embeds = self._encode_prompt_free_noise(
+                prompt=prompt,
+                num_frames=num_frames,
+                device=device,
+                num_videos_per_prompt=num_videos_per_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                negative_prompt=negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=self.clip_skip,
+            )
+        else:
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt,
+                device,
+                num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=self.clip_skip,
+            )
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            if self.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+            prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
+
+        # 6. Prepare IP-Adapter embeddings
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+                num_inference_steps = len(timesteps)
+                # make sure to readjust timesteps based on strength
+                timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
+
+            self._num_timesteps = len(timesteps)
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+            # 9. Denoising loop
+            with self.progress_bar(total=self._num_timesteps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if self.interrupt:
+                        continue
+
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=self.cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+
+        # 10. Post-processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents, decode_chunk_size)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+        # 11. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
diff --git a/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py b/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d26f95a2f586aba68f55f7b9eadf6cca58eac71
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py
@@ -0,0 +1,1341 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput
+from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...models.unets.unet_motion_model import MotionAdapter
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...video_processor import VideoProcessor
+from ..controlnet.multicontrolnet import MultiControlNetModel
+from ..free_init_utils import FreeInitMixin
+from ..free_noise_utils import AnimateDiffFreeNoiseMixin
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import AnimateDiffPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from PIL import Image
+        >>> from tqdm.auto import tqdm
+
+        >>> from diffusers import AnimateDiffVideoToVideoControlNetPipeline
+        >>> from diffusers.utils import export_to_gif, load_video
+        >>> from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter, LCMScheduler
+
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...     "lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16
+        ... )
+        >>> motion_adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
+        >>> vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
+
+        >>> pipe = AnimateDiffVideoToVideoControlNetPipeline.from_pretrained(
+        ...     "SG161222/Realistic_Vision_V5.1_noVAE",
+        ...     motion_adapter=motion_adapter,
+        ...     controlnet=controlnet,
+        ...     vae=vae,
+        ... ).to(device="cuda", dtype=torch.float16)
+
+        >>> pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
+        >>> pipe.load_lora_weights(
+        ...     "wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora"
+        ... )
+        >>> pipe.set_adapters(["lcm-lora"], [0.8])
+
+        >>> video = load_video(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/dance.gif"
+        ... )
+        >>> video = [frame.convert("RGB") for frame in video]
+
+        >>> from controlnet_aux.processor import OpenposeDetector
+
+        >>> open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
+        >>> for frame in tqdm(video):
+        ...     conditioning_frames.append(open_pose(frame))
+
+        >>> prompt = "astronaut in space, dancing"
+        >>> negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly"
+
+        >>> strength = 0.8
+        >>> with torch.inference_mode():
+        ...     video = pipe(
+        ...         video=video,
+        ...         prompt=prompt,
+        ...         negative_prompt=negative_prompt,
+        ...         num_inference_steps=10,
+        ...         guidance_scale=2.0,
+        ...         controlnet_conditioning_scale=0.75,
+        ...         conditioning_frames=conditioning_frames,
+        ...         strength=strength,
+        ...         generator=torch.Generator().manual_seed(42),
+        ...     ).frames[0]
+
+        >>> video = [frame.resize(conditioning_frames[0].size) for frame in video]
+        >>> export_to_gif(video, f"animatediff_vid2vid_controlnet.gif", fps=8)
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class AnimateDiffVideoToVideoControlNetPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    FreeInitMixin,
+    AnimateDiffFreeNoiseMixin,
+):
+    r"""
+    Pipeline for video-to-video generation with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]` or `Tuple[ControlNetModel]` or `MultiControlNetModel`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        motion_adapter: MotionAdapter,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.control_video_processor = VideoProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff_video2video.AnimateDiffVideoToVideoPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, (str, dict)):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff_video2video.AnimateDiffVideoToVideoPipeline.encode_video
+    def encode_video(self, video, generator, decode_chunk_size: int = 16) -> torch.Tensor:
+        latents = []
+        for i in range(0, len(video), decode_chunk_size):
+            batch_video = video[i : i + decode_chunk_size]
+            batch_video = retrieve_latents(self.vae.encode(batch_video), generator=generator)
+            latents.append(batch_video)
+        return torch.cat(latents)
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
+    def decode_latents(self, latents, decode_chunk_size: int = 16):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        video = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            batch_latents = latents[i : i + decode_chunk_size]
+            batch_latents = self.vae.decode(batch_latents).sample
+            video.append(batch_latents)
+
+        video = torch.cat(video)
+        video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        height,
+        width,
+        video=None,
+        conditioning_frames=None,
+        latents=None,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and not isinstance(prompt, (str, list, dict)):
+            raise ValueError(f"`prompt` has to be of type `str`, `list` or `dict` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if video is not None and latents is not None:
+            raise ValueError("Only one of `video` or `latents` should be provided")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+
+        num_frames = len(video) if latents is None else latents.shape[2]
+
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(conditioning_frames, list):
+                raise TypeError(
+                    f"For single controlnet, `image` must be of type `list` but got {type(conditioning_frames)}"
+                )
+            if len(conditioning_frames) != num_frames:
+                raise ValueError(f"Excepted image to have length {num_frames} but got {len(conditioning_frames)=}")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(conditioning_frames, list) or not isinstance(conditioning_frames[0], list):
+                raise TypeError(
+                    f"For multiple controlnets: `image` must be type list of lists but got {type(conditioning_frames)=}"
+                )
+            if len(conditioning_frames[0]) != num_frames:
+                raise ValueError(
+                    f"Expected length of image sublist as {num_frames} but got {len(conditioning_frames)=}"
+                )
+            if any(len(img) != len(conditioning_frames[0]) for img in conditioning_frames):
+                raise ValueError("All conditioning frame batches for multicontrolnet must be same size")
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff_video2video.AnimateDiffVideoToVideoPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, timesteps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff_video2video.AnimateDiffVideoToVideoPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        video: Optional[torch.Tensor] = None,
+        height: int = 64,
+        width: int = 64,
+        num_channels_latents: int = 4,
+        batch_size: int = 1,
+        timestep: Optional[int] = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        decode_chunk_size: int = 16,
+        add_noise: bool = False,
+    ) -> torch.Tensor:
+        num_frames = video.shape[1] if latents is None else latents.shape[2]
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                video = video.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list):
+                if len(generator) != batch_size:
+                    raise ValueError(
+                        f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                        f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                    )
+
+                init_latents = [
+                    self.encode_video(video[i], generator[i], decode_chunk_size).unsqueeze(0)
+                    for i in range(batch_size)
+                ]
+            else:
+                init_latents = [self.encode_video(vid, generator, decode_chunk_size).unsqueeze(0) for vid in video]
+
+            init_latents = torch.cat(init_latents, dim=0)
+
+            # restore vae to original dtype
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+            if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                error_message = (
+                    f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                    " images (`image`). Please make sure to update your script to pass as many initial images as text prompts"
+                )
+                raise ValueError(error_message)
+            elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                init_latents = torch.cat([init_latents], dim=0)
+
+            noise = randn_tensor(init_latents.shape, generator=generator, device=device, dtype=dtype)
+            latents = self.scheduler.add_noise(init_latents, noise, timestep).permute(0, 2, 1, 3, 4)
+        else:
+            if shape != latents.shape:
+                # [B, C, F, H, W]
+                raise ValueError(f"`latents` expected to have {shape=}, but found {latents.shape=}")
+
+            latents = latents.to(device, dtype=dtype)
+
+            if add_noise:
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                latents = self.scheduler.add_noise(latents, noise, timestep)
+
+        return latents
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff_controlnet.AnimateDiffControlNetPipeline.prepare_video
+    def prepare_conditioning_frames(
+        self,
+        video,
+        width,
+        height,
+        batch_size,
+        num_videos_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        video = self.control_video_processor.preprocess_video(video, height=height, width=width).to(
+            dtype=torch.float32
+        )
+        video = video.permute(0, 2, 1, 3, 4).flatten(0, 1)
+        video_batch_size = video.shape[0]
+
+        if video_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_videos_per_prompt
+
+        video = video.repeat_interleave(repeat_by, dim=0)
+        video = video.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            video = torch.cat([video] * 2)
+
+        return video
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        video: List[List[PipelineImageInput]] = None,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        enforce_inference_steps: bool = False,
+        timesteps: Optional[List[int]] = None,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        conditioning_frames: Optional[List[PipelineImageInput]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        decode_chunk_size: int = 16,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            video (`List[PipelineImageInput]`):
+                The input video to condition the generation on. Must be a list of images/frames of the video.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            strength (`float`, *optional*, defaults to 0.8):
+                Higher strength leads to more differences between original video and generated video.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            conditioning_frames (`List[PipelineImageInput]`, *optional*):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If multiple
+                ControlNets are specified, images must be passed as a list such that each element of the list can be
+                correctly batched for input to a single ControlNet.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            decode_chunk_size (`int`, defaults to `16`):
+                The number of frames to decode at a time when calling `decode_latents` method.
+
+        Examples:
+
+        Returns:
+            [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            strength=strength,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            video=video,
+            conditioning_frames=conditioning_frames,
+            latents=latents,
+            ip_adapter_image=ip_adapter_image,
+            ip_adapter_image_embeds=ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            control_guidance_start=control_guidance_start,
+            control_guidance_end=control_guidance_end,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, (str, dict)):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        dtype = self.dtype
+
+        # 3. Prepare timesteps
+        if not enforce_inference_steps:
+            timesteps, num_inference_steps = retrieve_timesteps(
+                self.scheduler, num_inference_steps, device, timesteps, sigmas
+            )
+            timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
+            latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+        else:
+            denoising_inference_steps = int(num_inference_steps / strength)
+            timesteps, denoising_inference_steps = retrieve_timesteps(
+                self.scheduler, denoising_inference_steps, device, timesteps, sigmas
+            )
+            timesteps = timesteps[-num_inference_steps:]
+            latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+
+        # 4. Prepare latent variables
+        if latents is None:
+            video = self.video_processor.preprocess_video(video, height=height, width=width)
+            # Move the number of frames before the number of channels.
+            video = video.permute(0, 2, 1, 3, 4)
+            video = video.to(device=device, dtype=dtype)
+
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            video=video,
+            height=height,
+            width=width,
+            num_channels_latents=num_channels_latents,
+            batch_size=batch_size * num_videos_per_prompt,
+            timestep=latent_timestep,
+            dtype=dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+            decode_chunk_size=decode_chunk_size,
+            add_noise=enforce_inference_steps,
+        )
+
+        # 5. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        num_frames = latents.shape[2]
+        if self.free_noise_enabled:
+            prompt_embeds, negative_prompt_embeds = self._encode_prompt_free_noise(
+                prompt=prompt,
+                num_frames=num_frames,
+                device=device,
+                num_videos_per_prompt=num_videos_per_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                negative_prompt=negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=self.clip_skip,
+            )
+        else:
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt,
+                device,
+                num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=self.clip_skip,
+            )
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            if self.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+            prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
+
+        # 6. Prepare IP-Adapter embeddings
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 7. Prepare ControlNet conditions
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        if isinstance(controlnet, ControlNetModel):
+            conditioning_frames = self.prepare_conditioning_frames(
+                video=conditioning_frames,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_videos_per_prompt * num_frames,
+                num_videos_per_prompt=num_videos_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            cond_prepared_videos = []
+            for frame_ in conditioning_frames:
+                prepared_video = self.prepare_conditioning_frames(
+                    video=frame_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_videos_per_prompt * num_frames,
+                    num_videos_per_prompt=num_videos_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+                cond_prepared_videos.append(prepared_video)
+            conditioning_frames = cond_prepared_videos
+        else:
+            assert False
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+                num_inference_steps = len(timesteps)
+                # make sure to readjust timesteps based on strength
+                timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
+
+            self._num_timesteps = len(timesteps)
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+            # 10. Denoising loop
+            with self.progress_bar(total=self._num_timesteps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if self.interrupt:
+                        continue
+
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    if guess_mode and self.do_classifier_free_guidance:
+                        # Infer ControlNet only for the conditional batch.
+                        control_model_input = latents
+                        control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                        controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    else:
+                        control_model_input = latent_model_input
+                        controlnet_prompt_embeds = prompt_embeds
+
+                    if isinstance(controlnet_keep[i], list):
+                        cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                    else:
+                        controlnet_cond_scale = controlnet_conditioning_scale
+                        if isinstance(controlnet_cond_scale, list):
+                            controlnet_cond_scale = controlnet_cond_scale[0]
+                        cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                    control_model_input = torch.transpose(control_model_input, 1, 2)
+                    control_model_input = control_model_input.reshape(
+                        (-1, control_model_input.shape[2], control_model_input.shape[3], control_model_input.shape[4])
+                    )
+
+                    down_block_res_samples, mid_block_res_sample = self.controlnet(
+                        control_model_input,
+                        t,
+                        encoder_hidden_states=controlnet_prompt_embeds,
+                        controlnet_cond=conditioning_frames,
+                        conditioning_scale=cond_scale,
+                        guess_mode=guess_mode,
+                        return_dict=False,
+                    )
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=self.cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                        down_block_additional_residuals=down_block_res_samples,
+                        mid_block_additional_residual=mid_block_res_sample,
+                    ).sample
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+
+        # 11. Post-processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents, decode_chunk_size)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+        # 12. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
diff --git a/diffusers/src/diffusers/pipelines/animatediff/pipeline_output.py b/diffusers/src/diffusers/pipelines/animatediff/pipeline_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..2417223cf95e682aa9b86f4c6013c5285c9afce4
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/animatediff/pipeline_output.py
@@ -0,0 +1,24 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class AnimateDiffPipelineOutput(BaseOutput):
+    r"""
+     Output class for AnimateDiff pipelines.
+
+    Args:
+         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+             denoised
+     PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+    `(batch_size, num_frames, channels, height, width)`
+    """
+
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
diff --git a/diffusers/src/diffusers/pipelines/audioldm/__init__.py b/diffusers/src/diffusers/pipelines/audioldm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a002b4aa72e0a180c7042c406667d37122d6e4cc
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/audioldm/__init__.py
@@ -0,0 +1,51 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import (
+        AudioLDMPipeline,
+    )
+
+    _dummy_objects.update({"AudioLDMPipeline": AudioLDMPipeline})
+else:
+    _import_structure["pipeline_audioldm"] = ["AudioLDMPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import (
+            AudioLDMPipeline,
+        )
+
+    else:
+        from .pipeline_audioldm import AudioLDMPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/diffusers/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
new file mode 100644
index 0000000000000000000000000000000000000000..105ca40f773fff966f9d95b795e1e6c69a107e00
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -0,0 +1,546 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import ClapTextModelWithProjection, RobertaTokenizer, RobertaTokenizerFast, SpeechT5HifiGan
+
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, StableDiffusionMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import AudioLDMPipeline
+        >>> import torch
+        >>> import scipy
+
+        >>> repo_id = "cvssp/audioldm-s-full-v2"
+        >>> pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
+        >>> audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]
+
+        >>> # save the audio sample as a .wav file
+        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
+        ```
+"""
+
+
+class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-audio generation using AudioLDM.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.ClapTextModelWithProjection`]):
+            Frozen text-encoder (`ClapTextModelWithProjection`, specifically the
+            [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant.
+        tokenizer ([`PreTrainedTokenizer`]):
+            A [`~transformers.RobertaTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded audio latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        vocoder ([`~transformers.SpeechT5HifiGan`]):
+            Vocoder of class `SpeechT5HifiGan`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: ClapTextModelWithProjection,
+        tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        vocoder: SpeechT5HifiGan,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            vocoder=vocoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_waveforms_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device (`torch.device`):
+                torch device
+            num_waveforms_per_prompt (`int`):
+                number of waveforms that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            attention_mask = text_inputs.attention_mask
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLAP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask.to(device),
+            )
+            prompt_embeds = prompt_embeds.text_embeds
+            # additional L_2 normalization over each hidden-state
+            prompt_embeds = F.normalize(prompt_embeds, dim=-1)
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        (
+            bs_embed,
+            seq_len,
+        ) = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_waveforms_per_prompt)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_waveforms_per_prompt, seq_len)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            uncond_input_ids = uncond_input.input_ids.to(device)
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input_ids,
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds.text_embeds
+            # additional L_2 normalization over each hidden-state
+            negative_prompt_embeds = F.normalize(negative_prompt_embeds, dim=-1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_waveforms_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_waveforms_per_prompt, seq_len)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        mel_spectrogram = self.vae.decode(latents).sample
+        return mel_spectrogram
+
+    def mel_spectrogram_to_waveform(self, mel_spectrogram):
+        if mel_spectrogram.dim() == 4:
+            mel_spectrogram = mel_spectrogram.squeeze(1)
+
+        waveform = self.vocoder(mel_spectrogram)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        waveform = waveform.cpu().float()
+        return waveform
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        audio_length_in_s,
+        vocoder_upsample_factor,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor
+        if audio_length_in_s < min_audio_length_in_s:
+            raise ValueError(
+                f"`audio_length_in_s` has to be a positive value greater than or equal to {min_audio_length_in_s}, but "
+                f"is {audio_length_in_s}."
+            )
+
+        if self.vocoder.config.model_in_dim % self.vae_scale_factor != 0:
+            raise ValueError(
+                f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the "
+                f"VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of "
+                f"{self.vae_scale_factor}."
+            )
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents with width->self.vocoder.config.model_in_dim
+    def prepare_latents(self, batch_size, num_channels_latents, height, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(self.vocoder.config.model_in_dim) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        audio_length_in_s: Optional[float] = None,
+        num_inference_steps: int = 10,
+        guidance_scale: float = 2.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_waveforms_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        output_type: Optional[str] = "np",
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
+            audio_length_in_s (`int`, *optional*, defaults to 5.12):
+                The length of the generated audio sample in seconds.
+            num_inference_steps (`int`, *optional*, defaults to 10):
+                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 2.5):
+                A higher guidance scale value encourages the model to generate audio that is closely linked to the text
+                `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
+                The number of waveforms to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated image. Choose between `"np"` to return a NumPy `np.ndarray` or
+                `"pt"` to return a PyTorch `torch.Tensor` object.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.AudioPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated audio.
+        """
+        # 0. Convert audio input length from seconds to spectrogram height
+        vocoder_upsample_factor = np.prod(self.vocoder.config.upsample_rates) / self.vocoder.config.sampling_rate
+
+        if audio_length_in_s is None:
+            audio_length_in_s = self.unet.config.sample_size * self.vae_scale_factor * vocoder_upsample_factor
+
+        height = int(audio_length_in_s / vocoder_upsample_factor)
+
+        original_waveform_length = int(audio_length_in_s * self.vocoder.config.sampling_rate)
+        if height % self.vae_scale_factor != 0:
+            height = int(np.ceil(height / self.vae_scale_factor)) * self.vae_scale_factor
+            logger.info(
+                f"Audio length in seconds {audio_length_in_s} is increased to {height * vocoder_upsample_factor} "
+                f"so that it can be handled by the model. It will be cut to {audio_length_in_s} after the "
+                f"denoising process."
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            audio_length_in_s,
+            vocoder_upsample_factor,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_waveforms_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_waveforms_per_prompt,
+            num_channels_latents,
+            height,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=None,
+                    class_labels=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 8. Post-processing
+        mel_spectrogram = self.decode_latents(latents)
+
+        audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
+
+        audio = audio[:, :original_waveform_length]
+
+        if output_type == "np":
+            audio = audio.numpy()
+
+        if not return_dict:
+            return (audio,)
+
+        return AudioPipelineOutput(audios=audio)
diff --git a/diffusers/src/diffusers/pipelines/audioldm2/__init__.py b/diffusers/src/diffusers/pipelines/audioldm2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..23cd0e44f89217b8391d0ce236070271db9aaf83
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/audioldm2/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modeling_audioldm2"] = ["AudioLDM2ProjectionModel", "AudioLDM2UNet2DConditionModel"]
+    _import_structure["pipeline_audioldm2"] = ["AudioLDM2Pipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
+        from .pipeline_audioldm2 import AudioLDM2Pipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/diffusers/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2af3078f74121eaca6aee7e0512c7d361044ece3
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
@@ -0,0 +1,1530 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import UNet2DConditionLoadersMixin
+from ...models.activations import get_activation
+from ...models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ...models.embeddings import (
+    TimestepEmbedding,
+    Timesteps,
+)
+from ...models.modeling_utils import ModelMixin
+from ...models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
+from ...models.transformers.transformer_2d import Transformer2DModel
+from ...models.unets.unet_2d_blocks import DownBlock2D, UpBlock2D
+from ...models.unets.unet_2d_condition import UNet2DConditionOutput
+from ...utils import BaseOutput, is_torch_version, logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def add_special_tokens(hidden_states, attention_mask, sos_token, eos_token):
+    batch_size = hidden_states.shape[0]
+
+    if attention_mask is not None:
+        # Add two more steps to attn mask
+        new_attn_mask_step = attention_mask.new_ones((batch_size, 1))
+        attention_mask = torch.concat([new_attn_mask_step, attention_mask, new_attn_mask_step], dim=-1)
+
+    # Add the SOS / EOS tokens at the start / end of the sequence respectively
+    sos_token = sos_token.expand(batch_size, 1, -1)
+    eos_token = eos_token.expand(batch_size, 1, -1)
+    hidden_states = torch.concat([sos_token, hidden_states, eos_token], dim=1)
+    return hidden_states, attention_mask
+
+
+@dataclass
+class AudioLDM2ProjectionModelOutput(BaseOutput):
+    """
+    Args:
+    Class for AudioLDM2 projection layer's outputs.
+        hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states obtained by linearly projecting the hidden-states for each of the text
+             encoders and subsequently concatenating them together.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices, formed by concatenating the attention masks
+             for the two text encoders together. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+    """
+
+    hidden_states: torch.Tensor
+    attention_mask: Optional[torch.LongTensor] = None
+
+
+class AudioLDM2ProjectionModel(ModelMixin, ConfigMixin):
+    """
+    A simple linear projection model to map two text embeddings to a shared latent space. It also inserts learned
+    embedding vectors at the start and end of each text embedding sequence respectively. Each variable appended with
+    `_1` refers to that corresponding to the second text encoder. Otherwise, it is from the first.
+
+    Args:
+        text_encoder_dim (`int`):
+            Dimensionality of the text embeddings from the first text encoder (CLAP).
+        text_encoder_1_dim (`int`):
+            Dimensionality of the text embeddings from the second text encoder (T5 or VITS).
+        langauge_model_dim (`int`):
+            Dimensionality of the text embeddings from the language model (GPT2).
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        text_encoder_dim,
+        text_encoder_1_dim,
+        langauge_model_dim,
+        use_learned_position_embedding=None,
+        max_seq_length=None,
+    ):
+        super().__init__()
+        # additional projection layers for each text encoder
+        self.projection = nn.Linear(text_encoder_dim, langauge_model_dim)
+        self.projection_1 = nn.Linear(text_encoder_1_dim, langauge_model_dim)
+
+        # learnable SOS / EOS token embeddings for each text encoder
+        self.sos_embed = nn.Parameter(torch.ones(langauge_model_dim))
+        self.eos_embed = nn.Parameter(torch.ones(langauge_model_dim))
+
+        self.sos_embed_1 = nn.Parameter(torch.ones(langauge_model_dim))
+        self.eos_embed_1 = nn.Parameter(torch.ones(langauge_model_dim))
+
+        self.use_learned_position_embedding = use_learned_position_embedding
+
+        # learable positional embedding for vits encoder
+        if self.use_learned_position_embedding is not None:
+            self.learnable_positional_embedding = torch.nn.Parameter(
+                torch.zeros((1, text_encoder_1_dim, max_seq_length))
+            )
+
+    def forward(
+        self,
+        hidden_states: Optional[torch.Tensor] = None,
+        hidden_states_1: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        attention_mask_1: Optional[torch.LongTensor] = None,
+    ):
+        hidden_states = self.projection(hidden_states)
+        hidden_states, attention_mask = add_special_tokens(
+            hidden_states, attention_mask, sos_token=self.sos_embed, eos_token=self.eos_embed
+        )
+
+        # Add positional embedding for Vits hidden state
+        if self.use_learned_position_embedding is not None:
+            hidden_states_1 = (hidden_states_1.permute(0, 2, 1) + self.learnable_positional_embedding).permute(0, 2, 1)
+
+        hidden_states_1 = self.projection_1(hidden_states_1)
+        hidden_states_1, attention_mask_1 = add_special_tokens(
+            hidden_states_1, attention_mask_1, sos_token=self.sos_embed_1, eos_token=self.eos_embed_1
+        )
+
+        # concatenate clap and t5 text encoding
+        hidden_states = torch.cat([hidden_states, hidden_states_1], dim=1)
+
+        # concatenate attention masks
+        if attention_mask is None and attention_mask_1 is not None:
+            attention_mask = attention_mask_1.new_ones((hidden_states[:2]))
+        elif attention_mask is not None and attention_mask_1 is None:
+            attention_mask_1 = attention_mask.new_ones((hidden_states_1[:2]))
+
+        if attention_mask is not None and attention_mask_1 is not None:
+            attention_mask = torch.cat([attention_mask, attention_mask_1], dim=-1)
+        else:
+            attention_mask = None
+
+        return AudioLDM2ProjectionModelOutput(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+        )
+
+
+class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output. Compared to the vanilla [`UNet2DConditionModel`], this variant optionally includes an additional
+    self-attention layer in each Transformer block, as well as multiple cross-attention layers. It also allows for up
+    to two cross-attention embeddings, `encoder_hidden_states` and `encoder_hidden_states_1`.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can only be `UNetMidBlock2DCrossAttn` for AudioLDM2.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention (`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        class_embeddings_concat: bool = False,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        if time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(f"{time_embedding_type} does not exist. Please make sure to use `positional`.")
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+        else:
+            raise ValueError(
+                f"unknown mid_block_type : {mid_block_type}. Should be `UNetMidBlock2DCrossAttn` for AudioLDM2."
+            )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        only_cross_attention = list(reversed(only_cross_attention))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+
+            self.conv_act = get_activation(act_fn)
+
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel._set_gradient_checkpointing
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        encoder_hidden_states_1: Optional[torch.Tensor] = None,
+        encoder_attention_mask_1: Optional[torch.Tensor] = None,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`AudioLDM2UNet2DConditionModel`] forward method.
+
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            encoder_hidden_states_1 (`torch.Tensor`, *optional*):
+                A second set of encoder hidden states with shape `(batch, sequence_length_2, feature_dim_2)`. Can be
+                used to condition the model on a different set of embeddings to `encoder_hidden_states`.
+            encoder_attention_mask_1 (`torch.Tensor`, *optional*):
+                A cross-attention mask of shape `(batch, sequence_length_2)` is applied to `encoder_hidden_states_1`.
+                If `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+
+        Returns:
+            [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        if encoder_attention_mask_1 is not None:
+            encoder_attention_mask_1 = (1 - encoder_attention_mask_1.to(sample.dtype)) * -10000.0
+            encoder_attention_mask_1 = encoder_attention_mask_1.unsqueeze(1)
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    encoder_hidden_states_1=encoder_hidden_states_1,
+                    encoder_attention_mask_1=encoder_attention_mask_1,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+                encoder_hidden_states_1=encoder_hidden_states_1,
+                encoder_attention_mask_1=encoder_attention_mask_1,
+            )
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    encoder_hidden_states_1=encoder_hidden_states_1,
+                    encoder_attention_mask_1=encoder_attention_mask_1,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DConditionOutput(sample=sample)
+
+
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    transformer_layers_per_block=1,
+    num_attention_heads=None,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    transformer_layers_per_block=1,
+    num_attention_heads=None,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,)
+        if isinstance(cross_attention_dim, (list, tuple)) and len(cross_attention_dim) > 4:
+            raise ValueError(
+                "Only up to 4 cross-attention layers are supported. Ensure that the length of cross-attention "
+                f"dims is less than or equal to 4. Got cross-attention dims {cross_attention_dim} of length {len(cross_attention_dim)}"
+            )
+        self.cross_attention_dim = cross_attention_dim
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            for j in range(len(cross_attention_dim)):
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim[j],
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        double_self_attention=True if cross_attention_dim[j] is None else False,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states_1: Optional[torch.Tensor] = None,
+        encoder_attention_mask_1: Optional[torch.Tensor] = None,
+    ):
+        output_states = ()
+        num_layers = len(self.resnets)
+        num_attention_per_layer = len(self.attentions) // num_layers
+
+        encoder_hidden_states_1 = (
+            encoder_hidden_states_1 if encoder_hidden_states_1 is not None else encoder_hidden_states
+        )
+        encoder_attention_mask_1 = (
+            encoder_attention_mask_1 if encoder_hidden_states_1 is not None else encoder_attention_mask
+        )
+
+        for i in range(num_layers):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.resnets[i]),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                for idx, cross_attention_dim in enumerate(self.cross_attention_dim):
+                    if cross_attention_dim is not None and idx <= 1:
+                        forward_encoder_hidden_states = encoder_hidden_states
+                        forward_encoder_attention_mask = encoder_attention_mask
+                    elif cross_attention_dim is not None and idx > 1:
+                        forward_encoder_hidden_states = encoder_hidden_states_1
+                        forward_encoder_attention_mask = encoder_attention_mask_1
+                    else:
+                        forward_encoder_hidden_states = None
+                        forward_encoder_attention_mask = None
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(self.attentions[i * num_attention_per_layer + idx], return_dict=False),
+                        hidden_states,
+                        forward_encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        forward_encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+            else:
+                hidden_states = self.resnets[i](hidden_states, temb)
+                for idx, cross_attention_dim in enumerate(self.cross_attention_dim):
+                    if cross_attention_dim is not None and idx <= 1:
+                        forward_encoder_hidden_states = encoder_hidden_states
+                        forward_encoder_attention_mask = encoder_attention_mask
+                    elif cross_attention_dim is not None and idx > 1:
+                        forward_encoder_hidden_states = encoder_hidden_states_1
+                        forward_encoder_attention_mask = encoder_attention_mask_1
+                    else:
+                        forward_encoder_hidden_states = None
+                        forward_encoder_attention_mask = None
+                    hidden_states = self.attentions[i * num_attention_per_layer + idx](
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        encoder_hidden_states=forward_encoder_hidden_states,
+                        encoder_attention_mask=forward_encoder_attention_mask,
+                        return_dict=False,
+                    )[0]
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        use_linear_projection=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,)
+        if isinstance(cross_attention_dim, (list, tuple)) and len(cross_attention_dim) > 4:
+            raise ValueError(
+                "Only up to 4 cross-attention layers are supported. Ensure that the length of cross-attention "
+                f"dims is less than or equal to 4. Got cross-attention dims {cross_attention_dim} of length {len(cross_attention_dim)}"
+            )
+        self.cross_attention_dim = cross_attention_dim
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for i in range(num_layers):
+            for j in range(len(cross_attention_dim)):
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim[j],
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        double_self_attention=True if cross_attention_dim[j] is None else False,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states_1: Optional[torch.Tensor] = None,
+        encoder_attention_mask_1: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        num_attention_per_layer = len(self.attentions) // (len(self.resnets) - 1)
+
+        encoder_hidden_states_1 = (
+            encoder_hidden_states_1 if encoder_hidden_states_1 is not None else encoder_hidden_states
+        )
+        encoder_attention_mask_1 = (
+            encoder_attention_mask_1 if encoder_hidden_states_1 is not None else encoder_attention_mask
+        )
+
+        for i in range(len(self.resnets[1:])):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                for idx, cross_attention_dim in enumerate(self.cross_attention_dim):
+                    if cross_attention_dim is not None and idx <= 1:
+                        forward_encoder_hidden_states = encoder_hidden_states
+                        forward_encoder_attention_mask = encoder_attention_mask
+                    elif cross_attention_dim is not None and idx > 1:
+                        forward_encoder_hidden_states = encoder_hidden_states_1
+                        forward_encoder_attention_mask = encoder_attention_mask_1
+                    else:
+                        forward_encoder_hidden_states = None
+                        forward_encoder_attention_mask = None
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(self.attentions[i * num_attention_per_layer + idx], return_dict=False),
+                        hidden_states,
+                        forward_encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        forward_encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.resnets[i + 1]),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                for idx, cross_attention_dim in enumerate(self.cross_attention_dim):
+                    if cross_attention_dim is not None and idx <= 1:
+                        forward_encoder_hidden_states = encoder_hidden_states
+                        forward_encoder_attention_mask = encoder_attention_mask
+                    elif cross_attention_dim is not None and idx > 1:
+                        forward_encoder_hidden_states = encoder_hidden_states_1
+                        forward_encoder_attention_mask = encoder_attention_mask_1
+                    else:
+                        forward_encoder_hidden_states = None
+                        forward_encoder_attention_mask = None
+                    hidden_states = self.attentions[i * num_attention_per_layer + idx](
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        encoder_hidden_states=forward_encoder_hidden_states,
+                        encoder_attention_mask=forward_encoder_attention_mask,
+                        return_dict=False,
+                    )[0]
+
+                hidden_states = self.resnets[i + 1](hidden_states, temb)
+
+        return hidden_states
+
+
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,)
+        if isinstance(cross_attention_dim, (list, tuple)) and len(cross_attention_dim) > 4:
+            raise ValueError(
+                "Only up to 4 cross-attention layers are supported. Ensure that the length of cross-attention "
+                f"dims is less than or equal to 4. Got cross-attention dims {cross_attention_dim} of length {len(cross_attention_dim)}"
+            )
+        self.cross_attention_dim = cross_attention_dim
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            for j in range(len(cross_attention_dim)):
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim[j],
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        double_self_attention=True if cross_attention_dim[j] is None else False,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states_1: Optional[torch.Tensor] = None,
+        encoder_attention_mask_1: Optional[torch.Tensor] = None,
+    ):
+        num_layers = len(self.resnets)
+        num_attention_per_layer = len(self.attentions) // num_layers
+
+        encoder_hidden_states_1 = (
+            encoder_hidden_states_1 if encoder_hidden_states_1 is not None else encoder_hidden_states
+        )
+        encoder_attention_mask_1 = (
+            encoder_attention_mask_1 if encoder_hidden_states_1 is not None else encoder_attention_mask
+        )
+
+        for i in range(num_layers):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.resnets[i]),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                for idx, cross_attention_dim in enumerate(self.cross_attention_dim):
+                    if cross_attention_dim is not None and idx <= 1:
+                        forward_encoder_hidden_states = encoder_hidden_states
+                        forward_encoder_attention_mask = encoder_attention_mask
+                    elif cross_attention_dim is not None and idx > 1:
+                        forward_encoder_hidden_states = encoder_hidden_states_1
+                        forward_encoder_attention_mask = encoder_attention_mask_1
+                    else:
+                        forward_encoder_hidden_states = None
+                        forward_encoder_attention_mask = None
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(self.attentions[i * num_attention_per_layer + idx], return_dict=False),
+                        hidden_states,
+                        forward_encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        forward_encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+            else:
+                hidden_states = self.resnets[i](hidden_states, temb)
+                for idx, cross_attention_dim in enumerate(self.cross_attention_dim):
+                    if cross_attention_dim is not None and idx <= 1:
+                        forward_encoder_hidden_states = encoder_hidden_states
+                        forward_encoder_attention_mask = encoder_attention_mask
+                    elif cross_attention_dim is not None and idx > 1:
+                        forward_encoder_hidden_states = encoder_hidden_states_1
+                        forward_encoder_attention_mask = encoder_attention_mask_1
+                    else:
+                        forward_encoder_hidden_states = None
+                        forward_encoder_attention_mask = None
+                    hidden_states = self.attentions[i * num_attention_per_layer + idx](
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        encoder_hidden_states=forward_encoder_hidden_states,
+                        encoder_attention_mask=forward_encoder_attention_mask,
+                        return_dict=False,
+                    )[0]
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
diff --git a/diffusers/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/diffusers/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b45771d7de7451b6560627deef4a7b254671a45e
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -0,0 +1,1065 @@
+# Copyright 2024 CVSSP, ByteDance and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import (
+    ClapFeatureExtractor,
+    ClapModel,
+    GPT2Model,
+    RobertaTokenizer,
+    RobertaTokenizerFast,
+    SpeechT5HifiGan,
+    T5EncoderModel,
+    T5Tokenizer,
+    T5TokenizerFast,
+    VitsModel,
+    VitsTokenizer,
+)
+
+from ...models import AutoencoderKL
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    is_librosa_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
+
+
+if is_librosa_available():
+    import librosa
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import scipy
+        >>> import torch
+        >>> from diffusers import AudioLDM2Pipeline
+
+        >>> repo_id = "cvssp/audioldm2"
+        >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # define the prompts
+        >>> prompt = "The sound of a hammer hitting a wooden surface."
+        >>> negative_prompt = "Low quality."
+
+        >>> # set the seed for generator
+        >>> generator = torch.Generator("cuda").manual_seed(0)
+
+        >>> # run the generation
+        >>> audio = pipe(
+        ...     prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     num_inference_steps=200,
+        ...     audio_length_in_s=10.0,
+        ...     num_waveforms_per_prompt=3,
+        ...     generator=generator,
+        ... ).audios
+
+        >>> # save the best audio sample (index 0) as a .wav file
+        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0])
+        ```
+        ```
+        #Using AudioLDM2 for Text To Speech
+        >>> import scipy
+        >>> import torch
+        >>> from diffusers import AudioLDM2Pipeline
+
+        >>> repo_id = "anhnct/audioldm2_gigaspeech"
+        >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # define the prompts
+        >>> prompt = "A female reporter is speaking"
+        >>> transcript = "wish you have a good day"
+
+        >>> # set the seed for generator
+        >>> generator = torch.Generator("cuda").manual_seed(0)
+
+        >>> # run the generation
+        >>> audio = pipe(
+        ...     prompt,
+        ...     transcription=transcript,
+        ...     num_inference_steps=200,
+        ...     audio_length_in_s=10.0,
+        ...     num_waveforms_per_prompt=2,
+        ...     generator=generator,
+        ...     max_new_tokens=512,          #Must set max_new_tokens equa to 512 for TTS
+        ... ).audios
+
+        >>> # save the best audio sample (index 0) as a .wav file
+        >>> scipy.io.wavfile.write("tts.wav", rate=16000, data=audio[0])
+        ```
+"""
+
+
+def prepare_inputs_for_generation(
+    inputs_embeds,
+    attention_mask=None,
+    past_key_values=None,
+    **kwargs,
+):
+    if past_key_values is not None:
+        # only last token for inputs_embeds if past is defined in kwargs
+        inputs_embeds = inputs_embeds[:, -1:]
+
+    return {
+        "inputs_embeds": inputs_embeds,
+        "attention_mask": attention_mask,
+        "past_key_values": past_key_values,
+        "use_cache": kwargs.get("use_cache"),
+    }
+
+
+class AudioLDM2Pipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-audio generation using AudioLDM2.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.ClapModel`]):
+            First frozen text-encoder. AudioLDM2 uses the joint audio-text embedding model
+            [CLAP](https://huggingface.co/docs/transformers/model_doc/clap#transformers.CLAPTextModelWithProjection),
+            specifically the [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant. The
+            text branch is used to encode the text prompt to a prompt embedding. The full audio-text model is used to
+            rank generated waveforms against the text prompt by computing similarity scores.
+        text_encoder_2 ([`~transformers.T5EncoderModel`, `~transformers.VitsModel`]):
+            Second frozen text-encoder. AudioLDM2 uses the encoder of
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [google/flan-t5-large](https://huggingface.co/google/flan-t5-large) variant. Second frozen text-encoder use
+            for TTS. AudioLDM2 uses the encoder of
+            [Vits](https://huggingface.co/docs/transformers/model_doc/vits#transformers.VitsModel).
+        projection_model ([`AudioLDM2ProjectionModel`]):
+            A trained model used to linearly project the hidden-states from the first and second text encoder models
+            and insert learned SOS and EOS token embeddings. The projected hidden-states from the two text encoders are
+            concatenated to give the input to the language model. A Learned Position Embedding for the Vits
+            hidden-states
+        language_model ([`~transformers.GPT2Model`]):
+            An auto-regressive language model used to generate a sequence of hidden-states conditioned on the projected
+            outputs from the two text encoders.
+        tokenizer ([`~transformers.RobertaTokenizer`]):
+            Tokenizer to tokenize text for the first frozen text-encoder.
+        tokenizer_2 ([`~transformers.T5Tokenizer`, `~transformers.VitsTokenizer`]):
+            Tokenizer to tokenize text for the second frozen text-encoder.
+        feature_extractor ([`~transformers.ClapFeatureExtractor`]):
+            Feature extractor to pre-process generated audio waveforms to log-mel spectrograms for automatic scoring.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded audio latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        vocoder ([`~transformers.SpeechT5HifiGan`]):
+            Vocoder of class `SpeechT5HifiGan` to convert the mel-spectrogram latents to the final audio waveform.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: ClapModel,
+        text_encoder_2: Union[T5EncoderModel, VitsModel],
+        projection_model: AudioLDM2ProjectionModel,
+        language_model: GPT2Model,
+        tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
+        tokenizer_2: Union[T5Tokenizer, T5TokenizerFast, VitsTokenizer],
+        feature_extractor: ClapFeatureExtractor,
+        unet: AudioLDM2UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        vocoder: SpeechT5HifiGan,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            projection_model=projection_model,
+            language_model=language_model,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            feature_extractor=feature_extractor,
+            unet=unet,
+            scheduler=scheduler,
+            vocoder=vocoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        model_sequence = [
+            self.text_encoder.text_model,
+            self.text_encoder.text_projection,
+            self.text_encoder_2,
+            self.projection_model,
+            self.language_model,
+            self.unet,
+            self.vae,
+            self.vocoder,
+            self.text_encoder,
+        ]
+
+        hook = None
+        for cpu_offloaded_model in model_sequence:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    def generate_language_model(
+        self,
+        inputs_embeds: torch.Tensor = None,
+        max_new_tokens: int = 8,
+        **model_kwargs,
+    ):
+        """
+
+        Generates a sequence of hidden-states from the language model, conditioned on the embedding inputs.
+
+        Parameters:
+            inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The sequence used as a prompt for the generation.
+            max_new_tokens (`int`):
+                Number of new tokens to generate.
+            model_kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of additional model-specific kwargs that will be forwarded to the `forward`
+                function of the model.
+
+        Return:
+            `inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The sequence of generated hidden-states.
+        """
+        max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens
+        model_kwargs = self.language_model._get_initial_cache_position(inputs_embeds, model_kwargs)
+        for _ in range(max_new_tokens):
+            # prepare model inputs
+            model_inputs = prepare_inputs_for_generation(inputs_embeds, **model_kwargs)
+
+            # forward pass to get next hidden states
+            output = self.language_model(**model_inputs, return_dict=True)
+
+            next_hidden_states = output.last_hidden_state
+
+            # Update the model input
+            inputs_embeds = torch.cat([inputs_embeds, next_hidden_states[:, -1:, :]], dim=1)
+
+            # Update generated hidden states, model inputs, and length for next step
+            model_kwargs = self.language_model._update_model_kwargs_for_generation(output, model_kwargs)
+
+        return inputs_embeds[:, -max_new_tokens:, :]
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_waveforms_per_prompt,
+        do_classifier_free_guidance,
+        transcription=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        generated_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_generated_prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        negative_attention_mask: Optional[torch.LongTensor] = None,
+        max_new_tokens: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            transcription (`str` or `List[str]`):
+                transcription of text to speech
+            device (`torch.device`):
+                torch device
+            num_waveforms_per_prompt (`int`):
+                number of waveforms that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-computed text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, *e.g.*
+                prompt weighting. If not provided, text embeddings will be computed from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-computed negative text embeddings from the Flan T5 model. Can be used to easily tweak text inputs,
+                *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
+                `negative_prompt` input argument.
+            generated_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs,
+                 *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
+                 argument.
+            negative_generated_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
+                inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
+                `negative_prompt` input argument.
+            attention_mask (`torch.LongTensor`, *optional*):
+                Pre-computed attention mask to be applied to the `prompt_embeds`. If not provided, attention mask will
+                be computed from `prompt` input argument.
+            negative_attention_mask (`torch.LongTensor`, *optional*):
+                Pre-computed attention mask to be applied to the `negative_prompt_embeds`. If not provided, attention
+                mask will be computed from `negative_prompt` input argument.
+            max_new_tokens (`int`, *optional*, defaults to None):
+                The number of new tokens to generate with the GPT2 language model.
+        Returns:
+            prompt_embeds (`torch.Tensor`):
+                Text embeddings from the Flan T5 model.
+            attention_mask (`torch.LongTensor`):
+                Attention mask to be applied to the `prompt_embeds`.
+            generated_prompt_embeds (`torch.Tensor`):
+                Text embeddings generated from the GPT2 langauge model.
+
+        Example:
+
+        ```python
+        >>> import scipy
+        >>> import torch
+        >>> from diffusers import AudioLDM2Pipeline
+
+        >>> repo_id = "cvssp/audioldm2"
+        >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # Get text embedding vectors
+        >>> prompt_embeds, attention_mask, generated_prompt_embeds = pipe.encode_prompt(
+        ...     prompt="Techno music with a strong, upbeat tempo and high melodic riffs",
+        ...     device="cuda",
+        ...     do_classifier_free_guidance=True,
+        ... )
+
+        >>> # Pass text embeddings to pipeline for text-conditional audio generation
+        >>> audio = pipe(
+        ...     prompt_embeds=prompt_embeds,
+        ...     attention_mask=attention_mask,
+        ...     generated_prompt_embeds=generated_prompt_embeds,
+        ...     num_inference_steps=200,
+        ...     audio_length_in_s=10.0,
+        ... ).audios[0]
+
+        >>> # save generated audio sample
+        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
+        ```"""
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2]
+        is_vits_text_encoder = isinstance(self.text_encoder_2, VitsModel)
+
+        if is_vits_text_encoder:
+            text_encoders = [self.text_encoder, self.text_encoder_2.text_encoder]
+        else:
+            text_encoders = [self.text_encoder, self.text_encoder_2]
+
+        if prompt_embeds is None:
+            prompt_embeds_list = []
+            attention_mask_list = []
+
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                use_prompt = isinstance(
+                    tokenizer, (RobertaTokenizer, RobertaTokenizerFast, T5Tokenizer, T5TokenizerFast)
+                )
+                text_inputs = tokenizer(
+                    prompt if use_prompt else transcription,
+                    padding="max_length"
+                    if isinstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast, VitsTokenizer))
+                    else True,
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                text_input_ids = text_inputs.input_ids
+                attention_mask = text_inputs.attention_mask
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        f"The following part of your input was truncated because {text_encoder.config.model_type} can "
+                        f"only handle sequences up to {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                text_input_ids = text_input_ids.to(device)
+                attention_mask = attention_mask.to(device)
+
+                if text_encoder.config.model_type == "clap":
+                    prompt_embeds = text_encoder.get_text_features(
+                        text_input_ids,
+                        attention_mask=attention_mask,
+                    )
+                    # append the seq-len dim: (bs, hidden_size) -> (bs, seq_len, hidden_size)
+                    prompt_embeds = prompt_embeds[:, None, :]
+                    # make sure that we attend to this single hidden-state
+                    attention_mask = attention_mask.new_ones((batch_size, 1))
+                elif is_vits_text_encoder:
+                    # Add end_token_id and attention mask in the end of sequence phonemes
+                    for text_input_id, text_attention_mask in zip(text_input_ids, attention_mask):
+                        for idx, phoneme_id in enumerate(text_input_id):
+                            if phoneme_id == 0:
+                                text_input_id[idx] = 182
+                                text_attention_mask[idx] = 1
+                                break
+                    prompt_embeds = text_encoder(
+                        text_input_ids, attention_mask=attention_mask, padding_mask=attention_mask.unsqueeze(-1)
+                    )
+                    prompt_embeds = prompt_embeds[0]
+                else:
+                    prompt_embeds = text_encoder(
+                        text_input_ids,
+                        attention_mask=attention_mask,
+                    )
+                    prompt_embeds = prompt_embeds[0]
+
+                prompt_embeds_list.append(prompt_embeds)
+                attention_mask_list.append(attention_mask)
+
+            projection_output = self.projection_model(
+                hidden_states=prompt_embeds_list[0],
+                hidden_states_1=prompt_embeds_list[1],
+                attention_mask=attention_mask_list[0],
+                attention_mask_1=attention_mask_list[1],
+            )
+            projected_prompt_embeds = projection_output.hidden_states
+            projected_attention_mask = projection_output.attention_mask
+
+            generated_prompt_embeds = self.generate_language_model(
+                projected_prompt_embeds,
+                attention_mask=projected_attention_mask,
+                max_new_tokens=max_new_tokens,
+            )
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        attention_mask = (
+            attention_mask.to(device=device)
+            if attention_mask is not None
+            else torch.ones(prompt_embeds.shape[:2], dtype=torch.long, device=device)
+        )
+        generated_prompt_embeds = generated_prompt_embeds.to(dtype=self.language_model.dtype, device=device)
+
+        bs_embed, seq_len, hidden_size = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_waveforms_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_waveforms_per_prompt, seq_len, hidden_size)
+
+        # duplicate attention mask for each generation per prompt
+        attention_mask = attention_mask.repeat(1, num_waveforms_per_prompt)
+        attention_mask = attention_mask.view(bs_embed * num_waveforms_per_prompt, seq_len)
+
+        bs_embed, seq_len, hidden_size = generated_prompt_embeds.shape
+        # duplicate generated embeddings for each generation per prompt, using mps friendly method
+        generated_prompt_embeds = generated_prompt_embeds.repeat(1, num_waveforms_per_prompt, 1)
+        generated_prompt_embeds = generated_prompt_embeds.view(
+            bs_embed * num_waveforms_per_prompt, seq_len, hidden_size
+        )
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            negative_prompt_embeds_list = []
+            negative_attention_mask_list = []
+            max_length = prompt_embeds.shape[1]
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                uncond_input = tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length
+                    if isinstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast, VitsTokenizer))
+                    else max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                uncond_input_ids = uncond_input.input_ids.to(device)
+                negative_attention_mask = uncond_input.attention_mask.to(device)
+
+                if text_encoder.config.model_type == "clap":
+                    negative_prompt_embeds = text_encoder.get_text_features(
+                        uncond_input_ids,
+                        attention_mask=negative_attention_mask,
+                    )
+                    # append the seq-len dim: (bs, hidden_size) -> (bs, seq_len, hidden_size)
+                    negative_prompt_embeds = negative_prompt_embeds[:, None, :]
+                    # make sure that we attend to this single hidden-state
+                    negative_attention_mask = negative_attention_mask.new_ones((batch_size, 1))
+                elif is_vits_text_encoder:
+                    negative_prompt_embeds = torch.zeros(
+                        batch_size,
+                        tokenizer.model_max_length,
+                        text_encoder.config.hidden_size,
+                    ).to(dtype=self.text_encoder_2.dtype, device=device)
+                    negative_attention_mask = torch.zeros(batch_size, tokenizer.model_max_length).to(
+                        dtype=self.text_encoder_2.dtype, device=device
+                    )
+                else:
+                    negative_prompt_embeds = text_encoder(
+                        uncond_input_ids,
+                        attention_mask=negative_attention_mask,
+                    )
+                    negative_prompt_embeds = negative_prompt_embeds[0]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+                negative_attention_mask_list.append(negative_attention_mask)
+
+            projection_output = self.projection_model(
+                hidden_states=negative_prompt_embeds_list[0],
+                hidden_states_1=negative_prompt_embeds_list[1],
+                attention_mask=negative_attention_mask_list[0],
+                attention_mask_1=negative_attention_mask_list[1],
+            )
+            negative_projected_prompt_embeds = projection_output.hidden_states
+            negative_projected_attention_mask = projection_output.attention_mask
+
+            negative_generated_prompt_embeds = self.generate_language_model(
+                negative_projected_prompt_embeds,
+                attention_mask=negative_projected_attention_mask,
+                max_new_tokens=max_new_tokens,
+            )
+
+        if do_classifier_free_guidance:
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            negative_attention_mask = (
+                negative_attention_mask.to(device=device)
+                if negative_attention_mask is not None
+                else torch.ones(negative_prompt_embeds.shape[:2], dtype=torch.long, device=device)
+            )
+            negative_generated_prompt_embeds = negative_generated_prompt_embeds.to(
+                dtype=self.language_model.dtype, device=device
+            )
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_waveforms_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_waveforms_per_prompt, seq_len, -1)
+
+            # duplicate unconditional attention mask for each generation per prompt
+            negative_attention_mask = negative_attention_mask.repeat(1, num_waveforms_per_prompt)
+            negative_attention_mask = negative_attention_mask.view(batch_size * num_waveforms_per_prompt, seq_len)
+
+            # duplicate unconditional generated embeddings for each generation per prompt
+            seq_len = negative_generated_prompt_embeds.shape[1]
+            negative_generated_prompt_embeds = negative_generated_prompt_embeds.repeat(1, num_waveforms_per_prompt, 1)
+            negative_generated_prompt_embeds = negative_generated_prompt_embeds.view(
+                batch_size * num_waveforms_per_prompt, seq_len, -1
+            )
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            attention_mask = torch.cat([negative_attention_mask, attention_mask])
+            generated_prompt_embeds = torch.cat([negative_generated_prompt_embeds, generated_prompt_embeds])
+
+        return prompt_embeds, attention_mask, generated_prompt_embeds
+
+    # Copied from diffusers.pipelines.audioldm.pipeline_audioldm.AudioLDMPipeline.mel_spectrogram_to_waveform
+    def mel_spectrogram_to_waveform(self, mel_spectrogram):
+        if mel_spectrogram.dim() == 4:
+            mel_spectrogram = mel_spectrogram.squeeze(1)
+
+        waveform = self.vocoder(mel_spectrogram)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        waveform = waveform.cpu().float()
+        return waveform
+
+    def score_waveforms(self, text, audio, num_waveforms_per_prompt, device, dtype):
+        if not is_librosa_available():
+            logger.info(
+                "Automatic scoring of the generated audio waveforms against the input prompt text requires the "
+                "`librosa` package to resample the generated waveforms. Returning the audios in the order they were "
+                "generated. To enable automatic scoring, install `librosa` with: `pip install librosa`."
+            )
+            return audio
+        inputs = self.tokenizer(text, return_tensors="pt", padding=True)
+        resampled_audio = librosa.resample(
+            audio.numpy(), orig_sr=self.vocoder.config.sampling_rate, target_sr=self.feature_extractor.sampling_rate
+        )
+        inputs["input_features"] = self.feature_extractor(
+            list(resampled_audio), return_tensors="pt", sampling_rate=self.feature_extractor.sampling_rate
+        ).input_features.type(dtype)
+        inputs = inputs.to(device)
+
+        # compute the audio-text similarity score using the CLAP model
+        logits_per_text = self.text_encoder(**inputs).logits_per_text
+        # sort by the highest matching generations per prompt
+        indices = torch.argsort(logits_per_text, dim=1, descending=True)[:, :num_waveforms_per_prompt]
+        audio = torch.index_select(audio, 0, indices.reshape(-1).cpu())
+        return audio
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        audio_length_in_s,
+        vocoder_upsample_factor,
+        callback_steps,
+        transcription=None,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        generated_prompt_embeds=None,
+        negative_generated_prompt_embeds=None,
+        attention_mask=None,
+        negative_attention_mask=None,
+    ):
+        min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor
+        if audio_length_in_s < min_audio_length_in_s:
+            raise ValueError(
+                f"`audio_length_in_s` has to be a positive value greater than or equal to {min_audio_length_in_s}, but "
+                f"is {audio_length_in_s}."
+            )
+
+        if self.vocoder.config.model_in_dim % self.vae_scale_factor != 0:
+            raise ValueError(
+                f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the "
+                f"VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of "
+                f"{self.vae_scale_factor}."
+            )
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and (prompt_embeds is None or generated_prompt_embeds is None):
+            raise ValueError(
+                "Provide either `prompt`, or `prompt_embeds` and `generated_prompt_embeds`. Cannot leave "
+                "`prompt` undefined without specifying both `prompt_embeds` and `generated_prompt_embeds`."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_embeds is not None and negative_generated_prompt_embeds is None:
+            raise ValueError(
+                "Cannot forward `negative_prompt_embeds` without `negative_generated_prompt_embeds`. Ensure that"
+                "both arguments are specified"
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if attention_mask is not None and attention_mask.shape != prompt_embeds.shape[:2]:
+                raise ValueError(
+                    "`attention_mask should have the same batch size and sequence length as `prompt_embeds`, but got:"
+                    f"`attention_mask: {attention_mask.shape} != `prompt_embeds` {prompt_embeds.shape}"
+                )
+
+        if transcription is None:
+            if self.text_encoder_2.config.model_type == "vits":
+                raise ValueError("Cannot forward without transcription. Please make sure to" " have transcription")
+        elif transcription is not None and (
+            not isinstance(transcription, str) and not isinstance(transcription, list)
+        ):
+            raise ValueError(f"`transcription` has to be of type `str` or `list` but is {type(transcription)}")
+
+        if generated_prompt_embeds is not None and negative_generated_prompt_embeds is not None:
+            if generated_prompt_embeds.shape != negative_generated_prompt_embeds.shape:
+                raise ValueError(
+                    "`generated_prompt_embeds` and `negative_generated_prompt_embeds` must have the same shape when "
+                    f"passed directly, but got: `generated_prompt_embeds` {generated_prompt_embeds.shape} != "
+                    f"`negative_generated_prompt_embeds` {negative_generated_prompt_embeds.shape}."
+                )
+            if (
+                negative_attention_mask is not None
+                and negative_attention_mask.shape != negative_prompt_embeds.shape[:2]
+            ):
+                raise ValueError(
+                    "`attention_mask should have the same batch size and sequence length as `prompt_embeds`, but got:"
+                    f"`attention_mask: {negative_attention_mask.shape} != `prompt_embeds` {negative_prompt_embeds.shape}"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents with width->self.vocoder.config.model_in_dim
+    def prepare_latents(self, batch_size, num_channels_latents, height, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(self.vocoder.config.model_in_dim) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        transcription: Union[str, List[str]] = None,
+        audio_length_in_s: Optional[float] = None,
+        num_inference_steps: int = 200,
+        guidance_scale: float = 3.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_waveforms_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        generated_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_generated_prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        negative_attention_mask: Optional[torch.LongTensor] = None,
+        max_new_tokens: Optional[int] = None,
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        output_type: Optional[str] = "np",
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
+            transcription (`str` or `List[str]`, *optional*):\
+                The transcript for text to speech.
+            audio_length_in_s (`int`, *optional*, defaults to 10.24):
+                The length of the generated audio sample in seconds.
+            num_inference_steps (`int`, *optional*, defaults to 200):
+                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 3.5):
+                A higher guidance scale value encourages the model to generate audio that is closely linked to the text
+                `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
+                The number of waveforms to generate per prompt. If `num_waveforms_per_prompt > 1`, then automatic
+                scoring is performed between the generated outputs and the text prompt. This scoring ranks the
+                generated waveforms based on their cosine similarity with the text input in the joint text-audio
+                embedding space.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for spectrogram
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            generated_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs,
+                 *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
+                 argument.
+            negative_generated_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
+                inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
+                `negative_prompt` input argument.
+            attention_mask (`torch.LongTensor`, *optional*):
+                Pre-computed attention mask to be applied to the `prompt_embeds`. If not provided, attention mask will
+                be computed from `prompt` input argument.
+            negative_attention_mask (`torch.LongTensor`, *optional*):
+                Pre-computed attention mask to be applied to the `negative_prompt_embeds`. If not provided, attention
+                mask will be computed from `negative_prompt` input argument.
+            max_new_tokens (`int`, *optional*, defaults to None):
+                Number of new tokens to generate with the GPT2 language model. If not provided, number of tokens will
+                be taken from the config of the model.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated audio. Choose between `"np"` to return a NumPy `np.ndarray` or
+                `"pt"` to return a PyTorch `torch.Tensor` object. Set to `"latent"` to return the latent diffusion
+                model (LDM) output.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated audio.
+        """
+        # 0. Convert audio input length from seconds to spectrogram height
+        vocoder_upsample_factor = np.prod(self.vocoder.config.upsample_rates) / self.vocoder.config.sampling_rate
+
+        if audio_length_in_s is None:
+            audio_length_in_s = self.unet.config.sample_size * self.vae_scale_factor * vocoder_upsample_factor
+
+        height = int(audio_length_in_s / vocoder_upsample_factor)
+
+        original_waveform_length = int(audio_length_in_s * self.vocoder.config.sampling_rate)
+        if height % self.vae_scale_factor != 0:
+            height = int(np.ceil(height / self.vae_scale_factor)) * self.vae_scale_factor
+            logger.info(
+                f"Audio length in seconds {audio_length_in_s} is increased to {height * vocoder_upsample_factor} "
+                f"so that it can be handled by the model. It will be cut to {audio_length_in_s} after the "
+                f"denoising process."
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            audio_length_in_s,
+            vocoder_upsample_factor,
+            callback_steps,
+            transcription,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            generated_prompt_embeds,
+            negative_generated_prompt_embeds,
+            attention_mask,
+            negative_attention_mask,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, attention_mask, generated_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_waveforms_per_prompt,
+            do_classifier_free_guidance,
+            transcription,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            generated_prompt_embeds=generated_prompt_embeds,
+            negative_generated_prompt_embeds=negative_generated_prompt_embeds,
+            attention_mask=attention_mask,
+            negative_attention_mask=negative_attention_mask,
+            max_new_tokens=max_new_tokens,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_waveforms_per_prompt,
+            num_channels_latents,
+            height,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=generated_prompt_embeds,
+                    encoder_hidden_states_1=prompt_embeds,
+                    encoder_attention_mask_1=attention_mask,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        self.maybe_free_model_hooks()
+
+        # 8. Post-processing
+        if not output_type == "latent":
+            latents = 1 / self.vae.config.scaling_factor * latents
+            mel_spectrogram = self.vae.decode(latents).sample
+        else:
+            return AudioPipelineOutput(audios=latents)
+
+        audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
+
+        audio = audio[:, :original_waveform_length]
+
+        # 9. Automatic scoring
+        if num_waveforms_per_prompt > 1 and prompt is not None:
+            audio = self.score_waveforms(
+                text=prompt,
+                audio=audio,
+                num_waveforms_per_prompt=num_waveforms_per_prompt,
+                device=device,
+                dtype=prompt_embeds.dtype,
+            )
+
+        if output_type == "np":
+            audio = audio.numpy()
+
+        if not return_dict:
+            return (audio,)
+
+        return AudioPipelineOutput(audios=audio)
diff --git a/diffusers/src/diffusers/pipelines/aura_flow/__init__.py b/diffusers/src/diffusers/pipelines/aura_flow/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1917baa61e28abf0970080c90b5864579cbe8f9
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/aura_flow/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_aura_flow"] = ["AuraFlowPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_aura_flow import AuraFlowPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py b/diffusers/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a86b5ceded92d6aab61f7006fdc58edcb8aefb2
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py
@@ -0,0 +1,591 @@
+# Copyright 2024 AuraFlow Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import torch
+from transformers import T5Tokenizer, UMT5EncoderModel
+
+from ...image_processor import VaeImageProcessor
+from ...models import AuraFlowTransformer2DModel, AutoencoderKL
+from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AuraFlowPipeline
+
+        >>> pipe = AuraFlowPipeline.from_pretrained("fal/AuraFlow", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> image = pipe(prompt).images[0]
+        >>> image.save("aura_flow.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class AuraFlowPipeline(DiffusionPipeline):
+    r"""
+    Args:
+        tokenizer (`T5TokenizerFast`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. AuraFlow uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [EleutherAI/pile-t5-xl](https://huggingface.co/EleutherAI/pile-t5-xl) variant.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        transformer ([`AuraFlowTransformer2DModel`]):
+            Conditional Transformer (MMDiT and DiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: UMT5EncoderModel,
+        vae: AutoencoderKL,
+        transformer: AuraFlowTransformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Union[str, List[str]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 256,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            max_sequence_length (`int`, defaults to 256): Maximum sequence length to use for the prompt.
+        """
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        max_length = max_sequence_length
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                truncation=True,
+                max_length=max_length,
+                padding="max_length",
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs["input_ids"]
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because T5 can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
+            prompt_embeds = self.text_encoder(**text_inputs)[0]
+            prompt_attention_mask = text_inputs["attention_mask"].unsqueeze(-1).expand(prompt_embeds.shape)
+            prompt_embeds = prompt_embeds * prompt_attention_mask
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.transformer is not None:
+            dtype = self.transformer.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.reshape(bs_embed, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                truncation=True,
+                max_length=max_length,
+                padding="max_length",
+                return_tensors="pt",
+            )
+            uncond_input = {k: v.to(device) for k, v in uncond_input.items()}
+            negative_prompt_embeds = self.text_encoder(**uncond_input)[0]
+            negative_prompt_attention_mask = (
+                uncond_input["attention_mask"].unsqueeze(-1).expand(negative_prompt_embeds.shape)
+            )
+            negative_prompt_embeds = negative_prompt_embeds * negative_prompt_attention_mask
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            negative_prompt_attention_mask = negative_prompt_attention_mask.reshape(bs_embed, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
+        else:
+            negative_prompt_embeds = None
+            negative_prompt_attention_mask = None
+
+        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        height: Optional[int] = 1024,
+        width: Optional[int] = 1024,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 256,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for best results.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+                `num_inference_steps` and `timesteps` must be `None`.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns: [`~pipelines.ImagePipelineOutput`] or `tuple`:
+            If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is returned
+            where the first element is a list with the generated images.
+        """
+        # 1. Check inputs. Raise error if not correct
+        height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        width = width or self.transformer.config.sample_size * self.vae_scale_factor
+
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        )
+
+        # 2. Determine batch size.
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            max_sequence_length=max_sequence_length,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+
+        # 4. Prepare timesteps
+
+        # sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            latent_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                # aura use timestep value between 0 and 1, with t=1 as noise and t=0 as the image
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = torch.tensor([t / 1000]).expand(latent_model_input.shape[0])
+                timestep = timestep.to(latents.device, dtype=latents.dtype)
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if output_type == "latent":
+            image = latents
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/auto_pipeline.py b/diffusers/src/diffusers/pipelines/auto_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6186da260ad2c7bad27907abfbf7dbf67a08b45
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/auto_pipeline.py
@@ -0,0 +1,1102 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+from huggingface_hub.utils import validate_hf_hub_args
+
+from ..configuration_utils import ConfigMixin
+from ..utils import is_sentencepiece_available
+from .aura_flow import AuraFlowPipeline
+from .controlnet import (
+    StableDiffusionControlNetImg2ImgPipeline,
+    StableDiffusionControlNetInpaintPipeline,
+    StableDiffusionControlNetPipeline,
+    StableDiffusionXLControlNetImg2ImgPipeline,
+    StableDiffusionXLControlNetInpaintPipeline,
+    StableDiffusionXLControlNetPipeline,
+)
+from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
+from .flux import (
+    FluxControlNetImg2ImgPipeline,
+    FluxControlNetInpaintPipeline,
+    FluxControlNetPipeline,
+    FluxImg2ImgPipeline,
+    FluxInpaintPipeline,
+    FluxPipeline,
+)
+from .hunyuandit import HunyuanDiTPipeline
+from .kandinsky import (
+    KandinskyCombinedPipeline,
+    KandinskyImg2ImgCombinedPipeline,
+    KandinskyImg2ImgPipeline,
+    KandinskyInpaintCombinedPipeline,
+    KandinskyInpaintPipeline,
+    KandinskyPipeline,
+)
+from .kandinsky2_2 import (
+    KandinskyV22CombinedPipeline,
+    KandinskyV22Img2ImgCombinedPipeline,
+    KandinskyV22Img2ImgPipeline,
+    KandinskyV22InpaintCombinedPipeline,
+    KandinskyV22InpaintPipeline,
+    KandinskyV22Pipeline,
+)
+from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
+from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
+from .lumina import LuminaText2ImgPipeline
+from .pag import (
+    HunyuanDiTPAGPipeline,
+    PixArtSigmaPAGPipeline,
+    StableDiffusion3PAGPipeline,
+    StableDiffusionControlNetPAGPipeline,
+    StableDiffusionPAGPipeline,
+    StableDiffusionXLControlNetPAGImg2ImgPipeline,
+    StableDiffusionXLControlNetPAGPipeline,
+    StableDiffusionXLPAGImg2ImgPipeline,
+    StableDiffusionXLPAGInpaintPipeline,
+    StableDiffusionXLPAGPipeline,
+)
+from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
+from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline
+from .stable_diffusion import (
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipeline,
+    StableDiffusionPipeline,
+)
+from .stable_diffusion_3 import (
+    StableDiffusion3Img2ImgPipeline,
+    StableDiffusion3InpaintPipeline,
+    StableDiffusion3Pipeline,
+)
+from .stable_diffusion_xl import (
+    StableDiffusionXLImg2ImgPipeline,
+    StableDiffusionXLInpaintPipeline,
+    StableDiffusionXLPipeline,
+)
+from .wuerstchen import WuerstchenCombinedPipeline, WuerstchenDecoderPipeline
+
+
+AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", StableDiffusionPipeline),
+        ("stable-diffusion-xl", StableDiffusionXLPipeline),
+        ("stable-diffusion-3", StableDiffusion3Pipeline),
+        ("stable-diffusion-3-pag", StableDiffusion3PAGPipeline),
+        ("if", IFPipeline),
+        ("hunyuan", HunyuanDiTPipeline),
+        ("hunyuan-pag", HunyuanDiTPAGPipeline),
+        ("kandinsky", KandinskyCombinedPipeline),
+        ("kandinsky22", KandinskyV22CombinedPipeline),
+        ("kandinsky3", Kandinsky3Pipeline),
+        ("stable-diffusion-controlnet", StableDiffusionControlNetPipeline),
+        ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetPipeline),
+        ("wuerstchen", WuerstchenCombinedPipeline),
+        ("cascade", StableCascadeCombinedPipeline),
+        ("lcm", LatentConsistencyModelPipeline),
+        ("pixart-alpha", PixArtAlphaPipeline),
+        ("pixart-sigma", PixArtSigmaPipeline),
+        ("stable-diffusion-pag", StableDiffusionPAGPipeline),
+        ("stable-diffusion-controlnet-pag", StableDiffusionControlNetPAGPipeline),
+        ("stable-diffusion-xl-pag", StableDiffusionXLPAGPipeline),
+        ("stable-diffusion-xl-controlnet-pag", StableDiffusionXLControlNetPAGPipeline),
+        ("pixart-sigma-pag", PixArtSigmaPAGPipeline),
+        ("auraflow", AuraFlowPipeline),
+        ("flux", FluxPipeline),
+        ("flux-controlnet", FluxControlNetPipeline),
+        ("lumina", LuminaText2ImgPipeline),
+    ]
+)
+
+AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", StableDiffusionImg2ImgPipeline),
+        ("stable-diffusion-xl", StableDiffusionXLImg2ImgPipeline),
+        ("stable-diffusion-3", StableDiffusion3Img2ImgPipeline),
+        ("if", IFImg2ImgPipeline),
+        ("kandinsky", KandinskyImg2ImgCombinedPipeline),
+        ("kandinsky22", KandinskyV22Img2ImgCombinedPipeline),
+        ("kandinsky3", Kandinsky3Img2ImgPipeline),
+        ("stable-diffusion-controlnet", StableDiffusionControlNetImg2ImgPipeline),
+        ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetImg2ImgPipeline),
+        ("stable-diffusion-xl-pag", StableDiffusionXLPAGImg2ImgPipeline),
+        ("stable-diffusion-xl-controlnet-pag", StableDiffusionXLControlNetPAGImg2ImgPipeline),
+        ("lcm", LatentConsistencyModelImg2ImgPipeline),
+        ("flux", FluxImg2ImgPipeline),
+        ("flux-controlnet", FluxControlNetImg2ImgPipeline),
+    ]
+)
+
+AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", StableDiffusionInpaintPipeline),
+        ("stable-diffusion-xl", StableDiffusionXLInpaintPipeline),
+        ("stable-diffusion-3", StableDiffusion3InpaintPipeline),
+        ("if", IFInpaintingPipeline),
+        ("kandinsky", KandinskyInpaintCombinedPipeline),
+        ("kandinsky22", KandinskyV22InpaintCombinedPipeline),
+        ("stable-diffusion-controlnet", StableDiffusionControlNetInpaintPipeline),
+        ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetInpaintPipeline),
+        ("stable-diffusion-xl-pag", StableDiffusionXLPAGInpaintPipeline),
+        ("flux", FluxInpaintPipeline),
+        ("flux-controlnet", FluxControlNetInpaintPipeline),
+    ]
+)
+
+_AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("kandinsky", KandinskyPipeline),
+        ("kandinsky22", KandinskyV22Pipeline),
+        ("wuerstchen", WuerstchenDecoderPipeline),
+        ("cascade", StableCascadeDecoderPipeline),
+    ]
+)
+_AUTO_IMAGE2IMAGE_DECODER_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("kandinsky", KandinskyImg2ImgPipeline),
+        ("kandinsky22", KandinskyV22Img2ImgPipeline),
+    ]
+)
+_AUTO_INPAINT_DECODER_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("kandinsky", KandinskyInpaintPipeline),
+        ("kandinsky22", KandinskyV22InpaintPipeline),
+    ]
+)
+
+if is_sentencepiece_available():
+    from .kolors import KolorsImg2ImgPipeline, KolorsPipeline
+    from .pag import KolorsPAGPipeline
+
+    AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsPipeline
+    AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors-pag"] = KolorsPAGPipeline
+    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsImg2ImgPipeline
+
+SUPPORTED_TASKS_MAPPINGS = [
+    AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
+    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
+    AUTO_INPAINT_PIPELINES_MAPPING,
+    _AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING,
+    _AUTO_IMAGE2IMAGE_DECODER_PIPELINES_MAPPING,
+    _AUTO_INPAINT_DECODER_PIPELINES_MAPPING,
+]
+
+
+def _get_connected_pipeline(pipeline_cls):
+    # for now connected pipelines can only be loaded from decoder pipelines, such as kandinsky-community/kandinsky-2-2-decoder
+    if pipeline_cls in _AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING.values():
+        return _get_task_class(
+            AUTO_TEXT2IMAGE_PIPELINES_MAPPING, pipeline_cls.__name__, throw_error_if_not_exist=False
+        )
+    if pipeline_cls in _AUTO_IMAGE2IMAGE_DECODER_PIPELINES_MAPPING.values():
+        return _get_task_class(
+            AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, pipeline_cls.__name__, throw_error_if_not_exist=False
+        )
+    if pipeline_cls in _AUTO_INPAINT_DECODER_PIPELINES_MAPPING.values():
+        return _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, pipeline_cls.__name__, throw_error_if_not_exist=False)
+
+
+def _get_task_class(mapping, pipeline_class_name, throw_error_if_not_exist: bool = True):
+    def get_model(pipeline_class_name):
+        for task_mapping in SUPPORTED_TASKS_MAPPINGS:
+            for model_name, pipeline in task_mapping.items():
+                if pipeline.__name__ == pipeline_class_name:
+                    return model_name
+
+    model_name = get_model(pipeline_class_name)
+
+    if model_name is not None:
+        task_class = mapping.get(model_name, None)
+        if task_class is not None:
+            return task_class
+
+    if throw_error_if_not_exist:
+        raise ValueError(f"AutoPipeline can't find a pipeline linked to {pipeline_class_name} for {model_name}")
+
+
+class AutoPipelineForText2Image(ConfigMixin):
+    r"""
+
+    [`AutoPipelineForText2Image`] is a generic pipeline class that instantiates a text-to-image pipeline class. The
+    specific underlying pipeline class is automatically selected from either the
+    [`~AutoPipelineForText2Image.from_pretrained`] or [`~AutoPipelineForText2Image.from_pipe`] methods.
+
+    This class cannot be instantiated using `__init__()` (throws an error).
+
+    Class attributes:
+
+        - **config_name** (`str`) -- The configuration filename that stores the class and module names of all the
+          diffusion pipeline's components.
+
+    """
+
+    config_name = "model_index.json"
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
+            f"`{self.__class__.__name__}.from_pipe(pipeline)` methods."
+        )
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_or_path, **kwargs):
+        r"""
+        Instantiates a text-to-image Pytorch diffusion pipeline from pretrained pipeline weight.
+
+        The from_pretrained() method takes care of returning the correct pipeline class instance by:
+            1. Detect the pipeline class of the pretrained_model_or_path based on the _class_name property of its
+               config object
+            2. Find the text-to-image pipeline linked to the pipeline class using pattern matching on pipeline class
+               name.
+
+        If a `controlnet` argument is passed, it will instantiate a [`StableDiffusionControlNetPipeline`] object.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+
+        Parameters:
+            pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_pipeline_directory/`) containing pipeline weights
+                      saved using
+                    [`~DiffusionPipeline.save_pretrained`].
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If "auto" is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            custom_revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
+                `revision` when loading a custom pipeline from the Hub. It can be a 🤗 Diffusers version when loading a
+                custom pipeline from GitHub, otherwise it defaults to `"main"` when loading from the Hub.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn’t need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if device_map contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline
+                class). The overwritten components are passed directly to the pipelines `__init__` method. See example
+                below for more information.
+            variant (`str`, *optional*):
+                Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+
+        <Tip>
+
+        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        `huggingface-cli login`.
+
+        </Tip>
+
+        Examples:
+
+        ```py
+        >>> from diffusers import AutoPipelineForText2Image
+
+        >>> pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> image = pipeline(prompt).images[0]
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+
+        load_config_kwargs = {
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "proxies": proxies,
+            "token": token,
+            "local_files_only": local_files_only,
+            "revision": revision,
+        }
+
+        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+        orig_class_name = config["_class_name"]
+
+        if "controlnet" in kwargs:
+            orig_class_name = config["_class_name"].replace("Pipeline", "ControlNetPipeline")
+        if "enable_pag" in kwargs:
+            enable_pag = kwargs.pop("enable_pag")
+            if enable_pag:
+                orig_class_name = orig_class_name.replace("Pipeline", "PAGPipeline")
+
+        text_2_image_cls = _get_task_class(AUTO_TEXT2IMAGE_PIPELINES_MAPPING, orig_class_name)
+
+        kwargs = {**load_config_kwargs, **kwargs}
+        return text_2_image_cls.from_pretrained(pretrained_model_or_path, **kwargs)
+
+    @classmethod
+    def from_pipe(cls, pipeline, **kwargs):
+        r"""
+        Instantiates a text-to-image Pytorch diffusion pipeline from another instantiated diffusion pipeline class.
+
+        The from_pipe() method takes care of returning the correct pipeline class instance by finding the text-to-image
+        pipeline linked to the pipeline class using pattern matching on pipeline class name.
+
+        All the modules the pipeline contains will be used to initialize the new pipeline without reallocating
+        additional memory.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pipeline (`DiffusionPipeline`):
+                an instantiated `DiffusionPipeline` object
+
+        ```py
+        >>> from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
+
+        >>> pipe_i2i = AutoPipelineForImage2Image.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", requires_safety_checker=False
+        ... )
+
+        >>> pipe_t2i = AutoPipelineForText2Image.from_pipe(pipe_i2i)
+        >>> image = pipe_t2i(prompt).images[0]
+        ```
+        """
+
+        original_config = dict(pipeline.config)
+        original_cls_name = pipeline.__class__.__name__
+
+        # derive the pipeline class to instantiate
+        text_2_image_cls = _get_task_class(AUTO_TEXT2IMAGE_PIPELINES_MAPPING, original_cls_name)
+
+        if "controlnet" in kwargs:
+            if kwargs["controlnet"] is not None:
+                to_replace = "PAGPipeline" if "PAG" in text_2_image_cls.__name__ else "Pipeline"
+                text_2_image_cls = _get_task_class(
+                    AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
+                    text_2_image_cls.__name__.replace("ControlNet", "").replace(to_replace, "ControlNet" + to_replace),
+                )
+            else:
+                text_2_image_cls = _get_task_class(
+                    AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
+                    text_2_image_cls.__name__.replace("ControlNet", ""),
+                )
+
+        if "enable_pag" in kwargs:
+            enable_pag = kwargs.pop("enable_pag")
+            if enable_pag:
+                text_2_image_cls = _get_task_class(
+                    AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
+                    text_2_image_cls.__name__.replace("PAG", "").replace("Pipeline", "PAGPipeline"),
+                )
+            else:
+                text_2_image_cls = _get_task_class(
+                    AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
+                    text_2_image_cls.__name__.replace("PAG", ""),
+                )
+
+        # define expected module and optional kwargs given the pipeline signature
+        expected_modules, optional_kwargs = text_2_image_cls._get_signature_keys(text_2_image_cls)
+
+        pretrained_model_name_or_path = original_config.pop("_name_or_path", None)
+
+        # allow users pass modules in `kwargs` to override the original pipeline's components
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        original_class_obj = {
+            k: pipeline.components[k]
+            for k, v in pipeline.components.items()
+            if k in expected_modules and k not in passed_class_obj
+        }
+
+        # allow users pass optional kwargs to override the original pipelines config attribute
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+        original_pipe_kwargs = {
+            k: original_config[k]
+            for k, v in original_config.items()
+            if k in optional_kwargs and k not in passed_pipe_kwargs
+        }
+
+        # config that were not expected by original pipeline is stored as private attribute
+        # we will pass them as optional arguments if they can be accepted by the pipeline
+        additional_pipe_kwargs = [
+            k[1:]
+            for k in original_config.keys()
+            if k.startswith("_") and k[1:] in optional_kwargs and k[1:] not in passed_pipe_kwargs
+        ]
+        for k in additional_pipe_kwargs:
+            original_pipe_kwargs[k] = original_config.pop(f"_{k}")
+
+        text_2_image_kwargs = {**passed_class_obj, **original_class_obj, **passed_pipe_kwargs, **original_pipe_kwargs}
+
+        # store unused config as private attribute
+        unused_original_config = {
+            f"{'' if k.startswith('_') else '_'}{k}": original_config[k]
+            for k, v in original_config.items()
+            if k not in text_2_image_kwargs
+        }
+
+        missing_modules = set(expected_modules) - set(pipeline._optional_components) - set(text_2_image_kwargs.keys())
+
+        if len(missing_modules) > 0:
+            raise ValueError(
+                f"Pipeline {text_2_image_cls} expected {expected_modules}, but only {set(list(passed_class_obj.keys()) + list(original_class_obj.keys()))} were passed"
+            )
+
+        model = text_2_image_cls(**text_2_image_kwargs)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        model.register_to_config(**unused_original_config)
+
+        return model
+
+
+class AutoPipelineForImage2Image(ConfigMixin):
+    r"""
+
+    [`AutoPipelineForImage2Image`] is a generic pipeline class that instantiates an image-to-image pipeline class. The
+    specific underlying pipeline class is automatically selected from either the
+    [`~AutoPipelineForImage2Image.from_pretrained`] or [`~AutoPipelineForImage2Image.from_pipe`] methods.
+
+    This class cannot be instantiated using `__init__()` (throws an error).
+
+    Class attributes:
+
+        - **config_name** (`str`) -- The configuration filename that stores the class and module names of all the
+          diffusion pipeline's components.
+
+    """
+
+    config_name = "model_index.json"
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
+            f"`{self.__class__.__name__}.from_pipe(pipeline)` methods."
+        )
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_or_path, **kwargs):
+        r"""
+        Instantiates a image-to-image Pytorch diffusion pipeline from pretrained pipeline weight.
+
+        The from_pretrained() method takes care of returning the correct pipeline class instance by:
+            1. Detect the pipeline class of the pretrained_model_or_path based on the _class_name property of its
+               config object
+            2. Find the image-to-image pipeline linked to the pipeline class using pattern matching on pipeline class
+               name.
+
+        If a `controlnet` argument is passed, it will instantiate a [`StableDiffusionControlNetImg2ImgPipeline`]
+        object.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+
+        Parameters:
+            pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_pipeline_directory/`) containing pipeline weights
+                      saved using
+                    [`~DiffusionPipeline.save_pretrained`].
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If "auto" is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            custom_revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
+                `revision` when loading a custom pipeline from the Hub. It can be a 🤗 Diffusers version when loading a
+                custom pipeline from GitHub, otherwise it defaults to `"main"` when loading from the Hub.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn’t need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if device_map contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline
+                class). The overwritten components are passed directly to the pipelines `__init__` method. See example
+                below for more information.
+            variant (`str`, *optional*):
+                Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+
+        <Tip>
+
+        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        `huggingface-cli login`.
+
+        </Tip>
+
+        Examples:
+
+        ```py
+        >>> from diffusers import AutoPipelineForImage2Image
+
+        >>> pipeline = AutoPipelineForImage2Image.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> image = pipeline(prompt, image).images[0]
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+
+        load_config_kwargs = {
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "proxies": proxies,
+            "token": token,
+            "local_files_only": local_files_only,
+            "revision": revision,
+        }
+
+        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+        orig_class_name = config["_class_name"]
+
+        # the `orig_class_name` can be:
+        # `- *Pipeline` (for regular text-to-image checkpoint)
+        # `- *Img2ImgPipeline` (for refiner checkpoint)
+        to_replace = "Img2ImgPipeline" if "Img2Img" in config["_class_name"] else "Pipeline"
+
+        if "controlnet" in kwargs:
+            orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
+        if "enable_pag" in kwargs:
+            enable_pag = kwargs.pop("enable_pag")
+            if enable_pag:
+                orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
+
+        image_2_image_cls = _get_task_class(AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, orig_class_name)
+
+        kwargs = {**load_config_kwargs, **kwargs}
+        return image_2_image_cls.from_pretrained(pretrained_model_or_path, **kwargs)
+
+    @classmethod
+    def from_pipe(cls, pipeline, **kwargs):
+        r"""
+        Instantiates a image-to-image Pytorch diffusion pipeline from another instantiated diffusion pipeline class.
+
+        The from_pipe() method takes care of returning the correct pipeline class instance by finding the
+        image-to-image pipeline linked to the pipeline class using pattern matching on pipeline class name.
+
+        All the modules the pipeline contains will be used to initialize the new pipeline without reallocating
+        additional memory.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pipeline (`DiffusionPipeline`):
+                an instantiated `DiffusionPipeline` object
+
+        Examples:
+
+        ```py
+        >>> from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
+
+        >>> pipe_t2i = AutoPipelineForText2Image.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", requires_safety_checker=False
+        ... )
+
+        >>> pipe_i2i = AutoPipelineForImage2Image.from_pipe(pipe_t2i)
+        >>> image = pipe_i2i(prompt, image).images[0]
+        ```
+        """
+
+        original_config = dict(pipeline.config)
+        original_cls_name = pipeline.__class__.__name__
+
+        # derive the pipeline class to instantiate
+        image_2_image_cls = _get_task_class(AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, original_cls_name)
+
+        if "controlnet" in kwargs:
+            if kwargs["controlnet"] is not None:
+                to_replace = "Img2ImgPipeline"
+                if "PAG" in image_2_image_cls.__name__:
+                    to_replace = "PAG" + to_replace
+                image_2_image_cls = _get_task_class(
+                    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
+                    image_2_image_cls.__name__.replace("ControlNet", "").replace(
+                        to_replace, "ControlNet" + to_replace
+                    ),
+                )
+            else:
+                image_2_image_cls = _get_task_class(
+                    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
+                    image_2_image_cls.__name__.replace("ControlNet", ""),
+                )
+
+        if "enable_pag" in kwargs:
+            enable_pag = kwargs.pop("enable_pag")
+            if enable_pag:
+                image_2_image_cls = _get_task_class(
+                    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
+                    image_2_image_cls.__name__.replace("PAG", "").replace("Img2ImgPipeline", "PAGImg2ImgPipeline"),
+                )
+            else:
+                image_2_image_cls = _get_task_class(
+                    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
+                    image_2_image_cls.__name__.replace("PAG", ""),
+                )
+
+        # define expected module and optional kwargs given the pipeline signature
+        expected_modules, optional_kwargs = image_2_image_cls._get_signature_keys(image_2_image_cls)
+
+        pretrained_model_name_or_path = original_config.pop("_name_or_path", None)
+
+        # allow users pass modules in `kwargs` to override the original pipeline's components
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        original_class_obj = {
+            k: pipeline.components[k]
+            for k, v in pipeline.components.items()
+            if k in expected_modules and k not in passed_class_obj
+        }
+
+        # allow users pass optional kwargs to override the original pipelines config attribute
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+        original_pipe_kwargs = {
+            k: original_config[k]
+            for k, v in original_config.items()
+            if k in optional_kwargs and k not in passed_pipe_kwargs
+        }
+
+        # config attribute that were not expected by original pipeline is stored as its private attribute
+        # we will pass them as optional arguments if they can be accepted by the pipeline
+        additional_pipe_kwargs = [
+            k[1:]
+            for k in original_config.keys()
+            if k.startswith("_") and k[1:] in optional_kwargs and k[1:] not in passed_pipe_kwargs
+        ]
+        for k in additional_pipe_kwargs:
+            original_pipe_kwargs[k] = original_config.pop(f"_{k}")
+
+        image_2_image_kwargs = {**passed_class_obj, **original_class_obj, **passed_pipe_kwargs, **original_pipe_kwargs}
+
+        # store unused config as private attribute
+        unused_original_config = {
+            f"{'' if k.startswith('_') else '_'}{k}": original_config[k]
+            for k, v in original_config.items()
+            if k not in image_2_image_kwargs
+        }
+
+        missing_modules = set(expected_modules) - set(pipeline._optional_components) - set(image_2_image_kwargs.keys())
+
+        if len(missing_modules) > 0:
+            raise ValueError(
+                f"Pipeline {image_2_image_cls} expected {expected_modules}, but only {set(list(passed_class_obj.keys()) + list(original_class_obj.keys()))} were passed"
+            )
+
+        model = image_2_image_cls(**image_2_image_kwargs)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        model.register_to_config(**unused_original_config)
+
+        return model
+
+
+class AutoPipelineForInpainting(ConfigMixin):
+    r"""
+
+    [`AutoPipelineForInpainting`] is a generic pipeline class that instantiates an inpainting pipeline class. The
+    specific underlying pipeline class is automatically selected from either the
+    [`~AutoPipelineForInpainting.from_pretrained`] or [`~AutoPipelineForInpainting.from_pipe`] methods.
+
+    This class cannot be instantiated using `__init__()` (throws an error).
+
+    Class attributes:
+
+        - **config_name** (`str`) -- The configuration filename that stores the class and module names of all the
+          diffusion pipeline's components.
+
+    """
+
+    config_name = "model_index.json"
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
+            f"`{self.__class__.__name__}.from_pipe(pipeline)` methods."
+        )
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_or_path, **kwargs):
+        r"""
+        Instantiates a inpainting Pytorch diffusion pipeline from pretrained pipeline weight.
+
+        The from_pretrained() method takes care of returning the correct pipeline class instance by:
+            1. Detect the pipeline class of the pretrained_model_or_path based on the _class_name property of its
+               config object
+            2. Find the inpainting pipeline linked to the pipeline class using pattern matching on pipeline class name.
+
+        If a `controlnet` argument is passed, it will instantiate a [`StableDiffusionControlNetInpaintPipeline`]
+        object.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+
+        Parameters:
+            pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_pipeline_directory/`) containing pipeline weights
+                      saved using
+                    [`~DiffusionPipeline.save_pretrained`].
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If "auto" is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            custom_revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
+                `revision` when loading a custom pipeline from the Hub. It can be a 🤗 Diffusers version when loading a
+                custom pipeline from GitHub, otherwise it defaults to `"main"` when loading from the Hub.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn’t need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if device_map contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline
+                class). The overwritten components are passed directly to the pipelines `__init__` method. See example
+                below for more information.
+            variant (`str`, *optional*):
+                Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+
+        <Tip>
+
+        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        `huggingface-cli login`.
+
+        </Tip>
+
+        Examples:
+
+        ```py
+        >>> from diffusers import AutoPipelineForInpainting
+
+        >>> pipeline = AutoPipelineForInpainting.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> image = pipeline(prompt, image=init_image, mask_image=mask_image).images[0]
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+
+        load_config_kwargs = {
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "proxies": proxies,
+            "token": token,
+            "local_files_only": local_files_only,
+            "revision": revision,
+        }
+
+        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+        orig_class_name = config["_class_name"]
+
+        # The `orig_class_name`` can be:
+        # `- *InpaintPipeline` (for inpaint-specific checkpoint)
+        #  - or *Pipeline (for regular text-to-image checkpoint)
+        to_replace = "InpaintPipeline" if "Inpaint" in config["_class_name"] else "Pipeline"
+
+        if "controlnet" in kwargs:
+            orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
+        if "enable_pag" in kwargs:
+            enable_pag = kwargs.pop("enable_pag")
+            if enable_pag:
+                orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
+        inpainting_cls = _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, orig_class_name)
+
+        kwargs = {**load_config_kwargs, **kwargs}
+        return inpainting_cls.from_pretrained(pretrained_model_or_path, **kwargs)
+
+    @classmethod
+    def from_pipe(cls, pipeline, **kwargs):
+        r"""
+        Instantiates a inpainting Pytorch diffusion pipeline from another instantiated diffusion pipeline class.
+
+        The from_pipe() method takes care of returning the correct pipeline class instance by finding the inpainting
+        pipeline linked to the pipeline class using pattern matching on pipeline class name.
+
+        All the modules the pipeline class contain will be used to initialize the new pipeline without reallocating
+        additional memory.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pipeline (`DiffusionPipeline`):
+                an instantiated `DiffusionPipeline` object
+
+        Examples:
+
+        ```py
+        >>> from diffusers import AutoPipelineForText2Image, AutoPipelineForInpainting
+
+        >>> pipe_t2i = AutoPipelineForText2Image.from_pretrained(
+        ...     "DeepFloyd/IF-I-XL-v1.0", requires_safety_checker=False
+        ... )
+
+        >>> pipe_inpaint = AutoPipelineForInpainting.from_pipe(pipe_t2i)
+        >>> image = pipe_inpaint(prompt, image=init_image, mask_image=mask_image).images[0]
+        ```
+        """
+        original_config = dict(pipeline.config)
+        original_cls_name = pipeline.__class__.__name__
+
+        # derive the pipeline class to instantiate
+        inpainting_cls = _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, original_cls_name)
+
+        if "controlnet" in kwargs:
+            if kwargs["controlnet"] is not None:
+                inpainting_cls = _get_task_class(
+                    AUTO_INPAINT_PIPELINES_MAPPING,
+                    inpainting_cls.__name__.replace("ControlNet", "").replace(
+                        "InpaintPipeline", "ControlNetInpaintPipeline"
+                    ),
+                )
+            else:
+                inpainting_cls = _get_task_class(
+                    AUTO_INPAINT_PIPELINES_MAPPING,
+                    inpainting_cls.__name__.replace("ControlNetInpaintPipeline", "InpaintPipeline"),
+                )
+
+        if "enable_pag" in kwargs:
+            enable_pag = kwargs.pop("enable_pag")
+            if enable_pag:
+                inpainting_cls = _get_task_class(
+                    AUTO_INPAINT_PIPELINES_MAPPING,
+                    inpainting_cls.__name__.replace("PAG", "").replace("InpaintPipeline", "PAGInpaintPipeline"),
+                )
+            else:
+                inpainting_cls = _get_task_class(
+                    AUTO_INPAINT_PIPELINES_MAPPING,
+                    inpainting_cls.__name__.replace("PAGInpaintPipeline", "InpaintPipeline"),
+                )
+
+        # define expected module and optional kwargs given the pipeline signature
+        expected_modules, optional_kwargs = inpainting_cls._get_signature_keys(inpainting_cls)
+
+        pretrained_model_name_or_path = original_config.pop("_name_or_path", None)
+
+        # allow users pass modules in `kwargs` to override the original pipeline's components
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        original_class_obj = {
+            k: pipeline.components[k]
+            for k, v in pipeline.components.items()
+            if k in expected_modules and k not in passed_class_obj
+        }
+
+        # allow users pass optional kwargs to override the original pipelines config attribute
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+        original_pipe_kwargs = {
+            k: original_config[k]
+            for k, v in original_config.items()
+            if k in optional_kwargs and k not in passed_pipe_kwargs
+        }
+
+        # config that were not expected by original pipeline is stored as private attribute
+        # we will pass them as optional arguments if they can be accepted by the pipeline
+        additional_pipe_kwargs = [
+            k[1:]
+            for k in original_config.keys()
+            if k.startswith("_") and k[1:] in optional_kwargs and k[1:] not in passed_pipe_kwargs
+        ]
+        for k in additional_pipe_kwargs:
+            original_pipe_kwargs[k] = original_config.pop(f"_{k}")
+
+        inpainting_kwargs = {**passed_class_obj, **original_class_obj, **passed_pipe_kwargs, **original_pipe_kwargs}
+
+        # store unused config as private attribute
+        unused_original_config = {
+            f"{'' if k.startswith('_') else '_'}{k}": original_config[k]
+            for k, v in original_config.items()
+            if k not in inpainting_kwargs
+        }
+
+        missing_modules = set(expected_modules) - set(pipeline._optional_components) - set(inpainting_kwargs.keys())
+
+        if len(missing_modules) > 0:
+            raise ValueError(
+                f"Pipeline {inpainting_cls} expected {expected_modules}, but only {set(list(passed_class_obj.keys()) + list(original_class_obj.keys()))} were passed"
+            )
+
+        model = inpainting_cls(**inpainting_kwargs)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        model.register_to_config(**unused_original_config)
+
+        return model
diff --git a/diffusers/src/diffusers/pipelines/blip_diffusion/__init__.py b/diffusers/src/diffusers/pipelines/blip_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..af6c879d5ce88aa8edec0691e987444ff1d3dfec
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/blip_diffusion/__init__.py
@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+from PIL import Image
+
+from ...utils import OptionalDependencyNotAvailable, is_torch_available, is_transformers_available
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import ShapEPipeline
+else:
+    from .blip_image_processing import BlipImageProcessor
+    from .modeling_blip2 import Blip2QFormerModel
+    from .modeling_ctx_clip import ContextCLIPTextModel
+    from .pipeline_blip_diffusion import BlipDiffusionPipeline
diff --git a/diffusers/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py b/diffusers/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..d92a0766905909b8c6284d336ba6effde20dc8d1
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py
@@ -0,0 +1,318 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for BLIP."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from transformers.image_transforms import convert_to_rgb, resize, to_channel_dimension_format
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+
+from diffusers.utils import numpy_to_pil
+
+
+if is_vision_available():
+    import PIL.Image
+
+
+logger = logging.get_logger(__name__)
+
+
+# We needed some extra functions on top of the ones in transformers.image_processing_utils.BaseImageProcessor, namely center crop
+# Copy-pasted from transformers.models.blip.image_processing_blip.BlipImageProcessor
+class BlipImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a BLIP image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        do_center_crop: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+        self.do_center_crop = do_center_crop
+
+    # Copy-pasted from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        do_center_crop: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: bool = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+        if do_center_crop:
+            images = [self.center_crop(image, size, input_data_format=input_data_format) for image in images]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+        return encoded_outputs
+
+    # Follows diffusers.VaeImageProcessor.postprocess
+    def postprocess(self, sample: torch.Tensor, output_type: str = "pil"):
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(
+                f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']"
+            )
+
+        # Equivalent to diffusers.VaeImageProcessor.denormalize
+        sample = (sample / 2 + 0.5).clamp(0, 1)
+        if output_type == "pt":
+            return sample
+
+        # Equivalent to diffusers.VaeImageProcessor.pt_to_numpy
+        sample = sample.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "np":
+            return sample
+        # Output_type must be 'pil'
+        sample = numpy_to_pil(sample)
+        return sample
diff --git a/diffusers/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py b/diffusers/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1be4761a99875a09cca9a30de42f2634aa36c903
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py
@@ -0,0 +1,642 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers import BertTokenizer
+from transformers.activations import QuickGELUActivation as QuickGELU
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
+from transformers.models.blip_2.configuration_blip_2 import Blip2Config, Blip2VisionConfig
+from transformers.models.blip_2.modeling_blip_2 import (
+    Blip2Encoder,
+    Blip2PreTrainedModel,
+    Blip2QFormerAttention,
+    Blip2QFormerIntermediate,
+    Blip2QFormerOutput,
+)
+from transformers.pytorch_utils import apply_chunking_to_forward
+from transformers.utils import (
+    logging,
+    replace_return_docstrings,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+# There is an implementation of Blip2 in `transformers` : https://github.com/huggingface/transformers/blob/main/src/transformers/models/blip_2/modeling_blip_2.py.
+# But it doesn't support getting multimodal embeddings. So, this module can be
+# replaced with a future `transformers` version supports that.
+class Blip2TextEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
+
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids)
+                embeddings = embeddings + position_embeddings
+
+            if query_embeds is not None:
+                batch_size = embeddings.shape[0]
+                # repeat the query embeddings for batch size
+                query_embeds = query_embeds.repeat(batch_size, 1, 1)
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+        embeddings = embeddings.to(query_embeds.dtype)
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copy-pasted from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->Blip2
+class Blip2VisionEmbeddings(nn.Module):
+    def __init__(self, config: Blip2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size, bias=False
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+
+
+# The Qformer encoder, which takes the visual embeddings, and the text input, to get multimodal embeddings
+class Blip2QFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [Blip2QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# The layers making up the Qformer encoder
+class Blip2QFormerLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = Blip2QFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate = Blip2QFormerIntermediate(config)
+        self.intermediate_query = Blip2QFormerIntermediate(config)
+        self.output_query = Blip2QFormerOutput(config)
+        self.output = Blip2QFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+# ProjLayer used to project the multimodal Blip2 embeddings to be used in the text encoder
+class ProjLayer(nn.Module):
+    def __init__(self, in_dim, out_dim, hidden_dim, drop_p=0.1, eps=1e-12):
+        super().__init__()
+
+        # Dense1 -> Act -> Dense2 -> Drop -> Res -> Norm
+        self.dense1 = nn.Linear(in_dim, hidden_dim)
+        self.act_fn = QuickGELU()
+        self.dense2 = nn.Linear(hidden_dim, out_dim)
+        self.dropout = nn.Dropout(drop_p)
+
+        self.LayerNorm = nn.LayerNorm(out_dim, eps=eps)
+
+    def forward(self, x):
+        x_in = x
+
+        x = self.LayerNorm(x)
+        x = self.dropout(self.dense2(self.act_fn(self.dense1(x)))) + x_in
+
+        return x
+
+
+# Copy-pasted from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->Blip2, BLIP->BLIP_2
+class Blip2VisionModel(Blip2PreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = Blip2VisionConfig
+
+    def __init__(self, config: Blip2VisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = Blip2VisionEmbeddings(config)
+        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = Blip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.post_init()
+
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layernorm(hidden_states)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+# Qformer model, used to get multimodal embeddings from the text and image inputs
+class Blip2QFormerModel(Blip2PreTrainedModel):
+    """
+    Querying Transformer (Q-Former), used in BLIP-2.
+    """
+
+    def __init__(self, config: Blip2Config):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = Blip2TextEmbeddings(config.qformer_config)
+        self.visual_encoder = Blip2VisionModel(config.vision_config)
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        if not hasattr(config, "tokenizer") or config.tokenizer is None:
+            self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="right")
+        else:
+            self.tokenizer = BertTokenizer.from_pretrained(config.tokenizer, truncation_side="right")
+        self.tokenizer.add_special_tokens({"bos_token": "[DEC]"})
+        self.proj_layer = ProjLayer(
+            in_dim=config.qformer_config.hidden_size,
+            out_dim=config.qformer_config.hidden_size,
+            hidden_dim=config.qformer_config.hidden_size * 4,
+            drop_p=0.1,
+            eps=1e-12,
+        )
+
+        self.encoder = Blip2QFormerEncoder(config.qformer_config)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_shape: Tuple[int],
+        device: torch.device,
+        has_query: bool = False,
+    ) -> torch.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+            device (`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        text_input=None,
+        image_input=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+
+        text = self.tokenizer(text_input, return_tensors="pt", padding=True)
+        text = text.to(self.device)
+        input_ids = text.input_ids
+        batch_size = input_ids.shape[0]
+        query_atts = torch.ones((batch_size, self.query_tokens.size()[1]), dtype=torch.long).to(self.device)
+        attention_mask = torch.cat([query_atts, text.attention_mask], dim=1)
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+
+        query_length = self.query_tokens.shape[1]
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            query_embeds=self.query_tokens,
+            past_key_values_length=past_key_values_length,
+        )
+
+        # embedding_output = self.layernorm(query_embeds)
+        # embedding_output = self.dropout(embedding_output)
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+
+        image_embeds_frozen = self.visual_encoder(image_input).last_hidden_state
+        # image_embeds_frozen = torch.ones_like(image_embeds_frozen)
+        encoder_hidden_states = image_embeds_frozen
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, list):
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if isinstance(encoder_attention_mask, list):
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.qformer_config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return self.proj_layer(sequence_output[:, :query_length, :])
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
diff --git a/diffusers/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py b/diffusers/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..d29dddf64b01e46c964814385f900da80d1746bb
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py
@@ -0,0 +1,223 @@
+# Copyright 2024 Salesforce.com, inc.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import CLIPPreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.models.clip.configuration_clip import CLIPTextConfig
+from transformers.models.clip.modeling_clip import CLIPEncoder
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# This is a modified version of the CLIPTextModel from transformers.models.clip.modeling_clip
+# Which allows for an extra input of "context embeddings", which are the query embeddings used in Qformer
+# They pass through the clip model, along with the text embeddings, and interact with them using self attention
+class ContextCLIPTextModel(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = ContextCLIPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        ctx_embeddings: torch.Tensor = None,
+        ctx_begin_pos: list = None,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        return self.text_model(
+            ctx_embeddings=ctx_embeddings,
+            ctx_begin_pos=ctx_begin_pos,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class ContextCLIPTextTransformer(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = ContextCLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        ctx_embeddings: torch.Tensor,
+        ctx_begin_pos: list,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify either input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            ctx_embeddings=ctx_embeddings,
+            ctx_begin_pos=ctx_begin_pos,
+        )
+
+        bsz, seq_len = input_shape
+        if ctx_embeddings is not None:
+            seq_len += ctx_embeddings.size(1)
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
+            hidden_states.device
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=input_ids.device),
+            input_ids.to(torch.int).argmax(dim=-1),
+        ]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
+        mask.fill_(torch.tensor(torch.finfo(dtype).min))
+        mask.triu_(1)  # zero out the lower diagonal
+        mask = mask.unsqueeze(1)  # expand mask
+        return mask
+
+
+class ContextCLIPTextEmbeddings(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+    def forward(
+        self,
+        ctx_embeddings: torch.Tensor,
+        ctx_begin_pos: list,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if ctx_embeddings is None:
+            ctx_len = 0
+        else:
+            ctx_len = ctx_embeddings.shape[1]
+
+        seq_length = (input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]) + ctx_len
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+            # for each input embeddings, add the ctx embeddings at the correct position
+            input_embeds_ctx = []
+            bsz = inputs_embeds.shape[0]
+
+            if ctx_embeddings is not None:
+                for i in range(bsz):
+                    cbp = ctx_begin_pos[i]
+
+                    prefix = inputs_embeds[i, :cbp]
+                    # remove the special token embedding
+                    suffix = inputs_embeds[i, cbp:]
+
+                    input_embeds_ctx.append(torch.cat([prefix, ctx_embeddings[i], suffix], dim=0))
+
+                inputs_embeds = torch.stack(input_embeds_ctx, dim=0)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
diff --git a/diffusers/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py b/diffusers/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff23247b5f813d854369f68ac6c512c5f33ac19b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py
@@ -0,0 +1,348 @@
+# Copyright 2024 Salesforce.com, inc.
+# Copyright 2024 The HuggingFace Team. All rights reserved.#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPTokenizer
+
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import PNDMScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .blip_image_processing import BlipImageProcessor
+from .modeling_blip2 import Blip2QFormerModel
+from .modeling_ctx_clip import ContextCLIPTextModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers.pipelines import BlipDiffusionPipeline
+        >>> from diffusers.utils import load_image
+        >>> import torch
+
+        >>> blip_diffusion_pipe = BlipDiffusionPipeline.from_pretrained(
+        ...     "Salesforce/blipdiffusion", torch_dtype=torch.float16
+        ... ).to("cuda")
+
+
+        >>> cond_subject = "dog"
+        >>> tgt_subject = "dog"
+        >>> text_prompt_input = "swimming underwater"
+
+        >>> cond_image = load_image(
+        ...     "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/dog.jpg"
+        ... )
+        >>> guidance_scale = 7.5
+        >>> num_inference_steps = 25
+        >>> negative_prompt = "over-exposure, under-exposure, saturated, duplicate, out of frame, lowres, cropped, worst quality, low quality, jpeg artifacts, morbid, mutilated, out of frame, ugly, bad anatomy, bad proportions, deformed, blurry, duplicate"
+
+
+        >>> output = blip_diffusion_pipe(
+        ...     text_prompt_input,
+        ...     cond_image,
+        ...     cond_subject,
+        ...     tgt_subject,
+        ...     guidance_scale=guidance_scale,
+        ...     num_inference_steps=num_inference_steps,
+        ...     neg_prompt=negative_prompt,
+        ...     height=512,
+        ...     width=512,
+        ... ).images
+        >>> output[0].save("image.png")
+        ```
+"""
+
+
+class BlipDiffusionPipeline(DiffusionPipeline):
+    """
+    Pipeline for Zero-Shot Subject Driven Generation using Blip Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        tokenizer ([`CLIPTokenizer`]):
+            Tokenizer for the text encoder
+        text_encoder ([`ContextCLIPTextModel`]):
+            Text encoder to encode the text prompt
+        vae ([`AutoencoderKL`]):
+            VAE model to map the latents to the image
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        scheduler ([`PNDMScheduler`]):
+             A scheduler to be used in combination with `unet` to generate image latents.
+        qformer ([`Blip2QFormerModel`]):
+            QFormer model to get multi-modal embeddings from the text and image.
+        image_processor ([`BlipImageProcessor`]):
+            Image Processor to preprocess and postprocess the image.
+        ctx_begin_pos (int, `optional`, defaults to 2):
+            Position of the context token in the text encoder.
+    """
+
+    model_cpu_offload_seq = "qformer->text_encoder->unet->vae"
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: ContextCLIPTextModel,
+        vae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+        scheduler: PNDMScheduler,
+        qformer: Blip2QFormerModel,
+        image_processor: BlipImageProcessor,
+        ctx_begin_pos: int = 2,
+        mean: List[float] = None,
+        std: List[float] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            scheduler=scheduler,
+            qformer=qformer,
+            image_processor=image_processor,
+        )
+        self.register_to_config(ctx_begin_pos=ctx_begin_pos, mean=mean, std=std)
+
+    def get_query_embeddings(self, input_image, src_subject):
+        return self.qformer(image_input=input_image, text_input=src_subject, return_dict=False)
+
+    # from the original Blip Diffusion code, speciefies the target subject and augments the prompt by repeating it
+    def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20):
+        rv = []
+        for prompt, tgt_subject in zip(prompts, tgt_subjects):
+            prompt = f"a {tgt_subject} {prompt.strip()}"
+            # a trick to amplify the prompt
+            rv.append(", ".join([prompt] * int(prompt_strength * prompt_reps)))
+
+        return rv
+
+    # Copied from diffusers.pipelines.consistency_models.pipeline_consistency_models.ConsistencyModelPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels, height, width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def encode_prompt(self, query_embeds, prompt, device=None):
+        device = device or self._execution_device
+
+        # embeddings for prompt, with query_embeds as context
+        max_len = self.text_encoder.text_model.config.max_position_embeddings
+        max_len -= self.qformer.config.num_query_tokens
+
+        tokenized_prompt = self.tokenizer(
+            prompt,
+            padding="max_length",
+            truncation=True,
+            max_length=max_len,
+            return_tensors="pt",
+        ).to(device)
+
+        batch_size = query_embeds.shape[0]
+        ctx_begin_pos = [self.config.ctx_begin_pos] * batch_size
+
+        text_embeddings = self.text_encoder(
+            input_ids=tokenized_prompt.input_ids,
+            ctx_embeddings=query_embeds,
+            ctx_begin_pos=ctx_begin_pos,
+        )[0]
+
+        return text_embeddings
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: List[str],
+        reference_image: PIL.Image.Image,
+        source_subject_category: List[str],
+        target_subject_category: List[str],
+        latents: Optional[torch.Tensor] = None,
+        guidance_scale: float = 7.5,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        neg_prompt: Optional[str] = "",
+        prompt_strength: float = 1.0,
+        prompt_reps: int = 20,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`List[str]`):
+                The prompt or prompts to guide the image generation.
+            reference_image (`PIL.Image.Image`):
+                The reference image to condition the generation on.
+            source_subject_category (`List[str]`):
+                The source subject category.
+            target_subject_category (`List[str]`):
+                The target subject category.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by random sampling.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            height (`int`, *optional*, defaults to 512):
+                The height of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            neg_prompt (`str`, *optional*, defaults to ""):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_strength (`float`, *optional*, defaults to 1.0):
+                The strength of the prompt. Specifies the number of times the prompt is repeated along with prompt_reps
+                to amplify the prompt.
+            prompt_reps (`int`, *optional*, defaults to 20):
+                The number of times the prompt is repeated along with prompt_strength to amplify the prompt.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        device = self._execution_device
+
+        reference_image = self.image_processor.preprocess(
+            reference_image, image_mean=self.config.mean, image_std=self.config.std, return_tensors="pt"
+        )["pixel_values"]
+        reference_image = reference_image.to(device)
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if isinstance(source_subject_category, str):
+            source_subject_category = [source_subject_category]
+        if isinstance(target_subject_category, str):
+            target_subject_category = [target_subject_category]
+
+        batch_size = len(prompt)
+
+        prompt = self._build_prompt(
+            prompts=prompt,
+            tgt_subjects=target_subject_category,
+            prompt_strength=prompt_strength,
+            prompt_reps=prompt_reps,
+        )
+        query_embeds = self.get_query_embeddings(reference_image, source_subject_category)
+        text_embeddings = self.encode_prompt(query_embeds, prompt, device)
+        do_classifier_free_guidance = guidance_scale > 1.0
+        if do_classifier_free_guidance:
+            max_length = self.text_encoder.text_model.config.max_position_embeddings
+
+            uncond_input = self.tokenizer(
+                [neg_prompt] * batch_size,
+                padding="max_length",
+                max_length=max_length,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(
+                input_ids=uncond_input.input_ids.to(device),
+                ctx_embeddings=None,
+            )[0]
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        scale_down_factor = 2 ** (len(self.unet.config.block_out_channels) - 1)
+        latents = self.prepare_latents(
+            batch_size=batch_size,
+            num_channels=self.unet.config.in_channels,
+            height=height // scale_down_factor,
+            width=width // scale_down_factor,
+            generator=generator,
+            latents=latents,
+            dtype=self.unet.dtype,
+            device=device,
+        )
+        # set timesteps
+        extra_set_kwargs = {}
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            do_classifier_free_guidance = guidance_scale > 1.0
+
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            noise_pred = self.unet(
+                latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_embeddings,
+                down_block_additional_residuals=None,
+                mid_block_additional_residual=None,
+            )["sample"]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+            )["prev_sample"]
+
+        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/cobra/__init__.py b/diffusers/src/diffusers/pipelines/cobra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..20ff6d866a18e69bee0d5b35e040d9e322fbfc2c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/cobra/__init__.py
@@ -0,0 +1,52 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_cobra_pixart"] = ["CobraPixArtAlphaPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_cobra_pixart import (
+            ASPECT_RATIO_256_BIN,
+            ASPECT_RATIO_512_BIN,
+            ASPECT_RATIO_1024_BIN,
+            CobraPixArtAlphaPipeline,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/cobra/__pycache__/__init__.cpython-311.pyc b/diffusers/src/diffusers/pipelines/cobra/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d8fc7186d1392a168156238d2009187c0b47edb
Binary files /dev/null and b/diffusers/src/diffusers/pipelines/cobra/__pycache__/__init__.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/pipelines/cobra/__pycache__/pipeline_cobra_pixart.cpython-311.pyc b/diffusers/src/diffusers/pipelines/cobra/__pycache__/pipeline_cobra_pixart.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55a0511983e2b30bab6ad3d93cbb23adbf1abd38
Binary files /dev/null and b/diffusers/src/diffusers/pipelines/cobra/__pycache__/pipeline_cobra_pixart.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/pipelines/cobra/pipeline_cobra_pixart.py b/diffusers/src/diffusers/pipelines/cobra/pipeline_cobra_pixart.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e228537e2c5e3a46681448ce9a3d343f56c9032
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/cobra/pipeline_cobra_pixart.py
@@ -0,0 +1,859 @@
+# Copyright 2024 PixArt-Alpha Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    import torch_npu
+    from torch_npu.contrib import transfer_to_npu
+except:
+    print('torch_npu not found')
+import html
+import inspect
+import re
+import numpy as np
+from PIL import Image
+import urllib.parse as ul
+from typing import Callable, List, Optional, Tuple, Union
+from einops import rearrange
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...image_processor import PixArtImageProcessor
+from ...models import AutoencoderKL, CausalSparseDiTModel, CausalSparseDiTControlModel
+from ...schedulers import DPMSolverMultistepScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    deprecate,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from PIL import Image
+from torchvision import transforms
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import PixArtAlphaPipeline
+
+        >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too.
+        >>> pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
+        >>> # Enable memory optimizations.
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A small cactus with a happy face in the Sahara desert."
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+ASPECT_RATIO_1024_BIN = {
+    "0.25": [512.0, 2048.0],
+    "0.28": [512.0, 1856.0],
+    "0.32": [576.0, 1792.0],
+    "0.33": [576.0, 1728.0],
+    "0.35": [576.0, 1664.0],
+    "0.4": [640.0, 1600.0],
+    "0.42": [640.0, 1536.0],
+    "0.48": [704.0, 1472.0],
+    "0.5": [704.0, 1408.0],
+    "0.52": [704.0, 1344.0],
+    "0.57": [768.0, 1344.0],
+    "0.6": [768.0, 1280.0],
+    "0.68": [832.0, 1216.0],
+    "0.72": [832.0, 1152.0],
+    "0.78": [896.0, 1152.0],
+    "0.82": [896.0, 1088.0],
+    "0.88": [960.0, 1088.0],
+    "0.94": [960.0, 1024.0],
+    "1.0": [1024.0, 1024.0],
+    "1.07": [1024.0, 960.0],
+    "1.13": [1088.0, 960.0],
+    "1.21": [1088.0, 896.0],
+    "1.29": [1152.0, 896.0],
+    "1.38": [1152.0, 832.0],
+    "1.46": [1216.0, 832.0],
+    "1.67": [1280.0, 768.0],
+    "1.75": [1344.0, 768.0],
+    "2.0": [1408.0, 704.0],
+    "2.09": [1472.0, 704.0],
+    "2.4": [1536.0, 640.0],
+    "2.5": [1600.0, 640.0],
+    "3.0": [1728.0, 576.0],
+    "4.0": [2048.0, 512.0],
+}
+
+ASPECT_RATIO_512_BIN = {
+    "0.25": [256.0, 1024.0],
+    "0.28": [256.0, 928.0],
+    "0.32": [288.0, 896.0],
+    "0.33": [288.0, 864.0],
+    "0.35": [288.0, 832.0],
+    "0.4": [320.0, 800.0],
+    "0.42": [320.0, 768.0],
+    "0.48": [352.0, 736.0],
+    "0.5": [352.0, 704.0],
+    "0.52": [352.0, 672.0],
+    "0.57": [384.0, 672.0],
+    "0.6": [384.0, 640.0],
+    "0.68": [416.0, 608.0],
+    "0.72": [416.0, 576.0],
+    "0.78": [448.0, 576.0],
+    "0.82": [448.0, 544.0],
+    "0.88": [480.0, 544.0],
+    "0.94": [480.0, 512.0],
+    "1.0": [512.0, 512.0],
+    "1.07": [512.0, 480.0],
+    "1.13": [544.0, 480.0],
+    "1.21": [544.0, 448.0],
+    "1.29": [576.0, 448.0],
+    "1.38": [576.0, 416.0],
+    "1.46": [608.0, 416.0],
+    "1.67": [640.0, 384.0],
+    "1.75": [672.0, 384.0],
+    "2.0": [704.0, 352.0],
+    "2.09": [736.0, 352.0],
+    "2.4": [768.0, 320.0],
+    "2.5": [800.0, 320.0],
+    "3.0": [864.0, 288.0],
+    "4.0": [1024.0, 256.0],
+}
+
+ASPECT_RATIO_256_BIN = {
+    "0.25": [128.0, 512.0],
+    "0.28": [128.0, 464.0],
+    "0.32": [144.0, 448.0],
+    "0.33": [144.0, 432.0],
+    "0.35": [144.0, 416.0],
+    "0.4": [160.0, 400.0],
+    "0.42": [160.0, 384.0],
+    "0.48": [176.0, 368.0],
+    "0.5": [176.0, 352.0],
+    "0.52": [176.0, 336.0],
+    "0.57": [192.0, 336.0],
+    "0.6": [192.0, 320.0],
+    "0.68": [208.0, 304.0],
+    "0.72": [208.0, 288.0],
+    "0.78": [224.0, 288.0],
+    "0.82": [224.0, 272.0],
+    "0.88": [240.0, 272.0],
+    "0.94": [240.0, 256.0],
+    "1.0": [256.0, 256.0],
+    "1.07": [256.0, 240.0],
+    "1.13": [272.0, 240.0],
+    "1.21": [272.0, 224.0],
+    "1.29": [288.0, 224.0],
+    "1.38": [288.0, 208.0],
+    "1.46": [304.0, 208.0],
+    "1.67": [320.0, 192.0],
+    "1.75": [336.0, 192.0],
+    "2.0": [352.0, 176.0],
+    "2.09": [368.0, 176.0],
+    "2.4": [384.0, 160.0],
+    "2.5": [400.0, 160.0],
+    "3.0": [432.0, 144.0],
+    "4.0": [512.0, 128.0],
+}
+
+def mask_to_tensor(PIL_image, normalize=True):
+    """
+    将图像转换为形状为 (1, 1, H, W) 的 PyTorch 张量。
+
+    参数:
+        image_path (str): 图像文件路径。
+        normalize (bool): 是否将像素值归一化到 [0, 1]。默认为 True。
+
+    返回:
+        torch.Tensor: 形状为 (1, 1, H, W) 的张量。
+    """
+    # 打开图像并转换为灰度图像
+    image = PIL_image.convert('L')
+
+    # 将图像转换为 NumPy 数组
+    image_array = np.array(image)
+
+    # 调整数组形状为 (1, 1, H, W)
+    image_array = image_array[np.newaxis, np.newaxis, :, :]
+
+    # 将 NumPy 数组转换为 PyTorch 张量
+    image_tensor = torch.from_numpy(image_array)
+
+    # 如果需要将像素值归一化到 [0, 1]，可以除以 255
+    if normalize:
+        image_tensor = image_tensor.float() / 255.0
+
+    return image_tensor
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class CobraPixArtAlphaPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using PixArt-Alpha.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. PixArt-Alpha uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`CausalDiT`]):
+            A text conditioned `CausalDiT` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    model_cpu_offload_seq = "transformer->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        transformer: CausalSparseDiTModel,
+        controlnet: CausalSparseDiTControlModel,
+        scheduler: DPMSolverMultistepScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae, transformer=transformer, scheduler=scheduler, controlnet = controlnet
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+    ):
+        image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    def check_inputs(
+        self,
+        height,
+        width,
+        callback_steps,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            # print('generator', generator)
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+        # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents_ref(self, batch_size, num_refs, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_refs,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            # print('generator', generator)
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        num_inference_steps: int = 20,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        max_sequence_length: int = 120,
+        cond_input: PipelineImageInput = None,
+        cond_refs: list = None,
+        hint_mask: PipelineImageInput = None,
+        hint_color: PipelineImageInput = None,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 4.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not
+                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            use_resolution_binning (`bool` defaults to `True`):
+                If set to `True`, the requested height and width are first mapped to the closest resolutions using
+                `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
+                the requested resolution. Useful for generating non-square images.
+            max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        if "mask_feature" in kwargs:
+            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+        # 1. Check inputs. Raise error if not correct
+        # height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        # width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        # if use_resolution_binning:
+        #     if self.transformer.config.sample_size == 128:
+        #         aspect_ratio_bin = ASPECT_RATIO_1024_BIN
+        #     elif self.transformer.config.sample_size == 64:
+        #         aspect_ratio_bin = ASPECT_RATIO_512_BIN
+        #     elif self.transformer.config.sample_size == 32:
+        #         aspect_ratio_bin = ASPECT_RATIO_256_BIN
+        #     else:
+        #         raise ValueError("Invalid sample size")
+        #     orig_height, orig_width = height, width
+        #     # height, width = self.image_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)
+        #     height,width = orig_height,orig_width
+        width, height = cond_input.size
+        hint_width, hint_height = hint_color.size
+        if hint_width != width or hint_height != height:
+            raise ValueError(f"Width and height of hint_color must be the same as cond_input, but got {hint_width} and {hint_height} for cond_input with size {width} and {height}.")
+        for tmp_i in range(len(cond_refs)):
+            if cond_refs[tmp_i]!=[]:
+                width_ref, height_ref = cond_refs[tmp_i][0].size
+                break
+        if width_ref*2 != width or height_ref*2 != height:
+            raise ValueError(f"Width and height of cond_refs must be twice the size of cond_input, but got {width_ref} and {height_ref} for cond_input with size {width} and {height}.")
+
+        self.check_inputs(
+            height,
+            width,
+            callback_steps,
+        )
+
+        # 2. Default height and width to transformer
+        batch_size = 1
+        n_ref_idx0 = len(cond_refs[0])
+        n_ref_idx1 = len(cond_refs[1])
+        n_ref_idx2 = len(cond_refs[2])
+        n_ref_idx3 = len(cond_refs[3])
+        cond_refs_idx0 = cond_refs[0]
+        cond_refs_idx1 = cond_refs[1]
+        cond_refs_idx2 = cond_refs[2]
+        cond_refs_idx3 = cond_refs[3]
+        N_ref = n_ref_idx0 + n_ref_idx1 + n_ref_idx2 + n_ref_idx3
+        num_ref_list = [n_ref_idx0, n_ref_idx1, n_ref_idx2, n_ref_idx3]
+        print('num_ref_list',num_ref_list)
+        num_images_per_prompt = 1
+
+        device = self._execution_device
+
+        prompt_embeds = torch.load('/mnt/workspace/zhuangjunhao/PixArt_RAG/causaldit_train/prompt_tensor/prompt_embeds.pt').unsqueeze(0).repeat(batch_size * num_images_per_prompt, 1, 1)
+        prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype, device=device)
+        prompt_attention_mask = torch.load('/mnt/workspace/zhuangjunhao/PixArt_RAG/causaldit_train/prompt_tensor/prompt_attention_mask.pt').unsqueeze(0).repeat(batch_size * num_images_per_prompt,1)
+        prompt_attention_mask = prompt_attention_mask.to(dtype=self.transformer.dtype, device=device)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = False
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            latent_channels,
+            height,
+            width,
+            self.transformer.dtype,
+            device,
+            generator,
+            # latents,
+        )
+        latents_ref = self.prepare_latents_ref(
+            batch_size * num_images_per_prompt,
+            N_ref,
+            latent_channels,
+            height_ref,
+            width_ref,
+            self.transformer.dtype,
+            device,
+            generator,
+        )
+
+
+        cond_input = self.prepare_image(
+            image=cond_input,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=self.controlnet.dtype,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+
+        hint_color = self.prepare_image(
+            image=hint_color,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=self.controlnet.dtype,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+
+        cond_refs_idx0 = [self.prepare_image(
+            image=cond_ref,
+            width=width_ref,
+            height=height_ref,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=self.controlnet.dtype,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        ) for cond_ref in cond_refs_idx0]
+        if len(cond_refs_idx0) > 0: 
+            cond_refs_idx0 = torch.cat(cond_refs_idx0, dim=0)
+        else:
+            cond_refs_idx0 = torch.zeros(0,0,0,0).to(dtype=self.controlnet.dtype, device=device)
+        cond_refs_idx1 = [self.prepare_image(
+            image=cond_ref,
+            width=width_ref,
+            height=height_ref,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=self.controlnet.dtype,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        ) for cond_ref in cond_refs_idx1]
+        if len(cond_refs_idx1) > 0:
+            cond_refs_idx1 = torch.cat(cond_refs_idx1, dim=0)
+        else:
+            cond_refs_idx1 = torch.zeros(0,0,0,0).to(dtype=self.controlnet.dtype, device=device)
+        cond_refs_idx2 = [self.prepare_image(
+            image=cond_ref,
+            width=width_ref,
+            height=height_ref,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=self.controlnet.dtype,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        ) for cond_ref in cond_refs_idx2]
+        if len(cond_refs_idx2) > 0:
+            cond_refs_idx2 = torch.cat(cond_refs_idx2, dim=0)
+        else:
+            cond_refs_idx2 = torch.zeros(0,0,0,0).to(dtype=self.controlnet.dtype, device=device)
+        cond_refs_idx3 = [self.prepare_image(
+            image=cond_ref,
+            width=width_ref,
+            height=height_ref,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=self.controlnet.dtype,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        ) for cond_ref in cond_refs_idx3]
+        if len(cond_refs_idx3) > 0:
+            cond_refs_idx3 = torch.cat(cond_refs_idx3, dim=0)
+        else:
+            cond_refs_idx3 = torch.zeros(0,0,0,0).to(dtype=self.controlnet.dtype, device=device)
+
+        all_cond_refs = []
+        if num_ref_list[0] != 0:
+            all_cond_refs.append(cond_refs_idx0)
+        if num_ref_list[1] != 0:
+            all_cond_refs.append(cond_refs_idx1)
+        if num_ref_list[2] != 0:
+            all_cond_refs.append(cond_refs_idx2)
+        if num_ref_list[3] != 0:
+            all_cond_refs.append(cond_refs_idx3)
+        cond_refs = torch.cat(all_cond_refs, dim=0)
+        # print('cond_refs',cond_refs.shape)
+
+        hint_mask = mask_to_tensor(hint_mask).to(dtype=self.controlnet.dtype, device=device)
+        # print('pipeline hint_mask',torch.max(hint_mask),torch.min(hint_mask))
+        # hint_mask_pil = transforms.ToPILImage()(hint_mask[0,:,:,:])
+        # hint_mask_pil.save('hint_mask.png')
+
+        
+
+
+
+        height, width = cond_input.shape[-2:]
+        # print(self.vae.dtype, self.controlnet.dtype, cond_image.dtype)
+        cond_input_latent = self.vae.encode(cond_input.to(dtype = self.vae.dtype)).latent_dist.sample() * self.vae.config.scaling_factor
+        cond_refs_latent=self.vae.encode(cond_refs.to(dtype = self.vae.dtype)).latent_dist.sample() * self.vae.config.scaling_factor
+        hint_color_latent = self.vae.encode(hint_color.to(dtype = self.vae.dtype)).latent_dist.sample() * self.vae.config.scaling_factor
+
+
+        cond_refs_latent = cond_refs_latent.unsqueeze(0) # 1 n_ref c h w
+        # print('cond_refs_latent',cond_refs_latent.shape)
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Prepare micro-conditions.
+        added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+        # if self.transformer.config.sample_size == 128:
+        add_width = width*2
+        resolution = torch.tensor([height, add_width]).repeat(batch_size * num_images_per_prompt, 1)
+        aspect_ratio = torch.tensor([float(height / add_width)]).repeat(batch_size * num_images_per_prompt, 1)
+        resolution = resolution.to(dtype=prompt_embeds.dtype, device=device)
+        aspect_ratio = aspect_ratio.to(dtype=prompt_embeds.dtype, device=device)
+
+        added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}
+        # added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+
+        K_cache = None
+        V_cache = None
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                
+                no_cache = K_cache is None and V_cache is None
+
+                latent_model_input = latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            
+
+                current_timestep = t
+                if not torch.is_tensor(current_timestep):
+                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                    # This would be a good case for the `match` statement (Python 3.10+)
+                    is_mps = latent_model_input.device.type == "mps"
+                    if isinstance(current_timestep, float):
+                        dtype = torch.float32 if is_mps else torch.float64
+                    else:
+                        dtype = torch.int32 if is_mps else torch.int64
+                    current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
+                elif len(current_timestep.shape) == 0:
+                    current_timestep = current_timestep[None].to(latent_model_input.device)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(latent_model_input.shape[0])
+
+                control_input = torch.concat([latent_model_input, cond_input_latent, hint_color_latent, hint_mask],1)
+
+                control_list = self.controlnet(
+                    control_input,
+                    encoder_hidden_states=None,
+                    encoder_attention_mask=None,
+                    timestep=current_timestep,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                if no_cache:
+                    noise_pred, K_cache, V_cache = self.transformer(
+                        latent_model_input.to(dtype=self.transformer.dtype),
+                        cond_refs_latent.to(dtype=self.transformer.dtype),
+                        n_ref_lists = [num_ref_list],
+                        encoder_hidden_states=prompt_embeds.to(dtype=self.transformer.dtype),
+                        encoder_attention_mask=prompt_attention_mask,
+                        timestep=current_timestep,
+                        added_cond_kwargs=added_cond_kwargs,
+                        control_list = control_list,
+                        return_dict=False,
+                        K_cache=None,
+                        V_cache=None,
+                    )
+                else:
+                    noise_pred, _, _ = self.transformer(
+                        latent_model_input.to(dtype=self.transformer.dtype),
+                        None,
+                        n_ref_lists = None,
+                        encoder_hidden_states=prompt_embeds.to(dtype=self.transformer.dtype),
+                        encoder_attention_mask=prompt_attention_mask,
+                        timestep=current_timestep,
+                        added_cond_kwargs=added_cond_kwargs,
+                        control_list = control_list,
+                        return_dict=False,
+                        K_cache=K_cache,
+                        V_cache=V_cache,
+                    )
+
+                # learned sigma
+                if self.transformer.config.out_channels // 2 == latent_channels:
+                    noise_pred = noise_pred.chunk(2, dim=1)[0]
+                else:
+                    noise_pred = noise_pred
+
+                # compute previous image: x_t -> x_t-1
+                if num_inference_steps == 1:
+
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).pred_original_sample
+
+                else:
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        ref_out_idx = 0
+        if not output_type == "latent":
+            image = self.vae.decode(latents.to(dtype = self.vae.dtype) / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        return (image,)
diff --git a/diffusers/src/diffusers/pipelines/cogvideo/__init__.py b/diffusers/src/diffusers/pipelines/cogvideo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd60fcea999462c7574ac1c8995473b9fc41ab84
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/cogvideo/__init__.py
@@ -0,0 +1,52 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_cogvideox"] = ["CogVideoXPipeline"]
+    _import_structure["pipeline_cogvideox_image2video"] = ["CogVideoXImageToVideoPipeline"]
+    _import_structure["pipeline_cogvideox_video2video"] = ["CogVideoXVideoToVideoPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_cogvideox import CogVideoXPipeline
+        from .pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
+        from .pipeline_cogvideox_video2video import CogVideoXVideoToVideoPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py b/diffusers/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py
new file mode 100644
index 0000000000000000000000000000000000000000..82839ffd2c92d7bfb05d2a4b9efa05f23b3ec556
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py
@@ -0,0 +1,744 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...loaders import CogVideoXLoraLoaderMixin
+from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
+from ...models.embeddings import get_3d_rotary_pos_embed
+from ...pipelines.pipeline_utils import DiffusionPipeline
+from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from ...utils import logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from .pipeline_output import CogVideoXPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> from diffusers import CogVideoXPipeline
+        >>> from diffusers.utils import export_to_video
+
+        >>> # Models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
+        >>> pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to("cuda")
+        >>> prompt = (
+        ...     "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
+        ...     "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
+        ...     "pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
+        ...     "casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
+        ...     "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
+        ...     "atmosphere of this unique musical performance."
+        ... )
+        >>> video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
+        >>> export_to_video(video, "output.mp4", fps=8)
+        ```
+"""
+
+
+# Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
+def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
+    tw = tgt_width
+    th = tgt_height
+    h, w = src
+    r = h / w
+    if r > (th / tw):
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
+    r"""
+    Pipeline for text-to-video generation using CogVideoX.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. CogVideoX uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`CogVideoXTransformer3DModel`]):
+            A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+    """
+
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        transformer: CogVideoXTransformer3DModel,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+        self.vae_scale_factor_spatial = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.vae_scale_factor_temporal = (
+            self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
+        )
+        self.vae_scaling_factor_image = (
+            self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
+        )
+
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        shape = (
+            batch_size,
+            (num_frames - 1) // self.vae_scale_factor_temporal + 1,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+        latents = 1 / self.vae_scaling_factor_image * latents
+
+        frames = self.vae.decode(latents).sample
+        return frames
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def fuse_qkv_projections(self) -> None:
+        r"""Enables fused QKV projections."""
+        self.fusing_transformer = True
+        self.transformer.fuse_qkv_projections()
+
+    def unfuse_qkv_projections(self) -> None:
+        r"""Disable QKV projection fusion if enabled."""
+        if not self.fusing_transformer:
+            logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
+        else:
+            self.transformer.unfuse_qkv_projections()
+            self.fusing_transformer = False
+
+    def _prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+
+        grid_crops_coords = get_resize_crop_region_for_grid(
+            (grid_height, grid_width), base_size_width, base_size_height
+        )
+        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+            embed_dim=self.transformer.config.attention_head_dim,
+            crops_coords=grid_crops_coords,
+            grid_size=(grid_height, grid_width),
+            temporal_size=num_frames,
+        )
+
+        freqs_cos = freqs_cos.to(device=device)
+        freqs_sin = freqs_sin.to(device=device)
+        return freqs_cos, freqs_sin
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 480,
+        width: int = 720,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The height in pixels of the generated image. This is set to 480 by default for the best results.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The width in pixels of the generated image. This is set to 720 by default for the best results.
+            num_frames (`int`, defaults to `48`):
+                Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
+                contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
+                num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
+                needs to be satisfied is that of divisibility mentioned above.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `226`):
+                Maximum sequence length in encoded prompt. Must be consistent with
+                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput`] or `tuple`:
+            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        if num_frames > 49:
+            raise ValueError(
+                "The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation."
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_on_step_end_tensor_inputs,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            negative_prompt,
+            do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Create rotary embeds if required
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    image_rotary_emb=image_rotary_emb,
+                    attention_kwargs=attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.float()
+
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latent":
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return CogVideoXPipelineOutput(frames=video)
diff --git a/diffusers/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py b/diffusers/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py
new file mode 100644
index 0000000000000000000000000000000000000000..afc11bce00d5bda8491f9044792b8621da38b07a
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py
@@ -0,0 +1,825 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import PIL
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput
+from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
+from ...models.embeddings import get_3d_rotary_pos_embed
+from ...pipelines.pipeline_utils import DiffusionPipeline
+from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from .pipeline_output import CogVideoXPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import CogVideoXImageToVideoPipeline
+        >>> from diffusers.utils import export_to_video, load_image
+
+        >>> pipe = CogVideoXImageToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b-I2V", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+
+        >>> prompt = "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
+        ... )
+        >>> video = pipe(image, prompt, use_dynamic_cfg=True)
+        >>> export_to_video(video.frames[0], "output.mp4", fps=8)
+        ```
+"""
+
+
+# Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
+def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
+    tw = tgt_width
+    th = tgt_height
+    h, w = src
+    r = h / w
+    if r > (th / tw):
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class CogVideoXImageToVideoPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image-to-video generation using CogVideoX.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. CogVideoX uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`CogVideoXTransformer3DModel`]):
+            A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+    """
+
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        transformer: CogVideoXTransformer3DModel,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor_spatial = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.vae_scale_factor_temporal = (
+            self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
+        )
+        self.vae_scaling_factor_image = (
+            self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
+        )
+
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def prepare_latents(
+        self,
+        image: torch.Tensor,
+        batch_size: int = 1,
+        num_channels_latents: int = 16,
+        num_frames: int = 13,
+        height: int = 60,
+        width: int = 90,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+
+        image = image.unsqueeze(2)  # [B, C, F, H, W]
+
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
+            ]
+        else:
+            image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
+
+        image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4)  # [B, F, C, H, W]
+        image_latents = self.vae_scaling_factor_image * image_latents
+
+        padding_shape = (
+            batch_size,
+            num_frames - 1,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+        latent_padding = torch.zeros(padding_shape, device=device, dtype=dtype)
+        image_latents = torch.cat([image_latents, latent_padding], dim=1)
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents, image_latents
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+        latents = 1 / self.vae_scaling_factor_image * latents
+
+        frames = self.vae.decode(latents).sample
+        return frames
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff_video2video.AnimateDiffVideoToVideoPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, timesteps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        image,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        latents=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self) -> None:
+        r"""Enables fused QKV projections."""
+        self.fusing_transformer = True
+        self.transformer.fuse_qkv_projections()
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self) -> None:
+        r"""Disable QKV projection fusion if enabled."""
+        if not self.fusing_transformer:
+            logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
+        else:
+            self.transformer.unfuse_qkv_projections()
+            self.fusing_transformer = False
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._prepare_rotary_positional_embeddings
+    def _prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+
+        grid_crops_coords = get_resize_crop_region_for_grid(
+            (grid_height, grid_width), base_size_width, base_size_height
+        )
+        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+            embed_dim=self.transformer.config.attention_head_dim,
+            crops_coords=grid_crops_coords,
+            grid_size=(grid_height, grid_width),
+            temporal_size=num_frames,
+        )
+
+        freqs_cos = freqs_cos.to(device=device)
+        freqs_sin = freqs_sin.to(device=device)
+        return freqs_cos, freqs_sin
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 480,
+        width: int = 720,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`PipelineImageInput`):
+                The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The height in pixels of the generated image. This is set to 480 by default for the best results.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The width in pixels of the generated image. This is set to 720 by default for the best results.
+            num_frames (`int`, defaults to `48`):
+                Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
+                contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
+                num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
+                needs to be satisfied is that of divisibility mentioned above.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `226`):
+                Maximum sequence length in encoded prompt. Must be consistent with
+                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] or `tuple`:
+            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        if num_frames > 49:
+            raise ValueError(
+                "The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation."
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            image=image,
+            prompt=prompt,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._interrupt = False
+
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latents
+        image = self.video_processor.preprocess(image, height=height, width=width).to(
+            device, dtype=prompt_embeds.dtype
+        )
+
+        latent_channels = self.transformer.config.in_channels // 2
+        latents, image_latents = self.prepare_latents(
+            image,
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Create rotary embeds if required
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                latent_image_input = torch.cat([image_latents] * 2) if do_classifier_free_guidance else image_latents
+                latent_model_input = torch.cat([latent_model_input, latent_image_input], dim=2)
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    image_rotary_emb=image_rotary_emb,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.float()
+
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latent":
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return CogVideoXPipelineOutput(frames=video)
diff --git a/diffusers/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py b/diffusers/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py
new file mode 100644
index 0000000000000000000000000000000000000000..35f8f2fa050840c52588e606cd194fe761e065ae
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py
@@ -0,0 +1,825 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from PIL import Image
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...loaders import CogVideoXLoraLoaderMixin
+from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
+from ...models.embeddings import get_3d_rotary_pos_embed
+from ...pipelines.pipeline_utils import DiffusionPipeline
+from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from ...utils import logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from .pipeline_output import CogVideoXPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> from diffusers import CogVideoXDPMScheduler, CogVideoXVideoToVideoPipeline
+        >>> from diffusers.utils import export_to_video, load_video
+
+        >>> # Models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
+        >>> pipe = CogVideoXVideoToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config)
+
+        >>> input_video = load_video(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hiker.mp4"
+        ... )
+        >>> prompt = (
+        ...     "An astronaut stands triumphantly at the peak of a towering mountain. Panorama of rugged peaks and "
+        ...     "valleys. Very futuristic vibe and animated aesthetic. Highlights of purple and golden colors in "
+        ...     "the scene. The sky is looks like an animated/cartoonish dream of galaxies, nebulae, stars, planets, "
+        ...     "moons, but the remainder of the scene is mostly realistic."
+        ... )
+
+        >>> video = pipe(
+        ...     video=input_video, prompt=prompt, strength=0.8, guidance_scale=6, num_inference_steps=50
+        ... ).frames[0]
+        >>> export_to_video(video, "output.mp4", fps=8)
+        ```
+"""
+
+
+# Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
+def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
+    tw = tgt_width
+    th = tgt_height
+    h, w = src
+    r = h / w
+    if r > (th / tw):
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
+    r"""
+    Pipeline for video-to-video generation using CogVideoX.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. CogVideoX uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`CogVideoXTransformer3DModel`]):
+            A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+    """
+
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        transformer: CogVideoXTransformer3DModel,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+
+        self.vae_scale_factor_spatial = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.vae_scale_factor_temporal = (
+            self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
+        )
+        self.vae_scaling_factor_image = (
+            self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
+        )
+
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def prepare_latents(
+        self,
+        video: Optional[torch.Tensor] = None,
+        batch_size: int = 1,
+        num_channels_latents: int = 16,
+        height: int = 60,
+        width: int = 90,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.Tensor] = None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        num_frames = (video.size(2) - 1) // self.vae_scale_factor_temporal + 1 if latents is None else latents.size(1)
+
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+
+        if latents is None:
+            if isinstance(generator, list):
+                if len(generator) != batch_size:
+                    raise ValueError(
+                        f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                        f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                    )
+
+                init_latents = [
+                    retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
+                ]
+            else:
+                init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
+
+            init_latents = torch.cat(init_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4)  # [B, F, C, H, W]
+            init_latents = self.vae_scaling_factor_image * init_latents
+
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+        latents = 1 / self.vae_scaling_factor_image * latents
+
+        frames = self.vae.decode(latents).sample
+        return frames
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff_video2video.AnimateDiffVideoToVideoPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, timesteps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        strength,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        video=None,
+        latents=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if video is not None and latents is not None:
+            raise ValueError("Only one of `video` or `latents` should be provided")
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self) -> None:
+        r"""Enables fused QKV projections."""
+        self.fusing_transformer = True
+        self.transformer.fuse_qkv_projections()
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self) -> None:
+        r"""Disable QKV projection fusion if enabled."""
+        if not self.fusing_transformer:
+            logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
+        else:
+            self.transformer.unfuse_qkv_projections()
+            self.fusing_transformer = False
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._prepare_rotary_positional_embeddings
+    def _prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+
+        grid_crops_coords = get_resize_crop_region_for_grid(
+            (grid_height, grid_width), base_size_width, base_size_height
+        )
+        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+            embed_dim=self.transformer.config.attention_head_dim,
+            crops_coords=grid_crops_coords,
+            grid_size=(grid_height, grid_width),
+            temporal_size=num_frames,
+        )
+
+        freqs_cos = freqs_cos.to(device=device)
+        freqs_sin = freqs_sin.to(device=device)
+        return freqs_cos, freqs_sin
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        video: List[Image.Image] = None,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 480,
+        width: int = 720,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        strength: float = 0.8,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            video (`List[PIL.Image.Image]`):
+                The input video to condition the generation on. Must be a list of images/frames of the video.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The height in pixels of the generated image. This is set to 480 by default for the best results.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The width in pixels of the generated image. This is set to 720 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            strength (`float`, *optional*, defaults to 0.8):
+                Higher strength leads to more differences between original video and generated video.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `226`):
+                Maximum sequence length in encoded prompt. Must be consistent with
+                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] or `tuple`:
+            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            strength=strength,
+            negative_prompt=negative_prompt,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            video=video,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            negative_prompt,
+            do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latents
+        if latents is None:
+            video = self.video_processor.preprocess_video(video, height=height, width=width)
+            video = video.to(device=device, dtype=prompt_embeds.dtype)
+
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            video,
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            latent_timestep,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Create rotary embeds if required
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    image_rotary_emb=image_rotary_emb,
+                    attention_kwargs=attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.float()
+
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latent":
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return CogVideoXPipelineOutput(frames=video)
diff --git a/diffusers/src/diffusers/pipelines/cogvideo/pipeline_output.py b/diffusers/src/diffusers/pipelines/cogvideo/pipeline_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..3de030dd6928db49ab0bc4d11868a93ac98dea50
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/cogvideo/pipeline_output.py
@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+
+import torch
+
+from diffusers.utils import BaseOutput
+
+
+@dataclass
+class CogVideoXPipelineOutput(BaseOutput):
+    r"""
+    Output class for CogVideo pipelines.
+
+    Args:
+        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+            `(batch_size, num_frames, channels, height, width)`.
+    """
+
+    frames: torch.Tensor
diff --git a/diffusers/src/diffusers/pipelines/colorflow/__init__.py b/diffusers/src/diffusers/pipelines/colorflow/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38062f9ef8a4d2e8e552d0f0a47de3143f4469d2
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/colorflow/__init__.py
@@ -0,0 +1,54 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_colorflow_pixart"] = ["ColorFlowPixArtAlphaPipeline"]
+    _import_structure["pipeline_colorflow_sd"] = ["ColorFlowSDPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_colorflow_pixart import (
+            ASPECT_RATIO_256_BIN,
+            ASPECT_RATIO_512_BIN,
+            ASPECT_RATIO_1024_BIN,
+            ColorFlowPixArtAlphaPipeline,
+        )
+        from .pipeline_colorflow_sd import ColorFlowSDPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/colorflow/pipeline_colorflow_pixart.py b/diffusers/src/diffusers/pipelines/colorflow/pipeline_colorflow_pixart.py
new file mode 100644
index 0000000000000000000000000000000000000000..68cce5402dfbd2d99f86175f6fd698518d8f5832
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/colorflow/pipeline_colorflow_pixart.py
@@ -0,0 +1,1087 @@
+# Copyright 2024 PixArt-Alpha Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...image_processor import PixArtImageProcessor
+from ...models import AutoencoderKL, PixArtTransformer2DModel, ColorGuiderPixArtModel
+from ...schedulers import DPMSolverMultistepScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    deprecate,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from PIL import Image
+from torchvision import transforms
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import PixArtAlphaPipeline
+
+        >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too.
+        >>> pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
+        >>> # Enable memory optimizations.
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A small cactus with a happy face in the Sahara desert."
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+ASPECT_RATIO_1024_BIN = {
+    "0.25": [512.0, 2048.0],
+    "0.28": [512.0, 1856.0],
+    "0.32": [576.0, 1792.0],
+    "0.33": [576.0, 1728.0],
+    "0.35": [576.0, 1664.0],
+    "0.4": [640.0, 1600.0],
+    "0.42": [640.0, 1536.0],
+    "0.48": [704.0, 1472.0],
+    "0.5": [704.0, 1408.0],
+    "0.52": [704.0, 1344.0],
+    "0.57": [768.0, 1344.0],
+    "0.6": [768.0, 1280.0],
+    "0.68": [832.0, 1216.0],
+    "0.72": [832.0, 1152.0],
+    "0.78": [896.0, 1152.0],
+    "0.82": [896.0, 1088.0],
+    "0.88": [960.0, 1088.0],
+    "0.94": [960.0, 1024.0],
+    "1.0": [1024.0, 1024.0],
+    "1.07": [1024.0, 960.0],
+    "1.13": [1088.0, 960.0],
+    "1.21": [1088.0, 896.0],
+    "1.29": [1152.0, 896.0],
+    "1.38": [1152.0, 832.0],
+    "1.46": [1216.0, 832.0],
+    "1.67": [1280.0, 768.0],
+    "1.75": [1344.0, 768.0],
+    "2.0": [1408.0, 704.0],
+    "2.09": [1472.0, 704.0],
+    "2.4": [1536.0, 640.0],
+    "2.5": [1600.0, 640.0],
+    "3.0": [1728.0, 576.0],
+    "4.0": [2048.0, 512.0],
+}
+
+ASPECT_RATIO_512_BIN = {
+    "0.25": [256.0, 1024.0],
+    "0.28": [256.0, 928.0],
+    "0.32": [288.0, 896.0],
+    "0.33": [288.0, 864.0],
+    "0.35": [288.0, 832.0],
+    "0.4": [320.0, 800.0],
+    "0.42": [320.0, 768.0],
+    "0.48": [352.0, 736.0],
+    "0.5": [352.0, 704.0],
+    "0.52": [352.0, 672.0],
+    "0.57": [384.0, 672.0],
+    "0.6": [384.0, 640.0],
+    "0.68": [416.0, 608.0],
+    "0.72": [416.0, 576.0],
+    "0.78": [448.0, 576.0],
+    "0.82": [448.0, 544.0],
+    "0.88": [480.0, 544.0],
+    "0.94": [480.0, 512.0],
+    "1.0": [512.0, 512.0],
+    "1.07": [512.0, 480.0],
+    "1.13": [544.0, 480.0],
+    "1.21": [544.0, 448.0],
+    "1.29": [576.0, 448.0],
+    "1.38": [576.0, 416.0],
+    "1.46": [608.0, 416.0],
+    "1.67": [640.0, 384.0],
+    "1.75": [672.0, 384.0],
+    "2.0": [704.0, 352.0],
+    "2.09": [736.0, 352.0],
+    "2.4": [768.0, 320.0],
+    "2.5": [800.0, 320.0],
+    "3.0": [864.0, 288.0],
+    "4.0": [1024.0, 256.0],
+}
+
+ASPECT_RATIO_256_BIN = {
+    "0.25": [128.0, 512.0],
+    "0.28": [128.0, 464.0],
+    "0.32": [144.0, 448.0],
+    "0.33": [144.0, 432.0],
+    "0.35": [144.0, 416.0],
+    "0.4": [160.0, 400.0],
+    "0.42": [160.0, 384.0],
+    "0.48": [176.0, 368.0],
+    "0.5": [176.0, 352.0],
+    "0.52": [176.0, 336.0],
+    "0.57": [192.0, 336.0],
+    "0.6": [192.0, 320.0],
+    "0.68": [208.0, 304.0],
+    "0.72": [208.0, 288.0],
+    "0.78": [224.0, 288.0],
+    "0.82": [224.0, 272.0],
+    "0.88": [240.0, 272.0],
+    "0.94": [240.0, 256.0],
+    "1.0": [256.0, 256.0],
+    "1.07": [256.0, 240.0],
+    "1.13": [272.0, 240.0],
+    "1.21": [272.0, 224.0],
+    "1.29": [288.0, 224.0],
+    "1.38": [288.0, 208.0],
+    "1.46": [304.0, 208.0],
+    "1.67": [320.0, 192.0],
+    "1.75": [336.0, 192.0],
+    "2.0": [352.0, 176.0],
+    "2.09": [368.0, 176.0],
+    "2.4": [384.0, 160.0],
+    "2.5": [400.0, 160.0],
+    "3.0": [432.0, 144.0],
+    "4.0": [512.0, 128.0],
+}
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class ColorFlowPixArtAlphaPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using PixArt-Alpha.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. PixArt-Alpha uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`PixArtTransformer2DModel`]):
+            A text conditioned `PixArtTransformer2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder"]
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKL,
+        transformer: PixArtTransformer2DModel,
+        colorguider: ColorGuiderPixArtModel,
+        scheduler: DPMSolverMultistepScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler, colorguider = colorguider
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str = "",
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+        max_sequence_length: int = 120,
+        **kwargs,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+                PixArt-Alpha, this should be "".
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
+                string.
+            clean_caption (`bool`, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+            max_sequence_length (`int`, defaults to 120): Maximum sequence length to use for the prompt.
+        """
+
+        if "mask_feature" in kwargs:
+            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # See Section 3.1. of the paper.
+        max_length = max_sequence_length
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because T5 can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            prompt_attention_mask = text_inputs.attention_mask
+            prompt_attention_mask = prompt_attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.transformer is not None:
+            dtype = self.transformer.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            negative_prompt_attention_mask = uncond_input.attention_mask
+            negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
+        else:
+            negative_prompt_embeds = None
+            negative_prompt_attention_mask = None
+
+        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+    ):
+        image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_steps,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: str = "",
+        num_inference_steps: int = 20,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 4.5,
+        num_images_per_prompt: Optional[int] = 1,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        use_resolution_binning: bool = True,
+        max_sequence_length: int = 120,
+        cond_image: PipelineImageInput = None,
+        cond_mask: PipelineImageInput = None,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 4.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not
+                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            use_resolution_binning (`bool` defaults to `True`):
+                If set to `True`, the requested height and width are first mapped to the closest resolutions using
+                `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
+                the requested resolution. Useful for generating non-square images.
+            max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        if "mask_feature" in kwargs:
+            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+        # 1. Check inputs. Raise error if not correct
+        # height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        # width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        # if use_resolution_binning:
+        #     if self.transformer.config.sample_size == 128:
+        #         aspect_ratio_bin = ASPECT_RATIO_1024_BIN
+        #     elif self.transformer.config.sample_size == 64:
+        #         aspect_ratio_bin = ASPECT_RATIO_512_BIN
+        #     elif self.transformer.config.sample_size == 32:
+        #         aspect_ratio_bin = ASPECT_RATIO_256_BIN
+        #     else:
+        #         raise ValueError("Invalid sample size")
+        #     orig_height, orig_width = height, width
+        #     # height, width = self.image_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)
+        #     height,width = orig_height,orig_width
+        width, height = cond_image.size
+        orig_height, orig_width = height, width
+
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_steps,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        )
+
+        # 2. Default height and width to transformer
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            clean_caption=clean_caption,
+            max_sequence_length=max_sequence_length,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            latent_channels,
+            height,
+            width,
+            self.transformer.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # width, height = image.size
+
+        center_width = width // 2
+        center_height = height // 2
+        crop_width = width // 2
+        crop_height = height // 2
+
+        left = (width - crop_width) // 2
+        top = (height - crop_height) // 2
+        right = left + crop_width
+        bottom = top + crop_height
+        # 创建一个新的图像，大小与原图相同，填充为黑色
+        image_A = Image.new('RGB', (width, height), (0, 0, 0))
+
+        # 将原图粘贴到新图像的外围区域
+        image_A.paste(cond_image, (0, 0))
+
+        # 将中心区域设置为黑色
+        image_A.paste(Image.new('RGB', (crop_width, crop_height), (0, 0, 0)), (left, top))
+        # 创建一个新的图像，大小与原图相同，填充为黑色
+        image_B = Image.new('RGB', (width, height), (0, 0, 0))
+
+        # 将原图的中心区域粘贴到新图像的中心区域
+        image_B.paste(cond_image.crop((left, top, right, bottom)), (left, top))
+
+
+        cond_image = self.prepare_image(
+            image=image_A,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=self.colorguider.dtype,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+        image_bw = self.prepare_image(
+            image=image_B,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=self.colorguider.dtype,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+        original_mask = transforms.ToTensor()(cond_mask).to(device=cond_image.device, dtype=cond_image.dtype).unsqueeze(0)
+        # print(original_mask.shape)
+        original_mask=(1 - original_mask[:,0:1,:,:]).to(cond_image.dtype)
+        original_mask = torch.cat([original_mask] * 2)
+        height, width = cond_image.shape[-2:]
+        conditioning_latents = self.vae.encode(cond_image.to(dtype = self.vae.dtype)).latent_dist.sample() * self.vae.config.scaling_factor
+        conditioning_latents_bw=self.vae.encode(image_bw.to(dtype = self.vae.dtype)).latent_dist.sample() * self.vae.config.scaling_factor
+        color_mask = torch.nn.functional.interpolate(
+                    original_mask, 
+                    size=(
+                        conditioning_latents.shape[-2], 
+                        conditioning_latents.shape[-1]
+                    ),
+                    mode='bilinear'
+                )
+        # print('color_mask',color_mask)
+        # print('color_mask',color_mask[0,0,conditioning_latents.shape[-2]//2:conditioning_latents.shape[-2]//2+3,conditioning_latents.shape[-1]//2:conditioning_latents.shape[-1]//2+3])
+        # conditioning_latents = torch.concat([conditioning_latents,color_mask],1)
+        conditioning_latents = torch.concat([conditioning_latents, color_mask, conditioning_latents_bw],1)
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Prepare micro-conditions.
+        added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+        # if self.transformer.config.sample_size == 128:
+        resolution = torch.tensor([height, width]).repeat(batch_size * num_images_per_prompt, 1)
+        aspect_ratio = torch.tensor([float(height / width)]).repeat(batch_size * num_images_per_prompt, 1)
+        resolution = resolution.to(dtype=prompt_embeds.dtype, device=device)
+        aspect_ratio = aspect_ratio.to(dtype=prompt_embeds.dtype, device=device)
+
+        if do_classifier_free_guidance:
+            resolution = torch.cat([resolution, resolution], dim=0)
+            aspect_ratio = torch.cat([aspect_ratio, aspect_ratio], dim=0)
+
+        added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}
+
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                current_timestep = t
+                if not torch.is_tensor(current_timestep):
+                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                    # This would be a good case for the `match` statement (Python 3.10+)
+                    is_mps = latent_model_input.device.type == "mps"
+                    if isinstance(current_timestep, float):
+                        dtype = torch.float32 if is_mps else torch.float64
+                    else:
+                        dtype = torch.int32 if is_mps else torch.int64
+                    current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
+                elif len(current_timestep.shape) == 0:
+                    current_timestep = current_timestep[None].to(latent_model_input.device)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(latent_model_input.shape[0])
+
+
+                # predict control list
+                control_input = torch.concat([latent_model_input, conditioning_latents],1)
+                # print('control_input',control_input.shape)
+                control_list = self.colorguider(
+                    control_input,
+                    encoder_hidden_states=None,
+                    encoder_attention_mask=None,
+                    timestep=current_timestep,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # predict noise model_output
+                # print('latent_model_input',latent_model_input.shape, added_cond_kwargs)
+                noise_pred = self.transformer(
+                    latent_model_input.to(dtype=self.transformer.dtype),
+                    encoder_hidden_states=prompt_embeds.to(dtype=self.transformer.dtype),
+                    encoder_attention_mask=prompt_attention_mask,
+                    timestep=current_timestep,
+                    added_cond_kwargs=added_cond_kwargs,
+                    control_list = control_list,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # learned sigma
+                if self.transformer.config.out_channels // 2 == latent_channels:
+                    noise_pred = noise_pred.chunk(2, dim=1)[0]
+                else:
+                    noise_pred = noise_pred
+
+                # compute previous image: x_t -> x_t-1
+                if num_inference_steps == 1:
+                    # For DMD one step sampling: https://arxiv.org/abs/2311.18828
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).pred_original_sample
+                else:
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents.to(dtype = self.vae.dtype) / self.vae.config.scaling_factor, return_dict=False)[0]
+            if use_resolution_binning:
+                image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/colorflow/pipeline_colorflow_sd.py b/diffusers/src/diffusers/pipelines/colorflow/pipeline_colorflow_sd.py
new file mode 100644
index 0000000000000000000000000000000000000000..34e67337a764c62e0cf2ed7a2ad6a7f00df0821e
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/colorflow/pipeline_colorflow_sd.py
@@ -0,0 +1,1304 @@
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, ColorGuiderSDModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from torchvision import transforms
+from PIL import Image
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+xxx
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class ColorFlowSDPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    LoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with BrushNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        colorguider ([``]`):
+            Provides additional conditioning to the `unet` during the denoising process.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        colorguider: ColorGuiderSDModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            colorguider=colorguider,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        mask,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        colorguider_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.colorguider, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            ( isinstance(self.colorguider, ColorGuiderSDModel))
+            or is_compiled
+            and ( isinstance(self.colorguider._orig_mod, ColorGuiderSDModel))
+        ):
+            self.check_image(image, mask, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `colorguider_conditioning_scale`
+        if (
+            ( isinstance(self.colorguider, ColorGuiderSDModel))
+            or is_compiled
+            and ( isinstance(self.colorguider._orig_mod, ColorGuiderSDModel))
+        ):
+            if not isinstance(colorguider_conditioning_scale, float):
+                raise TypeError("For single colorguider: `colorguider_conditioning_scale` must be type `float`.")
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def check_image(self, image, mask, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        mask_is_pil = isinstance(mask, PIL.Image.Image)
+        mask_is_tensor = isinstance(mask, torch.Tensor)
+        mask_is_np = isinstance(mask, np.ndarray)
+        mask_is_pil_list = isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image)
+        mask_is_tensor_list = isinstance(mask, list) and isinstance(mask[0], torch.Tensor)
+        mask_is_np_list = isinstance(mask, list) and isinstance(mask[0], np.ndarray)
+
+        if (
+            not mask_is_pil
+            and not mask_is_tensor
+            and not mask_is_np
+            and not mask_is_pil_list
+            and not mask_is_tensor_list
+            and not mask_is_np_list
+        ):
+            raise TypeError(
+                f"mask must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(mask)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            noise = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = noise * self.scheduler.init_noise_sigma
+        return latents, noise
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        cond_image: PipelineImageInput = None,
+        cond_mask: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        colorguider_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The BrushNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.FloatTensor`, it is passed to BrushNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple BrushNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single BrushNet. When `prompt` is a list, and if a list of images is passed for a single BrushNet,
+                each will be paired with each prompt in the `prompt` list. This also applies to multiple BrushNets,
+                where a list of image lists can be passed to batch for each prompt and each BrushNet.
+            mask (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The BrushNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.FloatTensor`, it is passed to BrushNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple BrushNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single BrushNet. When `prompt` is a list, and if a list of images is passed for a single BrushNet,
+                each will be paired with each prompt in the `prompt` list. This also applies to multiple BrushNets,
+                where a list of image lists can be passed to batch for each prompt and each BrushNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            colorguider_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the BrushNet are multiplied by `colorguider_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple BrushNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The BrushNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the BrushNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the BrushNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        image = cond_image
+        mask = cond_mask
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        colorguider = self.colorguider._orig_mod if is_compiled_module(self.colorguider) else self.colorguider
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            control_guidance_start, control_guidance_end = (
+                [control_guidance_start],
+                [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            image,
+            mask,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            colorguider_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        global_pool_conditions = (
+            colorguider.config.global_pool_conditions
+            if ( isinstance(colorguider, ColorGuiderSDModel))
+            else colorguider.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image
+        if (isinstance(colorguider, ColorGuiderSDModel)):
+            if colorguider.config.conditioning_channels==5:
+                image = self.prepare_image(
+                    image=image,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=colorguider.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+            elif colorguider.config.conditioning_channels==9:
+                width, height = image.size
+
+                center_width = width // 2
+                center_height = height // 2
+                crop_width = width // 2
+                crop_height = height // 2
+
+                left = (width - crop_width) // 2
+                top = (height - crop_height) // 2
+                right = left + crop_width
+                bottom = top + crop_height
+                # 创建一个新的图像，大小与原图相同，填充为黑色
+                image_A = Image.new('RGB', (width, height), (0, 0, 0))
+
+                # 将原图粘贴到新图像的外围区域
+                image_A.paste(image, (0, 0))
+
+                # 将中心区域设置为黑色
+                image_A.paste(Image.new('RGB', (crop_width, crop_height), (0, 0, 0)), (left, top))
+                # 创建一个新的图像，大小与原图相同，填充为黑色
+                image_B = Image.new('RGB', (width, height), (0, 0, 0))
+
+                # 将原图的中心区域粘贴到新图像的中心区域
+
+                image_B.paste(image.crop((left, top, right, bottom)), (left, top))
+
+
+                image = self.prepare_image(
+                    image=image_A,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=colorguider.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+                image_bw = self.prepare_image(
+                    image=image_B,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=colorguider.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+            # original_mask = self.prepare_image(
+            #     image=mask,
+            #     width=width,
+            #     height=height,
+            #     batch_size=batch_size * num_images_per_prompt,
+            #     num_images_per_prompt=num_images_per_prompt,
+            #     device=device,
+            #     dtype=colorguider.dtype,
+            #     do_classifier_free_guidance=self.do_classifier_free_guidance,
+            #     guess_mode=guess_mode,
+            # )
+            original_mask = transforms.ToTensor()(mask).to(device=device, dtype=image.dtype).unsqueeze(0)
+            # print(original_mask.shape)
+            original_mask=(1 - original_mask[:,0:1,:,:]).to(image.dtype)
+            original_mask = torch.cat([original_mask] * 2)
+            height, width = image.shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents, noise = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6.1 prepare condition latents
+        conditioning_latents=self.vae.encode(image).latent_dist.sample() * self.vae.config.scaling_factor
+        if colorguider.config.conditioning_channels==9:
+            conditioning_latents_bw=self.vae.encode(image_bw).latent_dist.sample() * self.vae.config.scaling_factor
+        mask = torch.nn.functional.interpolate(
+                    original_mask, 
+                    size=(
+                        conditioning_latents.shape[-2], 
+                        conditioning_latents.shape[-1]
+                    ),
+                    mode='bilinear'
+                )
+        # transforms.ToPILImage()((mask[0,:,:,:])).save(f"{device}_masks.png")
+        if colorguider.config.conditioning_channels==9:
+            conditioning_latents = torch.concat([conditioning_latents, mask, conditioning_latents_bw],1)
+        else:
+            conditioning_latents = torch.concat([conditioning_latents,mask],1)
+        # print(mask.shape, mask[0,0,32:34,32:34])
+
+
+        # 6.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 7.2 Create tensor stating which colorguiders to keep
+        colorguider_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            colorguider_keep.append(keeps[0] if (isinstance(colorguider, ColorGuiderSDModel)) else keeps)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_colorguider_compiled = is_compiled_module(self.colorguider)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if (is_unet_compiled and is_colorguider_compiled) and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # colorguider(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer BrushNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    colorguider_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    colorguider_prompt_embeds = prompt_embeds
+
+                if isinstance(colorguider_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(colorguider_conditioning_scale, colorguider_keep[i])]
+                else:
+                    colorguider_cond_scale = colorguider_conditioning_scale
+                    if isinstance(colorguider_cond_scale, list):
+                        colorguider_cond_scale = colorguider_cond_scale[0]
+                    cond_scale = colorguider_cond_scale * colorguider_keep[i]
+
+                down_block_res_samples, mid_block_res_sample, up_block_res_samples = self.colorguider(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=colorguider_prompt_embeds,
+                    colorguider_cond=conditioning_latents,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+                # mean_list = [down_block_res_sample.abs().mean() for down_block_res_sample in down_block_res_samples]
+                # print(len(mean_list), mean_list)
+                # print(mid_block_res_sample.abs().mean())
+                # mean_list = [down_block_res_sample.abs().mean() for down_block_res_sample in up_block_res_samples]
+                # print(len(mean_list), mean_list)
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infered BrushNet only for the conditional batch.
+                    # To apply the output of BrushNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+                    up_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in up_block_res_samples]
+                # print('cfg',guess_mode,self.do_classifier_free_guidance)
+
+                # predict the noise residual
+                # unet_input = torch.concat([latent_model_input, conditioning_latents],1)
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_add_samples=down_block_res_samples,
+                    mid_block_add_sample=mid_block_res_sample,
+                    up_block_add_samples=up_block_res_samples,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and colorguider
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.colorguider.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/consistency_models/__init__.py b/diffusers/src/diffusers/pipelines/consistency_models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..162d91c010acf95aa2daf87c51ab1e0c68361fd5
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/consistency_models/__init__.py
@@ -0,0 +1,24 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    _LazyModule,
+)
+
+
+_import_structure = {
+    "pipeline_consistency_models": ["ConsistencyModelPipeline"],
+}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_consistency_models import ConsistencyModelPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/diffusers/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py b/diffusers/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2f67a6989175a5a31419fc7d8bd324f8f2099c2
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
@@ -0,0 +1,275 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Union
+
+import torch
+
+from ...models import UNet2DModel
+from ...schedulers import CMStochasticIterativeScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+
+        >>> from diffusers import ConsistencyModelPipeline
+
+        >>> device = "cuda"
+        >>> # Load the cd_imagenet64_l2 checkpoint.
+        >>> model_id_or_path = "openai/diffusers-cd_imagenet64_l2"
+        >>> pipe = ConsistencyModelPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
+        >>> pipe.to(device)
+
+        >>> # Onestep Sampling
+        >>> image = pipe(num_inference_steps=1).images[0]
+        >>> image.save("cd_imagenet64_l2_onestep_sample.png")
+
+        >>> # Onestep sampling, class-conditional image generation
+        >>> # ImageNet-64 class label 145 corresponds to king penguins
+        >>> image = pipe(num_inference_steps=1, class_labels=145).images[0]
+        >>> image.save("cd_imagenet64_l2_onestep_sample_penguin.png")
+
+        >>> # Multistep sampling, class-conditional image generation
+        >>> # Timesteps can be explicitly specified; the particular timesteps below are from the original GitHub repo:
+        >>> # https://github.com/openai/consistency_models/blob/main/scripts/launch.sh#L77
+        >>> image = pipe(num_inference_steps=None, timesteps=[22, 0], class_labels=145).images[0]
+        >>> image.save("cd_imagenet64_l2_multistep_sample_penguin.png")
+        ```
+"""
+
+
+class ConsistencyModelPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for unconditional or class-conditional image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Currently only
+            compatible with [`CMStochasticIterativeScheduler`].
+    """
+
+    model_cpu_offload_seq = "unet"
+
+    def __init__(self, unet: UNet2DModel, scheduler: CMStochasticIterativeScheduler) -> None:
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+        )
+
+        self.safety_checker = None
+
+    def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels, height, width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Follows diffusers.VaeImageProcessor.postprocess
+    def postprocess_image(self, sample: torch.Tensor, output_type: str = "pil"):
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(
+                f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']"
+            )
+
+        # Equivalent to diffusers.VaeImageProcessor.denormalize
+        sample = (sample / 2 + 0.5).clamp(0, 1)
+        if output_type == "pt":
+            return sample
+
+        # Equivalent to diffusers.VaeImageProcessor.pt_to_numpy
+        sample = sample.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "np":
+            return sample
+
+        # Output_type must be 'pil'
+        sample = self.numpy_to_pil(sample)
+        return sample
+
+    def prepare_class_labels(self, batch_size, device, class_labels=None):
+        if self.unet.config.num_class_embeds is not None:
+            if isinstance(class_labels, list):
+                class_labels = torch.tensor(class_labels, dtype=torch.int)
+            elif isinstance(class_labels, int):
+                assert batch_size == 1, "Batch size must be 1 if classes is an int"
+                class_labels = torch.tensor([class_labels], dtype=torch.int)
+            elif class_labels is None:
+                # Randomly generate batch_size class labels
+                # TODO: should use generator here? int analogue of randn_tensor is not exposed in ...utils
+                class_labels = torch.randint(0, self.unet.config.num_class_embeds, size=(batch_size,))
+            class_labels = class_labels.to(device)
+        else:
+            class_labels = None
+        return class_labels
+
+    def check_inputs(self, num_inference_steps, timesteps, latents, batch_size, img_size, callback_steps):
+        if num_inference_steps is None and timesteps is None:
+            raise ValueError("Exactly one of `num_inference_steps` or `timesteps` must be supplied.")
+
+        if num_inference_steps is not None and timesteps is not None:
+            logger.warning(
+                f"Both `num_inference_steps`: {num_inference_steps} and `timesteps`: {timesteps} are supplied;"
+                " `timesteps` will be used over `num_inference_steps`."
+            )
+
+        if latents is not None:
+            expected_shape = (batch_size, 3, img_size, img_size)
+            if latents.shape != expected_shape:
+                raise ValueError(f"The shape of latents is {latents.shape} but is expected to be {expected_shape}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        batch_size: int = 1,
+        class_labels: Optional[Union[torch.Tensor, List[int], int]] = None,
+        num_inference_steps: int = 1,
+        timesteps: List[int] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            class_labels (`torch.Tensor` or `List[int]` or `int`, *optional*):
+                Optional class labels for conditioning class-conditional consistency models. Not used if the model is
+                not class-conditional.
+            num_inference_steps (`int`, *optional*, defaults to 1):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        # 0. Prepare call parameters
+        img_size = self.unet.config.sample_size
+        device = self._execution_device
+
+        # 1. Check inputs
+        self.check_inputs(num_inference_steps, timesteps, latents, batch_size, img_size, callback_steps)
+
+        # 2. Prepare image latents
+        # Sample image latents x_0 ~ N(0, sigma_0^2 * I)
+        sample = self.prepare_latents(
+            batch_size=batch_size,
+            num_channels=self.unet.config.in_channels,
+            height=img_size,
+            width=img_size,
+            dtype=self.unet.dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        # 3. Handle class_labels for class-conditional models
+        class_labels = self.prepare_class_labels(batch_size, device, class_labels=class_labels)
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps)
+            timesteps = self.scheduler.timesteps
+
+        # 5. Denoising loop
+        # Multistep sampling: implements Algorithm 1 in the paper
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                scaled_sample = self.scheduler.scale_model_input(sample, t)
+                model_output = self.unet(scaled_sample, t, class_labels=class_labels, return_dict=False)[0]
+
+                sample = self.scheduler.step(model_output, t, sample, generator=generator)[0]
+
+                # call the callback, if provided
+                progress_bar.update()
+                if callback is not None and i % callback_steps == 0:
+                    callback(i, t, sample)
+
+        # 6. Post-process image sample
+        image = self.postprocess_image(sample, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/controlnet/__init__.py b/diffusers/src/diffusers/pipelines/controlnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1671050c93f5d84fbf20195692d6f515b95303c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet/__init__.py
@@ -0,0 +1,80 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["multicontrolnet"] = ["MultiControlNetModel"]
+    _import_structure["pipeline_controlnet"] = ["StableDiffusionControlNetPipeline"]
+    _import_structure["pipeline_controlnet_blip_diffusion"] = ["BlipDiffusionControlNetPipeline"]
+    _import_structure["pipeline_controlnet_img2img"] = ["StableDiffusionControlNetImg2ImgPipeline"]
+    _import_structure["pipeline_controlnet_inpaint"] = ["StableDiffusionControlNetInpaintPipeline"]
+    _import_structure["pipeline_controlnet_inpaint_sd_xl"] = ["StableDiffusionXLControlNetInpaintPipeline"]
+    _import_structure["pipeline_controlnet_sd_xl"] = ["StableDiffusionXLControlNetPipeline"]
+    _import_structure["pipeline_controlnet_sd_xl_img2img"] = ["StableDiffusionXLControlNetImg2ImgPipeline"]
+try:
+    if not (is_transformers_available() and is_flax_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_flax_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
+else:
+    _import_structure["pipeline_flax_controlnet"] = ["FlaxStableDiffusionControlNetPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .multicontrolnet import MultiControlNetModel
+        from .pipeline_controlnet import StableDiffusionControlNetPipeline
+        from .pipeline_controlnet_blip_diffusion import BlipDiffusionControlNetPipeline
+        from .pipeline_controlnet_img2img import StableDiffusionControlNetImg2ImgPipeline
+        from .pipeline_controlnet_inpaint import StableDiffusionControlNetInpaintPipeline
+        from .pipeline_controlnet_inpaint_sd_xl import StableDiffusionXLControlNetInpaintPipeline
+        from .pipeline_controlnet_sd_xl import StableDiffusionXLControlNetPipeline
+        from .pipeline_controlnet_sd_xl_img2img import StableDiffusionXLControlNetImg2ImgPipeline
+
+    try:
+        if not (is_transformers_available() and is_flax_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_flax_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/controlnet/multicontrolnet.py b/diffusers/src/diffusers/pipelines/controlnet/multicontrolnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3c5ec6eed0379e7d92fd92abcabfce3c466fc45
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet/multicontrolnet.py
@@ -0,0 +1,183 @@
+import os
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...models.controlnet import ControlNetModel, ControlNetOutput
+from ...models.modeling_utils import ModelMixin
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MultiControlNetModel(ModelMixin):
+    r"""
+    Multiple `ControlNetModel` wrapper class for Multi-ControlNet
+
+    This module is a wrapper for multiple instances of the `ControlNetModel`. The `forward()` API is designed to be
+    compatible with `ControlNetModel`.
+
+    Args:
+        controlnets (`List[ControlNetModel]`):
+            Provides additional conditioning to the unet during the denoising process. You must set multiple
+            `ControlNetModel` as a list.
+    """
+
+    def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
+        super().__init__()
+        self.nets = nn.ModuleList(controlnets)
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        controlnet_cond: List[torch.tensor],
+        conditioning_scale: List[float],
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple]:
+        for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
+            down_samples, mid_sample = controlnet(
+                sample=sample,
+                timestep=timestep,
+                encoder_hidden_states=encoder_hidden_states,
+                controlnet_cond=image,
+                conditioning_scale=scale,
+                class_labels=class_labels,
+                timestep_cond=timestep_cond,
+                attention_mask=attention_mask,
+                added_cond_kwargs=added_cond_kwargs,
+                cross_attention_kwargs=cross_attention_kwargs,
+                guess_mode=guess_mode,
+                return_dict=return_dict,
+            )
+
+            # merge samples
+            if i == 0:
+                down_block_res_samples, mid_block_res_sample = down_samples, mid_sample
+            else:
+                down_block_res_samples = [
+                    samples_prev + samples_curr
+                    for samples_prev, samples_curr in zip(down_block_res_samples, down_samples)
+                ]
+                mid_block_res_sample += mid_sample
+
+        return down_block_res_samples, mid_block_res_sample
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+    ):
+        """
+        Save a model and its configuration file to a directory, so that it can be re-loaded using the
+        `[`~pipelines.controlnet.MultiControlNetModel.from_pretrained`]` class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to which to save. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful when in distributed training like
+                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
+                the main process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
+                need to replace `torch.save` by another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format pytorch_model.<variant>.bin.
+        """
+        for idx, controlnet in enumerate(self.nets):
+            suffix = "" if idx == 0 else f"_{idx}"
+            controlnet.save_pretrained(
+                save_directory + suffix,
+                is_main_process=is_main_process,
+                save_function=save_function,
+                safe_serialization=safe_serialization,
+                variant=variant,
+            )
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained MultiControlNet model from multiple pre-trained controlnet models.
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+        the model, you should first set it back in training mode with `model.train()`.
+
+        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
+        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+        task.
+
+        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
+        weights are discarded.
+
+        Parameters:
+            pretrained_model_path (`os.PathLike`):
+                A path to a *directory* containing model weights saved using
+                [`~diffusers.pipelines.controlnet.MultiControlNetModel.save_pretrained`], e.g.,
+                `./my_model_directory/controlnet`.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+                will be automatically derived from the model's weights.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be refined to each
+                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
+                GPU and the available CPU RAM if unset.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
+                also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
+                model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
+                setting this argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
+                ignored when using `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights will be downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model will be forcibly loaded from
+                `safetensors` weights. If set to `False`, loading will *not* use `safetensors`.
+        """
+        idx = 0
+        controlnets = []
+
+        # load controlnet and append to list until no controlnet directory exists anymore
+        # first controlnet has to be saved under `./mydirectory/controlnet` to be compliant with `DiffusionPipeline.from_prertained`
+        # second, third, ... controlnets have to be saved under `./mydirectory/controlnet_1`, `./mydirectory/controlnet_2`, ...
+        model_path_to_load = pretrained_model_path
+        while os.path.isdir(model_path_to_load):
+            controlnet = ControlNetModel.from_pretrained(model_path_to_load, **kwargs)
+            controlnets.append(controlnet)
+
+            idx += 1
+            model_path_to_load = pretrained_model_path + f"_{idx}"
+
+        logger.info(f"{len(controlnets)} controlnets loaded from {pretrained_model_path}.")
+
+        if len(controlnets) == 0:
+            raise ValueError(
+                f"No ControlNets found under {os.path.dirname(pretrained_model_path)}. Expected at least {pretrained_model_path + '_0'}."
+            )
+
+        return cls(controlnets)
diff --git a/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b2fefe7b0a43fc55cde25504066861fb4e61be2
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -0,0 +1,1348 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+        ... )
+        >>> image = np.array(image)
+
+        >>> # get canny image
+        >>> image = cv2.Canny(image, 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionControlNetPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> # remove following line if xformers is not installed
+        >>> pipe.enable_xformers_memory_efficient_attention()
+
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "futuristic-looking woman", num_inference_steps=20, generator=generator, image=canny_image
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionControlNetPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                transposed_image = [list(t) for t in zip(*image)]
+                if len(transposed_image) != len(self.controlnet.nets):
+                    raise ValueError(
+                        f"For multiple controlnets: if you pass`image` as a list of list, each sublist must have the same length as the number of controlnets, but the sublists in `image` got {len(transposed_image)} images and {len(self.controlnet.nets)} ControlNets."
+                    )
+                for image_ in transposed_image:
+                    self.check_image(image_, prompt, prompt_embeds)
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+            else:
+                for image_ in image:
+                    self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError(
+                        "A single batch of varying conditioning scale settings (e.g. [[1.0, 0.5], [0.2, 0.8]]) is not supported at the moment. "
+                        "The conditioning scale must be fixed across the batch."
+                    )
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single
+                ControlNet, each will be paired with each prompt in the `prompt` list. This also applies to multiple
+                ControlNets, where a list of image lists can be passed to batch for each prompt and each ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            image,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            images = []
+
+            # Nested lists as ControlNet condition
+            if isinstance(image[0], list):
+                # Transpose the nested image list
+                image = [list(t) for t in zip(*image)]
+
+            for image_ in image:
+                image_ = self.prepare_image(
+                    image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                images.append(image_)
+
+            image = images
+            height, width = image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 7.2 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_controlnet_compiled = is_compiled_module(self.controlnet)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Inferred ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..86e0ddef663e959969607ff235ef7efab877780b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py
@@ -0,0 +1,413 @@
+# Copyright 2024 Salesforce.com, inc.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPTokenizer
+
+from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
+from ...schedulers import PNDMScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..blip_diffusion.blip_image_processing import BlipImageProcessor
+from ..blip_diffusion.modeling_blip2 import Blip2QFormerModel
+from ..blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers.pipelines import BlipDiffusionControlNetPipeline
+        >>> from diffusers.utils import load_image
+        >>> from controlnet_aux import CannyDetector
+        >>> import torch
+
+        >>> blip_diffusion_pipe = BlipDiffusionControlNetPipeline.from_pretrained(
+        ...     "Salesforce/blipdiffusion-controlnet", torch_dtype=torch.float16
+        ... ).to("cuda")
+
+        >>> style_subject = "flower"
+        >>> tgt_subject = "teapot"
+        >>> text_prompt = "on a marble table"
+
+        >>> cldm_cond_image = load_image(
+        ...     "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/kettle.jpg"
+        ... ).resize((512, 512))
+        >>> canny = CannyDetector()
+        >>> cldm_cond_image = canny(cldm_cond_image, 30, 70, output_type="pil")
+        >>> style_image = load_image(
+        ...     "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/flower.jpg"
+        ... )
+        >>> guidance_scale = 7.5
+        >>> num_inference_steps = 50
+        >>> negative_prompt = "over-exposure, under-exposure, saturated, duplicate, out of frame, lowres, cropped, worst quality, low quality, jpeg artifacts, morbid, mutilated, out of frame, ugly, bad anatomy, bad proportions, deformed, blurry, duplicate"
+
+
+        >>> output = blip_diffusion_pipe(
+        ...     text_prompt,
+        ...     style_image,
+        ...     cldm_cond_image,
+        ...     style_subject,
+        ...     tgt_subject,
+        ...     guidance_scale=guidance_scale,
+        ...     num_inference_steps=num_inference_steps,
+        ...     neg_prompt=negative_prompt,
+        ...     height=512,
+        ...     width=512,
+        ... ).images
+        >>> output[0].save("image.png")
+        ```
+"""
+
+
+class BlipDiffusionControlNetPipeline(DiffusionPipeline):
+    """
+    Pipeline for Canny Edge based Controlled subject-driven generation using Blip Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        tokenizer ([`CLIPTokenizer`]):
+            Tokenizer for the text encoder
+        text_encoder ([`ContextCLIPTextModel`]):
+            Text encoder to encode the text prompt
+        vae ([`AutoencoderKL`]):
+            VAE model to map the latents to the image
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        scheduler ([`PNDMScheduler`]):
+             A scheduler to be used in combination with `unet` to generate image latents.
+        qformer ([`Blip2QFormerModel`]):
+            QFormer model to get multi-modal embeddings from the text and image.
+        controlnet ([`ControlNetModel`]):
+            ControlNet model to get the conditioning image embedding.
+        image_processor ([`BlipImageProcessor`]):
+            Image Processor to preprocess and postprocess the image.
+        ctx_begin_pos (int, `optional`, defaults to 2):
+            Position of the context token in the text encoder.
+    """
+
+    model_cpu_offload_seq = "qformer->text_encoder->unet->vae"
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: ContextCLIPTextModel,
+        vae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+        scheduler: PNDMScheduler,
+        qformer: Blip2QFormerModel,
+        controlnet: ControlNetModel,
+        image_processor: BlipImageProcessor,
+        ctx_begin_pos: int = 2,
+        mean: List[float] = None,
+        std: List[float] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            scheduler=scheduler,
+            qformer=qformer,
+            controlnet=controlnet,
+            image_processor=image_processor,
+        )
+        self.register_to_config(ctx_begin_pos=ctx_begin_pos, mean=mean, std=std)
+
+    def get_query_embeddings(self, input_image, src_subject):
+        return self.qformer(image_input=input_image, text_input=src_subject, return_dict=False)
+
+    # from the original Blip Diffusion code, speciefies the target subject and augments the prompt by repeating it
+    def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20):
+        rv = []
+        for prompt, tgt_subject in zip(prompts, tgt_subjects):
+            prompt = f"a {tgt_subject} {prompt.strip()}"
+            # a trick to amplify the prompt
+            rv.append(", ".join([prompt] * int(prompt_strength * prompt_reps)))
+
+        return rv
+
+    # Copied from diffusers.pipelines.consistency_models.pipeline_consistency_models.ConsistencyModelPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels, height, width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def encode_prompt(self, query_embeds, prompt, device=None):
+        device = device or self._execution_device
+
+        # embeddings for prompt, with query_embeds as context
+        max_len = self.text_encoder.text_model.config.max_position_embeddings
+        max_len -= self.qformer.config.num_query_tokens
+
+        tokenized_prompt = self.tokenizer(
+            prompt,
+            padding="max_length",
+            truncation=True,
+            max_length=max_len,
+            return_tensors="pt",
+        ).to(device)
+
+        batch_size = query_embeds.shape[0]
+        ctx_begin_pos = [self.config.ctx_begin_pos] * batch_size
+
+        text_embeddings = self.text_encoder(
+            input_ids=tokenized_prompt.input_ids,
+            ctx_embeddings=query_embeds,
+            ctx_begin_pos=ctx_begin_pos,
+        )[0]
+
+        return text_embeddings
+
+    # Adapted from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+    ):
+        image = self.image_processor.preprocess(
+            image,
+            size={"width": width, "height": height},
+            do_rescale=True,
+            do_center_crop=False,
+            do_normalize=False,
+            return_tensors="pt",
+        )["pixel_values"].to(device)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: List[str],
+        reference_image: PIL.Image.Image,
+        condtioning_image: PIL.Image.Image,
+        source_subject_category: List[str],
+        target_subject_category: List[str],
+        latents: Optional[torch.Tensor] = None,
+        guidance_scale: float = 7.5,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        neg_prompt: Optional[str] = "",
+        prompt_strength: float = 1.0,
+        prompt_reps: int = 20,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`List[str]`):
+                The prompt or prompts to guide the image generation.
+            reference_image (`PIL.Image.Image`):
+                The reference image to condition the generation on.
+            condtioning_image (`PIL.Image.Image`):
+                The conditioning canny edge image to condition the generation on.
+            source_subject_category (`List[str]`):
+                The source subject category.
+            target_subject_category (`List[str]`):
+                The target subject category.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by random sampling.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            height (`int`, *optional*, defaults to 512):
+                The height of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width of the generated image.
+            seed (`int`, *optional*, defaults to 42):
+                The seed to use for random generation.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            neg_prompt (`str`, *optional*, defaults to ""):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_strength (`float`, *optional*, defaults to 1.0):
+                The strength of the prompt. Specifies the number of times the prompt is repeated along with prompt_reps
+                to amplify the prompt.
+            prompt_reps (`int`, *optional*, defaults to 20):
+                The number of times the prompt is repeated along with prompt_strength to amplify the prompt.
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        device = self._execution_device
+
+        reference_image = self.image_processor.preprocess(
+            reference_image, image_mean=self.config.mean, image_std=self.config.std, return_tensors="pt"
+        )["pixel_values"]
+        reference_image = reference_image.to(device)
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if isinstance(source_subject_category, str):
+            source_subject_category = [source_subject_category]
+        if isinstance(target_subject_category, str):
+            target_subject_category = [target_subject_category]
+
+        batch_size = len(prompt)
+
+        prompt = self._build_prompt(
+            prompts=prompt,
+            tgt_subjects=target_subject_category,
+            prompt_strength=prompt_strength,
+            prompt_reps=prompt_reps,
+        )
+        query_embeds = self.get_query_embeddings(reference_image, source_subject_category)
+        text_embeddings = self.encode_prompt(query_embeds, prompt, device)
+        # 3. unconditional embedding
+        do_classifier_free_guidance = guidance_scale > 1.0
+        if do_classifier_free_guidance:
+            max_length = self.text_encoder.text_model.config.max_position_embeddings
+
+            uncond_input = self.tokenizer(
+                [neg_prompt] * batch_size,
+                padding="max_length",
+                max_length=max_length,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(
+                input_ids=uncond_input.input_ids.to(device),
+                ctx_embeddings=None,
+            )[0]
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        scale_down_factor = 2 ** (len(self.unet.config.block_out_channels) - 1)
+        latents = self.prepare_latents(
+            batch_size=batch_size,
+            num_channels=self.unet.config.in_channels,
+            height=height // scale_down_factor,
+            width=width // scale_down_factor,
+            generator=generator,
+            latents=latents,
+            dtype=self.unet.dtype,
+            device=device,
+        )
+        # set timesteps
+        extra_set_kwargs = {}
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        cond_image = self.prepare_control_image(
+            image=condtioning_image,
+            width=width,
+            height=height,
+            batch_size=batch_size,
+            num_images_per_prompt=1,
+            device=device,
+            dtype=self.controlnet.dtype,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            do_classifier_free_guidance = guidance_scale > 1.0
+
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            down_block_res_samples, mid_block_res_sample = self.controlnet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=text_embeddings,
+                controlnet_cond=cond_image,
+                return_dict=False,
+            )
+
+            noise_pred = self.unet(
+                latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_embeddings,
+                down_block_additional_residuals=down_block_res_samples,
+                mid_block_additional_residual=mid_block_res_sample,
+            )["sample"]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+            )["prev_sample"]
+        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a4f46d61990e03064f782e1818b2e01ab3c8368
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -0,0 +1,1319 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+        ... )
+        >>> np_image = np.array(image)
+
+        >>> # get canny image
+        >>> np_image = cv2.Canny(np_image, 100, 200)
+        >>> np_image = np_image[:, :, None]
+        >>> np_image = np.concatenate([np_image, np_image, np_image], axis=2)
+        >>> canny_image = Image.fromarray(np_image)
+
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "futuristic-looking woman",
+        ...     num_inference_steps=20,
+        ...     generator=generator,
+        ...     image=image,
+        ...     control_image=canny_image,
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def prepare_image(image):
+    if isinstance(image, torch.Tensor):
+        # Batch single image
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        image = image.to(dtype=torch.float32)
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    return image
+
+
+class StableDiffusionControlNetImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for image-to-image generation using Stable Diffusion with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+                    image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+                elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+                    )
+
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The initial image to be used as the starting point for the image generation process. Can also accept
+                image latents as `image`, and if passing latents directly they are not encoded again.
+            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            control_image,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image
+        image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+
+        # 5. Prepare controlnet_conditioning_image
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+            )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 7.2 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Inferred ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f7d464f9a911b750992b1111d86d053697eb58a
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -0,0 +1,1504 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This model implementation is heavily inspired by https://github.com/haofanwang/ControlNet-for-Diffusers/
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, DDIMScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy.png"
+        ... )
+        >>> init_image = init_image.resize((512, 512))
+
+        >>> generator = torch.Generator(device="cpu").manual_seed(1)
+
+        >>> mask_image = load_image(
+        ...     "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy_mask.png"
+        ... )
+        >>> mask_image = mask_image.resize((512, 512))
+
+
+        >>> def make_canny_condition(image):
+        ...     image = np.array(image)
+        ...     image = cv2.Canny(image, 100, 200)
+        ...     image = image[:, :, None]
+        ...     image = np.concatenate([image, image, image], axis=2)
+        ...     image = Image.fromarray(image)
+        ...     return image
+
+
+        >>> control_image = make_canny_condition(init_image)
+
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...     "lllyasviel/control_v11p_sd15_inpaint", torch_dtype=torch.float16
+        ... )
+        >>> pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+
+        >>> pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> image = pipe(
+        ...     "a handsome man with ray-ban sunglasses",
+        ...     num_inference_steps=20,
+        ...     generator=generator,
+        ...     eta=1.0,
+        ...     image=init_image,
+        ...     mask_image=mask_image,
+        ...     control_image=control_image,
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class StableDiffusionControlNetInpaintPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for image inpainting using Stable Diffusion with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    <Tip>
+
+    This pipeline can be used with checkpoints that have been specifically fine-tuned for inpainting
+    ([runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting)) as well as
+    default text-to-image Stable Diffusion checkpoints
+    ([runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)). Default text-to-image
+    Stable Diffusion checkpoints might be preferable for ControlNets that have been fine-tuned on those, such as
+    [lllyasviel/control_v11p_sd15_inpaint](https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint).
+
+    </Tip>
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        mask_image,
+        height,
+        width,
+        callback_steps,
+        output_type,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+        padding_mask_crop=None,
+    ):
+        if height is not None and height % 8 != 0 or width is not None and width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if padding_mask_crop is not None:
+            if not isinstance(image, PIL.Image.Image):
+                raise ValueError(
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
+                )
+            if not isinstance(mask_image, PIL.Image.Image):
+                raise ValueError(
+                    f"The mask image should be a PIL image when inpainting mask crop, but is of type"
+                    f" {type(mask_image)}."
+                )
+            if output_type != "pil":
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        crops_coords,
+        resize_mode,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        ).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 4:
+                image_latents = image
+            else:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        else:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        if masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.5,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`,
+                    `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, NumPy array or tensor representing an image batch to be used as the starting point. For both
+                NumPy array and PyTorch tensor, the expected value range is between `[0, 1]`. If it's a tensor or a
+                list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a NumPy array or
+                a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`. It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`,
+                    `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, NumPy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a NumPy array or PyTorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for PyTorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for NumPy array, it would be for `(B, H, W, 1)`, `(B, H, W)`, `(H,
+                W, 1)`, or `(H, W)`.
+            control_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`,
+                    `List[List[torch.Tensor]]`, or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
+                image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
+                with the same aspect ration of the image and contains all masked area, and then expand that area based
+                on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
+                resizing to the original image size for inpainting. This is useful when the masked area is small while
+                the image is large and contain information irrelevant for inpainting, such as background.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 0.5):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            control_image,
+            mask_image,
+            height,
+            width,
+            callback_steps,
+            output_type,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+            padding_mask_crop,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if padding_mask_crop is not None:
+            height, width = self.image_processor.get_default_height_width(image, height, width)
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                crops_coords=crops_coords,
+                resize_mode=resize_mode,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    crops_coords=crops_coords,
+                    resize_mode=resize_mode,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+        else:
+            assert False
+
+        # 4.1 Preprocess mask and image - resizes image and mask w.r.t height and width
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        mask = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+
+        masked_image = init_image * (mask < 0.5)
+        _, _, height, width = init_image.shape
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps=num_inference_steps, strength=strength, device=device
+        )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 7.2 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Inferred ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                # predict the noise residual
+                if num_channels_unet == 9:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..17fd2cb6c81d1182d57a776804c7a6c0bbc6f5a3
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -0,0 +1,1845 @@
+# Copyright 2024 Harutatsu Akiyama, Jinbin Bai, and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from .multicontrolnet import MultiControlNetModel
+
+
+if is_invisible_watermark_available():
+    from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install transformers accelerate
+        >>> from diffusers import StableDiffusionXLControlNetInpaintPipeline, ControlNetModel, DDIMScheduler
+        >>> from diffusers.utils import load_image
+        >>> from PIL import Image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy.png"
+        ... )
+        >>> init_image = init_image.resize((1024, 1024))
+
+        >>> generator = torch.Generator(device="cpu").manual_seed(1)
+
+        >>> mask_image = load_image(
+        ...     "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy_mask.png"
+        ... )
+        >>> mask_image = mask_image.resize((1024, 1024))
+
+
+        >>> def make_canny_condition(image):
+        ...     image = np.array(image)
+        ...     image = cv2.Canny(image, 100, 200)
+        ...     image = image[:, :, None]
+        ...     image = np.concatenate([image, image, image], axis=2)
+        ...     image = Image.fromarray(image)
+        ...     return image
+
+
+        >>> control_image = make_canny_condition(init_image)
+
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...     "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = StableDiffusionXLControlNetInpaintPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> image = pipe(
+        ...     "a handsome man with ray-ban sunglasses",
+        ...     num_inference_steps=20,
+        ...     generator=generator,
+        ...     eta=1.0,
+        ...     image=init_image,
+        ...     mask_image=mask_image,
+        ...     control_image=control_image,
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class StableDiffusionXLControlNetInpaintPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    TextualInversionLoaderMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "add_neg_time_ids",
+        "mask",
+        "masked_image_latents",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+    ):
+        super().__init__()
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        mask_image,
+        strength,
+        num_inference_steps,
+        callback_steps,
+        output_type,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+        padding_mask_crop=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` cannot be None.")
+        elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
+            raise ValueError(
+                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
+                f" {type(num_inference_steps)}."
+            )
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if padding_mask_crop is not None:
+            if not isinstance(image, PIL.Image.Image):
+                raise ValueError(
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
+                )
+            if not isinstance(mask_image, PIL.Image.Image):
+                raise ValueError(
+                    f"The mask image should be a PIL image when inpainting mask crop, but is of type"
+                    f" {type(mask_image)}."
+                )
+            if output_type != "pil":
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        crops_coords,
+        resize_mode,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        ).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        add_noise=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 4:
+                image_latents = image
+            else:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None and add_noise:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        elif add_noise:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+        else:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = image_latents.to(device)
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        dtype = image.dtype
+        if self.vae.config.force_upcast:
+            image = image.float()
+            self.vae.to(dtype=torch.float32)
+
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        if self.vae.config.force_upcast:
+            self.vae.to(dtype)
+
+        image_latents = image_latents.to(dtype)
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+
+        masked_image_latents = None
+        if masked_image is not None:
+            masked_image = masked_image.to(device=device, dtype=dtype)
+            masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+            if masked_image_latents.shape[0] < batch_size:
+                if not batch_size % masked_image_latents.shape[0] == 0:
+                    raise ValueError(
+                        "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                        f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                        " Make sure the number of images that you pass is divisible by the total requested batch size."
+                    )
+                masked_image_latents = masked_image_latents.repeat(
+                    batch_size // masked_image_latents.shape[0], 1, 1, 1
+                )
+
+            masked_image_latents = (
+                torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+            )
+
+            # aligning device to prevent device errors when concating it with the latent model input
+            masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+
+            timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+            return timesteps, num_inference_steps - t_start
+
+        else:
+            # Strength is irrelevant if we directly request a timestep to start at;
+            # that is, strength is determined by the denoising_start instead.
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            t_start = len(self.scheduler.timesteps) - num_inference_steps
+            timesteps = self.scheduler.timesteps[t_start:]
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index(t_start)
+            return timesteps, num_inference_steps
+
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(original_size + crops_coords_top_left + (negative_aesthetic_score,))
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        control_image: Union[
+            PipelineImageInput,
+            List[PipelineImageInput],
+        ] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 0.9999,
+        num_inference_steps: int = 50,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
+                image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
+                with the same aspect ration of the image and contains all masked area, and then expand that area based
+                on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
+                resizing to the original image size for inpainting. This is useful when the masked area is small while
+                the image is large and contain information irrelevant for inpainting, such as background.
+            strength (`float`, *optional*, defaults to 0.9999):
+                Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
+                between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
+                `strength`. The number of denoising steps depends on the amount of noise initially added. When
+                `strength` is 1, added noise will be maximum and the denoising process will run for the full number of
+                iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores the masked
+                portion of the reference `image`. Note that in the case of `denoising_start` being declared as an
+                integer, the value of `strength` will be ignored.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
+                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
+                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
+                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. `tuple. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # # 0.0 Default height and width to unet
+        # height = height or self.unet.config.sample_size * self.vae_scale_factor
+        # width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 0.1 align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            control_image,
+            mask_image,
+            strength,
+            num_inference_steps,
+            callback_steps,
+            output_type,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+            padding_mask_crop,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 3.1 Encode ip_adapter_image
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. set timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=denoising_start if denoising_value_valid(denoising_start) else None,
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+        self._num_timesteps = len(timesteps)
+
+        # 5. Preprocess mask and image - resizes image and mask w.r.t height and width
+        # 5.1 Prepare init image
+        if padding_mask_crop is not None:
+            height, width = self.image_processor.get_default_height_width(image, height, width)
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        # 5.2 Prepare control images
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                crops_coords=crops_coords,
+                resize_mode=resize_mode,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    crops_coords=crops_coords,
+                    resize_mode=resize_mode,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+        else:
+            raise ValueError(f"{controlnet.__class__} is not supported.")
+
+        # 5.3 Prepare mask
+        mask = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+
+        masked_image = init_image * (mask < 0.5)
+        _, _, height, width = init_image.shape
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
+        add_noise = True if denoising_start is None else False
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            add_noise=add_noise,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+
+        # 8. Check that sizes of mask, masked image and latents match
+        if num_channels_unet == 9:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
+        elif num_channels_unet != 4:
+            raise ValueError(
+                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+            )
+        # 8.1 Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8.2 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps if isinstance(controlnet, MultiControlNetModel) else keeps[0])
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 10. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        # 11. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        if (
+            denoising_end is not None
+            and denoising_start is not None
+            and denoising_value_valid(denoising_end)
+            and denoising_value_valid(denoising_start)
+            and denoising_start >= denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {denoising_end} when using type float."
+            )
+        elif denoising_end is not None and denoising_value_valid(denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    controlnet_added_cond_kwargs = {
+                        "text_embeds": add_text_embeds.chunk(2)[1],
+                        "time_ids": add_time_ids.chunk(2)[1],
+                    }
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                    controlnet_added_cond_kwargs = added_cond_kwargs
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                # # Resize control_image to match the size of the input to the controlnet
+                # if control_image.shape[-2:] != control_model_input.shape[-2:]:
+                #     control_image = F.interpolate(control_image, size=control_model_input.shape[-2:], mode="bilinear", align_corners=False)
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Inferred ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                if num_channels_unet == 9:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # make sure the VAE is in float32 mode, as it overflows in float16
+        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+            self.upcast_vae()
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            return StableDiffusionXLPipelineOutput(images=latents)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdebcdf836416d57aa738bfa163de519ee6b4625
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -0,0 +1,1587 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers.utils.import_utils import is_invisible_watermark_available
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+from .multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+        >>> negative_prompt = "low quality, bad quality, sketches"
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+        ... )
+
+        >>> # initialize the models and pipeline
+        >>> controlnet_conditioning_scale = 0.5  # recommended for good generalization
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...     "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
+        ... )
+        >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # get canny image
+        >>> image = np.array(image)
+        >>> image = cv2.Canny(image, 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+
+        >>> # generate image
+        >>> image = pipe(
+        ...     prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLControlNetPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        text_encoder_2 ([`~transformers.CLIPTextModelWithProjection`]):
+            Second frozen text-encoder
+            ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        tokenizer_2 ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings should always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark](https://github.com/ShieldMnt/invisible-watermark/) library to
+            watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no
+            watermarker is used.
+    """
+
+    # leave controlnet out on purpose because it iterates with unet
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "feature_extractor",
+        "image_encoder",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. This is sent to `tokenizer_2`
+                and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, pooled text embeddings are generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt
+                weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input
+                argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned containing the output images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            image,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            negative_pooled_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3.1 Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt,
+            prompt_2,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 3.2 Encode ip_adapter_image
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            images = []
+
+            for image_ in image:
+                image_ = self.prepare_image(
+                    image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                images.append(image_)
+
+            image = images
+            height, width = image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 7.2 Prepare added time ids & embeddings
+        if isinstance(image, list):
+            original_size = original_size or image[0].shape[-2:]
+        else:
+            original_size = original_size or image.shape[-2:]
+        target_size = target_size or (height, width)
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_controlnet_compiled = is_compiled_module(self.controlnet)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    controlnet_added_cond_kwargs = {
+                        "text_embeds": add_text_embeds.chunk(2)[1],
+                        "time_ids": add_time_ids.chunk(2)[1],
+                    }
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                    controlnet_added_cond_kwargs = added_cond_kwargs
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Inferred ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..af19f3c309f87e446e772806eccbbc23fc2fa93b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -0,0 +1,1656 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers.utils.import_utils import is_invisible_watermark_available
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+from .multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # pip install accelerate transformers safetensors diffusers
+
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+
+        >>> from transformers import DPTImageProcessor, DPTForDepthEstimation
+        >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
+        >>> from diffusers.utils import load_image
+
+
+        >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
+        >>> feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...     "diffusers/controlnet-depth-sdxl-1.0-small",
+        ...     variant="fp16",
+        ...     use_safetensors=True,
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0",
+        ...     controlnet=controlnet,
+        ...     vae=vae,
+        ...     variant="fp16",
+        ...     use_safetensors=True,
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+
+        >>> def get_depth_map(image):
+        ...     image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
+        ...     with torch.no_grad(), torch.autocast("cuda"):
+        ...         depth_map = depth_estimator(image).predicted_depth
+
+        ...     depth_map = torch.nn.functional.interpolate(
+        ...         depth_map.unsqueeze(1),
+        ...         size=(1024, 1024),
+        ...         mode="bicubic",
+        ...         align_corners=False,
+        ...     )
+        ...     depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
+        ...     depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
+        ...     depth_map = (depth_map - depth_min) / (depth_max - depth_min)
+        ...     image = torch.cat([depth_map] * 3, dim=1)
+        ...     image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
+        ...     image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
+        ...     return image
+
+
+        >>> prompt = "A robot, 4k photo"
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... ).resize((1024, 1024))
+        >>> controlnet_conditioning_scale = 0.5  # recommended for good generalization
+        >>> depth_image = get_depth_map(image)
+
+        >>> images = pipe(
+        ...     prompt,
+        ...     image=image,
+        ...     control_image=depth_image,
+        ...     strength=0.99,
+        ...     num_inference_steps=50,
+        ...     controlnet_conditioning_scale=controlnet_conditioning_scale,
+        ... ).images
+        >>> images[0].save(f"robot_cat.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class StableDiffusionXLControlNetImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    FromSingleFileMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for image-to-image generation using Stable Diffusion XL with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
+            as a list, the outputs from each ControlNet are added together to create one combined additional
+            conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+            Whether the `unet` requires an `aesthetic_score` condition to be passed during inference. Also see the
+            config of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "feature_extractor",
+        "image_encoder",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "add_neg_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        strength,
+        num_inference_steps,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` cannot be None.")
+        elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
+            raise ValueError(
+                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
+                f" {type(num_inference_steps)}."
+            )
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.prepare_image
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.prepare_latents
+    def prepare_latents(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        latents_mean = latents_std = None
+        if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
+            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
+        if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
+            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
+
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+                    image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+                elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+                    )
+
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            if latents_mean is not None and latents_std is not None:
+                latents_mean = latents_mean.to(device=device, dtype=dtype)
+                latents_std = latents_std.to(device=device, dtype=dtype)
+                init_latents = (init_latents - latents_mean) * self.vae.config.scaling_factor / latents_std
+            else:
+                init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        if add_noise:
+            shape = init_latents.shape
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # get latents
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The initial image will be used as the starting point for the image generation process. Can also accept
+                image latents as `image`, if passing latents directly, it will not be encoded again.
+            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also
+                be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in
+                init, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single controlnet.
+            height (`int`, *optional*, defaults to the size of control_image):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to the size of control_image):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
+                corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the controlnet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the controlnet stops applying.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple`
+            containing the output images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            control_image,
+            strength,
+            num_inference_steps,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3.1. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt,
+            prompt_2,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 3.2 Encode ip_adapter_image
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image and controlnet_conditioning_image
+        image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = control_image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+            height, width = control_image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                True,
+            )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 7.2 Prepare added time ids & embeddings
+        if isinstance(control_image, list):
+            original_size = original_size or control_image[0].shape[-2:]
+        else:
+            original_size = original_size or control_image.shape[-2:]
+        target_size = target_size or (height, width)
+
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+        add_text_embeds = pooled_prompt_embeds
+
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    controlnet_added_cond_kwargs = {
+                        "text_embeds": add_text_embeds.chunk(2)[1],
+                        "time_ids": add_time_ids.chunk(2)[1],
+                    }
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                    controlnet_added_cond_kwargs = added_cond_kwargs
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Inferred ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+            return StableDiffusionXLPipelineOutput(images=image)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py b/diffusers/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a2cc08dbb2bf18c96f5594c629c54d042c1c751
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
@@ -0,0 +1,532 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict
+from flax.jax_utils import unreplicate
+from flax.training.common_utils import shard
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTokenizer, FlaxCLIPTextModel
+
+from ...models import FlaxAutoencoderKL, FlaxControlNetModel, FlaxUNet2DConditionModel
+from ...schedulers import (
+    FlaxDDIMScheduler,
+    FlaxDPMSolverMultistepScheduler,
+    FlaxLMSDiscreteScheduler,
+    FlaxPNDMScheduler,
+)
+from ...utils import PIL_INTERPOLATION, logging, replace_example_docstring
+from ..pipeline_flax_utils import FlaxDiffusionPipeline
+from ..stable_diffusion import FlaxStableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker_flax import FlaxStableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Set to True to use python for loop instead of jax.fori_loop for easier debugging
+DEBUG = False
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import jax
+        >>> import numpy as np
+        >>> import jax.numpy as jnp
+        >>> from flax.jax_utils import replicate
+        >>> from flax.training.common_utils import shard
+        >>> from diffusers.utils import load_image, make_image_grid
+        >>> from PIL import Image
+        >>> from diffusers import FlaxStableDiffusionControlNetPipeline, FlaxControlNetModel
+
+
+        >>> def create_key(seed=0):
+        ...     return jax.random.PRNGKey(seed)
+
+
+        >>> rng = create_key(0)
+
+        >>> # get canny image
+        >>> canny_image = load_image(
+        ...     "https://huggingface.co/datasets/YiYiXu/test-doc-assets/resolve/main/blog_post_cell_10_output_0.jpeg"
+        ... )
+
+        >>> prompts = "best quality, extremely detailed"
+        >>> negative_prompts = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet, controlnet_params = FlaxControlNetModel.from_pretrained(
+        ...     "lllyasviel/sd-controlnet-canny", from_pt=True, dtype=jnp.float32
+        ... )
+        >>> pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, revision="flax", dtype=jnp.float32
+        ... )
+        >>> params["controlnet"] = controlnet_params
+
+        >>> num_samples = jax.device_count()
+        >>> rng = jax.random.split(rng, jax.device_count())
+
+        >>> prompt_ids = pipe.prepare_text_inputs([prompts] * num_samples)
+        >>> negative_prompt_ids = pipe.prepare_text_inputs([negative_prompts] * num_samples)
+        >>> processed_image = pipe.prepare_image_inputs([canny_image] * num_samples)
+
+        >>> p_params = replicate(params)
+        >>> prompt_ids = shard(prompt_ids)
+        >>> negative_prompt_ids = shard(negative_prompt_ids)
+        >>> processed_image = shard(processed_image)
+
+        >>> output = pipe(
+        ...     prompt_ids=prompt_ids,
+        ...     image=processed_image,
+        ...     params=p_params,
+        ...     prng_seed=rng,
+        ...     num_inference_steps=50,
+        ...     neg_prompt_ids=negative_prompt_ids,
+        ...     jit=True,
+        ... ).images
+
+        >>> output_images = pipe.numpy_to_pil(np.asarray(output.reshape((num_samples,) + output.shape[-3:])))
+        >>> output_images = make_image_grid(output_images, num_samples // 4, 4)
+        >>> output_images.save("generated_image.png")
+        ```
+"""
+
+
+class FlaxStableDiffusionControlNetPipeline(FlaxDiffusionPipeline):
+    r"""
+    Flax-based pipeline for text-to-image generation using Stable Diffusion with ControlNet Guidance.
+
+    This model inherits from [`FlaxDiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`FlaxAutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.FlaxCLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`FlaxUNet2DConditionModel`]):
+            A `FlaxUNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`FlaxControlNetModel`]:
+            Provides additional conditioning to the `unet` during the denoising process.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
+            [`FlaxDPMSolverMultistepScheduler`].
+        safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: FlaxAutoencoderKL,
+        text_encoder: FlaxCLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: FlaxUNet2DConditionModel,
+        controlnet: FlaxControlNetModel,
+        scheduler: Union[
+            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
+        ],
+        safety_checker: FlaxStableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        super().__init__()
+        self.dtype = dtype
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def prepare_text_inputs(self, prompt: Union[str, List[str]]):
+        if not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+
+        return text_input.input_ids
+
+    def prepare_image_inputs(self, image: Union[Image.Image, List[Image.Image]]):
+        if not isinstance(image, (Image.Image, list)):
+            raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
+
+        if isinstance(image, Image.Image):
+            image = [image]
+
+        processed_images = jnp.concatenate([preprocess(img, jnp.float32) for img in image])
+
+        return processed_images
+
+    def _get_has_nsfw_concepts(self, features, params):
+        has_nsfw_concepts = self.safety_checker(features, params)
+        return has_nsfw_concepts
+
+    def _run_safety_checker(self, images, safety_model_params, jit=False):
+        # safety_model_params should already be replicated when jit is True
+        pil_images = [Image.fromarray(image) for image in images]
+        features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
+
+        if jit:
+            features = shard(features)
+            has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
+            has_nsfw_concepts = unshard(has_nsfw_concepts)
+            safety_model_params = unreplicate(safety_model_params)
+        else:
+            has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
+
+        images_was_copied = False
+        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+            if has_nsfw_concept:
+                if not images_was_copied:
+                    images_was_copied = True
+                    images = images.copy()
+
+                images[idx] = np.zeros(images[idx].shape, dtype=np.uint8)  # black image
+
+            if any(has_nsfw_concepts):
+                warnings.warn(
+                    "Potential NSFW content was detected in one or more images. A black image will be returned"
+                    " instead. Try again with a different prompt and/or seed."
+                )
+
+        return images, has_nsfw_concepts
+
+    def _generate(
+        self,
+        prompt_ids: jnp.ndarray,
+        image: jnp.ndarray,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int,
+        guidance_scale: float,
+        latents: Optional[jnp.ndarray] = None,
+        neg_prompt_ids: Optional[jnp.ndarray] = None,
+        controlnet_conditioning_scale: float = 1.0,
+    ):
+        height, width = image.shape[-2:]
+        if height % 64 != 0 or width % 64 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
+
+        # get prompt text embeddings
+        prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+
+        # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
+        # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
+        batch_size = prompt_ids.shape[0]
+
+        max_length = prompt_ids.shape[-1]
+
+        if neg_prompt_ids is None:
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
+            ).input_ids
+        else:
+            uncond_input = neg_prompt_ids
+        negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
+        context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        image = jnp.concatenate([image] * 2)
+
+        latents_shape = (
+            batch_size,
+            self.unet.config.in_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if latents is None:
+            latents = jax.random.normal(prng_seed, shape=latents_shape, dtype=jnp.float32)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+        def loop_body(step, args):
+            latents, scheduler_state = args
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            latents_input = jnp.concatenate([latents] * 2)
+
+            t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            timestep = jnp.broadcast_to(t, latents_input.shape[0])
+
+            latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
+
+            down_block_res_samples, mid_block_res_sample = self.controlnet.apply(
+                {"params": params["controlnet"]},
+                jnp.array(latents_input),
+                jnp.array(timestep, dtype=jnp.int32),
+                encoder_hidden_states=context,
+                controlnet_cond=image,
+                conditioning_scale=controlnet_conditioning_scale,
+                return_dict=False,
+            )
+
+            # predict the noise residual
+            noise_pred = self.unet.apply(
+                {"params": params["unet"]},
+                jnp.array(latents_input),
+                jnp.array(timestep, dtype=jnp.int32),
+                encoder_hidden_states=context,
+                down_block_additional_residuals=down_block_res_samples,
+                mid_block_additional_residual=mid_block_res_sample,
+            ).sample
+
+            # perform guidance
+            noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+            return latents, scheduler_state
+
+        scheduler_state = self.scheduler.set_timesteps(
+            params["scheduler"], num_inference_steps=num_inference_steps, shape=latents_shape
+        )
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * params["scheduler"].init_noise_sigma
+
+        if DEBUG:
+            # run with python for loop
+            for i in range(num_inference_steps):
+                latents, scheduler_state = loop_body(i, (latents, scheduler_state))
+        else:
+            latents, _ = jax.lax.fori_loop(0, num_inference_steps, loop_body, (latents, scheduler_state))
+
+        # scale and decode the image latents with vae
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
+
+        image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+        return image
+
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt_ids: jnp.ndarray,
+        image: jnp.ndarray,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int = 50,
+        guidance_scale: Union[float, jnp.ndarray] = 7.5,
+        latents: jnp.ndarray = None,
+        neg_prompt_ids: jnp.ndarray = None,
+        controlnet_conditioning_scale: Union[float, jnp.ndarray] = 1.0,
+        return_dict: bool = True,
+        jit: bool = False,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt_ids (`jnp.ndarray`):
+                The prompt or prompts to guide the image generation.
+            image (`jnp.ndarray`):
+                Array representing the ControlNet input condition to provide guidance to the `unet` for generation.
+            params (`Dict` or `FrozenDict`):
+                Dictionary containing the model parameters/weights.
+            prng_seed (`jax.Array`):
+                Array containing random number generator key.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            latents (`jnp.ndarray`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                array is generated by sampling using the supplied random `generator`.
+            controlnet_conditioning_scale (`float` or `jnp.ndarray`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
+                a plain tuple.
+            jit (`bool`, defaults to `False`):
+                Whether to run `pmap` versions of the generation and safety scoring functions.
+
+                    <Tip warning={true}>
+
+                    This argument exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a
+                    future release.
+
+                    </Tip>
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated images
+                and the second element is a list of `bool`s indicating whether the corresponding generated image
+                contains "not-safe-for-work" (nsfw) content.
+        """
+
+        height, width = image.shape[-2:]
+
+        if isinstance(guidance_scale, float):
+            # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+            # shape information, as they may be sharded (when `jit` is `True`), or not.
+            guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
+            if len(prompt_ids.shape) > 2:
+                # Assume sharded
+                guidance_scale = guidance_scale[:, None]
+
+        if isinstance(controlnet_conditioning_scale, float):
+            # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+            # shape information, as they may be sharded (when `jit` is `True`), or not.
+            controlnet_conditioning_scale = jnp.array([controlnet_conditioning_scale] * prompt_ids.shape[0])
+            if len(prompt_ids.shape) > 2:
+                # Assume sharded
+                controlnet_conditioning_scale = controlnet_conditioning_scale[:, None]
+
+        if jit:
+            images = _p_generate(
+                self,
+                prompt_ids,
+                image,
+                params,
+                prng_seed,
+                num_inference_steps,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+                controlnet_conditioning_scale,
+            )
+        else:
+            images = self._generate(
+                prompt_ids,
+                image,
+                params,
+                prng_seed,
+                num_inference_steps,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+                controlnet_conditioning_scale,
+            )
+
+        if self.safety_checker is not None:
+            safety_params = params["safety_checker"]
+            images_uint8_casted = (images * 255).round().astype("uint8")
+            num_devices, batch_size = images.shape[:2]
+
+            images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
+            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
+            images = np.array(images)
+
+            # block images
+            if any(has_nsfw_concept):
+                for i, is_nsfw in enumerate(has_nsfw_concept):
+                    if is_nsfw:
+                        images[i] = np.asarray(images_uint8_casted[i])
+
+            images = images.reshape(num_devices, batch_size, height, width, 3)
+        else:
+            images = np.asarray(images)
+            has_nsfw_concept = False
+
+        if not return_dict:
+            return (images, has_nsfw_concept)
+
+        return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
+
+
+# Static argnums are pipe, num_inference_steps. A change would trigger recompilation.
+# Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
+@partial(
+    jax.pmap,
+    in_axes=(None, 0, 0, 0, 0, None, 0, 0, 0, 0),
+    static_broadcasted_argnums=(0, 5),
+)
+def _p_generate(
+    pipe,
+    prompt_ids,
+    image,
+    params,
+    prng_seed,
+    num_inference_steps,
+    guidance_scale,
+    latents,
+    neg_prompt_ids,
+    controlnet_conditioning_scale,
+):
+    return pipe._generate(
+        prompt_ids,
+        image,
+        params,
+        prng_seed,
+        num_inference_steps,
+        guidance_scale,
+        latents,
+        neg_prompt_ids,
+        controlnet_conditioning_scale,
+    )
+
+
+@partial(jax.pmap, static_broadcasted_argnums=(0,))
+def _p_get_has_nsfw_concepts(pipe, features, params):
+    return pipe._get_has_nsfw_concepts(features, params)
+
+
+def unshard(x: jnp.ndarray):
+    # einops.rearrange(x, 'd b ... -> (d b) ...')
+    num_devices, batch_size = x.shape[:2]
+    rest = x.shape[2:]
+    return x.reshape(num_devices * batch_size, *rest)
+
+
+def preprocess(image, dtype):
+    image = image.convert("RGB")
+    w, h = image.size
+    w, h = (x - x % 64 for x in (w, h))  # resize to integer multiple of 64
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = jnp.array(image).astype(dtype) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    return image
diff --git a/diffusers/src/diffusers/pipelines/controlnet_hunyuandit/__init__.py b/diffusers/src/diffusers/pipelines/controlnet_hunyuandit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..34c59795de328f2fa2cbe610625d866e4710d4a2
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet_hunyuandit/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_hunyuandit_controlnet"] = ["HunyuanDiTControlNetPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_hunyuandit_controlnet import HunyuanDiTControlNetPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py b/diffusers/src/diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..10c521e8499707ba5b38054251b0bbd3c15004cd
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py
@@ -0,0 +1,1042 @@
+# Copyright 2024 HunyuanDiT Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import BertModel, BertTokenizer, CLIPImageProcessor, MT5Tokenizer, T5EncoderModel
+
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...models import AutoencoderKL, HunyuanDiT2DControlNetModel, HunyuanDiT2DModel, HunyuanDiT2DMultiControlNetModel
+from ...models.embeddings import get_2d_rotary_pos_embed
+from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import HunyuanDiT2DControlNetModel, HunyuanDiTControlNetPipeline
+        import torch
+
+        controlnet = HunyuanDiT2DControlNetModel.from_pretrained(
+            "Tencent-Hunyuan/HunyuanDiT-v1.1-ControlNet-Diffusers-Canny", torch_dtype=torch.float16
+        )
+
+        pipe = HunyuanDiTControlNetPipeline.from_pretrained(
+            "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.to("cuda")
+
+        from diffusers.utils import load_image
+
+        cond_image = load_image(
+            "https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.1-ControlNet-Diffusers-Canny/resolve/main/canny.jpg?download=true"
+        )
+
+        ## You may also use English prompt as HunyuanDiT supports both English and Chinese
+        prompt = "在夜晚的酒店门前，一座古老的中国风格的狮子雕像矗立着，它的眼睛闪烁着光芒，仿佛在守护着这座建筑。背景是夜晚的酒店前，构图方式是特写，平视，居中构图。这张照片呈现了真实摄影风格，蕴含了中国雕塑文化，同时展现了神秘氛围"
+        # prompt="At night, an ancient Chinese-style lion statue stands in front of the hotel, its eyes gleaming as if guarding the building. The background is the hotel entrance at night, with a close-up, eye-level, and centered composition. This photo presents a realistic photographic style, embodies Chinese sculpture culture, and reveals a mysterious atmosphere."
+        image = pipe(
+            prompt,
+            height=1024,
+            width=1024,
+            control_image=cond_image,
+            num_inference_steps=50,
+        ).images[0]
+        ```
+"""
+
+STANDARD_RATIO = np.array(
+    [
+        1.0,  # 1:1
+        4.0 / 3.0,  # 4:3
+        3.0 / 4.0,  # 3:4
+        16.0 / 9.0,  # 16:9
+        9.0 / 16.0,  # 9:16
+    ]
+)
+STANDARD_SHAPE = [
+    [(1024, 1024), (1280, 1280)],  # 1:1
+    [(1024, 768), (1152, 864), (1280, 960)],  # 4:3
+    [(768, 1024), (864, 1152), (960, 1280)],  # 3:4
+    [(1280, 768)],  # 16:9
+    [(768, 1280)],  # 9:16
+]
+STANDARD_AREA = [np.array([w * h for w, h in shapes]) for shapes in STANDARD_SHAPE]
+SUPPORTED_SHAPE = [
+    (1024, 1024),
+    (1280, 1280),  # 1:1
+    (1024, 768),
+    (1152, 864),
+    (1280, 960),  # 4:3
+    (768, 1024),
+    (864, 1152),
+    (960, 1280),  # 3:4
+    (1280, 768),  # 16:9
+    (768, 1280),  # 9:16
+]
+
+
+def map_to_standard_shapes(target_width, target_height):
+    target_ratio = target_width / target_height
+    closest_ratio_idx = np.argmin(np.abs(STANDARD_RATIO - target_ratio))
+    closest_area_idx = np.argmin(np.abs(STANDARD_AREA[closest_ratio_idx] - target_width * target_height))
+    width, height = STANDARD_SHAPE[closest_ratio_idx][closest_area_idx]
+    return width, height
+
+
+def get_resize_crop_region_for_grid(src, tgt_size):
+    th = tw = tgt_size
+    h, w = src
+
+    r = h / w
+
+    # resize
+    if r > 1:
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class HunyuanDiTControlNetPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for English/Chinese-to-image generation using HunyuanDiT.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    HunyuanDiT uses two text encoders: [mT5](https://huggingface.co/google/mt5-base) and [bilingual CLIP](fine-tuned by
+    ourselves)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. We use
+            `sdxl-vae-fp16-fix`.
+        text_encoder (Optional[`~transformers.BertModel`, `~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+            HunyuanDiT uses a fine-tuned [bilingual CLIP].
+        tokenizer (Optional[`~transformers.BertTokenizer`, `~transformers.CLIPTokenizer`]):
+            A `BertTokenizer` or `CLIPTokenizer` to tokenize text.
+        transformer ([`HunyuanDiT2DModel`]):
+            The HunyuanDiT model designed by Tencent Hunyuan.
+        text_encoder_2 (`T5EncoderModel`):
+            The mT5 embedder. Specifically, it is 't5-v1_1-xxl'.
+        tokenizer_2 (`MT5Tokenizer`):
+            The tokenizer for the mT5 embedder.
+        scheduler ([`DDPMScheduler`]):
+            A scheduler to be used in combination with HunyuanDiT to denoise the encoded image latents.
+        controlnet ([`HunyuanDiT2DControlNetModel`] or `List[HunyuanDiT2DControlNetModel]` or [`HunyuanDiT2DControlNetModel`]):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = [
+        "safety_checker",
+        "feature_extractor",
+        "text_encoder_2",
+        "tokenizer_2",
+        "text_encoder",
+        "tokenizer",
+    ]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "prompt_embeds_2",
+        "negative_prompt_embeds_2",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: BertModel,
+        tokenizer: BertTokenizer,
+        transformer: HunyuanDiT2DModel,
+        scheduler: DDPMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        controlnet: Union[
+            HunyuanDiT2DControlNetModel,
+            List[HunyuanDiT2DControlNetModel],
+            Tuple[HunyuanDiT2DControlNetModel],
+            HunyuanDiT2DMultiControlNetModel,
+        ],
+        text_encoder_2=T5EncoderModel,
+        tokenizer_2=MT5Tokenizer,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            text_encoder_2=text_encoder_2,
+            controlnet=controlnet,
+        )
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.default_sample_size = (
+            self.transformer.config.sample_size
+            if hasattr(self, "transformer") and self.transformer is not None
+            else 128
+        )
+
+    # Copied from diffusers.pipelines.hunyuandit.pipeline_hunyuandit.HunyuanDiTPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        device: torch.device = None,
+        dtype: torch.dtype = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        max_sequence_length: Optional[int] = None,
+        text_encoder_index: int = 0,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            dtype (`torch.dtype`):
+                torch dtype
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the prompt. Required when `prompt_embeds` is passed directly.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the negative prompt. Required when `negative_prompt_embeds` is passed directly.
+            max_sequence_length (`int`, *optional*): maximum sequence length to use for the prompt.
+            text_encoder_index (`int`, *optional*):
+                Index of the text encoder to use. `0` for clip and `1` for T5.
+        """
+        if dtype is None:
+            if self.text_encoder_2 is not None:
+                dtype = self.text_encoder_2.dtype
+            elif self.transformer is not None:
+                dtype = self.transformer.dtype
+            else:
+                dtype = None
+
+        if device is None:
+            device = self._execution_device
+
+        tokenizers = [self.tokenizer, self.tokenizer_2]
+        text_encoders = [self.text_encoder, self.text_encoder_2]
+
+        tokenizer = tokenizers[text_encoder_index]
+        text_encoder = text_encoders[text_encoder_index]
+
+        if max_sequence_length is None:
+            if text_encoder_index == 0:
+                max_length = 77
+            if text_encoder_index == 1:
+                max_length = 256
+        else:
+            max_length = max_sequence_length
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_attention_mask = text_inputs.attention_mask.to(device)
+            prompt_embeds = text_encoder(
+                text_input_ids.to(device),
+                attention_mask=prompt_attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+            prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            negative_prompt_attention_mask = uncond_input.attention_mask.to(device)
+            negative_prompt_embeds = text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=negative_prompt_attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds, negative_prompt_embeds, prompt_attention_mask, negative_prompt_attention_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.hunyuandit.pipeline_hunyuandit.HunyuanDiTPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+        prompt_embeds_2=None,
+        negative_prompt_embeds_2=None,
+        prompt_attention_mask_2=None,
+        negative_prompt_attention_mask_2=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is None and prompt_embeds_2 is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds_2`. Cannot leave both `prompt` and `prompt_embeds_2` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+        if prompt_embeds_2 is not None and prompt_attention_mask_2 is None:
+            raise ValueError("Must provide `prompt_attention_mask_2` when specifying `prompt_embeds_2`.")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+        if negative_prompt_embeds_2 is not None and negative_prompt_attention_mask_2 is None:
+            raise ValueError(
+                "Must provide `negative_prompt_attention_mask_2` when specifying `negative_prompt_embeds_2`."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if prompt_embeds_2 is not None and negative_prompt_embeds_2 is not None:
+            if prompt_embeds_2.shape != negative_prompt_embeds_2.shape:
+                raise ValueError(
+                    "`prompt_embeds_2` and `negative_prompt_embeds_2` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds_2` {prompt_embeds_2.shape} != `negative_prompt_embeds_2`"
+                    f" {negative_prompt_embeds_2.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet.StableDiffusion3ControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if isinstance(image, torch.Tensor):
+            pass
+        else:
+            image = self.image_processor.preprocess(image, height=height, width=width)
+
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 5.0,
+        control_image: PipelineImageInput = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_2: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_2: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        prompt_attention_mask_2: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask_2: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = (1024, 1024),
+        target_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        use_resolution_binning: bool = True,
+    ):
+        r"""
+        The call function to the pipeline for generation with HunyuanDiT.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`):
+                The height in pixels of the generated image.
+            width (`int`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet.
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            negative_prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the prompt. Required when `prompt_embeds` is passed directly.
+            prompt_attention_mask_2 (`torch.Tensor`, *optional*):
+                Attention mask for the prompt. Required when `prompt_embeds_2` is passed directly.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the negative prompt. Required when `negative_prompt_embeds` is passed directly.
+            negative_prompt_attention_mask_2 (`torch.Tensor`, *optional*):
+                Attention mask for the negative prompt. Required when `negative_prompt_embeds_2` is passed directly.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback_on_step_end (`Callable[[int, int, Dict], None]`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A callback function or a list of callback functions to be called at the end of each denoising step.
+            callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
+                A list of tensor inputs that should be passed to the callback function. If not defined, all tensor
+                inputs will be passed.
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Rescale the noise_cfg according to `guidance_rescale`. Based on findings of [Common Diffusion Noise
+                Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+            original_size (`Tuple[int, int]`, *optional*, defaults to `(1024, 1024)`):
+                The original size of the image. Used to calculate the time ids.
+            target_size (`Tuple[int, int]`, *optional*):
+                The target size of the image. Used to calculate the time ids.
+            crops_coords_top_left (`Tuple[int, int]`, *optional*, defaults to `(0, 0)`):
+                The top left coordinates of the crop. Used to calculate the time ids.
+            use_resolution_binning (`bool`, *optional*, defaults to `True`):
+                Whether to use resolution binning or not. If `True`, the input resolution will be mapped to the closest
+                standard resolution. Supported resolutions are 1024x1024, 1280x1280, 1024x768, 1152x864, 1280x960,
+                768x1024, 864x1152, 960x1280, 1280x768, and 768x1280. It is recommended to set this to `True`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. default height and width
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        height = int((height // 16) * 16)
+        width = int((width // 16) * 16)
+
+        if use_resolution_binning and (height, width) not in SUPPORTED_SHAPE:
+            width, height = map_to_standard_shapes(width, height)
+            height = int(height)
+            width = int(width)
+            logger.warning(f"Reshaped to (height, width)=({height}, {width}), Supported shapes are {SUPPORTED_SHAPE}")
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+            prompt_embeds_2,
+            negative_prompt_embeds_2,
+            prompt_attention_mask_2,
+            negative_prompt_attention_mask_2,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            dtype=self.transformer.dtype,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            max_sequence_length=77,
+            text_encoder_index=0,
+        )
+        (
+            prompt_embeds_2,
+            negative_prompt_embeds_2,
+            prompt_attention_mask_2,
+            negative_prompt_attention_mask_2,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            dtype=self.transformer.dtype,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds_2,
+            negative_prompt_embeds=negative_prompt_embeds_2,
+            prompt_attention_mask=prompt_attention_mask_2,
+            negative_prompt_attention_mask=negative_prompt_attention_mask_2,
+            max_sequence_length=256,
+            text_encoder_index=1,
+        )
+
+        # 4. Prepare control image
+        if isinstance(self.controlnet, HunyuanDiT2DControlNetModel):
+            control_image = self.prepare_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=self.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=False,
+            )
+            height, width = control_image.shape[-2:]
+
+            control_image = self.vae.encode(control_image).latent_dist.sample()
+            control_image = control_image * self.vae.config.scaling_factor
+
+        elif isinstance(self.controlnet, HunyuanDiT2DMultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=self.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=False,
+                )
+
+                control_image_ = self.vae.encode(control_image_).latent_dist.sample()
+                control_image_ = control_image_ * self.vae.config.scaling_factor
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. create image_rotary_emb, style embedding & time ids
+        grid_height = height // 8 // self.transformer.config.patch_size
+        grid_width = width // 8 // self.transformer.config.patch_size
+        base_size = 512 // 8 // self.transformer.config.patch_size
+        grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size)
+        image_rotary_emb = get_2d_rotary_pos_embed(
+            self.transformer.inner_dim // self.transformer.num_heads, grid_crops_coords, (grid_height, grid_width)
+        )
+
+        style = torch.tensor([0], device=device)
+
+        target_size = target_size or (height, width)
+        add_time_ids = list(original_size + target_size + crops_coords_top_left)
+        add_time_ids = torch.tensor([add_time_ids], dtype=prompt_embeds.dtype)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask])
+            prompt_embeds_2 = torch.cat([negative_prompt_embeds_2, prompt_embeds_2])
+            prompt_attention_mask_2 = torch.cat([negative_prompt_attention_mask_2, prompt_attention_mask_2])
+            add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
+            style = torch.cat([style] * 2, dim=0)
+
+        prompt_embeds = prompt_embeds.to(device=device)
+        prompt_attention_mask = prompt_attention_mask.to(device=device)
+        prompt_embeds_2 = prompt_embeds_2.to(device=device)
+        prompt_attention_mask_2 = prompt_attention_mask_2.to(device=device)
+        add_time_ids = add_time_ids.to(dtype=prompt_embeds.dtype, device=device).repeat(
+            batch_size * num_images_per_prompt, 1
+        )
+        style = style.to(device=device).repeat(batch_size * num_images_per_prompt)
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # expand scalar t to 1-D tensor to match the 1st dim of latent_model_input
+                t_expand = torch.tensor([t] * latent_model_input.shape[0], device=device).to(
+                    dtype=latent_model_input.dtype
+                )
+
+                # controlnet(s) inference
+                control_block_samples = self.controlnet(
+                    latent_model_input,
+                    t_expand,
+                    encoder_hidden_states=prompt_embeds,
+                    text_embedding_mask=prompt_attention_mask,
+                    encoder_hidden_states_t5=prompt_embeds_2,
+                    text_embedding_mask_t5=prompt_attention_mask_2,
+                    image_meta_size=add_time_ids,
+                    style=style,
+                    image_rotary_emb=image_rotary_emb,
+                    return_dict=False,
+                    controlnet_cond=control_image,
+                    conditioning_scale=controlnet_conditioning_scale,
+                )[0]
+
+                # predict the noise residual
+                noise_pred = self.transformer(
+                    latent_model_input,
+                    t_expand,
+                    encoder_hidden_states=prompt_embeds,
+                    text_embedding_mask=prompt_attention_mask,
+                    encoder_hidden_states_t5=prompt_embeds_2,
+                    text_embedding_mask_t5=prompt_attention_mask_2,
+                    image_meta_size=add_time_ids,
+                    style=style,
+                    image_rotary_emb=image_rotary_emb,
+                    return_dict=False,
+                    controlnet_block_samples=control_block_samples,
+                )[0]
+
+                noise_pred, _ = noise_pred.chunk(2, dim=1)
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    prompt_embeds_2 = callback_outputs.pop("prompt_embeds_2", prompt_embeds_2)
+                    negative_prompt_embeds_2 = callback_outputs.pop(
+                        "negative_prompt_embeds_2", negative_prompt_embeds_2
+                    )
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/controlnet_sd3/__init__.py b/diffusers/src/diffusers/pipelines/controlnet_sd3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aeb61dc8e247555e654d2f84c035dff53328ea0d
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet_sd3/__init__.py
@@ -0,0 +1,57 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_3_controlnet"] = ["StableDiffusion3ControlNetPipeline"]
+    _import_structure["pipeline_stable_diffusion_3_controlnet_inpainting"] = [
+        "StableDiffusion3ControlNetInpaintingPipeline"
+    ]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_diffusion_3_controlnet import StableDiffusion3ControlNetPipeline
+        from .pipeline_stable_diffusion_3_controlnet_inpainting import StableDiffusion3ControlNetInpaintingPipeline
+
+    try:
+        if not (is_transformers_available() and is_flax_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_flax_and_transformers_objects import *  # noqa F403
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py b/diffusers/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5246027a49d23bd0e3eaa34bca674a9617f2cff
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py
@@ -0,0 +1,1097 @@
+# Copyright 2024 Stability AI, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel
+from ...models.transformers import SD3Transformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from ..stable_diffusion_3.pipeline_output import StableDiffusion3PipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusion3ControlNetPipeline
+        >>> from diffusers.models import SD3ControlNetModel, SD3MultiControlNetModel
+        >>> from diffusers.utils import load_image
+
+        >>> controlnet = SD3ControlNetModel.from_pretrained("InstantX/SD3-Controlnet-Canny", torch_dtype=torch.float16)
+
+        >>> pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+        >>> control_image = load_image("https://huggingface.co/InstantX/SD3-Controlnet-Canny/resolve/main/canny.jpg")
+        >>> prompt = "A girl holding a sign that says InstantX"
+        >>> image = pipe(prompt, control_image=control_image, controlnet_conditioning_scale=0.7).images[0]
+        >>> image.save("sd3.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    Args:
+        transformer ([`SD3Transformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant,
+            with an additional added projection layer that is initialized with a diagonal matrix with the `hidden_size`
+            as its dimension.
+        text_encoder_2 ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        text_encoder_3 ([`T5EncoderModel`]):
+            Frozen text-encoder. Stable Diffusion 3 uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_3 (`T5TokenizerFast`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        controlnet ([`SD3ControlNetModel`] or `List[SD3ControlNetModel]` or [`SD3MultiControlNetModel`]):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
+
+    def __init__(
+        self,
+        transformer: SD3Transformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer_2: CLIPTokenizer,
+        text_encoder_3: T5EncoderModel,
+        tokenizer_3: T5TokenizerFast,
+        controlnet: Union[
+            SD3ControlNetModel, List[SD3ControlNetModel], Tuple[SD3ControlNetModel], SD3MultiControlNetModel
+        ],
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            text_encoder_3=text_encoder_3,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            tokenizer_3=tokenizer_3,
+            transformer=transformer,
+            scheduler=scheduler,
+            controlnet=controlnet,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = (
+            self.transformer.config.sample_size
+            if hasattr(self, "transformer") and self.transformer is not None
+            else 128
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 256,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        if self.text_encoder_3 is None:
+            return torch.zeros(
+                (
+                    batch_size * num_images_per_prompt,
+                    self.tokenizer_max_length,
+                    self.transformer.config.joint_attention_dim,
+                ),
+                device=device,
+                dtype=dtype,
+            )
+
+        text_inputs = self.tokenizer_3(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_3(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_3.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_3(text_input_ids.to(device))[0]
+
+        dtype = self.text_encoder_3.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        clip_skip: Optional[int] = None,
+        clip_model_index: int = 0,
+    ):
+        device = device or self._execution_device
+
+        clip_tokenizers = [self.tokenizer, self.tokenizer_2]
+        clip_text_encoders = [self.text_encoder, self.text_encoder_2]
+
+        tokenizer = clip_tokenizers[clip_model_index]
+        text_encoder = clip_text_encoders[clip_model_index]
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+        pooled_prompt_embeds = prompt_embeds[0]
+
+        if clip_skip is None:
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+        else:
+            prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds, pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        prompt_3: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clip_skip: Optional[int] = None,
+        max_sequence_length: int = 256,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            prompt_3 = prompt_3 or prompt
+            prompt_3 = [prompt_3] if isinstance(prompt_3, str) else prompt_3
+
+            prompt_embed, pooled_prompt_embed = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=0,
+            )
+            prompt_2_embed, pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                prompt=prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=1,
+            )
+            clip_prompt_embeds = torch.cat([prompt_embed, prompt_2_embed], dim=-1)
+
+            t5_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            clip_prompt_embeds = torch.nn.functional.pad(
+                clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+            )
+
+            prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+            pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1)
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+            negative_prompt_3 = negative_prompt_3 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+            negative_prompt_3 = (
+                batch_size * [negative_prompt_3] if isinstance(negative_prompt_3, str) else negative_prompt_3
+            )
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embed, negative_pooled_prompt_embed = self._get_clip_prompt_embeds(
+                negative_prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=0,
+            )
+            negative_prompt_2_embed, negative_pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                negative_prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=1,
+            )
+            negative_clip_prompt_embeds = torch.cat([negative_prompt_embed, negative_prompt_2_embed], dim=-1)
+
+            t5_negative_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=negative_prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            negative_clip_prompt_embeds = torch.nn.functional.pad(
+                negative_clip_prompt_embeds,
+                (0, t5_negative_prompt_embed.shape[-1] - negative_clip_prompt_embeds.shape[-1]),
+            )
+
+            negative_prompt_embeds = torch.cat([negative_clip_prompt_embeds, t5_negative_prompt_embed], dim=-2)
+            negative_pooled_prompt_embeds = torch.cat(
+                [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        prompt_3,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        negative_prompt_3=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_3 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_3`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        elif prompt_3 is not None and (not isinstance(prompt_3, str) and not isinstance(prompt_3, list)):
+            raise ValueError(f"`prompt_3` has to be of type `str` or `list` but is {type(prompt_3)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_3 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_3`: {negative_prompt_3} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        return latents
+
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if isinstance(image, torch.Tensor):
+            pass
+        else:
+            image = self.image_processor.preprocess(image, height=height, width=width)
+
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        prompt_3: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        control_image: PipelineImageInput = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        controlnet_pooled_projections: Optional[torch.FloatTensor] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 256,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet.
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            controlnet_pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
+                Embeddings projected from the embeddings of controlnet input conditions.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used instead
+            negative_prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used instead
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(self.controlnet.nets) if isinstance(self.controlnet, SD3MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            prompt_3,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        dtype = self.transformer.dtype
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_3=prompt_3,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            device=device,
+            clip_skip=self.clip_skip,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+        )
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+
+        # 3. Prepare control image
+        if isinstance(self.controlnet, SD3ControlNetModel):
+            control_image = self.prepare_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=False,
+            )
+            height, width = control_image.shape[-2:]
+
+            control_image = self.vae.encode(control_image).latent_dist.sample()
+            control_image = control_image * self.vae.config.scaling_factor
+
+        elif isinstance(self.controlnet, SD3MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=False,
+                )
+
+                control_image_ = self.vae.encode(control_image_).latent_dist.sample()
+                control_image_ = control_image_ * self.vae.config.scaling_factor
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+        else:
+            assert False
+
+        if controlnet_pooled_projections is None:
+            controlnet_pooled_projections = torch.zeros_like(pooled_prompt_embeds)
+        else:
+            controlnet_pooled_projections = controlnet_pooled_projections or pooled_prompt_embeds
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(self.controlnet, SD3ControlNetModel) else keeps)
+
+        # 7. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                # controlnet(s) inference
+                control_block_samples = self.controlnet(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=controlnet_pooled_projections,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    return_dict=False,
+                )[0]
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    block_controlnet_hidden_states=control_block_samples,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusion3PipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py b/diffusers/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py
new file mode 100644
index 0000000000000000000000000000000000000000..47fc6d6daf158d9576ec8308d6e441c109dbe247
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py
@@ -0,0 +1,1144 @@
+# Copyright 2024 Stability AI, The HuggingFace Team and The AlimamaCreative Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel
+from ...models.transformers import SD3Transformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from ..stable_diffusion_3.pipeline_output import StableDiffusion3PipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers.utils import load_image, check_min_version
+        >>> from diffusers.pipelines import StableDiffusion3ControlNetInpaintingPipeline
+        >>> from diffusers.models.controlnet_sd3 import SD3ControlNetModel
+
+        >>> controlnet = SD3ControlNetModel.from_pretrained(
+        ...     "alimama-creative/SD3-Controlnet-Inpainting", use_safetensors=True, extra_conditioning_channels=1
+        ... )
+        >>> pipe = StableDiffusion3ControlNetInpaintingPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-3-medium-diffusers",
+        ...     controlnet=controlnet,
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe.text_encoder.to(torch.float16)
+        >>> pipe.controlnet.to(torch.float16)
+        >>> pipe.to("cuda")
+
+        >>> image = load_image(
+        ...     "https://huggingface.co/alimama-creative/SD3-Controlnet-Inpainting/resolve/main/images/dog.png"
+        ... )
+        >>> mask = load_image(
+        ...     "https://huggingface.co/alimama-creative/SD3-Controlnet-Inpainting/resolve/main/images/dog_mask.png"
+        ... )
+        >>> width = 1024
+        >>> height = 1024
+        >>> prompt = "A cat is sitting next to a puppy."
+        >>> generator = torch.Generator(device="cuda").manual_seed(24)
+        >>> res_image = pipe(
+        ...     negative_prompt="deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, mutated hands and fingers, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation, NSFW",
+        ...     prompt=prompt,
+        ...     height=height,
+        ...     width=width,
+        ...     control_image=image,
+        ...     control_mask=mask,
+        ...     num_inference_steps=28,
+        ...     generator=generator,
+        ...     controlnet_conditioning_scale=0.95,
+        ...     guidance_scale=7,
+        ... ).images[0]
+        >>> res_image.save(f"sd3.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    Args:
+        transformer ([`SD3Transformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant,
+            with an additional added projection layer that is initialized with a diagonal matrix with the `hidden_size`
+            as its dimension.
+        text_encoder_2 ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        text_encoder_3 ([`T5EncoderModel`]):
+            Frozen text-encoder. Stable Diffusion 3 uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_3 (`T5TokenizerFast`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        controlnet ([`SD3ControlNetModel`] or `List[SD3ControlNetModel]` or [`SD3MultiControlNetModel`]):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
+
+    def __init__(
+        self,
+        transformer: SD3Transformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer_2: CLIPTokenizer,
+        text_encoder_3: T5EncoderModel,
+        tokenizer_3: T5TokenizerFast,
+        controlnet: Union[
+            SD3ControlNetModel, List[SD3ControlNetModel], Tuple[SD3ControlNetModel], SD3MultiControlNetModel
+        ],
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            text_encoder_3=text_encoder_3,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            tokenizer_3=tokenizer_3,
+            transformer=transformer,
+            scheduler=scheduler,
+            controlnet=controlnet,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_resize=True, do_convert_rgb=True, do_normalize=True
+        )
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_resize=True,
+            do_convert_grayscale=True,
+            do_normalize=False,
+            do_binarize=True,
+        )
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = (
+            self.transformer.config.sample_size
+            if hasattr(self, "transformer") and self.transformer is not None
+            else 128
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 256,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        if self.text_encoder_3 is None:
+            return torch.zeros(
+                (
+                    batch_size * num_images_per_prompt,
+                    self.tokenizer_max_length,
+                    self.transformer.config.joint_attention_dim,
+                ),
+                device=device,
+                dtype=dtype,
+            )
+
+        text_inputs = self.tokenizer_3(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_3(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_3.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_3(text_input_ids.to(device))[0]
+
+        dtype = self.text_encoder_3.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        clip_skip: Optional[int] = None,
+        clip_model_index: int = 0,
+    ):
+        device = device or self._execution_device
+
+        clip_tokenizers = [self.tokenizer, self.tokenizer_2]
+        clip_text_encoders = [self.text_encoder, self.text_encoder_2]
+
+        tokenizer = clip_tokenizers[clip_model_index]
+        text_encoder = clip_text_encoders[clip_model_index]
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+        pooled_prompt_embeds = prompt_embeds[0]
+
+        if clip_skip is None:
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+        else:
+            prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds, pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        prompt_3: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clip_skip: Optional[int] = None,
+        max_sequence_length: int = 256,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            prompt_3 = prompt_3 or prompt
+            prompt_3 = [prompt_3] if isinstance(prompt_3, str) else prompt_3
+
+            prompt_embed, pooled_prompt_embed = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=0,
+            )
+            prompt_2_embed, pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                prompt=prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=1,
+            )
+            clip_prompt_embeds = torch.cat([prompt_embed, prompt_2_embed], dim=-1)
+
+            t5_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            clip_prompt_embeds = torch.nn.functional.pad(
+                clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+            )
+
+            prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+            pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1)
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+            negative_prompt_3 = negative_prompt_3 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+            negative_prompt_3 = (
+                batch_size * [negative_prompt_3] if isinstance(negative_prompt_3, str) else negative_prompt_3
+            )
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embed, negative_pooled_prompt_embed = self._get_clip_prompt_embeds(
+                negative_prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=0,
+            )
+            negative_prompt_2_embed, negative_pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                negative_prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=1,
+            )
+            negative_clip_prompt_embeds = torch.cat([negative_prompt_embed, negative_prompt_2_embed], dim=-1)
+
+            t5_negative_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=negative_prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            negative_clip_prompt_embeds = torch.nn.functional.pad(
+                negative_clip_prompt_embeds,
+                (0, t5_negative_prompt_embed.shape[-1] - negative_clip_prompt_embeds.shape[-1]),
+            )
+
+            negative_prompt_embeds = torch.cat([negative_clip_prompt_embeds, t5_negative_prompt_embed], dim=-2)
+            negative_pooled_prompt_embeds = torch.cat(
+                [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        prompt_3,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        negative_prompt_3=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_3 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_3`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        elif prompt_3 is not None and (not isinstance(prompt_3, str) and not isinstance(prompt_3, list)):
+            raise ValueError(f"`prompt_3` has to be of type `str` or `list` but is {type(prompt_3)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_3 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_3`: {negative_prompt_3} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        return latents
+
+    def prepare_image_with_mask(
+        self,
+        image,
+        mask,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if isinstance(image, torch.Tensor):
+            pass
+        else:
+            image = self.image_processor.preprocess(image, height=height, width=width)
+
+        image_batch_size = image.shape[0]
+
+        # Prepare image
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        # Prepare mask
+        if isinstance(mask, torch.Tensor):
+            pass
+        else:
+            mask = self.mask_processor.preprocess(mask, height=height, width=width)
+        mask = mask.repeat_interleave(repeat_by, dim=0)
+        mask = mask.to(device=device, dtype=dtype)
+
+        # Get masked image
+        masked_image = image.clone()
+        masked_image[(mask > 0.5).repeat(1, 3, 1, 1)] = -1
+
+        # Encode to latents
+        image_latents = self.vae.encode(masked_image).latent_dist.sample()
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        image_latents = image_latents.to(dtype)
+
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = 1 - mask
+        control_image = torch.cat([image_latents, mask], dim=1)
+
+        if do_classifier_free_guidance and not guess_mode:
+            control_image = torch.cat([control_image] * 2)
+
+        return control_image
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        prompt_3: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        control_image: PipelineImageInput = None,
+        control_mask: PipelineImageInput = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        controlnet_pooled_projections: Optional[torch.FloatTensor] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 256,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to
+                be masked out with `control_mask` and repainted according to `prompt`). For both numpy array and
+                pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the
+                expected shape should be `(B, C, H, W)`. If it is a numpy array or a list of arrays, the expected shape
+                should be `(B, H, W, C)` or `(H, W, C)`.
+            control_mask (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`. And
+                for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`.
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            controlnet_pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
+                Embeddings projected from the embeddings of controlnet input conditions.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used instead
+            negative_prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used instead
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(self.controlnet.nets) if isinstance(self.controlnet, SD3MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            prompt_3,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        dtype = self.transformer.dtype
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_3=prompt_3,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            device=device,
+            clip_skip=self.clip_skip,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+        )
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+
+        # 3. Prepare control image
+        if isinstance(self.controlnet, SD3ControlNetModel):
+            control_image = self.prepare_image_with_mask(
+                image=control_image,
+                mask=control_mask,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=False,
+            )
+            latent_height, latent_width = control_image.shape[-2:]
+
+            height = latent_height * self.vae_scale_factor
+            width = latent_width * self.vae_scale_factor
+
+        elif isinstance(self.controlnet, SD3MultiControlNetModel):
+            raise NotImplementedError("MultiControlNetModel is not supported for SD3ControlNetInpaintingPipeline.")
+        else:
+            assert False
+
+        if controlnet_pooled_projections is None:
+            controlnet_pooled_projections = torch.zeros_like(pooled_prompt_embeds)
+        else:
+            controlnet_pooled_projections = controlnet_pooled_projections or pooled_prompt_embeds
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(self.controlnet, SD3ControlNetModel) else keeps)
+
+        # 7. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                # controlnet(s) inference
+                control_block_samples = self.controlnet(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=controlnet_pooled_projections,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    return_dict=False,
+                )[0]
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    block_controlnet_hidden_states=control_block_samples,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            latents = latents.to(dtype=self.vae.dtype)
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusion3PipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/controlnet_xs/__init__.py b/diffusers/src/diffusers/pipelines/controlnet_xs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..978278b184f985a452f9d518a1d0eb4f271c74fd
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet_xs/__init__.py
@@ -0,0 +1,68 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_controlnet_xs"] = ["StableDiffusionControlNetXSPipeline"]
+    _import_structure["pipeline_controlnet_xs_sd_xl"] = ["StableDiffusionXLControlNetXSPipeline"]
+try:
+    if not (is_transformers_available() and is_flax_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_flax_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
+else:
+    pass  # _import_structure["pipeline_flax_controlnet"] = ["FlaxStableDiffusionControlNetPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_controlnet_xs import StableDiffusionControlNetXSPipeline
+        from .pipeline_controlnet_xs_sd_xl import StableDiffusionXLControlNetXSPipeline
+
+    try:
+        if not (is_transformers_available() and is_flax_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_flax_and_transformers_objects import *  # noqa F403
+    else:
+        pass  # from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/diffusers/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca10e65de8a421eb67391d5d900f3eb33777dbb6
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
@@ -0,0 +1,916 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetXSPipeline, ControlNetXSAdapter
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+        >>> negative_prompt = "low quality, bad quality, sketches"
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+        ... )
+
+        >>> # initialize the models and pipeline
+        >>> controlnet_conditioning_scale = 0.5
+
+        >>> controlnet = ControlNetXSAdapter.from_pretrained(
+        ...     "UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16
+        ... )
+        >>> pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # get canny image
+        >>> image = np.array(image)
+        >>> image = cv2.Canny(image, 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+        >>> # generate image
+        >>> image = pipe(
+        ...     prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image
+        ... ).images[0]
+        ```
+"""
+
+
+class StableDiffusionControlNetXSPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with ControlNet-XS guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetControlNetXSModel to denoise the encoded image latents.
+        controlnet ([`ControlNetXSAdapter`]):
+            A [`ControlNetXSAdapter`] to be used in combination with `unet` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: Union[UNet2DConditionModel, UNetControlNetXSModel],
+        controlnet: ControlNetXSAdapter,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetControlNetXSModel.from_unet(unet, controlnet)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # Check `image` and `controlnet_conditioning_scale`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.unet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.unet, UNetControlNetXSModel)
+            or is_compiled
+            and isinstance(self.unet._orig_mod, UNetControlNetXSModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        else:
+            assert False
+
+        start, end = control_guidance_start, control_guidance_end
+        if start >= end:
+            raise ValueError(
+                f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+            )
+        if start < 0.0:
+            raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+        if end > 1.0:
+            raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_scale
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.clip_skip
+    def clip_skip(self):
+        return self._clip_skip
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.do_classifier_free_guidance
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.cross_attention_kwargs
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.num_timesteps
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        control_guidance_start: float = 0.0,
+        control_guidance_end: float = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        unet = self.unet._orig_mod if is_compiled_module(self.unet) else self.unet
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            image,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare image
+        image = self.prepare_image(
+            image=image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=unet.dtype,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+        height, width = image.shape[-2:]
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        is_controlnet_compiled = is_compiled_module(self.unet)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if is_controlnet_compiled and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                apply_control = (
+                    i / len(timesteps) >= control_guidance_start and (i + 1) / len(timesteps) <= control_guidance_end
+                )
+                noise_pred = self.unet(
+                    sample=latent_model_input,
+                    timestep=t,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_cond=image,
+                    conditioning_scale=controlnet_conditioning_scale,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=True,
+                    apply_control=apply_control,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/diffusers/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..326cfdab7be784a504b45a7a5fde2a06cb7f5c36
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -0,0 +1,1111 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+)
+
+from diffusers.utils.import_utils import is_invisible_watermark_available
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionXLControlNetXSPipeline, ControlNetXSAdapter, AutoencoderKL
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+        >>> negative_prompt = "low quality, bad quality, sketches"
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+        ... )
+
+        >>> # initialize the models and pipeline
+        >>> controlnet_conditioning_scale = 0.5
+        >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+        >>> controlnet = ControlNetXSAdapter.from_pretrained(
+        ...     "UmerHA/Testing-ConrolNetXS-SDXL-canny", torch_dtype=torch.float16
+        ... )
+        >>> pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # get canny image
+        >>> image = np.array(image)
+        >>> image = cv2.Canny(image, 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+
+        >>> # generate image
+        >>> image = pipe(
+        ...     prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image
+        ... ).images[0]
+        ```
+"""
+
+
+class StableDiffusionXLControlNetXSPipeline(
+    DiffusionPipeline,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet-XS guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        text_encoder_2 ([`~transformers.CLIPTextModelWithProjection`]):
+            Second frozen text-encoder
+            ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        tokenizer_2 ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetControlNetXSModel to denoise the encoded image latents.
+        controlnet ([`ControlNetXSAdapter`]):
+            A [`ControlNetXSAdapter`] to be used in combination with `unet` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings should always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark](https://github.com/ShieldMnt/invisible-watermark/) library to
+            watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no
+            watermarker is used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: Union[UNet2DConditionModel, UNetControlNetXSModel],
+        controlnet: ControlNetXSAdapter,
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+        feature_extractor: CLIPImageProcessor = None,
+    ):
+        super().__init__()
+
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetControlNetXSModel.from_unet(unet, controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        # Check `image` and ``controlnet_conditioning_scale``
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.unet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.unet, UNetControlNetXSModel)
+            or is_compiled
+            and isinstance(self.unet._orig_mod, UNetControlNetXSModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        else:
+            assert False
+
+        start, end = control_guidance_start, control_guidance_end
+        if start >= end:
+            raise ValueError(
+                f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+            )
+        if start < 0.0:
+            raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+        if end > 1.0:
+            raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.base_add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_scale
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.clip_skip
+    def clip_skip(self):
+        return self._clip_skip
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.do_classifier_free_guidance
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.cross_attention_kwargs
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.num_timesteps
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        control_guidance_start: float = 0.0,
+        control_guidance_end: float = 1.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. This is sent to `tokenizer_2`
+                and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, pooled text embeddings are generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt
+                weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`.
+            control_guidance_start (`float`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] is
+                returned, otherwise a `tuple` is returned containing the output images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        unet = self.unet._orig_mod if is_compiled_module(self.unet) else self.unet
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            image,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt,
+            prompt_2,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+
+        # 4. Prepare image
+        if isinstance(unet, UNetControlNetXSModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=unet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+            )
+            height, width = image.shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Prepare added time ids & embeddings
+        if isinstance(image, list):
+            original_size = original_size or image[0].shape[-2:]
+        else:
+            original_size = original_size or image.shape[-2:]
+        target_size = target_size or (height, width)
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        is_controlnet_compiled = is_compiled_module(self.unet)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if is_controlnet_compiled and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                # predict the noise residual
+                apply_control = (
+                    i / len(timesteps) >= control_guidance_start and (i + 1) / len(timesteps) <= control_guidance_end
+                )
+                noise_pred = self.unet(
+                    sample=latent_model_input,
+                    timestep=t,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_cond=image,
+                    conditioning_scale=controlnet_conditioning_scale,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=True,
+                    apply_control=apply_control,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        # manually for max memory savings
+        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+            self.upcast_vae()
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/dance_diffusion/__init__.py b/diffusers/src/diffusers/pipelines/dance_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d3e466dfa65b2e9890451607959ed45d092cae7
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/dance_diffusion/__init__.py
@@ -0,0 +1,18 @@
+from typing import TYPE_CHECKING
+
+from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_dance_diffusion": ["DanceDiffusionPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_dance_diffusion import DanceDiffusionPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/diffusers/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/diffusers/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcd36c412b54ceb47df088a032161aa88d83950f
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -0,0 +1,156 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class DanceDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for audio generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet1DModel`]):
+            A `UNet1DModel` to denoise the encoded audio.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
+            [`IPNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "unet"
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        num_inference_steps: int = 100,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        audio_length_in_s: Optional[float] = None,
+        return_dict: bool = True,
+    ) -> Union[AudioPipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of audio samples to generate.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher-quality audio sample at
+                the expense of slower inference.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            audio_length_in_s (`float`, *optional*, defaults to `self.unet.config.sample_size/self.unet.config.sample_rate`):
+                The length of the generated audio sample in seconds.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        from scipy.io.wavfile import write
+
+        model_id = "harmonai/maestro-150k"
+        pipe = DiffusionPipeline.from_pretrained(model_id)
+        pipe = pipe.to("cuda")
+
+        audios = pipe(audio_length_in_s=4.0).audios
+
+        # To save locally
+        for i, audio in enumerate(audios):
+            write(f"maestro_test_{i}.wav", pipe.unet.sample_rate, audio.transpose())
+
+        # To dislay in google colab
+        import IPython.display as ipd
+
+        for audio in audios:
+            display(ipd.Audio(audio, rate=pipe.unet.sample_rate))
+        ```
+
+        Returns:
+            [`~pipelines.AudioPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated audio.
+        """
+
+        if audio_length_in_s is None:
+            audio_length_in_s = self.unet.config.sample_size / self.unet.config.sample_rate
+
+        sample_size = audio_length_in_s * self.unet.config.sample_rate
+
+        down_scale_factor = 2 ** len(self.unet.up_blocks)
+        if sample_size < 3 * down_scale_factor:
+            raise ValueError(
+                f"{audio_length_in_s} is too small. Make sure it's bigger or equal to"
+                f" {3 * down_scale_factor / self.unet.config.sample_rate}."
+            )
+
+        original_sample_size = int(sample_size)
+        if sample_size % down_scale_factor != 0:
+            sample_size = (
+                (audio_length_in_s * self.unet.config.sample_rate) // down_scale_factor + 1
+            ) * down_scale_factor
+            logger.info(
+                f"{audio_length_in_s} is increased to {sample_size / self.unet.config.sample_rate} so that it can be handled"
+                f" by the model. It will be cut to {original_sample_size / self.unet.config.sample_rate} after the denoising"
+                " process."
+            )
+        sample_size = int(sample_size)
+
+        dtype = next(self.unet.parameters()).dtype
+        shape = (batch_size, self.unet.config.in_channels, sample_size)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        audio = randn_tensor(shape, generator=generator, device=self._execution_device, dtype=dtype)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps, device=audio.device)
+        self.scheduler.timesteps = self.scheduler.timesteps.to(dtype)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(audio, t).sample
+
+            # 2. compute previous audio sample: x_t -> t_t-1
+            audio = self.scheduler.step(model_output, t, audio).prev_sample
+
+        audio = audio.clamp(-1, 1).float().cpu().numpy()
+
+        audio = audio[:, :, :original_sample_size]
+
+        if not return_dict:
+            return (audio,)
+
+        return AudioPipelineOutput(audios=audio)
diff --git a/diffusers/src/diffusers/pipelines/ddim/__init__.py b/diffusers/src/diffusers/pipelines/ddim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9eede47c897370a23c47c05291690881c987025
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/ddim/__init__.py
@@ -0,0 +1,18 @@
+from typing import TYPE_CHECKING
+
+from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_ddim": ["DDIMPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_ddim import DDIMPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/diffusers/src/diffusers/pipelines/ddim/pipeline_ddim.py b/diffusers/src/diffusers/pipelines/ddim/pipeline_ddim.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3b967ed369bd3e7287aa6dc628ecad6484a6359
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -0,0 +1,154 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ...schedulers import DDIMScheduler
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class DDIMPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+            [`DDPMScheduler`], or [`DDIMScheduler`].
+    """
+
+    model_cpu_offload_seq = "unet"
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+
+        # make sure scheduler can always be converted to DDIM
+        scheduler = DDIMScheduler.from_config(scheduler.config)
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        eta: float = 0.0,
+        num_inference_steps: int = 50,
+        use_clipped_model_output: Optional[bool] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. A value of `0` corresponds to
+                DDIM and `1` corresponds to DDPM.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            use_clipped_model_output (`bool`, *optional*, defaults to `None`):
+                If `True` or `False`, see documentation for [`DDIMScheduler.step`]. If `None`, nothing is passed
+                downstream to the scheduler (use `None` for schedulers which don't support this argument).
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import DDIMPipeline
+        >>> import PIL.Image
+        >>> import numpy as np
+
+        >>> # load model and scheduler
+        >>> pipe = DDIMPipeline.from_pretrained("fusing/ddim-lsun-bedroom")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pipe(eta=0.0, num_inference_steps=50)
+
+        >>> # process image to PIL
+        >>> image_processed = image.cpu().permute(0, 2, 3, 1)
+        >>> image_processed = (image_processed + 1.0) * 127.5
+        >>> image_processed = image_processed.numpy().astype(np.uint8)
+        >>> image_pil = PIL.Image.fromarray(image_processed[0])
+
+        >>> # save image
+        >>> image_pil.save("test.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+
+        # Sample gaussian noise to begin loop
+        if isinstance(self.unet.config.sample_size, int):
+            image_shape = (
+                batch_size,
+                self.unet.config.in_channels,
+                self.unet.config.sample_size,
+                self.unet.config.sample_size,
+            )
+        else:
+            image_shape = (batch_size, self.unet.config.in_channels, *self.unet.config.sample_size)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        image = randn_tensor(image_shape, generator=generator, device=self._execution_device, dtype=self.unet.dtype)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(image, t).sample
+
+            # 2. predict previous mean of image x_t-1 and add variance depending on eta
+            # eta corresponds to η in paper and should be between [0, 1]
+            # do x_t -> x_t-1
+            image = self.scheduler.step(
+                model_output, t, image, eta=eta, use_clipped_model_output=use_clipped_model_output, generator=generator
+            ).prev_sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/ddpm/__init__.py b/diffusers/src/diffusers/pipelines/ddpm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb41dd1dcf642c791f3d7b0d985efcaf3e4a2c22
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/ddpm/__init__.py
@@ -0,0 +1,22 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    _LazyModule,
+)
+
+
+_import_structure = {"pipeline_ddpm": ["DDPMPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_ddpm import DDPMPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/diffusers/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/diffusers/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb03a8d66758cb645a7194cf099d9a2c5d5dbe98
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -0,0 +1,127 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class DDPMPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+            [`DDPMScheduler`], or [`DDIMScheduler`].
+    """
+
+    model_cpu_offload_seq = "unet"
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        num_inference_steps: int = 1000,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 1000):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import DDPMPipeline
+
+        >>> # load model and scheduler
+        >>> pipe = DDPMPipeline.from_pretrained("google/ddpm-cat-256")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pipe().images[0]
+
+        >>> # save image
+        >>> image.save("ddpm_generated_image.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        # Sample gaussian noise to begin loop
+        if isinstance(self.unet.config.sample_size, int):
+            image_shape = (
+                batch_size,
+                self.unet.config.in_channels,
+                self.unet.config.sample_size,
+                self.unet.config.sample_size,
+            )
+        else:
+            image_shape = (batch_size, self.unet.config.in_channels, *self.unet.config.sample_size)
+
+        if self.device.type == "mps":
+            # randn does not work reproducibly on mps
+            image = randn_tensor(image_shape, generator=generator, dtype=self.unet.dtype)
+            image = image.to(self.device)
+        else:
+            image = randn_tensor(image_shape, generator=generator, device=self.device, dtype=self.unet.dtype)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(image, t).sample
+
+            # 2. compute previous image: x_t -> x_t-1
+            image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/deepfloyd_if/__init__.py b/diffusers/src/diffusers/pipelines/deepfloyd_if/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..79aab1fb186a857dd0a3353c4b5905b4595b5b7b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deepfloyd_if/__init__.py
@@ -0,0 +1,85 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {
+    "timesteps": [
+        "fast27_timesteps",
+        "smart100_timesteps",
+        "smart185_timesteps",
+        "smart27_timesteps",
+        "smart50_timesteps",
+        "super100_timesteps",
+        "super27_timesteps",
+        "super40_timesteps",
+    ]
+}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_if"] = ["IFPipeline"]
+    _import_structure["pipeline_if_img2img"] = ["IFImg2ImgPipeline"]
+    _import_structure["pipeline_if_img2img_superresolution"] = ["IFImg2ImgSuperResolutionPipeline"]
+    _import_structure["pipeline_if_inpainting"] = ["IFInpaintingPipeline"]
+    _import_structure["pipeline_if_inpainting_superresolution"] = ["IFInpaintingSuperResolutionPipeline"]
+    _import_structure["pipeline_if_superresolution"] = ["IFSuperResolutionPipeline"]
+    _import_structure["pipeline_output"] = ["IFPipelineOutput"]
+    _import_structure["safety_checker"] = ["IFSafetyChecker"]
+    _import_structure["watermark"] = ["IFWatermarker"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_if import IFPipeline
+        from .pipeline_if_img2img import IFImg2ImgPipeline
+        from .pipeline_if_img2img_superresolution import IFImg2ImgSuperResolutionPipeline
+        from .pipeline_if_inpainting import IFInpaintingPipeline
+        from .pipeline_if_inpainting_superresolution import IFInpaintingSuperResolutionPipeline
+        from .pipeline_if_superresolution import IFSuperResolutionPipeline
+        from .pipeline_output import IFPipelineOutput
+        from .safety_checker import IFSafetyChecker
+        from .timesteps import (
+            fast27_timesteps,
+            smart27_timesteps,
+            smart50_timesteps,
+            smart100_timesteps,
+            smart185_timesteps,
+            super27_timesteps,
+            super40_timesteps,
+            super100_timesteps,
+        )
+        from .watermark import IFWatermarker
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
new file mode 100644
index 0000000000000000000000000000000000000000..f545b24bec5c17c1f75673bb7d35b3753bcc41aa
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
@@ -0,0 +1,774 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...loaders import StableDiffusionLoraLoaderMixin
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFPipeline, IFSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+
+        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pt").images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pt"
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> safety_modules = {
+        ...     "feature_extractor": pipe.feature_extractor,
+        ...     "safety_checker": pipe.safety_checker,
+        ...     "watermarker": pipe.watermarker,
+        ... }
+        >>> super_res_2_pipe = DiffusionPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16
+        ... )
+        >>> super_res_2_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_2_pipe(
+        ...     prompt=prompt,
+        ...     image=image,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
+    _exclude_from_cpu_offload = ["watermarker"]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    @torch.no_grad()
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clean_caption (bool, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, device, generator):
+        shape = (batch_size, num_channels, height, width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        intermediate_images = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        intermediate_images = intermediate_images * self.scheduler.init_noise_sigma
+        return intermediate_images
+
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 100,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # 2. Define call parameters
+        height = height or self.unet.config.sample_size
+        width = width or self.unet.config.sample_size
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(0)
+
+        # 5. Prepare intermediate images
+        intermediate_images = self.prepare_intermediate_images(
+            batch_size * num_images_per_prompt,
+            self.unet.config.in_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = (
+                    torch.cat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+                )
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+                    noise_pred, _ = noise_pred.split(model_input.shape[1], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 11. Apply watermark
+            if self.watermarker is not None:
+                image = self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..07017912575dbae3338a845b82b83720601e689d
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
@@ -0,0 +1,895 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...loaders import StableDiffusionLoraLoaderMixin
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
+    w, h = images.size
+
+    coef = w / h
+
+    w, h = img_size, img_size
+
+    if coef >= 1:
+        w = int(round(img_size / 8 * coef) * 8)
+    else:
+        h = int(round(img_size / 8 / coef) * 8)
+
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+
+    return images
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> response = requests.get(url)
+        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> original_image = original_image.resize((768, 512))
+
+        >>> pipe = IFImg2ImgPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-I-XL-v1.0",
+        ...     variant="fp16",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A fantasy landscape in style minecraft"
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(
+        ...     image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ...     output_type="pt",
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0",
+        ...     text_encoder=None,
+        ...     variant="fp16",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image,
+        ...     original_image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFImg2ImgPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
+    _exclude_from_cpu_offload = ["watermarker"]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    @torch.no_grad()
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clean_caption (bool, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    def preprocess_image(self, image: PIL.Image.Image) -> torch.Tensor:
+        if not isinstance(image, list):
+            image = [image]
+
+        def numpy_to_pt(images):
+            if images.ndim == 3:
+                images = images[..., None]
+
+            images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+            return images
+
+        if isinstance(image[0], PIL.Image.Image):
+            new_image = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = resize(image_, self.unet.config.sample_size)
+                image_ = np.array(image_)
+                image_ = image_.astype(np.float32)
+                image_ = image_ / 127.5 - 1
+                new_image.append(image_)
+
+            image = new_image
+
+            image = np.stack(image, axis=0)  # to np
+            image = numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = numpy_to_pt(image)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_intermediate_images(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None
+    ):
+        _, channels, height, width = image.shape
+
+        batch_size = batch_size * num_images_per_prompt
+
+        shape = (batch_size, channels, height, width)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+        image = self.scheduler.add_noise(image, noise, timestep)
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        strength: float = 0.7,
+        num_inference_steps: int = 80,
+        timesteps: List[int] = None,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            strength (`float`, *optional*, defaults to 0.7):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 80):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 10.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt, image, batch_size, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        dtype = prompt_embeds.dtype
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+
+        # 5. Prepare intermediate images
+        image = self.preprocess_image(image)
+        image = image.to(device=device, dtype=dtype)
+
+        noise_timestep = timesteps[0:1]
+        noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt)
+
+        intermediate_images = self.prepare_intermediate_images(
+            image, noise_timestep, batch_size, num_images_per_prompt, dtype, device, generator
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = (
+                    torch.cat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+                )
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+                    noise_pred, _ = noise_pred.split(model_input.shape[1], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 11. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..6685ba6d774ae575118b88c3cc3809afad251434
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
@@ -0,0 +1,1011 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...loaders import StableDiffusionLoraLoaderMixin
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize
+def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
+    w, h = images.size
+
+    coef = w / h
+
+    w, h = img_size, img_size
+
+    if coef >= 1:
+        w = int(round(img_size / 8 * coef) * 8)
+    else:
+        h = int(round(img_size / 8 / coef) * 8)
+
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+
+    return images
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> response = requests.get(url)
+        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> original_image = original_image.resize((768, 512))
+
+        >>> pipe = IFImg2ImgPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-I-XL-v1.0",
+        ...     variant="fp16",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A fantasy landscape in style minecraft"
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(
+        ...     image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ...     output_type="pt",
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0",
+        ...     text_encoder=None,
+        ...     variant="fp16",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image,
+        ...     original_image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+    image_noising_scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor"]
+    model_cpu_offload_seq = "text_encoder->unet"
+    _exclude_from_cpu_offload = ["watermarker"]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        image_noising_scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if unet.config.in_channels != 6:
+            logger.warning(
+                "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            image_noising_scheduler=image_noising_scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    @torch.no_grad()
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clean_caption (bool, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        original_image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # image
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+        # original_image
+
+        if isinstance(original_image, list):
+            check_image_type = original_image[0]
+        else:
+            check_image_type = original_image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`original_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(original_image, list):
+            image_batch_size = len(original_image)
+        elif isinstance(original_image, torch.Tensor):
+            image_batch_size = original_image.shape[0]
+        elif isinstance(original_image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(original_image, np.ndarray):
+            image_batch_size = original_image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(
+                f"original_image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image
+    def preprocess_original_image(self, image: PIL.Image.Image) -> torch.Tensor:
+        if not isinstance(image, list):
+            image = [image]
+
+        def numpy_to_pt(images):
+            if images.ndim == 3:
+                images = images[..., None]
+
+            images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+            return images
+
+        if isinstance(image[0], PIL.Image.Image):
+            new_image = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = resize(image_, self.unet.config.sample_size)
+                image_ = np.array(image_)
+                image_ = image_.astype(np.float32)
+                image_ = image_ / 127.5 - 1
+                new_image.append(image_)
+
+            image = new_image
+
+            image = np.stack(image, axis=0)  # to np
+            image = numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = numpy_to_pt(image)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        return image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image
+    def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt, device) -> torch.Tensor:
+        if not isinstance(image, torch.Tensor) and not isinstance(image, list):
+            image = [image]
+
+        if isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i).astype(np.float32) / 127.5 - 1.0 for i in image]
+
+            image = np.stack(image, axis=0)  # to np
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image[0], np.ndarray):
+            image = np.stack(image, axis=0)  # to np
+            if image.ndim == 5:
+                image = image[0]
+
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image, list) and isinstance(image[0], torch.Tensor):
+            dims = image[0].ndim
+
+            if dims == 3:
+                image = torch.stack(image, dim=0)
+            elif dims == 4:
+                image = torch.concat(image, dim=0)
+            else:
+                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
+
+        image = image.to(device=device, dtype=self.unet.dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.prepare_intermediate_images
+    def prepare_intermediate_images(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None
+    ):
+        _, channels, height, width = image.shape
+
+        batch_size = batch_size * num_images_per_prompt
+
+        shape = (batch_size, channels, height, width)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+        image = self.scheduler.add_noise(image, noise, timestep)
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
+        original_image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        strength: float = 0.8,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 250,
+        clean_caption: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            original_image (`torch.Tensor` or `PIL.Image.Image`):
+                The original image that `image` was varied from.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            noise_level (`int`, *optional*, defaults to 250):
+                The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)`
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt,
+            image,
+            original_image,
+            batch_size,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        dtype = prompt_embeds.dtype
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+
+        # 5. prepare original image
+        original_image = self.preprocess_original_image(original_image)
+        original_image = original_image.to(device=device, dtype=dtype)
+
+        # 6. Prepare intermediate images
+        noise_timestep = timesteps[0:1]
+        noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt)
+
+        intermediate_images = self.prepare_intermediate_images(
+            original_image,
+            noise_timestep,
+            batch_size,
+            num_images_per_prompt,
+            dtype,
+            device,
+            generator,
+        )
+
+        # 7. Prepare upscaled image and noise level
+        _, _, height, width = original_image.shape
+
+        image = self.preprocess_image(image, num_images_per_prompt, device)
+
+        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
+
+        noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
+        noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
+        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
+
+        if do_classifier_free_guidance:
+            noise_level = torch.cat([noise_level] * 2)
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = torch.cat([intermediate_images, upscaled], dim=1)
+
+                model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    class_labels=noise_level,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+                    noise_pred, _ = noise_pred.split(intermediate_images.shape[1], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 10. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 11. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 12. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 13. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+        else:
+            # 10. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 11. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fca0bc0443cb3d9aa4cf366c6ab2058d085ec1d
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
@@ -0,0 +1,1014 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...loaders import StableDiffusionLoraLoaderMixin
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize
+def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
+    w, h = images.size
+
+    coef = w / h
+
+    w, h = img_size, img_size
+
+    if coef >= 1:
+        w = int(round(img_size / 8 * coef) * 8)
+    else:
+        h = int(round(img_size / 8 / coef) * 8)
+
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+
+    return images
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
+        >>> response = requests.get(url)
+        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> original_image = original_image
+
+        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
+        >>> response = requests.get(url)
+        >>> mask_image = Image.open(BytesIO(response.content))
+        >>> mask_image = mask_image
+
+        >>> pipe = IFInpaintingPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "blue sunglasses"
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(
+        ...     image=original_image,
+        ...     mask_image=mask_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ...     output_type="pt",
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFInpaintingSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image,
+        ...     mask_image=mask_image,
+        ...     original_image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFInpaintingPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
+    _exclude_from_cpu_offload = ["watermarker"]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    @torch.no_grad()
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clean_caption (bool, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        mask_image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # image
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+        # mask_image
+
+        if isinstance(mask_image, list):
+            check_image_type = mask_image[0]
+        else:
+            check_image_type = mask_image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`mask_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(mask_image, list):
+            image_batch_size = len(mask_image)
+        elif isinstance(mask_image, torch.Tensor):
+            image_batch_size = mask_image.shape[0]
+        elif isinstance(mask_image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(mask_image, np.ndarray):
+            image_batch_size = mask_image.shape[0]
+        else:
+            assert False
+
+        if image_batch_size != 1 and batch_size != image_batch_size:
+            raise ValueError(
+                f"mask_image batch size: {image_batch_size} must be `1` or the same as prompt batch size {batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image
+    def preprocess_image(self, image: PIL.Image.Image) -> torch.Tensor:
+        if not isinstance(image, list):
+            image = [image]
+
+        def numpy_to_pt(images):
+            if images.ndim == 3:
+                images = images[..., None]
+
+            images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+            return images
+
+        if isinstance(image[0], PIL.Image.Image):
+            new_image = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = resize(image_, self.unet.config.sample_size)
+                image_ = np.array(image_)
+                image_ = image_.astype(np.float32)
+                image_ = image_ / 127.5 - 1
+                new_image.append(image_)
+
+            image = new_image
+
+            image = np.stack(image, axis=0)  # to np
+            image = numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = numpy_to_pt(image)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        return image
+
+    def preprocess_mask_image(self, mask_image) -> torch.Tensor:
+        if not isinstance(mask_image, list):
+            mask_image = [mask_image]
+
+        if isinstance(mask_image[0], torch.Tensor):
+            mask_image = torch.cat(mask_image, axis=0) if mask_image[0].ndim == 4 else torch.stack(mask_image, axis=0)
+
+            if mask_image.ndim == 2:
+                # Batch and add channel dim for single mask
+                mask_image = mask_image.unsqueeze(0).unsqueeze(0)
+            elif mask_image.ndim == 3 and mask_image.shape[0] == 1:
+                # Single mask, the 0'th dimension is considered to be
+                # the existing batch size of 1
+                mask_image = mask_image.unsqueeze(0)
+            elif mask_image.ndim == 3 and mask_image.shape[0] != 1:
+                # Batch of mask, the 0'th dimension is considered to be
+                # the batching dimension
+                mask_image = mask_image.unsqueeze(1)
+
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+
+        elif isinstance(mask_image[0], PIL.Image.Image):
+            new_mask_image = []
+
+            for mask_image_ in mask_image:
+                mask_image_ = mask_image_.convert("L")
+                mask_image_ = resize(mask_image_, self.unet.config.sample_size)
+                mask_image_ = np.array(mask_image_)
+                mask_image_ = mask_image_[None, None, :]
+                new_mask_image.append(mask_image_)
+
+            mask_image = new_mask_image
+
+            mask_image = np.concatenate(mask_image, axis=0)
+            mask_image = mask_image.astype(np.float32) / 255.0
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+            mask_image = torch.from_numpy(mask_image)
+
+        elif isinstance(mask_image[0], np.ndarray):
+            mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
+
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+            mask_image = torch.from_numpy(mask_image)
+
+        return mask_image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_intermediate_images(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator=None
+    ):
+        image_batch_size, channels, height, width = image.shape
+
+        batch_size = batch_size * num_images_per_prompt
+
+        shape = (batch_size, channels, height, width)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+        noised_image = self.scheduler.add_noise(image, noise, timestep)
+
+        image = (1 - mask_image) * image + mask_image * noised_image
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        mask_image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            strength (`float`, *optional*, defaults to 1.0):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt,
+            image,
+            mask_image,
+            batch_size,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        dtype = prompt_embeds.dtype
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+
+        # 5. Prepare intermediate images
+        image = self.preprocess_image(image)
+        image = image.to(device=device, dtype=dtype)
+
+        mask_image = self.preprocess_mask_image(mask_image)
+        mask_image = mask_image.to(device=device, dtype=dtype)
+
+        if mask_image.shape[0] == 1:
+            mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, dim=0)
+        else:
+            mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        noise_timestep = timesteps[0:1]
+        noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt)
+
+        intermediate_images = self.prepare_intermediate_images(
+            image, noise_timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = (
+                    torch.cat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+                )
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+                    noise_pred, _ = noise_pred.split(model_input.shape[1], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                prev_intermediate_images = intermediate_images
+
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 11. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f04a1de2a6edec03c341cfbd1d696eae8754a1c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
@@ -0,0 +1,1121 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...loaders import StableDiffusionLoraLoaderMixin
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize
+def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
+    w, h = images.size
+
+    coef = w / h
+
+    w, h = img_size, img_size
+
+    if coef >= 1:
+        w = int(round(img_size / 8 * coef) * 8)
+    else:
+        h = int(round(img_size / 8 / coef) * 8)
+
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+
+    return images
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
+        >>> response = requests.get(url)
+        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> original_image = original_image
+
+        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
+        >>> response = requests.get(url)
+        >>> mask_image = Image.open(BytesIO(response.content))
+        >>> mask_image = mask_image
+
+        >>> pipe = IFInpaintingPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "blue sunglasses"
+
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+        >>> image = pipe(
+        ...     image=original_image,
+        ...     mask_image=mask_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ...     output_type="pt",
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFInpaintingSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image,
+        ...     mask_image=mask_image,
+        ...     original_image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+    """
+
+
+class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+    image_noising_scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    model_cpu_offload_seq = "text_encoder->unet"
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    _exclude_from_cpu_offload = ["watermarker"]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        image_noising_scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if unet.config.in_channels != 6:
+            logger.warning(
+                "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            image_noising_scheduler=image_noising_scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    @torch.no_grad()
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clean_caption (bool, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        original_image,
+        mask_image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # image
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+        # original_image
+
+        if isinstance(original_image, list):
+            check_image_type = original_image[0]
+        else:
+            check_image_type = original_image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`original_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(original_image, list):
+            image_batch_size = len(original_image)
+        elif isinstance(original_image, torch.Tensor):
+            image_batch_size = original_image.shape[0]
+        elif isinstance(original_image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(original_image, np.ndarray):
+            image_batch_size = original_image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(
+                f"original_image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
+            )
+
+        # mask_image
+
+        if isinstance(mask_image, list):
+            check_image_type = mask_image[0]
+        else:
+            check_image_type = mask_image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`mask_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(mask_image, list):
+            image_batch_size = len(mask_image)
+        elif isinstance(mask_image, torch.Tensor):
+            image_batch_size = mask_image.shape[0]
+        elif isinstance(mask_image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(mask_image, np.ndarray):
+            image_batch_size = mask_image.shape[0]
+        else:
+            assert False
+
+        if image_batch_size != 1 and batch_size != image_batch_size:
+            raise ValueError(
+                f"mask_image batch size: {image_batch_size} must be `1` or the same as prompt batch size {batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image
+    def preprocess_original_image(self, image: PIL.Image.Image) -> torch.Tensor:
+        if not isinstance(image, list):
+            image = [image]
+
+        def numpy_to_pt(images):
+            if images.ndim == 3:
+                images = images[..., None]
+
+            images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+            return images
+
+        if isinstance(image[0], PIL.Image.Image):
+            new_image = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = resize(image_, self.unet.config.sample_size)
+                image_ = np.array(image_)
+                image_ = image_.astype(np.float32)
+                image_ = image_ / 127.5 - 1
+                new_image.append(image_)
+
+            image = new_image
+
+            image = np.stack(image, axis=0)  # to np
+            image = numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = numpy_to_pt(image)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        return image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image
+    def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt, device) -> torch.Tensor:
+        if not isinstance(image, torch.Tensor) and not isinstance(image, list):
+            image = [image]
+
+        if isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i).astype(np.float32) / 127.5 - 1.0 for i in image]
+
+            image = np.stack(image, axis=0)  # to np
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image[0], np.ndarray):
+            image = np.stack(image, axis=0)  # to np
+            if image.ndim == 5:
+                image = image[0]
+
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image, list) and isinstance(image[0], torch.Tensor):
+            dims = image[0].ndim
+
+            if dims == 3:
+                image = torch.stack(image, dim=0)
+            elif dims == 4:
+                image = torch.concat(image, dim=0)
+            else:
+                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
+
+        image = image.to(device=device, dtype=self.unet.dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        return image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_inpainting.IFInpaintingPipeline.preprocess_mask_image
+    def preprocess_mask_image(self, mask_image) -> torch.Tensor:
+        if not isinstance(mask_image, list):
+            mask_image = [mask_image]
+
+        if isinstance(mask_image[0], torch.Tensor):
+            mask_image = torch.cat(mask_image, axis=0) if mask_image[0].ndim == 4 else torch.stack(mask_image, axis=0)
+
+            if mask_image.ndim == 2:
+                # Batch and add channel dim for single mask
+                mask_image = mask_image.unsqueeze(0).unsqueeze(0)
+            elif mask_image.ndim == 3 and mask_image.shape[0] == 1:
+                # Single mask, the 0'th dimension is considered to be
+                # the existing batch size of 1
+                mask_image = mask_image.unsqueeze(0)
+            elif mask_image.ndim == 3 and mask_image.shape[0] != 1:
+                # Batch of mask, the 0'th dimension is considered to be
+                # the batching dimension
+                mask_image = mask_image.unsqueeze(1)
+
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+
+        elif isinstance(mask_image[0], PIL.Image.Image):
+            new_mask_image = []
+
+            for mask_image_ in mask_image:
+                mask_image_ = mask_image_.convert("L")
+                mask_image_ = resize(mask_image_, self.unet.config.sample_size)
+                mask_image_ = np.array(mask_image_)
+                mask_image_ = mask_image_[None, None, :]
+                new_mask_image.append(mask_image_)
+
+            mask_image = new_mask_image
+
+            mask_image = np.concatenate(mask_image, axis=0)
+            mask_image = mask_image.astype(np.float32) / 255.0
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+            mask_image = torch.from_numpy(mask_image)
+
+        elif isinstance(mask_image[0], np.ndarray):
+            mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
+
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+            mask_image = torch.from_numpy(mask_image)
+
+        return mask_image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_inpainting.IFInpaintingPipeline.prepare_intermediate_images
+    def prepare_intermediate_images(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator=None
+    ):
+        image_batch_size, channels, height, width = image.shape
+
+        batch_size = batch_size * num_images_per_prompt
+
+        shape = (batch_size, channels, height, width)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+        noised_image = self.scheduler.add_noise(image, noise, timestep)
+
+        image = (1 - mask_image) * image + mask_image * noised_image
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
+        original_image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        mask_image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        strength: float = 0.8,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 100,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 0,
+        clean_caption: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            original_image (`torch.Tensor` or `PIL.Image.Image`):
+                The original image that `image` was varied from.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            noise_level (`int`, *optional*, defaults to 0):
+                The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)`
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt,
+            image,
+            original_image,
+            mask_image,
+            batch_size,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        dtype = prompt_embeds.dtype
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+
+        # 5. prepare original image
+        original_image = self.preprocess_original_image(original_image)
+        original_image = original_image.to(device=device, dtype=dtype)
+
+        # 6. prepare mask image
+        mask_image = self.preprocess_mask_image(mask_image)
+        mask_image = mask_image.to(device=device, dtype=dtype)
+
+        if mask_image.shape[0] == 1:
+            mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, dim=0)
+        else:
+            mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # 6. Prepare intermediate images
+        noise_timestep = timesteps[0:1]
+        noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt)
+
+        intermediate_images = self.prepare_intermediate_images(
+            original_image,
+            noise_timestep,
+            batch_size,
+            num_images_per_prompt,
+            dtype,
+            device,
+            mask_image,
+            generator,
+        )
+
+        # 7. Prepare upscaled image and noise level
+        _, _, height, width = original_image.shape
+
+        image = self.preprocess_image(image, num_images_per_prompt, device)
+
+        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
+
+        noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
+        noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
+        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
+
+        if do_classifier_free_guidance:
+            noise_level = torch.cat([noise_level] * 2)
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = torch.cat([intermediate_images, upscaled], dim=1)
+
+                model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    class_labels=noise_level,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+                    noise_pred, _ = noise_pred.split(intermediate_images.shape[1], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                prev_intermediate_images = intermediate_images
+
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 10. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 11. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 12. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 13. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+        else:
+            # 10. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 11. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..891963f2a904fb9d219ab1987df150d90156ea4a
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -0,0 +1,870 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...loaders import StableDiffusionLoraLoaderMixin
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFPipeline, IFSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+
+        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pt").images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFSuperResolutionPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+    image_noising_scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
+    _exclude_from_cpu_offload = ["watermarker"]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        image_noising_scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if unet.config.in_channels != 6:
+            logger.warning(
+                "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            image_noising_scheduler=image_noising_scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    @torch.no_grad()
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clean_caption (bool, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        batch_size,
+        noise_level,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
+            raise ValueError(
+                f"`noise_level`: {noise_level} must be a valid timestep in `self.noising_scheduler`, [0, {self.image_noising_scheduler.config.num_train_timesteps})"
+            )
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_intermediate_images
+    def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, device, generator):
+        shape = (batch_size, num_channels, height, width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        intermediate_images = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        intermediate_images = intermediate_images * self.scheduler.init_noise_sigma
+        return intermediate_images
+
+    def preprocess_image(self, image, num_images_per_prompt, device):
+        if not isinstance(image, torch.Tensor) and not isinstance(image, list):
+            image = [image]
+
+        if isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i).astype(np.float32) / 127.5 - 1.0 for i in image]
+
+            image = np.stack(image, axis=0)  # to np
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image[0], np.ndarray):
+            image = np.stack(image, axis=0)  # to np
+            if image.ndim == 5:
+                image = image[0]
+
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image, list) and isinstance(image[0], torch.Tensor):
+            dims = image[0].ndim
+
+            if dims == 3:
+                image = torch.stack(image, dim=0)
+            elif dims == 4:
+                image = torch.concat(image, dim=0)
+            else:
+                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
+
+        image = image.to(device=device, dtype=self.unet.dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: int = None,
+        width: int = None,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 250,
+        clean_caption: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to None):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to None):
+                The width in pixels of the generated image.
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`):
+                The image to be upscaled.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*, defaults to None):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            noise_level (`int`, *optional*, defaults to 250):
+                The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)`
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt,
+            image,
+            batch_size,
+            noise_level,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+
+        height = height or self.unet.config.sample_size
+        width = width or self.unet.config.sample_size
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(0)
+
+        # 5. Prepare intermediate images
+        num_channels = self.unet.config.in_channels // 2
+        intermediate_images = self.prepare_intermediate_images(
+            batch_size * num_images_per_prompt,
+            num_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare upscaled image and noise level
+        image = self.preprocess_image(image, num_images_per_prompt, device)
+        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
+
+        noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
+        noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
+        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
+
+        if do_classifier_free_guidance:
+            noise_level = torch.cat([noise_level] * 2)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = torch.cat([intermediate_images, upscaled], dim=1)
+
+                model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    class_labels=noise_level,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+                    noise_pred, _ = noise_pred.split(intermediate_images.shape[1], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 9. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 10. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 11. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 12. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 9. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 10. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f39ab5ba70ccbcaa1ca10438fe829d243277e06
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
@@ -0,0 +1,28 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class IFPipelineOutput(BaseOutput):
+    """
+    Args:
+    Output class for Stable Diffusion pipelines.
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+        nsfw_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content or a watermark. `None` if safety checking could not be performed.
+        watermark_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely has a watermark. `None` if safety
+            checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_detected: Optional[List[bool]]
+    watermark_detected: Optional[List[bool]]
diff --git a/diffusers/src/diffusers/pipelines/deepfloyd_if/safety_checker.py b/diffusers/src/diffusers/pipelines/deepfloyd_if/safety_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ffeed580bbea1514b11bf7a168a952328d8f424
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deepfloyd_if/safety_checker.py
@@ -0,0 +1,59 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import CLIPConfig, CLIPVisionModelWithProjection, PreTrainedModel
+
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class IFSafetyChecker(PreTrainedModel):
+    config_class = CLIPConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPConfig):
+        super().__init__(config)
+
+        self.vision_model = CLIPVisionModelWithProjection(config.vision_config)
+
+        self.p_head = nn.Linear(config.vision_config.projection_dim, 1)
+        self.w_head = nn.Linear(config.vision_config.projection_dim, 1)
+
+    @torch.no_grad()
+    def forward(self, clip_input, images, p_threshold=0.5, w_threshold=0.5):
+        image_embeds = self.vision_model(clip_input)[0]
+
+        nsfw_detected = self.p_head(image_embeds)
+        nsfw_detected = nsfw_detected.flatten()
+        nsfw_detected = nsfw_detected > p_threshold
+        nsfw_detected = nsfw_detected.tolist()
+
+        if any(nsfw_detected):
+            logger.warning(
+                "Potential NSFW content was detected in one or more images. A black image will be returned instead."
+                " Try again with a different prompt and/or seed."
+            )
+
+        for idx, nsfw_detected_ in enumerate(nsfw_detected):
+            if nsfw_detected_:
+                images[idx] = np.zeros(images[idx].shape)
+
+        watermark_detected = self.w_head(image_embeds)
+        watermark_detected = watermark_detected.flatten()
+        watermark_detected = watermark_detected > w_threshold
+        watermark_detected = watermark_detected.tolist()
+
+        if any(watermark_detected):
+            logger.warning(
+                "Potential watermarked content was detected in one or more images. A black image will be returned instead."
+                " Try again with a different prompt and/or seed."
+            )
+
+        for idx, watermark_detected_ in enumerate(watermark_detected):
+            if watermark_detected_:
+                images[idx] = np.zeros(images[idx].shape)
+
+        return images, nsfw_detected, watermark_detected
diff --git a/diffusers/src/diffusers/pipelines/deepfloyd_if/timesteps.py b/diffusers/src/diffusers/pipelines/deepfloyd_if/timesteps.py
new file mode 100644
index 0000000000000000000000000000000000000000..d44285c017bbb2ccffa4ae86dd77792a048625d9
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deepfloyd_if/timesteps.py
@@ -0,0 +1,579 @@
+fast27_timesteps = [
+    999,
+    800,
+    799,
+    600,
+    599,
+    500,
+    400,
+    399,
+    377,
+    355,
+    333,
+    311,
+    288,
+    266,
+    244,
+    222,
+    200,
+    199,
+    177,
+    155,
+    133,
+    111,
+    88,
+    66,
+    44,
+    22,
+    0,
+]
+
+smart27_timesteps = [
+    999,
+    976,
+    952,
+    928,
+    905,
+    882,
+    858,
+    857,
+    810,
+    762,
+    715,
+    714,
+    572,
+    429,
+    428,
+    286,
+    285,
+    238,
+    190,
+    143,
+    142,
+    118,
+    95,
+    71,
+    47,
+    24,
+    0,
+]
+
+smart50_timesteps = [
+    999,
+    988,
+    977,
+    966,
+    955,
+    944,
+    933,
+    922,
+    911,
+    900,
+    899,
+    879,
+    859,
+    840,
+    820,
+    800,
+    799,
+    766,
+    733,
+    700,
+    699,
+    650,
+    600,
+    599,
+    500,
+    499,
+    400,
+    399,
+    350,
+    300,
+    299,
+    266,
+    233,
+    200,
+    199,
+    179,
+    159,
+    140,
+    120,
+    100,
+    99,
+    88,
+    77,
+    66,
+    55,
+    44,
+    33,
+    22,
+    11,
+    0,
+]
+
+smart100_timesteps = [
+    999,
+    995,
+    992,
+    989,
+    985,
+    981,
+    978,
+    975,
+    971,
+    967,
+    964,
+    961,
+    957,
+    956,
+    951,
+    947,
+    942,
+    937,
+    933,
+    928,
+    923,
+    919,
+    914,
+    913,
+    908,
+    903,
+    897,
+    892,
+    887,
+    881,
+    876,
+    871,
+    870,
+    864,
+    858,
+    852,
+    846,
+    840,
+    834,
+    828,
+    827,
+    820,
+    813,
+    806,
+    799,
+    792,
+    785,
+    784,
+    777,
+    770,
+    763,
+    756,
+    749,
+    742,
+    741,
+    733,
+    724,
+    716,
+    707,
+    699,
+    698,
+    688,
+    677,
+    666,
+    656,
+    655,
+    645,
+    634,
+    623,
+    613,
+    612,
+    598,
+    584,
+    570,
+    569,
+    555,
+    541,
+    527,
+    526,
+    505,
+    484,
+    483,
+    462,
+    440,
+    439,
+    396,
+    395,
+    352,
+    351,
+    308,
+    307,
+    264,
+    263,
+    220,
+    219,
+    176,
+    132,
+    88,
+    44,
+    0,
+]
+
+smart185_timesteps = [
+    999,
+    997,
+    995,
+    992,
+    990,
+    988,
+    986,
+    984,
+    981,
+    979,
+    977,
+    975,
+    972,
+    970,
+    968,
+    966,
+    964,
+    961,
+    959,
+    957,
+    956,
+    954,
+    951,
+    949,
+    946,
+    944,
+    941,
+    939,
+    936,
+    934,
+    931,
+    929,
+    926,
+    924,
+    921,
+    919,
+    916,
+    914,
+    913,
+    910,
+    907,
+    905,
+    902,
+    899,
+    896,
+    893,
+    891,
+    888,
+    885,
+    882,
+    879,
+    877,
+    874,
+    871,
+    870,
+    867,
+    864,
+    861,
+    858,
+    855,
+    852,
+    849,
+    846,
+    843,
+    840,
+    837,
+    834,
+    831,
+    828,
+    827,
+    824,
+    821,
+    817,
+    814,
+    811,
+    808,
+    804,
+    801,
+    798,
+    795,
+    791,
+    788,
+    785,
+    784,
+    780,
+    777,
+    774,
+    770,
+    766,
+    763,
+    760,
+    756,
+    752,
+    749,
+    746,
+    742,
+    741,
+    737,
+    733,
+    730,
+    726,
+    722,
+    718,
+    714,
+    710,
+    707,
+    703,
+    699,
+    698,
+    694,
+    690,
+    685,
+    681,
+    677,
+    673,
+    669,
+    664,
+    660,
+    656,
+    655,
+    650,
+    646,
+    641,
+    636,
+    632,
+    627,
+    622,
+    618,
+    613,
+    612,
+    607,
+    602,
+    596,
+    591,
+    586,
+    580,
+    575,
+    570,
+    569,
+    563,
+    557,
+    551,
+    545,
+    539,
+    533,
+    527,
+    526,
+    519,
+    512,
+    505,
+    498,
+    491,
+    484,
+    483,
+    474,
+    466,
+    457,
+    449,
+    440,
+    439,
+    428,
+    418,
+    407,
+    396,
+    395,
+    381,
+    366,
+    352,
+    351,
+    330,
+    308,
+    307,
+    286,
+    264,
+    263,
+    242,
+    220,
+    219,
+    176,
+    175,
+    132,
+    131,
+    88,
+    44,
+    0,
+]
+
+super27_timesteps = [
+    999,
+    991,
+    982,
+    974,
+    966,
+    958,
+    950,
+    941,
+    933,
+    925,
+    916,
+    908,
+    900,
+    899,
+    874,
+    850,
+    825,
+    800,
+    799,
+    700,
+    600,
+    500,
+    400,
+    300,
+    200,
+    100,
+    0,
+]
+
+super40_timesteps = [
+    999,
+    992,
+    985,
+    978,
+    971,
+    964,
+    957,
+    949,
+    942,
+    935,
+    928,
+    921,
+    914,
+    907,
+    900,
+    899,
+    879,
+    859,
+    840,
+    820,
+    800,
+    799,
+    766,
+    733,
+    700,
+    699,
+    650,
+    600,
+    599,
+    500,
+    499,
+    400,
+    399,
+    300,
+    299,
+    200,
+    199,
+    100,
+    99,
+    0,
+]
+
+super100_timesteps = [
+    999,
+    996,
+    992,
+    989,
+    985,
+    982,
+    979,
+    975,
+    972,
+    968,
+    965,
+    961,
+    958,
+    955,
+    951,
+    948,
+    944,
+    941,
+    938,
+    934,
+    931,
+    927,
+    924,
+    920,
+    917,
+    914,
+    910,
+    907,
+    903,
+    900,
+    899,
+    891,
+    884,
+    876,
+    869,
+    861,
+    853,
+    846,
+    838,
+    830,
+    823,
+    815,
+    808,
+    800,
+    799,
+    788,
+    777,
+    766,
+    755,
+    744,
+    733,
+    722,
+    711,
+    700,
+    699,
+    688,
+    677,
+    666,
+    655,
+    644,
+    633,
+    622,
+    611,
+    600,
+    599,
+    585,
+    571,
+    557,
+    542,
+    528,
+    514,
+    500,
+    499,
+    485,
+    471,
+    457,
+    442,
+    428,
+    414,
+    400,
+    399,
+    379,
+    359,
+    340,
+    320,
+    300,
+    299,
+    279,
+    259,
+    240,
+    220,
+    200,
+    199,
+    166,
+    133,
+    100,
+    99,
+    66,
+    33,
+    0,
+]
diff --git a/diffusers/src/diffusers/pipelines/deepfloyd_if/watermark.py b/diffusers/src/diffusers/pipelines/deepfloyd_if/watermark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e03e3fab026a5702eb9c45a4d19fd3d4a0d3da6d
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deepfloyd_if/watermark.py
@@ -0,0 +1,46 @@
+from typing import List
+
+import PIL.Image
+import torch
+from PIL import Image
+
+from ...configuration_utils import ConfigMixin
+from ...models.modeling_utils import ModelMixin
+from ...utils import PIL_INTERPOLATION
+
+
+class IFWatermarker(ModelMixin, ConfigMixin):
+    def __init__(self):
+        super().__init__()
+
+        self.register_buffer("watermark_image", torch.zeros((62, 62, 4)))
+        self.watermark_image_as_pil = None
+
+    def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
+        # Copied from https://github.com/deep-floyd/IF/blob/b77482e36ca2031cb94dbca1001fc1e6400bf4ab/deepfloyd_if/modules/base.py#L287
+
+        h = images[0].height
+        w = images[0].width
+
+        sample_size = sample_size or h
+
+        coef = min(h / sample_size, w / sample_size)
+        img_h, img_w = (int(h / coef), int(w / coef)) if coef < 1 else (h, w)
+
+        S1, S2 = 1024**2, img_w * img_h
+        K = (S2 / S1) ** 0.5
+        wm_size, wm_x, wm_y = int(K * 62), img_w - int(14 * K), img_h - int(14 * K)
+
+        if self.watermark_image_as_pil is None:
+            watermark_image = self.watermark_image.to(torch.uint8).cpu().numpy()
+            watermark_image = Image.fromarray(watermark_image, mode="RGBA")
+            self.watermark_image_as_pil = watermark_image
+
+        wm_img = self.watermark_image_as_pil.resize(
+            (wm_size, wm_size), PIL_INTERPOLATION["bicubic"], reducing_gap=None
+        )
+
+        for pil_img in images:
+            pil_img.paste(wm_img, box=(wm_x - wm_size, wm_y - wm_size, wm_x, wm_y), mask=wm_img.split()[-1])
+
+        return images
diff --git a/diffusers/src/diffusers/pipelines/deprecated/README.md b/diffusers/src/diffusers/pipelines/deprecated/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1e21dbbbd96ca532a4ba286a84f244f12f177fb5
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/README.md
@@ -0,0 +1,3 @@
+# Deprecated Pipelines
+
+This folder contains pipelines that have very low usage as measured by model downloads, issues and PRs. While you can still use the pipelines just as before, we will stop testing the pipelines and will not accept any changes to existing files.
\ No newline at end of file
diff --git a/diffusers/src/diffusers/pipelines/deprecated/__init__.py b/diffusers/src/diffusers/pipelines/deprecated/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9936323170adbceac2c5c25e3881ea731d8602e1
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/__init__.py
@@ -0,0 +1,153 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_librosa_available,
+    is_note_seq_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_pt_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_pt_objects))
+else:
+    _import_structure["latent_diffusion_uncond"] = ["LDMPipeline"]
+    _import_structure["pndm"] = ["PNDMPipeline"]
+    _import_structure["repaint"] = ["RePaintPipeline"]
+    _import_structure["score_sde_ve"] = ["ScoreSdeVePipeline"]
+    _import_structure["stochastic_karras_ve"] = ["KarrasVePipeline"]
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["alt_diffusion"] = [
+        "AltDiffusionImg2ImgPipeline",
+        "AltDiffusionPipeline",
+        "AltDiffusionPipelineOutput",
+    ]
+    _import_structure["versatile_diffusion"] = [
+        "VersatileDiffusionDualGuidedPipeline",
+        "VersatileDiffusionImageVariationPipeline",
+        "VersatileDiffusionPipeline",
+        "VersatileDiffusionTextToImagePipeline",
+    ]
+    _import_structure["vq_diffusion"] = ["VQDiffusionPipeline"]
+    _import_structure["stable_diffusion_variants"] = [
+        "CycleDiffusionPipeline",
+        "StableDiffusionInpaintPipelineLegacy",
+        "StableDiffusionPix2PixZeroPipeline",
+        "StableDiffusionParadigmsPipeline",
+        "StableDiffusionModelEditingPipeline",
+    ]
+
+try:
+    if not (is_torch_available() and is_librosa_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_librosa_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_librosa_objects))
+
+else:
+    _import_structure["audio_diffusion"] = ["AudioDiffusionPipeline", "Mel"]
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
+
+else:
+    _import_structure["spectrogram_diffusion"] = ["MidiProcessor", "SpectrogramDiffusionPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_pt_objects import *
+
+    else:
+        from .latent_diffusion_uncond import LDMPipeline
+        from .pndm import PNDMPipeline
+        from .repaint import RePaintPipeline
+        from .score_sde_ve import ScoreSdeVePipeline
+        from .stochastic_karras_ve import KarrasVePipeline
+
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline, AltDiffusionPipelineOutput
+        from .audio_diffusion import AudioDiffusionPipeline, Mel
+        from .spectrogram_diffusion import SpectrogramDiffusionPipeline
+        from .stable_diffusion_variants import (
+            CycleDiffusionPipeline,
+            StableDiffusionInpaintPipelineLegacy,
+            StableDiffusionModelEditingPipeline,
+            StableDiffusionParadigmsPipeline,
+            StableDiffusionPix2PixZeroPipeline,
+        )
+        from .stochastic_karras_ve import KarrasVePipeline
+        from .versatile_diffusion import (
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+        )
+        from .vq_diffusion import VQDiffusionPipeline
+
+    try:
+        if not (is_torch_available() and is_librosa_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_librosa_objects import *
+    else:
+        from .audio_diffusion import AudioDiffusionPipeline, Mel
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+    else:
+        from .spectrogram_diffusion import (
+            MidiProcessor,
+            SpectrogramDiffusionPipeline,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py b/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..71fa15b3feff08dc4008d1fa02ba61ad1300efed
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py
@@ -0,0 +1,53 @@
+from typing import TYPE_CHECKING
+
+from ....utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ....utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modeling_roberta_series"] = ["RobertaSeriesModelWithTransformation"]
+    _import_structure["pipeline_alt_diffusion"] = ["AltDiffusionPipeline"]
+    _import_structure["pipeline_alt_diffusion_img2img"] = ["AltDiffusionImg2ImgPipeline"]
+
+    _import_structure["pipeline_output"] = ["AltDiffusionPipelineOutput"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ....utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .modeling_roberta_series import RobertaSeriesModelWithTransformation
+        from .pipeline_alt_diffusion import AltDiffusionPipeline
+        from .pipeline_alt_diffusion_img2img import AltDiffusionImg2ImgPipeline
+        from .pipeline_output import AltDiffusionPipelineOutput
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py b/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py
new file mode 100644
index 0000000000000000000000000000000000000000..f69f905b56c528a292340fcf33ee3b016096d1f4
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py
@@ -0,0 +1,124 @@
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import RobertaPreTrainedModel, XLMRobertaConfig, XLMRobertaModel
+from transformers.utils import ModelOutput
+
+
+@dataclass
+class TransformationModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        text_embeds (`torch.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one
+            for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    projection_state: Optional[torch.Tensor] = None
+    last_hidden_state: torch.Tensor = None
+    hidden_states: Optional[Tuple[torch.Tensor]] = None
+    attentions: Optional[Tuple[torch.Tensor]] = None
+
+
+class RobertaSeriesConfig(XLMRobertaConfig):
+    def __init__(
+        self,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        project_dim=512,
+        pooler_fn="cls",
+        learn_encoder=False,
+        use_attention_mask=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.project_dim = project_dim
+        self.pooler_fn = pooler_fn
+        self.learn_encoder = learn_encoder
+        self.use_attention_mask = use_attention_mask
+
+
+class RobertaSeriesModelWithTransformation(RobertaPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"logit_scale"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    base_model_prefix = "roberta"
+    config_class = RobertaSeriesConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.roberta = XLMRobertaModel(config)
+        self.transformation = nn.Linear(config.hidden_size, config.project_dim)
+        self.has_pre_transformation = getattr(config, "has_pre_transformation", False)
+        if self.has_pre_transformation:
+            self.transformation_pre = nn.Linear(config.hidden_size, config.project_dim)
+            self.pre_LN = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ):
+        r""" """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.base_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=True if self.has_pre_transformation else output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.has_pre_transformation:
+            sequence_output2 = outputs["hidden_states"][-2]
+            sequence_output2 = self.pre_LN(sequence_output2)
+            projection_state2 = self.transformation_pre(sequence_output2)
+
+            return TransformationModelOutput(
+                projection_state=projection_state2,
+                last_hidden_state=outputs.last_hidden_state,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        else:
+            projection_state = self.transformation(outputs.last_hidden_state)
+            return TransformationModelOutput(
+                projection_state=projection_state,
+                last_hidden_state=outputs.last_hidden_state,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
diff --git a/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py b/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6730ee610c9b52812797b3097209efae77dc3f8
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
@@ -0,0 +1,974 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, XLMRobertaTokenizer
+
+from ....configuration_utils import FrozenDict
+from ....image_processor import PipelineImageInput, VaeImageProcessor
+from ....loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ....models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .modeling_roberta_series import RobertaSeriesModelWithTransformation
+from .pipeline_output import AltDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AltDiffusionPipeline
+
+        >>> pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion-m9", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # "dark elf princess, highly detailed, d & d, fantasy, highly detailed, digital painting, trending on artstation, concept art, sharp focus, illustration, art by artgerm and greg rutkowski and fuji choko and viktoria gavrilenko and hoang lap"
+        >>> prompt = "黑暗精灵公主，非常详细，幻想，非常详细，数字绘画，概念艺术，敏锐的焦点，插图"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class AltDiffusionPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Alt Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.RobertaSeriesModelWithTransformation`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.XLMRobertaTokenizer`]):
+            A `XLMRobertaTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: RobertaSeriesModelWithTransformation,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Alt Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py b/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fbf5ccb274d92d5b70dc2e892a08440ab361c0d
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -0,0 +1,1041 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, XLMRobertaTokenizer
+
+from ....configuration_utils import FrozenDict
+from ....image_processor import PipelineImageInput, VaeImageProcessor
+from ....loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ....models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .modeling_roberta_series import RobertaSeriesModelWithTransformation
+from .pipeline_output import AltDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from io import BytesIO
+
+        >>> from diffusers import AltDiffusionImg2ImgPipeline
+
+        >>> device = "cuda"
+        >>> model_id_or_path = "BAAI/AltDiffusion-m9"
+        >>> pipe = AltDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
+        >>> pipe = pipe.to(device)
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+        >>> response = requests.get(url)
+        >>> init_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> init_image = init_image.resize((768, 512))
+
+        >>> # "A fantasy landscape, trending on artstation"
+        >>> prompt = "幻想风景, artstation"
+
+        >>> images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
+        >>> images[0].save("幻想风景.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class AltDiffusionImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-guided image-to-image generation using Alt Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.RobertaSeriesModelWithTransformation`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.XLMRobertaTokenizer`]):
+            A `XLMRobertaTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: RobertaSeriesModelWithTransformation,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Alt Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            strength,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 5. set timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # 7.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py b/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd174ae3c21fe4110babd503f0418366472059ff
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py
@@ -0,0 +1,28 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ....utils import (
+    BaseOutput,
+)
+
+
+@dataclass
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_output.StableDiffusionPipelineOutput with Stable->Alt
+class AltDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for Alt Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
diff --git a/diffusers/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py b/diffusers/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3127951863a7db3f9dd8e42ac5ab64fa9ac3ec0c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py
@@ -0,0 +1,23 @@
+from typing import TYPE_CHECKING
+
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {
+    "mel": ["Mel"],
+    "pipeline_audio_diffusion": ["AudioDiffusionPipeline"],
+}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .mel import Mel
+    from .pipeline_audio_diffusion import AudioDiffusionPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/diffusers/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py b/diffusers/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py
new file mode 100644
index 0000000000000000000000000000000000000000..3426c3ad0428efc69f2f8dee7a620df3c78b1405
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py
@@ -0,0 +1,179 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np  # noqa: E402
+
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....schedulers.scheduling_utils import SchedulerMixin
+
+
+try:
+    import librosa  # noqa: E402
+
+    _librosa_can_be_imported = True
+    _import_error = ""
+except Exception as e:
+    _librosa_can_be_imported = False
+    _import_error = (
+        f"Cannot import librosa because {e}. Make sure to correctly install librosa to be able to install it."
+    )
+
+
+from PIL import Image  # noqa: E402
+
+
+class Mel(ConfigMixin, SchedulerMixin):
+    """
+    Parameters:
+        x_res (`int`):
+            x resolution of spectrogram (time).
+        y_res (`int`):
+            y resolution of spectrogram (frequency bins).
+        sample_rate (`int`):
+            Sample rate of audio.
+        n_fft (`int`):
+            Number of Fast Fourier Transforms.
+        hop_length (`int`):
+            Hop length (a higher number is recommended if `y_res` < 256).
+        top_db (`int`):
+            Loudest decibel value.
+        n_iter (`int`):
+            Number of iterations for Griffin-Lim Mel inversion.
+    """
+
+    config_name = "mel_config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        x_res: int = 256,
+        y_res: int = 256,
+        sample_rate: int = 22050,
+        n_fft: int = 2048,
+        hop_length: int = 512,
+        top_db: int = 80,
+        n_iter: int = 32,
+    ):
+        self.hop_length = hop_length
+        self.sr = sample_rate
+        self.n_fft = n_fft
+        self.top_db = top_db
+        self.n_iter = n_iter
+        self.set_resolution(x_res, y_res)
+        self.audio = None
+
+        if not _librosa_can_be_imported:
+            raise ValueError(_import_error)
+
+    def set_resolution(self, x_res: int, y_res: int):
+        """Set resolution.
+
+        Args:
+            x_res (`int`):
+                x resolution of spectrogram (time).
+            y_res (`int`):
+                y resolution of spectrogram (frequency bins).
+        """
+        self.x_res = x_res
+        self.y_res = y_res
+        self.n_mels = self.y_res
+        self.slice_size = self.x_res * self.hop_length - 1
+
+    def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
+        """Load audio.
+
+        Args:
+            audio_file (`str`):
+                An audio file that must be on disk due to [Librosa](https://librosa.org/) limitation.
+            raw_audio (`np.ndarray`):
+                The raw audio file as a NumPy array.
+        """
+        if audio_file is not None:
+            self.audio, _ = librosa.load(audio_file, mono=True, sr=self.sr)
+        else:
+            self.audio = raw_audio
+
+        # Pad with silence if necessary.
+        if len(self.audio) < self.x_res * self.hop_length:
+            self.audio = np.concatenate([self.audio, np.zeros((self.x_res * self.hop_length - len(self.audio),))])
+
+    def get_number_of_slices(self) -> int:
+        """Get number of slices in audio.
+
+        Returns:
+            `int`:
+                Number of spectograms audio can be sliced into.
+        """
+        return len(self.audio) // self.slice_size
+
+    def get_audio_slice(self, slice: int = 0) -> np.ndarray:
+        """Get slice of audio.
+
+        Args:
+            slice (`int`):
+                Slice number of audio (out of `get_number_of_slices()`).
+
+        Returns:
+            `np.ndarray`:
+                The audio slice as a NumPy array.
+        """
+        return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)]
+
+    def get_sample_rate(self) -> int:
+        """Get sample rate.
+
+        Returns:
+            `int`:
+                Sample rate of audio.
+        """
+        return self.sr
+
+    def audio_slice_to_image(self, slice: int) -> Image.Image:
+        """Convert slice of audio to spectrogram.
+
+        Args:
+            slice (`int`):
+                Slice number of audio to convert (out of `get_number_of_slices()`).
+
+        Returns:
+            `PIL Image`:
+                A grayscale image of `x_res x y_res`.
+        """
+        S = librosa.feature.melspectrogram(
+            y=self.get_audio_slice(slice), sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels
+        )
+        log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
+        bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
+        image = Image.fromarray(bytedata)
+        return image
+
+    def image_to_audio(self, image: Image.Image) -> np.ndarray:
+        """Converts spectrogram to audio.
+
+        Args:
+            image (`PIL Image`):
+                An grayscale image of `x_res x y_res`.
+
+        Returns:
+            audio (`np.ndarray`):
+                The audio as a NumPy array.
+        """
+        bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width))
+        log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
+        S = librosa.db_to_power(log_S)
+        audio = librosa.feature.inverse.mel_to_audio(
+            S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_iter=self.n_iter
+        )
+        return audio
diff --git a/diffusers/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py b/diffusers/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..47044e050acf7221e3da74d78defa042d5d583f2
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py
@@ -0,0 +1,329 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from math import acos, sin
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image
+
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....schedulers import DDIMScheduler, DDPMScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import AudioPipelineOutput, BaseOutput, DiffusionPipeline, ImagePipelineOutput
+from .mel import Mel
+
+
+class AudioDiffusionPipeline(DiffusionPipeline):
+    """
+    Pipeline for audio diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        mel ([`Mel`]):
+            Transform audio into a spectrogram.
+        scheduler ([`DDIMScheduler`] or [`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`] or [`DDPMScheduler`].
+    """
+
+    _optional_components = ["vqvae"]
+
+    def __init__(
+        self,
+        vqvae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+        mel: Mel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+    ):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler, mel=mel, vqvae=vqvae)
+
+    def get_default_steps(self) -> int:
+        """Returns default number of steps recommended for inference.
+
+        Returns:
+            `int`:
+                The number of steps.
+        """
+        return 50 if isinstance(self.scheduler, DDIMScheduler) else 1000
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        audio_file: str = None,
+        raw_audio: np.ndarray = None,
+        slice: int = 0,
+        start_step: int = 0,
+        steps: int = None,
+        generator: torch.Generator = None,
+        mask_start_secs: float = 0,
+        mask_end_secs: float = 0,
+        step_generator: torch.Generator = None,
+        eta: float = 0,
+        noise: torch.Tensor = None,
+        encoding: torch.Tensor = None,
+        return_dict=True,
+    ) -> Union[
+        Union[AudioPipelineOutput, ImagePipelineOutput],
+        Tuple[List[Image.Image], Tuple[int, List[np.ndarray]]],
+    ]:
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`):
+                Number of samples to generate.
+            audio_file (`str`):
+                An audio file that must be on disk due to [Librosa](https://librosa.org/) limitation.
+            raw_audio (`np.ndarray`):
+                The raw audio file as a NumPy array.
+            slice (`int`):
+                Slice number of audio to convert.
+            start_step (int):
+                Step to start diffusion from.
+            steps (`int`):
+                Number of denoising steps (defaults to `50` for DDIM and `1000` for DDPM).
+            generator (`torch.Generator`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            mask_start_secs (`float`):
+                Number of seconds of audio to mask (not generate) at start.
+            mask_end_secs (`float`):
+                Number of seconds of audio to mask (not generate) at end.
+            step_generator (`torch.Generator`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) used to denoise.
+                None
+            eta (`float`):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            noise (`torch.Tensor`):
+                A noise tensor of shape `(batch_size, 1, height, width)` or `None`.
+            encoding (`torch.Tensor`):
+                A tensor for [`UNet2DConditionModel`] of shape `(batch_size, seq_length, cross_attention_dim)`.
+            return_dict (`bool`):
+                Whether or not to return a [`AudioPipelineOutput`], [`ImagePipelineOutput`] or a plain tuple.
+
+        Examples:
+
+        For audio diffusion:
+
+        ```py
+        import torch
+        from IPython.display import Audio
+        from diffusers import DiffusionPipeline
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256").to(device)
+
+        output = pipe()
+        display(output.images[0])
+        display(Audio(output.audios[0], rate=mel.get_sample_rate()))
+        ```
+
+        For latent audio diffusion:
+
+        ```py
+        import torch
+        from IPython.display import Audio
+        from diffusers import DiffusionPipeline
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        pipe = DiffusionPipeline.from_pretrained("teticio/latent-audio-diffusion-256").to(device)
+
+        output = pipe()
+        display(output.images[0])
+        display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate()))
+        ```
+
+        For other tasks like variation, inpainting, outpainting, etc:
+
+        ```py
+        output = pipe(
+            raw_audio=output.audios[0, 0],
+            start_step=int(pipe.get_default_steps() / 2),
+            mask_start_secs=1,
+            mask_end_secs=1,
+        )
+        display(output.images[0])
+        display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate()))
+        ```
+
+        Returns:
+            `List[PIL Image]`:
+                A list of Mel spectrograms (`float`, `List[np.ndarray]`) with the sample rate and raw audio.
+        """
+
+        steps = steps or self.get_default_steps()
+        self.scheduler.set_timesteps(steps)
+        step_generator = step_generator or generator
+        # For backwards compatibility
+        if isinstance(self.unet.config.sample_size, int):
+            self.unet.config.sample_size = (self.unet.config.sample_size, self.unet.config.sample_size)
+        if noise is None:
+            noise = randn_tensor(
+                (
+                    batch_size,
+                    self.unet.config.in_channels,
+                    self.unet.config.sample_size[0],
+                    self.unet.config.sample_size[1],
+                ),
+                generator=generator,
+                device=self.device,
+            )
+        images = noise
+        mask = None
+
+        if audio_file is not None or raw_audio is not None:
+            self.mel.load_audio(audio_file, raw_audio)
+            input_image = self.mel.audio_slice_to_image(slice)
+            input_image = np.frombuffer(input_image.tobytes(), dtype="uint8").reshape(
+                (input_image.height, input_image.width)
+            )
+            input_image = (input_image / 255) * 2 - 1
+            input_images = torch.tensor(input_image[np.newaxis, :, :], dtype=torch.float).to(self.device)
+
+            if self.vqvae is not None:
+                input_images = self.vqvae.encode(torch.unsqueeze(input_images, 0)).latent_dist.sample(
+                    generator=generator
+                )[0]
+                input_images = self.vqvae.config.scaling_factor * input_images
+
+            if start_step > 0:
+                images[0, 0] = self.scheduler.add_noise(input_images, noise, self.scheduler.timesteps[start_step - 1])
+
+            pixels_per_second = (
+                self.unet.config.sample_size[1] * self.mel.get_sample_rate() / self.mel.x_res / self.mel.hop_length
+            )
+            mask_start = int(mask_start_secs * pixels_per_second)
+            mask_end = int(mask_end_secs * pixels_per_second)
+            mask = self.scheduler.add_noise(input_images, noise, torch.tensor(self.scheduler.timesteps[start_step:]))
+
+        for step, t in enumerate(self.progress_bar(self.scheduler.timesteps[start_step:])):
+            if isinstance(self.unet, UNet2DConditionModel):
+                model_output = self.unet(images, t, encoding)["sample"]
+            else:
+                model_output = self.unet(images, t)["sample"]
+
+            if isinstance(self.scheduler, DDIMScheduler):
+                images = self.scheduler.step(
+                    model_output=model_output,
+                    timestep=t,
+                    sample=images,
+                    eta=eta,
+                    generator=step_generator,
+                )["prev_sample"]
+            else:
+                images = self.scheduler.step(
+                    model_output=model_output,
+                    timestep=t,
+                    sample=images,
+                    generator=step_generator,
+                )["prev_sample"]
+
+            if mask is not None:
+                if mask_start > 0:
+                    images[:, :, :, :mask_start] = mask[:, step, :, :mask_start]
+                if mask_end > 0:
+                    images[:, :, :, -mask_end:] = mask[:, step, :, -mask_end:]
+
+        if self.vqvae is not None:
+            # 0.18215 was scaling factor used in training to ensure unit variance
+            images = 1 / self.vqvae.config.scaling_factor * images
+            images = self.vqvae.decode(images)["sample"]
+
+        images = (images / 2 + 0.5).clamp(0, 1)
+        images = images.cpu().permute(0, 2, 3, 1).numpy()
+        images = (images * 255).round().astype("uint8")
+        images = list(
+            (Image.fromarray(_[:, :, 0]) for _ in images)
+            if images.shape[3] == 1
+            else (Image.fromarray(_, mode="RGB").convert("L") for _ in images)
+        )
+
+        audios = [self.mel.image_to_audio(_) for _ in images]
+        if not return_dict:
+            return images, (self.mel.get_sample_rate(), audios)
+
+        return BaseOutput(**AudioPipelineOutput(np.array(audios)[:, np.newaxis, :]), **ImagePipelineOutput(images))
+
+    @torch.no_grad()
+    def encode(self, images: List[Image.Image], steps: int = 50) -> np.ndarray:
+        """
+        Reverse the denoising step process to recover a noisy image from the generated image.
+
+        Args:
+            images (`List[PIL Image]`):
+                List of images to encode.
+            steps (`int`):
+                Number of encoding steps to perform (defaults to `50`).
+
+        Returns:
+            `np.ndarray`:
+                A noise tensor of shape `(batch_size, 1, height, width)`.
+        """
+
+        # Only works with DDIM as this method is deterministic
+        assert isinstance(self.scheduler, DDIMScheduler)
+        self.scheduler.set_timesteps(steps)
+        sample = np.array(
+            [np.frombuffer(image.tobytes(), dtype="uint8").reshape((1, image.height, image.width)) for image in images]
+        )
+        sample = (sample / 255) * 2 - 1
+        sample = torch.Tensor(sample).to(self.device)
+
+        for t in self.progress_bar(torch.flip(self.scheduler.timesteps, (0,))):
+            prev_timestep = t - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+            alpha_prod_t = self.scheduler.alphas_cumprod[t]
+            alpha_prod_t_prev = (
+                self.scheduler.alphas_cumprod[prev_timestep]
+                if prev_timestep >= 0
+                else self.scheduler.final_alpha_cumprod
+            )
+            beta_prod_t = 1 - alpha_prod_t
+            model_output = self.unet(sample, t)["sample"]
+            pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * model_output
+            sample = (sample - pred_sample_direction) * alpha_prod_t_prev ** (-0.5)
+            sample = sample * alpha_prod_t ** (0.5) + beta_prod_t ** (0.5) * model_output
+
+        return sample
+
+    @staticmethod
+    def slerp(x0: torch.Tensor, x1: torch.Tensor, alpha: float) -> torch.Tensor:
+        """Spherical Linear intERPolation.
+
+        Args:
+            x0 (`torch.Tensor`):
+                The first tensor to interpolate between.
+            x1 (`torch.Tensor`):
+                Second tensor to interpolate between.
+            alpha (`float`):
+                Interpolation between 0 and 1
+
+        Returns:
+            `torch.Tensor`:
+                The interpolated tensor.
+        """
+
+        theta = acos(torch.dot(torch.flatten(x0), torch.flatten(x1)) / torch.norm(x0) / torch.norm(x1))
+        return sin((1 - alpha) * theta) * x0 / sin(theta) + sin(alpha * theta) * x1 / sin(theta)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py b/diffusers/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..214f5bbca969f9ae0629578c72aaf339f86ded88
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py
@@ -0,0 +1,18 @@
+from typing import TYPE_CHECKING
+
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_latent_diffusion_uncond": ["LDMPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_latent_diffusion_uncond import LDMPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/diffusers/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/diffusers/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fe5d59f771da48ea3c7fb549dbb9762cac4620e
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
@@ -0,0 +1,130 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ....models import UNet2DModel, VQModel
+from ....schedulers import DDIMScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class LDMPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for unconditional image generation using latent diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            [`DDIMScheduler`] is used in combination with `unet` to denoise the encoded image latents.
+    """
+
+    def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler):
+        super().__init__()
+        self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        eta: float = 0.0,
+        num_inference_steps: int = 50,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[Tuple, ImagePipelineOutput]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                Number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import LDMPipeline
+
+        >>> # load model and scheduler
+        >>> pipe = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pipe().images[0]
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+
+        latents = randn_tensor(
+            (batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
+            generator=generator,
+        )
+        latents = latents.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+
+        extra_kwargs = {}
+        if accepts_eta:
+            extra_kwargs["eta"] = eta
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            latent_model_input = self.scheduler.scale_model_input(latents, t)
+            # predict the noise residual
+            noise_prediction = self.unet(latent_model_input, t).sample
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_prediction, t, latents, **extra_kwargs).prev_sample
+
+        # adjust latents with inverse of vae scale
+        latents = latents / self.vqvae.config.scaling_factor
+        # decode the image latents with the VAE
+        image = self.vqvae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/pndm/__init__.py b/diffusers/src/diffusers/pipelines/deprecated/pndm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e3bdba74079d77576655e22b43014a0438a9c2e
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/pndm/__init__.py
@@ -0,0 +1,18 @@
+from typing import TYPE_CHECKING
+
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_pndm": ["PNDMPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_pndm import PNDMPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/diffusers/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py b/diffusers/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef78af1940cedd07718a9e05d99da50e38a220d3
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py
@@ -0,0 +1,121 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ....models import UNet2DModel
+from ....schedulers import PNDMScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class PNDMPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for unconditional image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`PNDMScheduler`]):
+            A `PNDMScheduler` to be used in combination with `unet` to denoise the encoded image.
+    """
+
+    unet: UNet2DModel
+    scheduler: PNDMScheduler
+
+    def __init__(self, unet: UNet2DModel, scheduler: PNDMScheduler):
+        super().__init__()
+
+        scheduler = PNDMScheduler.from_config(scheduler.config)
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        num_inference_steps: int = 50,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, `optional`, defaults to 1):
+                The number of images to generate.
+            num_inference_steps (`int`, `optional`, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator`, `optional`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            output_type (`str`, `optional`, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import PNDMPipeline
+
+        >>> # load model and scheduler
+        >>> pndm = PNDMPipeline.from_pretrained("google/ddpm-cifar10-32")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pndm().images[0]
+
+        >>> # save image
+        >>> image.save("pndm_generated_image.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        # For more information on the sampling method you can take a look at Algorithm 2 of
+        # the official paper: https://arxiv.org/pdf/2202.09778.pdf
+
+        # Sample gaussian noise to begin loop
+        image = randn_tensor(
+            (batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
+            generator=generator,
+            device=self.device,
+        )
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        for t in self.progress_bar(self.scheduler.timesteps):
+            model_output = self.unet(image, t).sample
+
+            image = self.scheduler.step(model_output, t, image).prev_sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/repaint/__init__.py b/diffusers/src/diffusers/pipelines/deprecated/repaint/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c6b04af52d40e8a2bfa2aa5812b9fb8b1da06f5
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/repaint/__init__.py
@@ -0,0 +1,19 @@
+from typing import TYPE_CHECKING
+
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_repaint": ["RePaintPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_repaint import RePaintPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/diffusers/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py b/diffusers/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..101d315dfe59d3a298625cc79c9ca1754d9d181a
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py
@@ -0,0 +1,230 @@
+# Copyright 2024 ETH Zurich Computer Vision Lab and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from ....models import UNet2DModel
+from ....schedulers import RePaintScheduler
+from ....utils import PIL_INTERPOLATION, deprecate, logging
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def _preprocess_image(image: Union[List, PIL.Image.Image, torch.Tensor]):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+def _preprocess_mask(mask: Union[List, PIL.Image.Image, torch.Tensor]):
+    if isinstance(mask, torch.Tensor):
+        return mask
+    elif isinstance(mask, PIL.Image.Image):
+        mask = [mask]
+
+    if isinstance(mask[0], PIL.Image.Image):
+        w, h = mask[0].size
+        w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+        mask = [np.array(m.convert("L").resize((w, h), resample=PIL_INTERPOLATION["nearest"]))[None, :] for m in mask]
+        mask = np.concatenate(mask, axis=0)
+        mask = mask.astype(np.float32) / 255.0
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+    elif isinstance(mask[0], torch.Tensor):
+        mask = torch.cat(mask, dim=0)
+    return mask
+
+
+class RePaintPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image inpainting using RePaint.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`RePaintScheduler`]):
+            A `RePaintScheduler` to be used in combination with `unet` to denoise the encoded image.
+    """
+
+    unet: UNet2DModel
+    scheduler: RePaintScheduler
+    model_cpu_offload_seq = "unet"
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[torch.Tensor, PIL.Image.Image],
+        mask_image: Union[torch.Tensor, PIL.Image.Image],
+        num_inference_steps: int = 250,
+        eta: float = 0.0,
+        jump_length: int = 10,
+        jump_n_sample: int = 10,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                The original image to inpaint on.
+            mask_image (`torch.Tensor` or `PIL.Image.Image`):
+                The mask_image where 0.0 define which part of the original image to inpaint.
+            num_inference_steps (`int`, *optional*, defaults to 1000):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            eta (`float`):
+                The weight of the added noise in a diffusion step. Its value is between 0.0 and 1.0; 0.0 corresponds to
+                DDIM and 1.0 is the DDPM scheduler.
+            jump_length (`int`, *optional*, defaults to 10):
+                The number of steps taken forward in time before going backward in time for a single jump ("j" in
+                RePaint paper). Take a look at Figure 9 and 10 in the [paper](https://arxiv.org/pdf/2201.09865.pdf).
+            jump_n_sample (`int`, *optional*, defaults to 10):
+                The number of times to make a forward time jump for a given chosen time sample. Take a look at Figure 9
+                and 10 in the [paper](https://arxiv.org/pdf/2201.09865.pdf).
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            output_type (`str`, `optional`, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from io import BytesIO
+        >>> import torch
+        >>> import PIL
+        >>> import requests
+        >>> from diffusers import RePaintPipeline, RePaintScheduler
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256.png"
+        >>> mask_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
+
+        >>> # Load the original image and the mask as PIL images
+        >>> original_image = download_image(img_url).resize((256, 256))
+        >>> mask_image = download_image(mask_url).resize((256, 256))
+
+        >>> # Load the RePaint scheduler and pipeline based on a pretrained DDPM model
+        >>> scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256")
+        >>> pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> output = pipe(
+        ...     image=original_image,
+        ...     mask_image=mask_image,
+        ...     num_inference_steps=250,
+        ...     eta=0.0,
+        ...     jump_length=10,
+        ...     jump_n_sample=10,
+        ...     generator=generator,
+        ... )
+        >>> inpainted_image = output.images[0]
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+
+        original_image = image
+
+        original_image = _preprocess_image(original_image)
+        original_image = original_image.to(device=self._execution_device, dtype=self.unet.dtype)
+        mask_image = _preprocess_mask(mask_image)
+        mask_image = mask_image.to(device=self._execution_device, dtype=self.unet.dtype)
+
+        batch_size = original_image.shape[0]
+
+        # sample gaussian noise to begin the loop
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        image_shape = original_image.shape
+        image = randn_tensor(image_shape, generator=generator, device=self._execution_device, dtype=self.unet.dtype)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample, self._execution_device)
+        self.scheduler.eta = eta
+
+        t_last = self.scheduler.timesteps[0] + 1
+        generator = generator[0] if isinstance(generator, list) else generator
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            if t < t_last:
+                # predict the noise residual
+                model_output = self.unet(image, t).sample
+                # compute previous image: x_t -> x_t-1
+                image = self.scheduler.step(model_output, t, image, original_image, mask_image, generator).prev_sample
+
+            else:
+                # compute the reverse: x_t-1 -> x_t
+                image = self.scheduler.undo_step(image, t_last, generator)
+            t_last = t
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py b/diffusers/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..87c167c3dbd26e0408a41ef197a42dc5eb7038d7
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py
@@ -0,0 +1,19 @@
+from typing import TYPE_CHECKING
+
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_score_sde_ve": ["ScoreSdeVePipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_score_sde_ve import ScoreSdeVePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/diffusers/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py b/diffusers/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0bb114a81b75a979b49c0884e921a33beedff4f
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py
@@ -0,0 +1,109 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ....models import UNet2DModel
+from ....schedulers import ScoreSdeVeScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class ScoreSdeVePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for unconditional image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image.
+        scheduler ([`ScoreSdeVeScheduler`]):
+            A `ScoreSdeVeScheduler` to be used in combination with `unet` to denoise the encoded image.
+    """
+
+    unet: UNet2DModel
+    scheduler: ScoreSdeVeScheduler
+
+    def __init__(self, unet: UNet2DModel, scheduler: ScoreSdeVeScheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        num_inference_steps: int = 2000,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, `optional`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            output_type (`str`, `optional`, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+
+        img_size = self.unet.config.sample_size
+        shape = (batch_size, 3, img_size, img_size)
+
+        model = self.unet
+
+        sample = randn_tensor(shape, generator=generator) * self.scheduler.init_noise_sigma
+        sample = sample.to(self.device)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.scheduler.set_sigmas(num_inference_steps)
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            sigma_t = self.scheduler.sigmas[i] * torch.ones(shape[0], device=self.device)
+
+            # correction step
+            for _ in range(self.scheduler.config.correct_steps):
+                model_output = self.unet(sample, sigma_t).sample
+                sample = self.scheduler.step_correct(model_output, sample, generator=generator).prev_sample
+
+            # prediction step
+            model_output = model(sample, sigma_t).sample
+            output = self.scheduler.step_pred(model_output, t, sample, generator=generator)
+
+            sample, sample_mean = output.prev_sample, output.prev_sample_mean
+
+        sample = sample_mean.clamp(0, 1)
+        sample = sample.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            sample = self.numpy_to_pil(sample)
+
+        if not return_dict:
+            return (sample,)
+
+        return ImagePipelineOutput(images=sample)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py b/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..150954baa0eb8f8a7216b4891effc14a71e21b1b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py
@@ -0,0 +1,75 @@
+# flake8: noqa
+from typing import TYPE_CHECKING
+from ....utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    _LazyModule,
+    is_note_seq_available,
+    OptionalDependencyNotAvailable,
+    is_torch_available,
+    is_transformers_available,
+    get_objects_from_module,
+)
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ....utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["continous_encoder"] = ["SpectrogramContEncoder"]
+    _import_structure["notes_encoder"] = ["SpectrogramNotesEncoder"]
+    _import_structure["pipeline_spectrogram_diffusion"] = [
+        "SpectrogramContEncoder",
+        "SpectrogramDiffusionPipeline",
+        "T5FilmDecoder",
+    ]
+try:
+    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ....utils import dummy_transformers_and_torch_and_note_seq_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
+else:
+    _import_structure["midi_utils"] = ["MidiProcessor"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ....utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_spectrogram_diffusion import SpectrogramDiffusionPipeline
+        from .pipeline_spectrogram_diffusion import SpectrogramContEncoder
+        from .pipeline_spectrogram_diffusion import SpectrogramNotesEncoder
+        from .pipeline_spectrogram_diffusion import T5FilmDecoder
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ....utils.dummy_transformers_and_torch_and_note_seq_objects import *
+
+    else:
+        from .midi_utils import MidiProcessor
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py b/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..8664c2fb67113972976a24aa980d3aa7778b807e
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py
@@ -0,0 +1,92 @@
+# Copyright 2022 The Music Spectrogram Diffusion Authors.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from transformers.modeling_utils import ModuleUtilsMixin
+from transformers.models.t5.modeling_t5 import (
+    T5Block,
+    T5Config,
+    T5LayerNorm,
+)
+
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....models import ModelMixin
+
+
+class SpectrogramContEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
+    @register_to_config
+    def __init__(
+        self,
+        input_dims: int,
+        targets_context_length: int,
+        d_model: int,
+        dropout_rate: float,
+        num_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        feed_forward_proj: str,
+        is_decoder: bool = False,
+    ):
+        super().__init__()
+
+        self.input_proj = nn.Linear(input_dims, d_model, bias=False)
+
+        self.position_encoding = nn.Embedding(targets_context_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=dropout_rate)
+
+        t5config = T5Config(
+            d_model=d_model,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            feed_forward_proj=feed_forward_proj,
+            dropout_rate=dropout_rate,
+            is_decoder=is_decoder,
+            is_encoder_decoder=False,
+        )
+        self.encoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            lyr = T5Block(t5config)
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(d_model)
+        self.dropout_post = nn.Dropout(p=dropout_rate)
+
+    def forward(self, encoder_inputs, encoder_inputs_mask):
+        x = self.input_proj(encoder_inputs)
+
+        # terminal relative positional encodings
+        max_positions = encoder_inputs.shape[1]
+        input_positions = torch.arange(max_positions, device=encoder_inputs.device)
+
+        seq_lens = encoder_inputs_mask.sum(-1)
+        input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0)
+        x += self.position_encoding(input_positions)
+
+        x = self.dropout_pre(x)
+
+        # inverted the attention mask
+        input_shape = encoder_inputs.size()
+        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
+
+        for lyr in self.encoders:
+            x = lyr(x, extended_attention_mask)[0]
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
diff --git a/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py b/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e777e844935e76427cd92ead59b7d23eb8b1cbe5
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py
@@ -0,0 +1,667 @@
+# Copyright 2022 The Music Spectrogram Diffusion Authors.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import math
+import os
+from typing import Any, Callable, List, Mapping, MutableMapping, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from ....utils import is_note_seq_available
+from .pipeline_spectrogram_diffusion import TARGET_FEATURE_LENGTH
+
+
+if is_note_seq_available():
+    import note_seq
+else:
+    raise ImportError("Please install note-seq via `pip install note-seq`")
+
+
+INPUT_FEATURE_LENGTH = 2048
+
+SAMPLE_RATE = 16000
+HOP_SIZE = 320
+FRAME_RATE = int(SAMPLE_RATE // HOP_SIZE)
+
+DEFAULT_STEPS_PER_SECOND = 100
+DEFAULT_MAX_SHIFT_SECONDS = 10
+DEFAULT_NUM_VELOCITY_BINS = 1
+
+SLAKH_CLASS_PROGRAMS = {
+    "Acoustic Piano": 0,
+    "Electric Piano": 4,
+    "Chromatic Percussion": 8,
+    "Organ": 16,
+    "Acoustic Guitar": 24,
+    "Clean Electric Guitar": 26,
+    "Distorted Electric Guitar": 29,
+    "Acoustic Bass": 32,
+    "Electric Bass": 33,
+    "Violin": 40,
+    "Viola": 41,
+    "Cello": 42,
+    "Contrabass": 43,
+    "Orchestral Harp": 46,
+    "Timpani": 47,
+    "String Ensemble": 48,
+    "Synth Strings": 50,
+    "Choir and Voice": 52,
+    "Orchestral Hit": 55,
+    "Trumpet": 56,
+    "Trombone": 57,
+    "Tuba": 58,
+    "French Horn": 60,
+    "Brass Section": 61,
+    "Soprano/Alto Sax": 64,
+    "Tenor Sax": 66,
+    "Baritone Sax": 67,
+    "Oboe": 68,
+    "English Horn": 69,
+    "Bassoon": 70,
+    "Clarinet": 71,
+    "Pipe": 73,
+    "Synth Lead": 80,
+    "Synth Pad": 88,
+}
+
+
+@dataclasses.dataclass
+class NoteRepresentationConfig:
+    """Configuration note representations."""
+
+    onsets_only: bool
+    include_ties: bool
+
+
+@dataclasses.dataclass
+class NoteEventData:
+    pitch: int
+    velocity: Optional[int] = None
+    program: Optional[int] = None
+    is_drum: Optional[bool] = None
+    instrument: Optional[int] = None
+
+
+@dataclasses.dataclass
+class NoteEncodingState:
+    """Encoding state for note transcription, keeping track of active pitches."""
+
+    # velocity bin for active pitches and programs
+    active_pitches: MutableMapping[Tuple[int, int], int] = dataclasses.field(default_factory=dict)
+
+
+@dataclasses.dataclass
+class EventRange:
+    type: str
+    min_value: int
+    max_value: int
+
+
+@dataclasses.dataclass
+class Event:
+    type: str
+    value: int
+
+
+class Tokenizer:
+    def __init__(self, regular_ids: int):
+        # The special tokens: 0=PAD, 1=EOS, and 2=UNK
+        self._num_special_tokens = 3
+        self._num_regular_tokens = regular_ids
+
+    def encode(self, token_ids):
+        encoded = []
+        for token_id in token_ids:
+            if not 0 <= token_id < self._num_regular_tokens:
+                raise ValueError(
+                    f"token_id {token_id} does not fall within valid range of [0, {self._num_regular_tokens})"
+                )
+            encoded.append(token_id + self._num_special_tokens)
+
+        # Add EOS token
+        encoded.append(1)
+
+        # Pad to till INPUT_FEATURE_LENGTH
+        encoded = encoded + [0] * (INPUT_FEATURE_LENGTH - len(encoded))
+
+        return encoded
+
+
+class Codec:
+    """Encode and decode events.
+
+    Useful for declaring what certain ranges of a vocabulary should be used for. This is intended to be used from
+    Python before encoding or after decoding with GenericTokenVocabulary. This class is more lightweight and does not
+    include things like EOS or UNK token handling.
+
+    To ensure that 'shift' events are always the first block of the vocab and start at 0, that event type is required
+    and specified separately.
+    """
+
+    def __init__(self, max_shift_steps: int, steps_per_second: float, event_ranges: List[EventRange]):
+        """Define Codec.
+
+        Args:
+          max_shift_steps: Maximum number of shift steps that can be encoded.
+          steps_per_second: Shift steps will be interpreted as having a duration of
+              1 / steps_per_second.
+          event_ranges: Other supported event types and their ranges.
+        """
+        self.steps_per_second = steps_per_second
+        self._shift_range = EventRange(type="shift", min_value=0, max_value=max_shift_steps)
+        self._event_ranges = [self._shift_range] + event_ranges
+        # Ensure all event types have unique names.
+        assert len(self._event_ranges) == len({er.type for er in self._event_ranges})
+
+    @property
+    def num_classes(self) -> int:
+        return sum(er.max_value - er.min_value + 1 for er in self._event_ranges)
+
+    # The next couple methods are simplified special case methods just for shift
+    # events that are intended to be used from within autograph functions.
+
+    def is_shift_event_index(self, index: int) -> bool:
+        return (self._shift_range.min_value <= index) and (index <= self._shift_range.max_value)
+
+    @property
+    def max_shift_steps(self) -> int:
+        return self._shift_range.max_value
+
+    def encode_event(self, event: Event) -> int:
+        """Encode an event to an index."""
+        offset = 0
+        for er in self._event_ranges:
+            if event.type == er.type:
+                if not er.min_value <= event.value <= er.max_value:
+                    raise ValueError(
+                        f"Event value {event.value} is not within valid range "
+                        f"[{er.min_value}, {er.max_value}] for type {event.type}"
+                    )
+                return offset + event.value - er.min_value
+            offset += er.max_value - er.min_value + 1
+
+        raise ValueError(f"Unknown event type: {event.type}")
+
+    def event_type_range(self, event_type: str) -> Tuple[int, int]:
+        """Return [min_id, max_id] for an event type."""
+        offset = 0
+        for er in self._event_ranges:
+            if event_type == er.type:
+                return offset, offset + (er.max_value - er.min_value)
+            offset += er.max_value - er.min_value + 1
+
+        raise ValueError(f"Unknown event type: {event_type}")
+
+    def decode_event_index(self, index: int) -> Event:
+        """Decode an event index to an Event."""
+        offset = 0
+        for er in self._event_ranges:
+            if offset <= index <= offset + er.max_value - er.min_value:
+                return Event(type=er.type, value=er.min_value + index - offset)
+            offset += er.max_value - er.min_value + 1
+
+        raise ValueError(f"Unknown event index: {index}")
+
+
+@dataclasses.dataclass
+class ProgramGranularity:
+    # both tokens_map_fn and program_map_fn should be idempotent
+    tokens_map_fn: Callable[[Sequence[int], Codec], Sequence[int]]
+    program_map_fn: Callable[[int], int]
+
+
+def drop_programs(tokens, codec: Codec):
+    """Drops program change events from a token sequence."""
+    min_program_id, max_program_id = codec.event_type_range("program")
+    return tokens[(tokens < min_program_id) | (tokens > max_program_id)]
+
+
+def programs_to_midi_classes(tokens, codec):
+    """Modifies program events to be the first program in the MIDI class."""
+    min_program_id, max_program_id = codec.event_type_range("program")
+    is_program = (tokens >= min_program_id) & (tokens <= max_program_id)
+    return np.where(is_program, min_program_id + 8 * ((tokens - min_program_id) // 8), tokens)
+
+
+PROGRAM_GRANULARITIES = {
+    # "flat" granularity; drop program change tokens and set NoteSequence
+    # programs to zero
+    "flat": ProgramGranularity(tokens_map_fn=drop_programs, program_map_fn=lambda program: 0),
+    # map each program to the first program in its MIDI class
+    "midi_class": ProgramGranularity(
+        tokens_map_fn=programs_to_midi_classes, program_map_fn=lambda program: 8 * (program // 8)
+    ),
+    # leave programs as is
+    "full": ProgramGranularity(tokens_map_fn=lambda tokens, codec: tokens, program_map_fn=lambda program: program),
+}
+
+
+def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1):
+    """
+    equivalent of tf.signal.frame
+    """
+    signal_length = signal.shape[axis]
+    if pad_end:
+        frames_overlap = frame_length - frame_step
+        rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap)
+        pad_size = int(frame_length - rest_samples)
+
+        if pad_size != 0:
+            pad_axis = [0] * signal.ndim
+            pad_axis[axis] = pad_size
+            signal = F.pad(signal, pad_axis, "constant", pad_value)
+    frames = signal.unfold(axis, frame_length, frame_step)
+    return frames
+
+
+def program_to_slakh_program(program):
+    # this is done very hackily, probably should use a custom mapping
+    for slakh_program in sorted(SLAKH_CLASS_PROGRAMS.values(), reverse=True):
+        if program >= slakh_program:
+            return slakh_program
+
+
+def audio_to_frames(
+    samples,
+    hop_size: int,
+    frame_rate: int,
+) -> Tuple[Sequence[Sequence[int]], torch.Tensor]:
+    """Convert audio samples to non-overlapping frames and frame times."""
+    frame_size = hop_size
+    samples = np.pad(samples, [0, frame_size - len(samples) % frame_size], mode="constant")
+
+    # Split audio into frames.
+    frames = frame(
+        torch.Tensor(samples).unsqueeze(0),
+        frame_length=frame_size,
+        frame_step=frame_size,
+        pad_end=False,  # TODO check why its off by 1 here when True
+    )
+
+    num_frames = len(samples) // frame_size
+
+    times = np.arange(num_frames) / frame_rate
+    return frames, times
+
+
+def note_sequence_to_onsets_and_offsets_and_programs(
+    ns: note_seq.NoteSequence,
+) -> Tuple[Sequence[float], Sequence[NoteEventData]]:
+    """Extract onset & offset times and pitches & programs from a NoteSequence.
+
+    The onset & offset times will not necessarily be in sorted order.
+
+    Args:
+      ns: NoteSequence from which to extract onsets and offsets.
+
+    Returns:
+      times: A list of note onset and offset times. values: A list of NoteEventData objects where velocity is zero for
+      note
+          offsets.
+    """
+    # Sort by program and pitch and put offsets before onsets as a tiebreaker for
+    # subsequent stable sort.
+    notes = sorted(ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch))
+    times = [note.end_time for note in notes if not note.is_drum] + [note.start_time for note in notes]
+    values = [
+        NoteEventData(pitch=note.pitch, velocity=0, program=note.program, is_drum=False)
+        for note in notes
+        if not note.is_drum
+    ] + [
+        NoteEventData(pitch=note.pitch, velocity=note.velocity, program=note.program, is_drum=note.is_drum)
+        for note in notes
+    ]
+    return times, values
+
+
+def num_velocity_bins_from_codec(codec: Codec):
+    """Get number of velocity bins from event codec."""
+    lo, hi = codec.event_type_range("velocity")
+    return hi - lo
+
+
+# segment an array into segments of length n
+def segment(a, n):
+    return [a[i : i + n] for i in range(0, len(a), n)]
+
+
+def velocity_to_bin(velocity, num_velocity_bins):
+    if velocity == 0:
+        return 0
+    else:
+        return math.ceil(num_velocity_bins * velocity / note_seq.MAX_MIDI_VELOCITY)
+
+
+def note_event_data_to_events(
+    state: Optional[NoteEncodingState],
+    value: NoteEventData,
+    codec: Codec,
+) -> Sequence[Event]:
+    """Convert note event data to a sequence of events."""
+    if value.velocity is None:
+        # onsets only, no program or velocity
+        return [Event("pitch", value.pitch)]
+    else:
+        num_velocity_bins = num_velocity_bins_from_codec(codec)
+        velocity_bin = velocity_to_bin(value.velocity, num_velocity_bins)
+        if value.program is None:
+            # onsets + offsets + velocities only, no programs
+            if state is not None:
+                state.active_pitches[(value.pitch, 0)] = velocity_bin
+            return [Event("velocity", velocity_bin), Event("pitch", value.pitch)]
+        else:
+            if value.is_drum:
+                # drum events use a separate vocabulary
+                return [Event("velocity", velocity_bin), Event("drum", value.pitch)]
+            else:
+                # program + velocity + pitch
+                if state is not None:
+                    state.active_pitches[(value.pitch, value.program)] = velocity_bin
+                return [
+                    Event("program", value.program),
+                    Event("velocity", velocity_bin),
+                    Event("pitch", value.pitch),
+                ]
+
+
+def note_encoding_state_to_events(state: NoteEncodingState) -> Sequence[Event]:
+    """Output program and pitch events for active notes plus a final tie event."""
+    events = []
+    for pitch, program in sorted(state.active_pitches.keys(), key=lambda k: k[::-1]):
+        if state.active_pitches[(pitch, program)]:
+            events += [Event("program", program), Event("pitch", pitch)]
+    events.append(Event("tie", 0))
+    return events
+
+
+def encode_and_index_events(
+    state, event_times, event_values, codec, frame_times, encode_event_fn, encoding_state_to_events_fn=None
+):
+    """Encode a sequence of timed events and index to audio frame times.
+
+    Encodes time shifts as repeated single step shifts for later run length encoding.
+
+    Optionally, also encodes a sequence of "state events", keeping track of the current encoding state at each audio
+    frame. This can be used e.g. to prepend events representing the current state to a targets segment.
+
+    Args:
+      state: Initial event encoding state.
+      event_times: Sequence of event times.
+      event_values: Sequence of event values.
+      encode_event_fn: Function that transforms event value into a sequence of one
+          or more Event objects.
+      codec: An Codec object that maps Event objects to indices.
+      frame_times: Time for every audio frame.
+      encoding_state_to_events_fn: Function that transforms encoding state into a
+          sequence of one or more Event objects.
+
+    Returns:
+      events: Encoded events and shifts. event_start_indices: Corresponding start event index for every audio frame.
+          Note: one event can correspond to multiple audio indices due to sampling rate differences. This makes
+          splitting sequences tricky because the same event can appear at the end of one sequence and the beginning of
+          another.
+      event_end_indices: Corresponding end event index for every audio frame. Used
+          to ensure when slicing that one chunk ends where the next begins. Should always be true that
+          event_end_indices[i] = event_start_indices[i + 1].
+      state_events: Encoded "state" events representing the encoding state before
+          each event.
+      state_event_indices: Corresponding state event index for every audio frame.
+    """
+    indices = np.argsort(event_times, kind="stable")
+    event_steps = [round(event_times[i] * codec.steps_per_second) for i in indices]
+    event_values = [event_values[i] for i in indices]
+
+    events = []
+    state_events = []
+    event_start_indices = []
+    state_event_indices = []
+
+    cur_step = 0
+    cur_event_idx = 0
+    cur_state_event_idx = 0
+
+    def fill_event_start_indices_to_cur_step():
+        while (
+            len(event_start_indices) < len(frame_times)
+            and frame_times[len(event_start_indices)] < cur_step / codec.steps_per_second
+        ):
+            event_start_indices.append(cur_event_idx)
+            state_event_indices.append(cur_state_event_idx)
+
+    for event_step, event_value in zip(event_steps, event_values):
+        while event_step > cur_step:
+            events.append(codec.encode_event(Event(type="shift", value=1)))
+            cur_step += 1
+            fill_event_start_indices_to_cur_step()
+            cur_event_idx = len(events)
+            cur_state_event_idx = len(state_events)
+        if encoding_state_to_events_fn:
+            # Dump state to state events *before* processing the next event, because
+            # we want to capture the state prior to the occurrence of the event.
+            for e in encoding_state_to_events_fn(state):
+                state_events.append(codec.encode_event(e))
+
+        for e in encode_event_fn(state, event_value, codec):
+            events.append(codec.encode_event(e))
+
+    # After the last event, continue filling out the event_start_indices array.
+    # The inequality is not strict because if our current step lines up exactly
+    # with (the start of) an audio frame, we need to add an additional shift event
+    # to "cover" that frame.
+    while cur_step / codec.steps_per_second <= frame_times[-1]:
+        events.append(codec.encode_event(Event(type="shift", value=1)))
+        cur_step += 1
+        fill_event_start_indices_to_cur_step()
+        cur_event_idx = len(events)
+
+    # Now fill in event_end_indices. We need this extra array to make sure that
+    # when we slice events, each slice ends exactly where the subsequent slice
+    # begins.
+    event_end_indices = event_start_indices[1:] + [len(events)]
+
+    events = np.array(events).astype(np.int32)
+    state_events = np.array(state_events).astype(np.int32)
+    event_start_indices = segment(np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+    event_end_indices = segment(np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+    state_event_indices = segment(np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+
+    outputs = []
+    for start_indices, end_indices, event_indices in zip(event_start_indices, event_end_indices, state_event_indices):
+        outputs.append(
+            {
+                "inputs": events,
+                "event_start_indices": start_indices,
+                "event_end_indices": end_indices,
+                "state_events": state_events,
+                "state_event_indices": event_indices,
+            }
+        )
+
+    return outputs
+
+
+def extract_sequence_with_indices(features, state_events_end_token=None, feature_key="inputs"):
+    """Extract target sequence corresponding to audio token segment."""
+    features = features.copy()
+    start_idx = features["event_start_indices"][0]
+    end_idx = features["event_end_indices"][-1]
+
+    features[feature_key] = features[feature_key][start_idx:end_idx]
+
+    if state_events_end_token is not None:
+        # Extract the state events corresponding to the audio start token, and
+        # prepend them to the targets array.
+        state_event_start_idx = features["state_event_indices"][0]
+        state_event_end_idx = state_event_start_idx + 1
+        while features["state_events"][state_event_end_idx - 1] != state_events_end_token:
+            state_event_end_idx += 1
+        features[feature_key] = np.concatenate(
+            [
+                features["state_events"][state_event_start_idx:state_event_end_idx],
+                features[feature_key],
+            ],
+            axis=0,
+        )
+
+    return features
+
+
+def map_midi_programs(
+    feature, codec: Codec, granularity_type: str = "full", feature_key: str = "inputs"
+) -> Mapping[str, Any]:
+    """Apply MIDI program map to token sequences."""
+    granularity = PROGRAM_GRANULARITIES[granularity_type]
+
+    feature[feature_key] = granularity.tokens_map_fn(feature[feature_key], codec)
+    return feature
+
+
+def run_length_encode_shifts_fn(
+    features,
+    codec: Codec,
+    feature_key: str = "inputs",
+    state_change_event_types: Sequence[str] = (),
+) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]:
+    """Return a function that run-length encodes shifts for a given codec.
+
+    Args:
+      codec: The Codec to use for shift events.
+      feature_key: The feature key for which to run-length encode shifts.
+      state_change_event_types: A list of event types that represent state
+          changes; tokens corresponding to these event types will be interpreted as state changes and redundant ones
+          will be removed.
+
+    Returns:
+      A preprocessing function that run-length encodes single-step shifts.
+    """
+    state_change_event_ranges = [codec.event_type_range(event_type) for event_type in state_change_event_types]
+
+    def run_length_encode_shifts(features: MutableMapping[str, Any]) -> Mapping[str, Any]:
+        """Combine leading/interior shifts, trim trailing shifts.
+
+        Args:
+          features: Dict of features to process.
+
+        Returns:
+          A dict of features.
+        """
+        events = features[feature_key]
+
+        shift_steps = 0
+        total_shift_steps = 0
+        output = np.array([], dtype=np.int32)
+
+        current_state = np.zeros(len(state_change_event_ranges), dtype=np.int32)
+
+        for event in events:
+            if codec.is_shift_event_index(event):
+                shift_steps += 1
+                total_shift_steps += 1
+
+            else:
+                # If this event is a state change and has the same value as the current
+                # state, we can skip it entirely.
+                is_redundant = False
+                for i, (min_index, max_index) in enumerate(state_change_event_ranges):
+                    if (min_index <= event) and (event <= max_index):
+                        if current_state[i] == event:
+                            is_redundant = True
+                        current_state[i] = event
+                if is_redundant:
+                    continue
+
+                # Once we've reached a non-shift event, RLE all previous shift events
+                # before outputting the non-shift event.
+                if shift_steps > 0:
+                    shift_steps = total_shift_steps
+                    while shift_steps > 0:
+                        output_steps = np.minimum(codec.max_shift_steps, shift_steps)
+                        output = np.concatenate([output, [output_steps]], axis=0)
+                        shift_steps -= output_steps
+                output = np.concatenate([output, [event]], axis=0)
+
+        features[feature_key] = output
+        return features
+
+    return run_length_encode_shifts(features)
+
+
+def note_representation_processor_chain(features, codec: Codec, note_representation_config: NoteRepresentationConfig):
+    tie_token = codec.encode_event(Event("tie", 0))
+    state_events_end_token = tie_token if note_representation_config.include_ties else None
+
+    features = extract_sequence_with_indices(
+        features, state_events_end_token=state_events_end_token, feature_key="inputs"
+    )
+
+    features = map_midi_programs(features, codec)
+
+    features = run_length_encode_shifts_fn(features, codec, state_change_event_types=["velocity", "program"])
+
+    return features
+
+
+class MidiProcessor:
+    def __init__(self):
+        self.codec = Codec(
+            max_shift_steps=DEFAULT_MAX_SHIFT_SECONDS * DEFAULT_STEPS_PER_SECOND,
+            steps_per_second=DEFAULT_STEPS_PER_SECOND,
+            event_ranges=[
+                EventRange("pitch", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
+                EventRange("velocity", 0, DEFAULT_NUM_VELOCITY_BINS),
+                EventRange("tie", 0, 0),
+                EventRange("program", note_seq.MIN_MIDI_PROGRAM, note_seq.MAX_MIDI_PROGRAM),
+                EventRange("drum", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
+            ],
+        )
+        self.tokenizer = Tokenizer(self.codec.num_classes)
+        self.note_representation_config = NoteRepresentationConfig(onsets_only=False, include_ties=True)
+
+    def __call__(self, midi: Union[bytes, os.PathLike, str]):
+        if not isinstance(midi, bytes):
+            with open(midi, "rb") as f:
+                midi = f.read()
+
+        ns = note_seq.midi_to_note_sequence(midi)
+        ns_sus = note_seq.apply_sustain_control_changes(ns)
+
+        for note in ns_sus.notes:
+            if not note.is_drum:
+                note.program = program_to_slakh_program(note.program)
+
+        samples = np.zeros(int(ns_sus.total_time * SAMPLE_RATE))
+
+        _, frame_times = audio_to_frames(samples, HOP_SIZE, FRAME_RATE)
+        times, values = note_sequence_to_onsets_and_offsets_and_programs(ns_sus)
+
+        events = encode_and_index_events(
+            state=NoteEncodingState(),
+            event_times=times,
+            event_values=values,
+            frame_times=frame_times,
+            codec=self.codec,
+            encode_event_fn=note_event_data_to_events,
+            encoding_state_to_events_fn=note_encoding_state_to_events,
+        )
+
+        events = [
+            note_representation_processor_chain(event, self.codec, self.note_representation_config) for event in events
+        ]
+        input_tokens = [self.tokenizer.encode(event["inputs"]) for event in events]
+
+        return input_tokens
diff --git a/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py b/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..1259f0bf056aa40ad111b4f282210687430ad68f
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py
@@ -0,0 +1,86 @@
+# Copyright 2022 The Music Spectrogram Diffusion Authors.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from transformers.modeling_utils import ModuleUtilsMixin
+from transformers.models.t5.modeling_t5 import T5Block, T5Config, T5LayerNorm
+
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....models import ModelMixin
+
+
+class SpectrogramNotesEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
+    @register_to_config
+    def __init__(
+        self,
+        max_length: int,
+        vocab_size: int,
+        d_model: int,
+        dropout_rate: float,
+        num_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        feed_forward_proj: str,
+        is_decoder: bool = False,
+    ):
+        super().__init__()
+
+        self.token_embedder = nn.Embedding(vocab_size, d_model)
+
+        self.position_encoding = nn.Embedding(max_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=dropout_rate)
+
+        t5config = T5Config(
+            vocab_size=vocab_size,
+            d_model=d_model,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            dropout_rate=dropout_rate,
+            feed_forward_proj=feed_forward_proj,
+            is_decoder=is_decoder,
+            is_encoder_decoder=False,
+        )
+
+        self.encoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            lyr = T5Block(t5config)
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(d_model)
+        self.dropout_post = nn.Dropout(p=dropout_rate)
+
+    def forward(self, encoder_input_tokens, encoder_inputs_mask):
+        x = self.token_embedder(encoder_input_tokens)
+
+        seq_length = encoder_input_tokens.shape[1]
+        inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device)
+        x += self.position_encoding(inputs_positions)
+
+        x = self.dropout_pre(x)
+
+        # inverted the attention mask
+        input_shape = encoder_input_tokens.size()
+        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
+
+        for lyr in self.encoders:
+            x = lyr(x, extended_attention_mask)[0]
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
diff --git a/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8ac8e1416bf7049f3835f62f5762aa237c34890
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -0,0 +1,269 @@
+# Copyright 2022 The Music Spectrogram Diffusion Authors.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ....models import T5FilmDecoder
+from ....schedulers import DDPMScheduler
+from ....utils import is_onnx_available, logging
+from ....utils.torch_utils import randn_tensor
+
+
+if is_onnx_available():
+    from ...onnx_utils import OnnxRuntimeModel
+
+from ...pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from .continuous_encoder import SpectrogramContEncoder
+from .notes_encoder import SpectrogramNotesEncoder
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+TARGET_FEATURE_LENGTH = 256
+
+
+class SpectrogramDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for unconditional audio generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        notes_encoder ([`SpectrogramNotesEncoder`]):
+        continuous_encoder ([`SpectrogramContEncoder`]):
+        decoder ([`T5FilmDecoder`]):
+            A [`T5FilmDecoder`] to denoise the encoded audio latents.
+        scheduler ([`DDPMScheduler`]):
+            A scheduler to be used in combination with `decoder` to denoise the encoded audio latents.
+        melgan ([`OnnxRuntimeModel`]):
+    """
+
+    _optional_components = ["melgan"]
+
+    def __init__(
+        self,
+        notes_encoder: SpectrogramNotesEncoder,
+        continuous_encoder: SpectrogramContEncoder,
+        decoder: T5FilmDecoder,
+        scheduler: DDPMScheduler,
+        melgan: OnnxRuntimeModel if is_onnx_available() else Any,
+    ) -> None:
+        super().__init__()
+
+        # From MELGAN
+        self.min_value = math.log(1e-5)  # Matches MelGAN training.
+        self.max_value = 4.0  # Largest value for most examples
+        self.n_dims = 128
+
+        self.register_modules(
+            notes_encoder=notes_encoder,
+            continuous_encoder=continuous_encoder,
+            decoder=decoder,
+            scheduler=scheduler,
+            melgan=melgan,
+        )
+
+    def scale_features(self, features, output_range=(-1.0, 1.0), clip=False):
+        """Linearly scale features to network outputs range."""
+        min_out, max_out = output_range
+        if clip:
+            features = torch.clip(features, self.min_value, self.max_value)
+        # Scale to [0, 1].
+        zero_one = (features - self.min_value) / (self.max_value - self.min_value)
+        # Scale to [min_out, max_out].
+        return zero_one * (max_out - min_out) + min_out
+
+    def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False):
+        """Invert by linearly scaling network outputs to features range."""
+        min_out, max_out = input_range
+        outputs = torch.clip(outputs, min_out, max_out) if clip else outputs
+        # Scale to [0, 1].
+        zero_one = (outputs - min_out) / (max_out - min_out)
+        # Scale to [self.min_value, self.max_value].
+        return zero_one * (self.max_value - self.min_value) + self.min_value
+
+    def encode(self, input_tokens, continuous_inputs, continuous_mask):
+        tokens_mask = input_tokens > 0
+        tokens_encoded, tokens_mask = self.notes_encoder(
+            encoder_input_tokens=input_tokens, encoder_inputs_mask=tokens_mask
+        )
+
+        continuous_encoded, continuous_mask = self.continuous_encoder(
+            encoder_inputs=continuous_inputs, encoder_inputs_mask=continuous_mask
+        )
+
+        return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)]
+
+    def decode(self, encodings_and_masks, input_tokens, noise_time):
+        timesteps = noise_time
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=input_tokens.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(input_tokens.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps * torch.ones(input_tokens.shape[0], dtype=timesteps.dtype, device=timesteps.device)
+
+        logits = self.decoder(
+            encodings_and_masks=encodings_and_masks, decoder_input_tokens=input_tokens, decoder_noise_time=timesteps
+        )
+        return logits
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        input_tokens: List[List[int]],
+        generator: Optional[torch.Generator] = None,
+        num_inference_steps: int = 100,
+        return_dict: bool = True,
+        output_type: str = "np",
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ) -> Union[AudioPipelineOutput, Tuple]:
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            input_tokens (`List[List[int]]`):
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
+                expense of slower inference.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated audio.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Example:
+
+        ```py
+        >>> from diffusers import SpectrogramDiffusionPipeline, MidiProcessor
+
+        >>> pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
+        >>> pipe = pipe.to("cuda")
+        >>> processor = MidiProcessor()
+
+        >>> # Download MIDI from: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid
+        >>> output = pipe(processor("beethoven_hammerklavier_2.mid"))
+
+        >>> audio = output.audios[0]
+        ```
+
+        Returns:
+            [`pipelines.AudioPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated audio.
+        """
+
+        pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
+        full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
+        ones = torch.ones((1, TARGET_FEATURE_LENGTH), dtype=bool, device=self.device)
+
+        for i, encoder_input_tokens in enumerate(input_tokens):
+            if i == 0:
+                encoder_continuous_inputs = torch.from_numpy(pred_mel[:1].copy()).to(
+                    device=self.device, dtype=self.decoder.dtype
+                )
+                # The first chunk has no previous context.
+                encoder_continuous_mask = torch.zeros((1, TARGET_FEATURE_LENGTH), dtype=bool, device=self.device)
+            else:
+                # The full song pipeline does not feed in a context feature, so the mask
+                # will be all 0s after the feature converter. Because we know we're
+                # feeding in a full context chunk from the previous prediction, set it
+                # to all 1s.
+                encoder_continuous_mask = ones
+
+            encoder_continuous_inputs = self.scale_features(
+                encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True
+            )
+
+            encodings_and_masks = self.encode(
+                input_tokens=torch.IntTensor([encoder_input_tokens]).to(device=self.device),
+                continuous_inputs=encoder_continuous_inputs,
+                continuous_mask=encoder_continuous_mask,
+            )
+
+            # Sample encoder_continuous_inputs shaped gaussian noise to begin loop
+            x = randn_tensor(
+                shape=encoder_continuous_inputs.shape,
+                generator=generator,
+                device=self.device,
+                dtype=self.decoder.dtype,
+            )
+
+            # set step values
+            self.scheduler.set_timesteps(num_inference_steps)
+
+            # Denoising diffusion loop
+            for j, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+                output = self.decode(
+                    encodings_and_masks=encodings_and_masks,
+                    input_tokens=x,
+                    noise_time=t / self.scheduler.config.num_train_timesteps,  # rescale to [0, 1)
+                )
+
+                # Compute previous output: x_t -> x_t-1
+                x = self.scheduler.step(output, t, x, generator=generator).prev_sample
+
+            mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
+            encoder_continuous_inputs = mel[:1]
+            pred_mel = mel.cpu().float().numpy()
+
+            full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1)
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, full_pred_mel)
+
+            logger.info("Generated segment", i)
+
+        if output_type == "np" and not is_onnx_available():
+            raise ValueError(
+                "Cannot return output in 'np' format if ONNX is not available. Make sure to have ONNX installed or set 'output_type' to 'mel'."
+            )
+        elif output_type == "np" and self.melgan is None:
+            raise ValueError(
+                "Cannot return output in 'np' format if melgan component is not defined. Make sure to define `self.melgan` or set 'output_type' to 'mel'."
+            )
+
+        if output_type == "np":
+            output = self.melgan(input_features=full_pred_mel.astype(np.float32))
+        else:
+            output = full_pred_mel
+
+        if not return_dict:
+            return (output,)
+
+        return AudioPipelineOutput(audios=output)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..36cf1a33ce6ada8e718aabadb9a706737aee30bd
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py
@@ -0,0 +1,55 @@
+from typing import TYPE_CHECKING
+
+from ....utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ....utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_cycle_diffusion"] = ["CycleDiffusionPipeline"]
+    _import_structure["pipeline_stable_diffusion_inpaint_legacy"] = ["StableDiffusionInpaintPipelineLegacy"]
+    _import_structure["pipeline_stable_diffusion_model_editing"] = ["StableDiffusionModelEditingPipeline"]
+
+    _import_structure["pipeline_stable_diffusion_paradigms"] = ["StableDiffusionParadigmsPipeline"]
+    _import_structure["pipeline_stable_diffusion_pix2pix_zero"] = ["StableDiffusionPix2PixZeroPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ....utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .pipeline_cycle_diffusion import CycleDiffusionPipeline
+        from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
+        from .pipeline_stable_diffusion_model_editing import StableDiffusionModelEditingPipeline
+        from .pipeline_stable_diffusion_paradigms import StableDiffusionParadigmsPipeline
+        from .pipeline_stable_diffusion_pix2pix_zero import StableDiffusionPix2PixZeroPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..777be883cb9dfc86f8a5268620a746214da7dba4
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py
@@ -0,0 +1,949 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ....configuration_utils import FrozenDict
+from ....image_processor import PipelineImageInput, VaeImageProcessor
+from ....loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import DDIMScheduler
+from ....utils import PIL_INTERPOLATION, USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def posterior_sample(scheduler, latents, timestep, clean_latents, generator, eta):
+    # 1. get previous step value (=t-1)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
+
+    if prev_timestep <= 0:
+        return clean_latents
+
+    # 2. compute alphas, betas
+    alpha_prod_t = scheduler.alphas_cumprod[timestep]
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
+
+    variance = scheduler._get_variance(timestep, prev_timestep)
+    std_dev_t = eta * variance ** (0.5)
+
+    # direction pointing to x_t
+    e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5)
+    dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t
+    noise = std_dev_t * randn_tensor(
+        clean_latents.shape, dtype=clean_latents.dtype, device=clean_latents.device, generator=generator
+    )
+    prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise
+
+    return prev_latents
+
+
+def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
+    # 1. get previous step value (=t-1)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
+
+    # 2. compute alphas, betas
+    alpha_prod_t = scheduler.alphas_cumprod[timestep]
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
+
+    beta_prod_t = 1 - alpha_prod_t
+
+    # 3. compute predicted original sample from predicted noise also called
+    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+    # 4. Clip "predicted x_0"
+    if scheduler.config.clip_sample:
+        pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+    # 5. compute variance: "sigma_t(η)" -> see formula (16)
+    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+    variance = scheduler._get_variance(timestep, prev_timestep)
+    std_dev_t = eta * variance ** (0.5)
+
+    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
+
+    noise = (prev_latents - (alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction)) / (
+        variance ** (0.5) * eta
+    )
+    return noise
+
+
+class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin):
+    r"""
+    Pipeline for text-guided image to image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can only be an
+            instance of [`DDIMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = image.shape[0]
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(image.shape[0])
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
+
+        # add noise to latents using the timestep
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        clean_latents = init_latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents, clean_latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        source_prompt: Union[str, List[str]],
+        image: PipelineImageInput = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        source_guidance_scale: Optional[float] = 1,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            source_guidance_scale (`float`, *optional*, defaults to 1):
+                Guidance scale for the source prompt. This is useful to control the amount of influence the source
+                prompt has for encoding.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Example:
+
+        ```py
+        import requests
+        import torch
+        from PIL import Image
+        from io import BytesIO
+
+        from diffusers import CycleDiffusionPipeline, DDIMScheduler
+
+        # load the pipeline
+        # make sure you're logged in with `huggingface-cli login`
+        model_id_or_path = "CompVis/stable-diffusion-v1-4"
+        scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
+        pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")
+
+        # let's download an initial image
+        url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
+        response = requests.get(url)
+        init_image = Image.open(BytesIO(response.content)).convert("RGB")
+        init_image = init_image.resize((512, 512))
+        init_image.save("horse.png")
+
+        # let's specify a prompt
+        source_prompt = "An astronaut riding a horse"
+        prompt = "An astronaut riding an elephant"
+
+        # call the pipeline
+        image = pipe(
+            prompt=prompt,
+            source_prompt=source_prompt,
+            image=init_image,
+            num_inference_steps=100,
+            eta=0.1,
+            strength=0.8,
+            guidance_scale=2,
+            source_guidance_scale=1,
+        ).images[0]
+
+        image.save("horse_to_elephant.png")
+
+        # let's try another example
+        # See more samples at the original repo: https://github.com/ChenWu98/cycle-diffusion
+        url = (
+            "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png"
+        )
+        response = requests.get(url)
+        init_image = Image.open(BytesIO(response.content)).convert("RGB")
+        init_image = init_image.resize((512, 512))
+        init_image.save("black.png")
+
+        source_prompt = "A black colored car"
+        prompt = "A blue colored car"
+
+        # call the pipeline
+        torch.manual_seed(0)
+        image = pipe(
+            prompt=prompt,
+            source_prompt=source_prompt,
+            image=init_image,
+            num_inference_steps=100,
+            eta=0.1,
+            strength=0.85,
+            guidance_scale=3,
+            source_guidance_scale=1,
+        ).images[0]
+
+        image.save("black_to_blue.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 1. Check inputs
+        self.check_inputs(prompt, strength, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        source_prompt_embeds_tuple = self.encode_prompt(
+            source_prompt, device, num_images_per_prompt, do_classifier_free_guidance, None, clip_skip=clip_skip
+        )
+        if prompt_embeds_tuple[1] is not None:
+            prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+        else:
+            prompt_embeds = prompt_embeds_tuple[0]
+        if source_prompt_embeds_tuple[1] is not None:
+            source_prompt_embeds = torch.cat([source_prompt_embeds_tuple[1], source_prompt_embeds_tuple[0]])
+        else:
+            source_prompt_embeds = source_prompt_embeds_tuple[0]
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        latents, clean_latents = self.prepare_latents(
+            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+        )
+        source_latents = latents
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        generator = extra_step_kwargs.pop("generator", None)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                source_latent_model_input = (
+                    torch.cat([source_latents] * 2) if do_classifier_free_guidance else source_latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
+
+                # predict the noise residual
+                if do_classifier_free_guidance:
+                    concat_latent_model_input = torch.stack(
+                        [
+                            source_latent_model_input[0],
+                            latent_model_input[0],
+                            source_latent_model_input[1],
+                            latent_model_input[1],
+                        ],
+                        dim=0,
+                    )
+                    concat_prompt_embeds = torch.stack(
+                        [
+                            source_prompt_embeds[0],
+                            prompt_embeds[0],
+                            source_prompt_embeds[1],
+                            prompt_embeds[1],
+                        ],
+                        dim=0,
+                    )
+                else:
+                    concat_latent_model_input = torch.cat(
+                        [
+                            source_latent_model_input,
+                            latent_model_input,
+                        ],
+                        dim=0,
+                    )
+                    concat_prompt_embeds = torch.cat(
+                        [
+                            source_prompt_embeds,
+                            prompt_embeds,
+                        ],
+                        dim=0,
+                    )
+
+                concat_noise_pred = self.unet(
+                    concat_latent_model_input,
+                    t,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_hidden_states=concat_prompt_embeds,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    (
+                        source_noise_pred_uncond,
+                        noise_pred_uncond,
+                        source_noise_pred_text,
+                        noise_pred_text,
+                    ) = concat_noise_pred.chunk(4, dim=0)
+
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
+                        source_noise_pred_text - source_noise_pred_uncond
+                    )
+
+                else:
+                    (source_noise_pred, noise_pred) = concat_noise_pred.chunk(2, dim=0)
+
+                # Sample source_latents from the posterior distribution.
+                prev_source_latents = posterior_sample(
+                    self.scheduler, source_latents, t, clean_latents, generator=generator, **extra_step_kwargs
+                )
+                # Compute noise.
+                noise = compute_noise(
+                    self.scheduler, prev_source_latents, source_latents, t, source_noise_pred, **extra_step_kwargs
+                )
+                source_latents = prev_source_latents
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs
+                ).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 9. Post-processing
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aa5e68bfcb4b276cafea63fce6e2a5f3e2f79c2
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py
@@ -0,0 +1,542 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTokenizer
+
+from ....configuration_utils import FrozenDict
+from ....schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ....utils import deprecate, logging
+from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def preprocess(image):
+    w, h = image.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask, scale_factor=8):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL.Image.NEAREST)
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
+    return mask
+
+
+class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion. This is a *legacy feature* for Onnx pipelines to
+    provide compatibility with StableDiffusionInpaintPipelineLegacy and may be removed in the future.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _is_onnx = True
+
+    vae_encoder: OnnxRuntimeModel
+    vae_decoder: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+    safety_checker: OnnxRuntimeModel
+    feature_extractor: CLIPImageProcessor
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: Optional[int],
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[str],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[np.ndarray, PIL.Image.Image] = None,
+        mask_image: Union[np.ndarray, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`nd.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`nd.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.uu
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (?) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # check inputs. Raise error if not correct
+        self.check_inputs(prompt, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if generator is None:
+            generator = np.random
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        if isinstance(image, PIL.Image.Image):
+            image = preprocess(image)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        latents_dtype = prompt_embeds.dtype
+        image = image.astype(latents_dtype)
+
+        # encode the init image into latents and scale the latents
+        init_latents = self.vae_encoder(sample=image)[0]
+        init_latents = 0.18215 * init_latents
+
+        # Expand init_latents for batch_size and num_images_per_prompt
+        init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
+        init_latents_orig = init_latents
+
+        # preprocess mask
+        if not isinstance(mask_image, np.ndarray):
+            mask_image = preprocess_mask(mask_image, 8)
+        mask_image = mask_image.astype(latents_dtype)
+        mask = np.concatenate([mask_image] * num_images_per_prompt, axis=0)
+
+        # check sizes
+        if not mask.shape == init_latents.shape:
+            raise ValueError("The mask and image should be the same size!")
+
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        timesteps = self.scheduler.timesteps.numpy()[-init_timestep]
+        timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
+
+        # add noise to latents using the timesteps
+        noise = generator.randn(*init_latents.shape).astype(latents_dtype)
+        init_latents = self.scheduler.add_noise(
+            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
+        )
+        init_latents = init_latents.numpy()
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (?) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to ? in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        latents = init_latents
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        timesteps = self.scheduler.timesteps[t_start:].numpy()
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[
+                0
+            ]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            ).prev_sample
+
+            latents = latents.numpy()
+
+            init_latents_proper = self.scheduler.add_noise(
+                torch.from_numpy(init_latents_orig), torch.from_numpy(noise), torch.from_numpy(np.array([t]))
+            )
+
+            init_latents_proper = init_latents_proper.numpy()
+
+            latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        image = np.concatenate(
+            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+        )
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # There will throw an error if use safety_checker batchsize>1
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce7ad3b0dfe94500173b411e5cdbf1bad608d6d1
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
@@ -0,0 +1,784 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ....configuration_utils import FrozenDict
+from ....image_processor import VaeImageProcessor
+from ....loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import PIL_INTERPOLATION, USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)
+
+
+def preprocess_image(image, batch_size):
+    w, h = image.size
+    w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = np.array(image).astype(np.float32) / 255.0
+    image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask, batch_size, scale_factor=8):
+    if not isinstance(mask, torch.Tensor):
+        mask = mask.convert("L")
+        w, h = mask.size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+        mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
+        mask = np.array(mask).astype(np.float32) / 255.0
+        mask = np.tile(mask, (4, 1, 1))
+        mask = np.vstack([mask[None]] * batch_size)
+        mask = 1 - mask  # repaint white, keep black
+        mask = torch.from_numpy(mask)
+        return mask
+
+    else:
+        valid_mask_channel_sizes = [1, 3]
+        # if mask channel is fourth tensor dimension, permute dimensions to pytorch standard (B, C, H, W)
+        if mask.shape[3] in valid_mask_channel_sizes:
+            mask = mask.permute(0, 3, 1, 2)
+        elif mask.shape[1] not in valid_mask_channel_sizes:
+            raise ValueError(
+                f"Mask channel dimension of size in {valid_mask_channel_sizes} should be second or fourth dimension,"
+                f" but received mask of shape {tuple(mask.shape)}"
+            )
+        # (potentially) reduce mask channel dimension from 3 to 1 for broadcasting to latent shape
+        mask = mask.mean(dim=1, keepdim=True)
+        h, w = mask.shape[-2:]
+        h, w = (x - x % 8 for x in (h, w))  # resize to integer multiple of 8
+        mask = torch.nn.functional.interpolate(mask, (h // scale_factor, w // scale_factor))
+        return mask
+
+
+class StableDiffusionInpaintPipelineLegacy(
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, FromSingleFileMixin
+):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+        - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`]
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        deprecation_message = (
+            f"The class {self.__class__} is deprecated and will be removed in v1.0.0. You can achieve exactly the same functionality"
+            "by loading your model into `StableDiffusionInpaintPipeline` instead. See https://github.com/huggingface/diffusers/pull/3533"
+            "for more information."
+        )
+        deprecate("legacy is outdated", "1.0.0", deprecation_message, standard_warn=False)
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, num_images_per_prompt, dtype, device, generator):
+        image = image.to(device=device, dtype=dtype)
+        init_latent_dist = self.vae.encode(image).latent_dist
+        init_latents = init_latent_dist.sample(generator=generator)
+        init_latents = self.vae.config.scaling_factor * init_latents
+
+        # Expand init_latents for batch_size and num_images_per_prompt
+        init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
+        init_latents_orig = init_latents
+
+        # add noise to latents using the timesteps
+        noise = randn_tensor(init_latents.shape, generator=generator, device=device, dtype=dtype)
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+        return latents, init_latents_orig, noise
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[torch.Tensor, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        add_predicted_noise: Optional[bool] = False,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If mask is a tensor, the
+                expected shape should be either `(B, H, W, C)` or `(B, C, H, W)`, where C is 1 or 3.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+                is 1, the denoising process will be run on the masked area for the full number of iterations specified
+                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to
+                that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            add_predicted_noise (`bool`, *optional*, defaults to True):
+                Use predicted noise instead of random noise when constructing noisy versions of the original image in
+                the reverse diffusion process
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 1. Check inputs
+        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Preprocess image and mask
+        if not isinstance(image, torch.Tensor):
+            image = preprocess_image(image, batch_size)
+
+        mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor)
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        # encode the init image into latents and scale the latents
+        latents, init_latents_orig, noise = self.prepare_latents(
+            image, latent_timestep, num_images_per_prompt, prompt_embeds.dtype, device, generator
+        )
+
+        # 7. Prepare mask latent
+        mask = mask_image.to(device=device, dtype=latents.dtype)
+        mask = torch.cat([mask] * num_images_per_prompt)
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                # masking
+                if add_predicted_noise:
+                    init_latents_proper = self.scheduler.add_noise(
+                        init_latents_orig, noise_pred_uncond, torch.tensor([t])
+                    )
+                else:
+                    init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+
+                latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # use original latents corresponding to unmasked portions of the image
+        latents = (init_latents_orig * mask) + (latents * (1 - mask))
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e91986896bd4ec46560a8cc4f03845ccade4c61
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
@@ -0,0 +1,830 @@
+# Copyright 2024 TIME Authors and The HuggingFace Team. All rights reserved."
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ....image_processor import VaeImageProcessor
+from ....loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import PNDMScheduler
+from ....schedulers.scheduling_utils import SchedulerMixin
+from ....utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+AUGS_CONST = ["A photo of ", "An image of ", "A picture of "]
+
+
+class StableDiffusionModelEditingPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
+):
+    r"""
+    Pipeline for text-to-image model editing.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+        with_to_k ([`bool`]):
+            Whether to edit the key projection matrices along with the value projection matrices.
+        with_augs ([`list`]):
+            Textual augmentations to apply while editing the text-to-image model. Set to `[]` for no augmentations.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: SchedulerMixin,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+        with_to_k: bool = True,
+        with_augs: list = AUGS_CONST,
+    ):
+        super().__init__()
+
+        if isinstance(scheduler, PNDMScheduler):
+            logger.error("PNDMScheduler for this pipeline is currently not supported.")
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+        self.with_to_k = with_to_k
+        self.with_augs = with_augs
+
+        # get cross-attention layers
+        ca_layers = []
+
+        def append_ca(net_):
+            if net_.__class__.__name__ == "CrossAttention":
+                ca_layers.append(net_)
+            elif hasattr(net_, "children"):
+                for net__ in net_.children():
+                    append_ca(net__)
+
+        # recursively find all cross-attention layers in unet
+        for net in self.unet.named_children():
+            if "down" in net[0]:
+                append_ca(net[1])
+            elif "up" in net[0]:
+                append_ca(net[1])
+            elif "mid" in net[0]:
+                append_ca(net[1])
+
+        # get projection matrices
+        self.ca_clip_layers = [l for l in ca_layers if l.to_v.in_features == 768]
+        self.projection_matrices = [l.to_v for l in self.ca_clip_layers]
+        self.og_matrices = [copy.deepcopy(l.to_v) for l in self.ca_clip_layers]
+        if self.with_to_k:
+            self.projection_matrices = self.projection_matrices + [l.to_k for l in self.ca_clip_layers]
+            self.og_matrices = self.og_matrices + [copy.deepcopy(l.to_k) for l in self.ca_clip_layers]
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def edit_model(
+        self,
+        source_prompt: str,
+        destination_prompt: str,
+        lamb: float = 0.1,
+        restart_params: bool = True,
+    ):
+        r"""
+        Apply model editing via closed-form solution (see Eq. 5 in the TIME [paper](https://arxiv.org/abs/2303.08084)).
+
+        Args:
+            source_prompt (`str`):
+                The source prompt containing the concept to be edited.
+            destination_prompt (`str`):
+                The destination prompt. Must contain all words from `source_prompt` with additional ones to specify the
+                target edit.
+            lamb (`float`, *optional*, defaults to 0.1):
+                The lambda parameter specifying the regularization intesity. Smaller values increase the editing power.
+            restart_params (`bool`, *optional*, defaults to True):
+                Restart the model parameters to their pre-trained version before editing. This is done to avoid edit
+                compounding. When it is `False`, edits accumulate.
+        """
+
+        # restart LDM parameters
+        if restart_params:
+            num_ca_clip_layers = len(self.ca_clip_layers)
+            for idx_, l in enumerate(self.ca_clip_layers):
+                l.to_v = copy.deepcopy(self.og_matrices[idx_])
+                self.projection_matrices[idx_] = l.to_v
+                if self.with_to_k:
+                    l.to_k = copy.deepcopy(self.og_matrices[num_ca_clip_layers + idx_])
+                    self.projection_matrices[num_ca_clip_layers + idx_] = l.to_k
+
+        # set up sentences
+        old_texts = [source_prompt]
+        new_texts = [destination_prompt]
+        # add augmentations
+        base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][1:]
+        for aug in self.with_augs:
+            old_texts.append(aug + base)
+        base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][1:]
+        for aug in self.with_augs:
+            new_texts.append(aug + base)
+
+        # prepare input k* and v*
+        old_embs, new_embs = [], []
+        for old_text, new_text in zip(old_texts, new_texts):
+            text_input = self.tokenizer(
+                [old_text, new_text],
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+            old_emb, new_emb = text_embeddings
+            old_embs.append(old_emb)
+            new_embs.append(new_emb)
+
+        # identify corresponding destinations for each token in old_emb
+        idxs_replaces = []
+        for old_text, new_text in zip(old_texts, new_texts):
+            tokens_a = self.tokenizer(old_text).input_ids
+            tokens_b = self.tokenizer(new_text).input_ids
+            tokens_a = [self.tokenizer.encode("a ")[1] if self.tokenizer.decode(t) == "an" else t for t in tokens_a]
+            tokens_b = [self.tokenizer.encode("a ")[1] if self.tokenizer.decode(t) == "an" else t for t in tokens_b]
+            num_orig_tokens = len(tokens_a)
+            idxs_replace = []
+            j = 0
+            for i in range(num_orig_tokens):
+                curr_token = tokens_a[i]
+                while tokens_b[j] != curr_token:
+                    j += 1
+                idxs_replace.append(j)
+                j += 1
+            while j < 77:
+                idxs_replace.append(j)
+                j += 1
+            while len(idxs_replace) < 77:
+                idxs_replace.append(76)
+            idxs_replaces.append(idxs_replace)
+
+        # prepare batch: for each pair of setences, old context and new values
+        contexts, valuess = [], []
+        for old_emb, new_emb, idxs_replace in zip(old_embs, new_embs, idxs_replaces):
+            context = old_emb.detach()
+            values = []
+            with torch.no_grad():
+                for layer in self.projection_matrices:
+                    values.append(layer(new_emb[idxs_replace]).detach())
+            contexts.append(context)
+            valuess.append(values)
+
+        # edit the model
+        for layer_num in range(len(self.projection_matrices)):
+            # mat1 = \lambda W + \sum{v k^T}
+            mat1 = lamb * self.projection_matrices[layer_num].weight
+
+            # mat2 = \lambda I + \sum{k k^T}
+            mat2 = lamb * torch.eye(
+                self.projection_matrices[layer_num].weight.shape[1],
+                device=self.projection_matrices[layer_num].weight.device,
+            )
+
+            # aggregate sums for mat1, mat2
+            for context, values in zip(contexts, valuess):
+                context_vector = context.reshape(context.shape[0], context.shape[1], 1)
+                context_vector_T = context.reshape(context.shape[0], 1, context.shape[1])
+                value_vector = values[layer_num].reshape(values[layer_num].shape[0], values[layer_num].shape[1], 1)
+                for_mat1 = (value_vector @ context_vector_T).sum(dim=0)
+                for_mat2 = (context_vector @ context_vector_T).sum(dim=0)
+                mat1 += for_mat1
+                mat2 += for_mat2
+
+            # update projection matrix
+            self.projection_matrices[layer_num].weight = torch.nn.Parameter(mat1 @ torch.inverse(mat2))
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionModelEditingPipeline
+
+        >>> model_ckpt = "CompVis/stable-diffusion-v1-4"
+        >>> pipe = StableDiffusionModelEditingPipeline.from_pretrained(model_ckpt)
+
+        >>> pipe = pipe.to("cuda")
+
+        >>> source_prompt = "A pack of roses"
+        >>> destination_prompt = "A pack of blue roses"
+        >>> pipe.edit_model(source_prompt, destination_prompt)
+
+        >>> prompt = "A field of roses"
+        >>> image = pipe(prompt).images[0]
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
new file mode 100644
index 0000000000000000000000000000000000000000..be21900ab55a65c33350862e8f3634b9f4213ca8
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
@@ -0,0 +1,796 @@
+# Copyright 2024 ParaDiGMS authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ....image_processor import VaeImageProcessor
+from ....loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import DDPMParallelScheduler
+        >>> from diffusers import StableDiffusionParadigmsPipeline
+
+        >>> scheduler = DDPMParallelScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
+
+        >>> pipe = StableDiffusionParadigmsPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", scheduler=scheduler, torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> ngpu, batch_per_device = torch.cuda.device_count(), 5
+        >>> pipe.wrapped_unet = torch.nn.DataParallel(pipe.unet, device_ids=[d for d in range(ngpu)])
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt, parallel=ngpu * batch_per_device, num_inference_steps=1000).images[0]
+        ```
+"""
+
+
+class StableDiffusionParadigmsPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using a parallelized version of Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+        # attribute to wrap the unet with torch.nn.DataParallel when running multiple denoising steps on multiple GPUs
+        self.wrapped_unet = self.unet
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _cumsum(self, input, dim, debug=False):
+        if debug:
+            # cumsum_cuda_kernel does not have a deterministic implementation
+            # so perform cumsum on cpu for debugging purposes
+            return torch.cumsum(input.cpu().float(), dim=dim).to(input.device)
+        else:
+            return torch.cumsum(input, dim=dim)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        parallel: int = 10,
+        tolerance: float = 0.1,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        debug: bool = False,
+        clip_skip: int = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            parallel (`int`, *optional*, defaults to 10):
+                The batch size to use when doing parallel sampling. More parallelism may lead to faster inference but
+                requires higher memory usage and can also require more total FLOPs.
+            tolerance (`float`, *optional*, defaults to 0.1):
+                The error tolerance for determining when to slide the batch window forward for parallel sampling. Lower
+                tolerance usually leads to less or no degradation. Higher tolerance is faster but can risk degradation
+                of sample quality. The tolerance is specified as a ratio of the scheduler's noise magnitude.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            debug (`bool`, *optional*, defaults to `False`):
+                Whether or not to run in debug mode. In debug mode, `torch.cumsum` is evaluated using the CPU.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        extra_step_kwargs.pop("generator", None)
+
+        # # 7. Denoising loop
+        scheduler = self.scheduler
+        parallel = min(parallel, len(scheduler.timesteps))
+
+        begin_idx = 0
+        end_idx = parallel
+        latents_time_evolution_buffer = torch.stack([latents] * (len(scheduler.timesteps) + 1))
+
+        # We must make sure the noise of stochastic schedulers such as DDPM is sampled only once per timestep.
+        # Sampling inside the parallel denoising loop will mess this up, so we pre-sample the noise vectors outside the denoising loop.
+        noise_array = torch.zeros_like(latents_time_evolution_buffer)
+        for j in range(len(scheduler.timesteps)):
+            base_noise = randn_tensor(
+                shape=latents.shape, generator=generator, device=latents.device, dtype=prompt_embeds.dtype
+            )
+            noise = (self.scheduler._get_variance(scheduler.timesteps[j]) ** 0.5) * base_noise
+            noise_array[j] = noise.clone()
+
+        # We specify the error tolerance as a ratio of the scheduler's noise magnitude. We similarly compute the error tolerance
+        # outside of the denoising loop to avoid recomputing it at every step.
+        # We will be dividing the norm of the noise, so we store its inverse here to avoid a division at every step.
+        inverse_variance_norm = 1.0 / torch.tensor(
+            [scheduler._get_variance(scheduler.timesteps[j]) for j in range(len(scheduler.timesteps))] + [0]
+        ).to(noise_array.device)
+        latent_dim = noise_array[0, 0].numel()
+        inverse_variance_norm = inverse_variance_norm[:, None] / latent_dim
+
+        scaled_tolerance = tolerance**2
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            steps = 0
+            while begin_idx < len(scheduler.timesteps):
+                # these have shape (parallel_dim, 2*batch_size, ...)
+                # parallel_len is at most parallel, but could be less if we are at the end of the timesteps
+                # we are processing batch window of timesteps spanning [begin_idx, end_idx)
+                parallel_len = end_idx - begin_idx
+
+                block_prompt_embeds = torch.stack([prompt_embeds] * parallel_len)
+                block_latents = latents_time_evolution_buffer[begin_idx:end_idx]
+                block_t = scheduler.timesteps[begin_idx:end_idx, None].repeat(1, batch_size * num_images_per_prompt)
+                t_vec = block_t
+                if do_classifier_free_guidance:
+                    t_vec = t_vec.repeat(1, 2)
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([block_latents] * 2, dim=1) if do_classifier_free_guidance else block_latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t_vec)
+
+                # if parallel_len is small, no need to use multiple GPUs
+                net = self.wrapped_unet if parallel_len > 3 else self.unet
+                # predict the noise residual, shape is now [parallel_len * 2 * batch_size * num_images_per_prompt, ...]
+                model_output = net(
+                    latent_model_input.flatten(0, 1),
+                    t_vec.flatten(0, 1),
+                    encoder_hidden_states=block_prompt_embeds.flatten(0, 1),
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                per_latent_shape = model_output.shape[1:]
+                if do_classifier_free_guidance:
+                    model_output = model_output.reshape(
+                        parallel_len, 2, batch_size * num_images_per_prompt, *per_latent_shape
+                    )
+                    noise_pred_uncond, noise_pred_text = model_output[:, 0], model_output[:, 1]
+                    model_output = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                model_output = model_output.reshape(
+                    parallel_len * batch_size * num_images_per_prompt, *per_latent_shape
+                )
+
+                block_latents_denoise = scheduler.batch_step_no_noise(
+                    model_output=model_output,
+                    timesteps=block_t.flatten(0, 1),
+                    sample=block_latents.flatten(0, 1),
+                    **extra_step_kwargs,
+                ).reshape(block_latents.shape)
+
+                # back to shape (parallel_dim, batch_size, ...)
+                # now we want to add the pre-sampled noise
+                # parallel sampling algorithm requires computing the cumulative drift from the beginning
+                # of the window, so we need to compute cumulative sum of the deltas and the pre-sampled noises.
+                delta = block_latents_denoise - block_latents
+                cumulative_delta = self._cumsum(delta, dim=0, debug=debug)
+                cumulative_noise = self._cumsum(noise_array[begin_idx:end_idx], dim=0, debug=debug)
+
+                # if we are using an ODE-like scheduler (like DDIM), we don't want to add noise
+                if scheduler._is_ode_scheduler:
+                    cumulative_noise = 0
+
+                block_latents_new = (
+                    latents_time_evolution_buffer[begin_idx][None,] + cumulative_delta + cumulative_noise
+                )
+                cur_error = torch.linalg.norm(
+                    (block_latents_new - latents_time_evolution_buffer[begin_idx + 1 : end_idx + 1]).reshape(
+                        parallel_len, batch_size * num_images_per_prompt, -1
+                    ),
+                    dim=-1,
+                ).pow(2)
+                error_ratio = cur_error * inverse_variance_norm[begin_idx + 1 : end_idx + 1]
+
+                # find the first index of the vector error_ratio that is greater than error tolerance
+                # we can shift the window for the next iteration up to this index
+                error_ratio = torch.nn.functional.pad(
+                    error_ratio, (0, 0, 0, 1), value=1e9
+                )  # handle the case when everything is below ratio, by padding the end of parallel_len dimension
+                any_error_at_time = torch.max(error_ratio > scaled_tolerance, dim=1).values.int()
+                ind = torch.argmax(any_error_at_time).item()
+
+                # compute the new begin and end idxs for the window
+                new_begin_idx = begin_idx + min(1 + ind, parallel)
+                new_end_idx = min(new_begin_idx + parallel, len(scheduler.timesteps))
+
+                # store the computed latents for the current window in the global buffer
+                latents_time_evolution_buffer[begin_idx + 1 : end_idx + 1] = block_latents_new
+                # initialize the new sliding window latents with the end of the current window,
+                # should be better than random initialization
+                latents_time_evolution_buffer[end_idx : new_end_idx + 1] = latents_time_evolution_buffer[end_idx][
+                    None,
+                ]
+
+                steps += 1
+
+                progress_bar.update(new_begin_idx - begin_idx)
+                if callback is not None and steps % callback_steps == 0:
+                    callback(begin_idx, block_t[begin_idx], latents_time_evolution_buffer[begin_idx])
+
+                begin_idx = new_begin_idx
+                end_idx = new_end_idx
+
+        latents = latents_time_evolution_buffer[-1]
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
new file mode 100644
index 0000000000000000000000000000000000000000..2978972200c7ae9dc2d967a698753bd564e9a913
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
@@ -0,0 +1,1310 @@
+# Copyright 2024 Pix2Pix Zero Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import (
+    BlipForConditionalGeneration,
+    BlipProcessor,
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+)
+
+from ....image_processor import PipelineImageInput, VaeImageProcessor
+from ....loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.attention_processor import Attention
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import DDIMScheduler, DDPMScheduler, EulerAncestralDiscreteScheduler, LMSDiscreteScheduler
+from ....schedulers.scheduling_ddim_inverse import DDIMInverseScheduler
+from ....utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class Pix2PixInversionPipelineOutput(BaseOutput, TextualInversionLoaderMixin):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        latents (`torch.Tensor`)
+            inverted latents tensor
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    latents: torch.Tensor
+    images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import requests
+        >>> import torch
+
+        >>> from diffusers import DDIMScheduler, StableDiffusionPix2PixZeroPipeline
+
+
+        >>> def download(embedding_url, local_filepath):
+        ...     r = requests.get(embedding_url)
+        ...     with open(local_filepath, "wb") as f:
+        ...         f.write(r.content)
+
+
+        >>> model_ckpt = "CompVis/stable-diffusion-v1-4"
+        >>> pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(model_ckpt, torch_dtype=torch.float16)
+        >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.to("cuda")
+
+        >>> prompt = "a high resolution painting of a cat in the style of van gough"
+        >>> source_emb_url = "https://hf.co/datasets/sayakpaul/sample-datasets/resolve/main/cat.pt"
+        >>> target_emb_url = "https://hf.co/datasets/sayakpaul/sample-datasets/resolve/main/dog.pt"
+
+        >>> for url in [source_emb_url, target_emb_url]:
+        ...     download(url, url.split("/")[-1])
+
+        >>> src_embeds = torch.load(source_emb_url.split("/")[-1])
+        >>> target_embeds = torch.load(target_emb_url.split("/")[-1])
+        >>> images = pipeline(
+        ...     prompt,
+        ...     source_embeds=src_embeds,
+        ...     target_embeds=target_embeds,
+        ...     num_inference_steps=50,
+        ...     cross_attention_guidance_amount=0.15,
+        ... ).images
+
+        >>> images[0].save("edited_image_dog.png")
+        ```
+"""
+
+EXAMPLE_INVERT_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from transformers import BlipForConditionalGeneration, BlipProcessor
+        >>> from diffusers import DDIMScheduler, DDIMInverseScheduler, StableDiffusionPix2PixZeroPipeline
+
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> captioner_id = "Salesforce/blip-image-captioning-base"
+        >>> processor = BlipProcessor.from_pretrained(captioner_id)
+        >>> model = BlipForConditionalGeneration.from_pretrained(
+        ...     captioner_id, torch_dtype=torch.float16, low_cpu_mem_usage=True
+        ... )
+
+        >>> sd_model_ckpt = "CompVis/stable-diffusion-v1-4"
+        >>> pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
+        ...     sd_model_ckpt,
+        ...     caption_generator=model,
+        ...     caption_processor=processor,
+        ...     torch_dtype=torch.float16,
+        ...     safety_checker=None,
+        ... )
+
+        >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.enable_model_cpu_offload()
+
+        >>> img_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/test_images/cats/cat_6.png"
+
+        >>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB").resize((512, 512))
+        >>> # generate caption
+        >>> caption = pipeline.generate_caption(raw_image)
+
+        >>> # "a photography of a cat with flowers and dai dai daie - daie - daie kasaii"
+        >>> inv_latents = pipeline.invert(caption, image=raw_image).latents
+        >>> # we need to generate source and target embeds
+
+        >>> source_prompts = ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
+
+        >>> target_prompts = ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
+
+        >>> source_embeds = pipeline.get_embeds(source_prompts)
+        >>> target_embeds = pipeline.get_embeds(target_prompts)
+        >>> # the latents can then be used to edit a real image
+        >>> # when using Stable Diffusion 2 or other models that use v-prediction
+        >>> # set `cross_attention_guidance_amount` to 0.01 or less to avoid input latent gradient explosion
+
+        >>> image = pipeline(
+        ...     caption,
+        ...     source_embeds=source_embeds,
+        ...     target_embeds=target_embeds,
+        ...     num_inference_steps=50,
+        ...     cross_attention_guidance_amount=0.15,
+        ...     generator=generator,
+        ...     latents=inv_latents,
+        ...     negative_prompt=caption,
+        ... ).images[0]
+        >>> image.save("edited_image.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+def prepare_unet(unet: UNet2DConditionModel):
+    """Modifies the UNet (`unet`) to perform Pix2Pix Zero optimizations."""
+    pix2pix_zero_attn_procs = {}
+    for name in unet.attn_processors.keys():
+        module_name = name.replace(".processor", "")
+        module = unet.get_submodule(module_name)
+        if "attn2" in name:
+            pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(is_pix2pix_zero=True)
+            module.requires_grad_(True)
+        else:
+            pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(is_pix2pix_zero=False)
+            module.requires_grad_(False)
+
+    unet.set_attn_processor(pix2pix_zero_attn_procs)
+    return unet
+
+
+class Pix2PixZeroL2Loss:
+    def __init__(self):
+        self.loss = 0.0
+
+    def compute_loss(self, predictions, targets):
+        self.loss += ((predictions - targets) ** 2).sum((1, 2)).mean(0)
+
+
+class Pix2PixZeroAttnProcessor:
+    """An attention processor class to store the attention weights.
+    In Pix2Pix Zero, it happens during computations in the cross-attention blocks."""
+
+    def __init__(self, is_pix2pix_zero=False):
+        self.is_pix2pix_zero = is_pix2pix_zero
+        if self.is_pix2pix_zero:
+            self.reference_cross_attn_map = {}
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        timestep=None,
+        loss=None,
+    ):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        if self.is_pix2pix_zero and timestep is not None:
+            # new bookkeeping to save the attention weights.
+            if loss is None:
+                self.reference_cross_attn_map[timestep.item()] = attention_probs.detach().cpu()
+            # compute loss
+            elif loss is not None:
+                prev_attn_probs = self.reference_cross_attn_map.pop(timestep.item())
+                loss.compute_loss(attention_probs, prev_attn_probs.to(attention_probs.device))
+
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for pixel-level image editing using Pix2Pix Zero. Based on Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`], or [`DDPMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+        requires_safety_checker (bool):
+            Whether the pipeline requires a safety checker. We recommend setting it to True if you're using the
+            pipeline publicly.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = [
+        "safety_checker",
+        "feature_extractor",
+        "caption_generator",
+        "caption_processor",
+        "inverse_scheduler",
+    ]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDPMScheduler, DDIMScheduler, EulerAncestralDiscreteScheduler, LMSDiscreteScheduler],
+        feature_extractor: CLIPImageProcessor,
+        safety_checker: StableDiffusionSafetyChecker,
+        inverse_scheduler: DDIMInverseScheduler,
+        caption_generator: BlipForConditionalGeneration,
+        caption_processor: BlipProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            caption_processor=caption_processor,
+            caption_generator=caption_generator,
+            inverse_scheduler=inverse_scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        source_embeds,
+        target_embeds,
+        callback_steps,
+        prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if source_embeds is None and target_embeds is None:
+            raise ValueError("`source_embeds` and `target_embeds` cannot be undefined.")
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+    #  Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def generate_caption(self, images):
+        """Generates caption for a given image."""
+        text = "a photography of"
+
+        prev_device = self.caption_generator.device
+
+        device = self._execution_device
+        inputs = self.caption_processor(images, text, return_tensors="pt").to(
+            device=device, dtype=self.caption_generator.dtype
+        )
+        self.caption_generator.to(device)
+        outputs = self.caption_generator.generate(**inputs, max_new_tokens=128)
+
+        # offload caption generator
+        self.caption_generator.to(prev_device)
+
+        caption = self.caption_processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        return caption
+
+    def construct_direction(self, embs_source: torch.Tensor, embs_target: torch.Tensor):
+        """Constructs the edit direction to steer the image generation process semantically."""
+        return (embs_target.mean(0) - embs_source.mean(0)).unsqueeze(0)
+
+    @torch.no_grad()
+    def get_embeds(self, prompt: List[str], batch_size: int = 16) -> torch.Tensor:
+        num_prompts = len(prompt)
+        embeds = []
+        for i in range(0, num_prompts, batch_size):
+            prompt_slice = prompt[i : i + batch_size]
+
+            input_ids = self.tokenizer(
+                prompt_slice,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            ).input_ids
+
+            input_ids = input_ids.to(self.text_encoder.device)
+            embeds.append(self.text_encoder(input_ids)[0])
+
+        return torch.cat(embeds, dim=0).mean(0)[None]
+
+    def prepare_image_latents(self, image, batch_size, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        if image.shape[1] == 4:
+            latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if isinstance(generator, list):
+                latents = [
+                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                latents = torch.cat(latents, dim=0)
+            else:
+                latents = self.vae.encode(image).latent_dist.sample(generator)
+
+            latents = self.vae.config.scaling_factor * latents
+
+        if batch_size != latents.shape[0]:
+            if batch_size % latents.shape[0] == 0:
+                # expand image_latents for batch_size
+                deprecation_message = (
+                    f"You have passed {batch_size} text prompts (`prompt`), but only {latents.shape[0]} initial"
+                    " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                    " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                    " your script to pass as many initial images as text prompts to suppress this warning."
+                )
+                deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+                additional_latents_per_image = batch_size // latents.shape[0]
+                latents = torch.cat([latents] * additional_latents_per_image, dim=0)
+            else:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {latents.shape[0]} to {batch_size} text prompts."
+                )
+        else:
+            latents = torch.cat([latents], dim=0)
+
+        return latents
+
+    def get_epsilon(self, model_output: torch.Tensor, sample: torch.Tensor, timestep: int):
+        pred_type = self.inverse_scheduler.config.prediction_type
+        alpha_prod_t = self.inverse_scheduler.alphas_cumprod[timestep]
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        if pred_type == "epsilon":
+            return model_output
+        elif pred_type == "sample":
+            return (sample - alpha_prod_t ** (0.5) * model_output) / beta_prod_t ** (0.5)
+        elif pred_type == "v_prediction":
+            return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {pred_type} must be one of `epsilon`, `sample`, or `v_prediction`"
+            )
+
+    def auto_corr_loss(self, hidden_states, generator=None):
+        reg_loss = 0.0
+        for i in range(hidden_states.shape[0]):
+            for j in range(hidden_states.shape[1]):
+                noise = hidden_states[i : i + 1, j : j + 1, :, :]
+                while True:
+                    roll_amount = torch.randint(noise.shape[2] // 2, (1,), generator=generator).item()
+                    reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=2)).mean() ** 2
+                    reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=3)).mean() ** 2
+
+                    if noise.shape[2] <= 8:
+                        break
+                    noise = F.avg_pool2d(noise, kernel_size=2)
+        return reg_loss
+
+    def kl_divergence(self, hidden_states):
+        mean = hidden_states.mean()
+        var = hidden_states.var()
+        return var + mean**2 - 1 - torch.log(var + 1e-7)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        source_embeds: torch.Tensor = None,
+        target_embeds: torch.Tensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        cross_attention_guidance_amount: float = 0.1,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            source_embeds (`torch.Tensor`):
+                Source concept embeddings. Generation of the embeddings as per the [original
+                paper](https://arxiv.org/abs/2302.03027). Used in discovering the edit direction.
+            target_embeds (`torch.Tensor`):
+                Target concept embeddings. Generation of the embeddings as per the [original
+                paper](https://arxiv.org/abs/2302.03027). Used in discovering the edit direction.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            cross_attention_guidance_amount (`float`, defaults to 0.1):
+                Amount of guidance needed from the reference cross-attention maps.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Define the spatial resolutions.
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            source_embeds,
+            target_embeds,
+            callback_steps,
+            prompt_embeds,
+        )
+
+        # 3. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Generate the inverted noise from the input image or any other image
+        # generated from the input prompt.
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents_init = latents.clone()
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Rejig the UNet so that we can obtain the cross-attenion maps and
+        # use them for guiding the subsequent image generation.
+        self.unet = prepare_unet(self.unet)
+
+        # 7. Denoising loop where we obtain the cross-attention maps.
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs={"timestep": t},
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 8. Compute the edit directions.
+        edit_direction = self.construct_direction(source_embeds, target_embeds).to(prompt_embeds.device)
+
+        # 9. Edit the prompt embeddings as per the edit directions discovered.
+        prompt_embeds_edit = prompt_embeds.clone()
+        prompt_embeds_edit[1:2] += edit_direction
+
+        # 10. Second denoising loop to generate the edited image.
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        latents = latents_init
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # we want to learn the latent such that it steers the generation
+                # process towards the edited direction, so make the make initial
+                # noise learnable
+                x_in = latent_model_input.detach().clone()
+                x_in.requires_grad = True
+
+                # optimizer
+                opt = torch.optim.SGD([x_in], lr=cross_attention_guidance_amount)
+
+                with torch.enable_grad():
+                    # initialize loss
+                    loss = Pix2PixZeroL2Loss()
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        x_in,
+                        t,
+                        encoder_hidden_states=prompt_embeds_edit.detach(),
+                        cross_attention_kwargs={"timestep": t, "loss": loss},
+                    ).sample
+
+                    loss.loss.backward(retain_graph=False)
+                    opt.step()
+
+                # recompute the noise
+                noise_pred = self.unet(
+                    x_in.detach(),
+                    t,
+                    encoder_hidden_states=prompt_embeds_edit,
+                    cross_attention_kwargs={"timestep": None},
+                ).sample
+
+                latents = x_in.detach().chunk(2)[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INVERT_DOC_STRING)
+    def invert(
+        self,
+        prompt: Optional[str] = None,
+        image: PipelineImageInput = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        cross_attention_guidance_amount: float = 0.1,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        lambda_auto_corr: float = 20.0,
+        lambda_kl: float = 20.0,
+        num_reg_steps: int = 5,
+        num_auto_corr_rolls: int = 5,
+    ):
+        r"""
+        Function used to generate inverted latents given a prompt and image.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch which will be used for conditioning. Can also accept
+                image latents as `image`, if passing latents directly, it will not be encoded again.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 1):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            cross_attention_guidance_amount (`float`, defaults to 0.1):
+                Amount of guidance needed from the reference cross-attention maps.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            lambda_auto_corr (`float`, *optional*, defaults to 20.0):
+                Lambda parameter to control auto correction
+            lambda_kl (`float`, *optional*, defaults to 20.0):
+                Lambda parameter to control Kullback–Leibler divergence output
+            num_reg_steps (`int`, *optional*, defaults to 5):
+                Number of regularization loss steps
+            num_auto_corr_rolls (`int`, *optional*, defaults to 5):
+                Number of auto correction roll steps
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.pipeline_stable_diffusion_pix2pix_zero.Pix2PixInversionPipelineOutput`] or
+            `tuple`:
+            [`~pipelines.stable_diffusion.pipeline_stable_diffusion_pix2pix_zero.Pix2PixInversionPipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is the inverted
+            latents tensor and then second is the corresponding decoded image.
+        """
+        # 1. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 4. Prepare latent variables
+        latents = self.prepare_image_latents(image, batch_size, self.vae.dtype, device, generator)
+
+        # 5. Encode input prompt
+        num_images_per_prompt = 1
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.inverse_scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.inverse_scheduler.timesteps
+
+        # 6. Rejig the UNet so that we can obtain the cross-attenion maps and
+        # use them for guiding the subsequent image generation.
+        self.unet = prepare_unet(self.unet)
+
+        # 7. Denoising loop where we obtain the cross-attention maps.
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs={"timestep": t},
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # regularization of the noise prediction
+                with torch.enable_grad():
+                    for _ in range(num_reg_steps):
+                        if lambda_auto_corr > 0:
+                            for _ in range(num_auto_corr_rolls):
+                                var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)
+
+                                # Derive epsilon from model output before regularizing to IID standard normal
+                                var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
+
+                                l_ac = self.auto_corr_loss(var_epsilon, generator=generator)
+                                l_ac.backward()
+
+                                grad = var.grad.detach() / num_auto_corr_rolls
+                                noise_pred = noise_pred - lambda_auto_corr * grad
+
+                        if lambda_kl > 0:
+                            var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)
+
+                            # Derive epsilon from model output before regularizing to IID standard normal
+                            var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
+
+                            l_kld = self.kl_divergence(var_epsilon)
+                            l_kld.backward()
+
+                            grad = var.grad.detach()
+                            noise_pred = noise_pred - lambda_kl * grad
+
+                        noise_pred = noise_pred.detach()
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.inverse_scheduler.step(noise_pred, t, latents).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.inverse_scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        inverted_latents = latents.detach().clone()
+
+        # 8. Post-processing
+        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (inverted_latents, image)
+
+        return Pix2PixInversionPipelineOutput(latents=inverted_latents, images=image)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py b/diffusers/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..15c9a8c27f98dd7e1913bd57dfd5e8dae71172b4
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py
@@ -0,0 +1,19 @@
+from typing import TYPE_CHECKING
+
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_stochastic_karras_ve": ["KarrasVePipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_stochastic_karras_ve import KarrasVePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/diffusers/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py b/diffusers/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
new file mode 100644
index 0000000000000000000000000000000000000000..023edb4ce4bd766b364aaeafcdfb2a2c1eaf9980
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
@@ -0,0 +1,128 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ....models import UNet2DModel
+from ....schedulers import KarrasVeScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class KarrasVePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for unconditional image generation.
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image.
+        scheduler ([`KarrasVeScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image.
+    """
+
+    # add type hints for linting
+    unet: UNet2DModel
+    scheduler: KarrasVeScheduler
+
+    def __init__(self, unet: UNet2DModel, scheduler: KarrasVeScheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        num_inference_steps: int = 50,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[Tuple, ImagePipelineOutput]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+
+        img_size = self.unet.config.sample_size
+        shape = (batch_size, 3, img_size, img_size)
+
+        model = self.unet
+
+        # sample x_0 ~ N(0, sigma_0^2 * I)
+        sample = randn_tensor(shape, generator=generator, device=self.device) * self.scheduler.init_noise_sigma
+
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # here sigma_t == t_i from the paper
+            sigma = self.scheduler.schedule[t]
+            sigma_prev = self.scheduler.schedule[t - 1] if t > 0 else 0
+
+            # 1. Select temporarily increased noise level sigma_hat
+            # 2. Add new noise to move from sample_i to sample_hat
+            sample_hat, sigma_hat = self.scheduler.add_noise_to_input(sample, sigma, generator=generator)
+
+            # 3. Predict the noise residual given the noise magnitude `sigma_hat`
+            # The model inputs and output are adjusted by following eq. (213) in [1].
+            model_output = (sigma_hat / 2) * model((sample_hat + 1) / 2, sigma_hat / 2).sample
+
+            # 4. Evaluate dx/dt at sigma_hat
+            # 5. Take Euler step from sigma to sigma_prev
+            step_output = self.scheduler.step(model_output, sigma_hat, sigma_prev, sample_hat)
+
+            if sigma_prev != 0:
+                # 6. Apply 2nd order correction
+                # The model inputs and output are adjusted by following eq. (213) in [1].
+                model_output = (sigma_prev / 2) * model((step_output.prev_sample + 1) / 2, sigma_prev / 2).sample
+                step_output = self.scheduler.step_correct(
+                    model_output,
+                    sigma_hat,
+                    sigma_prev,
+                    sample_hat,
+                    step_output.prev_sample,
+                    step_output["derivative"],
+                )
+            sample = step_output.prev_sample
+
+        sample = (sample / 2 + 0.5).clamp(0, 1)
+        image = sample.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py b/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ea6ef6e2f65b96aebebdf72cb80135003e4f08d
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py
@@ -0,0 +1,71 @@
+from typing import TYPE_CHECKING
+
+from ....utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ....utils.dummy_torch_and_transformers_objects import (
+        VersatileDiffusionDualGuidedPipeline,
+        VersatileDiffusionImageVariationPipeline,
+        VersatileDiffusionPipeline,
+        VersatileDiffusionTextToImagePipeline,
+    )
+
+    _dummy_objects.update(
+        {
+            "VersatileDiffusionDualGuidedPipeline": VersatileDiffusionDualGuidedPipeline,
+            "VersatileDiffusionImageVariationPipeline": VersatileDiffusionImageVariationPipeline,
+            "VersatileDiffusionPipeline": VersatileDiffusionPipeline,
+            "VersatileDiffusionTextToImagePipeline": VersatileDiffusionTextToImagePipeline,
+        }
+    )
+else:
+    _import_structure["modeling_text_unet"] = ["UNetFlatConditionModel"]
+    _import_structure["pipeline_versatile_diffusion"] = ["VersatileDiffusionPipeline"]
+    _import_structure["pipeline_versatile_diffusion_dual_guided"] = ["VersatileDiffusionDualGuidedPipeline"]
+    _import_structure["pipeline_versatile_diffusion_image_variation"] = ["VersatileDiffusionImageVariationPipeline"]
+    _import_structure["pipeline_versatile_diffusion_text_to_image"] = ["VersatileDiffusionTextToImagePipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ....utils.dummy_torch_and_transformers_objects import (
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+        )
+    else:
+        from .pipeline_versatile_diffusion import VersatileDiffusionPipeline
+        from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline
+        from .pipeline_versatile_diffusion_image_variation import VersatileDiffusionImageVariationPipeline
+        from .pipeline_versatile_diffusion_text_to_image import VersatileDiffusionTextToImagePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3937e87f63c91e0837628f1af2cc7a012166644e
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
@@ -0,0 +1,2518 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from diffusers.utils import deprecate
+
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....models import ModelMixin
+from ....models.activations import get_activation
+from ....models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    AttnProcessor,
+)
+from ....models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from ....models.resnet import ResnetBlockCondNorm2D
+from ....models.transformers.dual_transformer_2d import DualTransformer2DModel
+from ....models.transformers.transformer_2d import Transformer2DModel
+from ....models.unets.unet_2d_condition import UNet2DConditionOutput
+from ....utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from ....utils.torch_utils import apply_freeu
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    num_attention_heads,
+    transformer_layers_per_block,
+    attention_type,
+    attention_head_dim,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    dropout=0.0,
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlockFlat":
+        return DownBlockFlat(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "CrossAttnDownBlockFlat":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockFlat")
+        return CrossAttnDownBlockFlat(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{down_block_type} is not supported.")
+
+
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    num_attention_heads,
+    transformer_layers_per_block,
+    resolution_idx,
+    attention_type,
+    attention_head_dim,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    dropout=0.0,
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlockFlat":
+        return UpBlockFlat(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "CrossAttnUpBlockFlat":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockFlat")
+        return CrossAttnUpBlockFlat(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{up_block_type} is not supported.")
+
+
+class FourierEmbedder(nn.Module):
+    def __init__(self, num_freqs=64, temperature=100):
+        super().__init__()
+
+        self.num_freqs = num_freqs
+        self.temperature = temperature
+
+        freq_bands = temperature ** (torch.arange(num_freqs) / num_freqs)
+        freq_bands = freq_bands[None, None, None]
+        self.register_buffer("freq_bands", freq_bands, persistent=False)
+
+    def __call__(self, x):
+        x = self.freq_bands * x.unsqueeze(-1)
+        return torch.stack((x.sin(), x.cos()), dim=-1).permute(0, 1, 3, 4, 2).reshape(*x.shape[:2], -1)
+
+
+class GLIGENTextBoundingboxProjection(nn.Module):
+    def __init__(self, positive_len, out_dim, feature_type, fourier_freqs=8):
+        super().__init__()
+        self.positive_len = positive_len
+        self.out_dim = out_dim
+
+        self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs)
+        self.position_dim = fourier_freqs * 2 * 4  # 2: sin/cos, 4: xyxy
+
+        if isinstance(out_dim, tuple):
+            out_dim = out_dim[0]
+
+        if feature_type == "text-only":
+            self.linears = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.null_positive_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+
+        elif feature_type == "text-image":
+            self.linears_text = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.linears_image = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.null_text_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+            self.null_image_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+
+        self.null_position_feature = torch.nn.Parameter(torch.zeros([self.position_dim]))
+
+    def forward(
+        self,
+        boxes,
+        masks,
+        positive_embeddings=None,
+        phrases_masks=None,
+        image_masks=None,
+        phrases_embeddings=None,
+        image_embeddings=None,
+    ):
+        masks = masks.unsqueeze(-1)
+
+        xyxy_embedding = self.fourier_embedder(boxes)
+        xyxy_null = self.null_position_feature.view(1, 1, -1)
+        xyxy_embedding = xyxy_embedding * masks + (1 - masks) * xyxy_null
+
+        if positive_embeddings:
+            positive_null = self.null_positive_feature.view(1, 1, -1)
+            positive_embeddings = positive_embeddings * masks + (1 - masks) * positive_null
+
+            objs = self.linears(torch.cat([positive_embeddings, xyxy_embedding], dim=-1))
+        else:
+            phrases_masks = phrases_masks.unsqueeze(-1)
+            image_masks = image_masks.unsqueeze(-1)
+
+            text_null = self.null_text_feature.view(1, 1, -1)
+            image_null = self.null_image_feature.view(1, 1, -1)
+
+            phrases_embeddings = phrases_embeddings * phrases_masks + (1 - phrases_masks) * text_null
+            image_embeddings = image_embeddings * image_masks + (1 - image_masks) * image_null
+
+            objs_text = self.linears_text(torch.cat([phrases_embeddings, xyxy_embedding], dim=-1))
+            objs_image = self.linears_image(torch.cat([image_embeddings, xyxy_embedding], dim=-1))
+            objs = torch.cat([objs_text, objs_image], dim=1)
+
+        return objs
+
+
+class UNetFlatConditionModel(ModelMixin, ConfigMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "DownBlockFlat")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlockFlatCrossAttn"`):
+            Block type for middle of UNet, it can be one of `UNetMidBlockFlatCrossAttn`, `UNetMidBlockFlat`, or
+            `UNetMidBlockFlatSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlockFlat`], [`~models.unet_2d_blocks.CrossAttnUpBlockFlat`],
+            [`~models.unet_2d_blocks.UNetMidBlockFlatCrossAttn`].
+       reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
+            blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
+            [`~models.unet_2d_blocks.CrossAttnDownBlockFlat`], [`~models.unet_2d_blocks.CrossAttnUpBlockFlat`],
+            [`~models.unet_2d_blocks.UNetMidBlockFlatCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlockFlat`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
+        *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
+        *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlockFlatSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["BasicTransformerBlock", "ResnetBlockFlat", "CrossAttnUpBlockFlat"]
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockFlat",
+            "CrossAttnDownBlockFlat",
+            "CrossAttnDownBlockFlat",
+            "DownBlockFlat",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlockFlatCrossAttn",
+        up_block_types: Tuple[str] = (
+            "UpBlockFlat",
+            "CrossAttnUpBlockFlat",
+            "CrossAttnUpBlockFlat",
+            "CrossAttnUpBlockFlat",
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        attention_type: str = "default",
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
+            for layer_number_per_block in transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
+
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = LinearMultiDim(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kandinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim_type`: {encoder_hid_dim_type} must be None, 'text_proj', 'text_image_proj' or 'image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kandinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        if mid_block_type == "UNetMidBlockFlatCrossAttn":
+            self.mid_block = UNetMidBlockFlatCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                attention_type=attention_type,
+            )
+        elif mid_block_type == "UNetMidBlockFlatSimpleCrossAttn":
+            self.mid_block = UNetMidBlockFlatSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type == "UNetMidBlockFlat":
+            self.mid_block = UNetMidBlockFlat(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                num_layers=0,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                add_attention=False,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = (
+            list(reversed(transformer_layers_per_block))
+            if reverse_transformer_layers_per_block is None
+            else reverse_transformer_layers_per_block
+        )
+        only_cross_attention = list(reversed(only_cross_attention))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resolution_idx=i,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+
+            self.conv_act = get_activation(act_fn)
+
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = LinearMultiDim(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+
+        if attention_type in ["gated", "gated-text-image"]:
+            positive_len = 768
+            if isinstance(cross_attention_dim, int):
+                positive_len = cross_attention_dim
+            elif isinstance(cross_attention_dim, (list, tuple)):
+                positive_len = cross_attention_dim[0]
+
+            feature_type = "text-only" if attention_type == "gated" else "text-image"
+            self.position_net = GLIGENTextBoundingboxProjection(
+                positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
+            )
+
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def unload_lora(self):
+        """Unloads LoRA weights."""
+        deprecate(
+            "unload_lora",
+            "0.28.0",
+            "Calling `unload_lora()` is deprecated and will be removed in a future version. Please install `peft` and then call `disable_adapters().",
+        )
+        for module in self.modules():
+            if hasattr(module, "set_lora_layer"):
+                module.set_lora_layer(None)
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNetFlatConditionModel`] forward method.
+
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added to UNet long skip connections from down blocks to up blocks for
+                example from ControlNet side model(s)
+            mid_block_additional_residual (`torch.Tensor`, *optional*):
+                additional residual to be added to UNet mid block output, for example from ControlNet side model
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+
+        Returns:
+            [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                # Forward upsample size to force interpolation output size.
+                forward_upsample_size = True
+                break
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds)
+            encoder_hidden_states = (encoder_hidden_states, image_embeds)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+
+        # 3. down
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlockFlat
+                additional_residuals = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+
+            down_block_res_samples += res_samples
+
+        if is_controlnet:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    scale=lora_scale,
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DConditionOutput(sample=sample)
+
+
+class LinearMultiDim(nn.Linear):
+    def __init__(self, in_features, out_features=None, second_dim=4, *args, **kwargs):
+        in_features = [in_features, second_dim, 1] if isinstance(in_features, int) else list(in_features)
+        if out_features is None:
+            out_features = in_features
+        out_features = [out_features, second_dim, 1] if isinstance(out_features, int) else list(out_features)
+        self.in_features_multidim = in_features
+        self.out_features_multidim = out_features
+        super().__init__(np.array(in_features).prod(), np.array(out_features).prod())
+
+    def forward(self, input_tensor, *args, **kwargs):
+        shape = input_tensor.shape
+        n_dim = len(self.in_features_multidim)
+        input_tensor = input_tensor.reshape(*shape[0:-n_dim], self.in_features)
+        output_tensor = super().forward(input_tensor)
+        output_tensor = output_tensor.view(*shape[0:-n_dim], *self.out_features_multidim)
+        return output_tensor
+
+
+class ResnetBlockFlat(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        time_embedding_norm="default",
+        use_in_shortcut=None,
+        second_dim=4,
+        **kwargs,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+
+        in_channels = [in_channels, second_dim, 1] if isinstance(in_channels, int) else list(in_channels)
+        self.in_channels_prod = np.array(in_channels).prod()
+        self.channels_multidim = in_channels
+
+        if out_channels is not None:
+            out_channels = [out_channels, second_dim, 1] if isinstance(out_channels, int) else list(out_channels)
+            out_channels_prod = np.array(out_channels).prod()
+            self.out_channels_multidim = out_channels
+        else:
+            out_channels_prod = self.in_channels_prod
+            self.out_channels_multidim = self.channels_multidim
+        self.time_embedding_norm = time_embedding_norm
+
+        if groups_out is None:
+            groups_out = groups
+
+        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=self.in_channels_prod, eps=eps, affine=True)
+        self.conv1 = torch.nn.Conv2d(self.in_channels_prod, out_channels_prod, kernel_size=1, padding=0)
+
+        if temb_channels is not None:
+            self.time_emb_proj = torch.nn.Linear(temb_channels, out_channels_prod)
+        else:
+            self.time_emb_proj = None
+
+        self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels_prod, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels_prod, out_channels_prod, kernel_size=1, padding=0)
+
+        self.nonlinearity = nn.SiLU()
+
+        self.use_in_shortcut = (
+            self.in_channels_prod != out_channels_prod if use_in_shortcut is None else use_in_shortcut
+        )
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = torch.nn.Conv2d(
+                self.in_channels_prod, out_channels_prod, kernel_size=1, stride=1, padding=0
+            )
+
+    def forward(self, input_tensor, temb):
+        shape = input_tensor.shape
+        n_dim = len(self.channels_multidim)
+        input_tensor = input_tensor.reshape(*shape[0:-n_dim], self.in_channels_prod, 1, 1)
+        input_tensor = input_tensor.view(-1, self.in_channels_prod, 1, 1)
+
+        hidden_states = input_tensor
+
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        if temb is not None:
+            temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
+            hidden_states = hidden_states + temb
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+
+        output_tensor = input_tensor + hidden_states
+
+        output_tensor = output_tensor.view(*shape[0:-n_dim], -1)
+        output_tensor = output_tensor.view(*shape[0:-n_dim], *self.out_channels_multidim)
+
+        return output_tensor
+
+
+class DownBlockFlat(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    LinearMultiDim(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnDownBlockFlat(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    LinearMultiDim(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        additional_residuals: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]:
+        output_states = ()
+
+        blocks = list(zip(self.resnets, self.attentions))
+
+        for i, (resnet, attn) in enumerate(blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+
+            # apply additional residuals to the output of the last pair of resnet and attention blocks
+            if i == len(blocks) - 1 and additional_residuals is not None:
+                hidden_states = hidden_states + additional_residuals
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+# Copied from diffusers.models.unets.unet_2d_blocks.UpBlock2D with UpBlock2D->UpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim
+class UpBlockFlat(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+        temb: Optional[torch.Tensor] = None,
+        upsample_size: Optional[int] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+# Copied from diffusers.models.unets.unet_2d_blocks.CrossAttnUpBlock2D with CrossAttnUpBlock2D->CrossAttnUpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim
+class CrossAttnUpBlockFlat(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+# Copied from diffusers.models.unets.unet_2d_blocks.UNetMidBlock2D with UNetMidBlock2D->UNetMidBlockFlat, ResnetBlock2D->ResnetBlockFlat
+class UNetMidBlockFlat(nn.Module):
+    """
+    A 2D UNet mid-block [`UNetMidBlockFlat`] with multiple residual blocks and optional attention blocks.
+
+    Args:
+        in_channels (`int`): The number of input channels.
+        temb_channels (`int`): The number of temporal embedding channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
+            The type of normalization to apply to the time embeddings. This can help to improve the performance of the
+            model on tasks with long-range temporal dependencies.
+        resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+        attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
+        resnet_pre_norm (`bool`, *optional*, defaults to `True`):
+            Whether to use pre-normalization for the resnet blocks.
+        add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
+        attention_head_dim (`int`, *optional*, defaults to 1):
+            Dimension of a single attention head. The number of attention heads is determined based on this value and
+            the number of input channels.
+        output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
+
+    Returns:
+        `torch.Tensor`: The output of the last residual block, which is a tensor of shape `(batch_size, in_channels,
+        height, width)`.
+
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        self.add_attention = add_attention
+
+        if attn_groups is None:
+            attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
+
+        # there is always at least one resnet
+        if resnet_time_scale_shift == "spatial":
+            resnets = [
+                ResnetBlockCondNorm2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm="spatial",
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                )
+            ]
+        else:
+            resnets = [
+                ResnetBlockFlat(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            ]
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warning(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+            )
+            attention_head_dim = in_channels
+
+        for _ in range(num_layers):
+            if self.add_attention:
+                attentions.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        rescale_output_factor=output_scale_factor,
+                        eps=resnet_eps,
+                        norm_num_groups=attn_groups,
+                        spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+            else:
+                attentions.append(None)
+
+            if resnet_time_scale_shift == "spatial":
+                resnets.append(
+                    ResnetBlockCondNorm2D(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm="spatial",
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                    )
+                )
+            else:
+                resnets.append(
+                    ResnetBlockFlat(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                hidden_states = attn(hidden_states, temb=temb)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+# Copied from diffusers.models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn with UNetMidBlock2DCrossAttn->UNetMidBlockFlatCrossAttn, ResnetBlock2D->ResnetBlockFlat
+class UNetMidBlockFlatCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        out_channels: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_groups_out: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+
+        out_channels = out_channels or in_channels
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        resnet_groups_out = resnet_groups_out or resnet_groups
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlockFlat(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                groups_out=resnet_groups_out,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for i in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups_out,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups_out,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+# Copied from diffusers.models.unets.unet_2d_blocks.UNetMidBlock2DSimpleCrossAttn with UNetMidBlock2DSimpleCrossAttn->UNetMidBlockFlatSimpleCrossAttn, ResnetBlock2D->ResnetBlockFlat
+class UNetMidBlockFlatSimpleCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+
+        self.attention_head_dim = attention_head_dim
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        self.num_heads = in_channels // self.attention_head_dim
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlockFlat(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                skip_time_act=skip_time_act,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=in_channels,
+                    cross_attention_dim=in_channels,
+                    heads=self.num_heads,
+                    dim_head=self.attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            # attn
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=mask,
+                **cross_attention_kwargs,
+            )
+
+            # resnet
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
diff --git a/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py b/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8dc18e2e8ac0facc006f9a1bf48b22ba7a3bdd8
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py
@@ -0,0 +1,421 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModel
+
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import logging
+from ...pipeline_utils import DiffusionPipeline
+from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline
+from .pipeline_versatile_diffusion_image_variation import VersatileDiffusionImageVariationPipeline
+from .pipeline_versatile_diffusion_text_to_image import VersatileDiffusionTextToImagePipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class VersatileDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    tokenizer: CLIPTokenizer
+    image_feature_extractor: CLIPImageProcessor
+    text_encoder: CLIPTextModel
+    image_encoder: CLIPVisionModel
+    image_unet: UNet2DConditionModel
+    text_unet: UNet2DConditionModel
+    vae: AutoencoderKL
+    scheduler: KarrasDiffusionSchedulers
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        image_feature_extractor: CLIPImageProcessor,
+        text_encoder: CLIPTextModel,
+        image_encoder: CLIPVisionModel,
+        image_unet: UNet2DConditionModel,
+        text_unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            image_feature_extractor=image_feature_extractor,
+            text_encoder=text_encoder,
+            image_encoder=image_encoder,
+            image_unet=image_unet,
+            text_unet=text_unet,
+            vae=vae,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    @torch.no_grad()
+    def image_variation(
+        self,
+        image: Union[torch.Tensor, PIL.Image.Image],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `torch.Tensor`):
+                The image prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionPipeline
+        >>> import torch
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> # let's download an initial image
+        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
+
+        >>> response = requests.get(url)
+        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
+
+        >>> pipe = VersatileDiffusionPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> image = pipe.image_variation(image, generator=generator).images[0]
+        >>> image.save("./car_variation.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        expected_components = inspect.signature(VersatileDiffusionImageVariationPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
+        return VersatileDiffusionImageVariationPipeline(**components)(
+            image=image,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
+
+    @torch.no_grad()
+    def text_to_image(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionPipeline
+        >>> import torch
+
+        >>> pipe = VersatileDiffusionPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> image = pipe.text_to_image("an astronaut riding on a horse on mars", generator=generator).images[0]
+        >>> image.save("./astronaut.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        expected_components = inspect.signature(VersatileDiffusionTextToImagePipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
+        temp_pipeline = VersatileDiffusionTextToImagePipeline(**components)
+        output = temp_pipeline(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
+        # swap the attention blocks back to the original state
+        temp_pipeline._swap_unet_attention_blocks()
+
+        return output
+
+    @torch.no_grad()
+    def dual_guided(
+        self,
+        prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
+        image: Union[str, List[str]],
+        text_to_image_strength: float = 0.5,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionPipeline
+        >>> import torch
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> # let's download an initial image
+        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
+
+        >>> response = requests.get(url)
+        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> text = "a red car in the sun"
+
+        >>> pipe = VersatileDiffusionPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> text_to_image_strength = 0.75
+
+        >>> image = pipe.dual_guided(
+        ...     prompt=text, image=image, text_to_image_strength=text_to_image_strength, generator=generator
+        ... ).images[0]
+        >>> image.save("./car_variation.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+
+        expected_components = inspect.signature(VersatileDiffusionDualGuidedPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
+        temp_pipeline = VersatileDiffusionDualGuidedPipeline(**components)
+        output = temp_pipeline(
+            prompt=prompt,
+            image=image,
+            text_to_image_strength=text_to_image_strength,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
+        temp_pipeline._revert_dual_attention()
+
+        return output
diff --git a/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
new file mode 100644
index 0000000000000000000000000000000000000000..2212651fbb5bf712ca624618e379684f1442af85
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -0,0 +1,561 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.utils.checkpoint
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ....image_processor import VaeImageProcessor
+from ....models import AutoencoderKL, DualTransformer2DModel, Transformer2DModel, UNet2DConditionModel
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import deprecate, logging
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .modeling_text_unet import UNetFlatConditionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image-text dual-guided generation using Versatile Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        bert ([`LDMBertModel`]):
+            Text-encoder model based on [`~transformers.BERT`].
+        tokenizer ([`~transformers.BertTokenizer`]):
+            A `BertTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "bert->unet->vqvae"
+
+    tokenizer: CLIPTokenizer
+    image_feature_extractor: CLIPImageProcessor
+    text_encoder: CLIPTextModelWithProjection
+    image_encoder: CLIPVisionModelWithProjection
+    image_unet: UNet2DConditionModel
+    text_unet: UNetFlatConditionModel
+    vae: AutoencoderKL
+    scheduler: KarrasDiffusionSchedulers
+
+    _optional_components = ["text_unet"]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        image_feature_extractor: CLIPImageProcessor,
+        text_encoder: CLIPTextModelWithProjection,
+        image_encoder: CLIPVisionModelWithProjection,
+        image_unet: UNet2DConditionModel,
+        text_unet: UNetFlatConditionModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            image_feature_extractor=image_feature_extractor,
+            text_encoder=text_encoder,
+            image_encoder=image_encoder,
+            image_unet=image_unet,
+            text_unet=text_unet,
+            vae=vae,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        if self.text_unet is not None and (
+            "dual_cross_attention" not in self.image_unet.config or not self.image_unet.config.dual_cross_attention
+        ):
+            # if loading from a universal checkpoint rather than a saved dual-guided pipeline
+            self._convert_to_dual_attention()
+
+    def remove_unused_weights(self):
+        self.register_modules(text_unet=None)
+
+    def _convert_to_dual_attention(self):
+        """
+        Replace image_unet's `Transformer2DModel` blocks with `DualTransformer2DModel` that contains transformer blocks
+        from both `image_unet` and `text_unet`
+        """
+        for name, module in self.image_unet.named_modules():
+            if isinstance(module, Transformer2DModel):
+                parent_name, index = name.rsplit(".", 1)
+                index = int(index)
+
+                image_transformer = self.image_unet.get_submodule(parent_name)[index]
+                text_transformer = self.text_unet.get_submodule(parent_name)[index]
+
+                config = image_transformer.config
+                dual_transformer = DualTransformer2DModel(
+                    num_attention_heads=config.num_attention_heads,
+                    attention_head_dim=config.attention_head_dim,
+                    in_channels=config.in_channels,
+                    num_layers=config.num_layers,
+                    dropout=config.dropout,
+                    norm_num_groups=config.norm_num_groups,
+                    cross_attention_dim=config.cross_attention_dim,
+                    attention_bias=config.attention_bias,
+                    sample_size=config.sample_size,
+                    num_vector_embeds=config.num_vector_embeds,
+                    activation_fn=config.activation_fn,
+                    num_embeds_ada_norm=config.num_embeds_ada_norm,
+                )
+                dual_transformer.transformers[0] = image_transformer
+                dual_transformer.transformers[1] = text_transformer
+
+                self.image_unet.get_submodule(parent_name)[index] = dual_transformer
+                self.image_unet.register_to_config(dual_cross_attention=True)
+
+    def _revert_dual_attention(self):
+        """
+        Revert the image_unet `DualTransformer2DModel` blocks back to `Transformer2DModel` with image_unet weights Call
+        this function if you reuse `image_unet` in another pipeline, e.g. `VersatileDiffusionPipeline`
+        """
+        for name, module in self.image_unet.named_modules():
+            if isinstance(module, DualTransformer2DModel):
+                parent_name, index = name.rsplit(".", 1)
+                index = int(index)
+                self.image_unet.get_submodule(parent_name)[index] = module.transformers[0]
+
+        self.image_unet.register_to_config(dual_cross_attention=False)
+
+    def _encode_text_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+        """
+
+        def normalize_embeddings(encoder_output):
+            embeds = self.text_encoder.text_projection(encoder_output.last_hidden_state)
+            embeds_pooled = encoder_output.text_embeds
+            embeds = embeds / torch.norm(embeds_pooled.unsqueeze(1), dim=-1, keepdim=True)
+            return embeds
+
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        prompt_embeds = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        prompt_embeds = normalize_embeddings(prompt_embeds)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens = [""] * batch_size
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def _encode_image_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+        """
+
+        def normalize_embeddings(encoder_output):
+            embeds = self.image_encoder.vision_model.post_layernorm(encoder_output.last_hidden_state)
+            embeds = self.image_encoder.visual_projection(embeds)
+            embeds_pooled = embeds[:, 0:1]
+            embeds = embeds / torch.norm(embeds_pooled, dim=-1, keepdim=True)
+            return embeds
+
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        image_input = self.image_feature_extractor(images=prompt, return_tensors="pt")
+        pixel_values = image_input.pixel_values.to(device).to(self.image_encoder.dtype)
+        image_embeddings = self.image_encoder(pixel_values)
+        image_embeddings = normalize_embeddings(image_embeddings)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_images = [np.zeros((512, 512, 3)) + 0.5] * batch_size
+            uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pt")
+            pixel_values = uncond_images.pixel_values.to(device).to(self.image_encoder.dtype)
+            negative_prompt_embeds = self.image_encoder(pixel_values)
+            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and conditional embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+
+        return image_embeddings
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, prompt, image, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, PIL.Image.Image) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` `PIL.Image` or `list` but is {type(prompt)}")
+        if not isinstance(image, str) and not isinstance(image, PIL.Image.Image) and not isinstance(image, list):
+            raise ValueError(f"`image` has to be of type `str` `PIL.Image` or `list` but is {type(image)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def set_transformer_params(self, mix_ratio: float = 0.5, condition_types: Tuple = ("text", "image")):
+        for name, module in self.image_unet.named_modules():
+            if isinstance(module, DualTransformer2DModel):
+                module.mix_ratio = mix_ratio
+
+                for i, type in enumerate(condition_types):
+                    if type == "text":
+                        module.condition_lengths[i] = self.text_encoder.config.max_position_embeddings
+                        module.transformer_index_for_condition[i] = 1  # use the second (text) transformer
+                    else:
+                        module.condition_lengths[i] = 257
+                        module.transformer_index_for_condition[i] = 0  # use the first (image) transformer
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
+        image: Union[str, List[str]],
+        text_to_image_strength: float = 0.5,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionDualGuidedPipeline
+        >>> import torch
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> # let's download an initial image
+        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
+
+        >>> response = requests.get(url)
+        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> text = "a red car in the sun"
+
+        >>> pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe.remove_unused_weights()
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> text_to_image_strength = 0.75
+
+        >>> image = pipe(
+        ...     prompt=text, image=image, text_to_image_strength=text_to_image_strength, generator=generator
+        ... ).images[0]
+        >>> image.save("./car_variation.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.image_unet.config.sample_size * self.vae_scale_factor
+        width = width or self.image_unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, image, height, width, callback_steps)
+
+        # 2. Define call parameters
+        prompt = [prompt] if not isinstance(prompt, list) else prompt
+        image = [image] if not isinstance(image, list) else image
+        batch_size = len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompts
+        prompt_embeds = self._encode_text_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance)
+        image_embeddings = self._encode_image_prompt(image, device, num_images_per_prompt, do_classifier_free_guidance)
+        dual_prompt_embeddings = torch.cat([prompt_embeds, image_embeddings], dim=1)
+        prompt_types = ("text", "image")
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.image_unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            dual_prompt_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Combine the attention blocks of the image and text UNets
+        self.set_transformer_params(text_to_image_strength, prompt_types)
+
+        # 8. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=dual_prompt_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
new file mode 100644
index 0000000000000000000000000000000000000000..62d3e83a4790f82f182a360b0cd0d6c20fb231e2
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -0,0 +1,402 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.utils.checkpoint
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from ....image_processor import VaeImageProcessor
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import deprecate, logging
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image variation using Versatile Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        bert ([`LDMBertModel`]):
+            Text-encoder model based on [`~transformers.BERT`].
+        tokenizer ([`~transformers.BertTokenizer`]):
+            A `BertTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "bert->unet->vqvae"
+
+    image_feature_extractor: CLIPImageProcessor
+    image_encoder: CLIPVisionModelWithProjection
+    image_unet: UNet2DConditionModel
+    vae: AutoencoderKL
+    scheduler: KarrasDiffusionSchedulers
+
+    def __init__(
+        self,
+        image_feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
+        image_unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+        self.register_modules(
+            image_feature_extractor=image_feature_extractor,
+            image_encoder=image_encoder,
+            image_unet=image_unet,
+            vae=vae,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+
+        def normalize_embeddings(encoder_output):
+            embeds = self.image_encoder.vision_model.post_layernorm(encoder_output.last_hidden_state)
+            embeds = self.image_encoder.visual_projection(embeds)
+            embeds_pooled = embeds[:, 0:1]
+            embeds = embeds / torch.norm(embeds_pooled, dim=-1, keepdim=True)
+            return embeds
+
+        if isinstance(prompt, torch.Tensor) and len(prompt.shape) == 4:
+            prompt = list(prompt)
+
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        image_input = self.image_feature_extractor(images=prompt, return_tensors="pt")
+        pixel_values = image_input.pixel_values.to(device).to(self.image_encoder.dtype)
+        image_embeddings = self.image_encoder(pixel_values)
+        image_embeddings = normalize_embeddings(image_embeddings)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_images: List[str]
+            if negative_prompt is None:
+                uncond_images = [np.zeros((512, 512, 3)) + 0.5] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, PIL.Image.Image):
+                uncond_images = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_images = negative_prompt
+
+            uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pt")
+            pixel_values = uncond_images.pixel_values.to(device).to(self.image_encoder.dtype)
+            negative_prompt_embeds = self.image_encoder(pixel_values)
+            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and conditional embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+
+        return image_embeddings
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs
+    def check_inputs(self, image, height, width, callback_steps):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `torch.Tensor`):
+                The image prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionImageVariationPipeline
+        >>> import torch
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> # let's download an initial image
+        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
+
+        >>> response = requests.get(url)
+        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
+
+        >>> pipe = VersatileDiffusionImageVariationPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> image = pipe(image, generator=generator).images[0]
+        >>> image.save("./car_variation.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.image_unet.config.sample_size * self.vae_scale_factor
+        width = width or self.image_unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(image, PIL.Image.Image) else len(image)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        image_embeddings = self._encode_prompt(
+            image, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.image_unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..de4c2ac9b7f49808b4070fb269bac1222ab59826
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -0,0 +1,480 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer
+
+from ....image_processor import VaeImageProcessor
+from ....models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import deprecate, logging
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .modeling_text_unet import UNetFlatConditionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Versatile Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        bert ([`LDMBertModel`]):
+            Text-encoder model based on [`~transformers.BERT`].
+        tokenizer ([`~transformers.BertTokenizer`]):
+            A `BertTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "bert->unet->vqvae"
+
+    tokenizer: CLIPTokenizer
+    image_feature_extractor: CLIPImageProcessor
+    text_encoder: CLIPTextModelWithProjection
+    image_unet: UNet2DConditionModel
+    text_unet: UNetFlatConditionModel
+    vae: AutoencoderKL
+    scheduler: KarrasDiffusionSchedulers
+
+    _optional_components = ["text_unet"]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        image_unet: UNet2DConditionModel,
+        text_unet: UNetFlatConditionModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            image_unet=image_unet,
+            text_unet=text_unet,
+            vae=vae,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        if self.text_unet is not None:
+            self._swap_unet_attention_blocks()
+
+    def _swap_unet_attention_blocks(self):
+        """
+        Swap the `Transformer2DModel` blocks between the image and text UNets
+        """
+        for name, module in self.image_unet.named_modules():
+            if isinstance(module, Transformer2DModel):
+                parent_name, index = name.rsplit(".", 1)
+                index = int(index)
+                self.image_unet.get_submodule(parent_name)[index], self.text_unet.get_submodule(parent_name)[index] = (
+                    self.text_unet.get_submodule(parent_name)[index],
+                    self.image_unet.get_submodule(parent_name)[index],
+                )
+
+    def remove_unused_weights(self):
+        self.register_modules(text_unet=None)
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+
+        def normalize_embeddings(encoder_output):
+            embeds = self.text_encoder.text_projection(encoder_output.last_hidden_state)
+            embeds_pooled = encoder_output.text_embeds
+            embeds = embeds / torch.norm(embeds_pooled.unsqueeze(1), dim=-1, keepdim=True)
+            return embeds
+
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        prompt_embeds = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        prompt_embeds = normalize_embeddings(prompt_embeds)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionTextToImagePipeline
+        >>> import torch
+
+        >>> pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe.remove_unused_weights()
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> image = pipe("an astronaut riding on a horse on mars", generator=generator).images[0]
+        >>> image.save("./astronaut.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.image_unet.config.sample_size * self.vae_scale_factor
+        width = width or self.image_unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.image_unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py b/diffusers/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..070903377c7188415af0417d4839d74a8a34dc01
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py
@@ -0,0 +1,57 @@
+from typing import TYPE_CHECKING
+
+from ....utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ....utils.dummy_torch_and_transformers_objects import (
+        LearnedClassifierFreeSamplingEmbeddings,
+        VQDiffusionPipeline,
+    )
+
+    _dummy_objects.update(
+        {
+            "LearnedClassifierFreeSamplingEmbeddings": LearnedClassifierFreeSamplingEmbeddings,
+            "VQDiffusionPipeline": VQDiffusionPipeline,
+        }
+    )
+else:
+    _import_structure["pipeline_vq_diffusion"] = ["LearnedClassifierFreeSamplingEmbeddings", "VQDiffusionPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ....utils.dummy_torch_and_transformers_objects import (
+            LearnedClassifierFreeSamplingEmbeddings,
+            VQDiffusionPipeline,
+        )
+    else:
+        from .pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings, VQDiffusionPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py b/diffusers/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dee000df05f6a0196b53277bbe254c53e73e39d
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py
@@ -0,0 +1,325 @@
+# Copyright 2024 Microsoft and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....models import ModelMixin, Transformer2DModel, VQModel
+from ....schedulers import VQDiffusionScheduler
+from ....utils import logging
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class LearnedClassifierFreeSamplingEmbeddings(ModelMixin, ConfigMixin):
+    """
+    Utility class for storing learned text embeddings for classifier free sampling
+    """
+
+    @register_to_config
+    def __init__(self, learnable: bool, hidden_size: Optional[int] = None, length: Optional[int] = None):
+        super().__init__()
+
+        self.learnable = learnable
+
+        if self.learnable:
+            assert hidden_size is not None, "learnable=True requires `hidden_size` to be set"
+            assert length is not None, "learnable=True requires `length` to be set"
+
+            embeddings = torch.zeros(length, hidden_size)
+        else:
+            embeddings = None
+
+        self.embeddings = torch.nn.Parameter(embeddings)
+
+
+class VQDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using VQ Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vqvae ([`VQModel`]):
+            Vector Quantized Variational Auto-Encoder (VAE) model to encode and decode images to and from latent
+            representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        transformer ([`Transformer2DModel`]):
+            A conditional `Transformer2DModel` to denoise the encoded image latents.
+        scheduler ([`VQDiffusionScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    vqvae: VQModel
+    text_encoder: CLIPTextModel
+    tokenizer: CLIPTokenizer
+    transformer: Transformer2DModel
+    learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings
+    scheduler: VQDiffusionScheduler
+
+    def __init__(
+        self,
+        vqvae: VQModel,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        transformer: Transformer2DModel,
+        scheduler: VQDiffusionScheduler,
+        learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vqvae=vqvae,
+            transformer=transformer,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
+        )
+
+    def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        prompt_embeds = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # NOTE: This additional step of normalizing the text embeddings is from VQ-Diffusion.
+        # While CLIP does normalize the pooled output of the text transformer when combining
+        # the image and text embeddings, CLIP does not directly normalize the last hidden state.
+        #
+        # CLIP normalizing the pooled output.
+        # https://github.com/huggingface/transformers/blob/d92e22d1f28324f513f3080e5c47c071a3916721/src/transformers/models/clip/modeling_clip.py#L1052-L1053
+        prompt_embeds = prompt_embeds / prompt_embeds.norm(dim=-1, keepdim=True)
+
+        # duplicate text embeddings for each generation per prompt
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            if self.learned_classifier_free_sampling_embeddings.learnable:
+                negative_prompt_embeds = self.learned_classifier_free_sampling_embeddings.embeddings
+                negative_prompt_embeds = negative_prompt_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            else:
+                uncond_tokens = [""] * batch_size
+
+                max_length = text_input_ids.shape[-1]
+                uncond_input = self.tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                negative_prompt_embeds = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+                # See comment for normalizing text embeddings
+                negative_prompt_embeds = negative_prompt_embeds / negative_prompt_embeds.norm(dim=-1, keepdim=True)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        num_inference_steps: int = 100,
+        guidance_scale: float = 5.0,
+        truncation_rate: float = 1.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            truncation_rate (`float`, *optional*, defaults to 1.0 (equivalent to no truncation)):
+                Used to "truncate" the predicted classes for x_0 such that the cumulative probability for a pixel is at
+                most `truncation_rate`. The lowest probabilities that would increase the cumulative probability above
+                `truncation_rate` are set to zero.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor` of shape (batch), *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Must be valid embedding indices.If not provided, a latents tensor will be generated of
+                completely masked latent pixels.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds = self._encode_prompt(prompt, num_images_per_prompt, do_classifier_free_guidance)
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get the initial completely masked latents unless the user supplied it
+
+        latents_shape = (batch_size, self.transformer.num_latent_pixels)
+        if latents is None:
+            mask_class = self.transformer.num_vector_embeds - 1
+            latents = torch.full(latents_shape, mask_class).to(self.device)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            if (latents < 0).any() or (latents >= self.transformer.num_vector_embeds).any():
+                raise ValueError(
+                    "Unexpected latents value(s). All latents be valid embedding indices i.e. in the range 0,"
+                    f" {self.transformer.num_vector_embeds - 1} (inclusive)."
+                )
+            latents = latents.to(self.device)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        sample = latents
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the sample if we are doing classifier free guidance
+            latent_model_input = torch.cat([sample] * 2) if do_classifier_free_guidance else sample
+
+            # predict the un-noised image
+            # model_output == `log_p_x_0`
+            model_output = self.transformer(latent_model_input, encoder_hidden_states=prompt_embeds, timestep=t).sample
+
+            if do_classifier_free_guidance:
+                model_output_uncond, model_output_text = model_output.chunk(2)
+                model_output = model_output_uncond + guidance_scale * (model_output_text - model_output_uncond)
+                model_output -= torch.logsumexp(model_output, dim=1, keepdim=True)
+
+            model_output = self.truncate(model_output, truncation_rate)
+
+            # remove `log(0)`'s (`-inf`s)
+            model_output = model_output.clamp(-70)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            sample = self.scheduler.step(model_output, timestep=t, sample=sample, generator=generator).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, sample)
+
+        embedding_channels = self.vqvae.config.vq_embed_dim
+        embeddings_shape = (batch_size, self.transformer.height, self.transformer.width, embedding_channels)
+        embeddings = self.vqvae.quantize.get_codebook_entry(sample, shape=embeddings_shape)
+        image = self.vqvae.decode(embeddings, force_not_quantize=True).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
+
+    def truncate(self, log_p_x_0: torch.Tensor, truncation_rate: float) -> torch.Tensor:
+        """
+        Truncates `log_p_x_0` such that for each column vector, the total cumulative probability is `truncation_rate`
+        The lowest probabilities that would increase the cumulative probability above `truncation_rate` are set to
+        zero.
+        """
+        sorted_log_p_x_0, indices = torch.sort(log_p_x_0, 1, descending=True)
+        sorted_p_x_0 = torch.exp(sorted_log_p_x_0)
+        keep_mask = sorted_p_x_0.cumsum(dim=1) < truncation_rate
+
+        # Ensure that at least the largest probability is not zeroed out
+        all_true = torch.full_like(keep_mask[:, 0:1, :], True)
+        keep_mask = torch.cat((all_true, keep_mask), dim=1)
+        keep_mask = keep_mask[:, :-1, :]
+
+        keep_mask = keep_mask.gather(1, indices.argsort(1))
+
+        rv = log_p_x_0.clone()
+
+        rv[~keep_mask] = -torch.inf  # -inf = log(0)
+
+        return rv
diff --git a/diffusers/src/diffusers/pipelines/dit/__init__.py b/diffusers/src/diffusers/pipelines/dit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe2a94f3cba77d867f97111a41895918842adc27
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/dit/__init__.py
@@ -0,0 +1,19 @@
+from typing import TYPE_CHECKING
+
+from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_dit": ["DiTPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_dit import DiTPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/diffusers/src/diffusers/pipelines/dit/pipeline_dit.py b/diffusers/src/diffusers/pipelines/dit/pipeline_dit.py
new file mode 100644
index 0000000000000000000000000000000000000000..14321b5f33cfae58771d7d363dd63a37348226d4
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/dit/pipeline_dit.py
@@ -0,0 +1,236 @@
+# Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
+# William Peebles and Saining Xie
+#
+# Copyright (c) 2021 OpenAI
+# MIT License
+#
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+
+from ...models import AutoencoderKL, DiTTransformer2DModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class DiTPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image generation based on a Transformer backbone instead of a UNet.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        transformer ([`DiTTransformer2DModel`]):
+            A class conditioned `DiTTransformer2DModel` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    model_cpu_offload_seq = "transformer->vae"
+
+    def __init__(
+        self,
+        transformer: DiTTransformer2DModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+        id2label: Optional[Dict[int, str]] = None,
+    ):
+        super().__init__()
+        self.register_modules(transformer=transformer, vae=vae, scheduler=scheduler)
+
+        # create a imagenet -> id dictionary for easier use
+        self.labels = {}
+        if id2label is not None:
+            for key, value in id2label.items():
+                for label in value.split(","):
+                    self.labels[label.lstrip().rstrip()] = int(key)
+            self.labels = dict(sorted(self.labels.items()))
+
+    def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
+        r"""
+
+        Map label strings from ImageNet to corresponding class ids.
+
+        Parameters:
+            label (`str` or `dict` of `str`):
+                Label strings to be mapped to class ids.
+
+        Returns:
+            `list` of `int`:
+                Class ids to be processed by pipeline.
+        """
+
+        if not isinstance(label, list):
+            label = list(label)
+
+        for l in label:
+            if l not in self.labels:
+                raise ValueError(
+                    f"{l} does not exist. Please make sure to select one of the following labels: \n {self.labels}."
+                )
+
+        return [self.labels[l] for l in label]
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        class_labels: List[int],
+        guidance_scale: float = 4.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        num_inference_steps: int = 50,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            class_labels (List[int]):
+                List of ImageNet class labels for the images to be generated.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 250):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import DiTPipeline, DPMSolverMultistepScheduler
+        >>> import torch
+
+        >>> pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
+        >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # pick words from Imagenet class labels
+        >>> pipe.labels  # to print all available words
+
+        >>> # pick words that exist in ImageNet
+        >>> words = ["white shark", "umbrella"]
+
+        >>> class_ids = pipe.get_label_ids(words)
+
+        >>> generator = torch.manual_seed(33)
+        >>> output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator)
+
+        >>> image = output.images[0]  # label 'white shark'
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+
+        batch_size = len(class_labels)
+        latent_size = self.transformer.config.sample_size
+        latent_channels = self.transformer.config.in_channels
+
+        latents = randn_tensor(
+            shape=(batch_size, latent_channels, latent_size, latent_size),
+            generator=generator,
+            device=self._execution_device,
+            dtype=self.transformer.dtype,
+        )
+        latent_model_input = torch.cat([latents] * 2) if guidance_scale > 1 else latents
+
+        class_labels = torch.tensor(class_labels, device=self._execution_device).reshape(-1)
+        class_null = torch.tensor([1000] * batch_size, device=self._execution_device)
+        class_labels_input = torch.cat([class_labels, class_null], 0) if guidance_scale > 1 else class_labels
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+        for t in self.progress_bar(self.scheduler.timesteps):
+            if guidance_scale > 1:
+                half = latent_model_input[: len(latent_model_input) // 2]
+                latent_model_input = torch.cat([half, half], dim=0)
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            timesteps = t
+            if not torch.is_tensor(timesteps):
+                # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                # This would be a good case for the `match` statement (Python 3.10+)
+                is_mps = latent_model_input.device.type == "mps"
+                if isinstance(timesteps, float):
+                    dtype = torch.float32 if is_mps else torch.float64
+                else:
+                    dtype = torch.int32 if is_mps else torch.int64
+                timesteps = torch.tensor([timesteps], dtype=dtype, device=latent_model_input.device)
+            elif len(timesteps.shape) == 0:
+                timesteps = timesteps[None].to(latent_model_input.device)
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timesteps = timesteps.expand(latent_model_input.shape[0])
+            # predict noise model_output
+            noise_pred = self.transformer(
+                latent_model_input, timestep=timesteps, class_labels=class_labels_input
+            ).sample
+
+            # perform guidance
+            if guidance_scale > 1:
+                eps, rest = noise_pred[:, :latent_channels], noise_pred[:, latent_channels:]
+                cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+
+                half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
+                eps = torch.cat([half_eps, half_eps], dim=0)
+
+                noise_pred = torch.cat([eps, rest], dim=1)
+
+            # learned sigma
+            if self.transformer.config.out_channels // 2 == latent_channels:
+                model_output, _ = torch.split(noise_pred, latent_channels, dim=1)
+            else:
+                model_output = noise_pred
+
+            # compute previous image: x_t -> x_t-1
+            latent_model_input = self.scheduler.step(model_output, t, latent_model_input).prev_sample
+
+        if guidance_scale > 1:
+            latents, _ = latent_model_input.chunk(2, dim=0)
+        else:
+            latents = latent_model_input
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        samples = self.vae.decode(latents).sample
+
+        samples = (samples / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        samples = samples.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            samples = self.numpy_to_pil(samples)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (samples,)
+
+        return ImagePipelineOutput(images=samples)
diff --git a/diffusers/src/diffusers/pipelines/flux/__init__.py b/diffusers/src/diffusers/pipelines/flux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ebf5ea6d78dbcf3f5e899d3b82b4b59b3d1ce1d
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/flux/__init__.py
@@ -0,0 +1,57 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_additional_imports = {}
+_import_structure = {"pipeline_output": ["FluxPipelineOutput"]}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_flux"] = ["FluxPipeline"]
+    _import_structure["pipeline_flux_controlnet"] = ["FluxControlNetPipeline"]
+    _import_structure["pipeline_flux_controlnet_image_to_image"] = ["FluxControlNetImg2ImgPipeline"]
+    _import_structure["pipeline_flux_controlnet_inpainting"] = ["FluxControlNetInpaintPipeline"]
+    _import_structure["pipeline_flux_img2img"] = ["FluxImg2ImgPipeline"]
+    _import_structure["pipeline_flux_inpaint"] = ["FluxInpaintPipeline"]
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_flux import FluxPipeline
+        from .pipeline_flux_controlnet import FluxControlNetPipeline
+        from .pipeline_flux_controlnet_image_to_image import FluxControlNetImg2ImgPipeline
+        from .pipeline_flux_controlnet_inpainting import FluxControlNetInpaintPipeline
+        from .pipeline_flux_img2img import FluxImg2ImgPipeline
+        from .pipeline_flux_inpaint import FluxInpaintPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
+    for name, value in _additional_imports.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/flux/pipeline_flux.py b/diffusers/src/diffusers/pipelines/flux/pipeline_flux.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb214885da1c9eb15bd02a84540144f08ae899d6
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/flux/pipeline_flux.py
@@ -0,0 +1,771 @@
+# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.transformers import FluxTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import FluxPipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxPipeline
+
+        >>> pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> # Depending on the variant being used, the pipeline call will slightly vary.
+        >>> # Refer to the pipeline documentation for more details.
+        >>> image = pipe(prompt, num_inference_steps=4, guidance_scale=0.0).images[0]
+        >>> image.save("flux.png")
+        ```
+"""
+
+
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    The Flux pipeline for text-to-image generation.
+
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 64
+
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+
+        return latent_image_ids.to(device=device, dtype=dtype)
+
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+
+        return latents
+
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+
+        shape = (batch_size, num_channels_latents, height, width)
+
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+
+        return latents, latent_image_ids
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return FluxPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py b/diffusers/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c072c482020667f71341f72b0f8dfd02345e6b4
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py
@@ -0,0 +1,935 @@
+# Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
+from ...models.transformers import FluxTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import FluxPipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers.utils import load_image
+        >>> from diffusers import FluxControlNetPipeline
+        >>> from diffusers import FluxControlNetModel
+
+        >>> controlnet_model = "InstantX/FLUX.1-dev-controlnet-canny"
+        >>> controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
+        >>> pipe = FluxControlNetPipeline.from_pretrained(
+        ...     base_model, controlnet=controlnet, torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe.to("cuda")
+        >>> control_image = load_image("https://huggingface.co/InstantX/SD3-Controlnet-Canny/resolve/main/canny.jpg")
+        >>> prompt = "A girl in city, 25 years old, cool, futuristic"
+        >>> image = pipe(
+        ...     prompt,
+        ...     control_image=control_image,
+        ...     controlnet_conditioning_scale=0.6,
+        ...     num_inference_steps=28,
+        ...     guidance_scale=3.5,
+        ... ).images[0]
+        >>> image.save("flux.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    The Flux pipeline for text-to-image generation.
+
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+        controlnet: Union[
+            FluxControlNetModel, List[FluxControlNetModel], Tuple[FluxControlNetModel], FluxMultiControlNetModel
+        ],
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            controlnet=controlnet,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 64
+
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+
+        return latent_image_ids.to(device=device, dtype=dtype)
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+
+        return latents
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+
+        shape = (batch_size, num_channels_latents, height, width)
+
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+
+        return latents, latent_image_ids
+
+    # Copied from diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet.StableDiffusion3ControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if isinstance(image, torch.Tensor):
+            pass
+        else:
+            image = self.image_processor.preprocess(image, height=height, width=width)
+
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        control_image: PipelineImageInput = None,
+        control_mode: Optional[Union[int, List[int]]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet.
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            control_mode (`int` or `List[int]`,, *optional*, defaults to None):
+                The control mode when applying ControlNet-Union.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        dtype = self.transformer.dtype
+
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+
+        # 3. Prepare control image
+        num_channels_latents = self.transformer.config.in_channels // 4
+        if isinstance(self.controlnet, FluxControlNetModel):
+            control_image = self.prepare_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=self.vae.dtype,
+            )
+            height, width = control_image.shape[-2:]
+
+            # vae encode
+            control_image = self.vae.encode(control_image).latent_dist.sample()
+            control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+            # pack
+            height_control_image, width_control_image = control_image.shape[2:]
+            control_image = self._pack_latents(
+                control_image,
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height_control_image,
+                width_control_image,
+            )
+
+            # Here we ensure that `control_mode` has the same length as the control_image.
+            if control_mode is not None:
+                if not isinstance(control_mode, int):
+                    raise ValueError(" For `FluxControlNet`, `control_mode` should be an `int` or `None`")
+                control_mode = torch.tensor(control_mode).to(device, dtype=torch.long)
+                control_mode = control_mode.view(-1, 1).expand(control_image.shape[0], 1)
+
+        elif isinstance(self.controlnet, FluxMultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=self.vae.dtype,
+                )
+                height, width = control_image_.shape[-2:]
+
+                # vae encode
+                control_image_ = self.vae.encode(control_image_).latent_dist.sample()
+                control_image_ = (control_image_ - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+                # pack
+                height_control_image, width_control_image = control_image_.shape[2:]
+                control_image_ = self._pack_latents(
+                    control_image_,
+                    batch_size * num_images_per_prompt,
+                    num_channels_latents,
+                    height_control_image,
+                    width_control_image,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+
+            # Here we ensure that `control_mode` has the same length as the control_image.
+            if isinstance(control_mode, list) and len(control_mode) != len(control_image):
+                raise ValueError(
+                    "For Multi-ControlNet, `control_mode` must be a list of the same "
+                    + " length as the number of controlnets (control images) specified"
+                )
+            if not isinstance(control_mode, list):
+                control_mode = [control_mode] * len(control_image)
+            # set control mode
+            control_modes = []
+            for cmode in control_mode:
+                if cmode is None:
+                    cmode = -1
+                control_mode = torch.tensor(cmode).expand(control_images[0].shape[0]).to(device, dtype=torch.long)
+                control_modes.append(control_mode)
+            control_mode = control_modes
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+
+                if isinstance(self.controlnet, FluxMultiControlNetModel):
+                    use_guidance = self.controlnet.nets[0].config.guidance_embeds
+                else:
+                    use_guidance = self.controlnet.config.guidance_embeds
+
+                guidance = torch.tensor([guidance_scale], device=device) if use_guidance else None
+                guidance = guidance.expand(latents.shape[0]) if guidance is not None else None
+
+                # controlnet
+                controlnet_block_samples, controlnet_single_block_samples = self.controlnet(
+                    hidden_states=latents,
+                    controlnet_cond=control_image,
+                    controlnet_mode=control_mode,
+                    conditioning_scale=controlnet_conditioning_scale,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )
+
+                guidance = (
+                    torch.tensor([guidance_scale], device=device) if self.transformer.config.guidance_embeds else None
+                )
+                guidance = guidance.expand(latents.shape[0]) if guidance is not None else None
+
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_block_samples=controlnet_block_samples,
+                    controlnet_single_block_samples=controlnet_single_block_samples,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return FluxPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py b/diffusers/src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..deeb9e3f546a8241c4ffa091f9ca3cc229207972
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py
@@ -0,0 +1,949 @@
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
+from ...models.transformers import FluxTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import FluxPipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxControlNetImg2ImgPipeline, FluxControlNetModel
+        >>> from diffusers.utils import load_image
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        >>> controlnet = FluxControlNetModel.from_pretrained(
+        ...     "InstantX/FLUX.1-dev-Controlnet-Canny-alpha", torch_dtype=torch.bfloat16
+        ... )
+
+        >>> pipe = FluxControlNetImg2ImgPipeline.from_pretrained(
+        ...     "black-forest-labs/FLUX.1-schnell", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+
+        >>> pipe.text_encoder.to(torch.float16)
+        >>> pipe.controlnet.to(torch.float16)
+        >>> pipe.to("cuda")
+
+        >>> control_image = load_image("https://huggingface.co/InstantX/SD3-Controlnet-Canny/resolve/main/canny.jpg")
+        >>> init_image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        ... )
+
+        >>> prompt = "A girl in city, 25 years old, cool, futuristic"
+        >>> image = pipe(
+        ...     prompt,
+        ...     image=init_image,
+        ...     control_image=control_image,
+        ...     controlnet_conditioning_scale=0.6,
+        ...     strength=0.7,
+        ...     num_inference_steps=2,
+        ...     guidance_scale=3.5,
+        ... ).images[0]
+        >>> image.save("flux_controlnet_img2img.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    The Flux controlnet pipeline for image-to-image generation.
+
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+        controlnet: Union[
+            FluxControlNetModel, List[FluxControlNetModel], Tuple[FluxControlNetModel], FluxMultiControlNetModel
+        ],
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            controlnet=controlnet,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 64
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        return image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        strength,
+        height,
+        width,
+        callback_on_step_end_tensor_inputs,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        max_sequence_length=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+
+        return latent_image_ids.to(device=device, dtype=dtype)
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+
+        return latents
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux_img2img.FluxImg2ImgPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+
+        shape = (batch_size, num_channels_latents, height, width)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+
+        image = image.to(device=device, dtype=dtype)
+        image_latents = self._encode_vae_image(image=image, generator=generator)
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self.scheduler.scale_noise(image_latents, timestep, noise)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        return latents, latent_image_ids
+
+    # Copied from diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet.StableDiffusion3ControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if isinstance(image, torch.Tensor):
+            pass
+        else:
+            image = self.image_processor.preprocess(image, height=height, width=width)
+
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.6,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        control_mode: Optional[Union[int, List[int]]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`.
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                The image(s) to modify with the pipeline.
+            control_image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                The ControlNet input condition. Image to control the generation.
+            height (`int`, *optional*, defaults to self.default_sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.default_sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 0.6):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+            num_inference_steps (`int`, *optional*, defaults to 28):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+            control_mode (`int` or `List[int]`, *optional*):
+                The mode for the ControlNet. If multiple ControlNets are used, this should be a list.
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original transformer.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or more [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to
+                make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                Additional keyword arguments to be passed to the joint attention mechanism.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising step during the inference.
+            callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function.
+            max_sequence_length (`int`, *optional*, defaults to 512):
+                The maximum length of the sequence to be generated.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            strength,
+            height,
+            width,
+            callback_on_step_end_tensor_inputs,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        dtype = self.transformer.dtype
+
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+
+        init_image = self.image_processor.preprocess(image, height=height, width=width)
+        init_image = init_image.to(dtype=torch.float32)
+
+        num_channels_latents = self.transformer.config.in_channels // 4
+
+        if isinstance(self.controlnet, FluxControlNetModel):
+            control_image = self.prepare_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=self.vae.dtype,
+            )
+            height, width = control_image.shape[-2:]
+
+            control_image = self.vae.encode(control_image).latent_dist.sample()
+            control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+            height_control_image, width_control_image = control_image.shape[2:]
+            control_image = self._pack_latents(
+                control_image,
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height_control_image,
+                width_control_image,
+            )
+
+            if control_mode is not None:
+                control_mode = torch.tensor(control_mode).to(device, dtype=torch.long)
+                control_mode = control_mode.reshape([-1, 1])
+
+        elif isinstance(self.controlnet, FluxMultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=self.vae.dtype,
+                )
+                height, width = control_image_.shape[-2:]
+
+                control_image_ = self.vae.encode(control_image_).latent_dist.sample()
+                control_image_ = (control_image_ - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+                height_control_image, width_control_image = control_image_.shape[2:]
+                control_image_ = self._pack_latents(
+                    control_image_,
+                    batch_size * num_images_per_prompt,
+                    num_channels_latents,
+                    height_control_image,
+                    width_control_image,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+
+            control_mode_ = []
+            if isinstance(control_mode, list):
+                for cmode in control_mode:
+                    if cmode is None:
+                        control_mode_.append(-1)
+                    else:
+                        control_mode_.append(cmode)
+            control_mode = torch.tensor(control_mode_).to(device, dtype=torch.long)
+            control_mode = control_mode.reshape([-1, 1])
+
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = (int(height) // self.vae_scale_factor) * (int(width) // self.vae_scale_factor)
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        latents, latent_image_ids = self.prepare_latents(
+            init_image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+
+                guidance = (
+                    torch.tensor([guidance_scale], device=device) if self.controlnet.config.guidance_embeds else None
+                )
+                guidance = guidance.expand(latents.shape[0]) if guidance is not None else None
+
+                controlnet_block_samples, controlnet_single_block_samples = self.controlnet(
+                    hidden_states=latents,
+                    controlnet_cond=control_image,
+                    controlnet_mode=control_mode,
+                    conditioning_scale=controlnet_conditioning_scale,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )
+
+                guidance = (
+                    torch.tensor([guidance_scale], device=device) if self.transformer.config.guidance_embeds else None
+                )
+                guidance = guidance.expand(latents.shape[0]) if guidance is not None else None
+
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_block_samples=controlnet_block_samples,
+                    controlnet_single_block_samples=controlnet_single_block_samples,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return FluxPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py b/diffusers/src/diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py
new file mode 100644
index 0000000000000000000000000000000000000000..e763200155f6afb8a61de68a5f67881691367340
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py
@@ -0,0 +1,1139 @@
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
+from ...models.transformers import FluxTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import FluxPipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxControlNetInpaintPipeline
+        >>> from diffusers.models import FluxControlNetModel
+        >>> from diffusers.utils import load_image
+
+        >>> controlnet = FluxControlNetModel.from_pretrained(
+        ...     "InstantX/FLUX.1-dev-controlnet-canny", torch_dtype=torch.float16
+        ... )
+        >>> pipe = FluxControlNetInpaintPipeline.from_pretrained(
+        ...     "black-forest-labs/FLUX.1-schnell", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> control_image = load_image(
+        ...     "https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Canny-alpha/resolve/main/canny.jpg"
+        ... )
+        >>> init_image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        ... )
+        >>> mask_image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+        ... )
+
+        >>> prompt = "A girl holding a sign that says InstantX"
+        >>> image = pipe(
+        ...     prompt,
+        ...     image=init_image,
+        ...     mask_image=mask_image,
+        ...     control_image=control_image,
+        ...     controlnet_conditioning_scale=0.7,
+        ...     strength=0.7,
+        ...     num_inference_steps=28,
+        ...     guidance_scale=3.5,
+        ... ).images[0]
+        >>> image.save("flux_controlnet_inpaint.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    The Flux controlnet pipeline for inpainting.
+
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+        controlnet: Union[
+            FluxControlNetModel, List[FluxControlNetModel], Tuple[FluxControlNetModel], FluxMultiControlNetModel
+        ],
+    ):
+        super().__init__()
+
+        self.register_modules(
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_encoder_2=text_encoder_2,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            controlnet=controlnet,
+        )
+
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            vae_latent_channels=self.vae.config.latent_channels,
+            do_normalize=False,
+            do_binarize=True,
+            do_convert_grayscale=True,
+        )
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 64
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        return image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        mask_image,
+        strength,
+        height,
+        width,
+        output_type,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        padding_mask_crop=None,
+        max_sequence_length=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if padding_mask_crop is not None:
+            if not isinstance(image, PIL.Image.Image):
+                raise ValueError(
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
+                )
+            if not isinstance(mask_image, PIL.Image.Image):
+                raise ValueError(
+                    f"The mask image should be a PIL image when inpainting mask crop, but is of type"
+                    f" {type(mask_image)}."
+                )
+            if output_type != "pil":
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+
+        return latent_image_ids.to(device=device, dtype=dtype)
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+
+        return latents
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux_inpaint.FluxInpaintPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+
+        shape = (batch_size, num_channels_latents, height, width)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+
+        image = image.to(device=device, dtype=dtype)
+        image_latents = self._encode_vae_image(image=image, generator=generator)
+
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = self.scheduler.scale_noise(image_latents, timestep, noise)
+        else:
+            noise = latents.to(device)
+            latents = noise
+
+        noise = self._pack_latents(noise, batch_size, num_channels_latents, height, width)
+        image_latents = self._pack_latents(image_latents, batch_size, num_channels_latents, height, width)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        return latents, noise, image_latents, latent_image_ids
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux_inpaint.FluxInpaintPipeline.prepare_mask_latents
+    def prepare_mask_latents(
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        num_channels_latents,
+        num_images_per_prompt,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+    ):
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(mask, size=(height, width))
+        mask = mask.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        if masked_image.shape[1] == 16:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = retrieve_latents(self.vae.encode(masked_image), generator=generator)
+
+        masked_image_latents = (masked_image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+
+        masked_image_latents = self._pack_latents(
+            masked_image_latents,
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+        )
+        mask = self._pack_latents(
+            mask.repeat(1, num_channels_latents, 1, 1),
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+        )
+
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet.StableDiffusion3ControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if isinstance(image, torch.Tensor):
+            pass
+        else:
+            image = self.image_processor.preprocess(image, height=height, width=width)
+
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.6,
+        padding_mask_crop: Optional[int] = None,
+        timesteps: List[int] = None,
+        num_inference_steps: int = 28,
+        guidance_scale: float = 7.0,
+        control_mode: Optional[Union[int, List[int]]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`.
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                The image(s) to inpaint.
+            mask_image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                The mask image(s) to use for inpainting. White pixels in the mask will be repainted, while black pixels
+                will be preserved.
+            masked_image_latents (`torch.FloatTensor`, *optional*):
+                Pre-generated masked image latents.
+            control_image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                The ControlNet input condition. Image to control the generation.
+            height (`int`, *optional*, defaults to self.default_sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.default_sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 0.6):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1.
+            padding_mask_crop (`int`, *optional*):
+                The size of the padding to use when cropping the mask.
+            num_inference_steps (`int`, *optional*, defaults to 28):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+            control_mode (`int` or `List[int]`, *optional*):
+                The mode for the ControlNet. If multiple ControlNets are used, this should be a list.
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original transformer.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or more [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to
+                make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                Additional keyword arguments to be passed to the joint attention mechanism.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising step during the inference.
+            callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function.
+            max_sequence_length (`int`, *optional*, defaults to 512):
+                The maximum length of the sequence to be generated.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        global_height = height
+        global_width = width
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            image,
+            mask_image,
+            strength,
+            height,
+            width,
+            output_type=output_type,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            padding_mask_crop=padding_mask_crop,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        dtype = self.transformer.dtype
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        prompt_embeds, pooled_prompt_embeds, text_ids = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+
+        # 4. Preprocess mask and image
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(
+                mask_image, global_width, global_height, pad=padding_mask_crop
+            )
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=global_height, width=global_width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        # 5. Prepare control image
+        num_channels_latents = self.transformer.config.in_channels // 4
+        if isinstance(self.controlnet, FluxControlNetModel):
+            control_image = self.prepare_image(
+                image=control_image,
+                width=height,
+                height=width,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=self.vae.dtype,
+            )
+            height, width = control_image.shape[-2:]
+
+            # vae encode
+            control_image = self.vae.encode(control_image).latent_dist.sample()
+            control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+            # pack
+            height_control_image, width_control_image = control_image.shape[2:]
+            control_image = self._pack_latents(
+                control_image,
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height_control_image,
+                width_control_image,
+            )
+
+            # set control mode
+            if control_mode is not None:
+                control_mode = torch.tensor(control_mode).to(device, dtype=torch.long)
+                control_mode = control_mode.reshape([-1, 1])
+
+        elif isinstance(self.controlnet, FluxMultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=self.vae.dtype,
+                )
+                height, width = control_image_.shape[-2:]
+
+                # vae encode
+                control_image_ = self.vae.encode(control_image_).latent_dist.sample()
+                control_image_ = (control_image_ - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+                # pack
+                height_control_image, width_control_image = control_image_.shape[2:]
+                control_image_ = self._pack_latents(
+                    control_image_,
+                    batch_size * num_images_per_prompt,
+                    num_channels_latents,
+                    height_control_image,
+                    width_control_image,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+
+            # set control mode
+            control_mode_ = []
+            if isinstance(control_mode, list):
+                for cmode in control_mode:
+                    if cmode is None:
+                        control_mode_.append(-1)
+                    else:
+                        control_mode_.append(cmode)
+            control_mode = torch.tensor(control_mode_).to(device, dtype=torch.long)
+            control_mode = control_mode.reshape([-1, 1])
+
+        # 6. Prepare timesteps
+
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = (int(global_height) // self.vae_scale_factor) * (int(global_width) // self.vae_scale_factor)
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 7. Prepare latent variables
+
+        latents, noise, image_latents, latent_image_ids = self.prepare_latents(
+            init_image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            global_height,
+            global_width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 8. Prepare mask latents
+        mask_condition = self.mask_processor.preprocess(
+            mask_image, height=global_height, width=global_width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+        if masked_image_latents is None:
+            masked_image = init_image * (mask_condition < 0.5)
+        else:
+            masked_image = masked_image_latents
+
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask_condition,
+            masked_image,
+            batch_size,
+            num_channels_latents,
+            num_images_per_prompt,
+            global_height,
+            global_width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 9. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+
+                # predict the noise residual
+                if self.controlnet.config.guidance_embeds:
+                    guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+                    guidance = guidance.expand(latents.shape[0])
+                else:
+                    guidance = None
+
+                controlnet_block_samples, controlnet_single_block_samples = self.controlnet(
+                    hidden_states=latents,
+                    controlnet_cond=control_image,
+                    controlnet_mode=control_mode,
+                    conditioning_scale=controlnet_conditioning_scale,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )
+
+                if self.transformer.config.guidance_embeds:
+                    guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+                    guidance = guidance.expand(latents.shape[0])
+                else:
+                    guidance = None
+
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_block_samples=controlnet_block_samples,
+                    controlnet_single_block_samples=controlnet_single_block_samples,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                # For inpainting, we need to apply the mask and add the masked image latents
+                init_latents_proper = image_latents
+                init_mask = mask
+
+                if i < len(timesteps) - 1:
+                    noise_timestep = timesteps[i + 1]
+                    init_latents_proper = self.scheduler.scale_noise(
+                        init_latents_proper, torch.tensor([noise_timestep]), noise
+                    )
+
+                latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        # Post-processing
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, global_height, global_width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return FluxPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/flux/pipeline_flux_img2img.py b/diffusers/src/diffusers/pipelines/flux/pipeline_flux_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..bee4f6ce52e7b5285ac0e8a3fbd64be691518281
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/flux/pipeline_flux_img2img.py
@@ -0,0 +1,844 @@
+# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FluxLoraLoaderMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.transformers import FluxTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import FluxPipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+
+        >>> from diffusers import FluxImg2ImgPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> device = "cuda"
+        >>> pipe = FluxImg2ImgPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+        >>> pipe = pipe.to(device)
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> init_image = load_image(url).resize((1024, 1024))
+
+        >>> prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
+
+        >>> images = pipe(
+        ...     prompt=prompt, image=init_image, num_inference_steps=4, strength=0.95, guidance_scale=0.0
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
+    r"""
+    The Flux pipeline for image inpainting.
+
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 64
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        return image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        strength,
+        height,
+        width,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+
+        return latent_image_ids.to(device=device, dtype=dtype)
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+
+        return latents
+
+    def prepare_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+
+        shape = (batch_size, num_channels_latents, height, width)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+
+        image = image.to(device=device, dtype=dtype)
+        image_latents = self._encode_vae_image(image=image, generator=generator)
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self.scheduler.scale_noise(image_latents, timestep, noise)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        return latents, latent_image_ids
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.6,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            strength,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+
+        # 2. Preprocess image
+        init_image = self.image_processor.preprocess(image, height=height, width=width)
+        init_image = init_image.to(dtype=torch.float32)
+
+        # 3. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+
+        # 4.Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = (int(height) // self.vae_scale_factor) * (int(width) // self.vae_scale_factor)
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+
+        latents, latent_image_ids = self.prepare_latents(
+            init_image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return FluxPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/flux/pipeline_flux_inpaint.py b/diffusers/src/diffusers/pipelines/flux/pipeline_flux_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..4603367002415fcc9eab77154fb6c8e4cfeb4886
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/flux/pipeline_flux_inpaint.py
@@ -0,0 +1,1009 @@
+# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FluxLoraLoaderMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.transformers import FluxTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import FluxPipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxInpaintPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = FluxInpaintPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+        >>> source = load_image(img_url)
+        >>> mask = load_image(mask_url)
+        >>> image = pipe(prompt=prompt, image=source, mask_image=mask).images[0]
+        >>> image.save("flux_inpainting.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
+    r"""
+    The Flux pipeline for image inpainting.
+
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            vae_latent_channels=self.vae.config.latent_channels,
+            do_normalize=False,
+            do_binarize=True,
+            do_convert_grayscale=True,
+        )
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 64
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        return image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        mask_image,
+        strength,
+        height,
+        width,
+        output_type,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        padding_mask_crop=None,
+        max_sequence_length=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if padding_mask_crop is not None:
+            if not isinstance(image, PIL.Image.Image):
+                raise ValueError(
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
+                )
+            if not isinstance(mask_image, PIL.Image.Image):
+                raise ValueError(
+                    f"The mask image should be a PIL image when inpainting mask crop, but is of type"
+                    f" {type(mask_image)}."
+                )
+            if output_type != "pil":
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+
+        return latent_image_ids.to(device=device, dtype=dtype)
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+
+        return latents
+
+    def prepare_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+
+        shape = (batch_size, num_channels_latents, height, width)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+
+        image = image.to(device=device, dtype=dtype)
+        image_latents = self._encode_vae_image(image=image, generator=generator)
+
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = self.scheduler.scale_noise(image_latents, timestep, noise)
+        else:
+            noise = latents.to(device)
+            latents = noise
+
+        noise = self._pack_latents(noise, batch_size, num_channels_latents, height, width)
+        image_latents = self._pack_latents(image_latents, batch_size, num_channels_latents, height, width)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        return latents, noise, image_latents, latent_image_ids
+
+    def prepare_mask_latents(
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        num_channels_latents,
+        num_images_per_prompt,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+    ):
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(mask, size=(height, width))
+        mask = mask.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        if masked_image.shape[1] == 16:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = retrieve_latents(self.vae.encode(masked_image), generator=generator)
+
+        masked_image_latents = (masked_image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+
+        masked_image_latents = self._pack_latents(
+            masked_image_latents,
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+        )
+        mask = self._pack_latents(
+            mask.repeat(1, num_channels_latents, 1, 1),
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+        )
+
+        return mask, masked_image_latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 0.6,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
+                1)`, or `(H, W)`.
+            mask_image_latent (`torch.Tensor`, `List[torch.Tensor]`):
+                `Tensor` representing an image batch to mask `image` generated by VAE. If not provided, the mask
+                latents tensor will ge generated by `mask_image`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
+                image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
+                with the same aspect ration of the image and contains all masked area, and then expand that area based
+                on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
+                resizing to the original image size for inpainting. This is useful when the masked area is small while
+                the image is large and contain information irrelevant for inpainting, such as background.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            image,
+            mask_image,
+            strength,
+            height,
+            width,
+            output_type=output_type,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            padding_mask_crop=padding_mask_crop,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+
+        # 2. Preprocess mask and image
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        # 3. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+
+        # 4.Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = (int(height) // self.vae_scale_factor) * (int(width) // self.vae_scale_factor)
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        num_channels_transformer = self.transformer.config.in_channels
+
+        latents, noise, image_latents, latent_image_ids = self.prepare_latents(
+            init_image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        mask_condition = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+
+        if masked_image_latents is None:
+            masked_image = init_image * (mask_condition < 0.5)
+        else:
+            masked_image = masked_image_latents
+
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask_condition,
+            masked_image,
+            batch_size,
+            num_channels_latents,
+            num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                # for 64 channel transformer only.
+                init_latents_proper = image_latents
+                init_mask = mask
+
+                if i < len(timesteps) - 1:
+                    noise_timestep = timesteps[i + 1]
+                    init_latents_proper = self.scheduler.scale_noise(
+                        init_latents_proper, torch.tensor([noise_timestep]), noise
+                    )
+
+                latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return FluxPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/flux/pipeline_output.py b/diffusers/src/diffusers/pipelines/flux/pipeline_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5d98fb5bf6000eaae99ded8be09c6e6e62cfd1a
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/flux/pipeline_output.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class FluxPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
diff --git a/diffusers/src/diffusers/pipelines/free_init_utils.py b/diffusers/src/diffusers/pipelines/free_init_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fb67592ca4f8cf6a420bf7acb94f4772d963646
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/free_init_utils.py
@@ -0,0 +1,187 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Tuple, Union
+
+import torch
+import torch.fft as fft
+
+from ..utils.torch_utils import randn_tensor
+
+
+class FreeInitMixin:
+    r"""Mixin class for FreeInit."""
+
+    def enable_free_init(
+        self,
+        num_iters: int = 3,
+        use_fast_sampling: bool = False,
+        method: str = "butterworth",
+        order: int = 4,
+        spatial_stop_frequency: float = 0.25,
+        temporal_stop_frequency: float = 0.25,
+    ):
+        """Enables the FreeInit mechanism as in https://arxiv.org/abs/2312.07537.
+
+        This implementation has been adapted from the [official repository](https://github.com/TianxingWu/FreeInit).
+
+        Args:
+            num_iters (`int`, *optional*, defaults to `3`):
+                Number of FreeInit noise re-initialization iterations.
+            use_fast_sampling (`bool`, *optional*, defaults to `False`):
+                Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables the
+                "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
+            method (`str`, *optional*, defaults to `butterworth`):
+                Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the FreeInit low
+                pass filter.
+            order (`int`, *optional*, defaults to `4`):
+                Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
+                whereas lower values lead to `gaussian` method behaviour.
+            spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
+                Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in the
+                original implementation.
+            temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
+                Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in the
+                original implementation.
+        """
+        self._free_init_num_iters = num_iters
+        self._free_init_use_fast_sampling = use_fast_sampling
+        self._free_init_method = method
+        self._free_init_order = order
+        self._free_init_spatial_stop_frequency = spatial_stop_frequency
+        self._free_init_temporal_stop_frequency = temporal_stop_frequency
+
+    def disable_free_init(self):
+        """Disables the FreeInit mechanism if enabled."""
+        self._free_init_num_iters = None
+
+    @property
+    def free_init_enabled(self):
+        return hasattr(self, "_free_init_num_iters") and self._free_init_num_iters is not None
+
+    def _get_free_init_freq_filter(
+        self,
+        shape: Tuple[int, ...],
+        device: Union[str, torch.dtype],
+        filter_type: str,
+        order: float,
+        spatial_stop_frequency: float,
+        temporal_stop_frequency: float,
+    ) -> torch.Tensor:
+        r"""Returns the FreeInit filter based on filter type and other input conditions."""
+
+        time, height, width = shape[-3], shape[-2], shape[-1]
+        mask = torch.zeros(shape)
+
+        if spatial_stop_frequency == 0 or temporal_stop_frequency == 0:
+            return mask
+
+        if filter_type == "butterworth":
+
+            def retrieve_mask(x):
+                return 1 / (1 + (x / spatial_stop_frequency**2) ** order)
+        elif filter_type == "gaussian":
+
+            def retrieve_mask(x):
+                return math.exp(-1 / (2 * spatial_stop_frequency**2) * x)
+        elif filter_type == "ideal":
+
+            def retrieve_mask(x):
+                return 1 if x <= spatial_stop_frequency * 2 else 0
+        else:
+            raise NotImplementedError("`filter_type` must be one of gaussian, butterworth or ideal")
+
+        for t in range(time):
+            for h in range(height):
+                for w in range(width):
+                    d_square = (
+                        ((spatial_stop_frequency / temporal_stop_frequency) * (2 * t / time - 1)) ** 2
+                        + (2 * h / height - 1) ** 2
+                        + (2 * w / width - 1) ** 2
+                    )
+                    mask[..., t, h, w] = retrieve_mask(d_square)
+
+        return mask.to(device)
+
+    def _apply_freq_filter(self, x: torch.Tensor, noise: torch.Tensor, low_pass_filter: torch.Tensor) -> torch.Tensor:
+        r"""Noise reinitialization."""
+        # FFT
+        x_freq = fft.fftn(x, dim=(-3, -2, -1))
+        x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1))
+        noise_freq = fft.fftn(noise, dim=(-3, -2, -1))
+        noise_freq = fft.fftshift(noise_freq, dim=(-3, -2, -1))
+
+        # frequency mix
+        high_pass_filter = 1 - low_pass_filter
+        x_freq_low = x_freq * low_pass_filter
+        noise_freq_high = noise_freq * high_pass_filter
+        x_freq_mixed = x_freq_low + noise_freq_high  # mix in freq domain
+
+        # IFFT
+        x_freq_mixed = fft.ifftshift(x_freq_mixed, dim=(-3, -2, -1))
+        x_mixed = fft.ifftn(x_freq_mixed, dim=(-3, -2, -1)).real
+
+        return x_mixed
+
+    def _apply_free_init(
+        self,
+        latents: torch.Tensor,
+        free_init_iteration: int,
+        num_inference_steps: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        generator: torch.Generator,
+    ):
+        if free_init_iteration == 0:
+            self._free_init_initial_noise = latents.detach().clone()
+        else:
+            latent_shape = latents.shape
+
+            free_init_filter_shape = (1, *latent_shape[1:])
+            free_init_freq_filter = self._get_free_init_freq_filter(
+                shape=free_init_filter_shape,
+                device=device,
+                filter_type=self._free_init_method,
+                order=self._free_init_order,
+                spatial_stop_frequency=self._free_init_spatial_stop_frequency,
+                temporal_stop_frequency=self._free_init_temporal_stop_frequency,
+            )
+
+            current_diffuse_timestep = self.scheduler.config.num_train_timesteps - 1
+            diffuse_timesteps = torch.full((latent_shape[0],), current_diffuse_timestep).long()
+
+            z_t = self.scheduler.add_noise(
+                original_samples=latents, noise=self._free_init_initial_noise, timesteps=diffuse_timesteps.to(device)
+            ).to(dtype=torch.float32)
+
+            z_rand = randn_tensor(
+                shape=latent_shape,
+                generator=generator,
+                device=device,
+                dtype=torch.float32,
+            )
+            latents = self._apply_freq_filter(z_t, z_rand, low_pass_filter=free_init_freq_filter)
+            latents = latents.to(dtype)
+
+        # Coarse-to-Fine Sampling for faster inference (can lead to lower quality)
+        if self._free_init_use_fast_sampling:
+            num_inference_steps = max(
+                1, int(num_inference_steps / self._free_init_num_iters * (free_init_iteration + 1))
+            )
+
+        if num_inference_steps > 0:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        return latents, self.scheduler.timesteps
diff --git a/diffusers/src/diffusers/pipelines/free_noise_utils.py b/diffusers/src/diffusers/pipelines/free_noise_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc0071a494e30d135f80b611def70c3d967ecf90
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/free_noise_utils.py
@@ -0,0 +1,596 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ..models.attention import BasicTransformerBlock, FreeNoiseTransformerBlock
+from ..models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
+from ..models.transformers.transformer_2d import Transformer2DModel
+from ..models.unets.unet_motion_model import (
+    AnimateDiffTransformer3D,
+    CrossAttnDownBlockMotion,
+    DownBlockMotion,
+    UpBlockMotion,
+)
+from ..pipelines.pipeline_utils import DiffusionPipeline
+from ..utils import logging
+from ..utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class SplitInferenceModule(nn.Module):
+    r"""
+    A wrapper module class that splits inputs along a specified dimension before performing a forward pass.
+
+    This module is useful when you need to perform inference on large tensors in a memory-efficient way by breaking
+    them into smaller chunks, processing each chunk separately, and then reassembling the results.
+
+    Args:
+        module (`nn.Module`):
+            The underlying PyTorch module that will be applied to each chunk of split inputs.
+        split_size (`int`, defaults to `1`):
+            The size of each chunk after splitting the input tensor.
+        split_dim (`int`, defaults to `0`):
+            The dimension along which the input tensors are split.
+        input_kwargs_to_split (`List[str]`, defaults to `["hidden_states"]`):
+            A list of keyword arguments (strings) that represent the input tensors to be split.
+
+    Workflow:
+        1. The keyword arguments specified in `input_kwargs_to_split` are split into smaller chunks using
+        `torch.split()` along the dimension `split_dim` and with a chunk size of `split_size`.
+        2. The `module` is invoked once for each split with both the split inputs and any unchanged arguments
+        that were passed.
+        3. The output tensors from each split are concatenated back together along `split_dim` before returning.
+
+    Example:
+        ```python
+        >>> import torch
+        >>> import torch.nn as nn
+
+        >>> model = nn.Linear(1000, 1000)
+        >>> split_module = SplitInferenceModule(model, split_size=2, split_dim=0, input_kwargs_to_split=["input"])
+
+        >>> input_tensor = torch.randn(42, 1000)
+        >>> # Will split the tensor into 21 slices of shape [2, 1000].
+        >>> output = split_module(input=input_tensor)
+        ```
+
+    It is also possible to nest `SplitInferenceModule` across different split dimensions for more complex
+    multi-dimensional splitting.
+    """
+
+    def __init__(
+        self,
+        module: nn.Module,
+        split_size: int = 1,
+        split_dim: int = 0,
+        input_kwargs_to_split: List[str] = ["hidden_states"],
+    ) -> None:
+        super().__init__()
+
+        self.module = module
+        self.split_size = split_size
+        self.split_dim = split_dim
+        self.input_kwargs_to_split = set(input_kwargs_to_split)
+
+    def forward(self, *args, **kwargs) -> Union[torch.Tensor, Tuple[torch.Tensor]]:
+        r"""Forward method for the `SplitInferenceModule`.
+
+        This method processes the input by splitting specified keyword arguments along a given dimension, running the
+        underlying module on each split, and then concatenating the results. The splitting is controlled by the
+        `split_size` and `split_dim` parameters specified during initialization.
+
+        Args:
+            *args (`Any`):
+                Positional arguments that are passed directly to the `module` without modification.
+            **kwargs (`Dict[str, torch.Tensor]`):
+                Keyword arguments passed to the underlying `module`. Only keyword arguments whose names match the
+                entries in `input_kwargs_to_split` and are of type `torch.Tensor` will be split. The remaining keyword
+                arguments are passed unchanged.
+
+        Returns:
+            `Union[torch.Tensor, Tuple[torch.Tensor]]`:
+                The outputs obtained from `SplitInferenceModule` are the same as if the underlying module was inferred
+                without it.
+                - If the underlying module returns a single tensor, the result will be a single concatenated tensor
+                along the same `split_dim` after processing all splits.
+                - If the underlying module returns a tuple of tensors, each element of the tuple will be concatenated
+                along the `split_dim` across all splits, and the final result will be a tuple of concatenated tensors.
+        """
+        split_inputs = {}
+
+        # 1. Split inputs that were specified during initialization and also present in passed kwargs
+        for key in list(kwargs.keys()):
+            if key not in self.input_kwargs_to_split or not torch.is_tensor(kwargs[key]):
+                continue
+            split_inputs[key] = torch.split(kwargs[key], self.split_size, self.split_dim)
+            kwargs.pop(key)
+
+        # 2. Invoke forward pass across each split
+        results = []
+        for split_input in zip(*split_inputs.values()):
+            inputs = dict(zip(split_inputs.keys(), split_input))
+            inputs.update(kwargs)
+
+            intermediate_tensor_or_tensor_tuple = self.module(*args, **inputs)
+            results.append(intermediate_tensor_or_tensor_tuple)
+
+        # 3. Concatenate split restuls to obtain final outputs
+        if isinstance(results[0], torch.Tensor):
+            return torch.cat(results, dim=self.split_dim)
+        elif isinstance(results[0], tuple):
+            return tuple([torch.cat(x, dim=self.split_dim) for x in zip(*results)])
+        else:
+            raise ValueError(
+                "In order to use the SplitInferenceModule, it is necessary for the underlying `module` to either return a torch.Tensor or a tuple of torch.Tensor's."
+            )
+
+
+class AnimateDiffFreeNoiseMixin:
+    r"""Mixin class for [FreeNoise](https://arxiv.org/abs/2310.15169)."""
+
+    def _enable_free_noise_in_block(self, block: Union[CrossAttnDownBlockMotion, DownBlockMotion, UpBlockMotion]):
+        r"""Helper function to enable FreeNoise in transformer blocks."""
+
+        for motion_module in block.motion_modules:
+            num_transformer_blocks = len(motion_module.transformer_blocks)
+
+            for i in range(num_transformer_blocks):
+                if isinstance(motion_module.transformer_blocks[i], FreeNoiseTransformerBlock):
+                    motion_module.transformer_blocks[i].set_free_noise_properties(
+                        self._free_noise_context_length,
+                        self._free_noise_context_stride,
+                        self._free_noise_weighting_scheme,
+                    )
+                else:
+                    assert isinstance(motion_module.transformer_blocks[i], BasicTransformerBlock)
+                    basic_transfomer_block = motion_module.transformer_blocks[i]
+
+                    motion_module.transformer_blocks[i] = FreeNoiseTransformerBlock(
+                        dim=basic_transfomer_block.dim,
+                        num_attention_heads=basic_transfomer_block.num_attention_heads,
+                        attention_head_dim=basic_transfomer_block.attention_head_dim,
+                        dropout=basic_transfomer_block.dropout,
+                        cross_attention_dim=basic_transfomer_block.cross_attention_dim,
+                        activation_fn=basic_transfomer_block.activation_fn,
+                        attention_bias=basic_transfomer_block.attention_bias,
+                        only_cross_attention=basic_transfomer_block.only_cross_attention,
+                        double_self_attention=basic_transfomer_block.double_self_attention,
+                        positional_embeddings=basic_transfomer_block.positional_embeddings,
+                        num_positional_embeddings=basic_transfomer_block.num_positional_embeddings,
+                        context_length=self._free_noise_context_length,
+                        context_stride=self._free_noise_context_stride,
+                        weighting_scheme=self._free_noise_weighting_scheme,
+                    ).to(device=self.device, dtype=self.dtype)
+
+                    motion_module.transformer_blocks[i].load_state_dict(
+                        basic_transfomer_block.state_dict(), strict=True
+                    )
+                    motion_module.transformer_blocks[i].set_chunk_feed_forward(
+                        basic_transfomer_block._chunk_size, basic_transfomer_block._chunk_dim
+                    )
+
+    def _disable_free_noise_in_block(self, block: Union[CrossAttnDownBlockMotion, DownBlockMotion, UpBlockMotion]):
+        r"""Helper function to disable FreeNoise in transformer blocks."""
+
+        for motion_module in block.motion_modules:
+            num_transformer_blocks = len(motion_module.transformer_blocks)
+
+            for i in range(num_transformer_blocks):
+                if isinstance(motion_module.transformer_blocks[i], FreeNoiseTransformerBlock):
+                    free_noise_transfomer_block = motion_module.transformer_blocks[i]
+
+                    motion_module.transformer_blocks[i] = BasicTransformerBlock(
+                        dim=free_noise_transfomer_block.dim,
+                        num_attention_heads=free_noise_transfomer_block.num_attention_heads,
+                        attention_head_dim=free_noise_transfomer_block.attention_head_dim,
+                        dropout=free_noise_transfomer_block.dropout,
+                        cross_attention_dim=free_noise_transfomer_block.cross_attention_dim,
+                        activation_fn=free_noise_transfomer_block.activation_fn,
+                        attention_bias=free_noise_transfomer_block.attention_bias,
+                        only_cross_attention=free_noise_transfomer_block.only_cross_attention,
+                        double_self_attention=free_noise_transfomer_block.double_self_attention,
+                        positional_embeddings=free_noise_transfomer_block.positional_embeddings,
+                        num_positional_embeddings=free_noise_transfomer_block.num_positional_embeddings,
+                    ).to(device=self.device, dtype=self.dtype)
+
+                    motion_module.transformer_blocks[i].load_state_dict(
+                        free_noise_transfomer_block.state_dict(), strict=True
+                    )
+                    motion_module.transformer_blocks[i].set_chunk_feed_forward(
+                        free_noise_transfomer_block._chunk_size, free_noise_transfomer_block._chunk_dim
+                    )
+
+    def _check_inputs_free_noise(
+        self,
+        prompt,
+        negative_prompt,
+        prompt_embeds,
+        negative_prompt_embeds,
+        num_frames,
+    ) -> None:
+        if not isinstance(prompt, (str, dict)):
+            raise ValueError(f"Expected `prompt` to have type `str` or `dict` but found {type(prompt)=}")
+
+        if negative_prompt is not None:
+            if not isinstance(negative_prompt, (str, dict)):
+                raise ValueError(
+                    f"Expected `negative_prompt` to have type `str` or `dict` but found {type(negative_prompt)=}"
+                )
+
+        if prompt_embeds is not None or negative_prompt_embeds is not None:
+            raise ValueError("`prompt_embeds` and `negative_prompt_embeds` is not supported in FreeNoise yet.")
+
+        frame_indices = [isinstance(x, int) for x in prompt.keys()]
+        frame_prompts = [isinstance(x, str) for x in prompt.values()]
+        min_frame = min(list(prompt.keys()))
+        max_frame = max(list(prompt.keys()))
+
+        if not all(frame_indices):
+            raise ValueError("Expected integer keys in `prompt` dict for FreeNoise.")
+        if not all(frame_prompts):
+            raise ValueError("Expected str values in `prompt` dict for FreeNoise.")
+        if min_frame != 0:
+            raise ValueError("The minimum frame index in `prompt` dict must be 0 as a starting prompt is necessary.")
+        if max_frame >= num_frames:
+            raise ValueError(
+                f"The maximum frame index in `prompt` dict must be lesser than {num_frames=} and follow 0-based indexing."
+            )
+
+    def _encode_prompt_free_noise(
+        self,
+        prompt: Union[str, Dict[int, str]],
+        num_frames: int,
+        device: torch.device,
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[Union[str, Dict[int, str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ) -> torch.Tensor:
+        if negative_prompt is None:
+            negative_prompt = ""
+
+        # Ensure that we have a dictionary of prompts
+        if isinstance(prompt, str):
+            prompt = {0: prompt}
+        if isinstance(negative_prompt, str):
+            negative_prompt = {0: negative_prompt}
+
+        self._check_inputs_free_noise(prompt, negative_prompt, prompt_embeds, negative_prompt_embeds, num_frames)
+
+        # Sort the prompts based on frame indices
+        prompt = dict(sorted(prompt.items()))
+        negative_prompt = dict(sorted(negative_prompt.items()))
+
+        # Ensure that we have a prompt for the last frame index
+        prompt[num_frames - 1] = prompt[list(prompt.keys())[-1]]
+        negative_prompt[num_frames - 1] = negative_prompt[list(negative_prompt.keys())[-1]]
+
+        frame_indices = list(prompt.keys())
+        frame_prompts = list(prompt.values())
+        frame_negative_indices = list(negative_prompt.keys())
+        frame_negative_prompts = list(negative_prompt.values())
+
+        # Generate and interpolate positive prompts
+        prompt_embeds, _ = self.encode_prompt(
+            prompt=frame_prompts,
+            device=device,
+            num_images_per_prompt=num_videos_per_prompt,
+            do_classifier_free_guidance=False,
+            negative_prompt=None,
+            prompt_embeds=None,
+            negative_prompt_embeds=None,
+            lora_scale=lora_scale,
+            clip_skip=clip_skip,
+        )
+
+        shape = (num_frames, *prompt_embeds.shape[1:])
+        prompt_interpolation_embeds = prompt_embeds.new_zeros(shape)
+
+        for i in range(len(frame_indices) - 1):
+            start_frame = frame_indices[i]
+            end_frame = frame_indices[i + 1]
+            start_tensor = prompt_embeds[i].unsqueeze(0)
+            end_tensor = prompt_embeds[i + 1].unsqueeze(0)
+
+            prompt_interpolation_embeds[start_frame : end_frame + 1] = self._free_noise_prompt_interpolation_callback(
+                start_frame, end_frame, start_tensor, end_tensor
+            )
+
+        # Generate and interpolate negative prompts
+        negative_prompt_embeds = None
+        negative_prompt_interpolation_embeds = None
+
+        if do_classifier_free_guidance:
+            _, negative_prompt_embeds = self.encode_prompt(
+                prompt=[""] * len(frame_negative_prompts),
+                device=device,
+                num_images_per_prompt=num_videos_per_prompt,
+                do_classifier_free_guidance=True,
+                negative_prompt=frame_negative_prompts,
+                prompt_embeds=None,
+                negative_prompt_embeds=None,
+                lora_scale=lora_scale,
+                clip_skip=clip_skip,
+            )
+
+            negative_prompt_interpolation_embeds = negative_prompt_embeds.new_zeros(shape)
+
+            for i in range(len(frame_negative_indices) - 1):
+                start_frame = frame_negative_indices[i]
+                end_frame = frame_negative_indices[i + 1]
+                start_tensor = negative_prompt_embeds[i].unsqueeze(0)
+                end_tensor = negative_prompt_embeds[i + 1].unsqueeze(0)
+
+                negative_prompt_interpolation_embeds[
+                    start_frame : end_frame + 1
+                ] = self._free_noise_prompt_interpolation_callback(start_frame, end_frame, start_tensor, end_tensor)
+
+        prompt_embeds = prompt_interpolation_embeds
+        negative_prompt_embeds = negative_prompt_interpolation_embeds
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def _prepare_latents_free_noise(
+        self,
+        batch_size: int,
+        num_channels_latents: int,
+        num_frames: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        context_num_frames = (
+            self._free_noise_context_length if self._free_noise_context_length == "repeat_context" else num_frames
+        )
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            context_num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            if self._free_noise_noise_type == "random":
+                return latents
+        else:
+            if latents.size(2) == num_frames:
+                return latents
+            elif latents.size(2) != self._free_noise_context_length:
+                raise ValueError(
+                    f"You have passed `latents` as a parameter to FreeNoise. The expected number of frames is either {num_frames} or {self._free_noise_context_length}, but found {latents.size(2)}"
+                )
+            latents = latents.to(device)
+
+        if self._free_noise_noise_type == "shuffle_context":
+            for i in range(self._free_noise_context_length, num_frames, self._free_noise_context_stride):
+                # ensure window is within bounds
+                window_start = max(0, i - self._free_noise_context_length)
+                window_end = min(num_frames, window_start + self._free_noise_context_stride)
+                window_length = window_end - window_start
+
+                if window_length == 0:
+                    break
+
+                indices = torch.LongTensor(list(range(window_start, window_end)))
+                shuffled_indices = indices[torch.randperm(window_length, generator=generator)]
+
+                current_start = i
+                current_end = min(num_frames, current_start + window_length)
+                if current_end == current_start + window_length:
+                    # batch of frames perfectly fits the window
+                    latents[:, :, current_start:current_end] = latents[:, :, shuffled_indices]
+                else:
+                    # handle the case where the last batch of frames does not fit perfectly with the window
+                    prefix_length = current_end - current_start
+                    shuffled_indices = shuffled_indices[:prefix_length]
+                    latents[:, :, current_start:current_end] = latents[:, :, shuffled_indices]
+
+        elif self._free_noise_noise_type == "repeat_context":
+            num_repeats = (num_frames + self._free_noise_context_length - 1) // self._free_noise_context_length
+            latents = torch.cat([latents] * num_repeats, dim=2)
+
+        latents = latents[:, :, :num_frames]
+        return latents
+
+    def _lerp(
+        self, start_index: int, end_index: int, start_tensor: torch.Tensor, end_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        num_indices = end_index - start_index + 1
+        interpolated_tensors = []
+
+        for i in range(num_indices):
+            alpha = i / (num_indices - 1)
+            interpolated_tensor = (1 - alpha) * start_tensor + alpha * end_tensor
+            interpolated_tensors.append(interpolated_tensor)
+
+        interpolated_tensors = torch.cat(interpolated_tensors)
+        return interpolated_tensors
+
+    def enable_free_noise(
+        self,
+        context_length: Optional[int] = 16,
+        context_stride: int = 4,
+        weighting_scheme: str = "pyramid",
+        noise_type: str = "shuffle_context",
+        prompt_interpolation_callback: Optional[
+            Callable[[DiffusionPipeline, int, int, torch.Tensor, torch.Tensor], torch.Tensor]
+        ] = None,
+    ) -> None:
+        r"""
+        Enable long video generation using FreeNoise.
+
+        Args:
+            context_length (`int`, defaults to `16`, *optional*):
+                The number of video frames to process at once. It's recommended to set this to the maximum frames the
+                Motion Adapter was trained with (usually 16/24/32). If `None`, the default value from the motion
+                adapter config is used.
+            context_stride (`int`, *optional*):
+                Long videos are generated by processing many frames. FreeNoise processes these frames in sliding
+                windows of size `context_length`. Context stride allows you to specify how many frames to skip between
+                each window. For example, a context length of 16 and context stride of 4 would process 24 frames as:
+                    [0, 15], [4, 19], [8, 23] (0-based indexing)
+            weighting_scheme (`str`, defaults to `pyramid`):
+                Weighting scheme for averaging latents after accumulation in FreeNoise blocks. The following weighting
+                schemes are supported currently:
+                    - "flat"
+                       Performs weighting averaging with a flat weight pattern: [1, 1, 1, 1, 1].
+                    - "pyramid"
+                        Performs weighted averaging with a pyramid like weight pattern: [1, 2, 3, 2, 1].
+                    - "delayed_reverse_sawtooth"
+                        Performs weighted averaging with low weights for earlier frames and high-to-low weights for
+                        later frames: [0.01, 0.01, 3, 2, 1].
+            noise_type (`str`, defaults to "shuffle_context"):
+                Must be one of ["shuffle_context", "repeat_context", "random"].
+                    - "shuffle_context"
+                        Shuffles a fixed batch of `context_length` latents to create a final latent of size
+                        `num_frames`. This is usually the best setting for most generation scenarious. However, there
+                        might be visible repetition noticeable in the kinds of motion/animation generated.
+                    - "repeated_context"
+                        Repeats a fixed batch of `context_length` latents to create a final latent of size
+                        `num_frames`.
+                    - "random"
+                        The final latents are random without any repetition.
+        """
+
+        allowed_weighting_scheme = ["flat", "pyramid", "delayed_reverse_sawtooth"]
+        allowed_noise_type = ["shuffle_context", "repeat_context", "random"]
+
+        if context_length > self.motion_adapter.config.motion_max_seq_length:
+            logger.warning(
+                f"You have set {context_length=} which is greater than {self.motion_adapter.config.motion_max_seq_length=}. This can lead to bad generation results."
+            )
+        if weighting_scheme not in allowed_weighting_scheme:
+            raise ValueError(
+                f"The parameter `weighting_scheme` must be one of {allowed_weighting_scheme}, but got {weighting_scheme=}"
+            )
+        if noise_type not in allowed_noise_type:
+            raise ValueError(f"The parameter `noise_type` must be one of {allowed_noise_type}, but got {noise_type=}")
+
+        self._free_noise_context_length = context_length or self.motion_adapter.config.motion_max_seq_length
+        self._free_noise_context_stride = context_stride
+        self._free_noise_weighting_scheme = weighting_scheme
+        self._free_noise_noise_type = noise_type
+        self._free_noise_prompt_interpolation_callback = prompt_interpolation_callback or self._lerp
+
+        if hasattr(self.unet.mid_block, "motion_modules"):
+            blocks = [*self.unet.down_blocks, self.unet.mid_block, *self.unet.up_blocks]
+        else:
+            blocks = [*self.unet.down_blocks, *self.unet.up_blocks]
+
+        for block in blocks:
+            self._enable_free_noise_in_block(block)
+
+    def disable_free_noise(self) -> None:
+        r"""Disable the FreeNoise sampling mechanism."""
+        self._free_noise_context_length = None
+
+        if hasattr(self.unet.mid_block, "motion_modules"):
+            blocks = [*self.unet.down_blocks, self.unet.mid_block, *self.unet.up_blocks]
+        else:
+            blocks = [*self.unet.down_blocks, *self.unet.up_blocks]
+
+        blocks = [*self.unet.down_blocks, self.unet.mid_block, *self.unet.up_blocks]
+        for block in blocks:
+            self._disable_free_noise_in_block(block)
+
+    def _enable_split_inference_motion_modules_(
+        self, motion_modules: List[AnimateDiffTransformer3D], spatial_split_size: int
+    ) -> None:
+        for motion_module in motion_modules:
+            motion_module.proj_in = SplitInferenceModule(motion_module.proj_in, spatial_split_size, 0, ["input"])
+
+            for i in range(len(motion_module.transformer_blocks)):
+                motion_module.transformer_blocks[i] = SplitInferenceModule(
+                    motion_module.transformer_blocks[i],
+                    spatial_split_size,
+                    0,
+                    ["hidden_states", "encoder_hidden_states"],
+                )
+
+            motion_module.proj_out = SplitInferenceModule(motion_module.proj_out, spatial_split_size, 0, ["input"])
+
+    def _enable_split_inference_attentions_(
+        self, attentions: List[Transformer2DModel], temporal_split_size: int
+    ) -> None:
+        for i in range(len(attentions)):
+            attentions[i] = SplitInferenceModule(
+                attentions[i], temporal_split_size, 0, ["hidden_states", "encoder_hidden_states"]
+            )
+
+    def _enable_split_inference_resnets_(self, resnets: List[ResnetBlock2D], temporal_split_size: int) -> None:
+        for i in range(len(resnets)):
+            resnets[i] = SplitInferenceModule(resnets[i], temporal_split_size, 0, ["input_tensor", "temb"])
+
+    def _enable_split_inference_samplers_(
+        self, samplers: Union[List[Downsample2D], List[Upsample2D]], temporal_split_size: int
+    ) -> None:
+        for i in range(len(samplers)):
+            samplers[i] = SplitInferenceModule(samplers[i], temporal_split_size, 0, ["hidden_states"])
+
+    def enable_free_noise_split_inference(self, spatial_split_size: int = 256, temporal_split_size: int = 16) -> None:
+        r"""
+        Enable FreeNoise memory optimizations by utilizing
+        [`~diffusers.pipelines.free_noise_utils.SplitInferenceModule`] across different intermediate modeling blocks.
+
+        Args:
+            spatial_split_size (`int`, defaults to `256`):
+                The split size across spatial dimensions for internal blocks. This is used in facilitating split
+                inference across the effective batch dimension (`[B x H x W, F, C]`) of intermediate tensors in motion
+                modeling blocks.
+            temporal_split_size (`int`, defaults to `16`):
+                The split size across temporal dimensions for internal blocks. This is used in facilitating split
+                inference across the effective batch dimension (`[B x F, H x W, C]`) of intermediate tensors in spatial
+                attention, resnets, downsampling and upsampling blocks.
+        """
+        # TODO(aryan): Discuss on what's the best way to provide more control to users
+        blocks = [*self.unet.down_blocks, self.unet.mid_block, *self.unet.up_blocks]
+        for block in blocks:
+            if getattr(block, "motion_modules", None) is not None:
+                self._enable_split_inference_motion_modules_(block.motion_modules, spatial_split_size)
+            if getattr(block, "attentions", None) is not None:
+                self._enable_split_inference_attentions_(block.attentions, temporal_split_size)
+            if getattr(block, "resnets", None) is not None:
+                self._enable_split_inference_resnets_(block.resnets, temporal_split_size)
+            if getattr(block, "downsamplers", None) is not None:
+                self._enable_split_inference_samplers_(block.downsamplers, temporal_split_size)
+            if getattr(block, "upsamplers", None) is not None:
+                self._enable_split_inference_samplers_(block.upsamplers, temporal_split_size)
+
+    @property
+    def free_noise_enabled(self):
+        return hasattr(self, "_free_noise_context_length") and self._free_noise_context_length is not None
diff --git a/diffusers/src/diffusers/pipelines/hunyuandit/__init__.py b/diffusers/src/diffusers/pipelines/hunyuandit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8337399106f0585d15fa0a35f607baa2c04b203b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/hunyuandit/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_hunyuandit"] = ["HunyuanDiTPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_hunyuandit import HunyuanDiTPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py b/diffusers/src/diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py
new file mode 100644
index 0000000000000000000000000000000000000000..86089abc07b4529a93ca48a5434f78ed26950539
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py
@@ -0,0 +1,900 @@
+# Copyright 2024 HunyuanDiT Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import BertModel, BertTokenizer, CLIPImageProcessor, MT5Tokenizer, T5EncoderModel
+
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKL, HunyuanDiT2DModel
+from ...models.embeddings import get_2d_rotary_pos_embed
+from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import HunyuanDiTPipeline
+
+        >>> pipe = HunyuanDiTPipeline.from_pretrained(
+        ...     "Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> # You may also use English prompt as HunyuanDiT supports both English and Chinese
+        >>> # prompt = "An astronaut riding a horse"
+        >>> prompt = "一个宇航员在骑马"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+STANDARD_RATIO = np.array(
+    [
+        1.0,  # 1:1
+        4.0 / 3.0,  # 4:3
+        3.0 / 4.0,  # 3:4
+        16.0 / 9.0,  # 16:9
+        9.0 / 16.0,  # 9:16
+    ]
+)
+STANDARD_SHAPE = [
+    [(1024, 1024), (1280, 1280)],  # 1:1
+    [(1024, 768), (1152, 864), (1280, 960)],  # 4:3
+    [(768, 1024), (864, 1152), (960, 1280)],  # 3:4
+    [(1280, 768)],  # 16:9
+    [(768, 1280)],  # 9:16
+]
+STANDARD_AREA = [np.array([w * h for w, h in shapes]) for shapes in STANDARD_SHAPE]
+SUPPORTED_SHAPE = [
+    (1024, 1024),
+    (1280, 1280),  # 1:1
+    (1024, 768),
+    (1152, 864),
+    (1280, 960),  # 4:3
+    (768, 1024),
+    (864, 1152),
+    (960, 1280),  # 3:4
+    (1280, 768),  # 16:9
+    (768, 1280),  # 9:16
+]
+
+
+def map_to_standard_shapes(target_width, target_height):
+    target_ratio = target_width / target_height
+    closest_ratio_idx = np.argmin(np.abs(STANDARD_RATIO - target_ratio))
+    closest_area_idx = np.argmin(np.abs(STANDARD_AREA[closest_ratio_idx] - target_width * target_height))
+    width, height = STANDARD_SHAPE[closest_ratio_idx][closest_area_idx]
+    return width, height
+
+
+def get_resize_crop_region_for_grid(src, tgt_size):
+    th = tw = tgt_size
+    h, w = src
+
+    r = h / w
+
+    # resize
+    if r > 1:
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class HunyuanDiTPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for English/Chinese-to-image generation using HunyuanDiT.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    HunyuanDiT uses two text encoders: [mT5](https://huggingface.co/google/mt5-base) and [bilingual CLIP](fine-tuned by
+    ourselves)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. We use
+            `sdxl-vae-fp16-fix`.
+        text_encoder (Optional[`~transformers.BertModel`, `~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+            HunyuanDiT uses a fine-tuned [bilingual CLIP].
+        tokenizer (Optional[`~transformers.BertTokenizer`, `~transformers.CLIPTokenizer`]):
+            A `BertTokenizer` or `CLIPTokenizer` to tokenize text.
+        transformer ([`HunyuanDiT2DModel`]):
+            The HunyuanDiT model designed by Tencent Hunyuan.
+        text_encoder_2 (`T5EncoderModel`):
+            The mT5 embedder. Specifically, it is 't5-v1_1-xxl'.
+        tokenizer_2 (`MT5Tokenizer`):
+            The tokenizer for the mT5 embedder.
+        scheduler ([`DDPMScheduler`]):
+            A scheduler to be used in combination with HunyuanDiT to denoise the encoded image latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = [
+        "safety_checker",
+        "feature_extractor",
+        "text_encoder_2",
+        "tokenizer_2",
+        "text_encoder",
+        "tokenizer",
+    ]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "prompt_embeds_2",
+        "negative_prompt_embeds_2",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: BertModel,
+        tokenizer: BertTokenizer,
+        transformer: HunyuanDiT2DModel,
+        scheduler: DDPMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+        text_encoder_2=T5EncoderModel,
+        tokenizer_2=MT5Tokenizer,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            text_encoder_2=text_encoder_2,
+        )
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.default_sample_size = (
+            self.transformer.config.sample_size
+            if hasattr(self, "transformer") and self.transformer is not None
+            else 128
+        )
+
+    def encode_prompt(
+        self,
+        prompt: str,
+        device: torch.device = None,
+        dtype: torch.dtype = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        max_sequence_length: Optional[int] = None,
+        text_encoder_index: int = 0,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            dtype (`torch.dtype`):
+                torch dtype
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the prompt. Required when `prompt_embeds` is passed directly.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the negative prompt. Required when `negative_prompt_embeds` is passed directly.
+            max_sequence_length (`int`, *optional*): maximum sequence length to use for the prompt.
+            text_encoder_index (`int`, *optional*):
+                Index of the text encoder to use. `0` for clip and `1` for T5.
+        """
+        if dtype is None:
+            if self.text_encoder_2 is not None:
+                dtype = self.text_encoder_2.dtype
+            elif self.transformer is not None:
+                dtype = self.transformer.dtype
+            else:
+                dtype = None
+
+        if device is None:
+            device = self._execution_device
+
+        tokenizers = [self.tokenizer, self.tokenizer_2]
+        text_encoders = [self.text_encoder, self.text_encoder_2]
+
+        tokenizer = tokenizers[text_encoder_index]
+        text_encoder = text_encoders[text_encoder_index]
+
+        if max_sequence_length is None:
+            if text_encoder_index == 0:
+                max_length = 77
+            if text_encoder_index == 1:
+                max_length = 256
+        else:
+            max_length = max_sequence_length
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_attention_mask = text_inputs.attention_mask.to(device)
+            prompt_embeds = text_encoder(
+                text_input_ids.to(device),
+                attention_mask=prompt_attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+            prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            negative_prompt_attention_mask = uncond_input.attention_mask.to(device)
+            negative_prompt_embeds = text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=negative_prompt_attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds, negative_prompt_embeds, prompt_attention_mask, negative_prompt_attention_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+        prompt_embeds_2=None,
+        negative_prompt_embeds_2=None,
+        prompt_attention_mask_2=None,
+        negative_prompt_attention_mask_2=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is None and prompt_embeds_2 is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds_2`. Cannot leave both `prompt` and `prompt_embeds_2` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+        if prompt_embeds_2 is not None and prompt_attention_mask_2 is None:
+            raise ValueError("Must provide `prompt_attention_mask_2` when specifying `prompt_embeds_2`.")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+        if negative_prompt_embeds_2 is not None and negative_prompt_attention_mask_2 is None:
+            raise ValueError(
+                "Must provide `negative_prompt_attention_mask_2` when specifying `negative_prompt_embeds_2`."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if prompt_embeds_2 is not None and negative_prompt_embeds_2 is not None:
+            if prompt_embeds_2.shape != negative_prompt_embeds_2.shape:
+                raise ValueError(
+                    "`prompt_embeds_2` and `negative_prompt_embeds_2` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds_2` {prompt_embeds_2.shape} != `negative_prompt_embeds_2`"
+                    f" {negative_prompt_embeds_2.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_2: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_2: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        prompt_attention_mask_2: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask_2: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = (1024, 1024),
+        target_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        use_resolution_binning: bool = True,
+    ):
+        r"""
+        The call function to the pipeline for generation with HunyuanDiT.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`):
+                The height in pixels of the generated image.
+            width (`int`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            negative_prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the prompt. Required when `prompt_embeds` is passed directly.
+            prompt_attention_mask_2 (`torch.Tensor`, *optional*):
+                Attention mask for the prompt. Required when `prompt_embeds_2` is passed directly.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the negative prompt. Required when `negative_prompt_embeds` is passed directly.
+            negative_prompt_attention_mask_2 (`torch.Tensor`, *optional*):
+                Attention mask for the negative prompt. Required when `negative_prompt_embeds_2` is passed directly.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback_on_step_end (`Callable[[int, int, Dict], None]`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A callback function or a list of callback functions to be called at the end of each denoising step.
+            callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
+                A list of tensor inputs that should be passed to the callback function. If not defined, all tensor
+                inputs will be passed.
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Rescale the noise_cfg according to `guidance_rescale`. Based on findings of [Common Diffusion Noise
+                Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+            original_size (`Tuple[int, int]`, *optional*, defaults to `(1024, 1024)`):
+                The original size of the image. Used to calculate the time ids.
+            target_size (`Tuple[int, int]`, *optional*):
+                The target size of the image. Used to calculate the time ids.
+            crops_coords_top_left (`Tuple[int, int]`, *optional*, defaults to `(0, 0)`):
+                The top left coordinates of the crop. Used to calculate the time ids.
+            use_resolution_binning (`bool`, *optional*, defaults to `True`):
+                Whether to use resolution binning or not. If `True`, the input resolution will be mapped to the closest
+                standard resolution. Supported resolutions are 1024x1024, 1280x1280, 1024x768, 1152x864, 1280x960,
+                768x1024, 864x1152, 960x1280, 1280x768, and 768x1280. It is recommended to set this to `True`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. default height and width
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        height = int((height // 16) * 16)
+        width = int((width // 16) * 16)
+
+        if use_resolution_binning and (height, width) not in SUPPORTED_SHAPE:
+            width, height = map_to_standard_shapes(width, height)
+            height = int(height)
+            width = int(width)
+            logger.warning(f"Reshaped to (height, width)=({height}, {width}), Supported shapes are {SUPPORTED_SHAPE}")
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+            prompt_embeds_2,
+            negative_prompt_embeds_2,
+            prompt_attention_mask_2,
+            negative_prompt_attention_mask_2,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            dtype=self.transformer.dtype,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            max_sequence_length=77,
+            text_encoder_index=0,
+        )
+        (
+            prompt_embeds_2,
+            negative_prompt_embeds_2,
+            prompt_attention_mask_2,
+            negative_prompt_attention_mask_2,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            dtype=self.transformer.dtype,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds_2,
+            negative_prompt_embeds=negative_prompt_embeds_2,
+            prompt_attention_mask=prompt_attention_mask_2,
+            negative_prompt_attention_mask=negative_prompt_attention_mask_2,
+            max_sequence_length=256,
+            text_encoder_index=1,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7 create image_rotary_emb, style embedding & time ids
+        grid_height = height // 8 // self.transformer.config.patch_size
+        grid_width = width // 8 // self.transformer.config.patch_size
+        base_size = 512 // 8 // self.transformer.config.patch_size
+        grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size)
+        image_rotary_emb = get_2d_rotary_pos_embed(
+            self.transformer.inner_dim // self.transformer.num_heads, grid_crops_coords, (grid_height, grid_width)
+        )
+
+        style = torch.tensor([0], device=device)
+
+        target_size = target_size or (height, width)
+        add_time_ids = list(original_size + target_size + crops_coords_top_left)
+        add_time_ids = torch.tensor([add_time_ids], dtype=prompt_embeds.dtype)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask])
+            prompt_embeds_2 = torch.cat([negative_prompt_embeds_2, prompt_embeds_2])
+            prompt_attention_mask_2 = torch.cat([negative_prompt_attention_mask_2, prompt_attention_mask_2])
+            add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
+            style = torch.cat([style] * 2, dim=0)
+
+        prompt_embeds = prompt_embeds.to(device=device)
+        prompt_attention_mask = prompt_attention_mask.to(device=device)
+        prompt_embeds_2 = prompt_embeds_2.to(device=device)
+        prompt_attention_mask_2 = prompt_attention_mask_2.to(device=device)
+        add_time_ids = add_time_ids.to(dtype=prompt_embeds.dtype, device=device).repeat(
+            batch_size * num_images_per_prompt, 1
+        )
+        style = style.to(device=device).repeat(batch_size * num_images_per_prompt)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # expand scalar t to 1-D tensor to match the 1st dim of latent_model_input
+                t_expand = torch.tensor([t] * latent_model_input.shape[0], device=device).to(
+                    dtype=latent_model_input.dtype
+                )
+
+                # predict the noise residual
+                noise_pred = self.transformer(
+                    latent_model_input,
+                    t_expand,
+                    encoder_hidden_states=prompt_embeds,
+                    text_embedding_mask=prompt_attention_mask,
+                    encoder_hidden_states_t5=prompt_embeds_2,
+                    text_embedding_mask_t5=prompt_attention_mask_2,
+                    image_meta_size=add_time_ids,
+                    style=style,
+                    image_rotary_emb=image_rotary_emb,
+                    return_dict=False,
+                )[0]
+
+                noise_pred, _ = noise_pred.chunk(2, dim=1)
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    prompt_embeds_2 = callback_outputs.pop("prompt_embeds_2", prompt_embeds_2)
+                    negative_prompt_embeds_2 = callback_outputs.pop(
+                        "negative_prompt_embeds_2", negative_prompt_embeds_2
+                    )
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/i2vgen_xl/__init__.py b/diffusers/src/diffusers/pipelines/i2vgen_xl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b24a7e4cee7fb843b9424469a05f511adfa758de
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/i2vgen_xl/__init__.py
@@ -0,0 +1,46 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_i2vgen_xl"] = ["I2VGenXLPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_i2vgen_xl import I2VGenXLPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py b/diffusers/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..f528b60e6ed79287bbc4bed658ab637c852c781f
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
@@ -0,0 +1,783 @@
+# Copyright 2024 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...models import AutoencoderKL
+from ...models.unets.unet_i2vgen_xl import I2VGenXLUNet
+from ...schedulers import DDIMScheduler
+from ...utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import I2VGenXLPipeline
+        >>> from diffusers.utils import export_to_gif, load_image
+
+        >>> pipeline = I2VGenXLPipeline.from_pretrained(
+        ...     "ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16"
+        ... )
+        >>> pipeline.enable_model_cpu_offload()
+
+        >>> image_url = (
+        ...     "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
+        ... )
+        >>> image = load_image(image_url).convert("RGB")
+
+        >>> prompt = "Papers were floating in the air on a table in the library"
+        >>> negative_prompt = "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
+        >>> generator = torch.manual_seed(8888)
+
+        >>> frames = pipeline(
+        ...     prompt=prompt,
+        ...     image=image,
+        ...     num_inference_steps=50,
+        ...     negative_prompt=negative_prompt,
+        ...     guidance_scale=9.0,
+        ...     generator=generator,
+        ... ).frames[0]
+        >>> video_path = export_to_gif(frames, "i2v.gif")
+        ```
+"""
+
+
+@dataclass
+class I2VGenXLPipelineOutput(BaseOutput):
+    r"""
+     Output class for image-to-video pipeline.
+
+    Args:
+         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+             denoised
+     PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+    `(batch_size, num_frames, channels, height, width)`
+    """
+
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
+
+
+class I2VGenXLPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+):
+    r"""
+    Pipeline for image-to-video generation as proposed in [I2VGenXL](https://i2vgen-xl.github.io/).
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`I2VGenXLUNet`]):
+            A [`I2VGenXLUNet`] to denoise the encoded video latents.
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        image_encoder: CLIPVisionModelWithProjection,
+        feature_extractor: CLIPImageProcessor,
+        unet: I2VGenXLUNet,
+        scheduler: DDIMScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        # `do_resize=False` as we do custom resizing.
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor, do_resize=False)
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_videos_per_prompt,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_videos_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if self.do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            # Apply clip_skip to negative prompt embeds
+            if clip_skip is None:
+                negative_prompt_embeds = self.text_encoder(
+                    uncond_input.input_ids.to(device),
+                    attention_mask=attention_mask,
+                )
+                negative_prompt_embeds = negative_prompt_embeds[0]
+            else:
+                negative_prompt_embeds = self.text_encoder(
+                    uncond_input.input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                negative_prompt_embeds = negative_prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                negative_prompt_embeds = self.text_encoder.text_model.final_layer_norm(negative_prompt_embeds)
+
+        if self.do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def _encode_image(self, image, device, num_videos_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.video_processor.pil_to_numpy(image)
+            image = self.video_processor.numpy_to_pt(image)
+
+            # Normalize the image with CLIP training stats.
+            image = self.feature_extractor(
+                images=image,
+                do_normalize=True,
+                do_center_crop=False,
+                do_resize=False,
+                do_rescale=False,
+                return_tensors="pt",
+            ).pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+
+        if self.do_classifier_free_guidance:
+            negative_image_embeddings = torch.zeros_like(image_embeddings)
+            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
+
+        return image_embeddings
+
+    def decode_latents(self, latents, decode_chunk_size=None):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        if decode_chunk_size is not None:
+            frames = []
+            for i in range(0, latents.shape[0], decode_chunk_size):
+                frame = self.vae.decode(latents[i : i + decode_chunk_size]).sample
+                frames.append(frame)
+            image = torch.cat(frames, dim=0)
+        else:
+            image = self.vae.decode(latents).sample
+
+        decode_shape = (batch_size, num_frames, -1) + image.shape[2:]
+        video = image[None, :].reshape(decode_shape).permute(0, 2, 1, 3, 4)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+    def prepare_image_latents(
+        self,
+        image,
+        device,
+        num_frames,
+        num_videos_per_prompt,
+    ):
+        image = image.to(device=device)
+        image_latents = self.vae.encode(image).latent_dist.sample()
+        image_latents = image_latents * self.vae.config.scaling_factor
+
+        # Add frames dimension to image latents
+        image_latents = image_latents.unsqueeze(2)
+
+        # Append a position mask for each subsequent frame
+        # after the intial image latent frame
+        frame_position_mask = []
+        for frame_idx in range(num_frames - 1):
+            scale = (frame_idx + 1) / (num_frames - 1)
+            frame_position_mask.append(torch.ones_like(image_latents[:, :, :1]) * scale)
+        if frame_position_mask:
+            frame_position_mask = torch.cat(frame_position_mask, dim=2)
+            image_latents = torch.cat([image_latents, frame_position_mask], dim=2)
+
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1, 1)
+
+        if self.do_classifier_free_guidance:
+            image_latents = torch.cat([image_latents] * 2)
+
+        return image_latents
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = 704,
+        width: Optional[int] = 1280,
+        target_fps: Optional[int] = 16,
+        num_frames: int = 16,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 9.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: float = 0.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        decode_chunk_size: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = 1,
+    ):
+        r"""
+        The call function to the pipeline for image-to-video generation with [`I2VGenXLPipeline`].
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            target_fps (`int`, *optional*):
+                Frames per second. The rate at which the generated images shall be exported to a video after
+                generation. This is also used as a "micro-condition" while generation.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate.
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            num_videos_per_prompt (`int`, *optional*):
+                The number of images to generate per prompt.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. The higher the chunk size, the higher the temporal
+                consistency between frames, but also the higher the memory consumption. By default, the decoder will
+                decode all frames at once for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        Returns:
+            [`pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, image, height, width, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = guidance_scale
+
+        # 3.1 Encode input text prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 3.2 Encode image prompt
+        # 3.2.1 Image encodings.
+        # https://github.com/ali-vilab/i2vgen-xl/blob/2539c9262ff8a2a22fa9daecbfd13f0a2dbc32d0/tools/inferences/inference_i2vgen_entrance.py#L114
+        cropped_image = _center_crop_wide(image, (width, width))
+        cropped_image = _resize_bilinear(
+            cropped_image, (self.feature_extractor.crop_size["width"], self.feature_extractor.crop_size["height"])
+        )
+        image_embeddings = self._encode_image(cropped_image, device, num_videos_per_prompt)
+
+        # 3.2.2 Image latents.
+        resized_image = _center_crop_wide(image, (width, height))
+        image = self.video_processor.preprocess(resized_image).to(device=device, dtype=image_embeddings.dtype)
+        image_latents = self.prepare_image_latents(
+            image,
+            device=device,
+            num_frames=num_frames,
+            num_videos_per_prompt=num_videos_per_prompt,
+        )
+
+        # 3.3 Prepare additional conditions for the UNet.
+        if self.do_classifier_free_guidance:
+            fps_tensor = torch.tensor([target_fps, target_fps]).to(device)
+        else:
+            fps_tensor = torch.tensor([target_fps]).to(device)
+        fps_tensor = fps_tensor.repeat(batch_size * num_videos_per_prompt, 1).ravel()
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    fps=fps_tensor,
+                    image_latents=image_latents,
+                    image_embeddings=image_embeddings,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # reshape latents
+                batch_size, channel, frames, width, height = latents.shape
+                latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * frames, channel, width, height)
+                noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(batch_size * frames, channel, width, height)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # reshape latents back
+                latents = latents[None, :].reshape(batch_size, frames, channel, width, height).permute(0, 2, 1, 3, 4)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        # 8. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents, decode_chunk_size=decode_chunk_size)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+        # 9. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return I2VGenXLPipelineOutput(frames=video)
+
+
+# The following utilities are taken and adapted from
+# https://github.com/ali-vilab/i2vgen-xl/blob/main/utils/transforms.py.
+
+
+def _convert_pt_to_pil(image: Union[torch.Tensor, List[torch.Tensor]]):
+    if isinstance(image, list) and isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, 0)
+
+    if isinstance(image, torch.Tensor):
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        image_numpy = VaeImageProcessor.pt_to_numpy(image)
+        image_pil = VaeImageProcessor.numpy_to_pil(image_numpy)
+        image = image_pil
+
+    return image
+
+
+def _resize_bilinear(
+    image: Union[torch.Tensor, List[torch.Tensor], PIL.Image.Image, List[PIL.Image.Image]], resolution: Tuple[int, int]
+):
+    # First convert the images to PIL in case they are float tensors (only relevant for tests now).
+    image = _convert_pt_to_pil(image)
+
+    if isinstance(image, list):
+        image = [u.resize(resolution, PIL.Image.BILINEAR) for u in image]
+    else:
+        image = image.resize(resolution, PIL.Image.BILINEAR)
+    return image
+
+
+def _center_crop_wide(
+    image: Union[torch.Tensor, List[torch.Tensor], PIL.Image.Image, List[PIL.Image.Image]], resolution: Tuple[int, int]
+):
+    # First convert the images to PIL in case they are float tensors (only relevant for tests now).
+    image = _convert_pt_to_pil(image)
+
+    if isinstance(image, list):
+        scale = min(image[0].size[0] / resolution[0], image[0].size[1] / resolution[1])
+        image = [u.resize((round(u.width // scale), round(u.height // scale)), resample=PIL.Image.BOX) for u in image]
+
+        # center crop
+        x1 = (image[0].width - resolution[0]) // 2
+        y1 = (image[0].height - resolution[1]) // 2
+        image = [u.crop((x1, y1, x1 + resolution[0], y1 + resolution[1])) for u in image]
+        return image
+    else:
+        scale = min(image.size[0] / resolution[0], image.size[1] / resolution[1])
+        image = image.resize((round(image.width // scale), round(image.height // scale)), resample=PIL.Image.BOX)
+        x1 = (image.width - resolution[0]) // 2
+        y1 = (image.height - resolution[1]) // 2
+        image = image.crop((x1, y1, x1 + resolution[0], y1 + resolution[1]))
+        return image
diff --git a/diffusers/src/diffusers/pipelines/kandinsky/__init__.py b/diffusers/src/diffusers/pipelines/kandinsky/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..606f7b378a79489bbcbaa87db2040bd4196bbd8a
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky/__init__.py
@@ -0,0 +1,66 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_kandinsky"] = ["KandinskyPipeline"]
+    _import_structure["pipeline_kandinsky_combined"] = [
+        "KandinskyCombinedPipeline",
+        "KandinskyImg2ImgCombinedPipeline",
+        "KandinskyInpaintCombinedPipeline",
+    ]
+    _import_structure["pipeline_kandinsky_img2img"] = ["KandinskyImg2ImgPipeline"]
+    _import_structure["pipeline_kandinsky_inpaint"] = ["KandinskyInpaintPipeline"]
+    _import_structure["pipeline_kandinsky_prior"] = ["KandinskyPriorPipeline", "KandinskyPriorPipelineOutput"]
+    _import_structure["text_encoder"] = ["MultilingualCLIP"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .pipeline_kandinsky import KandinskyPipeline
+        from .pipeline_kandinsky_combined import (
+            KandinskyCombinedPipeline,
+            KandinskyImg2ImgCombinedPipeline,
+            KandinskyInpaintCombinedPipeline,
+        )
+        from .pipeline_kandinsky_img2img import KandinskyImg2ImgPipeline
+        from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
+        from .pipeline_kandinsky_prior import KandinskyPriorPipeline, KandinskyPriorPipelineOutput
+        from .text_encoder import MultilingualCLIP
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2041e101564af12fc76b2a625b9ffafefaf9ffa
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -0,0 +1,407 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Union
+
+import torch
+from transformers import (
+    XLMRobertaTokenizer,
+)
+
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDIMScheduler, DDPMScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .text_encoder import MultilingualCLIP
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/Kandinsky-2-1-prior")
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "red cat, 4k photo"
+        >>> out = pipe_prior(prompt)
+        >>> image_emb = out.image_embeds
+        >>> negative_image_emb = out.negative_image_embeds
+
+        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ...     prompt,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=negative_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ... ).images
+
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
+        new_h += 1
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
+        new_w += 1
+    return new_h * scale_factor, new_w * scale_factor
+
+
+class KandinskyPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->movq"
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            truncation=True,
+            max_length=77,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        text_input_ids = text_input_ids.to(device)
+        text_mask = text_inputs.attention_mask.to(device)
+
+        prompt_embeds, text_encoder_hidden_states = self.text_encoder(
+            input_ids=text_input_ids, attention_mask=text_mask
+        )
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=77,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            uncond_text_input_ids = uncond_input.input_ids.to(device)
+            uncond_text_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder(
+                input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask
+            )
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image_embeds: Union[torch.Tensor, List[torch.Tensor]],
+        negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=prompt_embeds.dtype, device=device
+            )
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        num_channels_latents = self.unet.config.in_channels
+
+        height, width = get_new_h_w(height, width, self.movq_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            text_encoder_hidden_states.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_encoder_hidden_states,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            ).prev_sample
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # post-processing
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py b/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe9909770376faafacb534f5ed63ac68708167ad
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
@@ -0,0 +1,817 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    XLMRobertaTokenizer,
+)
+
+from ...models import PriorTransformer, UNet2DConditionModel, VQModel
+from ...schedulers import DDIMScheduler, DDPMScheduler, UnCLIPScheduler
+from ...utils import (
+    replace_example_docstring,
+)
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_kandinsky import KandinskyPipeline
+from .pipeline_kandinsky_img2img import KandinskyImg2ImgPipeline
+from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
+from .pipeline_kandinsky_prior import KandinskyPriorPipeline
+from .text_encoder import MultilingualCLIP
+
+
+TEXT2IMAGE_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipe = AutoPipelineForText2Image.from_pretrained(
+            "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"
+
+        image = pipe(prompt=prompt, num_inference_steps=25).images[0]
+        ```
+"""
+
+IMAGE2IMAGE_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import AutoPipelineForImage2Image
+        import torch
+        import requests
+        from io import BytesIO
+        from PIL import Image
+        import os
+
+        pipe = AutoPipelineForImage2Image.from_pretrained(
+            "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "A fantasy landscape, Cinematic lighting"
+        negative_prompt = "low quality, bad quality"
+
+        url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+        image.thumbnail((768, 768))
+
+        image = pipe(prompt=prompt, image=original_image, num_inference_steps=25).images[0]
+        ```
+"""
+
+INPAINT_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import AutoPipelineForInpainting
+        from diffusers.utils import load_image
+        import torch
+        import numpy as np
+
+        pipe = AutoPipelineForInpainting.from_pretrained(
+            "kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "A fantasy landscape, Cinematic lighting"
+        negative_prompt = "low quality, bad quality"
+
+        original_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+
+        mask = np.zeros((768, 768), dtype=np.float32)
+        # Let's mask out an area above the cat's head
+        mask[:250, 250:-250] = 1
+
+        image = pipe(prompt=prompt, image=original_image, mask_image=mask, num_inference_steps=25).images[0]
+        ```
+"""
+
+
+class KandinskyCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for text-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+        prior_prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+             Tokenizer of class
+             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior_scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    _load_connected_pipes = True
+    model_cpu_offload_seq = "text_encoder->unet->movq->prior_prior->prior_image_encoder->prior_text_encoder"
+    _exclude_from_cpu_offload = ["prior_prior"]
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+        movq: VQModel,
+        prior_prior: PriorTransformer,
+        prior_image_encoder: CLIPVisionModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+        prior_image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+            prior_prior=prior_prior,
+            prior_image_encoder=prior_image_encoder,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_processor=prior_image_processor,
+        )
+        self.prior_pipe = KandinskyPriorPipeline(
+            prior=prior_prior,
+            image_encoder=prior_image_encoder,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_processor=prior_image_processor,
+        )
+        self.decoder_pipe = KandinskyPipeline(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
+        Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
+        GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
+        Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.enable_model_cpu_offload()
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(TEXT2IMAGE_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        height: int = 512,
+        width: int = 512,
+        prior_guidance_scale: float = 4.0,
+        prior_num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            prior_num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        prior_outputs = self.prior_pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            num_inference_steps=prior_num_inference_steps,
+            generator=generator,
+            latents=latents,
+            guidance_scale=prior_guidance_scale,
+            output_type="pt",
+            return_dict=False,
+        )
+        image_embeds = prior_outputs[0]
+        negative_image_embeds = prior_outputs[1]
+
+        prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
+
+        if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
+            prompt = (image_embeds.shape[0] // len(prompt)) * prompt
+
+        outputs = self.decoder_pipe(
+            prompt=prompt,
+            image_embeds=image_embeds,
+            negative_image_embeds=negative_image_embeds,
+            width=width,
+            height=height,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            output_type=output_type,
+            callback=callback,
+            callback_steps=callback_steps,
+            return_dict=return_dict,
+        )
+
+        self.maybe_free_model_hooks()
+
+        return outputs
+
+
+class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+        prior_prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+             Tokenizer of class
+             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior_scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    _load_connected_pipes = True
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->" "text_encoder->unet->movq"
+    _exclude_from_cpu_offload = ["prior_prior"]
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+        movq: VQModel,
+        prior_prior: PriorTransformer,
+        prior_image_encoder: CLIPVisionModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+        prior_image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+            prior_prior=prior_prior,
+            prior_image_encoder=prior_image_encoder,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_processor=prior_image_processor,
+        )
+        self.prior_pipe = KandinskyPriorPipeline(
+            prior=prior_prior,
+            image_encoder=prior_image_encoder,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_processor=prior_image_processor,
+        )
+        self.decoder_pipe = KandinskyImg2ImgPipeline(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.enable_model_cpu_offload()
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(IMAGE2IMAGE_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        strength: float = 0.3,
+        height: int = 512,
+        width: int = 512,
+        prior_guidance_scale: float = 4.0,
+        prior_num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 0.3):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            prior_num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        prior_outputs = self.prior_pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            num_inference_steps=prior_num_inference_steps,
+            generator=generator,
+            latents=latents,
+            guidance_scale=prior_guidance_scale,
+            output_type="pt",
+            return_dict=False,
+        )
+        image_embeds = prior_outputs[0]
+        negative_image_embeds = prior_outputs[1]
+
+        prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
+        image = [image] if isinstance(prompt, PIL.Image.Image) else image
+
+        if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
+            prompt = (image_embeds.shape[0] // len(prompt)) * prompt
+
+        if (
+            isinstance(image, (list, tuple))
+            and len(image) < image_embeds.shape[0]
+            and image_embeds.shape[0] % len(image) == 0
+        ):
+            image = (image_embeds.shape[0] // len(image)) * image
+
+        outputs = self.decoder_pipe(
+            prompt=prompt,
+            image=image,
+            image_embeds=image_embeds,
+            negative_image_embeds=negative_image_embeds,
+            strength=strength,
+            width=width,
+            height=height,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            output_type=output_type,
+            callback=callback,
+            callback_steps=callback_steps,
+            return_dict=return_dict,
+        )
+
+        self.maybe_free_model_hooks()
+
+        return outputs
+
+
+class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+        prior_prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+             Tokenizer of class
+             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior_scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    _load_connected_pipes = True
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->text_encoder->unet->movq"
+    _exclude_from_cpu_offload = ["prior_prior"]
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+        movq: VQModel,
+        prior_prior: PriorTransformer,
+        prior_image_encoder: CLIPVisionModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+        prior_image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+            prior_prior=prior_prior,
+            prior_image_encoder=prior_image_encoder,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_processor=prior_image_processor,
+        )
+        self.prior_pipe = KandinskyPriorPipeline(
+            prior=prior_prior,
+            image_encoder=prior_image_encoder,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_processor=prior_image_processor,
+        )
+        self.decoder_pipe = KandinskyInpaintPipeline(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.enable_model_cpu_offload()
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(INPAINT_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
+        mask_image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        height: int = 512,
+        width: int = 512,
+        prior_guidance_scale: float = 4.0,
+        prior_num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            mask_image (`np.array`):
+                Tensor representing an image batch, to mask `image`. White pixels in the mask will be repainted, while
+                black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single
+                channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3,
+                so the expected shape would be `(B, H, W, 1)`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            prior_num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        prior_outputs = self.prior_pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            num_inference_steps=prior_num_inference_steps,
+            generator=generator,
+            latents=latents,
+            guidance_scale=prior_guidance_scale,
+            output_type="pt",
+            return_dict=False,
+        )
+        image_embeds = prior_outputs[0]
+        negative_image_embeds = prior_outputs[1]
+
+        prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
+        image = [image] if isinstance(prompt, PIL.Image.Image) else image
+        mask_image = [mask_image] if isinstance(mask_image, PIL.Image.Image) else mask_image
+
+        if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
+            prompt = (image_embeds.shape[0] // len(prompt)) * prompt
+
+        if (
+            isinstance(image, (list, tuple))
+            and len(image) < image_embeds.shape[0]
+            and image_embeds.shape[0] % len(image) == 0
+        ):
+            image = (image_embeds.shape[0] // len(image)) * image
+
+        if (
+            isinstance(mask_image, (list, tuple))
+            and len(mask_image) < image_embeds.shape[0]
+            and image_embeds.shape[0] % len(mask_image) == 0
+        ):
+            mask_image = (image_embeds.shape[0] // len(mask_image)) * mask_image
+
+        outputs = self.decoder_pipe(
+            prompt=prompt,
+            image=image,
+            mask_image=mask_image,
+            image_embeds=image_embeds,
+            negative_image_embeds=negative_image_embeds,
+            width=width,
+            height=height,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            output_type=output_type,
+            callback=callback,
+            callback_steps=callback_steps,
+            return_dict=return_dict,
+        )
+
+        self.maybe_free_model_hooks()
+
+        return outputs
diff --git a/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef5241fee5d208ad57a1f97283dc3a24cb4cde7e
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -0,0 +1,500 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from PIL import Image
+from transformers import (
+    XLMRobertaTokenizer,
+)
+
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDIMScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .text_encoder import MultilingualCLIP
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline
+        >>> from diffusers.utils import load_image
+        >>> import torch
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "A red cartoon frog, 4k"
+        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+
+        >>> pipe = KandinskyImg2ImgPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/frog.png"
+        ... )
+
+        >>> image = pipe(
+        ...     prompt,
+        ...     image=init_image,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ...     strength=0.2,
+        ... ).images
+
+        >>> image[0].save("red_frog.png")
+        ```
+"""
+
+
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
+        new_h += 1
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
+        new_w += 1
+    return new_h * scale_factor, new_w * scale_factor
+
+
+def prepare_image(pil_image, w=512, h=512):
+    pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
+    arr = np.array(pil_image.convert("RGB"))
+    arr = arr.astype(np.float32) / 127.5 - 1
+    arr = np.transpose(arr, [2, 0, 1])
+    image = torch.from_numpy(arr).unsqueeze(0)
+    return image
+
+
+class KandinskyImg2ImgPipeline(DiffusionPipeline):
+    """
+    Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ image encoder and decoder
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->movq"
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        movq: VQModel,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, latents, latent_timestep, shape, dtype, device, generator, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+
+        shape = latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        latents = self.add_noise(latents, noise, latent_timestep)
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        text_input_ids = text_input_ids.to(device)
+        text_mask = text_inputs.attention_mask.to(device)
+
+        prompt_embeds, text_encoder_hidden_states = self.text_encoder(
+            input_ids=text_input_ids, attention_mask=text_mask
+        )
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=77,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            uncond_text_input_ids = uncond_input.input_ids.to(device)
+            uncond_text_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder(
+                input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask
+            )
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    #  add_noise method to overwrite the one in schedule because it use a different beta schedule for adding noise vs sampling
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        betas = torch.linspace(0.0001, 0.02, 1000, dtype=torch.float32)
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        alphas_cumprod = alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+
+        return noisy_samples
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
+        image_embeds: torch.Tensor,
+        negative_image_embeds: torch.Tensor,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        strength: float = 0.3,
+        guidance_scale: float = 7.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.Tensor`, `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            strength (`float`, *optional*, defaults to 0.3):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        # 1. Define call parameters
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 2. get text and image embeddings
+        prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=prompt_embeds.dtype, device=device
+            )
+
+        # 3. pre-processing initial image
+        if not isinstance(image, list):
+            image = [image]
+        if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support  PIL image and pytorch tensor"
+            )
+
+        image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
+        image = image.to(dtype=prompt_embeds.dtype, device=device)
+
+        latents = self.movq.encode(image)["latents"]
+        latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # 4. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        timesteps_tensor, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+
+        # the formular to calculate timestep for add_noise is taken from the original kandinsky repo
+        latent_timestep = int(self.scheduler.config.num_train_timesteps * strength) - 2
+
+        latent_timestep = torch.tensor([latent_timestep] * batch_size, dtype=timesteps_tensor.dtype, device=device)
+
+        num_channels_latents = self.unet.config.in_channels
+
+        height, width = get_new_h_w(height, width, self.movq_scale_factor)
+
+        # 5. Create initial latent
+        latents = self.prepare_latents(
+            latents,
+            latent_timestep,
+            (batch_size, num_channels_latents, height, width),
+            text_encoder_hidden_states.dtype,
+            device,
+            generator,
+            self.scheduler,
+        )
+
+        # 6. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_encoder_hidden_states,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            ).prev_sample
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # 7. post-processing
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..778b6e314c0df998497aa6319d09017b31df657c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -0,0 +1,635 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import deepcopy
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from packaging import version
+from PIL import Image
+from transformers import (
+    XLMRobertaTokenizer,
+)
+
+from ... import __version__
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDIMScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .text_encoder import MultilingualCLIP
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline
+        >>> from diffusers.utils import load_image
+        >>> import torch
+        >>> import numpy as np
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "a hat"
+        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+
+        >>> pipe = KandinskyInpaintPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+
+        >>> mask = np.zeros((768, 768), dtype=np.float32)
+        >>> mask[:250, 250:-250] = 1
+
+        >>> out = pipe(
+        ...     prompt,
+        ...     image=init_image,
+        ...     mask_image=mask,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... )
+
+        >>> image = out.images[0]
+        >>> image.save("cat_with_hat.png")
+        ```
+"""
+
+
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
+        new_h += 1
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
+        new_w += 1
+    return new_h * scale_factor, new_w * scale_factor
+
+
+def prepare_mask(masks):
+    prepared_masks = []
+    for mask in masks:
+        old_mask = deepcopy(mask)
+        for i in range(mask.shape[1]):
+            for j in range(mask.shape[2]):
+                if old_mask[0][i][j] == 1:
+                    continue
+                if i != 0:
+                    mask[:, i - 1, j] = 0
+                if j != 0:
+                    mask[:, i, j - 1] = 0
+                if i != 0 and j != 0:
+                    mask[:, i - 1, j - 1] = 0
+                if i != mask.shape[1] - 1:
+                    mask[:, i + 1, j] = 0
+                if j != mask.shape[2] - 1:
+                    mask[:, i, j + 1] = 0
+                if i != mask.shape[1] - 1 and j != mask.shape[2] - 1:
+                    mask[:, i + 1, j + 1] = 0
+        prepared_masks.append(mask)
+    return torch.stack(prepared_masks, dim=0)
+
+
+def prepare_mask_and_masked_image(image, mask, height, width):
+    r"""
+    Prepares a pair (mask, image) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will
+    be converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for
+    the ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+        height (`int`, *optional*, defaults to 512):
+            The height in pixels of the generated image.
+        width (`int`, *optional*, defaults to 512):
+            The width in pixels of the generated image.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+
+    if image is None:
+        raise ValueError("`image` input cannot be undefined.")
+
+    if mask is None:
+        raise ValueError("`mask_image` input cannot be undefined.")
+
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            # resize all images w.r.t passed height an width
+            image = [i.resize((width, height), resample=Image.BICUBIC, reducing_gap=1) for i in image]
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+
+        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+    mask = 1 - mask
+
+    return mask, image
+
+
+class KandinskyInpaintPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-guided image inpainting using Kandinsky2.1
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ image encoder and decoder
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->movq"
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        movq: VQModel,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            movq=movq,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+        self._warn_has_been_called = False
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        text_input_ids = text_input_ids.to(device)
+        text_mask = text_inputs.attention_mask.to(device)
+
+        prompt_embeds, text_encoder_hidden_states = self.text_encoder(
+            input_ids=text_input_ids, attention_mask=text_mask
+        )
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=77,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            uncond_text_input_ids = uncond_input.input_ids.to(device)
+            uncond_text_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder(
+                input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask
+            )
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.Tensor, PIL.Image.Image],
+        mask_image: Union[torch.Tensor, PIL.Image.Image, np.ndarray],
+        image_embeds: torch.Tensor,
+        negative_image_embeds: torch.Tensor,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.Tensor`, `PIL.Image.Image` or `np.ndarray`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`PIL.Image.Image`,`torch.Tensor` or `np.ndarray`):
+                `Image`, or a tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. You can pass a pytorch tensor as mask only if the
+                image you passed is a pytorch tensor, and it should contain one color channel (L) instead of 3, so the
+                expected shape would be either `(B, 1, H, W,)`, `(B, H, W)`, `(1, H, W)` or `(H, W)` If image is an PIL
+                image or numpy array, mask should also be a either PIL image or numpy array. If it is a PIL image, it
+                will be converted to a single channel (luminance) before use. If it is a nummpy array, the expected
+                shape is `(H, W)`.
+            image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        if not self._warn_has_been_called and version.parse(version.parse(__version__).base_version) < version.parse(
+            "0.23.0.dev0"
+        ):
+            logger.warning(
+                "Please note that the expected format of `mask_image` has recently been changed. "
+                "Before diffusers == 0.19.0, Kandinsky Inpainting pipelines repainted black pixels and preserved black pixels. "
+                "As of diffusers==0.19.0 this behavior has been inverted. Now white pixels are repainted and black pixels are preserved. "
+                "This way, Kandinsky's masking behavior is aligned with Stable Diffusion. "
+                "THIS means that you HAVE to invert the input mask to have the same behavior as before as explained in https://github.com/huggingface/diffusers/pull/4207. "
+                "This warning will be surpressed after the first inference call and will be removed in diffusers>0.23.0"
+            )
+            self._warn_has_been_called = True
+
+        # Define call parameters
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=prompt_embeds.dtype, device=device
+            )
+
+        # preprocess image and mask
+        mask_image, image = prepare_mask_and_masked_image(image, mask_image, height, width)
+
+        image = image.to(dtype=prompt_embeds.dtype, device=device)
+        image = self.movq.encode(image)["latents"]
+
+        mask_image = mask_image.to(dtype=prompt_embeds.dtype, device=device)
+
+        image_shape = tuple(image.shape[-2:])
+        mask_image = F.interpolate(
+            mask_image,
+            image_shape,
+            mode="nearest",
+        )
+        mask_image = prepare_mask(mask_image)
+        masked_image = image * mask_image
+
+        mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
+        masked_image = masked_image.repeat_interleave(num_images_per_prompt, dim=0)
+        if do_classifier_free_guidance:
+            mask_image = mask_image.repeat(2, 1, 1, 1)
+            masked_image = masked_image.repeat(2, 1, 1, 1)
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        num_channels_latents = self.movq.config.latent_channels
+
+        # get h, w for latents
+        sample_height, sample_width = get_new_h_w(height, width, self.movq_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, sample_height, sample_width),
+            text_encoder_hidden_states.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        # Check that sizes of mask, masked image and latents match with expected
+        num_channels_mask = mask_image.shape[1]
+        num_channels_masked_image = masked_image.shape[1]
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = torch.cat([latent_model_input, masked_image, mask_image], dim=1)
+
+            added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_encoder_hidden_states,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            ).prev_sample
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # post-processing
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5152d71cb6bbdad7532a8845720ea1713ae2c5f
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -0,0 +1,547 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import PriorTransformer
+from ...schedulers import UnCLIPScheduler
+from ...utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior")
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "red cat, 4k photo"
+        >>> out = pipe_prior(prompt)
+        >>> image_emb = out.image_embeds
+        >>> negative_image_emb = out.negative_image_embeds
+
+        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ...     prompt,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=negative_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ... ).images
+
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+EXAMPLE_INTERPOLATE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyPriorPipeline, KandinskyPipeline
+        >>> from diffusers.utils import load_image
+        >>> import PIL
+
+        >>> import torch
+        >>> from torchvision import transforms
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> img1 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+
+        >>> img2 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/starry_night.jpeg"
+        ... )
+
+        >>> images_texts = ["a cat", img1, img2]
+        >>> weights = [0.3, 0.3, 0.4]
+        >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
+
+        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ...     "",
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=150,
+        ... ).images[0]
+
+        >>> image.save("starry_cat.png")
+        ```
+"""
+
+
+@dataclass
+class KandinskyPriorPipelineOutput(BaseOutput):
+    """
+    Output class for KandinskyPriorPipeline.
+
+    Args:
+        image_embeds (`torch.Tensor`)
+            clip image embeddings for text prompt
+        negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
+            clip image embeddings for unconditional tokens
+    """
+
+    image_embeds: Union[torch.Tensor, np.ndarray]
+    negative_image_embeds: Union[torch.Tensor, np.ndarray]
+
+
+class KandinskyPriorPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating image prior for Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    _exclude_from_cpu_offload = ["prior"]
+    model_cpu_offload_seq = "text_encoder->prior"
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        image_encoder: CLIPVisionModelWithProjection,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: UnCLIPScheduler,
+        image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+        )
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
+    def interpolate(
+        self,
+        images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]],
+        weights: List[float],
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        negative_prior_prompt: Optional[str] = None,
+        negative_prompt: str = "",
+        guidance_scale: float = 4.0,
+        device=None,
+    ):
+        """
+        Function invoked when using the prior pipeline for interpolation.
+
+        Args:
+            images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`):
+                list of prompts and images to guide the image generation.
+            weights: (`List[float]`):
+                list of weights for each condition in `images_and_prompts`
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            negative_prior_prompt (`str`, *optional*):
+                The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+
+        Examples:
+
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        device = device or self.device
+
+        if len(images_and_prompts) != len(weights):
+            raise ValueError(
+                f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length"
+            )
+
+        image_embeddings = []
+        for cond, weight in zip(images_and_prompts, weights):
+            if isinstance(cond, str):
+                image_emb = self(
+                    cond,
+                    num_inference_steps=num_inference_steps,
+                    num_images_per_prompt=num_images_per_prompt,
+                    generator=generator,
+                    latents=latents,
+                    negative_prompt=negative_prior_prompt,
+                    guidance_scale=guidance_scale,
+                ).image_embeds
+
+            elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
+                if isinstance(cond, PIL.Image.Image):
+                    cond = (
+                        self.image_processor(cond, return_tensors="pt")
+                        .pixel_values[0]
+                        .unsqueeze(0)
+                        .to(dtype=self.image_encoder.dtype, device=device)
+                    )
+
+                image_emb = self.image_encoder(cond)["image_embeds"]
+
+            else:
+                raise ValueError(
+                    f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is {type(cond)}"
+                )
+
+            image_embeddings.append(image_emb * weight)
+
+        image_emb = torch.cat(image_embeddings).sum(dim=0, keepdim=True)
+
+        out_zero = self(
+            negative_prompt,
+            num_inference_steps=num_inference_steps,
+            num_images_per_prompt=num_images_per_prompt,
+            generator=generator,
+            latents=latents,
+            negative_prompt=negative_prior_prompt,
+            guidance_scale=guidance_scale,
+        )
+        zero_image_emb = out_zero.negative_image_embeds if negative_prompt == "" else out_zero.image_embeds
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=zero_image_emb)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def get_zero_embed(self, batch_size=1, device=None):
+        device = device or self.device
+        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
+            device=device, dtype=self.image_encoder.dtype
+        )
+        zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
+        zero_image_emb = zero_image_emb.repeat(batch_size, 1)
+        return zero_image_emb
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        guidance_scale: float = 4.0,
+        output_type: Optional[str] = "pt",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
+                (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        elif not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+        elif not isinstance(negative_prompt, list) and negative_prompt is not None:
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+        # if the negative prompt is defined we double the batch size to
+        # directly retrieve the negative prompt embedding
+        if negative_prompt is not None:
+            prompt = prompt + negative_prompt
+            negative_prompt = 2 * negative_prompt
+
+        device = self._execution_device
+
+        batch_size = len(prompt)
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # prior
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        prior_timesteps_tensor = self.scheduler.timesteps
+
+        embedding_dim = self.prior.config.embedding_dim
+
+        latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+                encoder_hidden_states=text_encoder_hidden_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == prior_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = prior_timesteps_tensor[i + 1]
+
+            latents = self.scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+        latents = self.prior.post_process_latents(latents)
+
+        image_embeddings = latents
+
+        # if negative prompt has been defined, we retrieve split the image embedding into two
+        if negative_prompt is None:
+            zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
+
+            self.maybe_free_model_hooks()
+        else:
+            image_embeddings, zero_embeds = image_embeddings.chunk(2)
+
+            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                self.prior_hook.offload()
+
+        if output_type not in ["pt", "np"]:
+            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
+
+        if output_type == "np":
+            image_embeddings = image_embeddings.cpu().numpy()
+            zero_embeds = zero_embeds.cpu().numpy()
+
+        if not return_dict:
+            return (image_embeddings, zero_embeds)
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky/text_encoder.py b/diffusers/src/diffusers/pipelines/kandinsky/text_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..caa0029f00ca22818819d5b76b57ec489c6da1d6
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky/text_encoder.py
@@ -0,0 +1,27 @@
+import torch
+from transformers import PreTrainedModel, XLMRobertaConfig, XLMRobertaModel
+
+
+class MCLIPConfig(XLMRobertaConfig):
+    model_type = "M-CLIP"
+
+    def __init__(self, transformerDimSize=1024, imageDimSize=768, **kwargs):
+        self.transformerDimensions = transformerDimSize
+        self.numDims = imageDimSize
+        super().__init__(**kwargs)
+
+
+class MultilingualCLIP(PreTrainedModel):
+    config_class = MCLIPConfig
+
+    def __init__(self, config, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+        self.transformer = XLMRobertaModel(config)
+        self.LinearTransformation = torch.nn.Linear(
+            in_features=config.transformerDimensions, out_features=config.numDims
+        )
+
+    def forward(self, input_ids, attention_mask):
+        embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0]
+        embs2 = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(dim=1)[:, None]
+        return self.LinearTransformation(embs2), embs
diff --git a/diffusers/src/diffusers/pipelines/kandinsky2_2/__init__.py b/diffusers/src/diffusers/pipelines/kandinsky2_2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..67e97f161173ac8981dadf757fd8d6438307c973
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky2_2/__init__.py
@@ -0,0 +1,70 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_kandinsky2_2"] = ["KandinskyV22Pipeline"]
+    _import_structure["pipeline_kandinsky2_2_combined"] = [
+        "KandinskyV22CombinedPipeline",
+        "KandinskyV22Img2ImgCombinedPipeline",
+        "KandinskyV22InpaintCombinedPipeline",
+    ]
+    _import_structure["pipeline_kandinsky2_2_controlnet"] = ["KandinskyV22ControlnetPipeline"]
+    _import_structure["pipeline_kandinsky2_2_controlnet_img2img"] = ["KandinskyV22ControlnetImg2ImgPipeline"]
+    _import_structure["pipeline_kandinsky2_2_img2img"] = ["KandinskyV22Img2ImgPipeline"]
+    _import_structure["pipeline_kandinsky2_2_inpainting"] = ["KandinskyV22InpaintPipeline"]
+    _import_structure["pipeline_kandinsky2_2_prior"] = ["KandinskyV22PriorPipeline"]
+    _import_structure["pipeline_kandinsky2_2_prior_emb2emb"] = ["KandinskyV22PriorEmb2EmbPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_kandinsky2_2 import KandinskyV22Pipeline
+        from .pipeline_kandinsky2_2_combined import (
+            KandinskyV22CombinedPipeline,
+            KandinskyV22Img2ImgCombinedPipeline,
+            KandinskyV22InpaintCombinedPipeline,
+        )
+        from .pipeline_kandinsky2_2_controlnet import KandinskyV22ControlnetPipeline
+        from .pipeline_kandinsky2_2_controlnet_img2img import KandinskyV22ControlnetImg2ImgPipeline
+        from .pipeline_kandinsky2_2_img2img import KandinskyV22Img2ImgPipeline
+        from .pipeline_kandinsky2_2_inpainting import KandinskyV22InpaintPipeline
+        from .pipeline_kandinsky2_2_prior import KandinskyV22PriorPipeline
+        from .pipeline_kandinsky2_2_prior_emb2emb import KandinskyV22PriorEmb2EmbPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..471db61556f5491d985e4fbccf33457d0c5e6bdf
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
@@ -0,0 +1,320 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List, Optional, Union
+
+import torch
+
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import deprecate, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
+        >>> pipe_prior.to("cuda")
+        >>> prompt = "red cat, 4k photo"
+        >>> out = pipe_prior(prompt)
+        >>> image_emb = out.image_embeds
+        >>> zero_image_emb = out.negative_image_embeds
+        >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
+        >>> pipe.to("cuda")
+        >>> image = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... ).images
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+class KandinskyV22Pipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    model_cpu_offload_seq = "unet->movq"
+    _callback_tensor_inputs = ["latents", "image_embeds", "negative_image_embeds"]
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image_embeds: Union[torch.Tensor, List[torch.Tensor]],
+        negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        device = self._execution_device
+
+        self._guidance_scale = guidance_scale
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        batch_size = image_embeds.shape[0] * num_images_per_prompt
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if self.do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=self.unet.dtype, device=device
+            )
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        num_channels_latents = self.unet.config.in_channels
+
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            image_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        self._num_timesteps = len(timesteps)
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if self.do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            )[0]
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                image_embeds = callback_outputs.pop("image_embeds", image_embeds)
+                negative_image_embeds = callback_outputs.pop("negative_image_embeds", negative_image_embeds)
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if output_type not in ["pt", "np", "pil", "latent"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if not output_type == "latent":
+            # post-processing
+            image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+            if output_type in ["np", "pil"]:
+                image = image * 0.5 + 0.5
+                image = image.clamp(0, 1)
+                image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            if output_type == "pil":
+                image = self.numpy_to_pil(image)
+        else:
+            image = latents
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..68334fef381195b0329f68838b47489f72f679e9
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
@@ -0,0 +1,854 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import PriorTransformer, UNet2DConditionModel, VQModel
+from ...schedulers import DDPMScheduler, UnCLIPScheduler
+from ...utils import deprecate, logging, replace_example_docstring
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_kandinsky2_2 import KandinskyV22Pipeline
+from .pipeline_kandinsky2_2_img2img import KandinskyV22Img2ImgPipeline
+from .pipeline_kandinsky2_2_inpainting import KandinskyV22InpaintPipeline
+from .pipeline_kandinsky2_2_prior import KandinskyV22PriorPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+TEXT2IMAGE_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipe = AutoPipelineForText2Image.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"
+
+        image = pipe(prompt=prompt, num_inference_steps=25).images[0]
+        ```
+"""
+
+IMAGE2IMAGE_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import AutoPipelineForImage2Image
+        import torch
+        import requests
+        from io import BytesIO
+        from PIL import Image
+        import os
+
+        pipe = AutoPipelineForImage2Image.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "A fantasy landscape, Cinematic lighting"
+        negative_prompt = "low quality, bad quality"
+
+        url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+        image.thumbnail((768, 768))
+
+        image = pipe(prompt=prompt, image=original_image, num_inference_steps=25).images[0]
+        ```
+"""
+
+INPAINT_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import AutoPipelineForInpainting
+        from diffusers.utils import load_image
+        import torch
+        import numpy as np
+
+        pipe = AutoPipelineForInpainting.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "A fantasy landscape, Cinematic lighting"
+        negative_prompt = "low quality, bad quality"
+
+        original_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+
+        mask = np.zeros((768, 768), dtype=np.float32)
+        # Let's mask out an area above the cat's head
+        mask[:250, 250:-250] = 1
+
+        image = pipe(prompt=prompt, image=original_image, mask_image=mask, num_inference_steps=25).images[0]
+        ```
+"""
+
+
+class KandinskyV22CombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for text-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+        prior_prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior_scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        prior_image_processor ([`CLIPImageProcessor`]):
+            A image_processor to be used to preprocess image from clip.
+    """
+
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
+    _load_connected_pipes = True
+    _exclude_from_cpu_offload = ["prior_prior"]
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+        prior_prior: PriorTransformer,
+        prior_image_encoder: CLIPVisionModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+        prior_image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+            prior_prior=prior_prior,
+            prior_image_encoder=prior_image_encoder,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_processor=prior_image_processor,
+        )
+        self.prior_pipe = KandinskyV22PriorPipeline(
+            prior=prior_prior,
+            image_encoder=prior_image_encoder,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_processor=prior_image_processor,
+        )
+        self.decoder_pipe = KandinskyV22Pipeline(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.enable_model_cpu_offload()
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(TEXT2IMAGE_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        height: int = 512,
+        width: int = 512,
+        prior_guidance_scale: float = 4.0,
+        prior_num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+        prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        prior_callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            prior_num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            prior_callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference of the prior pipeline.
+                The function is called with the following arguments: `prior_callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
+            prior_callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
+                list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
+                the `._callback_tensor_inputs` attribute of your prior pipeline class.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference of the decoder pipeline.
+                The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline,
+                step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors
+                as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        prior_outputs = self.prior_pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            num_inference_steps=prior_num_inference_steps,
+            generator=generator,
+            latents=latents,
+            guidance_scale=prior_guidance_scale,
+            output_type="pt",
+            return_dict=False,
+            callback_on_step_end=prior_callback_on_step_end,
+            callback_on_step_end_tensor_inputs=prior_callback_on_step_end_tensor_inputs,
+        )
+        image_embeds = prior_outputs[0]
+        negative_image_embeds = prior_outputs[1]
+
+        prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
+
+        if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
+            prompt = (image_embeds.shape[0] // len(prompt)) * prompt
+
+        outputs = self.decoder_pipe(
+            image_embeds=image_embeds,
+            negative_image_embeds=negative_image_embeds,
+            width=width,
+            height=height,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            output_type=output_type,
+            callback=callback,
+            callback_steps=callback_steps,
+            return_dict=return_dict,
+            callback_on_step_end=callback_on_step_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+        self.maybe_free_model_hooks()
+
+        return outputs
+
+
+class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+        prior_prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior_scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        prior_image_processor ([`CLIPImageProcessor`]):
+            A image_processor to be used to preprocess image from clip.
+    """
+
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
+    _load_connected_pipes = True
+    _exclude_from_cpu_offload = ["prior_prior"]
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+        prior_prior: PriorTransformer,
+        prior_image_encoder: CLIPVisionModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+        prior_image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+            prior_prior=prior_prior,
+            prior_image_encoder=prior_image_encoder,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_processor=prior_image_processor,
+        )
+        self.prior_pipe = KandinskyV22PriorPipeline(
+            prior=prior_prior,
+            image_encoder=prior_image_encoder,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_processor=prior_image_processor,
+        )
+        self.decoder_pipe = KandinskyV22Img2ImgPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
+        self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
+
+    def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.enable_model_cpu_offload()
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(IMAGE2IMAGE_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        strength: float = 0.3,
+        num_images_per_prompt: int = 1,
+        height: int = 512,
+        width: int = 512,
+        prior_guidance_scale: float = 4.0,
+        prior_num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+        prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        prior_callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            strength (`float`, *optional*, defaults to 0.3):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            prior_num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        prior_outputs = self.prior_pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            num_inference_steps=prior_num_inference_steps,
+            generator=generator,
+            latents=latents,
+            guidance_scale=prior_guidance_scale,
+            output_type="pt",
+            return_dict=False,
+            callback_on_step_end=prior_callback_on_step_end,
+            callback_on_step_end_tensor_inputs=prior_callback_on_step_end_tensor_inputs,
+        )
+        image_embeds = prior_outputs[0]
+        negative_image_embeds = prior_outputs[1]
+
+        prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
+        image = [image] if isinstance(image, PIL.Image.Image) else image
+
+        if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
+            prompt = (image_embeds.shape[0] // len(prompt)) * prompt
+
+        if (
+            isinstance(image, (list, tuple))
+            and len(image) < image_embeds.shape[0]
+            and image_embeds.shape[0] % len(image) == 0
+        ):
+            image = (image_embeds.shape[0] // len(image)) * image
+
+        outputs = self.decoder_pipe(
+            image=image,
+            image_embeds=image_embeds,
+            negative_image_embeds=negative_image_embeds,
+            width=width,
+            height=height,
+            strength=strength,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            output_type=output_type,
+            callback=callback,
+            callback_steps=callback_steps,
+            return_dict=return_dict,
+            callback_on_step_end=callback_on_step_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        self.maybe_free_model_hooks()
+        return outputs
+
+
+class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for inpainting generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+        prior_prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior_scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        prior_image_processor ([`CLIPImageProcessor`]):
+            A image_processor to be used to preprocess image from clip.
+    """
+
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
+    _load_connected_pipes = True
+    _exclude_from_cpu_offload = ["prior_prior"]
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+        prior_prior: PriorTransformer,
+        prior_image_encoder: CLIPVisionModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+        prior_image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+            prior_prior=prior_prior,
+            prior_image_encoder=prior_image_encoder,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_processor=prior_image_processor,
+        )
+        self.prior_pipe = KandinskyV22PriorPipeline(
+            prior=prior_prior,
+            image_encoder=prior_image_encoder,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_processor=prior_image_processor,
+        )
+        self.decoder_pipe = KandinskyV22InpaintPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.enable_model_cpu_offload()
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(INPAINT_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
+        mask_image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        height: int = 512,
+        width: int = 512,
+        prior_guidance_scale: float = 4.0,
+        prior_num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        prior_callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            mask_image (`np.array`):
+                Tensor representing an image batch, to mask `image`. White pixels in the mask will be repainted, while
+                black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single
+                channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3,
+                so the expected shape would be `(B, H, W, 1)`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            prior_num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            prior_callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `prior_callback_on_step_end(self: DiffusionPipeline, step: int, timestep:
+                int, callback_kwargs: Dict)`.
+            prior_callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
+                list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
+                the `._callback_tensor_inputs` attribute of your pipeline class.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        prior_kwargs = {}
+        if kwargs.get("prior_callback", None) is not None:
+            prior_kwargs["callback"] = kwargs.pop("prior_callback")
+            deprecate(
+                "prior_callback",
+                "1.0.0",
+                "Passing `prior_callback` as an input argument to `__call__` is deprecated, consider use `prior_callback_on_step_end`",
+            )
+        if kwargs.get("prior_callback_steps", None) is not None:
+            deprecate(
+                "prior_callback_steps",
+                "1.0.0",
+                "Passing `prior_callback_steps` as an input argument to `__call__` is deprecated, consider use `prior_callback_on_step_end`",
+            )
+            prior_kwargs["callback_steps"] = kwargs.pop("prior_callback_steps")
+
+        prior_outputs = self.prior_pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            num_inference_steps=prior_num_inference_steps,
+            generator=generator,
+            latents=latents,
+            guidance_scale=prior_guidance_scale,
+            output_type="pt",
+            return_dict=False,
+            callback_on_step_end=prior_callback_on_step_end,
+            callback_on_step_end_tensor_inputs=prior_callback_on_step_end_tensor_inputs,
+            **prior_kwargs,
+        )
+        image_embeds = prior_outputs[0]
+        negative_image_embeds = prior_outputs[1]
+
+        prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
+        image = [image] if isinstance(image, PIL.Image.Image) else image
+        mask_image = [mask_image] if isinstance(mask_image, PIL.Image.Image) else mask_image
+
+        if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
+            prompt = (image_embeds.shape[0] // len(prompt)) * prompt
+
+        if (
+            isinstance(image, (list, tuple))
+            and len(image) < image_embeds.shape[0]
+            and image_embeds.shape[0] % len(image) == 0
+        ):
+            image = (image_embeds.shape[0] // len(image)) * image
+
+        if (
+            isinstance(mask_image, (list, tuple))
+            and len(mask_image) < image_embeds.shape[0]
+            and image_embeds.shape[0] % len(mask_image) == 0
+        ):
+            mask_image = (image_embeds.shape[0] // len(mask_image)) * mask_image
+
+        outputs = self.decoder_pipe(
+            image=image,
+            mask_image=mask_image,
+            image_embeds=image_embeds,
+            negative_image_embeds=negative_image_embeds,
+            width=width,
+            height=height,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback_on_step_end=callback_on_step_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            **kwargs,
+        )
+        self.maybe_free_model_hooks()
+
+        return outputs
diff --git a/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0130c3951b38b8ff404bf5747c1bb194abc351b3
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
@@ -0,0 +1,320 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Union
+
+import torch
+
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    logging,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> import numpy as np
+
+        >>> from diffusers import KandinskyV22PriorPipeline, KandinskyV22ControlnetPipeline
+        >>> from transformers import pipeline
+        >>> from diffusers.utils import load_image
+
+
+        >>> def make_hint(image, depth_estimator):
+        ...     image = depth_estimator(image)["depth"]
+        ...     image = np.array(image)
+        ...     image = image[:, :, None]
+        ...     image = np.concatenate([image, image, image], axis=2)
+        ...     detected_map = torch.from_numpy(image).float() / 255.0
+        ...     hint = detected_map.permute(2, 0, 1)
+        ...     return hint
+
+
+        >>> depth_estimator = pipeline("depth-estimation")
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior = pipe_prior.to("cuda")
+
+        >>> pipe = KandinskyV22ControlnetPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+
+        >>> img = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... ).resize((768, 768))
+
+        >>> hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")
+
+        >>> prompt = "A robot, 4k photo"
+        >>> negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(43)
+
+        >>> image_emb, zero_image_emb = pipe_prior(
+        ...     prompt=prompt, negative_prompt=negative_prior_prompt, generator=generator
+        ... ).to_tuple()
+
+        >>> images = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     hint=hint,
+        ...     num_inference_steps=50,
+        ...     generator=generator,
+        ...     height=768,
+        ...     width=768,
+        ... ).images
+
+        >>> images[0].save("robot_cat.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+class KandinskyV22ControlnetPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    model_cpu_offload_seq = "unet->movq"
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image_embeds: Union[torch.Tensor, List[torch.Tensor]],
+        negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
+        hint: torch.Tensor,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            hint (`torch.Tensor`):
+                The controlnet condition.
+            image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        device = self._execution_device
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+        if isinstance(hint, list):
+            hint = torch.cat(hint, dim=0)
+
+        batch_size = image_embeds.shape[0] * num_images_per_prompt
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            hint = hint.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=self.unet.dtype, device=device
+            )
+            hint = torch.cat([hint, hint], dim=0).to(dtype=self.unet.dtype, device=device)
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        num_channels_latents = self.movq.config.latent_channels
+
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            image_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"image_embeds": image_embeds, "hint": hint}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            )[0]
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+        # post-processing
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..12be1534c642c49dd04ca847b80e718969a2c3ee
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
@@ -0,0 +1,381 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from PIL import Image
+
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    logging,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> import numpy as np
+
+        >>> from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22ControlnetImg2ImgPipeline
+        >>> from transformers import pipeline
+        >>> from diffusers.utils import load_image
+
+
+        >>> def make_hint(image, depth_estimator):
+        ...     image = depth_estimator(image)["depth"]
+        ...     image = np.array(image)
+        ...     image = image[:, :, None]
+        ...     image = np.concatenate([image, image, image], axis=2)
+        ...     detected_map = torch.from_numpy(image).float() / 255.0
+        ...     hint = detected_map.permute(2, 0, 1)
+        ...     return hint
+
+
+        >>> depth_estimator = pipeline("depth-estimation")
+
+        >>> pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior = pipe_prior.to("cuda")
+
+        >>> pipe = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> img = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... ).resize((768, 768))
+
+
+        >>> hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")
+
+        >>> prompt = "A robot, 4k photo"
+        >>> negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(43)
+
+        >>> img_emb = pipe_prior(prompt=prompt, image=img, strength=0.85, generator=generator)
+        >>> negative_emb = pipe_prior(prompt=negative_prior_prompt, image=img, strength=1, generator=generator)
+
+        >>> images = pipe(
+        ...     image=img,
+        ...     strength=0.5,
+        ...     image_embeds=img_emb.image_embeds,
+        ...     negative_image_embeds=negative_emb.image_embeds,
+        ...     hint=hint,
+        ...     num_inference_steps=50,
+        ...     generator=generator,
+        ...     height=768,
+        ...     width=768,
+        ... ).images
+
+        >>> images[0].save("robot_cat.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.prepare_image
+def prepare_image(pil_image, w=512, h=512):
+    pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
+    arr = np.array(pil_image.convert("RGB"))
+    arr = arr.astype(np.float32) / 127.5 - 1
+    arr = np.transpose(arr, [2, 0, 1])
+    image = torch.from_numpy(arr).unsqueeze(0)
+    return image
+
+
+class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
+    """
+    Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    model_cpu_offload_seq = "unet->movq"
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.KandinskyImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2_img2img.KandinskyV22Img2ImgPipeline.prepare_latents
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    self.movq.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = self.movq.encode(image).latent_dist.sample(generator)
+
+            init_latents = self.movq.config.scaling_factor * init_latents
+
+        init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image_embeds: Union[torch.Tensor, List[torch.Tensor]],
+        image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
+        negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
+        hint: torch.Tensor,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        strength: float = 0.3,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            hint (`torch.Tensor`):
+                The controlnet condition.
+            negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        device = self._execution_device
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+        if isinstance(hint, list):
+            hint = torch.cat(hint, dim=0)
+
+        batch_size = image_embeds.shape[0]
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            hint = hint.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=self.unet.dtype, device=device
+            )
+            hint = torch.cat([hint, hint], dim=0).to(dtype=self.unet.dtype, device=device)
+
+        if not isinstance(image, list):
+            image = [image]
+        if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support  PIL image and pytorch tensor"
+            )
+
+        image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
+        image = image.to(dtype=image_embeds.dtype, device=device)
+
+        latents = self.movq.encode(image)["latents"]
+        latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
+        latents = self.prepare_latents(
+            latents, latent_timestep, batch_size, num_images_per_prompt, image_embeds.dtype, device, generator
+        )
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"image_embeds": image_embeds, "hint": hint}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            )[0]
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # post-processing
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..899273a1a7362374cb91d66366095910ab6d8403
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
@@ -0,0 +1,399 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from PIL import Image
+
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22Img2ImgPipeline, KandinskyV22PriorPipeline
+        >>> from diffusers.utils import load_image
+        >>> import torch
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "A red cartoon frog, 4k"
+        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+
+        >>> pipe = KandinskyV22Img2ImgPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/frog.png"
+        ... )
+
+        >>> image = pipe(
+        ...     image=init_image,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ...     strength=0.2,
+        ... ).images
+
+        >>> image[0].save("red_frog.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.prepare_image
+def prepare_image(pil_image, w=512, h=512):
+    pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
+    arr = np.array(pil_image.convert("RGB"))
+    arr = arr.astype(np.float32) / 127.5 - 1
+    arr = np.transpose(arr, [2, 0, 1])
+    image = torch.from_numpy(arr).unsqueeze(0)
+    return image
+
+
+class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
+    """
+    Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    model_cpu_offload_seq = "unet->movq"
+    _callback_tensor_inputs = ["latents", "image_embeds", "negative_image_embeds"]
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.KandinskyImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    self.movq.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = self.movq.encode(image).latent_dist.sample(generator)
+
+            init_latents = self.movq.config.scaling_factor * init_latents
+
+        init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image_embeds: Union[torch.Tensor, List[torch.Tensor]],
+        image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
+        negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        strength: float = 0.3,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        device = self._execution_device
+
+        self._guidance_scale = guidance_scale
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        batch_size = image_embeds.shape[0]
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if self.do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=self.unet.dtype, device=device
+            )
+
+        if not isinstance(image, list):
+            image = [image]
+        if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support  PIL image and pytorch tensor"
+            )
+
+        image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
+        image = image.to(dtype=image_embeds.dtype, device=device)
+
+        latents = self.movq.encode(image)["latents"]
+        latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
+        latents = self.prepare_latents(
+            latents, latent_timestep, batch_size, num_images_per_prompt, image_embeds.dtype, device, generator
+        )
+        self._num_timesteps = len(timesteps)
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if self.do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            )[0]
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                image_embeds = callback_outputs.pop("image_embeds", image_embeds)
+                negative_image_embeds = callback_outputs.pop("negative_image_embeds", negative_image_embeds)
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if output_type not in ["pt", "np", "pil", "latent"]:
+            raise ValueError(
+                f"Only the output types `pt`, `pil` ,`np` and `latent` are supported not output_type={output_type}"
+            )
+
+        if not output_type == "latent":
+            # post-processing
+            image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+            if output_type in ["np", "pil"]:
+                image = image * 0.5 + 0.5
+                image = image.clamp(0, 1)
+                image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            if output_type == "pil":
+                image = self.numpy_to_pil(image)
+        else:
+            image = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5ba7a0011a14058a01f53c34ef5dfe476e28708
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
@@ -0,0 +1,556 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import deepcopy
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from packaging import version
+from PIL import Image
+
+from ... import __version__
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22InpaintPipeline, KandinskyV22PriorPipeline
+        >>> from diffusers.utils import load_image
+        >>> import torch
+        >>> import numpy as np
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "a hat"
+        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+
+        >>> pipe = KandinskyV22InpaintPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+
+        >>> mask = np.zeros((768, 768), dtype=np.float32)
+        >>> mask[:250, 250:-250] = 1
+
+        >>> out = pipe(
+        ...     image=init_image,
+        ...     mask_image=mask,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... )
+
+        >>> image = out.images[0]
+        >>> image.save("cat_with_hat.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_inpaint.prepare_mask
+def prepare_mask(masks):
+    prepared_masks = []
+    for mask in masks:
+        old_mask = deepcopy(mask)
+        for i in range(mask.shape[1]):
+            for j in range(mask.shape[2]):
+                if old_mask[0][i][j] == 1:
+                    continue
+                if i != 0:
+                    mask[:, i - 1, j] = 0
+                if j != 0:
+                    mask[:, i, j - 1] = 0
+                if i != 0 and j != 0:
+                    mask[:, i - 1, j - 1] = 0
+                if i != mask.shape[1] - 1:
+                    mask[:, i + 1, j] = 0
+                if j != mask.shape[2] - 1:
+                    mask[:, i, j + 1] = 0
+                if i != mask.shape[1] - 1 and j != mask.shape[2] - 1:
+                    mask[:, i + 1, j + 1] = 0
+        prepared_masks.append(mask)
+    return torch.stack(prepared_masks, dim=0)
+
+
+# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_inpaint.prepare_mask_and_masked_image
+def prepare_mask_and_masked_image(image, mask, height, width):
+    r"""
+    Prepares a pair (mask, image) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will
+    be converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for
+    the ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+        height (`int`, *optional*, defaults to 512):
+            The height in pixels of the generated image.
+        width (`int`, *optional*, defaults to 512):
+            The width in pixels of the generated image.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+
+    if image is None:
+        raise ValueError("`image` input cannot be undefined.")
+
+    if mask is None:
+        raise ValueError("`mask_image` input cannot be undefined.")
+
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            # resize all images w.r.t passed height an width
+            image = [i.resize((width, height), resample=Image.BICUBIC, reducing_gap=1) for i in image]
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+
+        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+    mask = 1 - mask
+
+    return mask, image
+
+
+class KandinskyV22InpaintPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-guided image inpainting using Kandinsky2.1
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    model_cpu_offload_seq = "unet->movq"
+    _callback_tensor_inputs = ["latents", "image_embeds", "negative_image_embeds", "masked_image", "mask_image"]
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+        self._warn_has_been_called = False
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image_embeds: Union[torch.Tensor, List[torch.Tensor]],
+        image: Union[torch.Tensor, PIL.Image.Image],
+        mask_image: Union[torch.Tensor, PIL.Image.Image, np.ndarray],
+        negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`np.array`):
+                Tensor representing an image batch, to mask `image`. White pixels in the mask will be repainted, while
+                black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single
+                channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3,
+                so the expected shape would be `(B, H, W, 1)`.
+            negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        if not self._warn_has_been_called and version.parse(version.parse(__version__).base_version) < version.parse(
+            "0.23.0.dev0"
+        ):
+            logger.warning(
+                "Please note that the expected format of `mask_image` has recently been changed. "
+                "Before diffusers == 0.19.0, Kandinsky Inpainting pipelines repainted black pixels and preserved black pixels. "
+                "As of diffusers==0.19.0 this behavior has been inverted. Now white pixels are repainted and black pixels are preserved. "
+                "This way, Kandinsky's masking behavior is aligned with Stable Diffusion. "
+                "THIS means that you HAVE to invert the input mask to have the same behavior as before as explained in https://github.com/huggingface/diffusers/pull/4207. "
+                "This warning will be surpressed after the first inference call and will be removed in diffusers>0.23.0"
+            )
+            self._warn_has_been_called = True
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        self._guidance_scale = guidance_scale
+
+        device = self._execution_device
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        batch_size = image_embeds.shape[0] * num_images_per_prompt
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if self.do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=self.unet.dtype, device=device
+            )
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # preprocess image and mask
+        mask_image, image = prepare_mask_and_masked_image(image, mask_image, height, width)
+
+        image = image.to(dtype=image_embeds.dtype, device=device)
+        image = self.movq.encode(image)["latents"]
+
+        mask_image = mask_image.to(dtype=image_embeds.dtype, device=device)
+
+        image_shape = tuple(image.shape[-2:])
+        mask_image = F.interpolate(
+            mask_image,
+            image_shape,
+            mode="nearest",
+        )
+        mask_image = prepare_mask(mask_image)
+        masked_image = image * mask_image
+
+        mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
+        masked_image = masked_image.repeat_interleave(num_images_per_prompt, dim=0)
+        if self.do_classifier_free_guidance:
+            mask_image = mask_image.repeat(2, 1, 1, 1)
+            masked_image = masked_image.repeat(2, 1, 1, 1)
+
+        num_channels_latents = self.movq.config.latent_channels
+
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            image_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+        noise = torch.clone(latents)
+
+        self._num_timesteps = len(timesteps)
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+            latent_model_input = torch.cat([latent_model_input, masked_image, mask_image], dim=1)
+
+            added_cond_kwargs = {"image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if self.do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            )[0]
+            init_latents_proper = image[:1]
+            init_mask = mask_image[:1]
+
+            if i < len(timesteps) - 1:
+                noise_timestep = timesteps[i + 1]
+                init_latents_proper = self.scheduler.add_noise(
+                    init_latents_proper, noise, torch.tensor([noise_timestep])
+                )
+
+            latents = init_mask * init_latents_proper + (1 - init_mask) * latents
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                image_embeds = callback_outputs.pop("image_embeds", image_embeds)
+                negative_image_embeds = callback_outputs.pop("negative_image_embeds", negative_image_embeds)
+                masked_image = callback_outputs.pop("masked_image", masked_image)
+                mask_image = callback_outputs.pop("mask_image", mask_image)
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # post-processing
+        latents = mask_image[:1] * image[:1] + (1 - mask_image[:1]) * latents
+
+        if output_type not in ["pt", "np", "pil", "latent"]:
+            raise ValueError(
+                f"Only the output types `pt`, `pil`, `np` and `latent` are supported not output_type={output_type}"
+            )
+
+        if not output_type == "latent":
+            image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+            if output_type in ["np", "pil"]:
+                image = image * 0.5 + 0.5
+                image = image.clamp(0, 1)
+                image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            if output_type == "pil":
+                image = self.numpy_to_pil(image)
+        else:
+            image = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2134b22b40b70ab26e3225de08cead28542f34f
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -0,0 +1,549 @@
+from typing import Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import PriorTransformer
+from ...schedulers import UnCLIPScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..kandinsky import KandinskyPriorPipelineOutput
+from ..pipeline_utils import DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
+        >>> pipe_prior.to("cuda")
+        >>> prompt = "red cat, 4k photo"
+        >>> image_emb, negative_image_emb = pipe_prior(prompt).to_tuple()
+
+        >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
+        >>> pipe.to("cuda")
+        >>> image = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=negative_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... ).images
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+EXAMPLE_INTERPOLATE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22PriorPipeline, KandinskyV22Pipeline
+        >>> from diffusers.utils import load_image
+        >>> import PIL
+        >>> import torch
+        >>> from torchvision import transforms
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+        >>> img1 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+        >>> img2 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/starry_night.jpeg"
+        ... )
+        >>> images_texts = ["a cat", img1, img2]
+        >>> weights = [0.3, 0.3, 0.4]
+        >>> out = pipe_prior.interpolate(images_texts, weights)
+        >>> pipe = KandinskyV22Pipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+        >>> image = pipe(
+        ...     image_embeds=out.image_embeds,
+        ...     negative_image_embeds=out.negative_image_embeds,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... ).images[0]
+        >>> image.save("starry_cat.png")
+        ```
+"""
+
+
+class KandinskyV22PriorPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating image prior for Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        image_processor ([`CLIPImageProcessor`]):
+            A image_processor to be used to preprocess image from clip.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->prior"
+    _exclude_from_cpu_offload = ["prior"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "text_encoder_hidden_states", "text_mask"]
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        image_encoder: CLIPVisionModelWithProjection,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: UnCLIPScheduler,
+        image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+        )
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
+    def interpolate(
+        self,
+        images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]],
+        weights: List[float],
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        negative_prior_prompt: Optional[str] = None,
+        negative_prompt: str = "",
+        guidance_scale: float = 4.0,
+        device=None,
+    ):
+        """
+        Function invoked when using the prior pipeline for interpolation.
+
+        Args:
+            images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`):
+                list of prompts and images to guide the image generation.
+            weights: (`List[float]`):
+                list of weights for each condition in `images_and_prompts`
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            negative_prior_prompt (`str`, *optional*):
+                The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+
+        Examples:
+
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        device = device or self.device
+
+        if len(images_and_prompts) != len(weights):
+            raise ValueError(
+                f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length"
+            )
+
+        image_embeddings = []
+        for cond, weight in zip(images_and_prompts, weights):
+            if isinstance(cond, str):
+                image_emb = self(
+                    cond,
+                    num_inference_steps=num_inference_steps,
+                    num_images_per_prompt=num_images_per_prompt,
+                    generator=generator,
+                    latents=latents,
+                    negative_prompt=negative_prior_prompt,
+                    guidance_scale=guidance_scale,
+                ).image_embeds.unsqueeze(0)
+
+            elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
+                if isinstance(cond, PIL.Image.Image):
+                    cond = (
+                        self.image_processor(cond, return_tensors="pt")
+                        .pixel_values[0]
+                        .unsqueeze(0)
+                        .to(dtype=self.image_encoder.dtype, device=device)
+                    )
+
+                image_emb = self.image_encoder(cond)["image_embeds"].repeat(num_images_per_prompt, 1).unsqueeze(0)
+
+            else:
+                raise ValueError(
+                    f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is {type(cond)}"
+                )
+
+            image_embeddings.append(image_emb * weight)
+
+        image_emb = torch.cat(image_embeddings).sum(dim=0)
+
+        out_zero = self(
+            negative_prompt,
+            num_inference_steps=num_inference_steps,
+            num_images_per_prompt=num_images_per_prompt,
+            generator=generator,
+            latents=latents,
+            negative_prompt=negative_prior_prompt,
+            guidance_scale=guidance_scale,
+        )
+        zero_image_emb = out_zero.negative_image_embeds if negative_prompt == "" else out_zero.image_embeds
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=zero_image_emb)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline.get_zero_embed
+    def get_zero_embed(self, batch_size=1, device=None):
+        device = device or self.device
+        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
+            device=device, dtype=self.image_encoder.dtype
+        )
+        zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
+        zero_image_emb = zero_image_emb.repeat(batch_size, 1)
+        return zero_image_emb
+
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        guidance_scale: float = 4.0,
+        output_type: Optional[str] = "pt",  # pt only
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
+                (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        elif not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+        elif not isinstance(negative_prompt, list) and negative_prompt is not None:
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+        # if the negative prompt is defined we double the batch size to
+        # directly retrieve the negative prompt embedding
+        if negative_prompt is not None:
+            prompt = prompt + negative_prompt
+            negative_prompt = 2 * negative_prompt
+
+        device = self._execution_device
+
+        batch_size = len(prompt)
+        batch_size = batch_size * num_images_per_prompt
+
+        self._guidance_scale = guidance_scale
+
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, self.do_classifier_free_guidance, negative_prompt
+        )
+
+        # prior
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        embedding_dim = self.prior.config.embedding_dim
+
+        latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+        self._num_timesteps = len(timesteps)
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+                encoder_hidden_states=text_encoder_hidden_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if self.do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + self.guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == timesteps.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = timesteps[i + 1]
+
+            latents = self.scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                text_encoder_hidden_states = callback_outputs.pop(
+                    "text_encoder_hidden_states", text_encoder_hidden_states
+                )
+                text_mask = callback_outputs.pop("text_mask", text_mask)
+
+        latents = self.prior.post_process_latents(latents)
+
+        image_embeddings = latents
+
+        # if negative prompt has been defined, we retrieve split the image embedding into two
+        if negative_prompt is None:
+            zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
+        else:
+            image_embeddings, zero_embeds = image_embeddings.chunk(2)
+
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np"]:
+            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
+
+        if output_type == "np":
+            image_embeddings = image_embeddings.cpu().numpy()
+            zero_embeds = zero_embeds.cpu().numpy()
+
+        if not return_dict:
+            return (image_embeddings, zero_embeds)
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec6509bb3cb59e5cc53de00a2b99f78f7abbe4ee
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -0,0 +1,563 @@
+from typing import List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import PriorTransformer
+from ...schedulers import UnCLIPScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..kandinsky import KandinskyPriorPipelineOutput
+from ..pipeline_utils import DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorEmb2EmbPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "red cat, 4k photo"
+        >>> img = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+        >>> image_emb, nagative_image_emb = pipe_prior(prompt, image=img, strength=0.2).to_tuple()
+
+        >>> pipe = KandinskyPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder, torch_dtype=torch.float16"
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=negative_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ... ).images
+
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+EXAMPLE_INTERPOLATE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22Pipeline
+        >>> from diffusers.utils import load_image
+        >>> import PIL
+
+        >>> import torch
+        >>> from torchvision import transforms
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> img1 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+
+        >>> img2 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/starry_night.jpeg"
+        ... )
+
+        >>> images_texts = ["a cat", img1, img2]
+        >>> weights = [0.3, 0.3, 0.4]
+        >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
+
+        >>> pipe = KandinskyV22Pipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=150,
+        ... ).images[0]
+
+        >>> image.save("starry_cat.png")
+        ```
+"""
+
+
+class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating image prior for Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->prior"
+    _exclude_from_cpu_offload = ["prior"]
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        image_encoder: CLIPVisionModelWithProjection,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: UnCLIPScheduler,
+        image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+        )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
+    def interpolate(
+        self,
+        images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]],
+        weights: List[float],
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        negative_prior_prompt: Optional[str] = None,
+        negative_prompt: str = "",
+        guidance_scale: float = 4.0,
+        device=None,
+    ):
+        """
+        Function invoked when using the prior pipeline for interpolation.
+
+        Args:
+            images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`):
+                list of prompts and images to guide the image generation.
+            weights: (`List[float]`):
+                list of weights for each condition in `images_and_prompts`
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            negative_prior_prompt (`str`, *optional*):
+                The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+
+        Examples:
+
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        device = device or self.device
+
+        if len(images_and_prompts) != len(weights):
+            raise ValueError(
+                f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length"
+            )
+
+        image_embeddings = []
+        for cond, weight in zip(images_and_prompts, weights):
+            if isinstance(cond, str):
+                image_emb = self(
+                    cond,
+                    num_inference_steps=num_inference_steps,
+                    num_images_per_prompt=num_images_per_prompt,
+                    generator=generator,
+                    latents=latents,
+                    negative_prompt=negative_prior_prompt,
+                    guidance_scale=guidance_scale,
+                ).image_embeds.unsqueeze(0)
+
+            elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
+                image_emb = self._encode_image(
+                    cond, device=device, num_images_per_prompt=num_images_per_prompt
+                ).unsqueeze(0)
+
+            else:
+                raise ValueError(
+                    f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is {type(cond)}"
+                )
+
+            image_embeddings.append(image_emb * weight)
+
+        image_emb = torch.cat(image_embeddings).sum(dim=0)
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=torch.randn_like(image_emb))
+
+    def _encode_image(
+        self,
+        image: Union[torch.Tensor, List[PIL.Image.Image]],
+        device,
+        num_images_per_prompt,
+    ):
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor(image, return_tensors="pt").pixel_values.to(
+                dtype=self.image_encoder.dtype, device=device
+            )
+
+        image_emb = self.image_encoder(image)["image_embeds"]  # B, D
+        image_emb = image_emb.repeat_interleave(num_images_per_prompt, dim=0)
+        image_emb.to(device=device)
+
+        return image_emb
+
+    def prepare_latents(self, emb, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        emb = emb.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        init_latents = emb
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline.get_zero_embed
+    def get_zero_embed(self, batch_size=1, device=None):
+        device = device or self.device
+        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
+            device=device, dtype=self.image_encoder.dtype
+        )
+        zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
+        zero_image_emb = zero_image_emb.repeat(batch_size, 1)
+        return zero_image_emb
+
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.Tensor, List[torch.Tensor], PIL.Image.Image, List[PIL.Image.Image]],
+        strength: float = 0.3,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        guidance_scale: float = 4.0,
+        output_type: Optional[str] = "pt",  # pt only
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `emb`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added.
+            emb (`torch.Tensor`):
+                The image embedding.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
+                (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        elif not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+        elif not isinstance(negative_prompt, list) and negative_prompt is not None:
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+        # if the negative prompt is defined we double the batch size to
+        # directly retrieve the negative prompt embedding
+        if negative_prompt is not None:
+            prompt = prompt + negative_prompt
+            negative_prompt = 2 * negative_prompt
+
+        device = self._execution_device
+
+        batch_size = len(prompt)
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        if not isinstance(image, List):
+            image = [image]
+
+        if isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, dim=0)
+
+        if isinstance(image, torch.Tensor) and image.ndim == 2:
+            # allow user to pass image_embeds directly
+            image_embeds = image.repeat_interleave(num_images_per_prompt, dim=0)
+        elif isinstance(image, torch.Tensor) and image.ndim != 4:
+            raise ValueError(
+                f" if pass `image` as pytorch tensor, or a list of pytorch tensor, please make sure each tensor has shape [batch_size, channels, height, width], currently {image[0].unsqueeze(0).shape}"
+            )
+        else:
+            image_embeds = self._encode_image(image, device, num_images_per_prompt)
+
+        # prior
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        latents = image_embeds
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size)
+        latents = self.prepare_latents(
+            latents,
+            latent_timestep,
+            batch_size // num_images_per_prompt,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+                encoder_hidden_states=text_encoder_hidden_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == timesteps.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = timesteps[i + 1]
+
+            latents = self.scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+        latents = self.prior.post_process_latents(latents)
+
+        image_embeddings = latents
+
+        # if negative prompt has been defined, we retrieve split the image embedding into two
+        if negative_prompt is None:
+            zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
+        else:
+            image_embeddings, zero_embeds = image_embeddings.chunk(2)
+
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np"]:
+            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
+
+        if output_type == "np":
+            image_embeddings = image_embeddings.cpu().numpy()
+            zero_embeds = zero_embeds.cpu().numpy()
+
+        if not return_dict:
+            return (image_embeddings, zero_embeds)
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky3/__init__.py b/diffusers/src/diffusers/pipelines/kandinsky3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8a3063141b5e62682ce3ad8e8e0d02473402665
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky3/__init__.py
@@ -0,0 +1,49 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_kandinsky3"] = ["Kandinsky3Pipeline"]
+    _import_structure["pipeline_kandinsky3_img2img"] = ["Kandinsky3Img2ImgPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_kandinsky3 import Kandinsky3Pipeline
+        from .pipeline_kandinsky3_img2img import Kandinsky3Img2ImgPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py b/diffusers/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5360632275b41b035f94d84d6156a6bafe84c613
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+import argparse
+import fnmatch
+
+from safetensors.torch import load_file
+
+from diffusers import Kandinsky3UNet
+
+
+MAPPING = {
+    "to_time_embed.1": "time_embedding.linear_1",
+    "to_time_embed.3": "time_embedding.linear_2",
+    "in_layer": "conv_in",
+    "out_layer.0": "conv_norm_out",
+    "out_layer.2": "conv_out",
+    "down_samples": "down_blocks",
+    "up_samples": "up_blocks",
+    "projection_lin": "encoder_hid_proj.projection_linear",
+    "projection_ln": "encoder_hid_proj.projection_norm",
+    "feature_pooling": "add_time_condition",
+    "to_query": "to_q",
+    "to_key": "to_k",
+    "to_value": "to_v",
+    "output_layer": "to_out.0",
+    "self_attention_block": "attentions.0",
+}
+
+DYNAMIC_MAP = {
+    "resnet_attn_blocks.*.0": "resnets_in.*",
+    "resnet_attn_blocks.*.1": ("attentions.*", 1),
+    "resnet_attn_blocks.*.2": "resnets_out.*",
+}
+# MAPPING = {}
+
+
+def convert_state_dict(unet_state_dict):
+    """
+    Args:
+    Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model.
+        unet_model (torch.nn.Module): The original U-Net model. unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet
+        model to match keys with.
+
+    Returns:
+        OrderedDict: The converted state dictionary.
+    """
+    # Example of renaming logic (this will vary based on your model's architecture)
+    converted_state_dict = {}
+    for key in unet_state_dict:
+        new_key = key
+        for pattern, new_pattern in MAPPING.items():
+            new_key = new_key.replace(pattern, new_pattern)
+
+        for dyn_pattern, dyn_new_pattern in DYNAMIC_MAP.items():
+            has_matched = False
+            if fnmatch.fnmatch(new_key, f"*.{dyn_pattern}.*") and not has_matched:
+                star = int(new_key.split(dyn_pattern.split(".")[0])[-1].split(".")[1])
+
+                if isinstance(dyn_new_pattern, tuple):
+                    new_star = star + dyn_new_pattern[-1]
+                    dyn_new_pattern = dyn_new_pattern[0]
+                else:
+                    new_star = star
+
+                pattern = dyn_pattern.replace("*", str(star))
+                new_pattern = dyn_new_pattern.replace("*", str(new_star))
+
+                new_key = new_key.replace(pattern, new_pattern)
+                has_matched = True
+
+        converted_state_dict[new_key] = unet_state_dict[key]
+
+    return converted_state_dict
+
+
+def main(model_path, output_path):
+    # Load your original U-Net model
+    unet_state_dict = load_file(model_path)
+
+    # Initialize your Kandinsky3UNet model
+    config = {}
+
+    # Convert the state dict
+    converted_state_dict = convert_state_dict(unet_state_dict)
+
+    unet = Kandinsky3UNet(config)
+    unet.load_state_dict(converted_state_dict)
+
+    unet.save_pretrained(output_path)
+    print(f"Converted model saved to {output_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert U-Net PyTorch model to Kandinsky3UNet format")
+    parser.add_argument("--model_path", type=str, required=True, help="Path to the original U-Net PyTorch model")
+    parser.add_argument("--output_path", type=str, required=True, help="Path to save the converted model")
+
+    args = parser.parse_args()
+    main(args.model_path, args.output_path)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py b/diffusers/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dbae2a1909ad4d3de74e9b27cfbde40ab6eae61
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py
@@ -0,0 +1,576 @@
+from typing import Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...loaders import StableDiffusionLoraLoaderMixin
+from ...models import Kandinsky3UNet, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    deprecate,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import AutoPipelineForText2Image
+        >>> import torch
+
+        >>> pipe = AutoPipelineForText2Image.from_pretrained(
+        ...     "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."
+
+        >>> generator = torch.Generator(device="cpu").manual_seed(0)
+        >>> image = pipe(prompt, num_inference_steps=25, generator=generator).images[0]
+        ```
+
+"""
+
+
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+class Kandinsky3Pipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin):
+    model_cpu_offload_seq = "text_encoder->unet->movq"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "negative_attention_mask",
+        "attention_mask",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: Kandinsky3UNet,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, unet=unet, scheduler=scheduler, movq=movq
+        )
+
+    def process_embeds(self, embeddings, attention_mask, cut_context):
+        if cut_context:
+            embeddings[attention_mask == 0] = torch.zeros_like(embeddings[attention_mask == 0])
+            max_seq_length = attention_mask.sum(-1).max() + 1
+            embeddings = embeddings[:, :max_seq_length]
+            attention_mask = attention_mask[:, :max_seq_length]
+        return embeddings, attention_mask
+
+    @torch.no_grad()
+    def encode_prompt(
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        device=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        _cut_context=False,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
+            negative_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        max_length = 128
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids.to(device)
+            attention_mask = text_inputs.attention_mask.to(device)
+            prompt_embeds = self.text_encoder(
+                text_input_ids,
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+            prompt_embeds, attention_mask = self.process_embeds(prompt_embeds, attention_mask, _cut_context)
+            prompt_embeds = prompt_embeds * attention_mask.unsqueeze(2)
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        attention_mask = attention_mask.repeat(num_images_per_prompt, 1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            if negative_prompt is not None:
+                uncond_input = self.tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=128,
+                    truncation=True,
+                    return_attention_mask=True,
+                    return_tensors="pt",
+                )
+                text_input_ids = uncond_input.input_ids.to(device)
+                negative_attention_mask = uncond_input.attention_mask.to(device)
+
+                negative_prompt_embeds = self.text_encoder(
+                    text_input_ids,
+                    attention_mask=negative_attention_mask,
+                )
+                negative_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds[:, : prompt_embeds.shape[1]]
+                negative_attention_mask = negative_attention_mask[:, : prompt_embeds.shape[1]]
+                negative_prompt_embeds = negative_prompt_embeds * negative_attention_mask.unsqueeze(2)
+
+            else:
+                negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+                negative_attention_mask = torch.zeros_like(attention_mask)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+            if negative_prompt_embeds.shape != prompt_embeds.shape:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+                negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+                negative_attention_mask = negative_attention_mask.repeat(num_images_per_prompt, 1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+            negative_attention_mask = None
+        return prompt_embeds, negative_prompt_embeds, attention_mask, negative_attention_mask
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        attention_mask=None,
+        negative_attention_mask=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if negative_prompt_embeds is not None and negative_attention_mask is None:
+            raise ValueError("Please provide `negative_attention_mask` along with `negative_prompt_embeds`")
+
+        if negative_prompt_embeds is not None and negative_attention_mask is not None:
+            if negative_prompt_embeds.shape[:2] != negative_attention_mask.shape:
+                raise ValueError(
+                    "`negative_prompt_embeds` and `negative_attention_mask` must have the same batch_size and token length when passed directly, but"
+                    f" got: `negative_prompt_embeds` {negative_prompt_embeds.shape[:2]} != `negative_attention_mask`"
+                    f" {negative_attention_mask.shape}."
+                )
+
+        if prompt_embeds is not None and attention_mask is None:
+            raise ValueError("Please provide `attention_mask` along with `prompt_embeds`")
+
+        if prompt_embeds is not None and attention_mask is not None:
+            if prompt_embeds.shape[:2] != attention_mask.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `attention_mask` must have the same batch_size and token length when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape[:2]} != `attention_mask`"
+                    f" {attention_mask.shape}."
+                )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 25,
+        guidance_scale: float = 3.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        height: Optional[int] = 1024,
+        width: Optional[int] = 1024,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        latents=None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 3.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
+            negative_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        cut_context = True
+        device = self._execution_device
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            attention_mask,
+            negative_attention_mask,
+        )
+
+        self._guidance_scale = guidance_scale
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds, attention_mask, negative_attention_mask = self.encode_prompt(
+            prompt,
+            self.do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            _cut_context=cut_context,
+            attention_mask=attention_mask,
+            negative_attention_mask=negative_attention_mask,
+        )
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            attention_mask = torch.cat([negative_attention_mask, attention_mask]).bool()
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latents
+        height, width = downscale_height_and_width(height, width, 8)
+
+        latents = self.prepare_latents(
+            (batch_size * num_images_per_prompt, 4, height, width),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_attention_mask=attention_mask,
+                    return_dict=False,
+                )[0]
+
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+
+                    noise_pred = (guidance_scale + 1.0) * noise_pred_text - guidance_scale * noise_pred_uncond
+                    # noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                    generator=generator,
+                ).prev_sample
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    attention_mask = callback_outputs.pop("attention_mask", attention_mask)
+                    negative_attention_mask = callback_outputs.pop("negative_attention_mask", negative_attention_mask)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+            # post-processing
+            if output_type not in ["pt", "np", "pil", "latent"]:
+                raise ValueError(
+                    f"Only the output types `pt`, `pil`, `np` and `latent` are supported not output_type={output_type}"
+                )
+
+            if not output_type == "latent":
+                image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+                if output_type in ["np", "pil"]:
+                    image = image * 0.5 + 0.5
+                    image = image.clamp(0, 1)
+                    image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+                if output_type == "pil":
+                    image = self.numpy_to_pil(image)
+            else:
+                image = latents
+
+            self.maybe_free_model_hooks()
+
+            if not return_dict:
+                return (image,)
+
+            return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py b/diffusers/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..81c45c4fb6f87e7fced3626ad86fabe3f9ec80ad
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
@@ -0,0 +1,643 @@
+import inspect
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import PIL.Image
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...loaders import StableDiffusionLoraLoaderMixin
+from ...models import Kandinsky3UNet, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    deprecate,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import AutoPipelineForImage2Image
+        >>> from diffusers.utils import load_image
+        >>> import torch
+
+        >>> pipe = AutoPipelineForImage2Image.from_pretrained(
+        ...     "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A painting of the inside of a subway train with tiny raccoons."
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png"
+        ... )
+
+        >>> generator = torch.Generator(device="cpu").manual_seed(0)
+        >>> image = pipe(prompt, image=image, strength=0.75, num_inference_steps=25, generator=generator).images[0]
+        ```
+"""
+
+
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+def prepare_image(pil_image):
+    arr = np.array(pil_image.convert("RGB"))
+    arr = arr.astype(np.float32) / 127.5 - 1
+    arr = np.transpose(arr, [2, 0, 1])
+    image = torch.from_numpy(arr).unsqueeze(0)
+    return image
+
+
+class Kandinsky3Img2ImgPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin):
+    model_cpu_offload_seq = "text_encoder->movq->unet->movq"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "negative_attention_mask",
+        "attention_mask",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: Kandinsky3UNet,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, unet=unet, scheduler=scheduler, movq=movq
+        )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def _process_embeds(self, embeddings, attention_mask, cut_context):
+        # return embeddings, attention_mask
+        if cut_context:
+            embeddings[attention_mask == 0] = torch.zeros_like(embeddings[attention_mask == 0])
+            max_seq_length = attention_mask.sum(-1).max() + 1
+            embeddings = embeddings[:, :max_seq_length]
+            attention_mask = attention_mask[:, :max_seq_length]
+        return embeddings, attention_mask
+
+    @torch.no_grad()
+    def encode_prompt(
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        device=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        _cut_context=False,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
+            negative_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        max_length = 128
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids.to(device)
+            attention_mask = text_inputs.attention_mask.to(device)
+            prompt_embeds = self.text_encoder(
+                text_input_ids,
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+            prompt_embeds, attention_mask = self._process_embeds(prompt_embeds, attention_mask, _cut_context)
+            prompt_embeds = prompt_embeds * attention_mask.unsqueeze(2)
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        attention_mask = attention_mask.repeat(num_images_per_prompt, 1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            if negative_prompt is not None:
+                uncond_input = self.tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=128,
+                    truncation=True,
+                    return_attention_mask=True,
+                    return_tensors="pt",
+                )
+                text_input_ids = uncond_input.input_ids.to(device)
+                negative_attention_mask = uncond_input.attention_mask.to(device)
+
+                negative_prompt_embeds = self.text_encoder(
+                    text_input_ids,
+                    attention_mask=negative_attention_mask,
+                )
+                negative_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds[:, : prompt_embeds.shape[1]]
+                negative_attention_mask = negative_attention_mask[:, : prompt_embeds.shape[1]]
+                negative_prompt_embeds = negative_prompt_embeds * negative_attention_mask.unsqueeze(2)
+
+            else:
+                negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+                negative_attention_mask = torch.zeros_like(attention_mask)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+            if negative_prompt_embeds.shape != prompt_embeds.shape:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+                negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+                negative_attention_mask = negative_attention_mask.repeat(num_images_per_prompt, 1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+            negative_attention_mask = None
+        return prompt_embeds, negative_prompt_embeds, attention_mask, negative_attention_mask
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    self.movq.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = self.movq.encode(image).latent_dist.sample(generator)
+
+            init_latents = self.movq.config.scaling_factor * init_latents
+
+        init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        attention_mask=None,
+        negative_attention_mask=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if negative_prompt_embeds is not None and negative_attention_mask is None:
+            raise ValueError("Please provide `negative_attention_mask` along with `negative_prompt_embeds`")
+
+        if negative_prompt_embeds is not None and negative_attention_mask is not None:
+            if negative_prompt_embeds.shape[:2] != negative_attention_mask.shape:
+                raise ValueError(
+                    "`negative_prompt_embeds` and `negative_attention_mask` must have the same batch_size and token length when passed directly, but"
+                    f" got: `negative_prompt_embeds` {negative_prompt_embeds.shape[:2]} != `negative_attention_mask`"
+                    f" {negative_attention_mask.shape}."
+                )
+
+        if prompt_embeds is not None and attention_mask is None:
+            raise ValueError("Please provide `attention_mask` along with `prompt_embeds`")
+
+        if prompt_embeds is not None and attention_mask is not None:
+            if prompt_embeds.shape[:2] != attention_mask.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `attention_mask` must have the same batch_size and token length when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape[:2]} != `attention_mask`"
+                    f" {attention_mask.shape}."
+                )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]] = None,
+        strength: float = 0.3,
+        num_inference_steps: int = 25,
+        guidance_scale: float = 3.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 3.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
+            negative_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        cut_context = True
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            attention_mask,
+            negative_attention_mask,
+        )
+
+        self._guidance_scale = guidance_scale
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds, attention_mask, negative_attention_mask = self.encode_prompt(
+            prompt,
+            self.do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            _cut_context=cut_context,
+            attention_mask=attention_mask,
+            negative_attention_mask=negative_attention_mask,
+        )
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            attention_mask = torch.cat([negative_attention_mask, attention_mask]).bool()
+        if not isinstance(image, list):
+            image = [image]
+        if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support  PIL image and pytorch tensor"
+            )
+
+        image = torch.cat([prepare_image(i) for i in image], dim=0)
+        image = image.to(dtype=prompt_embeds.dtype, device=device)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        # 5. Prepare latents
+        latents = self.movq.encode(image)["latents"]
+        latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        latents = self.prepare_latents(
+            latents, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+        )
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_attention_mask=attention_mask,
+                )[0]
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+
+                    noise_pred = (guidance_scale + 1.0) * noise_pred_text - guidance_scale * noise_pred_uncond
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                    generator=generator,
+                ).prev_sample
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    attention_mask = callback_outputs.pop("attention_mask", attention_mask)
+                    negative_attention_mask = callback_outputs.pop("negative_attention_mask", negative_attention_mask)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+            # post-processing
+            if output_type not in ["pt", "np", "pil", "latent"]:
+                raise ValueError(
+                    f"Only the output types `pt`, `pil`, `np` and `latent` are supported not output_type={output_type}"
+                )
+            if not output_type == "latent":
+                image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+                if output_type in ["np", "pil"]:
+                    image = image * 0.5 + 0.5
+                    image = image.clamp(0, 1)
+                    image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+                if output_type == "pil":
+                    image = self.numpy_to_pil(image)
+            else:
+                image = latents
+
+            self.maybe_free_model_hooks()
+
+            if not return_dict:
+                return (image,)
+
+            return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/kolors/__init__.py b/diffusers/src/diffusers/pipelines/kolors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..671d22e9f433802ec82ebe89fae4cef6c001064b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kolors/__init__.py
@@ -0,0 +1,54 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_sentencepiece_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()) and is_sentencepiece_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_and_sentencepiece_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_sentencepiece_objects))
+else:
+    _import_structure["pipeline_kolors"] = ["KolorsPipeline"]
+    _import_structure["pipeline_kolors_img2img"] = ["KolorsImg2ImgPipeline"]
+    _import_structure["text_encoder"] = ["ChatGLMModel"]
+    _import_structure["tokenizer"] = ["ChatGLMTokenizer"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()) and is_sentencepiece_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_and_sentencepiece_objects import *
+
+    else:
+        from .pipeline_kolors import KolorsPipeline
+        from .pipeline_kolors_img2img import KolorsImg2ImgPipeline
+        from .text_encoder import ChatGLMModel
+        from .tokenizer import ChatGLMTokenizer
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/kolors/pipeline_kolors.py b/diffusers/src/diffusers/pipelines/kolors/pipeline_kolors.py
new file mode 100644
index 0000000000000000000000000000000000000000..b682429e9744decf6859933e53c7d73bcb19f8bd
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kolors/pipeline_kolors.py
@@ -0,0 +1,1070 @@
+# Copyright 2024 Stability AI, Kwai-Kolors Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, StableDiffusionXLLoraLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import KolorsPipelineOutput
+from .text_encoder import ChatGLMModel
+from .tokenizer import ChatGLMTokenizer
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import KolorsPipeline
+
+        >>> pipe = KolorsPipeline.from_pretrained(
+        ...     "Kwai-Kolors/Kolors-diffusers", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = (
+        ...     "A photo of a ladybug, macro, zoom, high quality, film, holding a wooden sign with the text 'KOLORS'"
+        ... )
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, IPAdapterMixin):
+    r"""
+    Pipeline for text-to-image generation using Kolors.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`ChatGLMModel`]):
+            Frozen text-encoder. Kolors uses [ChatGLM3-6B](https://huggingface.co/THUDM/chatglm3-6b).
+        tokenizer (`ChatGLMTokenizer`):
+            Tokenizer of class
+            [ChatGLMTokenizer](https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"False"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `Kwai-Kolors/Kolors-diffusers`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = [
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: ChatGLMModel,
+        tokenizer: ChatGLMTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = False,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+    def encode_prompt(
+        self,
+        prompt,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 256,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+        """
+        # from IPython import embed; embed(); exit()
+        device = device or self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer]
+        text_encoders = [self.text_encoder]
+
+        if prompt_embeds is None:
+            prompt_embeds_list = []
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=max_sequence_length,
+                    truncation=True,
+                    return_tensors="pt",
+                ).to(device)
+                output = text_encoder(
+                    input_ids=text_inputs["input_ids"],
+                    attention_mask=text_inputs["attention_mask"],
+                    position_ids=text_inputs["position_ids"],
+                    output_hidden_states=True,
+                )
+
+                # [max_sequence_length, batch, hidden_size] -> [batch, max_sequence_length, hidden_size]
+                # clone to have a contiguous tensor
+                prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
+                # [max_sequence_length, batch, hidden_size] -> [batch, hidden_size]
+                pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone()
+                bs_embed, seq_len, _ = prompt_embeds.shape
+                prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+                prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = prompt_embeds_list[0]
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            negative_prompt_embeds_list = []
+
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                uncond_input = tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=max_sequence_length,
+                    truncation=True,
+                    return_tensors="pt",
+                ).to(device)
+                output = text_encoder(
+                    input_ids=uncond_input["input_ids"],
+                    attention_mask=uncond_input["attention_mask"],
+                    position_ids=uncond_input["position_ids"],
+                    output_hidden_states=True,
+                )
+
+                # [max_sequence_length, batch, hidden_size] -> [batch, max_sequence_length, hidden_size]
+                # clone to have a contiguous tensor
+                negative_prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
+                # [max_sequence_length, batch, hidden_size] -> [batch, hidden_size]
+                negative_pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone()
+
+                if do_classifier_free_guidance:
+                    # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+                    seq_len = negative_prompt_embeds.shape[1]
+
+                    negative_prompt_embeds = negative_prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+
+                    negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+                    negative_prompt_embeds = negative_prompt_embeds.view(
+                        batch_size * num_images_per_prompt, seq_len, -1
+                    )
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = negative_prompt_embeds_list[0]
+
+        bs_embed = pooled_prompt_embeds.shape[0]
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        num_inference_steps,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
+            raise ValueError(
+                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
+                f" {type(num_inference_steps)}."
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+        if max_sequence_length is not None and max_sequence_length > 256:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 256 but is {max_sequence_length}")
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 256,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [Kwai-Kolors/Kolors-diffusers](https://huggingface.co/Kwai-Kolors/Kolors-diffusers) and checkpoints
+                that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [Kwai-Kolors/Kolors-diffusers](https://huggingface.co/Kwai-Kolors/Kolors-diffusers) and checkpoints
+                that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.kolors.KolorsPipelineOutput`] instead of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.kolors.KolorsPipelineOutput`] or `tuple`: [`~pipelines.kolors.KolorsPipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            num_inference_steps,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            pooled_prompt_embeds,
+            negative_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+
+            # unscale/denormalize the latents
+            latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return KolorsPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py b/diffusers/src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..4985a80f88df49b76b7614bdb4bb05585b1942c1
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py
@@ -0,0 +1,1250 @@
+# Copyright 2024 Stability AI, Kwai-Kolors Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, StableDiffusionXLLoraLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import KolorsPipelineOutput
+from .text_encoder import ChatGLMModel
+from .tokenizer import ChatGLMTokenizer
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import KolorsImg2ImgPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = KolorsImg2ImgPipeline.from_pretrained(
+        ...     "Kwai-Kolors/Kolors-diffusers", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> url = (
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/kolors/bunny_source.png"
+        ... )
+
+
+        >>> init_image = load_image(url)
+        >>> prompt = "high quality image of a capybara wearing sunglasses. In the background of the image there are trees, poles, grass and other objects. At the bottom of the object there is the road., 8k, highly detailed."
+        >>> image = pipe(prompt, image=init_image).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, IPAdapterMixin):
+    r"""
+    Pipeline for text-to-image generation using Kolors.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`ChatGLMModel`]):
+            Frozen text-encoder. Kolors uses [ChatGLM3-6B](https://huggingface.co/THUDM/chatglm3-6b).
+        tokenizer (`ChatGLMTokenizer`):
+            Tokenizer of class
+            [ChatGLMTokenizer](https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"False"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `Kwai-Kolors/Kolors-diffusers`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder-unet->vae"
+    _optional_components = [
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: ChatGLMModel,
+        tokenizer: ChatGLMTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = False,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+    # Copied from diffusers.pipelines.kolors.pipeline_kolors.KolorsPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 256,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+        """
+        # from IPython import embed; embed(); exit()
+        device = device or self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer]
+        text_encoders = [self.text_encoder]
+
+        if prompt_embeds is None:
+            prompt_embeds_list = []
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=max_sequence_length,
+                    truncation=True,
+                    return_tensors="pt",
+                ).to(device)
+                output = text_encoder(
+                    input_ids=text_inputs["input_ids"],
+                    attention_mask=text_inputs["attention_mask"],
+                    position_ids=text_inputs["position_ids"],
+                    output_hidden_states=True,
+                )
+
+                # [max_sequence_length, batch, hidden_size] -> [batch, max_sequence_length, hidden_size]
+                # clone to have a contiguous tensor
+                prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
+                # [max_sequence_length, batch, hidden_size] -> [batch, hidden_size]
+                pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone()
+                bs_embed, seq_len, _ = prompt_embeds.shape
+                prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+                prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = prompt_embeds_list[0]
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            negative_prompt_embeds_list = []
+
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                uncond_input = tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=max_sequence_length,
+                    truncation=True,
+                    return_tensors="pt",
+                ).to(device)
+                output = text_encoder(
+                    input_ids=uncond_input["input_ids"],
+                    attention_mask=uncond_input["attention_mask"],
+                    position_ids=uncond_input["position_ids"],
+                    output_hidden_states=True,
+                )
+
+                # [max_sequence_length, batch, hidden_size] -> [batch, max_sequence_length, hidden_size]
+                # clone to have a contiguous tensor
+                negative_prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
+                # [max_sequence_length, batch, hidden_size] -> [batch, hidden_size]
+                negative_pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone()
+
+                if do_classifier_free_guidance:
+                    # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+                    seq_len = negative_prompt_embeds.shape[1]
+
+                    negative_prompt_embeds = negative_prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+
+                    negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+                    negative_prompt_embeds = negative_prompt_embeds.view(
+                        batch_size * num_images_per_prompt, seq_len, -1
+                    )
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = negative_prompt_embeds_list[0]
+
+        bs_embed = pooled_prompt_embeds.shape[0]
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        num_inference_steps,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
+            raise ValueError(
+                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
+                f" {type(num_inference_steps)}."
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+        if max_sequence_length is not None and max_sequence_length > 256:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 256 but is {max_sequence_length}")
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+
+            timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+            return timesteps, num_inference_steps - t_start
+
+        else:
+            # Strength is irrelevant if we directly request a timestep to start at;
+            # that is, strength is determined by the denoising_start instead.
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            t_start = len(self.scheduler.timesteps) - num_inference_steps
+            timesteps = self.scheduler.timesteps[t_start:]
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index(t_start)
+            return timesteps, num_inference_steps
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.prepare_latents
+    def prepare_latents(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        latents_mean = latents_std = None
+        if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
+            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
+        if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
+            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
+
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+                    image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+                elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+                    )
+
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            if latents_mean is not None and latents_std is not None:
+                latents_mean = latents_mean.to(device=device, dtype=dtype)
+                latents_std = latents_std.to(device=device, dtype=dtype)
+                init_latents = (init_latents - latents_mean) * self.vae.config.scaling_factor / latents_std
+            else:
+                init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        if add_noise:
+            shape = init_latents.shape
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # get latents
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_start(self):
+        return self._denoising_start
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.3,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 256,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
+                The image(s) to modify with the pipeline.
+            strength (`float`, *optional*, defaults to 0.3):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. Note that in the case of
+                `denoising_start` being declared as an integer, the value of `strength` will be ignored.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [Kwai-Kolors/Kolors-diffusers](https://huggingface.co/Kwai-Kolors/Kolors-diffusers) and checkpoints
+                that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [Kwai-Kolors/Kolors-diffusers](https://huggingface.co/Kwai-Kolors/Kolors-diffusers) and checkpoints
+                that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refine Image
+                Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality).
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.kolors.KolorsPipelineOutput`] instead of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.kolors.KolorsPipelineOutput`] or `tuple`: [`~pipelines.kolors.KolorsPipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            strength,
+            num_inference_steps,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            pooled_prompt_embeds,
+            negative_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 5. Prepare timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
+        )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        add_noise = True if self.denoising_start is None else False
+
+        # 6. Prepare latent variables
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                add_noise,
+            )
+
+        # 7. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 8. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 9. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 9.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and self.denoising_start is not None
+            and denoising_value_valid(self.denoising_end)
+            and denoising_value_valid(self.denoising_start)
+            and self.denoising_start >= self.denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {self.denoising_end} when using type float."
+            )
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+
+            # unscale/denormalize the latents
+            latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return KolorsPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/kolors/pipeline_output.py b/diffusers/src/diffusers/pipelines/kolors/pipeline_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..310ee7e8a89b253c55dfb47e42964310c93eb56a
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kolors/pipeline_output.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class KolorsPipelineOutput(BaseOutput):
+    """
+    Output class for Kolors pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
diff --git a/diffusers/src/diffusers/pipelines/kolors/text_encoder.py b/diffusers/src/diffusers/pipelines/kolors/text_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fb6f18a907aa1efb68b2ca6f82ca477534c3a31
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kolors/text_encoder.py
@@ -0,0 +1,889 @@
+# Copyright 2024 ChatGLM3-6B Model Team, Kwai-Kolors Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import LayerNorm
+from torch.nn.utils import skip_init
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithPast
+
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        classifier_dropout=None,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs,
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.classifier_dropout = classifier_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        super().__init__(**kwargs)
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
+        self.eps = eps
+
+    def forward(self, hidden_states: torch.Tensor):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+
+        return (self.weight * hidden_states).to(input_dtype)
+
+
+def _config_to_kwargs(args):
+    common_kwargs = {
+        "dtype": args.torch_dtype,
+    }
+    return common_kwargs
+
+
+class CoreAttention(torch.nn.Module):
+    def __init__(self, config: ChatGLMConfig, layer_number):
+        super(CoreAttention, self).__init__()
+
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = projection_size
+        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        pytorch_major_version = int(torch.__version__.split(".")[0])
+        if pytorch_major_version >= 2:
+            query_layer, key_layer, value_layer = [
+                k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]
+            ]
+            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(
+                    query_layer, key_layer, value_layer, is_causal=True
+                )
+            else:
+                if attention_mask is not None:
+                    attention_mask = ~attention_mask
+                context_layer = torch.nn.functional.scaled_dot_product_attention(
+                    query_layer, key_layer, value_layer, attention_mask
+                )
+            context_layer = context_layer.permute(2, 0, 1, 3)
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.reshape(*new_context_layer_shape)
+        else:
+            # Raw attention scores
+
+            # [b, np, sq, sk]
+            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+
+            # [sq, b, np, hn] -> [sq, b * np, hn]
+            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+            # [sk, b, np, hn] -> [sk, b * np, hn]
+            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+
+            # preallocting input tensor: [b * np, sq, sk]
+            matmul_input_buffer = torch.empty(
+                output_size[0] * output_size[1],
+                output_size[2],
+                output_size[3],
+                dtype=query_layer.dtype,
+                device=query_layer.device,
+            )
+
+            # Raw attention scores. [b * np, sq, sk]
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer.transpose(0, 1),  # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0,
+                alpha=(1.0 / self.norm_factor),
+            )
+
+            # change view to [b, np, sq, sk]
+            attention_scores = matmul_result.view(*output_size)
+
+            # ===========================
+            # Attention probs and dropout
+            # ===========================
+
+            # attention scores and attention mask [b, np, sq, sk]
+            if self.attention_softmax_in_fp32:
+                attention_scores = attention_scores.float()
+            if self.coeff is not None:
+                attention_scores = attention_scores * self.coeff
+            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
+                attention_mask = torch.ones(
+                    output_size[0], 1, output_size[2], output_size[3], device=attention_scores.device, dtype=torch.bool
+                )
+                attention_mask.tril_()
+                attention_mask = ~attention_mask
+            if attention_mask is not None:
+                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+            attention_probs = F.softmax(attention_scores, dim=-1)
+            attention_probs = attention_probs.type_as(value_layer)
+
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = self.attention_dropout(attention_probs)
+            # =========================
+            # Context layer. [sq, b, hp]
+            # =========================
+
+            # value_layer -> context layer.
+            # [sk, b, np, hn] --> [b, np, sq, hn]
+
+            # context layer shape: [b, np, sq, hn]
+            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+            # change view [sk, b * np, hn]
+            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+            # change view [b * np, sq, sk]
+            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+            # matmul: [b * np, sq, hn]
+            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+            # change view [b, np, sq, hn]
+            context_layer = context_layer.view(*output_size)
+            # [b, np, sq, hn] --> [sq, b, np, hn]
+            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+            # [sq, b, np, hn] --> [sq, b, hp]
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor,
+    num_partitions: int,
+    contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """Split a tensor along its last dimension.
+
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+@torch.jit.script
+def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [sq, b, np, hn]
+    sq, _b, np, _hn = x.size(0), x.size(1), x.size(2), x.size(3)
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:sq]
+    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return torch.cat((x_out2, x_pass), dim=-1)
+
+
+class SelfAttention(torch.nn.Module):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [s, b, h] and returns output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(SelfAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+
+        self.projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        self.multi_query_attention = config.multi_query_attention
+        self.qkv_hidden_size = 3 * self.projection_size
+        if self.multi_query_attention:
+            self.num_multi_query_groups_per_partition = config.multi_query_group_num
+            self.qkv_hidden_size = (
+                self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
+            )
+        self.query_key_value = nn.Linear(
+            config.hidden_size,
+            self.qkv_hidden_size,
+            bias=config.add_bias_linear or config.add_qkv_bias,
+            device=device,
+            **_config_to_kwargs(config),
+        )
+
+        self.core_attention = CoreAttention(config, self.layer_number)
+
+        # Output.
+        self.dense = nn.Linear(
+            self.projection_size,
+            config.hidden_size,
+            bias=config.add_bias_linear,
+            device=device,
+            **_config_to_kwargs(config),
+        )
+
+    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
+        if self.multi_query_attention:
+            num_attention_heads = self.num_multi_query_groups_per_partition
+        else:
+            num_attention_heads = self.num_attention_heads_per_partition
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            num_attention_heads,
+            self.hidden_size_per_attention_head,
+            dtype=dtype,
+            device=device,
+        )
+
+    def forward(self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True):
+        # hidden_states: [sq, b, h]
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer = self.query_key_value(hidden_states)
+
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(
+                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            key_layer = key_layer.view(
+                key_layer.size()[:-1]
+                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.view(
+                value_layer.size()[:-1]
+                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + (
+                self.num_attention_heads_per_partition,
+                3 * self.hidden_size_per_attention_head,
+            )
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+
+        # adjust key and value for inference
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            key_layer = torch.cat((cache_k, key_layer), dim=0)
+            value_layer = torch.cat((cache_v, value_layer), dim=0)
+        if use_cache:
+            kv_cache = (key_layer, value_layer)
+        else:
+            kv_cache = None
+
+        if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(-2)
+            key_layer = key_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            key_layer = key_layer.contiguous().view(
+                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.unsqueeze(-2)
+            value_layer = value_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            value_layer = value_layer.contiguous().view(
+                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output = self.dense(context_layer)
+
+        return output, kv_cache
+
+
+class MLP(torch.nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h hidden dimension, perform nonlinear transformation,
+    and project the state back into h hidden dimension.
+    """
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(MLP, self).__init__()
+
+        self.add_bias = config.add_bias_linear
+
+        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = nn.Linear(
+            config.hidden_size,
+            config.ffn_hidden_size * 2,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config),
+        )
+
+        def swiglu(x):
+            x = torch.chunk(x, 2, dim=-1)
+            return F.silu(x[0]) * x[1]
+
+        self.activation_func = swiglu
+
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(
+            config.ffn_hidden_size, config.hidden_size, bias=self.add_bias, device=device, **_config_to_kwargs(config)
+        )
+
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        return output
+
+
+class GLMBlock(torch.nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(GLMBlock, self).__init__()
+        self.layer_number = layer_number
+
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNormFunc(
+            config.hidden_size, eps=config.layernorm_epsilon, device=device, dtype=config.torch_dtype
+        )
+
+        # Self attention.
+        self.self_attention = SelfAttention(config, layer_number, device=device)
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNormFunc(
+            config.hidden_size, eps=config.layernorm_epsilon, device=device, dtype=config.torch_dtype
+        )
+
+        # MLP
+        self.mlp = MLP(config, device=device)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        rotary_pos_emb,
+        kv_cache=None,
+        use_cache=True,
+    ):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, kv_cache = self.self_attention(
+            layernorm_output, attention_mask, rotary_pos_emb, kv_cache=kv_cache, use_cache=use_cache
+        )
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+        layernorm_input = residual + layernorm_input
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
+        output = residual + output
+
+        return output, kv_cache
+
+
+class GLMTransformer(torch.nn.Module):
+    """Transformer class."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(GLMTransformer, self).__init__()
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = config.post_layer_norm
+
+        # Number of layers.
+        self.num_layers = config.num_layers
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GLMBlock(config, layer_number, device=device)
+
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = LayerNormFunc(
+                config.hidden_size, eps=config.layernorm_epsilon, device=device, dtype=config.torch_dtype
+            )
+
+        self.gradient_checkpointing = False
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        rotary_pos_emb,
+        kv_caches=None,
+        use_cache: Optional[bool] = True,
+        output_hidden_states: Optional[bool] = False,
+    ):
+        if not kv_caches:
+            kv_caches = [None for _ in range(self.num_layers)]
+        presents = () if use_cache else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+        for index in range(self.num_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer = self._get_layer(index)
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer, hidden_states, attention_mask, rotary_pos_emb, kv_caches[index], use_cache
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states, attention_mask, rotary_pos_emb, kv_cache=kv_caches[index], use_cache=use_cache
+                )
+            hidden_states, kv_cache = layer_ret
+            if use_cache:
+                presents = presents + (kv_cache,)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states, presents, all_hidden_states, all_self_attentions
+
+
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+
+    def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        batch_size, seq_length = input_ids.shape
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
+        full_attention_mask.tril_()
+        past_length = 0
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[0]
+        if past_length:
+            full_attention_mask = torch.cat(
+                (torch.ones(batch_size, seq_length, past_length, device=input_ids.device), full_attention_mask), dim=-1
+            )
+        if padding_mask is not None:
+            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+        if not past_length and padding_mask is not None:
+            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+        full_attention_mask = (full_attention_mask < 0.5).bool()
+        full_attention_mask.unsqueeze_(1)
+        return full_attention_mask
+
+    def get_position_ids(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+        return position_ids
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GLMTransformer):
+            module.gradient_checkpointing = value
+
+
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+
+
+class Embedding(torch.nn.Module):
+    """Language model embeddings."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = config.hidden_size
+        # Word embeddings (parallel).
+        self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size, self.hidden_size, dtype=config.torch_dtype, device=device
+        )
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+    def forward(self, input_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+        return embeddings
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, original_impl=False, device=None, dtype=None):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.dim = dim
+        self.original_impl = original_impl
+
+    def forward_impl(self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000):
+        """Enhanced Transformer with Rotary Position Embedding.
+
+        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+        transformers/rope/__init__.py. MIT License:
+        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+        """
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
+
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
+
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.outer(seq_idx, theta).float()
+
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if dtype in (torch.float16, torch.bfloat16, torch.int8):
+            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
+        return cache
+
+    def forward(self, max_seq_len, offset=0):
+        return self.forward_impl(max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device)
+
+
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix Input shape: (batch-size, prefix-length) Output shape: (batch-size,
+    prefix-length, 2*layers*hidden)
+    """
+
+    def __init__(self, config: ChatGLMConfig):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(kv_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, kv_size),
+            )
+        else:
+            self.embedding = torch.nn.Embedding(
+                config.pre_seq_len, config.num_layers * config.kv_channels * config.multi_query_group_num * 2
+            )
+
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+
+
+class ChatGLMModel(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        init_kwargs = {}
+        if device is not None:
+            init_kwargs["device"] = device
+        self.embedding = init_method(Embedding, config, **init_kwargs)
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+
+        # Rotary positional embeddings
+        self.seq_length = config.seq_length
+        rotary_dim = (
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        )
+
+        self.rotary_pos_emb = RotaryEmbedding(
+            rotary_dim // 2, original_impl=config.original_rope, device=device, dtype=config.torch_dtype
+        )
+        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
+        self.output_layer = init_method(
+            nn.Linear,
+            config.hidden_size,
+            config.padded_vocab_size,
+            bias=False,
+            dtype=config.torch_dtype,
+            **init_kwargs,
+        )
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
+
+    def get_input_embeddings(self):
+        return self.embedding.word_embeddings
+
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size, self.pre_seq_len, self.num_layers * 2, self.multi_query_group_num, self.kv_channels
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        return past_key_values
+
+    def forward(
+        self,
+        input_ids,
+        position_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        full_attention_mask: Optional[torch.BoolTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, seq_length = input_ids.shape
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+
+        if self.pre_seq_len is not None:
+            if past_key_values is None:
+                past_key_values = self.get_prompt(
+                    batch_size=batch_size, device=input_ids.device, dtype=inputs_embeds.dtype
+                )
+            if attention_mask is not None:
+                attention_mask = torch.cat(
+                    [attention_mask.new_ones((batch_size, self.pre_seq_len)), attention_mask], dim=-1
+                )
+
+        if full_attention_mask is None:
+            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
+
+        # Rotary positional embeddings
+        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+        if position_ids is not None:
+            rotary_pos_emb = rotary_pos_emb[position_ids]
+        else:
+            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
+        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
+
+        # Run encoder.
+        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds,
+            full_attention_mask,
+            rotary_pos_emb=rotary_pos_emb,
+            kv_caches=past_key_values,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+        )
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
diff --git a/diffusers/src/diffusers/pipelines/kolors/tokenizer.py b/diffusers/src/diffusers/pipelines/kolors/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa241b920c97a7ca478c52ef54844e8f1742a881
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/kolors/tokenizer.py
@@ -0,0 +1,338 @@
+# Copyright 2024 ChatGLM3-6B Model Team, Kwai-Kolors Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import re
+from typing import Dict, List, Optional, Union
+
+from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizer
+from transformers.tokenization_utils_base import BatchEncoding, EncodedInput
+from transformers.utils import PaddingStrategy
+
+
+class SPTokenizer:
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+
+        role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
+        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
+        self.special_tokens = {}
+        self.index_special_tokens = {}
+        for token in special_tokens:
+            self.special_tokens[token] = self.n_words
+            self.index_special_tokens[self.n_words] = token
+            self.n_words += 1
+        self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens])
+
+    def tokenize(self, s: str, encode_special_tokens=False):
+        if encode_special_tokens:
+            last_index = 0
+            t = []
+            for match in re.finditer(self.role_special_token_expression, s):
+                if last_index < match.start():
+                    t.extend(self.sp_model.EncodeAsPieces(s[last_index : match.start()]))
+                t.append(s[match.start() : match.end()])
+                last_index = match.end()
+            if last_index < len(s):
+                t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
+            return t
+        else:
+            return self.sp_model.EncodeAsPieces(s)
+
+    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        assert isinstance(s, str)
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def decode(self, t: List[int]) -> str:
+        text, buffer = "", []
+        for token in t:
+            if token in self.index_special_tokens:
+                if buffer:
+                    text += self.sp_model.decode(buffer)
+                    buffer = []
+                text += self.index_special_tokens[token]
+            else:
+                buffer.append(token)
+        if buffer:
+            text += self.sp_model.decode(buffer)
+        return text
+
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self.sp_model.DecodePieces(tokens)
+        return text
+
+    def convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        return self.sp_model.PieceToId(token)
+
+    def convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.index_special_tokens:
+            return self.index_special_tokens[index]
+        if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
+            return ""
+        return self.sp_model.IdToPiece(index)
+
+
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "tokenizer.model"}
+
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+
+    def __init__(
+        self,
+        vocab_file,
+        padding_side="left",
+        clean_up_tokenization_spaces=False,
+        encode_special_tokens=False,
+        **kwargs,
+    ):
+        self.name = "GLMTokenizer"
+
+        self.vocab_file = vocab_file
+        self.tokenizer = SPTokenizer(vocab_file)
+        self.special_tokens = {
+            "<bos>": self.tokenizer.bos_id,
+            "<eos>": self.tokenizer.eos_id,
+            "<pad>": self.tokenizer.pad_id,
+        }
+        self.encode_special_tokens = encode_special_tokens
+        super().__init__(
+            padding_side=padding_side,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            encode_special_tokens=encode_special_tokens,
+            **kwargs,
+        )
+
+    def get_command(self, token):
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
+        return self.tokenizer.special_tokens[token]
+
+    @property
+    def unk_token(self) -> str:
+        return "<unk>"
+
+    @unk_token.setter
+    def unk_token(self, value: str):
+        self._unk_token = value
+
+    @property
+    def pad_token(self) -> str:
+        return "<unk>"
+
+    @pad_token.setter
+    def pad_token(self, value: str):
+        self._pad_token = value
+
+    @property
+    def pad_token_id(self):
+        return self.get_command("<pad>")
+
+    @property
+    def eos_token(self) -> str:
+        return "</s>"
+
+    @eos_token.setter
+    def eos_token(self, value: str):
+        self._eos_token = value
+
+    @property
+    def eos_token_id(self):
+        return self.get_command("<eos>")
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_words
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text, **kwargs):
+        return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.tokenizer.convert_token_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.tokenizer.decode_tokens(tokens)
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(save_directory, self.vocab_files_names["vocab_file"])
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, "rb") as fin:
+            proto_str = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+
+        return (vocab_file,)
+
+    def get_prefix_tokens(self):
+        prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
+        return prefix_tokens
+
+    def build_single_message(self, role, metadata, message):
+        assert role in ["system", "user", "assistant", "observation"], role
+        role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n")
+        message_tokens = self.tokenizer.encode(message)
+        tokens = role_tokens + message_tokens
+        return tokens
+
+    def build_chat_input(self, query, history=None, role="user"):
+        if history is None:
+            history = []
+        input_ids = []
+        for item in history:
+            content = item["content"]
+            if item["role"] == "system" and "tools" in item:
+                content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
+            input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content))
+        input_ids.extend(self.build_single_message(role, "", query))
+        input_ids.extend([self.get_command("<|assistant|>")])
+        return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        prefix_tokens = self.get_prefix_tokens()
+        token_ids_0 = prefix_tokens + token_ids_0
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
+        return token_ids_0
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        padding_side: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        assert self.padding_side == "left"
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * seq_length
+
+        if "position_ids" not in encoded_inputs:
+            encoded_inputs["position_ids"] = list(range(seq_length))
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
diff --git a/diffusers/src/diffusers/pipelines/latent_consistency_models/__init__.py b/diffusers/src/diffusers/pipelines/latent_consistency_models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f79d3c4773f393ed689a949041d36ad77e20968
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/latent_consistency_models/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_latent_consistency_img2img"] = ["LatentConsistencyModelImg2ImgPipeline"]
+    _import_structure["pipeline_latent_consistency_text2img"] = ["LatentConsistencyModelPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_latent_consistency_img2img import LatentConsistencyModelImg2ImgPipeline
+        from .pipeline_latent_consistency_text2img import LatentConsistencyModelPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/diffusers/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd72d3c9e10e58b5c23eacca058853df10d0df65
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
@@ -0,0 +1,976 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import LCMScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import AutoPipelineForImage2Image
+        >>> import torch
+        >>> import PIL
+
+        >>> pipe = AutoPipelineForImage2Image.from_pretrained("SimianLuo/LCM_Dreamshaper_v7")
+        >>> # To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+        >>> pipe.to(torch_device="cuda", torch_dtype=torch.float32)
+
+        >>> prompt = "High altitude snowy mountains"
+        >>> image = PIL.Image.open("./snowy_mountains.png")
+
+        >>> # Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps.
+        >>> num_inference_steps = 4
+        >>> images = pipe(
+        ...     prompt=prompt, image=image, num_inference_steps=num_inference_steps, guidance_scale=8.0
+        ... ).images
+
+        >>> images[0].save("image.png")
+        ```
+
+"""
+
+
+class LatentConsistencyModelImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for image-to-image generation using a latent consistency model.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Currently only
+            supports [`LCMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+        requires_safety_checker (`bool`, *optional*, defaults to `True`):
+            Whether the pipeline requires a safety checker component.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "denoised", "prompt_embeds", "w_embedding"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: LCMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+                    image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+                elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+                    )
+
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        strength: float,
+        callback_steps: int,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    @property
+    def do_classifier_free_guidance(self):
+        return False
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        num_inference_steps: int = 4,
+        strength: float = 0.8,
+        original_inference_steps: int = None,
+        timesteps: List[int] = None,
+        guidance_scale: float = 8.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            original_inference_steps (`int`, *optional*):
+                The original number of inference steps use to generate a linearly-spaced timestep schedule, from which
+                we will draw `num_inference_steps` evenly spaced timesteps from as our final timestep schedule,
+                following the Skipping-Step method in the paper (see Section 4.3). If not set this will default to the
+                scheduler's `original_inference_steps` attribute.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps on the original LCM training/distillation timestep schedule are used. Must be in descending
+                order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+                Note that the original latent consistency models paper uses a different CFG formulation where the
+                guidance scales are decreased by 1 (so in the paper formulation CFG is enabled when `guidance_scale >
+                0`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            strength,
+            callback_steps,
+            prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        # NOTE: when a LCM is distilled from an LDM via latent consistency distillation (Algorithm 1) with guided
+        # distillation, the forward pass of the LCM learns to approximate sampling from the LDM using CFG with the
+        # unconditional prompt "" (the empty string). Due to this, LCMs currently do not support negative prompts.
+        prompt_embeds, _ = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt=None,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=None,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Encode image
+        image = self.image_processor.preprocess(image)
+
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            original_inference_steps=original_inference_steps,
+            strength=strength,
+        )
+
+        # 6. Prepare latent variables
+        original_inference_steps = (
+            original_inference_steps
+            if original_inference_steps is not None
+            else self.scheduler.config.original_inference_steps
+        )
+        latent_timestep = timesteps[:1]
+        if latents is None:
+            latents = self.prepare_latents(
+                image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+            )
+        bs = batch_size * num_images_per_prompt
+
+        # 6. Get Guidance Scale Embedding
+        # NOTE: We use the Imagen CFG formulation that StableDiffusionPipeline uses rather than the original LCM paper
+        # CFG formulation, so we need to subtract 1 from the input guidance_scale.
+        # LCM CFG formulation:  cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond), (cfg_scale > 0.0 using CFG)
+        w = torch.tensor(self.guidance_scale - 1).repeat(bs)
+        w_embedding = self.get_guidance_scale_embedding(w, embedding_dim=self.unet.config.time_cond_proj_dim).to(
+            device=device, dtype=latents.dtype
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 8. LCM Multistep Sampling Loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latents = latents.to(prompt_embeds.dtype)
+
+                # model prediction (v-prediction, eps, x)
+                model_pred = self.unet(
+                    latents,
+                    t,
+                    timestep_cond=w_embedding,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents, denoised = self.scheduler.step(model_pred, t, latents, **extra_step_kwargs, return_dict=False)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    w_embedding = callback_outputs.pop("w_embedding", w_embedding)
+                    denoised = callback_outputs.pop("denoised", denoised)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        denoised = denoised.to(prompt_embeds.dtype)
+        if not output_type == "latent":
+            image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = denoised
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/diffusers/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..89cafc2877fed0ab2b26d5e6b652c291ee101730
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
@@ -0,0 +1,905 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import LCMScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import DiffusionPipeline
+        >>> import torch
+
+        >>> pipe = DiffusionPipeline.from_pretrained("SimianLuo/LCM_Dreamshaper_v7")
+        >>> # To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+        >>> pipe.to(torch_device="cuda", torch_dtype=torch.float32)
+
+        >>> prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
+
+        >>> # Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps.
+        >>> num_inference_steps = 4
+        >>> images = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0).images
+        >>> images[0].save("image.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class LatentConsistencyModelPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using a latent consistency model.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Currently only
+            supports [`LCMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+        requires_safety_checker (`bool`, *optional*, defaults to `True`):
+            Whether the pipeline requires a safety checker component.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "denoised", "prompt_embeds", "w_embedding"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: LCMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Currently StableDiffusionPipeline.check_inputs with negative prompt stuff removed
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: int,
+        width: int,
+        callback_steps: int,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    @property
+    def do_classifier_free_guidance(self):
+        return False
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 4,
+        original_inference_steps: int = None,
+        timesteps: List[int] = None,
+        guidance_scale: float = 8.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            original_inference_steps (`int`, *optional*):
+                The original number of inference steps use to generate a linearly-spaced timestep schedule, from which
+                we will draw `num_inference_steps` evenly spaced timesteps from as our final timestep schedule,
+                following the Skipping-Step method in the paper (see Section 4.3). If not set this will default to the
+                scheduler's `original_inference_steps` attribute.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps on the original LCM training/distillation timestep schedule are used. Must be in descending
+                order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+                Note that the original latent consistency models paper uses a different CFG formulation where the
+                guidance scales are decreased by 1 (so in the paper formulation CFG is enabled when `guidance_scale >
+                0`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        # NOTE: when a LCM is distilled from an LDM via latent consistency distillation (Algorithm 1) with guided
+        # distillation, the forward pass of the LCM learns to approximate sampling from the LDM using CFG with the
+        # unconditional prompt "" (the empty string). Due to this, LCMs currently do not support negative prompts.
+        prompt_embeds, _ = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt=None,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=None,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, original_inference_steps=original_inference_steps
+        )
+
+        # 5. Prepare latent variable
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        bs = batch_size * num_images_per_prompt
+
+        # 6. Get Guidance Scale Embedding
+        # NOTE: We use the Imagen CFG formulation that StableDiffusionPipeline uses rather than the original LCM paper
+        # CFG formulation, so we need to subtract 1 from the input guidance_scale.
+        # LCM CFG formulation:  cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond), (cfg_scale > 0.0 using CFG)
+        w = torch.tensor(self.guidance_scale - 1).repeat(bs)
+        w_embedding = self.get_guidance_scale_embedding(w, embedding_dim=self.unet.config.time_cond_proj_dim).to(
+            device=device, dtype=latents.dtype
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 8. LCM MultiStep Sampling Loop:
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latents = latents.to(prompt_embeds.dtype)
+
+                # model prediction (v-prediction, eps, x)
+                model_pred = self.unet(
+                    latents,
+                    t,
+                    timestep_cond=w_embedding,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents, denoised = self.scheduler.step(model_pred, t, latents, **extra_step_kwargs, return_dict=False)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    w_embedding = callback_outputs.pop("w_embedding", w_embedding)
+                    denoised = callback_outputs.pop("denoised", denoised)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        denoised = denoised.to(prompt_embeds.dtype)
+        if not output_type == "latent":
+            image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = denoised
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/latent_diffusion/__init__.py b/diffusers/src/diffusers/pipelines/latent_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..561f96fc71dc7b4404e09571e0b7eaa4ee02fde8
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/latent_diffusion/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_latent_diffusion"] = ["LDMBertModel", "LDMTextToImagePipeline"]
+    _import_structure["pipeline_latent_diffusion_superresolution"] = ["LDMSuperResolutionPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_latent_diffusion import LDMBertModel, LDMTextToImagePipeline
+        from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/diffusers/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6f3531a883554411d06e806367387b1c4e59a0a
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -0,0 +1,746 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.utils import logging
+
+from ...models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class LDMTextToImagePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using latent diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        bert ([`LDMBertModel`]):
+            Text-encoder model based on [`~transformers.BERT`].
+        tokenizer ([`~transformers.BertTokenizer`]):
+            A `BertTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "bert->unet->vqvae"
+
+    def __init__(
+        self,
+        vqvae: Union[VQModel, AutoencoderKL],
+        bert: PreTrainedModel,
+        tokenizer: PreTrainedTokenizer,
+        unet: Union[UNet2DModel, UNet2DConditionModel],
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+    ):
+        super().__init__()
+        self.register_modules(vqvae=vqvae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+        self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 1.0,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[Tuple, ImagePipelineOutput]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 1.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import DiffusionPipeline
+
+        >>> # load model and scheduler
+        >>> ldm = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> prompt = "A painting of a squirrel eating a burger"
+        >>> images = ldm([prompt], num_inference_steps=50, eta=0.3, guidance_scale=6).images
+
+        >>> # save images
+        >>> for idx, image in enumerate(images):
+        ...     image.save(f"squirrel-{idx}.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # get unconditional embeddings for classifier free guidance
+        if guidance_scale != 1.0:
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=77, truncation=True, return_tensors="pt"
+            )
+            negative_prompt_embeds = self.bert(uncond_input.input_ids.to(self._execution_device))[0]
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt")
+        prompt_embeds = self.bert(text_input.input_ids.to(self._execution_device))[0]
+
+        # get the initial random noise unless the user supplied it
+        latents_shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(
+                latents_shape, generator=generator, device=self._execution_device, dtype=prompt_embeds.dtype
+            )
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+        latents = latents.to(self._execution_device)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+
+        extra_kwargs = {}
+        if accepts_eta:
+            extra_kwargs["eta"] = eta
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            if guidance_scale == 1.0:
+                # guidance_scale of 1 means no guidance
+                latents_input = latents
+                context = prompt_embeds
+            else:
+                # For classifier free guidance, we need to do two forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing two forward passes
+                latents_input = torch.cat([latents] * 2)
+                context = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+            # predict the noise residual
+            noise_pred = self.unet(latents_input, t, encoder_hidden_states=context).sample
+            # perform guidance
+            if guidance_scale != 1.0:
+                noise_pred_uncond, noise_prediction_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / self.vqvae.config.scaling_factor * latents
+        image = self.vqvae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
+
+
+################################################################################
+# Code for the text transformer model
+################################################################################
+""" PyTorch LDMBERT model."""
+
+
+logger = logging.get_logger(__name__)
+
+LDMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "ldm-bert",
+    # See all LDMBert models at https://huggingface.co/models?filter=ldmbert
+]
+
+
+LDMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "ldm-bert": "https://huggingface.co/valhalla/ldm-bert/blob/main/config.json",
+}
+
+
+""" LDMBERT model configuration"""
+
+
+class LDMBertConfig(PretrainedConfig):
+    model_type = "ldmbert"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        max_position_embeddings=77,
+        encoder_layers=32,
+        encoder_ffn_dim=5120,
+        encoder_attention_heads=8,
+        head_dim=64,
+        encoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1280,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        pad_token_id=0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.head_dim = head_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->LDMBert
+class LDMBertAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        head_dim: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = head_dim
+        self.inner_dim = head_dim * num_heads
+
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, self.inner_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, self.inner_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, self.inner_dim, bias=bias)
+        self.out_proj = nn.Linear(self.inner_dim, embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.inner_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class LDMBertEncoderLayer(nn.Module):
+    def __init__(self, config: LDMBertConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = LDMBertAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            head_dim=config.head_dim,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Args:
+            hidden_states (`torch.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.Tensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.bart.modeling_bart.BartPretrainedModel with Bart->LDMBert
+class LDMBertPreTrainedModel(PreTrainedModel):
+    config_class = LDMBertConfig
+    base_model_prefix = "model"
+    _supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"]
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (LDMBertEncoder,)):
+            module.gradient_checkpointing = value
+
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+
+
+class LDMBertEncoder(LDMBertPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`LDMBertEncoderLayer`].
+
+    Args:
+        config: LDMBertConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: LDMBertConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim)
+        self.embed_positions = nn.Embedding(config.max_position_embeddings, embed_dim)
+        self.layers = nn.ModuleList([LDMBertEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layer_norm = nn.LayerNorm(embed_dim)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.BaseModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        seq_len = input_shape[1]
+        if position_ids is None:
+            position_ids = torch.arange(seq_len, dtype=torch.long, device=inputs_embeds.device).expand((1, -1))
+        embed_pos = self.embed_positions(position_ids)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    (head_mask[idx] if head_mask is not None else None),
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class LDMBertModel(LDMBertPreTrainedModel):
+    _no_split_modules = []
+
+    def __init__(self, config: LDMBertConfig):
+        super().__init__(config)
+        self.model = LDMBertEncoder(config)
+        self.to_logits = nn.Linear(config.hidden_size, config.vocab_size)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
diff --git a/diffusers/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/diffusers/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb72b4d4eb8e387d596b22cca65c82aef0ab9e75
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
@@ -0,0 +1,189 @@
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.utils.checkpoint
+
+from ...models import UNet2DModel, VQModel
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import PIL_INTERPOLATION
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+def preprocess(image):
+    w, h = image.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+class LDMSuperResolutionPipeline(DiffusionPipeline):
+    r"""
+    A pipeline for image super-resolution using latent diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`],
+            [`EulerAncestralDiscreteScheduler`], [`DPMSolverMultistepScheduler`], or [`PNDMScheduler`].
+    """
+
+    def __init__(
+        self,
+        vqvae: VQModel,
+        unet: UNet2DModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+    ):
+        super().__init__()
+        self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        batch_size: Optional[int] = 1,
+        num_inference_steps: Optional[int] = 100,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[Tuple, ImagePipelineOutput]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image` or tensor representing an image batch to be used as the starting point for the process.
+            batch_size (`int`, *optional*, defaults to 1):
+                Number of images to generate.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> import requests
+        >>> from PIL import Image
+        >>> from io import BytesIO
+        >>> from diffusers import LDMSuperResolutionPipeline
+        >>> import torch
+
+        >>> # load model and scheduler
+        >>> pipeline = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
+        >>> pipeline = pipeline.to("cuda")
+
+        >>> # let's download an  image
+        >>> url = (
+        ...     "https://user-images.githubusercontent.com/38061659/199705896-b48e17b8-b231-47cd-a270-4ffa5a93fa3e.png"
+        ... )
+        >>> response = requests.get(url)
+        >>> low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> low_res_img = low_res_img.resize((128, 128))
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> upscaled_image = pipeline(low_res_img, num_inference_steps=100, eta=1).images[0]
+        >>> # save image
+        >>> upscaled_image.save("ldm_generated_image.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, torch.Tensor):
+            batch_size = image.shape[0]
+        else:
+            raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(image)}")
+
+        if isinstance(image, PIL.Image.Image):
+            image = preprocess(image)
+
+        height, width = image.shape[-2:]
+
+        # in_channels should be 6: 3 for latents, 3 for low resolution image
+        latents_shape = (batch_size, self.unet.config.in_channels // 2, height, width)
+        latents_dtype = next(self.unet.parameters()).dtype
+
+        latents = randn_tensor(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+
+        image = image.to(device=self.device, dtype=latents_dtype)
+
+        # set timesteps and move to the correct device
+        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature.
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_kwargs = {}
+        if accepts_eta:
+            extra_kwargs["eta"] = eta
+
+        for t in self.progress_bar(timesteps_tensor):
+            # concat latents and low resolution image in the channel dimension.
+            latents_input = torch.cat([latents, image], dim=1)
+            latents_input = self.scheduler.scale_model_input(latents_input, t)
+            # predict the noise residual
+            noise_pred = self.unet(latents_input, t).sample
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
+
+        # decode the image latents with the VQVAE
+        image = self.vqvae.decode(latents).sample
+        image = torch.clamp(image, -1.0, 1.0)
+        image = image / 2 + 0.5
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/latte/__init__.py b/diffusers/src/diffusers/pipelines/latte/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4296b42e125303c0e036b9f2deecc36bdf959de3
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/latte/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_latte"] = ["LattePipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_latte import LattePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/latte/pipeline_latte.py b/diffusers/src/diffusers/pipelines/latte/pipeline_latte.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4bedf2e0e2a05ab057a08d27ace1cdeccce4610
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/latte/pipeline_latte.py
@@ -0,0 +1,881 @@
+# Copyright 2024 the Latte Team and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+import inspect
+import re
+import urllib.parse as ul
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...models import AutoencoderKL, LatteTransformer3DModel
+from ...pipelines.pipeline_utils import DiffusionPipeline
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    BACKENDS_MAPPING,
+    BaseOutput,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...video_processor import VideoProcessor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import LattePipeline
+        >>> from diffusers.utils import export_to_gif
+
+        >>> # You can replace the checkpoint id with "maxin-cn/Latte-1" too.
+        >>> pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16)
+        >>> # Enable memory optimizations.
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A small cactus with a happy face in the Sahara desert."
+        >>> videos = pipe(prompt).frames[0]
+        >>> export_to_gif(videos, "latte.gif")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+@dataclass
+class LattePipelineOutput(BaseOutput):
+    frames: torch.Tensor
+
+
+class LattePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using Latte.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. Latte uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`LatteTransformer3DModel`]):
+            A text conditioned `LatteTransformer3DModel` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+    """
+
+    bad_punct_regex = re.compile(r"[#®•©™&@·º½¾¿¡§~\)\(\]\[\}\{\|\\/\\*]{1,}")
+
+    _optional_components = ["tokenizer", "text_encoder"]
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKL,
+        transformer: LatteTransformer3DModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py
+    def mask_text_embeddings(self, emb, mask):
+        if emb.shape[0] == 1:
+            keep_index = mask.sum().item()
+            return emb[:, :, :keep_index, :], keep_index  # 1, 120, 4096 -> 1 7 4096
+        else:
+            masked_feature = emb * mask[:, None, :, None]  # 1 120 4096
+            return masked_feature, emb.shape[2]
+
+    # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str = "",
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+        mask_feature: bool = True,
+        dtype=None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the video generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+                Latte, this should be "".
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of video that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. For Latte, it's should be the embeddings of the "" string.
+            clean_caption (bool, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+            mask_feature: (bool, defaults to `True`):
+                If `True`, the function will mask the text embeddings.
+        """
+        embeds_initially_provided = prompt_embeds is not None and negative_prompt_embeds is not None
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        max_length = 120
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+            prompt_embeds_attention_mask = attention_mask
+
+            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+            prompt_embeds = prompt_embeds[0]
+        else:
+            prompt_embeds_attention_mask = torch.ones_like(prompt_embeds)
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.transformer is not None:
+            dtype = self.transformer.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        prompt_embeds_attention_mask = prompt_embeds_attention_mask.view(bs_embed, -1)
+        prompt_embeds_attention_mask = prompt_embeds_attention_mask.repeat(num_images_per_prompt, 1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        # Perform additional masking.
+        if mask_feature and not embeds_initially_provided:
+            prompt_embeds = prompt_embeds.unsqueeze(1)
+            masked_prompt_embeds, keep_indices = self.mask_text_embeddings(prompt_embeds, prompt_embeds_attention_mask)
+            masked_prompt_embeds = masked_prompt_embeds.squeeze(1)
+            masked_negative_prompt_embeds = (
+                negative_prompt_embeds[:, :keep_indices, :] if negative_prompt_embeds is not None else None
+            )
+
+            return masked_prompt_embeds, masked_negative_prompt_embeds
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: str = "",
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: int = 1,
+        video_length: int = 16,
+        height: int = 512,
+        width: int = 512,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        clean_caption: bool = True,
+        mask_feature: bool = True,
+        enable_temporal_attentions: bool = True,
+        decode_chunk_size: Optional[int] = None,
+    ) -> Union[LattePipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the video generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the video generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate videos that are closely linked to the text `prompt`,
+                usually at the expense of lower video quality.
+            video_length (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated video.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. For Latte this negative prompt should be "". If not provided,
+                negative_prompt_embeds will be generated from `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate video. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable[[int, int, Dict], None]`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A callback function or a list of callback functions to be called at the end of each denoising step.
+            callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
+                A list of tensor inputs that should be passed to the callback function. If not defined, all tensor
+                inputs will be passed.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            mask_feature (`bool` defaults to `True`): If set to `True`, the text embeddings will be masked.
+            enable_temporal_attentions (`bool`, *optional*, defaults to `True`): Whether to enable temporal attentions
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
+                expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
+                For lower memory usage, reduce `decode_chunk_size`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.latte.pipeline_latte.LattePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.latte.pipeline_latte.LattePipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Default
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else video_length
+
+        # 1. Check inputs. Raise error if not correct
+        height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_on_step_end_tensor_inputs,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._interrupt = False
+
+        # 2. Default height and width to transformer
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+            mask_feature=mask_feature,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            latent_channels,
+            video_length,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                current_timestep = t
+                if not torch.is_tensor(current_timestep):
+                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                    # This would be a good case for the `match` statement (Python 3.10+)
+                    is_mps = latent_model_input.device.type == "mps"
+                    if isinstance(current_timestep, float):
+                        dtype = torch.float32 if is_mps else torch.float64
+                    else:
+                        dtype = torch.int32 if is_mps else torch.int64
+                    current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
+                elif len(current_timestep.shape) == 0:
+                    current_timestep = current_timestep[None].to(latent_model_input.device)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(latent_model_input.shape[0])
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=current_timestep,
+                    enable_temporal_attentions=enable_temporal_attentions,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # use learned sigma?
+                if not (
+                    hasattr(self.scheduler.config, "variance_type")
+                    and self.scheduler.config.variance_type in ["learned", "learned_range"]
+                ):
+                    noise_pred = noise_pred.chunk(2, dim=1)[0]
+
+                # compute previous video: x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latents":
+            video = self.decode_latents(latents, video_length, decode_chunk_size=14)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return LattePipelineOutput(frames=video)
+
+    # Similar to diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion.decode_latents
+    def decode_latents(self, latents: torch.Tensor, video_length: int, decode_chunk_size: int = 14):
+        # [batch, channels, frames, height, width] -> [batch*frames, channels, height, width]
+        latents = latents.permute(0, 2, 1, 3, 4).flatten(0, 1)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
+        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
+
+        # decode decode_chunk_size frames at a time to avoid OOM
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i : i + decode_chunk_size].shape[0]
+            decode_kwargs = {}
+            if accepts_num_frames:
+                # we only pass num_frames_in if it's expected
+                decode_kwargs["num_frames"] = num_frames_in
+
+            frame = self.vae.decode(latents[i : i + decode_chunk_size], **decode_kwargs).sample
+            frames.append(frame)
+        frames = torch.cat(frames, dim=0)
+
+        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
+        frames = frames.reshape(-1, video_length, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        frames = frames.float()
+        return frames
diff --git a/diffusers/src/diffusers/pipelines/ledits_pp/__init__.py b/diffusers/src/diffusers/pipelines/ledits_pp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aae3b1cb18ce96c7d79d3c44bc37cf2ae4db9720
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/ledits_pp/__init__.py
@@ -0,0 +1,55 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_leditspp_stable_diffusion"] = ["LEditsPPPipelineStableDiffusion"]
+    _import_structure["pipeline_leditspp_stable_diffusion_xl"] = ["LEditsPPPipelineStableDiffusionXL"]
+
+    _import_structure["pipeline_output"] = ["LEditsPPDiffusionPipelineOutput", "LEditsPPDiffusionPipelineOutput"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_leditspp_stable_diffusion import (
+            LEditsPPDiffusionPipelineOutput,
+            LEditsPPInversionPipelineOutput,
+            LEditsPPPipelineStableDiffusion,
+        )
+        from .pipeline_leditspp_stable_diffusion_xl import LEditsPPPipelineStableDiffusionXL
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py b/diffusers/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..049b89661b115afd01af789e878773d05e8fdfa5
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py
@@ -0,0 +1,1499 @@
+import inspect
+import math
+from itertools import repeat
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import Attention, AttnProcessor
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from ...schedulers import DDIMScheduler, DPMSolverMultistepScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import LEditsPPDiffusionPipelineOutput, LEditsPPInversionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+
+        >>> from diffusers import LEditsPPPipelineStableDiffusion
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = LEditsPPPipelineStableDiffusion.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/cherry_blossom.png"
+        >>> image = load_image(img_url).convert("RGB")
+
+        >>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.1)
+
+        >>> edited_image = pipe(
+        ...     editing_prompt=["cherry blossom"], edit_guidance_scale=10.0, edit_threshold=0.75
+        ... ).images[0]
+        ```
+"""
+
+
+# Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionAttendAndExcitePipeline.AttentionStore
+class LeditsAttentionStore:
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [], "down_self": [], "mid_self": [], "up_self": []}
+
+    def __call__(self, attn, is_cross: bool, place_in_unet: str, editing_prompts, PnP=False):
+        # attn.shape = batch_size * head_size, seq_len query, seq_len_key
+        if attn.shape[1] <= self.max_size:
+            bs = 1 + int(PnP) + editing_prompts
+            skip = 2 if PnP else 1  # skip PnP & unconditional
+            attn = torch.stack(attn.split(self.batch_size)).permute(1, 0, 2, 3)
+            source_batch_size = int(attn.shape[1] // bs)
+            self.forward(attn[:, skip * source_batch_size :], is_cross, place_in_unet)
+
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+
+        self.step_store[key].append(attn)
+
+    def between_steps(self, store_step=True):
+        if store_step:
+            if self.average:
+                if len(self.attention_store) == 0:
+                    self.attention_store = self.step_store
+                else:
+                    for key in self.attention_store:
+                        for i in range(len(self.attention_store[key])):
+                            self.attention_store[key][i] += self.step_store[key][i]
+            else:
+                if len(self.attention_store) == 0:
+                    self.attention_store = [self.step_store]
+                else:
+                    self.attention_store.append(self.step_store)
+
+            self.cur_step += 1
+        self.step_store = self.get_empty_store()
+
+    def get_attention(self, step: int):
+        if self.average:
+            attention = {
+                key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store
+            }
+        else:
+            assert step is not None
+            attention = self.attention_store[step]
+        return attention
+
+    def aggregate_attention(
+        self, attention_maps, prompts, res: Union[int, Tuple[int]], from_where: List[str], is_cross: bool, select: int
+    ):
+        out = [[] for x in range(self.batch_size)]
+        if isinstance(res, int):
+            num_pixels = res**2
+            resolution = (res, res)
+        else:
+            num_pixels = res[0] * res[1]
+            resolution = res[:2]
+
+        for location in from_where:
+            for bs_item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]:
+                for batch, item in enumerate(bs_item):
+                    if item.shape[1] == num_pixels:
+                        cross_maps = item.reshape(len(prompts), -1, *resolution, item.shape[-1])[select]
+                        out[batch].append(cross_maps)
+
+        out = torch.stack([torch.cat(x, dim=0) for x in out])
+        # average over heads
+        out = out.sum(1) / out.shape[1]
+        return out
+
+    def __init__(self, average: bool, batch_size=1, max_resolution=16, max_size: int = None):
+        self.step_store = self.get_empty_store()
+        self.attention_store = []
+        self.cur_step = 0
+        self.average = average
+        self.batch_size = batch_size
+        if max_size is None:
+            self.max_size = max_resolution**2
+        elif max_size is not None and max_resolution is None:
+            self.max_size = max_size
+        else:
+            raise ValueError("Only allowed to set one of max_resolution or max_size")
+
+
+# Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionAttendAndExcitePipeline.GaussianSmoothing
+class LeditsGaussianSmoothing:
+    def __init__(self, device):
+        kernel_size = [3, 3]
+        sigma = [0.5, 0.5]
+
+        # The gaussian kernel is the product of the gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(1, *[1] * (kernel.dim() - 1))
+
+        self.weight = kernel.to(device)
+
+    def __call__(self, input):
+        """
+        Arguments:
+        Apply gaussian filter to input.
+            input (torch.Tensor): Input to apply gaussian filter on.
+        Returns:
+            filtered (torch.Tensor): Filtered output.
+        """
+        return F.conv2d(input, weight=self.weight.to(input.dtype))
+
+
+class LEDITSCrossAttnProcessor:
+    def __init__(self, attention_store, place_in_unet, pnp, editing_prompts):
+        self.attnstore = attention_store
+        self.place_in_unet = place_in_unet
+        self.editing_prompts = editing_prompts
+        self.pnp = pnp
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask=None,
+        temb=None,
+    ):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        self.attnstore(
+            attention_probs,
+            is_cross=True,
+            place_in_unet=self.place_in_unet,
+            editing_prompts=self.editing_prompts,
+            PnP=self.pnp,
+        )
+
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class LEditsPPPipelineStableDiffusion(
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+):
+    """
+    Pipeline for textual image editing using LEDits++ with Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionPipeline`]. Check the superclass
+    documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular
+    device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
+            automatically be set to [`DPMSolverMultistepScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, DPMSolverMultistepScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if not isinstance(scheduler, DDIMScheduler) and not isinstance(scheduler, DPMSolverMultistepScheduler):
+            scheduler = DPMSolverMultistepScheduler.from_config(
+                scheduler.config, algorithm_type="sde-dpmsolver++", solver_order=2
+            )
+            logger.warning(
+                "This pipeline only supports DDIMScheduler and DPMSolverMultistepScheduler. "
+                "The scheduler has been changed to DPMSolverMultistepScheduler."
+            )
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+        self.inversion_steps = None
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, eta, generator=None):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        negative_prompt=None,
+        editing_prompt_embeddings=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if editing_prompt_embeddings is not None and negative_prompt_embeds is not None:
+            if editing_prompt_embeddings.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`editing_prompt_embeddings` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `editing_prompt_embeddings` {editing_prompt_embeddings.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, latents):
+        # shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+
+        # if latents.shape != shape:
+        #    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+
+        latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_unet(self, attention_store, PnP: bool = False):
+        attn_procs = {}
+        for name in self.unet.attn_processors.keys():
+            if name.startswith("mid_block"):
+                place_in_unet = "mid"
+            elif name.startswith("up_blocks"):
+                place_in_unet = "up"
+            elif name.startswith("down_blocks"):
+                place_in_unet = "down"
+            else:
+                continue
+
+            if "attn2" in name and place_in_unet != "mid":
+                attn_procs[name] = LEDITSCrossAttnProcessor(
+                    attention_store=attention_store,
+                    place_in_unet=place_in_unet,
+                    pnp=PnP,
+                    editing_prompts=self.enabled_editing_prompts,
+                )
+            else:
+                attn_procs[name] = AttnProcessor()
+
+        self.unet.set_attn_processor(attn_procs)
+
+    def encode_prompt(
+        self,
+        device,
+        num_images_per_prompt,
+        enable_edit_guidance,
+        negative_prompt=None,
+        editing_prompt=None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        editing_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            enable_edit_guidance (`bool`):
+                whether to perform any editing or reconstruct the input image instead
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            editing_prompt (`str` or `List[str]`, *optional*):
+                Editing prompt(s) to be encoded. If not defined, one has to pass `editing_prompt_embeds` instead.
+            editing_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        batch_size = self.batch_size
+        num_edit_tokens = None
+
+        if negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but exoected"
+                    f"{batch_size} based on the input images. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = negative_prompt_embeds.dtype
+
+        negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        if enable_edit_guidance:
+            if editing_prompt_embeds is None:
+                # textual inversion: procecss multi-vector tokens if necessary
+                # if isinstance(self, TextualInversionLoaderMixin):
+                #    prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+                if isinstance(editing_prompt, str):
+                    editing_prompt = [editing_prompt]
+
+                max_length = negative_prompt_embeds.shape[1]
+                text_inputs = self.tokenizer(
+                    [x for item in editing_prompt for x in repeat(item, batch_size)],
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                    return_length=True,
+                )
+
+                num_edit_tokens = text_inputs.length - 2  # not counting startoftext and endoftext
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = self.tokenizer(
+                    [x for item in editing_prompt for x in repeat(item, batch_size)],
+                    padding="longest",
+                    return_tensors="pt",
+                ).input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = self.tokenizer.batch_decode(
+                        untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                    )
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                if (
+                    hasattr(self.text_encoder.config, "use_attention_mask")
+                    and self.text_encoder.config.use_attention_mask
+                ):
+                    attention_mask = text_inputs.attention_mask.to(device)
+                else:
+                    attention_mask = None
+
+                if clip_skip is None:
+                    editing_prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                    editing_prompt_embeds = editing_prompt_embeds[0]
+                else:
+                    editing_prompt_embeds = self.text_encoder(
+                        text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                    )
+                    # Access the `hidden_states` first, that contains a tuple of
+                    # all the hidden states from the encoder layers. Then index into
+                    # the tuple to access the hidden states from the desired layer.
+                    editing_prompt_embeds = editing_prompt_embeds[-1][-(clip_skip + 1)]
+                    # We also need to apply the final LayerNorm here to not mess with the
+                    # representations. The `last_hidden_states` that we typically use for
+                    # obtaining the final prompt representations passes through the LayerNorm
+                    # layer.
+                    editing_prompt_embeds = self.text_encoder.text_model.final_layer_norm(editing_prompt_embeds)
+
+            editing_prompt_embeds = editing_prompt_embeds.to(dtype=negative_prompt_embeds.dtype, device=device)
+
+            bs_embed_edit, seq_len, _ = editing_prompt_embeds.shape
+            editing_prompt_embeds = editing_prompt_embeds.to(dtype=negative_prompt_embeds.dtype, device=device)
+            editing_prompt_embeds = editing_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            editing_prompt_embeds = editing_prompt_embeds.view(bs_embed_edit * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+
+        # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+        seq_len = negative_prompt_embeds.shape[1]
+
+        negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return editing_prompt_embeds, negative_prompt_embeds, num_edit_tokens
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        editing_prompt: Optional[Union[str, List[str]]] = None,
+        editing_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        reverse_editing_direction: Optional[Union[bool, List[bool]]] = False,
+        edit_guidance_scale: Optional[Union[float, List[float]]] = 5,
+        edit_warmup_steps: Optional[Union[int, List[int]]] = 0,
+        edit_cooldown_steps: Optional[Union[int, List[int]]] = None,
+        edit_threshold: Optional[Union[float, List[float]]] = 0.9,
+        user_mask: Optional[torch.Tensor] = None,
+        sem_guidance: Optional[List[torch.Tensor]] = None,
+        use_cross_attn_mask: bool = False,
+        use_intersect_mask: bool = True,
+        attn_store_steps: Optional[List[int]] = [],
+        store_averaged_over_steps: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for editing. The
+        [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`] method has to be called beforehand. Edits will
+        always be performed for the last inverted image(s).
+
+        Args:
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a plain
+                tuple.
+            editing_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. The image is reconstructed by setting
+                `editing_prompt = None`. Guidance direction of prompt should be specified via
+                `reverse_editing_direction`.
+            editing_prompt_embeds (`torch.Tensor>`, *optional*):
+                Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should
+                be specified via `reverse_editing_direction`.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
+                Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
+            edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
+                Guidance scale for guiding the image generation. If provided as list values should correspond to
+                `editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++
+                Paper](https://arxiv.org/abs/2301.12247).
+            edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
+                Number of diffusion steps (for each prompt) for which guidance will not be applied.
+            edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
+                Number of diffusion steps (for each prompt) after which guidance will no longer be applied.
+            edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
+                Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
+                'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
+                Paper](https://arxiv.org/abs/2301.12247).
+            user_mask (`torch.Tensor`, *optional*):
+                User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s
+                implicit masks do not meet user preferences.
+            sem_guidance (`List[torch.Tensor]`, *optional*):
+                List of pre-generated guidance vectors to be applied at generation. Length of the list has to
+                correspond to `num_inference_steps`.
+            use_cross_attn_mask (`bool`, defaults to `False`):
+                Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
+                is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++
+                paper](https://arxiv.org/pdf/2311.16711.pdf).
+            use_intersect_mask (`bool`, defaults to `True`):
+                Whether the masking term is calculated as intersection of cross-attention masks and masks derived from
+                the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate
+                are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
+            attn_store_steps (`List[int]`, *optional*):
+                Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
+            store_averaged_over_steps (`bool`, defaults to `True`):
+                Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If
+                False, attention maps for each step are stores separately. Just for visualization purposes.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            content, according to the `safety_checker`.
+        """
+
+        if self.inversion_steps is None:
+            raise ValueError(
+                "You need to invert an input image first before calling the pipeline. The `invert` method has to be called beforehand. Edits will always be performed for the last inverted image(s)."
+            )
+
+        eta = self.eta
+        num_images_per_prompt = 1
+        latents = self.init_latents
+
+        zs = self.zs
+        self.scheduler.set_timesteps(len(self.scheduler.timesteps))
+
+        if use_intersect_mask:
+            use_cross_attn_mask = True
+
+        if use_cross_attn_mask:
+            self.smoothing = LeditsGaussianSmoothing(self.device)
+
+        if user_mask is not None:
+            user_mask = user_mask.to(self.device)
+
+        org_prompt = ""
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            negative_prompt,
+            editing_prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        batch_size = self.batch_size
+
+        if editing_prompt:
+            enable_edit_guidance = True
+            if isinstance(editing_prompt, str):
+                editing_prompt = [editing_prompt]
+            self.enabled_editing_prompts = len(editing_prompt)
+        elif editing_prompt_embeds is not None:
+            enable_edit_guidance = True
+            self.enabled_editing_prompts = editing_prompt_embeds.shape[0]
+        else:
+            self.enabled_editing_prompts = 0
+            enable_edit_guidance = False
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        edit_concepts, uncond_embeddings, num_edit_tokens = self.encode_prompt(
+            editing_prompt=editing_prompt,
+            device=self.device,
+            num_images_per_prompt=num_images_per_prompt,
+            enable_edit_guidance=enable_edit_guidance,
+            negative_prompt=negative_prompt,
+            editing_prompt_embeds=editing_prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if enable_edit_guidance:
+            text_embeddings = torch.cat([uncond_embeddings, edit_concepts])
+            self.text_cross_attention_maps = [editing_prompt] if isinstance(editing_prompt, str) else editing_prompt
+        else:
+            text_embeddings = torch.cat([uncond_embeddings])
+
+        # 4. Prepare timesteps
+        # self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        timesteps = self.inversion_steps
+        t_to_idx = {int(v): k for k, v in enumerate(timesteps[-zs.shape[0] :])}
+
+        if use_cross_attn_mask:
+            self.attention_store = LeditsAttentionStore(
+                average=store_averaged_over_steps,
+                batch_size=batch_size,
+                max_size=(latents.shape[-2] / 4.0) * (latents.shape[-1] / 4.0),
+                max_resolution=None,
+            )
+            self.prepare_unet(self.attention_store, PnP=False)
+            resolution = latents.shape[-2:]
+            att_res = (int(resolution[0] / 4), int(resolution[1] / 4))
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            None,
+            None,
+            text_embeddings.dtype,
+            self.device,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
+
+        self.sem_guidance = None
+        self.activation_mask = None
+
+        # 7. Denoising loop
+        num_warmup_steps = 0
+        with self.progress_bar(total=len(timesteps)) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+
+                if enable_edit_guidance:
+                    latent_model_input = torch.cat([latents] * (1 + self.enabled_editing_prompts))
+                else:
+                    latent_model_input = latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                text_embed_input = text_embeddings
+
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embed_input).sample
+
+                noise_pred_out = noise_pred.chunk(1 + self.enabled_editing_prompts)  # [b,4, 64, 64]
+                noise_pred_uncond = noise_pred_out[0]
+                noise_pred_edit_concepts = noise_pred_out[1:]
+
+                noise_guidance_edit = torch.zeros(
+                    noise_pred_uncond.shape,
+                    device=self.device,
+                    dtype=noise_pred_uncond.dtype,
+                )
+
+                if sem_guidance is not None and len(sem_guidance) > i:
+                    noise_guidance_edit += sem_guidance[i].to(self.device)
+
+                elif enable_edit_guidance:
+                    if self.activation_mask is None:
+                        self.activation_mask = torch.zeros(
+                            (len(timesteps), len(noise_pred_edit_concepts), *noise_pred_edit_concepts[0].shape)
+                        )
+
+                    if self.sem_guidance is None:
+                        self.sem_guidance = torch.zeros((len(timesteps), *noise_pred_uncond.shape))
+
+                    for c, noise_pred_edit_concept in enumerate(noise_pred_edit_concepts):
+                        if isinstance(edit_warmup_steps, list):
+                            edit_warmup_steps_c = edit_warmup_steps[c]
+                        else:
+                            edit_warmup_steps_c = edit_warmup_steps
+                        if i < edit_warmup_steps_c:
+                            continue
+
+                        if isinstance(edit_guidance_scale, list):
+                            edit_guidance_scale_c = edit_guidance_scale[c]
+                        else:
+                            edit_guidance_scale_c = edit_guidance_scale
+
+                        if isinstance(edit_threshold, list):
+                            edit_threshold_c = edit_threshold[c]
+                        else:
+                            edit_threshold_c = edit_threshold
+                        if isinstance(reverse_editing_direction, list):
+                            reverse_editing_direction_c = reverse_editing_direction[c]
+                        else:
+                            reverse_editing_direction_c = reverse_editing_direction
+
+                        if isinstance(edit_cooldown_steps, list):
+                            edit_cooldown_steps_c = edit_cooldown_steps[c]
+                        elif edit_cooldown_steps is None:
+                            edit_cooldown_steps_c = i + 1
+                        else:
+                            edit_cooldown_steps_c = edit_cooldown_steps
+
+                        if i >= edit_cooldown_steps_c:
+                            continue
+
+                        noise_guidance_edit_tmp = noise_pred_edit_concept - noise_pred_uncond
+
+                        if reverse_editing_direction_c:
+                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * -1
+
+                        noise_guidance_edit_tmp = noise_guidance_edit_tmp * edit_guidance_scale_c
+
+                        if user_mask is not None:
+                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * user_mask
+
+                        if use_cross_attn_mask:
+                            out = self.attention_store.aggregate_attention(
+                                attention_maps=self.attention_store.step_store,
+                                prompts=self.text_cross_attention_maps,
+                                res=att_res,
+                                from_where=["up", "down"],
+                                is_cross=True,
+                                select=self.text_cross_attention_maps.index(editing_prompt[c]),
+                            )
+                            attn_map = out[:, :, :, 1 : 1 + num_edit_tokens[c]]  # 0 -> startoftext
+
+                            # average over all tokens
+                            if attn_map.shape[3] != num_edit_tokens[c]:
+                                raise ValueError(
+                                    f"Incorrect shape of attention_map. Expected size {num_edit_tokens[c]}, but found {attn_map.shape[3]}!"
+                                )
+
+                            attn_map = torch.sum(attn_map, dim=3)
+
+                            # gaussian_smoothing
+                            attn_map = F.pad(attn_map.unsqueeze(1), (1, 1, 1, 1), mode="reflect")
+                            attn_map = self.smoothing(attn_map).squeeze(1)
+
+                            # torch.quantile function expects float32
+                            if attn_map.dtype == torch.float32:
+                                tmp = torch.quantile(attn_map.flatten(start_dim=1), edit_threshold_c, dim=1)
+                            else:
+                                tmp = torch.quantile(
+                                    attn_map.flatten(start_dim=1).to(torch.float32), edit_threshold_c, dim=1
+                                ).to(attn_map.dtype)
+                            attn_mask = torch.where(
+                                attn_map >= tmp.unsqueeze(1).unsqueeze(1).repeat(1, *att_res), 1.0, 0.0
+                            )
+
+                            # resolution must match latent space dimension
+                            attn_mask = F.interpolate(
+                                attn_mask.unsqueeze(1),
+                                noise_guidance_edit_tmp.shape[-2:],  # 64,64
+                            ).repeat(1, 4, 1, 1)
+                            self.activation_mask[i, c] = attn_mask.detach().cpu()
+                            if not use_intersect_mask:
+                                noise_guidance_edit_tmp = noise_guidance_edit_tmp * attn_mask
+
+                        if use_intersect_mask:
+                            if t <= 800:
+                                noise_guidance_edit_tmp_quantile = torch.abs(noise_guidance_edit_tmp)
+                                noise_guidance_edit_tmp_quantile = torch.sum(
+                                    noise_guidance_edit_tmp_quantile, dim=1, keepdim=True
+                                )
+                                noise_guidance_edit_tmp_quantile = noise_guidance_edit_tmp_quantile.repeat(
+                                    1, self.unet.config.in_channels, 1, 1
+                                )
+
+                                # torch.quantile function expects float32
+                                if noise_guidance_edit_tmp_quantile.dtype == torch.float32:
+                                    tmp = torch.quantile(
+                                        noise_guidance_edit_tmp_quantile.flatten(start_dim=2),
+                                        edit_threshold_c,
+                                        dim=2,
+                                        keepdim=False,
+                                    )
+                                else:
+                                    tmp = torch.quantile(
+                                        noise_guidance_edit_tmp_quantile.flatten(start_dim=2).to(torch.float32),
+                                        edit_threshold_c,
+                                        dim=2,
+                                        keepdim=False,
+                                    ).to(noise_guidance_edit_tmp_quantile.dtype)
+
+                                intersect_mask = (
+                                    torch.where(
+                                        noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                        torch.ones_like(noise_guidance_edit_tmp),
+                                        torch.zeros_like(noise_guidance_edit_tmp),
+                                    )
+                                    * attn_mask
+                                )
+
+                                self.activation_mask[i, c] = intersect_mask.detach().cpu()
+
+                                noise_guidance_edit_tmp = noise_guidance_edit_tmp * intersect_mask
+
+                            else:
+                                # print(f"only attention mask for step {i}")
+                                noise_guidance_edit_tmp = noise_guidance_edit_tmp * attn_mask
+
+                        elif not use_cross_attn_mask:
+                            # calculate quantile
+                            noise_guidance_edit_tmp_quantile = torch.abs(noise_guidance_edit_tmp)
+                            noise_guidance_edit_tmp_quantile = torch.sum(
+                                noise_guidance_edit_tmp_quantile, dim=1, keepdim=True
+                            )
+                            noise_guidance_edit_tmp_quantile = noise_guidance_edit_tmp_quantile.repeat(1, 4, 1, 1)
+
+                            # torch.quantile function expects float32
+                            if noise_guidance_edit_tmp_quantile.dtype == torch.float32:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                )
+                            else:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2).to(torch.float32),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                ).to(noise_guidance_edit_tmp_quantile.dtype)
+
+                            self.activation_mask[i, c] = (
+                                torch.where(
+                                    noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                    torch.ones_like(noise_guidance_edit_tmp),
+                                    torch.zeros_like(noise_guidance_edit_tmp),
+                                )
+                                .detach()
+                                .cpu()
+                            )
+
+                            noise_guidance_edit_tmp = torch.where(
+                                noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                noise_guidance_edit_tmp,
+                                torch.zeros_like(noise_guidance_edit_tmp),
+                            )
+
+                        noise_guidance_edit += noise_guidance_edit_tmp
+
+                    self.sem_guidance[i] = noise_guidance_edit.detach().cpu()
+
+                noise_pred = noise_pred_uncond + noise_guidance_edit
+
+                if enable_edit_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_edit_concepts.mean(dim=0, keepdim=False),
+                        guidance_rescale=self.guidance_rescale,
+                    )
+
+                idx = t_to_idx[int(t)]
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, variance_noise=zs[idx], **extra_step_kwargs
+                ).prev_sample
+
+                # step callback
+                if use_cross_attn_mask:
+                    store_step = i in attn_store_steps
+                    self.attention_store.between_steps(store_step)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    # prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        # 8. Post-processing
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, self.device, text_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return LEditsPPDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    @torch.no_grad()
+    def invert(
+        self,
+        image: PipelineImageInput,
+        source_prompt: str = "",
+        source_guidance_scale: float = 3.5,
+        num_inversion_steps: int = 30,
+        skip: float = 0.15,
+        generator: Optional[torch.Generator] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        resize_mode: Optional[str] = "default",
+        crops_coords: Optional[Tuple[int, int, int, int]] = None,
+    ):
+        r"""
+        The function to the pipeline for image inversion as described by the [LEDITS++
+        Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
+        inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
+
+        Args:
+            image (`PipelineImageInput`):
+                Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect
+                ratio.
+            source_prompt (`str`, defaults to `""`):
+                Prompt describing the input image that will be used for guidance during inversion. Guidance is disabled
+                if the `source_prompt` is `""`.
+            source_guidance_scale (`float`, defaults to `3.5`):
+                Strength of guidance during inversion.
+            num_inversion_steps (`int`, defaults to `30`):
+                Number of total performed inversion steps after discarding the initial `skip` steps.
+            skip (`float`, defaults to `0.15`):
+                Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
+                will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion
+                deterministic.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            height (`int`, *optional*, defaults to `None`):
+                The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
+                height.
+            width (`int`, *optional*`, defaults to `None`):
+                The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
+            resize_mode (`str`, *optional*, defaults to `default`):
+                The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
+                the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will
+                resize the image to fit within the specified width and height, maintaining the aspect ratio, and then
+                center the image within the dimensions, filling empty with data from image. If `crop`, will resize the
+                image to fit within the specified width and height, maintaining the aspect ratio, and then center the
+                image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
+                supported for PIL image input.
+            crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
+                The crop coordinates for each image in the batch. If `None`, will not crop the image.
+
+        Returns:
+            [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
+            and respective VAE reconstruction(s).
+        """
+        # Reset attn processor, we do not want to store attn maps during inversion
+        self.unet.set_attn_processor(AttnProcessor())
+
+        self.eta = 1.0
+
+        self.scheduler.config.timestep_spacing = "leading"
+        self.scheduler.set_timesteps(int(num_inversion_steps * (1 + skip)))
+        self.inversion_steps = self.scheduler.timesteps[-num_inversion_steps:]
+        timesteps = self.inversion_steps
+
+        # 1. encode image
+        x0, resized = self.encode_image(
+            image,
+            dtype=self.text_encoder.dtype,
+            height=height,
+            width=width,
+            resize_mode=resize_mode,
+            crops_coords=crops_coords,
+        )
+        self.batch_size = x0.shape[0]
+
+        # autoencoder reconstruction
+        image_rec = self.vae.decode(x0 / self.vae.config.scaling_factor, return_dict=False, generator=generator)[0]
+        image_rec = self.image_processor.postprocess(image_rec, output_type="pil")
+
+        # 2. get embeddings
+        do_classifier_free_guidance = source_guidance_scale > 1.0
+
+        lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+
+        uncond_embedding, text_embeddings, _ = self.encode_prompt(
+            num_images_per_prompt=1,
+            device=self.device,
+            negative_prompt=None,
+            enable_edit_guidance=do_classifier_free_guidance,
+            editing_prompt=source_prompt,
+            lora_scale=lora_scale,
+            clip_skip=clip_skip,
+        )
+
+        # 3. find zs and xts
+        variance_noise_shape = (num_inversion_steps, *x0.shape)
+
+        # intermediate latents
+        t_to_idx = {int(v): k for k, v in enumerate(timesteps)}
+        xts = torch.zeros(size=variance_noise_shape, device=self.device, dtype=uncond_embedding.dtype)
+
+        for t in reversed(timesteps):
+            idx = num_inversion_steps - t_to_idx[int(t)] - 1
+            noise = randn_tensor(shape=x0.shape, generator=generator, device=self.device, dtype=x0.dtype)
+            xts[idx] = self.scheduler.add_noise(x0, noise, torch.Tensor([t]))
+        xts = torch.cat([x0.unsqueeze(0), xts], dim=0)
+
+        self.scheduler.set_timesteps(len(self.scheduler.timesteps))
+        # noise maps
+        zs = torch.zeros(size=variance_noise_shape, device=self.device, dtype=uncond_embedding.dtype)
+
+        with self.progress_bar(total=len(timesteps)) as progress_bar:
+            for t in timesteps:
+                idx = num_inversion_steps - t_to_idx[int(t)] - 1
+                # 1. predict noise residual
+                xt = xts[idx + 1]
+
+                noise_pred = self.unet(xt, timestep=t, encoder_hidden_states=uncond_embedding).sample
+
+                if not source_prompt == "":
+                    noise_pred_cond = self.unet(xt, timestep=t, encoder_hidden_states=text_embeddings).sample
+                    noise_pred = noise_pred + source_guidance_scale * (noise_pred_cond - noise_pred)
+
+                xtm1 = xts[idx]
+                z, xtm1_corrected = compute_noise(self.scheduler, xtm1, xt, t, noise_pred, self.eta)
+                zs[idx] = z
+
+                # correction to avoid error accumulation
+                xts[idx] = xtm1_corrected
+
+                progress_bar.update()
+
+        self.init_latents = xts[-1].expand(self.batch_size, -1, -1, -1)
+        zs = zs.flip(0)
+        self.zs = zs
+
+        return LEditsPPInversionPipelineOutput(images=resized, vae_reconstruction_images=image_rec)
+
+    @torch.no_grad()
+    def encode_image(self, image, dtype=None, height=None, width=None, resize_mode="default", crops_coords=None):
+        image = self.image_processor.preprocess(
+            image=image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+        resized = self.image_processor.postprocess(image=image, output_type="pil")
+
+        if max(image.shape[-2:]) > self.vae.config["sample_size"] * 1.5:
+            logger.warning(
+                "Your input images far exceed the default resolution of the underlying diffusion model. "
+                "The output images may contain severe artifacts! "
+                "Consider down-sampling the input using the `height` and `width` parameters"
+            )
+        image = image.to(dtype)
+
+        x0 = self.vae.encode(image.to(self.device)).latent_dist.mode()
+        x0 = x0.to(dtype)
+        x0 = self.vae.config.scaling_factor * x0
+        return x0, resized
+
+
+def compute_noise_ddim(scheduler, prev_latents, latents, timestep, noise_pred, eta):
+    # 1. get previous step value (=t-1)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
+
+    # 2. compute alphas, betas
+    alpha_prod_t = scheduler.alphas_cumprod[timestep]
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
+
+    beta_prod_t = 1 - alpha_prod_t
+
+    # 3. compute predicted original sample from predicted noise also called
+    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+    # 4. Clip "predicted x_0"
+    if scheduler.config.clip_sample:
+        pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+    # 5. compute variance: "sigma_t(η)" -> see formula (16)
+    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+    variance = scheduler._get_variance(timestep, prev_timestep)
+    std_dev_t = eta * variance ** (0.5)
+
+    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
+
+    # modifed so that updated xtm1 is returned as well (to avoid error accumulation)
+    mu_xt = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+    if variance > 0.0:
+        noise = (prev_latents - mu_xt) / (variance ** (0.5) * eta)
+    else:
+        noise = torch.tensor([0.0]).to(latents.device)
+
+    return noise, mu_xt + (eta * variance**0.5) * noise
+
+
+def compute_noise_sde_dpm_pp_2nd(scheduler, prev_latents, latents, timestep, noise_pred, eta):
+    def first_order_update(model_output, sample):  # timestep, prev_timestep, sample):
+        sigma_t, sigma_s = scheduler.sigmas[scheduler.step_index + 1], scheduler.sigmas[scheduler.step_index]
+        alpha_t, sigma_t = scheduler._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = scheduler._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+
+        mu_xt = (sigma_t / sigma_s * torch.exp(-h)) * sample + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+
+        mu_xt = scheduler.dpm_solver_first_order_update(
+            model_output=model_output, sample=sample, noise=torch.zeros_like(sample)
+        )
+
+        sigma = sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h))
+        if sigma > 0.0:
+            noise = (prev_latents - mu_xt) / sigma
+        else:
+            noise = torch.tensor([0.0]).to(sample.device)
+
+        prev_sample = mu_xt + sigma * noise
+        return noise, prev_sample
+
+    def second_order_update(model_output_list, sample):  # timestep_list, prev_timestep, sample):
+        sigma_t, sigma_s0, sigma_s1 = (
+            scheduler.sigmas[scheduler.step_index + 1],
+            scheduler.sigmas[scheduler.step_index],
+            scheduler.sigmas[scheduler.step_index - 1],
+        )
+
+        alpha_t, sigma_t = scheduler._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = scheduler._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = scheduler._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+
+        mu_xt = (
+            (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+            + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+            + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+        )
+
+        sigma = sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h))
+        if sigma > 0.0:
+            noise = (prev_latents - mu_xt) / sigma
+        else:
+            noise = torch.tensor([0.0]).to(sample.device)
+
+        prev_sample = mu_xt + sigma * noise
+
+        return noise, prev_sample
+
+    if scheduler.step_index is None:
+        scheduler._init_step_index(timestep)
+
+    model_output = scheduler.convert_model_output(model_output=noise_pred, sample=latents)
+    for i in range(scheduler.config.solver_order - 1):
+        scheduler.model_outputs[i] = scheduler.model_outputs[i + 1]
+    scheduler.model_outputs[-1] = model_output
+
+    if scheduler.lower_order_nums < 1:
+        noise, prev_sample = first_order_update(model_output, latents)
+    else:
+        noise, prev_sample = second_order_update(scheduler.model_outputs, latents)
+
+    if scheduler.lower_order_nums < scheduler.config.solver_order:
+        scheduler.lower_order_nums += 1
+
+    # upon completion increase step index by one
+    scheduler._step_index += 1
+
+    return noise, prev_sample
+
+
+def compute_noise(scheduler, *args):
+    if isinstance(scheduler, DDIMScheduler):
+        return compute_noise_ddim(scheduler, *args)
+    elif (
+        isinstance(scheduler, DPMSolverMultistepScheduler)
+        and scheduler.config.algorithm_type == "sde-dpmsolver++"
+        and scheduler.config.solver_order == 2
+    ):
+        return compute_noise_sde_dpm_pp_2nd(scheduler, *args)
+    else:
+        raise NotImplementedError
diff --git a/diffusers/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py b/diffusers/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..995cc15f3f933e74c223fc43aaca38260164c6d5
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
@@ -0,0 +1,1794 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import (
+    Attention,
+    AttnProcessor,
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import DDIMScheduler, DPMSolverMultistepScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import LEditsPPDiffusionPipelineOutput, LEditsPPInversionPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> import PIL
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> from diffusers import LEditsPPPipelineStableDiffusionXL
+
+        >>> pipe = LEditsPPPipelineStableDiffusionXL.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/tennis.jpg"
+        >>> image = download_image(img_url)
+
+        >>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.2)
+
+        >>> edited_image = pipe(
+        ...     editing_prompt=["tennis ball", "tomato"],
+        ...     reverse_editing_direction=[True, False],
+        ...     edit_guidance_scale=[5.0, 10.0],
+        ...     edit_threshold=[0.9, 0.85],
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.LeditsAttentionStore
+class LeditsAttentionStore:
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [], "down_self": [], "mid_self": [], "up_self": []}
+
+    def __call__(self, attn, is_cross: bool, place_in_unet: str, editing_prompts, PnP=False):
+        # attn.shape = batch_size * head_size, seq_len query, seq_len_key
+        if attn.shape[1] <= self.max_size:
+            bs = 1 + int(PnP) + editing_prompts
+            skip = 2 if PnP else 1  # skip PnP & unconditional
+            attn = torch.stack(attn.split(self.batch_size)).permute(1, 0, 2, 3)
+            source_batch_size = int(attn.shape[1] // bs)
+            self.forward(attn[:, skip * source_batch_size :], is_cross, place_in_unet)
+
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+
+        self.step_store[key].append(attn)
+
+    def between_steps(self, store_step=True):
+        if store_step:
+            if self.average:
+                if len(self.attention_store) == 0:
+                    self.attention_store = self.step_store
+                else:
+                    for key in self.attention_store:
+                        for i in range(len(self.attention_store[key])):
+                            self.attention_store[key][i] += self.step_store[key][i]
+            else:
+                if len(self.attention_store) == 0:
+                    self.attention_store = [self.step_store]
+                else:
+                    self.attention_store.append(self.step_store)
+
+            self.cur_step += 1
+        self.step_store = self.get_empty_store()
+
+    def get_attention(self, step: int):
+        if self.average:
+            attention = {
+                key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store
+            }
+        else:
+            assert step is not None
+            attention = self.attention_store[step]
+        return attention
+
+    def aggregate_attention(
+        self, attention_maps, prompts, res: Union[int, Tuple[int]], from_where: List[str], is_cross: bool, select: int
+    ):
+        out = [[] for x in range(self.batch_size)]
+        if isinstance(res, int):
+            num_pixels = res**2
+            resolution = (res, res)
+        else:
+            num_pixels = res[0] * res[1]
+            resolution = res[:2]
+
+        for location in from_where:
+            for bs_item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]:
+                for batch, item in enumerate(bs_item):
+                    if item.shape[1] == num_pixels:
+                        cross_maps = item.reshape(len(prompts), -1, *resolution, item.shape[-1])[select]
+                        out[batch].append(cross_maps)
+
+        out = torch.stack([torch.cat(x, dim=0) for x in out])
+        # average over heads
+        out = out.sum(1) / out.shape[1]
+        return out
+
+    def __init__(self, average: bool, batch_size=1, max_resolution=16, max_size: int = None):
+        self.step_store = self.get_empty_store()
+        self.attention_store = []
+        self.cur_step = 0
+        self.average = average
+        self.batch_size = batch_size
+        if max_size is None:
+            self.max_size = max_resolution**2
+        elif max_size is not None and max_resolution is None:
+            self.max_size = max_size
+        else:
+            raise ValueError("Only allowed to set one of max_resolution or max_size")
+
+
+# Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.LeditsGaussianSmoothing
+class LeditsGaussianSmoothing:
+    def __init__(self, device):
+        kernel_size = [3, 3]
+        sigma = [0.5, 0.5]
+
+        # The gaussian kernel is the product of the gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(1, *[1] * (kernel.dim() - 1))
+
+        self.weight = kernel.to(device)
+
+    def __call__(self, input):
+        """
+        Arguments:
+        Apply gaussian filter to input.
+            input (torch.Tensor): Input to apply gaussian filter on.
+        Returns:
+            filtered (torch.Tensor): Filtered output.
+        """
+        return F.conv2d(input, weight=self.weight.to(input.dtype))
+
+
+# Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.LEDITSCrossAttnProcessor
+class LEDITSCrossAttnProcessor:
+    def __init__(self, attention_store, place_in_unet, pnp, editing_prompts):
+        self.attnstore = attention_store
+        self.place_in_unet = place_in_unet
+        self.editing_prompts = editing_prompts
+        self.pnp = pnp
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask=None,
+        temb=None,
+    ):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        self.attnstore(
+            attention_probs,
+            is_cross=True,
+            place_in_unet=self.place_in_unet,
+            editing_prompts=self.editing_prompts,
+            PnP=self.pnp,
+        )
+
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+
+
+class LEditsPPPipelineStableDiffusionXL(
+    DiffusionPipeline,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+):
+    """
+    Pipeline for textual image editing using LEDits++ with Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the
+    superclass documentation for the generic methods implemented for all pipelines (downloading, saving, running on a
+    particular device, etc.).
+
+    In addition the pipeline inherits the following loading methods:
+        - *LoRA*: [`LEditsPPPipelineStableDiffusionXL.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.StableDiffusionXLPipeline.save_lora_weights`]
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`~transformers.CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 ([`~transformers.CLIPTokenizer`]):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
+            automatically be set to [`DPMSolverMultistepScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DPMSolverMultistepScheduler, DDIMScheduler],
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        if not isinstance(scheduler, DDIMScheduler) and not isinstance(scheduler, DPMSolverMultistepScheduler):
+            self.scheduler = DPMSolverMultistepScheduler.from_config(
+                scheduler.config, algorithm_type="sde-dpmsolver++", solver_order=2
+            )
+            logger.warning(
+                "This pipeline only supports DDIMScheduler and DPMSolverMultistepScheduler. "
+                "The scheduler has been changed to DPMSolverMultistepScheduler."
+            )
+
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+        self.inversion_steps = None
+
+    def encode_prompt(
+        self,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+        enable_edit_guidance: bool = True,
+        editing_prompt: Optional[str] = None,
+        editing_prompt_embeds: Optional[torch.Tensor] = None,
+        editing_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+    ) -> object:
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead.
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            enable_edit_guidance (`bool`):
+                Whether to guide towards an editing prompt or not.
+            editing_prompt (`str` or `List[str]`, *optional*):
+                Editing prompt(s) to be encoded. If not defined and 'enable_edit_guidance' is True, one has to pass
+                `editing_prompt_embeds` instead.
+            editing_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from
+                `editing_prompt` input argument.
+            editing_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated edit pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled editing_pooled_prompt_embeds will be generated from `editing_prompt`
+                input argument.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        batch_size = self.batch_size
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+        num_edit_tokens = 0
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+
+        if negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+
+            if batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but image inversion "
+                    f" has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of the input images."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+            if zero_out_negative_prompt:
+                negative_prompt_embeds = torch.zeros_like(negative_prompt_embeds)
+                negative_pooled_prompt_embeds = torch.zeros_like(negative_pooled_prompt_embeds)
+
+        if enable_edit_guidance and editing_prompt_embeds is None:
+            editing_prompt_2 = editing_prompt
+
+            editing_prompts = [editing_prompt, editing_prompt_2]
+            edit_prompt_embeds_list = []
+
+            for editing_prompt, tokenizer, text_encoder in zip(editing_prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    editing_prompt = self.maybe_convert_prompt(editing_prompt, tokenizer)
+
+                max_length = negative_prompt_embeds.shape[1]
+                edit_concepts_input = tokenizer(
+                    # [x for item in editing_prompt for x in repeat(item, batch_size)],
+                    editing_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                    return_length=True,
+                )
+                num_edit_tokens = edit_concepts_input.length - 2
+
+                edit_concepts_embeds = text_encoder(
+                    edit_concepts_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                editing_pooled_prompt_embeds = edit_concepts_embeds[0]
+                if clip_skip is None:
+                    edit_concepts_embeds = edit_concepts_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    edit_concepts_embeds = edit_concepts_embeds.hidden_states[-(clip_skip + 2)]
+
+                edit_prompt_embeds_list.append(edit_concepts_embeds)
+
+            edit_concepts_embeds = torch.concat(edit_prompt_embeds_list, dim=-1)
+        elif not enable_edit_guidance:
+            edit_concepts_embeds = None
+            editing_pooled_prompt_embeds = None
+
+        negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        bs_embed, seq_len, _ = negative_prompt_embeds.shape
+        # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+        seq_len = negative_prompt_embeds.shape[1]
+        negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if enable_edit_guidance:
+            bs_embed_edit, seq_len, _ = edit_concepts_embeds.shape
+            edit_concepts_embeds = edit_concepts_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            edit_concepts_embeds = edit_concepts_embeds.repeat(1, num_images_per_prompt, 1)
+            edit_concepts_embeds = edit_concepts_embeds.view(bs_embed_edit * num_images_per_prompt, seq_len, -1)
+
+        negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+
+        if enable_edit_guidance:
+            editing_pooled_prompt_embeds = editing_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed_edit * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return (
+            negative_prompt_embeds,
+            edit_concepts_embeds,
+            negative_pooled_prompt_embeds,
+            editing_pooled_prompt_embeds,
+            num_edit_tokens,
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, eta, generator=None):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        negative_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+    ):
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, device, latents):
+        latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    # Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.LEditsPPPipelineStableDiffusion.prepare_unet
+    def prepare_unet(self, attention_store, PnP: bool = False):
+        attn_procs = {}
+        for name in self.unet.attn_processors.keys():
+            if name.startswith("mid_block"):
+                place_in_unet = "mid"
+            elif name.startswith("up_blocks"):
+                place_in_unet = "up"
+            elif name.startswith("down_blocks"):
+                place_in_unet = "down"
+            else:
+                continue
+
+            if "attn2" in name and place_in_unet != "mid":
+                attn_procs[name] = LEDITSCrossAttnProcessor(
+                    attention_store=attention_store,
+                    place_in_unet=place_in_unet,
+                    pnp=PnP,
+                    editing_prompts=self.enabled_editing_prompts,
+                )
+            else:
+                attn_procs[name] = AttnProcessor()
+
+        self.unet.set_attn_processor(attn_procs)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        denoising_end: Optional[float] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        editing_prompt: Optional[Union[str, List[str]]] = None,
+        editing_prompt_embeddings: Optional[torch.Tensor] = None,
+        editing_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        reverse_editing_direction: Optional[Union[bool, List[bool]]] = False,
+        edit_guidance_scale: Optional[Union[float, List[float]]] = 5,
+        edit_warmup_steps: Optional[Union[int, List[int]]] = 0,
+        edit_cooldown_steps: Optional[Union[int, List[int]]] = None,
+        edit_threshold: Optional[Union[float, List[float]]] = 0.9,
+        sem_guidance: Optional[List[torch.Tensor]] = None,
+        use_cross_attn_mask: bool = False,
+        use_intersect_mask: bool = False,
+        user_mask: Optional[torch.Tensor] = None,
+        attn_store_steps: Optional[List[int]] = [],
+        store_averaged_over_steps: bool = True,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for editing. The
+        [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`] method has to be called beforehand. Edits
+        will always be performed for the last inverted image(s).
+
+        Args:
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            editing_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. The image is reconstructed by setting
+                `editing_prompt = None`. Guidance direction of prompt should be specified via
+                `reverse_editing_direction`.
+            editing_prompt_embeddings (`torch.Tensor`, *optional*):
+                Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input argument.
+            editing_pooled_prompt_embeddings (`torch.Tensor`, *optional*):
+                Pre-generated pooled edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input
+                argument.
+            reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
+                Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
+            edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
+                Guidance scale for guiding the image generation. If provided as list values should correspond to
+                `editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++
+                Paper](https://arxiv.org/abs/2301.12247).
+            edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
+                Number of diffusion steps (for each prompt) for which guidance is not applied.
+            edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
+                Number of diffusion steps (for each prompt) after which guidance is no longer applied.
+            edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
+                Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
+                'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
+                Paper](https://arxiv.org/abs/2301.12247).
+            sem_guidance (`List[torch.Tensor]`, *optional*):
+                List of pre-generated guidance vectors to be applied at generation. Length of the list has to
+                correspond to `num_inference_steps`.
+            use_cross_attn_mask:
+                Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
+                is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++
+                paper](https://arxiv.org/pdf/2311.16711.pdf).
+            use_intersect_mask:
+                Whether the masking term is calculated as intersection of cross-attention masks and masks derived from
+                the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate
+                are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
+            user_mask:
+                User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s
+                implicit masks do not meet user preferences.
+            attn_store_steps:
+                Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
+            store_averaged_over_steps:
+                Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If
+                False, attention maps for each step are stores separately. Just for visualization purposes.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images.
+        """
+        if self.inversion_steps is None:
+            raise ValueError(
+                "You need to invert an input image first before calling the pipeline. The `invert` method has to be called beforehand. Edits will always be performed for the last inverted image(s)."
+            )
+
+        eta = self.eta
+        num_images_per_prompt = 1
+        latents = self.init_latents
+
+        zs = self.zs
+        self.scheduler.set_timesteps(len(self.scheduler.timesteps))
+
+        if use_intersect_mask:
+            use_cross_attn_mask = True
+
+        if use_cross_attn_mask:
+            self.smoothing = LeditsGaussianSmoothing(self.device)
+
+        if user_mask is not None:
+            user_mask = user_mask.to(self.device)
+
+        # TODO: Check inputs
+        # 1. Check inputs. Raise error if not correct
+        # self.check_inputs(
+        #    callback_steps,
+        #    negative_prompt,
+        #    negative_prompt_2,
+        #    prompt_embeds,
+        #    negative_prompt_embeds,
+        #    pooled_prompt_embeds,
+        #    negative_pooled_prompt_embeds,
+        # )
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+
+        # 2. Define call parameters
+        batch_size = self.batch_size
+
+        device = self._execution_device
+
+        if editing_prompt:
+            enable_edit_guidance = True
+            if isinstance(editing_prompt, str):
+                editing_prompt = [editing_prompt]
+            self.enabled_editing_prompts = len(editing_prompt)
+        elif editing_prompt_embeddings is not None:
+            enable_edit_guidance = True
+            self.enabled_editing_prompts = editing_prompt_embeddings.shape[0]
+        else:
+            self.enabled_editing_prompts = 0
+            enable_edit_guidance = False
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            edit_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            pooled_edit_embeds,
+            num_edit_tokens,
+        ) = self.encode_prompt(
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+            enable_edit_guidance=enable_edit_guidance,
+            editing_prompt=editing_prompt,
+            editing_prompt_embeds=editing_prompt_embeddings,
+            editing_pooled_prompt_embeds=editing_pooled_prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        # self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        timesteps = self.inversion_steps
+        t_to_idx = {int(v): k for k, v in enumerate(timesteps)}
+
+        if use_cross_attn_mask:
+            self.attention_store = LeditsAttentionStore(
+                average=store_averaged_over_steps,
+                batch_size=batch_size,
+                max_size=(latents.shape[-2] / 4.0) * (latents.shape[-1] / 4.0),
+                max_resolution=None,
+            )
+            self.prepare_unet(self.attention_store)
+            resolution = latents.shape[-2:]
+            att_res = (int(resolution[0] / 4), int(resolution[1] / 4))
+
+        # 5. Prepare latent variables
+        latents = self.prepare_latents(device=device, latents=latents)
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
+
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(negative_pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = negative_pooled_prompt_embeds
+        add_time_ids = self._get_add_time_ids(
+            self.size,
+            crops_coords_top_left,
+            self.size,
+            dtype=negative_pooled_prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if enable_edit_guidance:
+            prompt_embeds = torch.cat([prompt_embeds, edit_prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([add_text_embeds, pooled_edit_embeds], dim=0)
+            edit_concepts_time_ids = add_time_ids.repeat(edit_prompt_embeds.shape[0], 1)
+            add_time_ids = torch.cat([add_time_ids, edit_concepts_time_ids], dim=0)
+            self.text_cross_attention_maps = [editing_prompt] if isinstance(editing_prompt, str) else editing_prompt
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        if ip_adapter_image is not None:
+            # TODO: fix image encoding
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+                image_embeds = image_embeds.to(device)
+
+        # 8. Denoising loop
+        self.sem_guidance = None
+        self.activation_mask = None
+
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=self._num_timesteps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * (1 + self.enabled_editing_prompts))
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                noise_pred_out = noise_pred.chunk(1 + self.enabled_editing_prompts)  # [b,4, 64, 64]
+                noise_pred_uncond = noise_pred_out[0]
+                noise_pred_edit_concepts = noise_pred_out[1:]
+
+                noise_guidance_edit = torch.zeros(
+                    noise_pred_uncond.shape,
+                    device=self.device,
+                    dtype=noise_pred_uncond.dtype,
+                )
+
+                if sem_guidance is not None and len(sem_guidance) > i:
+                    noise_guidance_edit += sem_guidance[i].to(self.device)
+
+                elif enable_edit_guidance:
+                    if self.activation_mask is None:
+                        self.activation_mask = torch.zeros(
+                            (len(timesteps), self.enabled_editing_prompts, *noise_pred_edit_concepts[0].shape)
+                        )
+                    if self.sem_guidance is None:
+                        self.sem_guidance = torch.zeros((len(timesteps), *noise_pred_uncond.shape))
+
+                    # noise_guidance_edit = torch.zeros_like(noise_guidance)
+                    for c, noise_pred_edit_concept in enumerate(noise_pred_edit_concepts):
+                        if isinstance(edit_warmup_steps, list):
+                            edit_warmup_steps_c = edit_warmup_steps[c]
+                        else:
+                            edit_warmup_steps_c = edit_warmup_steps
+                        if i < edit_warmup_steps_c:
+                            continue
+
+                        if isinstance(edit_guidance_scale, list):
+                            edit_guidance_scale_c = edit_guidance_scale[c]
+                        else:
+                            edit_guidance_scale_c = edit_guidance_scale
+
+                        if isinstance(edit_threshold, list):
+                            edit_threshold_c = edit_threshold[c]
+                        else:
+                            edit_threshold_c = edit_threshold
+                        if isinstance(reverse_editing_direction, list):
+                            reverse_editing_direction_c = reverse_editing_direction[c]
+                        else:
+                            reverse_editing_direction_c = reverse_editing_direction
+
+                        if isinstance(edit_cooldown_steps, list):
+                            edit_cooldown_steps_c = edit_cooldown_steps[c]
+                        elif edit_cooldown_steps is None:
+                            edit_cooldown_steps_c = i + 1
+                        else:
+                            edit_cooldown_steps_c = edit_cooldown_steps
+
+                        if i >= edit_cooldown_steps_c:
+                            continue
+
+                        noise_guidance_edit_tmp = noise_pred_edit_concept - noise_pred_uncond
+
+                        if reverse_editing_direction_c:
+                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * -1
+
+                        noise_guidance_edit_tmp = noise_guidance_edit_tmp * edit_guidance_scale_c
+
+                        if user_mask is not None:
+                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * user_mask
+
+                        if use_cross_attn_mask:
+                            out = self.attention_store.aggregate_attention(
+                                attention_maps=self.attention_store.step_store,
+                                prompts=self.text_cross_attention_maps,
+                                res=att_res,
+                                from_where=["up", "down"],
+                                is_cross=True,
+                                select=self.text_cross_attention_maps.index(editing_prompt[c]),
+                            )
+                            attn_map = out[:, :, :, 1 : 1 + num_edit_tokens[c]]  # 0 -> startoftext
+
+                            # average over all tokens
+                            if attn_map.shape[3] != num_edit_tokens[c]:
+                                raise ValueError(
+                                    f"Incorrect shape of attention_map. Expected size {num_edit_tokens[c]}, but found {attn_map.shape[3]}!"
+                                )
+                            attn_map = torch.sum(attn_map, dim=3)
+
+                            # gaussian_smoothing
+                            attn_map = F.pad(attn_map.unsqueeze(1), (1, 1, 1, 1), mode="reflect")
+                            attn_map = self.smoothing(attn_map).squeeze(1)
+
+                            # torch.quantile function expects float32
+                            if attn_map.dtype == torch.float32:
+                                tmp = torch.quantile(attn_map.flatten(start_dim=1), edit_threshold_c, dim=1)
+                            else:
+                                tmp = torch.quantile(
+                                    attn_map.flatten(start_dim=1).to(torch.float32), edit_threshold_c, dim=1
+                                ).to(attn_map.dtype)
+                            attn_mask = torch.where(
+                                attn_map >= tmp.unsqueeze(1).unsqueeze(1).repeat(1, *att_res), 1.0, 0.0
+                            )
+
+                            # resolution must match latent space dimension
+                            attn_mask = F.interpolate(
+                                attn_mask.unsqueeze(1),
+                                noise_guidance_edit_tmp.shape[-2:],  # 64,64
+                            ).repeat(1, 4, 1, 1)
+                            self.activation_mask[i, c] = attn_mask.detach().cpu()
+                            if not use_intersect_mask:
+                                noise_guidance_edit_tmp = noise_guidance_edit_tmp * attn_mask
+
+                        if use_intersect_mask:
+                            noise_guidance_edit_tmp_quantile = torch.abs(noise_guidance_edit_tmp)
+                            noise_guidance_edit_tmp_quantile = torch.sum(
+                                noise_guidance_edit_tmp_quantile, dim=1, keepdim=True
+                            )
+                            noise_guidance_edit_tmp_quantile = noise_guidance_edit_tmp_quantile.repeat(
+                                1, self.unet.config.in_channels, 1, 1
+                            )
+
+                            # torch.quantile function expects float32
+                            if noise_guidance_edit_tmp_quantile.dtype == torch.float32:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                )
+                            else:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2).to(torch.float32),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                ).to(noise_guidance_edit_tmp_quantile.dtype)
+
+                            intersect_mask = (
+                                torch.where(
+                                    noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                    torch.ones_like(noise_guidance_edit_tmp),
+                                    torch.zeros_like(noise_guidance_edit_tmp),
+                                )
+                                * attn_mask
+                            )
+
+                            self.activation_mask[i, c] = intersect_mask.detach().cpu()
+
+                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * intersect_mask
+
+                        elif not use_cross_attn_mask:
+                            # calculate quantile
+                            noise_guidance_edit_tmp_quantile = torch.abs(noise_guidance_edit_tmp)
+                            noise_guidance_edit_tmp_quantile = torch.sum(
+                                noise_guidance_edit_tmp_quantile, dim=1, keepdim=True
+                            )
+                            noise_guidance_edit_tmp_quantile = noise_guidance_edit_tmp_quantile.repeat(1, 4, 1, 1)
+
+                            # torch.quantile function expects float32
+                            if noise_guidance_edit_tmp_quantile.dtype == torch.float32:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                )
+                            else:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2).to(torch.float32),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                ).to(noise_guidance_edit_tmp_quantile.dtype)
+
+                            self.activation_mask[i, c] = (
+                                torch.where(
+                                    noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                    torch.ones_like(noise_guidance_edit_tmp),
+                                    torch.zeros_like(noise_guidance_edit_tmp),
+                                )
+                                .detach()
+                                .cpu()
+                            )
+
+                            noise_guidance_edit_tmp = torch.where(
+                                noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                noise_guidance_edit_tmp,
+                                torch.zeros_like(noise_guidance_edit_tmp),
+                            )
+
+                        noise_guidance_edit += noise_guidance_edit_tmp
+
+                    self.sem_guidance[i] = noise_guidance_edit.detach().cpu()
+
+                noise_pred = noise_pred_uncond + noise_guidance_edit
+
+                # compute the previous noisy sample x_t -> x_t-1
+                if enable_edit_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_edit_concepts.mean(dim=0, keepdim=False),
+                        guidance_rescale=self.guidance_rescale,
+                    )
+
+                idx = t_to_idx[int(t)]
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, variance_noise=zs[idx], **extra_step_kwargs, return_dict=False
+                )[0]
+
+                # step callback
+                if use_cross_attn_mask:
+                    store_step = i in attn_store_steps
+                    self.attention_store.between_steps(store_step)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    # negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > 0 and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return LEditsPPDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
+
+    @torch.no_grad()
+    # Modified from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.LEditsPPPipelineStableDiffusion.encode_image
+    def encode_image(self, image, dtype=None, height=None, width=None, resize_mode="default", crops_coords=None):
+        image = self.image_processor.preprocess(
+            image=image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+        resized = self.image_processor.postprocess(image=image, output_type="pil")
+
+        if max(image.shape[-2:]) > self.vae.config["sample_size"] * 1.5:
+            logger.warning(
+                "Your input images far exceed the default resolution of the underlying diffusion model. "
+                "The output images may contain severe artifacts! "
+                "Consider down-sampling the input using the `height` and `width` parameters"
+            )
+        image = image.to(self.device, dtype=dtype)
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+        if needs_upcasting:
+            image = image.float()
+            self.upcast_vae()
+
+        x0 = self.vae.encode(image).latent_dist.mode()
+        x0 = x0.to(dtype)
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+
+        x0 = self.vae.config.scaling_factor * x0
+        return x0, resized
+
+    @torch.no_grad()
+    def invert(
+        self,
+        image: PipelineImageInput,
+        source_prompt: str = "",
+        source_guidance_scale=3.5,
+        negative_prompt: str = None,
+        negative_prompt_2: str = None,
+        num_inversion_steps: int = 50,
+        skip: float = 0.15,
+        generator: Optional[torch.Generator] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        num_zero_noise_steps: int = 3,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        The function to the pipeline for image inversion as described by the [LEDITS++
+        Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
+        inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
+
+        Args:
+            image (`PipelineImageInput`):
+                Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect
+                ratio.
+            source_prompt (`str`, defaults to `""`):
+                Prompt describing the input image that will be used for guidance during inversion. Guidance is disabled
+                if the `source_prompt` is `""`.
+            source_guidance_scale (`float`, defaults to `3.5`):
+                Strength of guidance during inversion.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_inversion_steps (`int`, defaults to `50`):
+                Number of total performed inversion steps after discarding the initial `skip` steps.
+            skip (`float`, defaults to `0.15`):
+                Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
+                will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion
+                deterministic.
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            num_zero_noise_steps (`int`, defaults to `3`):
+                Number of final diffusion steps that will not renoise the current image. If no steps are set to zero
+                SD-XL in combination with [`DPMSolverMultistepScheduler`] will produce noise artifacts.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Returns:
+            [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
+            and respective VAE reconstruction(s).
+        """
+
+        # Reset attn processor, we do not want to store attn maps during inversion
+        self.unet.set_attn_processor(AttnProcessor())
+
+        self.eta = 1.0
+
+        self.scheduler.config.timestep_spacing = "leading"
+        self.scheduler.set_timesteps(int(num_inversion_steps * (1 + skip)))
+        self.inversion_steps = self.scheduler.timesteps[-num_inversion_steps:]
+        timesteps = self.inversion_steps
+
+        num_images_per_prompt = 1
+
+        device = self._execution_device
+
+        # 0. Ensure that only uncond embedding is used if prompt = ""
+        if source_prompt == "":
+            # noise pred should only be noise_pred_uncond
+            source_guidance_scale = 0.0
+            do_classifier_free_guidance = False
+        else:
+            do_classifier_free_guidance = source_guidance_scale > 1.0
+
+        # 1. prepare image
+        x0, resized = self.encode_image(image, dtype=self.text_encoder_2.dtype)
+        width = x0.shape[2] * self.vae_scale_factor
+        height = x0.shape[3] * self.vae_scale_factor
+        self.size = (height, width)
+
+        self.batch_size = x0.shape[0]
+
+        # 2. get embeddings
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+
+        if isinstance(source_prompt, str):
+            source_prompt = [source_prompt] * self.batch_size
+
+        (
+            negative_prompt_embeds,
+            prompt_embeds,
+            negative_pooled_prompt_embeds,
+            edit_pooled_prompt_embeds,
+            _,
+        ) = self.encode_prompt(
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            editing_prompt=source_prompt,
+            lora_scale=text_encoder_lora_scale,
+            enable_edit_guidance=do_classifier_free_guidance,
+        )
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(negative_pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        # 3. Prepare added time ids & embeddings
+        add_text_embeds = negative_pooled_prompt_embeds
+        add_time_ids = self._get_add_time_ids(
+            self.size,
+            crops_coords_top_left,
+            self.size,
+            dtype=negative_prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([add_text_embeds, edit_pooled_prompt_embeds], dim=0)
+            add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
+
+        negative_prompt_embeds = negative_prompt_embeds.to(device)
+
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(self.batch_size * num_images_per_prompt, 1)
+
+        # autoencoder reconstruction
+        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+            self.upcast_vae()
+            x0_tmp = x0.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            image_rec = self.vae.decode(
+                x0_tmp / self.vae.config.scaling_factor, return_dict=False, generator=generator
+            )[0]
+        elif self.vae.config.force_upcast:
+            x0_tmp = x0.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            image_rec = self.vae.decode(
+                x0_tmp / self.vae.config.scaling_factor, return_dict=False, generator=generator
+            )[0]
+        else:
+            image_rec = self.vae.decode(x0 / self.vae.config.scaling_factor, return_dict=False, generator=generator)[0]
+
+        image_rec = self.image_processor.postprocess(image_rec, output_type="pil")
+
+        # 5. find zs and xts
+        variance_noise_shape = (num_inversion_steps, *x0.shape)
+
+        # intermediate latents
+        t_to_idx = {int(v): k for k, v in enumerate(timesteps)}
+        xts = torch.zeros(size=variance_noise_shape, device=self.device, dtype=negative_prompt_embeds.dtype)
+
+        for t in reversed(timesteps):
+            idx = num_inversion_steps - t_to_idx[int(t)] - 1
+            noise = randn_tensor(shape=x0.shape, generator=generator, device=self.device, dtype=x0.dtype)
+            xts[idx] = self.scheduler.add_noise(x0, noise, t.unsqueeze(0))
+        xts = torch.cat([x0.unsqueeze(0), xts], dim=0)
+
+        # noise maps
+        zs = torch.zeros(size=variance_noise_shape, device=self.device, dtype=negative_prompt_embeds.dtype)
+
+        self.scheduler.set_timesteps(len(self.scheduler.timesteps))
+
+        for t in self.progress_bar(timesteps):
+            idx = num_inversion_steps - t_to_idx[int(t)] - 1
+            # 1. predict noise residual
+            xt = xts[idx + 1]
+
+            latent_model_input = torch.cat([xt] * 2) if do_classifier_free_guidance else xt
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+            noise_pred = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=negative_prompt_embeds,
+                cross_attention_kwargs=cross_attention_kwargs,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            # 2. perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_out = noise_pred.chunk(2)
+                noise_pred_uncond, noise_pred_text = noise_pred_out[0], noise_pred_out[1]
+                noise_pred = noise_pred_uncond + source_guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            xtm1 = xts[idx]
+            z, xtm1_corrected = compute_noise(self.scheduler, xtm1, xt, t, noise_pred, self.eta)
+            zs[idx] = z
+
+            # correction to avoid error accumulation
+            xts[idx] = xtm1_corrected
+
+        self.init_latents = xts[-1]
+        zs = zs.flip(0)
+
+        if num_zero_noise_steps > 0:
+            zs[-num_zero_noise_steps:] = torch.zeros_like(zs[-num_zero_noise_steps:])
+        self.zs = zs
+        return LEditsPPInversionPipelineOutput(images=resized, vae_reconstruction_images=image_rec)
+
+
+# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.compute_noise_ddim
+def compute_noise_ddim(scheduler, prev_latents, latents, timestep, noise_pred, eta):
+    # 1. get previous step value (=t-1)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
+
+    # 2. compute alphas, betas
+    alpha_prod_t = scheduler.alphas_cumprod[timestep]
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
+
+    beta_prod_t = 1 - alpha_prod_t
+
+    # 3. compute predicted original sample from predicted noise also called
+    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+    # 4. Clip "predicted x_0"
+    if scheduler.config.clip_sample:
+        pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+    # 5. compute variance: "sigma_t(η)" -> see formula (16)
+    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+    variance = scheduler._get_variance(timestep, prev_timestep)
+    std_dev_t = eta * variance ** (0.5)
+
+    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
+
+    # modifed so that updated xtm1 is returned as well (to avoid error accumulation)
+    mu_xt = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+    if variance > 0.0:
+        noise = (prev_latents - mu_xt) / (variance ** (0.5) * eta)
+    else:
+        noise = torch.tensor([0.0]).to(latents.device)
+
+    return noise, mu_xt + (eta * variance**0.5) * noise
+
+
+# Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.compute_noise_sde_dpm_pp_2nd
+def compute_noise_sde_dpm_pp_2nd(scheduler, prev_latents, latents, timestep, noise_pred, eta):
+    def first_order_update(model_output, sample):  # timestep, prev_timestep, sample):
+        sigma_t, sigma_s = scheduler.sigmas[scheduler.step_index + 1], scheduler.sigmas[scheduler.step_index]
+        alpha_t, sigma_t = scheduler._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = scheduler._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+
+        mu_xt = (sigma_t / sigma_s * torch.exp(-h)) * sample + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+
+        mu_xt = scheduler.dpm_solver_first_order_update(
+            model_output=model_output, sample=sample, noise=torch.zeros_like(sample)
+        )
+
+        sigma = sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h))
+        if sigma > 0.0:
+            noise = (prev_latents - mu_xt) / sigma
+        else:
+            noise = torch.tensor([0.0]).to(sample.device)
+
+        prev_sample = mu_xt + sigma * noise
+        return noise, prev_sample
+
+    def second_order_update(model_output_list, sample):  # timestep_list, prev_timestep, sample):
+        sigma_t, sigma_s0, sigma_s1 = (
+            scheduler.sigmas[scheduler.step_index + 1],
+            scheduler.sigmas[scheduler.step_index],
+            scheduler.sigmas[scheduler.step_index - 1],
+        )
+
+        alpha_t, sigma_t = scheduler._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = scheduler._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = scheduler._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+
+        mu_xt = (
+            (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+            + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+            + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+        )
+
+        sigma = sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h))
+        if sigma > 0.0:
+            noise = (prev_latents - mu_xt) / sigma
+        else:
+            noise = torch.tensor([0.0]).to(sample.device)
+
+        prev_sample = mu_xt + sigma * noise
+
+        return noise, prev_sample
+
+    if scheduler.step_index is None:
+        scheduler._init_step_index(timestep)
+
+    model_output = scheduler.convert_model_output(model_output=noise_pred, sample=latents)
+    for i in range(scheduler.config.solver_order - 1):
+        scheduler.model_outputs[i] = scheduler.model_outputs[i + 1]
+    scheduler.model_outputs[-1] = model_output
+
+    if scheduler.lower_order_nums < 1:
+        noise, prev_sample = first_order_update(model_output, latents)
+    else:
+        noise, prev_sample = second_order_update(scheduler.model_outputs, latents)
+
+    if scheduler.lower_order_nums < scheduler.config.solver_order:
+        scheduler.lower_order_nums += 1
+
+    # upon completion increase step index by one
+    scheduler._step_index += 1
+
+    return noise, prev_sample
+
+
+# Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.compute_noise
+def compute_noise(scheduler, *args):
+    if isinstance(scheduler, DDIMScheduler):
+        return compute_noise_ddim(scheduler, *args)
+    elif (
+        isinstance(scheduler, DPMSolverMultistepScheduler)
+        and scheduler.config.algorithm_type == "sde-dpmsolver++"
+        and scheduler.config.solver_order == 2
+    ):
+        return compute_noise_sde_dpm_pp_2nd(scheduler, *args)
+    else:
+        raise NotImplementedError
diff --git a/diffusers/src/diffusers/pipelines/ledits_pp/pipeline_output.py b/diffusers/src/diffusers/pipelines/ledits_pp/pipeline_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..756be82b0069aa986bc0594476e9060e114a7c84
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/ledits_pp/pipeline_output.py
@@ -0,0 +1,43 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class LEditsPPDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for LEdits++ Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains “not-safe-for-work” (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+
+
+@dataclass
+class LEditsPPInversionPipelineOutput(BaseOutput):
+    """
+    Output class for LEdits++ Diffusion pipelines.
+
+    Args:
+        input_images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of the cropped and resized input images as PIL images of length `batch_size` or NumPy array of shape `
+            (batch_size, height, width, num_channels)`.
+        vae_reconstruction_images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of VAE reconstruction of all input images as PIL images of length `batch_size` or NumPy array of shape
+            ` (batch_size, height, width, num_channels)`.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    vae_reconstruction_images: Union[List[PIL.Image.Image], np.ndarray]
diff --git a/diffusers/src/diffusers/pipelines/lumina/__init__.py b/diffusers/src/diffusers/pipelines/lumina/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca13963597217f47b3922290abe90ed1c035c552
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/lumina/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_lumina"] = ["LuminaText2ImgPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_lumina import LuminaText2ImgPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/lumina/pipeline_lumina.py b/diffusers/src/diffusers/pipelines/lumina/pipeline_lumina.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc7e8ddb954de79583fd9fff95a217159404b41
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/lumina/pipeline_lumina.py
@@ -0,0 +1,897 @@
+# Copyright 2024 Alpha-VLLM and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+import inspect
+import math
+import re
+import urllib.parse as ul
+from typing import List, Optional, Tuple, Union
+
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKL
+from ...models.embeddings import get_2d_rotary_pos_embed_lumina
+from ...models.transformers.lumina_nextdit2d import LuminaNextDiT2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import LuminaText2ImgPipeline
+
+        >>> pipe = LuminaText2ImgPipeline.from_pretrained(
+        ...     "Alpha-VLLM/Lumina-Next-SFT-diffusers", torch_dtype=torch.bfloat16
+        ... )
+        >>> # Enable memory optimizations.
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "Upper body of a young woman in a Victorian-era outfit with brass goggles and leather straps. Background shows an industrial revolution cityscape with smoky skies and tall, metal structures"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class LuminaText2ImgPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Lumina-T2I.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`AutoModel`]):
+            Frozen text-encoder. Lumina-T2I uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/Alpha-VLLM/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`AutoModel`):
+            Tokenizer of class
+            [AutoModel](https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel).
+        transformer ([`Transformer2DModel`]):
+            A text conditioned `Transformer2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    def __init__(
+        self,
+        transformer: LuminaNextDiT2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: AutoModel,
+        tokenizer: AutoTokenizer,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 8
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.max_sequence_length = 256
+        self.default_sample_size = (
+            self.transformer.config.sample_size
+            if hasattr(self, "transformer") and self.transformer is not None
+            else 128
+        )
+        self.default_image_size = self.default_sample_size * self.vae_scale_factor
+
+    def _get_gemma_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        clean_caption: Optional[bool] = False,
+        max_length: Optional[int] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+        text_inputs = self.tokenizer(
+            prompt,
+            pad_to_multiple_of=8,
+            max_length=self.max_sequence_length,
+            truncation=True,
+            padding=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(device)
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids.to(device)
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because Gemma can only handle sequences up to"
+                f" {self.max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_attention_mask = text_inputs.attention_mask.to(device)
+        prompt_embeds = self.text_encoder(
+            text_input_ids, attention_mask=prompt_attention_mask, output_hidden_states=True
+        )
+        prompt_embeds = prompt_embeds.hidden_states[-2]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.transformer is not None:
+            dtype = self.transformer.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+        prompt_attention_mask = prompt_attention_mask.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds, prompt_attention_mask
+
+    # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+        **kwargs,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+                Lumina-T2I, this should be "".
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For Lumina-T2I, it's should be the embeddings of the "" string.
+            clean_caption (`bool`, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+            max_sequence_length (`int`, defaults to 256): Maximum sequence length to use for the prompt.
+        """
+        if device is None:
+            device = self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds, prompt_attention_mask = self._get_gemma_prompt_embeds(
+                prompt=prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                clean_caption=clean_caption,
+            )
+
+        # Get negative embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt if negative_prompt is not None else ""
+
+            # Normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                negative_prompt = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            # Padding negative prompt to the same length with prompt
+            prompt_max_length = prompt_embeds.shape[1]
+            negative_text_inputs = self.tokenizer(
+                negative_prompt,
+                padding="max_length",
+                max_length=prompt_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            negative_text_input_ids = negative_text_inputs.input_ids.to(device)
+            negative_prompt_attention_mask = negative_text_inputs.attention_mask.to(device)
+            # Get the negative prompt embeddings
+            negative_prompt_embeds = self.text_encoder(
+                negative_text_input_ids,
+                attention_mask=negative_prompt_attention_mask,
+                output_hidden_states=True,
+            )
+
+            negative_dtype = self.text_encoder.dtype
+            negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+            _, seq_len, _ = negative_prompt_embeds.shape
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=negative_dtype, device=device)
+            # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(
+                batch_size * num_images_per_prompt, -1
+            )
+
+        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        width: Optional[int] = None,
+        height: Optional[int] = None,
+        num_inference_steps: int = 30,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Union[str, List[str]] = None,
+        sigmas: List[float] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        clean_caption: bool = True,
+        max_sequence_length: int = 256,
+        scaling_watershed: Optional[float] = 1.0,
+        proportional_attn: Optional[bool] = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_inference_steps (`int`, *optional*, defaults to 30):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For Lumina-T2I this negative prompt should be "". If not
+                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            max_sequence_length (`int` defaults to 120):
+                Maximum sequence length to use with the `prompt`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+        )
+        cross_attention_kwargs = {}
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if proportional_attn:
+            cross_attention_kwargs["base_sequence_length"] = (self.default_image_size // 16) ** 2
+
+        scaling_factor = math.sqrt(width * height / self.default_image_size**2)
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            clean_caption=clean_caption,
+            max_sequence_length=max_sequence_length,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([prompt_attention_mask, negative_prompt_attention_mask], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            latent_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                current_timestep = t
+                if not torch.is_tensor(current_timestep):
+                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                    # This would be a good case for the `match` statement (Python 3.10+)
+                    is_mps = latent_model_input.device.type == "mps"
+                    if isinstance(current_timestep, float):
+                        dtype = torch.float32 if is_mps else torch.float64
+                    else:
+                        dtype = torch.int32 if is_mps else torch.int64
+                    current_timestep = torch.tensor(
+                        [current_timestep],
+                        dtype=dtype,
+                        device=latent_model_input.device,
+                    )
+                elif len(current_timestep.shape) == 0:
+                    current_timestep = current_timestep[None].to(latent_model_input.device)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(latent_model_input.shape[0])
+
+                # reverse the timestep since Lumina uses t=0 as the noise and t=1 as the image
+                current_timestep = 1 - current_timestep / self.scheduler.config.num_train_timesteps
+
+                # prepare image_rotary_emb for positional encoding
+                # dynamic scaling_factor for different resolution.
+                # NOTE: For `Time-aware` denosing mechanism from Lumina-Next
+                # https://arxiv.org/abs/2406.18583, Sec 2.3
+                # NOTE: We should compute different image_rotary_emb with different timestep.
+                if current_timestep[0] < scaling_watershed:
+                    linear_factor = scaling_factor
+                    ntk_factor = 1.0
+                else:
+                    linear_factor = 1.0
+                    ntk_factor = scaling_factor
+                image_rotary_emb = get_2d_rotary_pos_embed_lumina(
+                    self.transformer.head_dim,
+                    384,
+                    384,
+                    linear_factor=linear_factor,
+                    ntk_factor=ntk_factor,
+                )
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=current_timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_mask=prompt_attention_mask,
+                    image_rotary_emb=image_rotary_emb,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.chunk(2, dim=1)[0]
+
+                # perform guidance scale
+                # NOTE: For exact reproducibility reasons, we apply classifier-free guidance on only
+                # three channels by default. The standard approach to cfg applies it to all channels.
+                # This can be done by uncommenting the following line and commenting-out the line following that.
+                # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+                if do_classifier_free_guidance:
+                    noise_pred_eps, noise_pred_rest = noise_pred[:, :3], noise_pred[:, 3:]
+                    noise_pred_cond_eps, noise_pred_uncond_eps = torch.split(
+                        noise_pred_eps, len(noise_pred_eps) // 2, dim=0
+                    )
+                    noise_pred_half = noise_pred_uncond_eps + guidance_scale * (
+                        noise_pred_cond_eps - noise_pred_uncond_eps
+                    )
+                    noise_pred_eps = torch.cat([noise_pred_half, noise_pred_half], dim=0)
+
+                    noise_pred = torch.cat([noise_pred_eps, noise_pred_rest], dim=1)
+                    noise_pred, _ = noise_pred.chunk(2, dim=0)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                noise_pred = -noise_pred
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                progress_bar.update()
+
+        if not output_type == "latent":
+            latents = latents / self.vae.config.scaling_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        else:
+            image = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/marigold/__init__.py b/diffusers/src/diffusers/pipelines/marigold/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5ae03adfc11d7cdf3c233a5a735483e95ff4a29
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/marigold/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["marigold_image_processing"] = ["MarigoldImageProcessor"]
+    _import_structure["pipeline_marigold_depth"] = ["MarigoldDepthOutput", "MarigoldDepthPipeline"]
+    _import_structure["pipeline_marigold_normals"] = ["MarigoldNormalsOutput", "MarigoldNormalsPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .marigold_image_processing import MarigoldImageProcessor
+        from .pipeline_marigold_depth import MarigoldDepthOutput, MarigoldDepthPipeline
+        from .pipeline_marigold_normals import MarigoldNormalsOutput, MarigoldNormalsPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/marigold/marigold_image_processing.py b/diffusers/src/diffusers/pipelines/marigold/marigold_image_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..51b9983db6f6372aff58fb99e3fa8113ef4752e3
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/marigold/marigold_image_processing.py
@@ -0,0 +1,576 @@
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+from PIL import Image
+
+from ... import ConfigMixin
+from ...configuration_utils import register_to_config
+from ...image_processor import PipelineImageInput
+from ...utils import CONFIG_NAME, logging
+from ...utils.import_utils import is_matplotlib_available
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class MarigoldImageProcessor(ConfigMixin):
+    config_name = CONFIG_NAME
+
+    @register_to_config
+    def __init__(
+        self,
+        vae_scale_factor: int = 8,
+        do_normalize: bool = True,
+        do_range_check: bool = True,
+    ):
+        super().__init__()
+
+    @staticmethod
+    def expand_tensor_or_array(images: Union[torch.Tensor, np.ndarray]) -> Union[torch.Tensor, np.ndarray]:
+        """
+        Expand a tensor or array to a specified number of images.
+        """
+        if isinstance(images, np.ndarray):
+            if images.ndim == 2:  # [H,W] -> [1,H,W,1]
+                images = images[None, ..., None]
+            if images.ndim == 3:  # [H,W,C] -> [1,H,W,C]
+                images = images[None]
+        elif isinstance(images, torch.Tensor):
+            if images.ndim == 2:  # [H,W] -> [1,1,H,W]
+                images = images[None, None]
+            elif images.ndim == 3:  # [1,H,W] -> [1,1,H,W]
+                images = images[None]
+        else:
+            raise ValueError(f"Unexpected input type: {type(images)}")
+        return images
+
+    @staticmethod
+    def pt_to_numpy(images: torch.Tensor) -> np.ndarray:
+        """
+        Convert a PyTorch tensor to a NumPy image.
+        """
+        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+        return images
+
+    @staticmethod
+    def numpy_to_pt(images: np.ndarray) -> torch.Tensor:
+        """
+        Convert a NumPy image to a PyTorch tensor.
+        """
+        if np.issubdtype(images.dtype, np.integer) and not np.issubdtype(images.dtype, np.unsignedinteger):
+            raise ValueError(f"Input image dtype={images.dtype} cannot be a signed integer.")
+        if np.issubdtype(images.dtype, np.complexfloating):
+            raise ValueError(f"Input image dtype={images.dtype} cannot be complex.")
+        if np.issubdtype(images.dtype, bool):
+            raise ValueError(f"Input image dtype={images.dtype} cannot be boolean.")
+
+        images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+        return images
+
+    @staticmethod
+    def resize_antialias(
+        image: torch.Tensor, size: Tuple[int, int], mode: str, is_aa: Optional[bool] = None
+    ) -> torch.Tensor:
+        if not torch.is_tensor(image):
+            raise ValueError(f"Invalid input type={type(image)}.")
+        if not torch.is_floating_point(image):
+            raise ValueError(f"Invalid input dtype={image.dtype}.")
+        if image.dim() != 4:
+            raise ValueError(f"Invalid input dimensions; shape={image.shape}.")
+
+        antialias = is_aa and mode in ("bilinear", "bicubic")
+        image = F.interpolate(image, size, mode=mode, antialias=antialias)
+
+        return image
+
+    @staticmethod
+    def resize_to_max_edge(image: torch.Tensor, max_edge_sz: int, mode: str) -> torch.Tensor:
+        if not torch.is_tensor(image):
+            raise ValueError(f"Invalid input type={type(image)}.")
+        if not torch.is_floating_point(image):
+            raise ValueError(f"Invalid input dtype={image.dtype}.")
+        if image.dim() != 4:
+            raise ValueError(f"Invalid input dimensions; shape={image.shape}.")
+
+        h, w = image.shape[-2:]
+        max_orig = max(h, w)
+        new_h = h * max_edge_sz // max_orig
+        new_w = w * max_edge_sz // max_orig
+
+        if new_h == 0 or new_w == 0:
+            raise ValueError(f"Extreme aspect ratio of the input image: [{w} x {h}]")
+
+        image = MarigoldImageProcessor.resize_antialias(image, (new_h, new_w), mode, is_aa=True)
+
+        return image
+
+    @staticmethod
+    def pad_image(image: torch.Tensor, align: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
+        if not torch.is_tensor(image):
+            raise ValueError(f"Invalid input type={type(image)}.")
+        if not torch.is_floating_point(image):
+            raise ValueError(f"Invalid input dtype={image.dtype}.")
+        if image.dim() != 4:
+            raise ValueError(f"Invalid input dimensions; shape={image.shape}.")
+
+        h, w = image.shape[-2:]
+        ph, pw = -h % align, -w % align
+
+        image = F.pad(image, (0, pw, 0, ph), mode="replicate")
+
+        return image, (ph, pw)
+
+    @staticmethod
+    def unpad_image(image: torch.Tensor, padding: Tuple[int, int]) -> torch.Tensor:
+        if not torch.is_tensor(image):
+            raise ValueError(f"Invalid input type={type(image)}.")
+        if not torch.is_floating_point(image):
+            raise ValueError(f"Invalid input dtype={image.dtype}.")
+        if image.dim() != 4:
+            raise ValueError(f"Invalid input dimensions; shape={image.shape}.")
+
+        ph, pw = padding
+        uh = None if ph == 0 else -ph
+        uw = None if pw == 0 else -pw
+
+        image = image[:, :, :uh, :uw]
+
+        return image
+
+    @staticmethod
+    def load_image_canonical(
+        image: Union[torch.Tensor, np.ndarray, Image.Image],
+        device: torch.device = torch.device("cpu"),
+        dtype: torch.dtype = torch.float32,
+    ) -> Tuple[torch.Tensor, int]:
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+
+        image_dtype_max = None
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            image = MarigoldImageProcessor.expand_tensor_or_array(image)
+            if image.ndim != 4:
+                raise ValueError("Input image is not 2-, 3-, or 4-dimensional.")
+        if isinstance(image, np.ndarray):
+            if np.issubdtype(image.dtype, np.integer) and not np.issubdtype(image.dtype, np.unsignedinteger):
+                raise ValueError(f"Input image dtype={image.dtype} cannot be a signed integer.")
+            if np.issubdtype(image.dtype, np.complexfloating):
+                raise ValueError(f"Input image dtype={image.dtype} cannot be complex.")
+            if np.issubdtype(image.dtype, bool):
+                raise ValueError(f"Input image dtype={image.dtype} cannot be boolean.")
+            if np.issubdtype(image.dtype, np.unsignedinteger):
+                image_dtype_max = np.iinfo(image.dtype).max
+                image = image.astype(np.float32)  # because torch does not have unsigned dtypes beyond torch.uint8
+            image = MarigoldImageProcessor.numpy_to_pt(image)
+
+        if torch.is_tensor(image) and not torch.is_floating_point(image) and image_dtype_max is None:
+            if image.dtype != torch.uint8:
+                raise ValueError(f"Image dtype={image.dtype} is not supported.")
+            image_dtype_max = 255
+
+        if not torch.is_tensor(image):
+            raise ValueError(f"Input type unsupported: {type(image)}.")
+
+        if image.shape[1] == 1:
+            image = image.repeat(1, 3, 1, 1)  # [N,1,H,W] -> [N,3,H,W]
+        if image.shape[1] != 3:
+            raise ValueError(f"Input image is not 1- or 3-channel: {image.shape}.")
+
+        image = image.to(device=device, dtype=dtype)
+
+        if image_dtype_max is not None:
+            image = image / image_dtype_max
+
+        return image
+
+    @staticmethod
+    def check_image_values_range(image: torch.Tensor) -> None:
+        if not torch.is_tensor(image):
+            raise ValueError(f"Invalid input type={type(image)}.")
+        if not torch.is_floating_point(image):
+            raise ValueError(f"Invalid input dtype={image.dtype}.")
+        if image.min().item() < 0.0 or image.max().item() > 1.0:
+            raise ValueError("Input image data is partially outside of the [0,1] range.")
+
+    def preprocess(
+        self,
+        image: PipelineImageInput,
+        processing_resolution: Optional[int] = None,
+        resample_method_input: str = "bilinear",
+        device: torch.device = torch.device("cpu"),
+        dtype: torch.dtype = torch.float32,
+    ):
+        if isinstance(image, list):
+            images = None
+            for i, img in enumerate(image):
+                img = self.load_image_canonical(img, device, dtype)  # [N,3,H,W]
+                if images is None:
+                    images = img
+                else:
+                    if images.shape[2:] != img.shape[2:]:
+                        raise ValueError(
+                            f"Input image[{i}] has incompatible dimensions {img.shape[2:]} with the previous images "
+                            f"{images.shape[2:]}"
+                        )
+                    images = torch.cat((images, img), dim=0)
+            image = images
+            del images
+        else:
+            image = self.load_image_canonical(image, device, dtype)  # [N,3,H,W]
+
+        original_resolution = image.shape[2:]
+
+        if self.config.do_range_check:
+            self.check_image_values_range(image)
+
+        if self.config.do_normalize:
+            image = image * 2.0 - 1.0
+
+        if processing_resolution is not None and processing_resolution > 0:
+            image = self.resize_to_max_edge(image, processing_resolution, resample_method_input)  # [N,3,PH,PW]
+
+        image, padding = self.pad_image(image, self.config.vae_scale_factor)  # [N,3,PPH,PPW]
+
+        return image, padding, original_resolution
+
+    @staticmethod
+    def colormap(
+        image: Union[np.ndarray, torch.Tensor],
+        cmap: str = "Spectral",
+        bytes: bool = False,
+        _force_method: Optional[str] = None,
+    ) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Converts a monochrome image into an RGB image by applying the specified colormap. This function mimics the
+        behavior of matplotlib.colormaps, but allows the user to use the most discriminative color maps ("Spectral",
+        "binary") without having to install or import matplotlib. For all other cases, the function will attempt to use
+        the native implementation.
+
+        Args:
+            image: 2D tensor of values between 0 and 1, either as np.ndarray or torch.Tensor.
+            cmap: Colormap name.
+            bytes: Whether to return the output as uint8 or floating point image.
+            _force_method:
+                Can be used to specify whether to use the native implementation (`"matplotlib"`), the efficient custom
+                implementation of the select color maps (`"custom"`), or rely on autodetection (`None`, default).
+
+        Returns:
+            An RGB-colorized tensor corresponding to the input image.
+        """
+        if not (torch.is_tensor(image) or isinstance(image, np.ndarray)):
+            raise ValueError("Argument must be a numpy array or torch tensor.")
+        if _force_method not in (None, "matplotlib", "custom"):
+            raise ValueError("_force_method must be either `None`, `'matplotlib'` or `'custom'`.")
+
+        supported_cmaps = {
+            "binary": [
+                (1.0, 1.0, 1.0),
+                (0.0, 0.0, 0.0),
+            ],
+            "Spectral": [  # Taken from matplotlib/_cm.py
+                (0.61960784313725492, 0.003921568627450980, 0.25882352941176473),  # 0.0 -> [0]
+                (0.83529411764705885, 0.24313725490196078, 0.30980392156862746),
+                (0.95686274509803926, 0.42745098039215684, 0.2627450980392157),
+                (0.99215686274509807, 0.68235294117647061, 0.38039215686274508),
+                (0.99607843137254903, 0.8784313725490196, 0.54509803921568623),
+                (1.0, 1.0, 0.74901960784313726),
+                (0.90196078431372551, 0.96078431372549022, 0.59607843137254901),
+                (0.6705882352941176, 0.8666666666666667, 0.64313725490196083),
+                (0.4, 0.76078431372549016, 0.6470588235294118),
+                (0.19607843137254902, 0.53333333333333333, 0.74117647058823533),
+                (0.36862745098039218, 0.30980392156862746, 0.63529411764705879),  # 1.0 -> [K-1]
+            ],
+        }
+
+        def method_matplotlib(image, cmap, bytes=False):
+            if is_matplotlib_available():
+                import matplotlib
+            else:
+                return None
+
+            arg_is_pt, device = torch.is_tensor(image), None
+            if arg_is_pt:
+                image, device = image.cpu().numpy(), image.device
+
+            if cmap not in matplotlib.colormaps:
+                raise ValueError(
+                    f"Unexpected color map {cmap}; available options are: {', '.join(list(matplotlib.colormaps.keys()))}"
+                )
+
+            cmap = matplotlib.colormaps[cmap]
+            out = cmap(image, bytes=bytes)  # [?,4]
+            out = out[..., :3]  # [?,3]
+
+            if arg_is_pt:
+                out = torch.tensor(out, device=device)
+
+            return out
+
+        def method_custom(image, cmap, bytes=False):
+            arg_is_np = isinstance(image, np.ndarray)
+            if arg_is_np:
+                image = torch.tensor(image)
+            if image.dtype == torch.uint8:
+                image = image.float() / 255
+            else:
+                image = image.float()
+
+            is_cmap_reversed = cmap.endswith("_r")
+            if is_cmap_reversed:
+                cmap = cmap[:-2]
+
+            if cmap not in supported_cmaps:
+                raise ValueError(
+                    f"Only {list(supported_cmaps.keys())} color maps are available without installing matplotlib."
+                )
+
+            cmap = supported_cmaps[cmap]
+            if is_cmap_reversed:
+                cmap = cmap[::-1]
+            cmap = torch.tensor(cmap, dtype=torch.float, device=image.device)  # [K,3]
+            K = cmap.shape[0]
+
+            pos = image.clamp(min=0, max=1) * (K - 1)
+            left = pos.long()
+            right = (left + 1).clamp(max=K - 1)
+
+            d = (pos - left.float()).unsqueeze(-1)
+            left_colors = cmap[left]
+            right_colors = cmap[right]
+
+            out = (1 - d) * left_colors + d * right_colors
+
+            if bytes:
+                out = (out * 255).to(torch.uint8)
+
+            if arg_is_np:
+                out = out.numpy()
+
+            return out
+
+        if _force_method is None and torch.is_tensor(image) and cmap == "Spectral":
+            return method_custom(image, cmap, bytes)
+
+        out = None
+        if _force_method != "custom":
+            out = method_matplotlib(image, cmap, bytes)
+
+        if _force_method == "matplotlib" and out is None:
+            raise ImportError("Make sure to install matplotlib if you want to use a color map other than 'Spectral'.")
+
+        if out is None:
+            out = method_custom(image, cmap, bytes)
+
+        return out
+
+    @staticmethod
+    def visualize_depth(
+        depth: Union[
+            PIL.Image.Image,
+            np.ndarray,
+            torch.Tensor,
+            List[PIL.Image.Image],
+            List[np.ndarray],
+            List[torch.Tensor],
+        ],
+        val_min: float = 0.0,
+        val_max: float = 1.0,
+        color_map: str = "Spectral",
+    ) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
+        """
+        Visualizes depth maps, such as predictions of the `MarigoldDepthPipeline`.
+
+        Args:
+            depth (`Union[PIL.Image.Image, np.ndarray, torch.Tensor, List[PIL.Image.Image], List[np.ndarray],
+                List[torch.Tensor]]`): Depth maps.
+            val_min (`float`, *optional*, defaults to `0.0`): Minimum value of the visualized depth range.
+            val_max (`float`, *optional*, defaults to `1.0`): Maximum value of the visualized depth range.
+            color_map (`str`, *optional*, defaults to `"Spectral"`): Color map used to convert a single-channel
+                      depth prediction into colored representation.
+
+        Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with depth maps visualization.
+        """
+        if val_max <= val_min:
+            raise ValueError(f"Invalid values range: [{val_min}, {val_max}].")
+
+        def visualize_depth_one(img, idx=None):
+            prefix = "Depth" + (f"[{idx}]" if idx else "")
+            if isinstance(img, PIL.Image.Image):
+                if img.mode != "I;16":
+                    raise ValueError(f"{prefix}: invalid PIL mode={img.mode}.")
+                img = np.array(img).astype(np.float32) / (2**16 - 1)
+            if isinstance(img, np.ndarray) or torch.is_tensor(img):
+                if img.ndim != 2:
+                    raise ValueError(f"{prefix}: unexpected shape={img.shape}.")
+                if isinstance(img, np.ndarray):
+                    img = torch.from_numpy(img)
+                if not torch.is_floating_point(img):
+                    raise ValueError(f"{prefix}: unexected dtype={img.dtype}.")
+            else:
+                raise ValueError(f"{prefix}: unexpected type={type(img)}.")
+            if val_min != 0.0 or val_max != 1.0:
+                img = (img - val_min) / (val_max - val_min)
+            img = MarigoldImageProcessor.colormap(img, cmap=color_map, bytes=True)  # [H,W,3]
+            img = PIL.Image.fromarray(img.cpu().numpy())
+            return img
+
+        if depth is None or isinstance(depth, list) and any(o is None for o in depth):
+            raise ValueError("Input depth is `None`")
+        if isinstance(depth, (np.ndarray, torch.Tensor)):
+            depth = MarigoldImageProcessor.expand_tensor_or_array(depth)
+            if isinstance(depth, np.ndarray):
+                depth = MarigoldImageProcessor.numpy_to_pt(depth)  # [N,H,W,1] -> [N,1,H,W]
+            if not (depth.ndim == 4 and depth.shape[1] == 1):  # [N,1,H,W]
+                raise ValueError(f"Unexpected input shape={depth.shape}, expecting [N,1,H,W].")
+            return [visualize_depth_one(img[0], idx) for idx, img in enumerate(depth)]
+        elif isinstance(depth, list):
+            return [visualize_depth_one(img, idx) for idx, img in enumerate(depth)]
+        else:
+            raise ValueError(f"Unexpected input type: {type(depth)}")
+
+    @staticmethod
+    def export_depth_to_16bit_png(
+        depth: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
+        val_min: float = 0.0,
+        val_max: float = 1.0,
+    ) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
+        def export_depth_to_16bit_png_one(img, idx=None):
+            prefix = "Depth" + (f"[{idx}]" if idx else "")
+            if not isinstance(img, np.ndarray) and not torch.is_tensor(img):
+                raise ValueError(f"{prefix}: unexpected type={type(img)}.")
+            if img.ndim != 2:
+                raise ValueError(f"{prefix}: unexpected shape={img.shape}.")
+            if torch.is_tensor(img):
+                img = img.cpu().numpy()
+            if not np.issubdtype(img.dtype, np.floating):
+                raise ValueError(f"{prefix}: unexected dtype={img.dtype}.")
+            if val_min != 0.0 or val_max != 1.0:
+                img = (img - val_min) / (val_max - val_min)
+            img = (img * (2**16 - 1)).astype(np.uint16)
+            img = PIL.Image.fromarray(img, mode="I;16")
+            return img
+
+        if depth is None or isinstance(depth, list) and any(o is None for o in depth):
+            raise ValueError("Input depth is `None`")
+        if isinstance(depth, (np.ndarray, torch.Tensor)):
+            depth = MarigoldImageProcessor.expand_tensor_or_array(depth)
+            if isinstance(depth, np.ndarray):
+                depth = MarigoldImageProcessor.numpy_to_pt(depth)  # [N,H,W,1] -> [N,1,H,W]
+            if not (depth.ndim == 4 and depth.shape[1] == 1):
+                raise ValueError(f"Unexpected input shape={depth.shape}, expecting [N,1,H,W].")
+            return [export_depth_to_16bit_png_one(img[0], idx) for idx, img in enumerate(depth)]
+        elif isinstance(depth, list):
+            return [export_depth_to_16bit_png_one(img, idx) for idx, img in enumerate(depth)]
+        else:
+            raise ValueError(f"Unexpected input type: {type(depth)}")
+
+    @staticmethod
+    def visualize_normals(
+        normals: Union[
+            np.ndarray,
+            torch.Tensor,
+            List[np.ndarray],
+            List[torch.Tensor],
+        ],
+        flip_x: bool = False,
+        flip_y: bool = False,
+        flip_z: bool = False,
+    ) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
+        """
+        Visualizes surface normals, such as predictions of the `MarigoldNormalsPipeline`.
+
+        Args:
+            normals (`Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]`):
+                Surface normals.
+            flip_x (`bool`, *optional*, defaults to `False`): Flips the X axis of the normals frame of reference.
+                      Default direction is right.
+            flip_y (`bool`, *optional*, defaults to `False`):  Flips the Y axis of the normals frame of reference.
+                      Default direction is top.
+            flip_z (`bool`, *optional*, defaults to `False`): Flips the Z axis of the normals frame of reference.
+                      Default direction is facing the observer.
+
+        Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with surface normals visualization.
+        """
+        flip_vec = None
+        if any((flip_x, flip_y, flip_z)):
+            flip_vec = torch.tensor(
+                [
+                    (-1) ** flip_x,
+                    (-1) ** flip_y,
+                    (-1) ** flip_z,
+                ],
+                dtype=torch.float32,
+            )
+
+        def visualize_normals_one(img, idx=None):
+            img = img.permute(1, 2, 0)
+            if flip_vec is not None:
+                img *= flip_vec.to(img.device)
+            img = (img + 1.0) * 0.5
+            img = (img * 255).to(dtype=torch.uint8, device="cpu").numpy()
+            img = PIL.Image.fromarray(img)
+            return img
+
+        if normals is None or isinstance(normals, list) and any(o is None for o in normals):
+            raise ValueError("Input normals is `None`")
+        if isinstance(normals, (np.ndarray, torch.Tensor)):
+            normals = MarigoldImageProcessor.expand_tensor_or_array(normals)
+            if isinstance(normals, np.ndarray):
+                normals = MarigoldImageProcessor.numpy_to_pt(normals)  # [N,3,H,W]
+            if not (normals.ndim == 4 and normals.shape[1] == 3):
+                raise ValueError(f"Unexpected input shape={normals.shape}, expecting [N,3,H,W].")
+            return [visualize_normals_one(img, idx) for idx, img in enumerate(normals)]
+        elif isinstance(normals, list):
+            return [visualize_normals_one(img, idx) for idx, img in enumerate(normals)]
+        else:
+            raise ValueError(f"Unexpected input type: {type(normals)}")
+
+    @staticmethod
+    def visualize_uncertainty(
+        uncertainty: Union[
+            np.ndarray,
+            torch.Tensor,
+            List[np.ndarray],
+            List[torch.Tensor],
+        ],
+        saturation_percentile=95,
+    ) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
+        """
+        Visualizes dense uncertainties, such as produced by `MarigoldDepthPipeline` or `MarigoldNormalsPipeline`.
+
+        Args:
+            uncertainty (`Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]`):
+                Uncertainty maps.
+            saturation_percentile (`int`, *optional*, defaults to `95`):
+                Specifies the percentile uncertainty value visualized with maximum intensity.
+
+        Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with uncertainty visualization.
+        """
+
+        def visualize_uncertainty_one(img, idx=None):
+            prefix = "Uncertainty" + (f"[{idx}]" if idx else "")
+            if img.min() < 0:
+                raise ValueError(f"{prefix}: unexected data range, min={img.min()}.")
+            img = img.squeeze(0).cpu().numpy()
+            saturation_value = np.percentile(img, saturation_percentile)
+            img = np.clip(img * 255 / saturation_value, 0, 255)
+            img = img.astype(np.uint8)
+            img = PIL.Image.fromarray(img)
+            return img
+
+        if uncertainty is None or isinstance(uncertainty, list) and any(o is None for o in uncertainty):
+            raise ValueError("Input uncertainty is `None`")
+        if isinstance(uncertainty, (np.ndarray, torch.Tensor)):
+            uncertainty = MarigoldImageProcessor.expand_tensor_or_array(uncertainty)
+            if isinstance(uncertainty, np.ndarray):
+                uncertainty = MarigoldImageProcessor.numpy_to_pt(uncertainty)  # [N,1,H,W]
+            if not (uncertainty.ndim == 4 and uncertainty.shape[1] == 1):
+                raise ValueError(f"Unexpected input shape={uncertainty.shape}, expecting [N,1,H,W].")
+            return [visualize_uncertainty_one(img, idx) for idx, img in enumerate(uncertainty)]
+        elif isinstance(uncertainty, list):
+            return [visualize_uncertainty_one(img, idx) for idx, img in enumerate(uncertainty)]
+        else:
+            raise ValueError(f"Unexpected input type: {type(uncertainty)}")
diff --git a/diffusers/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py b/diffusers/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..a602ba611ea5a860f783aae7a789b7c90f417667
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
@@ -0,0 +1,813 @@
+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# More information and citation instructions are available on the
+# Marigold project website: https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput
+from ...models import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+)
+from ...schedulers import (
+    DDIMScheduler,
+    LCMScheduler,
+)
+from ...utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.import_utils import is_scipy_available
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .marigold_image_processing import MarigoldImageProcessor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+Examples:
+```py
+>>> import diffusers
+>>> import torch
+
+>>> pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
+...     "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
+... ).to("cuda")
+
+>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+>>> depth = pipe(image)
+
+>>> vis = pipe.image_processor.visualize_depth(depth.prediction)
+>>> vis[0].save("einstein_depth.png")
+
+>>> depth_16bit = pipe.image_processor.export_depth_to_16bit_png(depth.prediction)
+>>> depth_16bit[0].save("einstein_depth_16bit.png")
+```
+"""
+
+
+@dataclass
+class MarigoldDepthOutput(BaseOutput):
+    """
+    Output class for Marigold monocular depth prediction pipeline.
+
+    Args:
+        prediction (`np.ndarray`, `torch.Tensor`):
+            Predicted depth maps with values in the range [0, 1]. The shape is always $numimages \times 1 \times height
+            \times width$, regardless of whether the images were passed as a 4D array or a list.
+        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
+            \times 1 \times height \times width$.
+        latent (`None`, `torch.Tensor`):
+            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
+            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
+    """
+
+    prediction: Union[np.ndarray, torch.Tensor]
+    uncertainty: Union[None, np.ndarray, torch.Tensor]
+    latent: Union[None, torch.Tensor]
+
+
+class MarigoldDepthPipeline(DiffusionPipeline):
+    """
+    Pipeline for monocular depth estimation using the Marigold method: https://marigoldmonodepth.github.io.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        unet (`UNet2DConditionModel`):
+            Conditional U-Net to denoise the depth latent, conditioned on image latent.
+        vae (`AutoencoderKL`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images and predictions to and from latent
+            representations.
+        scheduler (`DDIMScheduler` or `LCMScheduler`):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        text_encoder (`CLIPTextModel`):
+            Text-encoder, for empty text embedding.
+        tokenizer (`CLIPTokenizer`):
+            CLIP tokenizer.
+        prediction_type (`str`, *optional*):
+            Type of predictions made by the model.
+        scale_invariant (`bool`, *optional*):
+            A model property specifying whether the predicted depth maps are scale-invariant. This value must be set in
+            the model config. When used together with the `shift_invariant=True` flag, the model is also called
+            "affine-invariant". NB: overriding this value is not supported.
+        shift_invariant (`bool`, *optional*):
+            A model property specifying whether the predicted depth maps are shift-invariant. This value must be set in
+            the model config. When used together with the `scale_invariant=True` flag, the model is also called
+            "affine-invariant". NB: overriding this value is not supported.
+        default_denoising_steps (`int`, *optional*):
+            The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
+            quality with the given model. This value must be set in the model config. When the pipeline is called
+            without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
+            reasonable results with various model flavors compatible with the pipeline, such as those relying on very
+            short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
+        default_processing_resolution (`int`, *optional*):
+            The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
+            the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
+            default value is used. This is required to ensure reasonable results with various model flavors trained
+            with varying optimal processing resolution values.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    supported_prediction_types = ("depth", "disparity")
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: Union[DDIMScheduler, LCMScheduler],
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        prediction_type: Optional[str] = None,
+        scale_invariant: Optional[bool] = True,
+        shift_invariant: Optional[bool] = True,
+        default_denoising_steps: Optional[int] = None,
+        default_processing_resolution: Optional[int] = None,
+    ):
+        super().__init__()
+
+        if prediction_type not in self.supported_prediction_types:
+            logger.warning(
+                f"Potentially unsupported `prediction_type='{prediction_type}'`; values supported by the pipeline: "
+                f"{self.supported_prediction_types}."
+            )
+
+        self.register_modules(
+            unet=unet,
+            vae=vae,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+        self.register_to_config(
+            prediction_type=prediction_type,
+            scale_invariant=scale_invariant,
+            shift_invariant=shift_invariant,
+            default_denoising_steps=default_denoising_steps,
+            default_processing_resolution=default_processing_resolution,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+        self.scale_invariant = scale_invariant
+        self.shift_invariant = shift_invariant
+        self.default_denoising_steps = default_denoising_steps
+        self.default_processing_resolution = default_processing_resolution
+
+        self.empty_text_embedding = None
+
+        self.image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def check_inputs(
+        self,
+        image: PipelineImageInput,
+        num_inference_steps: int,
+        ensemble_size: int,
+        processing_resolution: int,
+        resample_method_input: str,
+        resample_method_output: str,
+        batch_size: int,
+        ensembling_kwargs: Optional[Dict[str, Any]],
+        latents: Optional[torch.Tensor],
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+        output_type: str,
+        output_uncertainty: bool,
+    ) -> int:
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
+        if num_inference_steps < 1:
+            raise ValueError("`num_inference_steps` must be positive.")
+        if ensemble_size < 1:
+            raise ValueError("`ensemble_size` must be positive.")
+        if ensemble_size == 2:
+            logger.warning(
+                "`ensemble_size` == 2 results are similar to no ensembling (1); "
+                "consider increasing the value to at least 3."
+            )
+        if ensemble_size > 1 and (self.scale_invariant or self.shift_invariant) and not is_scipy_available():
+            raise ImportError("Make sure to install scipy if you want to use ensembling.")
+        if ensemble_size == 1 and output_uncertainty:
+            raise ValueError(
+                "Computing uncertainty by setting `output_uncertainty=True` also requires setting `ensemble_size` "
+                "greater than 1."
+            )
+        if processing_resolution is None:
+            raise ValueError(
+                "`processing_resolution` is not specified and could not be resolved from the model config."
+            )
+        if processing_resolution < 0:
+            raise ValueError(
+                "`processing_resolution` must be non-negative: 0 for native resolution, or any positive value for "
+                "downsampled processing."
+            )
+        if processing_resolution % self.vae_scale_factor != 0:
+            raise ValueError(f"`processing_resolution` must be a multiple of {self.vae_scale_factor}.")
+        if resample_method_input not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_input` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if resample_method_output not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_output` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if batch_size < 1:
+            raise ValueError("`batch_size` must be positive.")
+        if output_type not in ["pt", "np"]:
+            raise ValueError("`output_type` must be one of `pt` or `np`.")
+        if latents is not None and generator is not None:
+            raise ValueError("`latents` and `generator` cannot be used together.")
+        if ensembling_kwargs is not None:
+            if not isinstance(ensembling_kwargs, dict):
+                raise ValueError("`ensembling_kwargs` must be a dictionary.")
+            if "reduction" in ensembling_kwargs and ensembling_kwargs["reduction"] not in ("mean", "median"):
+                raise ValueError("`ensembling_kwargs['reduction']` can be either `'mean'` or `'median'`.")
+
+        # image checks
+        num_images = 0
+        W, H = None, None
+        if not isinstance(image, list):
+            image = [image]
+        for i, img in enumerate(image):
+            if isinstance(img, np.ndarray) or torch.is_tensor(img):
+                if img.ndim not in (2, 3, 4):
+                    raise ValueError(f"`image[{i}]` has unsupported dimensions or shape: {img.shape}.")
+                H_i, W_i = img.shape[-2:]
+                N_i = 1
+                if img.ndim == 4:
+                    N_i = img.shape[0]
+            elif isinstance(img, Image.Image):
+                W_i, H_i = img.size
+                N_i = 1
+            else:
+                raise ValueError(f"Unsupported `image[{i}]` type: {type(img)}.")
+            if W is None:
+                W, H = W_i, H_i
+            elif (W, H) != (W_i, H_i):
+                raise ValueError(
+                    f"Input `image[{i}]` has incompatible dimensions {(W_i, H_i)} with the previous images {(W, H)}"
+                )
+            num_images += N_i
+
+        # latents checks
+        if latents is not None:
+            if not torch.is_tensor(latents):
+                raise ValueError("`latents` must be a torch.Tensor.")
+            if latents.dim() != 4:
+                raise ValueError(f"`latents` has unsupported dimensions or shape: {latents.shape}.")
+
+            if processing_resolution > 0:
+                max_orig = max(H, W)
+                new_H = H * processing_resolution // max_orig
+                new_W = W * processing_resolution // max_orig
+                if new_H == 0 or new_W == 0:
+                    raise ValueError(f"Extreme aspect ratio of the input image: [{W} x {H}]")
+                W, H = new_W, new_H
+            w = (W + self.vae_scale_factor - 1) // self.vae_scale_factor
+            h = (H + self.vae_scale_factor - 1) // self.vae_scale_factor
+            shape_expected = (num_images * ensemble_size, self.vae.config.latent_channels, h, w)
+
+            if latents.shape != shape_expected:
+                raise ValueError(f"`latents` has unexpected shape={latents.shape} expected={shape_expected}.")
+
+        # generator checks
+        if generator is not None:
+            if isinstance(generator, list):
+                if len(generator) != num_images * ensemble_size:
+                    raise ValueError(
+                        "The number of generators must match the total number of ensemble members for all input images."
+                    )
+                if not all(g.device.type == generator[0].device.type for g in generator):
+                    raise ValueError("`generator` device placement is not consistent in the list.")
+            elif not isinstance(generator, torch.Generator):
+                raise ValueError(f"Unsupported generator type: {type(generator)}.")
+
+        return num_images
+
+    def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        elif not isinstance(self._progress_bar_config, dict):
+            raise ValueError(
+                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+            )
+
+        progress_bar_config = dict(**self._progress_bar_config)
+        progress_bar_config["desc"] = progress_bar_config.get("desc", desc)
+        progress_bar_config["leave"] = progress_bar_config.get("leave", leave)
+        if iterable is not None:
+            return tqdm(iterable, **progress_bar_config)
+        elif total is not None:
+            return tqdm(total=total, **progress_bar_config)
+        else:
+            raise ValueError("Either `total` or `iterable` has to be defined.")
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        num_inference_steps: Optional[int] = None,
+        ensemble_size: int = 1,
+        processing_resolution: Optional[int] = None,
+        match_input_resolution: bool = True,
+        resample_method_input: str = "bilinear",
+        resample_method_output: str = "bilinear",
+        batch_size: int = 1,
+        ensembling_kwargs: Optional[Dict[str, Any]] = None,
+        latents: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: str = "np",
+        output_uncertainty: bool = False,
+        output_latent: bool = False,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline.
+
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`),
+                `List[torch.Tensor]`: An input image or images used as an input for the depth estimation task. For
+                arrays and tensors, the expected value range is between `[0, 1]`. Passing a batch of images is possible
+                by providing a four-dimensional array or a tensor. Additionally, a list of images of two- or
+                three-dimensional arrays or tensors can be passed. In the latter case, all list elements must have the
+                same width and height.
+            num_inference_steps (`int`, *optional*, defaults to `None`):
+                Number of denoising diffusion steps during inference. The default value `None` results in automatic
+                selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
+                for Marigold-LCM models.
+            ensemble_size (`int`, defaults to `1`):
+                Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
+                faster inference.
+            processing_resolution (`int`, *optional*, defaults to `None`):
+                Effective processing resolution. When set to `0`, matches the larger input image dimension. This
+                produces crisper predictions, but may also lead to the overall loss of global context. The default
+                value `None` resolves to the optimal value from the model config.
+            match_input_resolution (`bool`, *optional*, defaults to `True`):
+                When enabled, the output prediction is resized to match the input dimensions. When disabled, the longer
+                side of the output will equal to `processing_resolution`.
+            resample_method_input (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize input images to `processing_resolution`. The accepted values are:
+                `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            resample_method_output (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize output predictions to match the input resolution. The accepted values
+                are `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            batch_size (`int`, *optional*, defaults to `1`):
+                Batch size; only matters when setting `ensemble_size` or passing a tensor of images.
+            ensembling_kwargs (`dict`, *optional*, defaults to `None`)
+                Extra dictionary with arguments for precise ensembling control. The following options are available:
+                - reduction (`str`, *optional*, defaults to `"median"`): Defines the ensembling function applied in
+                  every pixel location, can be either `"median"` or `"mean"`.
+                - regularizer_strength (`float`, *optional*, defaults to `0.02`): Strength of the regularizer that
+                  pulls the aligned predictions to the unit range from 0 to 1.
+                - max_iter (`int`, *optional*, defaults to `2`): Maximum number of the alignment solver steps. Refer to
+                  `scipy.optimize.minimize` function, `options` argument.
+                - tol (`float`, *optional*, defaults to `1e-3`): Alignment solver tolerance. The solver stops when the
+                  tolerance is reached.
+                - max_res (`int`, *optional*, defaults to `None`): Resolution at which the alignment is performed;
+                  `None` matches the `processing_resolution`.
+            latents (`torch.Tensor`, or `List[torch.Tensor]`, *optional*, defaults to `None`):
+                Latent noise tensors to replace the random initialization. These can be taken from the previous
+                function call's output.
+            generator (`torch.Generator`, or `List[torch.Generator]`, *optional*, defaults to `None`):
+                Random number generator object to ensure reproducibility.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                Preferred format of the output's `prediction` and the optional `uncertainty` fields. The accepted
+                values are: `"np"` (numpy array) or `"pt"` (torch tensor).
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `uncertainty` field contains the predictive uncertainty map, provided that
+                the `ensemble_size` argument is set to a value above 2.
+            output_latent (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
+                within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
+                `latents` argument.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.marigold.MarigoldDepthOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.marigold.MarigoldDepthOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.marigold.MarigoldDepthOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is the prediction, the second element is the uncertainty
+                (or `None`), and the third is the latent (or `None`).
+        """
+
+        # 0. Resolving variables.
+        device = self._execution_device
+        dtype = self.dtype
+
+        # Model-specific optimal default values leading to fast and reasonable results.
+        if num_inference_steps is None:
+            num_inference_steps = self.default_denoising_steps
+        if processing_resolution is None:
+            processing_resolution = self.default_processing_resolution
+
+        # 1. Check inputs.
+        num_images = self.check_inputs(
+            image,
+            num_inference_steps,
+            ensemble_size,
+            processing_resolution,
+            resample_method_input,
+            resample_method_output,
+            batch_size,
+            ensembling_kwargs,
+            latents,
+            generator,
+            output_type,
+            output_uncertainty,
+        )
+
+        # 2. Prepare empty text conditioning.
+        # Model invocation: self.tokenizer, self.text_encoder.
+        if self.empty_text_embedding is None:
+            prompt = ""
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="do_not_pad",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids.to(device)
+            self.empty_text_embedding = self.text_encoder(text_input_ids)[0]  # [1,2,1024]
+
+        # 3. Preprocess input images. This function loads input image or images of compatible dimensions `(H, W)`,
+        # optionally downsamples them to the `processing_resolution` `(PH, PW)`, where
+        # `max(PH, PW) == processing_resolution`, and pads the dimensions to `(PPH, PPW)` such that these values are
+        # divisible by the latent space downscaling factor (typically 8 in Stable Diffusion). The default value `None`
+        # of `processing_resolution` resolves to the optimal value from the model config. It is a recommended mode of
+        # operation and leads to the most reasonable results. Using the native image resolution or any other processing
+        # resolution can lead to loss of either fine details or global context in the output predictions.
+        image, padding, original_resolution = self.image_processor.preprocess(
+            image, processing_resolution, resample_method_input, device, dtype
+        )  # [N,3,PPH,PPW]
+
+        # 4. Encode input image into latent space. At this step, each of the `N` input images is represented with `E`
+        # ensemble members. Each ensemble member is an independent diffused prediction, just initialized independently.
+        # Latents of each such predictions across all input images and all ensemble members are represented in the
+        # `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
+        # into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
+        # reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
+        # code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
+        # as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
+        # noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
+        # dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
+        # Model invocation: self.vae.encoder.
+        image_latent, pred_latent = self.prepare_latents(
+            image, latents, generator, ensemble_size, batch_size
+        )  # [N*E,4,h,w], [N*E,4,h,w]
+
+        del image
+
+        batch_empty_text_embedding = self.empty_text_embedding.to(device=device, dtype=dtype).repeat(
+            batch_size, 1, 1
+        )  # [B,1024,2]
+
+        # 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
+        # The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
+        # outputs noise for the predicted modality's latent space. The number of denoising diffusion steps is defined by
+        # `num_inference_steps`. It is either set directly, or resolves to the optimal value specific to the loaded
+        # model.
+        # Model invocation: self.unet.
+        pred_latents = []
+
+        for i in self.progress_bar(
+            range(0, num_images * ensemble_size, batch_size), leave=True, desc="Marigold predictions..."
+        ):
+            batch_image_latent = image_latent[i : i + batch_size]  # [B,4,h,w]
+            batch_pred_latent = pred_latent[i : i + batch_size]  # [B,4,h,w]
+            effective_batch_size = batch_image_latent.shape[0]
+            text = batch_empty_text_embedding[:effective_batch_size]  # [B,2,1024]
+
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            for t in self.progress_bar(self.scheduler.timesteps, leave=False, desc="Diffusion steps..."):
+                batch_latent = torch.cat([batch_image_latent, batch_pred_latent], dim=1)  # [B,8,h,w]
+                noise = self.unet(batch_latent, t, encoder_hidden_states=text, return_dict=False)[0]  # [B,4,h,w]
+                batch_pred_latent = self.scheduler.step(
+                    noise, t, batch_pred_latent, generator=generator
+                ).prev_sample  # [B,4,h,w]
+
+            pred_latents.append(batch_pred_latent)
+
+        pred_latent = torch.cat(pred_latents, dim=0)  # [N*E,4,h,w]
+
+        del (
+            pred_latents,
+            image_latent,
+            batch_empty_text_embedding,
+            batch_image_latent,
+            batch_pred_latent,
+            text,
+            batch_latent,
+            noise,
+        )
+
+        # 6. Decode predictions from latent into pixel space. The resulting `N * E` predictions have shape `(PPH, PPW)`,
+        # which requires slight postprocessing. Decoding into pixel space happens in batches of size `batch_size`.
+        # Model invocation: self.vae.decoder.
+        prediction = torch.cat(
+            [
+                self.decode_prediction(pred_latent[i : i + batch_size])
+                for i in range(0, pred_latent.shape[0], batch_size)
+            ],
+            dim=0,
+        )  # [N*E,1,PPH,PPW]
+
+        if not output_latent:
+            pred_latent = None
+
+        # 7. Remove padding. The output shape is (PH, PW).
+        prediction = self.image_processor.unpad_image(prediction, padding)  # [N*E,1,PH,PW]
+
+        # 8. Ensemble and compute uncertainty (when `output_uncertainty` is set). This code treats each of the `N`
+        # groups of `E` ensemble predictions independently. For each group it computes an ensembled prediction of shape
+        # `(PH, PW)` and an optional uncertainty map of the same dimensions. After computing this pair of outputs for
+        # each group independently, it stacks them respectively into batches of `N` almost final predictions and
+        # uncertainty maps.
+        uncertainty = None
+        if ensemble_size > 1:
+            prediction = prediction.reshape(num_images, ensemble_size, *prediction.shape[1:])  # [N,E,1,PH,PW]
+            prediction = [
+                self.ensemble_depth(
+                    prediction[i],
+                    self.scale_invariant,
+                    self.shift_invariant,
+                    output_uncertainty,
+                    **(ensembling_kwargs or {}),
+                )
+                for i in range(num_images)
+            ]  # [ [[1,1,PH,PW], [1,1,PH,PW]], ... ]
+            prediction, uncertainty = zip(*prediction)  # [[1,1,PH,PW], ... ], [[1,1,PH,PW], ... ]
+            prediction = torch.cat(prediction, dim=0)  # [N,1,PH,PW]
+            if output_uncertainty:
+                uncertainty = torch.cat(uncertainty, dim=0)  # [N,1,PH,PW]
+            else:
+                uncertainty = None
+
+        # 9. If `match_input_resolution` is set, the output prediction and the uncertainty are upsampled to match the
+        # input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
+        # Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
+        # setting the `resample_method_output` parameter (e.g., to `"nearest"`).
+        if match_input_resolution:
+            prediction = self.image_processor.resize_antialias(
+                prediction, original_resolution, resample_method_output, is_aa=False
+            )  # [N,1,H,W]
+            if uncertainty is not None and output_uncertainty:
+                uncertainty = self.image_processor.resize_antialias(
+                    uncertainty, original_resolution, resample_method_output, is_aa=False
+                )  # [N,1,H,W]
+
+        # 10. Prepare the final outputs.
+        if output_type == "np":
+            prediction = self.image_processor.pt_to_numpy(prediction)  # [N,H,W,1]
+            if uncertainty is not None and output_uncertainty:
+                uncertainty = self.image_processor.pt_to_numpy(uncertainty)  # [N,H,W,1]
+
+        # 11. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (prediction, uncertainty, pred_latent)
+
+        return MarigoldDepthOutput(
+            prediction=prediction,
+            uncertainty=uncertainty,
+            latent=pred_latent,
+        )
+
+    def prepare_latents(
+        self,
+        image: torch.Tensor,
+        latents: Optional[torch.Tensor],
+        generator: Optional[torch.Generator],
+        ensemble_size: int,
+        batch_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        def retrieve_latents(encoder_output):
+            if hasattr(encoder_output, "latent_dist"):
+                return encoder_output.latent_dist.mode()
+            elif hasattr(encoder_output, "latents"):
+                return encoder_output.latents
+            else:
+                raise AttributeError("Could not access latents of provided encoder_output")
+
+        image_latent = torch.cat(
+            [
+                retrieve_latents(self.vae.encode(image[i : i + batch_size]))
+                for i in range(0, image.shape[0], batch_size)
+            ],
+            dim=0,
+        )  # [N,4,h,w]
+        image_latent = image_latent * self.vae.config.scaling_factor
+        image_latent = image_latent.repeat_interleave(ensemble_size, dim=0)  # [N*E,4,h,w]
+
+        pred_latent = latents
+        if pred_latent is None:
+            pred_latent = randn_tensor(
+                image_latent.shape,
+                generator=generator,
+                device=image_latent.device,
+                dtype=image_latent.dtype,
+            )  # [N*E,4,h,w]
+
+        return image_latent, pred_latent
+
+    def decode_prediction(self, pred_latent: torch.Tensor) -> torch.Tensor:
+        if pred_latent.dim() != 4 or pred_latent.shape[1] != self.vae.config.latent_channels:
+            raise ValueError(
+                f"Expecting 4D tensor of shape [B,{self.vae.config.latent_channels},H,W]; got {pred_latent.shape}."
+            )
+
+        prediction = self.vae.decode(pred_latent / self.vae.config.scaling_factor, return_dict=False)[0]  # [B,3,H,W]
+
+        prediction = prediction.mean(dim=1, keepdim=True)  # [B,1,H,W]
+        prediction = torch.clip(prediction, -1.0, 1.0)  # [B,1,H,W]
+        prediction = (prediction + 1.0) / 2.0
+
+        return prediction  # [B,1,H,W]
+
+    @staticmethod
+    def ensemble_depth(
+        depth: torch.Tensor,
+        scale_invariant: bool = True,
+        shift_invariant: bool = True,
+        output_uncertainty: bool = False,
+        reduction: str = "median",
+        regularizer_strength: float = 0.02,
+        max_iter: int = 2,
+        tol: float = 1e-3,
+        max_res: int = 1024,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Ensembles the depth maps represented by the `depth` tensor with expected shape `(B, 1, H, W)`, where B is the
+        number of ensemble members for a given prediction of size `(H x W)`. Even though the function is designed for
+        depth maps, it can also be used with disparity maps as long as the input tensor values are non-negative. The
+        alignment happens when the predictions have one or more degrees of freedom, that is when they are either
+        affine-invariant (`scale_invariant=True` and `shift_invariant=True`), or just scale-invariant (only
+        `scale_invariant=True`). For absolute predictions (`scale_invariant=False` and `shift_invariant=False`)
+        alignment is skipped and only ensembling is performed.
+
+        Args:
+            depth (`torch.Tensor`):
+                Input ensemble depth maps.
+            scale_invariant (`bool`, *optional*, defaults to `True`):
+                Whether to treat predictions as scale-invariant.
+            shift_invariant (`bool`, *optional*, defaults to `True`):
+                Whether to treat predictions as shift-invariant.
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                Whether to output uncertainty map.
+            reduction (`str`, *optional*, defaults to `"median"`):
+                Reduction method used to ensemble aligned predictions. The accepted values are: `"mean"` and
+                `"median"`.
+            regularizer_strength (`float`, *optional*, defaults to `0.02`):
+                Strength of the regularizer that pulls the aligned predictions to the unit range from 0 to 1.
+            max_iter (`int`, *optional*, defaults to `2`):
+                Maximum number of the alignment solver steps. Refer to `scipy.optimize.minimize` function, `options`
+                argument.
+            tol (`float`, *optional*, defaults to `1e-3`):
+                Alignment solver tolerance. The solver stops when the tolerance is reached.
+            max_res (`int`, *optional*, defaults to `1024`):
+                Resolution at which the alignment is performed; `None` matches the `processing_resolution`.
+        Returns:
+            A tensor of aligned and ensembled depth maps and optionally a tensor of uncertainties of the same shape:
+            `(1, 1, H, W)`.
+        """
+        if depth.dim() != 4 or depth.shape[1] != 1:
+            raise ValueError(f"Expecting 4D tensor of shape [B,1,H,W]; got {depth.shape}.")
+        if reduction not in ("mean", "median"):
+            raise ValueError(f"Unrecognized reduction method: {reduction}.")
+        if not scale_invariant and shift_invariant:
+            raise ValueError("Pure shift-invariant ensembling is not supported.")
+
+        def init_param(depth: torch.Tensor):
+            init_min = depth.reshape(ensemble_size, -1).min(dim=1).values
+            init_max = depth.reshape(ensemble_size, -1).max(dim=1).values
+
+            if scale_invariant and shift_invariant:
+                init_s = 1.0 / (init_max - init_min).clamp(min=1e-6)
+                init_t = -init_s * init_min
+                param = torch.cat((init_s, init_t)).cpu().numpy()
+            elif scale_invariant:
+                init_s = 1.0 / init_max.clamp(min=1e-6)
+                param = init_s.cpu().numpy()
+            else:
+                raise ValueError("Unrecognized alignment.")
+
+            return param
+
+        def align(depth: torch.Tensor, param: np.ndarray) -> torch.Tensor:
+            if scale_invariant and shift_invariant:
+                s, t = np.split(param, 2)
+                s = torch.from_numpy(s).to(depth).view(ensemble_size, 1, 1, 1)
+                t = torch.from_numpy(t).to(depth).view(ensemble_size, 1, 1, 1)
+                out = depth * s + t
+            elif scale_invariant:
+                s = torch.from_numpy(param).to(depth).view(ensemble_size, 1, 1, 1)
+                out = depth * s
+            else:
+                raise ValueError("Unrecognized alignment.")
+            return out
+
+        def ensemble(
+            depth_aligned: torch.Tensor, return_uncertainty: bool = False
+        ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+            uncertainty = None
+            if reduction == "mean":
+                prediction = torch.mean(depth_aligned, dim=0, keepdim=True)
+                if return_uncertainty:
+                    uncertainty = torch.std(depth_aligned, dim=0, keepdim=True)
+            elif reduction == "median":
+                prediction = torch.median(depth_aligned, dim=0, keepdim=True).values
+                if return_uncertainty:
+                    uncertainty = torch.median(torch.abs(depth_aligned - prediction), dim=0, keepdim=True).values
+            else:
+                raise ValueError(f"Unrecognized reduction method: {reduction}.")
+            return prediction, uncertainty
+
+        def cost_fn(param: np.ndarray, depth: torch.Tensor) -> float:
+            cost = 0.0
+            depth_aligned = align(depth, param)
+
+            for i, j in torch.combinations(torch.arange(ensemble_size)):
+                diff = depth_aligned[i] - depth_aligned[j]
+                cost += (diff**2).mean().sqrt().item()
+
+            if regularizer_strength > 0:
+                prediction, _ = ensemble(depth_aligned, return_uncertainty=False)
+                err_near = (0.0 - prediction.min()).abs().item()
+                err_far = (1.0 - prediction.max()).abs().item()
+                cost += (err_near + err_far) * regularizer_strength
+
+            return cost
+
+        def compute_param(depth: torch.Tensor):
+            import scipy
+
+            depth_to_align = depth.to(torch.float32)
+            if max_res is not None and max(depth_to_align.shape[2:]) > max_res:
+                depth_to_align = MarigoldImageProcessor.resize_to_max_edge(depth_to_align, max_res, "nearest-exact")
+
+            param = init_param(depth_to_align)
+
+            res = scipy.optimize.minimize(
+                partial(cost_fn, depth=depth_to_align),
+                param,
+                method="BFGS",
+                tol=tol,
+                options={"maxiter": max_iter, "disp": False},
+            )
+
+            return res.x
+
+        requires_aligning = scale_invariant or shift_invariant
+        ensemble_size = depth.shape[0]
+
+        if requires_aligning:
+            param = compute_param(depth)
+            depth = align(depth, param)
+
+        depth, uncertainty = ensemble(depth, return_uncertainty=output_uncertainty)
+
+        depth_max = depth.max()
+        if scale_invariant and shift_invariant:
+            depth_min = depth.min()
+        elif scale_invariant:
+            depth_min = 0
+        else:
+            raise ValueError("Unrecognized alignment.")
+        depth_range = (depth_max - depth_min).clamp(min=1e-6)
+        depth = (depth - depth_min) / depth_range
+        if output_uncertainty:
+            uncertainty /= depth_range
+
+        return depth, uncertainty  # [1,1,H,W], [1,1,H,W]
diff --git a/diffusers/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py b/diffusers/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa9ad36ffc35614c889165152ecaea443773cfb8
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
@@ -0,0 +1,690 @@
+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# More information and citation instructions are available on the
+# Marigold project website: https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput
+from ...models import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+)
+from ...schedulers import (
+    DDIMScheduler,
+    LCMScheduler,
+)
+from ...utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .marigold_image_processing import MarigoldImageProcessor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+Examples:
+```py
+>>> import diffusers
+>>> import torch
+
+>>> pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
+...     "prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
+... ).to("cuda")
+
+>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+>>> normals = pipe(image)
+
+>>> vis = pipe.image_processor.visualize_normals(normals.prediction)
+>>> vis[0].save("einstein_normals.png")
+```
+"""
+
+
+@dataclass
+class MarigoldNormalsOutput(BaseOutput):
+    """
+    Output class for Marigold monocular normals prediction pipeline.
+
+    Args:
+        prediction (`np.ndarray`, `torch.Tensor`):
+            Predicted normals with values in the range [-1, 1]. The shape is always $numimages \times 3 \times height
+            \times width$, regardless of whether the images were passed as a 4D array or a list.
+        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
+            \times 1 \times height \times width$.
+        latent (`None`, `torch.Tensor`):
+            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
+            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
+    """
+
+    prediction: Union[np.ndarray, torch.Tensor]
+    uncertainty: Union[None, np.ndarray, torch.Tensor]
+    latent: Union[None, torch.Tensor]
+
+
+class MarigoldNormalsPipeline(DiffusionPipeline):
+    """
+    Pipeline for monocular normals estimation using the Marigold method: https://marigoldmonodepth.github.io.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        unet (`UNet2DConditionModel`):
+            Conditional U-Net to denoise the normals latent, conditioned on image latent.
+        vae (`AutoencoderKL`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images and predictions to and from latent
+            representations.
+        scheduler (`DDIMScheduler` or `LCMScheduler`):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        text_encoder (`CLIPTextModel`):
+            Text-encoder, for empty text embedding.
+        tokenizer (`CLIPTokenizer`):
+            CLIP tokenizer.
+        prediction_type (`str`, *optional*):
+            Type of predictions made by the model.
+        use_full_z_range (`bool`, *optional*):
+            Whether the normals predicted by this model utilize the full range of the Z dimension, or only its positive
+            half.
+        default_denoising_steps (`int`, *optional*):
+            The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
+            quality with the given model. This value must be set in the model config. When the pipeline is called
+            without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
+            reasonable results with various model flavors compatible with the pipeline, such as those relying on very
+            short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
+        default_processing_resolution (`int`, *optional*):
+            The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
+            the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
+            default value is used. This is required to ensure reasonable results with various model flavors trained
+            with varying optimal processing resolution values.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    supported_prediction_types = ("normals",)
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: Union[DDIMScheduler, LCMScheduler],
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        prediction_type: Optional[str] = None,
+        use_full_z_range: Optional[bool] = True,
+        default_denoising_steps: Optional[int] = None,
+        default_processing_resolution: Optional[int] = None,
+    ):
+        super().__init__()
+
+        if prediction_type not in self.supported_prediction_types:
+            logger.warning(
+                f"Potentially unsupported `prediction_type='{prediction_type}'`; values supported by the pipeline: "
+                f"{self.supported_prediction_types}."
+            )
+
+        self.register_modules(
+            unet=unet,
+            vae=vae,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+        self.register_to_config(
+            use_full_z_range=use_full_z_range,
+            default_denoising_steps=default_denoising_steps,
+            default_processing_resolution=default_processing_resolution,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+        self.use_full_z_range = use_full_z_range
+        self.default_denoising_steps = default_denoising_steps
+        self.default_processing_resolution = default_processing_resolution
+
+        self.empty_text_embedding = None
+
+        self.image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def check_inputs(
+        self,
+        image: PipelineImageInput,
+        num_inference_steps: int,
+        ensemble_size: int,
+        processing_resolution: int,
+        resample_method_input: str,
+        resample_method_output: str,
+        batch_size: int,
+        ensembling_kwargs: Optional[Dict[str, Any]],
+        latents: Optional[torch.Tensor],
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+        output_type: str,
+        output_uncertainty: bool,
+    ) -> int:
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
+        if num_inference_steps < 1:
+            raise ValueError("`num_inference_steps` must be positive.")
+        if ensemble_size < 1:
+            raise ValueError("`ensemble_size` must be positive.")
+        if ensemble_size == 2:
+            logger.warning(
+                "`ensemble_size` == 2 results are similar to no ensembling (1); "
+                "consider increasing the value to at least 3."
+            )
+        if ensemble_size == 1 and output_uncertainty:
+            raise ValueError(
+                "Computing uncertainty by setting `output_uncertainty=True` also requires setting `ensemble_size` "
+                "greater than 1."
+            )
+        if processing_resolution is None:
+            raise ValueError(
+                "`processing_resolution` is not specified and could not be resolved from the model config."
+            )
+        if processing_resolution < 0:
+            raise ValueError(
+                "`processing_resolution` must be non-negative: 0 for native resolution, or any positive value for "
+                "downsampled processing."
+            )
+        if processing_resolution % self.vae_scale_factor != 0:
+            raise ValueError(f"`processing_resolution` must be a multiple of {self.vae_scale_factor}.")
+        if resample_method_input not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_input` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if resample_method_output not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_output` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if batch_size < 1:
+            raise ValueError("`batch_size` must be positive.")
+        if output_type not in ["pt", "np"]:
+            raise ValueError("`output_type` must be one of `pt` or `np`.")
+        if latents is not None and generator is not None:
+            raise ValueError("`latents` and `generator` cannot be used together.")
+        if ensembling_kwargs is not None:
+            if not isinstance(ensembling_kwargs, dict):
+                raise ValueError("`ensembling_kwargs` must be a dictionary.")
+            if "reduction" in ensembling_kwargs and ensembling_kwargs["reduction"] not in ("closest", "mean"):
+                raise ValueError("`ensembling_kwargs['reduction']` can be either `'closest'` or `'mean'`.")
+
+        # image checks
+        num_images = 0
+        W, H = None, None
+        if not isinstance(image, list):
+            image = [image]
+        for i, img in enumerate(image):
+            if isinstance(img, np.ndarray) or torch.is_tensor(img):
+                if img.ndim not in (2, 3, 4):
+                    raise ValueError(f"`image[{i}]` has unsupported dimensions or shape: {img.shape}.")
+                H_i, W_i = img.shape[-2:]
+                N_i = 1
+                if img.ndim == 4:
+                    N_i = img.shape[0]
+            elif isinstance(img, Image.Image):
+                W_i, H_i = img.size
+                N_i = 1
+            else:
+                raise ValueError(f"Unsupported `image[{i}]` type: {type(img)}.")
+            if W is None:
+                W, H = W_i, H_i
+            elif (W, H) != (W_i, H_i):
+                raise ValueError(
+                    f"Input `image[{i}]` has incompatible dimensions {(W_i, H_i)} with the previous images {(W, H)}"
+                )
+            num_images += N_i
+
+        # latents checks
+        if latents is not None:
+            if not torch.is_tensor(latents):
+                raise ValueError("`latents` must be a torch.Tensor.")
+            if latents.dim() != 4:
+                raise ValueError(f"`latents` has unsupported dimensions or shape: {latents.shape}.")
+
+            if processing_resolution > 0:
+                max_orig = max(H, W)
+                new_H = H * processing_resolution // max_orig
+                new_W = W * processing_resolution // max_orig
+                if new_H == 0 or new_W == 0:
+                    raise ValueError(f"Extreme aspect ratio of the input image: [{W} x {H}]")
+                W, H = new_W, new_H
+            w = (W + self.vae_scale_factor - 1) // self.vae_scale_factor
+            h = (H + self.vae_scale_factor - 1) // self.vae_scale_factor
+            shape_expected = (num_images * ensemble_size, self.vae.config.latent_channels, h, w)
+
+            if latents.shape != shape_expected:
+                raise ValueError(f"`latents` has unexpected shape={latents.shape} expected={shape_expected}.")
+
+        # generator checks
+        if generator is not None:
+            if isinstance(generator, list):
+                if len(generator) != num_images * ensemble_size:
+                    raise ValueError(
+                        "The number of generators must match the total number of ensemble members for all input images."
+                    )
+                if not all(g.device.type == generator[0].device.type for g in generator):
+                    raise ValueError("`generator` device placement is not consistent in the list.")
+            elif not isinstance(generator, torch.Generator):
+                raise ValueError(f"Unsupported generator type: {type(generator)}.")
+
+        return num_images
+
+    def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        elif not isinstance(self._progress_bar_config, dict):
+            raise ValueError(
+                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+            )
+
+        progress_bar_config = dict(**self._progress_bar_config)
+        progress_bar_config["desc"] = progress_bar_config.get("desc", desc)
+        progress_bar_config["leave"] = progress_bar_config.get("leave", leave)
+        if iterable is not None:
+            return tqdm(iterable, **progress_bar_config)
+        elif total is not None:
+            return tqdm(total=total, **progress_bar_config)
+        else:
+            raise ValueError("Either `total` or `iterable` has to be defined.")
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        num_inference_steps: Optional[int] = None,
+        ensemble_size: int = 1,
+        processing_resolution: Optional[int] = None,
+        match_input_resolution: bool = True,
+        resample_method_input: str = "bilinear",
+        resample_method_output: str = "bilinear",
+        batch_size: int = 1,
+        ensembling_kwargs: Optional[Dict[str, Any]] = None,
+        latents: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: str = "np",
+        output_uncertainty: bool = False,
+        output_latent: bool = False,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline.
+
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`),
+                `List[torch.Tensor]`: An input image or images used as an input for the normals estimation task. For
+                arrays and tensors, the expected value range is between `[0, 1]`. Passing a batch of images is possible
+                by providing a four-dimensional array or a tensor. Additionally, a list of images of two- or
+                three-dimensional arrays or tensors can be passed. In the latter case, all list elements must have the
+                same width and height.
+            num_inference_steps (`int`, *optional*, defaults to `None`):
+                Number of denoising diffusion steps during inference. The default value `None` results in automatic
+                selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
+                for Marigold-LCM models.
+            ensemble_size (`int`, defaults to `1`):
+                Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
+                faster inference.
+            processing_resolution (`int`, *optional*, defaults to `None`):
+                Effective processing resolution. When set to `0`, matches the larger input image dimension. This
+                produces crisper predictions, but may also lead to the overall loss of global context. The default
+                value `None` resolves to the optimal value from the model config.
+            match_input_resolution (`bool`, *optional*, defaults to `True`):
+                When enabled, the output prediction is resized to match the input dimensions. When disabled, the longer
+                side of the output will equal to `processing_resolution`.
+            resample_method_input (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize input images to `processing_resolution`. The accepted values are:
+                `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            resample_method_output (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize output predictions to match the input resolution. The accepted values
+                are `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            batch_size (`int`, *optional*, defaults to `1`):
+                Batch size; only matters when setting `ensemble_size` or passing a tensor of images.
+            ensembling_kwargs (`dict`, *optional*, defaults to `None`)
+                Extra dictionary with arguments for precise ensembling control. The following options are available:
+                - reduction (`str`, *optional*, defaults to `"closest"`): Defines the ensembling function applied in
+                  every pixel location, can be either `"closest"` or `"mean"`.
+            latents (`torch.Tensor`, *optional*, defaults to `None`):
+                Latent noise tensors to replace the random initialization. These can be taken from the previous
+                function call's output.
+            generator (`torch.Generator`, or `List[torch.Generator]`, *optional*, defaults to `None`):
+                Random number generator object to ensure reproducibility.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                Preferred format of the output's `prediction` and the optional `uncertainty` fields. The accepted
+                values are: `"np"` (numpy array) or `"pt"` (torch tensor).
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `uncertainty` field contains the predictive uncertainty map, provided that
+                the `ensemble_size` argument is set to a value above 2.
+            output_latent (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
+                within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
+                `latents` argument.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.marigold.MarigoldDepthOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.marigold.MarigoldNormalsOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.marigold.MarigoldNormalsOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is the prediction, the second element is the uncertainty
+                (or `None`), and the third is the latent (or `None`).
+        """
+
+        # 0. Resolving variables.
+        device = self._execution_device
+        dtype = self.dtype
+
+        # Model-specific optimal default values leading to fast and reasonable results.
+        if num_inference_steps is None:
+            num_inference_steps = self.default_denoising_steps
+        if processing_resolution is None:
+            processing_resolution = self.default_processing_resolution
+
+        # 1. Check inputs.
+        num_images = self.check_inputs(
+            image,
+            num_inference_steps,
+            ensemble_size,
+            processing_resolution,
+            resample_method_input,
+            resample_method_output,
+            batch_size,
+            ensembling_kwargs,
+            latents,
+            generator,
+            output_type,
+            output_uncertainty,
+        )
+
+        # 2. Prepare empty text conditioning.
+        # Model invocation: self.tokenizer, self.text_encoder.
+        if self.empty_text_embedding is None:
+            prompt = ""
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="do_not_pad",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids.to(device)
+            self.empty_text_embedding = self.text_encoder(text_input_ids)[0]  # [1,2,1024]
+
+        # 3. Preprocess input images. This function loads input image or images of compatible dimensions `(H, W)`,
+        # optionally downsamples them to the `processing_resolution` `(PH, PW)`, where
+        # `max(PH, PW) == processing_resolution`, and pads the dimensions to `(PPH, PPW)` such that these values are
+        # divisible by the latent space downscaling factor (typically 8 in Stable Diffusion). The default value `None`
+        # of `processing_resolution` resolves to the optimal value from the model config. It is a recommended mode of
+        # operation and leads to the most reasonable results. Using the native image resolution or any other processing
+        # resolution can lead to loss of either fine details or global context in the output predictions.
+        image, padding, original_resolution = self.image_processor.preprocess(
+            image, processing_resolution, resample_method_input, device, dtype
+        )  # [N,3,PPH,PPW]
+
+        # 4. Encode input image into latent space. At this step, each of the `N` input images is represented with `E`
+        # ensemble members. Each ensemble member is an independent diffused prediction, just initialized independently.
+        # Latents of each such predictions across all input images and all ensemble members are represented in the
+        # `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
+        # into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
+        # reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
+        # code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
+        # as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
+        # noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
+        # dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
+        # Model invocation: self.vae.encoder.
+        image_latent, pred_latent = self.prepare_latents(
+            image, latents, generator, ensemble_size, batch_size
+        )  # [N*E,4,h,w], [N*E,4,h,w]
+
+        del image
+
+        batch_empty_text_embedding = self.empty_text_embedding.to(device=device, dtype=dtype).repeat(
+            batch_size, 1, 1
+        )  # [B,1024,2]
+
+        # 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
+        # The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
+        # outputs noise for the predicted modality's latent space. The number of denoising diffusion steps is defined by
+        # `num_inference_steps`. It is either set directly, or resolves to the optimal value specific to the loaded
+        # model.
+        # Model invocation: self.unet.
+        pred_latents = []
+
+        for i in self.progress_bar(
+            range(0, num_images * ensemble_size, batch_size), leave=True, desc="Marigold predictions..."
+        ):
+            batch_image_latent = image_latent[i : i + batch_size]  # [B,4,h,w]
+            batch_pred_latent = pred_latent[i : i + batch_size]  # [B,4,h,w]
+            effective_batch_size = batch_image_latent.shape[0]
+            text = batch_empty_text_embedding[:effective_batch_size]  # [B,2,1024]
+
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            for t in self.progress_bar(self.scheduler.timesteps, leave=False, desc="Diffusion steps..."):
+                batch_latent = torch.cat([batch_image_latent, batch_pred_latent], dim=1)  # [B,8,h,w]
+                noise = self.unet(batch_latent, t, encoder_hidden_states=text, return_dict=False)[0]  # [B,4,h,w]
+                batch_pred_latent = self.scheduler.step(
+                    noise, t, batch_pred_latent, generator=generator
+                ).prev_sample  # [B,4,h,w]
+
+            pred_latents.append(batch_pred_latent)
+
+        pred_latent = torch.cat(pred_latents, dim=0)  # [N*E,4,h,w]
+
+        del (
+            pred_latents,
+            image_latent,
+            batch_empty_text_embedding,
+            batch_image_latent,
+            batch_pred_latent,
+            text,
+            batch_latent,
+            noise,
+        )
+
+        # 6. Decode predictions from latent into pixel space. The resulting `N * E` predictions have shape `(PPH, PPW)`,
+        # which requires slight postprocessing. Decoding into pixel space happens in batches of size `batch_size`.
+        # Model invocation: self.vae.decoder.
+        prediction = torch.cat(
+            [
+                self.decode_prediction(pred_latent[i : i + batch_size])
+                for i in range(0, pred_latent.shape[0], batch_size)
+            ],
+            dim=0,
+        )  # [N*E,3,PPH,PPW]
+
+        if not output_latent:
+            pred_latent = None
+
+        # 7. Remove padding. The output shape is (PH, PW).
+        prediction = self.image_processor.unpad_image(prediction, padding)  # [N*E,3,PH,PW]
+
+        # 8. Ensemble and compute uncertainty (when `output_uncertainty` is set). This code treats each of the `N`
+        # groups of `E` ensemble predictions independently. For each group it computes an ensembled prediction of shape
+        # `(PH, PW)` and an optional uncertainty map of the same dimensions. After computing this pair of outputs for
+        # each group independently, it stacks them respectively into batches of `N` almost final predictions and
+        # uncertainty maps.
+        uncertainty = None
+        if ensemble_size > 1:
+            prediction = prediction.reshape(num_images, ensemble_size, *prediction.shape[1:])  # [N,E,3,PH,PW]
+            prediction = [
+                self.ensemble_normals(prediction[i], output_uncertainty, **(ensembling_kwargs or {}))
+                for i in range(num_images)
+            ]  # [ [[1,3,PH,PW], [1,1,PH,PW]], ... ]
+            prediction, uncertainty = zip(*prediction)  # [[1,3,PH,PW], ... ], [[1,1,PH,PW], ... ]
+            prediction = torch.cat(prediction, dim=0)  # [N,3,PH,PW]
+            if output_uncertainty:
+                uncertainty = torch.cat(uncertainty, dim=0)  # [N,1,PH,PW]
+            else:
+                uncertainty = None
+
+        # 9. If `match_input_resolution` is set, the output prediction and the uncertainty are upsampled to match the
+        # input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
+        # After upsampling, the native resolution normal maps are renormalized to unit length to reduce the artifacts.
+        # Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
+        # setting the `resample_method_output` parameter (e.g., to `"nearest"`).
+        if match_input_resolution:
+            prediction = self.image_processor.resize_antialias(
+                prediction, original_resolution, resample_method_output, is_aa=False
+            )  # [N,3,H,W]
+            prediction = self.normalize_normals(prediction)  # [N,3,H,W]
+            if uncertainty is not None and output_uncertainty:
+                uncertainty = self.image_processor.resize_antialias(
+                    uncertainty, original_resolution, resample_method_output, is_aa=False
+                )  # [N,1,H,W]
+
+        # 10. Prepare the final outputs.
+        if output_type == "np":
+            prediction = self.image_processor.pt_to_numpy(prediction)  # [N,H,W,3]
+            if uncertainty is not None and output_uncertainty:
+                uncertainty = self.image_processor.pt_to_numpy(uncertainty)  # [N,H,W,1]
+
+        # 11. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (prediction, uncertainty, pred_latent)
+
+        return MarigoldNormalsOutput(
+            prediction=prediction,
+            uncertainty=uncertainty,
+            latent=pred_latent,
+        )
+
+    # Copied from diffusers.pipelines.marigold.pipeline_marigold_depth.MarigoldDepthPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        image: torch.Tensor,
+        latents: Optional[torch.Tensor],
+        generator: Optional[torch.Generator],
+        ensemble_size: int,
+        batch_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        def retrieve_latents(encoder_output):
+            if hasattr(encoder_output, "latent_dist"):
+                return encoder_output.latent_dist.mode()
+            elif hasattr(encoder_output, "latents"):
+                return encoder_output.latents
+            else:
+                raise AttributeError("Could not access latents of provided encoder_output")
+
+        image_latent = torch.cat(
+            [
+                retrieve_latents(self.vae.encode(image[i : i + batch_size]))
+                for i in range(0, image.shape[0], batch_size)
+            ],
+            dim=0,
+        )  # [N,4,h,w]
+        image_latent = image_latent * self.vae.config.scaling_factor
+        image_latent = image_latent.repeat_interleave(ensemble_size, dim=0)  # [N*E,4,h,w]
+
+        pred_latent = latents
+        if pred_latent is None:
+            pred_latent = randn_tensor(
+                image_latent.shape,
+                generator=generator,
+                device=image_latent.device,
+                dtype=image_latent.dtype,
+            )  # [N*E,4,h,w]
+
+        return image_latent, pred_latent
+
+    def decode_prediction(self, pred_latent: torch.Tensor) -> torch.Tensor:
+        if pred_latent.dim() != 4 or pred_latent.shape[1] != self.vae.config.latent_channels:
+            raise ValueError(
+                f"Expecting 4D tensor of shape [B,{self.vae.config.latent_channels},H,W]; got {pred_latent.shape}."
+            )
+
+        prediction = self.vae.decode(pred_latent / self.vae.config.scaling_factor, return_dict=False)[0]  # [B,3,H,W]
+
+        prediction = torch.clip(prediction, -1.0, 1.0)
+
+        if not self.use_full_z_range:
+            prediction[:, 2, :, :] *= 0.5
+            prediction[:, 2, :, :] += 0.5
+
+        prediction = self.normalize_normals(prediction)  # [B,3,H,W]
+
+        return prediction  # [B,3,H,W]
+
+    @staticmethod
+    def normalize_normals(normals: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
+        if normals.dim() != 4 or normals.shape[1] != 3:
+            raise ValueError(f"Expecting 4D tensor of shape [B,3,H,W]; got {normals.shape}.")
+
+        norm = torch.norm(normals, dim=1, keepdim=True)
+        normals /= norm.clamp(min=eps)
+
+        return normals
+
+    @staticmethod
+    def ensemble_normals(
+        normals: torch.Tensor, output_uncertainty: bool, reduction: str = "closest"
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Ensembles the normals maps represented by the `normals` tensor with expected shape `(B, 3, H, W)`, where B is
+        the number of ensemble members for a given prediction of size `(H x W)`.
+
+        Args:
+            normals (`torch.Tensor`):
+                Input ensemble normals maps.
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                Whether to output uncertainty map.
+            reduction (`str`, *optional*, defaults to `"closest"`):
+                Reduction method used to ensemble aligned predictions. The accepted values are: `"closest"` and
+                `"mean"`.
+
+        Returns:
+            A tensor of aligned and ensembled normals maps with shape `(1, 3, H, W)` and optionally a tensor of
+            uncertainties of shape `(1, 1, H, W)`.
+        """
+        if normals.dim() != 4 or normals.shape[1] != 3:
+            raise ValueError(f"Expecting 4D tensor of shape [B,3,H,W]; got {normals.shape}.")
+        if reduction not in ("closest", "mean"):
+            raise ValueError(f"Unrecognized reduction method: {reduction}.")
+
+        mean_normals = normals.mean(dim=0, keepdim=True)  # [1,3,H,W]
+        mean_normals = MarigoldNormalsPipeline.normalize_normals(mean_normals)  # [1,3,H,W]
+
+        sim_cos = (mean_normals * normals).sum(dim=1, keepdim=True)  # [E,1,H,W]
+        sim_cos = sim_cos.clamp(-1, 1)  # required to avoid NaN in uncertainty with fp16
+
+        uncertainty = None
+        if output_uncertainty:
+            uncertainty = sim_cos.arccos()  # [E,1,H,W]
+            uncertainty = uncertainty.mean(dim=0, keepdim=True) / np.pi  # [1,1,H,W]
+
+        if reduction == "mean":
+            return mean_normals, uncertainty  # [1,3,H,W], [1,1,H,W]
+
+        closest_indices = sim_cos.argmax(dim=0, keepdim=True)  # [1,1,H,W]
+        closest_indices = closest_indices.repeat(1, 3, 1, 1)  # [1,3,H,W]
+        closest_normals = torch.gather(normals, 0, closest_indices)  # [1,3,H,W]
+
+        return closest_normals, uncertainty  # [1,3,H,W], [1,1,H,W]
diff --git a/diffusers/src/diffusers/pipelines/musicldm/__init__.py b/diffusers/src/diffusers/pipelines/musicldm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed71eeb1d99b28f20f7cd94776c0303208620653
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/musicldm/__init__.py
@@ -0,0 +1,49 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_musicldm"] = ["MusicLDMPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_musicldm import MusicLDMPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/musicldm/pipeline_musicldm.py b/diffusers/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
new file mode 100644
index 0000000000000000000000000000000000000000..728635da6d4dc66ae2f8e465b20309ae654c0558
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
@@ -0,0 +1,635 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import (
+    ClapFeatureExtractor,
+    ClapModel,
+    ClapTextModelWithProjection,
+    RobertaTokenizer,
+    RobertaTokenizerFast,
+    SpeechT5HifiGan,
+)
+
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    is_librosa_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, StableDiffusionMixin
+
+
+if is_librosa_available():
+    import librosa
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import MusicLDMPipeline
+        >>> import torch
+        >>> import scipy
+
+        >>> repo_id = "ucsd-reach/musicldm"
+        >>> pipe = MusicLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
+        >>> audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]
+
+        >>> # save the audio sample as a .wav file
+        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
+        ```
+"""
+
+
+class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-audio generation using MusicLDM.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.ClapModel`]):
+            Frozen text-audio embedding model (`ClapTextModel`), specifically the
+            [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant.
+        tokenizer ([`PreTrainedTokenizer`]):
+            A [`~transformers.RobertaTokenizer`] to tokenize text.
+        feature_extractor ([`~transformers.ClapFeatureExtractor`]):
+            Feature extractor to compute mel-spectrograms from audio waveforms.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded audio latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        vocoder ([`~transformers.SpeechT5HifiGan`]):
+            Vocoder of class `SpeechT5HifiGan`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: Union[ClapTextModelWithProjection, ClapModel],
+        tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
+        feature_extractor: Optional[ClapFeatureExtractor],
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        vocoder: SpeechT5HifiGan,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            unet=unet,
+            scheduler=scheduler,
+            vocoder=vocoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_waveforms_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device (`torch.device`):
+                torch device
+            num_waveforms_per_prompt (`int`):
+                number of waveforms that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            attention_mask = text_inputs.attention_mask
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLAP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder.get_text_features(
+                text_input_ids.to(device),
+                attention_mask=attention_mask.to(device),
+            )
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.text_model.dtype, device=device)
+
+        (
+            bs_embed,
+            seq_len,
+        ) = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_waveforms_per_prompt)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_waveforms_per_prompt, seq_len)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            uncond_input_ids = uncond_input.input_ids.to(device)
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder.get_text_features(
+                uncond_input_ids,
+                attention_mask=attention_mask,
+            )
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.text_model.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_waveforms_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_waveforms_per_prompt, seq_len)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.audioldm.pipeline_audioldm.AudioLDMPipeline.mel_spectrogram_to_waveform
+    def mel_spectrogram_to_waveform(self, mel_spectrogram):
+        if mel_spectrogram.dim() == 4:
+            mel_spectrogram = mel_spectrogram.squeeze(1)
+
+        waveform = self.vocoder(mel_spectrogram)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        waveform = waveform.cpu().float()
+        return waveform
+
+    # Copied from diffusers.pipelines.audioldm2.pipeline_audioldm2.AudioLDM2Pipeline.score_waveforms
+    def score_waveforms(self, text, audio, num_waveforms_per_prompt, device, dtype):
+        if not is_librosa_available():
+            logger.info(
+                "Automatic scoring of the generated audio waveforms against the input prompt text requires the "
+                "`librosa` package to resample the generated waveforms. Returning the audios in the order they were "
+                "generated. To enable automatic scoring, install `librosa` with: `pip install librosa`."
+            )
+            return audio
+        inputs = self.tokenizer(text, return_tensors="pt", padding=True)
+        resampled_audio = librosa.resample(
+            audio.numpy(), orig_sr=self.vocoder.config.sampling_rate, target_sr=self.feature_extractor.sampling_rate
+        )
+        inputs["input_features"] = self.feature_extractor(
+            list(resampled_audio), return_tensors="pt", sampling_rate=self.feature_extractor.sampling_rate
+        ).input_features.type(dtype)
+        inputs = inputs.to(device)
+
+        # compute the audio-text similarity score using the CLAP model
+        logits_per_text = self.text_encoder(**inputs).logits_per_text
+        # sort by the highest matching generations per prompt
+        indices = torch.argsort(logits_per_text, dim=1, descending=True)[:, :num_waveforms_per_prompt]
+        audio = torch.index_select(audio, 0, indices.reshape(-1).cpu())
+        return audio
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.audioldm.pipeline_audioldm.AudioLDMPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        audio_length_in_s,
+        vocoder_upsample_factor,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor
+        if audio_length_in_s < min_audio_length_in_s:
+            raise ValueError(
+                f"`audio_length_in_s` has to be a positive value greater than or equal to {min_audio_length_in_s}, but "
+                f"is {audio_length_in_s}."
+            )
+
+        if self.vocoder.config.model_in_dim % self.vae_scale_factor != 0:
+            raise ValueError(
+                f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the "
+                f"VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of "
+                f"{self.vae_scale_factor}."
+            )
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.audioldm.pipeline_audioldm.AudioLDMPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(self.vocoder.config.model_in_dim) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        model_sequence = [
+            self.text_encoder.text_model,
+            self.text_encoder.text_projection,
+            self.unet,
+            self.vae,
+            self.vocoder,
+            self.text_encoder,
+        ]
+
+        hook = None
+        for cpu_offloaded_model in model_sequence:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        audio_length_in_s: Optional[float] = None,
+        num_inference_steps: int = 200,
+        guidance_scale: float = 2.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_waveforms_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        output_type: Optional[str] = "np",
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
+            audio_length_in_s (`int`, *optional*, defaults to 10.24):
+                The length of the generated audio sample in seconds.
+            num_inference_steps (`int`, *optional*, defaults to 200):
+                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 2.0):
+                A higher guidance scale value encourages the model to generate audio that is closely linked to the text
+                `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
+                The number of waveforms to generate per prompt. If `num_waveforms_per_prompt > 1`, the text encoding
+                model is a joint text-audio model ([`~transformers.ClapModel`]), and the tokenizer is a
+                `[~transformers.ClapProcessor]`, then automatic scoring will be performed between the generated outputs
+                and the input text. This scoring ranks the generated waveforms based on their cosine similarity to text
+                input in the joint text-audio embedding space.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated audio. Choose between `"np"` to return a NumPy `np.ndarray` or
+                `"pt"` to return a PyTorch `torch.Tensor` object. Set to `"latent"` to return the latent diffusion
+                model (LDM) output.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.AudioPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated audio.
+        """
+        # 0. Convert audio input length from seconds to spectrogram height
+        vocoder_upsample_factor = np.prod(self.vocoder.config.upsample_rates) / self.vocoder.config.sampling_rate
+
+        if audio_length_in_s is None:
+            audio_length_in_s = self.unet.config.sample_size * self.vae_scale_factor * vocoder_upsample_factor
+
+        height = int(audio_length_in_s / vocoder_upsample_factor)
+
+        original_waveform_length = int(audio_length_in_s * self.vocoder.config.sampling_rate)
+        if height % self.vae_scale_factor != 0:
+            height = int(np.ceil(height / self.vae_scale_factor)) * self.vae_scale_factor
+            logger.info(
+                f"Audio length in seconds {audio_length_in_s} is increased to {height * vocoder_upsample_factor} "
+                f"so that it can be handled by the model. It will be cut to {audio_length_in_s} after the "
+                f"denoising process."
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            audio_length_in_s,
+            vocoder_upsample_factor,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_waveforms_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_waveforms_per_prompt,
+            num_channels_latents,
+            height,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=None,
+                    class_labels=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        self.maybe_free_model_hooks()
+
+        # 8. Post-processing
+        if not output_type == "latent":
+            latents = 1 / self.vae.config.scaling_factor * latents
+            mel_spectrogram = self.vae.decode(latents).sample
+        else:
+            return AudioPipelineOutput(audios=latents)
+
+        audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
+
+        audio = audio[:, :original_waveform_length]
+
+        # 9. Automatic scoring
+        if num_waveforms_per_prompt > 1 and prompt is not None:
+            audio = self.score_waveforms(
+                text=prompt,
+                audio=audio,
+                num_waveforms_per_prompt=num_waveforms_per_prompt,
+                device=device,
+                dtype=prompt_embeds.dtype,
+            )
+
+        if output_type == "np":
+            audio = audio.numpy()
+
+        if not return_dict:
+            return (audio,)
+
+        return AudioPipelineOutput(audios=audio)
diff --git a/diffusers/src/diffusers/pipelines/onnx_utils.py b/diffusers/src/diffusers/pipelines/onnx_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..11f2241c64c837bc3d0160370214f59859b331df
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/onnx_utils.py
@@ -0,0 +1,215 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import shutil
+from pathlib import Path
+from typing import Optional, Union
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import validate_hf_hub_args
+
+from ..utils import ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, is_onnx_available, logging
+
+
+if is_onnx_available():
+    import onnxruntime as ort
+
+
+logger = logging.get_logger(__name__)
+
+ORT_TO_NP_TYPE = {
+    "tensor(bool)": np.bool_,
+    "tensor(int8)": np.int8,
+    "tensor(uint8)": np.uint8,
+    "tensor(int16)": np.int16,
+    "tensor(uint16)": np.uint16,
+    "tensor(int32)": np.int32,
+    "tensor(uint32)": np.uint32,
+    "tensor(int64)": np.int64,
+    "tensor(uint64)": np.uint64,
+    "tensor(float16)": np.float16,
+    "tensor(float)": np.float32,
+    "tensor(double)": np.float64,
+}
+
+
+class OnnxRuntimeModel:
+    def __init__(self, model=None, **kwargs):
+        logger.info("`diffusers.OnnxRuntimeModel` is experimental and might change in the future.")
+        self.model = model
+        self.model_save_dir = kwargs.get("model_save_dir", None)
+        self.latest_model_name = kwargs.get("latest_model_name", ONNX_WEIGHTS_NAME)
+
+    def __call__(self, **kwargs):
+        inputs = {k: np.array(v) for k, v in kwargs.items()}
+        return self.model.run(None, inputs)
+
+    @staticmethod
+    def load_model(path: Union[str, Path], provider=None, sess_options=None):
+        """
+        Loads an ONNX Inference session with an ExecutionProvider. Default provider is `CPUExecutionProvider`
+
+        Arguments:
+            path (`str` or `Path`):
+                Directory from which to load
+            provider(`str`, *optional*):
+                Onnxruntime execution provider to use for loading the model, defaults to `CPUExecutionProvider`
+        """
+        if provider is None:
+            logger.info("No onnxruntime provider specified, using CPUExecutionProvider")
+            provider = "CPUExecutionProvider"
+
+        return ort.InferenceSession(path, providers=[provider], sess_options=sess_options)
+
+    def _save_pretrained(self, save_directory: Union[str, Path], file_name: Optional[str] = None, **kwargs):
+        """
+        Save a model and its configuration file to a directory, so that it can be re-loaded using the
+        [`~optimum.onnxruntime.modeling_ort.ORTModel.from_pretrained`] class method. It will always save the
+        latest_model_name.
+
+        Arguments:
+            save_directory (`str` or `Path`):
+                Directory where to save the model file.
+            file_name(`str`, *optional*):
+                Overwrites the default model file name from `"model.onnx"` to `file_name`. This allows you to save the
+                model with a different name.
+        """
+        model_file_name = file_name if file_name is not None else ONNX_WEIGHTS_NAME
+
+        src_path = self.model_save_dir.joinpath(self.latest_model_name)
+        dst_path = Path(save_directory).joinpath(model_file_name)
+        try:
+            shutil.copyfile(src_path, dst_path)
+        except shutil.SameFileError:
+            pass
+
+        # copy external weights (for models >2GB)
+        src_path = self.model_save_dir.joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
+        if src_path.exists():
+            dst_path = Path(save_directory).joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
+            try:
+                shutil.copyfile(src_path, dst_path)
+            except shutil.SameFileError:
+                pass
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        **kwargs,
+    ):
+        """
+        Save a model to a directory, so that it can be re-loaded using the [`~OnnxModel.from_pretrained`] class
+        method.:
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to which to save. Will be created if it doesn't exist.
+        """
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        # saving model weights/files
+        self._save_pretrained(save_directory, **kwargs)
+
+    @classmethod
+    @validate_hf_hub_args
+    def _from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        token: Optional[Union[bool, str, None]] = None,
+        revision: Optional[Union[str, None]] = None,
+        force_download: bool = False,
+        cache_dir: Optional[str] = None,
+        file_name: Optional[str] = None,
+        provider: Optional[str] = None,
+        sess_options: Optional["ort.SessionOptions"] = None,
+        **kwargs,
+    ):
+        """
+        Load a model from a directory or the HF Hub.
+
+        Arguments:
+            model_id (`str` or `Path`):
+                Directory from which to load
+            token (`str` or `bool`):
+                Is needed to load models from a private or gated repository
+            revision (`str`):
+                Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id
+            cache_dir (`Union[str, Path]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            file_name(`str`):
+                Overwrites the default model file name from `"model.onnx"` to `file_name`. This allows you to load
+                different model files from the same repository or directory.
+            provider(`str`):
+                The ONNX runtime provider, e.g. `CPUExecutionProvider` or `CUDAExecutionProvider`.
+            kwargs (`Dict`, *optional*):
+                kwargs will be passed to the model during initialization
+        """
+        model_file_name = file_name if file_name is not None else ONNX_WEIGHTS_NAME
+        # load model from local directory
+        if os.path.isdir(model_id):
+            model = OnnxRuntimeModel.load_model(
+                Path(model_id, model_file_name).as_posix(), provider=provider, sess_options=sess_options
+            )
+            kwargs["model_save_dir"] = Path(model_id)
+        # load model from hub
+        else:
+            # download model
+            model_cache_path = hf_hub_download(
+                repo_id=model_id,
+                filename=model_file_name,
+                token=token,
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+            )
+            kwargs["model_save_dir"] = Path(model_cache_path).parent
+            kwargs["latest_model_name"] = Path(model_cache_path).name
+            model = OnnxRuntimeModel.load_model(model_cache_path, provider=provider, sess_options=sess_options)
+        return cls(model=model, **kwargs)
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        force_download: bool = True,
+        token: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        **model_kwargs,
+    ):
+        revision = None
+        if len(str(model_id).split("@")) == 2:
+            model_id, revision = model_id.split("@")
+
+        return cls._from_pretrained(
+            model_id=model_id,
+            revision=revision,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            token=token,
+            **model_kwargs,
+        )
diff --git a/diffusers/src/diffusers/pipelines/pag/__init__.py b/diffusers/src/diffusers/pipelines/pag/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8842ce911752537d69835eade836f0584476161
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/__init__.py
@@ -0,0 +1,69 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_pag_controlnet_sd"] = ["StableDiffusionControlNetPAGPipeline"]
+    _import_structure["pipeline_pag_controlnet_sd_xl"] = ["StableDiffusionXLControlNetPAGPipeline"]
+    _import_structure["pipeline_pag_controlnet_sd_xl_img2img"] = ["StableDiffusionXLControlNetPAGImg2ImgPipeline"]
+    _import_structure["pipeline_pag_hunyuandit"] = ["HunyuanDiTPAGPipeline"]
+    _import_structure["pipeline_pag_kolors"] = ["KolorsPAGPipeline"]
+    _import_structure["pipeline_pag_pixart_sigma"] = ["PixArtSigmaPAGPipeline"]
+    _import_structure["pipeline_pag_sd"] = ["StableDiffusionPAGPipeline"]
+    _import_structure["pipeline_pag_sd_3"] = ["StableDiffusion3PAGPipeline"]
+    _import_structure["pipeline_pag_sd_animatediff"] = ["AnimateDiffPAGPipeline"]
+    _import_structure["pipeline_pag_sd_xl"] = ["StableDiffusionXLPAGPipeline"]
+    _import_structure["pipeline_pag_sd_xl_img2img"] = ["StableDiffusionXLPAGImg2ImgPipeline"]
+    _import_structure["pipeline_pag_sd_xl_inpaint"] = ["StableDiffusionXLPAGInpaintPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_pag_controlnet_sd import StableDiffusionControlNetPAGPipeline
+        from .pipeline_pag_controlnet_sd_xl import StableDiffusionXLControlNetPAGPipeline
+        from .pipeline_pag_controlnet_sd_xl_img2img import StableDiffusionXLControlNetPAGImg2ImgPipeline
+        from .pipeline_pag_hunyuandit import HunyuanDiTPAGPipeline
+        from .pipeline_pag_kolors import KolorsPAGPipeline
+        from .pipeline_pag_pixart_sigma import PixArtSigmaPAGPipeline
+        from .pipeline_pag_sd import StableDiffusionPAGPipeline
+        from .pipeline_pag_sd_3 import StableDiffusion3PAGPipeline
+        from .pipeline_pag_sd_animatediff import AnimateDiffPAGPipeline
+        from .pipeline_pag_sd_xl import StableDiffusionXLPAGPipeline
+        from .pipeline_pag_sd_xl_img2img import StableDiffusionXLPAGImg2ImgPipeline
+        from .pipeline_pag_sd_xl_inpaint import StableDiffusionXLPAGInpaintPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/pag/pag_utils.py b/diffusers/src/diffusers/pipelines/pag/pag_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..728f730c9904523a48c77643dac4f3ed6a114d32
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/pag_utils.py
@@ -0,0 +1,237 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Dict, List, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    PAGCFGIdentitySelfAttnProcessor2_0,
+    PAGIdentitySelfAttnProcessor2_0,
+)
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class PAGMixin:
+    r"""Mixin class for [Pertubed Attention Guidance](https://arxiv.org/abs/2403.17377v1)."""
+
+    def _set_pag_attn_processor(self, pag_applied_layers, do_classifier_free_guidance):
+        r"""
+        Set the attention processor for the PAG layers.
+        """
+        pag_attn_processors = self._pag_attn_processors
+        if pag_attn_processors is None:
+            raise ValueError(
+                "No PAG attention processors have been set. Set the attention processors by calling `set_pag_applied_layers` and passing the relevant parameters."
+            )
+
+        pag_attn_proc = pag_attn_processors[0] if do_classifier_free_guidance else pag_attn_processors[1]
+
+        if hasattr(self, "unet"):
+            model: nn.Module = self.unet
+        else:
+            model: nn.Module = self.transformer
+
+        def is_self_attn(module: nn.Module) -> bool:
+            r"""
+            Check if the module is self-attention module based on its name.
+            """
+            return isinstance(module, Attention) and not module.is_cross_attention
+
+        def is_fake_integral_match(layer_id, name):
+            layer_id = layer_id.split(".")[-1]
+            name = name.split(".")[-1]
+            return layer_id.isnumeric() and name.isnumeric() and layer_id == name
+
+        for layer_id in pag_applied_layers:
+            # for each PAG layer input, we find corresponding self-attention layers in the unet model
+            target_modules = []
+
+            for name, module in model.named_modules():
+                # Identify the following simple cases:
+                #   (1) Self Attention layer existing
+                #   (2) Whether the module name matches pag layer id even partially
+                #   (3) Make sure it's not a fake integral match if the layer_id ends with a number
+                #       For example, blocks.1, blocks.10 should be differentiable if layer_id="blocks.1"
+                if (
+                    is_self_attn(module)
+                    and re.search(layer_id, name) is not None
+                    and not is_fake_integral_match(layer_id, name)
+                ):
+                    logger.debug(f"Applying PAG to layer: {name}")
+                    target_modules.append(module)
+
+            if len(target_modules) == 0:
+                raise ValueError(f"Cannot find PAG layer to set attention processor for: {layer_id}")
+
+            for module in target_modules:
+                module.processor = pag_attn_proc
+
+    def _get_pag_scale(self, t):
+        r"""
+        Get the scale factor for the perturbed attention guidance at timestep `t`.
+        """
+
+        if self.do_pag_adaptive_scaling:
+            signal_scale = self.pag_scale - self.pag_adaptive_scale * (1000 - t)
+            if signal_scale < 0:
+                signal_scale = 0
+            return signal_scale
+        else:
+            return self.pag_scale
+
+    def _apply_perturbed_attention_guidance(self, noise_pred, do_classifier_free_guidance, guidance_scale, t):
+        r"""
+        Apply perturbed attention guidance to the noise prediction.
+
+        Args:
+            noise_pred (torch.Tensor): The noise prediction tensor.
+            do_classifier_free_guidance (bool): Whether to apply classifier-free guidance.
+            guidance_scale (float): The scale factor for the guidance term.
+            t (int): The current time step.
+
+        Returns:
+            torch.Tensor: The updated noise prediction tensor after applying perturbed attention guidance.
+        """
+        pag_scale = self._get_pag_scale(t)
+        if do_classifier_free_guidance:
+            noise_pred_uncond, noise_pred_text, noise_pred_perturb = noise_pred.chunk(3)
+            noise_pred = (
+                noise_pred_uncond
+                + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                + pag_scale * (noise_pred_text - noise_pred_perturb)
+            )
+        else:
+            noise_pred_text, noise_pred_perturb = noise_pred.chunk(2)
+            noise_pred = noise_pred_text + pag_scale * (noise_pred_text - noise_pred_perturb)
+        return noise_pred
+
+    def _prepare_perturbed_attention_guidance(self, cond, uncond, do_classifier_free_guidance):
+        """
+        Prepares the perturbed attention guidance for the PAG model.
+
+        Args:
+            cond (torch.Tensor): The conditional input tensor.
+            uncond (torch.Tensor): The unconditional input tensor.
+            do_classifier_free_guidance (bool): Flag indicating whether to perform classifier-free guidance.
+
+        Returns:
+            torch.Tensor: The prepared perturbed attention guidance tensor.
+        """
+
+        cond = torch.cat([cond] * 2, dim=0)
+
+        if do_classifier_free_guidance:
+            cond = torch.cat([uncond, cond], dim=0)
+        return cond
+
+    def set_pag_applied_layers(
+        self,
+        pag_applied_layers: Union[str, List[str]],
+        pag_attn_processors: Tuple[AttentionProcessor, AttentionProcessor] = (
+            PAGCFGIdentitySelfAttnProcessor2_0(),
+            PAGIdentitySelfAttnProcessor2_0(),
+        ),
+    ):
+        r"""
+        Set the the self-attention layers to apply PAG. Raise ValueError if the input is invalid.
+
+        Args:
+            pag_applied_layers (`str` or `List[str]`):
+                One or more strings identifying the layer names, or a simple regex for matching multiple layers, where
+                PAG is to be applied. A few ways of expected usage are as follows:
+                  - Single layers specified as - "blocks.{layer_index}"
+                  - Multiple layers as a list - ["blocks.{layers_index_1}", "blocks.{layer_index_2}", ...]
+                  - Multiple layers as a block name - "mid"
+                  - Multiple layers as regex - "blocks.({layer_index_1}|{layer_index_2})"
+            pag_attn_processors:
+                (`Tuple[AttentionProcessor, AttentionProcessor]`, defaults to `(PAGCFGIdentitySelfAttnProcessor2_0(),
+                PAGIdentitySelfAttnProcessor2_0())`): A tuple of two attention processors. The first attention
+                processor is for PAG with Classifier-free guidance enabled (conditional and unconditional). The second
+                attention processor is for PAG with CFG disabled (unconditional only).
+        """
+
+        if not hasattr(self, "_pag_attn_processors"):
+            self._pag_attn_processors = None
+
+        if not isinstance(pag_applied_layers, list):
+            pag_applied_layers = [pag_applied_layers]
+        if pag_attn_processors is not None:
+            if not isinstance(pag_attn_processors, tuple) or len(pag_attn_processors) != 2:
+                raise ValueError("Expected a tuple of two attention processors")
+
+        for i in range(len(pag_applied_layers)):
+            if not isinstance(pag_applied_layers[i], str):
+                raise ValueError(
+                    f"Expected either a string or a list of string but got type {type(pag_applied_layers[i])}"
+                )
+
+        self.pag_applied_layers = pag_applied_layers
+        self._pag_attn_processors = pag_attn_processors
+
+    @property
+    def pag_scale(self) -> float:
+        r"""Get the scale factor for the perturbed attention guidance."""
+        return self._pag_scale
+
+    @property
+    def pag_adaptive_scale(self) -> float:
+        r"""Get the adaptive scale factor for the perturbed attention guidance."""
+        return self._pag_adaptive_scale
+
+    @property
+    def do_pag_adaptive_scaling(self) -> bool:
+        r"""Check if the adaptive scaling is enabled for the perturbed attention guidance."""
+        return self._pag_adaptive_scale > 0 and self._pag_scale > 0 and len(self.pag_applied_layers) > 0
+
+    @property
+    def do_perturbed_attention_guidance(self) -> bool:
+        r"""Check if the perturbed attention guidance is enabled."""
+        return self._pag_scale > 0 and len(self.pag_applied_layers) > 0
+
+    @property
+    def pag_attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of PAG attention processors: A dictionary contains all PAG attention processors used in the model
+            with the key as the name of the layer.
+        """
+
+        if self._pag_attn_processors is None:
+            return {}
+
+        valid_attn_processors = {x.__class__ for x in self._pag_attn_processors}
+
+        processors = {}
+        # We could have iterated through the self.components.items() and checked if a component is
+        # `ModelMixin` subclassed but that can include a VAE too.
+        if hasattr(self, "unet"):
+            denoiser_module = self.unet
+        elif hasattr(self, "transformer"):
+            denoiser_module = self.transformer
+        else:
+            raise ValueError("No denoiser module found.")
+
+        for name, proc in denoiser_module.attn_processors.items():
+            if proc.__class__ in valid_attn_processors:
+                processors[name] = proc
+
+        return processors
diff --git a/diffusers/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bac883b5c999a2d12a38621f8ca7eebdd8c067f
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py
@@ -0,0 +1,1329 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+from ..controlnet.multicontrolnet import MultiControlNetModel
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .pag_utils import PAGMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import AutoPipelineForText2Image, ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+        ... )
+        >>> image = np.array(image)
+
+        >>> # get canny image
+        >>> image = cv2.Canny(image, 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = AutoPipelineForText2Image.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, enable_pag=True
+        ... )
+
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> # remove following line if xformers is not installed
+        >>> pipe.enable_xformers_memory_efficient_attention()
+
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting",
+        ...     guidance_scale=7.5,
+        ...     generator=generator,
+        ...     image=canny_image,
+        ...     pag_scale=10,
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionControlNetPAGPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+    PAGMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+        pag_applied_layers: Union[str, List[str]] = "mid",
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+        self.set_pag_applied_layers(pag_applied_layers)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                transposed_image = [list(t) for t in zip(*image)]
+                if len(transposed_image) != len(self.controlnet.nets):
+                    raise ValueError(
+                        f"For multiple controlnets: if you pass`image` as a list of list, each sublist must have the same length as the number of controlnets, but the sublists in `image` got {len(transposed_image)} images and {len(self.controlnet.nets)} ControlNets."
+                    )
+                for image_ in transposed_image:
+                    self.check_image(image_, prompt, prompt_embeds)
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+            else:
+                for image_ in image:
+                    self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError(
+                        "A single batch of varying conditioning scale settings (e.g. [[1.0, 0.5], [0.2, 0.8]]) is not supported at the moment. "
+                        "The conditioning scale must be fixed across the batch."
+                    )
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        pag_scale: float = 3.0,
+        pag_adaptive_scale: float = 0.0,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single
+                ControlNet, each will be paired with each prompt in the `prompt` list. This also applies to multiple
+                ControlNets, where a list of image lists can be passed to batch for each prompt and each ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            pag_scale (`float`, *optional*, defaults to 3.0):
+                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
+                guidance will not be used.
+            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
+                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
+                used.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            image,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._pag_scale = pag_scale
+        self._pag_adaptive_scale = pag_adaptive_scale
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_perturbed_attention_guidance:
+            prompt_embeds = self._prepare_perturbed_attention_guidance(
+                prompt_embeds, negative_prompt_embeds, self.do_classifier_free_guidance
+            )
+        elif self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            images = []
+
+            # Nested lists as ControlNet condition
+            if isinstance(image[0], list):
+                # Transpose the nested image list
+                image = [list(t) for t in zip(*image)]
+
+            for image_ in image:
+                image_ = self.prepare_image(
+                    image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                images.append(image_)
+
+            image = images
+            height, width = image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+            for i, image_embeds in enumerate(ip_adapter_image_embeds):
+                negative_image_embeds = None
+                if self.do_classifier_free_guidance:
+                    negative_image_embeds, image_embeds = image_embeds.chunk(2)
+
+                if self.do_perturbed_attention_guidance:
+                    image_embeds = self._prepare_perturbed_attention_guidance(
+                        image_embeds, negative_image_embeds, self.do_classifier_free_guidance
+                    )
+                elif self.do_classifier_free_guidance:
+                    image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
+                image_embeds = image_embeds.to(device)
+                ip_adapter_image_embeds[i] = image_embeds
+
+        added_cond_kwargs = (
+            {"image_embeds": ip_adapter_image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        controlnet_prompt_embeds = prompt_embeds
+
+        # 7.2 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        images = image if isinstance(image, list) else [image]
+        for i, single_image in enumerate(images):
+            if self.do_classifier_free_guidance:
+                single_image = single_image.chunk(2)[0]
+
+            if self.do_perturbed_attention_guidance:
+                single_image = self._prepare_perturbed_attention_guidance(
+                    single_image, single_image, self.do_classifier_free_guidance
+                )
+            elif self.do_classifier_free_guidance:
+                single_image = torch.cat([single_image] * 2)
+            single_image = single_image.to(device)
+            images[i] = single_image
+
+        image = images if isinstance(image, list) else images[0]
+
+        # 8. Denoising loop
+        if self.do_perturbed_attention_guidance:
+            original_attn_proc = self.unet.attn_processors
+            self._set_pag_attn_processor(
+                pag_applied_layers=self.pag_applied_layers,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_controlnet_compiled = is_compiled_module(self.controlnet)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # controlnet(s) inference
+                control_model_input = latent_model_input
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Inferred ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_perturbed_attention_guidance:
+                    noise_pred = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    )
+                elif self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if self.do_perturbed_attention_guidance:
+            self.unet.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..247fc900a7b0dd5e3189f4ee2d83d227ce52b013
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py
@@ -0,0 +1,1612 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers.utils.import_utils import is_invisible_watermark_available
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from .pag_utils import PAGMixin
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+from ..controlnet.multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import AutoPipelineForText2Image, ControlNetModel, AutoencoderKL
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+        >>> negative_prompt = "low quality, bad quality, sketches"
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+        ... )
+
+        >>> # initialize the models and pipeline
+        >>> controlnet_conditioning_scale = 0.5  # recommended for good generalization
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...     "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
+        ... )
+        >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+        >>> pipe = AutoPipelineForText2Image.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0",
+        ...     controlnet=controlnet,
+        ...     vae=vae,
+        ...     torch_dtype=torch.float16,
+        ...     enable_pag=True,
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # get canny image
+        >>> image = np.array(image)
+        >>> image = cv2.Canny(image, 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+
+        >>> # generate image
+        >>> image = pipe(
+        ...     prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image, pag_scale=0.3
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLControlNetPAGPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+    PAGMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        text_encoder_2 ([`~transformers.CLIPTextModelWithProjection`]):
+            Second frozen text-encoder
+            ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        tokenizer_2 ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings should always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark](https://github.com/ShieldMnt/invisible-watermark/) library to
+            watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no
+            watermarker is used.
+    """
+
+    # leave controlnet out on purpose because it iterates with unet
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "feature_extractor",
+        "image_encoder",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        pag_applied_layers: Union[str, List[str]] = "mid",  # ["down.block_2", "up.block_1.attentions_0"], "mid"
+    ):
+        super().__init__()
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.set_pag_applied_layers(pag_applied_layers)
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        pag_scale: float = 3.0,
+        pag_adaptive_scale: float = 0.0,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. This is sent to `tokenizer_2`
+                and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, pooled text embeddings are generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt
+                weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input
+                argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            pag_scale (`float`, *optional*, defaults to 3.0):
+                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
+                guidance will not be used.
+            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
+                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
+                used.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned containing the output images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            image,
+            None,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            negative_pooled_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._pag_scale = pag_scale
+        self._pag_adaptive_scale = pag_adaptive_scale
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        # 3.1 Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt,
+            prompt_2,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 3.2 Encode ip_adapter_image
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=False,
+            )
+            height, width = image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            images = []
+
+            for image_ in image:
+                image_ = self.prepare_image(
+                    image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=False,
+                )
+
+                images.append(image_)
+
+            image = images
+            height, width = image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 7.2 Prepare added time ids & embeddings
+        if isinstance(image, list):
+            original_size = original_size or image[0].shape[-2:]
+        else:
+            original_size = original_size or image.shape[-2:]
+        target_size = target_size or (height, width)
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        images = image if isinstance(image, list) else [image]
+        for i, single_image in enumerate(images):
+            if self.do_classifier_free_guidance:
+                single_image = single_image.chunk(2)[0]
+
+            if self.do_perturbed_attention_guidance:
+                single_image = self._prepare_perturbed_attention_guidance(
+                    single_image, single_image, self.do_classifier_free_guidance
+                )
+            elif self.do_classifier_free_guidance:
+                single_image = torch.cat([single_image] * 2)
+            single_image = single_image.to(device)
+            images[i] = single_image
+
+        image = images if isinstance(image, list) else images[0]
+
+        if ip_adapter_image_embeds is not None:
+            for i, image_embeds in enumerate(ip_adapter_image_embeds):
+                negative_image_embeds = None
+                if self.do_classifier_free_guidance:
+                    negative_image_embeds, image_embeds = image_embeds.chunk(2)
+
+                if self.do_perturbed_attention_guidance:
+                    image_embeds = self._prepare_perturbed_attention_guidance(
+                        image_embeds, negative_image_embeds, self.do_classifier_free_guidance
+                    )
+                elif self.do_classifier_free_guidance:
+                    image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
+                image_embeds = image_embeds.to(device)
+                ip_adapter_image_embeds[i] = image_embeds
+
+        if self.do_perturbed_attention_guidance:
+            prompt_embeds = self._prepare_perturbed_attention_guidance(
+                prompt_embeds, negative_prompt_embeds, self.do_classifier_free_guidance
+            )
+            add_text_embeds = self._prepare_perturbed_attention_guidance(
+                add_text_embeds, negative_pooled_prompt_embeds, self.do_classifier_free_guidance
+            )
+            add_time_ids = self._prepare_perturbed_attention_guidance(
+                add_time_ids, negative_add_time_ids, self.do_classifier_free_guidance
+            )
+        elif self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+        added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        controlnet_prompt_embeds = prompt_embeds
+        controlnet_added_cond_kwargs = added_cond_kwargs
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        if self.do_perturbed_attention_guidance:
+            original_attn_proc = self.unet.attn_processors
+            self._set_pag_attn_processor(
+                pag_applied_layers=self.pag_applied_layers,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_controlnet_compiled = is_compiled_module(self.controlnet)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # controlnet(s) inference
+                control_model_input = latent_model_input
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=False,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                if ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = ip_adapter_image_embeds
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_perturbed_attention_guidance:
+                    noise_pred = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    )
+                elif self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if self.do_perturbed_attention_guidance:
+            self.unet.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..66398483e046ee034e9c69a8ef92b6ac99c3a240
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
@@ -0,0 +1,1685 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers.utils.import_utils import is_invisible_watermark_available
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from .pag_utils import PAGMixin
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+from ..controlnet.multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # pip install accelerate transformers safetensors diffusers
+
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+
+        >>> from transformers import DPTFeatureExtractor, DPTForDepthEstimation
+        >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetPAGImg2ImgPipeline, AutoencoderKL
+        >>> from diffusers.utils import load_image
+
+
+        >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
+        >>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...     "diffusers/controlnet-depth-sdxl-1.0-small",
+        ...     variant="fp16",
+        ...     use_safetensors="True",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionXLControlNetPAGImg2ImgPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0",
+        ...     controlnet=controlnet,
+        ...     vae=vae,
+        ...     variant="fp16",
+        ...     use_safetensors=True,
+        ...     torch_dtype=torch.float16,
+        ...     enable_pag=True,
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+
+        >>> def get_depth_map(image):
+        ...     image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
+        ...     with torch.no_grad(), torch.autocast("cuda"):
+        ...         depth_map = depth_estimator(image).predicted_depth
+
+        ...     depth_map = torch.nn.fuctional.interpolate(
+        ...         depth_map.unsqueeze(1),
+        ...         size=(1024, 1024),
+        ...         mode="bicubic",
+        ...         align_corners=False,
+        ...     )
+        ...     depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
+        ...     depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
+        ...     depth_map = (depth_map - depth_min) / (depth_max - depth_min)
+        ...     image = torch.cat([depth_map] * 3, dim=1)
+        ...     image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
+        ...     image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
+        ...     return image
+
+
+        >>> prompt = "A robot, 4k photo"
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... ).resize((1024, 1024))
+        >>> controlnet_conditioning_scale = 0.5  # recommended for good generalization
+        >>> depth_image = get_depth_map(image)
+
+        >>> images = pipe(
+        ...     prompt,
+        ...     image=image,
+        ...     control_image=depth_image,
+        ...     strength=0.99,
+        ...     num_inference_steps=50,
+        ...     controlnet_conditioning_scale=controlnet_conditioning_scale,
+        ... ).images
+        >>> images[0].save(f"robot_cat.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class StableDiffusionXLControlNetPAGImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    PAGMixin,
+):
+    r"""
+    Pipeline for image-to-image generation using Stable Diffusion XL with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
+            as a list, the outputs from each ControlNet are added together to create one combined additional
+            conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+            Whether the `unet` requires an `aesthetic_score` condition to be passed during inference. Also see the
+            config of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "feature_extractor",
+        "image_encoder",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "add_neg_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        pag_applied_layers: Union[str, List[str]] = "mid",  # ["mid"], ["down.block_1", "up.block_0.attentions_0"]
+    ):
+        super().__init__()
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+
+        self.set_pag_applied_layers(pag_applied_layers)
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl_img2img.StableDiffusionXLControlNetImg2ImgPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        strength,
+        num_inference_steps,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` cannot be None.")
+        elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
+            raise ValueError(
+                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
+                f" {type(num_inference_steps)}."
+            )
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.prepare_image
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.prepare_latents
+    def prepare_latents(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        latents_mean = latents_std = None
+        if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
+            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
+        if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
+            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
+
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+                    image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+                elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+                    )
+
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            if latents_mean is not None and latents_std is not None:
+                latents_mean = latents_mean.to(device=device, dtype=dtype)
+                latents_std = latents_std.to(device=device, dtype=dtype)
+                init_latents = (init_latents - latents_mean) * self.vae.config.scaling_factor / latents_std
+            else:
+                init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        if add_noise:
+            shape = init_latents.shape
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # get latents
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        pag_scale: float = 3.0,
+        pag_adaptive_scale: float = 0.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The initial image will be used as the starting point for the image generation process. Can also accept
+                image latents as `image`, if passing latents directly, it will not be encoded again.
+            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also
+                be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in
+                init, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single controlnet.
+            height (`int`, *optional*, defaults to the size of control_image):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to the size of control_image):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
+                corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the controlnet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the controlnet stops applying.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            pag_scale (`float`, *optional*, defaults to 3.0):
+                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
+                guidance will not be used.
+            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
+                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
+                used.
+
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple` containing the output images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            control_image,
+            strength,
+            num_inference_steps,
+            None,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._pag_scale = pag_scale
+        self._pag_adaptive_scale = pag_adaptive_scale
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        # 3.1 Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt,
+            prompt_2,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 3.2 Encode ip_adapter_image
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image and controlnet_conditioning_image
+        image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=False,
+            )
+            height, width = control_image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=False,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+            height, width = control_image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                True,
+            )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 7.2 Prepare added time ids & embeddings
+        if isinstance(control_image, list):
+            original_size = original_size or control_image[0].shape[-2:]
+        else:
+            original_size = original_size or control_image.shape[-2:]
+        target_size = target_size or (height, width)
+
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+        add_text_embeds = pooled_prompt_embeds
+
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+        add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        control_images = control_image if isinstance(control_image, list) else [control_image]
+        for i, single_image in enumerate(control_images):
+            if self.do_classifier_free_guidance:
+                single_image = single_image.chunk(2)[0]
+
+            if self.do_perturbed_attention_guidance:
+                single_image = self._prepare_perturbed_attention_guidance(
+                    single_image, single_image, self.do_classifier_free_guidance
+                )
+            elif self.do_classifier_free_guidance:
+                single_image = torch.cat([single_image] * 2)
+            single_image = single_image.to(device)
+            control_images[i] = single_image
+
+        control_image = control_images if isinstance(control_image, list) else control_images[0]
+
+        if ip_adapter_image_embeds is not None:
+            for i, image_embeds in enumerate(ip_adapter_image_embeds):
+                negative_image_embeds = None
+                if self.do_classifier_free_guidance:
+                    negative_image_embeds, image_embeds = image_embeds.chunk(2)
+
+                if self.do_perturbed_attention_guidance:
+                    image_embeds = self._prepare_perturbed_attention_guidance(
+                        image_embeds, negative_image_embeds, self.do_classifier_free_guidance
+                    )
+                elif self.do_classifier_free_guidance:
+                    image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
+                image_embeds = image_embeds.to(device)
+                ip_adapter_image_embeds[i] = image_embeds
+
+        if self.do_perturbed_attention_guidance:
+            prompt_embeds = self._prepare_perturbed_attention_guidance(
+                prompt_embeds, negative_prompt_embeds, self.do_classifier_free_guidance
+            )
+            add_text_embeds = self._prepare_perturbed_attention_guidance(
+                add_text_embeds, negative_pooled_prompt_embeds, self.do_classifier_free_guidance
+            )
+            add_time_ids = self._prepare_perturbed_attention_guidance(
+                add_time_ids, add_neg_time_ids, self.do_classifier_free_guidance
+            )
+        elif self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+        added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        controlnet_prompt_embeds = prompt_embeds
+        controlnet_added_cond_kwargs = added_cond_kwargs
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+        if self.do_perturbed_attention_guidance:
+            original_attn_proc = self.unet.attn_processors
+            self._set_pag_attn_processor(
+                pag_applied_layers=self.pag_applied_layers,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # controlnet(s) inference
+                control_model_input = latent_model_input
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=False,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                if ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = ip_adapter_image_embeds
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_perturbed_attention_guidance:
+                    noise_pred = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    )
+                elif self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+            return StableDiffusionXLPipelineOutput(images=image)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if self.do_perturbed_attention_guidance:
+            self.unet.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/pag/pipeline_pag_hunyuandit.py b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_hunyuandit.py
new file mode 100644
index 0000000000000000000000000000000000000000..63126cc5aae9bdf446aaf695ba6db07dabb891f0
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_hunyuandit.py
@@ -0,0 +1,953 @@
+# Copyright 2024 HunyuanDiT Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import BertModel, BertTokenizer, CLIPImageProcessor, MT5Tokenizer, T5EncoderModel
+
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKL, HunyuanDiT2DModel
+from ...models.attention_processor import PAGCFGHunyuanAttnProcessor2_0, PAGHunyuanAttnProcessor2_0
+from ...models.embeddings import get_2d_rotary_pos_embed
+from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pag_utils import PAGMixin
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> from diffusers import AutoPipelineForText2Image
+
+        >>> pipe = AutoPipelineForText2Image.from_pretrained(
+        ...     "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers",
+        ...     torch_dtype=torch.float16,
+        ...     enable_pag=True,
+        ...     pag_applied_layers=[14],
+        ... ).to("cuda")
+
+        >>> # prompt = "an astronaut riding a horse"
+        >>> prompt = "一个宇航员在骑马"
+        >>> image = pipe(prompt, guidance_scale=4, pag_scale=3).images[0]
+        ```
+"""
+
+STANDARD_RATIO = np.array(
+    [
+        1.0,  # 1:1
+        4.0 / 3.0,  # 4:3
+        3.0 / 4.0,  # 3:4
+        16.0 / 9.0,  # 16:9
+        9.0 / 16.0,  # 9:16
+    ]
+)
+STANDARD_SHAPE = [
+    [(1024, 1024), (1280, 1280)],  # 1:1
+    [(1024, 768), (1152, 864), (1280, 960)],  # 4:3
+    [(768, 1024), (864, 1152), (960, 1280)],  # 3:4
+    [(1280, 768)],  # 16:9
+    [(768, 1280)],  # 9:16
+]
+STANDARD_AREA = [np.array([w * h for w, h in shapes]) for shapes in STANDARD_SHAPE]
+SUPPORTED_SHAPE = [
+    (1024, 1024),
+    (1280, 1280),  # 1:1
+    (1024, 768),
+    (1152, 864),
+    (1280, 960),  # 4:3
+    (768, 1024),
+    (864, 1152),
+    (960, 1280),  # 3:4
+    (1280, 768),  # 16:9
+    (768, 1280),  # 9:16
+]
+
+
+def map_to_standard_shapes(target_width, target_height):
+    target_ratio = target_width / target_height
+    closest_ratio_idx = np.argmin(np.abs(STANDARD_RATIO - target_ratio))
+    closest_area_idx = np.argmin(np.abs(STANDARD_AREA[closest_ratio_idx] - target_width * target_height))
+    width, height = STANDARD_SHAPE[closest_ratio_idx][closest_area_idx]
+    return width, height
+
+
+def get_resize_crop_region_for_grid(src, tgt_size):
+    th = tw = tgt_size
+    h, w = src
+
+    r = h / w
+
+    # resize
+    if r > 1:
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class HunyuanDiTPAGPipeline(DiffusionPipeline, PAGMixin):
+    r"""
+    Pipeline for English/Chinese-to-image generation using HunyuanDiT and [Perturbed Attention
+    Guidance](https://huggingface.co/docs/diffusers/en/using-diffusers/pag).
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    HunyuanDiT uses two text encoders: [mT5](https://huggingface.co/google/mt5-base) and [bilingual CLIP](fine-tuned by
+    ourselves)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. We use
+            `sdxl-vae-fp16-fix`.
+        text_encoder (Optional[`~transformers.BertModel`, `~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+            HunyuanDiT uses a fine-tuned [bilingual CLIP].
+        tokenizer (Optional[`~transformers.BertTokenizer`, `~transformers.CLIPTokenizer`]):
+            A `BertTokenizer` or `CLIPTokenizer` to tokenize text.
+        transformer ([`HunyuanDiT2DModel`]):
+            The HunyuanDiT model designed by Tencent Hunyuan.
+        text_encoder_2 (`T5EncoderModel`):
+            The mT5 embedder. Specifically, it is 't5-v1_1-xxl'.
+        tokenizer_2 (`MT5Tokenizer`):
+            The tokenizer for the mT5 embedder.
+        scheduler ([`DDPMScheduler`]):
+            A scheduler to be used in combination with HunyuanDiT to denoise the encoded image latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = [
+        "safety_checker",
+        "feature_extractor",
+        "text_encoder_2",
+        "tokenizer_2",
+        "text_encoder",
+        "tokenizer",
+    ]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "prompt_embeds_2",
+        "negative_prompt_embeds_2",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: BertModel,
+        tokenizer: BertTokenizer,
+        transformer: HunyuanDiT2DModel,
+        scheduler: DDPMScheduler,
+        safety_checker: Optional[StableDiffusionSafetyChecker] = None,
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        requires_safety_checker: bool = True,
+        text_encoder_2: Optional[T5EncoderModel] = None,
+        tokenizer_2: Optional[MT5Tokenizer] = None,
+        pag_applied_layers: Union[str, List[str]] = "blocks.1",  # "blocks.16.attn1", "blocks.16", "16", 16
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            text_encoder_2=text_encoder_2,
+        )
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.default_sample_size = (
+            self.transformer.config.sample_size
+            if hasattr(self, "transformer") and self.transformer is not None
+            else 128
+        )
+
+        self.set_pag_applied_layers(
+            pag_applied_layers, pag_attn_processors=(PAGCFGHunyuanAttnProcessor2_0(), PAGHunyuanAttnProcessor2_0())
+        )
+
+    # Copied from diffusers.pipelines.hunyuandit.pipeline_hunyuandit.HunyuanDiTPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        device: torch.device = None,
+        dtype: torch.dtype = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        max_sequence_length: Optional[int] = None,
+        text_encoder_index: int = 0,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            dtype (`torch.dtype`):
+                torch dtype
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the prompt. Required when `prompt_embeds` is passed directly.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the negative prompt. Required when `negative_prompt_embeds` is passed directly.
+            max_sequence_length (`int`, *optional*): maximum sequence length to use for the prompt.
+            text_encoder_index (`int`, *optional*):
+                Index of the text encoder to use. `0` for clip and `1` for T5.
+        """
+        if dtype is None:
+            if self.text_encoder_2 is not None:
+                dtype = self.text_encoder_2.dtype
+            elif self.transformer is not None:
+                dtype = self.transformer.dtype
+            else:
+                dtype = None
+
+        if device is None:
+            device = self._execution_device
+
+        tokenizers = [self.tokenizer, self.tokenizer_2]
+        text_encoders = [self.text_encoder, self.text_encoder_2]
+
+        tokenizer = tokenizers[text_encoder_index]
+        text_encoder = text_encoders[text_encoder_index]
+
+        if max_sequence_length is None:
+            if text_encoder_index == 0:
+                max_length = 77
+            if text_encoder_index == 1:
+                max_length = 256
+        else:
+            max_length = max_sequence_length
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_attention_mask = text_inputs.attention_mask.to(device)
+            prompt_embeds = text_encoder(
+                text_input_ids.to(device),
+                attention_mask=prompt_attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+            prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            negative_prompt_attention_mask = uncond_input.attention_mask.to(device)
+            negative_prompt_embeds = text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=negative_prompt_attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds, negative_prompt_embeds, prompt_attention_mask, negative_prompt_attention_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.hunyuandit.pipeline_hunyuandit.HunyuanDiTPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+        prompt_embeds_2=None,
+        negative_prompt_embeds_2=None,
+        prompt_attention_mask_2=None,
+        negative_prompt_attention_mask_2=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is None and prompt_embeds_2 is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds_2`. Cannot leave both `prompt` and `prompt_embeds_2` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+        if prompt_embeds_2 is not None and prompt_attention_mask_2 is None:
+            raise ValueError("Must provide `prompt_attention_mask_2` when specifying `prompt_embeds_2`.")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+        if negative_prompt_embeds_2 is not None and negative_prompt_attention_mask_2 is None:
+            raise ValueError(
+                "Must provide `negative_prompt_attention_mask_2` when specifying `negative_prompt_embeds_2`."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if prompt_embeds_2 is not None and negative_prompt_embeds_2 is not None:
+            if prompt_embeds_2.shape != negative_prompt_embeds_2.shape:
+                raise ValueError(
+                    "`prompt_embeds_2` and `negative_prompt_embeds_2` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds_2` {prompt_embeds_2.shape} != `negative_prompt_embeds_2`"
+                    f" {negative_prompt_embeds_2.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_2: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_2: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        prompt_attention_mask_2: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask_2: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = (1024, 1024),
+        target_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        use_resolution_binning: bool = True,
+        pag_scale: float = 3.0,
+        pag_adaptive_scale: float = 0.0,
+    ):
+        r"""
+        The call function to the pipeline for generation with HunyuanDiT.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`):
+                The height in pixels of the generated image.
+            width (`int`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            negative_prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the prompt. Required when `prompt_embeds` is passed directly.
+            prompt_attention_mask_2 (`torch.Tensor`, *optional*):
+                Attention mask for the prompt. Required when `prompt_embeds_2` is passed directly.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the negative prompt. Required when `negative_prompt_embeds` is passed directly.
+            negative_prompt_attention_mask_2 (`torch.Tensor`, *optional*):
+                Attention mask for the negative prompt. Required when `negative_prompt_embeds_2` is passed directly.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback_on_step_end (`Callable[[int, int, Dict], None]`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A callback function or a list of callback functions to be called at the end of each denoising step.
+            callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
+                A list of tensor inputs that should be passed to the callback function. If not defined, all tensor
+                inputs will be passed.
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Rescale the noise_cfg according to `guidance_rescale`. Based on findings of [Common Diffusion Noise
+                Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+            original_size (`Tuple[int, int]`, *optional*, defaults to `(1024, 1024)`):
+                The original size of the image. Used to calculate the time ids.
+            target_size (`Tuple[int, int]`, *optional*):
+                The target size of the image. Used to calculate the time ids.
+            crops_coords_top_left (`Tuple[int, int]`, *optional*, defaults to `(0, 0)`):
+                The top left coordinates of the crop. Used to calculate the time ids.
+            use_resolution_binning (`bool`, *optional*, defaults to `True`):
+                Whether to use resolution binning or not. If `True`, the input resolution will be mapped to the closest
+                standard resolution. Supported resolutions are 1024x1024, 1280x1280, 1024x768, 1152x864, 1280x960,
+                768x1024, 864x1152, 960x1280, 1280x768, and 768x1280. It is recommended to set this to `True`.
+            pag_scale (`float`, *optional*, defaults to 3.0):
+                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
+                guidance will not be used.
+            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
+                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
+                used.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Default height and width
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        height = int((height // 16) * 16)
+        width = int((width // 16) * 16)
+
+        if use_resolution_binning and (height, width) not in SUPPORTED_SHAPE:
+            width, height = map_to_standard_shapes(width, height)
+            height = int(height)
+            width = int(width)
+            logger.warning(f"Reshaped to (height, width)=({height}, {width}), Supported shapes are {SUPPORTED_SHAPE}")
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+            prompt_embeds_2,
+            negative_prompt_embeds_2,
+            prompt_attention_mask_2,
+            negative_prompt_attention_mask_2,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._interrupt = False
+        self._pag_scale = pag_scale
+        self._pag_adaptive_scale = pag_adaptive_scale
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            dtype=self.transformer.dtype,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            max_sequence_length=77,
+            text_encoder_index=0,
+        )
+        (
+            prompt_embeds_2,
+            negative_prompt_embeds_2,
+            prompt_attention_mask_2,
+            negative_prompt_attention_mask_2,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            dtype=self.transformer.dtype,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds_2,
+            negative_prompt_embeds=negative_prompt_embeds_2,
+            prompt_attention_mask=prompt_attention_mask_2,
+            negative_prompt_attention_mask=negative_prompt_attention_mask_2,
+            max_sequence_length=256,
+            text_encoder_index=1,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Create image_rotary_emb, style embedding & time ids
+        grid_height = height // 8 // self.transformer.config.patch_size
+        grid_width = width // 8 // self.transformer.config.patch_size
+        base_size = 512 // 8 // self.transformer.config.patch_size
+        grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size)
+        image_rotary_emb = get_2d_rotary_pos_embed(
+            self.transformer.inner_dim // self.transformer.num_heads, grid_crops_coords, (grid_height, grid_width)
+        )
+
+        style = torch.tensor([0], device=device)
+
+        target_size = target_size or (height, width)
+        add_time_ids = list(original_size + target_size + crops_coords_top_left)
+        add_time_ids = torch.tensor([add_time_ids], dtype=prompt_embeds.dtype)
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_perturbed_attention_guidance:
+            prompt_embeds = self._prepare_perturbed_attention_guidance(
+                prompt_embeds, negative_prompt_embeds, self.do_classifier_free_guidance
+            )
+            prompt_attention_mask = self._prepare_perturbed_attention_guidance(
+                prompt_attention_mask, negative_prompt_attention_mask, self.do_classifier_free_guidance
+            )
+            prompt_embeds_2 = self._prepare_perturbed_attention_guidance(
+                prompt_embeds_2, negative_prompt_embeds_2, self.do_classifier_free_guidance
+            )
+            prompt_attention_mask_2 = self._prepare_perturbed_attention_guidance(
+                prompt_attention_mask_2, negative_prompt_attention_mask_2, self.do_classifier_free_guidance
+            )
+            add_time_ids = torch.cat([add_time_ids] * 3, dim=0)
+            style = torch.cat([style] * 3, dim=0)
+        elif self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask])
+            prompt_embeds_2 = torch.cat([negative_prompt_embeds_2, prompt_embeds_2])
+            prompt_attention_mask_2 = torch.cat([negative_prompt_attention_mask_2, prompt_attention_mask_2])
+            add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
+            style = torch.cat([style] * 2, dim=0)
+
+        prompt_embeds = prompt_embeds.to(device=device)
+        prompt_attention_mask = prompt_attention_mask.to(device=device)
+        prompt_embeds_2 = prompt_embeds_2.to(device=device)
+        prompt_attention_mask_2 = prompt_attention_mask_2.to(device=device)
+        add_time_ids = add_time_ids.to(dtype=prompt_embeds.dtype, device=device).repeat(
+            batch_size * num_images_per_prompt, 1
+        )
+        style = style.to(device=device).repeat(batch_size * num_images_per_prompt)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+
+        if self.do_perturbed_attention_guidance:
+            original_attn_proc = self.transformer.attn_processors
+            self._set_pag_attn_processor(
+                pag_applied_layers=self.pag_applied_layers,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # expand scalar t to 1-D tensor to match the 1st dim of latent_model_input
+                t_expand = torch.tensor([t] * latent_model_input.shape[0], device=device).to(
+                    dtype=latent_model_input.dtype
+                )
+
+                # predict the noise residual
+                noise_pred = self.transformer(
+                    latent_model_input,
+                    t_expand,
+                    encoder_hidden_states=prompt_embeds,
+                    text_embedding_mask=prompt_attention_mask,
+                    encoder_hidden_states_t5=prompt_embeds_2,
+                    text_embedding_mask_t5=prompt_attention_mask_2,
+                    image_meta_size=add_time_ids,
+                    style=style,
+                    image_rotary_emb=image_rotary_emb,
+                    return_dict=False,
+                )[0]
+
+                noise_pred, _ = noise_pred.chunk(2, dim=1)
+
+                # perform guidance
+                if self.do_perturbed_attention_guidance:
+                    noise_pred = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    )
+                elif self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    prompt_embeds_2 = callback_outputs.pop("prompt_embeds_2", prompt_embeds_2)
+                    negative_prompt_embeds_2 = callback_outputs.pop(
+                        "negative_prompt_embeds_2", negative_prompt_embeds_2
+                    )
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # 9. Offload all models
+        self.maybe_free_model_hooks()
+
+        if self.do_perturbed_attention_guidance:
+            self.transformer.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/pag/pipeline_pag_kolors.py b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_kolors.py
new file mode 100644
index 0000000000000000000000000000000000000000..3255bfdfc85f4dceec8259fbfa9191db30b730e7
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_kolors.py
@@ -0,0 +1,1136 @@
+# Copyright 2024 Stability AI, Kwai-Kolors Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, StableDiffusionXLLoraLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..kolors.pipeline_output import KolorsPipelineOutput
+from ..kolors.text_encoder import ChatGLMModel
+from ..kolors.tokenizer import ChatGLMTokenizer
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pag_utils import PAGMixin
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AutoPipelineForText2Image
+
+        >>> pipe = AutoPipelineForText2Image.from_pretrained(
+        ...     "Kwai-Kolors/Kolors-diffusers",
+        ...     variant="fp16",
+        ...     torch_dtype=torch.float16,
+        ...     enable_pag=True,
+        ...     pag_applied_layers=["down.block_2.attentions_1", "up.block_0.attentions_1"],
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = (
+        ...     "A photo of a ladybug, macro, zoom, high quality, film, holding a wooden sign with the text 'KOLORS'"
+        ... )
+        >>> image = pipe(prompt, guidance_scale=5.5, pag_scale=1.5).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class KolorsPAGPipeline(
+    DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, IPAdapterMixin, PAGMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Kolors.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`ChatGLMModel`]):
+            Frozen text-encoder. Kolors uses [ChatGLM3-6B](https://huggingface.co/THUDM/chatglm3-6b).
+        tokenizer (`ChatGLMTokenizer`):
+            Tokenizer of class
+            [ChatGLMTokenizer](https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"False"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `Kwai-Kolors/Kolors-diffusers`.
+        pag_applied_layers (`str` or `List[str]``, *optional*, defaults to `"mid"`):
+            Set the transformer attention layers where to apply the perturbed attention guidance. Can be a string or a
+            list of strings with "down", "mid", "up", a whole transformer block or specific transformer block attention
+            layers, e.g.:
+                ["mid"] ["down", "mid"] ["down", "mid", "up.block_1"] ["down", "mid", "up.block_1.attentions_0",
+                "up.block_1.attentions_1"]
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = [
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: ChatGLMModel,
+        tokenizer: ChatGLMTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = False,
+        pag_applied_layers: Union[str, List[str]] = "mid",
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+        self.set_pag_applied_layers(pag_applied_layers)
+
+    # Copied from diffusers.pipelines.kolors.pipeline_kolors.KolorsPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 256,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+        """
+        # from IPython import embed; embed(); exit()
+        device = device or self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer]
+        text_encoders = [self.text_encoder]
+
+        if prompt_embeds is None:
+            prompt_embeds_list = []
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=max_sequence_length,
+                    truncation=True,
+                    return_tensors="pt",
+                ).to(device)
+                output = text_encoder(
+                    input_ids=text_inputs["input_ids"],
+                    attention_mask=text_inputs["attention_mask"],
+                    position_ids=text_inputs["position_ids"],
+                    output_hidden_states=True,
+                )
+
+                # [max_sequence_length, batch, hidden_size] -> [batch, max_sequence_length, hidden_size]
+                # clone to have a contiguous tensor
+                prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
+                # [max_sequence_length, batch, hidden_size] -> [batch, hidden_size]
+                pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone()
+                bs_embed, seq_len, _ = prompt_embeds.shape
+                prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+                prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = prompt_embeds_list[0]
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            negative_prompt_embeds_list = []
+
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                uncond_input = tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=max_sequence_length,
+                    truncation=True,
+                    return_tensors="pt",
+                ).to(device)
+                output = text_encoder(
+                    input_ids=uncond_input["input_ids"],
+                    attention_mask=uncond_input["attention_mask"],
+                    position_ids=uncond_input["position_ids"],
+                    output_hidden_states=True,
+                )
+
+                # [max_sequence_length, batch, hidden_size] -> [batch, max_sequence_length, hidden_size]
+                # clone to have a contiguous tensor
+                negative_prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
+                # [max_sequence_length, batch, hidden_size] -> [batch, hidden_size]
+                negative_pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone()
+
+                if do_classifier_free_guidance:
+                    # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+                    seq_len = negative_prompt_embeds.shape[1]
+
+                    negative_prompt_embeds = negative_prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+
+                    negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+                    negative_prompt_embeds = negative_prompt_embeds.view(
+                        batch_size * num_images_per_prompt, seq_len, -1
+                    )
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = negative_prompt_embeds_list[0]
+
+        bs_embed = pooled_prompt_embeds.shape[0]
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.kolors.pipeline_kolors.KolorsPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        num_inference_steps,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
+            raise ValueError(
+                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
+                f" {type(num_inference_steps)}."
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+        if max_sequence_length is not None and max_sequence_length > 256:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 256 but is {max_sequence_length}")
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        pag_scale: float = 3.0,
+        pag_adaptive_scale: float = 0.0,
+        max_sequence_length: int = 256,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [Kwai-Kolors/Kolors-diffusers](https://huggingface.co/Kwai-Kolors/Kolors-diffusers) and checkpoints
+                that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [Kwai-Kolors/Kolors-diffusers](https://huggingface.co/Kwai-Kolors/Kolors-diffusers) and checkpoints
+                that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.kolors.KolorsPipelineOutput`] instead of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            pag_scale (`float`, *optional*, defaults to 3.0):
+                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
+                guidance will not be used.
+            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
+                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
+                used.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.kolors.KolorsPipelineOutput`] or `tuple`: [`~pipelines.kolors.KolorsPipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            num_inference_steps,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            pooled_prompt_embeds,
+            negative_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+        self._pag_scale = pag_scale
+        self._pag_adaptive_scale = pag_adaptive_scale
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_perturbed_attention_guidance:
+            prompt_embeds = self._prepare_perturbed_attention_guidance(
+                prompt_embeds, negative_prompt_embeds, self.do_classifier_free_guidance
+            )
+            add_text_embeds = self._prepare_perturbed_attention_guidance(
+                add_text_embeds, negative_pooled_prompt_embeds, self.do_classifier_free_guidance
+            )
+            add_time_ids = self._prepare_perturbed_attention_guidance(
+                add_time_ids, negative_add_time_ids, self.do_classifier_free_guidance
+            )
+        elif self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+            for i, image_embeds in enumerate(ip_adapter_image_embeds):
+                negative_image_embeds = None
+                if self.do_classifier_free_guidance:
+                    negative_image_embeds, image_embeds = image_embeds.chunk(2)
+
+                if self.do_perturbed_attention_guidance:
+                    image_embeds = self._prepare_perturbed_attention_guidance(
+                        image_embeds, negative_image_embeds, self.do_classifier_free_guidance
+                    )
+                elif self.do_classifier_free_guidance:
+                    image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
+                image_embeds = image_embeds.to(device)
+                ip_adapter_image_embeds[i] = image_embeds
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        if self.do_perturbed_attention_guidance:
+            original_attn_proc = self.unet.attn_processors
+            self._set_pag_attn_processor(
+                pag_applied_layers=self.pag_applied_layers,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_perturbed_attention_guidance:
+                    noise_pred = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    )
+                elif self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+
+            # unscale/denormalize the latents
+            latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if self.do_perturbed_attention_guidance:
+            self.unet.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image,)
+
+        return KolorsPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e5e6cbaf5adacc6de27e1bfb44cbae4c63f3153
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py
@@ -0,0 +1,872 @@
+# Copyright 2024 PixArt-Sigma Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...image_processor import PixArtImageProcessor
+from ...models import AutoencoderKL, PixArtTransformer2DModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    BACKENDS_MAPPING,
+    deprecate,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ..pixart_alpha.pipeline_pixart_alpha import (
+    ASPECT_RATIO_256_BIN,
+    ASPECT_RATIO_512_BIN,
+    ASPECT_RATIO_1024_BIN,
+)
+from ..pixart_alpha.pipeline_pixart_sigma import ASPECT_RATIO_2048_BIN
+from .pag_utils import PAGMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AutoPipelineForText2Image
+
+        >>> pipe = AutoPipelineForText2Image.from_pretrained(
+        ...     "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
+        ...     torch_dtype=torch.float16,
+        ...     pag_applied_layers=["blocks.14"],
+        ...     enable_pag=True,
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "A small cactus with a happy face in the Sahara desert"
+        >>> image = pipe(prompt, pag_scale=4.0, guidance_scale=1.0).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class PixArtSigmaPAGPipeline(DiffusionPipeline, PAGMixin):
+    r"""
+    [PAG pipeline](https://huggingface.co/docs/diffusers/main/en/using-diffusers/pag) for text-to-image generation
+    using PixArt-Sigma.
+    """
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder"]
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKL,
+        transformer: PixArtTransformer2DModel,
+        scheduler: KarrasDiffusionSchedulers,
+        pag_applied_layers: Union[str, List[str]] = "blocks.1",  # 1st transformer block
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.set_pag_applied_layers(pag_applied_layers)
+
+    # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt with 120->300
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str = "",
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+        max_sequence_length: int = 300,
+        **kwargs,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+                PixArt-Alpha, this should be "".
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
+                string.
+            clean_caption (`bool`, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+            max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
+        """
+
+        if "mask_feature" in kwargs:
+            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # See Section 3.1. of the paper.
+        max_length = max_sequence_length
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because T5 can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            prompt_attention_mask = text_inputs.attention_mask
+            prompt_attention_mask = prompt_attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.transformer is not None:
+            dtype = self.transformer.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            negative_prompt_attention_mask = uncond_input.attention_mask
+            negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
+        else:
+            negative_prompt_embeds = None
+            negative_prompt_attention_mask = None
+
+        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_steps,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: str = "",
+        num_inference_steps: int = 20,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 4.5,
+        num_images_per_prompt: Optional[int] = 1,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        use_resolution_binning: bool = True,
+        max_sequence_length: int = 300,
+        pag_scale: float = 3.0,
+        pag_adaptive_scale: float = 0.0,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 4.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
+                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            use_resolution_binning (`bool` defaults to `True`):
+                If set to `True`, the requested height and width are first mapped to the closest resolutions using
+                `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
+                the requested resolution. Useful for generating non-square images.
+            max_sequence_length (`int` defaults to 300): Maximum sequence length to use with the `prompt`.
+            pag_scale (`float`, *optional*, defaults to 3.0):
+                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
+                guidance will not be used.
+            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
+                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
+                used.
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        # 1. Check inputs. Raise error if not correct
+        height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        if use_resolution_binning:
+            if self.transformer.config.sample_size == 256:
+                aspect_ratio_bin = ASPECT_RATIO_2048_BIN
+            elif self.transformer.config.sample_size == 128:
+                aspect_ratio_bin = ASPECT_RATIO_1024_BIN
+            elif self.transformer.config.sample_size == 64:
+                aspect_ratio_bin = ASPECT_RATIO_512_BIN
+            elif self.transformer.config.sample_size == 32:
+                aspect_ratio_bin = ASPECT_RATIO_256_BIN
+            else:
+                raise ValueError("Invalid sample size")
+            orig_height, orig_width = height, width
+            height, width = self.image_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)
+
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_steps,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        )
+        self._pag_scale = pag_scale
+        self._pag_adaptive_scale = pag_adaptive_scale
+
+        # 2. Default height and width to transformer
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            clean_caption=clean_caption,
+            max_sequence_length=max_sequence_length,
+        )
+        if self.do_perturbed_attention_guidance:
+            prompt_embeds = self._prepare_perturbed_attention_guidance(
+                prompt_embeds, negative_prompt_embeds, do_classifier_free_guidance
+            )
+            prompt_attention_mask = self._prepare_perturbed_attention_guidance(
+                prompt_attention_mask, negative_prompt_attention_mask, do_classifier_free_guidance
+            )
+        elif do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            latent_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        if self.do_perturbed_attention_guidance:
+            original_attn_proc = self.transformer.attn_processors
+            self._set_pag_attn_processor(
+                pag_applied_layers=self.pag_applied_layers,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+            )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Prepare micro-conditions.
+        added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance, perturbed-attention guidance, or both
+                latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                current_timestep = t
+                if not torch.is_tensor(current_timestep):
+                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                    # This would be a good case for the `match` statement (Python 3.10+)
+                    is_mps = latent_model_input.device.type == "mps"
+                    if isinstance(current_timestep, float):
+                        dtype = torch.float32 if is_mps else torch.float64
+                    else:
+                        dtype = torch.int32 if is_mps else torch.int64
+                    current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
+                elif len(current_timestep.shape) == 0:
+                    current_timestep = current_timestep[None].to(latent_model_input.device)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(latent_model_input.shape[0])
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_attention_mask=prompt_attention_mask,
+                    timestep=current_timestep,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_perturbed_attention_guidance:
+                    noise_pred = self._apply_perturbed_attention_guidance(
+                        noise_pred, do_classifier_free_guidance, guidance_scale, current_timestep
+                    )
+                elif do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # learned sigma
+                if self.transformer.config.out_channels // 2 == latent_channels:
+                    noise_pred = noise_pred.chunk(2, dim=1)[0]
+                else:
+                    noise_pred = noise_pred
+
+                # compute previous image: x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            if use_resolution_binning:
+                image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if self.do_perturbed_attention_guidance:
+            self.transformer.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd.py b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6a4f7f42c84f48876d322f32dbfe4470e028755
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd.py
@@ -0,0 +1,1050 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .pag_utils import PAGMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AutoPipelineForText2Image
+
+        >>> pipe = AutoPipelineForText2Image.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, enable_pag=True
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt, pag_scale=0.3).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionPAGPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+    PAGMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+        pag_applied_layers: Union[str, List[str]] = "mid",
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+        self.set_pag_applied_layers(pag_applied_layers)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        pag_scale: float = 3.0,
+        pag_adaptive_scale: float = 0.0,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            pag_scale (`float`, *optional*, defaults to 3.0):
+                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
+                guidance will not be used.
+            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
+                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
+                used.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            None,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        self._pag_scale = pag_scale
+        self._pag_adaptive_scale = pag_adaptive_scale
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_perturbed_attention_guidance:
+            prompt_embeds = self._prepare_perturbed_attention_guidance(
+                prompt_embeds, negative_prompt_embeds, self.do_classifier_free_guidance
+            )
+        elif self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+            for i, image_embeds in enumerate(ip_adapter_image_embeds):
+                negative_image_embeds = None
+                if self.do_classifier_free_guidance:
+                    negative_image_embeds, image_embeds = image_embeds.chunk(2)
+                if self.do_perturbed_attention_guidance:
+                    image_embeds = self._prepare_perturbed_attention_guidance(
+                        image_embeds, negative_image_embeds, self.do_classifier_free_guidance
+                    )
+
+                elif self.do_classifier_free_guidance:
+                    image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
+                image_embeds = image_embeds.to(device)
+                ip_adapter_image_embeds[i] = image_embeds
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": ip_adapter_image_embeds}
+            if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
+            else None
+        )
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        if self.do_perturbed_attention_guidance:
+            original_attn_proc = self.unet.attn_processors
+            self._set_pag_attn_processor(
+                pag_applied_layers=self.pag_applied_layers,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_perturbed_attention_guidance:
+                    noise_pred = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    )
+
+                elif self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if self.do_perturbed_attention_guidance:
+            self.unet.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..3035509843c0305080c43ed43ec519f353907b36
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py
@@ -0,0 +1,985 @@
+# Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
+from ...models.attention_processor import PAGCFGJointAttnProcessor2_0, PAGJointAttnProcessor2_0
+from ...models.autoencoders import AutoencoderKL
+from ...models.transformers import SD3Transformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from ..stable_diffusion_3.pipeline_output import StableDiffusion3PipelineOutput
+from .pag_utils import PAGMixin
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AutoPipelineForText2Image
+
+        >>> pipe = AutoPipelineForText2Image.from_pretrained(
+        ...     "stabilityai/stable-diffusion-3-medium-diffusers",
+        ...     torch_dtype=torch.float16,
+        ...     enable_pag=True,
+        ...     pag_applied_layers=["blocks.13"],
+        ... )
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> image = pipe(prompt, guidance_scale=5.0, pag_scale=0.7).images[0]
+        >>> image.save("sd3_pag.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusion3PAGPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, PAGMixin):
+    r"""
+    [PAG pipeline](https://huggingface.co/docs/diffusers/main/en/using-diffusers/pag) for text-to-image generation
+    using Stable Diffusion 3.
+
+    Args:
+        transformer ([`SD3Transformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant,
+            with an additional added projection layer that is initialized with a diagonal matrix with the `hidden_size`
+            as its dimension.
+        text_encoder_2 ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        text_encoder_3 ([`T5EncoderModel`]):
+            Frozen text-encoder. Stable Diffusion 3 uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_3 (`T5TokenizerFast`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
+
+    def __init__(
+        self,
+        transformer: SD3Transformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer_2: CLIPTokenizer,
+        text_encoder_3: T5EncoderModel,
+        tokenizer_3: T5TokenizerFast,
+        pag_applied_layers: Union[str, List[str]] = "blocks.1",  # 1st transformer block
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            text_encoder_3=text_encoder_3,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            tokenizer_3=tokenizer_3,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = (
+            self.transformer.config.sample_size
+            if hasattr(self, "transformer") and self.transformer is not None
+            else 128
+        )
+
+        self.set_pag_applied_layers(
+            pag_applied_layers, pag_attn_processors=(PAGCFGJointAttnProcessor2_0(), PAGJointAttnProcessor2_0())
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 256,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        if self.text_encoder_3 is None:
+            return torch.zeros(
+                (
+                    batch_size * num_images_per_prompt,
+                    self.tokenizer_max_length,
+                    self.transformer.config.joint_attention_dim,
+                ),
+                device=device,
+                dtype=dtype,
+            )
+
+        text_inputs = self.tokenizer_3(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_3(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_3.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_3(text_input_ids.to(device))[0]
+
+        dtype = self.text_encoder_3.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        clip_skip: Optional[int] = None,
+        clip_model_index: int = 0,
+    ):
+        device = device or self._execution_device
+
+        clip_tokenizers = [self.tokenizer, self.tokenizer_2]
+        clip_text_encoders = [self.text_encoder, self.text_encoder_2]
+
+        tokenizer = clip_tokenizers[clip_model_index]
+        text_encoder = clip_text_encoders[clip_model_index]
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+        pooled_prompt_embeds = prompt_embeds[0]
+
+        if clip_skip is None:
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+        else:
+            prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds, pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        prompt_3: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clip_skip: Optional[int] = None,
+        max_sequence_length: int = 256,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            prompt_3 = prompt_3 or prompt
+            prompt_3 = [prompt_3] if isinstance(prompt_3, str) else prompt_3
+
+            prompt_embed, pooled_prompt_embed = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=0,
+            )
+            prompt_2_embed, pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                prompt=prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=1,
+            )
+            clip_prompt_embeds = torch.cat([prompt_embed, prompt_2_embed], dim=-1)
+
+            t5_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            clip_prompt_embeds = torch.nn.functional.pad(
+                clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+            )
+
+            prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+            pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1)
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+            negative_prompt_3 = negative_prompt_3 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+            negative_prompt_3 = (
+                batch_size * [negative_prompt_3] if isinstance(negative_prompt_3, str) else negative_prompt_3
+            )
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embed, negative_pooled_prompt_embed = self._get_clip_prompt_embeds(
+                negative_prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=0,
+            )
+            negative_prompt_2_embed, negative_pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                negative_prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=1,
+            )
+            negative_clip_prompt_embeds = torch.cat([negative_prompt_embed, negative_prompt_2_embed], dim=-1)
+
+            t5_negative_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=negative_prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            negative_clip_prompt_embeds = torch.nn.functional.pad(
+                negative_clip_prompt_embeds,
+                (0, t5_negative_prompt_embed.shape[-1] - negative_clip_prompt_embeds.shape[-1]),
+            )
+
+            negative_prompt_embeds = torch.cat([negative_clip_prompt_embeds, t5_negative_prompt_embed], dim=-2)
+            negative_pooled_prompt_embeds = torch.cat(
+                [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        prompt_3,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        negative_prompt_3=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_3 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_3`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        elif prompt_3 is not None and (not isinstance(prompt_3, str) and not isinstance(prompt_3, list)):
+            raise ValueError(f"`prompt_3` has to be of type `str` or `list` but is {type(prompt_3)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_3 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_3`: {negative_prompt_3} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        prompt_3: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 256,
+        pag_scale: float = 3.0,
+        pag_adaptive_scale: float = 0.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used instead
+            negative_prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used instead
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+            pag_scale (`float`, *optional*, defaults to 3.0):
+                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
+                guidance will not be used.
+            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
+                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
+                used.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            prompt_3,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        self._pag_scale = pag_scale
+        self._pag_adaptive_scale = pag_adaptive_scale  #
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_3=prompt_3,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            device=device,
+            clip_skip=self.clip_skip,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+
+        if self.do_perturbed_attention_guidance:
+            prompt_embeds = self._prepare_perturbed_attention_guidance(
+                prompt_embeds, negative_prompt_embeds, self.do_classifier_free_guidance
+            )
+            pooled_prompt_embeds = self._prepare_perturbed_attention_guidance(
+                pooled_prompt_embeds, negative_pooled_prompt_embeds, self.do_classifier_free_guidance
+            )
+        elif self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        if self.do_perturbed_attention_guidance:
+            original_attn_proc = self.transformer.attn_processors
+            self._set_pag_attn_processor(
+                pag_applied_layers=self.pag_applied_layers,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance, perturbed-attention guidance, or both
+                latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_perturbed_attention_guidance:
+                    noise_pred = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    )
+
+                elif self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if self.do_perturbed_attention_guidance:
+            self.transformer.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusion3PipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e81fa3a158c21a87e9cf2655ae767ac7d57d50b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py
@@ -0,0 +1,866 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput
+from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...models.unets.unet_motion_model import MotionAdapter
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..animatediff.pipeline_output import AnimateDiffPipelineOutput
+from ..free_init_utils import FreeInitMixin
+from ..free_noise_utils import AnimateDiffFreeNoiseMixin
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pag_utils import PAGMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AnimateDiffPAGPipeline, MotionAdapter, DDIMScheduler
+        >>> from diffusers.utils import export_to_gif
+
+        >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+        >>> motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-2"
+        >>> motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id)
+        >>> scheduler = DDIMScheduler.from_pretrained(
+        ...     model_id, subfolder="scheduler", beta_schedule="linear", steps_offset=1, clip_sample=False
+        ... )
+        >>> pipe = AnimateDiffPAGPipeline.from_pretrained(
+        ...     model_id,
+        ...     motion_adapter=motion_adapter,
+        ...     scheduler=scheduler,
+        ...     pag_applied_layers=["mid"],
+        ...     torch_dtype=torch.float16,
+        ... ).to("cuda")
+
+        >>> video = pipe(
+        ...     prompt="car, futuristic cityscape with neon lights, street, no human",
+        ...     negative_prompt="low quality, bad quality",
+        ...     num_inference_steps=25,
+        ...     guidance_scale=6.0,
+        ...     pag_scale=3.0,
+        ...     generator=torch.Generator().manual_seed(42),
+        ... ).frames[0]
+
+        >>> export_to_gif(video, "animatediff_pag.gif")
+        ```
+"""
+
+
+class AnimateDiffPAGPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    FreeInitMixin,
+    AnimateDiffFreeNoiseMixin,
+    PAGMixin,
+):
+    r"""
+    Pipeline for text-to-video generation using
+    [AnimateDiff](https://huggingface.co/docs/diffusers/en/api/pipelines/animatediff) and [Perturbed Attention
+    Guidance](https://huggingface.co/docs/diffusers/en/using-diffusers/pag).
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: Union[UNet2DConditionModel, UNetMotionModel],
+        motion_adapter: MotionAdapter,
+        scheduler: KarrasDiffusionSchedulers,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        pag_applied_layers: Union[str, List[str]] = "mid_block.*attn1",  # ["mid"], ["down_blocks.1"]
+    ):
+        super().__init__()
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
+
+        self.set_pag_applied_layers(pag_applied_layers)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
+    def decode_latents(self, latents, decode_chunk_size: int = 16):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        video = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            batch_latents = latents[i : i + decode_chunk_size]
+            batch_latents = self.vae.decode(batch_latents).sample
+            video.append(batch_latents)
+
+        video = torch.cat(video)
+        video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.pia.pipeline_pia.PIAPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        # If FreeNoise is enabled, generate latents as described in Equation (7) of [FreeNoise](https://arxiv.org/abs/2310.15169)
+        if self.free_noise_enabled:
+            latents = self._prepare_latents_free_noise(
+                batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents
+            )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        num_frames: Optional[int] = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        decode_chunk_size: int = 16,
+        pag_scale: float = 3.0,
+        pag_adaptive_scale: float = 0.0,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            pag_scale (`float`, *optional*, defaults to 3.0):
+                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
+                guidance will not be used.
+            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
+                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
+                used.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._pag_scale = pag_scale
+        self._pag_adaptive_scale = pag_adaptive_scale
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_perturbed_attention_guidance:
+            prompt_embeds = self._prepare_perturbed_attention_guidance(
+                prompt_embeds, negative_prompt_embeds, self.do_classifier_free_guidance
+            )
+        elif self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+            for i, image_embeds in enumerate(ip_adapter_image_embeds):
+                negative_image_embeds = None
+                if self.do_classifier_free_guidance:
+                    negative_image_embeds, image_embeds = image_embeds.chunk(2)
+                if self.do_perturbed_attention_guidance:
+                    image_embeds = self._prepare_perturbed_attention_guidance(
+                        image_embeds, negative_image_embeds, self.do_classifier_free_guidance
+                    )
+                elif self.do_classifier_free_guidance:
+                    image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
+                image_embeds = image_embeds.to(device)
+                ip_adapter_image_embeds[i] = image_embeds
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": ip_adapter_image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        if self.do_perturbed_attention_guidance:
+            original_attn_proc = self.unet.attn_processors
+            self._set_pag_attn_processor(
+                pag_applied_layers=self.pag_applied_layers,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+
+            self._num_timesteps = len(timesteps)
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+            # 8. Denoising loop
+            with self.progress_bar(total=self._num_timesteps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat(
+                        [latents] * (prompt_embeds.shape[0] // num_frames // latents.shape[0])
+                    )
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+
+                    # perform guidance
+                    if self.do_perturbed_attention_guidance:
+                        noise_pred = self._apply_perturbed_attention_guidance(
+                            noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                        )
+                    elif self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+
+        # 9. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents, decode_chunk_size)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+        # 10. Offload all models
+        self.maybe_free_model_hooks()
+
+        if self.do_perturbed_attention_guidance:
+            self.unet.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
diff --git a/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..18fc06c1f9b869f35132701ca740de49b84cf3c8
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py
@@ -0,0 +1,1333 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from .pag_utils import PAGMixin
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AutoPipelineForText2Image
+
+        >>> pipe = AutoPipelineForText2Image.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0",
+        ...     torch_dtype=torch.float16,
+        ...     enable_pag=True,
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt, pag_scale=0.3).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLPAGPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    PAGMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+        pag_applied_layers: Union[str, List[str]] = "mid",  # ["mid"],["down.block_1"],["up.block_0.attentions_0"]
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+        self.set_pag_applied_layers(pag_applied_layers)
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        pag_scale: float = 3.0,
+        pag_adaptive_scale: float = 0.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            pag_scale (`float`, *optional*, defaults to 3.0):
+                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
+                guidance will not be used.
+            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
+                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
+                used.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            None,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+        self._pag_scale = pag_scale
+        self._pag_adaptive_scale = pag_adaptive_scale
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_perturbed_attention_guidance:
+            prompt_embeds = self._prepare_perturbed_attention_guidance(
+                prompt_embeds, negative_prompt_embeds, self.do_classifier_free_guidance
+            )
+            add_text_embeds = self._prepare_perturbed_attention_guidance(
+                add_text_embeds, negative_pooled_prompt_embeds, self.do_classifier_free_guidance
+            )
+            add_time_ids = self._prepare_perturbed_attention_guidance(
+                add_time_ids, negative_add_time_ids, self.do_classifier_free_guidance
+            )
+
+        elif self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+            for i, image_embeds in enumerate(ip_adapter_image_embeds):
+                negative_image_embeds = None
+                if self.do_classifier_free_guidance:
+                    negative_image_embeds, image_embeds = image_embeds.chunk(2)
+
+                if self.do_perturbed_attention_guidance:
+                    image_embeds = self._prepare_perturbed_attention_guidance(
+                        image_embeds, negative_image_embeds, self.do_classifier_free_guidance
+                    )
+                elif self.do_classifier_free_guidance:
+                    image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
+                image_embeds = image_embeds.to(device)
+                ip_adapter_image_embeds[i] = image_embeds
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        if self.do_perturbed_attention_guidance:
+            original_attn_proc = self.unet.attn_processors
+            self._set_pag_attn_processor(
+                pag_applied_layers=self.pag_applied_layers,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance, perturbed-attention guidance, or both
+                latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = ip_adapter_image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_perturbed_attention_guidance:
+                    noise_pred = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    )
+
+                elif self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if self.do_perturbed_attention_guidance:
+            self.unet.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc85aaaca37fea9f6e8893e28ba396606880f08e
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py
@@ -0,0 +1,1532 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from .pag_utils import PAGMixin
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AutoPipelineForImage2Image
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = AutoPipelineForImage2Image.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-refiner-1.0",
+        ...     torch_dtype=torch.float16,
+        ...     enable_pag=True,
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
+
+        >>> init_image = load_image(url).convert("RGB")
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt, image=init_image, pag_scale=0.3).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLPAGImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    IPAdapterMixin,
+    PAGMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+            Whether the `unet` requires an `aesthetic_score` condition to be passed during inference. Also see the
+            config of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "add_neg_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+        pag_applied_layers: Union[str, List[str]] = "mid",  # ["mid"], ["down.block_1", "up.block_0.attentions_0"]
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+        self.set_pag_applied_layers(pag_applied_layers)
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        strength,
+        num_inference_steps,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` cannot be None.")
+        elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
+            raise ValueError(
+                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
+                f" {type(num_inference_steps)}."
+            )
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+
+            timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+            return timesteps, num_inference_steps - t_start
+
+        else:
+            # Strength is irrelevant if we directly request a timestep to start at;
+            # that is, strength is determined by the denoising_start instead.
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            t_start = len(self.scheduler.timesteps) - num_inference_steps
+            timesteps = self.scheduler.timesteps[t_start:]
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index(t_start)
+            return timesteps, num_inference_steps
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.prepare_latents
+    def prepare_latents(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        latents_mean = latents_std = None
+        if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
+            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
+        if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
+            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
+
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+                    image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+                elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+                    )
+
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            if latents_mean is not None and latents_std is not None:
+                latents_mean = latents_mean.to(device=device, dtype=dtype)
+                latents_std = latents_std.to(device=device, dtype=dtype)
+                init_latents = (init_latents - latents_mean) * self.vae.config.scaling_factor / latents_std
+            else:
+                init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        if add_noise:
+            shape = init_latents.shape
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # get latents
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def denoising_start(self):
+        return self._denoising_start
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.3,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        pag_scale: float = 3.0,
+        pag_adaptive_scale: float = 0.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
+                The image(s) to modify with the pipeline.
+            strength (`float`, *optional*, defaults to 0.3):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. Note that in the case of
+                `denoising_start` being declared as an integer, the value of `strength` will be ignored.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refine Image
+                Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality).
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
+                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
+                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
+                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refine Image
+                Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality).
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            pag_scale (`float`, *optional*, defaults to 3.0):
+                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
+                guidance will not be used.
+            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
+                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
+                used.
+
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            strength,
+            num_inference_steps,
+            None,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        self._interrupt = False
+        self._pag_scale = pag_scale
+        self._pag_adaptive_scale = pag_adaptive_scale
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 5. Prepare timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
+        )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        add_noise = True if self.denoising_start is None else False
+
+        # 6. Prepare latent variables
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                add_noise,
+            )
+        # 7. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 8. Prepare added time ids & embeddings
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+        add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_perturbed_attention_guidance:
+            prompt_embeds = self._prepare_perturbed_attention_guidance(
+                prompt_embeds, negative_prompt_embeds, self.do_classifier_free_guidance
+            )
+            add_text_embeds = self._prepare_perturbed_attention_guidance(
+                add_text_embeds, negative_pooled_prompt_embeds, self.do_classifier_free_guidance
+            )
+            add_time_ids = self._prepare_perturbed_attention_guidance(
+                add_time_ids, add_neg_time_ids, self.do_classifier_free_guidance
+            )
+        elif self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+            for i, image_embeds in enumerate(ip_adapter_image_embeds):
+                negative_image_embeds = None
+                if self.do_classifier_free_guidance:
+                    negative_image_embeds, image_embeds = image_embeds.chunk(2)
+
+                if self.do_perturbed_attention_guidance:
+                    image_embeds = self._prepare_perturbed_attention_guidance(
+                        image_embeds, negative_image_embeds, self.do_classifier_free_guidance
+                    )
+                elif self.do_classifier_free_guidance:
+                    image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
+                image_embeds = image_embeds.to(device)
+                ip_adapter_image_embeds[i] = image_embeds
+
+        # 9. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 9.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and self.denoising_start is not None
+            and denoising_value_valid(self.denoising_end)
+            and denoising_value_valid(self.denoising_start)
+            and self.denoising_start >= self.denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {self.denoising_end} when using type float."
+            )
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        if self.do_perturbed_attention_guidance:
+            original_attn_proc = self.unet.attn_processors
+            self._set_pag_attn_processor(
+                pag_applied_layers=self.pag_applied_layers,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = ip_adapter_image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_perturbed_attention_guidance:
+                    noise_pred = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    )
+                elif self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if self.do_perturbed_attention_guidance:
+            self.unet.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5ebf4300934c4115cd23a49b723f1fc55f23849
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py
@@ -0,0 +1,1764 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from .pag_utils import PAGMixin
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AutoPipelineForInpainting
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = AutoPipelineForInpainting.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0",
+        ...     torch_dtype=torch.float16,
+        ...     variant="fp16",
+        ...     enable_pag=True,
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+        >>> init_image = load_image(img_url).convert("RGB")
+        >>> mask_image = load_image(mask_url).convert("RGB")
+
+        >>> prompt = "A majestic tiger sitting on a bench"
+        >>> image = pipe(
+        ...     prompt=prompt,
+        ...     image=init_image,
+        ...     mask_image=mask_image,
+        ...     num_inference_steps=50,
+        ...     strength=0.80,
+        ...     pag_scale=0.3,
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLPAGInpaintPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    PAGMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+            Whether the `unet` requires a aesthetic_score condition to be passed during inference. Also see the config
+            of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "add_neg_time_ids",
+        "mask",
+        "masked_image_latents",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+        pag_applied_layers: Union[str, List[str]] = "mid",  # ["mid"], ["down.block_1", "up.block_0.attentions_0"]
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+        self.set_pag_applied_layers(pag_applied_layers)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        mask_image,
+        height,
+        width,
+        strength,
+        callback_steps,
+        output_type,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        padding_mask_crop=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if padding_mask_crop is not None:
+            if not isinstance(image, PIL.Image.Image):
+                raise ValueError(
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
+                )
+            if not isinstance(mask_image, PIL.Image.Image):
+                raise ValueError(
+                    f"The mask image should be a PIL image when inpainting mask crop, but is of type"
+                    f" {type(mask_image)}."
+                )
+            if output_type != "pil":
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        add_noise=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if image.shape[1] == 4:
+            image_latents = image.to(device=device, dtype=dtype)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+        elif return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+            image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None and add_noise:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        elif add_noise:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+        else:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = image_latents.to(device)
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        dtype = image.dtype
+        if self.vae.config.force_upcast:
+            image = image.float()
+            self.vae.to(dtype=torch.float32)
+
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        if self.vae.config.force_upcast:
+            self.vae.to(dtype)
+
+        image_latents = image_latents.to(dtype)
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipeline.prepare_mask_latents
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+
+        if masked_image is not None and masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = None
+
+        if masked_image is not None:
+            if masked_image_latents is None:
+                masked_image = masked_image.to(device=device, dtype=dtype)
+                masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+            if masked_image_latents.shape[0] < batch_size:
+                if not batch_size % masked_image_latents.shape[0] == 0:
+                    raise ValueError(
+                        "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                        f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                        " Make sure the number of images that you pass is divisible by the total requested batch size."
+                    )
+                masked_image_latents = masked_image_latents.repeat(
+                    batch_size // masked_image_latents.shape[0], 1, 1, 1
+                )
+
+            masked_image_latents = (
+                torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+            )
+
+            # aligning device to prevent device errors when concating it with the latent model input
+            masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+
+            timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+            return timesteps, num_inference_steps - t_start
+
+        else:
+            # Strength is irrelevant if we directly request a timestep to start at;
+            # that is, strength is determined by the denoising_start instead.
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            t_start = len(self.scheduler.timesteps) - num_inference_steps
+            timesteps = self.scheduler.timesteps[t_start:]
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index(t_start)
+            return timesteps, num_inference_steps
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def denoising_start(self):
+        return self._denoising_start
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: torch.Tensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 0.9999,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        pag_scale: float = 3.0,
+        pag_adaptive_scale: float = 0.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
+                image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
+                with the same aspect ration of the image and contains all masked area, and then expand that area based
+                on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
+                resizing to the original image size for inpainting. This is useful when the masked area is small while
+                the image is large and contain information irrelevant for inpainting, such as background.
+            strength (`float`, *optional*, defaults to 0.9999):
+                Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
+                between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
+                `strength`. The number of denoising steps depends on the amount of noise initially added. When
+                `strength` is 1, added noise will be maximum and the denoising process will run for the full number of
+                iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores the masked
+                portion of the reference `image`. Note that in the case of `denoising_start` being declared as an
+                integer, the value of `strength` will be ignored.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
+                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
+                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
+                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            pag_scale (`float`, *optional*, defaults to 3.0):
+                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
+                guidance will not be used.
+            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
+                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
+                used.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. `tuple. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            image,
+            mask_image,
+            height,
+            width,
+            strength,
+            None,
+            output_type,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+            padding_mask_crop,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        self._interrupt = False
+        self._pag_scale = pag_scale
+        self._pag_adaptive_scale = pag_adaptive_scale
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. set timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+
+        # 5. Preprocess mask and image
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        mask = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+
+        if masked_image_latents is not None:
+            masked_image = masked_image_latents
+        elif init_image.shape[1] == 4:
+            # if images are in latent space, we can't mask it
+            masked_image = None
+        else:
+            masked_image = init_image * (mask < 0.5)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
+        add_noise = True if self.denoising_start is None else False
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            add_noise=add_noise,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+        if self.do_perturbed_attention_guidance:
+            if self.do_classifier_free_guidance:
+                mask, _ = mask.chunk(2)
+                masked_image_latents, _ = masked_image_latents.chunk(2)
+            mask = self._prepare_perturbed_attention_guidance(mask, mask, self.do_classifier_free_guidance)
+            masked_image_latents = self._prepare_perturbed_attention_guidance(
+                masked_image_latents, masked_image_latents, self.do_classifier_free_guidance
+            )
+
+        # 8. Check that sizes of mask, masked image and latents match
+        if num_channels_unet == 9:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
+        elif num_channels_unet != 4:
+            raise ValueError(
+                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+            )
+        # 8.1 Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 10. Prepare added time ids & embeddings
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+        add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_perturbed_attention_guidance:
+            prompt_embeds = self._prepare_perturbed_attention_guidance(
+                prompt_embeds, negative_prompt_embeds, self.do_classifier_free_guidance
+            )
+            add_text_embeds = self._prepare_perturbed_attention_guidance(
+                add_text_embeds, negative_pooled_prompt_embeds, self.do_classifier_free_guidance
+            )
+            add_time_ids = self._prepare_perturbed_attention_guidance(
+                add_time_ids, add_neg_time_ids, self.do_classifier_free_guidance
+            )
+
+        elif self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+            for i, image_embeds in enumerate(ip_adapter_image_embeds):
+                negative_image_embeds = None
+                if self.do_classifier_free_guidance:
+                    negative_image_embeds, image_embeds = image_embeds.chunk(2)
+
+                if self.do_perturbed_attention_guidance:
+                    image_embeds = self._prepare_perturbed_attention_guidance(
+                        image_embeds, negative_image_embeds, self.do_classifier_free_guidance
+                    )
+                elif self.do_classifier_free_guidance:
+                    image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
+                image_embeds = image_embeds.to(device)
+                ip_adapter_image_embeds[i] = image_embeds
+
+        # 11. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        if (
+            self.denoising_end is not None
+            and self.denoising_start is not None
+            and denoising_value_valid(self.denoising_end)
+            and denoising_value_valid(self.denoising_start)
+            and self.denoising_start >= self.denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {self.denoising_end} when using type float."
+            )
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 11.1 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        if self.do_perturbed_attention_guidance:
+            original_attn_proc = self.unet.attn_processors
+            self._set_pag_attn_processor(
+                pag_applied_layers=self.pag_applied_layers,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if num_channels_unet == 9:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = ip_adapter_image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_perturbed_attention_guidance:
+                    noise_pred = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    )
+                elif self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_perturbed_attention_guidance:
+                        init_mask, *_ = mask.chunk(3) if self.do_classifier_free_guidance else mask.chunk(2)
+                    else:
+                        init_mask, *_ = mask.chunk(2) if self.do_classifier_free_guidance else mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            return StableDiffusionXLPipelineOutput(images=latents)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if self.do_perturbed_attention_guidance:
+            self.unet.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/paint_by_example/__init__.py b/diffusers/src/diffusers/pipelines/paint_by_example/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaa775f690c3d290074662c029f242df3c61e003
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/paint_by_example/__init__.py
@@ -0,0 +1,55 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import numpy as np
+import PIL
+from PIL import Image
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["image_encoder"] = ["PaintByExampleImageEncoder"]
+    _import_structure["pipeline_paint_by_example"] = ["PaintByExamplePipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .image_encoder import PaintByExampleImageEncoder
+        from .pipeline_paint_by_example import PaintByExamplePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/paint_by_example/image_encoder.py b/diffusers/src/diffusers/pipelines/paint_by_example/image_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fd0338b1f9190e63053c086e4bbd74da9d98c54
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/paint_by_example/image_encoder.py
@@ -0,0 +1,67 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+from transformers import CLIPPreTrainedModel, CLIPVisionModel
+
+from ...models.attention import BasicTransformerBlock
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class PaintByExampleImageEncoder(CLIPPreTrainedModel):
+    def __init__(self, config, proj_size=None):
+        super().__init__(config)
+        self.proj_size = proj_size or getattr(config, "projection_dim", 768)
+
+        self.model = CLIPVisionModel(config)
+        self.mapper = PaintByExampleMapper(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.proj_out = nn.Linear(config.hidden_size, self.proj_size)
+
+        # uncondition for scaling
+        self.uncond_vector = nn.Parameter(torch.randn((1, 1, self.proj_size)))
+
+    def forward(self, pixel_values, return_uncond_vector=False):
+        clip_output = self.model(pixel_values=pixel_values)
+        latent_states = clip_output.pooler_output
+        latent_states = self.mapper(latent_states[:, None])
+        latent_states = self.final_layer_norm(latent_states)
+        latent_states = self.proj_out(latent_states)
+        if return_uncond_vector:
+            return latent_states, self.uncond_vector
+
+        return latent_states
+
+
+class PaintByExampleMapper(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        num_layers = (config.num_hidden_layers + 1) // 5
+        hid_size = config.hidden_size
+        num_heads = 1
+        self.blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(hid_size, num_heads, hid_size, activation_fn="gelu", attention_bias=True)
+                for _ in range(num_layers)
+            ]
+        )
+
+    def forward(self, hidden_states):
+        for block in self.blocks:
+            hidden_states = block(hidden_states)
+
+        return hidden_states
diff --git a/diffusers/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/diffusers/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..b225fd71edf81a88ac82818816db6ffbe9cf2480
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
@@ -0,0 +1,626 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor
+
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .image_encoder import PaintByExampleImageEncoder
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def prepare_mask_and_masked_image(image, mask):
+    """
+    Prepares a pair (image, mask) to be consumed by the Paint by Example pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Batched mask
+            if mask.shape[0] == image.shape[0]:
+                mask = mask.unsqueeze(1)
+            else:
+                mask = mask.unsqueeze(0)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+        assert mask.shape[1] == 1, "Mask image must have a single channel"
+
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # paint-by-example inverses the mask
+        mask = 1 - mask
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        if isinstance(image, PIL.Image.Image):
+            image = [image]
+
+        image = np.concatenate([np.array(i.convert("RGB"))[None, :] for i in image], axis=0)
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, PIL.Image.Image):
+            mask = [mask]
+
+        mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+        mask = mask.astype(np.float32) / 255.0
+
+        # paint-by-example inverses the mask
+        mask = 1 - mask
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+    masked_image = image * mask
+
+    return mask, masked_image
+
+
+class PaintByExamplePipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    <Tip warning={true}>
+
+    🧪 This is an experimental feature!
+
+    </Tip>
+
+    Pipeline for image-guided image inpainting using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`PaintByExampleImageEncoder`]):
+            Encodes the example input image. The `unet` is conditioned on the example image instead of a text prompt.
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+
+    """
+
+    # TODO: feature_extractor is required to encode initial images (if they are in PIL format),
+    # we should give a descriptive message if the pipeline doesn't have one.
+
+    model_cpu_offload_seq = "unet->vae"
+    _exclude_from_cpu_offload = ["image_encoder"]
+    _optional_components = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: PaintByExampleImageEncoder,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = False,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs
+    def check_inputs(self, image, height, width, callback_steps):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        if masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings, negative_prompt_embeds = self.image_encoder(image, return_uncond_vector=True)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, image_embeddings.shape[0], 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, 1, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+
+        return image_embeddings
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        example_image: Union[torch.Tensor, PIL.Image.Image],
+        image: Union[torch.Tensor, PIL.Image.Image],
+        mask_image: Union[torch.Tensor, PIL.Image.Image],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            example_image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
+                An example image to guide image generation.
+            image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
+                `Image` or tensor representing an image batch to be inpainted (parts of the image are masked out with
+                `mask_image` and repainted according to `prompt`).
+            mask_image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
+                `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted,
+                while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
+                (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the
+                expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Example:
+
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+        >>> from diffusers import PaintByExamplePipeline
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = (
+        ...     "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/image/example_1.png"
+        ... )
+        >>> mask_url = (
+        ...     "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/mask/example_1.png"
+        ... )
+        >>> example_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/reference/example_1.jpg"
+
+        >>> init_image = download_image(img_url).resize((512, 512))
+        >>> mask_image = download_image(mask_url).resize((512, 512))
+        >>> example_image = download_image(example_url).resize((512, 512))
+
+        >>> pipe = PaintByExamplePipeline.from_pretrained(
+        ...     "Fantasy-Studio/Paint-by-Example",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0]
+        >>> image
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 1. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 2. Preprocess mask and image
+        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+        height, width = masked_image.shape[-2:]
+
+        # 3. Check inputs
+        self.check_inputs(example_image, height, width, callback_steps)
+
+        # 4. Encode input image
+        image_embeddings = self._encode_image(
+            example_image, device, num_images_per_prompt, do_classifier_free_guidance
+        )
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+
+        # 8. Check that sizes of mask, masked image and latents match
+        num_channels_mask = mask.shape[1]
+        num_channels_masked_image = masked_image_latents.shape[1]
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 10. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = torch.cat([latent_model_input, masked_image_latents, mask], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        self.maybe_free_model_hooks()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/pia/__init__.py b/diffusers/src/diffusers/pipelines/pia/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..16e8004966e58387f7dba2f6ff3175575fe0abee
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pia/__init__.py
@@ -0,0 +1,46 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_pia"] = ["PIAPipeline", "PIAPipelineOutput"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .pipeline_pia import PIAPipeline, PIAPipelineOutput
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/pia/pipeline_pia.py b/diffusers/src/diffusers/pipelines/pia/pipeline_pia.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7dfcd39edce2e61b0a0eed214580719a18cac6d
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pia/pipeline_pia.py
@@ -0,0 +1,944 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...models.unets.unet_motion_model import MotionAdapter
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import (
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..free_init_utils import FreeInitMixin
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import EulerDiscreteScheduler, MotionAdapter, PIAPipeline
+        >>> from diffusers.utils import export_to_gif, load_image
+
+        >>> adapter = MotionAdapter.from_pretrained("openmmlab/PIA-condition-adapter")
+        >>> pipe = PIAPipeline.from_pretrained(
+        ...     "SG161222/Realistic_Vision_V6.0_B1_noVAE", motion_adapter=adapter, torch_dtype=torch.float16
+        ... )
+
+        >>> pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true"
+        ... )
+        >>> image = image.resize((512, 512))
+        >>> prompt = "cat in a hat"
+        >>> negative_prompt = "wrong white balance, dark, sketches, worst quality, low quality, deformed, distorted"
+        >>> generator = torch.Generator("cpu").manual_seed(0)
+        >>> output = pipe(image=image, prompt=prompt, negative_prompt=negative_prompt, generator=generator)
+        >>> frames = output.frames[0]
+        >>> export_to_gif(frames, "pia-animation.gif")
+        ```
+"""
+
+RANGE_LIST = [
+    [1.0, 0.9, 0.85, 0.85, 0.85, 0.8],  # 0 Small Motion
+    [1.0, 0.8, 0.8, 0.8, 0.79, 0.78, 0.75],  # Moderate Motion
+    [1.0, 0.8, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.6, 0.5, 0.5],  # Large Motion
+    [1.0, 0.9, 0.85, 0.85, 0.85, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.85, 0.85, 0.9, 1.0],  # Loop
+    [1.0, 0.8, 0.8, 0.8, 0.79, 0.78, 0.75, 0.75, 0.75, 0.75, 0.75, 0.78, 0.79, 0.8, 0.8, 1.0],  # Loop
+    [1.0, 0.8, 0.7, 0.7, 0.7, 0.7, 0.6, 0.5, 0.5, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8, 1.0],  # Loop
+    [0.5, 0.4, 0.4, 0.4, 0.35, 0.3],  # Style Transfer Candidate Small Motion
+    [0.5, 0.4, 0.4, 0.4, 0.35, 0.35, 0.3, 0.25, 0.2],  # Style Transfer Moderate Motion
+    [0.5, 0.2],  # Style Transfer Large Motion
+]
+
+
+def prepare_mask_coef_by_statistics(num_frames: int, cond_frame: int, motion_scale: int):
+    assert num_frames > 0, "video_length should be greater than 0"
+
+    assert num_frames > cond_frame, "video_length should be greater than cond_frame"
+
+    range_list = RANGE_LIST
+
+    assert motion_scale < len(range_list), f"motion_scale type{motion_scale} not implemented"
+
+    coef = range_list[motion_scale]
+    coef = coef + ([coef[-1]] * (num_frames - len(coef)))
+
+    order = [abs(i - cond_frame) for i in range(num_frames)]
+    coef = [coef[order[i]] for i in range(num_frames)]
+
+    return coef
+
+
+@dataclass
+class PIAPipelineOutput(BaseOutput):
+    r"""
+    Output class for PIAPipeline.
+
+    Args:
+        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+            Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`, NumPy array of
+            shape `(batch_size, num_frames, channels, height, width, Torch tensor of shape `(batch_size, num_frames,
+            channels, height, width)`.
+    """
+
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
+
+
+class PIAPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
+    FreeInitMixin,
+):
+    r"""
+    Pipeline for text-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: Union[UNet2DConditionModel, UNetMotionModel],
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        motion_adapter: Optional[MotionAdapter] = None,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_masked_condition(
+        self,
+        image,
+        batch_size,
+        num_channels_latents,
+        num_frames,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        motion_scale=0,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        _, _, _, scaled_height, scaled_width = shape
+
+        image = self.video_processor.preprocess(image)
+        image = image.to(device, dtype)
+
+        if isinstance(generator, list):
+            image_latent = [
+                self.vae.encode(image[k : k + 1]).latent_dist.sample(generator[k]) for k in range(batch_size)
+            ]
+            image_latent = torch.cat(image_latent, dim=0)
+        else:
+            image_latent = self.vae.encode(image).latent_dist.sample(generator)
+
+        image_latent = image_latent.to(device=device, dtype=dtype)
+        image_latent = torch.nn.functional.interpolate(image_latent, size=[scaled_height, scaled_width])
+        image_latent_padding = image_latent.clone() * self.vae.config.scaling_factor
+
+        mask = torch.zeros((batch_size, 1, num_frames, scaled_height, scaled_width)).to(device=device, dtype=dtype)
+        mask_coef = prepare_mask_coef_by_statistics(num_frames, 0, motion_scale)
+        masked_image = torch.zeros(batch_size, 4, num_frames, scaled_height, scaled_width).to(
+            device=device, dtype=self.unet.dtype
+        )
+        for f in range(num_frames):
+            mask[:, :, f, :, :] = mask_coef[f]
+            masked_image[:, :, f, :, :] = image_latent_padding.clone()
+
+        mask = torch.cat([mask] * 2) if self.do_classifier_free_guidance else mask
+        masked_image = torch.cat([masked_image] * 2) if self.do_classifier_free_guidance else masked_image
+
+        return mask, masked_image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Union[str, List[str]] = None,
+        strength: float = 1.0,
+        num_frames: Optional[int] = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        motion_scale: int = 0,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PipelineImageInput`):
+                The input image to be used for video generation.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            motion_scale: (`int`, *optional*, defaults to 0):
+                Parameter that controls the amount and type of motion that is added to the image. Increasing the value
+                increases the amount of motion, while specific ranges of values control the type of motion that is
+                added. Must be between 0 and 8. Set between 0-2 to only increase the amount of motion. Set between 3-5
+                to create looping motion. Set between 6-8 to perform motion with image style transfer.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is a list with the generated frames.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latent variables
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            4,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents=latents,
+        )
+        mask, masked_image = self.prepare_masked_condition(
+            image,
+            batch_size * num_videos_per_prompt,
+            4,
+            num_frames=num_frames,
+            height=height,
+            width=width,
+            dtype=self.unet.dtype,
+            device=device,
+            generator=generator,
+            motion_scale=motion_scale,
+        )
+        if strength < 1.0:
+            noise = randn_tensor(latents.shape, generator=generator, device=device, dtype=latents.dtype)
+            latents = self.scheduler.add_noise(masked_image[0], noise, latent_timestep)
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 8. Denoising loop
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+
+            self._num_timesteps = len(timesteps)
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+            with self.progress_bar(total=self._num_timesteps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image], dim=1)
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+
+        # 9. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+        # 10. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return PIAPipelineOutput(frames=video)
diff --git a/diffusers/src/diffusers/pipelines/pipeline_flax_utils.py b/diffusers/src/diffusers/pipelines/pipeline_flax_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c212873a88fbef21c8a7d528fe92efea09e7ed
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pipeline_flax_utils.py
@@ -0,0 +1,611 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+import os
+from typing import Any, Dict, List, Optional, Union
+
+import flax
+import numpy as np
+import PIL.Image
+from flax.core.frozen_dict import FrozenDict
+from huggingface_hub import create_repo, snapshot_download
+from huggingface_hub.utils import validate_hf_hub_args
+from PIL import Image
+from tqdm.auto import tqdm
+
+from ..configuration_utils import ConfigMixin
+from ..models.modeling_flax_utils import FLAX_WEIGHTS_NAME, FlaxModelMixin
+from ..schedulers.scheduling_utils_flax import SCHEDULER_CONFIG_NAME, FlaxSchedulerMixin
+from ..utils import (
+    CONFIG_NAME,
+    BaseOutput,
+    PushToHubMixin,
+    http_user_agent,
+    is_transformers_available,
+    logging,
+)
+
+
+if is_transformers_available():
+    from transformers import FlaxPreTrainedModel
+
+INDEX_FILE = "diffusion_flax_model.bin"
+
+
+logger = logging.get_logger(__name__)
+
+
+LOADABLE_CLASSES = {
+    "diffusers": {
+        "FlaxModelMixin": ["save_pretrained", "from_pretrained"],
+        "FlaxSchedulerMixin": ["save_pretrained", "from_pretrained"],
+        "FlaxDiffusionPipeline": ["save_pretrained", "from_pretrained"],
+    },
+    "transformers": {
+        "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
+        "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
+        "FlaxPreTrainedModel": ["save_pretrained", "from_pretrained"],
+        "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"],
+        "ProcessorMixin": ["save_pretrained", "from_pretrained"],
+        "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
+    },
+}
+
+ALL_IMPORTABLE_CLASSES = {}
+for library in LOADABLE_CLASSES:
+    ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library])
+
+
+def import_flax_or_no_model(module, class_name):
+    try:
+        # 1. First make sure that if a Flax object is present, import this one
+        class_obj = getattr(module, "Flax" + class_name)
+    except AttributeError:
+        # 2. If this doesn't work, it's not a model and we don't append "Flax"
+        class_obj = getattr(module, class_name)
+    except AttributeError:
+        raise ValueError(f"Neither Flax{class_name} nor {class_name} exist in {module}")
+
+    return class_obj
+
+
+@flax.struct.dataclass
+class FlaxImagePipelineOutput(BaseOutput):
+    """
+    Output class for image pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+class FlaxDiffusionPipeline(ConfigMixin, PushToHubMixin):
+    r"""
+    Base class for Flax-based pipelines.
+
+    [`FlaxDiffusionPipeline`] stores all components (models, schedulers, and processors) for diffusion pipelines and
+    provides methods for loading, downloading and saving models. It also includes methods to:
+
+        - enable/disable the progress bar for the denoising iteration
+
+    Class attributes:
+
+        - **config_name** ([`str`]) -- The configuration filename that stores the class and module names of all the
+          diffusion pipeline's components.
+    """
+
+    config_name = "model_index.json"
+
+    def register_modules(self, **kwargs):
+        # import it here to avoid circular import
+        from diffusers import pipelines
+
+        for name, module in kwargs.items():
+            if module is None:
+                register_dict = {name: (None, None)}
+            else:
+                # retrieve library
+                library = module.__module__.split(".")[0]
+
+                # check if the module is a pipeline module
+                pipeline_dir = module.__module__.split(".")[-2]
+                path = module.__module__.split(".")
+                is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
+
+                # if library is not in LOADABLE_CLASSES, then it is a custom module.
+                # Or if it's a pipeline module, then the module is inside the pipeline
+                # folder so we set the library to module name.
+                if library not in LOADABLE_CLASSES or is_pipeline_module:
+                    library = pipeline_dir
+
+                # retrieve class_name
+                class_name = module.__class__.__name__
+
+                register_dict = {name: (library, class_name)}
+
+            # save model index config
+            self.register_to_config(**register_dict)
+
+            # set models
+            setattr(self, name, module)
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        params: Union[Dict, FrozenDict],
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        # TODO: handle inference_state
+        """
+        Save all saveable variables of the pipeline to a directory. A pipeline variable can be saved and loaded if its
+        class implements both a save and loading method. The pipeline is easily reloaded using the
+        [`~FlaxDiffusionPipeline.from_pretrained`] class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to which to save. Will be created if it doesn't exist.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        self.save_config(save_directory)
+
+        model_index_dict = dict(self.config)
+        model_index_dict.pop("_class_name")
+        model_index_dict.pop("_diffusers_version")
+        model_index_dict.pop("_module", None)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", False)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+
+        for pipeline_component_name in model_index_dict.keys():
+            sub_model = getattr(self, pipeline_component_name)
+            if sub_model is None:
+                # edge case for saving a pipeline with safety_checker=None
+                continue
+
+            model_cls = sub_model.__class__
+
+            save_method_name = None
+            # search for the model's base class in LOADABLE_CLASSES
+            for library_name, library_classes in LOADABLE_CLASSES.items():
+                library = importlib.import_module(library_name)
+                for base_class, save_load_methods in library_classes.items():
+                    class_candidate = getattr(library, base_class, None)
+                    if class_candidate is not None and issubclass(model_cls, class_candidate):
+                        # if we found a suitable base class in LOADABLE_CLASSES then grab its save method
+                        save_method_name = save_load_methods[0]
+                        break
+                if save_method_name is not None:
+                    break
+
+            save_method = getattr(sub_model, save_method_name)
+            expects_params = "params" in set(inspect.signature(save_method).parameters.keys())
+
+            if expects_params:
+                save_method(
+                    os.path.join(save_directory, pipeline_component_name), params=params[pipeline_component_name]
+                )
+            else:
+                save_method(os.path.join(save_directory, pipeline_component_name))
+
+            if push_to_hub:
+                self._upload_folder(
+                    save_directory,
+                    repo_id,
+                    token=token,
+                    commit_message=commit_message,
+                    create_pr=create_pr,
+                )
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a Flax-based diffusion pipeline from pretrained pipeline weights.
+
+        The pipeline is set in evaluation mode (`model.eval()) by default and dropout modules are deactivated.
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```
+        Some weights of FlaxUNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        ```
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *repo id* (for example `runwayml/stable-diffusion-v1-5`) of a pretrained pipeline
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      using [`~FlaxDiffusionPipeline.save_pretrained`].
+            dtype (`str` or `jnp.dtype`, *optional*):
+                Override the default `jnp.dtype` and load the model under this dtype. If `"auto"`, the dtype is
+                automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (the pipeline components) of the specific pipeline
+                class. The overwritten components are passed directly to the pipelines `__init__` method.
+
+        <Tip>
+
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`.
+
+        </Tip>
+
+        Examples:
+
+        ```py
+        >>> from diffusers import FlaxDiffusionPipeline
+
+        >>> # Download pipeline from huggingface.co and cache.
+        >>> # Requires to be logged in to Hugging Face hub,
+        >>> # see more in [the documentation](https://huggingface.co/docs/hub/security-tokens)
+        >>> pipeline, params = FlaxDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5",
+        ...     variant="bf16",
+        ...     dtype=jnp.bfloat16,
+        ... )
+
+        >>> # Download pipeline, but use a different scheduler
+        >>> from diffusers import FlaxDPMSolverMultistepScheduler
+
+        >>> model_id = "runwayml/stable-diffusion-v1-5"
+        >>> dpmpp, dpmpp_state = FlaxDPMSolverMultistepScheduler.from_pretrained(
+        ...     model_id,
+        ...     subfolder="scheduler",
+        ... )
+
+        >>> dpm_pipe, dpm_params = FlaxStableDiffusionPipeline.from_pretrained(
+        ...     model_id, variant="bf16", dtype=jnp.bfloat16, scheduler=dpmpp
+        ... )
+        >>> dpm_params["scheduler"] = dpmpp_state
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        from_pt = kwargs.pop("from_pt", False)
+        use_memory_efficient_attention = kwargs.pop("use_memory_efficient_attention", False)
+        split_head_dim = kwargs.pop("split_head_dim", False)
+        dtype = kwargs.pop("dtype", None)
+
+        # 1. Download the checkpoints and configs
+        # use snapshot download here to get it working from from_pretrained
+        if not os.path.isdir(pretrained_model_name_or_path):
+            config_dict = cls.load_config(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+            )
+            # make sure we only download sub-folders and `diffusers` filenames
+            folder_names = [k for k in config_dict.keys() if not k.startswith("_")]
+            allow_patterns = [os.path.join(k, "*") for k in folder_names]
+            allow_patterns += [FLAX_WEIGHTS_NAME, SCHEDULER_CONFIG_NAME, CONFIG_NAME, cls.config_name]
+
+            ignore_patterns = ["*.bin", "*.safetensors"] if not from_pt else []
+            ignore_patterns += ["*.onnx", "*.onnx_data", "*.xml", "*.pb"]
+
+            if cls != FlaxDiffusionPipeline:
+                requested_pipeline_class = cls.__name__
+            else:
+                requested_pipeline_class = config_dict.get("_class_name", cls.__name__)
+                requested_pipeline_class = (
+                    requested_pipeline_class
+                    if requested_pipeline_class.startswith("Flax")
+                    else "Flax" + requested_pipeline_class
+                )
+
+            user_agent = {"pipeline_class": requested_pipeline_class}
+            user_agent = http_user_agent(user_agent)
+
+            # download all allow_patterns
+            cached_folder = snapshot_download(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
+                user_agent=user_agent,
+            )
+        else:
+            cached_folder = pretrained_model_name_or_path
+
+        config_dict = cls.load_config(cached_folder)
+
+        # 2. Load the pipeline class, if using custom module then load it from the hub
+        # if we load from explicit class, let's use it
+        if cls != FlaxDiffusionPipeline:
+            pipeline_class = cls
+        else:
+            diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
+            class_name = (
+                config_dict["_class_name"]
+                if config_dict["_class_name"].startswith("Flax")
+                else "Flax" + config_dict["_class_name"]
+            )
+            pipeline_class = getattr(diffusers_module, class_name)
+
+        # some modules can be passed directly to the init
+        # in this case they are already instantiated in `kwargs`
+        # extract them here
+        expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+
+        init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
+
+        # define init kwargs
+        init_kwargs = {k: init_dict.pop(k) for k in optional_kwargs if k in init_dict}
+        init_kwargs = {**init_kwargs, **passed_pipe_kwargs}
+
+        # remove `null` components
+        def load_module(name, value):
+            if value[0] is None:
+                return False
+            if name in passed_class_obj and passed_class_obj[name] is None:
+                return False
+            return True
+
+        init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}
+
+        # Throw nice warnings / errors for fast accelerate loading
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"Keyword arguments {unused_kwargs} are not expected by {pipeline_class.__name__} and will be ignored."
+            )
+
+        # inference_params
+        params = {}
+
+        # import it here to avoid circular import
+        from diffusers import pipelines
+
+        # 3. Load each module in the pipeline
+        for name, (library_name, class_name) in init_dict.items():
+            if class_name is None:
+                # edge case for when the pipeline was saved with safety_checker=None
+                init_kwargs[name] = None
+                continue
+
+            is_pipeline_module = hasattr(pipelines, library_name)
+            loaded_sub_model = None
+            sub_model_should_be_defined = True
+
+            # if the model is in a pipeline module, then we load it from the pipeline
+            if name in passed_class_obj:
+                # 1. check that passed_class_obj has correct parent class
+                if not is_pipeline_module:
+                    library = importlib.import_module(library_name)
+                    class_obj = getattr(library, class_name)
+                    importable_classes = LOADABLE_CLASSES[library_name]
+                    class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+
+                    expected_class_obj = None
+                    for class_name, class_candidate in class_candidates.items():
+                        if class_candidate is not None and issubclass(class_obj, class_candidate):
+                            expected_class_obj = class_candidate
+
+                    if not issubclass(passed_class_obj[name].__class__, expected_class_obj):
+                        raise ValueError(
+                            f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be"
+                            f" {expected_class_obj}"
+                        )
+                elif passed_class_obj[name] is None:
+                    logger.warning(
+                        f"You have passed `None` for {name} to disable its functionality in {pipeline_class}. Note"
+                        f" that this might lead to problems when using {pipeline_class} and is not recommended."
+                    )
+                    sub_model_should_be_defined = False
+                else:
+                    logger.warning(
+                        f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
+                        " has the correct type"
+                    )
+
+                # set passed class object
+                loaded_sub_model = passed_class_obj[name]
+            elif is_pipeline_module:
+                pipeline_module = getattr(pipelines, library_name)
+                class_obj = import_flax_or_no_model(pipeline_module, class_name)
+
+                importable_classes = ALL_IMPORTABLE_CLASSES
+                class_candidates = {c: class_obj for c in importable_classes.keys()}
+            else:
+                # else we just import it from the library.
+                library = importlib.import_module(library_name)
+                class_obj = import_flax_or_no_model(library, class_name)
+
+                importable_classes = LOADABLE_CLASSES[library_name]
+                class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+
+            if loaded_sub_model is None and sub_model_should_be_defined:
+                load_method_name = None
+                for class_name, class_candidate in class_candidates.items():
+                    if class_candidate is not None and issubclass(class_obj, class_candidate):
+                        load_method_name = importable_classes[class_name][1]
+
+                load_method = getattr(class_obj, load_method_name)
+
+                # check if the module is in a subdirectory
+                if os.path.isdir(os.path.join(cached_folder, name)):
+                    loadable_folder = os.path.join(cached_folder, name)
+                else:
+                    loaded_sub_model = cached_folder
+
+                if issubclass(class_obj, FlaxModelMixin):
+                    loaded_sub_model, loaded_params = load_method(
+                        loadable_folder,
+                        from_pt=from_pt,
+                        use_memory_efficient_attention=use_memory_efficient_attention,
+                        split_head_dim=split_head_dim,
+                        dtype=dtype,
+                    )
+                    params[name] = loaded_params
+                elif is_transformers_available() and issubclass(class_obj, FlaxPreTrainedModel):
+                    if from_pt:
+                        # TODO(Suraj): Fix this in Transformers. We should be able to use `_do_init=False` here
+                        loaded_sub_model = load_method(loadable_folder, from_pt=from_pt)
+                        loaded_params = loaded_sub_model.params
+                        del loaded_sub_model._params
+                    else:
+                        loaded_sub_model, loaded_params = load_method(loadable_folder, _do_init=False)
+                    params[name] = loaded_params
+                elif issubclass(class_obj, FlaxSchedulerMixin):
+                    loaded_sub_model, scheduler_state = load_method(loadable_folder)
+                    params[name] = scheduler_state
+                else:
+                    loaded_sub_model = load_method(loadable_folder)
+
+            init_kwargs[name] = loaded_sub_model  # UNet(...), # DiffusionSchedule(...)
+
+        # 4. Potentially add passed objects if expected
+        missing_modules = set(expected_modules) - set(init_kwargs.keys())
+        passed_modules = list(passed_class_obj.keys())
+
+        if len(missing_modules) > 0 and missing_modules <= set(passed_modules):
+            for module in missing_modules:
+                init_kwargs[module] = passed_class_obj.get(module, None)
+        elif len(missing_modules) > 0:
+            passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs
+            raise ValueError(
+                f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed."
+            )
+
+        model = pipeline_class(**init_kwargs, dtype=dtype)
+        return model, params
+
+    @classmethod
+    def _get_signature_keys(cls, obj):
+        parameters = inspect.signature(obj.__init__).parameters
+        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
+        expected_modules = set(required_parameters.keys()) - {"self"}
+
+        return expected_modules, optional_parameters
+
+    @property
+    def components(self) -> Dict[str, Any]:
+        r"""
+
+        The `self.components` property can be useful to run different pipelines with the same weights and
+        configurations to not have to re-allocate memory.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import (
+        ...     FlaxStableDiffusionPipeline,
+        ...     FlaxStableDiffusionImg2ImgPipeline,
+        ... )
+
+        >>> text2img = FlaxStableDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", variant="bf16", dtype=jnp.bfloat16
+        ... )
+        >>> img2img = FlaxStableDiffusionImg2ImgPipeline(**text2img.components)
+        ```
+
+        Returns:
+            A dictionary containing all the modules needed to initialize the pipeline.
+        """
+        expected_modules, optional_parameters = self._get_signature_keys(self)
+        components = {
+            k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters
+        }
+
+        if set(components.keys()) != expected_modules:
+            raise ValueError(
+                f"{self} has been incorrectly initialized or {self.__class__} is incorrectly implemented. Expected"
+                f" {expected_modules} to be defined, but {components} are defined."
+            )
+
+        return components
+
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a NumPy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+
+        return pil_images
+
+    # TODO: make it compatible with jax.lax
+    def progress_bar(self, iterable):
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        elif not isinstance(self._progress_bar_config, dict):
+            raise ValueError(
+                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+            )
+
+        return tqdm(iterable, **self._progress_bar_config)
+
+    def set_progress_bar_config(self, **kwargs):
+        self._progress_bar_config = kwargs
diff --git a/diffusers/src/diffusers/pipelines/pipeline_loading_utils.py b/diffusers/src/diffusers/pipelines/pipeline_loading_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..318599f560636610c9bd812bbe311674d83ff055
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -0,0 +1,838 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import importlib
+import os
+import re
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+import torch
+from huggingface_hub import ModelCard, model_info
+from huggingface_hub.utils import validate_hf_hub_args
+from packaging import version
+
+from .. import __version__
+from ..utils import (
+    FLAX_WEIGHTS_NAME,
+    ONNX_EXTERNAL_WEIGHTS_NAME,
+    ONNX_WEIGHTS_NAME,
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    deprecate,
+    get_class_from_dynamic_module,
+    is_accelerate_available,
+    is_peft_available,
+    is_transformers_available,
+    logging,
+)
+from ..utils.torch_utils import is_compiled_module
+
+
+if is_transformers_available():
+    import transformers
+    from transformers import PreTrainedModel
+    from transformers.utils import FLAX_WEIGHTS_NAME as TRANSFORMERS_FLAX_WEIGHTS_NAME
+    from transformers.utils import SAFE_WEIGHTS_NAME as TRANSFORMERS_SAFE_WEIGHTS_NAME
+    from transformers.utils import WEIGHTS_NAME as TRANSFORMERS_WEIGHTS_NAME
+
+if is_accelerate_available():
+    import accelerate
+    from accelerate import dispatch_model
+    from accelerate.hooks import remove_hook_from_module
+    from accelerate.utils import compute_module_sizes, get_max_memory
+
+
+INDEX_FILE = "diffusion_pytorch_model.bin"
+CUSTOM_PIPELINE_FILE_NAME = "pipeline.py"
+DUMMY_MODULES_FOLDER = "diffusers.utils"
+TRANSFORMERS_DUMMY_MODULES_FOLDER = "transformers.utils"
+CONNECTED_PIPES_KEYS = ["prior"]
+
+logger = logging.get_logger(__name__)
+
+LOADABLE_CLASSES = {
+    "diffusers": {
+        "ModelMixin": ["save_pretrained", "from_pretrained"],
+        "SchedulerMixin": ["save_pretrained", "from_pretrained"],
+        "DiffusionPipeline": ["save_pretrained", "from_pretrained"],
+        "OnnxRuntimeModel": ["save_pretrained", "from_pretrained"],
+    },
+    "transformers": {
+        "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
+        "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
+        "PreTrainedModel": ["save_pretrained", "from_pretrained"],
+        "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"],
+        "ProcessorMixin": ["save_pretrained", "from_pretrained"],
+        "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
+    },
+    "onnxruntime.training": {
+        "ORTModule": ["save_pretrained", "from_pretrained"],
+    },
+}
+
+ALL_IMPORTABLE_CLASSES = {}
+for library in LOADABLE_CLASSES:
+    ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library])
+
+
+def is_safetensors_compatible(filenames, passed_components=None, folder_names=None) -> bool:
+    """
+    Checking for safetensors compatibility:
+    - The model is safetensors compatible only if there is a safetensors file for each model component present in
+      filenames.
+
+    Converting default pytorch serialized filenames to safetensors serialized filenames:
+    - For models from the diffusers library, just replace the ".bin" extension with ".safetensors"
+    - For models from the transformers library, the filename changes from "pytorch_model" to "model", and the ".bin"
+      extension is replaced with ".safetensors"
+    """
+    passed_components = passed_components or []
+    if folder_names is not None:
+        filenames = {f for f in filenames if os.path.split(f)[0] in folder_names}
+
+    # extract all components of the pipeline and their associated files
+    components = {}
+    for filename in filenames:
+        if not len(filename.split("/")) == 2:
+            continue
+
+        component, component_filename = filename.split("/")
+        if component in passed_components:
+            continue
+
+        components.setdefault(component, [])
+        components[component].append(component_filename)
+
+    # iterate over all files of a component
+    # check if safetensor files exist for that component
+    # if variant is provided check if the variant of the safetensors exists
+    for component, component_filenames in components.items():
+        matches = []
+        for component_filename in component_filenames:
+            filename, extension = os.path.splitext(component_filename)
+
+            match_exists = extension == ".safetensors"
+            matches.append(match_exists)
+
+        if not any(matches):
+            return False
+
+    return True
+
+
+def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLike], str]:
+    weight_names = [
+        WEIGHTS_NAME,
+        SAFETENSORS_WEIGHTS_NAME,
+        FLAX_WEIGHTS_NAME,
+        ONNX_WEIGHTS_NAME,
+        ONNX_EXTERNAL_WEIGHTS_NAME,
+    ]
+
+    if is_transformers_available():
+        weight_names += [TRANSFORMERS_WEIGHTS_NAME, TRANSFORMERS_SAFE_WEIGHTS_NAME, TRANSFORMERS_FLAX_WEIGHTS_NAME]
+
+    # model_pytorch, diffusion_model_pytorch, ...
+    weight_prefixes = [w.split(".")[0] for w in weight_names]
+    # .bin, .safetensors, ...
+    weight_suffixs = [w.split(".")[-1] for w in weight_names]
+    # -00001-of-00002
+    transformers_index_format = r"\d{5}-of-\d{5}"
+
+    if variant is not None:
+        # `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetensors`
+        variant_file_re = re.compile(
+            rf"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$"
+        )
+        # `text_encoder/pytorch_model.bin.index.fp16.json`
+        variant_index_re = re.compile(
+            rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.{variant}\.json$"
+        )
+
+    # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetensors`
+    non_variant_file_re = re.compile(
+        rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$"
+    )
+    # `text_encoder/pytorch_model.bin.index.json`
+    non_variant_index_re = re.compile(rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json")
+
+    if variant is not None:
+        variant_weights = {f for f in filenames if variant_file_re.match(f.split("/")[-1]) is not None}
+        variant_indexes = {f for f in filenames if variant_index_re.match(f.split("/")[-1]) is not None}
+        variant_filenames = variant_weights | variant_indexes
+    else:
+        variant_filenames = set()
+
+    non_variant_weights = {f for f in filenames if non_variant_file_re.match(f.split("/")[-1]) is not None}
+    non_variant_indexes = {f for f in filenames if non_variant_index_re.match(f.split("/")[-1]) is not None}
+    non_variant_filenames = non_variant_weights | non_variant_indexes
+
+    # all variant filenames will be used by default
+    usable_filenames = set(variant_filenames)
+
+    def convert_to_variant(filename):
+        if "index" in filename:
+            variant_filename = filename.replace("index", f"index.{variant}")
+        elif re.compile(f"^(.*?){transformers_index_format}").match(filename) is not None:
+            variant_filename = f"{filename.split('-')[0]}.{variant}-{'-'.join(filename.split('-')[1:])}"
+        else:
+            variant_filename = f"{filename.split('.')[0]}.{variant}.{filename.split('.')[1]}"
+        return variant_filename
+
+    for f in non_variant_filenames:
+        variant_filename = convert_to_variant(f)
+        if variant_filename not in usable_filenames:
+            usable_filenames.add(f)
+
+    return usable_filenames, variant_filenames
+
+
+@validate_hf_hub_args
+def warn_deprecated_model_variant(pretrained_model_name_or_path, token, variant, revision, model_filenames):
+    info = model_info(
+        pretrained_model_name_or_path,
+        token=token,
+        revision=None,
+    )
+    filenames = {sibling.rfilename for sibling in info.siblings}
+    comp_model_filenames, _ = variant_compatible_siblings(filenames, variant=revision)
+    comp_model_filenames = [".".join(f.split(".")[:1] + f.split(".")[2:]) for f in comp_model_filenames]
+
+    if set(model_filenames).issubset(set(comp_model_filenames)):
+        warnings.warn(
+            f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` even though you can load it via `variant=`{revision}`. Loading model variants via `revision='{revision}'` is deprecated and will be removed in diffusers v1. Please use `variant='{revision}'` instead.",
+            FutureWarning,
+        )
+    else:
+        warnings.warn(
+            f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have the required variant filenames in the 'main' branch. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {revision} files' so that the correct variant file can be added.",
+            FutureWarning,
+        )
+
+
+def _unwrap_model(model):
+    """Unwraps a model."""
+    if is_compiled_module(model):
+        model = model._orig_mod
+
+    if is_peft_available():
+        from peft import PeftModel
+
+        if isinstance(model, PeftModel):
+            model = model.base_model.model
+
+    return model
+
+
+def maybe_raise_or_warn(
+    library_name, library, class_name, importable_classes, passed_class_obj, name, is_pipeline_module
+):
+    """Simple helper method to raise or warn in case incorrect module has been passed"""
+    if not is_pipeline_module:
+        library = importlib.import_module(library_name)
+        class_obj = getattr(library, class_name)
+        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+
+        expected_class_obj = None
+        for class_name, class_candidate in class_candidates.items():
+            if class_candidate is not None and issubclass(class_obj, class_candidate):
+                expected_class_obj = class_candidate
+
+        # Dynamo wraps the original model in a private class.
+        # I didn't find a public API to get the original class.
+        sub_model = passed_class_obj[name]
+        unwrapped_sub_model = _unwrap_model(sub_model)
+        model_cls = unwrapped_sub_model.__class__
+
+        if not issubclass(model_cls, expected_class_obj):
+            raise ValueError(
+                f"{passed_class_obj[name]} is of type: {model_cls}, but should be" f" {expected_class_obj}"
+            )
+    else:
+        logger.warning(
+            f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
+            " has the correct type"
+        )
+
+
+def get_class_obj_and_candidates(
+    library_name, class_name, importable_classes, pipelines, is_pipeline_module, component_name=None, cache_dir=None
+):
+    """Simple helper method to retrieve class object of module as well as potential parent class objects"""
+    component_folder = os.path.join(cache_dir, component_name)
+
+    if is_pipeline_module:
+        pipeline_module = getattr(pipelines, library_name)
+
+        class_obj = getattr(pipeline_module, class_name)
+        class_candidates = {c: class_obj for c in importable_classes.keys()}
+    elif os.path.isfile(os.path.join(component_folder, library_name + ".py")):
+        # load custom component
+        class_obj = get_class_from_dynamic_module(
+            component_folder, module_file=library_name + ".py", class_name=class_name
+        )
+        class_candidates = {c: class_obj for c in importable_classes.keys()}
+    else:
+        # else we just import it from the library.
+        library = importlib.import_module(library_name)
+
+        class_obj = getattr(library, class_name)
+        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+
+    return class_obj, class_candidates
+
+
+def _get_custom_pipeline_class(
+    custom_pipeline,
+    repo_id=None,
+    hub_revision=None,
+    class_name=None,
+    cache_dir=None,
+    revision=None,
+):
+    if custom_pipeline.endswith(".py"):
+        path = Path(custom_pipeline)
+        # decompose into folder & file
+        file_name = path.name
+        custom_pipeline = path.parent.absolute()
+    elif repo_id is not None:
+        file_name = f"{custom_pipeline}.py"
+        custom_pipeline = repo_id
+    else:
+        file_name = CUSTOM_PIPELINE_FILE_NAME
+
+    if repo_id is not None and hub_revision is not None:
+        # if we load the pipeline code from the Hub
+        # make sure to overwrite the `revision`
+        revision = hub_revision
+
+    return get_class_from_dynamic_module(
+        custom_pipeline,
+        module_file=file_name,
+        class_name=class_name,
+        cache_dir=cache_dir,
+        revision=revision,
+    )
+
+
+def _get_pipeline_class(
+    class_obj,
+    config=None,
+    load_connected_pipeline=False,
+    custom_pipeline=None,
+    repo_id=None,
+    hub_revision=None,
+    class_name=None,
+    cache_dir=None,
+    revision=None,
+):
+    if custom_pipeline is not None:
+        return _get_custom_pipeline_class(
+            custom_pipeline,
+            repo_id=repo_id,
+            hub_revision=hub_revision,
+            class_name=class_name,
+            cache_dir=cache_dir,
+            revision=revision,
+        )
+
+    if class_obj.__name__ != "DiffusionPipeline":
+        return class_obj
+
+    diffusers_module = importlib.import_module(class_obj.__module__.split(".")[0])
+    class_name = class_name or config["_class_name"]
+    if not class_name:
+        raise ValueError(
+            "The class name could not be found in the configuration file. Please make sure to pass the correct `class_name`."
+        )
+
+    class_name = class_name[4:] if class_name.startswith("Flax") else class_name
+
+    pipeline_cls = getattr(diffusers_module, class_name)
+
+    if load_connected_pipeline:
+        from .auto_pipeline import _get_connected_pipeline
+
+        connected_pipeline_cls = _get_connected_pipeline(pipeline_cls)
+        if connected_pipeline_cls is not None:
+            logger.info(
+                f"Loading connected pipeline {connected_pipeline_cls.__name__} instead of {pipeline_cls.__name__} as specified via `load_connected_pipeline=True`"
+            )
+        else:
+            logger.info(f"{pipeline_cls.__name__} has no connected pipeline class. Loading {pipeline_cls.__name__}.")
+
+        pipeline_cls = connected_pipeline_cls or pipeline_cls
+
+    return pipeline_cls
+
+
+def _load_empty_model(
+    library_name: str,
+    class_name: str,
+    importable_classes: List[Any],
+    pipelines: Any,
+    is_pipeline_module: bool,
+    name: str,
+    torch_dtype: Union[str, torch.dtype],
+    cached_folder: Union[str, os.PathLike],
+    **kwargs,
+):
+    # retrieve class objects.
+    class_obj, _ = get_class_obj_and_candidates(
+        library_name,
+        class_name,
+        importable_classes,
+        pipelines,
+        is_pipeline_module,
+        component_name=name,
+        cache_dir=cached_folder,
+    )
+
+    if is_transformers_available():
+        transformers_version = version.parse(version.parse(transformers.__version__).base_version)
+    else:
+        transformers_version = "N/A"
+
+    # Determine library.
+    is_transformers_model = (
+        is_transformers_available()
+        and issubclass(class_obj, PreTrainedModel)
+        and transformers_version >= version.parse("4.20.0")
+    )
+    diffusers_module = importlib.import_module(__name__.split(".")[0])
+    is_diffusers_model = issubclass(class_obj, diffusers_module.ModelMixin)
+
+    model = None
+    config_path = cached_folder
+    user_agent = {
+        "diffusers": __version__,
+        "file_type": "model",
+        "framework": "pytorch",
+    }
+
+    if is_diffusers_model:
+        # Load config and then the model on meta.
+        config, unused_kwargs, commit_hash = class_obj.load_config(
+            os.path.join(config_path, name),
+            cache_dir=cached_folder,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=kwargs.pop("force_download", False),
+            proxies=kwargs.pop("proxies", None),
+            local_files_only=kwargs.pop("local_files_only", False),
+            token=kwargs.pop("token", None),
+            revision=kwargs.pop("revision", None),
+            subfolder=kwargs.pop("subfolder", None),
+            user_agent=user_agent,
+        )
+        with accelerate.init_empty_weights():
+            model = class_obj.from_config(config, **unused_kwargs)
+    elif is_transformers_model:
+        config_class = getattr(class_obj, "config_class", None)
+        if config_class is None:
+            raise ValueError("`config_class` cannot be None. Please double-check the model.")
+
+        config = config_class.from_pretrained(
+            cached_folder,
+            subfolder=name,
+            force_download=kwargs.pop("force_download", False),
+            proxies=kwargs.pop("proxies", None),
+            local_files_only=kwargs.pop("local_files_only", False),
+            token=kwargs.pop("token", None),
+            revision=kwargs.pop("revision", None),
+            user_agent=user_agent,
+        )
+        with accelerate.init_empty_weights():
+            model = class_obj(config)
+
+    if model is not None:
+        model = model.to(dtype=torch_dtype)
+    return model
+
+
+def _assign_components_to_devices(
+    module_sizes: Dict[str, float], device_memory: Dict[str, float], device_mapping_strategy: str = "balanced"
+):
+    device_ids = list(device_memory.keys())
+    device_cycle = device_ids + device_ids[::-1]
+    device_memory = device_memory.copy()
+
+    device_id_component_mapping = {}
+    current_device_index = 0
+    for component in module_sizes:
+        device_id = device_cycle[current_device_index % len(device_cycle)]
+        component_memory = module_sizes[component]
+        curr_device_memory = device_memory[device_id]
+
+        # If the GPU doesn't fit the current component offload to the CPU.
+        if component_memory > curr_device_memory:
+            device_id_component_mapping["cpu"] = [component]
+        else:
+            if device_id not in device_id_component_mapping:
+                device_id_component_mapping[device_id] = [component]
+            else:
+                device_id_component_mapping[device_id].append(component)
+
+            # Update the device memory.
+            device_memory[device_id] -= component_memory
+            current_device_index += 1
+
+    return device_id_component_mapping
+
+
+def _get_final_device_map(device_map, pipeline_class, passed_class_obj, init_dict, library, max_memory, **kwargs):
+    # To avoid circular import problem.
+    from diffusers import pipelines
+
+    torch_dtype = kwargs.get("torch_dtype", torch.float32)
+
+    # Load each module in the pipeline on a meta device so that we can derive the device map.
+    init_empty_modules = {}
+    for name, (library_name, class_name) in init_dict.items():
+        if class_name.startswith("Flax"):
+            raise ValueError("Flax pipelines are not supported with `device_map`.")
+
+        # Define all importable classes
+        is_pipeline_module = hasattr(pipelines, library_name)
+        importable_classes = ALL_IMPORTABLE_CLASSES
+        loaded_sub_model = None
+
+        # Use passed sub model or load class_name from library_name
+        if name in passed_class_obj:
+            # if the model is in a pipeline module, then we load it from the pipeline
+            # check that passed_class_obj has correct parent class
+            maybe_raise_or_warn(
+                library_name,
+                library,
+                class_name,
+                importable_classes,
+                passed_class_obj,
+                name,
+                is_pipeline_module,
+            )
+            with accelerate.init_empty_weights():
+                loaded_sub_model = passed_class_obj[name]
+
+        else:
+            loaded_sub_model = _load_empty_model(
+                library_name=library_name,
+                class_name=class_name,
+                importable_classes=importable_classes,
+                pipelines=pipelines,
+                is_pipeline_module=is_pipeline_module,
+                pipeline_class=pipeline_class,
+                name=name,
+                torch_dtype=torch_dtype,
+                cached_folder=kwargs.get("cached_folder", None),
+                force_download=kwargs.get("force_download", None),
+                proxies=kwargs.get("proxies", None),
+                local_files_only=kwargs.get("local_files_only", None),
+                token=kwargs.get("token", None),
+                revision=kwargs.get("revision", None),
+            )
+
+        if loaded_sub_model is not None:
+            init_empty_modules[name] = loaded_sub_model
+
+    # determine device map
+    # Obtain a sorted dictionary for mapping the model-level components
+    # to their sizes.
+    module_sizes = {
+        module_name: compute_module_sizes(module, dtype=torch_dtype)[""]
+        for module_name, module in init_empty_modules.items()
+        if isinstance(module, torch.nn.Module)
+    }
+    module_sizes = dict(sorted(module_sizes.items(), key=lambda item: item[1], reverse=True))
+
+    # Obtain maximum memory available per device (GPUs only).
+    max_memory = get_max_memory(max_memory)
+    max_memory = dict(sorted(max_memory.items(), key=lambda item: item[1], reverse=True))
+    max_memory = {k: v for k, v in max_memory.items() if k != "cpu"}
+
+    # Obtain a dictionary mapping the model-level components to the available
+    # devices based on the maximum memory and the model sizes.
+    final_device_map = None
+    if len(max_memory) > 0:
+        device_id_component_mapping = _assign_components_to_devices(
+            module_sizes, max_memory, device_mapping_strategy=device_map
+        )
+
+        # Obtain the final device map, e.g., `{"unet": 0, "text_encoder": 1, "vae": 1, ...}`
+        final_device_map = {}
+        for device_id, components in device_id_component_mapping.items():
+            for component in components:
+                final_device_map[component] = device_id
+
+    return final_device_map
+
+
+def load_sub_model(
+    library_name: str,
+    class_name: str,
+    importable_classes: List[Any],
+    pipelines: Any,
+    is_pipeline_module: bool,
+    pipeline_class: Any,
+    torch_dtype: torch.dtype,
+    provider: Any,
+    sess_options: Any,
+    device_map: Optional[Union[Dict[str, torch.device], str]],
+    max_memory: Optional[Dict[Union[int, str], Union[int, str]]],
+    offload_folder: Optional[Union[str, os.PathLike]],
+    offload_state_dict: bool,
+    model_variants: Dict[str, str],
+    name: str,
+    from_flax: bool,
+    variant: str,
+    low_cpu_mem_usage: bool,
+    cached_folder: Union[str, os.PathLike],
+):
+    """Helper method to load the module `name` from `library_name` and `class_name`"""
+
+    # retrieve class candidates
+
+    class_obj, class_candidates = get_class_obj_and_candidates(
+        library_name,
+        class_name,
+        importable_classes,
+        pipelines,
+        is_pipeline_module,
+        component_name=name,
+        cache_dir=cached_folder,
+    )
+
+    load_method_name = None
+    # retrieve load method name
+    for class_name, class_candidate in class_candidates.items():
+        if class_candidate is not None and issubclass(class_obj, class_candidate):
+            load_method_name = importable_classes[class_name][1]
+
+    # if load method name is None, then we have a dummy module -> raise Error
+    if load_method_name is None:
+        none_module = class_obj.__module__
+        is_dummy_path = none_module.startswith(DUMMY_MODULES_FOLDER) or none_module.startswith(
+            TRANSFORMERS_DUMMY_MODULES_FOLDER
+        )
+        if is_dummy_path and "dummy" in none_module:
+            # call class_obj for nice error message of missing requirements
+            class_obj()
+
+        raise ValueError(
+            f"The component {class_obj} of {pipeline_class} cannot be loaded as it does not seem to have"
+            f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}."
+        )
+
+    load_method = getattr(class_obj, load_method_name)
+
+    # add kwargs to loading method
+    diffusers_module = importlib.import_module(__name__.split(".")[0])
+    loading_kwargs = {}
+    if issubclass(class_obj, torch.nn.Module):
+        loading_kwargs["torch_dtype"] = torch_dtype
+    if issubclass(class_obj, diffusers_module.OnnxRuntimeModel):
+        loading_kwargs["provider"] = provider
+        loading_kwargs["sess_options"] = sess_options
+
+    is_diffusers_model = issubclass(class_obj, diffusers_module.ModelMixin)
+
+    if is_transformers_available():
+        transformers_version = version.parse(version.parse(transformers.__version__).base_version)
+    else:
+        transformers_version = "N/A"
+
+    is_transformers_model = (
+        is_transformers_available()
+        and issubclass(class_obj, PreTrainedModel)
+        and transformers_version >= version.parse("4.20.0")
+    )
+
+    # When loading a transformers model, if the device_map is None, the weights will be initialized as opposed to diffusers.
+    # To make default loading faster we set the `low_cpu_mem_usage=low_cpu_mem_usage` flag which is `True` by default.
+    # This makes sure that the weights won't be initialized which significantly speeds up loading.
+    if is_diffusers_model or is_transformers_model:
+        loading_kwargs["device_map"] = device_map
+        loading_kwargs["max_memory"] = max_memory
+        loading_kwargs["offload_folder"] = offload_folder
+        loading_kwargs["offload_state_dict"] = offload_state_dict
+        loading_kwargs["variant"] = model_variants.pop(name, None)
+
+        if from_flax:
+            loading_kwargs["from_flax"] = True
+
+        # the following can be deleted once the minimum required `transformers` version
+        # is higher than 4.27
+        if (
+            is_transformers_model
+            and loading_kwargs["variant"] is not None
+            and transformers_version < version.parse("4.27.0")
+        ):
+            raise ImportError(
+                f"When passing `variant='{variant}'`, please make sure to upgrade your `transformers` version to at least 4.27.0.dev0"
+            )
+        elif is_transformers_model and loading_kwargs["variant"] is None:
+            loading_kwargs.pop("variant")
+
+        # if `from_flax` and model is transformer model, can currently not load with `low_cpu_mem_usage`
+        if not (from_flax and is_transformers_model):
+            loading_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
+        else:
+            loading_kwargs["low_cpu_mem_usage"] = False
+
+    # check if the module is in a subdirectory
+    if os.path.isdir(os.path.join(cached_folder, name)):
+        loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs)
+    else:
+        # else load from the root directory
+        loaded_sub_model = load_method(cached_folder, **loading_kwargs)
+
+    if isinstance(loaded_sub_model, torch.nn.Module) and isinstance(device_map, dict):
+        # remove hooks
+        remove_hook_from_module(loaded_sub_model, recurse=True)
+        needs_offloading_to_cpu = device_map[""] == "cpu"
+
+        if needs_offloading_to_cpu:
+            dispatch_model(
+                loaded_sub_model,
+                state_dict=loaded_sub_model.state_dict(),
+                device_map=device_map,
+                force_hooks=True,
+                main_device=0,
+            )
+        else:
+            dispatch_model(loaded_sub_model, device_map=device_map, force_hooks=True)
+
+    return loaded_sub_model
+
+
+def _fetch_class_library_tuple(module):
+    # import it here to avoid circular import
+    diffusers_module = importlib.import_module(__name__.split(".")[0])
+    pipelines = getattr(diffusers_module, "pipelines")
+
+    # register the config from the original module, not the dynamo compiled one
+    not_compiled_module = _unwrap_model(module)
+    library = not_compiled_module.__module__.split(".")[0]
+
+    # check if the module is a pipeline module
+    module_path_items = not_compiled_module.__module__.split(".")
+    pipeline_dir = module_path_items[-2] if len(module_path_items) > 2 else None
+
+    path = not_compiled_module.__module__.split(".")
+    is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
+
+    # if library is not in LOADABLE_CLASSES, then it is a custom module.
+    # Or if it's a pipeline module, then the module is inside the pipeline
+    # folder so we set the library to module name.
+    if is_pipeline_module:
+        library = pipeline_dir
+    elif library not in LOADABLE_CLASSES:
+        library = not_compiled_module.__module__
+
+    # retrieve class_name
+    class_name = not_compiled_module.__class__.__name__
+
+    return (library, class_name)
+
+
+def _identify_model_variants(folder: str, variant: str, config: dict) -> dict:
+    model_variants = {}
+    if variant is not None:
+        for sub_folder in os.listdir(folder):
+            folder_path = os.path.join(folder, sub_folder)
+            is_folder = os.path.isdir(folder_path) and sub_folder in config
+            variant_exists = is_folder and any(p.split(".")[1].startswith(variant) for p in os.listdir(folder_path))
+            if variant_exists:
+                model_variants[sub_folder] = variant
+    return model_variants
+
+
+def _resolve_custom_pipeline_and_cls(folder, config, custom_pipeline):
+    custom_class_name = None
+    if os.path.isfile(os.path.join(folder, f"{custom_pipeline}.py")):
+        custom_pipeline = os.path.join(folder, f"{custom_pipeline}.py")
+    elif isinstance(config["_class_name"], (list, tuple)) and os.path.isfile(
+        os.path.join(folder, f"{config['_class_name'][0]}.py")
+    ):
+        custom_pipeline = os.path.join(folder, f"{config['_class_name'][0]}.py")
+        custom_class_name = config["_class_name"][1]
+
+    return custom_pipeline, custom_class_name
+
+
+def _maybe_raise_warning_for_inpainting(pipeline_class, pretrained_model_name_or_path: str, config: dict):
+    if pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and version.parse(
+        version.parse(config["_diffusers_version"]).base_version
+    ) <= version.parse("0.5.1"):
+        from diffusers import StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy
+
+        pipeline_class = StableDiffusionInpaintPipelineLegacy
+
+        deprecation_message = (
+            "You are using a legacy checkpoint for inpainting with Stable Diffusion, therefore we are loading the"
+            f" {StableDiffusionInpaintPipelineLegacy} class instead of {StableDiffusionInpaintPipeline}. For"
+            " better inpainting results, we strongly suggest using Stable Diffusion's official inpainting"
+            " checkpoint: https://huggingface.co/runwayml/stable-diffusion-inpainting instead or adapting your"
+            f" checkpoint {pretrained_model_name_or_path} to the format of"
+            " https://huggingface.co/runwayml/stable-diffusion-inpainting. Note that we do not actively maintain"
+            " the {StableDiffusionInpaintPipelineLegacy} class and will likely remove it in version 1.0.0."
+        )
+        deprecate("StableDiffusionInpaintPipelineLegacy", "1.0.0", deprecation_message, standard_warn=False)
+
+
+def _update_init_kwargs_with_connected_pipeline(
+    init_kwargs: dict, passed_pipe_kwargs: dict, passed_class_objs: dict, folder: str, **pipeline_loading_kwargs
+) -> dict:
+    from .pipeline_utils import DiffusionPipeline
+
+    modelcard = ModelCard.load(os.path.join(folder, "README.md"))
+    connected_pipes = {prefix: getattr(modelcard.data, prefix, [None])[0] for prefix in CONNECTED_PIPES_KEYS}
+
+    # We don't scheduler argument to match the existing logic:
+    # https://github.com/huggingface/diffusers/blob/867e0c919e1aa7ef8b03c8eb1460f4f875a683ae/src/diffusers/pipelines/pipeline_utils.py#L906C13-L925C14
+    pipeline_loading_kwargs_cp = pipeline_loading_kwargs.copy()
+    if pipeline_loading_kwargs_cp is not None and len(pipeline_loading_kwargs_cp) >= 1:
+        for k in pipeline_loading_kwargs:
+            if "scheduler" in k:
+                _ = pipeline_loading_kwargs_cp.pop(k)
+
+    def get_connected_passed_kwargs(prefix):
+        connected_passed_class_obj = {
+            k.replace(f"{prefix}_", ""): w for k, w in passed_class_objs.items() if k.split("_")[0] == prefix
+        }
+        connected_passed_pipe_kwargs = {
+            k.replace(f"{prefix}_", ""): w for k, w in passed_pipe_kwargs.items() if k.split("_")[0] == prefix
+        }
+
+        connected_passed_kwargs = {**connected_passed_class_obj, **connected_passed_pipe_kwargs}
+        return connected_passed_kwargs
+
+    connected_pipes = {
+        prefix: DiffusionPipeline.from_pretrained(
+            repo_id, **pipeline_loading_kwargs_cp, **get_connected_passed_kwargs(prefix)
+        )
+        for prefix, repo_id in connected_pipes.items()
+        if repo_id is not None
+    }
+
+    for prefix, connected_pipe in connected_pipes.items():
+        # add connected pipes to `init_kwargs` with <prefix>_<component_name>, e.g. "prior_text_encoder"
+        init_kwargs.update(
+            {"_".join([prefix, name]): component for name, component in connected_pipe.components.items()}
+        )
+
+    return init_kwargs
diff --git a/diffusers/src/diffusers/pipelines/pipeline_utils.py b/diffusers/src/diffusers/pipelines/pipeline_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccd1c9485d0e462285dfe42ba80108ca667a30c6
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pipeline_utils.py
@@ -0,0 +1,1944 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import fnmatch
+import importlib
+import inspect
+import os
+import re
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin
+
+import numpy as np
+import PIL.Image
+import requests
+import torch
+from huggingface_hub import (
+    ModelCard,
+    create_repo,
+    hf_hub_download,
+    model_info,
+    snapshot_download,
+)
+from huggingface_hub.utils import OfflineModeIsEnabled, validate_hf_hub_args
+from packaging import version
+from requests.exceptions import HTTPError
+from tqdm.auto import tqdm
+
+from .. import __version__
+from ..configuration_utils import ConfigMixin
+from ..models import AutoencoderKL
+from ..models.attention_processor import FusedAttnProcessor2_0
+from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, ModelMixin
+from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
+from ..utils import (
+    CONFIG_NAME,
+    DEPRECATED_REVISION_ARGS,
+    BaseOutput,
+    PushToHubMixin,
+    deprecate,
+    is_accelerate_available,
+    is_accelerate_version,
+    is_torch_npu_available,
+    is_torch_version,
+    logging,
+    numpy_to_pil,
+)
+from ..utils.hub_utils import load_or_create_model_card, populate_model_card
+from ..utils.torch_utils import is_compiled_module
+
+
+if is_torch_npu_available():
+    import torch_npu  # noqa: F401
+
+
+from .pipeline_loading_utils import (
+    ALL_IMPORTABLE_CLASSES,
+    CONNECTED_PIPES_KEYS,
+    CUSTOM_PIPELINE_FILE_NAME,
+    LOADABLE_CLASSES,
+    _fetch_class_library_tuple,
+    _get_custom_pipeline_class,
+    _get_final_device_map,
+    _get_pipeline_class,
+    _identify_model_variants,
+    _maybe_raise_warning_for_inpainting,
+    _resolve_custom_pipeline_and_cls,
+    _unwrap_model,
+    _update_init_kwargs_with_connected_pipeline,
+    is_safetensors_compatible,
+    load_sub_model,
+    maybe_raise_or_warn,
+    variant_compatible_siblings,
+    warn_deprecated_model_variant,
+)
+
+
+if is_accelerate_available():
+    import accelerate
+
+
+LIBRARIES = []
+for library in LOADABLE_CLASSES:
+    LIBRARIES.append(library)
+
+SUPPORTED_DEVICE_MAP = ["balanced"]
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class ImagePipelineOutput(BaseOutput):
+    """
+    Output class for image pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+@dataclass
+class AudioPipelineOutput(BaseOutput):
+    """
+    Output class for audio pipelines.
+
+    Args:
+        audios (`np.ndarray`)
+            List of denoised audio samples of a NumPy array of shape `(batch_size, num_channels, sample_rate)`.
+    """
+
+    audios: np.ndarray
+
+
+class DiffusionPipeline(ConfigMixin, PushToHubMixin):
+    r"""
+    Base class for all pipelines.
+
+    [`DiffusionPipeline`] stores all components (models, schedulers, and processors) for diffusion pipelines and
+    provides methods for loading, downloading and saving models. It also includes methods to:
+
+        - move all PyTorch modules to the device of your choice
+        - enable/disable the progress bar for the denoising iteration
+
+    Class attributes:
+
+        - **config_name** (`str`) -- The configuration filename that stores the class and module names of all the
+          diffusion pipeline's components.
+        - **_optional_components** (`List[str]`) -- List of all optional components that don't have to be passed to the
+          pipeline to function (should be overridden by subclasses).
+    """
+
+    config_name = "model_index.json"
+    model_cpu_offload_seq = None
+    hf_device_map = None
+    _optional_components = []
+    _exclude_from_cpu_offload = []
+    _load_connected_pipes = False
+    _is_onnx = False
+
+    def register_modules(self, **kwargs):
+        for name, module in kwargs.items():
+            # retrieve library
+            if module is None or isinstance(module, (tuple, list)) and module[0] is None:
+                register_dict = {name: (None, None)}
+            else:
+                library, class_name = _fetch_class_library_tuple(module)
+                register_dict = {name: (library, class_name)}
+
+            # save model index config
+            self.register_to_config(**register_dict)
+
+            # set models
+            setattr(self, name, module)
+
+    def __setattr__(self, name: str, value: Any):
+        if name in self.__dict__ and hasattr(self.config, name):
+            # We need to overwrite the config if name exists in config
+            if isinstance(getattr(self.config, name), (tuple, list)):
+                if value is not None and self.config[name][0] is not None:
+                    class_library_tuple = _fetch_class_library_tuple(value)
+                else:
+                    class_library_tuple = (None, None)
+
+                self.register_to_config(**{name: class_library_tuple})
+            else:
+                self.register_to_config(**{name: value})
+
+        super().__setattr__(name, value)
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+        max_shard_size: Optional[Union[int, str]] = None,
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        """
+        Save all saveable variables of the pipeline to a directory. A pipeline variable can be saved and loaded if its
+        class implements both a save and loading method. The pipeline is easily reloaded using the
+        [`~DiffusionPipeline.from_pretrained`] class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save a pipeline to. Will be created if it doesn't exist.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
+            max_shard_size (`int` or `str`, defaults to `None`):
+                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
+                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5GB"`).
+                If expressed as an integer, the unit is bytes. Note that this limit will be decreased after a certain
+                period of time (starting from Oct 2024) to allow users to upgrade to the latest version of `diffusers`.
+                This is to establish a common default size for this argument across different libraries in the Hugging
+                Face ecosystem (`transformers`, and `accelerate`, for example).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        model_index_dict = dict(self.config)
+        model_index_dict.pop("_class_name", None)
+        model_index_dict.pop("_diffusers_version", None)
+        model_index_dict.pop("_module", None)
+        model_index_dict.pop("_name_or_path", None)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", False)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+
+        expected_modules, optional_kwargs = self._get_signature_keys(self)
+
+        def is_saveable_module(name, value):
+            if name not in expected_modules:
+                return False
+            if name in self._optional_components and value[0] is None:
+                return False
+            return True
+
+        model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)}
+        for pipeline_component_name in model_index_dict.keys():
+            sub_model = getattr(self, pipeline_component_name)
+            model_cls = sub_model.__class__
+
+            # Dynamo wraps the original model in a private class.
+            # I didn't find a public API to get the original class.
+            if is_compiled_module(sub_model):
+                sub_model = _unwrap_model(sub_model)
+                model_cls = sub_model.__class__
+
+            save_method_name = None
+            # search for the model's base class in LOADABLE_CLASSES
+            for library_name, library_classes in LOADABLE_CLASSES.items():
+                if library_name in sys.modules:
+                    library = importlib.import_module(library_name)
+                else:
+                    logger.info(
+                        f"{library_name} is not installed. Cannot save {pipeline_component_name} as {library_classes} from {library_name}"
+                    )
+
+                for base_class, save_load_methods in library_classes.items():
+                    class_candidate = getattr(library, base_class, None)
+                    if class_candidate is not None and issubclass(model_cls, class_candidate):
+                        # if we found a suitable base class in LOADABLE_CLASSES then grab its save method
+                        save_method_name = save_load_methods[0]
+                        break
+                if save_method_name is not None:
+                    break
+
+            if save_method_name is None:
+                logger.warning(
+                    f"self.{pipeline_component_name}={sub_model} of type {type(sub_model)} cannot be saved."
+                )
+                # make sure that unsaveable components are not tried to be loaded afterward
+                self.register_to_config(**{pipeline_component_name: (None, None)})
+                continue
+
+            save_method = getattr(sub_model, save_method_name)
+
+            # Call the save method with the argument safe_serialization only if it's supported
+            save_method_signature = inspect.signature(save_method)
+            save_method_accept_safe = "safe_serialization" in save_method_signature.parameters
+            save_method_accept_variant = "variant" in save_method_signature.parameters
+            save_method_accept_max_shard_size = "max_shard_size" in save_method_signature.parameters
+
+            save_kwargs = {}
+            if save_method_accept_safe:
+                save_kwargs["safe_serialization"] = safe_serialization
+            if save_method_accept_variant:
+                save_kwargs["variant"] = variant
+            if save_method_accept_max_shard_size and max_shard_size is not None:
+                # max_shard_size is expected to not be None in ModelMixin
+                save_kwargs["max_shard_size"] = max_shard_size
+
+            save_method(os.path.join(save_directory, pipeline_component_name), **save_kwargs)
+
+        # finally save the config
+        self.save_config(save_directory)
+
+        if push_to_hub:
+            # Create a new empty model card and eventually tag it
+            model_card = load_or_create_model_card(repo_id, token=token, is_pipeline=True)
+            model_card = populate_model_card(model_card)
+            model_card.save(os.path.join(save_directory, "README.md"))
+
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
+
+    def to(self, *args, **kwargs):
+        r"""
+        Performs Pipeline dtype and/or device conversion. A torch.dtype and torch.device are inferred from the
+        arguments of `self.to(*args, **kwargs).`
+
+        <Tip>
+
+            If the pipeline already has the correct torch.dtype and torch.device, then it is returned as is. Otherwise,
+            the returned pipeline is a copy of self with the desired torch.dtype and torch.device.
+
+        </Tip>
+
+
+        Here are the ways to call `to`:
+
+        - `to(dtype, silence_dtype_warnings=False) → DiffusionPipeline` to return a pipeline with the specified
+          [`dtype`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype)
+        - `to(device, silence_dtype_warnings=False) → DiffusionPipeline` to return a pipeline with the specified
+          [`device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.device)
+        - `to(device=None, dtype=None, silence_dtype_warnings=False) → DiffusionPipeline` to return a pipeline with the
+          specified [`device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.device) and
+          [`dtype`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype)
+
+        Arguments:
+            dtype (`torch.dtype`, *optional*):
+                Returns a pipeline with the specified
+                [`dtype`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype)
+            device (`torch.Device`, *optional*):
+                Returns a pipeline with the specified
+                [`device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.device)
+            silence_dtype_warnings (`str`, *optional*, defaults to `False`):
+                Whether to omit warnings if the target `dtype` is not compatible with the target `device`.
+
+        Returns:
+            [`DiffusionPipeline`]: The pipeline converted to specified `dtype` and/or `dtype`.
+        """
+        dtype = kwargs.pop("dtype", None)
+        device = kwargs.pop("device", None)
+        silence_dtype_warnings = kwargs.pop("silence_dtype_warnings", False)
+
+        dtype_arg = None
+        device_arg = None
+        if len(args) == 1:
+            if isinstance(args[0], torch.dtype):
+                dtype_arg = args[0]
+            else:
+                device_arg = torch.device(args[0]) if args[0] is not None else None
+        elif len(args) == 2:
+            if isinstance(args[0], torch.dtype):
+                raise ValueError(
+                    "When passing two arguments, make sure the first corresponds to `device` and the second to `dtype`."
+                )
+            device_arg = torch.device(args[0]) if args[0] is not None else None
+            dtype_arg = args[1]
+        elif len(args) > 2:
+            raise ValueError("Please make sure to pass at most two arguments (`device` and `dtype`) `.to(...)`")
+
+        if dtype is not None and dtype_arg is not None:
+            raise ValueError(
+                "You have passed `dtype` both as an argument and as a keyword argument. Please only pass one of the two."
+            )
+
+        dtype = dtype or dtype_arg
+
+        if device is not None and device_arg is not None:
+            raise ValueError(
+                "You have passed `device` both as an argument and as a keyword argument. Please only pass one of the two."
+            )
+
+        device = device or device_arg
+
+        # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU.
+        def module_is_sequentially_offloaded(module):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+
+            return hasattr(module, "_hf_hook") and (
+                isinstance(module._hf_hook, accelerate.hooks.AlignDevicesHook)
+                or hasattr(module._hf_hook, "hooks")
+                and isinstance(module._hf_hook.hooks[0], accelerate.hooks.AlignDevicesHook)
+            )
+
+        def module_is_offloaded(module):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.17.0.dev0"):
+                return False
+
+            return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.CpuOffload)
+
+        # .to("cuda") would raise an error if the pipeline is sequentially offloaded, so we raise our own to make it clearer
+        pipeline_is_sequentially_offloaded = any(
+            module_is_sequentially_offloaded(module) for _, module in self.components.items()
+        )
+        if pipeline_is_sequentially_offloaded and device and torch.device(device).type == "cuda":
+            raise ValueError(
+                "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading."
+            )
+
+        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        if is_pipeline_device_mapped:
+            raise ValueError(
+                "It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` first and then call `to()`."
+            )
+
+        # Display a warning in this case (the operation succeeds but the benefits are lost)
+        pipeline_is_offloaded = any(module_is_offloaded(module) for _, module in self.components.items())
+        if pipeline_is_offloaded and device and torch.device(device).type == "cuda":
+            logger.warning(
+                f"It seems like you have activated model offloading by calling `enable_model_cpu_offload`, but are now manually moving the pipeline to GPU. It is strongly recommended against doing so as memory gains from offloading are likely to be lost. Offloading automatically takes care of moving the individual components {', '.join(self.components.keys())} to GPU when needed. To make sure offloading works as expected, you should consider moving the pipeline back to CPU: `pipeline.to('cpu')` or removing the move altogether if you use offloading."
+            )
+
+        module_names, _ = self._get_signature_keys(self)
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module)]
+
+        is_offloaded = pipeline_is_offloaded or pipeline_is_sequentially_offloaded
+        for module in modules:
+            is_loaded_in_8bit = hasattr(module, "is_loaded_in_8bit") and module.is_loaded_in_8bit
+
+            if is_loaded_in_8bit and dtype is not None:
+                logger.warning(
+                    f"The module '{module.__class__.__name__}' has been loaded in 8bit and conversion to {dtype} is not yet supported. Module is still in 8bit precision."
+                )
+
+            if is_loaded_in_8bit and device is not None:
+                logger.warning(
+                    f"The module '{module.__class__.__name__}' has been loaded in 8bit and moving it to {dtype} via `.to()` is not yet supported. Module is still on {module.device}."
+                )
+            else:
+                module.to(device, dtype)
+
+            if (
+                module.dtype == torch.float16
+                and str(device) in ["cpu"]
+                and not silence_dtype_warnings
+                and not is_offloaded
+            ):
+                logger.warning(
+                    "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It"
+                    " is not recommended to move them to `cpu` as running them will fail. Please make"
+                    " sure to use an accelerator to run the pipeline in inference, due to the lack of"
+                    " support for`float16` operations on this device in PyTorch. Please, remove the"
+                    " `torch_dtype=torch.float16` argument, or use another device for inference."
+                )
+        return self
+
+    @property
+    def device(self) -> torch.device:
+        r"""
+        Returns:
+            `torch.device`: The torch device on which the pipeline is located.
+        """
+        module_names, _ = self._get_signature_keys(self)
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module)]
+
+        for module in modules:
+            return module.device
+
+        return torch.device("cpu")
+
+    @property
+    def dtype(self) -> torch.dtype:
+        r"""
+        Returns:
+            `torch.dtype`: The torch dtype on which the pipeline is located.
+        """
+        module_names, _ = self._get_signature_keys(self)
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module)]
+
+        for module in modules:
+            return module.dtype
+
+        return torch.float32
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a PyTorch diffusion pipeline from pretrained pipeline weights.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_pipeline_directory/`) containing pipeline weights
+                      saved using
+                    [`~DiffusionPipeline.save_pretrained`].
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If "auto" is passed, the
+                dtype is automatically derived from the model's weights.
+            custom_pipeline (`str`, *optional*):
+
+                <Tip warning={true}>
+
+                🧪 This is an experimental feature and may change in the future.
+
+                </Tip>
+
+                Can be either:
+
+                    - A string, the *repo id* (for example `hf-internal-testing/diffusers-dummy-pipeline`) of a custom
+                      pipeline hosted on the Hub. The repository must contain a file called pipeline.py that defines
+                      the custom pipeline.
+                    - A string, the *file name* of a community pipeline hosted on GitHub under
+                      [Community](https://github.com/huggingface/diffusers/tree/main/examples/community). Valid file
+                      names must match the file name and not the pipeline script (`clip_guided_stable_diffusion`
+                      instead of `clip_guided_stable_diffusion.py`). Community pipelines are always loaded from the
+                      current main branch of GitHub.
+                    - A path to a directory (`./my_pipeline_directory/`) containing a custom pipeline. The directory
+                      must contain a file called `pipeline.py` that defines the custom pipeline.
+
+                For more information on how to load and create custom pipelines, please have a look at [Loading and
+                Adding Custom
+                Pipelines](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview)
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            custom_revision (`str`, *optional*):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
+                `revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers
+                version.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn’t need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if device_map contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            use_onnx (`bool`, *optional*, defaults to `None`):
+                If set to `True`, ONNX weights will always be downloaded if present. If set to `False`, ONNX weights
+                will never be downloaded. By default `use_onnx` defaults to the `_is_onnx` class attribute which is
+                `False` for non-ONNX pipelines and `True` for ONNX pipelines. ONNX weights include both files ending
+                with `.onnx` and `.pb`.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline
+                class). The overwritten components are passed directly to the pipelines `__init__` method. See example
+                below for more information.
+            variant (`str`, *optional*):
+                Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+
+        <Tip>
+
+        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        `huggingface-cli login`.
+
+        </Tip>
+
+        Examples:
+
+        ```py
+        >>> from diffusers import DiffusionPipeline
+
+        >>> # Download pipeline from huggingface.co and cache.
+        >>> pipeline = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+
+        >>> # Download pipeline that requires an authorization token
+        >>> # For more information on access tokens, please refer to this section
+        >>> # of the documentation](https://huggingface.co/docs/hub/security-tokens)
+        >>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+
+        >>> # Use a different scheduler
+        >>> from diffusers import LMSDiscreteScheduler
+
+        >>> scheduler = LMSDiscreteScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.scheduler = scheduler
+        ```
+        """
+        # Copy the kwargs to re-use during loading connected pipeline.
+        kwargs_copied = kwargs.copy()
+
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        from_flax = kwargs.pop("from_flax", False)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        custom_pipeline = kwargs.pop("custom_pipeline", None)
+        custom_revision = kwargs.pop("custom_revision", None)
+        provider = kwargs.pop("provider", None)
+        sess_options = kwargs.pop("sess_options", None)
+        device_map = kwargs.pop("device_map", None)
+        max_memory = kwargs.pop("max_memory", None)
+        offload_folder = kwargs.pop("offload_folder", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        use_onnx = kwargs.pop("use_onnx", None)
+        load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
+
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+
+        if device_map is not None and not is_accelerate_available():
+            raise NotImplementedError(
+                "Using `device_map` requires the `accelerate` library. Please install it using: `pip install accelerate`."
+            )
+
+        if device_map is not None and not isinstance(device_map, str):
+            raise ValueError("`device_map` must be a string.")
+
+        if device_map is not None and device_map not in SUPPORTED_DEVICE_MAP:
+            raise NotImplementedError(
+                f"{device_map} not supported. Supported strategies are: {', '.join(SUPPORTED_DEVICE_MAP)}"
+            )
+
+        if device_map is not None and device_map in SUPPORTED_DEVICE_MAP:
+            if is_accelerate_version("<", "0.28.0"):
+                raise NotImplementedError("Device placement requires `accelerate` version `0.28.0` or later.")
+
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to False while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
+
+        # 1. Download the checkpoints and configs
+        # use snapshot download here to get it working from from_pretrained
+        if not os.path.isdir(pretrained_model_name_or_path):
+            if pretrained_model_name_or_path.count("/") > 1:
+                raise ValueError(
+                    f'The provided pretrained_model_name_or_path "{pretrained_model_name_or_path}"'
+                    " is neither a valid local path nor a valid repo id. Please check the parameter."
+                )
+            cached_folder = cls.download(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                from_flax=from_flax,
+                use_safetensors=use_safetensors,
+                use_onnx=use_onnx,
+                custom_pipeline=custom_pipeline,
+                custom_revision=custom_revision,
+                variant=variant,
+                load_connected_pipeline=load_connected_pipeline,
+                **kwargs,
+            )
+        else:
+            cached_folder = pretrained_model_name_or_path
+
+        config_dict = cls.load_config(cached_folder)
+
+        # pop out "_ignore_files" as it is only needed for download
+        config_dict.pop("_ignore_files", None)
+
+        # 2. Define which model components should load variants
+        # We retrieve the information by matching whether variant model checkpoints exist in the subfolders.
+        # Example: `diffusion_pytorch_model.safetensors` -> `diffusion_pytorch_model.fp16.safetensors`
+        # with variant being `"fp16"`.
+        model_variants = _identify_model_variants(folder=cached_folder, variant=variant, config=config_dict)
+
+        # 3. Load the pipeline class, if using custom module then load it from the hub
+        # if we load from explicit class, let's use it
+        custom_pipeline, custom_class_name = _resolve_custom_pipeline_and_cls(
+            folder=cached_folder, config=config_dict, custom_pipeline=custom_pipeline
+        )
+        pipeline_class = _get_pipeline_class(
+            cls,
+            config=config_dict,
+            load_connected_pipeline=load_connected_pipeline,
+            custom_pipeline=custom_pipeline,
+            class_name=custom_class_name,
+            cache_dir=cache_dir,
+            revision=custom_revision,
+        )
+
+        if device_map is not None and pipeline_class._load_connected_pipes:
+            raise NotImplementedError("`device_map` is not yet supported for connected pipelines.")
+
+        # DEPRECATED: To be removed in 1.0.0
+        # we are deprecating the `StableDiffusionInpaintPipelineLegacy` pipeline which gets loaded
+        # when a user requests for a `StableDiffusionInpaintPipeline` with `diffusers` version being <= 0.5.1.
+        _maybe_raise_warning_for_inpainting(
+            pipeline_class=pipeline_class,
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            config=config_dict,
+        )
+
+        # 4. Define expected modules given pipeline signature
+        # and define non-None initialized modules (=`init_kwargs`)
+
+        # some modules can be passed directly to the init
+        # in this case they are already instantiated in `kwargs`
+        # extract them here
+        expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+        init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
+
+        # define init kwargs and make sure that optional component modules are filtered out
+        init_kwargs = {
+            k: init_dict.pop(k)
+            for k in optional_kwargs
+            if k in init_dict and k not in pipeline_class._optional_components
+        }
+        init_kwargs = {**init_kwargs, **passed_pipe_kwargs}
+
+        # remove `null` components
+        def load_module(name, value):
+            if value[0] is None:
+                return False
+            if name in passed_class_obj and passed_class_obj[name] is None:
+                return False
+            return True
+
+        init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}
+
+        # Special case: safety_checker must be loaded separately when using `from_flax`
+        if from_flax and "safety_checker" in init_dict and "safety_checker" not in passed_class_obj:
+            raise NotImplementedError(
+                "The safety checker cannot be automatically loaded when loading weights `from_flax`."
+                " Please, pass `safety_checker=None` to `from_pretrained`, and load the safety checker"
+                " separately if you need it."
+            )
+
+        # 5. Throw nice warnings / errors for fast accelerate loading
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"Keyword arguments {unused_kwargs} are not expected by {pipeline_class.__name__} and will be ignored."
+            )
+
+        # import it here to avoid circular import
+        from diffusers import pipelines
+
+        # 6. device map delegation
+        final_device_map = None
+        if device_map is not None:
+            final_device_map = _get_final_device_map(
+                device_map=device_map,
+                pipeline_class=pipeline_class,
+                passed_class_obj=passed_class_obj,
+                init_dict=init_dict,
+                library=library,
+                max_memory=max_memory,
+                torch_dtype=torch_dtype,
+                cached_folder=cached_folder,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+            )
+
+        # 7. Load each module in the pipeline
+        current_device_map = None
+        for name, (library_name, class_name) in logging.tqdm(init_dict.items(), desc="Loading pipeline components..."):
+            # 7.1 device_map shenanigans
+            if final_device_map is not None and len(final_device_map) > 0:
+                component_device = final_device_map.get(name, None)
+                if component_device is not None:
+                    current_device_map = {"": component_device}
+                else:
+                    current_device_map = None
+
+            # 7.2 - now that JAX/Flax is an official framework of the library, we might load from Flax names
+            class_name = class_name[4:] if class_name.startswith("Flax") else class_name
+
+            # 7.3 Define all importable classes
+            is_pipeline_module = hasattr(pipelines, library_name)
+            importable_classes = ALL_IMPORTABLE_CLASSES
+            loaded_sub_model = None
+
+            # 7.4 Use passed sub model or load class_name from library_name
+            if name in passed_class_obj:
+                # if the model is in a pipeline module, then we load it from the pipeline
+                # check that passed_class_obj has correct parent class
+                maybe_raise_or_warn(
+                    library_name, library, class_name, importable_classes, passed_class_obj, name, is_pipeline_module
+                )
+
+                loaded_sub_model = passed_class_obj[name]
+            else:
+                # load sub model
+                loaded_sub_model = load_sub_model(
+                    library_name=library_name,
+                    class_name=class_name,
+                    importable_classes=importable_classes,
+                    pipelines=pipelines,
+                    is_pipeline_module=is_pipeline_module,
+                    pipeline_class=pipeline_class,
+                    torch_dtype=torch_dtype,
+                    provider=provider,
+                    sess_options=sess_options,
+                    device_map=current_device_map,
+                    max_memory=max_memory,
+                    offload_folder=offload_folder,
+                    offload_state_dict=offload_state_dict,
+                    model_variants=model_variants,
+                    name=name,
+                    from_flax=from_flax,
+                    variant=variant,
+                    low_cpu_mem_usage=low_cpu_mem_usage,
+                    cached_folder=cached_folder,
+                )
+                logger.info(
+                    f"Loaded {name} as {class_name} from `{name}` subfolder of {pretrained_model_name_or_path}."
+                )
+
+            init_kwargs[name] = loaded_sub_model  # UNet(...), # DiffusionSchedule(...)
+
+        # 8. Handle connected pipelines.
+        if pipeline_class._load_connected_pipes and os.path.isfile(os.path.join(cached_folder, "README.md")):
+            init_kwargs = _update_init_kwargs_with_connected_pipeline(
+                init_kwargs=init_kwargs,
+                passed_pipe_kwargs=passed_pipe_kwargs,
+                passed_class_objs=passed_class_obj,
+                folder=cached_folder,
+                **kwargs_copied,
+            )
+
+        # 9. Potentially add passed objects if expected
+        missing_modules = set(expected_modules) - set(init_kwargs.keys())
+        passed_modules = list(passed_class_obj.keys())
+        optional_modules = pipeline_class._optional_components
+        if len(missing_modules) > 0 and missing_modules <= set(passed_modules + optional_modules):
+            for module in missing_modules:
+                init_kwargs[module] = passed_class_obj.get(module, None)
+        elif len(missing_modules) > 0:
+            passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs
+            raise ValueError(
+                f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed."
+            )
+
+        # 10. Instantiate the pipeline
+        model = pipeline_class(**init_kwargs)
+
+        # 11. Save where the model was instantiated from
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        if device_map is not None:
+            setattr(model, "hf_device_map", final_device_map)
+        return model
+
+    @property
+    def name_or_path(self) -> str:
+        return getattr(self.config, "_name_or_path", None)
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        [`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
+        Accelerate's module hooks.
+        """
+        for name, model in self.components.items():
+            if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
+                continue
+
+            if not hasattr(model, "_hf_hook"):
+                return self.device
+            for module in model.modules():
+                if (
+                    hasattr(module, "_hf_hook")
+                    and hasattr(module._hf_hook, "execution_device")
+                    and module._hf_hook.execution_device is not None
+                ):
+                    return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def remove_all_hooks(self):
+        r"""
+        Removes all hooks that were added when using `enable_sequential_cpu_offload` or `enable_model_cpu_offload`.
+        """
+        for _, model in self.components.items():
+            if isinstance(model, torch.nn.Module) and hasattr(model, "_hf_hook"):
+                accelerate.hooks.remove_hook_from_module(model, recurse=True)
+        self._all_hooks = []
+
+    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+
+        Arguments:
+            gpu_id (`int`, *optional*):
+                The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
+            device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
+                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
+                default to "cuda".
+        """
+        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        if is_pipeline_device_mapped:
+            raise ValueError(
+                "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_model_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_model_cpu_offload()`."
+            )
+
+        if self.model_cpu_offload_seq is None:
+            raise ValueError(
+                "Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
+            )
+
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        self.remove_all_hooks()
+
+        torch_device = torch.device(device)
+        device_index = torch_device.index
+
+        if gpu_id is not None and device_index is not None:
+            raise ValueError(
+                f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
+                f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
+            )
+
+        # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
+        self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
+
+        device_type = torch_device.type
+        device = torch.device(f"{device_type}:{self._offload_gpu_id}")
+        self._offload_device = device
+
+        self.to("cpu", silence_dtype_warnings=True)
+        device_mod = getattr(torch, device.type, None)
+        if hasattr(device_mod, "empty_cache") and device_mod.is_available():
+            device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
+
+        self._all_hooks = []
+        hook = None
+        for model_str in self.model_cpu_offload_seq.split("->"):
+            model = all_model_components.pop(model_str, None)
+            if not isinstance(model, torch.nn.Module):
+                continue
+
+            _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
+            self._all_hooks.append(hook)
+
+        # CPU offload models that are not in the seq chain unless they are explicitly excluded
+        # these models will stay on CPU until maybe_free_model_hooks is called
+        # some models cannot be in the seq chain because they are iteratively called, such as controlnet
+        for name, model in all_model_components.items():
+            if not isinstance(model, torch.nn.Module):
+                continue
+
+            if name in self._exclude_from_cpu_offload:
+                model.to(device)
+            else:
+                _, hook = cpu_offload_with_hook(model, device)
+                self._all_hooks.append(hook)
+
+    def maybe_free_model_hooks(self):
+        r"""
+        Function that offloads all components, removes all model hooks that were added when using
+        `enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
+        is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
+        functions correctly when applying enable_model_cpu_offload.
+        """
+        if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
+            # `enable_model_cpu_offload` has not be called, so silently do nothing
+            return
+
+        # make sure the model is in the same state as before calling it
+        self.enable_model_cpu_offload(device=getattr(self, "_offload_device", "cuda"))
+
+    def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using 🤗 Accelerate, significantly reducing memory usage. When called, the state
+        dicts of all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are saved to CPU
+        and then moved to `torch.device('meta')` and loaded to GPU only when their specific submodule has its `forward`
+        method called. Offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+
+        Arguments:
+            gpu_id (`int`, *optional*):
+                The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
+            device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
+                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
+                default to "cuda".
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+        self.remove_all_hooks()
+
+        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        if is_pipeline_device_mapped:
+            raise ValueError(
+                "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`."
+            )
+
+        torch_device = torch.device(device)
+        device_index = torch_device.index
+
+        if gpu_id is not None and device_index is not None:
+            raise ValueError(
+                f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
+                f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
+            )
+
+        # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
+        self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
+
+        device_type = torch_device.type
+        device = torch.device(f"{device_type}:{self._offload_gpu_id}")
+        self._offload_device = device
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            device_mod = getattr(torch, self.device.type, None)
+            if hasattr(device_mod, "empty_cache") and device_mod.is_available():
+                device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        for name, model in self.components.items():
+            if not isinstance(model, torch.nn.Module):
+                continue
+
+            if name in self._exclude_from_cpu_offload:
+                model.to(device)
+            else:
+                # make sure to offload buffers if not all high level weights
+                # are of type nn.Module
+                offload_buffers = len(model._parameters) > 0
+                cpu_offload(model, device, offload_buffers=offload_buffers)
+
+    def reset_device_map(self):
+        r"""
+        Resets the device maps (if any) to None.
+        """
+        if self.hf_device_map is None:
+            return
+        else:
+            self.remove_all_hooks()
+            for name, component in self.components.items():
+                if isinstance(component, torch.nn.Module):
+                    component.to("cpu")
+            self.hf_device_map = None
+
+    @classmethod
+    @validate_hf_hub_args
+    def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
+        r"""
+        Download and cache a PyTorch diffusion pipeline from pretrained pipeline weights.
+
+        Parameters:
+            pretrained_model_name (`str` or `os.PathLike`, *optional*):
+                A string, the *repository id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
+                hosted on the Hub.
+            custom_pipeline (`str`, *optional*):
+                Can be either:
+
+                    - A string, the *repository id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained
+                      pipeline hosted on the Hub. The repository must contain a file called `pipeline.py` that defines
+                      the custom pipeline.
+
+                    - A string, the *file name* of a community pipeline hosted on GitHub under
+                      [Community](https://github.com/huggingface/diffusers/tree/main/examples/community). Valid file
+                      names must match the file name and not the pipeline script (`clip_guided_stable_diffusion`
+                      instead of `clip_guided_stable_diffusion.py`). Community pipelines are always loaded from the
+                      current `main` branch of GitHub.
+
+                    - A path to a *directory* (`./my_pipeline_directory/`) containing a custom pipeline. The directory
+                      must contain a file called `pipeline.py` that defines the custom pipeline.
+
+                <Tip warning={true}>
+
+                🧪 This is an experimental feature and may change in the future.
+
+                </Tip>
+
+                For more information on how to load and create custom pipelines, take a look at [How to contribute a
+                community pipeline](https://huggingface.co/docs/diffusers/main/en/using-diffusers/contribute_pipeline).
+
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            custom_revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
+                `revision` when loading a custom pipeline from the Hub. It can be a 🤗 Diffusers version when loading a
+                custom pipeline from GitHub, otherwise it defaults to `"main"` when loading from the Hub.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            variant (`str`, *optional*):
+                Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            use_onnx (`bool`, *optional*, defaults to `False`):
+                If set to `True`, ONNX weights will always be downloaded if present. If set to `False`, ONNX weights
+                will never be downloaded. By default `use_onnx` defaults to the `_is_onnx` class attribute which is
+                `False` for non-ONNX pipelines and `True` for ONNX pipelines. ONNX weights include both files ending
+                with `.onnx` and `.pb`.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom pipelines and components defined on the Hub in their own files. This
+                option should only be set to `True` for repositories you trust and in which you have read the code, as
+                it will execute code present on the Hub on your local machine.
+
+        Returns:
+            `os.PathLike`:
+                A path to the downloaded pipeline.
+
+        <Tip>
+
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`.
+
+        </Tip>
+
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        from_flax = kwargs.pop("from_flax", False)
+        custom_pipeline = kwargs.pop("custom_pipeline", None)
+        custom_revision = kwargs.pop("custom_revision", None)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        use_onnx = kwargs.pop("use_onnx", None)
+        load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", False)
+
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        allow_patterns = None
+        ignore_patterns = None
+
+        model_info_call_error: Optional[Exception] = None
+        if not local_files_only:
+            try:
+                info = model_info(pretrained_model_name, token=token, revision=revision)
+            except (HTTPError, OfflineModeIsEnabled, requests.ConnectionError) as e:
+                logger.warning(f"Couldn't connect to the Hub: {e}.\nWill try to load from local cache.")
+                local_files_only = True
+                model_info_call_error = e  # save error to reraise it if model is not cached locally
+
+        if not local_files_only:
+            config_file = hf_hub_download(
+                pretrained_model_name,
+                cls.config_name,
+                cache_dir=cache_dir,
+                revision=revision,
+                proxies=proxies,
+                force_download=force_download,
+                token=token,
+            )
+
+            config_dict = cls._dict_from_json_file(config_file)
+            ignore_filenames = config_dict.pop("_ignore_files", [])
+
+            # retrieve all folder_names that contain relevant files
+            folder_names = [k for k, v in config_dict.items() if isinstance(v, list) and k != "_class_name"]
+
+            filenames = {sibling.rfilename for sibling in info.siblings}
+            model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant)
+
+            diffusers_module = importlib.import_module(__name__.split(".")[0])
+            pipelines = getattr(diffusers_module, "pipelines")
+
+            # optionally create a custom component <> custom file mapping
+            custom_components = {}
+            for component in folder_names:
+                module_candidate = config_dict[component][0]
+
+                if module_candidate is None or not isinstance(module_candidate, str):
+                    continue
+
+                # We compute candidate file path on the Hub. Do not use `os.path.join`.
+                candidate_file = f"{component}/{module_candidate}.py"
+
+                if candidate_file in filenames:
+                    custom_components[component] = module_candidate
+                elif module_candidate not in LOADABLE_CLASSES and not hasattr(pipelines, module_candidate):
+                    raise ValueError(
+                        f"{candidate_file} as defined in `model_index.json` does not exist in {pretrained_model_name} and is not a module in 'diffusers/pipelines'."
+                    )
+
+            if len(variant_filenames) == 0 and variant is not None:
+                deprecation_message = (
+                    f"You are trying to load the model files of the `variant={variant}`, but no such modeling files are available."
+                    f"The default model files: {model_filenames} will be loaded instead. Make sure to not load from `variant={variant}`"
+                    "if such variant modeling files are not available. Doing so will lead to an error in v0.24.0 as defaulting to non-variant"
+                    "modeling files is deprecated."
+                )
+                deprecate("no variant default", "0.24.0", deprecation_message, standard_warn=False)
+
+            # remove ignored filenames
+            model_filenames = set(model_filenames) - set(ignore_filenames)
+            variant_filenames = set(variant_filenames) - set(ignore_filenames)
+
+            # if the whole pipeline is cached we don't have to ping the Hub
+            if revision in DEPRECATED_REVISION_ARGS and version.parse(
+                version.parse(__version__).base_version
+            ) >= version.parse("0.22.0"):
+                warn_deprecated_model_variant(pretrained_model_name, token, variant, revision, model_filenames)
+
+            model_folder_names = {os.path.split(f)[0] for f in model_filenames if os.path.split(f)[0] in folder_names}
+
+            custom_class_name = None
+            if custom_pipeline is None and isinstance(config_dict["_class_name"], (list, tuple)):
+                custom_pipeline = config_dict["_class_name"][0]
+                custom_class_name = config_dict["_class_name"][1]
+
+            # all filenames compatible with variant will be added
+            allow_patterns = list(model_filenames)
+
+            # allow all patterns from non-model folders
+            # this enables downloading schedulers, tokenizers, ...
+            allow_patterns += [f"{k}/*" for k in folder_names if k not in model_folder_names]
+            # add custom component files
+            allow_patterns += [f"{k}/{f}.py" for k, f in custom_components.items()]
+            # add custom pipeline file
+            allow_patterns += [f"{custom_pipeline}.py"] if f"{custom_pipeline}.py" in filenames else []
+            # also allow downloading config.json files with the model
+            allow_patterns += [os.path.join(k, "config.json") for k in model_folder_names]
+
+            allow_patterns += [
+                SCHEDULER_CONFIG_NAME,
+                CONFIG_NAME,
+                cls.config_name,
+                CUSTOM_PIPELINE_FILE_NAME,
+            ]
+
+            load_pipe_from_hub = custom_pipeline is not None and f"{custom_pipeline}.py" in filenames
+            load_components_from_hub = len(custom_components) > 0
+
+            if load_pipe_from_hub and not trust_remote_code:
+                raise ValueError(
+                    f"The repository for {pretrained_model_name} contains custom code in {custom_pipeline}.py which must be executed to correctly "
+                    f"load the model. You can inspect the repository content at https://hf.co/{pretrained_model_name}/blob/main/{custom_pipeline}.py.\n"
+                    f"Please pass the argument `trust_remote_code=True` to allow custom code to be run."
+                )
+
+            if load_components_from_hub and not trust_remote_code:
+                raise ValueError(
+                    f"The repository for {pretrained_model_name} contains custom code in {'.py, '.join([os.path.join(k, v) for k,v in custom_components.items()])} which must be executed to correctly "
+                    f"load the model. You can inspect the repository content at {', '.join([f'https://hf.co/{pretrained_model_name}/{k}/{v}.py' for k,v in custom_components.items()])}.\n"
+                    f"Please pass the argument `trust_remote_code=True` to allow custom code to be run."
+                )
+
+            # retrieve passed components that should not be downloaded
+            pipeline_class = _get_pipeline_class(
+                cls,
+                config_dict,
+                load_connected_pipeline=load_connected_pipeline,
+                custom_pipeline=custom_pipeline,
+                repo_id=pretrained_model_name if load_pipe_from_hub else None,
+                hub_revision=revision,
+                class_name=custom_class_name,
+                cache_dir=cache_dir,
+                revision=custom_revision,
+            )
+            expected_components, _ = cls._get_signature_keys(pipeline_class)
+            passed_components = [k for k in expected_components if k in kwargs]
+
+            if (
+                use_safetensors
+                and not allow_pickle
+                and not is_safetensors_compatible(
+                    model_filenames, passed_components=passed_components, folder_names=model_folder_names
+                )
+            ):
+                raise EnvironmentError(
+                    f"Could not find the necessary `safetensors` weights in {model_filenames} (variant={variant})"
+                )
+            if from_flax:
+                ignore_patterns = ["*.bin", "*.safetensors", "*.onnx", "*.pb"]
+            elif use_safetensors and is_safetensors_compatible(
+                model_filenames, passed_components=passed_components, folder_names=model_folder_names
+            ):
+                ignore_patterns = ["*.bin", "*.msgpack"]
+
+                use_onnx = use_onnx if use_onnx is not None else pipeline_class._is_onnx
+                if not use_onnx:
+                    ignore_patterns += ["*.onnx", "*.pb"]
+
+                safetensors_variant_filenames = {f for f in variant_filenames if f.endswith(".safetensors")}
+                safetensors_model_filenames = {f for f in model_filenames if f.endswith(".safetensors")}
+                if (
+                    len(safetensors_variant_filenames) > 0
+                    and safetensors_model_filenames != safetensors_variant_filenames
+                ):
+                    logger.warning(
+                        f"\nA mixture of {variant} and non-{variant} filenames will be loaded.\nLoaded {variant} filenames:\n[{', '.join(safetensors_variant_filenames)}]\nLoaded non-{variant} filenames:\n[{', '.join(safetensors_model_filenames - safetensors_variant_filenames)}\nIf this behavior is not expected, please check your folder structure."
+                    )
+            else:
+                ignore_patterns = ["*.safetensors", "*.msgpack"]
+
+                use_onnx = use_onnx if use_onnx is not None else pipeline_class._is_onnx
+                if not use_onnx:
+                    ignore_patterns += ["*.onnx", "*.pb"]
+
+                bin_variant_filenames = {f for f in variant_filenames if f.endswith(".bin")}
+                bin_model_filenames = {f for f in model_filenames if f.endswith(".bin")}
+                if len(bin_variant_filenames) > 0 and bin_model_filenames != bin_variant_filenames:
+                    logger.warning(
+                        f"\nA mixture of {variant} and non-{variant} filenames will be loaded.\nLoaded {variant} filenames:\n[{', '.join(bin_variant_filenames)}]\nLoaded non-{variant} filenames:\n[{', '.join(bin_model_filenames - bin_variant_filenames)}\nIf this behavior is not expected, please check your folder structure."
+                    )
+
+            # Don't download any objects that are passed
+            allow_patterns = [
+                p for p in allow_patterns if not (len(p.split("/")) == 2 and p.split("/")[0] in passed_components)
+            ]
+
+            if pipeline_class._load_connected_pipes:
+                allow_patterns.append("README.md")
+
+            # Don't download index files of forbidden patterns either
+            ignore_patterns = ignore_patterns + [f"{i}.index.*json" for i in ignore_patterns]
+            re_ignore_pattern = [re.compile(fnmatch.translate(p)) for p in ignore_patterns]
+            re_allow_pattern = [re.compile(fnmatch.translate(p)) for p in allow_patterns]
+
+            expected_files = [f for f in filenames if not any(p.match(f) for p in re_ignore_pattern)]
+            expected_files = [f for f in expected_files if any(p.match(f) for p in re_allow_pattern)]
+
+            snapshot_folder = Path(config_file).parent
+            pipeline_is_cached = all((snapshot_folder / f).is_file() for f in expected_files)
+
+            if pipeline_is_cached and not force_download:
+                # if the pipeline is cached, we can directly return it
+                # else call snapshot_download
+                return snapshot_folder
+
+        user_agent = {"pipeline_class": cls.__name__}
+        if custom_pipeline is not None and not custom_pipeline.endswith(".py"):
+            user_agent["custom_pipeline"] = custom_pipeline
+
+        # download all allow_patterns - ignore_patterns
+        try:
+            cached_folder = snapshot_download(
+                pretrained_model_name,
+                cache_dir=cache_dir,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
+                user_agent=user_agent,
+            )
+
+            # retrieve pipeline class from local file
+            cls_name = cls.load_config(os.path.join(cached_folder, "model_index.json")).get("_class_name", None)
+            cls_name = cls_name[4:] if isinstance(cls_name, str) and cls_name.startswith("Flax") else cls_name
+
+            diffusers_module = importlib.import_module(__name__.split(".")[0])
+            pipeline_class = getattr(diffusers_module, cls_name, None) if isinstance(cls_name, str) else None
+
+            if pipeline_class is not None and pipeline_class._load_connected_pipes:
+                modelcard = ModelCard.load(os.path.join(cached_folder, "README.md"))
+                connected_pipes = sum([getattr(modelcard.data, k, []) for k in CONNECTED_PIPES_KEYS], [])
+                for connected_pipe_repo_id in connected_pipes:
+                    download_kwargs = {
+                        "cache_dir": cache_dir,
+                        "force_download": force_download,
+                        "proxies": proxies,
+                        "local_files_only": local_files_only,
+                        "token": token,
+                        "variant": variant,
+                        "use_safetensors": use_safetensors,
+                    }
+                    DiffusionPipeline.download(connected_pipe_repo_id, **download_kwargs)
+
+            return cached_folder
+
+        except FileNotFoundError:
+            # Means we tried to load pipeline with `local_files_only=True` but the files have not been found in local cache.
+            # This can happen in two cases:
+            # 1. If the user passed `local_files_only=True`                    => we raise the error directly
+            # 2. If we forced `local_files_only=True` when `model_info` failed => we raise the initial error
+            if model_info_call_error is None:
+                # 1. user passed `local_files_only=True`
+                raise
+            else:
+                # 2. we forced `local_files_only=True` when `model_info` failed
+                raise EnvironmentError(
+                    f"Cannot load model {pretrained_model_name}: model is not cached locally and an error occurred"
+                    " while trying to fetch metadata from the Hub. Please check out the root cause in the stacktrace"
+                    " above."
+                ) from model_info_call_error
+
+    @classmethod
+    def _get_signature_keys(cls, obj):
+        parameters = inspect.signature(obj.__init__).parameters
+        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
+        expected_modules = set(required_parameters.keys()) - {"self"}
+
+        optional_names = list(optional_parameters)
+        for name in optional_names:
+            if name in cls._optional_components:
+                expected_modules.add(name)
+                optional_parameters.remove(name)
+
+        return expected_modules, optional_parameters
+
+    @classmethod
+    def _get_signature_types(cls):
+        signature_types = {}
+        for k, v in inspect.signature(cls.__init__).parameters.items():
+            if inspect.isclass(v.annotation):
+                signature_types[k] = (v.annotation,)
+            elif get_origin(v.annotation) == Union:
+                signature_types[k] = get_args(v.annotation)
+            else:
+                logger.warning(f"cannot get type annotation for Parameter {k} of {cls}.")
+        return signature_types
+
+    @property
+    def components(self) -> Dict[str, Any]:
+        r"""
+        The `self.components` property can be useful to run different pipelines with the same weights and
+        configurations without reallocating additional memory.
+
+        Returns (`dict`):
+            A dictionary containing all the modules needed to initialize the pipeline.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import (
+        ...     StableDiffusionPipeline,
+        ...     StableDiffusionImg2ImgPipeline,
+        ...     StableDiffusionInpaintPipeline,
+        ... )
+
+        >>> text2img = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> img2img = StableDiffusionImg2ImgPipeline(**text2img.components)
+        >>> inpaint = StableDiffusionInpaintPipeline(**text2img.components)
+        ```
+        """
+        expected_modules, optional_parameters = self._get_signature_keys(self)
+        components = {
+            k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters
+        }
+
+        if set(components.keys()) != expected_modules:
+            raise ValueError(
+                f"{self} has been incorrectly initialized or {self.__class__} is incorrectly implemented. Expected"
+                f" {expected_modules} to be defined, but {components.keys()} are defined."
+            )
+
+        return components
+
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a NumPy image or a batch of images to a PIL image.
+        """
+        return numpy_to_pil(images)
+
+    def progress_bar(self, iterable=None, total=None):
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        elif not isinstance(self._progress_bar_config, dict):
+            raise ValueError(
+                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+            )
+
+        if iterable is not None:
+            return tqdm(iterable, **self._progress_bar_config)
+        elif total is not None:
+            return tqdm(total=total, **self._progress_bar_config)
+        else:
+            raise ValueError("Either `total` or `iterable` has to be defined.")
+
+    def set_progress_bar_config(self, **kwargs):
+        self._progress_bar_config = kwargs
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        r"""
+        Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). When this
+        option is enabled, you should observe lower GPU memory usage and a potential speed up during inference. Speed
+        up during training is not guaranteed.
+
+        <Tip warning={true}>
+
+        ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes
+        precedent.
+
+        </Tip>
+
+        Parameters:
+            attention_op (`Callable`, *optional*):
+                Override the default `None` operator for use as `op` argument to the
+                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
+                function of xFormers.
+
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
+        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
+
+        >>> pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+        >>> pipe.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
+        >>> # Workaround for not accepting attention shape using VAE for Flash Attention
+        >>> pipe.vae.enable_xformers_memory_efficient_attention(attention_op=None)
+        ```
+        """
+        self.set_use_memory_efficient_attention_xformers(True, attention_op)
+
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
+        """
+        self.set_use_memory_efficient_attention_xformers(False)
+
+    def set_use_memory_efficient_attention_xformers(
+        self, valid: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_memory_efficient_attention_xformers method
+        # gets the message
+        def fn_recursive_set_mem_eff(module: torch.nn.Module):
+            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
+
+            for child in module.children():
+                fn_recursive_set_mem_eff(child)
+
+        module_names, _ = self._get_signature_keys(self)
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module)]
+
+        for module in modules:
+            fn_recursive_set_mem_eff(module)
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation. When this option is enabled, the attention module splits the input tensor
+        in slices to compute attention in several steps. For more than one attention head, the computation is performed
+        sequentially over each head. This is useful to save some memory in exchange for a small speed decrease.
+
+        <Tip warning={true}>
+
+        ⚠️ Don't enable attention slicing if you're already using `scaled_dot_product_attention` (SDPA) from PyTorch
+        2.0 or xFormers. These attention computations are already very memory efficient so you won't need to enable
+        this function. If you enable attention slicing with SDPA or xFormers, it can lead to serious slow downs!
+
+        </Tip>
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPipeline
+
+        >>> pipe = StableDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5",
+        ...     torch_dtype=torch.float16,
+        ...     use_safetensors=True,
+        ... )
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> pipe.enable_attention_slicing()
+        >>> image = pipe(prompt).images[0]
+        ```
+        """
+        self.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously called, attention is
+        computed in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    def set_attention_slice(self, slice_size: Optional[int]):
+        module_names, _ = self._get_signature_keys(self)
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module) and hasattr(m, "set_attention_slice")]
+
+        for module in modules:
+            module.set_attention_slice(slice_size)
+
+    @classmethod
+    def from_pipe(cls, pipeline, **kwargs):
+        r"""
+        Create a new pipeline from a given pipeline. This method is useful to create a new pipeline from the existing
+        pipeline components without reallocating additional memory.
+
+        Arguments:
+            pipeline (`DiffusionPipeline`):
+                The pipeline from which to create a new pipeline.
+
+        Returns:
+            `DiffusionPipeline`:
+                A new pipeline with the same weights and configurations as `pipeline`.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import StableDiffusionPipeline, StableDiffusionSAGPipeline
+
+        >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> new_pipe = StableDiffusionSAGPipeline.from_pipe(pipe)
+        ```
+        """
+
+        original_config = dict(pipeline.config)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+
+        # derive the pipeline class to instantiate
+        custom_pipeline = kwargs.pop("custom_pipeline", None)
+        custom_revision = kwargs.pop("custom_revision", None)
+
+        if custom_pipeline is not None:
+            pipeline_class = _get_custom_pipeline_class(custom_pipeline, revision=custom_revision)
+        else:
+            pipeline_class = cls
+
+        expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
+        # true_optional_modules are optional components with default value in signature so it is ok not to pass them to `__init__`
+        # e.g. `image_encoder` for StableDiffusionPipeline
+        parameters = inspect.signature(cls.__init__).parameters
+        true_optional_modules = set(
+            {k for k, v in parameters.items() if v.default != inspect._empty and k in expected_modules}
+        )
+
+        # get the class of each component based on its type hint
+        # e.g. {"unet": UNet2DConditionModel, "text_encoder": CLIPTextMode}
+        component_types = pipeline_class._get_signature_types()
+
+        pretrained_model_name_or_path = original_config.pop("_name_or_path", None)
+        # allow users pass modules in `kwargs` to override the original pipeline's components
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+
+        original_class_obj = {}
+        for name, component in pipeline.components.items():
+            if name in expected_modules and name not in passed_class_obj:
+                # for model components, we will not switch over if the class does not matches the type hint in the new pipeline's signature
+                if (
+                    not isinstance(component, ModelMixin)
+                    or type(component) in component_types[name]
+                    or (component is None and name in cls._optional_components)
+                ):
+                    original_class_obj[name] = component
+                else:
+                    logger.warning(
+                        f"component {name} is not switched over to new pipeline because type does not match the expected."
+                        f" {name} is {type(component)} while the new pipeline expect {component_types[name]}."
+                        f" please pass the component of the correct type to the new pipeline. `from_pipe(..., {name}={name})`"
+                    )
+
+        # allow users pass optional kwargs to override the original pipelines config attribute
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+        original_pipe_kwargs = {
+            k: original_config[k]
+            for k in original_config.keys()
+            if k in optional_kwargs and k not in passed_pipe_kwargs
+        }
+
+        # config attribute that were not expected by pipeline is stored as its private attribute
+        # (i.e. when the original pipeline was also instantiated with `from_pipe` from another pipeline that has this config)
+        # in this case, we will pass them as optional arguments if they can be accepted by the new pipeline
+        additional_pipe_kwargs = [
+            k[1:]
+            for k in original_config.keys()
+            if k.startswith("_") and k[1:] in optional_kwargs and k[1:] not in passed_pipe_kwargs
+        ]
+        for k in additional_pipe_kwargs:
+            original_pipe_kwargs[k] = original_config.pop(f"_{k}")
+
+        pipeline_kwargs = {
+            **passed_class_obj,
+            **original_class_obj,
+            **passed_pipe_kwargs,
+            **original_pipe_kwargs,
+            **kwargs,
+        }
+
+        # store unused config as private attribute in the new pipeline
+        unused_original_config = {
+            f"{'' if k.startswith('_') else '_'}{k}": v for k, v in original_config.items() if k not in pipeline_kwargs
+        }
+
+        missing_modules = (
+            set(expected_modules)
+            - set(pipeline._optional_components)
+            - set(pipeline_kwargs.keys())
+            - set(true_optional_modules)
+        )
+
+        if len(missing_modules) > 0:
+            raise ValueError(
+                f"Pipeline {pipeline_class} expected {expected_modules}, but only {set(list(passed_class_obj.keys()) + list(original_class_obj.keys()))} were passed"
+            )
+
+        new_pipeline = pipeline_class(**pipeline_kwargs)
+        if pretrained_model_name_or_path is not None:
+            new_pipeline.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        new_pipeline.register_to_config(**unused_original_config)
+
+        if torch_dtype is not None:
+            new_pipeline.to(dtype=torch_dtype)
+
+        return new_pipeline
+
+
+class StableDiffusionMixin:
+    r"""
+    Helper for DiffusionPipeline with vae and unet.(mainly for LDM such as stable diffusion)
+    """
+
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stages where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
diff --git a/diffusers/src/diffusers/pipelines/pixart_alpha/__init__.py b/diffusers/src/diffusers/pipelines/pixart_alpha/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cb870fc8589aeff366429c182736b2ddd6ce215
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pixart_alpha/__init__.py
@@ -0,0 +1,55 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_pixart_alpha"] = ["PixArtAlphaPipeline"]
+    _import_structure["pipeline_pixart_sigma"] = ["PixArtSigmaPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_pixart_alpha import (
+            ASPECT_RATIO_256_BIN,
+            ASPECT_RATIO_512_BIN,
+            ASPECT_RATIO_1024_BIN,
+            PixArtAlphaPipeline,
+        )
+        from .pipeline_pixart_sigma import ASPECT_RATIO_2048_BIN, PixArtSigmaPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/pixart_alpha/__pycache__/__init__.cpython-311.pyc b/diffusers/src/diffusers/pipelines/pixart_alpha/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cde32cb165a4b5c8c80d165f7e80bc0ca6d7fb15
Binary files /dev/null and b/diffusers/src/diffusers/pipelines/pixart_alpha/__pycache__/__init__.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/pipelines/pixart_alpha/__pycache__/pipeline_pixart_alpha.cpython-311.pyc b/diffusers/src/diffusers/pipelines/pixart_alpha/__pycache__/pipeline_pixart_alpha.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b100e010f11eb01494c5e132e2aa24b97c0764b
Binary files /dev/null and b/diffusers/src/diffusers/pipelines/pixart_alpha/__pycache__/pipeline_pixart_alpha.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/diffusers/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
new file mode 100644
index 0000000000000000000000000000000000000000..dce25fb6bee47fe79c1891324293dfb9e1e5e21b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
@@ -0,0 +1,983 @@
+# Copyright 2024 PixArt-Alpha Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...image_processor import PixArtImageProcessor
+from ...models import AutoencoderKL, PixArtTransformer2DModel
+from ...schedulers import DPMSolverMultistepScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    deprecate,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import PixArtAlphaPipeline
+
+        >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too.
+        >>> pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
+        >>> # Enable memory optimizations.
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A small cactus with a happy face in the Sahara desert."
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+ASPECT_RATIO_1024_BIN = {
+    "0.25": [512.0, 2048.0],
+    "0.28": [512.0, 1856.0],
+    "0.32": [576.0, 1792.0],
+    "0.33": [576.0, 1728.0],
+    "0.35": [576.0, 1664.0],
+    "0.4": [640.0, 1600.0],
+    "0.42": [640.0, 1536.0],
+    "0.48": [704.0, 1472.0],
+    "0.5": [704.0, 1408.0],
+    "0.52": [704.0, 1344.0],
+    "0.57": [768.0, 1344.0],
+    "0.6": [768.0, 1280.0],
+    "0.68": [832.0, 1216.0],
+    "0.72": [832.0, 1152.0],
+    "0.78": [896.0, 1152.0],
+    "0.82": [896.0, 1088.0],
+    "0.88": [960.0, 1088.0],
+    "0.94": [960.0, 1024.0],
+    "1.0": [1024.0, 1024.0],
+    "1.07": [1024.0, 960.0],
+    "1.13": [1088.0, 960.0],
+    "1.21": [1088.0, 896.0],
+    "1.29": [1152.0, 896.0],
+    "1.38": [1152.0, 832.0],
+    "1.46": [1216.0, 832.0],
+    "1.67": [1280.0, 768.0],
+    "1.75": [1344.0, 768.0],
+    "2.0": [1408.0, 704.0],
+    "2.09": [1472.0, 704.0],
+    "2.4": [1536.0, 640.0],
+    "2.5": [1600.0, 640.0],
+    "3.0": [1728.0, 576.0],
+    "4.0": [2048.0, 512.0],
+}
+
+ASPECT_RATIO_512_BIN = {
+    "0.25": [256.0, 1024.0],
+    "0.28": [256.0, 928.0],
+    "0.32": [288.0, 896.0],
+    "0.33": [288.0, 864.0],
+    "0.35": [288.0, 832.0],
+    "0.4": [320.0, 800.0],
+    "0.42": [320.0, 768.0],
+    "0.48": [352.0, 736.0],
+    "0.5": [352.0, 704.0],
+    "0.52": [352.0, 672.0],
+    "0.57": [384.0, 672.0],
+    "0.6": [384.0, 640.0],
+    "0.68": [416.0, 608.0],
+    "0.72": [416.0, 576.0],
+    "0.78": [448.0, 576.0],
+    "0.82": [448.0, 544.0],
+    "0.88": [480.0, 544.0],
+    "0.94": [480.0, 512.0],
+    "1.0": [512.0, 512.0],
+    "1.07": [512.0, 480.0],
+    "1.13": [544.0, 480.0],
+    "1.21": [544.0, 448.0],
+    "1.29": [576.0, 448.0],
+    "1.38": [576.0, 416.0],
+    "1.46": [608.0, 416.0],
+    "1.67": [640.0, 384.0],
+    "1.75": [672.0, 384.0],
+    "2.0": [704.0, 352.0],
+    "2.09": [736.0, 352.0],
+    "2.4": [768.0, 320.0],
+    "2.5": [800.0, 320.0],
+    "3.0": [864.0, 288.0],
+    "4.0": [1024.0, 256.0],
+}
+
+ASPECT_RATIO_256_BIN = {
+    "0.25": [128.0, 512.0],
+    "0.28": [128.0, 464.0],
+    "0.32": [144.0, 448.0],
+    "0.33": [144.0, 432.0],
+    "0.35": [144.0, 416.0],
+    "0.4": [160.0, 400.0],
+    "0.42": [160.0, 384.0],
+    "0.48": [176.0, 368.0],
+    "0.5": [176.0, 352.0],
+    "0.52": [176.0, 336.0],
+    "0.57": [192.0, 336.0],
+    "0.6": [192.0, 320.0],
+    "0.68": [208.0, 304.0],
+    "0.72": [208.0, 288.0],
+    "0.78": [224.0, 288.0],
+    "0.82": [224.0, 272.0],
+    "0.88": [240.0, 272.0],
+    "0.94": [240.0, 256.0],
+    "1.0": [256.0, 256.0],
+    "1.07": [256.0, 240.0],
+    "1.13": [272.0, 240.0],
+    "1.21": [272.0, 224.0],
+    "1.29": [288.0, 224.0],
+    "1.38": [288.0, 208.0],
+    "1.46": [304.0, 208.0],
+    "1.67": [320.0, 192.0],
+    "1.75": [336.0, 192.0],
+    "2.0": [352.0, 176.0],
+    "2.09": [368.0, 176.0],
+    "2.4": [384.0, 160.0],
+    "2.5": [400.0, 160.0],
+    "3.0": [432.0, 144.0],
+    "4.0": [512.0, 128.0],
+}
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class PixArtAlphaPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using PixArt-Alpha.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. PixArt-Alpha uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`PixArtTransformer2DModel`]):
+            A text conditioned `PixArtTransformer2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder"]
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKL,
+        transformer: PixArtTransformer2DModel,
+        scheduler: DPMSolverMultistepScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str = "",
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+        max_sequence_length: int = 120,
+        **kwargs,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+                PixArt-Alpha, this should be "".
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
+                string.
+            clean_caption (`bool`, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+            max_sequence_length (`int`, defaults to 120): Maximum sequence length to use for the prompt.
+        """
+
+        if "mask_feature" in kwargs:
+            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # See Section 3.1. of the paper.
+        # print(self.text_encoder.dtype)
+        # print(self.text_encoder.shared.weight)
+        # print(self.text_encoder.shared.weight.dtype)
+        # print(type(self.text_encoder))
+        max_length = max_sequence_length
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because T5 can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            prompt_attention_mask = text_inputs.attention_mask
+            prompt_attention_mask = prompt_attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.transformer is not None:
+            dtype = self.transformer.dtype
+        else:
+            dtype = None
+        # print('before:', prompt_embeds.dtype, prompt_embeds.shape)
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        # print('after:', prompt_embeds.dtype)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            negative_prompt_attention_mask = uncond_input.attention_mask
+            negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
+        else:
+            negative_prompt_embeds = None
+            negative_prompt_attention_mask = None
+
+        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_steps,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: str = "",
+        num_inference_steps: int = 20,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 4.5,
+        num_images_per_prompt: Optional[int] = 1,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        use_resolution_binning: bool = True,
+        max_sequence_length: int = 120,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 4.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not
+                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            use_resolution_binning (`bool` defaults to `True`):
+                If set to `True`, the requested height and width are first mapped to the closest resolutions using
+                `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
+                the requested resolution. Useful for generating non-square images.
+            max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        if "mask_feature" in kwargs:
+            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+        # 1. Check inputs. Raise error if not correct
+        height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        if use_resolution_binning:
+            if self.transformer.config.sample_size == 128:
+                aspect_ratio_bin = ASPECT_RATIO_1024_BIN
+            elif self.transformer.config.sample_size == 64:
+                aspect_ratio_bin = ASPECT_RATIO_512_BIN
+            elif self.transformer.config.sample_size == 32:
+                aspect_ratio_bin = ASPECT_RATIO_256_BIN
+            else:
+                raise ValueError("Invalid sample size")
+            orig_height, orig_width = height, width
+            # height, width = self.image_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)
+            height,width = orig_height,orig_width
+            # height = int(height*(640/512))
+            # width = int(width*(640/512))
+            print(height, width)
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_steps,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        )
+
+        # 2. Default height and width to transformer
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            clean_caption=clean_caption,
+            max_sequence_length=max_sequence_length,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            latent_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Prepare micro-conditions.
+        added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+        if self.transformer.config.sample_size == 128:
+            resolution = torch.tensor([height, width]).repeat(batch_size * num_images_per_prompt, 1)
+            aspect_ratio = torch.tensor([float(height / width)]).repeat(batch_size * num_images_per_prompt, 1)
+            resolution = resolution.to(dtype=prompt_embeds.dtype, device=device)
+            aspect_ratio = aspect_ratio.to(dtype=prompt_embeds.dtype, device=device)
+
+            if do_classifier_free_guidance:
+                resolution = torch.cat([resolution, resolution], dim=0)
+                aspect_ratio = torch.cat([aspect_ratio, aspect_ratio], dim=0)
+
+            added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}
+
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                current_timestep = t
+                if not torch.is_tensor(current_timestep):
+                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                    # This would be a good case for the `match` statement (Python 3.10+)
+                    is_mps = latent_model_input.device.type == "mps"
+                    if isinstance(current_timestep, float):
+                        dtype = torch.float32 if is_mps else torch.float64
+                    else:
+                        dtype = torch.int32 if is_mps else torch.int64
+                    current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
+                elif len(current_timestep.shape) == 0:
+                    current_timestep = current_timestep[None].to(latent_model_input.device)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(latent_model_input.shape[0])
+
+                # predict noise model_output
+                # added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+                # print(f"added_cond_kwargs: {added_cond_kwargs}")
+                # added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+                print(f"self.transformer.config.sample_size: {self.transformer.config.sample_size}")
+                noise_pred = self.transformer(
+                    latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_attention_mask=prompt_attention_mask,
+                    timestep=current_timestep,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # learned sigma
+                if self.transformer.config.out_channels // 2 == latent_channels:
+                    noise_pred = noise_pred.chunk(2, dim=1)[0]
+                else:
+                    noise_pred = noise_pred
+
+                # compute previous image: x_t -> x_t-1
+                if num_inference_steps == 1:
+                    # For DMD one step sampling: https://arxiv.org/abs/2311.18828
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).pred_original_sample
+                else:
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            # print(image)
+            if use_resolution_binning:
+                image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py b/diffusers/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py
new file mode 100644
index 0000000000000000000000000000000000000000..e77c7ff64aec5e016ad5ea4b2780bdbdf78e16fc
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py
@@ -0,0 +1,882 @@
+# Copyright 2024 PixArt-Sigma Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...image_processor import PixArtImageProcessor
+from ...models import AutoencoderKL, PixArtTransformer2DModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    BACKENDS_MAPPING,
+    deprecate,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .pipeline_pixart_alpha import (
+    ASPECT_RATIO_256_BIN,
+    ASPECT_RATIO_512_BIN,
+    ASPECT_RATIO_1024_BIN,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+ASPECT_RATIO_2048_BIN = {
+    "0.25": [1024.0, 4096.0],
+    "0.26": [1024.0, 3968.0],
+    "0.27": [1024.0, 3840.0],
+    "0.28": [1024.0, 3712.0],
+    "0.32": [1152.0, 3584.0],
+    "0.33": [1152.0, 3456.0],
+    "0.35": [1152.0, 3328.0],
+    "0.4": [1280.0, 3200.0],
+    "0.42": [1280.0, 3072.0],
+    "0.48": [1408.0, 2944.0],
+    "0.5": [1408.0, 2816.0],
+    "0.52": [1408.0, 2688.0],
+    "0.57": [1536.0, 2688.0],
+    "0.6": [1536.0, 2560.0],
+    "0.68": [1664.0, 2432.0],
+    "0.72": [1664.0, 2304.0],
+    "0.78": [1792.0, 2304.0],
+    "0.82": [1792.0, 2176.0],
+    "0.88": [1920.0, 2176.0],
+    "0.94": [1920.0, 2048.0],
+    "1.0": [2048.0, 2048.0],
+    "1.07": [2048.0, 1920.0],
+    "1.13": [2176.0, 1920.0],
+    "1.21": [2176.0, 1792.0],
+    "1.29": [2304.0, 1792.0],
+    "1.38": [2304.0, 1664.0],
+    "1.46": [2432.0, 1664.0],
+    "1.67": [2560.0, 1536.0],
+    "1.75": [2688.0, 1536.0],
+    "2.0": [2816.0, 1408.0],
+    "2.09": [2944.0, 1408.0],
+    "2.4": [3072.0, 1280.0],
+    "2.5": [3200.0, 1280.0],
+    "2.89": [3328.0, 1152.0],
+    "3.0": [3456.0, 1152.0],
+    "3.11": [3584.0, 1152.0],
+    "3.62": [3712.0, 1024.0],
+    "3.75": [3840.0, 1024.0],
+    "3.88": [3968.0, 1024.0],
+    "4.0": [4096.0, 1024.0],
+}
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import PixArtSigmaPipeline
+
+        >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-Sigma-XL-2-512-MS" too.
+        >>> pipe = PixArtSigmaPipeline.from_pretrained(
+        ...     "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", torch_dtype=torch.float16
+        ... )
+        >>> # Enable memory optimizations.
+        >>> # pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A small cactus with a happy face in the Sahara desert."
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class PixArtSigmaPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using PixArt-Sigma.
+    """
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder"]
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKL,
+        transformer: PixArtTransformer2DModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt with 120->300
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str = "",
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+        max_sequence_length: int = 300,
+        **kwargs,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+                PixArt-Alpha, this should be "".
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
+                string.
+            clean_caption (`bool`, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+            max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
+        """
+
+        if "mask_feature" in kwargs:
+            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # See Section 3.1. of the paper.
+        max_length = max_sequence_length
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because T5 can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            prompt_attention_mask = text_inputs.attention_mask
+            prompt_attention_mask = prompt_attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.transformer is not None:
+            dtype = self.transformer.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            negative_prompt_attention_mask = uncond_input.attention_mask
+            negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
+        else:
+            negative_prompt_embeds = None
+            negative_prompt_attention_mask = None
+
+        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_steps,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: str = "",
+        num_inference_steps: int = 20,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 4.5,
+        num_images_per_prompt: Optional[int] = 1,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        use_resolution_binning: bool = True,
+        max_sequence_length: int = 300,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 4.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
+                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            use_resolution_binning (`bool` defaults to `True`):
+                If set to `True`, the requested height and width are first mapped to the closest resolutions using
+                `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
+                the requested resolution. Useful for generating non-square images.
+            max_sequence_length (`int` defaults to 300): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        # 1. Check inputs. Raise error if not correct
+        height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        if use_resolution_binning:
+            if self.transformer.config.sample_size == 256:
+                aspect_ratio_bin = ASPECT_RATIO_2048_BIN
+            elif self.transformer.config.sample_size == 128:
+                aspect_ratio_bin = ASPECT_RATIO_1024_BIN
+            elif self.transformer.config.sample_size == 64:
+                aspect_ratio_bin = ASPECT_RATIO_512_BIN
+            elif self.transformer.config.sample_size == 32:
+                aspect_ratio_bin = ASPECT_RATIO_256_BIN
+            else:
+                raise ValueError("Invalid sample size")
+            orig_height, orig_width = height, width
+            height, width = self.image_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)
+
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_steps,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        )
+
+        # 2. Default height and width to transformer
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            clean_caption=clean_caption,
+            max_sequence_length=max_sequence_length,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            latent_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Prepare micro-conditions.
+        added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                current_timestep = t
+                if not torch.is_tensor(current_timestep):
+                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                    # This would be a good case for the `match` statement (Python 3.10+)
+                    is_mps = latent_model_input.device.type == "mps"
+                    if isinstance(current_timestep, float):
+                        dtype = torch.float32 if is_mps else torch.float64
+                    else:
+                        dtype = torch.int32 if is_mps else torch.int64
+                    current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
+                elif len(current_timestep.shape) == 0:
+                    current_timestep = current_timestep[None].to(latent_model_input.device)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(latent_model_input.shape[0])
+
+                # predict noise model_output
+                # print(f"added_cond_kwargs: {added_cond_kwargs}")
+                # print(f"self.transformer.config.sample_size: {self.transformer.config.sample_size}")
+                noise_pred = self.transformer(
+                    latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_attention_mask=prompt_attention_mask,
+                    timestep=current_timestep,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # learned sigma
+                if self.transformer.config.out_channels // 2 == latent_channels:
+                    noise_pred = noise_pred.chunk(2, dim=1)[0]
+                else:
+                    noise_pred = noise_pred
+
+                # compute previous image: x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            if use_resolution_binning:
+                image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py b/diffusers/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..70f5b1a547c4b90e28109843ae3be2fca2e98c88
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
@@ -0,0 +1,49 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_output"] = ["SemanticStableDiffusionPipelineOutput"]
+    _import_structure["pipeline_semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_semantic_stable_diffusion import SemanticStableDiffusionPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py b/diffusers/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..34991299398115f439537b77e1f1fc8a83e0d431
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py
@@ -0,0 +1,25 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class SemanticStableDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains “not-safe-for-work” (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
diff --git a/diffusers/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/diffusers/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f83071f3e85edb7c1e1eaf61d46086b239e6651
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
@@ -0,0 +1,722 @@
+import inspect
+from itertools import repeat
+from typing import Callable, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import SemanticStableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with latent editing.
+
+    This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionPipeline`]. Check the superclass
+    documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular
+    device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`Q16SafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        editing_prompt: Optional[Union[str, List[str]]] = None,
+        editing_prompt_embeddings: Optional[torch.Tensor] = None,
+        reverse_editing_direction: Optional[Union[bool, List[bool]]] = False,
+        edit_guidance_scale: Optional[Union[float, List[float]]] = 5,
+        edit_warmup_steps: Optional[Union[int, List[int]]] = 10,
+        edit_cooldown_steps: Optional[Union[int, List[int]]] = None,
+        edit_threshold: Optional[Union[float, List[float]]] = 0.9,
+        edit_momentum_scale: Optional[float] = 0.1,
+        edit_mom_beta: Optional[float] = 0.4,
+        edit_weights: Optional[List[float]] = None,
+        sem_guidance: Optional[List[torch.Tensor]] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            editing_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to use for semantic guidance. Semantic guidance is disabled by setting
+                `editing_prompt = None`. Guidance direction of prompt should be specified via
+                `reverse_editing_direction`.
+            editing_prompt_embeddings (`torch.Tensor`, *optional*):
+                Pre-computed embeddings to use for semantic guidance. Guidance direction of embedding should be
+                specified via `reverse_editing_direction`.
+            reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
+                Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
+            edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
+                Guidance scale for semantic guidance. If provided as a list, values should correspond to
+                `editing_prompt`.
+            edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
+                Number of diffusion steps (for each prompt) for which semantic guidance is not applied. Momentum is
+                calculated for those steps and applied once all warmup periods are over.
+            edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
+                Number of diffusion steps (for each prompt) after which semantic guidance is longer applied.
+            edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
+                Threshold of semantic guidance.
+            edit_momentum_scale (`float`, *optional*, defaults to 0.1):
+                Scale of the momentum to be added to the semantic guidance at each diffusion step. If set to 0.0,
+                momentum is disabled. Momentum is already built up during warmup (for diffusion steps smaller than
+                `sld_warmup_steps`). Momentum is only added to latent guidance once all warmup periods are finished.
+            edit_mom_beta (`float`, *optional*, defaults to 0.4):
+                Defines how semantic guidance momentum builds up. `edit_mom_beta` indicates how much of the previous
+                momentum is kept. Momentum is already built up during warmup (for diffusion steps smaller than
+                `edit_warmup_steps`).
+            edit_weights (`List[float]`, *optional*, defaults to `None`):
+                Indicates how much each individual concept should influence the overall guidance. If no weights are
+                provided all concepts are applied equally.
+            sem_guidance (`List[torch.Tensor]`, *optional*):
+                List of pre-generated guidance vectors to be applied at generation. Length of the list has to
+                correspond to `num_inference_steps`.
+
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> from diffusers import SemanticStableDiffusionPipeline
+
+        >>> pipe = SemanticStableDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> out = pipe(
+        ...     prompt="a photo of the face of a woman",
+        ...     num_images_per_prompt=1,
+        ...     guidance_scale=7,
+        ...     editing_prompt=[
+        ...         "smiling, smile",  # Concepts to apply
+        ...         "glasses, wearing glasses",
+        ...         "curls, wavy hair, curly hair",
+        ...         "beard, full beard, mustache",
+        ...     ],
+        ...     reverse_editing_direction=[
+        ...         False,
+        ...         False,
+        ...         False,
+        ...         False,
+        ...     ],  # Direction of guidance i.e. increase all concepts
+        ...     edit_warmup_steps=[10, 10, 10, 10],  # Warmup period for each concept
+        ...     edit_guidance_scale=[4, 5, 5, 5.4],  # Guidance scale for each concept
+        ...     edit_threshold=[
+        ...         0.99,
+        ...         0.975,
+        ...         0.925,
+        ...         0.96,
+        ...     ],  # Threshold for each concept. Threshold equals the percentile of the latent space that will be discarded. I.e. threshold=0.99 uses 1% of the latent dimensions
+        ...     edit_momentum_scale=0.3,  # Momentum scale that will be added to the latent guidance
+        ...     edit_mom_beta=0.6,  # Momentum beta
+        ...     edit_weights=[1, 1, 1, 1, 1],  # Weights of the individual concepts against each other
+        ... )
+        >>> image = out.images[0]
+        ```
+
+        Returns:
+            [`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`,
+                [`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is a list with the generated images and the second element
+                is a list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work"
+                (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+
+        if editing_prompt:
+            enable_edit_guidance = True
+            if isinstance(editing_prompt, str):
+                editing_prompt = [editing_prompt]
+            enabled_editing_prompts = len(editing_prompt)
+        elif editing_prompt_embeddings is not None:
+            enable_edit_guidance = True
+            enabled_editing_prompts = editing_prompt_embeddings.shape[0]
+        else:
+            enabled_editing_prompts = 0
+            enable_edit_guidance = False
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if enable_edit_guidance:
+            # get safety text embeddings
+            if editing_prompt_embeddings is None:
+                edit_concepts_input = self.tokenizer(
+                    [x for item in editing_prompt for x in repeat(item, batch_size)],
+                    padding="max_length",
+                    max_length=self.tokenizer.model_max_length,
+                    return_tensors="pt",
+                )
+
+                edit_concepts_input_ids = edit_concepts_input.input_ids
+
+                if edit_concepts_input_ids.shape[-1] > self.tokenizer.model_max_length:
+                    removed_text = self.tokenizer.batch_decode(
+                        edit_concepts_input_ids[:, self.tokenizer.model_max_length :]
+                    )
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+                    edit_concepts_input_ids = edit_concepts_input_ids[:, : self.tokenizer.model_max_length]
+                edit_concepts = self.text_encoder(edit_concepts_input_ids.to(device))[0]
+            else:
+                edit_concepts = editing_prompt_embeddings.to(device).repeat(batch_size, 1, 1)
+
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            bs_embed_edit, seq_len_edit, _ = edit_concepts.shape
+            edit_concepts = edit_concepts.repeat(1, num_images_per_prompt, 1)
+            edit_concepts = edit_concepts.view(bs_embed_edit * num_images_per_prompt, seq_len_edit, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            if enable_edit_guidance:
+                text_embeddings = torch.cat([uncond_embeddings, text_embeddings, edit_concepts])
+            else:
+                text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        # get the initial random noise unless the user supplied it
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # Initialize edit_momentum to None
+        edit_momentum = None
+
+        self.uncond_estimates = None
+        self.text_estimates = None
+        self.edit_estimates = None
+        self.sem_guidance = None
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = (
+                torch.cat([latents] * (2 + enabled_editing_prompts)) if do_classifier_free_guidance else latents
+            )
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_out = noise_pred.chunk(2 + enabled_editing_prompts)  # [b,4, 64, 64]
+                noise_pred_uncond, noise_pred_text = noise_pred_out[0], noise_pred_out[1]
+                noise_pred_edit_concepts = noise_pred_out[2:]
+
+                # default text guidance
+                noise_guidance = guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # noise_guidance = (noise_pred_text - noise_pred_edit_concepts[0])
+
+                if self.uncond_estimates is None:
+                    self.uncond_estimates = torch.zeros((num_inference_steps + 1, *noise_pred_uncond.shape))
+                self.uncond_estimates[i] = noise_pred_uncond.detach().cpu()
+
+                if self.text_estimates is None:
+                    self.text_estimates = torch.zeros((num_inference_steps + 1, *noise_pred_text.shape))
+                self.text_estimates[i] = noise_pred_text.detach().cpu()
+
+                if self.edit_estimates is None and enable_edit_guidance:
+                    self.edit_estimates = torch.zeros(
+                        (num_inference_steps + 1, len(noise_pred_edit_concepts), *noise_pred_edit_concepts[0].shape)
+                    )
+
+                if self.sem_guidance is None:
+                    self.sem_guidance = torch.zeros((num_inference_steps + 1, *noise_pred_text.shape))
+
+                if edit_momentum is None:
+                    edit_momentum = torch.zeros_like(noise_guidance)
+
+                if enable_edit_guidance:
+                    concept_weights = torch.zeros(
+                        (len(noise_pred_edit_concepts), noise_guidance.shape[0]),
+                        device=device,
+                        dtype=noise_guidance.dtype,
+                    )
+                    noise_guidance_edit = torch.zeros(
+                        (len(noise_pred_edit_concepts), *noise_guidance.shape),
+                        device=device,
+                        dtype=noise_guidance.dtype,
+                    )
+                    # noise_guidance_edit = torch.zeros_like(noise_guidance)
+                    warmup_inds = []
+                    for c, noise_pred_edit_concept in enumerate(noise_pred_edit_concepts):
+                        self.edit_estimates[i, c] = noise_pred_edit_concept
+                        if isinstance(edit_guidance_scale, list):
+                            edit_guidance_scale_c = edit_guidance_scale[c]
+                        else:
+                            edit_guidance_scale_c = edit_guidance_scale
+
+                        if isinstance(edit_threshold, list):
+                            edit_threshold_c = edit_threshold[c]
+                        else:
+                            edit_threshold_c = edit_threshold
+                        if isinstance(reverse_editing_direction, list):
+                            reverse_editing_direction_c = reverse_editing_direction[c]
+                        else:
+                            reverse_editing_direction_c = reverse_editing_direction
+                        if edit_weights:
+                            edit_weight_c = edit_weights[c]
+                        else:
+                            edit_weight_c = 1.0
+                        if isinstance(edit_warmup_steps, list):
+                            edit_warmup_steps_c = edit_warmup_steps[c]
+                        else:
+                            edit_warmup_steps_c = edit_warmup_steps
+
+                        if isinstance(edit_cooldown_steps, list):
+                            edit_cooldown_steps_c = edit_cooldown_steps[c]
+                        elif edit_cooldown_steps is None:
+                            edit_cooldown_steps_c = i + 1
+                        else:
+                            edit_cooldown_steps_c = edit_cooldown_steps
+                        if i >= edit_warmup_steps_c:
+                            warmup_inds.append(c)
+                        if i >= edit_cooldown_steps_c:
+                            noise_guidance_edit[c, :, :, :, :] = torch.zeros_like(noise_pred_edit_concept)
+                            continue
+
+                        noise_guidance_edit_tmp = noise_pred_edit_concept - noise_pred_uncond
+                        # tmp_weights = (noise_pred_text - noise_pred_edit_concept).sum(dim=(1, 2, 3))
+                        tmp_weights = (noise_guidance - noise_pred_edit_concept).sum(dim=(1, 2, 3))
+
+                        tmp_weights = torch.full_like(tmp_weights, edit_weight_c)  # * (1 / enabled_editing_prompts)
+                        if reverse_editing_direction_c:
+                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * -1
+                        concept_weights[c, :] = tmp_weights
+
+                        noise_guidance_edit_tmp = noise_guidance_edit_tmp * edit_guidance_scale_c
+
+                        # torch.quantile function expects float32
+                        if noise_guidance_edit_tmp.dtype == torch.float32:
+                            tmp = torch.quantile(
+                                torch.abs(noise_guidance_edit_tmp).flatten(start_dim=2),
+                                edit_threshold_c,
+                                dim=2,
+                                keepdim=False,
+                            )
+                        else:
+                            tmp = torch.quantile(
+                                torch.abs(noise_guidance_edit_tmp).flatten(start_dim=2).to(torch.float32),
+                                edit_threshold_c,
+                                dim=2,
+                                keepdim=False,
+                            ).to(noise_guidance_edit_tmp.dtype)
+
+                        noise_guidance_edit_tmp = torch.where(
+                            torch.abs(noise_guidance_edit_tmp) >= tmp[:, :, None, None],
+                            noise_guidance_edit_tmp,
+                            torch.zeros_like(noise_guidance_edit_tmp),
+                        )
+                        noise_guidance_edit[c, :, :, :, :] = noise_guidance_edit_tmp
+
+                        # noise_guidance_edit = noise_guidance_edit + noise_guidance_edit_tmp
+
+                    warmup_inds = torch.tensor(warmup_inds).to(device)
+                    if len(noise_pred_edit_concepts) > warmup_inds.shape[0] > 0:
+                        concept_weights = concept_weights.to("cpu")  # Offload to cpu
+                        noise_guidance_edit = noise_guidance_edit.to("cpu")
+
+                        concept_weights_tmp = torch.index_select(concept_weights.to(device), 0, warmup_inds)
+                        concept_weights_tmp = torch.where(
+                            concept_weights_tmp < 0, torch.zeros_like(concept_weights_tmp), concept_weights_tmp
+                        )
+                        concept_weights_tmp = concept_weights_tmp / concept_weights_tmp.sum(dim=0)
+                        # concept_weights_tmp = torch.nan_to_num(concept_weights_tmp)
+
+                        noise_guidance_edit_tmp = torch.index_select(noise_guidance_edit.to(device), 0, warmup_inds)
+                        noise_guidance_edit_tmp = torch.einsum(
+                            "cb,cbijk->bijk", concept_weights_tmp, noise_guidance_edit_tmp
+                        )
+                        noise_guidance = noise_guidance + noise_guidance_edit_tmp
+
+                        self.sem_guidance[i] = noise_guidance_edit_tmp.detach().cpu()
+
+                        del noise_guidance_edit_tmp
+                        del concept_weights_tmp
+                        concept_weights = concept_weights.to(device)
+                        noise_guidance_edit = noise_guidance_edit.to(device)
+
+                    concept_weights = torch.where(
+                        concept_weights < 0, torch.zeros_like(concept_weights), concept_weights
+                    )
+
+                    concept_weights = torch.nan_to_num(concept_weights)
+
+                    noise_guidance_edit = torch.einsum("cb,cbijk->bijk", concept_weights, noise_guidance_edit)
+                    noise_guidance_edit = noise_guidance_edit.to(edit_momentum.device)
+
+                    noise_guidance_edit = noise_guidance_edit + edit_momentum_scale * edit_momentum
+
+                    edit_momentum = edit_mom_beta * edit_momentum + (1 - edit_mom_beta) * noise_guidance_edit
+
+                    if warmup_inds.shape[0] == len(noise_pred_edit_concepts):
+                        noise_guidance = noise_guidance + noise_guidance_edit
+                        self.sem_guidance[i] = noise_guidance_edit.detach().cpu()
+
+                if sem_guidance is not None:
+                    edit_guidance = sem_guidance[i].to(device)
+                    noise_guidance = noise_guidance + edit_guidance
+
+                noise_pred = noise_pred_uncond + noise_guidance
+
+                # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # 8. Post-processing
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return SemanticStableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/shap_e/__init__.py b/diffusers/src/diffusers/pipelines/shap_e/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ed563c4a51f6e627c06711b60fe3a0709ff22f7
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/shap_e/__init__.py
@@ -0,0 +1,71 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["camera"] = ["create_pan_cameras"]
+    _import_structure["pipeline_shap_e"] = ["ShapEPipeline"]
+    _import_structure["pipeline_shap_e_img2img"] = ["ShapEImg2ImgPipeline"]
+    _import_structure["renderer"] = [
+        "BoundingBoxVolume",
+        "ImportanceRaySampler",
+        "MLPNeRFModelOutput",
+        "MLPNeRSTFModel",
+        "ShapEParamsProjModel",
+        "ShapERenderer",
+        "StratifiedRaySampler",
+        "VoidNeRFModel",
+    ]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .camera import create_pan_cameras
+        from .pipeline_shap_e import ShapEPipeline
+        from .pipeline_shap_e_img2img import ShapEImg2ImgPipeline
+        from .renderer import (
+            BoundingBoxVolume,
+            ImportanceRaySampler,
+            MLPNeRFModelOutput,
+            MLPNeRSTFModel,
+            ShapEParamsProjModel,
+            ShapERenderer,
+            StratifiedRaySampler,
+            VoidNeRFModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/shap_e/camera.py b/diffusers/src/diffusers/pipelines/shap_e/camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4b94c3000d845dedf1dc3e45fa217b9071c5f38
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/shap_e/camera.py
@@ -0,0 +1,147 @@
+# Copyright 2024 Open AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Tuple
+
+import numpy as np
+import torch
+
+
+@dataclass
+class DifferentiableProjectiveCamera:
+    """
+    Implements a batch, differentiable, standard pinhole camera
+    """
+
+    origin: torch.Tensor  # [batch_size x 3]
+    x: torch.Tensor  # [batch_size x 3]
+    y: torch.Tensor  # [batch_size x 3]
+    z: torch.Tensor  # [batch_size x 3]
+    width: int
+    height: int
+    x_fov: float
+    y_fov: float
+    shape: Tuple[int]
+
+    def __post_init__(self):
+        assert self.x.shape[0] == self.y.shape[0] == self.z.shape[0] == self.origin.shape[0]
+        assert self.x.shape[1] == self.y.shape[1] == self.z.shape[1] == self.origin.shape[1] == 3
+        assert len(self.x.shape) == len(self.y.shape) == len(self.z.shape) == len(self.origin.shape) == 2
+
+    def resolution(self):
+        return torch.from_numpy(np.array([self.width, self.height], dtype=np.float32))
+
+    def fov(self):
+        return torch.from_numpy(np.array([self.x_fov, self.y_fov], dtype=np.float32))
+
+    def get_image_coords(self) -> torch.Tensor:
+        """
+        :return: coords of shape (width * height, 2)
+        """
+        pixel_indices = torch.arange(self.height * self.width)
+        coords = torch.stack(
+            [
+                pixel_indices % self.width,
+                torch.div(pixel_indices, self.width, rounding_mode="trunc"),
+            ],
+            axis=1,
+        )
+        return coords
+
+    @property
+    def camera_rays(self):
+        batch_size, *inner_shape = self.shape
+        inner_batch_size = int(np.prod(inner_shape))
+
+        coords = self.get_image_coords()
+        coords = torch.broadcast_to(coords.unsqueeze(0), [batch_size * inner_batch_size, *coords.shape])
+        rays = self.get_camera_rays(coords)
+
+        rays = rays.view(batch_size, inner_batch_size * self.height * self.width, 2, 3)
+
+        return rays
+
+    def get_camera_rays(self, coords: torch.Tensor) -> torch.Tensor:
+        batch_size, *shape, n_coords = coords.shape
+        assert n_coords == 2
+        assert batch_size == self.origin.shape[0]
+
+        flat = coords.view(batch_size, -1, 2)
+
+        res = self.resolution()
+        fov = self.fov()
+
+        fracs = (flat.float() / (res - 1)) * 2 - 1
+        fracs = fracs * torch.tan(fov / 2)
+
+        fracs = fracs.view(batch_size, -1, 2)
+        directions = (
+            self.z.view(batch_size, 1, 3)
+            + self.x.view(batch_size, 1, 3) * fracs[:, :, :1]
+            + self.y.view(batch_size, 1, 3) * fracs[:, :, 1:]
+        )
+        directions = directions / directions.norm(dim=-1, keepdim=True)
+        rays = torch.stack(
+            [
+                torch.broadcast_to(self.origin.view(batch_size, 1, 3), [batch_size, directions.shape[1], 3]),
+                directions,
+            ],
+            dim=2,
+        )
+        return rays.view(batch_size, *shape, 2, 3)
+
+    def resize_image(self, width: int, height: int) -> "DifferentiableProjectiveCamera":
+        """
+        Creates a new camera for the resized view assuming the aspect ratio does not change.
+        """
+        assert width * self.height == height * self.width, "The aspect ratio should not change."
+        return DifferentiableProjectiveCamera(
+            origin=self.origin,
+            x=self.x,
+            y=self.y,
+            z=self.z,
+            width=width,
+            height=height,
+            x_fov=self.x_fov,
+            y_fov=self.y_fov,
+        )
+
+
+def create_pan_cameras(size: int) -> DifferentiableProjectiveCamera:
+    origins = []
+    xs = []
+    ys = []
+    zs = []
+    for theta in np.linspace(0, 2 * np.pi, num=20):
+        z = np.array([np.sin(theta), np.cos(theta), -0.5])
+        z /= np.sqrt(np.sum(z**2))
+        origin = -z * 4
+        x = np.array([np.cos(theta), -np.sin(theta), 0.0])
+        y = np.cross(z, x)
+        origins.append(origin)
+        xs.append(x)
+        ys.append(y)
+        zs.append(z)
+    return DifferentiableProjectiveCamera(
+        origin=torch.from_numpy(np.stack(origins, axis=0)).float(),
+        x=torch.from_numpy(np.stack(xs, axis=0)).float(),
+        y=torch.from_numpy(np.stack(ys, axis=0)).float(),
+        z=torch.from_numpy(np.stack(zs, axis=0)).float(),
+        width=size,
+        height=size,
+        x_fov=0.7,
+        y_fov=0.7,
+        shape=(1, len(xs)),
+    )
diff --git a/diffusers/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/diffusers/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
new file mode 100644
index 0000000000000000000000000000000000000000..f87f28e06c4ab9c490f42207b9441398857d3614
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -0,0 +1,334 @@
+# Copyright 2024 Open AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+
+from ...models import PriorTransformer
+from ...schedulers import HeunDiscreteScheduler
+from ...utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .renderer import ShapERenderer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
+        >>> from diffusers.utils import export_to_gif
+
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        >>> repo = "openai/shap-e"
+        >>> pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16)
+        >>> pipe = pipe.to(device)
+
+        >>> guidance_scale = 15.0
+        >>> prompt = "a shark"
+
+        >>> images = pipe(
+        ...     prompt,
+        ...     guidance_scale=guidance_scale,
+        ...     num_inference_steps=64,
+        ...     frame_size=256,
+        ... ).images
+
+        >>> gif_path = export_to_gif(images[0], "shark_3d.gif")
+        ```
+"""
+
+
+@dataclass
+class ShapEPipelineOutput(BaseOutput):
+    """
+    Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].
+
+    Args:
+        images (`torch.Tensor`)
+            A list of images for 3D rendering.
+    """
+
+    images: Union[List[List[PIL.Image.Image]], List[List[np.ndarray]]]
+
+
+class ShapEPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating latent representation of a 3D asset and rendering with the NeRF method.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+             A `CLIPTokenizer` to tokenize text.
+        scheduler ([`HeunDiscreteScheduler`]):
+            A scheduler to be used in combination with the `prior` model to generate image embedding.
+        shap_e_renderer ([`ShapERenderer`]):
+            Shap-E renderer projects the generated latents into parameters of a MLP to create 3D objects with the NeRF
+            rendering method.
+    """
+
+    model_cpu_offload_seq = "text_encoder->prior"
+    _exclude_from_cpu_offload = ["shap_e_renderer"]
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: HeunDiscreteScheduler,
+        shap_e_renderer: ShapERenderer,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            shap_e_renderer=shap_e_renderer,
+        )
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+    ):
+        len(prompt) if isinstance(prompt, list) else 1
+
+        # YiYi Notes: set pad_token_id to be 0, not sure why I can't set in the config file
+        self.tokenizer.pad_token_id = 0
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+        prompt_embeds = text_encoder_output.text_embeds
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        # in Shap-E it normalize the prompt_embeds and then later rescale it
+        prompt_embeds = prompt_embeds / torch.linalg.norm(prompt_embeds, dim=-1, keepdim=True)
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # Rescale the features to have unit variance
+        prompt_embeds = math.sqrt(prompt_embeds.shape[1]) * prompt_embeds
+
+        return prompt_embeds
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: str,
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        guidance_scale: float = 4.0,
+        frame_size: int = 64,
+        output_type: Optional[str] = "pil",  # pil, np, latent, mesh
+        return_dict: bool = True,
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            frame_size (`int`, *optional*, default to 64):
+                The width and height of each image frame of the generated 3D output.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`), `"latent"` (`torch.Tensor`), or mesh ([`MeshDecoderOutput`]).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] instead of a plain
+                tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        prompt_embeds = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance)
+
+        # prior
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        num_embeddings = self.prior.config.num_embeddings
+        embedding_dim = self.prior.config.embedding_dim
+
+        latents = self.prepare_latents(
+            (batch_size, num_embeddings * embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
+        latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim)
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            noise_pred = self.prior(
+                scaled_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+            ).predicted_image_embedding
+
+            # remove the variance
+            noise_pred, _ = noise_pred.split(
+                scaled_model_input.shape[2], dim=2
+            )  # batch_size, num_embeddings, embedding_dim
+
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
+
+            latents = self.scheduler.step(
+                noise_pred,
+                timestep=t,
+                sample=latents,
+            ).prev_sample
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["np", "pil", "latent", "mesh"]:
+            raise ValueError(
+                f"Only the output types `pil`, `np`, `latent` and `mesh` are supported not output_type={output_type}"
+            )
+
+        if output_type == "latent":
+            return ShapEPipelineOutput(images=latents)
+
+        images = []
+        if output_type == "mesh":
+            for i, latent in enumerate(latents):
+                mesh = self.shap_e_renderer.decode_to_mesh(
+                    latent[None, :],
+                    device,
+                )
+                images.append(mesh)
+
+        else:
+            # np, pil
+            for i, latent in enumerate(latents):
+                image = self.shap_e_renderer.decode_to_image(
+                    latent[None, :],
+                    device,
+                    size=frame_size,
+                )
+                images.append(image)
+
+            images = torch.stack(images)
+
+            images = images.cpu().numpy()
+
+            if output_type == "pil":
+                images = [self.numpy_to_pil(image) for image in images]
+
+        if not return_dict:
+            return (images,)
+
+        return ShapEPipelineOutput(images=images)
diff --git a/diffusers/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/diffusers/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cc145e4c3e294c785c1787dca3811c549f0fedd
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -0,0 +1,321 @@
+# Copyright 2024 Open AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModel
+
+from ...models import PriorTransformer
+from ...schedulers import HeunDiscreteScheduler
+from ...utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .renderer import ShapERenderer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from PIL import Image
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
+        >>> from diffusers.utils import export_to_gif, load_image
+
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        >>> repo = "openai/shap-e-img2img"
+        >>> pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16)
+        >>> pipe = pipe.to(device)
+
+        >>> guidance_scale = 3.0
+        >>> image_url = "https://hf.co/datasets/diffusers/docs-images/resolve/main/shap-e/corgi.png"
+        >>> image = load_image(image_url).convert("RGB")
+
+        >>> images = pipe(
+        ...     image,
+        ...     guidance_scale=guidance_scale,
+        ...     num_inference_steps=64,
+        ...     frame_size=256,
+        ... ).images
+
+        >>> gif_path = export_to_gif(images[0], "corgi_3d.gif")
+        ```
+"""
+
+
+@dataclass
+class ShapEPipelineOutput(BaseOutput):
+    """
+    Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].
+
+    Args:
+        images (`torch.Tensor`)
+            A list of images for 3D rendering.
+    """
+
+    images: Union[PIL.Image.Image, np.ndarray]
+
+
+class ShapEImg2ImgPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating latent representation of a 3D asset and rendering with the NeRF method from an image.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        image_encoder ([`~transformers.CLIPVisionModel`]):
+            Frozen image-encoder.
+        image_processor ([`~transformers.CLIPImageProcessor`]):
+             A `CLIPImageProcessor` to process images.
+        scheduler ([`HeunDiscreteScheduler`]):
+            A scheduler to be used in combination with the `prior` model to generate image embedding.
+        shap_e_renderer ([`ShapERenderer`]):
+            Shap-E renderer projects the generated latents into parameters of a MLP to create 3D objects with the NeRF
+            rendering method.
+    """
+
+    model_cpu_offload_seq = "image_encoder->prior"
+    _exclude_from_cpu_offload = ["shap_e_renderer"]
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        image_encoder: CLIPVisionModel,
+        image_processor: CLIPImageProcessor,
+        scheduler: HeunDiscreteScheduler,
+        shap_e_renderer: ShapERenderer,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+            scheduler=scheduler,
+            shap_e_renderer=shap_e_renderer,
+        )
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_image(
+        self,
+        image,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+    ):
+        if isinstance(image, List) and isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor(image, return_tensors="pt").pixel_values[0].unsqueeze(0)
+
+        image = image.to(dtype=self.image_encoder.dtype, device=device)
+
+        image_embeds = self.image_encoder(image)["last_hidden_state"]
+        image_embeds = image_embeds[:, 1:, :].contiguous()  # batch_size, dim, 256
+
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            negative_image_embeds = torch.zeros_like(image_embeds)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        return image_embeds
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image]],
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        guidance_scale: float = 4.0,
+        frame_size: int = 64,
+        output_type: Optional[str] = "pil",  # pil, np, latent, mesh
+        return_dict: bool = True,
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
+                latents as image, but if passing latents directly it is not encoded again.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            frame_size (`int`, *optional*, default to 64):
+                The width and height of each image frame of the generated 3D output.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`), `"latent"` (`torch.Tensor`), or mesh ([`MeshDecoderOutput`]).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] instead of a plain
+                tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
+        """
+
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, torch.Tensor):
+            batch_size = image.shape[0]
+        elif isinstance(image, list) and isinstance(image[0], (torch.Tensor, PIL.Image.Image)):
+            batch_size = len(image)
+        else:
+            raise ValueError(
+                f"`image` has to be of type `PIL.Image.Image`, `torch.Tensor`, `List[PIL.Image.Image]` or `List[torch.Tensor]` but is {type(image)}"
+            )
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        image_embeds = self._encode_image(image, device, num_images_per_prompt, do_classifier_free_guidance)
+
+        # prior
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        num_embeddings = self.prior.config.num_embeddings
+        embedding_dim = self.prior.config.embedding_dim
+        if latents is None:
+            latents = self.prepare_latents(
+                (batch_size, num_embeddings * embedding_dim),
+                image_embeds.dtype,
+                device,
+                generator,
+                latents,
+                self.scheduler,
+            )
+
+        # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
+        latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim)
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            noise_pred = self.prior(
+                scaled_model_input,
+                timestep=t,
+                proj_embedding=image_embeds,
+            ).predicted_image_embedding
+
+            # remove the variance
+            noise_pred, _ = noise_pred.split(
+                scaled_model_input.shape[2], dim=2
+            )  # batch_size, num_embeddings, embedding_dim
+
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
+
+            latents = self.scheduler.step(
+                noise_pred,
+                timestep=t,
+                sample=latents,
+            ).prev_sample
+
+        if output_type not in ["np", "pil", "latent", "mesh"]:
+            raise ValueError(
+                f"Only the output types `pil`, `np`, `latent` and `mesh` are supported not output_type={output_type}"
+            )
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if output_type == "latent":
+            return ShapEPipelineOutput(images=latents)
+
+        images = []
+        if output_type == "mesh":
+            for i, latent in enumerate(latents):
+                mesh = self.shap_e_renderer.decode_to_mesh(
+                    latent[None, :],
+                    device,
+                )
+                images.append(mesh)
+
+        else:
+            # np, pil
+            for i, latent in enumerate(latents):
+                image = self.shap_e_renderer.decode_to_image(
+                    latent[None, :],
+                    device,
+                    size=frame_size,
+                )
+                images.append(image)
+
+            images = torch.stack(images)
+
+            images = images.cpu().numpy()
+
+            if output_type == "pil":
+                images = [self.numpy_to_pil(image) for image in images]
+
+        if not return_dict:
+            return (images,)
+
+        return ShapEPipelineOutput(images=images)
diff --git a/diffusers/src/diffusers/pipelines/shap_e/renderer.py b/diffusers/src/diffusers/pipelines/shap_e/renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d9f9d9b2ab1e923a79c72f21f2d0c5dd7456515
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/shap_e/renderer.py
@@ -0,0 +1,1050 @@
+# Copyright 2024 Open AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models import ModelMixin
+from ...utils import BaseOutput
+from .camera import create_pan_cameras
+
+
+def sample_pmf(pmf: torch.Tensor, n_samples: int) -> torch.Tensor:
+    r"""
+    Sample from the given discrete probability distribution with replacement.
+
+    The i-th bin is assumed to have mass pmf[i].
+
+    Args:
+        pmf: [batch_size, *shape, n_samples, 1] where (pmf.sum(dim=-2) == 1).all()
+        n_samples: number of samples
+
+    Return:
+        indices sampled with replacement
+    """
+
+    *shape, support_size, last_dim = pmf.shape
+    assert last_dim == 1
+
+    cdf = torch.cumsum(pmf.view(-1, support_size), dim=1)
+    inds = torch.searchsorted(cdf, torch.rand(cdf.shape[0], n_samples, device=cdf.device))
+
+    return inds.view(*shape, n_samples, 1).clamp(0, support_size - 1)
+
+
+def posenc_nerf(x: torch.Tensor, min_deg: int = 0, max_deg: int = 15) -> torch.Tensor:
+    """
+    Concatenate x and its positional encodings, following NeRF.
+
+    Reference: https://arxiv.org/pdf/2210.04628.pdf
+    """
+    if min_deg == max_deg:
+        return x
+
+    scales = 2.0 ** torch.arange(min_deg, max_deg, dtype=x.dtype, device=x.device)
+    *shape, dim = x.shape
+    xb = (x.reshape(-1, 1, dim) * scales.view(1, -1, 1)).reshape(*shape, -1)
+    assert xb.shape[-1] == dim * (max_deg - min_deg)
+    emb = torch.cat([xb, xb + math.pi / 2.0], axis=-1).sin()
+    return torch.cat([x, emb], dim=-1)
+
+
+def encode_position(position):
+    return posenc_nerf(position, min_deg=0, max_deg=15)
+
+
+def encode_direction(position, direction=None):
+    if direction is None:
+        return torch.zeros_like(posenc_nerf(position, min_deg=0, max_deg=8))
+    else:
+        return posenc_nerf(direction, min_deg=0, max_deg=8)
+
+
+def _sanitize_name(x: str) -> str:
+    return x.replace(".", "__")
+
+
+def integrate_samples(volume_range, ts, density, channels):
+    r"""
+    Function integrating the model output.
+
+    Args:
+        volume_range: Specifies the integral range [t0, t1]
+        ts: timesteps
+        density: torch.Tensor [batch_size, *shape, n_samples, 1]
+        channels: torch.Tensor [batch_size, *shape, n_samples, n_channels]
+    returns:
+        channels: integrated rgb output weights: torch.Tensor [batch_size, *shape, n_samples, 1] (density
+        *transmittance)[i] weight for each rgb output at [..., i, :]. transmittance: transmittance of this volume
+    )
+    """
+
+    # 1. Calculate the weights
+    _, _, dt = volume_range.partition(ts)
+    ddensity = density * dt
+
+    mass = torch.cumsum(ddensity, dim=-2)
+    transmittance = torch.exp(-mass[..., -1, :])
+
+    alphas = 1.0 - torch.exp(-ddensity)
+    Ts = torch.exp(torch.cat([torch.zeros_like(mass[..., :1, :]), -mass[..., :-1, :]], dim=-2))
+    # This is the probability of light hitting and reflecting off of
+    # something at depth [..., i, :].
+    weights = alphas * Ts
+
+    # 2. Integrate channels
+    channels = torch.sum(channels * weights, dim=-2)
+
+    return channels, weights, transmittance
+
+
+def volume_query_points(volume, grid_size):
+    indices = torch.arange(grid_size**3, device=volume.bbox_min.device)
+    zs = indices % grid_size
+    ys = torch.div(indices, grid_size, rounding_mode="trunc") % grid_size
+    xs = torch.div(indices, grid_size**2, rounding_mode="trunc") % grid_size
+    combined = torch.stack([xs, ys, zs], dim=1)
+    return (combined.float() / (grid_size - 1)) * (volume.bbox_max - volume.bbox_min) + volume.bbox_min
+
+
+def _convert_srgb_to_linear(u: torch.Tensor):
+    return torch.where(u <= 0.04045, u / 12.92, ((u + 0.055) / 1.055) ** 2.4)
+
+
+def _create_flat_edge_indices(
+    flat_cube_indices: torch.Tensor,
+    grid_size: Tuple[int, int, int],
+):
+    num_xs = (grid_size[0] - 1) * grid_size[1] * grid_size[2]
+    y_offset = num_xs
+    num_ys = grid_size[0] * (grid_size[1] - 1) * grid_size[2]
+    z_offset = num_xs + num_ys
+    return torch.stack(
+        [
+            # Edges spanning x-axis.
+            flat_cube_indices[:, 0] * grid_size[1] * grid_size[2]
+            + flat_cube_indices[:, 1] * grid_size[2]
+            + flat_cube_indices[:, 2],
+            flat_cube_indices[:, 0] * grid_size[1] * grid_size[2]
+            + (flat_cube_indices[:, 1] + 1) * grid_size[2]
+            + flat_cube_indices[:, 2],
+            flat_cube_indices[:, 0] * grid_size[1] * grid_size[2]
+            + flat_cube_indices[:, 1] * grid_size[2]
+            + flat_cube_indices[:, 2]
+            + 1,
+            flat_cube_indices[:, 0] * grid_size[1] * grid_size[2]
+            + (flat_cube_indices[:, 1] + 1) * grid_size[2]
+            + flat_cube_indices[:, 2]
+            + 1,
+            # Edges spanning y-axis.
+            (
+                y_offset
+                + flat_cube_indices[:, 0] * (grid_size[1] - 1) * grid_size[2]
+                + flat_cube_indices[:, 1] * grid_size[2]
+                + flat_cube_indices[:, 2]
+            ),
+            (
+                y_offset
+                + (flat_cube_indices[:, 0] + 1) * (grid_size[1] - 1) * grid_size[2]
+                + flat_cube_indices[:, 1] * grid_size[2]
+                + flat_cube_indices[:, 2]
+            ),
+            (
+                y_offset
+                + flat_cube_indices[:, 0] * (grid_size[1] - 1) * grid_size[2]
+                + flat_cube_indices[:, 1] * grid_size[2]
+                + flat_cube_indices[:, 2]
+                + 1
+            ),
+            (
+                y_offset
+                + (flat_cube_indices[:, 0] + 1) * (grid_size[1] - 1) * grid_size[2]
+                + flat_cube_indices[:, 1] * grid_size[2]
+                + flat_cube_indices[:, 2]
+                + 1
+            ),
+            # Edges spanning z-axis.
+            (
+                z_offset
+                + flat_cube_indices[:, 0] * grid_size[1] * (grid_size[2] - 1)
+                + flat_cube_indices[:, 1] * (grid_size[2] - 1)
+                + flat_cube_indices[:, 2]
+            ),
+            (
+                z_offset
+                + (flat_cube_indices[:, 0] + 1) * grid_size[1] * (grid_size[2] - 1)
+                + flat_cube_indices[:, 1] * (grid_size[2] - 1)
+                + flat_cube_indices[:, 2]
+            ),
+            (
+                z_offset
+                + flat_cube_indices[:, 0] * grid_size[1] * (grid_size[2] - 1)
+                + (flat_cube_indices[:, 1] + 1) * (grid_size[2] - 1)
+                + flat_cube_indices[:, 2]
+            ),
+            (
+                z_offset
+                + (flat_cube_indices[:, 0] + 1) * grid_size[1] * (grid_size[2] - 1)
+                + (flat_cube_indices[:, 1] + 1) * (grid_size[2] - 1)
+                + flat_cube_indices[:, 2]
+            ),
+        ],
+        dim=-1,
+    )
+
+
+class VoidNeRFModel(nn.Module):
+    """
+    Implements the default empty space model where all queries are rendered as background.
+    """
+
+    def __init__(self, background, channel_scale=255.0):
+        super().__init__()
+        background = nn.Parameter(torch.from_numpy(np.array(background)).to(dtype=torch.float32) / channel_scale)
+
+        self.register_buffer("background", background)
+
+    def forward(self, position):
+        background = self.background[None].to(position.device)
+
+        shape = position.shape[:-1]
+        ones = [1] * (len(shape) - 1)
+        n_channels = background.shape[-1]
+        background = torch.broadcast_to(background.view(background.shape[0], *ones, n_channels), [*shape, n_channels])
+
+        return background
+
+
+@dataclass
+class VolumeRange:
+    t0: torch.Tensor
+    t1: torch.Tensor
+    intersected: torch.Tensor
+
+    def __post_init__(self):
+        assert self.t0.shape == self.t1.shape == self.intersected.shape
+
+    def partition(self, ts):
+        """
+        Partitions t0 and t1 into n_samples intervals.
+
+        Args:
+            ts: [batch_size, *shape, n_samples, 1]
+
+        Return:
+
+            lower: [batch_size, *shape, n_samples, 1] upper: [batch_size, *shape, n_samples, 1] delta: [batch_size,
+            *shape, n_samples, 1]
+
+        where
+            ts \\in [lower, upper] deltas = upper - lower
+        """
+
+        mids = (ts[..., 1:, :] + ts[..., :-1, :]) * 0.5
+        lower = torch.cat([self.t0[..., None, :], mids], dim=-2)
+        upper = torch.cat([mids, self.t1[..., None, :]], dim=-2)
+        delta = upper - lower
+        assert lower.shape == upper.shape == delta.shape == ts.shape
+        return lower, upper, delta
+
+
+class BoundingBoxVolume(nn.Module):
+    """
+    Axis-aligned bounding box defined by the two opposite corners.
+    """
+
+    def __init__(
+        self,
+        *,
+        bbox_min,
+        bbox_max,
+        min_dist: float = 0.0,
+        min_t_range: float = 1e-3,
+    ):
+        """
+        Args:
+            bbox_min: the left/bottommost corner of the bounding box
+            bbox_max: the other corner of the bounding box
+            min_dist: all rays should start at least this distance away from the origin.
+        """
+        super().__init__()
+
+        self.min_dist = min_dist
+        self.min_t_range = min_t_range
+
+        self.bbox_min = torch.tensor(bbox_min)
+        self.bbox_max = torch.tensor(bbox_max)
+        self.bbox = torch.stack([self.bbox_min, self.bbox_max])
+        assert self.bbox.shape == (2, 3)
+        assert min_dist >= 0.0
+        assert min_t_range > 0.0
+
+    def intersect(
+        self,
+        origin: torch.Tensor,
+        direction: torch.Tensor,
+        t0_lower: Optional[torch.Tensor] = None,
+        epsilon=1e-6,
+    ):
+        """
+        Args:
+            origin: [batch_size, *shape, 3]
+            direction: [batch_size, *shape, 3]
+            t0_lower: Optional [batch_size, *shape, 1] lower bound of t0 when intersecting this volume.
+            params: Optional meta parameters in case Volume is parametric
+            epsilon: to stabilize calculations
+
+        Return:
+            A tuple of (t0, t1, intersected) where each has a shape [batch_size, *shape, 1]. If a ray intersects with
+            the volume, `o + td` is in the volume for all t in [t0, t1]. If the volume is bounded, t1 is guaranteed to
+            be on the boundary of the volume.
+        """
+
+        batch_size, *shape, _ = origin.shape
+        ones = [1] * len(shape)
+        bbox = self.bbox.view(1, *ones, 2, 3).to(origin.device)
+
+        def _safe_divide(a, b, epsilon=1e-6):
+            return a / torch.where(b < 0, b - epsilon, b + epsilon)
+
+        ts = _safe_divide(bbox - origin[..., None, :], direction[..., None, :], epsilon=epsilon)
+
+        # Cases to think about:
+        #
+        #   1. t1 <= t0: the ray does not pass through the AABB.
+        #   2. t0 < t1 <= 0: the ray intersects but the BB is behind the origin.
+        #   3. t0 <= 0 <= t1: the ray starts from inside the BB
+        #   4. 0 <= t0 < t1: the ray is not inside and intersects with the BB twice.
+        #
+        # 1 and 4 are clearly handled from t0 < t1 below.
+        # Making t0 at least min_dist (>= 0) takes care of 2 and 3.
+        t0 = ts.min(dim=-2).values.max(dim=-1, keepdim=True).values.clamp(self.min_dist)
+        t1 = ts.max(dim=-2).values.min(dim=-1, keepdim=True).values
+        assert t0.shape == t1.shape == (batch_size, *shape, 1)
+        if t0_lower is not None:
+            assert t0.shape == t0_lower.shape
+            t0 = torch.maximum(t0, t0_lower)
+
+        intersected = t0 + self.min_t_range < t1
+        t0 = torch.where(intersected, t0, torch.zeros_like(t0))
+        t1 = torch.where(intersected, t1, torch.ones_like(t1))
+
+        return VolumeRange(t0=t0, t1=t1, intersected=intersected)
+
+
+class StratifiedRaySampler(nn.Module):
+    """
+    Instead of fixed intervals, a sample is drawn uniformly at random from each interval.
+    """
+
+    def __init__(self, depth_mode: str = "linear"):
+        """
+        :param depth_mode: linear samples ts linearly in depth. harmonic ensures
+            closer points are sampled more densely.
+        """
+        self.depth_mode = depth_mode
+        assert self.depth_mode in ("linear", "geometric", "harmonic")
+
+    def sample(
+        self,
+        t0: torch.Tensor,
+        t1: torch.Tensor,
+        n_samples: int,
+        epsilon: float = 1e-3,
+    ) -> torch.Tensor:
+        """
+        Args:
+            t0: start time has shape [batch_size, *shape, 1]
+            t1: finish time has shape [batch_size, *shape, 1]
+            n_samples: number of ts to sample
+        Return:
+            sampled ts of shape [batch_size, *shape, n_samples, 1]
+        """
+        ones = [1] * (len(t0.shape) - 1)
+        ts = torch.linspace(0, 1, n_samples).view(*ones, n_samples).to(t0.dtype).to(t0.device)
+
+        if self.depth_mode == "linear":
+            ts = t0 * (1.0 - ts) + t1 * ts
+        elif self.depth_mode == "geometric":
+            ts = (t0.clamp(epsilon).log() * (1.0 - ts) + t1.clamp(epsilon).log() * ts).exp()
+        elif self.depth_mode == "harmonic":
+            # The original NeRF recommends this interpolation scheme for
+            # spherical scenes, but there could be some weird edge cases when
+            # the observer crosses from the inner to outer volume.
+            ts = 1.0 / (1.0 / t0.clamp(epsilon) * (1.0 - ts) + 1.0 / t1.clamp(epsilon) * ts)
+
+        mids = 0.5 * (ts[..., 1:] + ts[..., :-1])
+        upper = torch.cat([mids, t1], dim=-1)
+        lower = torch.cat([t0, mids], dim=-1)
+        # yiyi notes: add a random seed here for testing, don't forget to remove
+        torch.manual_seed(0)
+        t_rand = torch.rand_like(ts)
+
+        ts = lower + (upper - lower) * t_rand
+        return ts.unsqueeze(-1)
+
+
+class ImportanceRaySampler(nn.Module):
+    """
+    Given the initial estimate of densities, this samples more from regions/bins expected to have objects.
+    """
+
+    def __init__(
+        self,
+        volume_range: VolumeRange,
+        ts: torch.Tensor,
+        weights: torch.Tensor,
+        blur_pool: bool = False,
+        alpha: float = 1e-5,
+    ):
+        """
+        Args:
+            volume_range: the range in which a ray intersects the given volume.
+            ts: earlier samples from the coarse rendering step
+            weights: discretized version of density * transmittance
+            blur_pool: if true, use 2-tap max + 2-tap blur filter from mip-NeRF.
+            alpha: small value to add to weights.
+        """
+        self.volume_range = volume_range
+        self.ts = ts.clone().detach()
+        self.weights = weights.clone().detach()
+        self.blur_pool = blur_pool
+        self.alpha = alpha
+
+    @torch.no_grad()
+    def sample(self, t0: torch.Tensor, t1: torch.Tensor, n_samples: int) -> torch.Tensor:
+        """
+        Args:
+            t0: start time has shape [batch_size, *shape, 1]
+            t1: finish time has shape [batch_size, *shape, 1]
+            n_samples: number of ts to sample
+        Return:
+            sampled ts of shape [batch_size, *shape, n_samples, 1]
+        """
+        lower, upper, _ = self.volume_range.partition(self.ts)
+
+        batch_size, *shape, n_coarse_samples, _ = self.ts.shape
+
+        weights = self.weights
+        if self.blur_pool:
+            padded = torch.cat([weights[..., :1, :], weights, weights[..., -1:, :]], dim=-2)
+            maxes = torch.maximum(padded[..., :-1, :], padded[..., 1:, :])
+            weights = 0.5 * (maxes[..., :-1, :] + maxes[..., 1:, :])
+        weights = weights + self.alpha
+        pmf = weights / weights.sum(dim=-2, keepdim=True)
+        inds = sample_pmf(pmf, n_samples)
+        assert inds.shape == (batch_size, *shape, n_samples, 1)
+        assert (inds >= 0).all() and (inds < n_coarse_samples).all()
+
+        t_rand = torch.rand(inds.shape, device=inds.device)
+        lower_ = torch.gather(lower, -2, inds)
+        upper_ = torch.gather(upper, -2, inds)
+
+        ts = lower_ + (upper_ - lower_) * t_rand
+        ts = torch.sort(ts, dim=-2).values
+        return ts
+
+
+@dataclass
+class MeshDecoderOutput(BaseOutput):
+    """
+    A 3D triangle mesh with optional data at the vertices and faces.
+
+    Args:
+        verts (`torch.Tensor` of shape `(N, 3)`):
+            array of vertext coordinates
+        faces (`torch.Tensor` of shape `(N, 3)`):
+            array of triangles, pointing to indices in verts.
+        vertext_channels (Dict):
+            vertext coordinates for each color channel
+    """
+
+    verts: torch.Tensor
+    faces: torch.Tensor
+    vertex_channels: Dict[str, torch.Tensor]
+
+
+class MeshDecoder(nn.Module):
+    """
+    Construct meshes from Signed distance functions (SDFs) using marching cubes method
+    """
+
+    def __init__(self):
+        super().__init__()
+        cases = torch.zeros(256, 5, 3, dtype=torch.long)
+        masks = torch.zeros(256, 5, dtype=torch.bool)
+
+        self.register_buffer("cases", cases)
+        self.register_buffer("masks", masks)
+
+    def forward(self, field: torch.Tensor, min_point: torch.Tensor, size: torch.Tensor):
+        """
+        For a signed distance field, produce a mesh using marching cubes.
+
+        :param field: a 3D tensor of field values, where negative values correspond
+                    to the outside of the shape. The dimensions correspond to the x, y, and z directions, respectively.
+        :param min_point: a tensor of shape [3] containing the point corresponding
+                        to (0, 0, 0) in the field.
+        :param size: a tensor of shape [3] containing the per-axis distance from the
+                    (0, 0, 0) field corner and the (-1, -1, -1) field corner.
+        """
+        assert len(field.shape) == 3, "input must be a 3D scalar field"
+        dev = field.device
+
+        cases = self.cases.to(dev)
+        masks = self.masks.to(dev)
+
+        min_point = min_point.to(dev)
+        size = size.to(dev)
+
+        grid_size = field.shape
+        grid_size_tensor = torch.tensor(grid_size).to(size)
+
+        # Create bitmasks between 0 and 255 (inclusive) indicating the state
+        # of the eight corners of each cube.
+        bitmasks = (field > 0).to(torch.uint8)
+        bitmasks = bitmasks[:-1, :, :] | (bitmasks[1:, :, :] << 1)
+        bitmasks = bitmasks[:, :-1, :] | (bitmasks[:, 1:, :] << 2)
+        bitmasks = bitmasks[:, :, :-1] | (bitmasks[:, :, 1:] << 4)
+
+        # Compute corner coordinates across the entire grid.
+        corner_coords = torch.empty(*grid_size, 3, device=dev, dtype=field.dtype)
+        corner_coords[range(grid_size[0]), :, :, 0] = torch.arange(grid_size[0], device=dev, dtype=field.dtype)[
+            :, None, None
+        ]
+        corner_coords[:, range(grid_size[1]), :, 1] = torch.arange(grid_size[1], device=dev, dtype=field.dtype)[
+            :, None
+        ]
+        corner_coords[:, :, range(grid_size[2]), 2] = torch.arange(grid_size[2], device=dev, dtype=field.dtype)
+
+        # Compute all vertices across all edges in the grid, even though we will
+        # throw some out later. We have (X-1)*Y*Z + X*(Y-1)*Z + X*Y*(Z-1) vertices.
+        # These are all midpoints, and don't account for interpolation (which is
+        # done later based on the used edge midpoints).
+        edge_midpoints = torch.cat(
+            [
+                ((corner_coords[:-1] + corner_coords[1:]) / 2).reshape(-1, 3),
+                ((corner_coords[:, :-1] + corner_coords[:, 1:]) / 2).reshape(-1, 3),
+                ((corner_coords[:, :, :-1] + corner_coords[:, :, 1:]) / 2).reshape(-1, 3),
+            ],
+            dim=0,
+        )
+
+        # Create a flat array of [X, Y, Z] indices for each cube.
+        cube_indices = torch.zeros(
+            grid_size[0] - 1, grid_size[1] - 1, grid_size[2] - 1, 3, device=dev, dtype=torch.long
+        )
+        cube_indices[range(grid_size[0] - 1), :, :, 0] = torch.arange(grid_size[0] - 1, device=dev)[:, None, None]
+        cube_indices[:, range(grid_size[1] - 1), :, 1] = torch.arange(grid_size[1] - 1, device=dev)[:, None]
+        cube_indices[:, :, range(grid_size[2] - 1), 2] = torch.arange(grid_size[2] - 1, device=dev)
+        flat_cube_indices = cube_indices.reshape(-1, 3)
+
+        # Create a flat array mapping each cube to 12 global edge indices.
+        edge_indices = _create_flat_edge_indices(flat_cube_indices, grid_size)
+
+        # Apply the LUT to figure out the triangles.
+        flat_bitmasks = bitmasks.reshape(-1).long()  # must cast to long for indexing to believe this not a mask
+        local_tris = cases[flat_bitmasks]
+        local_masks = masks[flat_bitmasks]
+        # Compute the global edge indices for the triangles.
+        global_tris = torch.gather(edge_indices, 1, local_tris.reshape(local_tris.shape[0], -1)).reshape(
+            local_tris.shape
+        )
+        # Select the used triangles for each cube.
+        selected_tris = global_tris.reshape(-1, 3)[local_masks.reshape(-1)]
+
+        # Now we have a bunch of indices into the full list of possible vertices,
+        # but we want to reduce this list to only the used vertices.
+        used_vertex_indices = torch.unique(selected_tris.view(-1))
+        used_edge_midpoints = edge_midpoints[used_vertex_indices]
+        old_index_to_new_index = torch.zeros(len(edge_midpoints), device=dev, dtype=torch.long)
+        old_index_to_new_index[used_vertex_indices] = torch.arange(
+            len(used_vertex_indices), device=dev, dtype=torch.long
+        )
+
+        # Rewrite the triangles to use the new indices
+        faces = torch.gather(old_index_to_new_index, 0, selected_tris.view(-1)).reshape(selected_tris.shape)
+
+        # Compute the actual interpolated coordinates corresponding to edge midpoints.
+        v1 = torch.floor(used_edge_midpoints).to(torch.long)
+        v2 = torch.ceil(used_edge_midpoints).to(torch.long)
+        s1 = field[v1[:, 0], v1[:, 1], v1[:, 2]]
+        s2 = field[v2[:, 0], v2[:, 1], v2[:, 2]]
+        p1 = (v1.float() / (grid_size_tensor - 1)) * size + min_point
+        p2 = (v2.float() / (grid_size_tensor - 1)) * size + min_point
+        # The signs of s1 and s2 should be different. We want to find
+        # t such that t*s2 + (1-t)*s1 = 0.
+        t = (s1 / (s1 - s2))[:, None]
+        verts = t * p2 + (1 - t) * p1
+
+        return MeshDecoderOutput(verts=verts, faces=faces, vertex_channels=None)
+
+
+@dataclass
+class MLPNeRFModelOutput(BaseOutput):
+    density: torch.Tensor
+    signed_distance: torch.Tensor
+    channels: torch.Tensor
+    ts: torch.Tensor
+
+
+class MLPNeRSTFModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        d_hidden: int = 256,
+        n_output: int = 12,
+        n_hidden_layers: int = 6,
+        act_fn: str = "swish",
+        insert_direction_at: int = 4,
+    ):
+        super().__init__()
+
+        # Instantiate the MLP
+
+        # Find out the dimension of encoded position and direction
+        dummy = torch.eye(1, 3)
+        d_posenc_pos = encode_position(position=dummy).shape[-1]
+        d_posenc_dir = encode_direction(position=dummy).shape[-1]
+
+        mlp_widths = [d_hidden] * n_hidden_layers
+        input_widths = [d_posenc_pos] + mlp_widths
+        output_widths = mlp_widths + [n_output]
+
+        if insert_direction_at is not None:
+            input_widths[insert_direction_at] += d_posenc_dir
+
+        self.mlp = nn.ModuleList([nn.Linear(d_in, d_out) for d_in, d_out in zip(input_widths, output_widths)])
+
+        if act_fn == "swish":
+            # self.activation = swish
+            # yiyi testing:
+            self.activation = lambda x: F.silu(x)
+        else:
+            raise ValueError(f"Unsupported activation function {act_fn}")
+
+        self.sdf_activation = torch.tanh
+        self.density_activation = torch.nn.functional.relu
+        self.channel_activation = torch.sigmoid
+
+    def map_indices_to_keys(self, output):
+        h_map = {
+            "sdf": (0, 1),
+            "density_coarse": (1, 2),
+            "density_fine": (2, 3),
+            "stf": (3, 6),
+            "nerf_coarse": (6, 9),
+            "nerf_fine": (9, 12),
+        }
+
+        mapped_output = {k: output[..., start:end] for k, (start, end) in h_map.items()}
+
+        return mapped_output
+
+    def forward(self, *, position, direction, ts, nerf_level="coarse", rendering_mode="nerf"):
+        h = encode_position(position)
+
+        h_preact = h
+        h_directionless = None
+        for i, layer in enumerate(self.mlp):
+            if i == self.config.insert_direction_at:  # 4 in the config
+                h_directionless = h_preact
+                h_direction = encode_direction(position, direction=direction)
+                h = torch.cat([h, h_direction], dim=-1)
+
+            h = layer(h)
+
+            h_preact = h
+
+            if i < len(self.mlp) - 1:
+                h = self.activation(h)
+
+        h_final = h
+        if h_directionless is None:
+            h_directionless = h_preact
+
+        activation = self.map_indices_to_keys(h_final)
+
+        if nerf_level == "coarse":
+            h_density = activation["density_coarse"]
+        else:
+            h_density = activation["density_fine"]
+
+        if rendering_mode == "nerf":
+            if nerf_level == "coarse":
+                h_channels = activation["nerf_coarse"]
+            else:
+                h_channels = activation["nerf_fine"]
+
+        elif rendering_mode == "stf":
+            h_channels = activation["stf"]
+
+        density = self.density_activation(h_density)
+        signed_distance = self.sdf_activation(activation["sdf"])
+        channels = self.channel_activation(h_channels)
+
+        # yiyi notes: I think signed_distance is not used
+        return MLPNeRFModelOutput(density=density, signed_distance=signed_distance, channels=channels, ts=ts)
+
+
+class ChannelsProj(nn.Module):
+    def __init__(
+        self,
+        *,
+        vectors: int,
+        channels: int,
+        d_latent: int,
+    ):
+        super().__init__()
+        self.proj = nn.Linear(d_latent, vectors * channels)
+        self.norm = nn.LayerNorm(channels)
+        self.d_latent = d_latent
+        self.vectors = vectors
+        self.channels = channels
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_bvd = x
+        w_vcd = self.proj.weight.view(self.vectors, self.channels, self.d_latent)
+        b_vc = self.proj.bias.view(1, self.vectors, self.channels)
+        h = torch.einsum("bvd,vcd->bvc", x_bvd, w_vcd)
+        h = self.norm(h)
+
+        h = h + b_vc
+        return h
+
+
+class ShapEParamsProjModel(ModelMixin, ConfigMixin):
+    """
+    project the latent representation of a 3D asset to obtain weights of a multi-layer perceptron (MLP).
+
+    For more details, see the original paper:
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        *,
+        param_names: Tuple[str] = (
+            "nerstf.mlp.0.weight",
+            "nerstf.mlp.1.weight",
+            "nerstf.mlp.2.weight",
+            "nerstf.mlp.3.weight",
+        ),
+        param_shapes: Tuple[Tuple[int]] = (
+            (256, 93),
+            (256, 256),
+            (256, 256),
+            (256, 256),
+        ),
+        d_latent: int = 1024,
+    ):
+        super().__init__()
+
+        # check inputs
+        if len(param_names) != len(param_shapes):
+            raise ValueError("Must provide same number of `param_names` as `param_shapes`")
+        self.projections = nn.ModuleDict({})
+        for k, (vectors, channels) in zip(param_names, param_shapes):
+            self.projections[_sanitize_name(k)] = ChannelsProj(
+                vectors=vectors,
+                channels=channels,
+                d_latent=d_latent,
+            )
+
+    def forward(self, x: torch.Tensor):
+        out = {}
+        start = 0
+        for k, shape in zip(self.config.param_names, self.config.param_shapes):
+            vectors, _ = shape
+            end = start + vectors
+            x_bvd = x[:, start:end]
+            out[k] = self.projections[_sanitize_name(k)](x_bvd).reshape(len(x), *shape)
+            start = end
+        return out
+
+
+class ShapERenderer(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        *,
+        param_names: Tuple[str] = (
+            "nerstf.mlp.0.weight",
+            "nerstf.mlp.1.weight",
+            "nerstf.mlp.2.weight",
+            "nerstf.mlp.3.weight",
+        ),
+        param_shapes: Tuple[Tuple[int]] = (
+            (256, 93),
+            (256, 256),
+            (256, 256),
+            (256, 256),
+        ),
+        d_latent: int = 1024,
+        d_hidden: int = 256,
+        n_output: int = 12,
+        n_hidden_layers: int = 6,
+        act_fn: str = "swish",
+        insert_direction_at: int = 4,
+        background: Tuple[float] = (
+            255.0,
+            255.0,
+            255.0,
+        ),
+    ):
+        super().__init__()
+
+        self.params_proj = ShapEParamsProjModel(
+            param_names=param_names,
+            param_shapes=param_shapes,
+            d_latent=d_latent,
+        )
+        self.mlp = MLPNeRSTFModel(d_hidden, n_output, n_hidden_layers, act_fn, insert_direction_at)
+        self.void = VoidNeRFModel(background=background, channel_scale=255.0)
+        self.volume = BoundingBoxVolume(bbox_max=[1.0, 1.0, 1.0], bbox_min=[-1.0, -1.0, -1.0])
+        self.mesh_decoder = MeshDecoder()
+
+    @torch.no_grad()
+    def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with_direction=False):
+        """
+        Perform volumetric rendering over a partition of possible t's in the union of rendering volumes (written below
+        with some abuse of notations)
+
+            C(r) := sum(
+                transmittance(t[i]) * integrate(
+                    lambda t: density(t) * channels(t) * transmittance(t), [t[i], t[i + 1]],
+                ) for i in range(len(parts))
+            ) + transmittance(t[-1]) * void_model(t[-1]).channels
+
+        where
+
+        1) transmittance(s) := exp(-integrate(density, [t[0], s])) calculates the probability of light passing through
+        the volume specified by [t[0], s]. (transmittance of 1 means light can pass freely) 2) density and channels are
+        obtained by evaluating the appropriate part.model at time t. 3) [t[i], t[i + 1]] is defined as the range of t
+        where the ray intersects (parts[i].volume \\ union(part.volume for part in parts[:i])) at the surface of the
+        shell (if bounded). If the ray does not intersect, the integral over this segment is evaluated as 0 and
+        transmittance(t[i + 1]) := transmittance(t[i]). 4) The last term is integration to infinity (e.g. [t[-1],
+        math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
+
+        Args:
+            rays: [batch_size x ... x 2 x 3] origin and direction. sampler: disjoint volume integrals. n_samples:
+            number of ts to sample. prev_model_outputs: model outputs from the previous rendering step, including
+
+        :return: A tuple of
+            - `channels`
+            - A importance samplers for additional fine-grained rendering
+            - raw model output
+        """
+        origin, direction = rays[..., 0, :], rays[..., 1, :]
+
+        # Integrate over [t[i], t[i + 1]]
+
+        # 1 Intersect the rays with the current volume and sample ts to integrate along.
+        vrange = self.volume.intersect(origin, direction, t0_lower=None)
+        ts = sampler.sample(vrange.t0, vrange.t1, n_samples)
+        ts = ts.to(rays.dtype)
+
+        if prev_model_out is not None:
+            # Append the previous ts now before fprop because previous
+            # rendering used a different model and we can't reuse the output.
+            ts = torch.sort(torch.cat([ts, prev_model_out.ts], dim=-2), dim=-2).values
+
+        batch_size, *_shape, _t0_dim = vrange.t0.shape
+        _, *ts_shape, _ts_dim = ts.shape
+
+        # 2. Get the points along the ray and query the model
+        directions = torch.broadcast_to(direction.unsqueeze(-2), [batch_size, *ts_shape, 3])
+        positions = origin.unsqueeze(-2) + ts * directions
+
+        directions = directions.to(self.mlp.dtype)
+        positions = positions.to(self.mlp.dtype)
+
+        optional_directions = directions if render_with_direction else None
+
+        model_out = self.mlp(
+            position=positions,
+            direction=optional_directions,
+            ts=ts,
+            nerf_level="coarse" if prev_model_out is None else "fine",
+        )
+
+        # 3. Integrate the model results
+        channels, weights, transmittance = integrate_samples(
+            vrange, model_out.ts, model_out.density, model_out.channels
+        )
+
+        # 4. Clean up results that do not intersect with the volume.
+        transmittance = torch.where(vrange.intersected, transmittance, torch.ones_like(transmittance))
+        channels = torch.where(vrange.intersected, channels, torch.zeros_like(channels))
+        # 5. integration to infinity (e.g. [t[-1], math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
+        channels = channels + transmittance * self.void(origin)
+
+        weighted_sampler = ImportanceRaySampler(vrange, ts=model_out.ts, weights=weights)
+
+        return channels, weighted_sampler, model_out
+
+    @torch.no_grad()
+    def decode_to_image(
+        self,
+        latents,
+        device,
+        size: int = 64,
+        ray_batch_size: int = 4096,
+        n_coarse_samples=64,
+        n_fine_samples=128,
+    ):
+        # project the parameters from the generated latents
+        projected_params = self.params_proj(latents)
+
+        # update the mlp layers of the renderer
+        for name, param in self.mlp.state_dict().items():
+            if f"nerstf.{name}" in projected_params.keys():
+                param.copy_(projected_params[f"nerstf.{name}"].squeeze(0))
+
+        # create cameras object
+        camera = create_pan_cameras(size)
+        rays = camera.camera_rays
+        rays = rays.to(device)
+        n_batches = rays.shape[1] // ray_batch_size
+
+        coarse_sampler = StratifiedRaySampler()
+
+        images = []
+
+        for idx in range(n_batches):
+            rays_batch = rays[:, idx * ray_batch_size : (idx + 1) * ray_batch_size]
+
+            # render rays with coarse, stratified samples.
+            _, fine_sampler, coarse_model_out = self.render_rays(rays_batch, coarse_sampler, n_coarse_samples)
+            # Then, render with additional importance-weighted ray samples.
+            channels, _, _ = self.render_rays(
+                rays_batch, fine_sampler, n_fine_samples, prev_model_out=coarse_model_out
+            )
+
+            images.append(channels)
+
+        images = torch.cat(images, dim=1)
+        images = images.view(*camera.shape, camera.height, camera.width, -1).squeeze(0)
+
+        return images
+
+    @torch.no_grad()
+    def decode_to_mesh(
+        self,
+        latents,
+        device,
+        grid_size: int = 128,
+        query_batch_size: int = 4096,
+        texture_channels: Tuple = ("R", "G", "B"),
+    ):
+        # 1. project the parameters from the generated latents
+        projected_params = self.params_proj(latents)
+
+        # 2. update the mlp layers of the renderer
+        for name, param in self.mlp.state_dict().items():
+            if f"nerstf.{name}" in projected_params.keys():
+                param.copy_(projected_params[f"nerstf.{name}"].squeeze(0))
+
+        # 3. decoding with STF rendering
+        # 3.1 query the SDF values at vertices along a regular 128**3 grid
+
+        query_points = volume_query_points(self.volume, grid_size)
+        query_positions = query_points[None].repeat(1, 1, 1).to(device=device, dtype=self.mlp.dtype)
+
+        fields = []
+
+        for idx in range(0, query_positions.shape[1], query_batch_size):
+            query_batch = query_positions[:, idx : idx + query_batch_size]
+
+            model_out = self.mlp(
+                position=query_batch, direction=None, ts=None, nerf_level="fine", rendering_mode="stf"
+            )
+            fields.append(model_out.signed_distance)
+
+        # predicted SDF values
+        fields = torch.cat(fields, dim=1)
+        fields = fields.float()
+
+        assert (
+            len(fields.shape) == 3 and fields.shape[-1] == 1
+        ), f"expected [meta_batch x inner_batch] SDF results, but got {fields.shape}"
+
+        fields = fields.reshape(1, *([grid_size] * 3))
+
+        # create grid 128 x 128 x 128
+        # - force a negative border around the SDFs to close off all the models.
+        full_grid = torch.zeros(
+            1,
+            grid_size + 2,
+            grid_size + 2,
+            grid_size + 2,
+            device=fields.device,
+            dtype=fields.dtype,
+        )
+        full_grid.fill_(-1.0)
+        full_grid[:, 1:-1, 1:-1, 1:-1] = fields
+        fields = full_grid
+
+        # apply a differentiable implementation of Marching Cubes to construct meshs
+        raw_meshes = []
+        mesh_mask = []
+
+        for field in fields:
+            raw_mesh = self.mesh_decoder(field, self.volume.bbox_min, self.volume.bbox_max - self.volume.bbox_min)
+            mesh_mask.append(True)
+            raw_meshes.append(raw_mesh)
+
+        mesh_mask = torch.tensor(mesh_mask, device=fields.device)
+        max_vertices = max(len(m.verts) for m in raw_meshes)
+
+        # 3.2. query the texture color head at each vertex of the resulting mesh.
+        texture_query_positions = torch.stack(
+            [m.verts[torch.arange(0, max_vertices) % len(m.verts)] for m in raw_meshes],
+            dim=0,
+        )
+        texture_query_positions = texture_query_positions.to(device=device, dtype=self.mlp.dtype)
+
+        textures = []
+
+        for idx in range(0, texture_query_positions.shape[1], query_batch_size):
+            query_batch = texture_query_positions[:, idx : idx + query_batch_size]
+
+            texture_model_out = self.mlp(
+                position=query_batch, direction=None, ts=None, nerf_level="fine", rendering_mode="stf"
+            )
+            textures.append(texture_model_out.channels)
+
+        # predict texture color
+        textures = torch.cat(textures, dim=1)
+
+        textures = _convert_srgb_to_linear(textures)
+        textures = textures.float()
+
+        # 3.3 augument the mesh with texture data
+        assert len(textures.shape) == 3 and textures.shape[-1] == len(
+            texture_channels
+        ), f"expected [meta_batch x inner_batch x texture_channels] field results, but got {textures.shape}"
+
+        for m, texture in zip(raw_meshes, textures):
+            texture = texture[: len(m.verts)]
+            m.vertex_channels = dict(zip(texture_channels, texture.unbind(-1)))
+
+        return raw_meshes[0]
diff --git a/diffusers/src/diffusers/pipelines/stable_audio/__init__.py b/diffusers/src/diffusers/pipelines/stable_audio/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfdd419ae9914e64c7fdcf7c152ac308b75d75d2
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_audio/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modeling_stable_audio"] = ["StableAudioProjectionModel"]
+    _import_structure["pipeline_stable_audio"] = ["StableAudioPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .modeling_stable_audio import StableAudioProjectionModel
+        from .pipeline_stable_audio import StableAudioPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_audio/modeling_stable_audio.py b/diffusers/src/diffusers/pipelines/stable_audio/modeling_stable_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8f8a705de21ae4486318f21da87cb44f0737368
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_audio/modeling_stable_audio.py
@@ -0,0 +1,158 @@
+# Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from math import pi
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.modeling_utils import ModelMixin
+from ...utils import BaseOutput, logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class StableAudioPositionalEmbedding(nn.Module):
+    """Used for continuous time"""
+
+    def __init__(self, dim: int):
+        super().__init__()
+        assert (dim % 2) == 0
+        half_dim = dim // 2
+        self.weights = nn.Parameter(torch.randn(half_dim))
+
+    def forward(self, times: torch.Tensor) -> torch.Tensor:
+        times = times[..., None]
+        freqs = times * self.weights[None] * 2 * pi
+        fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
+        fouriered = torch.cat((times, fouriered), dim=-1)
+        return fouriered
+
+
+@dataclass
+class StableAudioProjectionModelOutput(BaseOutput):
+    """
+    Args:
+    Class for StableAudio projection layer's outputs.
+        text_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states obtained by linearly projecting the hidden-states for the text encoder.
+        seconds_start_hidden_states (`torch.Tensor` of shape `(batch_size, 1, hidden_size)`, *optional*):
+            Sequence of hidden-states obtained by linearly projecting the audio start hidden states.
+        seconds_end_hidden_states (`torch.Tensor` of shape `(batch_size, 1, hidden_size)`, *optional*):
+            Sequence of hidden-states obtained by linearly projecting the audio end hidden states.
+    """
+
+    text_hidden_states: Optional[torch.Tensor] = None
+    seconds_start_hidden_states: Optional[torch.Tensor] = None
+    seconds_end_hidden_states: Optional[torch.Tensor] = None
+
+
+class StableAudioNumberConditioner(nn.Module):
+    """
+    A simple linear projection model to map numbers to a latent space.
+
+    Args:
+        number_embedding_dim (`int`):
+            Dimensionality of the number embeddings.
+        min_value (`int`):
+            The minimum value of the seconds number conditioning modules.
+        max_value (`int`):
+            The maximum value of the seconds number conditioning modules
+        internal_dim (`int`):
+            Dimensionality of the intermediate number hidden states.
+    """
+
+    def __init__(
+        self,
+        number_embedding_dim,
+        min_value,
+        max_value,
+        internal_dim: Optional[int] = 256,
+    ):
+        super().__init__()
+        self.time_positional_embedding = nn.Sequential(
+            StableAudioPositionalEmbedding(internal_dim),
+            nn.Linear(in_features=internal_dim + 1, out_features=number_embedding_dim),
+        )
+
+        self.number_embedding_dim = number_embedding_dim
+        self.min_value = min_value
+        self.max_value = max_value
+
+    def forward(
+        self,
+        floats: torch.Tensor,
+    ):
+        floats = floats.clamp(self.min_value, self.max_value)
+
+        normalized_floats = (floats - self.min_value) / (self.max_value - self.min_value)
+
+        # Cast floats to same type as embedder
+        embedder_dtype = next(self.time_positional_embedding.parameters()).dtype
+        normalized_floats = normalized_floats.to(embedder_dtype)
+
+        embedding = self.time_positional_embedding(normalized_floats)
+        float_embeds = embedding.view(-1, 1, self.number_embedding_dim)
+
+        return float_embeds
+
+
+class StableAudioProjectionModel(ModelMixin, ConfigMixin):
+    """
+    A simple linear projection model to map the conditioning values to a shared latent space.
+
+    Args:
+        text_encoder_dim (`int`):
+            Dimensionality of the text embeddings from the text encoder (T5).
+        conditioning_dim (`int`):
+            Dimensionality of the output conditioning tensors.
+        min_value (`int`):
+            The minimum value of the seconds number conditioning modules.
+        max_value (`int`):
+            The maximum value of the seconds number conditioning modules
+    """
+
+    @register_to_config
+    def __init__(self, text_encoder_dim, conditioning_dim, min_value, max_value):
+        super().__init__()
+        self.text_projection = (
+            nn.Identity() if conditioning_dim == text_encoder_dim else nn.Linear(text_encoder_dim, conditioning_dim)
+        )
+        self.start_number_conditioner = StableAudioNumberConditioner(conditioning_dim, min_value, max_value)
+        self.end_number_conditioner = StableAudioNumberConditioner(conditioning_dim, min_value, max_value)
+
+    def forward(
+        self,
+        text_hidden_states: Optional[torch.Tensor] = None,
+        start_seconds: Optional[torch.Tensor] = None,
+        end_seconds: Optional[torch.Tensor] = None,
+    ):
+        text_hidden_states = (
+            text_hidden_states if text_hidden_states is None else self.text_projection(text_hidden_states)
+        )
+        seconds_start_hidden_states = (
+            start_seconds if start_seconds is None else self.start_number_conditioner(start_seconds)
+        )
+        seconds_end_hidden_states = end_seconds if end_seconds is None else self.end_number_conditioner(end_seconds)
+
+        return StableAudioProjectionModelOutput(
+            text_hidden_states=text_hidden_states,
+            seconds_start_hidden_states=seconds_start_hidden_states,
+            seconds_end_hidden_states=seconds_end_hidden_states,
+        )
diff --git a/diffusers/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py b/diffusers/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fe082d88957dc4cf29c2f11c9a2d01fbba3252b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py
@@ -0,0 +1,745 @@
+# Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+from transformers import (
+    T5EncoderModel,
+    T5Tokenizer,
+    T5TokenizerFast,
+)
+
+from ...models import AutoencoderOobleck, StableAudioDiTModel
+from ...models.embeddings import get_1d_rotary_pos_embed
+from ...schedulers import EDMDPMSolverMultistepScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from .modeling_stable_audio import StableAudioProjectionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import scipy
+        >>> import torch
+        >>> import soundfile as sf
+        >>> from diffusers import StableAudioPipeline
+
+        >>> repo_id = "stabilityai/stable-audio-open-1.0"
+        >>> pipe = StableAudioPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # define the prompts
+        >>> prompt = "The sound of a hammer hitting a wooden surface."
+        >>> negative_prompt = "Low quality."
+
+        >>> # set the seed for generator
+        >>> generator = torch.Generator("cuda").manual_seed(0)
+
+        >>> # run the generation
+        >>> audio = pipe(
+        ...     prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     num_inference_steps=200,
+        ...     audio_end_in_s=10.0,
+        ...     num_waveforms_per_prompt=3,
+        ...     generator=generator,
+        ... ).audios
+
+        >>> output = audio[0].T.float().cpu().numpy()
+        >>> sf.write("hammer.wav", output, pipe.vae.sampling_rate)
+        ```
+"""
+
+
+class StableAudioPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-audio generation using StableAudio.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderOobleck`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.T5EncoderModel`]):
+            Frozen text-encoder. StableAudio uses the encoder of
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [google-t5/t5-base](https://huggingface.co/google-t5/t5-base) variant.
+        projection_model ([`StableAudioProjectionModel`]):
+            A trained model used to linearly project the hidden-states from the text encoder model and the start and
+            end seconds. The projected hidden-states from the encoder and the conditional seconds are concatenated to
+            give the input to the transformer model.
+        tokenizer ([`~transformers.T5Tokenizer`]):
+            Tokenizer to tokenize text for the frozen text-encoder.
+        transformer ([`StableAudioDiTModel`]):
+            A `StableAudioDiTModel` to denoise the encoded audio latents.
+        scheduler ([`EDMDPMSolverMultistepScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded audio latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->projection_model->transformer->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderOobleck,
+        text_encoder: T5EncoderModel,
+        projection_model: StableAudioProjectionModel,
+        tokenizer: Union[T5Tokenizer, T5TokenizerFast],
+        transformer: StableAudioDiTModel,
+        scheduler: EDMDPMSolverMultistepScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            projection_model=projection_model,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.rotary_embed_dim = self.transformer.config.attention_head_dim // 2
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        negative_attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # 1. Tokenize text
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            attention_mask = text_inputs.attention_mask
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    f"The following part of your input was truncated because {self.text_encoder.config.model_type} can "
+                    f"only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            text_input_ids = text_input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # 2. Text encoder forward
+            self.text_encoder.eval()
+            prompt_embeds = self.text_encoder(
+                text_input_ids,
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if do_classifier_free_guidance and negative_prompt is not None:
+            uncond_tokens: List[str]
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # 1. Tokenize text
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            uncond_input_ids = uncond_input.input_ids.to(device)
+            negative_attention_mask = uncond_input.attention_mask.to(device)
+
+            # 2. Text encoder forward
+            self.text_encoder.eval()
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input_ids,
+                attention_mask=negative_attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+            if negative_attention_mask is not None:
+                # set the masked tokens to the null embed
+                negative_prompt_embeds = torch.where(
+                    negative_attention_mask.to(torch.bool).unsqueeze(2), negative_prompt_embeds, 0.0
+                )
+
+        # 3. Project prompt_embeds and negative_prompt_embeds
+        if do_classifier_free_guidance and negative_prompt_embeds is not None:
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the negative and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            if attention_mask is not None and negative_attention_mask is None:
+                negative_attention_mask = torch.ones_like(attention_mask)
+            elif attention_mask is None and negative_attention_mask is not None:
+                attention_mask = torch.ones_like(negative_attention_mask)
+
+            if attention_mask is not None:
+                attention_mask = torch.cat([negative_attention_mask, attention_mask])
+
+        prompt_embeds = self.projection_model(
+            text_hidden_states=prompt_embeds,
+        ).text_hidden_states
+        if attention_mask is not None:
+            prompt_embeds = prompt_embeds * attention_mask.unsqueeze(-1).to(prompt_embeds.dtype)
+            prompt_embeds = prompt_embeds * attention_mask.unsqueeze(-1).to(prompt_embeds.dtype)
+
+        return prompt_embeds
+
+    def encode_duration(
+        self,
+        audio_start_in_s,
+        audio_end_in_s,
+        device,
+        do_classifier_free_guidance,
+        batch_size,
+    ):
+        audio_start_in_s = audio_start_in_s if isinstance(audio_start_in_s, list) else [audio_start_in_s]
+        audio_end_in_s = audio_end_in_s if isinstance(audio_end_in_s, list) else [audio_end_in_s]
+
+        if len(audio_start_in_s) == 1:
+            audio_start_in_s = audio_start_in_s * batch_size
+        if len(audio_end_in_s) == 1:
+            audio_end_in_s = audio_end_in_s * batch_size
+
+        # Cast the inputs to floats
+        audio_start_in_s = [float(x) for x in audio_start_in_s]
+        audio_start_in_s = torch.tensor(audio_start_in_s).to(device)
+
+        audio_end_in_s = [float(x) for x in audio_end_in_s]
+        audio_end_in_s = torch.tensor(audio_end_in_s).to(device)
+
+        projection_output = self.projection_model(
+            start_seconds=audio_start_in_s,
+            end_seconds=audio_end_in_s,
+        )
+        seconds_start_hidden_states = projection_output.seconds_start_hidden_states
+        seconds_end_hidden_states = projection_output.seconds_end_hidden_states
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we repeat the audio hidden states to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            seconds_start_hidden_states = torch.cat([seconds_start_hidden_states, seconds_start_hidden_states], dim=0)
+            seconds_end_hidden_states = torch.cat([seconds_end_hidden_states, seconds_end_hidden_states], dim=0)
+
+        return seconds_start_hidden_states, seconds_end_hidden_states
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        audio_start_in_s,
+        audio_end_in_s,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        attention_mask=None,
+        negative_attention_mask=None,
+        initial_audio_waveforms=None,
+        initial_audio_sampling_rate=None,
+    ):
+        if audio_end_in_s < audio_start_in_s:
+            raise ValueError(
+                f"`audio_end_in_s={audio_end_in_s}' must be higher than 'audio_start_in_s={audio_start_in_s}` but "
+            )
+
+        if (
+            audio_start_in_s < self.projection_model.config.min_value
+            or audio_start_in_s > self.projection_model.config.max_value
+        ):
+            raise ValueError(
+                f"`audio_start_in_s` must be greater than or equal to {self.projection_model.config.min_value}, and lower than or equal to {self.projection_model.config.max_value} but "
+                f"is {audio_start_in_s}."
+            )
+
+        if (
+            audio_end_in_s < self.projection_model.config.min_value
+            or audio_end_in_s > self.projection_model.config.max_value
+        ):
+            raise ValueError(
+                f"`audio_end_in_s` must be greater than or equal to {self.projection_model.config.min_value}, and lower than or equal to {self.projection_model.config.max_value} but "
+                f"is {audio_end_in_s}."
+            )
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and (prompt_embeds is None):
+            raise ValueError(
+                "Provide either `prompt`, or `prompt_embeds`. Cannot leave"
+                "`prompt` undefined without specifying `prompt_embeds`."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if attention_mask is not None and attention_mask.shape != prompt_embeds.shape[:2]:
+                raise ValueError(
+                    "`attention_mask should have the same batch size and sequence length as `prompt_embeds`, but got:"
+                    f"`attention_mask: {attention_mask.shape} != `prompt_embeds` {prompt_embeds.shape}"
+                )
+
+        if initial_audio_sampling_rate is None and initial_audio_waveforms is not None:
+            raise ValueError(
+                "`initial_audio_waveforms' is provided but the sampling rate is not. Make sure to pass `initial_audio_sampling_rate`."
+            )
+
+        if initial_audio_sampling_rate is not None and initial_audio_sampling_rate != self.vae.sampling_rate:
+            raise ValueError(
+                f"`initial_audio_sampling_rate` must be {self.vae.hop_length}' but is `{initial_audio_sampling_rate}`."
+                "Make sure to resample the `initial_audio_waveforms` and to correct the sampling rate. "
+            )
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_vae,
+        sample_size,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        initial_audio_waveforms=None,
+        num_waveforms_per_prompt=None,
+        audio_channels=None,
+    ):
+        shape = (batch_size, num_channels_vae, sample_size)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # encode the initial audio for use by the model
+        if initial_audio_waveforms is not None:
+            # check dimension
+            if initial_audio_waveforms.ndim == 2:
+                initial_audio_waveforms = initial_audio_waveforms.unsqueeze(1)
+            elif initial_audio_waveforms.ndim != 3:
+                raise ValueError(
+                    f"`initial_audio_waveforms` must be of shape `(batch_size, num_channels, audio_length)` or `(batch_size, audio_length)` but has `{initial_audio_waveforms.ndim}` dimensions"
+                )
+
+            audio_vae_length = self.transformer.config.sample_size * self.vae.hop_length
+            audio_shape = (batch_size // num_waveforms_per_prompt, audio_channels, audio_vae_length)
+
+            # check num_channels
+            if initial_audio_waveforms.shape[1] == 1 and audio_channels == 2:
+                initial_audio_waveforms = initial_audio_waveforms.repeat(1, 2, 1)
+            elif initial_audio_waveforms.shape[1] == 2 and audio_channels == 1:
+                initial_audio_waveforms = initial_audio_waveforms.mean(1, keepdim=True)
+
+            if initial_audio_waveforms.shape[:2] != audio_shape[:2]:
+                raise ValueError(
+                    f"`initial_audio_waveforms` must be of shape `(batch_size, num_channels, audio_length)` or `(batch_size, audio_length)` but is of shape `{initial_audio_waveforms.shape}`"
+                )
+
+            # crop or pad
+            audio_length = initial_audio_waveforms.shape[-1]
+            if audio_length < audio_vae_length:
+                logger.warning(
+                    f"The provided input waveform is shorter ({audio_length}) than the required audio length ({audio_vae_length}) of the model and will thus be padded."
+                )
+            elif audio_length > audio_vae_length:
+                logger.warning(
+                    f"The provided input waveform is longer ({audio_length}) than the required audio length ({audio_vae_length}) of the model and will thus be cropped."
+                )
+
+            audio = initial_audio_waveforms.new_zeros(audio_shape)
+            audio[:, :, : min(audio_length, audio_vae_length)] = initial_audio_waveforms[:, :, :audio_vae_length]
+
+            encoded_audio = self.vae.encode(audio).latent_dist.sample(generator)
+            encoded_audio = encoded_audio.repeat((num_waveforms_per_prompt, 1, 1))
+            latents = encoded_audio + latents
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        audio_end_in_s: Optional[float] = None,
+        audio_start_in_s: Optional[float] = 0.0,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_waveforms_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        initial_audio_waveforms: Optional[torch.Tensor] = None,
+        initial_audio_sampling_rate: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        negative_attention_mask: Optional[torch.LongTensor] = None,
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        output_type: Optional[str] = "pt",
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
+            audio_end_in_s (`float`, *optional*, defaults to 47.55):
+                Audio end index in seconds.
+            audio_start_in_s (`float`, *optional*, defaults to 0):
+                Audio start index in seconds.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                A higher guidance scale value encourages the model to generate audio that is closely linked to the text
+                `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
+                The number of waveforms to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for audio
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            initial_audio_waveforms (`torch.Tensor`, *optional*):
+                Optional initial audio waveforms to use as the initial audio waveform for generation. Must be of shape
+                `(batch_size, num_channels, audio_length)` or `(batch_size, audio_length)`, where `batch_size`
+                corresponds to the number of prompts passed to the model.
+            initial_audio_sampling_rate (`int`, *optional*):
+                Sampling rate of the `initial_audio_waveforms`, if they are provided. Must be the same as the model.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-computed text embeddings from the text encoder model. Can be used to easily tweak text inputs,
+                *e.g.* prompt weighting. If not provided, text embeddings will be computed from `prompt` input
+                argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-computed negative text embeddings from the text encoder model. Can be used to easily tweak text
+                inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
+                `negative_prompt` input argument.
+            attention_mask (`torch.LongTensor`, *optional*):
+                Pre-computed attention mask to be applied to the `prompt_embeds`. If not provided, attention mask will
+                be computed from `prompt` input argument.
+            negative_attention_mask (`torch.LongTensor`, *optional*):
+                Pre-computed attention mask to be applied to the `negative_text_audio_duration_embeds`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generated audio. Choose between `"np"` to return a NumPy `np.ndarray` or
+                `"pt"` to return a PyTorch `torch.Tensor` object. Set to `"latent"` to return the latent diffusion
+                model (LDM) output.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated audio.
+        """
+        # 0. Convert audio input length from seconds to latent length
+        downsample_ratio = self.vae.hop_length
+
+        max_audio_length_in_s = self.transformer.config.sample_size * downsample_ratio / self.vae.config.sampling_rate
+        if audio_end_in_s is None:
+            audio_end_in_s = max_audio_length_in_s
+
+        if audio_end_in_s - audio_start_in_s > max_audio_length_in_s:
+            raise ValueError(
+                f"The total audio length requested ({audio_end_in_s-audio_start_in_s}s) is longer than the model maximum possible length ({max_audio_length_in_s}). Make sure that 'audio_end_in_s-audio_start_in_s<={max_audio_length_in_s}'."
+            )
+
+        waveform_start = int(audio_start_in_s * self.vae.config.sampling_rate)
+        waveform_end = int(audio_end_in_s * self.vae.config.sampling_rate)
+        waveform_length = int(self.transformer.config.sample_size)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            audio_start_in_s,
+            audio_end_in_s,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            attention_mask,
+            negative_attention_mask,
+            initial_audio_waveforms,
+            initial_audio_sampling_rate,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            attention_mask,
+            negative_attention_mask,
+        )
+
+        # Encode duration
+        seconds_start_hidden_states, seconds_end_hidden_states = self.encode_duration(
+            audio_start_in_s,
+            audio_end_in_s,
+            device,
+            do_classifier_free_guidance and (negative_prompt is not None or negative_prompt_embeds is not None),
+            batch_size,
+        )
+
+        # Create text_audio_duration_embeds and audio_duration_embeds
+        text_audio_duration_embeds = torch.cat(
+            [prompt_embeds, seconds_start_hidden_states, seconds_end_hidden_states], dim=1
+        )
+
+        audio_duration_embeds = torch.cat([seconds_start_hidden_states, seconds_end_hidden_states], dim=2)
+
+        # In case of classifier free guidance without negative prompt, we need to create unconditional embeddings and
+        # to concatenate it to the embeddings
+        if do_classifier_free_guidance and negative_prompt_embeds is None and negative_prompt is None:
+            negative_text_audio_duration_embeds = torch.zeros_like(
+                text_audio_duration_embeds, device=text_audio_duration_embeds.device
+            )
+            text_audio_duration_embeds = torch.cat(
+                [negative_text_audio_duration_embeds, text_audio_duration_embeds], dim=0
+            )
+            audio_duration_embeds = torch.cat([audio_duration_embeds, audio_duration_embeds], dim=0)
+
+        bs_embed, seq_len, hidden_size = text_audio_duration_embeds.shape
+        # duplicate audio_duration_embeds and text_audio_duration_embeds for each generation per prompt, using mps friendly method
+        text_audio_duration_embeds = text_audio_duration_embeds.repeat(1, num_waveforms_per_prompt, 1)
+        text_audio_duration_embeds = text_audio_duration_embeds.view(
+            bs_embed * num_waveforms_per_prompt, seq_len, hidden_size
+        )
+
+        audio_duration_embeds = audio_duration_embeds.repeat(1, num_waveforms_per_prompt, 1)
+        audio_duration_embeds = audio_duration_embeds.view(
+            bs_embed * num_waveforms_per_prompt, -1, audio_duration_embeds.shape[-1]
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_vae = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_waveforms_per_prompt,
+            num_channels_vae,
+            waveform_length,
+            text_audio_duration_embeds.dtype,
+            device,
+            generator,
+            latents,
+            initial_audio_waveforms,
+            num_waveforms_per_prompt,
+            audio_channels=self.vae.config.audio_channels,
+        )
+
+        # 6. Prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare rotary positional embedding
+        rotary_embedding = get_1d_rotary_pos_embed(
+            self.rotary_embed_dim,
+            latents.shape[2] + audio_duration_embeds.shape[1],
+            use_real=True,
+            repeat_interleave_real=False,
+        )
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.transformer(
+                    latent_model_input,
+                    t.unsqueeze(0),
+                    encoder_hidden_states=text_audio_duration_embeds,
+                    global_hidden_states=audio_duration_embeds,
+                    rotary_embedding=rotary_embedding,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 9. Post-processing
+        if not output_type == "latent":
+            audio = self.vae.decode(latents).sample
+        else:
+            return AudioPipelineOutput(audios=latents)
+
+        audio = audio[:, :, waveform_start:waveform_end]
+
+        if output_type == "np":
+            audio = audio.cpu().float().numpy()
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (audio,)
+
+        return AudioPipelineOutput(audios=audio)
diff --git a/diffusers/src/diffusers/pipelines/stable_cascade/__init__.py b/diffusers/src/diffusers/pipelines/stable_cascade/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5270cb94af01fd94ed6c8e76c243a86cad8ec348
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_cascade/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_cascade"] = ["StableCascadeDecoderPipeline"]
+    _import_structure["pipeline_stable_cascade_combined"] = ["StableCascadeCombinedPipeline"]
+    _import_structure["pipeline_stable_cascade_prior"] = ["StableCascadePriorPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_stable_cascade import StableCascadeDecoderPipeline
+        from .pipeline_stable_cascade_combined import StableCascadeCombinedPipeline
+        from .pipeline_stable_cascade_prior import StableCascadePriorPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py b/diffusers/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
new file mode 100644
index 0000000000000000000000000000000000000000..111ccc40c5a551997f7ca9f83f44d68e2c606717
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
@@ -0,0 +1,528 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...models import StableCascadeUNet
+from ...schedulers import DDPMWuerstchenScheduler
+from ...utils import is_torch_version, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ..wuerstchen.modeling_paella_vq_model import PaellaVQModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableCascadePriorPipeline, StableCascadeDecoderPipeline
+
+        >>> prior_pipe = StableCascadePriorPipeline.from_pretrained(
+        ...     "stabilityai/stable-cascade-prior", torch_dtype=torch.bfloat16
+        ... ).to("cuda")
+        >>> gen_pipe = StableCascadeDecoderPipeline.from_pretrain(
+        ...     "stabilityai/stable-cascade", torch_dtype=torch.float16
+        ... ).to("cuda")
+
+        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+        >>> prior_output = pipe(prompt)
+        >>> images = gen_pipe(prior_output.image_embeddings, prompt=prompt)
+        ```
+"""
+
+
+class StableCascadeDecoderPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating images from the Stable Cascade model.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        tokenizer (`CLIPTokenizer`):
+            The CLIP tokenizer.
+        text_encoder (`CLIPTextModel`):
+            The CLIP text encoder.
+        decoder ([`StableCascadeUNet`]):
+            The Stable Cascade decoder unet.
+        vqgan ([`PaellaVQModel`]):
+            The VQGAN model.
+        scheduler ([`DDPMWuerstchenScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        latent_dim_scale (float, `optional`, defaults to 10.67):
+            Multiplier to determine the VQ latent space size from the image embeddings. If the image embeddings are
+            height=24 and width=24, the VQ latent shape needs to be height=int(24*10.67)=256 and
+            width=int(24*10.67)=256 in order to match the training conditions.
+    """
+
+    unet_name = "decoder"
+    text_encoder_name = "text_encoder"
+    model_cpu_offload_seq = "text_encoder->decoder->vqgan"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds_pooled",
+        "negative_prompt_embeds",
+        "image_embeddings",
+    ]
+
+    def __init__(
+        self,
+        decoder: StableCascadeUNet,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModel,
+        scheduler: DDPMWuerstchenScheduler,
+        vqgan: PaellaVQModel,
+        latent_dim_scale: float = 10.67,
+    ) -> None:
+        super().__init__()
+        self.register_modules(
+            decoder=decoder,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            scheduler=scheduler,
+            vqgan=vqgan,
+        )
+        self.register_to_config(latent_dim_scale=latent_dim_scale)
+
+    def prepare_latents(
+        self, batch_size, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, scheduler
+    ):
+        _, channels, height, width = image_embeddings.shape
+        latents_shape = (
+            batch_size * num_images_per_prompt,
+            4,
+            int(height * self.config.latent_dim_scale),
+            int(width * self.config.latent_dim_scale),
+        )
+
+        if latents is None:
+            latents = randn_tensor(latents_shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def encode_prompt(
+        self,
+        device,
+        batch_size,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        prompt=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
+    ):
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            attention_mask = text_inputs.attention_mask
+
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+                attention_mask = attention_mask[:, : self.tokenizer.model_max_length]
+
+            text_encoder_output = self.text_encoder(
+                text_input_ids.to(device), attention_mask=attention_mask.to(device), output_hidden_states=True
+            )
+            prompt_embeds = text_encoder_output.hidden_states[-1]
+            if prompt_embeds_pooled is None:
+                prompt_embeds_pooled = text_encoder_output.text_embeds.unsqueeze(1)
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        prompt_embeds_pooled = prompt_embeds_pooled.to(dtype=self.text_encoder.dtype, device=device)
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        prompt_embeds_pooled = prompt_embeds_pooled.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if negative_prompt_embeds is None and do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=uncond_input.attention_mask.to(device),
+                output_hidden_states=True,
+            )
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.hidden_states[-1]
+            negative_prompt_embeds_pooled = negative_prompt_embeds_text_encoder_output.text_embeds.unsqueeze(1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            seq_len = negative_prompt_embeds_pooled.shape[1]
+            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.to(
+                dtype=self.text_encoder.dtype, device=device
+            )
+            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            # done duplicates
+
+        return prompt_embeds, prompt_embeds_pooled, negative_prompt_embeds, negative_prompt_embeds_pooled
+
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    def get_timestep_ratio_conditioning(self, t, alphas_cumprod):
+        s = torch.tensor([0.008])
+        clamp_range = [0, 1]
+        min_var = torch.cos(s / (1 + s) * torch.pi * 0.5) ** 2
+        var = alphas_cumprod[t]
+        var = var.clamp(*clamp_range)
+        s, min_var = s.to(var.device), min_var.to(var.device)
+        ratio = (((var * min_var) ** 0.5).acos() / (torch.pi * 0.5)) * (1 + s) - s
+        return ratio
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image_embeddings: Union[torch.Tensor, List[torch.Tensor]],
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 10,
+        guidance_scale: float = 0.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image_embedding (`torch.Tensor` or `List[torch.Tensor]`):
+                Image Embeddings either extracted from an image or generated by a Prior Model.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            num_inference_steps (`int`, *optional*, defaults to 12):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 0.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `decoder_guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
+                `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely
+                linked to the text `prompt`, usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `decoder_guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_embeds_pooled (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_prompt_embeds_pooled (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt`
+                input argument.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
+            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
+            embeddings.
+        """
+
+        # 0. Define commonly used variables
+        device = self._execution_device
+        dtype = self.decoder.dtype
+        self._guidance_scale = guidance_scale
+        if is_torch_version("<", "2.2.0") and dtype == torch.bfloat16:
+            raise ValueError("`StableCascadeDecoderPipeline` requires torch>=2.2.0 when using `torch.bfloat16` dtype.")
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+        if isinstance(image_embeddings, list):
+            image_embeddings = torch.cat(image_embeddings, dim=0)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Compute the effective number of images per prompt
+        # We must account for the fact that the image embeddings from the prior can be generated with num_images_per_prompt > 1
+        # This results in a case where a single prompt is associated with multiple image embeddings
+        # Divide the number of image embeddings by the batch size to determine if this is the case.
+        num_images_per_prompt = num_images_per_prompt * (image_embeddings.shape[0] // batch_size)
+
+        # 2. Encode caption
+        if prompt_embeds is None and negative_prompt_embeds is None:
+            _, prompt_embeds_pooled, _, negative_prompt_embeds_pooled = self.encode_prompt(
+                prompt=prompt,
+                device=device,
+                batch_size=batch_size,
+                num_images_per_prompt=num_images_per_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                negative_prompt=negative_prompt,
+                prompt_embeds=prompt_embeds,
+                prompt_embeds_pooled=prompt_embeds_pooled,
+                negative_prompt_embeds=negative_prompt_embeds,
+                negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+            )
+
+        # The pooled embeds from the prior are pooled again before being passed to the decoder
+        prompt_embeds_pooled = (
+            torch.cat([prompt_embeds_pooled, negative_prompt_embeds_pooled])
+            if self.do_classifier_free_guidance
+            else prompt_embeds_pooled
+        )
+        effnet = (
+            torch.cat([image_embeddings, torch.zeros_like(image_embeddings)])
+            if self.do_classifier_free_guidance
+            else image_embeddings
+        )
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latents
+        latents = self.prepare_latents(
+            batch_size, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, self.scheduler
+        )
+
+        if isinstance(self.scheduler, DDPMWuerstchenScheduler):
+            timesteps = timesteps[:-1]
+        else:
+            if hasattr(self.scheduler.config, "clip_sample") and self.scheduler.config.clip_sample:
+                self.scheduler.config.clip_sample = False  # disample sample clipping
+                logger.warning(" set `clip_sample` to be False")
+
+        # 6. Run denoising loop
+        if hasattr(self.scheduler, "betas"):
+            alphas = 1.0 - self.scheduler.betas
+            alphas_cumprod = torch.cumprod(alphas, dim=0)
+        else:
+            alphas_cumprod = []
+
+        self._num_timesteps = len(timesteps)
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            if not isinstance(self.scheduler, DDPMWuerstchenScheduler):
+                if len(alphas_cumprod) > 0:
+                    timestep_ratio = self.get_timestep_ratio_conditioning(t.long().cpu(), alphas_cumprod)
+                    timestep_ratio = timestep_ratio.expand(latents.size(0)).to(dtype).to(device)
+                else:
+                    timestep_ratio = t.float().div(self.scheduler.timesteps[-1]).expand(latents.size(0)).to(dtype)
+            else:
+                timestep_ratio = t.expand(latents.size(0)).to(dtype)
+
+            # 7. Denoise latents
+            predicted_latents = self.decoder(
+                sample=torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents,
+                timestep_ratio=torch.cat([timestep_ratio] * 2) if self.do_classifier_free_guidance else timestep_ratio,
+                clip_text_pooled=prompt_embeds_pooled,
+                effnet=effnet,
+                return_dict=False,
+            )[0]
+
+            # 8. Check for classifier free guidance and apply it
+            if self.do_classifier_free_guidance:
+                predicted_latents_text, predicted_latents_uncond = predicted_latents.chunk(2)
+                predicted_latents = torch.lerp(predicted_latents_uncond, predicted_latents_text, self.guidance_scale)
+
+            # 9. Renoise latents to next timestep
+            if not isinstance(self.scheduler, DDPMWuerstchenScheduler):
+                timestep_ratio = t
+            latents = self.scheduler.step(
+                model_output=predicted_latents,
+                timestep=timestep_ratio,
+                sample=latents,
+                generator=generator,
+            ).prev_sample
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+        if output_type not in ["pt", "np", "pil", "latent"]:
+            raise ValueError(
+                f"Only the output types `pt`, `np`, `pil` and `latent` are supported not output_type={output_type}"
+            )
+
+        if not output_type == "latent":
+            # 10. Scale and decode the image latents with vq-vae
+            latents = self.vqgan.config.scale_factor * latents
+            images = self.vqgan.decode(latents).sample.clamp(0, 1)
+            if output_type == "np":
+                images = images.permute(0, 2, 3, 1).cpu().float().numpy()  # float() as bfloat16-> numpy doesnt work
+            elif output_type == "pil":
+                images = images.permute(0, 2, 3, 1).cpu().float().numpy()  # float() as bfloat16-> numpy doesnt work
+                images = self.numpy_to_pil(images)
+        else:
+            images = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return images
+        return ImagePipelineOutput(images)
diff --git a/diffusers/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py b/diffusers/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..6724b60cc424aa3b4557d522b08af4302dec2da0
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
@@ -0,0 +1,315 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Dict, List, Optional, Union
+
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import StableCascadeUNet
+from ...schedulers import DDPMWuerstchenScheduler
+from ...utils import is_torch_version, replace_example_docstring
+from ..pipeline_utils import DiffusionPipeline
+from ..wuerstchen.modeling_paella_vq_model import PaellaVQModel
+from .pipeline_stable_cascade import StableCascadeDecoderPipeline
+from .pipeline_stable_cascade_prior import StableCascadePriorPipeline
+
+
+TEXT2IMAGE_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableCascadeCombinedPipeline
+
+        >>> pipe = StableCascadeCombinedPipeline.from_pretrained(
+        ...     "stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+        >>> images = pipe(prompt=prompt)
+        ```
+"""
+
+
+class StableCascadeCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for text-to-image generation using Stable Cascade.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        tokenizer (`CLIPTokenizer`):
+            The decoder tokenizer to be used for text inputs.
+        text_encoder (`CLIPTextModel`):
+            The decoder text encoder to be used for text inputs.
+        decoder (`StableCascadeUNet`):
+            The decoder model to be used for decoder image generation pipeline.
+        scheduler (`DDPMWuerstchenScheduler`):
+            The scheduler to be used for decoder image generation pipeline.
+        vqgan (`PaellaVQModel`):
+            The VQGAN model to be used for decoder image generation pipeline.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        prior_prior (`StableCascadeUNet`):
+            The prior model to be used for prior pipeline.
+        prior_scheduler (`DDPMWuerstchenScheduler`):
+            The scheduler to be used for prior pipeline.
+    """
+
+    _load_connected_pipes = True
+    _optional_components = ["prior_feature_extractor", "prior_image_encoder"]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModel,
+        decoder: StableCascadeUNet,
+        scheduler: DDPMWuerstchenScheduler,
+        vqgan: PaellaVQModel,
+        prior_prior: StableCascadeUNet,
+        prior_text_encoder: CLIPTextModel,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: DDPMWuerstchenScheduler,
+        prior_feature_extractor: Optional[CLIPImageProcessor] = None,
+        prior_image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            decoder=decoder,
+            scheduler=scheduler,
+            vqgan=vqgan,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_prior=prior_prior,
+            prior_scheduler=prior_scheduler,
+            prior_feature_extractor=prior_feature_extractor,
+            prior_image_encoder=prior_image_encoder,
+        )
+        self.prior_pipe = StableCascadePriorPipeline(
+            prior=prior_prior,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_encoder=prior_image_encoder,
+            feature_extractor=prior_feature_extractor,
+        )
+        self.decoder_pipe = StableCascadeDecoderPipeline(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            decoder=decoder,
+            scheduler=scheduler,
+            vqgan=vqgan,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
+        self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
+
+    def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
+        Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
+        GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
+        Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(TEXT2IMAGE_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        images: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]] = None,
+        height: int = 512,
+        width: int = 512,
+        prior_num_inference_steps: int = 60,
+        prior_guidance_scale: float = 4.0,
+        num_inference_steps: int = 12,
+        decoder_guidance_scale: float = 0.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        prior_callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation for the prior and decoder.
+            images (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, *optional*):
+                The images to guide the image generation for the prior.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, text embeddings will be generated from `prompt` input argument.
+            prompt_embeds_pooled (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
+                prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            negative_prompt_embeds_pooled (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
+                prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `prior_guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
+                `prior_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked
+                to the text `prompt`, usually at the expense of lower image quality.
+            prior_num_inference_steps (`Union[int, Dict[float, int]]`, *optional*, defaults to 60):
+                The number of prior denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. For more specific timestep spacing, you can pass customized
+                `prior_timesteps`
+            num_inference_steps (`int`, *optional*, defaults to 12):
+                The number of decoder denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. For more specific timestep spacing, you can pass customized
+                `timesteps`
+            decoder_guidance_scale (`float`, *optional*, defaults to 0.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            prior_callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `prior_callback_on_step_end(self: DiffusionPipeline, step: int, timestep:
+                int, callback_kwargs: Dict)`.
+            prior_callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
+                list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
+                the `._callback_tensor_inputs` attribute of your pipeline class.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
+            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        dtype = self.decoder_pipe.decoder.dtype
+        if is_torch_version("<", "2.2.0") and dtype == torch.bfloat16:
+            raise ValueError(
+                "`StableCascadeCombinedPipeline` requires torch>=2.2.0 when using `torch.bfloat16` dtype."
+            )
+
+        prior_outputs = self.prior_pipe(
+            prompt=prompt if prompt_embeds is None else None,
+            images=images,
+            height=height,
+            width=width,
+            num_inference_steps=prior_num_inference_steps,
+            guidance_scale=prior_guidance_scale,
+            negative_prompt=negative_prompt if negative_prompt_embeds is None else None,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_pooled=prompt_embeds_pooled,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+            num_images_per_prompt=num_images_per_prompt,
+            generator=generator,
+            latents=latents,
+            output_type="pt",
+            return_dict=True,
+            callback_on_step_end=prior_callback_on_step_end,
+            callback_on_step_end_tensor_inputs=prior_callback_on_step_end_tensor_inputs,
+        )
+        image_embeddings = prior_outputs.image_embeddings
+        prompt_embeds = prior_outputs.get("prompt_embeds", None)
+        prompt_embeds_pooled = prior_outputs.get("prompt_embeds_pooled", None)
+        negative_prompt_embeds = prior_outputs.get("negative_prompt_embeds", None)
+        negative_prompt_embeds_pooled = prior_outputs.get("negative_prompt_embeds_pooled", None)
+
+        outputs = self.decoder_pipe(
+            image_embeddings=image_embeddings,
+            prompt=prompt if prompt_embeds is None else None,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=decoder_guidance_scale,
+            negative_prompt=negative_prompt if negative_prompt_embeds is None else None,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_pooled=prompt_embeds_pooled,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+            generator=generator,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback_on_step_end=callback_on_step_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        return outputs
diff --git a/diffusers/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py b/diffusers/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..058dbf6b0797fce082528a2eae32e3221cb37c3c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
@@ -0,0 +1,639 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from math import ceil
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import StableCascadeUNet
+from ...schedulers import DDPMWuerstchenScheduler
+from ...utils import BaseOutput, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+DEFAULT_STAGE_C_TIMESTEPS = list(np.linspace(1.0, 2 / 3, 20)) + list(np.linspace(2 / 3, 0.0, 11))[1:]
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableCascadePriorPipeline
+
+        >>> prior_pipe = StableCascadePriorPipeline.from_pretrained(
+        ...     "stabilityai/stable-cascade-prior", torch_dtype=torch.bfloat16
+        ... ).to("cuda")
+
+        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+        >>> prior_output = pipe(prompt)
+        ```
+"""
+
+
+@dataclass
+class StableCascadePriorPipelineOutput(BaseOutput):
+    """
+    Output class for WuerstchenPriorPipeline.
+
+    Args:
+        image_embeddings (`torch.Tensor` or `np.ndarray`)
+            Prior image embeddings for text prompt
+        prompt_embeds (`torch.Tensor`):
+            Text embeddings for the prompt.
+        negative_prompt_embeds (`torch.Tensor`):
+            Text embeddings for the negative prompt.
+    """
+
+    image_embeddings: Union[torch.Tensor, np.ndarray]
+    prompt_embeds: Union[torch.Tensor, np.ndarray]
+    prompt_embeds_pooled: Union[torch.Tensor, np.ndarray]
+    negative_prompt_embeds: Union[torch.Tensor, np.ndarray]
+    negative_prompt_embeds_pooled: Union[torch.Tensor, np.ndarray]
+
+
+class StableCascadePriorPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating image prior for Stable Cascade.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        prior ([`StableCascadeUNet`]):
+            The Stable Cascade prior to approximate the image embedding from the text and/or image embedding.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder
+            ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`DDPMWuerstchenScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        resolution_multiple ('float', *optional*, defaults to 42.67):
+            Default resolution for multiple images generated.
+    """
+
+    unet_name = "prior"
+    text_encoder_name = "text_encoder"
+    model_cpu_offload_seq = "image_encoder->text_encoder->prior"
+    _optional_components = ["image_encoder", "feature_extractor"]
+    _callback_tensor_inputs = ["latents", "text_encoder_hidden_states", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        prior: StableCascadeUNet,
+        scheduler: DDPMWuerstchenScheduler,
+        resolution_multiple: float = 42.67,
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+    ) -> None:
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            prior=prior,
+            scheduler=scheduler,
+        )
+        self.register_to_config(resolution_multiple=resolution_multiple)
+
+    def prepare_latents(
+        self, batch_size, height, width, num_images_per_prompt, dtype, device, generator, latents, scheduler
+    ):
+        latent_shape = (
+            num_images_per_prompt * batch_size,
+            self.prior.config.in_channels,
+            ceil(height / self.config.resolution_multiple),
+            ceil(width / self.config.resolution_multiple),
+        )
+
+        if latents is None:
+            latents = randn_tensor(latent_shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != latent_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latent_shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def encode_prompt(
+        self,
+        device,
+        batch_size,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        prompt=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
+    ):
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            attention_mask = text_inputs.attention_mask
+
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+                attention_mask = attention_mask[:, : self.tokenizer.model_max_length]
+
+            text_encoder_output = self.text_encoder(
+                text_input_ids.to(device), attention_mask=attention_mask.to(device), output_hidden_states=True
+            )
+            prompt_embeds = text_encoder_output.hidden_states[-1]
+            if prompt_embeds_pooled is None:
+                prompt_embeds_pooled = text_encoder_output.text_embeds.unsqueeze(1)
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        prompt_embeds_pooled = prompt_embeds_pooled.to(dtype=self.text_encoder.dtype, device=device)
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        prompt_embeds_pooled = prompt_embeds_pooled.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if negative_prompt_embeds is None and do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=uncond_input.attention_mask.to(device),
+                output_hidden_states=True,
+            )
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.hidden_states[-1]
+            negative_prompt_embeds_pooled = negative_prompt_embeds_text_encoder_output.text_embeds.unsqueeze(1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            seq_len = negative_prompt_embeds_pooled.shape[1]
+            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.to(
+                dtype=self.text_encoder.dtype, device=device
+            )
+            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            # done duplicates
+
+        return prompt_embeds, prompt_embeds_pooled, negative_prompt_embeds, negative_prompt_embeds_pooled
+
+    def encode_image(self, images, device, dtype, batch_size, num_images_per_prompt):
+        image_embeds = []
+        for image in images:
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+            image = image.to(device=device, dtype=dtype)
+            image_embed = self.image_encoder(image).image_embeds.unsqueeze(1)
+            image_embeds.append(image_embed)
+        image_embeds = torch.cat(image_embeds, dim=1)
+
+        image_embeds = image_embeds.repeat(batch_size * num_images_per_prompt, 1, 1)
+        negative_image_embeds = torch.zeros_like(image_embeds)
+
+        return image_embeds, negative_image_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        images=None,
+        image_embeds=None,
+        negative_prompt=None,
+        prompt_embeds=None,
+        prompt_embeds_pooled=None,
+        negative_prompt_embeds=None,
+        negative_prompt_embeds_pooled=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and prompt_embeds_pooled is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `prompt_embeds_pooled` must also be provided. Make sure to generate `prompt_embeds_pooled` from the same text encoder that was used to generate `prompt_embeds`"
+            )
+
+        if negative_prompt_embeds is not None and negative_prompt_embeds_pooled is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_pooled` must also be provided. Make sure to generate `prompt_embeds_pooled` from the same text encoder that was used to generate `prompt_embeds`"
+            )
+
+        if prompt_embeds_pooled is not None and negative_prompt_embeds_pooled is not None:
+            if prompt_embeds_pooled.shape != negative_prompt_embeds_pooled.shape:
+                raise ValueError(
+                    "`prompt_embeds_pooled` and `negative_prompt_embeds_pooled` must have the same shape when passed"
+                    f"directly, but got: `prompt_embeds_pooled` {prompt_embeds_pooled.shape} !="
+                    f"`negative_prompt_embeds_pooled` {negative_prompt_embeds_pooled.shape}."
+                )
+
+        if image_embeds is not None and images is not None:
+            raise ValueError(
+                f"Cannot forward both `images`: {images} and `image_embeds`: {image_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+
+        if images:
+            for i, image in enumerate(images):
+                if not isinstance(image, torch.Tensor) and not isinstance(image, PIL.Image.Image):
+                    raise TypeError(
+                        f"'images' must contain images of type 'torch.Tensor' or 'PIL.Image.Image, but got"
+                        f"{type(image)} for image number {i}."
+                    )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    def get_timestep_ratio_conditioning(self, t, alphas_cumprod):
+        s = torch.tensor([0.008])
+        clamp_range = [0, 1]
+        min_var = torch.cos(s / (1 + s) * torch.pi * 0.5) ** 2
+        var = alphas_cumprod[t]
+        var = var.clamp(*clamp_range)
+        s, min_var = s.to(var.device), min_var.to(var.device)
+        ratio = (((var * min_var) ** 0.5).acos() / (torch.pi * 0.5)) * (1 + s) - s
+        return ratio
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        images: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]] = None,
+        height: int = 1024,
+        width: int = 1024,
+        num_inference_steps: int = 20,
+        timesteps: List[float] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pt",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 1024):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 1024):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 60):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 8.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `decoder_guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
+                `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely
+                linked to the text `prompt`, usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `decoder_guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_embeds_pooled (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_prompt_embeds_pooled (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt`
+                input argument.
+            image_embeds (`torch.Tensor`, *optional*):
+                Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. If
+                not provided, image embeddings will be generated from `image` input argument if existing.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if `return_dict` is
+            True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
+            embeddings.
+        """
+
+        # 0. Define commonly used variables
+        device = self._execution_device
+        dtype = next(self.prior.parameters()).dtype
+        self._guidance_scale = guidance_scale
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            images=images,
+            image_embeds=image_embeds,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_pooled=prompt_embeds_pooled,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        # 2. Encode caption + images
+        (
+            prompt_embeds,
+            prompt_embeds_pooled,
+            negative_prompt_embeds,
+            negative_prompt_embeds_pooled,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            batch_size=batch_size,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_pooled=prompt_embeds_pooled,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+        )
+
+        if images is not None:
+            image_embeds_pooled, uncond_image_embeds_pooled = self.encode_image(
+                images=images,
+                device=device,
+                dtype=dtype,
+                batch_size=batch_size,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+        elif image_embeds is not None:
+            image_embeds_pooled = image_embeds.repeat(batch_size * num_images_per_prompt, 1, 1)
+            uncond_image_embeds_pooled = torch.zeros_like(image_embeds_pooled)
+        else:
+            image_embeds_pooled = torch.zeros(
+                batch_size * num_images_per_prompt,
+                1,
+                self.prior.config.clip_image_in_channels,
+                device=device,
+                dtype=dtype,
+            )
+            uncond_image_embeds_pooled = torch.zeros(
+                batch_size * num_images_per_prompt,
+                1,
+                self.prior.config.clip_image_in_channels,
+                device=device,
+                dtype=dtype,
+            )
+
+        if self.do_classifier_free_guidance:
+            image_embeds = torch.cat([image_embeds_pooled, uncond_image_embeds_pooled], dim=0)
+        else:
+            image_embeds = image_embeds_pooled
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        text_encoder_hidden_states = (
+            torch.cat([prompt_embeds, negative_prompt_embeds]) if negative_prompt_embeds is not None else prompt_embeds
+        )
+        text_encoder_pooled = (
+            torch.cat([prompt_embeds_pooled, negative_prompt_embeds_pooled])
+            if negative_prompt_embeds is not None
+            else prompt_embeds_pooled
+        )
+
+        # 4. Prepare and set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latents
+        latents = self.prepare_latents(
+            batch_size, height, width, num_images_per_prompt, dtype, device, generator, latents, self.scheduler
+        )
+
+        if isinstance(self.scheduler, DDPMWuerstchenScheduler):
+            timesteps = timesteps[:-1]
+        else:
+            if hasattr(self.scheduler.config, "clip_sample") and self.scheduler.config.clip_sample:
+                self.scheduler.config.clip_sample = False  # disample sample clipping
+                logger.warning(" set `clip_sample` to be False")
+        # 6. Run denoising loop
+        if hasattr(self.scheduler, "betas"):
+            alphas = 1.0 - self.scheduler.betas
+            alphas_cumprod = torch.cumprod(alphas, dim=0)
+        else:
+            alphas_cumprod = []
+
+        self._num_timesteps = len(timesteps)
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            if not isinstance(self.scheduler, DDPMWuerstchenScheduler):
+                if len(alphas_cumprod) > 0:
+                    timestep_ratio = self.get_timestep_ratio_conditioning(t.long().cpu(), alphas_cumprod)
+                    timestep_ratio = timestep_ratio.expand(latents.size(0)).to(dtype).to(device)
+                else:
+                    timestep_ratio = t.float().div(self.scheduler.timesteps[-1]).expand(latents.size(0)).to(dtype)
+            else:
+                timestep_ratio = t.expand(latents.size(0)).to(dtype)
+            # 7. Denoise image embeddings
+            predicted_image_embedding = self.prior(
+                sample=torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents,
+                timestep_ratio=torch.cat([timestep_ratio] * 2) if self.do_classifier_free_guidance else timestep_ratio,
+                clip_text_pooled=text_encoder_pooled,
+                clip_text=text_encoder_hidden_states,
+                clip_img=image_embeds,
+                return_dict=False,
+            )[0]
+
+            # 8. Check for classifier free guidance and apply it
+            if self.do_classifier_free_guidance:
+                predicted_image_embedding_text, predicted_image_embedding_uncond = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = torch.lerp(
+                    predicted_image_embedding_uncond, predicted_image_embedding_text, self.guidance_scale
+                )
+
+            # 9. Renoise latents to next timestep
+            if not isinstance(self.scheduler, DDPMWuerstchenScheduler):
+                timestep_ratio = t
+            latents = self.scheduler.step(
+                model_output=predicted_image_embedding, timestep=timestep_ratio, sample=latents, generator=generator
+            ).prev_sample
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if output_type == "np":
+            latents = latents.cpu().float().numpy()  # float() as bfloat16-> numpy doesnt work
+            prompt_embeds = prompt_embeds.cpu().float().numpy()  # float() as bfloat16-> numpy doesnt work
+            negative_prompt_embeds = (
+                negative_prompt_embeds.cpu().float().numpy() if negative_prompt_embeds is not None else None
+            )  # float() as bfloat16-> numpy doesnt work
+
+        if not return_dict:
+            return (
+                latents,
+                prompt_embeds,
+                prompt_embeds_pooled,
+                negative_prompt_embeds,
+                negative_prompt_embeds_pooled,
+            )
+
+        return StableCascadePriorPipelineOutput(
+            image_embeddings=latents,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_pooled=prompt_embeds_pooled,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+        )
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/README.md b/diffusers/src/diffusers/pipelines/stable_diffusion/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5b229fddadd5d55134162b796dea2feecf20503c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/README.md
@@ -0,0 +1,176 @@
+# Stable Diffusion
+
+## Overview
+
+Stable Diffusion was proposed in [Stable Diffusion Announcement](https://stability.ai/blog/stable-diffusion-announcement) by Patrick Esser and Robin Rombach and the Stability AI team.
+
+The summary of the model is the following:
+
+*Stable Diffusion is a text-to-image model that will empower billions of people to create stunning art within seconds. It is a breakthrough in speed and quality meaning that it can run on consumer GPUs. You can see some of the amazing output that has been created by this model without pre or post-processing on this page. The model itself builds upon the work of the team at CompVis and Runway in their widely used latent diffusion model combined with insights from the conditional diffusion models by our lead generative AI developer Katherine Crowson, Dall-E 2 by Open AI, Imagen by Google Brain and many others. We are delighted that AI media generation is a cooperative field and hope it can continue this way to bring the gift of creativity to all.*
+
+## Tips:
+
+- Stable Diffusion has the same architecture as [Latent Diffusion](https://arxiv.org/abs/2112.10752) but uses a frozen CLIP Text Encoder instead of training the text encoder jointly with the diffusion model.
+- An in-detail explanation of the Stable Diffusion model can be found under [Stable Diffusion with 🧨 Diffusers](https://huggingface.co/blog/stable_diffusion).
+- If you don't want to rely on the Hugging Face Hub and having to pass a authentication token, you can
+download the weights with `git lfs install; git clone https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5` and instead pass the local path to the cloned folder to `from_pretrained` as shown below.
+- Stable Diffusion can work with a variety of different samplers as is shown below.
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_stable_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py) | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [pipeline_stable_diffusion_img2img](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [pipeline_stable_diffusion_inpaint](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+
+## Examples:
+
+### Using Stable Diffusion without being logged into the Hub.
+
+If you want to download the model weights using a single Python line, you need to be logged in via `huggingface-cli login`.
+
+```python
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+```
+
+This however can make it difficult to build applications on top of `diffusers` as you will always have to pass the token around. A potential way to solve this issue is by downloading the weights to a local path `"./stable-diffusion-v1-5"`:
+
+```
+git lfs install
+git clone https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5
+```
+
+and simply passing the local path to `from_pretrained`:
+
+```python
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
+```
+
+### Text-to-Image with default PLMS scheduler
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+
+image.save("astronaut_rides_horse.png")
+```
+
+### Text-to-Image with DDIM scheduler
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline, DDIMScheduler
+
+scheduler =  DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    scheduler=scheduler,
+).to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+
+image.save("astronaut_rides_horse.png")
+```
+
+### Text-to-Image with K-LMS scheduler
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
+
+lms = LMSDiscreteScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    scheduler=lms,
+).to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+
+image.save("astronaut_rides_horse.png")
+```
+
+### CycleDiffusion using Stable Diffusion and DDIM scheduler
+
+```python
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+
+from diffusers import CycleDiffusionPipeline, DDIMScheduler
+
+
+# load the scheduler. CycleDiffusion only supports stochastic schedulers.
+
+# load the pipeline
+# make sure you're logged in with `huggingface-cli login`
+model_id_or_path = "CompVis/stable-diffusion-v1-4"
+scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
+pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")
+
+# let's download an initial image
+url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+init_image.save("horse.png")
+
+# let's specify a prompt
+source_prompt = "An astronaut riding a horse"
+prompt = "An astronaut riding an elephant"
+
+# call the pipeline
+image = pipe(
+    prompt=prompt,
+    source_prompt=source_prompt,
+    image=init_image,
+    num_inference_steps=100,
+    eta=0.1,
+    strength=0.8,
+    guidance_scale=2,
+    source_guidance_scale=1,
+).images[0]
+
+image.save("horse_to_elephant.png")
+
+# let's try another example
+# See more samples at the original repo: https://github.com/ChenWu98/cycle-diffusion
+url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+init_image.save("black.png")
+
+source_prompt = "A black colored car"
+prompt = "A blue colored car"
+
+# call the pipeline
+torch.manual_seed(0)
+image = pipe(
+    prompt=prompt,
+    source_prompt=source_prompt,
+    image=init_image,
+    num_inference_steps=100,
+    eta=0.1,
+    strength=0.85,
+    guidance_scale=3,
+    source_guidance_scale=1,
+).images[0]
+
+image.save("black_to_blue.png")
+```
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/__init__.py b/diffusers/src/diffusers/pipelines/stable_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ce08aec66ca688e65623e40bdbaf459cc7283b2
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -0,0 +1,202 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_k_diffusion_available,
+    is_k_diffusion_version,
+    is_onnx_available,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_additional_imports = {}
+_import_structure = {"pipeline_output": ["StableDiffusionPipelineOutput"]}
+
+if is_transformers_available() and is_flax_available():
+    _import_structure["pipeline_output"].extend(["FlaxStableDiffusionPipelineOutput"])
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["clip_image_project_model"] = ["CLIPImageProjection"]
+    _import_structure["pipeline_cycle_diffusion"] = ["CycleDiffusionPipeline"]
+    _import_structure["pipeline_stable_diffusion"] = ["StableDiffusionPipeline"]
+    _import_structure["pipeline_stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
+    _import_structure["pipeline_stable_diffusion_gligen"] = ["StableDiffusionGLIGENPipeline"]
+    _import_structure["pipeline_stable_diffusion_gligen_text_image"] = ["StableDiffusionGLIGENTextImagePipeline"]
+    _import_structure["pipeline_stable_diffusion_img2img"] = ["StableDiffusionImg2ImgPipeline"]
+    _import_structure["pipeline_stable_diffusion_inpaint"] = ["StableDiffusionInpaintPipeline"]
+    _import_structure["pipeline_stable_diffusion_inpaint_legacy"] = ["StableDiffusionInpaintPipelineLegacy"]
+    _import_structure["pipeline_stable_diffusion_instruct_pix2pix"] = ["StableDiffusionInstructPix2PixPipeline"]
+    _import_structure["pipeline_stable_diffusion_latent_upscale"] = ["StableDiffusionLatentUpscalePipeline"]
+    _import_structure["pipeline_stable_diffusion_model_editing"] = ["StableDiffusionModelEditingPipeline"]
+    _import_structure["pipeline_stable_diffusion_paradigms"] = ["StableDiffusionParadigmsPipeline"]
+    _import_structure["pipeline_stable_diffusion_upscale"] = ["StableDiffusionUpscalePipeline"]
+    _import_structure["pipeline_stable_unclip"] = ["StableUnCLIPPipeline"]
+    _import_structure["pipeline_stable_unclip_img2img"] = ["StableUnCLIPImg2ImgPipeline"]
+    _import_structure["safety_checker"] = ["StableDiffusionSafetyChecker"]
+    _import_structure["stable_unclip_image_normalizer"] = ["StableUnCLIPImageNormalizer"]
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import (
+        StableDiffusionImageVariationPipeline,
+    )
+
+    _dummy_objects.update({"StableDiffusionImageVariationPipeline": StableDiffusionImageVariationPipeline})
+else:
+    _import_structure["pipeline_stable_diffusion_image_variation"] = ["StableDiffusionImageVariationPipeline"]
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.26.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import (
+        StableDiffusionDepth2ImgPipeline,
+    )
+
+    _dummy_objects.update(
+        {
+            "StableDiffusionDepth2ImgPipeline": StableDiffusionDepth2ImgPipeline,
+        }
+    )
+else:
+    _import_structure["pipeline_stable_diffusion_depth2img"] = ["StableDiffusionDepth2ImgPipeline"]
+
+try:
+    if not (is_transformers_available() and is_onnx_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_onnx_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_onnx_objects))
+else:
+    _import_structure["pipeline_onnx_stable_diffusion"] = [
+        "OnnxStableDiffusionPipeline",
+        "StableDiffusionOnnxPipeline",
+    ]
+    _import_structure["pipeline_onnx_stable_diffusion_img2img"] = ["OnnxStableDiffusionImg2ImgPipeline"]
+    _import_structure["pipeline_onnx_stable_diffusion_inpaint"] = ["OnnxStableDiffusionInpaintPipeline"]
+    _import_structure["pipeline_onnx_stable_diffusion_inpaint_legacy"] = ["OnnxStableDiffusionInpaintPipelineLegacy"]
+    _import_structure["pipeline_onnx_stable_diffusion_upscale"] = ["OnnxStableDiffusionUpscalePipeline"]
+
+if is_transformers_available() and is_flax_available():
+    from ...schedulers.scheduling_pndm_flax import PNDMSchedulerState
+
+    _additional_imports.update({"PNDMSchedulerState": PNDMSchedulerState})
+    _import_structure["pipeline_flax_stable_diffusion"] = ["FlaxStableDiffusionPipeline"]
+    _import_structure["pipeline_flax_stable_diffusion_img2img"] = ["FlaxStableDiffusionImg2ImgPipeline"]
+    _import_structure["pipeline_flax_stable_diffusion_inpaint"] = ["FlaxStableDiffusionInpaintPipeline"]
+    _import_structure["safety_checker_flax"] = ["FlaxStableDiffusionSafetyChecker"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .clip_image_project_model import CLIPImageProjection
+        from .pipeline_stable_diffusion import (
+            StableDiffusionPipeline,
+            StableDiffusionPipelineOutput,
+        )
+        from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
+        from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
+        from .pipeline_stable_diffusion_instruct_pix2pix import (
+            StableDiffusionInstructPix2PixPipeline,
+        )
+        from .pipeline_stable_diffusion_latent_upscale import (
+            StableDiffusionLatentUpscalePipeline,
+        )
+        from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline
+        from .pipeline_stable_unclip import StableUnCLIPPipeline
+        from .pipeline_stable_unclip_img2img import StableUnCLIPImg2ImgPipeline
+        from .safety_checker import StableDiffusionSafetyChecker
+        from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import (
+            StableDiffusionImageVariationPipeline,
+        )
+    else:
+        from .pipeline_stable_diffusion_image_variation import (
+            StableDiffusionImageVariationPipeline,
+        )
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.26.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import StableDiffusionDepth2ImgPipeline
+    else:
+        from .pipeline_stable_diffusion_depth2img import (
+            StableDiffusionDepth2ImgPipeline,
+        )
+
+    try:
+        if not (is_transformers_available() and is_onnx_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_onnx_objects import *
+    else:
+        from .pipeline_onnx_stable_diffusion import (
+            OnnxStableDiffusionPipeline,
+            StableDiffusionOnnxPipeline,
+        )
+        from .pipeline_onnx_stable_diffusion_img2img import (
+            OnnxStableDiffusionImg2ImgPipeline,
+        )
+        from .pipeline_onnx_stable_diffusion_inpaint import (
+            OnnxStableDiffusionInpaintPipeline,
+        )
+        from .pipeline_onnx_stable_diffusion_upscale import (
+            OnnxStableDiffusionUpscalePipeline,
+        )
+
+    try:
+        if not (is_transformers_available() and is_flax_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_flax_objects import *
+    else:
+        from .pipeline_flax_stable_diffusion import FlaxStableDiffusionPipeline
+        from .pipeline_flax_stable_diffusion_img2img import (
+            FlaxStableDiffusionImg2ImgPipeline,
+        )
+        from .pipeline_flax_stable_diffusion_inpaint import (
+            FlaxStableDiffusionInpaintPipeline,
+        )
+        from .pipeline_output import FlaxStableDiffusionPipelineOutput
+        from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
+    for name, value in _additional_imports.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py b/diffusers/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..71f9d9714e6b8b7772888d959ae1ae24fae37f77
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The GLIGEN Authors and HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.modeling_utils import ModelMixin
+
+
+class CLIPImageProjection(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, hidden_size: int = 768):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.project = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+
+    def forward(self, x):
+        return self.project(x)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/diffusers/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..53dc98aea69898f53847265a3e0a06e31173f100
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -0,0 +1,1869 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the Stable Diffusion checkpoints."""
+
+import re
+from contextlib import nullcontext
+from io import BytesIO
+from typing import Dict, Optional, Union
+
+import requests
+import torch
+import yaml
+from transformers import (
+    AutoFeatureExtractor,
+    BertTokenizerFast,
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from ...models import (
+    AutoencoderKL,
+    ControlNetModel,
+    PriorTransformer,
+    UNet2DConditionModel,
+)
+from ...schedulers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UnCLIPScheduler,
+)
+from ...utils import is_accelerate_available, logging
+from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
+from ..paint_by_example import PaintByExampleImageEncoder
+from ..pipeline_utils import DiffusionPipeline
+from .safety_checker import StableDiffusionSafetyChecker
+from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+    from accelerate.utils import set_module_tensor_to_device
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "to_q.weight")
+        new_item = new_item.replace("q.bias", "to_q.bias")
+
+        new_item = new_item.replace("k.weight", "to_k.weight")
+        new_item = new_item.replace("k.bias", "to_k.bias")
+
+        new_item = new_item.replace("v.weight", "to_v.weight")
+        new_item = new_item.replace("v.bias", "to_v.bias")
+
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        is_attn_weight = "proj_attn.weight" in new_path or ("attentions" in new_path and "to_" in new_path)
+        shape = old_checkpoint[path["old"]].shape
+        if is_attn_weight and len(shape) == 3:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        elif is_attn_weight and len(shape) == 4:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    if controlnet:
+        unet_params = original_config["model"]["params"]["control_stage_config"]["params"]
+    else:
+        if (
+            "unet_config" in original_config["model"]["params"]
+            and original_config["model"]["params"]["unet_config"] is not None
+        ):
+            unet_params = original_config["model"]["params"]["unet_config"]["params"]
+        else:
+            unet_params = original_config["model"]["params"]["network_config"]["params"]
+
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    if unet_params["transformer_depth"] is not None:
+        transformer_layers_per_block = (
+            unet_params["transformer_depth"]
+            if isinstance(unet_params["transformer_depth"], int)
+            else list(unet_params["transformer_depth"])
+        )
+    else:
+        transformer_layers_per_block = 1
+
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
+
+    head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None
+    use_linear_projection = (
+        unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim_mult = unet_params["model_channels"] // unet_params["num_head_channels"]
+            head_dim = [head_dim_mult * c for c in list(unet_params["channel_mult"])]
+
+    class_embed_type = None
+    addition_embed_type = None
+    addition_time_embed_dim = None
+    projection_class_embeddings_input_dim = None
+    context_dim = None
+
+    if unet_params["context_dim"] is not None:
+        context_dim = (
+            unet_params["context_dim"]
+            if isinstance(unet_params["context_dim"], int)
+            else unet_params["context_dim"][0]
+        )
+
+    if "num_classes" in unet_params:
+        if unet_params["num_classes"] == "sequential":
+            if context_dim in [2048, 1280]:
+                # SDXL
+                addition_embed_type = "text_time"
+                addition_time_embed_dim = 256
+            else:
+                class_embed_type = "projection"
+            assert "adm_in_channels" in unet_params
+            projection_class_embeddings_input_dim = unet_params["adm_in_channels"]
+
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params["in_channels"],
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params["num_res_blocks"],
+        "cross_attention_dim": context_dim,
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "addition_embed_type": addition_embed_type,
+        "addition_time_embed_dim": addition_time_embed_dim,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "transformer_layers_per_block": transformer_layers_per_block,
+    }
+
+    if "disable_self_attentions" in unet_params:
+        config["only_cross_attention"] = unet_params["disable_self_attentions"]
+
+    if "num_classes" in unet_params and isinstance(unet_params["num_classes"], int):
+        config["num_class_embeds"] = unet_params["num_classes"]
+
+    if controlnet:
+        config["conditioning_channels"] = unet_params["hint_channels"]
+    else:
+        config["out_channels"] = unet_params["out_channels"]
+        config["up_block_types"] = tuple(up_block_types)
+
+    return config
+
+
+def create_vae_diffusers_config(original_config, image_size: int):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
+
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
+    }
+    return config
+
+
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config["model"]["params"]["timesteps"],
+        beta_start=original_config["model"]["params"]["linear_start"],
+        beta_end=original_config["model"]["params"]["linear_end"],
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+
+
+def create_ldm_bert_config(original_config):
+    bert_params = original_config["model"]["params"]["cond_stage_config"]["params"]
+    config = LDMBertConfig(
+        d_model=bert_params.n_embed,
+        encoder_layers=bert_params.n_layer,
+        encoder_ffn_dim=bert_params.n_embed * 4,
+    )
+    return config
+
+
+def convert_ldm_unet_checkpoint(
+    checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False
+):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    if skip_extract_state_dict:
+        unet_state_dict = checkpoint
+    else:
+        # extract state_dict for UNet
+        unet_state_dict = {}
+        keys = list(checkpoint.keys())
+
+        if controlnet:
+            unet_key = "control_model."
+        else:
+            unet_key = "model.diffusion_model."
+
+        # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+        if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+            logger.warning(f"Checkpoint {path} has both EMA and non-EMA weights.")
+            logger.warning(
+                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+            )
+            for key in keys:
+                if key.startswith("model.diffusion_model"):
+                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+        else:
+            if sum(k.startswith("model_ema") for k in keys) > 100:
+                logger.warning(
+                    "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                    " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+                )
+
+            for key in keys:
+                if key.startswith(unet_key):
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+
+    if config["addition_embed_type"] == "text_time":
+        new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+
+    # Relevant to StableDiffusionUpscalePipeline
+    if "num_class_embeds" in config:
+        if (config["num_class_embeds"] is not None) and ("label_emb.weight" in unet_state_dict):
+            new_checkpoint["class_embedding.weight"] = unet_state_dict["label_emb.weight"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    if not controlnet:
+        new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+        new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+        new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+        new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in sorted(output_block_list.items())}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    if controlnet:
+        # conditioning embedding
+
+        orig_index = 0
+
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+
+        orig_index += 2
+
+        diffusers_index = 0
+
+        while diffusers_index < 6:
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
+            )
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
+            )
+            diffusers_index += 1
+            orig_index += 2
+
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+
+        # down blocks
+        for i in range(num_input_blocks):
+            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
+
+        # mid block
+        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
+
+    return new_checkpoint
+
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    keys = list(checkpoint.keys())
+    vae_key = "first_stage_model." if any(k.startswith("first_stage_model.") for k in keys) else ""
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+def convert_ldm_bert_checkpoint(checkpoint, config):
+    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
+        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
+        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
+        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
+
+        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
+        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
+
+    def _copy_linear(hf_linear, pt_linear):
+        hf_linear.weight = pt_linear.weight
+        hf_linear.bias = pt_linear.bias
+
+    def _copy_layer(hf_layer, pt_layer):
+        # copy layer norms
+        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
+        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
+
+        # copy attn
+        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
+
+        # copy MLP
+        pt_mlp = pt_layer[1][1]
+        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
+        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
+
+    def _copy_layers(hf_layers, pt_layers):
+        for i, hf_layer in enumerate(hf_layers):
+            if i != 0:
+                i += i
+            pt_layer = pt_layers[i : i + 2]
+            _copy_layer(hf_layer, pt_layer)
+
+    hf_model = LDMBertModel(config).eval()
+
+    # copy  embeds
+    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
+    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
+
+    # copy layer norm
+    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
+
+    # copy hidden layers
+    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
+
+    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
+
+    return hf_model
+
+
+def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
+    if text_encoder is None:
+        config_name = "openai/clip-vit-large-patch14"
+        try:
+            config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: 'openai/clip-vit-large-patch14'."
+            )
+
+        ctx = init_empty_weights if is_accelerate_available() else nullcontext
+        with ctx():
+            text_model = CLIPTextModel(config)
+    else:
+        text_model = text_encoder
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    remove_prefixes = ["cond_stage_model.transformer", "conditioner.embedders.0.transformer"]
+
+    for key in keys:
+        for prefix in remove_prefixes:
+            if key.startswith(prefix):
+                text_model_dict[key[len(prefix + ".") :]] = checkpoint[key]
+
+    if is_accelerate_available():
+        for param_name, param in text_model_dict.items():
+            set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
+    else:
+        if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)):
+            text_model_dict.pop("text_model.embeddings.position_ids", None)
+
+        text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
+
+textenc_conversion_lst = [
+    ("positional_embedding", "text_model.embeddings.position_embedding.weight"),
+    ("token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("ln_final.bias", "text_model.final_layer_norm.bias"),
+    ("text_projection", "text_projection.weight"),
+]
+textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
+
+textenc_transformer_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+
+
+def convert_paint_by_example_checkpoint(checkpoint, local_files_only=False):
+    config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
+    model = PaintByExampleImageEncoder(config)
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+
+    # load clip vision
+    model.model.load_state_dict(text_model_dict)
+
+    # load mapper
+    keys_mapper = {
+        k[len("cond_stage_model.mapper.res") :]: v
+        for k, v in checkpoint.items()
+        if k.startswith("cond_stage_model.mapper")
+    }
+
+    MAPPING = {
+        "attn.c_qkv": ["attn1.to_q", "attn1.to_k", "attn1.to_v"],
+        "attn.c_proj": ["attn1.to_out.0"],
+        "ln_1": ["norm1"],
+        "ln_2": ["norm3"],
+        "mlp.c_fc": ["ff.net.0.proj"],
+        "mlp.c_proj": ["ff.net.2"],
+    }
+
+    mapped_weights = {}
+    for key, value in keys_mapper.items():
+        prefix = key[: len("blocks.i")]
+        suffix = key.split(prefix)[-1].split(".")[-1]
+        name = key.split(prefix)[-1].split(suffix)[0][1:-1]
+        mapped_names = MAPPING[name]
+
+        num_splits = len(mapped_names)
+        for i, mapped_name in enumerate(mapped_names):
+            new_name = ".".join([prefix, mapped_name, suffix])
+            shape = value.shape[0] // num_splits
+            mapped_weights[new_name] = value[i * shape : (i + 1) * shape]
+
+    model.mapper.load_state_dict(mapped_weights)
+
+    # load final layer norm
+    model.final_layer_norm.load_state_dict(
+        {
+            "bias": checkpoint["cond_stage_model.final_ln.bias"],
+            "weight": checkpoint["cond_stage_model.final_ln.weight"],
+        }
+    )
+
+    # load final proj
+    model.proj_out.load_state_dict(
+        {
+            "bias": checkpoint["proj_out.bias"],
+            "weight": checkpoint["proj_out.weight"],
+        }
+    )
+
+    # load uncond vector
+    model.uncond_vector.data = torch.nn.Parameter(checkpoint["learnable_vector"])
+    return model
+
+
+def convert_open_clip_checkpoint(
+    checkpoint,
+    config_name,
+    prefix="cond_stage_model.model.",
+    has_projection=False,
+    local_files_only=False,
+    **config_kwargs,
+):
+    # text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
+    # text_model = CLIPTextModelWithProjection.from_pretrained(
+    #    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
+    # )
+    try:
+        config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only)
+    except Exception:
+        raise ValueError(
+            f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: '{config_name}'."
+        )
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        text_model = CLIPTextModelWithProjection(config) if has_projection else CLIPTextModel(config)
+
+    keys = list(checkpoint.keys())
+
+    keys_to_ignore = []
+    if config_name == "stabilityai/stable-diffusion-2" and config.num_hidden_layers == 23:
+        # make sure to remove all keys > 22
+        keys_to_ignore += [k for k in keys if k.startswith("cond_stage_model.model.transformer.resblocks.23")]
+        keys_to_ignore += ["cond_stage_model.model.text_projection"]
+
+    text_model_dict = {}
+
+    if prefix + "text_projection" in checkpoint:
+        d_model = int(checkpoint[prefix + "text_projection"].shape[0])
+    else:
+        d_model = 1024
+
+    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
+
+    for key in keys:
+        if key in keys_to_ignore:
+            continue
+        if key[len(prefix) :] in textenc_conversion_map:
+            if key.endswith("text_projection"):
+                value = checkpoint[key].T.contiguous()
+            else:
+                value = checkpoint[key]
+
+            text_model_dict[textenc_conversion_map[key[len(prefix) :]]] = value
+
+        if key.startswith(prefix + "transformer."):
+            new_key = key[len(prefix + "transformer.") :]
+            if new_key.endswith(".in_proj_weight"):
+                new_key = new_key[: -len(".in_proj_weight")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
+            elif new_key.endswith(".in_proj_bias"):
+                new_key = new_key[: -len(".in_proj_bias")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
+            else:
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+
+                text_model_dict[new_key] = checkpoint[key]
+
+    if is_accelerate_available():
+        for param_name, param in text_model_dict.items():
+            set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
+    else:
+        if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)):
+            text_model_dict.pop("text_model.embeddings.position_ids", None)
+
+        text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
+
+def stable_unclip_image_encoder(original_config, local_files_only=False):
+    """
+    Returns the image processor and clip image encoder for the img2img unclip pipeline.
+
+    We currently know of two types of stable unclip models which separately use the clip and the openclip image
+    encoders.
+    """
+
+    image_embedder_config = original_config["model"]["params"]["embedder_config"]
+
+    sd_clip_image_embedder_class = image_embedder_config["target"]
+    sd_clip_image_embedder_class = sd_clip_image_embedder_class.split(".")[-1]
+
+    if sd_clip_image_embedder_class == "ClipImageEmbedder":
+        clip_model_name = image_embedder_config.params.model
+
+        if clip_model_name == "ViT-L/14":
+            feature_extractor = CLIPImageProcessor()
+            image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                "openai/clip-vit-large-patch14", local_files_only=local_files_only
+            )
+        else:
+            raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}")
+
+    elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder":
+        feature_extractor = CLIPImageProcessor()
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", local_files_only=local_files_only
+        )
+    else:
+        raise NotImplementedError(
+            f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}"
+        )
+
+    return feature_extractor, image_encoder
+
+
+def stable_unclip_image_noising_components(
+    original_config, clip_stats_path: Optional[str] = None, device: Optional[str] = None
+):
+    """
+    Returns the noising components for the img2img and txt2img unclip pipelines.
+
+    Converts the stability noise augmentor into
+    1. a `StableUnCLIPImageNormalizer` for holding the CLIP stats
+    2. a `DDPMScheduler` for holding the noise schedule
+
+    If the noise augmentor config specifies a clip stats path, the `clip_stats_path` must be provided.
+    """
+    noise_aug_config = original_config["model"]["params"]["noise_aug_config"]
+    noise_aug_class = noise_aug_config["target"]
+    noise_aug_class = noise_aug_class.split(".")[-1]
+
+    if noise_aug_class == "CLIPEmbeddingNoiseAugmentation":
+        noise_aug_config = noise_aug_config.params
+        embedding_dim = noise_aug_config.timestep_dim
+        max_noise_level = noise_aug_config.noise_schedule_config.timesteps
+        beta_schedule = noise_aug_config.noise_schedule_config.beta_schedule
+
+        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedding_dim)
+        image_noising_scheduler = DDPMScheduler(num_train_timesteps=max_noise_level, beta_schedule=beta_schedule)
+
+        if "clip_stats_path" in noise_aug_config:
+            if clip_stats_path is None:
+                raise ValueError("This stable unclip config requires a `clip_stats_path`")
+
+            clip_mean, clip_std = torch.load(clip_stats_path, map_location=device)
+            clip_mean = clip_mean[None, :]
+            clip_std = clip_std[None, :]
+
+            clip_stats_state_dict = {
+                "mean": clip_mean,
+                "std": clip_std,
+            }
+
+            image_normalizer.load_state_dict(clip_stats_state_dict)
+    else:
+        raise NotImplementedError(f"Unknown noise augmentor class: {noise_aug_class}")
+
+    return image_normalizer, image_noising_scheduler
+
+
+def convert_controlnet_checkpoint(
+    checkpoint,
+    original_config,
+    checkpoint_path,
+    image_size,
+    upcast_attention,
+    extract_ema,
+    use_linear_projection=None,
+    cross_attention_dim=None,
+):
+    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
+    ctrlnet_config["upcast_attention"] = upcast_attention
+
+    ctrlnet_config.pop("sample_size")
+
+    if use_linear_projection is not None:
+        ctrlnet_config["use_linear_projection"] = use_linear_projection
+
+    if cross_attention_dim is not None:
+        ctrlnet_config["cross_attention_dim"] = cross_attention_dim
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        controlnet = ControlNetModel(**ctrlnet_config)
+
+    # Some controlnet ckpt files are distributed independently from the rest of the
+    # model components i.e. https://huggingface.co/thibaud/controlnet-sd21/
+    if "time_embed.0.weight" in checkpoint:
+        skip_extract_state_dict = True
+    else:
+        skip_extract_state_dict = False
+
+    converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint,
+        ctrlnet_config,
+        path=checkpoint_path,
+        extract_ema=extract_ema,
+        controlnet=True,
+        skip_extract_state_dict=skip_extract_state_dict,
+    )
+
+    if is_accelerate_available():
+        for param_name, param in converted_ctrl_checkpoint.items():
+            set_module_tensor_to_device(controlnet, param_name, "cpu", value=param)
+    else:
+        controlnet.load_state_dict(converted_ctrl_checkpoint)
+
+    return controlnet
+
+
+def download_from_original_stable_diffusion_ckpt(
+    checkpoint_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+    original_config_file: str = None,
+    image_size: Optional[int] = None,
+    prediction_type: str = None,
+    model_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "pndm",
+    num_in_channels: Optional[int] = None,
+    upcast_attention: Optional[bool] = None,
+    device: str = None,
+    from_safetensors: bool = False,
+    stable_unclip: Optional[str] = None,
+    stable_unclip_prior: Optional[str] = None,
+    clip_stats_path: Optional[str] = None,
+    controlnet: Optional[bool] = None,
+    adapter: Optional[bool] = None,
+    load_safety_checker: bool = True,
+    safety_checker: Optional[StableDiffusionSafetyChecker] = None,
+    feature_extractor: Optional[AutoFeatureExtractor] = None,
+    pipeline_class: DiffusionPipeline = None,
+    local_files_only=False,
+    vae_path=None,
+    vae=None,
+    text_encoder=None,
+    text_encoder_2=None,
+    tokenizer=None,
+    tokenizer_2=None,
+    config_files=None,
+) -> DiffusionPipeline:
+    """
+    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
+    config file.
+
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+
+    Args:
+        checkpoint_path_or_dict (`str` or `dict`): Path to `.ckpt` file, or the state dict.
+        original_config_file (`str`):
+            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
+            inferred by looking for a key that only exists in SD2.0 models.
+        image_size (`int`, *optional*, defaults to 512):
+            The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
+            Base. Use 768 for Stable Diffusion v2.
+        prediction_type (`str`, *optional*):
+            The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable
+            Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2.
+        num_in_channels (`int`, *optional*, defaults to None):
+            The number of input channels. If `None`, it will be automatically inferred.
+        scheduler_type (`str`, *optional*, defaults to 'pndm'):
+            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
+            "ddim"]`.
+        model_type (`str`, *optional*, defaults to `None`):
+            The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder",
+            "FrozenCLIPEmbedder", "PaintByExample"]`.
+        is_img2img (`bool`, *optional*, defaults to `False`):
+            Whether the model should be loaded as an img2img pipeline.
+        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
+            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
+            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
+            inference. Non-EMA weights are usually better to continue fine-tuning.
+        upcast_attention (`bool`, *optional*, defaults to `None`):
+            Whether the attention computation should always be upcasted. This is necessary when running stable
+            diffusion 2.1.
+        device (`str`, *optional*, defaults to `None`):
+            The device to use. Pass `None` to determine automatically.
+        from_safetensors (`str`, *optional*, defaults to `False`):
+            If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
+        load_safety_checker (`bool`, *optional*, defaults to `True`):
+            Whether to load the safety checker or not. Defaults to `True`.
+        safety_checker (`StableDiffusionSafetyChecker`, *optional*, defaults to `None`):
+            Safety checker to use. If this parameter is `None`, the function will load a new instance of
+            [StableDiffusionSafetyChecker] by itself, if needed.
+        feature_extractor (`AutoFeatureExtractor`, *optional*, defaults to `None`):
+            Feature extractor to use. If this parameter is `None`, the function will load a new instance of
+            [AutoFeatureExtractor] by itself, if needed.
+        pipeline_class (`str`, *optional*, defaults to `None`):
+            The pipeline class to use. Pass `None` to determine automatically.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            Whether or not to only look at local files (i.e., do not try to download the model).
+        vae (`AutoencoderKL`, *optional*, defaults to `None`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. If
+            this parameter is `None`, the function will load a new instance of [CLIP] by itself, if needed.
+        text_encoder (`CLIPTextModel`, *optional*, defaults to `None`):
+            An instance of [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel)
+            to use, specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)
+            variant. If this parameter is `None`, the function will load a new instance of [CLIP] by itself, if needed.
+        tokenizer (`CLIPTokenizer`, *optional*, defaults to `None`):
+            An instance of
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
+            to use. If this parameter is `None`, the function will load a new instance of [CLIPTokenizer] by itself, if
+            needed.
+        config_files (`Dict[str, str]`, *optional*, defaults to `None`):
+            A dictionary mapping from config file names to their contents. If this parameter is `None`, the function
+            will load the config files by itself, if needed. Valid keys are:
+                - `v1`: Config file for Stable Diffusion v1
+                - `v2`: Config file for Stable Diffusion v2
+                - `xl`: Config file for Stable Diffusion XL
+                - `xl_refiner`: Config file for Stable Diffusion XL Refiner
+        return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    """
+
+    # import pipelines here to avoid circular import error when using from_single_file method
+    from diffusers import (
+        LDMTextToImagePipeline,
+        PaintByExamplePipeline,
+        StableDiffusionControlNetPipeline,
+        StableDiffusionInpaintPipeline,
+        StableDiffusionPipeline,
+        StableDiffusionUpscalePipeline,
+        StableDiffusionXLControlNetInpaintPipeline,
+        StableDiffusionXLImg2ImgPipeline,
+        StableDiffusionXLInpaintPipeline,
+        StableDiffusionXLPipeline,
+        StableUnCLIPImg2ImgPipeline,
+        StableUnCLIPPipeline,
+    )
+
+    if prediction_type == "v-prediction":
+        prediction_type = "v_prediction"
+
+    if isinstance(checkpoint_path_or_dict, str):
+        if from_safetensors:
+            from safetensors.torch import load_file as safe_load
+
+            checkpoint = safe_load(checkpoint_path_or_dict, device="cpu")
+        else:
+            if device is None:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                checkpoint = torch.load(checkpoint_path_or_dict, map_location=device)
+            else:
+                checkpoint = torch.load(checkpoint_path_or_dict, map_location=device)
+    elif isinstance(checkpoint_path_or_dict, dict):
+        checkpoint = checkpoint_path_or_dict
+
+    # Sometimes models don't have the global_step item
+    if "global_step" in checkpoint:
+        global_step = checkpoint["global_step"]
+    else:
+        logger.debug("global_step key not found in model")
+        global_step = None
+
+    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
+    # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
+    while "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    if original_config_file is None:
+        key_name_v2_1 = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
+        key_name_sd_xl_base = "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.bias"
+        key_name_sd_xl_refiner = "conditioner.embedders.0.model.transformer.resblocks.9.mlp.c_proj.bias"
+        is_upscale = pipeline_class == StableDiffusionUpscalePipeline
+
+        config_url = None
+
+        # model_type = "v1"
+        if config_files is not None and "v1" in config_files:
+            original_config_file = config_files["v1"]
+        else:
+            config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+
+        if key_name_v2_1 in checkpoint and checkpoint[key_name_v2_1].shape[-1] == 1024:
+            # model_type = "v2"
+            if config_files is not None and "v2" in config_files:
+                original_config_file = config_files["v2"]
+            else:
+                config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
+            if global_step == 110000:
+                # v2.1 needs to upcast attention
+                upcast_attention = True
+        elif key_name_sd_xl_base in checkpoint:
+            # only base xl has two text embedders
+            if config_files is not None and "xl" in config_files:
+                original_config_file = config_files["xl"]
+            else:
+                config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
+        elif key_name_sd_xl_refiner in checkpoint:
+            # only refiner xl has embedder and one text embedders
+            if config_files is not None and "xl_refiner" in config_files:
+                original_config_file = config_files["xl_refiner"]
+            else:
+                config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml"
+
+        if is_upscale:
+            config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml"
+
+        if config_url is not None:
+            original_config_file = BytesIO(requests.get(config_url).content)
+        else:
+            with open(original_config_file, "r") as f:
+                original_config_file = f.read()
+    else:
+        with open(original_config_file, "r") as f:
+            original_config_file = f.read()
+
+    original_config = yaml.safe_load(original_config_file)
+
+    # Convert the text model.
+    if (
+        model_type is None
+        and "cond_stage_config" in original_config["model"]["params"]
+        and original_config["model"]["params"]["cond_stage_config"] is not None
+    ):
+        model_type = original_config["model"]["params"]["cond_stage_config"]["target"].split(".")[-1]
+        logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
+    elif model_type is None and original_config["model"]["params"]["network_config"] is not None:
+        if original_config["model"]["params"]["network_config"]["params"]["context_dim"] == 2048:
+            model_type = "SDXL"
+        else:
+            model_type = "SDXL-Refiner"
+        if image_size is None:
+            image_size = 1024
+
+    if pipeline_class is None:
+        # Check if we have a SDXL or SD model and initialize default pipeline
+        if model_type not in ["SDXL", "SDXL-Refiner"]:
+            pipeline_class = StableDiffusionPipeline if not controlnet else StableDiffusionControlNetPipeline
+        else:
+            pipeline_class = StableDiffusionXLPipeline if model_type == "SDXL" else StableDiffusionXLImg2ImgPipeline
+
+    if num_in_channels is None and pipeline_class in [
+        StableDiffusionInpaintPipeline,
+        StableDiffusionXLInpaintPipeline,
+        StableDiffusionXLControlNetInpaintPipeline,
+    ]:
+        num_in_channels = 9
+    if num_in_channels is None and pipeline_class == StableDiffusionUpscalePipeline:
+        num_in_channels = 7
+    elif num_in_channels is None:
+        num_in_channels = 4
+
+    if "unet_config" in original_config["model"]["params"]:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+    elif "network_config" in original_config["model"]["params"]:
+        original_config["model"]["params"]["network_config"]["params"]["in_channels"] = num_in_channels
+
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
+            # as it relies on a brittle global step parameter here
+            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
+        if image_size is None:
+            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
+            # as it relies on a brittle global step parameter here
+            image_size = 512 if global_step == 875000 else 768
+    else:
+        if prediction_type is None:
+            prediction_type = "epsilon"
+        if image_size is None:
+            image_size = 512
+
+    if controlnet is None and "control_stage_config" in original_config["model"]["params"]:
+        path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
+        controlnet = convert_controlnet_checkpoint(
+            checkpoint, original_config, path, image_size, upcast_attention, extract_ema
+        )
+
+    if "timesteps" in original_config["model"]["params"]:
+        num_train_timesteps = original_config["model"]["params"]["timesteps"]
+    else:
+        num_train_timesteps = 1000
+
+    if model_type in ["SDXL", "SDXL-Refiner"]:
+        scheduler_dict = {
+            "beta_schedule": "scaled_linear",
+            "beta_start": 0.00085,
+            "beta_end": 0.012,
+            "interpolation_type": "linear",
+            "num_train_timesteps": num_train_timesteps,
+            "prediction_type": "epsilon",
+            "sample_max_value": 1.0,
+            "set_alpha_to_one": False,
+            "skip_prk_steps": True,
+            "steps_offset": 1,
+            "timestep_spacing": "leading",
+        }
+        scheduler = EulerDiscreteScheduler.from_config(scheduler_dict)
+        scheduler_type = "euler"
+    else:
+        if "linear_start" in original_config["model"]["params"]:
+            beta_start = original_config["model"]["params"]["linear_start"]
+        else:
+            beta_start = 0.02
+
+        if "linear_end" in original_config["model"]["params"]:
+            beta_end = original_config["model"]["params"]["linear_end"]
+        else:
+            beta_end = 0.085
+        scheduler = DDIMScheduler(
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            beta_start=beta_start,
+            num_train_timesteps=num_train_timesteps,
+            steps_offset=1,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            prediction_type=prediction_type,
+        )
+    # make sure scheduler works correctly with DDIM
+    scheduler.register_to_config(clip_sample=False)
+
+    if scheduler_type == "pndm":
+        config = dict(scheduler.config)
+        config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(config)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+    elif scheduler_type == "ddim":
+        scheduler = scheduler
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+    if pipeline_class == StableDiffusionUpscalePipeline:
+        image_size = original_config["model"]["params"]["unet_config"]["params"]["image_size"]
+
+    # Convert the UNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet_config["upcast_attention"] = upcast_attention
+
+    path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=path, extract_ema=extract_ema
+    )
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        unet = UNet2DConditionModel(**unet_config)
+
+    if is_accelerate_available():
+        if model_type not in ["SDXL", "SDXL-Refiner"]:  # SBM Delay this.
+            for param_name, param in converted_unet_checkpoint.items():
+                set_module_tensor_to_device(unet, param_name, "cpu", value=param)
+    else:
+        unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model.
+    if vae_path is None and vae is None:
+        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+        if (
+            "model" in original_config
+            and "params" in original_config["model"]
+            and "scale_factor" in original_config["model"]["params"]
+        ):
+            vae_scaling_factor = original_config["model"]["params"]["scale_factor"]
+        else:
+            vae_scaling_factor = 0.18215  # default SD scaling factor
+
+        vae_config["scaling_factor"] = vae_scaling_factor
+
+        ctx = init_empty_weights if is_accelerate_available() else nullcontext
+        with ctx():
+            vae = AutoencoderKL(**vae_config)
+
+        if is_accelerate_available():
+            for param_name, param in converted_vae_checkpoint.items():
+                set_module_tensor_to_device(vae, param_name, "cpu", value=param)
+        else:
+            vae.load_state_dict(converted_vae_checkpoint)
+    elif vae is None:
+        vae = AutoencoderKL.from_pretrained(vae_path, local_files_only=local_files_only)
+
+    if model_type == "FrozenOpenCLIPEmbedder":
+        config_name = "stabilityai/stable-diffusion-2"
+        config_kwargs = {"subfolder": "text_encoder"}
+
+        if text_encoder is None:
+            text_model = convert_open_clip_checkpoint(
+                checkpoint, config_name, local_files_only=local_files_only, **config_kwargs
+            )
+        else:
+            text_model = text_encoder
+
+        try:
+            tokenizer = CLIPTokenizer.from_pretrained(
+                "stabilityai/stable-diffusion-2", subfolder="tokenizer", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'stabilityai/stable-diffusion-2'."
+            )
+
+        if stable_unclip is None:
+            if controlnet:
+                pipe = pipeline_class(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    controlnet=controlnet,
+                    safety_checker=safety_checker,
+                    feature_extractor=feature_extractor,
+                )
+                if hasattr(pipe, "requires_safety_checker"):
+                    pipe.requires_safety_checker = False
+
+            elif pipeline_class == StableDiffusionUpscalePipeline:
+                scheduler = DDIMScheduler.from_pretrained(
+                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler"
+                )
+                low_res_scheduler = DDPMScheduler.from_pretrained(
+                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="low_res_scheduler"
+                )
+
+                pipe = pipeline_class(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    low_res_scheduler=low_res_scheduler,
+                    safety_checker=safety_checker,
+                    feature_extractor=feature_extractor,
+                )
+
+            else:
+                pipe = pipeline_class(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    safety_checker=safety_checker,
+                    feature_extractor=feature_extractor,
+                )
+                if hasattr(pipe, "requires_safety_checker"):
+                    pipe.requires_safety_checker = False
+
+        else:
+            image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components(
+                original_config, clip_stats_path=clip_stats_path, device=device
+            )
+
+            if stable_unclip == "img2img":
+                feature_extractor, image_encoder = stable_unclip_image_encoder(original_config)
+
+                pipe = StableUnCLIPImg2ImgPipeline(
+                    # image encoding components
+                    feature_extractor=feature_extractor,
+                    image_encoder=image_encoder,
+                    # image noising components
+                    image_normalizer=image_normalizer,
+                    image_noising_scheduler=image_noising_scheduler,
+                    # regular denoising components
+                    tokenizer=tokenizer,
+                    text_encoder=text_model,
+                    unet=unet,
+                    scheduler=scheduler,
+                    # vae
+                    vae=vae,
+                )
+            elif stable_unclip == "txt2img":
+                if stable_unclip_prior is None or stable_unclip_prior == "karlo":
+                    karlo_model = "kakaobrain/karlo-v1-alpha"
+                    prior = PriorTransformer.from_pretrained(
+                        karlo_model, subfolder="prior", local_files_only=local_files_only
+                    )
+
+                    try:
+                        prior_tokenizer = CLIPTokenizer.from_pretrained(
+                            "openai/clip-vit-large-patch14", local_files_only=local_files_only
+                        )
+                    except Exception:
+                        raise ValueError(
+                            f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+                        )
+                    prior_text_model = CLIPTextModelWithProjection.from_pretrained(
+                        "openai/clip-vit-large-patch14", local_files_only=local_files_only
+                    )
+
+                    prior_scheduler = UnCLIPScheduler.from_pretrained(
+                        karlo_model, subfolder="prior_scheduler", local_files_only=local_files_only
+                    )
+                    prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
+                else:
+                    raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}")
+
+                pipe = StableUnCLIPPipeline(
+                    # prior components
+                    prior_tokenizer=prior_tokenizer,
+                    prior_text_encoder=prior_text_model,
+                    prior=prior,
+                    prior_scheduler=prior_scheduler,
+                    # image noising components
+                    image_normalizer=image_normalizer,
+                    image_noising_scheduler=image_noising_scheduler,
+                    # regular denoising components
+                    tokenizer=tokenizer,
+                    text_encoder=text_model,
+                    unet=unet,
+                    scheduler=scheduler,
+                    # vae
+                    vae=vae,
+                )
+            else:
+                raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}")
+    elif model_type == "PaintByExample":
+        vision_model = convert_paint_by_example_checkpoint(checkpoint)
+        try:
+            tokenizer = CLIPTokenizer.from_pretrained(
+                "openai/clip-vit-large-patch14", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+            )
+        try:
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the feature_extractor in the following path: 'CompVis/stable-diffusion-safety-checker'."
+            )
+        pipe = PaintByExamplePipeline(
+            vae=vae,
+            image_encoder=vision_model,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=None,
+            feature_extractor=feature_extractor,
+        )
+    elif model_type == "FrozenCLIPEmbedder":
+        text_model = convert_ldm_clip_checkpoint(
+            checkpoint, local_files_only=local_files_only, text_encoder=text_encoder
+        )
+        try:
+            tokenizer = (
+                CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
+                if tokenizer is None
+                else tokenizer
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+            )
+
+        if load_safety_checker:
+            safety_checker = StableDiffusionSafetyChecker.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+
+        if controlnet:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_model,
+                tokenizer=tokenizer,
+                unet=unet,
+                controlnet=controlnet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+            )
+        else:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_model,
+                tokenizer=tokenizer,
+                unet=unet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+            )
+    elif model_type in ["SDXL", "SDXL-Refiner"]:
+        is_refiner = model_type == "SDXL-Refiner"
+
+        if (is_refiner is False) and (tokenizer is None):
+            try:
+                tokenizer = CLIPTokenizer.from_pretrained(
+                    "openai/clip-vit-large-patch14", local_files_only=local_files_only
+                )
+            except Exception:
+                raise ValueError(
+                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+                )
+
+        if (is_refiner is False) and (text_encoder is None):
+            text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
+
+        if tokenizer_2 is None:
+            try:
+                tokenizer_2 = CLIPTokenizer.from_pretrained(
+                    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only
+                )
+            except Exception:
+                raise ValueError(
+                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k' with `pad_token` set to '!'."
+                )
+
+        if text_encoder_2 is None:
+            config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+            config_kwargs = {"projection_dim": 1280}
+            prefix = "conditioner.embedders.0.model." if is_refiner else "conditioner.embedders.1.model."
+
+            text_encoder_2 = convert_open_clip_checkpoint(
+                checkpoint,
+                config_name,
+                prefix=prefix,
+                has_projection=True,
+                local_files_only=local_files_only,
+                **config_kwargs,
+            )
+
+        if is_accelerate_available():  # SBM Now move model to cpu.
+            for param_name, param in converted_unet_checkpoint.items():
+                set_module_tensor_to_device(unet, param_name, "cpu", value=param)
+
+        if controlnet:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                text_encoder_2=text_encoder_2,
+                tokenizer_2=tokenizer_2,
+                unet=unet,
+                controlnet=controlnet,
+                scheduler=scheduler,
+                force_zeros_for_empty_prompt=True,
+            )
+        elif adapter:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                text_encoder_2=text_encoder_2,
+                tokenizer_2=tokenizer_2,
+                unet=unet,
+                adapter=adapter,
+                scheduler=scheduler,
+                force_zeros_for_empty_prompt=True,
+            )
+
+        else:
+            pipeline_kwargs = {
+                "vae": vae,
+                "text_encoder": text_encoder,
+                "tokenizer": tokenizer,
+                "text_encoder_2": text_encoder_2,
+                "tokenizer_2": tokenizer_2,
+                "unet": unet,
+                "scheduler": scheduler,
+            }
+
+            if (pipeline_class == StableDiffusionXLImg2ImgPipeline) or (
+                pipeline_class == StableDiffusionXLInpaintPipeline
+            ):
+                pipeline_kwargs.update({"requires_aesthetics_score": is_refiner})
+
+            if is_refiner:
+                pipeline_kwargs.update({"force_zeros_for_empty_prompt": False})
+
+            pipe = pipeline_class(**pipeline_kwargs)
+    else:
+        text_config = create_ldm_bert_config(original_config)
+        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
+        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", local_files_only=local_files_only)
+        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+
+    return pipe
+
+
+def download_controlnet_from_original_ckpt(
+    checkpoint_path: str,
+    original_config_file: str,
+    image_size: int = 512,
+    extract_ema: bool = False,
+    num_in_channels: Optional[int] = None,
+    upcast_attention: Optional[bool] = None,
+    device: str = None,
+    from_safetensors: bool = False,
+    use_linear_projection: Optional[bool] = None,
+    cross_attention_dim: Optional[bool] = None,
+) -> DiffusionPipeline:
+    if from_safetensors:
+        from safetensors import safe_open
+
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+
+    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
+    # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
+    while "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    with open(original_config_file, "r") as f:
+        original_config_file = f.read()
+    original_config = yaml.safe_load(original_config_file)
+
+    if num_in_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+
+    if "control_stage_config" not in original_config["model"]["params"]:
+        raise ValueError("`control_stage_config` not present in original config")
+
+    controlnet = convert_controlnet_checkpoint(
+        checkpoint,
+        original_config,
+        checkpoint_path,
+        image_size,
+        upcast_attention,
+        extract_ema,
+        use_linear_projection=use_linear_projection,
+        cross_attention_dim=cross_attention_dim,
+    )
+
+    return controlnet
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d6ffd463cc31387ccc5d45bb4b49446a571cef0
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -0,0 +1,473 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict
+from flax.jax_utils import unreplicate
+from flax.training.common_utils import shard
+from packaging import version
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTokenizer, FlaxCLIPTextModel
+
+from ...models import FlaxAutoencoderKL, FlaxUNet2DConditionModel
+from ...schedulers import (
+    FlaxDDIMScheduler,
+    FlaxDPMSolverMultistepScheduler,
+    FlaxLMSDiscreteScheduler,
+    FlaxPNDMScheduler,
+)
+from ...utils import deprecate, logging, replace_example_docstring
+from ..pipeline_flax_utils import FlaxDiffusionPipeline
+from .pipeline_output import FlaxStableDiffusionPipelineOutput
+from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Set to True to use python for loop instead of jax.fori_loop for easier debugging
+DEBUG = False
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import jax
+        >>> import numpy as np
+        >>> from flax.jax_utils import replicate
+        >>> from flax.training.common_utils import shard
+
+        >>> from diffusers import FlaxStableDiffusionPipeline
+
+        >>> pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", variant="bf16", dtype=jax.numpy.bfloat16
+        ... )
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+
+        >>> prng_seed = jax.random.PRNGKey(0)
+        >>> num_inference_steps = 50
+
+        >>> num_samples = jax.device_count()
+        >>> prompt = num_samples * [prompt]
+        >>> prompt_ids = pipeline.prepare_inputs(prompt)
+        # shard inputs and rng
+
+        >>> params = replicate(params)
+        >>> prng_seed = jax.random.split(prng_seed, jax.device_count())
+        >>> prompt_ids = shard(prompt_ids)
+
+        >>> images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
+        >>> images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
+        ```
+"""
+
+
+class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
+    r"""
+    Flax-based pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`FlaxDiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`FlaxAutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.FlaxCLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`FlaxUNet2DConditionModel`]):
+            A `FlaxUNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
+            [`FlaxDPMSolverMultistepScheduler`].
+        safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: FlaxAutoencoderKL,
+        text_encoder: FlaxCLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: FlaxUNet2DConditionModel,
+        scheduler: Union[
+            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
+        ],
+        safety_checker: FlaxStableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        super().__init__()
+        self.dtype = dtype
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def prepare_inputs(self, prompt: Union[str, List[str]]):
+        if not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+        return text_input.input_ids
+
+    def _get_has_nsfw_concepts(self, features, params):
+        has_nsfw_concepts = self.safety_checker(features, params)
+        return has_nsfw_concepts
+
+    def _run_safety_checker(self, images, safety_model_params, jit=False):
+        # safety_model_params should already be replicated when jit is True
+        pil_images = [Image.fromarray(image) for image in images]
+        features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
+
+        if jit:
+            features = shard(features)
+            has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
+            has_nsfw_concepts = unshard(has_nsfw_concepts)
+            safety_model_params = unreplicate(safety_model_params)
+        else:
+            has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
+
+        images_was_copied = False
+        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+            if has_nsfw_concept:
+                if not images_was_copied:
+                    images_was_copied = True
+                    images = images.copy()
+
+                images[idx] = np.zeros(images[idx].shape, dtype=np.uint8)  # black image
+
+            if any(has_nsfw_concepts):
+                warnings.warn(
+                    "Potential NSFW content was detected in one or more images. A black image will be returned"
+                    " instead. Try again with a different prompt and/or seed."
+                )
+
+        return images, has_nsfw_concepts
+
+    def _generate(
+        self,
+        prompt_ids: jnp.array,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int,
+        height: int,
+        width: int,
+        guidance_scale: float,
+        latents: Optional[jnp.ndarray] = None,
+        neg_prompt_ids: Optional[jnp.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # get prompt text embeddings
+        prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+
+        # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
+        # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
+        batch_size = prompt_ids.shape[0]
+
+        max_length = prompt_ids.shape[-1]
+
+        if neg_prompt_ids is None:
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
+            ).input_ids
+        else:
+            uncond_input = neg_prompt_ids
+        negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
+        context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        # Ensure model output will be `float32` before going into the scheduler
+        guidance_scale = jnp.array([guidance_scale], dtype=jnp.float32)
+
+        latents_shape = (
+            batch_size,
+            self.unet.config.in_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if latents is None:
+            latents = jax.random.normal(prng_seed, shape=latents_shape, dtype=jnp.float32)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+        def loop_body(step, args):
+            latents, scheduler_state = args
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            latents_input = jnp.concatenate([latents] * 2)
+
+            t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            timestep = jnp.broadcast_to(t, latents_input.shape[0])
+
+            latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet.apply(
+                {"params": params["unet"]},
+                jnp.array(latents_input),
+                jnp.array(timestep, dtype=jnp.int32),
+                encoder_hidden_states=context,
+            ).sample
+            # perform guidance
+            noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+            return latents, scheduler_state
+
+        scheduler_state = self.scheduler.set_timesteps(
+            params["scheduler"], num_inference_steps=num_inference_steps, shape=latents.shape
+        )
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * params["scheduler"].init_noise_sigma
+
+        if DEBUG:
+            # run with python for loop
+            for i in range(num_inference_steps):
+                latents, scheduler_state = loop_body(i, (latents, scheduler_state))
+        else:
+            latents, _ = jax.lax.fori_loop(0, num_inference_steps, loop_body, (latents, scheduler_state))
+
+        # scale and decode the image latents with vae
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
+
+        image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+        return image
+
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt_ids: jnp.array,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int = 50,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        guidance_scale: Union[float, jnp.ndarray] = 7.5,
+        latents: jnp.ndarray = None,
+        neg_prompt_ids: jnp.ndarray = None,
+        return_dict: bool = True,
+        jit: bool = False,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            latents (`jnp.ndarray`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                array is generated by sampling using the supplied random `generator`.
+            jit (`bool`, defaults to `False`):
+                Whether to run `pmap` versions of the generation and safety scoring functions.
+
+                    <Tip warning={true}>
+
+                    This argument exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a
+                    future release.
+
+                    </Tip>
+
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
+                a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated images
+                and the second element is a list of `bool`s indicating whether the corresponding generated image
+                contains "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        if isinstance(guidance_scale, float):
+            # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+            # shape information, as they may be sharded (when `jit` is `True`), or not.
+            guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
+            if len(prompt_ids.shape) > 2:
+                # Assume sharded
+                guidance_scale = guidance_scale[:, None]
+
+        if jit:
+            images = _p_generate(
+                self,
+                prompt_ids,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+            )
+        else:
+            images = self._generate(
+                prompt_ids,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+            )
+
+        if self.safety_checker is not None:
+            safety_params = params["safety_checker"]
+            images_uint8_casted = (images * 255).round().astype("uint8")
+            num_devices, batch_size = images.shape[:2]
+
+            images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
+            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
+            images = np.asarray(images).copy()
+
+            # block images
+            if any(has_nsfw_concept):
+                for i, is_nsfw in enumerate(has_nsfw_concept):
+                    if is_nsfw:
+                        images[i, 0] = np.asarray(images_uint8_casted[i])
+
+            images = images.reshape(num_devices, batch_size, height, width, 3)
+        else:
+            images = np.asarray(images)
+            has_nsfw_concept = False
+
+        if not return_dict:
+            return (images, has_nsfw_concept)
+
+        return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
+
+
+# Static argnums are pipe, num_inference_steps, height, width. A change would trigger recompilation.
+# Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
+@partial(
+    jax.pmap,
+    in_axes=(None, 0, 0, 0, None, None, None, 0, 0, 0),
+    static_broadcasted_argnums=(0, 4, 5, 6),
+)
+def _p_generate(
+    pipe,
+    prompt_ids,
+    params,
+    prng_seed,
+    num_inference_steps,
+    height,
+    width,
+    guidance_scale,
+    latents,
+    neg_prompt_ids,
+):
+    return pipe._generate(
+        prompt_ids,
+        params,
+        prng_seed,
+        num_inference_steps,
+        height,
+        width,
+        guidance_scale,
+        latents,
+        neg_prompt_ids,
+    )
+
+
+@partial(jax.pmap, static_broadcasted_argnums=(0,))
+def _p_get_has_nsfw_concepts(pipe, features, params):
+    return pipe._get_has_nsfw_concepts(features, params)
+
+
+def unshard(x: jnp.ndarray):
+    # einops.rearrange(x, 'd b ... -> (d b) ...')
+    num_devices, batch_size = x.shape[:2]
+    rest = x.shape[2:]
+    return x.reshape(num_devices * batch_size, *rest)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..7792bc0975955ab6131e1ba34c6953149e412208
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
@@ -0,0 +1,532 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict
+from flax.jax_utils import unreplicate
+from flax.training.common_utils import shard
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTokenizer, FlaxCLIPTextModel
+
+from ...models import FlaxAutoencoderKL, FlaxUNet2DConditionModel
+from ...schedulers import (
+    FlaxDDIMScheduler,
+    FlaxDPMSolverMultistepScheduler,
+    FlaxLMSDiscreteScheduler,
+    FlaxPNDMScheduler,
+)
+from ...utils import PIL_INTERPOLATION, logging, replace_example_docstring
+from ..pipeline_flax_utils import FlaxDiffusionPipeline
+from .pipeline_output import FlaxStableDiffusionPipelineOutput
+from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Set to True to use python for loop instead of jax.fori_loop for easier debugging
+DEBUG = False
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import jax
+        >>> import numpy as np
+        >>> import jax.numpy as jnp
+        >>> from flax.jax_utils import replicate
+        >>> from flax.training.common_utils import shard
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+        >>> from diffusers import FlaxStableDiffusionImg2ImgPipeline
+
+
+        >>> def create_key(seed=0):
+        ...     return jax.random.PRNGKey(seed)
+
+
+        >>> rng = create_key(0)
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> response = requests.get(url)
+        >>> init_img = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> init_img = init_img.resize((768, 512))
+
+        >>> prompts = "A fantasy landscape, trending on artstation"
+
+        >>> pipeline, params = FlaxStableDiffusionImg2ImgPipeline.from_pretrained(
+        ...     "CompVis/stable-diffusion-v1-4",
+        ...     revision="flax",
+        ...     dtype=jnp.bfloat16,
+        ... )
+
+        >>> num_samples = jax.device_count()
+        >>> rng = jax.random.split(rng, jax.device_count())
+        >>> prompt_ids, processed_image = pipeline.prepare_inputs(
+        ...     prompt=[prompts] * num_samples, image=[init_img] * num_samples
+        ... )
+        >>> p_params = replicate(params)
+        >>> prompt_ids = shard(prompt_ids)
+        >>> processed_image = shard(processed_image)
+
+        >>> output = pipeline(
+        ...     prompt_ids=prompt_ids,
+        ...     image=processed_image,
+        ...     params=p_params,
+        ...     prng_seed=rng,
+        ...     strength=0.75,
+        ...     num_inference_steps=50,
+        ...     jit=True,
+        ...     height=512,
+        ...     width=768,
+        ... ).images
+
+        >>> output_images = pipeline.numpy_to_pil(np.asarray(output.reshape((num_samples,) + output.shape[-3:])))
+        ```
+"""
+
+
+class FlaxStableDiffusionImg2ImgPipeline(FlaxDiffusionPipeline):
+    r"""
+    Flax-based pipeline for text-guided image-to-image generation using Stable Diffusion.
+
+    This model inherits from [`FlaxDiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`FlaxAutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.FlaxCLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`FlaxUNet2DConditionModel`]):
+            A `FlaxUNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
+            [`FlaxDPMSolverMultistepScheduler`].
+        safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: FlaxAutoencoderKL,
+        text_encoder: FlaxCLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: FlaxUNet2DConditionModel,
+        scheduler: Union[
+            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
+        ],
+        safety_checker: FlaxStableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        super().__init__()
+        self.dtype = dtype
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def prepare_inputs(self, prompt: Union[str, List[str]], image: Union[Image.Image, List[Image.Image]]):
+        if not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if not isinstance(image, (Image.Image, list)):
+            raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
+
+        if isinstance(image, Image.Image):
+            image = [image]
+
+        processed_images = jnp.concatenate([preprocess(img, jnp.float32) for img in image])
+
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+        return text_input.input_ids, processed_images
+
+    def _get_has_nsfw_concepts(self, features, params):
+        has_nsfw_concepts = self.safety_checker(features, params)
+        return has_nsfw_concepts
+
+    def _run_safety_checker(self, images, safety_model_params, jit=False):
+        # safety_model_params should already be replicated when jit is True
+        pil_images = [Image.fromarray(image) for image in images]
+        features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
+
+        if jit:
+            features = shard(features)
+            has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
+            has_nsfw_concepts = unshard(has_nsfw_concepts)
+            safety_model_params = unreplicate(safety_model_params)
+        else:
+            has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
+
+        images_was_copied = False
+        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+            if has_nsfw_concept:
+                if not images_was_copied:
+                    images_was_copied = True
+                    images = images.copy()
+
+                images[idx] = np.zeros(images[idx].shape, dtype=np.uint8)  # black image
+
+            if any(has_nsfw_concepts):
+                warnings.warn(
+                    "Potential NSFW content was detected in one or more images. A black image will be returned"
+                    " instead. Try again with a different prompt and/or seed."
+                )
+
+        return images, has_nsfw_concepts
+
+    def get_timestep_start(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+
+        return t_start
+
+    def _generate(
+        self,
+        prompt_ids: jnp.ndarray,
+        image: jnp.ndarray,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        start_timestep: int,
+        num_inference_steps: int,
+        height: int,
+        width: int,
+        guidance_scale: float,
+        noise: Optional[jnp.ndarray] = None,
+        neg_prompt_ids: Optional[jnp.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # get prompt text embeddings
+        prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+
+        # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
+        # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
+        batch_size = prompt_ids.shape[0]
+
+        max_length = prompt_ids.shape[-1]
+
+        if neg_prompt_ids is None:
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
+            ).input_ids
+        else:
+            uncond_input = neg_prompt_ids
+        negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
+        context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        latents_shape = (
+            batch_size,
+            self.unet.config.in_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if noise is None:
+            noise = jax.random.normal(prng_seed, shape=latents_shape, dtype=jnp.float32)
+        else:
+            if noise.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {noise.shape}, expected {latents_shape}")
+
+        # Create init_latents
+        init_latent_dist = self.vae.apply({"params": params["vae"]}, image, method=self.vae.encode).latent_dist
+        init_latents = init_latent_dist.sample(key=prng_seed).transpose((0, 3, 1, 2))
+        init_latents = self.vae.config.scaling_factor * init_latents
+
+        def loop_body(step, args):
+            latents, scheduler_state = args
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            latents_input = jnp.concatenate([latents] * 2)
+
+            t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            timestep = jnp.broadcast_to(t, latents_input.shape[0])
+
+            latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet.apply(
+                {"params": params["unet"]},
+                jnp.array(latents_input),
+                jnp.array(timestep, dtype=jnp.int32),
+                encoder_hidden_states=context,
+            ).sample
+            # perform guidance
+            noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+            return latents, scheduler_state
+
+        scheduler_state = self.scheduler.set_timesteps(
+            params["scheduler"], num_inference_steps=num_inference_steps, shape=latents_shape
+        )
+
+        latent_timestep = scheduler_state.timesteps[start_timestep : start_timestep + 1].repeat(batch_size)
+
+        latents = self.scheduler.add_noise(params["scheduler"], init_latents, noise, latent_timestep)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * params["scheduler"].init_noise_sigma
+
+        if DEBUG:
+            # run with python for loop
+            for i in range(start_timestep, num_inference_steps):
+                latents, scheduler_state = loop_body(i, (latents, scheduler_state))
+        else:
+            latents, _ = jax.lax.fori_loop(start_timestep, num_inference_steps, loop_body, (latents, scheduler_state))
+
+        # scale and decode the image latents with vae
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
+
+        image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+        return image
+
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt_ids: jnp.ndarray,
+        image: jnp.ndarray,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        guidance_scale: Union[float, jnp.ndarray] = 7.5,
+        noise: jnp.ndarray = None,
+        neg_prompt_ids: jnp.ndarray = None,
+        return_dict: bool = True,
+        jit: bool = False,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt_ids (`jnp.ndarray`):
+                The prompt or prompts to guide image generation.
+            image (`jnp.ndarray`):
+                Array representing an image batch to be used as the starting point.
+            params (`Dict` or `FrozenDict`):
+                Dictionary containing the model parameters/weights.
+            prng_seed (`jax.Array` or `jax.Array`):
+                Array containing random number generator key.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            noise (`jnp.ndarray`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. The array is generated by
+                sampling using the supplied random `generator`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
+                a plain tuple.
+            jit (`bool`, defaults to `False`):
+                Whether to run `pmap` versions of the generation and safety scoring functions.
+
+                    <Tip warning={true}>
+
+                    This argument exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a
+                    future release.
+
+                    </Tip>
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated images
+                and the second element is a list of `bool`s indicating whether the corresponding generated image
+                contains "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        if isinstance(guidance_scale, float):
+            # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+            # shape information, as they may be sharded (when `jit` is `True`), or not.
+            guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
+            if len(prompt_ids.shape) > 2:
+                # Assume sharded
+                guidance_scale = guidance_scale[:, None]
+
+        start_timestep = self.get_timestep_start(num_inference_steps, strength)
+
+        if jit:
+            images = _p_generate(
+                self,
+                prompt_ids,
+                image,
+                params,
+                prng_seed,
+                start_timestep,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                noise,
+                neg_prompt_ids,
+            )
+        else:
+            images = self._generate(
+                prompt_ids,
+                image,
+                params,
+                prng_seed,
+                start_timestep,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                noise,
+                neg_prompt_ids,
+            )
+
+        if self.safety_checker is not None:
+            safety_params = params["safety_checker"]
+            images_uint8_casted = (images * 255).round().astype("uint8")
+            num_devices, batch_size = images.shape[:2]
+
+            images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
+            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
+            images = np.asarray(images)
+
+            # block images
+            if any(has_nsfw_concept):
+                for i, is_nsfw in enumerate(has_nsfw_concept):
+                    if is_nsfw:
+                        images[i] = np.asarray(images_uint8_casted[i])
+
+            images = images.reshape(num_devices, batch_size, height, width, 3)
+        else:
+            images = np.asarray(images)
+            has_nsfw_concept = False
+
+        if not return_dict:
+            return (images, has_nsfw_concept)
+
+        return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
+
+
+# Static argnums are pipe, start_timestep, num_inference_steps, height, width. A change would trigger recompilation.
+# Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
+@partial(
+    jax.pmap,
+    in_axes=(None, 0, 0, 0, 0, None, None, None, None, 0, 0, 0),
+    static_broadcasted_argnums=(0, 5, 6, 7, 8),
+)
+def _p_generate(
+    pipe,
+    prompt_ids,
+    image,
+    params,
+    prng_seed,
+    start_timestep,
+    num_inference_steps,
+    height,
+    width,
+    guidance_scale,
+    noise,
+    neg_prompt_ids,
+):
+    return pipe._generate(
+        prompt_ids,
+        image,
+        params,
+        prng_seed,
+        start_timestep,
+        num_inference_steps,
+        height,
+        width,
+        guidance_scale,
+        noise,
+        neg_prompt_ids,
+    )
+
+
+@partial(jax.pmap, static_broadcasted_argnums=(0,))
+def _p_get_has_nsfw_concepts(pipe, features, params):
+    return pipe._get_has_nsfw_concepts(features, params)
+
+
+def unshard(x: jnp.ndarray):
+    # einops.rearrange(x, 'd b ... -> (d b) ...')
+    num_devices, batch_size = x.shape[:2]
+    rest = x.shape[2:]
+    return x.reshape(num_devices * batch_size, *rest)
+
+
+def preprocess(image, dtype):
+    w, h = image.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = jnp.array(image).astype(dtype) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    return 2.0 * image - 1.0
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6bb0ac299b3bc382fd0a79b1d0a78e907ebd790
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
@@ -0,0 +1,589 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict
+from flax.jax_utils import unreplicate
+from flax.training.common_utils import shard
+from packaging import version
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTokenizer, FlaxCLIPTextModel
+
+from ...models import FlaxAutoencoderKL, FlaxUNet2DConditionModel
+from ...schedulers import (
+    FlaxDDIMScheduler,
+    FlaxDPMSolverMultistepScheduler,
+    FlaxLMSDiscreteScheduler,
+    FlaxPNDMScheduler,
+)
+from ...utils import PIL_INTERPOLATION, deprecate, logging, replace_example_docstring
+from ..pipeline_flax_utils import FlaxDiffusionPipeline
+from .pipeline_output import FlaxStableDiffusionPipelineOutput
+from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Set to True to use python for loop instead of jax.fori_loop for easier debugging
+DEBUG = False
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import jax
+        >>> import numpy as np
+        >>> from flax.jax_utils import replicate
+        >>> from flax.training.common_utils import shard
+        >>> import PIL
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from diffusers import FlaxStableDiffusionInpaintPipeline
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+        >>> init_image = download_image(img_url).resize((512, 512))
+        >>> mask_image = download_image(mask_url).resize((512, 512))
+
+        >>> pipeline, params = FlaxStableDiffusionInpaintPipeline.from_pretrained(
+        ...     "xvjiarui/stable-diffusion-2-inpainting"
+        ... )
+
+        >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+        >>> prng_seed = jax.random.PRNGKey(0)
+        >>> num_inference_steps = 50
+
+        >>> num_samples = jax.device_count()
+        >>> prompt = num_samples * [prompt]
+        >>> init_image = num_samples * [init_image]
+        >>> mask_image = num_samples * [mask_image]
+        >>> prompt_ids, processed_masked_images, processed_masks = pipeline.prepare_inputs(
+        ...     prompt, init_image, mask_image
+        ... )
+        # shard inputs and rng
+
+        >>> params = replicate(params)
+        >>> prng_seed = jax.random.split(prng_seed, jax.device_count())
+        >>> prompt_ids = shard(prompt_ids)
+        >>> processed_masked_images = shard(processed_masked_images)
+        >>> processed_masks = shard(processed_masks)
+
+        >>> images = pipeline(
+        ...     prompt_ids, processed_masks, processed_masked_images, params, prng_seed, num_inference_steps, jit=True
+        ... ).images
+        >>> images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
+        ```
+"""
+
+
+class FlaxStableDiffusionInpaintPipeline(FlaxDiffusionPipeline):
+    r"""
+    Flax-based pipeline for text-guided image inpainting using Stable Diffusion.
+
+    <Tip warning={true}>
+
+    🧪 This is an experimental feature!
+
+    </Tip>
+
+    This model inherits from [`FlaxDiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`FlaxAutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.FlaxCLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`FlaxUNet2DConditionModel`]):
+            A `FlaxUNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
+            [`FlaxDPMSolverMultistepScheduler`].
+        safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: FlaxAutoencoderKL,
+        text_encoder: FlaxCLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: FlaxUNet2DConditionModel,
+        scheduler: Union[
+            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
+        ],
+        safety_checker: FlaxStableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        super().__init__()
+        self.dtype = dtype
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def prepare_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[Image.Image, List[Image.Image]],
+        mask: Union[Image.Image, List[Image.Image]],
+    ):
+        if not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if not isinstance(image, (Image.Image, list)):
+            raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
+
+        if isinstance(image, Image.Image):
+            image = [image]
+
+        if not isinstance(mask, (Image.Image, list)):
+            raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
+
+        if isinstance(mask, Image.Image):
+            mask = [mask]
+
+        processed_images = jnp.concatenate([preprocess_image(img, jnp.float32) for img in image])
+        processed_masks = jnp.concatenate([preprocess_mask(m, jnp.float32) for m in mask])
+        # processed_masks[processed_masks < 0.5] = 0
+        processed_masks = processed_masks.at[processed_masks < 0.5].set(0)
+        # processed_masks[processed_masks >= 0.5] = 1
+        processed_masks = processed_masks.at[processed_masks >= 0.5].set(1)
+
+        processed_masked_images = processed_images * (processed_masks < 0.5)
+
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+        return text_input.input_ids, processed_masked_images, processed_masks
+
+    def _get_has_nsfw_concepts(self, features, params):
+        has_nsfw_concepts = self.safety_checker(features, params)
+        return has_nsfw_concepts
+
+    def _run_safety_checker(self, images, safety_model_params, jit=False):
+        # safety_model_params should already be replicated when jit is True
+        pil_images = [Image.fromarray(image) for image in images]
+        features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
+
+        if jit:
+            features = shard(features)
+            has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
+            has_nsfw_concepts = unshard(has_nsfw_concepts)
+            safety_model_params = unreplicate(safety_model_params)
+        else:
+            has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
+
+        images_was_copied = False
+        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+            if has_nsfw_concept:
+                if not images_was_copied:
+                    images_was_copied = True
+                    images = images.copy()
+
+                images[idx] = np.zeros(images[idx].shape, dtype=np.uint8)  # black image
+
+            if any(has_nsfw_concepts):
+                warnings.warn(
+                    "Potential NSFW content was detected in one or more images. A black image will be returned"
+                    " instead. Try again with a different prompt and/or seed."
+                )
+
+        return images, has_nsfw_concepts
+
+    def _generate(
+        self,
+        prompt_ids: jnp.ndarray,
+        mask: jnp.ndarray,
+        masked_image: jnp.ndarray,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int,
+        height: int,
+        width: int,
+        guidance_scale: float,
+        latents: Optional[jnp.ndarray] = None,
+        neg_prompt_ids: Optional[jnp.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # get prompt text embeddings
+        prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+
+        # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
+        # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
+        batch_size = prompt_ids.shape[0]
+
+        max_length = prompt_ids.shape[-1]
+
+        if neg_prompt_ids is None:
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
+            ).input_ids
+        else:
+            uncond_input = neg_prompt_ids
+        negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
+        context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        latents_shape = (
+            batch_size,
+            self.vae.config.latent_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if latents is None:
+            latents = jax.random.normal(prng_seed, shape=latents_shape, dtype=self.dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+        prng_seed, mask_prng_seed = jax.random.split(prng_seed)
+
+        masked_image_latent_dist = self.vae.apply(
+            {"params": params["vae"]}, masked_image, method=self.vae.encode
+        ).latent_dist
+        masked_image_latents = masked_image_latent_dist.sample(key=mask_prng_seed).transpose((0, 3, 1, 2))
+        masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
+        del mask_prng_seed
+
+        mask = jax.image.resize(mask, (*mask.shape[:-2], *masked_image_latents.shape[-2:]), method="nearest")
+
+        # 8. Check that sizes of mask, masked image and latents match
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_mask = mask.shape[1]
+        num_channels_masked_image = masked_image_latents.shape[1]
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
+
+        def loop_body(step, args):
+            latents, mask, masked_image_latents, scheduler_state = args
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            latents_input = jnp.concatenate([latents] * 2)
+            mask_input = jnp.concatenate([mask] * 2)
+            masked_image_latents_input = jnp.concatenate([masked_image_latents] * 2)
+
+            t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            timestep = jnp.broadcast_to(t, latents_input.shape[0])
+
+            latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
+            # concat latents, mask, masked_image_latents in the channel dimension
+            latents_input = jnp.concatenate([latents_input, mask_input, masked_image_latents_input], axis=1)
+
+            # predict the noise residual
+            noise_pred = self.unet.apply(
+                {"params": params["unet"]},
+                jnp.array(latents_input),
+                jnp.array(timestep, dtype=jnp.int32),
+                encoder_hidden_states=context,
+            ).sample
+            # perform guidance
+            noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+            return latents, mask, masked_image_latents, scheduler_state
+
+        scheduler_state = self.scheduler.set_timesteps(
+            params["scheduler"], num_inference_steps=num_inference_steps, shape=latents.shape
+        )
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * params["scheduler"].init_noise_sigma
+
+        if DEBUG:
+            # run with python for loop
+            for i in range(num_inference_steps):
+                latents, mask, masked_image_latents, scheduler_state = loop_body(
+                    i, (latents, mask, masked_image_latents, scheduler_state)
+                )
+        else:
+            latents, _, _, _ = jax.lax.fori_loop(
+                0, num_inference_steps, loop_body, (latents, mask, masked_image_latents, scheduler_state)
+            )
+
+        # scale and decode the image latents with vae
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
+
+        image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+        return image
+
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt_ids: jnp.ndarray,
+        mask: jnp.ndarray,
+        masked_image: jnp.ndarray,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int = 50,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        guidance_scale: Union[float, jnp.ndarray] = 7.5,
+        latents: jnp.ndarray = None,
+        neg_prompt_ids: jnp.ndarray = None,
+        return_dict: bool = True,
+        jit: bool = False,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            latents (`jnp.ndarray`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                array is generated by sampling using the supplied random `generator`.
+            jit (`bool`, defaults to `False`):
+                Whether to run `pmap` versions of the generation and safety scoring functions.
+
+                    <Tip warning={true}>
+
+                    This argument exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a
+                    future release.
+
+                    </Tip>
+
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
+                a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated images
+                and the second element is a list of `bool`s indicating whether the corresponding generated image
+                contains "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        masked_image = jax.image.resize(masked_image, (*masked_image.shape[:-2], height, width), method="bicubic")
+        mask = jax.image.resize(mask, (*mask.shape[:-2], height, width), method="nearest")
+
+        if isinstance(guidance_scale, float):
+            # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+            # shape information, as they may be sharded (when `jit` is `True`), or not.
+            guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
+            if len(prompt_ids.shape) > 2:
+                # Assume sharded
+                guidance_scale = guidance_scale[:, None]
+
+        if jit:
+            images = _p_generate(
+                self,
+                prompt_ids,
+                mask,
+                masked_image,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+            )
+        else:
+            images = self._generate(
+                prompt_ids,
+                mask,
+                masked_image,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+            )
+
+        if self.safety_checker is not None:
+            safety_params = params["safety_checker"]
+            images_uint8_casted = (images * 255).round().astype("uint8")
+            num_devices, batch_size = images.shape[:2]
+
+            images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
+            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
+            images = np.asarray(images)
+
+            # block images
+            if any(has_nsfw_concept):
+                for i, is_nsfw in enumerate(has_nsfw_concept):
+                    if is_nsfw:
+                        images[i] = np.asarray(images_uint8_casted[i])
+
+            images = images.reshape(num_devices, batch_size, height, width, 3)
+        else:
+            images = np.asarray(images)
+            has_nsfw_concept = False
+
+        if not return_dict:
+            return (images, has_nsfw_concept)
+
+        return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
+
+
+# Static argnums are pipe, num_inference_steps, height, width. A change would trigger recompilation.
+# Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
+@partial(
+    jax.pmap,
+    in_axes=(None, 0, 0, 0, 0, 0, None, None, None, 0, 0, 0),
+    static_broadcasted_argnums=(0, 6, 7, 8),
+)
+def _p_generate(
+    pipe,
+    prompt_ids,
+    mask,
+    masked_image,
+    params,
+    prng_seed,
+    num_inference_steps,
+    height,
+    width,
+    guidance_scale,
+    latents,
+    neg_prompt_ids,
+):
+    return pipe._generate(
+        prompt_ids,
+        mask,
+        masked_image,
+        params,
+        prng_seed,
+        num_inference_steps,
+        height,
+        width,
+        guidance_scale,
+        latents,
+        neg_prompt_ids,
+    )
+
+
+@partial(jax.pmap, static_broadcasted_argnums=(0,))
+def _p_get_has_nsfw_concepts(pipe, features, params):
+    return pipe._get_has_nsfw_concepts(features, params)
+
+
+def unshard(x: jnp.ndarray):
+    # einops.rearrange(x, 'd b ... -> (d b) ...')
+    num_devices, batch_size = x.shape[:2]
+    rest = x.shape[2:]
+    return x.reshape(num_devices * batch_size, *rest)
+
+
+def preprocess_image(image, dtype):
+    w, h = image.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = jnp.array(image).astype(dtype) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask, dtype):
+    w, h = mask.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w, h))
+    mask = jnp.array(mask.convert("L")).astype(dtype) / 255.0
+    mask = jnp.expand_dims(mask, axis=(0, 1))
+
+    return mask
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e34dcb83c010c344f5b8463a939d1932ed5f769
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -0,0 +1,487 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPImageProcessor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import deprecate, logging
+from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ..pipeline_utils import DiffusionPipeline
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)
+
+
+class OnnxStableDiffusionPipeline(DiffusionPipeline):
+    vae_encoder: OnnxRuntimeModel
+    vae_decoder: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+    safety_checker: OnnxRuntimeModel
+    feature_extractor: CLIPImageProcessor
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _is_onnx = True
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: Optional[int],
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[str],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int],
+        width: Optional[int],
+        callback_steps: int,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        latents: Optional[np.ndarray] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.Tensor`):
+                `Image`, or tensor representing an image batch which will be upscaled. *
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                One or a list of [numpy generator(s)](TODO) to make generation deterministic.
+            latents (`np.ndarray`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if generator is None:
+            generator = np.random
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # get the initial random noise unless the user supplied it
+        latents_dtype = prompt_embeds.dtype
+        latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, width // 8)
+        if latents is None:
+            latents = generator.randn(*latents_shape).astype(latents_dtype)
+        elif latents.shape != latents_shape:
+            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        latents = latents * np.float64(self.scheduler.init_noise_sigma)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = latent_model_input.cpu().numpy()
+
+            # predict the noise residual
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)
+            noise_pred = noise_pred[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            scheduler_output = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )
+            latents = scheduler_output.prev_sample.numpy()
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        image = np.concatenate(
+            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+        )
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+
+class StableDiffusionOnnxPipeline(OnnxStableDiffusionPipeline):
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        deprecation_message = "Please use `OnnxStableDiffusionPipeline` instead of `StableDiffusionOnnxPipeline`."
+        deprecate("StableDiffusionOnnxPipeline", "1.0.0", deprecation_message)
+        super().__init__(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..c39409886bd9d27faf71d6e343b5e45fa8b6a51c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -0,0 +1,549 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import PIL_INTERPOLATION, deprecate, logging
+from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ..pipeline_utils import DiffusionPipeline
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess with 8->64
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 64 for x in (w, h))  # resize to integer multiple of 64
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image to image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    vae_encoder: OnnxRuntimeModel
+    vae_decoder: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+    safety_checker: OnnxRuntimeModel
+    feature_extractor: CLIPImageProcessor
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _is_onnx = True
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: Optional[int],
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[str],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        callback_steps: int,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[np.ndarray, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # check inputs. Raise error if not correct
+        self.check_inputs(prompt, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if generator is None:
+            generator = np.random
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        image = preprocess(image).cpu().numpy()
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        latents_dtype = prompt_embeds.dtype
+        image = image.astype(latents_dtype)
+        # encode the init image into latents and scale the latents
+        init_latents = self.vae_encoder(sample=image)[0]
+        init_latents = 0.18215 * init_latents
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = len(prompt) // init_latents.shape[0]
+            init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0)
+        elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
+            )
+        else:
+            init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
+
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        timesteps = self.scheduler.timesteps.numpy()[-init_timestep]
+        timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
+
+        # add noise to latents using the timesteps
+        noise = generator.randn(*init_latents.shape).astype(latents_dtype)
+        init_latents = self.scheduler.add_noise(
+            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
+        )
+        init_latents = init_latents.numpy()
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        latents = init_latents
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        timesteps = self.scheduler.timesteps[t_start:].numpy()
+
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = latent_model_input.cpu().numpy()
+
+            # predict the noise residual
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[
+                0
+            ]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            scheduler_output = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )
+            latents = scheduler_output.prev_sample.numpy()
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        image = np.concatenate(
+            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+        )
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # safety_checker does not support batched inputs yet
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..18d8050826cc3b755f7d42dd1903b5c80f2c6672
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -0,0 +1,563 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import PIL_INTERPOLATION, deprecate, logging
+from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ..pipeline_utils import DiffusionPipeline
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+NUM_UNET_INPUT_CHANNELS = 9
+NUM_LATENT_CHANNELS = 4
+
+
+def prepare_mask_and_masked_image(image, mask, latents_shape):
+    image = np.array(image.convert("RGB").resize((latents_shape[1] * 8, latents_shape[0] * 8)))
+    image = image[None].transpose(0, 3, 1, 2)
+    image = image.astype(np.float32) / 127.5 - 1.0
+
+    image_mask = np.array(mask.convert("L").resize((latents_shape[1] * 8, latents_shape[0] * 8)))
+    masked_image = image * (image_mask < 127.5)
+
+    mask = mask.resize((latents_shape[1], latents_shape[0]), PIL_INTERPOLATION["nearest"])
+    mask = np.array(mask.convert("L"))
+    mask = mask.astype(np.float32) / 255.0
+    mask = mask[None, None]
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+
+    return mask, masked_image
+
+
+class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    vae_encoder: OnnxRuntimeModel
+    vae_decoder: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+    safety_checker: OnnxRuntimeModel
+    feature_extractor: CLIPImageProcessor
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _is_onnx = True
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        logger.info("`OnnxStableDiffusionInpaintPipeline` is experimental and will very likely change in the future.")
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: Optional[int],
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[str],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int],
+        width: Optional[int],
+        callback_steps: int,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: PIL.Image.Image,
+        mask_image: PIL.Image.Image,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        latents: Optional[np.ndarray] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            latents (`np.ndarray`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if generator is None:
+            generator = np.random
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        num_channels_latents = NUM_LATENT_CHANNELS
+        latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
+        latents_dtype = prompt_embeds.dtype
+        if latents is None:
+            latents = generator.randn(*latents_shape).astype(latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+        # prepare mask and masked_image
+        mask, masked_image = prepare_mask_and_masked_image(image, mask_image, latents_shape[-2:])
+        mask = mask.astype(latents.dtype)
+        masked_image = masked_image.astype(latents.dtype)
+
+        masked_image_latents = self.vae_encoder(sample=masked_image)[0]
+        masked_image_latents = 0.18215 * masked_image_latents
+
+        # duplicate mask and masked_image_latents for each generation per prompt
+        mask = mask.repeat(batch_size * num_images_per_prompt, 0)
+        masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 0)
+
+        mask = np.concatenate([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            np.concatenate([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        num_channels_mask = mask.shape[1]
+        num_channels_masked_image = masked_image_latents.shape[1]
+
+        unet_input_channels = NUM_UNET_INPUT_CHANNELS
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != unet_input_channels:
+            raise ValueError(
+                "Incorrect configuration settings! The config of `pipeline.unet` expects"
+                f" {unet_input_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * np.float64(self.scheduler.init_noise_sigma)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            # concat latents, mask, masked_image_latnets in the channel dimension
+            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = latent_model_input.cpu().numpy()
+            latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1)
+
+            # predict the noise residual
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[
+                0
+            ]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            scheduler_output = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )
+            latents = scheduler_output.prev_sample.numpy()
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        image = np.concatenate(
+            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+        )
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # safety_checker does not support batched inputs yet
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd9ec57fb8792a56f7382ff6beef2a925ae897ef
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py
@@ -0,0 +1,586 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
+from ...utils import deprecate, logging
+from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ..pipeline_utils import DiffusionPipeline
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)
+
+
+def preprocess(image):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 64 for x in (w, h))  # resize to integer multiple of 32
+
+        image = [np.array(i.resize((w, h)))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+
+    return image
+
+
+class OnnxStableDiffusionUpscalePipeline(DiffusionPipeline):
+    vae: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    low_res_scheduler: DDPMScheduler
+    scheduler: KarrasDiffusionSchedulers
+    safety_checker: OnnxRuntimeModel
+    feature_extractor: CLIPImageProcessor
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _is_onnx = True
+
+    def __init__(
+        self,
+        vae: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: Any,
+        unet: OnnxRuntimeModel,
+        low_res_scheduler: DDPMScheduler,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: Optional[OnnxRuntimeModel] = None,
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        max_noise_level: int = 350,
+        num_latent_channels=4,
+        num_unet_input_channels=7,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            low_res_scheduler=low_res_scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(
+            max_noise_level=max_noise_level,
+            num_latent_channels=num_latent_channels,
+            num_unet_input_channels=num_unet_input_channels,
+        )
+
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        image,
+        noise_level,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, np.ndarray)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `np.ndarray`, `PIL.Image.Image` or `list` but is {type(image)}"
+            )
+
+        # verify batch size of prompt and image are same if image is a list or tensor or numpy array
+        if isinstance(image, (list, np.ndarray)):
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+
+            if isinstance(image, list):
+                image_batch_size = len(image)
+            else:
+                image_batch_size = image.shape[0]
+            if batch_size != image_batch_size:
+                raise ValueError(
+                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
+                    " Please make sure that passed `prompt` matches the batch size of `image`."
+                )
+
+        # check noise level
+        if noise_level > self.config.max_noise_level:
+            raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is None:
+            latents = generator.randn(*shape).astype(dtype)
+        elif latents.shape != shape:
+            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+
+        return latents
+
+    def decode_latents(self, latents):
+        latents = 1 / 0.08333 * latents
+        image = self.vae(latent_sample=latents)[0]
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+        return image
+
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: Optional[int],
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[str],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[np.ndarray, PIL.Image.Image, List[PIL.Image.Image]],
+        num_inference_steps: int = 75,
+        guidance_scale: float = 9.0,
+        noise_level: int = 20,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[np.random.RandomState, List[np.random.RandomState]]] = None,
+        latents: Optional[np.ndarray] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            noise_level (`float`, defaults to 0.2):
+                Deteremines the amount of noise to add to the initial image before performing upscaling.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            image,
+            noise_level,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if generator is None:
+            generator = np.random
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        latents_dtype = prompt_embeds.dtype
+        image = preprocess(image).cpu().numpy()
+        height, width = image.shape[2:]
+
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            self.config.num_latent_channels,
+            height,
+            width,
+            latents_dtype,
+            generator,
+        )
+        image = image.astype(latents_dtype)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        timesteps = self.scheduler.timesteps
+
+        # Scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * np.float64(self.scheduler.init_noise_sigma)
+
+        # 5. Add noise to image
+        noise_level = np.array([noise_level]).astype(np.int64)
+        noise = generator.randn(*image.shape).astype(latents_dtype)
+
+        image = self.low_res_scheduler.add_noise(
+            torch.from_numpy(image), torch.from_numpy(noise), torch.from_numpy(noise_level)
+        )
+        image = image.numpy()
+
+        batch_multiplier = 2 if do_classifier_free_guidance else 1
+        image = np.concatenate([image] * batch_multiplier * num_images_per_prompt)
+        noise_level = np.concatenate([noise_level] * image.shape[0])
+
+        # 7. Check that sizes of image and latents match
+        num_channels_image = image.shape[1]
+        if self.config.num_latent_channels + num_channels_image != self.config.num_unet_input_channels:
+            raise ValueError(
+                "Incorrect configuration settings! The config of `pipeline.unet` expects"
+                f" {self.config.num_unet_input_channels} but received `num_channels_latents`: {self.config.num_latent_channels} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {self.config.num_latent_channels + num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = np.concatenate([latent_model_input, image], axis=1)
+
+                # timestep to tensor
+                timestep = np.array([t], dtype=timestep_dtype)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    sample=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    class_labels=noise_level,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+                ).prev_sample
+                latents = latents.numpy()
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 10. Post-processing
+        image = self.decode_latents(latents)
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_output.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fb9b1a1412d96b69144a4c2e960dcc8b75a615c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_output.py
@@ -0,0 +1,45 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput, is_flax_available
+
+
+@dataclass
+class StableDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+
+
+if is_flax_available():
+    import flax
+
+    @flax.struct.dataclass
+    class FlaxStableDiffusionPipelineOutput(BaseOutput):
+        """
+        Output class for Flax-based Stable Diffusion pipelines.
+
+        Args:
+            images (`np.ndarray`):
+                Denoised images of array shape of `(batch_size, height, width, num_channels)`.
+            nsfw_content_detected (`List[bool]`):
+                List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content
+                or `None` if safety checking could not be performed.
+        """
+
+        images: np.ndarray
+        nsfw_content_detected: List[bool]
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2bbec7b3c3f3428ddf60d7b0ec8ce2d490a3047
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -0,0 +1,1072 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...configuration_utils import FrozenDict
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPipeline
+
+        >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
+            else None
+        )
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..7801b0d01dffe59712da18866eb579d5d9818a1e
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -0,0 +1,875 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPTextModel, CLIPTokenizer, DPTForDepthEstimation, DPTImageProcessor
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import PIL_INTERPOLATION, USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin):
+    r"""
+    Pipeline for text-guided depth-based image-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "depth_mask"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        depth_estimator: DPTForDepthEstimation,
+        feature_extractor: DPTImageProcessor,
+    ):
+        super().__init__()
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            depth_estimator=depth_estimator,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+                    image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+                elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+                    )
+
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_guidance, dtype, device):
+        if isinstance(image, PIL.Image.Image):
+            image = [image]
+        else:
+            image = list(image)
+
+        if isinstance(image[0], PIL.Image.Image):
+            width, height = image[0].size
+        elif isinstance(image[0], np.ndarray):
+            width, height = image[0].shape[:-1]
+        else:
+            height, width = image[0].shape[-2:]
+
+        if depth_map is None:
+            pixel_values = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+            pixel_values = pixel_values.to(device=device, dtype=dtype)
+            # The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16.
+            # So we use `torch.autocast` here for half precision inference.
+            if torch.backends.mps.is_available():
+                autocast_ctx = contextlib.nullcontext()
+                logger.warning(
+                    "The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16, but autocast is not yet supported on MPS."
+                )
+            else:
+                autocast_ctx = torch.autocast(device.type, dtype=dtype)
+
+            with autocast_ctx:
+                depth_map = self.depth_estimator(pixel_values).predicted_depth
+        else:
+            depth_map = depth_map.to(device=device, dtype=dtype)
+
+        depth_map = torch.nn.functional.interpolate(
+            depth_map.unsqueeze(1),
+            size=(height // self.vae_scale_factor, width // self.vae_scale_factor),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
+        depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
+        depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0
+        depth_map = depth_map.to(dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if depth_map.shape[0] < batch_size:
+            repeat_by = batch_size // depth_map.shape[0]
+            depth_map = depth_map.repeat(repeat_by, 1, 1, 1)
+
+        depth_map = torch.cat([depth_map] * 2) if do_classifier_free_guidance else depth_map
+        return depth_map
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        depth_map: Optional[torch.Tensor] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be used as the starting point. Can accept image
+                latents as `image` only if `depth_map` is not `None`.
+            depth_map (`torch.Tensor`, *optional*):
+                Depth prediction to be used as additional conditioning for the image generation process. If not
+                defined, it automatically predicts the depth with `self.depth_estimator`.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> from diffusers import StableDiffusionDepth2ImgPipeline
+
+        >>> pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-depth",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe.to("cuda")
+
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> init_image = Image.open(requests.get(url, stream=True).raw)
+        >>> prompt = "two tigers"
+        >>> n_prompt = "bad, deformed, ugly, bad anotomy"
+        >>> image = pipe(prompt=prompt, image=init_image, negative_prompt=n_prompt, strength=0.7).images[0]
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            strength,
+            callback_steps,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        if image is None:
+            raise ValueError("`image` input cannot be undefined.")
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare depth mask
+        depth_mask = self.prepare_depth_map(
+            image,
+            depth_map,
+            batch_size * num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            prompt_embeds.dtype,
+            device,
+        )
+
+        # 5. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 6. Set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 7. Prepare latent variables
+        latents = self.prepare_latents(
+            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+        )
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = torch.cat([latent_model_input, depth_mask], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    depth_mask = callback_outputs.pop("depth_mask", depth_mask)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
new file mode 100644
index 0000000000000000000000000000000000000000..93a8bd160318400cddbbdec46465fcccdc506029
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -0,0 +1,425 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class StableDiffusionImageVariationPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline to generate image variations from an input image using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    # TODO: feature_extractor is required to encode images (if they are in PIL format),
+    # we should give a descriptive message if the pipeline doesn't have one.
+    _optional_components = ["safety_checker"]
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(image_embeddings)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+
+        return image_embeddings
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, image, height, width, callback_steps):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+
+        Examples:
+
+        ```py
+        from diffusers import StableDiffusionImageVariationPipeline
+        from PIL import Image
+        from io import BytesIO
+        import requests
+
+        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            "lambdalabs/sd-image-variations-diffusers", revision="v2.0"
+        )
+        pipe = pipe.to("cuda")
+
+        url = "https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200"
+
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+
+        out = pipe(image, num_images_per_prompt=3, guidance_scale=15)
+        out["images"][0].save("result.jpg")
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width, callback_steps)
+
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input image
+        image_embeddings = self._encode_image(image, device, num_images_per_prompt, do_classifier_free_guidance)
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        self.maybe_free_model_hooks()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..424f0e3c56e2c1a4e73bc4e31dc34cdf4c4f78e9
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -0,0 +1,1145 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...configuration_utils import FrozenDict
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from io import BytesIO
+
+        >>> from diffusers import StableDiffusionImg2ImgPipeline
+
+        >>> device = "cuda"
+        >>> model_id_or_path = "runwayml/stable-diffusion-v1-5"
+        >>> pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
+        >>> pipe = pipe.to(device)
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+        >>> response = requests.get(url)
+        >>> init_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> init_image = init_image.resize((768, 512))
+
+        >>> prompt = "A fantasy landscape, trending on artstation"
+
+        >>> images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
+        >>> images[0].save("fantasy_landscape.png")
+        ```
+"""
+
+
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-guided image-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+                    image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+                elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+                    )
+
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            strength,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 5. set timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 7.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2c5b11d34cfd39fb62b438022525f5ce41c8066
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -0,0 +1,1338 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...configuration_utils import FrozenDict
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AsymmetricAutoencoderKL, AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionInpaintPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`, `AsymmetricAutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "mask", "masked_image_latents"]
+
+    def __init__(
+        self,
+        vae: Union[AutoencoderKL, AsymmetricAutoencoderKL],
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration"
+                " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
+                " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
+                " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
+                " Hub, it would be very nice if you could open a Pull request for the"
+                " `scheduler/scheduler_config.json` file"
+            )
+            deprecate("skip_prk_steps not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["skip_prk_steps"] = True
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        # Check shapes, assume num_channels_latents == 4, num_channels_mask == 1, num_channels_masked == 4
+        if unet.config.in_channels != 9:
+            logger.info(f"You have loaded a UNet with {unet.config.in_channels} input channels which.")
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        mask_image,
+        height,
+        width,
+        strength,
+        callback_steps,
+        output_type,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        padding_mask_crop=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if padding_mask_crop is not None:
+            if not isinstance(image, PIL.Image.Image):
+                raise ValueError(
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
+                )
+            if not isinstance(mask_image, PIL.Image.Image):
+                raise ValueError(
+                    f"The mask image should be a PIL image when inpainting mask crop, but is of type"
+                    f" {type(mask_image)}."
+                )
+            if output_type != "pil":
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 4:
+                image_latents = image
+            else:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        else:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        if masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: torch.Tensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to
+                be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch
+                tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the
+                expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the
+                expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but
+                if passing latents directly it is not encoded again.
+            mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
+                1)`, or `(H, W)`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
+                image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
+                with the same aspect ration of the image and contains all masked area, and then expand that area based
+                on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
+                resizing to the original image size for inpainting. This is useful when the masked area is small while
+                the image is large and contain information irrelevant for inpainting, such as background.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+
+        >>> from diffusers import StableDiffusionInpaintPipeline
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+        >>> init_image = download_image(img_url).resize((512, 512))
+        >>> mask_image = download_image(mask_url).resize((512, 512))
+
+        >>> pipe = StableDiffusionInpaintPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+        >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            image,
+            mask_image,
+            height,
+            width,
+            strength,
+            callback_steps,
+            output_type,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+            padding_mask_crop,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. set timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps=num_inference_steps, strength=strength, device=device
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+
+        # 5. Preprocess mask and image
+
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 7. Prepare mask latent variables
+        mask_condition = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+
+        if masked_image_latents is None:
+            masked_image = init_image * (mask_condition < 0.5)
+        else:
+            masked_image = masked_image_latents
+
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask_condition,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+
+        # 8. Check that sizes of mask, masked image and latents match
+        if num_channels_unet == 9:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
+        elif num_channels_unet != 4:
+            raise ValueError(
+                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+            )
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 9.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 10. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if num_channels_unet == 9:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            condition_kwargs = {}
+            if isinstance(self.vae, AsymmetricAutoencoderKL):
+                init_image = init_image.to(device=device, dtype=masked_image_latents.dtype)
+                init_image_condition = init_image.clone()
+                init_image = self._encode_vae_image(init_image, generator=generator)
+                mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype)
+                condition_kwargs = {"image": init_image_condition, "mask": mask_condition}
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor, return_dict=False, generator=generator, **condition_kwargs
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd89b195c77885aed4831d67bfe8a8383e0593ec
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -0,0 +1,906 @@
+# Copyright 2024 The InstructPix2Pix Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import PIL_INTERPOLATION, deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class StableDiffusionInstructPix2PixPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for pixel-level image editing by following text instructions (based on Stable Diffusion).
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "image_latents"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 7.5,
+        image_guidance_scale: float = 1.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be repainted according to `prompt`. Can also accept
+                image latents as `image`, but if passing latents directly it is not encoded again.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            image_guidance_scale (`float`, *optional*, defaults to 1.5):
+                Push the generated image towards the initial `image`. Image guidance scale is enabled by setting
+                `image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
+                linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
+                value of at least `1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+
+        >>> from diffusers import StableDiffusionInstructPix2PixPipeline
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
+
+        >>> image = download_image(img_url).resize((512, 512))
+
+        >>> pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+        ...     "timbrooks/instruct-pix2pix", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "make the mountains snowy"
+        >>> image = pipe(prompt=prompt, image=image).images[0]
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Check inputs
+        self.check_inputs(
+            prompt,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._image_guidance_scale = image_guidance_scale
+
+        device = self._execution_device
+
+        if image is None:
+            raise ValueError("`image` input cannot be undefined.")
+
+        # 1. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 2. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # 3. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 4. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare Image latents
+        image_latents = self.prepare_image_latents(
+            image,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            self.do_classifier_free_guidance,
+        )
+
+        height, width = image_latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Check that shapes of latents and image match the UNet channels
+        num_channels_image = image_latents.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Expand the latents if we are doing classifier free guidance.
+                # The latents are expanded 3 times because for pix2pix the guidance\
+                # is applied for both the text and the input image.
+                latent_model_input = torch.cat([latents] * 3) if self.do_classifier_free_guidance else latents
+
+                # concat latents, image_latents in the channel dimension
+                scaled_latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                scaled_latent_model_input = torch.cat([scaled_latent_model_input, image_latents], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    scaled_latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    added_cond_kwargs=added_cond_kwargs,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_text, noise_pred_image, noise_pred_uncond = noise_pred.chunk(3)
+                    noise_pred = (
+                        noise_pred_uncond
+                        + self.guidance_scale * (noise_pred_text - noise_pred_image)
+                        + self.image_guidance_scale * (noise_pred_image - noise_pred_uncond)
+                    )
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    image_latents = callback_outputs.pop("image_latents", image_latents)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_ prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        else:
+            prompt_embeds_dtype = self.unet.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            # pix2pix has two negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
+            prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds, negative_prompt_embeds])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat(
+                        [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
+                    )
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    (
+                        single_image_embeds,
+                        single_negative_image_embeds,
+                        single_negative_image_embeds,
+                    ) = single_image_embeds.chunk(3)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat(
+                        [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
+                    )
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_image_latents(
+        self, image, batch_size, num_images_per_prompt, dtype, device, do_classifier_free_guidance, generator=None
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            image_latents = image
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), sample_mode="argmax")
+
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand image_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_image_latents = torch.zeros_like(image_latents)
+            image_latents = torch.cat([image_latents, image_latents, uncond_image_latents], dim=0)
+
+        return image_latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def image_guidance_scale(self):
+        return self._image_guidance_scale
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self.guidance_scale > 1.0 and self.image_guidance_scale >= 1.0
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffe02ae679e5d60f52ff6d46cbd983488096285d
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -0,0 +1,655 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import EulerDiscreteScheduler
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.preprocess
+def preprocess(image):
+    warnings.warn(
+        "The preprocess method is deprecated and will be removed in a future version. Please"
+        " use VaeImageProcessor.preprocess instead",
+        FutureWarning,
+    )
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 64 for x in (w, h))  # resize to integer multiple of 64
+
+        image = [np.array(i.resize((w, h)))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin):
+    r"""
+    Pipeline for upscaling Stable Diffusion output image resolution by a factor of 2.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A [`EulerDiscreteScheduler`] to be used in combination with `unet` to denoise the encoded image latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: EulerDiscreteScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, resample="bicubic")
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            **kwargs,
+        )
+
+        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds])
+
+        return prompt_embeds, pooled_prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None or pooled_prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_length=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            text_encoder_out = self.text_encoder(
+                text_input_ids.to(device),
+                output_hidden_states=True,
+            )
+            prompt_embeds = text_encoder_out.hidden_states[-1]
+            pooled_prompt_embeds = text_encoder_out.pooler_output
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            if negative_prompt_embeds is None or negative_pooled_prompt_embeds is None:
+                uncond_tokens: List[str]
+                if negative_prompt is None:
+                    uncond_tokens = [""] * batch_size
+                elif type(prompt) is not type(negative_prompt):
+                    raise TypeError(
+                        f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                        f" {type(prompt)}."
+                    )
+                elif isinstance(negative_prompt, str):
+                    uncond_tokens = [negative_prompt]
+                elif batch_size != len(negative_prompt):
+                    raise ValueError(
+                        f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                        f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                        " the batch size of `prompt`."
+                    )
+                else:
+                    uncond_tokens = negative_prompt
+
+                max_length = text_input_ids.shape[-1]
+                uncond_input = self.tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_length=True,
+                    return_tensors="pt",
+                )
+
+                uncond_encoder_out = self.text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+
+                negative_prompt_embeds = uncond_encoder_out.hidden_states[-1]
+                negative_pooled_prompt_embeds = uncond_encoder_out.pooler_output
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+    ):
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, np.ndarray)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
+            )
+
+        # verify batch size of prompt and image are same if image is a list or tensor
+        if isinstance(image, (list, torch.Tensor)):
+            if prompt is not None:
+                if isinstance(prompt, str):
+                    batch_size = 1
+                else:
+                    batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+
+            if isinstance(image, list):
+                image_batch_size = len(image)
+            else:
+                image_batch_size = image.shape[0] if image.ndim == 4 else 1
+            if batch_size != image_batch_size:
+                raise ValueError(
+                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
+                    " Please make sure that passed `prompt` matches the batch size of `image`."
+                )
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        num_inference_steps: int = 75,
+        guidance_scale: float = 9.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image upscaling.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be upscaled. If it's a tensor, it can be either a
+                latent output from a Stable Diffusion model or an image tensor in the range `[-1, 1]`. It is considered
+                a `latent` if `image.shape[1]` is `4`; otherwise, it is considered to be an image representation and
+                encoded using this pipeline's `vae` encoder.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+        ```py
+        >>> from diffusers import StableDiffusionLatentUpscalePipeline, StableDiffusionPipeline
+        >>> import torch
+
+
+        >>> pipeline = StableDiffusionPipeline.from_pretrained(
+        ...     "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
+        ... )
+        >>> pipeline.to("cuda")
+
+        >>> model_id = "stabilityai/sd-x2-latent-upscaler"
+        >>> upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+        >>> upscaler.to("cuda")
+
+        >>> prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic"
+        >>> generator = torch.manual_seed(33)
+
+        >>> low_res_latents = pipeline(prompt, generator=generator, output_type="latent").images
+
+        >>> with torch.no_grad():
+        ...     image = pipeline.decode_latents(low_res_latents)
+        >>> image = pipeline.numpy_to_pil(image)[0]
+
+        >>> image.save("../images/a1.png")
+
+        >>> upscaled_image = upscaler(
+        ...     prompt=prompt,
+        ...     image=low_res_latents,
+        ...     num_inference_steps=20,
+        ...     guidance_scale=0,
+        ...     generator=generator,
+        ... ).images[0]
+
+        >>> upscaled_image.save("../images/a2.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
+        """
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            image,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None:
+            batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if guidance_scale == 0:
+            prompt = [""] * batch_size
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt,
+            device,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds])
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+        image = image.to(dtype=prompt_embeds.dtype, device=device)
+        if image.shape[1] == 3:
+            # encode image if not in latent-space yet
+            image = retrieve_latents(self.vae.encode(image), generator=generator) * self.vae.config.scaling_factor
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        batch_multiplier = 2 if do_classifier_free_guidance else 1
+        image = image[None, :] if image.ndim == 3 else image
+        image = torch.cat([image] * batch_multiplier)
+
+        # 5. Add noise to image (set to be 0):
+        # (see below notes from the author):
+        # "the This step theoretically can make the model work better on out-of-distribution inputs, but mostly just seems to make it match the input less, so it's turned off by default."
+        noise_level = torch.tensor([0.0], dtype=torch.float32, device=device)
+        noise_level = torch.cat([noise_level] * image.shape[0])
+        inv_noise_level = (noise_level**2 + 1) ** (-0.5)
+
+        image_cond = F.interpolate(image, scale_factor=2, mode="nearest") * inv_noise_level[:, None, None, None]
+        image_cond = image_cond.to(prompt_embeds.dtype)
+
+        noise_level_embed = torch.cat(
+            [
+                torch.ones(pooled_prompt_embeds.shape[0], 64, dtype=pooled_prompt_embeds.dtype, device=device),
+                torch.zeros(pooled_prompt_embeds.shape[0], 64, dtype=pooled_prompt_embeds.dtype, device=device),
+            ],
+            dim=1,
+        )
+
+        timestep_condition = torch.cat([noise_level_embed, pooled_prompt_embeds], dim=1)
+
+        # 6. Prepare latent variables
+        height, width = image.shape[2:]
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size,
+            num_channels_latents,
+            height * 2,  # 2x upscale
+            width * 2,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Check that sizes of image and latents match
+        num_channels_image = image.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+
+        # 9. Denoising loop
+        num_warmup_steps = 0
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                sigma = self.scheduler.sigmas[i]
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                scaled_model_input = torch.cat([scaled_model_input, image_cond], dim=1)
+                # preconditioning parameter based on  Karras et al. (2022) (table 1)
+                timestep = torch.log(sigma) * 0.25
+
+                noise_pred = self.unet(
+                    scaled_model_input,
+                    timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_condition,
+                ).sample
+
+                # in original repo, the output contains a variance channel that's not used
+                noise_pred = noise_pred[:, :-1]
+
+                # apply preconditioning, based on table 1 in Karras et al. (2022)
+                inv_sigma = 1 / (sigma**2 + 1)
+                noise_pred = inv_sigma * latent_model_input + self.scheduler.scale_model_input(sigma, t) * noise_pred
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cbbe17531ef7565a99ff2ce201193b3068443c8
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -0,0 +1,809 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def preprocess(image):
+    warnings.warn(
+        "The preprocess method is deprecated and will be removed in a future version. Please"
+        " use VaeImageProcessor.preprocess instead",
+        FutureWarning,
+    )
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 64 for x in (w, h))  # resize to integer multiple of 64
+
+        image = [np.array(i.resize((w, h)))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+class StableDiffusionUpscalePipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-guided image super-resolution using Stable Diffusion 2.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        low_res_scheduler ([`SchedulerMixin`]):
+            A scheduler used to add initial noise to the low resolution conditioning image. It must be an instance of
+            [`DDPMScheduler`].
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["watermarker", "safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        low_res_scheduler: DDPMScheduler,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: Optional[Any] = None,
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        watermarker: Optional[Any] = None,
+        max_noise_level: int = 350,
+    ):
+        super().__init__()
+
+        if hasattr(
+            vae, "config"
+        ):  # check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate
+            is_vae_scaling_factor_set_to_0_08333 = (
+                hasattr(vae.config, "scaling_factor") and vae.config.scaling_factor == 0.08333
+            )
+            if not is_vae_scaling_factor_set_to_0_08333:
+                deprecation_message = (
+                    "The configuration file of the vae does not contain `scaling_factor` or it is set to"
+                    f" {vae.config.scaling_factor}, which seems highly unlikely. If your checkpoint is a fine-tuned"
+                    " version of `stabilityai/stable-diffusion-x4-upscaler` you should change 'scaling_factor' to"
+                    " 0.08333 Please make sure to update the config accordingly, as not doing so might lead to"
+                    " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging"
+                    " Face Hub, it would be very nice if you could open a Pull Request for the `vae/config.json` file"
+                )
+                deprecate("wrong scaling_factor", "1.0.0", deprecation_message, standard_warn=False)
+                vae.register_to_config(scaling_factor=0.08333)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            watermarker=watermarker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, resample="bicubic")
+        self.register_to_config(max_noise_level=max_noise_level)
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        noise_level,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, np.ndarray)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `np.ndarray`, `PIL.Image.Image` or `list` but is {type(image)}"
+            )
+
+        # verify batch size of prompt and image are same if image is a list or tensor or numpy array
+        if isinstance(image, (list, np.ndarray, torch.Tensor)):
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+
+            if isinstance(image, list):
+                image_batch_size = len(image)
+            else:
+                image_batch_size = image.shape[0]
+            if batch_size != image_batch_size:
+                raise ValueError(
+                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
+                    " Please make sure that passed `prompt` matches the batch size of `image`."
+                )
+
+        # check noise level
+        if noise_level > self.config.max_noise_level:
+            raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        num_inference_steps: int = 75,
+        guidance_scale: float = 9.0,
+        noise_level: int = 20,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be upscaled.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+        ```py
+        >>> import requests
+        >>> from PIL import Image
+        >>> from io import BytesIO
+        >>> from diffusers import StableDiffusionUpscalePipeline
+        >>> import torch
+
+        >>> # load model and scheduler
+        >>> model_id = "stabilityai/stable-diffusion-x4-upscaler"
+        >>> pipeline = StableDiffusionUpscalePipeline.from_pretrained(
+        ...     model_id, variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipeline = pipeline.to("cuda")
+
+        >>> # let's download an  image
+        >>> url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
+        >>> response = requests.get(url)
+        >>> low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> low_res_img = low_res_img.resize((128, 128))
+        >>> prompt = "a white cat"
+
+        >>> upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
+        >>> upscaled_image.save("upsampled_cat.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            image,
+            noise_level,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        if image is None:
+            raise ValueError("`image` input cannot be undefined.")
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+        image = image.to(dtype=prompt_embeds.dtype, device=device)
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Add noise to image
+        noise_level = torch.tensor([noise_level], dtype=torch.long, device=device)
+        noise = randn_tensor(image.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
+        image = self.low_res_scheduler.add_noise(image, noise, noise_level)
+
+        batch_multiplier = 2 if do_classifier_free_guidance else 1
+        image = torch.cat([image] * batch_multiplier * num_images_per_prompt)
+        noise_level = torch.cat([noise_level] * image.shape[0])
+
+        # 6. Prepare latent variables
+        height, width = image.shape[2:]
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Check that sizes of image and latents match
+        num_channels_image = image.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = torch.cat([latent_model_input, image], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=noise_level,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+
+            # Ensure latents are always the same type as the VAE
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+
+            image, has_nsfw_concept, _ = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # 11. Apply watermark
+        if output_type == "pil" and self.watermarker is not None:
+            image = self.watermarker.apply_watermark(image)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
new file mode 100644
index 0000000000000000000000000000000000000000..41811f8f2c0ee5ee538c14a174f9e42143ede5e6
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
@@ -0,0 +1,940 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+from transformers.models.clip.modeling_clip import CLIPTextModelOutput
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, PriorTransformer, UNet2DConditionModel
+from ...models.embeddings import get_timestep_embedding
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin
+from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableUnCLIPPipeline
+
+        >>> pipe = StableUnCLIPPipeline.from_pretrained(
+        ...     "fusing/stable-unclip-2-1-l", torch_dtype=torch.float16
+        ... )  # TODO update model path
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> images = pipe(prompt).images
+        >>> images[0].save("astronaut_horse.png")
+        ```
+"""
+
+
+class StableUnCLIPPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
+):
+    """
+    Pipeline for text-to-image generation using stable unCLIP.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        prior_tokenizer ([`CLIPTokenizer`]):
+            A [`CLIPTokenizer`].
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen [`CLIPTextModelWithProjection`] text-encoder.
+        prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        prior_scheduler ([`KarrasDiffusionSchedulers`]):
+            Scheduler used in the prior denoising process.
+        image_normalizer ([`StableUnCLIPImageNormalizer`]):
+            Used to normalize the predicted image embeddings before the noise is applied and un-normalize the image
+            embeddings after the noise has been applied.
+        image_noising_scheduler ([`KarrasDiffusionSchedulers`]):
+            Noise schedule for adding noise to the predicted image embeddings. The amount of noise to add is determined
+            by the `noise_level`.
+        tokenizer ([`CLIPTokenizer`]):
+            A [`CLIPTokenizer`].
+        text_encoder ([`CLIPTextModel`]):
+            Frozen [`CLIPTextModel`] text-encoder.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] to denoise the encoded image latents.
+        scheduler ([`KarrasDiffusionSchedulers`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+    """
+
+    _exclude_from_cpu_offload = ["prior", "image_normalizer"]
+    model_cpu_offload_seq = "text_encoder->prior_text_encoder->unet->vae"
+
+    # prior components
+    prior_tokenizer: CLIPTokenizer
+    prior_text_encoder: CLIPTextModelWithProjection
+    prior: PriorTransformer
+    prior_scheduler: KarrasDiffusionSchedulers
+
+    # image noising components
+    image_normalizer: StableUnCLIPImageNormalizer
+    image_noising_scheduler: KarrasDiffusionSchedulers
+
+    # regular denoising components
+    tokenizer: CLIPTokenizer
+    text_encoder: CLIPTextModel
+    unet: UNet2DConditionModel
+    scheduler: KarrasDiffusionSchedulers
+
+    vae: AutoencoderKL
+
+    def __init__(
+        self,
+        # prior components
+        prior_tokenizer: CLIPTokenizer,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior: PriorTransformer,
+        prior_scheduler: KarrasDiffusionSchedulers,
+        # image noising components
+        image_normalizer: StableUnCLIPImageNormalizer,
+        image_noising_scheduler: KarrasDiffusionSchedulers,
+        # regular denoising components
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        # vae
+        vae: AutoencoderKL,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior_tokenizer=prior_tokenizer,
+            prior_text_encoder=prior_text_encoder,
+            prior=prior,
+            prior_scheduler=prior_scheduler,
+            image_normalizer=image_normalizer,
+            image_noising_scheduler=image_noising_scheduler,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._encode_prompt with _encode_prompt->_encode_prior_prompt, tokenizer->prior_tokenizer, text_encoder->prior_text_encoder
+    def _encode_prior_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+        text_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if text_model_output is None:
+            batch_size = len(prompt) if isinstance(prompt, list) else 1
+            # get prompt text embeddings
+            text_inputs = self.prior_tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.prior_tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            text_mask = text_inputs.attention_mask.bool().to(device)
+
+            untruncated_ids = self.prior_tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.prior_tokenizer.batch_decode(
+                    untruncated_ids[:, self.prior_tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.prior_tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.prior_tokenizer.model_max_length]
+
+            prior_text_encoder_output = self.prior_text_encoder(text_input_ids.to(device))
+
+            prompt_embeds = prior_text_encoder_output.text_embeds
+            text_enc_hid_states = prior_text_encoder_output.last_hidden_state
+
+        else:
+            batch_size = text_model_output[0].shape[0]
+            prompt_embeds, text_enc_hid_states = text_model_output[0], text_model_output[1]
+            text_mask = text_attention_mask
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_enc_hid_states = text_enc_hid_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens = [""] * batch_size
+
+            uncond_input = self.prior_tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.prior_tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_prior_text_encoder_output = self.prior_text_encoder(
+                uncond_input.input_ids.to(device)
+            )
+
+            negative_prompt_embeds = negative_prompt_embeds_prior_text_encoder_output.text_embeds
+            uncond_text_enc_hid_states = negative_prompt_embeds_prior_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_enc_hid_states.shape[1]
+            uncond_text_enc_hid_states = uncond_text_enc_hid_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_enc_hid_states = uncond_text_enc_hid_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_enc_hid_states = torch.cat([uncond_text_enc_hid_states, text_enc_hid_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_enc_hid_states, text_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs with prepare_extra_step_kwargs->prepare_prior_extra_step_kwargs, scheduler->prior_scheduler
+    def prepare_prior_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the prior_scheduler step, since not all prior_schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other prior_schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.prior_scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the prior_scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.prior_scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        noise_level,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Please make sure to define only one of the two."
+            )
+
+        if prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                "Provide either `negative_prompt` or `negative_prompt_embeds`. Cannot leave both `negative_prompt` and `negative_prompt_embeds` undefined."
+            )
+
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
+            raise ValueError(
+                f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive."
+            )
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def noise_image_embeddings(
+        self,
+        image_embeds: torch.Tensor,
+        noise_level: int,
+        noise: Optional[torch.Tensor] = None,
+        generator: Optional[torch.Generator] = None,
+    ):
+        """
+        Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher
+        `noise_level` increases the variance in the final un-noised images.
+
+        The noise is applied in two ways:
+        1. A noise schedule is applied directly to the embeddings.
+        2. A vector of sinusoidal time embeddings are appended to the output.
+
+        In both cases, the amount of noise is controlled by the same `noise_level`.
+
+        The embeddings are normalized before the noise is applied and un-normalized after the noise is applied.
+        """
+        if noise is None:
+            noise = randn_tensor(
+                image_embeds.shape, generator=generator, device=image_embeds.device, dtype=image_embeds.dtype
+            )
+
+        noise_level = torch.tensor([noise_level] * image_embeds.shape[0], device=image_embeds.device)
+
+        self.image_normalizer.to(image_embeds.device)
+        image_embeds = self.image_normalizer.scale(image_embeds)
+
+        image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise)
+
+        image_embeds = self.image_normalizer.unscale(image_embeds)
+
+        noise_level = get_timestep_embedding(
+            timesteps=noise_level, embedding_dim=image_embeds.shape[-1], flip_sin_to_cos=True, downscale_freq_shift=0
+        )
+
+        # `get_timestep_embeddings` does not contain any weights and will always return f32 tensors,
+        # but we might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        noise_level = noise_level.to(image_embeds.dtype)
+
+        image_embeds = torch.cat((image_embeds, noise_level), 1)
+
+        return image_embeds
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        # regular denoising process args
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 20,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 0,
+        # prior args
+        prior_num_inference_steps: int = 25,
+        prior_guidance_scale: float = 4.0,
+        prior_latents: Optional[torch.Tensor] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 20):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 10.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            noise_level (`int`, *optional*, defaults to `0`):
+                The amount of noise to add to the image embeddings. A higher `noise_level` increases the variance in
+                the final un-noised images. See [`StableUnCLIPPipeline.noise_image_embeddings`] for more details.
+            prior_num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps in the prior denoising process. More denoising steps usually lead to a
+                higher quality image at the expense of slower inference.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            prior_latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                embedding generation in the prior denoising process. Can be used to tweak the same generation with
+                different prompts. If not provided, a latents tensor is generated by sampling using the supplied random
+                `generator`.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                [`~ pipeline_utils.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning
+                a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            callback_steps=callback_steps,
+            noise_level=noise_level,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        batch_size = batch_size * num_images_per_prompt
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        prior_do_classifier_free_guidance = prior_guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prior_prompt_embeds, prior_text_encoder_hidden_states, prior_text_mask = self._encode_prior_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=prior_do_classifier_free_guidance,
+        )
+
+        # 4. Prepare prior timesteps
+        self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
+        prior_timesteps_tensor = self.prior_scheduler.timesteps
+
+        # 5. Prepare prior latent variables
+        embedding_dim = self.prior.config.embedding_dim
+        prior_latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            prior_prompt_embeds.dtype,
+            device,
+            generator,
+            prior_latents,
+            self.prior_scheduler,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        prior_extra_step_kwargs = self.prepare_prior_extra_step_kwargs(generator, eta)
+
+        # 7. Prior denoising loop
+        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([prior_latents] * 2) if prior_do_classifier_free_guidance else prior_latents
+            latent_model_input = self.prior_scheduler.scale_model_input(latent_model_input, t)
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prior_prompt_embeds,
+                encoder_hidden_states=prior_text_encoder_hidden_states,
+                attention_mask=prior_text_mask,
+            ).predicted_image_embedding
+
+            if prior_do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            prior_latents = self.prior_scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=prior_latents,
+                **prior_extra_step_kwargs,
+                return_dict=False,
+            )[0]
+
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, prior_latents)
+
+        prior_latents = self.prior.post_process_latents(prior_latents)
+
+        image_embeds = prior_latents
+
+        # done prior
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 8. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 9. Prepare image embeddings
+        image_embeds = self.noise_image_embeddings(
+            image_embeds=image_embeds,
+            noise_level=noise_level,
+            generator=generator,
+        )
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(image_embeds)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeds = torch.cat([negative_prompt_embeds, image_embeds])
+
+        # 10. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 11. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        latents = self.prepare_latents(
+            shape=shape,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+            scheduler=self.scheduler,
+        )
+
+        # 12. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 13. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                class_labels=image_embeds,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..2556d5e57b6de1fa726708c86eaefe4efdc90f37
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -0,0 +1,846 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.embeddings import get_timestep_embedding
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin
+from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from io import BytesIO
+
+        >>> from diffusers import StableUnCLIPImg2ImgPipeline
+
+        >>> pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1-unclip-small", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+        >>> response = requests.get(url)
+        >>> init_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> init_image = init_image.resize((768, 512))
+
+        >>> prompt = "A fantasy landscape, trending on artstation"
+
+        >>> images = pipe(init_image, prompt).images
+        >>> images[0].save("fantasy_landscape.png")
+        ```
+"""
+
+
+class StableUnCLIPImg2ImgPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
+):
+    """
+    Pipeline for text-guided image-to-image generation using stable unCLIP.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        feature_extractor ([`CLIPImageProcessor`]):
+            Feature extractor for image pre-processing before being encoded.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            CLIP vision model for encoding images.
+        image_normalizer ([`StableUnCLIPImageNormalizer`]):
+            Used to normalize the predicted image embeddings before the noise is applied and un-normalize the image
+            embeddings after the noise has been applied.
+        image_noising_scheduler ([`KarrasDiffusionSchedulers`]):
+            Noise schedule for adding noise to the predicted image embeddings. The amount of noise to add is determined
+            by the `noise_level`.
+        tokenizer (`~transformers.CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`)].
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen [`~transformers.CLIPTextModel`] text-encoder.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] to denoise the encoded image latents.
+        scheduler ([`KarrasDiffusionSchedulers`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _exclude_from_cpu_offload = ["image_normalizer"]
+
+    # image encoding components
+    feature_extractor: CLIPImageProcessor
+    image_encoder: CLIPVisionModelWithProjection
+
+    # image noising components
+    image_normalizer: StableUnCLIPImageNormalizer
+    image_noising_scheduler: KarrasDiffusionSchedulers
+
+    # regular denoising components
+    tokenizer: CLIPTokenizer
+    text_encoder: CLIPTextModel
+    unet: UNet2DConditionModel
+    scheduler: KarrasDiffusionSchedulers
+
+    vae: AutoencoderKL
+
+    def __init__(
+        self,
+        # image encoding components
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
+        # image noising components
+        image_normalizer: StableUnCLIPImageNormalizer,
+        image_noising_scheduler: KarrasDiffusionSchedulers,
+        # regular denoising components
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModel,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        # vae
+        vae: AutoencoderKL,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            image_normalizer=image_normalizer,
+            image_noising_scheduler=image_noising_scheduler,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def _encode_image(
+        self,
+        image,
+        device,
+        batch_size,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        noise_level,
+        generator,
+        image_embeds,
+    ):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if isinstance(image, PIL.Image.Image):
+            # the image embedding should repeated so it matches the total batch size of the prompt
+            repeat_by = batch_size
+        else:
+            # assume the image input is already properly batched and just needs to be repeated so
+            # it matches the num_images_per_prompt.
+            #
+            # NOTE(will) this is probably missing a few number of side cases. I.e. batched/non-batched
+            # `image_embeds`. If those happen to be common use cases, let's think harder about
+            # what the expected dimensions of inputs should be and how we handle the encoding.
+            repeat_by = num_images_per_prompt
+
+        if image_embeds is None:
+            if not isinstance(image, torch.Tensor):
+                image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+
+            image = image.to(device=device, dtype=dtype)
+            image_embeds = self.image_encoder(image).image_embeds
+
+        image_embeds = self.noise_image_embeddings(
+            image_embeds=image_embeds,
+            noise_level=noise_level,
+            generator=generator,
+        )
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        image_embeds = image_embeds.unsqueeze(1)
+        bs_embed, seq_len, _ = image_embeds.shape
+        image_embeds = image_embeds.repeat(1, repeat_by, 1)
+        image_embeds = image_embeds.view(bs_embed * repeat_by, seq_len, -1)
+        image_embeds = image_embeds.squeeze(1)
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(image_embeds)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeds = torch.cat([negative_prompt_embeds, image_embeds])
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        height,
+        width,
+        callback_steps,
+        noise_level,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        image_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Please make sure to define only one of the two."
+            )
+
+        if prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                "Provide either `negative_prompt` or `negative_prompt_embeds`. Cannot leave both `negative_prompt` and `negative_prompt_embeds` undefined."
+            )
+
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
+            raise ValueError(
+                f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive."
+            )
+
+        if image is not None and image_embeds is not None:
+            raise ValueError(
+                "Provide either `image` or `image_embeds`. Please make sure to define only one of the two."
+            )
+
+        if image is None and image_embeds is None:
+            raise ValueError(
+                "Provide either `image` or `image_embeds`. Cannot leave both `image` and `image_embeds` undefined."
+            )
+
+        if image is not None:
+            if (
+                not isinstance(image, torch.Tensor)
+                and not isinstance(image, PIL.Image.Image)
+                and not isinstance(image, list)
+            ):
+                raise ValueError(
+                    "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                    f" {type(image)}"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_unclip.StableUnCLIPPipeline.noise_image_embeddings
+    def noise_image_embeddings(
+        self,
+        image_embeds: torch.Tensor,
+        noise_level: int,
+        noise: Optional[torch.Tensor] = None,
+        generator: Optional[torch.Generator] = None,
+    ):
+        """
+        Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher
+        `noise_level` increases the variance in the final un-noised images.
+
+        The noise is applied in two ways:
+        1. A noise schedule is applied directly to the embeddings.
+        2. A vector of sinusoidal time embeddings are appended to the output.
+
+        In both cases, the amount of noise is controlled by the same `noise_level`.
+
+        The embeddings are normalized before the noise is applied and un-normalized after the noise is applied.
+        """
+        if noise is None:
+            noise = randn_tensor(
+                image_embeds.shape, generator=generator, device=image_embeds.device, dtype=image_embeds.dtype
+            )
+
+        noise_level = torch.tensor([noise_level] * image_embeds.shape[0], device=image_embeds.device)
+
+        self.image_normalizer.to(image_embeds.device)
+        image_embeds = self.image_normalizer.scale(image_embeds)
+
+        image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise)
+
+        image_embeds = self.image_normalizer.unscale(image_embeds)
+
+        noise_level = get_timestep_embedding(
+            timesteps=noise_level, embedding_dim=image_embeds.shape[-1], flip_sin_to_cos=True, downscale_freq_shift=0
+        )
+
+        # `get_timestep_embeddings` does not contain any weights and will always return f32 tensors,
+        # but we might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        noise_level = noise_level.to(image_embeds.dtype)
+
+        image_embeds = torch.cat((image_embeds, noise_level), 1)
+
+        return image_embeds
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 20,
+        guidance_scale: float = 10,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 0,
+        image_embeds: Optional[torch.Tensor] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, either `prompt_embeds` will be
+                used or prompt is initialized to `""`.
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image` or tensor representing an image batch. The image is encoded to its CLIP embedding which the
+                `unet` is conditioned on. The image is _not_ encoded by the `vae` and then used as the latents in the
+                denoising process like it is in the standard Stable Diffusion text-guided image variation process.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 20):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 10.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            noise_level (`int`, *optional*, defaults to `0`):
+                The amount of noise to add to the image embeddings. A higher `noise_level` increases the variance in
+                the final un-noised images. See [`StableUnCLIPPipeline.noise_image_embeddings`] for more details.
+            image_embeds (`torch.Tensor`, *optional*):
+                Pre-generated CLIP embeddings to condition the `unet` on. These latents are not used in the denoising
+                process. If you want to provide pre-generated latents, pass them to `__call__` as `latents`.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                [`~ pipeline_utils.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning
+                a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        if prompt is None and prompt_embeds is None:
+            prompt = len(image) * [""] if isinstance(image, list) else ""
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            image=image,
+            height=height,
+            width=width,
+            callback_steps=callback_steps,
+            noise_level=noise_level,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            image_embeds=image_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        batch_size = batch_size * num_images_per_prompt
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Encoder input image
+        noise_level = torch.tensor([noise_level], device=device)
+        image_embeds = self._encode_image(
+            image=image,
+            device=device,
+            batch_size=batch_size,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            noise_level=noise_level,
+            generator=generator,
+            image_embeds=image_embeds,
+        )
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        if latents is None:
+            latents = self.prepare_latents(
+                batch_size=batch_size,
+                num_channels_latents=num_channels_latents,
+                height=height,
+                width=width,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+                latents=latents,
+            )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                class_labels=image_embeds,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # 9. Post-processing
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/diffusers/src/diffusers/pipelines/stable_diffusion/safety_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a0e86409e4a008b8cc3e91c8541aa8ad0fadb5a
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/safety_checker.py
@@ -0,0 +1,126 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import CLIPConfig, CLIPVisionModel, PreTrainedModel
+
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def cosine_distance(image_embeds, text_embeds):
+    normalized_image_embeds = nn.functional.normalize(image_embeds)
+    normalized_text_embeds = nn.functional.normalize(text_embeds)
+    return torch.mm(normalized_image_embeds, normalized_text_embeds.t())
+
+
+class StableDiffusionSafetyChecker(PreTrainedModel):
+    config_class = CLIPConfig
+    main_input_name = "clip_input"
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPConfig):
+        super().__init__(config)
+
+        self.vision_model = CLIPVisionModel(config.vision_config)
+        self.visual_projection = nn.Linear(config.vision_config.hidden_size, config.projection_dim, bias=False)
+
+        self.concept_embeds = nn.Parameter(torch.ones(17, config.projection_dim), requires_grad=False)
+        self.special_care_embeds = nn.Parameter(torch.ones(3, config.projection_dim), requires_grad=False)
+
+        self.concept_embeds_weights = nn.Parameter(torch.ones(17), requires_grad=False)
+        self.special_care_embeds_weights = nn.Parameter(torch.ones(3), requires_grad=False)
+
+    @torch.no_grad()
+    def forward(self, clip_input, images):
+        pooled_output = self.vision_model(clip_input)[1]  # pooled_output
+        image_embeds = self.visual_projection(pooled_output)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).cpu().float().numpy()
+        cos_dist = cosine_distance(image_embeds, self.concept_embeds).cpu().float().numpy()
+
+        result = []
+        batch_size = image_embeds.shape[0]
+        for i in range(batch_size):
+            result_img = {"special_scores": {}, "special_care": [], "concept_scores": {}, "bad_concepts": []}
+
+            # increase this value to create a stronger `nfsw` filter
+            # at the cost of increasing the possibility of filtering benign images
+            adjustment = 0.0
+
+            for concept_idx in range(len(special_cos_dist[0])):
+                concept_cos = special_cos_dist[i][concept_idx]
+                concept_threshold = self.special_care_embeds_weights[concept_idx].item()
+                result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
+                if result_img["special_scores"][concept_idx] > 0:
+                    result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
+                    adjustment = 0.01
+
+            for concept_idx in range(len(cos_dist[0])):
+                concept_cos = cos_dist[i][concept_idx]
+                concept_threshold = self.concept_embeds_weights[concept_idx].item()
+                result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
+                if result_img["concept_scores"][concept_idx] > 0:
+                    result_img["bad_concepts"].append(concept_idx)
+
+            result.append(result_img)
+
+        has_nsfw_concepts = [len(res["bad_concepts"]) > 0 for res in result]
+
+        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+            if has_nsfw_concept:
+                if torch.is_tensor(images) or torch.is_tensor(images[0]):
+                    images[idx] = torch.zeros_like(images[idx])  # black image
+                else:
+                    images[idx] = np.zeros(images[idx].shape)  # black image
+
+        if any(has_nsfw_concepts):
+            logger.warning(
+                "Potential NSFW content was detected in one or more images. A black image will be returned instead."
+                " Try again with a different prompt and/or seed."
+            )
+
+        return images, has_nsfw_concepts
+
+    @torch.no_grad()
+    def forward_onnx(self, clip_input: torch.Tensor, images: torch.Tensor):
+        pooled_output = self.vision_model(clip_input)[1]  # pooled_output
+        image_embeds = self.visual_projection(pooled_output)
+
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
+        cos_dist = cosine_distance(image_embeds, self.concept_embeds)
+
+        # increase this value to create a stronger `nsfw` filter
+        # at the cost of increasing the possibility of filtering benign images
+        adjustment = 0.0
+
+        special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment
+        # special_scores = special_scores.round(decimals=3)
+        special_care = torch.any(special_scores > 0, dim=1)
+        special_adjustment = special_care * 0.01
+        special_adjustment = special_adjustment.unsqueeze(1).expand(-1, cos_dist.shape[1])
+
+        concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
+        # concept_scores = concept_scores.round(decimals=3)
+        has_nsfw_concepts = torch.any(concept_scores > 0, dim=1)
+
+        images[has_nsfw_concepts] = 0.0  # black image
+
+        return images, has_nsfw_concepts
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py b/diffusers/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..571a4f2d771015784926bcbf240d1f367b4fc293
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
@@ -0,0 +1,112 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import jax
+import jax.numpy as jnp
+from flax import linen as nn
+from flax.core.frozen_dict import FrozenDict
+from transformers import CLIPConfig, FlaxPreTrainedModel
+from transformers.models.clip.modeling_flax_clip import FlaxCLIPVisionModule
+
+
+def jax_cosine_distance(emb_1, emb_2, eps=1e-12):
+    norm_emb_1 = jnp.divide(emb_1.T, jnp.clip(jnp.linalg.norm(emb_1, axis=1), a_min=eps)).T
+    norm_emb_2 = jnp.divide(emb_2.T, jnp.clip(jnp.linalg.norm(emb_2, axis=1), a_min=eps)).T
+    return jnp.matmul(norm_emb_1, norm_emb_2.T)
+
+
+class FlaxStableDiffusionSafetyCheckerModule(nn.Module):
+    config: CLIPConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.vision_model = FlaxCLIPVisionModule(self.config.vision_config)
+        self.visual_projection = nn.Dense(self.config.projection_dim, use_bias=False, dtype=self.dtype)
+
+        self.concept_embeds = self.param("concept_embeds", jax.nn.initializers.ones, (17, self.config.projection_dim))
+        self.special_care_embeds = self.param(
+            "special_care_embeds", jax.nn.initializers.ones, (3, self.config.projection_dim)
+        )
+
+        self.concept_embeds_weights = self.param("concept_embeds_weights", jax.nn.initializers.ones, (17,))
+        self.special_care_embeds_weights = self.param("special_care_embeds_weights", jax.nn.initializers.ones, (3,))
+
+    def __call__(self, clip_input):
+        pooled_output = self.vision_model(clip_input)[1]
+        image_embeds = self.visual_projection(pooled_output)
+
+        special_cos_dist = jax_cosine_distance(image_embeds, self.special_care_embeds)
+        cos_dist = jax_cosine_distance(image_embeds, self.concept_embeds)
+
+        # increase this value to create a stronger `nfsw` filter
+        # at the cost of increasing the possibility of filtering benign image inputs
+        adjustment = 0.0
+
+        special_scores = special_cos_dist - self.special_care_embeds_weights[None, :] + adjustment
+        special_scores = jnp.round(special_scores, 3)
+        is_special_care = jnp.any(special_scores > 0, axis=1, keepdims=True)
+        # Use a lower threshold if an image has any special care concept
+        special_adjustment = is_special_care * 0.01
+
+        concept_scores = cos_dist - self.concept_embeds_weights[None, :] + special_adjustment
+        concept_scores = jnp.round(concept_scores, 3)
+        has_nsfw_concepts = jnp.any(concept_scores > 0, axis=1)
+
+        return has_nsfw_concepts
+
+
+class FlaxStableDiffusionSafetyChecker(FlaxPreTrainedModel):
+    config_class = CLIPConfig
+    main_input_name = "clip_input"
+    module_class = FlaxStableDiffusionSafetyCheckerModule
+
+    def __init__(
+        self,
+        config: CLIPConfig,
+        input_shape: Optional[Tuple] = None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        if input_shape is None:
+            input_shape = (1, 224, 224, 3)
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.Array, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensor
+        clip_input = jax.random.normal(rng, input_shape)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(rngs, clip_input)["params"]
+
+        return random_params
+
+    def __call__(
+        self,
+        clip_input,
+        params: dict = None,
+    ):
+        clip_input = jnp.transpose(clip_input, (0, 2, 3, 1))
+
+        return self.module.apply(
+            {"params": params or self.params},
+            jnp.array(clip_input, dtype=jnp.float32),
+            rngs={},
+        )
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py b/diffusers/src/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fc6b3a3f8b0d1012a13482052adfc61a2682aba
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
@@ -0,0 +1,57 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.modeling_utils import ModelMixin
+
+
+class StableUnCLIPImageNormalizer(ModelMixin, ConfigMixin):
+    """
+    This class is used to hold the mean and standard deviation of the CLIP embedder used in stable unCLIP.
+
+    It is used to normalize the image embeddings before the noise is applied and un-normalize the noised image
+    embeddings.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        embedding_dim: int = 768,
+    ):
+        super().__init__()
+
+        self.mean = nn.Parameter(torch.zeros(1, embedding_dim))
+        self.std = nn.Parameter(torch.ones(1, embedding_dim))
+
+    def to(
+        self,
+        torch_device: Optional[Union[str, torch.device]] = None,
+        torch_dtype: Optional[torch.dtype] = None,
+    ):
+        self.mean = nn.Parameter(self.mean.to(torch_device).to(torch_dtype))
+        self.std = nn.Parameter(self.std.to(torch_device).to(torch_dtype))
+        return self
+
+    def scale(self, embeds):
+        embeds = (embeds - self.mean) * 1.0 / self.std
+        return embeds
+
+    def unscale(self, embeds):
+        embeds = (embeds * self.std) + self.mean
+        return embeds
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_3/__init__.py b/diffusers/src/diffusers/pipelines/stable_diffusion_3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0604589a208d0c5c1270629dcad144395b750cb
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_3/__init__.py
@@ -0,0 +1,54 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_additional_imports = {}
+_import_structure = {"pipeline_output": ["StableDiffusion3PipelineOutput"]}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_3"] = ["StableDiffusion3Pipeline"]
+    _import_structure["pipeline_stable_diffusion_3_img2img"] = ["StableDiffusion3Img2ImgPipeline"]
+    _import_structure["pipeline_stable_diffusion_3_inpaint"] = ["StableDiffusion3InpaintPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
+        from .pipeline_stable_diffusion_3_img2img import StableDiffusion3Img2ImgPipeline
+        from .pipeline_stable_diffusion_3_inpaint import StableDiffusion3InpaintPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
+    for name, value in _additional_imports.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_3/pipeline_output.py b/diffusers/src/diffusers/pipelines/stable_diffusion_3/pipeline_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..4655f446102a7f2db93743abdba9acc038270862
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_3/pipeline_output.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class StableDiffusion3PipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/diffusers/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a10f329a0af804858ff9410cc1e59d2be38d859
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
@@ -0,0 +1,935 @@
+# Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.transformers import SD3Transformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import StableDiffusion3PipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusion3Pipeline
+
+        >>> pipe = StableDiffusion3Pipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> image = pipe(prompt).images[0]
+        >>> image.save("sd3.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    Args:
+        transformer ([`SD3Transformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant,
+            with an additional added projection layer that is initialized with a diagonal matrix with the `hidden_size`
+            as its dimension.
+        text_encoder_2 ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        text_encoder_3 ([`T5EncoderModel`]):
+            Frozen text-encoder. Stable Diffusion 3 uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_3 (`T5TokenizerFast`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
+
+    def __init__(
+        self,
+        transformer: SD3Transformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer_2: CLIPTokenizer,
+        text_encoder_3: T5EncoderModel,
+        tokenizer_3: T5TokenizerFast,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            text_encoder_3=text_encoder_3,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            tokenizer_3=tokenizer_3,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = (
+            self.transformer.config.sample_size
+            if hasattr(self, "transformer") and self.transformer is not None
+            else 128
+        )
+
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 256,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        if self.text_encoder_3 is None:
+            return torch.zeros(
+                (
+                    batch_size * num_images_per_prompt,
+                    self.tokenizer_max_length,
+                    self.transformer.config.joint_attention_dim,
+                ),
+                device=device,
+                dtype=dtype,
+            )
+
+        text_inputs = self.tokenizer_3(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_3(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_3.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_3(text_input_ids.to(device))[0]
+
+        dtype = self.text_encoder_3.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        clip_skip: Optional[int] = None,
+        clip_model_index: int = 0,
+    ):
+        device = device or self._execution_device
+
+        clip_tokenizers = [self.tokenizer, self.tokenizer_2]
+        clip_text_encoders = [self.text_encoder, self.text_encoder_2]
+
+        tokenizer = clip_tokenizers[clip_model_index]
+        text_encoder = clip_text_encoders[clip_model_index]
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+        pooled_prompt_embeds = prompt_embeds[0]
+
+        if clip_skip is None:
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+        else:
+            prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds, pooled_prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        prompt_3: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clip_skip: Optional[int] = None,
+        max_sequence_length: int = 256,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            prompt_3 = prompt_3 or prompt
+            prompt_3 = [prompt_3] if isinstance(prompt_3, str) else prompt_3
+
+            prompt_embed, pooled_prompt_embed = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=0,
+            )
+            prompt_2_embed, pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                prompt=prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=1,
+            )
+            clip_prompt_embeds = torch.cat([prompt_embed, prompt_2_embed], dim=-1)
+
+            t5_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            clip_prompt_embeds = torch.nn.functional.pad(
+                clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+            )
+
+            prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+            pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1)
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+            negative_prompt_3 = negative_prompt_3 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+            negative_prompt_3 = (
+                batch_size * [negative_prompt_3] if isinstance(negative_prompt_3, str) else negative_prompt_3
+            )
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embed, negative_pooled_prompt_embed = self._get_clip_prompt_embeds(
+                negative_prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=0,
+            )
+            negative_prompt_2_embed, negative_pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                negative_prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=1,
+            )
+            negative_clip_prompt_embeds = torch.cat([negative_prompt_embed, negative_prompt_2_embed], dim=-1)
+
+            t5_negative_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=negative_prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            negative_clip_prompt_embeds = torch.nn.functional.pad(
+                negative_clip_prompt_embeds,
+                (0, t5_negative_prompt_embed.shape[-1] - negative_clip_prompt_embeds.shape[-1]),
+            )
+
+            negative_prompt_embeds = torch.cat([negative_clip_prompt_embeds, t5_negative_prompt_embed], dim=-2)
+            negative_pooled_prompt_embeds = torch.cat(
+                [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        prompt_3,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        negative_prompt_3=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_3 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_3`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        elif prompt_3 is not None and (not isinstance(prompt_3, str) and not isinstance(prompt_3, list)):
+            raise ValueError(f"`prompt_3` has to be of type `str` or `list` but is {type(prompt_3)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_3 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_3`: {negative_prompt_3} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        prompt_3: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 256,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used instead
+            negative_prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used instead
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            prompt_3,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_3=prompt_3,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            device=device,
+            clip_skip=self.clip_skip,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusion3PipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py b/diffusers/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..96d53663b8672ac5a02e9dbd0e0b6bd6151b7c05
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py
@@ -0,0 +1,967 @@
+# Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import SD3LoraLoaderMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.transformers import SD3Transformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import StableDiffusion3PipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+
+        >>> from diffusers import AutoPipelineForImage2Image
+        >>> from diffusers.utils import load_image
+
+        >>> device = "cuda"
+        >>> model_id_or_path = "stabilityai/stable-diffusion-3-medium-diffusers"
+        >>> pipe = AutoPipelineForImage2Image.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
+        >>> pipe = pipe.to(device)
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> init_image = load_image(url).resize((1024, 1024))
+
+        >>> prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
+
+        >>> images = pipe(prompt=prompt, image=init_image, strength=0.95, guidance_scale=7.5).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
+    r"""
+    Args:
+        transformer ([`SD3Transformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant,
+            with an additional added projection layer that is initialized with a diagonal matrix with the `hidden_size`
+            as its dimension.
+        text_encoder_2 ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        text_encoder_3 ([`T5EncoderModel`]):
+            Frozen text-encoder. Stable Diffusion 3 uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_3 (`T5TokenizerFast`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
+
+    def __init__(
+        self,
+        transformer: SD3Transformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer_2: CLIPTokenizer,
+        text_encoder_3: T5EncoderModel,
+        tokenizer_3: T5TokenizerFast,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            text_encoder_3=text_encoder_3,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            tokenizer_3=tokenizer_3,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, vae_latent_channels=self.vae.config.latent_channels
+        )
+        self.tokenizer_max_length = self.tokenizer.model_max_length
+        self.default_sample_size = self.transformer.config.sample_size
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 256,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        if self.text_encoder_3 is None:
+            return torch.zeros(
+                (
+                    batch_size * num_images_per_prompt,
+                    self.tokenizer_max_length,
+                    self.transformer.config.joint_attention_dim,
+                ),
+                device=device,
+                dtype=dtype,
+            )
+
+        text_inputs = self.tokenizer_3(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_3(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_3.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_3(text_input_ids.to(device))[0]
+
+        dtype = self.text_encoder_3.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        clip_skip: Optional[int] = None,
+        clip_model_index: int = 0,
+    ):
+        device = device or self._execution_device
+
+        clip_tokenizers = [self.tokenizer, self.tokenizer_2]
+        clip_text_encoders = [self.text_encoder, self.text_encoder_2]
+
+        tokenizer = clip_tokenizers[clip_model_index]
+        text_encoder = clip_text_encoders[clip_model_index]
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+        pooled_prompt_embeds = prompt_embeds[0]
+
+        if clip_skip is None:
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+        else:
+            prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds, pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        prompt_3: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clip_skip: Optional[int] = None,
+        max_sequence_length: int = 256,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            prompt_3 = prompt_3 or prompt
+            prompt_3 = [prompt_3] if isinstance(prompt_3, str) else prompt_3
+
+            prompt_embed, pooled_prompt_embed = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=0,
+            )
+            prompt_2_embed, pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                prompt=prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=1,
+            )
+            clip_prompt_embeds = torch.cat([prompt_embed, prompt_2_embed], dim=-1)
+
+            t5_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            clip_prompt_embeds = torch.nn.functional.pad(
+                clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+            )
+
+            prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+            pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1)
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+            negative_prompt_3 = negative_prompt_3 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+            negative_prompt_3 = (
+                batch_size * [negative_prompt_3] if isinstance(negative_prompt_3, str) else negative_prompt_3
+            )
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embed, negative_pooled_prompt_embed = self._get_clip_prompt_embeds(
+                negative_prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=0,
+            )
+            negative_prompt_2_embed, negative_pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                negative_prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=1,
+            )
+            negative_clip_prompt_embeds = torch.cat([negative_prompt_embed, negative_prompt_2_embed], dim=-1)
+
+            t5_negative_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=negative_prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            negative_clip_prompt_embeds = torch.nn.functional.pad(
+                negative_clip_prompt_embeds,
+                (0, t5_negative_prompt_embed.shape[-1] - negative_clip_prompt_embeds.shape[-1]),
+            )
+
+            negative_prompt_embeds = torch.cat([negative_clip_prompt_embeds, t5_negative_prompt_embed], dim=-2)
+            negative_pooled_prompt_embeds = torch.cat(
+                [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        prompt_3,
+        strength,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        negative_prompt_3=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_3 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_3`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        elif prompt_3 is not None and (not isinstance(prompt_3, str) and not isinstance(prompt_3, list)):
+            raise ValueError(f"`prompt_3` has to be of type `str` or `list` but is {type(prompt_3)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_3 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_3`: {negative_prompt_3} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+        if image.shape[1] == self.vae.config.latent_channels:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = (init_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.scale_noise(init_latents, timestep, noise)
+        latents = init_latents.to(device=device, dtype=dtype)
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        prompt_3: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.6,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 256,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used instead
+            negative_prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used instead
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            prompt_3,
+            strength,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_3=prompt_3,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            device=device,
+            clip_skip=self.clip_skip,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+        )
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+
+        # 3. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 5. Prepare latent variables
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+            )
+
+        # 6. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusion3PipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py b/diffusers/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..440b6529c9cafb5b832f7a76e27312366151054c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py
@@ -0,0 +1,1201 @@
+# Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.transformers import SD3Transformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import StableDiffusion3PipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusion3InpaintPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = StableDiffusion3InpaintPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+        >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+        >>> source = load_image(img_url)
+        >>> mask = load_image(mask_url)
+        >>> image = pipe(prompt=prompt, image=source, mask_image=mask).images[0]
+        >>> image.save("sd3_inpainting.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    Args:
+        transformer ([`SD3Transformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant,
+            with an additional added projection layer that is initialized with a diagonal matrix with the `hidden_size`
+            as its dimension.
+        text_encoder_2 ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        text_encoder_3 ([`T5EncoderModel`]):
+            Frozen text-encoder. Stable Diffusion 3 uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_3 (`T5TokenizerFast`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
+
+    def __init__(
+        self,
+        transformer: SD3Transformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer_2: CLIPTokenizer,
+        text_encoder_3: T5EncoderModel,
+        tokenizer_3: T5TokenizerFast,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            text_encoder_3=text_encoder_3,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            tokenizer_3=tokenizer_3,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, vae_latent_channels=self.vae.config.latent_channels
+        )
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            vae_latent_channels=self.vae.config.latent_channels,
+            do_normalize=False,
+            do_binarize=True,
+            do_convert_grayscale=True,
+        )
+        self.tokenizer_max_length = self.tokenizer.model_max_length
+        self.default_sample_size = self.transformer.config.sample_size
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 256,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        if self.text_encoder_3 is None:
+            return torch.zeros(
+                (
+                    batch_size * num_images_per_prompt,
+                    self.tokenizer_max_length,
+                    self.transformer.config.joint_attention_dim,
+                ),
+                device=device,
+                dtype=dtype,
+            )
+
+        text_inputs = self.tokenizer_3(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_3(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_3.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_3(text_input_ids.to(device))[0]
+
+        dtype = self.text_encoder_3.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        clip_skip: Optional[int] = None,
+        clip_model_index: int = 0,
+    ):
+        device = device or self._execution_device
+
+        clip_tokenizers = [self.tokenizer, self.tokenizer_2]
+        clip_text_encoders = [self.text_encoder, self.text_encoder_2]
+
+        tokenizer = clip_tokenizers[clip_model_index]
+        text_encoder = clip_text_encoders[clip_model_index]
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+        pooled_prompt_embeds = prompt_embeds[0]
+
+        if clip_skip is None:
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+        else:
+            prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds, pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        prompt_3: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clip_skip: Optional[int] = None,
+        max_sequence_length: int = 256,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            prompt_3 = prompt_3 or prompt
+            prompt_3 = [prompt_3] if isinstance(prompt_3, str) else prompt_3
+
+            prompt_embed, pooled_prompt_embed = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=0,
+            )
+            prompt_2_embed, pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                prompt=prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=1,
+            )
+            clip_prompt_embeds = torch.cat([prompt_embed, prompt_2_embed], dim=-1)
+
+            t5_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            clip_prompt_embeds = torch.nn.functional.pad(
+                clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+            )
+
+            prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+            pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1)
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+            negative_prompt_3 = negative_prompt_3 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+            negative_prompt_3 = (
+                batch_size * [negative_prompt_3] if isinstance(negative_prompt_3, str) else negative_prompt_3
+            )
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embed, negative_pooled_prompt_embed = self._get_clip_prompt_embeds(
+                negative_prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=0,
+            )
+            negative_prompt_2_embed, negative_pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                negative_prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=1,
+            )
+            negative_clip_prompt_embeds = torch.cat([negative_prompt_embed, negative_prompt_2_embed], dim=-1)
+
+            t5_negative_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=negative_prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+            negative_clip_prompt_embeds = torch.nn.functional.pad(
+                negative_clip_prompt_embeds,
+                (0, t5_negative_prompt_embed.shape[-1] - negative_clip_prompt_embeds.shape[-1]),
+            )
+
+            negative_prompt_embeds = torch.cat([negative_clip_prompt_embeds, t5_negative_prompt_embed], dim=-2)
+            negative_pooled_prompt_embeds = torch.cat(
+                [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        prompt_3,
+        strength,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        negative_prompt_3=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_3 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_3`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        elif prompt_3 is not None and (not isinstance(prompt_3, str) and not isinstance(prompt_3, list)):
+            raise ValueError(f"`prompt_3` has to be of type `str` or `list` but is {type(prompt_3)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_3 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_3`: {negative_prompt_3} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 16:
+                image_latents = image
+            else:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.scale_noise(image_latents, timestep, noise)
+        else:
+            noise = latents.to(device)
+            latents = noise
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        return image_latents
+
+    def prepare_mask_latents(
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        num_images_per_prompt,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        do_classifier_free_guidance,
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        if masked_image.shape[1] == 16:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = retrieve_latents(self.vae.encode(masked_image), generator=generator)
+
+        masked_image_latents = (masked_image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return mask, masked_image_latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        prompt_3: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: PipelineImageInput = None,
+        height: int = None,
+        width: int = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 0.6,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 256,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                will be used instead
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
+                1)`, or `(H, W)`.
+            mask_image_latent (`torch.Tensor`, `List[torch.Tensor]`):
+                `Tensor` representing an image batch to mask `image` generated by VAE. If not provided, the mask
+                latents tensor will ge generated by `mask_image`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
+                image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
+                with the same aspect ration of the image and contains all masked area, and then expand that area based
+                on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
+                resizing to the original image size for inpainting. This is useful when the masked area is small while
+                the image is large and contain information irrelevant for inpainting, such as background.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used instead
+            negative_prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used instead
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        width = width or self.transformer.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            prompt_3,
+            strength,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_3=prompt_3,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            device=device,
+            clip_skip=self.clip_skip,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+        )
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+
+        # 3. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+
+        # 4. Preprocess mask and image
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_transformer = self.transformer.config.in_channels
+        return_image_latents = num_channels_transformer == 16
+
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 6. Prepare mask latent variables
+        mask_condition = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+
+        if masked_image_latents is None:
+            masked_image = init_image * (mask_condition < 0.5)
+        else:
+            masked_image = masked_image_latents
+
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask_condition,
+            masked_image,
+            batch_size,
+            num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+
+        # match the inpainting pipeline and will be updated with input + mask inpainting model later
+        if num_channels_transformer == 33:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if (
+                num_channels_latents + num_channels_mask + num_channels_masked_image
+                != self.transformer.config.in_channels
+            ):
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.transformer`: {self.transformer.config} expects"
+                    f" {self.transformer.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.transformer` or your `mask_image` or `image` input."
+                )
+        elif num_channels_transformer != 16:
+            raise ValueError(
+                f"The transformer {self.transformer.__class__} should have 16 input channels or 33 input channels, not {self.transformer.config.in_channels}."
+            )
+
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                if num_channels_transformer == 33:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if num_channels_transformer == 16:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.scale_noise(
+                            init_latents_proper, torch.tensor([noise_timestep]), noise
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+        else:
+            image = latents
+
+        do_denormalize = [True] * image.shape[0]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusion3PipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py b/diffusers/src/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cce556fceb2379be482d383e380a09836d25ce3b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py b/diffusers/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f40fa72a25c85b6f71398dfe1e5055b0b34d400
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py
@@ -0,0 +1,1097 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import Attention
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionAttendAndExcitePipeline
+
+        >>> pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(
+        ...     "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
+        ... ).to("cuda")
+
+
+        >>> prompt = "a cat and a frog"
+
+        >>> # use get_indices function to find out indices of the tokens you want to alter
+        >>> pipe.get_indices(prompt)
+        {0: '<|startoftext|>', 1: 'a</w>', 2: 'cat</w>', 3: 'and</w>', 4: 'a</w>', 5: 'frog</w>', 6: '<|endoftext|>'}
+
+        >>> token_indices = [2, 5]
+        >>> seed = 6141
+        >>> generator = torch.Generator("cuda").manual_seed(seed)
+
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     token_indices=token_indices,
+        ...     guidance_scale=7.5,
+        ...     generator=generator,
+        ...     num_inference_steps=50,
+        ...     max_iter_to_alter=25,
+        ... ).images
+
+        >>> image = images[0]
+        >>> image.save(f"../images/{prompt}_{seed}.png")
+        ```
+"""
+
+
+class AttentionStore:
+    @staticmethod
+    def get_empty_store():
+        return {"down": [], "mid": [], "up": []}
+
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if self.cur_att_layer >= 0 and is_cross:
+            if attn.shape[1] == np.prod(self.attn_res):
+                self.step_store[place_in_unet].append(attn)
+
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers:
+            self.cur_att_layer = 0
+            self.between_steps()
+
+    def between_steps(self):
+        self.attention_store = self.step_store
+        self.step_store = self.get_empty_store()
+
+    def get_average_attention(self):
+        average_attention = self.attention_store
+        return average_attention
+
+    def aggregate_attention(self, from_where: List[str]) -> torch.Tensor:
+        """Aggregates the attention across the different layers and heads at the specified resolution."""
+        out = []
+        attention_maps = self.get_average_attention()
+        for location in from_where:
+            for item in attention_maps[location]:
+                cross_maps = item.reshape(-1, self.attn_res[0], self.attn_res[1], item.shape[-1])
+                out.append(cross_maps)
+        out = torch.cat(out, dim=0)
+        out = out.sum(0) / out.shape[0]
+        return out
+
+    def reset(self):
+        self.cur_att_layer = 0
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+
+    def __init__(self, attn_res):
+        """
+        Initialize an empty AttentionStore :param step_index: used to visualize only a specific step in the diffusion
+        process
+        """
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+        self.curr_step_index = 0
+        self.attn_res = attn_res
+
+
+class AttendExciteAttnProcessor:
+    def __init__(self, attnstore, place_in_unet):
+        super().__init__()
+        self.attnstore = attnstore
+        self.place_in_unet = place_in_unet
+
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        query = attn.to_q(hidden_states)
+
+        is_cross = encoder_hidden_states is not None
+        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+
+        # only need to store attention maps during the Attend and Excite process
+        if attention_probs.requires_grad:
+            self.attnstore(attention_probs, is_cross, self.place_in_unet)
+
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion and Attend-and-Excite.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        indices,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        indices_is_list_ints = isinstance(indices, list) and isinstance(indices[0], int)
+        indices_is_list_list_ints = (
+            isinstance(indices, list) and isinstance(indices[0], list) and isinstance(indices[0][0], int)
+        )
+
+        if not indices_is_list_ints and not indices_is_list_list_ints:
+            raise TypeError("`indices` must be a list of ints or a list of a list of ints")
+
+        if indices_is_list_ints:
+            indices_batch_size = 1
+        elif indices_is_list_list_ints:
+            indices_batch_size = len(indices)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if indices_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"indices batch size must be same as prompt batch size. indices batch size: {indices_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @staticmethod
+    def _compute_max_attention_per_index(
+        attention_maps: torch.Tensor,
+        indices: List[int],
+    ) -> List[torch.Tensor]:
+        """Computes the maximum attention value for each of the tokens we wish to alter."""
+        attention_for_text = attention_maps[:, :, 1:-1]
+        attention_for_text *= 100
+        attention_for_text = torch.nn.functional.softmax(attention_for_text, dim=-1)
+
+        # Shift indices since we removed the first token
+        indices = [index - 1 for index in indices]
+
+        # Extract the maximum values
+        max_indices_list = []
+        for i in indices:
+            image = attention_for_text[:, :, i]
+            smoothing = GaussianSmoothing().to(attention_maps.device)
+            input = F.pad(image.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1), mode="reflect")
+            image = smoothing(input).squeeze(0).squeeze(0)
+            max_indices_list.append(image.max())
+        return max_indices_list
+
+    def _aggregate_and_get_max_attention_per_token(
+        self,
+        indices: List[int],
+    ):
+        """Aggregates the attention for each token and computes the max activation value for each token to alter."""
+        attention_maps = self.attention_store.aggregate_attention(
+            from_where=("up", "down", "mid"),
+        )
+        max_attention_per_index = self._compute_max_attention_per_index(
+            attention_maps=attention_maps,
+            indices=indices,
+        )
+        return max_attention_per_index
+
+    @staticmethod
+    def _compute_loss(max_attention_per_index: List[torch.Tensor]) -> torch.Tensor:
+        """Computes the attend-and-excite loss using the maximum attention value for each token."""
+        losses = [max(0, 1.0 - curr_max) for curr_max in max_attention_per_index]
+        loss = max(losses)
+        return loss
+
+    @staticmethod
+    def _update_latent(latents: torch.Tensor, loss: torch.Tensor, step_size: float) -> torch.Tensor:
+        """Update the latent according to the computed loss."""
+        grad_cond = torch.autograd.grad(loss.requires_grad_(True), [latents], retain_graph=True)[0]
+        latents = latents - step_size * grad_cond
+        return latents
+
+    def _perform_iterative_refinement_step(
+        self,
+        latents: torch.Tensor,
+        indices: List[int],
+        loss: torch.Tensor,
+        threshold: float,
+        text_embeddings: torch.Tensor,
+        step_size: float,
+        t: int,
+        max_refinement_steps: int = 20,
+    ):
+        """
+        Performs the iterative latent refinement introduced in the paper. Here, we continuously update the latent code
+        according to our loss objective until the given threshold is reached for all tokens.
+        """
+        iteration = 0
+        target_loss = max(0, 1.0 - threshold)
+        while loss > target_loss:
+            iteration += 1
+
+            latents = latents.clone().detach().requires_grad_(True)
+            self.unet(latents, t, encoder_hidden_states=text_embeddings).sample
+            self.unet.zero_grad()
+
+            # Get max activation value for each subject token
+            max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
+                indices=indices,
+            )
+
+            loss = self._compute_loss(max_attention_per_index)
+
+            if loss != 0:
+                latents = self._update_latent(latents, loss, step_size)
+
+            logger.info(f"\t Try {iteration}. loss: {loss}")
+
+            if iteration >= max_refinement_steps:
+                logger.info(f"\t Exceeded max number of iterations ({max_refinement_steps})! ")
+                break
+
+        # Run one more time but don't compute gradients and update the latents.
+        # We just need to compute the new loss - the grad update will occur below
+        latents = latents.clone().detach().requires_grad_(True)
+        _ = self.unet(latents, t, encoder_hidden_states=text_embeddings).sample
+        self.unet.zero_grad()
+
+        # Get max activation value for each subject token
+        max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
+            indices=indices,
+        )
+        loss = self._compute_loss(max_attention_per_index)
+        logger.info(f"\t Finished with loss of: {loss}")
+        return loss, latents, max_attention_per_index
+
+    def register_attention_control(self):
+        attn_procs = {}
+        cross_att_count = 0
+        for name in self.unet.attn_processors.keys():
+            if name.startswith("mid_block"):
+                place_in_unet = "mid"
+            elif name.startswith("up_blocks"):
+                place_in_unet = "up"
+            elif name.startswith("down_blocks"):
+                place_in_unet = "down"
+            else:
+                continue
+
+            cross_att_count += 1
+            attn_procs[name] = AttendExciteAttnProcessor(attnstore=self.attention_store, place_in_unet=place_in_unet)
+
+        self.unet.set_attn_processor(attn_procs)
+        self.attention_store.num_att_layers = cross_att_count
+
+    def get_indices(self, prompt: str) -> Dict[str, int]:
+        """Utility function to list the indices of the tokens you wish to alte"""
+        ids = self.tokenizer(prompt).input_ids
+        indices = {i: tok for tok, i in zip(self.tokenizer.convert_ids_to_tokens(ids), range(len(ids)))}
+        return indices
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        token_indices: Union[List[int], List[List[int]]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        max_iter_to_alter: int = 25,
+        thresholds: dict = {0: 0.05, 10: 0.5, 20: 0.8},
+        scale_factor: int = 20,
+        attn_res: Optional[Tuple[int]] = (16, 16),
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            token_indices (`List[int]`):
+                The token indices to alter with attend-and-excite.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            max_iter_to_alter (`int`, *optional*, defaults to `25`):
+                Number of denoising steps to apply attend-and-excite. The `max_iter_to_alter` denoising steps are when
+                attend-and-excite is applied. For example, if `max_iter_to_alter` is `25` and there are a total of `30`
+                denoising steps, the first `25` denoising steps applies attend-and-excite and the last `5` will not.
+            thresholds (`dict`, *optional*, defaults to `{0: 0.05, 10: 0.5, 20: 0.8}`):
+                Dictionary defining the iterations and desired thresholds to apply iterative latent refinement in.
+            scale_factor (`int`, *optional*, default to 20):
+                Scale factor to control the step size of each attend-and-excite update.
+            attn_res (`tuple`, *optional*, default computed from width and height):
+                The 2D resolution of the semantic attention map.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            token_indices,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        if attn_res is None:
+            attn_res = int(np.ceil(width / 32)), int(np.ceil(height / 32))
+        self.attention_store = AttentionStore(attn_res)
+        original_attn_proc = self.unet.attn_processors
+        self.register_attention_control()
+
+        # default config for step size from original repo
+        scale_range = np.linspace(1.0, 0.5, len(self.scheduler.timesteps))
+        step_size = scale_factor * np.sqrt(scale_range)
+
+        text_embeddings = (
+            prompt_embeds[batch_size * num_images_per_prompt :] if do_classifier_free_guidance else prompt_embeds
+        )
+
+        if isinstance(token_indices[0], int):
+            token_indices = [token_indices]
+
+        indices = []
+
+        for ind in token_indices:
+            indices = indices + [ind] * num_images_per_prompt
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Attend and excite process
+                with torch.enable_grad():
+                    latents = latents.clone().detach().requires_grad_(True)
+                    updated_latents = []
+                    for latent, index, text_embedding in zip(latents, indices, text_embeddings):
+                        # Forward pass of denoising with text conditioning
+                        latent = latent.unsqueeze(0)
+                        text_embedding = text_embedding.unsqueeze(0)
+
+                        self.unet(
+                            latent,
+                            t,
+                            encoder_hidden_states=text_embedding,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                        ).sample
+                        self.unet.zero_grad()
+
+                        # Get max activation value for each subject token
+                        max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
+                            indices=index,
+                        )
+
+                        loss = self._compute_loss(max_attention_per_index=max_attention_per_index)
+
+                        # If this is an iterative refinement step, verify we have reached the desired threshold for all
+                        if i in thresholds.keys() and loss > 1.0 - thresholds[i]:
+                            loss, latent, max_attention_per_index = self._perform_iterative_refinement_step(
+                                latents=latent,
+                                indices=index,
+                                loss=loss,
+                                threshold=thresholds[i],
+                                text_embeddings=text_embedding,
+                                step_size=step_size[i],
+                                t=t,
+                            )
+
+                        # Perform gradient update
+                        if i < max_iter_to_alter:
+                            if loss != 0:
+                                latent = self._update_latent(
+                                    latents=latent,
+                                    loss=loss,
+                                    step_size=step_size[i],
+                                )
+                            logger.info(f"Iteration {i} | Loss: {loss:0.4f}")
+
+                        updated_latents.append(latent)
+
+                    latents = torch.cat(updated_latents, dim=0)
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 8. Post-processing
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        self.maybe_free_model_hooks()
+        # make sure to set the original attention processors back
+        self.unet.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+
+class GaussianSmoothing(torch.nn.Module):
+    """
+    Arguments:
+    Apply gaussian smoothing on a 1d, 2d or 3d tensor. Filtering is performed seperately for each channel in the input
+    using a depthwise convolution.
+        channels (int, sequence): Number of channels of the input tensors. Output will
+            have this number of channels as well.
+        kernel_size (int, sequence): Size of the gaussian kernel. sigma (float, sequence): Standard deviation of the
+        gaussian kernel. dim (int, optional): The number of dimensions of the data.
+            Default value is 2 (spatial).
+    """
+
+    # channels=1, kernel_size=kernel_size, sigma=sigma, dim=2
+    def __init__(
+        self,
+        channels: int = 1,
+        kernel_size: int = 3,
+        sigma: float = 0.5,
+        dim: int = 2,
+    ):
+        super().__init__()
+
+        if isinstance(kernel_size, int):
+            kernel_size = [kernel_size] * dim
+        if isinstance(sigma, float):
+            sigma = [sigma] * dim
+
+        # The gaussian kernel is the product of the
+        # gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
+
+        self.register_buffer("weight", kernel)
+        self.groups = channels
+
+        if dim == 1:
+            self.conv = F.conv1d
+        elif dim == 2:
+            self.conv = F.conv2d
+        elif dim == 3:
+            self.conv = F.conv3d
+        else:
+            raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim))
+
+    def forward(self, input):
+        """
+        Arguments:
+        Apply gaussian filter to input.
+            input (torch.Tensor): Input to apply gaussian filter on.
+        Returns:
+            filtered (torch.Tensor): Filtered output.
+        """
+        return self.conv(input, weight=self.weight.to(input.dtype), groups=self.groups)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_diffedit/__init__.py b/diffusers/src/diffusers/pipelines/stable_diffusion_diffedit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2145edb96c6be124abf9e9a21b9a5e8a3f3d641
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_diffedit/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_diffedit"] = ["StableDiffusionDiffEditPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py b/diffusers/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b86470dbff1ac943d314e713e3d23eba2fdd011
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
@@ -0,0 +1,1531 @@
+# Copyright 2024 DiffEdit Authors and Pix2Pix Zero Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import DDIMInverseScheduler, KarrasDiffusionSchedulers
+from ...utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class DiffEditInversionPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        latents (`torch.Tensor`)
+            inverted latents tensor
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `num_timesteps * batch_size` or numpy array of shape `(num_timesteps,
+            batch_size, height, width, num_channels)`. PIL images or numpy array present the denoised images of the
+            diffusion pipeline.
+    """
+
+    latents: torch.Tensor
+    images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+EXAMPLE_DOC_STRING = """
+
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+
+        >>> from diffusers import StableDiffusionDiffEditPipeline
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png"
+
+        >>> init_image = download_image(img_url).resize((768, 768))
+
+        >>> pipeline = StableDiffusionDiffEditPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
+        ... )
+
+        >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.enable_model_cpu_offload()
+
+        >>> mask_prompt = "A bowl of fruits"
+        >>> prompt = "A bowl of pears"
+
+        >>> mask_image = pipeline.generate_mask(image=init_image, source_prompt=prompt, target_prompt=mask_prompt)
+        >>> image_latents = pipeline.invert(image=init_image, prompt=mask_prompt).latents
+        >>> image = pipeline(prompt=prompt, mask_image=mask_image, image_latents=image_latents).images[0]
+        ```
+"""
+
+EXAMPLE_INVERT_DOC_STRING = """
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+
+        >>> from diffusers import StableDiffusionDiffEditPipeline
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png"
+
+        >>> init_image = download_image(img_url).resize((768, 768))
+
+        >>> pipeline = StableDiffusionDiffEditPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
+        ... )
+
+        >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.enable_model_cpu_offload()
+
+        >>> prompt = "A bowl of fruits"
+
+        >>> inverted_latents = pipeline.invert(image=init_image, prompt=prompt).latents
+        ```
+"""
+
+
+def auto_corr_loss(hidden_states, generator=None):
+    reg_loss = 0.0
+    for i in range(hidden_states.shape[0]):
+        for j in range(hidden_states.shape[1]):
+            noise = hidden_states[i : i + 1, j : j + 1, :, :]
+            while True:
+                roll_amount = torch.randint(noise.shape[2] // 2, (1,), generator=generator).item()
+                reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=2)).mean() ** 2
+                reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=3)).mean() ** 2
+
+                if noise.shape[2] <= 8:
+                    break
+                noise = torch.nn.functional.avg_pool2d(noise, kernel_size=2)
+    return reg_loss
+
+
+def kl_divergence(hidden_states):
+    return hidden_states.var() + hidden_states.mean() ** 2 - 1 - torch.log(hidden_states.var() + 1e-7)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+def preprocess_mask(mask, batch_size: int = 1):
+    if not isinstance(mask, torch.Tensor):
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+
+        if isinstance(mask, list):
+            if isinstance(mask[0], PIL.Image.Image):
+                mask = [np.array(m.convert("L")).astype(np.float32) / 255.0 for m in mask]
+            if isinstance(mask[0], np.ndarray):
+                mask = np.stack(mask, axis=0) if mask[0].ndim < 3 else np.concatenate(mask, axis=0)
+                mask = torch.from_numpy(mask)
+            elif isinstance(mask[0], torch.Tensor):
+                mask = torch.stack(mask, dim=0) if mask[0].ndim < 3 else torch.cat(mask, dim=0)
+
+    # Batch and add channel dim for single mask
+    if mask.ndim == 2:
+        mask = mask.unsqueeze(0).unsqueeze(0)
+
+    # Batch single mask or add channel dim
+    if mask.ndim == 3:
+        # Single batched mask, no channel dim or single mask not batched but channel dim
+        if mask.shape[0] == 1:
+            mask = mask.unsqueeze(0)
+
+        # Batched masks no channel dim
+        else:
+            mask = mask.unsqueeze(1)
+
+    # Check mask shape
+    if batch_size > 1:
+        if mask.shape[0] == 1:
+            mask = torch.cat([mask] * batch_size)
+        elif mask.shape[0] > 1 and mask.shape[0] != batch_size:
+            raise ValueError(
+                f"`mask_image` with batch size {mask.shape[0]} cannot be broadcasted to batch size {batch_size} "
+                f"inferred by prompt inputs"
+            )
+
+    if mask.shape[1] != 1:
+        raise ValueError(f"`mask_image` must have 1 channel, but has {mask.shape[1]} channels")
+
+    # Check mask is in [0, 1]
+    if mask.min() < 0 or mask.max() > 1:
+        raise ValueError("`mask_image` should be in [0, 1] range")
+
+    # Binarize mask
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+
+    return mask
+
+
+class StableDiffusionDiffEditPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
+):
+    r"""
+    <Tip warning={true}>
+
+    This is an experimental feature!
+
+    </Tip>
+
+    Pipeline for text-guided image inpainting using Stable Diffusion and DiffEdit.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading and saving methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        inverse_scheduler ([`DDIMInverseScheduler`]):
+            A scheduler to be used in combination with `unet` to fill in the unmasked part of the input latents.
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "inverse_scheduler"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        inverse_scheduler: DDIMInverseScheduler,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration"
+                " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
+                " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
+                " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
+                " Hub, it would be very nice if you could open a Pull request for the"
+                " `scheduler/scheduler_config.json` file"
+            )
+            deprecate("skip_prk_steps not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["skip_prk_steps"] = True
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            inverse_scheduler=inverse_scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (strength is None) or (strength is not None and (strength < 0 or strength > 1)):
+            raise ValueError(
+                f"The value of `strength` should in [0.0, 1.0] but is, but is {strength} of type {type(strength)}."
+            )
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def check_source_inputs(
+        self,
+        source_prompt=None,
+        source_negative_prompt=None,
+        source_prompt_embeds=None,
+        source_negative_prompt_embeds=None,
+    ):
+        if source_prompt is not None and source_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `source_prompt`: {source_prompt} and `source_prompt_embeds`: {source_prompt_embeds}."
+                "  Please make sure to only forward one of the two."
+            )
+        elif source_prompt is None and source_prompt_embeds is None:
+            raise ValueError(
+                "Provide either `source_image` or `source_prompt_embeds`. Cannot leave all both of the arguments undefined."
+            )
+        elif source_prompt is not None and (
+            not isinstance(source_prompt, str) and not isinstance(source_prompt, list)
+        ):
+            raise ValueError(f"`source_prompt` has to be of type `str` or `list` but is {type(source_prompt)}")
+
+        if source_negative_prompt is not None and source_negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `source_negative_prompt`: {source_negative_prompt} and `source_negative_prompt_embeds`:"
+                f" {source_negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if source_prompt_embeds is not None and source_negative_prompt_embeds is not None:
+            if source_prompt_embeds.shape != source_negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`source_prompt_embeds` and `source_negative_prompt_embeds` must have the same shape when passed"
+                    f" directly, but got: `source_prompt_embeds` {source_prompt_embeds.shape} !="
+                    f" `source_negative_prompt_embeds` {source_negative_prompt_embeds.shape}."
+                )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    def get_inverse_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+
+        # safety for t_start overflow to prevent empty timsteps slice
+        if t_start == 0:
+            return self.inverse_scheduler.timesteps, num_inference_steps
+        timesteps = self.inverse_scheduler.timesteps[:-t_start]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_image_latents(self, image, batch_size, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        if image.shape[1] == 4:
+            latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if isinstance(generator, list):
+                latents = [
+                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                latents = torch.cat(latents, dim=0)
+            else:
+                latents = self.vae.encode(image).latent_dist.sample(generator)
+
+            latents = self.vae.config.scaling_factor * latents
+
+        if batch_size != latents.shape[0]:
+            if batch_size % latents.shape[0] == 0:
+                # expand image_latents for batch_size
+                deprecation_message = (
+                    f"You have passed {batch_size} text prompts (`prompt`), but only {latents.shape[0]} initial"
+                    " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                    " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                    " your script to pass as many initial images as text prompts to suppress this warning."
+                )
+                deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+                additional_latents_per_image = batch_size // latents.shape[0]
+                latents = torch.cat([latents] * additional_latents_per_image, dim=0)
+            else:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {latents.shape[0]} to {batch_size} text prompts."
+                )
+        else:
+            latents = torch.cat([latents], dim=0)
+
+        return latents
+
+    def get_epsilon(self, model_output: torch.Tensor, sample: torch.Tensor, timestep: int):
+        pred_type = self.inverse_scheduler.config.prediction_type
+        alpha_prod_t = self.inverse_scheduler.alphas_cumprod[timestep]
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        if pred_type == "epsilon":
+            return model_output
+        elif pred_type == "sample":
+            return (sample - alpha_prod_t ** (0.5) * model_output) / beta_prod_t ** (0.5)
+        elif pred_type == "v_prediction":
+            return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {pred_type} must be one of `epsilon`, `sample`, or `v_prediction`"
+            )
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def generate_mask(
+        self,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        target_prompt: Optional[Union[str, List[str]]] = None,
+        target_negative_prompt: Optional[Union[str, List[str]]] = None,
+        target_prompt_embeds: Optional[torch.Tensor] = None,
+        target_negative_prompt_embeds: Optional[torch.Tensor] = None,
+        source_prompt: Optional[Union[str, List[str]]] = None,
+        source_negative_prompt: Optional[Union[str, List[str]]] = None,
+        source_prompt_embeds: Optional[torch.Tensor] = None,
+        source_negative_prompt_embeds: Optional[torch.Tensor] = None,
+        num_maps_per_mask: Optional[int] = 10,
+        mask_encode_strength: Optional[float] = 0.5,
+        mask_thresholding_ratio: Optional[float] = 3.0,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "np",
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Generate a latent mask given a mask prompt, a target prompt, and an image.
+
+        Args:
+            image (`PIL.Image.Image`):
+                `Image` or tensor representing an image batch to be used for computing the mask.
+            target_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide semantic mask generation. If not defined, you need to pass
+                `prompt_embeds`.
+            target_negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            target_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            target_negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            source_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide semantic mask generation using DiffEdit. If not defined, you need to
+                pass `source_prompt_embeds` or `source_image` instead.
+            source_negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide semantic mask generation away from using DiffEdit. If not defined, you
+                need to pass `source_negative_prompt_embeds` or `source_image` instead.
+            source_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings to guide the semantic mask generation. Can be used to easily tweak text
+                inputs (prompt weighting). If not provided, text embeddings are generated from `source_prompt` input
+                argument.
+            source_negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings to negatively guide the semantic mask generation. Can be used to easily
+                tweak text inputs (prompt weighting). If not provided, text embeddings are generated from
+                `source_negative_prompt` input argument.
+            num_maps_per_mask (`int`, *optional*, defaults to 10):
+                The number of noise maps sampled to generate the semantic mask using DiffEdit.
+            mask_encode_strength (`float`, *optional*, defaults to 0.5):
+                The strength of the noise maps sampled to generate the semantic mask using DiffEdit. Must be between 0
+                and 1.
+            mask_thresholding_ratio (`float`, *optional*, defaults to 3.0):
+                The maximum multiple of the mean absolute difference used to clamp the semantic guidance map before
+                mask binarization.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the
+                [`~models.attention_processor.AttnProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            `List[PIL.Image.Image]` or `np.array`:
+                When returning a `List[PIL.Image.Image]`, the list consists of a batch of single-channel binary images
+                with dimensions `(height // self.vae_scale_factor, width // self.vae_scale_factor)`. If it's
+                `np.array`, the shape is `(batch_size, height // self.vae_scale_factor, width //
+                self.vae_scale_factor)`.
+        """
+
+        # 1. Check inputs (Provide dummy argument for callback_steps)
+        self.check_inputs(
+            target_prompt,
+            mask_encode_strength,
+            1,
+            target_negative_prompt,
+            target_prompt_embeds,
+            target_negative_prompt_embeds,
+        )
+
+        self.check_source_inputs(
+            source_prompt,
+            source_negative_prompt,
+            source_prompt_embeds,
+            source_negative_prompt_embeds,
+        )
+
+        if (num_maps_per_mask is None) or (
+            num_maps_per_mask is not None and (not isinstance(num_maps_per_mask, int) or num_maps_per_mask <= 0)
+        ):
+            raise ValueError(
+                f"`num_maps_per_mask` has to be a positive integer but is {num_maps_per_mask} of type"
+                f" {type(num_maps_per_mask)}."
+            )
+
+        if mask_thresholding_ratio is None or mask_thresholding_ratio <= 0:
+            raise ValueError(
+                f"`mask_thresholding_ratio` has to be positive but is {mask_thresholding_ratio} of type"
+                f" {type(mask_thresholding_ratio)}."
+            )
+
+        # 2. Define call parameters
+        if target_prompt is not None and isinstance(target_prompt, str):
+            batch_size = 1
+        elif target_prompt is not None and isinstance(target_prompt, list):
+            batch_size = len(target_prompt)
+        else:
+            batch_size = target_prompt_embeds.shape[0]
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompts
+        (cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None)
+        target_negative_prompt_embeds, target_prompt_embeds = self.encode_prompt(
+            target_prompt,
+            device,
+            num_maps_per_mask,
+            do_classifier_free_guidance,
+            target_negative_prompt,
+            prompt_embeds=target_prompt_embeds,
+            negative_prompt_embeds=target_negative_prompt_embeds,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            target_prompt_embeds = torch.cat([target_negative_prompt_embeds, target_prompt_embeds])
+
+        source_negative_prompt_embeds, source_prompt_embeds = self.encode_prompt(
+            source_prompt,
+            device,
+            num_maps_per_mask,
+            do_classifier_free_guidance,
+            source_negative_prompt,
+            prompt_embeds=source_prompt_embeds,
+            negative_prompt_embeds=source_negative_prompt_embeds,
+        )
+        if do_classifier_free_guidance:
+            source_prompt_embeds = torch.cat([source_negative_prompt_embeds, source_prompt_embeds])
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image).repeat_interleave(num_maps_per_mask, dim=0)
+
+        # 5. Set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, _ = self.get_timesteps(num_inference_steps, mask_encode_strength, device)
+        encode_timestep = timesteps[0]
+
+        # 6. Prepare image latents and add noise with specified strength
+        image_latents = self.prepare_image_latents(
+            image, batch_size * num_maps_per_mask, self.vae.dtype, device, generator
+        )
+        noise = randn_tensor(image_latents.shape, generator=generator, device=device, dtype=self.vae.dtype)
+        image_latents = self.scheduler.add_noise(image_latents, noise, encode_timestep)
+
+        latent_model_input = torch.cat([image_latents] * (4 if do_classifier_free_guidance else 2))
+        latent_model_input = self.scheduler.scale_model_input(latent_model_input, encode_timestep)
+
+        # 7. Predict the noise residual
+        prompt_embeds = torch.cat([source_prompt_embeds, target_prompt_embeds])
+        noise_pred = self.unet(
+            latent_model_input,
+            encode_timestep,
+            encoder_hidden_states=prompt_embeds,
+            cross_attention_kwargs=cross_attention_kwargs,
+        ).sample
+
+        if do_classifier_free_guidance:
+            noise_pred_neg_src, noise_pred_source, noise_pred_uncond, noise_pred_target = noise_pred.chunk(4)
+            noise_pred_source = noise_pred_neg_src + guidance_scale * (noise_pred_source - noise_pred_neg_src)
+            noise_pred_target = noise_pred_uncond + guidance_scale * (noise_pred_target - noise_pred_uncond)
+        else:
+            noise_pred_source, noise_pred_target = noise_pred.chunk(2)
+
+        # 8. Compute the mask from the absolute difference of predicted noise residuals
+        # TODO: Consider smoothing mask guidance map
+        mask_guidance_map = (
+            torch.abs(noise_pred_target - noise_pred_source)
+            .reshape(batch_size, num_maps_per_mask, *noise_pred_target.shape[-3:])
+            .mean([1, 2])
+        )
+        clamp_magnitude = mask_guidance_map.mean() * mask_thresholding_ratio
+        semantic_mask_image = mask_guidance_map.clamp(0, clamp_magnitude) / clamp_magnitude
+        semantic_mask_image = torch.where(semantic_mask_image <= 0.5, 0, 1)
+        mask_image = semantic_mask_image.cpu().numpy()
+
+        # 9. Convert to Numpy array or PIL.
+        if output_type == "pil":
+            mask_image = self.image_processor.numpy_to_pil(mask_image)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        return mask_image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INVERT_DOC_STRING)
+    def invert(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        num_inference_steps: int = 50,
+        inpaint_strength: float = 0.8,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        decode_latents: bool = False,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        lambda_auto_corr: float = 20.0,
+        lambda_kl: float = 20.0,
+        num_reg_steps: int = 0,
+        num_auto_corr_rolls: int = 5,
+    ):
+        r"""
+        Generate inverted latents given a prompt and image.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`PIL.Image.Image`):
+                `Image` or tensor representing an image batch to produce the inverted latents guided by `prompt`.
+            inpaint_strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent of the noising process to run latent inversion. Must be between 0 and 1. When
+                `inpaint_strength` is 1, the inversion process is run for the full number of iterations specified in
+                `num_inference_steps`. `image` is used as a reference for the inversion process, and adding more noise
+                increases `inpaint_strength`. If `inpaint_strength` is 0, no inpainting occurs.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            decode_latents (`bool`, *optional*, defaults to `False`):
+                Whether or not to decode the inverted latents into a generated image. Setting this argument to `True`
+                decodes all inverted latents for each timestep into a list of generated images.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.DiffEditInversionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the
+                [`~models.attention_processor.AttnProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            lambda_auto_corr (`float`, *optional*, defaults to 20.0):
+                Lambda parameter to control auto correction.
+            lambda_kl (`float`, *optional*, defaults to 20.0):
+                Lambda parameter to control Kullback-Leibler divergence output.
+            num_reg_steps (`int`, *optional*, defaults to 0):
+                Number of regularization loss steps.
+            num_auto_corr_rolls (`int`, *optional*, defaults to 5):
+                Number of auto correction roll steps.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.pipeline_stable_diffusion_diffedit.DiffEditInversionPipelineOutput`] or
+            `tuple`:
+                If `return_dict` is `True`,
+                [`~pipelines.stable_diffusion.pipeline_stable_diffusion_diffedit.DiffEditInversionPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is the inverted latents tensors
+                ordered by increasing noise, and the second is the corresponding decoded images if `decode_latents` is
+                `True`, otherwise `None`.
+        """
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            inpaint_strength,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        if image is None:
+            raise ValueError("`image` input cannot be undefined.")
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 4. Prepare latent variables
+        num_images_per_prompt = 1
+        latents = self.prepare_image_latents(
+            image, batch_size * num_images_per_prompt, self.vae.dtype, device, generator
+        )
+
+        # 5. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 6. Prepare timesteps
+        self.inverse_scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_inverse_timesteps(num_inference_steps, inpaint_strength, device)
+
+        # 7. Noising loop where we obtain the intermediate noised latent image for each timestep.
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
+        inverted_latents = []
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # regularization of the noise prediction (not in original code or paper but borrowed from Pix2PixZero)
+                if num_reg_steps > 0:
+                    with torch.enable_grad():
+                        for _ in range(num_reg_steps):
+                            if lambda_auto_corr > 0:
+                                for _ in range(num_auto_corr_rolls):
+                                    var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)
+
+                                    # Derive epsilon from model output before regularizing to IID standard normal
+                                    var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
+
+                                    l_ac = auto_corr_loss(var_epsilon, generator=generator)
+                                    l_ac.backward()
+
+                                    grad = var.grad.detach() / num_auto_corr_rolls
+                                    noise_pred = noise_pred - lambda_auto_corr * grad
+
+                            if lambda_kl > 0:
+                                var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)
+
+                                # Derive epsilon from model output before regularizing to IID standard normal
+                                var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
+
+                                l_kld = kl_divergence(var_epsilon)
+                                l_kld.backward()
+
+                                grad = var.grad.detach()
+                                noise_pred = noise_pred - lambda_kl * grad
+
+                            noise_pred = noise_pred.detach()
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.inverse_scheduler.step(noise_pred, t, latents).prev_sample
+                inverted_latents.append(latents.detach().clone())
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.inverse_scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        assert len(inverted_latents) == len(timesteps)
+        latents = torch.stack(list(reversed(inverted_latents)), 1)
+
+        # 8. Post-processing
+        image = None
+        if decode_latents:
+            image = self.decode_latents(latents.flatten(0, 1))
+
+        # 9. Convert to PIL.
+        if decode_latents and output_type == "pil":
+            image = self.image_processor.numpy_to_pil(image)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (latents, image)
+
+        return DiffEditInversionPipelineOutput(latents=latents, images=image)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        mask_image: Union[torch.Tensor, PIL.Image.Image] = None,
+        image_latents: Union[torch.Tensor, PIL.Image.Image] = None,
+        inpaint_strength: Optional[float] = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            mask_image (`PIL.Image.Image`):
+                `Image` or tensor representing an image batch to mask the generated image. White pixels in the mask are
+                repainted, while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, 1, H, W)`.
+            image_latents (`PIL.Image.Image` or `torch.Tensor`):
+                Partially noised image latents from the inversion process to be used as inputs for image generation.
+            inpaint_strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to inpaint the masked area. Must be between 0 and 1. When `inpaint_strength` is 1, the
+                denoising process is run on the masked area for the full number of iterations specified in
+                `num_inference_steps`. `image_latents` is used as a reference for the masked area, and adding more
+                noise to a region increases `inpaint_strength`. If `inpaint_strength` is 0, no inpainting occurs.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            inpaint_strength,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        if mask_image is None:
+            raise ValueError(
+                "`mask_image` input cannot be undefined. Use `generate_mask()` to compute `mask_image` from text prompts."
+            )
+        if image_latents is None:
+            raise ValueError(
+                "`image_latents` input cannot be undefined. Use `invert()` to compute `image_latents` from input images."
+            )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Preprocess mask
+        mask_image = preprocess_mask(mask_image, batch_size)
+        latent_height, latent_width = mask_image.shape[-2:]
+        mask_image = torch.cat([mask_image] * num_images_per_prompt)
+        mask_image = mask_image.to(device=device, dtype=prompt_embeds.dtype)
+
+        # 5. Set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, inpaint_strength, device)
+
+        # 6. Preprocess image latents
+        if isinstance(image_latents, list) and any(isinstance(l, torch.Tensor) and l.ndim == 5 for l in image_latents):
+            image_latents = torch.cat(image_latents).detach()
+        elif isinstance(image_latents, torch.Tensor) and image_latents.ndim == 5:
+            image_latents = image_latents.detach()
+        else:
+            image_latents = self.image_processor.preprocess(image_latents).detach()
+
+        latent_shape = (self.vae.config.latent_channels, latent_height, latent_width)
+        if image_latents.shape[-3:] != latent_shape:
+            raise ValueError(
+                f"Each latent image in `image_latents` must have shape {latent_shape}, "
+                f"but has shape {image_latents.shape[-3:]}"
+            )
+        if image_latents.ndim == 4:
+            image_latents = image_latents.reshape(batch_size, len(timesteps), *latent_shape)
+        if image_latents.shape[:2] != (batch_size, len(timesteps)):
+            raise ValueError(
+                f"`image_latents` must have batch size {batch_size} with latent images from {len(timesteps)}"
+                f" timesteps, but has batch size {image_latents.shape[0]} with latent images from"
+                f" {image_latents.shape[1]} timesteps."
+            )
+        image_latents = image_latents.transpose(0, 1).repeat_interleave(num_images_per_prompt, dim=1)
+        image_latents = image_latents.to(device=device, dtype=prompt_embeds.dtype)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        latents = image_latents[0].clone()
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # mask with inverted latents from appropriate timestep - use original image latent for last step
+                latents = latents * mask_image + image_latents[i] * (1 - mask_image)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_gligen/__init__.py b/diffusers/src/diffusers/pipelines/stable_diffusion_gligen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..147980cbf9e5c3418fc1854787ae37b25e4fed56
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_gligen/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_gligen"] = ["StableDiffusionGLIGENPipeline"]
+    _import_structure["pipeline_stable_diffusion_gligen_text_image"] = ["StableDiffusionGLIGENTextImagePipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_diffusion_gligen import StableDiffusionGLIGENPipeline
+        from .pipeline_stable_diffusion_gligen_text_image import StableDiffusionGLIGENTextImagePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py b/diffusers/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py
new file mode 100644
index 0000000000000000000000000000000000000000..52ccd5612776caca32fd7bfec09297477f49b427
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py
@@ -0,0 +1,851 @@
+# Copyright 2024 The GLIGEN Authors and HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention import GatedSelfAttentionDense
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionGLIGENPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> # Insert objects described by text at the region defined by bounding boxes
+        >>> pipe = StableDiffusionGLIGENPipeline.from_pretrained(
+        ...     "masterful/gligen-1-4-inpainting-text-box", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> input_image = load_image(
+        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/livingroom_modern.png"
+        ... )
+        >>> prompt = "a birthday cake"
+        >>> boxes = [[0.2676, 0.6088, 0.4773, 0.7183]]
+        >>> phrases = ["a birthday cake"]
+
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     gligen_phrases=phrases,
+        ...     gligen_inpaint_image=input_image,
+        ...     gligen_boxes=boxes,
+        ...     gligen_scheduled_sampling_beta=1,
+        ...     output_type="pil",
+        ...     num_inference_steps=50,
+        ... ).images
+
+        >>> images[0].save("./gligen-1-4-inpainting-text-box.jpg")
+
+        >>> # Generate an image described by the prompt and
+        >>> # insert objects described by text at the region defined by bounding boxes
+        >>> pipe = StableDiffusionGLIGENPipeline.from_pretrained(
+        ...     "masterful/gligen-1-4-generation-text-box", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a waterfall and a modern high speed train running through the tunnel in a beautiful forest with fall foliage"
+        >>> boxes = [[0.1387, 0.2051, 0.4277, 0.7090], [0.4980, 0.4355, 0.8516, 0.7266]]
+        >>> phrases = ["a waterfall", "a modern high speed train running through the tunnel"]
+
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     gligen_phrases=phrases,
+        ...     gligen_boxes=boxes,
+        ...     gligen_scheduled_sampling_beta=1,
+        ...     output_type="pil",
+        ...     num_inference_steps=50,
+        ... ).images
+
+        >>> images[0].save("./gligen-1-4-generation-text-box.jpg")
+        ```
+"""
+
+
+class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with Grounded-Language-to-Image Generation (GLIGEN).
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        gligen_phrases,
+        gligen_boxes,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if len(gligen_phrases) != len(gligen_boxes):
+            raise ValueError(
+                "length of `gligen_phrases` and `gligen_boxes` has to be same, but"
+                f" got: `gligen_phrases` {len(gligen_phrases)} != `gligen_boxes` {len(gligen_boxes)}"
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def enable_fuser(self, enabled=True):
+        for module in self.unet.modules():
+            if type(module) is GatedSelfAttentionDense:
+                module.enabled = enabled
+
+    def draw_inpaint_mask_from_boxes(self, boxes, size):
+        inpaint_mask = torch.ones(size[0], size[1])
+        for box in boxes:
+            x0, x1 = box[0] * size[0], box[2] * size[0]
+            y0, y1 = box[1] * size[1], box[3] * size[1]
+            inpaint_mask[int(y0) : int(y1), int(x0) : int(x1)] = 0
+        return inpaint_mask
+
+    def crop(self, im, new_width, new_height):
+        width, height = im.size
+        left = (width - new_width) / 2
+        top = (height - new_height) / 2
+        right = (width + new_width) / 2
+        bottom = (height + new_height) / 2
+        return im.crop((left, top, right, bottom))
+
+    def target_size_center_crop(self, im, new_hw):
+        width, height = im.size
+        if width != height:
+            im = self.crop(im, min(height, width), min(height, width))
+        return im.resize((new_hw, new_hw), PIL.Image.LANCZOS)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        gligen_scheduled_sampling_beta: float = 0.3,
+        gligen_phrases: List[str] = None,
+        gligen_boxes: List[List[float]] = None,
+        gligen_inpaint_image: Optional[PIL.Image.Image] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            gligen_phrases (`List[str]`):
+                The phrases to guide what to include in each of the regions defined by the corresponding
+                `gligen_boxes`. There should only be one phrase per bounding box.
+            gligen_boxes (`List[List[float]]`):
+                The bounding boxes that identify rectangular regions of the image that are going to be filled with the
+                content described by the corresponding `gligen_phrases`. Each rectangular box is defined as a
+                `List[float]` of 4 elements `[xmin, ymin, xmax, ymax]` where each value is between [0,1].
+            gligen_inpaint_image (`PIL.Image.Image`, *optional*):
+                The input image, if provided, is inpainted with objects described by the `gligen_boxes` and
+                `gligen_phrases`. Otherwise, it is treated as a generation task on a blank input image.
+            gligen_scheduled_sampling_beta (`float`, defaults to 0.3):
+                Scheduled Sampling factor from [GLIGEN: Open-Set Grounded Text-to-Image
+                Generation](https://arxiv.org/pdf/2301.07093.pdf). Scheduled Sampling factor is only varied for
+                scheduled sampling during inference for improved quality and controllability.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            gligen_phrases,
+            gligen_boxes,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 5.1 Prepare GLIGEN variables
+        max_objs = 30
+        if len(gligen_boxes) > max_objs:
+            warnings.warn(
+                f"More that {max_objs} objects found. Only first {max_objs} objects will be processed.",
+                FutureWarning,
+            )
+            gligen_phrases = gligen_phrases[:max_objs]
+            gligen_boxes = gligen_boxes[:max_objs]
+        # prepare batched input to the GLIGENTextBoundingboxProjection (boxes, phrases, mask)
+        # Get tokens for phrases from pre-trained CLIPTokenizer
+        tokenizer_inputs = self.tokenizer(gligen_phrases, padding=True, return_tensors="pt").to(device)
+        # For the token, we use the same pre-trained text encoder
+        # to obtain its text feature
+        _text_embeddings = self.text_encoder(**tokenizer_inputs).pooler_output
+        n_objs = len(gligen_boxes)
+        # For each entity, described in phrases, is denoted with a bounding box,
+        # we represent the location information as (xmin,ymin,xmax,ymax)
+        boxes = torch.zeros(max_objs, 4, device=device, dtype=self.text_encoder.dtype)
+        boxes[:n_objs] = torch.tensor(gligen_boxes)
+        text_embeddings = torch.zeros(
+            max_objs, self.unet.config.cross_attention_dim, device=device, dtype=self.text_encoder.dtype
+        )
+        text_embeddings[:n_objs] = _text_embeddings
+        # Generate a mask for each object that is entity described by phrases
+        masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        masks[:n_objs] = 1
+
+        repeat_batch = batch_size * num_images_per_prompt
+        boxes = boxes.unsqueeze(0).expand(repeat_batch, -1, -1).clone()
+        text_embeddings = text_embeddings.unsqueeze(0).expand(repeat_batch, -1, -1).clone()
+        masks = masks.unsqueeze(0).expand(repeat_batch, -1).clone()
+        if do_classifier_free_guidance:
+            repeat_batch = repeat_batch * 2
+            boxes = torch.cat([boxes] * 2)
+            text_embeddings = torch.cat([text_embeddings] * 2)
+            masks = torch.cat([masks] * 2)
+            masks[: repeat_batch // 2] = 0
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+        cross_attention_kwargs["gligen"] = {"boxes": boxes, "positive_embeddings": text_embeddings, "masks": masks}
+
+        # Prepare latent variables for GLIGEN inpainting
+        if gligen_inpaint_image is not None:
+            # if the given input image is not of the same size as expected by VAE
+            # center crop and resize the input image to expected shape
+            if gligen_inpaint_image.size != (self.vae.sample_size, self.vae.sample_size):
+                gligen_inpaint_image = self.target_size_center_crop(gligen_inpaint_image, self.vae.sample_size)
+            # Convert a single image into a batch of images with a batch size of 1
+            # The resulting shape becomes (1, C, H, W), where C is the number of channels,
+            # and H and W are the height and width of the image.
+            # scales the pixel values to a range [-1, 1]
+            gligen_inpaint_image = self.image_processor.preprocess(gligen_inpaint_image)
+            gligen_inpaint_image = gligen_inpaint_image.to(dtype=self.vae.dtype, device=self.vae.device)
+            # Run AutoEncoder to get corresponding latents
+            gligen_inpaint_latent = self.vae.encode(gligen_inpaint_image).latent_dist.sample()
+            gligen_inpaint_latent = self.vae.config.scaling_factor * gligen_inpaint_latent
+            # Generate an inpainting mask
+            # pixel value = 0, where the object is present (defined by bounding boxes above)
+            #               1, everywhere else
+            gligen_inpaint_mask = self.draw_inpaint_mask_from_boxes(gligen_boxes, gligen_inpaint_latent.shape[2:])
+            gligen_inpaint_mask = gligen_inpaint_mask.to(
+                dtype=gligen_inpaint_latent.dtype, device=gligen_inpaint_latent.device
+            )
+            gligen_inpaint_mask = gligen_inpaint_mask[None, None]
+            gligen_inpaint_mask_addition = torch.cat(
+                (gligen_inpaint_latent * gligen_inpaint_mask, gligen_inpaint_mask), dim=1
+            )
+            # Convert a single mask into a batch of masks with a batch size of 1
+            gligen_inpaint_mask_addition = gligen_inpaint_mask_addition.expand(repeat_batch, -1, -1, -1).clone()
+
+        num_grounding_steps = int(gligen_scheduled_sampling_beta * len(timesteps))
+        self.enable_fuser(True)
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Scheduled sampling
+                if i == num_grounding_steps:
+                    self.enable_fuser(False)
+
+                if latents.shape[1] != 4:
+                    latents = torch.randn_like(latents[:, :4])
+
+                if gligen_inpaint_image is not None:
+                    gligen_inpaint_latent_with_noise = (
+                        self.scheduler.add_noise(
+                            gligen_inpaint_latent, torch.randn_like(gligen_inpaint_latent), torch.tensor([t])
+                        )
+                        .expand(latents.shape[0], -1, -1, -1)
+                        .clone()
+                    )
+                    latents = gligen_inpaint_latent_with_noise * gligen_inpaint_mask + latents * (
+                        1 - gligen_inpaint_mask
+                    )
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if gligen_inpaint_image is not None:
+                    latent_model_input = torch.cat((latent_model_input, gligen_inpaint_mask_addition), dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/diffusers/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6748ad418fe5f7da297135b3c49d9f6589eeb4b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py
@@ -0,0 +1,1023 @@
+# Copyright 2024 The GLIGEN Authors and HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention import GatedSelfAttentionDense
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import USE_PEFT_BACKEND, logging, replace_example_docstring, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.clip_image_project_model import CLIPImageProjection
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionGLIGENTextImagePipeline
+        >>> from diffusers.utils import load_image
+
+        >>> # Insert objects described by image at the region defined by bounding boxes
+        >>> pipe = StableDiffusionGLIGENTextImagePipeline.from_pretrained(
+        ...     "anhnct/Gligen_Inpainting_Text_Image", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> input_image = load_image(
+        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/livingroom_modern.png"
+        ... )
+        >>> prompt = "a backpack"
+        >>> boxes = [[0.2676, 0.4088, 0.4773, 0.7183]]
+        >>> phrases = None
+        >>> gligen_image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/backpack.jpeg"
+        ... )
+
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     gligen_phrases=phrases,
+        ...     gligen_inpaint_image=input_image,
+        ...     gligen_boxes=boxes,
+        ...     gligen_images=[gligen_image],
+        ...     gligen_scheduled_sampling_beta=1,
+        ...     output_type="pil",
+        ...     num_inference_steps=50,
+        ... ).images
+
+        >>> images[0].save("./gligen-inpainting-text-image-box.jpg")
+
+        >>> # Generate an image described by the prompt and
+        >>> # insert objects described by text and image at the region defined by bounding boxes
+        >>> pipe = StableDiffusionGLIGENTextImagePipeline.from_pretrained(
+        ...     "anhnct/Gligen_Text_Image", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a flower sitting on the beach"
+        >>> boxes = [[0.0, 0.09, 0.53, 0.76]]
+        >>> phrases = ["flower"]
+        >>> gligen_image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/pexels-pixabay-60597.jpg"
+        ... )
+
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     gligen_phrases=phrases,
+        ...     gligen_images=[gligen_image],
+        ...     gligen_boxes=boxes,
+        ...     gligen_scheduled_sampling_beta=1,
+        ...     output_type="pil",
+        ...     num_inference_steps=50,
+        ... ).images
+
+        >>> images[0].save("./gligen-generation-text-image-box.jpg")
+
+        >>> # Generate an image described by the prompt and
+        >>> # transfer style described by image at the region defined by bounding boxes
+        >>> pipe = StableDiffusionGLIGENTextImagePipeline.from_pretrained(
+        ...     "anhnct/Gligen_Text_Image", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a dragon flying on the sky"
+        >>> boxes = [[0.4, 0.2, 1.0, 0.8], [0.0, 1.0, 0.0, 1.0]]  # Set `[0.0, 1.0, 0.0, 1.0]` for the style
+
+        >>> gligen_image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png"
+        ... )
+
+        >>> gligen_placeholder = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png"
+        ... )
+
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     gligen_phrases=[
+        ...         "dragon",
+        ...         "placeholder",
+        ...     ],  # Can use any text instead of `placeholder` token, because we will use mask here
+        ...     gligen_images=[
+        ...         gligen_placeholder,
+        ...         gligen_image,
+        ...     ],  # Can use any image in gligen_placeholder, because we will use mask here
+        ...     input_phrases_mask=[1, 0],  # Set 0 for the placeholder token
+        ...     input_images_mask=[0, 1],  # Set 0 for the placeholder image
+        ...     gligen_boxes=boxes,
+        ...     gligen_scheduled_sampling_beta=1,
+        ...     output_type="pil",
+        ...     num_inference_steps=50,
+        ... ).images
+
+        >>> images[0].save("./gligen-generation-text-image-box-style-transfer.jpg")
+        ```
+"""
+
+
+class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with Grounded-Language-to-Image Generation (GLIGEN).
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        processor ([`~transformers.CLIPProcessor`]):
+            A `CLIPProcessor` to procces reference image.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        image_project ([`CLIPImageProjection`]):
+            A `CLIPImageProjection` to project image embedding into phrases embedding space.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        processor: CLIPProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
+        image_project: CLIPImageProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            image_encoder=image_encoder,
+            processor=processor,
+            image_project=image_project,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def enable_fuser(self, enabled=True):
+        for module in self.unet.modules():
+            if type(module) is GatedSelfAttentionDense:
+                module.enabled = enabled
+
+    def draw_inpaint_mask_from_boxes(self, boxes, size):
+        """
+        Create an inpainting mask based on given boxes. This function generates an inpainting mask using the provided
+        boxes to mark regions that need to be inpainted.
+        """
+        inpaint_mask = torch.ones(size[0], size[1])
+        for box in boxes:
+            x0, x1 = box[0] * size[0], box[2] * size[0]
+            y0, y1 = box[1] * size[1], box[3] * size[1]
+            inpaint_mask[int(y0) : int(y1), int(x0) : int(x1)] = 0
+        return inpaint_mask
+
+    def crop(self, im, new_width, new_height):
+        """
+        Crop the input image to the specified dimensions.
+        """
+        width, height = im.size
+        left = (width - new_width) / 2
+        top = (height - new_height) / 2
+        right = (width + new_width) / 2
+        bottom = (height + new_height) / 2
+        return im.crop((left, top, right, bottom))
+
+    def target_size_center_crop(self, im, new_hw):
+        """
+        Crop and resize the image to the target size while keeping the center.
+        """
+        width, height = im.size
+        if width != height:
+            im = self.crop(im, min(height, width), min(height, width))
+        return im.resize((new_hw, new_hw), PIL.Image.LANCZOS)
+
+    def complete_mask(self, has_mask, max_objs, device):
+        """
+        Based on the input mask corresponding value `0 or 1` for each phrases and image, mask the features
+        corresponding to phrases and images.
+        """
+        mask = torch.ones(1, max_objs).type(self.text_encoder.dtype).to(device)
+        if has_mask is None:
+            return mask
+
+        if isinstance(has_mask, int):
+            return mask * has_mask
+        else:
+            for idx, value in enumerate(has_mask):
+                mask[0, idx] = value
+            return mask
+
+    def get_clip_feature(self, input, normalize_constant, device, is_image=False):
+        """
+        Get image and phrases embedding by using CLIP pretrain model. The image embedding is transformed into the
+        phrases embedding space through a projection.
+        """
+        if is_image:
+            if input is None:
+                return None
+            inputs = self.processor(images=[input], return_tensors="pt").to(device)
+            inputs["pixel_values"] = inputs["pixel_values"].to(self.image_encoder.dtype)
+
+            outputs = self.image_encoder(**inputs)
+            feature = outputs.image_embeds
+            feature = self.image_project(feature).squeeze(0)
+            feature = (feature / feature.norm()) * normalize_constant
+            feature = feature.unsqueeze(0)
+        else:
+            if input is None:
+                return None
+            inputs = self.tokenizer(input, return_tensors="pt", padding=True).to(device)
+            outputs = self.text_encoder(**inputs)
+            feature = outputs.pooler_output
+        return feature
+
+    def get_cross_attention_kwargs_with_grounded(
+        self,
+        hidden_size,
+        gligen_phrases,
+        gligen_images,
+        gligen_boxes,
+        input_phrases_mask,
+        input_images_mask,
+        repeat_batch,
+        normalize_constant,
+        max_objs,
+        device,
+    ):
+        """
+        Prepare the cross-attention kwargs containing information about the grounded input (boxes, mask, image
+        embedding, phrases embedding).
+        """
+        phrases, images = gligen_phrases, gligen_images
+        images = [None] * len(phrases) if images is None else images
+        phrases = [None] * len(images) if phrases is None else phrases
+
+        boxes = torch.zeros(max_objs, 4, device=device, dtype=self.text_encoder.dtype)
+        masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        phrases_masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        image_masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        phrases_embeddings = torch.zeros(max_objs, hidden_size, device=device, dtype=self.text_encoder.dtype)
+        image_embeddings = torch.zeros(max_objs, hidden_size, device=device, dtype=self.text_encoder.dtype)
+
+        text_features = []
+        image_features = []
+        for phrase, image in zip(phrases, images):
+            text_features.append(self.get_clip_feature(phrase, normalize_constant, device, is_image=False))
+            image_features.append(self.get_clip_feature(image, normalize_constant, device, is_image=True))
+
+        for idx, (box, text_feature, image_feature) in enumerate(zip(gligen_boxes, text_features, image_features)):
+            boxes[idx] = torch.tensor(box)
+            masks[idx] = 1
+            if text_feature is not None:
+                phrases_embeddings[idx] = text_feature
+                phrases_masks[idx] = 1
+            if image_feature is not None:
+                image_embeddings[idx] = image_feature
+                image_masks[idx] = 1
+
+        input_phrases_mask = self.complete_mask(input_phrases_mask, max_objs, device)
+        phrases_masks = phrases_masks.unsqueeze(0).repeat(repeat_batch, 1) * input_phrases_mask
+        input_images_mask = self.complete_mask(input_images_mask, max_objs, device)
+        image_masks = image_masks.unsqueeze(0).repeat(repeat_batch, 1) * input_images_mask
+        boxes = boxes.unsqueeze(0).repeat(repeat_batch, 1, 1)
+        masks = masks.unsqueeze(0).repeat(repeat_batch, 1)
+        phrases_embeddings = phrases_embeddings.unsqueeze(0).repeat(repeat_batch, 1, 1)
+        image_embeddings = image_embeddings.unsqueeze(0).repeat(repeat_batch, 1, 1)
+
+        out = {
+            "boxes": boxes,
+            "masks": masks,
+            "phrases_masks": phrases_masks,
+            "image_masks": image_masks,
+            "phrases_embeddings": phrases_embeddings,
+            "image_embeddings": image_embeddings,
+        }
+
+        return out
+
+    def get_cross_attention_kwargs_without_grounded(self, hidden_size, repeat_batch, max_objs, device):
+        """
+        Prepare the cross-attention kwargs without information about the grounded input (boxes, mask, image embedding,
+        phrases embedding) (All are zero tensor).
+        """
+        boxes = torch.zeros(max_objs, 4, device=device, dtype=self.text_encoder.dtype)
+        masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        phrases_masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        image_masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        phrases_embeddings = torch.zeros(max_objs, hidden_size, device=device, dtype=self.text_encoder.dtype)
+        image_embeddings = torch.zeros(max_objs, hidden_size, device=device, dtype=self.text_encoder.dtype)
+
+        out = {
+            "boxes": boxes.unsqueeze(0).repeat(repeat_batch, 1, 1),
+            "masks": masks.unsqueeze(0).repeat(repeat_batch, 1),
+            "phrases_masks": phrases_masks.unsqueeze(0).repeat(repeat_batch, 1),
+            "image_masks": image_masks.unsqueeze(0).repeat(repeat_batch, 1),
+            "phrases_embeddings": phrases_embeddings.unsqueeze(0).repeat(repeat_batch, 1, 1),
+            "image_embeddings": image_embeddings.unsqueeze(0).repeat(repeat_batch, 1, 1),
+        }
+
+        return out
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        gligen_scheduled_sampling_beta: float = 0.3,
+        gligen_phrases: List[str] = None,
+        gligen_images: List[PIL.Image.Image] = None,
+        input_phrases_mask: Union[int, List[int]] = None,
+        input_images_mask: Union[int, List[int]] = None,
+        gligen_boxes: List[List[float]] = None,
+        gligen_inpaint_image: Optional[PIL.Image.Image] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        gligen_normalize_constant: float = 28.7,
+        clip_skip: int = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            gligen_phrases (`List[str]`):
+                The phrases to guide what to include in each of the regions defined by the corresponding
+                `gligen_boxes`. There should only be one phrase per bounding box.
+            gligen_images (`List[PIL.Image.Image]`):
+                The images to guide what to include in each of the regions defined by the corresponding `gligen_boxes`.
+                There should only be one image per bounding box
+            input_phrases_mask (`int` or `List[int]`):
+                pre phrases mask input defined by the correspongding `input_phrases_mask`
+            input_images_mask (`int` or `List[int]`):
+                pre images mask input defined by the correspongding `input_images_mask`
+            gligen_boxes (`List[List[float]]`):
+                The bounding boxes that identify rectangular regions of the image that are going to be filled with the
+                content described by the corresponding `gligen_phrases`. Each rectangular box is defined as a
+                `List[float]` of 4 elements `[xmin, ymin, xmax, ymax]` where each value is between [0,1].
+            gligen_inpaint_image (`PIL.Image.Image`, *optional*):
+                The input image, if provided, is inpainted with objects described by the `gligen_boxes` and
+                `gligen_phrases`. Otherwise, it is treated as a generation task on a blank input image.
+            gligen_scheduled_sampling_beta (`float`, defaults to 0.3):
+                Scheduled Sampling factor from [GLIGEN: Open-Set Grounded Text-to-Image
+                Generation](https://arxiv.org/pdf/2301.07093.pdf). Scheduled Sampling factor is only varied for
+                scheduled sampling during inference for improved quality and controllability.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            gligen_normalize_constant (`float`, *optional*, defaults to 28.7):
+                The normalize value of the image embedding.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 5.1 Prepare GLIGEN variables
+        max_objs = 30
+        if len(gligen_boxes) > max_objs:
+            warnings.warn(
+                f"More that {max_objs} objects found. Only first {max_objs} objects will be processed.",
+                FutureWarning,
+            )
+            gligen_phrases = gligen_phrases[:max_objs]
+            gligen_boxes = gligen_boxes[:max_objs]
+            gligen_images = gligen_images[:max_objs]
+
+        repeat_batch = batch_size * num_images_per_prompt
+
+        if do_classifier_free_guidance:
+            repeat_batch = repeat_batch * 2
+
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+
+        hidden_size = prompt_embeds.shape[2]
+
+        cross_attention_kwargs["gligen"] = self.get_cross_attention_kwargs_with_grounded(
+            hidden_size=hidden_size,
+            gligen_phrases=gligen_phrases,
+            gligen_images=gligen_images,
+            gligen_boxes=gligen_boxes,
+            input_phrases_mask=input_phrases_mask,
+            input_images_mask=input_images_mask,
+            repeat_batch=repeat_batch,
+            normalize_constant=gligen_normalize_constant,
+            max_objs=max_objs,
+            device=device,
+        )
+
+        cross_attention_kwargs_without_grounded = {}
+        cross_attention_kwargs_without_grounded["gligen"] = self.get_cross_attention_kwargs_without_grounded(
+            hidden_size=hidden_size, repeat_batch=repeat_batch, max_objs=max_objs, device=device
+        )
+
+        # Prepare latent variables for GLIGEN inpainting
+        if gligen_inpaint_image is not None:
+            # if the given input image is not of the same size as expected by VAE
+            # center crop and resize the input image to expected shape
+            if gligen_inpaint_image.size != (self.vae.sample_size, self.vae.sample_size):
+                gligen_inpaint_image = self.target_size_center_crop(gligen_inpaint_image, self.vae.sample_size)
+            # Convert a single image into a batch of images with a batch size of 1
+            # The resulting shape becomes (1, C, H, W), where C is the number of channels,
+            # and H and W are the height and width of the image.
+            # scales the pixel values to a range [-1, 1]
+            gligen_inpaint_image = self.image_processor.preprocess(gligen_inpaint_image)
+            gligen_inpaint_image = gligen_inpaint_image.to(dtype=self.vae.dtype, device=self.vae.device)
+            # Run AutoEncoder to get corresponding latents
+            gligen_inpaint_latent = self.vae.encode(gligen_inpaint_image).latent_dist.sample()
+            gligen_inpaint_latent = self.vae.config.scaling_factor * gligen_inpaint_latent
+            # Generate an inpainting mask
+            # pixel value = 0, where the object is present (defined by bounding boxes above)
+            #               1, everywhere else
+            gligen_inpaint_mask = self.draw_inpaint_mask_from_boxes(gligen_boxes, gligen_inpaint_latent.shape[2:])
+            gligen_inpaint_mask = gligen_inpaint_mask.to(
+                dtype=gligen_inpaint_latent.dtype, device=gligen_inpaint_latent.device
+            )
+            gligen_inpaint_mask = gligen_inpaint_mask[None, None]
+            gligen_inpaint_mask_addition = torch.cat(
+                (gligen_inpaint_latent * gligen_inpaint_mask, gligen_inpaint_mask), dim=1
+            )
+            # Convert a single mask into a batch of masks with a batch size of 1
+            gligen_inpaint_mask_addition = gligen_inpaint_mask_addition.expand(repeat_batch, -1, -1, -1).clone()
+
+        int(gligen_scheduled_sampling_beta * len(timesteps))
+        self.enable_fuser(True)
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if latents.shape[1] != 4:
+                    latents = torch.randn_like(latents[:, :4])
+
+                if gligen_inpaint_image is not None:
+                    gligen_inpaint_latent_with_noise = (
+                        self.scheduler.add_noise(
+                            gligen_inpaint_latent, torch.randn_like(gligen_inpaint_latent), torch.tensor([t])
+                        )
+                        .expand(latents.shape[0], -1, -1, -1)
+                        .clone()
+                    )
+                    latents = gligen_inpaint_latent_with_noise * gligen_inpaint_mask + latents * (
+                        1 - gligen_inpaint_mask
+                    )
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if gligen_inpaint_image is not None:
+                    latent_model_input = torch.cat((latent_model_input, gligen_inpaint_mask_addition), dim=1)
+
+                # predict the noise residual with grounded information
+                noise_pred_with_grounding = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # predict the noise residual without grounded information
+                noise_pred_without_grounding = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs_without_grounded,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    # Using noise_pred_text from noise residual with grounded information and noise_pred_uncond from noise residual without grounded information
+                    _, noise_pred_text = noise_pred_with_grounding.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_without_grounding.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                else:
+                    noise_pred = noise_pred_with_grounding
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py b/diffusers/src/diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eb5bf8c229b584c934e33806efa73a3727dbb5b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py
@@ -0,0 +1,62 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_k_diffusion_available,
+    is_k_diffusion_version,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (
+        is_transformers_available()
+        and is_torch_available()
+        and is_k_diffusion_available()
+        and is_k_diffusion_version(">=", "0.0.12")
+    ):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_k_diffusion"] = ["StableDiffusionKDiffusionPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_k_diffusion"] = ["StableDiffusionXLKDiffusionPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (
+            is_transformers_available()
+            and is_torch_available()
+            and is_k_diffusion_available()
+            and is_k_diffusion_version(">=", "0.0.12")
+        ):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_and_k_diffusion_objects import *
+    else:
+        from .pipeline_stable_diffusion_k_diffusion import StableDiffusionKDiffusionPipeline
+        from .pipeline_stable_diffusion_xl_k_diffusion import StableDiffusionXLKDiffusionPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py b/diffusers/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..122701ff923f60458abad380eb614c8df7582d31
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -0,0 +1,670 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
+from k_diffusion.sampling import BrownianTreeNoiseSampler, get_sigmas_karras
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import LMSDiscreteScheduler
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class ModelWrapper:
+    def __init__(self, model, alphas_cumprod):
+        self.model = model
+        self.alphas_cumprod = alphas_cumprod
+
+    def apply_model(self, *args, **kwargs):
+        if len(args) == 3:
+            encoder_hidden_states = args[-1]
+            args = args[:2]
+        if kwargs.get("cond", None) is not None:
+            encoder_hidden_states = kwargs.pop("cond")
+        return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample
+
+
+class StableDiffusionKDiffusionPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    <Tip warning={true}>
+
+        This is an experimental pipeline and is likely to change in the future.
+
+    </Tip>
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae,
+        text_encoder,
+        tokenizer,
+        unet,
+        scheduler,
+        safety_checker,
+        feature_extractor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        logger.info(
+            f"{self.__class__} is an experimntal pipeline and is likely to change in the future. We recommend to use"
+            " this pipeline for fast experimentation / iteration if needed, but advice to rely on existing pipelines"
+            " as defined in https://huggingface.co/docs/diffusers/api/schedulers#implemented-schedulers for"
+            " production settings."
+        )
+
+        # get correct sigmas from LMS
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        model = ModelWrapper(unet, scheduler.alphas_cumprod)
+        if scheduler.config.prediction_type == "v_prediction":
+            self.k_diffusion_model = CompVisVDenoiser(model)
+        else:
+            self.k_diffusion_model = CompVisDenoiser(model)
+
+    def set_scheduler(self, scheduler_type: str):
+        library = importlib.import_module("k_diffusion")
+        sampling = getattr(library, "sampling")
+        try:
+            self.sampler = getattr(sampling, scheduler_type)
+        except Exception:
+            valid_samplers = []
+            for s in dir(sampling):
+                if "sample_" in s:
+                    valid_samplers.append(s)
+
+            raise ValueError(f"Invalid scheduler type {scheduler_type}. Please choose one of {valid_samplers}.")
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        use_karras_sigmas: Optional[bool] = False,
+        noise_sampler_seed: Optional[int] = None,
+        clip_skip: int = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+                Use karras sigmas. For example, specifying `sample_dpmpp_2m` to `set_scheduler` will be equivalent to
+                `DPM++2M` in stable-diffusion-webui. On top of that, setting this option to True will make it `DPM++2M
+                Karras`.
+            noise_sampler_seed (`int`, *optional*, defaults to `None`):
+                The random seed to use for the noise sampler. If `None`, a random seed will be generated.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = True
+        if guidance_scale <= 1.0:
+            raise ValueError("has to use guidance_scale")
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=prompt_embeds.device)
+
+        # 5. Prepare sigmas
+        if use_karras_sigmas:
+            sigma_min: float = self.k_diffusion_model.sigmas[0].item()
+            sigma_max: float = self.k_diffusion_model.sigmas[-1].item()
+            sigmas = get_sigmas_karras(n=num_inference_steps, sigma_min=sigma_min, sigma_max=sigma_max)
+        else:
+            sigmas = self.scheduler.sigmas
+        sigmas = sigmas.to(device)
+        sigmas = sigmas.to(prompt_embeds.dtype)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents = latents * sigmas[0]
+        self.k_diffusion_model.sigmas = self.k_diffusion_model.sigmas.to(latents.device)
+        self.k_diffusion_model.log_sigmas = self.k_diffusion_model.log_sigmas.to(latents.device)
+
+        # 7. Define model function
+        def model_fn(x, t):
+            latent_model_input = torch.cat([x] * 2)
+            t = torch.cat([t] * 2)
+
+            noise_pred = self.k_diffusion_model(latent_model_input, t, cond=prompt_embeds)
+
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            return noise_pred
+
+        # 8. Run k-diffusion solver
+        sampler_kwargs = {}
+
+        if "noise_sampler" in inspect.signature(self.sampler).parameters:
+            min_sigma, max_sigma = sigmas[sigmas > 0].min(), sigmas.max()
+            noise_sampler = BrownianTreeNoiseSampler(latents, min_sigma, max_sigma, noise_sampler_seed)
+            sampler_kwargs["noise_sampler"] = noise_sampler
+
+        if "generator" in inspect.signature(self.sampler).parameters:
+            sampler_kwargs["generator"] = generator
+
+        latents = self.sampler(model_fn, latents, sigmas, **sampler_kwargs)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py b/diffusers/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..45f814fd538f4fbe1d9b6fb8d2490e4cb169afde
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
@@ -0,0 +1,892 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import torch
+from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
+from k_diffusion.sampling import BrownianTreeNoiseSampler, get_sigmas_karras
+from transformers import (
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+)
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers, LMSDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLKDiffusionPipeline
+
+        >>> pipe = StableDiffusionXLKDiffusionPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> pipe.set_scheduler("sample_dpmpp_2m_sde")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.ModelWrapper
+class ModelWrapper:
+    def __init__(self, model, alphas_cumprod):
+        self.model = model
+        self.alphas_cumprod = alphas_cumprod
+
+    def apply_model(self, *args, **kwargs):
+        if len(args) == 3:
+            encoder_hidden_states = args[-1]
+            args = args[:2]
+        if kwargs.get("cond", None) is not None:
+            encoder_hidden_states = kwargs.pop("cond")
+        return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample
+
+
+class StableDiffusionXLKDiffusionPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL and k-diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "feature_extractor",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+    ):
+        super().__init__()
+
+        # get correct sigmas from LMS
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+        model = ModelWrapper(unet, scheduler.alphas_cumprod)
+        if scheduler.config.prediction_type == "v_prediction":
+            self.k_diffusion_model = CompVisVDenoiser(model)
+        else:
+            self.k_diffusion_model = CompVisDenoiser(model)
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.set_scheduler
+    def set_scheduler(self, scheduler_type: str):
+        library = importlib.import_module("k_diffusion")
+        sampling = getattr(library, "sampling")
+        try:
+            self.sampler = getattr(sampling, scheduler_type)
+        except Exception:
+            valid_samplers = []
+            for s in dir(sampling):
+                if "sample_" in s:
+                    valid_samplers.append(s)
+
+            raise ValueError(f"Invalid scheduler type {scheduler_type}. Please choose one of {valid_samplers}.")
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        use_karras_sigmas: Optional[bool] = False,
+        noise_sampler_seed: Optional[int] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+
+        if guidance_scale <= 1.0:
+            raise ValueError("has to use guidance_scale")
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = None
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=prompt_embeds.device)
+
+        # 5. Prepare sigmas
+        if use_karras_sigmas:
+            sigma_min: float = self.k_diffusion_model.sigmas[0].item()
+            sigma_max: float = self.k_diffusion_model.sigmas[-1].item()
+            sigmas = get_sigmas_karras(n=num_inference_steps, sigma_min=sigma_min, sigma_max=sigma_max)
+        else:
+            sigmas = self.scheduler.sigmas
+        sigmas = sigmas.to(dtype=prompt_embeds.dtype, device=device)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents = latents * sigmas[0]
+
+        self.k_diffusion_model.sigmas = self.k_diffusion_model.sigmas.to(latents.device)
+        self.k_diffusion_model.log_sigmas = self.k_diffusion_model.log_sigmas.to(latents.device)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        # 8. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 9. Define model function
+        def model_fn(x, t):
+            latent_model_input = torch.cat([x] * 2)
+            t = torch.cat([t] * 2)
+
+            noise_pred = self.k_diffusion_model(
+                latent_model_input,
+                t,
+                cond=prompt_embeds,
+                timestep_cond=timestep_cond,
+                added_cond_kwargs=added_cond_kwargs,
+            )
+
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            return noise_pred
+
+        # 10. Run k-diffusion solver
+        sampler_kwargs = {}
+
+        if "noise_sampler" in inspect.signature(self.sampler).parameters:
+            min_sigma, max_sigma = sigmas[sigmas > 0].min(), sigmas.max()
+            noise_sampler = BrownianTreeNoiseSampler(latents, min_sigma, max_sigma, noise_sampler_seed)
+            sampler_kwargs["noise_sampler"] = noise_sampler
+
+        if "generator" in inspect.signature(self.sampler).parameters:
+            sampler_kwargs["generator"] = generator
+
+        latents = self.sampler(model_fn, latents, sigmas, **sampler_kwargs)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py b/diffusers/src/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dae2affddd1fd5952f454ed9cee906277dcceb16
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_ldm3d"] = ["StableDiffusionLDM3DPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_diffusion_ldm3d import StableDiffusionLDM3DPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/diffusers/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..251ec12d66ab5942105ef25baac98c6b6966be52
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
@@ -0,0 +1,1013 @@
+# Copyright 2024 The Intel Labs Team Authors and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessorLDM3D
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> from diffusers import StableDiffusionLDM3DPipeline
+
+        >>> pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d-4c")
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> output = pipe(prompt)
+        >>> rgb_image, depth_image = output.rgb, output.depth
+        >>> rgb_image[0].save("astronaut_ldm3d_rgb.jpg")
+        >>> depth_image[0].save("astronaut_ldm3d_depth.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+@dataclass
+class LDM3DPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        rgb (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        depth (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    rgb: Union[List[PIL.Image.Image], np.ndarray]
+    depth: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+
+
+class StableDiffusionLDM3DPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image and 3D generation using LDM3D.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            rgb_feature_extractor_input = feature_extractor_input[0]
+            safety_checker_input = self.feature_extractor(rgb_feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 49,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        rgb, depth = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return ((rgb, depth), has_nsfw_concept)
+
+        return LDM3DPipelineOutput(rgb=rgb, depth=depth, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_panorama/__init__.py b/diffusers/src/diffusers/pipelines/stable_diffusion_panorama/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7572db7236cd6bcfd7dd032abcb29fd5f67cf1c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_panorama/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_panorama"] = ["StableDiffusionPanoramaPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_diffusion_panorama import StableDiffusionPanoramaPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/diffusers/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
new file mode 100644
index 0000000000000000000000000000000000000000..96fba06f92a2b35827542bba0ae9d7dad98b7be7
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
@@ -0,0 +1,1168 @@
+# Copyright 2024 MultiDiffusion Authors and The HuggingFace Team. All rights reserved."
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import DDIMScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPanoramaPipeline, DDIMScheduler
+
+        >>> model_ckpt = "stabilityai/stable-diffusion-2-base"
+        >>> scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
+        >>> pipe = StableDiffusionPanoramaPipeline.from_pretrained(
+        ...     model_ckpt, scheduler=scheduler, torch_dtype=torch.float16
+        ... )
+
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of the dolomites"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionPanoramaPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using MultiDiffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def decode_latents_with_padding(self, latents: torch.Tensor, padding: int = 8) -> torch.Tensor:
+        """
+        Decode the given latents with padding for circular inference.
+
+        Args:
+            latents (torch.Tensor): The input latents to decode.
+            padding (int, optional): The number of latents to add on each side for padding. Defaults to 8.
+
+        Returns:
+            torch.Tensor: The decoded image with padding removed.
+
+        Notes:
+            - The padding is added to remove boundary artifacts and improve the output quality.
+            - This would slightly increase the memory usage.
+            - The padding pixels are then removed from the decoded image.
+
+        """
+        latents = 1 / self.vae.config.scaling_factor * latents
+        latents_left = latents[..., :padding]
+        latents_right = latents[..., -padding:]
+        latents = torch.cat((latents_right, latents, latents_left), axis=-1)
+        image = self.vae.decode(latents, return_dict=False)[0]
+        padding_pix = self.vae_scale_factor * padding
+        image = image[..., padding_pix:-padding_pix]
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    def get_views(
+        self,
+        panorama_height: int,
+        panorama_width: int,
+        window_size: int = 64,
+        stride: int = 8,
+        circular_padding: bool = False,
+    ) -> List[Tuple[int, int, int, int]]:
+        """
+        Generates a list of views based on the given parameters. Here, we define the mappings F_i (see Eq. 7 in the
+        MultiDiffusion paper https://arxiv.org/abs/2302.08113). If panorama's height/width < window_size, num_blocks of
+        height/width should return 1.
+
+        Args:
+            panorama_height (int): The height of the panorama.
+            panorama_width (int): The width of the panorama.
+            window_size (int, optional): The size of the window. Defaults to 64.
+            stride (int, optional): The stride value. Defaults to 8.
+            circular_padding (bool, optional): Whether to apply circular padding. Defaults to False.
+
+        Returns:
+            List[Tuple[int, int, int, int]]: A list of tuples representing the views. Each tuple contains four integers
+            representing the start and end coordinates of the window in the panorama.
+
+        """
+        panorama_height /= 8
+        panorama_width /= 8
+        num_blocks_height = (panorama_height - window_size) // stride + 1 if panorama_height > window_size else 1
+        if circular_padding:
+            num_blocks_width = panorama_width // stride if panorama_width > window_size else 1
+        else:
+            num_blocks_width = (panorama_width - window_size) // stride + 1 if panorama_width > window_size else 1
+        total_num_blocks = int(num_blocks_height * num_blocks_width)
+        views = []
+        for i in range(total_num_blocks):
+            h_start = int((i // num_blocks_width) * stride)
+            h_end = h_start + window_size
+            w_start = int((i % num_blocks_width) * stride)
+            w_end = w_start + window_size
+            views.append((h_start, h_end, w_start, w_end))
+        return views
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    @property
+    def do_classifier_free_guidance(self):
+        return False
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 2048,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        view_batch_size: int = 1,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        circular_padding: bool = False,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs: Any,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 2048):
+                The width in pixels of the generated image. The width is kept high because the pipeline is supposed
+                generate panorama-like images.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                The timesteps at which to generate the images. If not specified, then the default timestep spacing
+                strategy of the scheduler is used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            view_batch_size (`int`, *optional*, defaults to 1):
+                The batch size to denoise split views. For some GPUs with high performance, higher view batch size can
+                speedup the generation and increase the VRAM usage.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                A rescaling factor for the guidance embeddings. A value of 0.0 means no rescaling is applied.
+            circular_padding (`bool`, *optional*, defaults to `False`):
+                If set to `True`, circular padding is applied to ensure there are no stitching artifacts. Circular
+                padding allows the model to seamlessly generate a transition from the rightmost part of the image to
+                the leftmost part, maintaining consistency in a 360-degree sense.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Define panorama grid and initialize views for synthesis.
+        # prepare batch grid
+        views = self.get_views(height, width, circular_padding=circular_padding)
+        views_batch = [views[i : i + view_batch_size] for i in range(0, len(views), view_batch_size)]
+        views_scheduler_status = [copy.deepcopy(self.scheduler.__dict__)] * len(views_batch)
+        count = torch.zeros_like(latents)
+        value = torch.zeros_like(latents)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 7.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 8. Denoising loop
+        # Each denoising step also includes refinement of the latents with respect to the
+        # views.
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                count.zero_()
+                value.zero_()
+
+                # generate views
+                # Here, we iterate through different spatial crops of the latents and denoise them. These
+                # denoised (latent) crops are then averaged to produce the final latent
+                # for the current timestep via MultiDiffusion. Please see Sec. 4.1 in the
+                # MultiDiffusion paper for more details: https://arxiv.org/abs/2302.08113
+                # Batch views denoise
+                for j, batch_view in enumerate(views_batch):
+                    vb_size = len(batch_view)
+                    # get the latents corresponding to the current view coordinates
+                    if circular_padding:
+                        latents_for_view = []
+                        for h_start, h_end, w_start, w_end in batch_view:
+                            if w_end > latents.shape[3]:
+                                # Add circular horizontal padding
+                                latent_view = torch.cat(
+                                    (
+                                        latents[:, :, h_start:h_end, w_start:],
+                                        latents[:, :, h_start:h_end, : w_end - latents.shape[3]],
+                                    ),
+                                    axis=-1,
+                                )
+                            else:
+                                latent_view = latents[:, :, h_start:h_end, w_start:w_end]
+                            latents_for_view.append(latent_view)
+                        latents_for_view = torch.cat(latents_for_view)
+                    else:
+                        latents_for_view = torch.cat(
+                            [
+                                latents[:, :, h_start:h_end, w_start:w_end]
+                                for h_start, h_end, w_start, w_end in batch_view
+                            ]
+                        )
+
+                    # rematch block's scheduler status
+                    self.scheduler.__dict__.update(views_scheduler_status[j])
+
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = (
+                        latents_for_view.repeat_interleave(2, dim=0)
+                        if do_classifier_free_guidance
+                        else latents_for_view
+                    )
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # repeat prompt_embeds for batch
+                    prompt_embeds_input = torch.cat([prompt_embeds] * vb_size)
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds_input,
+                        timestep_cond=timestep_cond,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2]
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        noise_pred = rescale_noise_cfg(
+                            noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
+                        )
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents_denoised_batch = self.scheduler.step(
+                        noise_pred, t, latents_for_view, **extra_step_kwargs
+                    ).prev_sample
+
+                    # save views scheduler status after sample
+                    views_scheduler_status[j] = copy.deepcopy(self.scheduler.__dict__)
+
+                    # extract value from batch
+                    for latents_view_denoised, (h_start, h_end, w_start, w_end) in zip(
+                        latents_denoised_batch.chunk(vb_size), batch_view
+                    ):
+                        if circular_padding and w_end > latents.shape[3]:
+                            # Case for circular padding
+                            value[:, :, h_start:h_end, w_start:] += latents_view_denoised[
+                                :, :, h_start:h_end, : latents.shape[3] - w_start
+                            ]
+                            value[:, :, h_start:h_end, : w_end - latents.shape[3]] += latents_view_denoised[
+                                :, :, h_start:h_end, latents.shape[3] - w_start :
+                            ]
+                            count[:, :, h_start:h_end, w_start:] += 1
+                            count[:, :, h_start:h_end, : w_end - latents.shape[3]] += 1
+                        else:
+                            value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised
+                            count[:, :, h_start:h_end, w_start:w_end] += 1
+
+                # take the MultiDiffusion step. Eq. 5 in MultiDiffusion paper: https://arxiv.org/abs/2302.08113
+                latents = torch.where(count > 0, value / count, value)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if output_type != "latent":
+            if circular_padding:
+                image = self.decode_latents_with_padding(latents)
+            else:
+                image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_safe/__init__.py b/diffusers/src/diffusers/pipelines/stable_diffusion_safe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b432b9418c46257913d81c5bf56edc0f1fa74ed1
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_safe/__init__.py
@@ -0,0 +1,99 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import numpy as np
+import PIL
+from PIL import Image
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    BaseOutput,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+@dataclass
+class SafetyConfig(object):
+    WEAK = {
+        "sld_warmup_steps": 15,
+        "sld_guidance_scale": 20,
+        "sld_threshold": 0.0,
+        "sld_momentum_scale": 0.0,
+        "sld_mom_beta": 0.0,
+    }
+    MEDIUM = {
+        "sld_warmup_steps": 10,
+        "sld_guidance_scale": 1000,
+        "sld_threshold": 0.01,
+        "sld_momentum_scale": 0.3,
+        "sld_mom_beta": 0.4,
+    }
+    STRONG = {
+        "sld_warmup_steps": 7,
+        "sld_guidance_scale": 2000,
+        "sld_threshold": 0.025,
+        "sld_momentum_scale": 0.5,
+        "sld_mom_beta": 0.7,
+    }
+    MAX = {
+        "sld_warmup_steps": 0,
+        "sld_guidance_scale": 5000,
+        "sld_threshold": 1.0,
+        "sld_momentum_scale": 0.5,
+        "sld_mom_beta": 0.7,
+    }
+
+
+_dummy_objects = {}
+_additional_imports = {}
+_import_structure = {}
+
+_additional_imports.update({"SafetyConfig": SafetyConfig})
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure.update(
+        {
+            "pipeline_output": ["StableDiffusionSafePipelineOutput"],
+            "pipeline_stable_diffusion_safe": ["StableDiffusionPipelineSafe"],
+            "safety_checker": ["StableDiffusionSafetyChecker"],
+        }
+    )
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_output import StableDiffusionSafePipelineOutput
+        from .pipeline_stable_diffusion_safe import StableDiffusionPipelineSafe
+        from .safety_checker import SafeStableDiffusionSafetyChecker
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
+    for name, value in _additional_imports.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py b/diffusers/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..69a064d6638df556d3007f59daf7e767ec7c298b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py
@@ -0,0 +1,34 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import (
+    BaseOutput,
+)
+
+
+@dataclass
+class StableDiffusionSafePipelineOutput(BaseOutput):
+    """
+    Output class for Safe Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+        nsfw_content_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, or `None` if safety checking could not be performed.
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images that were flagged by the safety checker any may contain "not-safe-for-work"
+            (nsfw) content, or `None` if no safety check was performed or no images were flagged.
+        applied_safety_concept (`str`)
+            The safety concept that was applied for safety guidance, or `None` if safety guidance was disabled
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+    unsafe_images: Optional[Union[List[PIL.Image.Image], np.ndarray]]
+    applied_safety_concept: Optional[str]
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/diffusers/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd59cf51869d3f7f20c685fd6c0cb3fcc64bc4c5
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -0,0 +1,769 @@
+import inspect
+import warnings
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import PipelineImageInput
+from ...loaders import IPAdapterMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import StableDiffusionSafePipelineOutput
+from .safety_checker import SafeStableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class StableDiffusionPipelineSafe(DiffusionPipeline, StableDiffusionMixin, IPAdapterMixin):
+    r"""
+    Pipeline based on the [`StableDiffusionPipeline`] for text-to-image generation using Safe Latent Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: SafeStableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        safety_concept: Optional[str] = (
+            "an image showing hate, harassment, violence, suffering, humiliation, harm, suicide, sexual, nudity,"
+            " bodily fluids, blood, obscene gestures, illegal activity, drug use, theft, vandalism, weapons, child"
+            " abuse, brutality, cruelty"
+        )
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self._safety_text_concept = safety_concept
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    @property
+    def safety_concept(self):
+        r"""
+        Getter method for the safety concept used with SLD
+
+        Returns:
+            `str`: The text describing the safety concept
+        """
+        return self._safety_text_concept
+
+    @safety_concept.setter
+    def safety_concept(self, concept):
+        r"""
+        Setter method for the safety concept used with SLD
+
+        Args:
+            concept (`str`):
+                The text of the new safety concept
+        """
+        self._safety_text_concept = concept
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt,
+        enable_safety_guidance,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        prompt_embeds = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        prompt_embeds = prompt_embeds[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # Encode the safety concept text
+            if enable_safety_guidance:
+                safety_concept_input = self.tokenizer(
+                    [self._safety_text_concept],
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                safety_embeddings = self.text_encoder(safety_concept_input.input_ids.to(self.device))[0]
+
+                # duplicate safety embeddings for each generation per prompt, using mps friendly method
+                seq_len = safety_embeddings.shape[1]
+                safety_embeddings = safety_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+                safety_embeddings = safety_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+                # For classifier free guidance + sld, we need to do three forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing three forward passes
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds, safety_embeddings])
+
+            else:
+                # For classifier free guidance, we need to do two forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing two forward passes
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype, enable_safety_guidance):
+        if self.safety_checker is not None:
+            images = image.copy()
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+            flagged_images = np.zeros((2, *image.shape[1:]))
+            if any(has_nsfw_concept):
+                logger.warning(
+                    "Potential NSFW content was detected in one or more images. A black image will be returned"
+                    " instead."
+                    f"{'You may look at this images in the `unsafe_images` variable of the output at your own discretion.' if enable_safety_guidance else 'Try again with a different prompt and/or seed.'}"
+                )
+                for idx, has_nsfw_concept in enumerate(has_nsfw_concept):
+                    if has_nsfw_concept:
+                        flagged_images[idx] = images[idx]
+                        image[idx] = np.zeros(image[idx].shape)  # black image
+        else:
+            has_nsfw_concept = None
+            flagged_images = None
+        return image, has_nsfw_concept, flagged_images
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def perform_safety_guidance(
+        self,
+        enable_safety_guidance,
+        safety_momentum,
+        noise_guidance,
+        noise_pred_out,
+        i,
+        sld_guidance_scale,
+        sld_warmup_steps,
+        sld_threshold,
+        sld_momentum_scale,
+        sld_mom_beta,
+    ):
+        # Perform SLD guidance
+        if enable_safety_guidance:
+            if safety_momentum is None:
+                safety_momentum = torch.zeros_like(noise_guidance)
+            noise_pred_text, noise_pred_uncond = noise_pred_out[0], noise_pred_out[1]
+            noise_pred_safety_concept = noise_pred_out[2]
+
+            # Equation 6
+            scale = torch.clamp(torch.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale, max=1.0)
+
+            # Equation 6
+            safety_concept_scale = torch.where(
+                (noise_pred_text - noise_pred_safety_concept) >= sld_threshold, torch.zeros_like(scale), scale
+            )
+
+            # Equation 4
+            noise_guidance_safety = torch.mul((noise_pred_safety_concept - noise_pred_uncond), safety_concept_scale)
+
+            # Equation 7
+            noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum
+
+            # Equation 8
+            safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety
+
+            if i >= sld_warmup_steps:  # Warmup
+                # Equation 3
+                noise_guidance = noise_guidance - noise_guidance_safety
+        return noise_guidance, safety_momentum
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        sld_guidance_scale: Optional[float] = 1000,
+        sld_warmup_steps: Optional[int] = 10,
+        sld_threshold: Optional[float] = 0.01,
+        sld_momentum_scale: Optional[float] = 0.3,
+        sld_mom_beta: Optional[float] = 0.4,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            sld_guidance_scale (`float`, *optional*, defaults to 1000):
+                If `sld_guidance_scale < 1`, safety guidance is disabled.
+            sld_warmup_steps (`int`, *optional*, defaults to 10):
+                Number of warmup steps for safety guidance. SLD is only be applied for diffusion steps greater than
+                `sld_warmup_steps`.
+            sld_threshold (`float`, *optional*, defaults to 0.01):
+                Threshold that separates the hyperplane between appropriate and inappropriate images.
+            sld_momentum_scale (`float`, *optional*, defaults to 0.3):
+                Scale of the SLD momentum to be added to the safety guidance at each diffusion step. If set to 0.0,
+                momentum is disabled. Momentum is built up during warmup for diffusion steps smaller than
+                `sld_warmup_steps`.
+            sld_mom_beta (`float`, *optional*, defaults to 0.4):
+                Defines how safety guidance momentum builds up. `sld_mom_beta` indicates how much of the previous
+                momentum is kept. Momentum is built up during warmup for diffusion steps smaller than
+                `sld_warmup_steps`.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+
+        Examples:
+
+        ```py
+        import torch
+        from diffusers import StableDiffusionPipelineSafe
+        from diffusers.pipelines.stable_diffusion_safe import SafetyConfig
+
+        pipeline = StableDiffusionPipelineSafe.from_pretrained(
+            "AIML-TUDA/stable-diffusion-safe", torch_dtype=torch.float16
+        ).to("cuda")
+        prompt = "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c. leyendecker"
+        image = pipeline(prompt=prompt, **SafetyConfig.MEDIUM).images[0]
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        enable_safety_guidance = sld_guidance_scale > 1.0 and do_classifier_free_guidance
+        if not enable_safety_guidance:
+            warnings.warn("Safety checker disabled!")
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if do_classifier_free_guidance:
+                if enable_safety_guidance:
+                    image_embeds = torch.cat([negative_image_embeds, image_embeds, image_embeds])
+                else:
+                    image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, enable_safety_guidance
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        safety_momentum = None
+
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * (3 if enable_safety_guidance else 2))
+                    if do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input, t, encoder_hidden_states=prompt_embeds, added_cond_kwargs=added_cond_kwargs
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_out = noise_pred.chunk((3 if enable_safety_guidance else 2))
+                    noise_pred_uncond, noise_pred_text = noise_pred_out[0], noise_pred_out[1]
+
+                    # default classifier free guidance
+                    noise_guidance = noise_pred_text - noise_pred_uncond
+
+                    # Perform SLD guidance
+                    if enable_safety_guidance:
+                        if safety_momentum is None:
+                            safety_momentum = torch.zeros_like(noise_guidance)
+                        noise_pred_safety_concept = noise_pred_out[2]
+
+                        # Equation 6
+                        scale = torch.clamp(
+                            torch.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale, max=1.0
+                        )
+
+                        # Equation 6
+                        safety_concept_scale = torch.where(
+                            (noise_pred_text - noise_pred_safety_concept) >= sld_threshold,
+                            torch.zeros_like(scale),
+                            scale,
+                        )
+
+                        # Equation 4
+                        noise_guidance_safety = torch.mul(
+                            (noise_pred_safety_concept - noise_pred_uncond), safety_concept_scale
+                        )
+
+                        # Equation 7
+                        noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum
+
+                        # Equation 8
+                        safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety
+
+                        if i >= sld_warmup_steps:  # Warmup
+                            # Equation 3
+                            noise_guidance = noise_guidance - noise_guidance_safety
+
+                    noise_pred = noise_pred_uncond + guidance_scale * noise_guidance
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 8. Post-processing
+        image = self.decode_latents(latents)
+
+        # 9. Run safety checker
+        image, has_nsfw_concept, flagged_images = self.run_safety_checker(
+            image, device, prompt_embeds.dtype, enable_safety_guidance
+        )
+
+        # 10. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+            if flagged_images is not None:
+                flagged_images = self.numpy_to_pil(flagged_images)
+
+        if not return_dict:
+            return (
+                image,
+                has_nsfw_concept,
+                self._safety_text_concept if enable_safety_guidance else None,
+                flagged_images,
+            )
+
+        return StableDiffusionSafePipelineOutput(
+            images=image,
+            nsfw_content_detected=has_nsfw_concept,
+            applied_safety_concept=self._safety_text_concept if enable_safety_guidance else None,
+            unsafe_images=flagged_images,
+        )
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py b/diffusers/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..338e4c65c500a902ca483c79e8311e8796cc7c03
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py
@@ -0,0 +1,109 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from transformers import CLIPConfig, CLIPVisionModel, PreTrainedModel
+
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def cosine_distance(image_embeds, text_embeds):
+    normalized_image_embeds = nn.functional.normalize(image_embeds)
+    normalized_text_embeds = nn.functional.normalize(text_embeds)
+    return torch.mm(normalized_image_embeds, normalized_text_embeds.t())
+
+
+class SafeStableDiffusionSafetyChecker(PreTrainedModel):
+    config_class = CLIPConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPConfig):
+        super().__init__(config)
+
+        self.vision_model = CLIPVisionModel(config.vision_config)
+        self.visual_projection = nn.Linear(config.vision_config.hidden_size, config.projection_dim, bias=False)
+
+        self.concept_embeds = nn.Parameter(torch.ones(17, config.projection_dim), requires_grad=False)
+        self.special_care_embeds = nn.Parameter(torch.ones(3, config.projection_dim), requires_grad=False)
+
+        self.concept_embeds_weights = nn.Parameter(torch.ones(17), requires_grad=False)
+        self.special_care_embeds_weights = nn.Parameter(torch.ones(3), requires_grad=False)
+
+    @torch.no_grad()
+    def forward(self, clip_input, images):
+        pooled_output = self.vision_model(clip_input)[1]  # pooled_output
+        image_embeds = self.visual_projection(pooled_output)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).cpu().float().numpy()
+        cos_dist = cosine_distance(image_embeds, self.concept_embeds).cpu().float().numpy()
+
+        result = []
+        batch_size = image_embeds.shape[0]
+        for i in range(batch_size):
+            result_img = {"special_scores": {}, "special_care": [], "concept_scores": {}, "bad_concepts": []}
+
+            # increase this value to create a stronger `nfsw` filter
+            # at the cost of increasing the possibility of filtering benign images
+            adjustment = 0.0
+
+            for concept_idx in range(len(special_cos_dist[0])):
+                concept_cos = special_cos_dist[i][concept_idx]
+                concept_threshold = self.special_care_embeds_weights[concept_idx].item()
+                result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
+                if result_img["special_scores"][concept_idx] > 0:
+                    result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
+                    adjustment = 0.01
+
+            for concept_idx in range(len(cos_dist[0])):
+                concept_cos = cos_dist[i][concept_idx]
+                concept_threshold = self.concept_embeds_weights[concept_idx].item()
+                result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
+                if result_img["concept_scores"][concept_idx] > 0:
+                    result_img["bad_concepts"].append(concept_idx)
+
+            result.append(result_img)
+
+        has_nsfw_concepts = [len(res["bad_concepts"]) > 0 for res in result]
+
+        return images, has_nsfw_concepts
+
+    @torch.no_grad()
+    def forward_onnx(self, clip_input: torch.Tensor, images: torch.Tensor):
+        pooled_output = self.vision_model(clip_input)[1]  # pooled_output
+        image_embeds = self.visual_projection(pooled_output)
+
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
+        cos_dist = cosine_distance(image_embeds, self.concept_embeds)
+
+        # increase this value to create a stronger `nsfw` filter
+        # at the cost of increasing the possibility of filtering benign images
+        adjustment = 0.0
+
+        special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment
+        # special_scores = special_scores.round(decimals=3)
+        special_care = torch.any(special_scores > 0, dim=1)
+        special_adjustment = special_care * 0.01
+        special_adjustment = special_adjustment.unsqueeze(1).expand(-1, cos_dist.shape[1])
+
+        concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
+        # concept_scores = concept_scores.round(decimals=3)
+        has_nsfw_concepts = torch.any(concept_scores > 0, dim=1)
+
+        return images, has_nsfw_concepts
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_sag/__init__.py b/diffusers/src/diffusers/pipelines/stable_diffusion_sag/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..378e0e57817f58a0a28afed5d6110f6ee3effb3a
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_sag/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_diffusion_sag import StableDiffusionSAGPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py b/diffusers/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
new file mode 100644
index 0000000000000000000000000000000000000000..c32052d2e4d019ef7a5d305381c8063fe78a0991
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
@@ -0,0 +1,954 @@
+# Copyright 2024 Susung Hong and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionSAGPipeline
+
+        >>> pipe = StableDiffusionSAGPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt, sag_scale=0.75).images[0]
+        ```
+"""
+
+
+# processes and stores attention probabilities
+class CrossAttnStoreProcessor:
+    def __init__(self):
+        self.attention_probs = None
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+    ):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        self.attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(self.attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+# Modified to get self-attention guidance scale in this paper (https://arxiv.org/pdf/2210.00939.pdf) as an input
+class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            image_embeds = ip_adapter_image_embeds
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        sag_scale: float = 0.75,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            sag_scale (`float`, *optional*, defaults to 0.75):
+                Chosen between [0, 1.0] for better quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the
+                `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # and `sag_scale` is` `s` of equation (16)
+        # of the self-attention guidance paper: https://arxiv.org/pdf/2210.00939.pdf
+        # `sag_scale = 0` means no self-attention guidance
+        do_self_attention_guidance = sag_scale > 0.0
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                do_classifier_free_guidance,
+            )
+
+            if do_classifier_free_guidance:
+                image_embeds = []
+                negative_image_embeds = []
+                for tmp_image_embeds in ip_adapter_image_embeds:
+                    single_negative_image_embeds, single_image_embeds = tmp_image_embeds.chunk(2)
+                    image_embeds.append(single_image_embeds)
+                    negative_image_embeds.append(single_negative_image_embeds)
+            else:
+                image_embeds = ip_adapter_image_embeds
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        if timesteps.dtype not in [torch.int16, torch.int32, torch.int64]:
+            raise ValueError(
+                f"{self.__class__.__name__} does not support using a scheduler of type {self.scheduler.__class__.__name__}. Please make sure to use one of 'DDIMScheduler, PNDMScheduler, DDPMScheduler, DEISMultistepScheduler, UniPCMultistepScheduler, DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler'."
+            )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        if do_classifier_free_guidance:
+            added_uncond_kwargs = (
+                {"image_embeds": negative_image_embeds}
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+                else None
+            )
+
+        # 7. Denoising loop
+        original_attn_proc = self.unet.attn_processors
+        store_processor = CrossAttnStoreProcessor()
+        self.unet.mid_block.attentions[0].transformer_blocks[0].attn1.processor = store_processor
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+        map_size = None
+
+        def get_map_size(module, input, output):
+            nonlocal map_size
+            map_size = output[0].shape[-2:]
+
+        with self.unet.mid_block.attentions[0].register_forward_hook(get_map_size):
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # predict the noise residual
+
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # perform self-attention guidance with the stored self-attention map
+                    if do_self_attention_guidance:
+                        # classifier-free guidance produces two chunks of attention map
+                        # and we only use unconditional one according to equation (25)
+                        # in https://arxiv.org/pdf/2210.00939.pdf
+                        if do_classifier_free_guidance:
+                            # DDIM-like prediction of x0
+                            pred_x0 = self.pred_x0(latents, noise_pred_uncond, t)
+                            # get the stored attention maps
+                            uncond_attn, cond_attn = store_processor.attention_probs.chunk(2)
+                            # self-attention-based degrading of latents
+                            degraded_latents = self.sag_masking(
+                                pred_x0, uncond_attn, map_size, t, self.pred_epsilon(latents, noise_pred_uncond, t)
+                            )
+                            uncond_emb, _ = prompt_embeds.chunk(2)
+                            # forward and give guidance
+                            degraded_pred = self.unet(
+                                degraded_latents,
+                                t,
+                                encoder_hidden_states=uncond_emb,
+                                added_cond_kwargs=added_uncond_kwargs,
+                            ).sample
+                            noise_pred += sag_scale * (noise_pred_uncond - degraded_pred)
+                        else:
+                            # DDIM-like prediction of x0
+                            pred_x0 = self.pred_x0(latents, noise_pred, t)
+                            # get the stored attention maps
+                            cond_attn = store_processor.attention_probs
+                            # self-attention-based degrading of latents
+                            degraded_latents = self.sag_masking(
+                                pred_x0, cond_attn, map_size, t, self.pred_epsilon(latents, noise_pred, t)
+                            )
+                            # forward and give guidance
+                            degraded_pred = self.unet(
+                                degraded_latents,
+                                t,
+                                encoder_hidden_states=prompt_embeds,
+                                added_cond_kwargs=added_cond_kwargs,
+                            ).sample
+                            noise_pred += sag_scale * (noise_pred - degraded_pred)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            step_idx = i // getattr(self.scheduler, "order", 1)
+                            callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        self.maybe_free_model_hooks()
+        # make sure to set the original attention processors back
+        self.unet.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    def sag_masking(self, original_latents, attn_map, map_size, t, eps):
+        # Same masking process as in SAG paper: https://arxiv.org/pdf/2210.00939.pdf
+        bh, hw1, hw2 = attn_map.shape
+        b, latent_channel, latent_h, latent_w = original_latents.shape
+        h = self.unet.config.attention_head_dim
+        if isinstance(h, list):
+            h = h[-1]
+
+        # Produce attention mask
+        attn_map = attn_map.reshape(b, h, hw1, hw2)
+        attn_mask = attn_map.mean(1, keepdim=False).sum(1, keepdim=False) > 1.0
+        attn_mask = (
+            attn_mask.reshape(b, map_size[0], map_size[1])
+            .unsqueeze(1)
+            .repeat(1, latent_channel, 1, 1)
+            .type(attn_map.dtype)
+        )
+        attn_mask = F.interpolate(attn_mask, (latent_h, latent_w))
+
+        # Blur according to the self-attention mask
+        degraded_latents = gaussian_blur_2d(original_latents, kernel_size=9, sigma=1.0)
+        degraded_latents = degraded_latents * attn_mask + original_latents * (1 - attn_mask)
+
+        # Noise it again to match the noise level
+        degraded_latents = self.scheduler.add_noise(degraded_latents, noise=eps, timesteps=t[None])
+
+        return degraded_latents
+
+    # Modified from diffusers.schedulers.scheduling_ddim.DDIMScheduler.step
+    # Note: there are some schedulers that clip or do not return x_0 (PNDMScheduler, DDIMScheduler, etc.)
+    def pred_x0(self, sample, model_output, timestep):
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep].to(sample.device)
+
+        beta_prod_t = 1 - alpha_prod_t
+        if self.scheduler.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.scheduler.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.scheduler.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            # predict V
+            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`,"
+                " or `v_prediction`"
+            )
+
+        return pred_original_sample
+
+    def pred_epsilon(self, sample, model_output, timestep):
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+
+        beta_prod_t = 1 - alpha_prod_t
+        if self.scheduler.config.prediction_type == "epsilon":
+            pred_eps = model_output
+        elif self.scheduler.config.prediction_type == "sample":
+            pred_eps = (sample - (alpha_prod_t**0.5) * model_output) / (beta_prod_t**0.5)
+        elif self.scheduler.config.prediction_type == "v_prediction":
+            pred_eps = (beta_prod_t**0.5) * sample + (alpha_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`,"
+                " or `v_prediction`"
+            )
+
+        return pred_eps
+
+
+# Gaussian blur
+def gaussian_blur_2d(img, kernel_size, sigma):
+    ksize_half = (kernel_size - 1) * 0.5
+
+    x = torch.linspace(-ksize_half, ksize_half, steps=kernel_size)
+
+    pdf = torch.exp(-0.5 * (x / sigma).pow(2))
+
+    x_kernel = pdf / pdf.sum()
+    x_kernel = x_kernel.to(device=img.device, dtype=img.dtype)
+
+    kernel2d = torch.mm(x_kernel[:, None], x_kernel[None, :])
+    kernel2d = kernel2d.expand(img.shape[-3], 1, kernel2d.shape[0], kernel2d.shape[1])
+
+    padding = [kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2]
+
+    img = F.pad(img, padding, mode="reflect")
+    img = F.conv2d(img, kernel2d, groups=img.shape[-3])
+
+    return img
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_xl/__init__.py b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8088fbcfceba205b9b908613f4ca3fdc579120e8
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
@@ -0,0 +1,76 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_additional_imports = {}
+_import_structure = {"pipeline_output": ["StableDiffusionXLPipelineOutput"]}
+
+if is_transformers_available() and is_flax_available():
+    _import_structure["pipeline_output"].extend(["FlaxStableDiffusionXLPipelineOutput"])
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_xl"] = ["StableDiffusionXLPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_img2img"] = ["StableDiffusionXLImg2ImgPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_inpaint"] = ["StableDiffusionXLInpaintPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_instruct_pix2pix"] = ["StableDiffusionXLInstructPix2PixPipeline"]
+
+if is_transformers_available() and is_flax_available():
+    from ...schedulers.scheduling_pndm_flax import PNDMSchedulerState
+
+    _additional_imports.update({"PNDMSchedulerState": PNDMSchedulerState})
+    _import_structure["pipeline_flax_stable_diffusion_xl"] = ["FlaxStableDiffusionXLPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_stable_diffusion_xl import StableDiffusionXLPipeline
+        from .pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipeline
+        from .pipeline_stable_diffusion_xl_inpaint import StableDiffusionXLInpaintPipeline
+        from .pipeline_stable_diffusion_xl_instruct_pix2pix import StableDiffusionXLInstructPix2PixPipeline
+
+    try:
+        if not (is_transformers_available() and is_flax_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_flax_objects import *
+    else:
+        from .pipeline_flax_stable_diffusion_xl import (
+            FlaxStableDiffusionXLPipeline,
+        )
+        from .pipeline_output import FlaxStableDiffusionXLPipelineOutput
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
+    for name, value in _additional_imports.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..77363b2546d72836618d4fad3d24ca572d1c8f12
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py
@@ -0,0 +1,308 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict
+from transformers import CLIPTokenizer, FlaxCLIPTextModel
+
+from diffusers.utils import logging
+
+from ...models import FlaxAutoencoderKL, FlaxUNet2DConditionModel
+from ...schedulers import (
+    FlaxDDIMScheduler,
+    FlaxDPMSolverMultistepScheduler,
+    FlaxLMSDiscreteScheduler,
+    FlaxPNDMScheduler,
+)
+from ..pipeline_flax_utils import FlaxDiffusionPipeline
+from .pipeline_output import FlaxStableDiffusionXLPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Set to True to use python for loop instead of jax.fori_loop for easier debugging
+DEBUG = False
+
+
+class FlaxStableDiffusionXLPipeline(FlaxDiffusionPipeline):
+    def __init__(
+        self,
+        text_encoder: FlaxCLIPTextModel,
+        text_encoder_2: FlaxCLIPTextModel,
+        vae: FlaxAutoencoderKL,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: FlaxUNet2DConditionModel,
+        scheduler: Union[
+            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
+        ],
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        super().__init__()
+        self.dtype = dtype
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def prepare_inputs(self, prompt: Union[str, List[str]]):
+        if not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        # Assume we have the two encoders
+        inputs = []
+        for tokenizer in [self.tokenizer, self.tokenizer_2]:
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            inputs.append(text_inputs.input_ids)
+        inputs = jnp.stack(inputs, axis=1)
+        return inputs
+
+    def __call__(
+        self,
+        prompt_ids: jax.Array,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int = 50,
+        guidance_scale: Union[float, jax.Array] = 7.5,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        latents: jnp.array = None,
+        neg_prompt_ids: jnp.array = None,
+        return_dict: bool = True,
+        output_type: str = None,
+        jit: bool = False,
+    ):
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        if isinstance(guidance_scale, float) and jit:
+            # Convert to a tensor so each device gets a copy.
+            guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
+            guidance_scale = guidance_scale[:, None]
+
+        return_latents = output_type == "latent"
+
+        if jit:
+            images = _p_generate(
+                self,
+                prompt_ids,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+                return_latents,
+            )
+        else:
+            images = self._generate(
+                prompt_ids,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+                return_latents,
+            )
+
+        if not return_dict:
+            return (images,)
+
+        return FlaxStableDiffusionXLPipelineOutput(images=images)
+
+    def get_embeddings(self, prompt_ids: jnp.array, params):
+        # We assume we have the two encoders
+
+        # bs, encoder_input, seq_length
+        te_1_inputs = prompt_ids[:, 0, :]
+        te_2_inputs = prompt_ids[:, 1, :]
+
+        prompt_embeds = self.text_encoder(te_1_inputs, params=params["text_encoder"], output_hidden_states=True)
+        prompt_embeds = prompt_embeds["hidden_states"][-2]
+        prompt_embeds_2_out = self.text_encoder_2(
+            te_2_inputs, params=params["text_encoder_2"], output_hidden_states=True
+        )
+        prompt_embeds_2 = prompt_embeds_2_out["hidden_states"][-2]
+        text_embeds = prompt_embeds_2_out["text_embeds"]
+        prompt_embeds = jnp.concatenate([prompt_embeds, prompt_embeds_2], axis=-1)
+        return prompt_embeds, text_embeds
+
+    def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, bs, dtype):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = jnp.array([add_time_ids] * bs, dtype=dtype)
+        return add_time_ids
+
+    def _generate(
+        self,
+        prompt_ids: jnp.array,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int,
+        height: int,
+        width: int,
+        guidance_scale: float,
+        latents: Optional[jnp.array] = None,
+        neg_prompt_ids: Optional[jnp.array] = None,
+        return_latents=False,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # Encode input prompt
+        prompt_embeds, pooled_embeds = self.get_embeddings(prompt_ids, params)
+
+        # Get unconditional embeddings
+        batch_size = prompt_embeds.shape[0]
+        if neg_prompt_ids is None:
+            neg_prompt_embeds = jnp.zeros_like(prompt_embeds)
+            negative_pooled_embeds = jnp.zeros_like(pooled_embeds)
+        else:
+            neg_prompt_embeds, negative_pooled_embeds = self.get_embeddings(neg_prompt_ids, params)
+
+        add_time_ids = self._get_add_time_ids(
+            (height, width), (0, 0), (height, width), prompt_embeds.shape[0], dtype=prompt_embeds.dtype
+        )
+
+        prompt_embeds = jnp.concatenate([neg_prompt_embeds, prompt_embeds], axis=0)  # (2, 77, 2048)
+        add_text_embeds = jnp.concatenate([negative_pooled_embeds, pooled_embeds], axis=0)
+        add_time_ids = jnp.concatenate([add_time_ids, add_time_ids], axis=0)
+
+        # Ensure model output will be `float32` before going into the scheduler
+        guidance_scale = jnp.array([guidance_scale], dtype=jnp.float32)
+
+        # Create random latents
+        latents_shape = (
+            batch_size,
+            self.unet.config.in_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if latents is None:
+            latents = jax.random.normal(prng_seed, shape=latents_shape, dtype=jnp.float32)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+        # Prepare scheduler state
+        scheduler_state = self.scheduler.set_timesteps(
+            params["scheduler"], num_inference_steps=num_inference_steps, shape=latents.shape
+        )
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * scheduler_state.init_noise_sigma
+
+        added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        # Denoising loop
+        def loop_body(step, args):
+            latents, scheduler_state = args
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            latents_input = jnp.concatenate([latents] * 2)
+
+            t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            timestep = jnp.broadcast_to(t, latents_input.shape[0])
+
+            latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet.apply(
+                {"params": params["unet"]},
+                jnp.array(latents_input),
+                jnp.array(timestep, dtype=jnp.int32),
+                encoder_hidden_states=prompt_embeds,
+                added_cond_kwargs=added_cond_kwargs,
+            ).sample
+            # perform guidance
+            noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+            return latents, scheduler_state
+
+        if DEBUG:
+            # run with python for loop
+            for i in range(num_inference_steps):
+                latents, scheduler_state = loop_body(i, (latents, scheduler_state))
+        else:
+            latents, _ = jax.lax.fori_loop(0, num_inference_steps, loop_body, (latents, scheduler_state))
+
+        if return_latents:
+            return latents
+
+        # Decode latents
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
+
+        image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+        return image
+
+
+# Static argnums are pipe, num_inference_steps, height, width, return_latents. A change would trigger recompilation.
+# Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
+@partial(
+    jax.pmap,
+    in_axes=(None, 0, 0, 0, None, None, None, 0, 0, 0, None),
+    static_broadcasted_argnums=(0, 4, 5, 6, 10),
+)
+def _p_generate(
+    pipe,
+    prompt_ids,
+    params,
+    prng_seed,
+    num_inference_steps,
+    height,
+    width,
+    guidance_scale,
+    latents,
+    neg_prompt_ids,
+    return_latents,
+):
+    return pipe._generate(
+        prompt_ids,
+        params,
+        prng_seed,
+        num_inference_steps,
+        height,
+        width,
+        guidance_scale,
+        latents,
+        neg_prompt_ids,
+        return_latents,
+    )
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..0783f44486ee1448bd15529f745af381ee7fa69f
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py
@@ -0,0 +1,37 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput, is_flax_available
+
+
+@dataclass
+class StableDiffusionXLPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+if is_flax_available():
+    import flax
+
+    @flax.struct.dataclass
+    class FlaxStableDiffusionXLPipelineOutput(BaseOutput):
+        """
+        Output class for Flax Stable Diffusion XL pipelines.
+
+        Args:
+            images (`np.ndarray`)
+                Array of shape `(batch_size, height, width, num_channels)` with images from the diffusion pipeline.
+        """
+
+        images: np.ndarray
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..94b7283cb6c3757cf82cb4f0ba390707bd65bbea
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -0,0 +1,1300 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from .watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLPipeline
+
+        >>> pipe = StableDiffusionXLPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..29b5e11875fc0f31f0e7795fed813d715c6e4269
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -0,0 +1,1494 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from .watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLImg2ImgPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
+
+        >>> init_image = load_image(url).convert("RGB")
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt, image=init_image).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+            Whether the `unet` requires an `aesthetic_score` condition to be passed during inference. Also see the
+            config of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "add_neg_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        strength,
+        num_inference_steps,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` cannot be None.")
+        elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
+            raise ValueError(
+                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
+                f" {type(num_inference_steps)}."
+            )
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+
+            timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+            return timesteps, num_inference_steps - t_start
+
+        else:
+            # Strength is irrelevant if we directly request a timestep to start at;
+            # that is, strength is determined by the denoising_start instead.
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            t_start = len(self.scheduler.timesteps) - num_inference_steps
+            timesteps = self.scheduler.timesteps[t_start:]
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index(t_start)
+            return timesteps, num_inference_steps
+
+    def prepare_latents(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        latents_mean = latents_std = None
+        if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
+            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
+        if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
+            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
+
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+                    image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+                elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+                    )
+
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            if latents_mean is not None and latents_std is not None:
+                latents_mean = latents_mean.to(device=device, dtype=dtype)
+                latents_std = latents_std.to(device=device, dtype=dtype)
+                init_latents = (init_latents - latents_mean) * self.vae.config.scaling_factor / latents_std
+            else:
+                init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        if add_noise:
+            shape = init_latents.shape
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # get latents
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def denoising_start(self):
+        return self._denoising_start
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.3,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
+                The image(s) to modify with the pipeline.
+            strength (`float`, *optional*, defaults to 0.3):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. Note that in the case of
+                `denoising_start` being declared as an integer, the value of `strength` will be ignored.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refine Image
+                Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality).
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
+                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
+                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
+                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refine Image
+                Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality).
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            strength,
+            num_inference_steps,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 5. Prepare timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
+        )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        add_noise = True if self.denoising_start is None else False
+
+        # 6. Prepare latent variables
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                add_noise,
+            )
+        # 7. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 8. Prepare added time ids & embeddings
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 9. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 9.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and self.denoising_start is not None
+            and denoising_value_valid(self.denoising_end)
+            and denoising_value_valid(self.denoising_start)
+            and self.denoising_start >= self.denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {self.denoising_end} when using type float."
+            )
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..d28a9afbfb7aad890d45b23dc3b3d4a1b55d3f2b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -0,0 +1,1732 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from .watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLInpaintPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = StableDiffusionXLInpaintPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0",
+        ...     torch_dtype=torch.float16,
+        ...     variant="fp16",
+        ...     use_safetensors=True,
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+        >>> init_image = load_image(img_url).convert("RGB")
+        >>> mask_image = load_image(mask_url).convert("RGB")
+
+        >>> prompt = "A majestic tiger sitting on a bench"
+        >>> image = pipe(
+        ...     prompt=prompt, image=init_image, mask_image=mask_image, num_inference_steps=50, strength=0.80
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+def mask_pil_to_torch(mask, height, width):
+    # preprocess mask
+    if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+        mask = [mask]
+
+    if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+        mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
+        mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+        mask = mask.astype(np.float32) / 255.0
+    elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+        mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+    mask = torch.from_numpy(mask)
+    return mask
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLInpaintPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    FromSingleFileMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+            Whether the `unet` requires a aesthetic_score condition to be passed during inference. Also see the config
+            of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "add_neg_time_ids",
+        "mask",
+        "masked_image_latents",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        mask_image,
+        height,
+        width,
+        strength,
+        callback_steps,
+        output_type,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        padding_mask_crop=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if padding_mask_crop is not None:
+            if not isinstance(image, PIL.Image.Image):
+                raise ValueError(
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
+                )
+            if not isinstance(mask_image, PIL.Image.Image):
+                raise ValueError(
+                    f"The mask image should be a PIL image when inpainting mask crop, but is of type"
+                    f" {type(mask_image)}."
+                )
+            if output_type != "pil":
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        add_noise=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if image.shape[1] == 4:
+            image_latents = image.to(device=device, dtype=dtype)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+        elif return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+            image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None and add_noise:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        elif add_noise:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+        else:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = image_latents.to(device)
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        dtype = image.dtype
+        if self.vae.config.force_upcast:
+            image = image.float()
+            self.vae.to(dtype=torch.float32)
+
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        if self.vae.config.force_upcast:
+            self.vae.to(dtype)
+
+        image_latents = image_latents.to(dtype)
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+
+        if masked_image is not None and masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = None
+
+        if masked_image is not None:
+            if masked_image_latents is None:
+                masked_image = masked_image.to(device=device, dtype=dtype)
+                masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+            if masked_image_latents.shape[0] < batch_size:
+                if not batch_size % masked_image_latents.shape[0] == 0:
+                    raise ValueError(
+                        "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                        f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                        " Make sure the number of images that you pass is divisible by the total requested batch size."
+                    )
+                masked_image_latents = masked_image_latents.repeat(
+                    batch_size // masked_image_latents.shape[0], 1, 1, 1
+                )
+
+            masked_image_latents = (
+                torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+            )
+
+            # aligning device to prevent device errors when concating it with the latent model input
+            masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+
+            timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+            return timesteps, num_inference_steps - t_start
+
+        else:
+            # Strength is irrelevant if we directly request a timestep to start at;
+            # that is, strength is determined by the denoising_start instead.
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            t_start = len(self.scheduler.timesteps) - num_inference_steps
+            timesteps = self.scheduler.timesteps[t_start:]
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index(t_start)
+            return timesteps, num_inference_steps
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def denoising_start(self):
+        return self._denoising_start
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: torch.Tensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 0.9999,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
+                image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
+                with the same aspect ration of the image and contains all masked area, and then expand that area based
+                on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
+                resizing to the original image size for inpainting. This is useful when the masked area is small while
+                the image is large and contain information irrelevant for inpainting, such as background.
+            strength (`float`, *optional*, defaults to 0.9999):
+                Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
+                between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
+                `strength`. The number of denoising steps depends on the amount of noise initially added. When
+                `strength` is 1, added noise will be maximum and the denoising process will run for the full number of
+                iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores the masked
+                portion of the reference `image`. Note that in the case of `denoising_start` being declared as an
+                integer, the value of `strength` will be ignored.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
+                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
+                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
+                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. `tuple. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            image,
+            mask_image,
+            height,
+            width,
+            strength,
+            callback_steps,
+            output_type,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+            padding_mask_crop,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. set timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+
+        # 5. Preprocess mask and image
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        mask = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+
+        if masked_image_latents is not None:
+            masked_image = masked_image_latents
+        elif init_image.shape[1] == 4:
+            # if images are in latent space, we can't mask it
+            masked_image = None
+        else:
+            masked_image = init_image * (mask < 0.5)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
+        add_noise = True if self.denoising_start is None else False
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            add_noise=add_noise,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+
+        # 8. Check that sizes of mask, masked image and latents match
+        if num_channels_unet == 9:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
+        elif num_channels_unet != 4:
+            raise ValueError(
+                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+            )
+        # 8.1 Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 10. Prepare added time ids & embeddings
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 11. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        if (
+            self.denoising_end is not None
+            and self.denoising_start is not None
+            and denoising_value_valid(self.denoising_end)
+            and denoising_value_valid(self.denoising_start)
+            and self.denoising_start >= self.denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {self.denoising_end} when using type float."
+            )
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 11.1 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if num_channels_unet == 9:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            return StableDiffusionXLPipelineOutput(images=latents)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
new file mode 100644
index 0000000000000000000000000000000000000000..b59f2312726d0071c70317f34f14b00986c02f69
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -0,0 +1,992 @@
+# Copyright 2024 Harutatsu Akiyama and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from .watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLInstructPix2PixPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> resolution = 768
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
+        ... ).resize((resolution, resolution))
+        >>> edit_instruction = "Turn sky into a cloudy one"
+
+        >>> pipe = StableDiffusionXLInstructPix2PixPipeline.from_pretrained(
+        ...     "diffusers/sdxl-instructpix2pix-768", torch_dtype=torch.float16
+        ... ).to("cuda")
+
+        >>> edited_image = pipe(
+        ...     prompt=edit_instruction,
+        ...     image=image,
+        ...     height=resolution,
+        ...     width=resolution,
+        ...     guidance_scale=3.0,
+        ...     image_guidance_scale=1.5,
+        ...     num_inference_steps=30,
+        ... ).images[0]
+        >>> edited_image
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class StableDiffusionXLInstructPix2PixPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+):
+    r"""
+    Pipeline for pixel-level image editing by following text instructions. Based on Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+            Whether the `unet` requires a aesthetic_score condition to be passed during inference. Also see the config
+            of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+        is_cosxl_edit (`bool`, *optional*):
+            When set the image latents are scaled.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+        is_cosxl_edit: Optional[bool] = False,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_sample_size = self.unet.config.sample_size
+        self.is_cosxl_edit = is_cosxl_edit
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(
+                    text_input_ids.to(device),
+                    output_hidden_states=True,
+                )
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                prompt_embeds = prompt_embeds.hidden_states[-2]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        prompt_embeds_dtype = self.text_encoder_2.dtype if self.text_encoder_2 is not None else self.unet.dtype
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_image_latents(
+        self, image, batch_size, num_images_per_prompt, dtype, device, do_classifier_free_guidance, generator=None
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            image_latents = image
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                image = image.float()
+                self.upcast_vae()
+
+            image_latents = retrieve_latents(self.vae.encode(image), sample_mode="argmax")
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand image_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_image_latents = torch.zeros_like(image_latents)
+            image_latents = torch.cat([image_latents, image_latents, uncond_image_latents], dim=0)
+
+        if image_latents.dtype != self.vae.dtype:
+            image_latents = image_latents.to(dtype=self.vae.dtype)
+
+        if self.is_cosxl_edit:
+            image_latents = image_latents * self.vae.config.scaling_factor
+
+        return image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 100,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        image_guidance_scale: float = 1.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
+                The image(s) to modify with the pipeline.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            image_guidance_scale (`float`, *optional*, defaults to 1.5):
+                Image guidance scale is to push the generated image towards the initial image `image`. Image guidance
+                scale is enabled by setting `image_guidance_scale > 1`. Higher image guidance scale encourages to
+                generate images that are closely linked to the source image `image`, usually at the expense of lower
+                image quality. This pipeline requires a value of at least `1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        if image is None:
+            raise ValueError("`image` input cannot be undefined.")
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0 and image_guidance_scale >= 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image, height=height, width=width).to(device)
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare Image latents
+        image_latents = self.prepare_image_latents(
+            image,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            do_classifier_free_guidance,
+        )
+
+        # 7. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 8. Check that shapes of latents and image match the UNet channels
+        num_channels_image = image_latents.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents + num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 10. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if do_classifier_free_guidance:
+            # The extra concat similar to how it's done in SD InstructPix2Pix.
+            prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds, negative_prompt_embeds], dim=0)
+            add_text_embeds = torch.cat(
+                [add_text_embeds, negative_pooled_prompt_embeds, negative_pooled_prompt_embeds], dim=0
+            )
+            add_time_ids = torch.cat([add_time_ids, add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        # 11. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Expand the latents if we are doing classifier free guidance.
+                # The latents are expanded 3 times because for pix2pix the guidance
+                # is applied for both the text and the input image.
+                latent_model_input = torch.cat([latents] * 3) if do_classifier_free_guidance else latents
+
+                # concat latents, image_latents in the channel dimension
+                scaled_latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                scaled_latent_model_input = torch.cat([scaled_latent_model_input, image_latents], dim=1)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                noise_pred = self.unet(
+                    scaled_latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_text, noise_pred_image, noise_pred_uncond = noise_pred.chunk(3)
+                    noise_pred = (
+                        noise_pred_uncond
+                        + guidance_scale * (noise_pred_text - noise_pred_image)
+                        + image_guidance_scale * (noise_pred_image - noise_pred_uncond)
+                    )
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            return StableDiffusionXLPipelineOutput(images=latents)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/stable_diffusion_xl/watermark.py b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
new file mode 100644
index 0000000000000000000000000000000000000000..70d06bb6320d890ea90542cc7551bc42d7c59451
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
@@ -0,0 +1,42 @@
+import numpy as np
+import torch
+
+from ...utils import is_invisible_watermark_available
+
+
+if is_invisible_watermark_available():
+    from imwatermark import WatermarkEncoder
+
+
+# Copied from https://github.com/Stability-AI/generative-models/blob/613af104c6b85184091d42d374fef420eddb356d/scripts/demo/streamlit_helpers.py#L66
+WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110
+# bin(x)[2:] gives bits of x as str, use int to convert them to 0/1
+WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
+
+
+class StableDiffusionXLWatermarker:
+    def __init__(self):
+        self.watermark = WATERMARK_BITS
+        self.encoder = WatermarkEncoder()
+
+        self.encoder.set_watermark("bits", self.watermark)
+
+    def apply_watermark(self, images: torch.Tensor):
+        # can't encode images that are smaller than 256
+        if images.shape[-1] < 256:
+            return images
+
+        images = (255 * (images / 2 + 0.5)).cpu().permute(0, 2, 3, 1).float().numpy()
+
+        # Convert RGB to BGR, which is the channel order expected by the watermark encoder.
+        images = images[:, :, :, ::-1]
+
+        # Add watermark and convert BGR back to RGB
+        images = [self.encoder.encode(image, "dwtDct")[:, :, ::-1] for image in images]
+
+        images = np.array(images)
+
+        images = torch.from_numpy(images).permute(0, 3, 1, 2)
+
+        images = torch.clamp(2 * (images / 255 - 0.5), min=-1.0, max=1.0)
+        return images
diff --git a/diffusers/src/diffusers/pipelines/stable_video_diffusion/__init__.py b/diffusers/src/diffusers/pipelines/stable_video_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bd4dc78966e217d85769691b98ed8fb0b6ac05c
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_video_diffusion/__init__.py
@@ -0,0 +1,58 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    BaseOutput,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure.update(
+        {
+            "pipeline_stable_video_diffusion": [
+                "StableVideoDiffusionPipeline",
+                "StableVideoDiffusionPipelineOutput",
+            ],
+        }
+    )
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_video_diffusion import (
+            StableVideoDiffusionPipeline,
+            StableVideoDiffusionPipelineOutput,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/diffusers/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..b14fdd4f8de33310491464f8cf0c68e6dd1a32e4
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -0,0 +1,726 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput
+from ...models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
+from ...schedulers import EulerDiscreteScheduler
+from ...utils import BaseOutput, logging, replace_example_docstring
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...video_processor import VideoProcessor
+from ..pipeline_utils import DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import StableVideoDiffusionPipeline
+        >>> from diffusers.utils import load_image, export_to_video
+
+        >>> pipe = StableVideoDiffusionPipeline.from_pretrained(
+        ...     "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg"
+        ... )
+        >>> image = image.resize((1024, 576))
+
+        >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
+        >>> export_to_video(frames, "generated.mp4", fps=7)
+        ```
+"""
+
+
+def _append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+@dataclass
+class StableVideoDiffusionPipelineOutput(BaseOutput):
+    r"""
+    Output class for Stable Video Diffusion pipeline.
+
+    Args:
+        frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]):
+            List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
+            num_frames, height, width, num_channels)`.
+    """
+
+    frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor]
+
+
+class StableVideoDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline to generate video from an input image using Stable Video Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKLTemporalDecoder`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder
+            ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
+        unet ([`UNetSpatioTemporalConditionModel`]):
+            A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
+        scheduler ([`EulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images.
+    """
+
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKLTemporalDecoder,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNetSpatioTemporalConditionModel,
+        scheduler: EulerDiscreteScheduler,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(do_resize=True, vae_scale_factor=self.vae_scale_factor)
+
+    def _encode_image(
+        self,
+        image: PipelineImageInput,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ) -> torch.Tensor:
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.video_processor.pil_to_numpy(image)
+            image = self.video_processor.numpy_to_pt(image)
+
+            # We normalize the image before resizing to match with the original implementation.
+            # Then we unnormalize it after resizing.
+            image = image * 2.0 - 1.0
+            image = _resize_with_antialiasing(image, (224, 224))
+            image = (image + 1.0) / 2.0
+
+        # Normalize the image with for CLIP input
+        image = self.feature_extractor(
+            images=image,
+            do_normalize=True,
+            do_center_crop=False,
+            do_resize=False,
+            do_rescale=False,
+            return_tensors="pt",
+        ).pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            negative_image_embeddings = torch.zeros_like(image_embeddings)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
+
+        return image_embeddings
+
+    def _encode_vae_image(
+        self,
+        image: torch.Tensor,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ):
+        image = image.to(device=device)
+        image_latents = self.vae.encode(image).latent_dist.mode()
+
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+
+        if do_classifier_free_guidance:
+            negative_image_latents = torch.zeros_like(image_latents)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_latents = torch.cat([negative_image_latents, image_latents])
+
+        return image_latents
+
+    def _get_add_time_ids(
+        self,
+        fps: int,
+        motion_bucket_id: int,
+        noise_aug_strength: float,
+        dtype: torch.dtype,
+        batch_size: int,
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ):
+        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
+
+        if do_classifier_free_guidance:
+            add_time_ids = torch.cat([add_time_ids, add_time_ids])
+
+        return add_time_ids
+
+    def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int = 14):
+        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
+        latents = latents.flatten(0, 1)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
+        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
+
+        # decode decode_chunk_size frames at a time to avoid OOM
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i : i + decode_chunk_size].shape[0]
+            decode_kwargs = {}
+            if accepts_num_frames:
+                # we only pass num_frames_in if it's expected
+                decode_kwargs["num_frames"] = num_frames_in
+
+            frame = self.vae.decode(latents[i : i + decode_chunk_size], **decode_kwargs).sample
+            frames.append(frame)
+        frames = torch.cat(frames, dim=0)
+
+        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
+        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        frames = frames.float()
+        return frames
+
+    def check_inputs(self, image, height, width):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_frames: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: Union[str, torch.device],
+        generator: torch.Generator,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents // 2,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        if isinstance(self.guidance_scale, (int, float)):
+            return self.guidance_scale > 1
+        return self.guidance_scale.max() > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor],
+        height: int = 576,
+        width: int = 1024,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 25,
+        sigmas: Optional[List[float]] = None,
+        min_guidance_scale: float = 1.0,
+        max_guidance_scale: float = 3.0,
+        fps: int = 7,
+        motion_bucket_id: int = 127,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
+                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
+                1]`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for
+                `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            min_guidance_scale (`float`, *optional*, defaults to 1.0):
+                The minimum guidance scale. Used for the classifier free guidance with first frame.
+            max_guidance_scale (`float`, *optional*, defaults to 3.0):
+                The maximum guidance scale. Used for the classifier free guidance with last frame.
+            fps (`int`, *optional*, defaults to 7):
+                Frames per second. The rate at which the generated images shall be exported to a video after
+                generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+            motion_bucket_id (`int`, *optional*, defaults to 127):
+                Used for conditioning the amount of motion for the generation. The higher the number the more motion
+                will be in the video.
+            noise_aug_strength (`float`, *optional*, defaults to 0.02):
+                The amount of noise added to the init image, the higher it is the less the video will look like the
+                init image. Increase it for more motion.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
+                expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
+                For lower memory usage, reduce `decode_chunk_size`.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `pil`, `np` or `pt`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that is called at the end of each denoising step during inference. The function is called
+                with the following arguments:
+                    `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
+                `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.Tensor`) is
+                returned.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width)
+
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = max_guidance_scale
+
+        # 3. Encode input image
+        image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+
+        # NOTE: Stable Video Diffusion was conditioned on fps - 1, which is why it is reduced here.
+        # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+        fps = fps - 1
+
+        # 4. Encode input image using VAE
+        image = self.video_processor.preprocess(image, height=height, width=width).to(device)
+        noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
+        image = image + noise_aug_strength * noise
+
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+
+        image_latents = self._encode_vae_image(
+            image,
+            device=device,
+            num_videos_per_prompt=num_videos_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        )
+        image_latents = image_latents.to(image_embeddings.dtype)
+
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+
+        # Repeat the image latents for each frame so we can concatenate them with the noise
+        # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
+        image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            fps,
+            motion_bucket_id,
+            noise_aug_strength,
+            image_embeddings.dtype,
+            batch_size,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+        )
+        added_time_ids = added_time_ids.to(device)
+
+        # 6. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None, sigmas)
+
+        # 7. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 8. Prepare guidance scale
+        guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
+        guidance_scale = guidance_scale.to(device, latents.dtype)
+        guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
+        guidance_scale = _append_dims(guidance_scale, latents.ndim)
+
+        self._guidance_scale = guidance_scale
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # Concatenate image_latents over channels dimension
+                latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latent":
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+            frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+            frames = self.video_processor.postprocess_video(video=frames, output_type=output_type)
+        else:
+            frames = latents
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return frames
+
+        return StableVideoDiffusionPipelineOutput(frames=frames)
+
+
+# resizing utils
+# TODO: clean up later
+def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True):
+    h, w = input.shape[-2:]
+    factors = (h / size[0], w / size[1])
+
+    # First, we have to determine sigma
+    # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171
+    sigmas = (
+        max((factors[0] - 1.0) / 2.0, 0.001),
+        max((factors[1] - 1.0) / 2.0, 0.001),
+    )
+
+    # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma
+    # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206
+    # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now
+    ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
+
+    # Make sure it is odd
+    if (ks[0] % 2) == 0:
+        ks = ks[0] + 1, ks[1]
+
+    if (ks[1] % 2) == 0:
+        ks = ks[0], ks[1] + 1
+
+    input = _gaussian_blur2d(input, ks, sigmas)
+
+    output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners)
+    return output
+
+
+def _compute_padding(kernel_size):
+    """Compute padding tuple."""
+    # 4 or 6 ints:  (padding_left, padding_right,padding_top,padding_bottom)
+    # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad
+    if len(kernel_size) < 2:
+        raise AssertionError(kernel_size)
+    computed = [k - 1 for k in kernel_size]
+
+    # for even kernels we need to do asymmetric padding :(
+    out_padding = 2 * len(kernel_size) * [0]
+
+    for i in range(len(kernel_size)):
+        computed_tmp = computed[-(i + 1)]
+
+        pad_front = computed_tmp // 2
+        pad_rear = computed_tmp - pad_front
+
+        out_padding[2 * i + 0] = pad_front
+        out_padding[2 * i + 1] = pad_rear
+
+    return out_padding
+
+
+def _filter2d(input, kernel):
+    # prepare kernel
+    b, c, h, w = input.shape
+    tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype)
+
+    tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
+
+    height, width = tmp_kernel.shape[-2:]
+
+    padding_shape: List[int] = _compute_padding([height, width])
+    input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
+
+    # kernel and input tensor reshape to align element-wise or batch-wise params
+    tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
+    input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1))
+
+    # convolve the tensor with the kernel.
+    output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
+
+    out = output.view(b, c, h, w)
+    return out
+
+
+def _gaussian(window_size: int, sigma):
+    if isinstance(sigma, float):
+        sigma = torch.tensor([[sigma]])
+
+    batch_size = sigma.shape[0]
+
+    x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
+
+    if window_size % 2 == 0:
+        x = x + 0.5
+
+    gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
+
+    return gauss / gauss.sum(-1, keepdim=True)
+
+
+def _gaussian_blur2d(input, kernel_size, sigma):
+    if isinstance(sigma, tuple):
+        sigma = torch.tensor([sigma], dtype=input.dtype)
+    else:
+        sigma = sigma.to(dtype=input.dtype)
+
+    ky, kx = int(kernel_size[0]), int(kernel_size[1])
+    bs = sigma.shape[0]
+    kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
+    kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
+    out_x = _filter2d(input, kernel_x[..., None, :])
+    out = _filter2d(out_x, kernel_y[..., None])
+
+    return out
diff --git a/diffusers/src/diffusers/pipelines/t2i_adapter/__init__.py b/diffusers/src/diffusers/pipelines/t2i_adapter/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..08c22a2707fe55770a519db481954881c1cad26e
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/t2i_adapter/__init__.py
@@ -0,0 +1,47 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_adapter"] = ["StableDiffusionAdapterPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_adapter"] = ["StableDiffusionXLAdapterPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_stable_diffusion_adapter import StableDiffusionAdapterPipeline
+        from .pipeline_stable_diffusion_xl_adapter import StableDiffusionXLAdapterPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/diffusers/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cb7c26bb6a2a411f803a647b4eccbabf331c4e6
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
@@ -0,0 +1,942 @@
+# Copyright 2024 TencentARC and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+@dataclass
+class StableDiffusionAdapterPipelineOutput(BaseOutput):
+    """
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+        nsfw_content_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, or `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from PIL import Image
+        >>> from diffusers.utils import load_image
+        >>> import torch
+        >>> from diffusers import StableDiffusionAdapterPipeline, T2IAdapter
+
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_ref.png"
+        ... )
+
+        >>> color_palette = image.resize((8, 8))
+        >>> color_palette = color_palette.resize((512, 512), resample=Image.Resampling.NEAREST)
+
+        >>> adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_color_sd14v1", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionAdapterPipeline.from_pretrained(
+        ...     "CompVis/stable-diffusion-v1-4",
+        ...     adapter=adapter,
+        ...     torch_dtype=torch.float16,
+        ... )
+
+        >>> pipe.to("cuda")
+
+        >>> out_image = pipe(
+        ...     "At night, glowing cubes in front of the beach",
+        ...     image=color_palette,
+        ... ).images[0]
+        ```
+"""
+
+
+def _preprocess_adapter_image(image, height, width):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])) for i in image]
+        image = [
+            i[None, ..., None] if i.ndim == 2 else i[None, ...] for i in image
+        ]  # expand [h, w] or [h, w, c] to [b, h, w, c]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        if image[0].ndim == 3:
+            image = torch.stack(image, dim=0)
+        elif image[0].ndim == 4:
+            image = torch.cat(image, dim=0)
+        else:
+            raise ValueError(
+                f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}"
+            )
+    return image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
+    https://arxiv.org/abs/2302.08453
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        adapter ([`T2IAdapter`] or [`MultiAdapter`] or `List[T2IAdapter]`):
+            Provides additional conditioning to the unet during the denoising process. If you set multiple Adapter as a
+            list, the outputs from each Adapter are added together to create one combined additional conditioning.
+        adapter_weights (`List[float]`, *optional*, defaults to None):
+            List of floats representing the weight which will be multiply to each adapter's output before adding them
+            together.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->adapter->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(adapter, (list, tuple)):
+            adapter = MultiAdapter(adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            adapter=adapter,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        image,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if isinstance(self.adapter, MultiAdapter):
+            if not isinstance(image, list):
+                raise ValueError(
+                    "MultiAdapter is enabled, but `image` is not a list. Please pass a list of images to `image`."
+                )
+
+            if len(image) != len(self.adapter.adapters):
+                raise ValueError(
+                    f"MultiAdapter requires passing the same number of images as adapters. Given {len(image)} images and {len(self.adapter.adapters)} adapters."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _default_height_width(self, height, width, image):
+        # NOTE: It is possible that a list of images have different
+        # dimensions for each image, so just checking the first image
+        # is not _exactly_ correct, but it is simple.
+        while isinstance(image, list):
+            image = image[0]
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[-2]
+
+            # round down to nearest multiple of `self.adapter.downscale_factor`
+            height = (height // self.adapter.downscale_factor) * self.adapter.downscale_factor
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[-1]
+
+            # round down to nearest multiple of `self.adapter.downscale_factor`
+            width = (width // self.adapter.downscale_factor) * self.adapter.downscale_factor
+
+        return height, width
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.Tensor, PIL.Image.Image, List[PIL.Image.Image]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        adapter_conditioning_scale: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
+                The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
+                type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
+                accepted as an image. The control image is automatically resized to fit the output image.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            adapter_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
+                residual in the original unet. If multiple adapters are specified in init, you can set the
+                corresponding scale as a list.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. When returning a tuple, the first element is a list with the generated images, and the second
+            element is a list of `bool`s denoting whether the corresponding generated image likely represents
+            "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height, width = self._default_height_width(height, width, image)
+        device = self._execution_device
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, image, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        self._guidance_scale = guidance_scale
+
+        if isinstance(self.adapter, MultiAdapter):
+            adapter_input = []
+
+            for one_image in image:
+                one_image = _preprocess_adapter_image(one_image, height, width)
+                one_image = one_image.to(device=device, dtype=self.adapter.dtype)
+                adapter_input.append(one_image)
+        else:
+            adapter_input = _preprocess_adapter_image(image, height, width)
+            adapter_input = adapter_input.to(device=device, dtype=self.adapter.dtype)
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        if isinstance(self.adapter, MultiAdapter):
+            adapter_state = self.adapter(adapter_input, adapter_conditioning_scale)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v
+        else:
+            adapter_state = self.adapter(adapter_input)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v * adapter_conditioning_scale
+        if num_images_per_prompt > 1:
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v.repeat(num_images_per_prompt, 1, 1, 1)
+        if self.do_classifier_free_guidance:
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = torch.cat([v] * 2, dim=0)
+
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_intrablock_additional_residuals=[state.clone() for state in adapter_state],
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionAdapterPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/diffusers/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/diffusers/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ea197e42e6212785355512692e2b098776c6cbc
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -0,0 +1,1277 @@
+# Copyright 2024 TencentARC and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, MultiAdapter, T2IAdapter, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import T2IAdapter, StableDiffusionXLAdapterPipeline, DDPMScheduler
+        >>> from diffusers.utils import load_image
+
+        >>> sketch_image = load_image("https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch.png").convert("L")
+
+        >>> model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+
+        >>> adapter = T2IAdapter.from_pretrained(
+        ...     "Adapter/t2iadapter",
+        ...     subfolder="sketch_sdxl_1.0",
+        ...     torch_dtype=torch.float16,
+        ...     adapter_type="full_adapter_xl",
+        ... )
+        >>> scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")
+
+        >>> pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
+        ...     model_id, adapter=adapter, torch_dtype=torch.float16, variant="fp16", scheduler=scheduler
+        ... ).to("cuda")
+
+        >>> generator = torch.manual_seed(42)
+        >>> sketch_image_out = pipe(
+        ...     prompt="a photo of a dog in real world, high quality",
+        ...     negative_prompt="extra digit, fewer digits, cropped, worst quality, low quality",
+        ...     image=sketch_image,
+        ...     generator=generator,
+        ...     guidance_scale=7.5,
+        ... ).images[0]
+        ```
+"""
+
+
+def _preprocess_adapter_image(image, height, width):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])) for i in image]
+        image = [
+            i[None, ..., None] if i.ndim == 2 else i[None, ...] for i in image
+        ]  # expand [h, w] or [h, w, c] to [b, h, w, c]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        if image[0].ndim == 3:
+            image = torch.stack(image, dim=0)
+        elif image[0].ndim == 4:
+            image = torch.cat(image, dim=0)
+        else:
+            raise ValueError(
+                f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}"
+            )
+    return image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLAdapterPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
+    https://arxiv.org/abs/2302.08453
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        adapter ([`T2IAdapter`] or [`MultiAdapter`] or `List[T2IAdapter]`):
+            Provides additional conditioning to the unet during the denoising process. If you set multiple Adapter as a
+            list, the outputs from each Adapter are added together to create one combined additional conditioning.
+        adapter_weights (`List[float]`, *optional*, defaults to None):
+            List of floats representing the weight which will be multiply to each adapter's output before adding them
+            together.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "feature_extractor",
+        "image_encoder",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            adapter=adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_sample_size = self.unet.config.sample_size
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.t2i_adapter.pipeline_stable_diffusion_adapter.StableDiffusionAdapterPipeline._default_height_width
+    def _default_height_width(self, height, width, image):
+        # NOTE: It is possible that a list of images have different
+        # dimensions for each image, so just checking the first image
+        # is not _exactly_ correct, but it is simple.
+        while isinstance(image, list):
+            image = image[0]
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[-2]
+
+            # round down to nearest multiple of `self.adapter.downscale_factor`
+            height = (height // self.adapter.downscale_factor) * self.adapter.downscale_factor
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[-1]
+
+            # round down to nearest multiple of `self.adapter.downscale_factor`
+            width = (width // self.adapter.downscale_factor) * self.adapter.downscale_factor
+
+        return height, width
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        adapter_conditioning_scale: Union[float, List[float]] = 1.0,
+        adapter_conditioning_factor: float = 1.0,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
+                The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
+                type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
+                accepted as an image. The control image is automatically resized to fit the output image.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionAdapterPipelineOutput`]
+                instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            adapter_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
+                residual in the original unet. If multiple adapters are specified in init, you can set the
+                corresponding scale as a list.
+            adapter_conditioning_factor (`float`, *optional*, defaults to 1.0):
+                The fraction of timesteps for which adapter should be applied. If `adapter_conditioning_factor` is
+                `0.0`, adapter is not applied at all. If `adapter_conditioning_factor` is `1.0`, adapter is applied for
+                all timesteps. If `adapter_conditioning_factor` is `0.5`, adapter is applied for half of the timesteps.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+
+        height, width = self._default_height_width(height, width, image)
+        device = self._execution_device
+
+        if isinstance(self.adapter, MultiAdapter):
+            adapter_input = []
+
+            for one_image in image:
+                one_image = _preprocess_adapter_image(one_image, height, width)
+                one_image = one_image.to(device=device, dtype=self.adapter.dtype)
+                adapter_input.append(one_image)
+        else:
+            adapter_input = _preprocess_adapter_image(image, height, width)
+            adapter_input = adapter_input.to(device=device, dtype=self.adapter.dtype)
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+        )
+
+        self._guidance_scale = guidance_scale
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3.1 Encode input prompt
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+
+        # 3.2 Encode ip_adapter_image
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6.1 Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Prepare added time ids & embeddings & adapter features
+        if isinstance(self.adapter, MultiAdapter):
+            adapter_state = self.adapter(adapter_input, adapter_conditioning_scale)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v
+        else:
+            adapter_state = self.adapter(adapter_input)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v * adapter_conditioning_scale
+        if num_images_per_prompt > 1:
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v.repeat(num_images_per_prompt, 1, 1, 1)
+        if self.do_classifier_free_guidance:
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = torch.cat([v] * 2, dim=0)
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        # Apply denoising_end
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                # predict the noise residual
+                if i < int(num_inference_steps * adapter_conditioning_factor):
+                    down_intrablock_additional_residuals = [state.clone() for state in adapter_state]
+                else:
+                    down_intrablock_additional_residuals = None
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_intrablock_additional_residuals=down_intrablock_additional_residuals,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+            return StableDiffusionXLPipelineOutput(images=image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/text_to_video_synthesis/__init__.py b/diffusers/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d8fdb92769bb3dcb2dd7696115b16f197062262
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
@@ -0,0 +1,54 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_output"] = ["TextToVideoSDPipelineOutput"]
+    _import_structure["pipeline_text_to_video_synth"] = ["TextToVideoSDPipeline"]
+    _import_structure["pipeline_text_to_video_synth_img2img"] = ["VideoToVideoSDPipeline"]
+    _import_structure["pipeline_text_to_video_zero"] = ["TextToVideoZeroPipeline"]
+    _import_structure["pipeline_text_to_video_zero_sdxl"] = ["TextToVideoZeroSDXLPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_output import TextToVideoSDPipelineOutput
+        from .pipeline_text_to_video_synth import TextToVideoSDPipeline
+        from .pipeline_text_to_video_synth_img2img import VideoToVideoSDPipeline
+        from .pipeline_text_to_video_zero import TextToVideoZeroPipeline
+        from .pipeline_text_to_video_zero_sdxl import TextToVideoZeroSDXLPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py b/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..040bf0efba8465d830af5e6686bc3f544585f63f
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
@@ -0,0 +1,26 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL
+import torch
+
+from ...utils import (
+    BaseOutput,
+)
+
+
+@dataclass
+class TextToVideoSDPipelineOutput(BaseOutput):
+    """
+     Output class for text-to-video pipelines.
+
+    Args:
+         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+             denoised
+     PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+    `(batch_size, num_frames, channels, height, width)`
+    """
+
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
diff --git a/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdd72b97f86b62c404347478e9698b7d889a6bf1
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -0,0 +1,643 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet3DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import TextToVideoSDPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import TextToVideoSDPipeline
+        >>> from diffusers.utils import export_to_video
+
+        >>> pipe = TextToVideoSDPipeline.from_pretrained(
+        ...     "damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16"
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "Spiderman is surfing"
+        >>> video_frames = pipe(prompt).frames[0]
+        >>> video_path = export_to_video(video_frames)
+        >>> video_path
+        ```
+"""
+
+
+class TextToVideoSDPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
+):
+    r"""
+    Pipeline for text-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet3DConditionModel`]):
+            A [`UNet3DConditionModel`] to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet3DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 16,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 9.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated video. Choose between `torch.Tensor` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_images_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # reshape latents
+                bsz, channel, frames, width, height = latents.shape
+                latents = latents.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+                noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # reshape latents back
+                latents = latents[None, :].reshape(bsz, frames, channel, width, height).permute(0, 2, 1, 3, 4)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 8. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+        # 9. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return TextToVideoSDPipelineOutput(frames=video)
diff --git a/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..92bf1d388c13cae67fae4de20a0994d68d874049
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
@@ -0,0 +1,699 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet3DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import TextToVideoSDPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
+        >>> from diffusers.utils import export_to_video
+
+        >>> pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dtype=torch.float16)
+        >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe.to("cuda")
+
+        >>> prompt = "spiderman running in the desert"
+        >>> video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames[0]
+        >>> # safe low-res video
+        >>> video_path = export_to_video(video_frames, output_video_path="./video_576_spiderman.mp4")
+
+        >>> # let's offload the text-to-image model
+        >>> pipe.to("cpu")
+
+        >>> # and load the image-to-image model
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     "cerspense/zeroscope_v2_XL", torch_dtype=torch.float16, revision="refs/pr/15"
+        ... )
+        >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # The VAE consumes A LOT of memory, let's make sure we run it in sliced mode
+        >>> pipe.vae.enable_slicing()
+
+        >>> # now let's upscale it
+        >>> video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames]
+
+        >>> # and denoise it
+        >>> video_frames = pipe(prompt, video=video, strength=0.6).frames[0]
+        >>> video_path = export_to_video(video_frames, output_video_path="./video_1024_spiderman.mp4")
+        >>> video_path
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class VideoToVideoSDPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
+):
+    r"""
+    Pipeline for text-guided video-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet3DConditionModel`]):
+            A [`UNet3DConditionModel`] to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet3DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, video, timestep, batch_size, dtype, device, generator=None):
+        video = video.to(device=device, dtype=dtype)
+
+        # change from (b, c, f, h, w) -> (b * f, c, w, h)
+        bsz, channel, frames, width, height = video.shape
+        video = video.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+
+        if video.shape[1] == 4:
+            init_latents = video
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(video[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(video), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `video` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        latents = latents[None, :].reshape((bsz, frames, latents.shape[1]) + latents.shape[2:]).permute(0, 2, 1, 3, 4)
+
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        video: Union[List[np.ndarray], torch.Tensor] = None,
+        strength: float = 0.6,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 15.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            video (`List[np.ndarray]` or `torch.Tensor`):
+                `video` frames or tensor representing a video batch to be used as the starting point for the process.
+                Can also accept video latents as `image`, if passing latents directly, it will not be encoded again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `video`. Must be between 0 and 1. `video` is used as a
+                starting point, adding more noise to it the larger the `strength`. The number of denoising steps
+                depends on the amount of noise initially added. When `strength` is 1, added noise is maximum and the
+                denoising process runs for the full number of iterations specified in `num_inference_steps`. A value of
+                1 essentially ignores `video`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in video generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated video. Choose between `torch.Tensor` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        # 0. Default height and width to unet
+        num_images_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Preprocess video
+        video = self.video_processor.preprocess_video(video)
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(video, latent_timestep, batch_size, prompt_embeds.dtype, device, generator)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # reshape latents
+                bsz, channel, frames, width, height = latents.shape
+                latents = latents.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+                noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # reshape latents back
+                latents = latents[None, :].reshape(bsz, frames, channel, width, height).permute(0, 2, 1, 3, 4)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+
+        # 9. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+        # 10. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return TextToVideoSDPipelineOutput(frames=video)
diff --git a/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
new file mode 100644
index 0000000000000000000000000000000000000000..c95c7f1b9625564757b3747546dc977b7622e577
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
@@ -0,0 +1,981 @@
+import copy
+import inspect
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from torch.nn.functional import grid_sample
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def rearrange_0(tensor, f):
+    F, C, H, W = tensor.size()
+    tensor = torch.permute(torch.reshape(tensor, (F // f, f, C, H, W)), (0, 2, 1, 3, 4))
+    return tensor
+
+
+def rearrange_1(tensor):
+    B, C, F, H, W = tensor.size()
+    return torch.reshape(torch.permute(tensor, (0, 2, 1, 3, 4)), (B * F, C, H, W))
+
+
+def rearrange_3(tensor, f):
+    F, D, C = tensor.size()
+    return torch.reshape(tensor, (F // f, f, D, C))
+
+
+def rearrange_4(tensor):
+    B, F, D, C = tensor.size()
+    return torch.reshape(tensor, (B * F, D, C))
+
+
+class CrossFrameAttnProcessor:
+    """
+    Cross frame attention processor. Each frame attends the first frame.
+
+    Args:
+        batch_size: The number that represents actual batch size, other than the frames.
+            For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to
+            2, due to classifier-free guidance.
+    """
+
+    def __init__(self, batch_size=2):
+        self.batch_size = batch_size
+
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+
+        is_cross_attention = encoder_hidden_states is not None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        # Cross Frame Attention
+        if not is_cross_attention:
+            video_length = key.size()[0] // self.batch_size
+            first_frame_index = [0] * video_length
+
+            # rearrange keys to have batch and frames in the 1st and 2nd dims respectively
+            key = rearrange_3(key, video_length)
+            key = key[:, first_frame_index]
+            # rearrange values to have batch and frames in the 1st and 2nd dims respectively
+            value = rearrange_3(value, video_length)
+            value = value[:, first_frame_index]
+
+            # rearrange back to original shape
+            key = rearrange_4(key)
+            value = rearrange_4(value)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class CrossFrameAttnProcessor2_0:
+    """
+    Cross frame attention processor with scaled_dot_product attention of Pytorch 2.0.
+
+    Args:
+        batch_size: The number that represents actual batch size, other than the frames.
+            For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to
+            2, due to classifier-free guidance.
+    """
+
+    def __init__(self, batch_size=2):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.batch_size = batch_size
+
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        inner_dim = hidden_states.shape[-1]
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        query = attn.to_q(hidden_states)
+
+        is_cross_attention = encoder_hidden_states is not None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        # Cross Frame Attention
+        if not is_cross_attention:
+            video_length = max(1, key.size()[0] // self.batch_size)
+            first_frame_index = [0] * video_length
+
+            # rearrange keys to have batch and frames in the 1st and 2nd dims respectively
+            key = rearrange_3(key, video_length)
+            key = key[:, first_frame_index]
+            # rearrange values to have batch and frames in the 1st and 2nd dims respectively
+            value = rearrange_3(value, video_length)
+            value = value[:, first_frame_index]
+
+            # rearrange back to original shape
+            key = rearrange_4(key)
+            value = rearrange_4(value)
+
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+
+
+@dataclass
+class TextToVideoPipelineOutput(BaseOutput):
+    r"""
+    Output class for zero-shot text-to-video pipeline.
+
+    Args:
+        images (`[List[PIL.Image.Image]`, `np.ndarray`]):
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`[List[bool]]`):
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+
+
+def coords_grid(batch, ht, wd, device):
+    # Adapted from https://github.com/princeton-vl/RAFT/blob/master/core/utils/utils.py
+    coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+
+
+def warp_single_latent(latent, reference_flow):
+    """
+    Warp latent of a single frame with given flow
+
+    Args:
+        latent: latent code of a single frame
+        reference_flow: flow which to warp the latent with
+
+    Returns:
+        warped: warped latent
+    """
+    _, _, H, W = reference_flow.size()
+    _, _, h, w = latent.size()
+    coords0 = coords_grid(1, H, W, device=latent.device).to(latent.dtype)
+
+    coords_t0 = coords0 + reference_flow
+    coords_t0[:, 0] /= W
+    coords_t0[:, 1] /= H
+
+    coords_t0 = coords_t0 * 2.0 - 1.0
+    coords_t0 = F.interpolate(coords_t0, size=(h, w), mode="bilinear")
+    coords_t0 = torch.permute(coords_t0, (0, 2, 3, 1))
+
+    warped = grid_sample(latent, coords_t0, mode="nearest", padding_mode="reflection")
+    return warped
+
+
+def create_motion_field(motion_field_strength_x, motion_field_strength_y, frame_ids, device, dtype):
+    """
+    Create translation motion field
+
+    Args:
+        motion_field_strength_x: motion strength along x-axis
+        motion_field_strength_y: motion strength along y-axis
+        frame_ids: indexes of the frames the latents of which are being processed.
+            This is needed when we perform chunk-by-chunk inference
+        device: device
+        dtype: dtype
+
+    Returns:
+
+    """
+    seq_length = len(frame_ids)
+    reference_flow = torch.zeros((seq_length, 2, 512, 512), device=device, dtype=dtype)
+    for fr_idx in range(seq_length):
+        reference_flow[fr_idx, 0, :, :] = motion_field_strength_x * (frame_ids[fr_idx])
+        reference_flow[fr_idx, 1, :, :] = motion_field_strength_y * (frame_ids[fr_idx])
+    return reference_flow
+
+
+def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_strength_y, frame_ids, latents):
+    """
+    Creates translation motion and warps the latents accordingly
+
+    Args:
+        motion_field_strength_x: motion strength along x-axis
+        motion_field_strength_y: motion strength along y-axis
+        frame_ids: indexes of the frames the latents of which are being processed.
+            This is needed when we perform chunk-by-chunk inference
+        latents: latent codes of frames
+
+    Returns:
+        warped_latents: warped latents
+    """
+    motion_field = create_motion_field(
+        motion_field_strength_x=motion_field_strength_x,
+        motion_field_strength_y=motion_field_strength_y,
+        frame_ids=frame_ids,
+        device=latents.device,
+        dtype=latents.dtype,
+    )
+    warped_latents = latents.clone().detach()
+    for i in range(len(warped_latents)):
+        warped_latents[i] = warp_single_latent(latents[i][None], motion_field[i][None])
+    return warped_latents
+
+
+class TextToVideoZeroPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
+):
+    r"""
+    Pipeline for zero-shot text-to-video generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet3DConditionModel`] to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`CLIPImageProcessor`]):
+            A [`CLIPImageProcessor`] to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def forward_loop(self, x_t0, t0, t1, generator):
+        """
+        Perform DDPM forward process from time t0 to t1. This is the same as adding noise with corresponding variance.
+
+        Args:
+            x_t0:
+                Latent code at time t0.
+            t0:
+                Timestep at t0.
+            t1:
+                Timestamp at t1.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+
+        Returns:
+            x_t1:
+                Forward process applied to x_t0 from time t0 to t1.
+        """
+        eps = randn_tensor(x_t0.size(), generator=generator, dtype=x_t0.dtype, device=x_t0.device)
+        alpha_vec = torch.prod(self.scheduler.alphas[t0:t1])
+        x_t1 = torch.sqrt(alpha_vec) * x_t0 + torch.sqrt(1 - alpha_vec) * eps
+        return x_t1
+
+    def backward_loop(
+        self,
+        latents,
+        timesteps,
+        prompt_embeds,
+        guidance_scale,
+        callback,
+        callback_steps,
+        num_warmup_steps,
+        extra_step_kwargs,
+        cross_attention_kwargs=None,
+    ):
+        """
+        Perform backward process given list of time steps.
+
+        Args:
+            latents:
+                Latents at time timesteps[0].
+            timesteps:
+                Time steps along which to perform backward process.
+            prompt_embeds:
+                Pre-generated text embeddings.
+            guidance_scale:
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            extra_step_kwargs:
+                Extra_step_kwargs.
+            cross_attention_kwargs:
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            num_warmup_steps:
+                number of warmup steps.
+
+        Returns:
+            latents:
+                Latents of backward process output at time timesteps[-1].
+        """
+        do_classifier_free_guidance = guidance_scale > 1.0
+        num_steps = (len(timesteps) - num_warmup_steps) // self.scheduler.order
+        with self.progress_bar(total=num_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        return latents.clone().detach()
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        video_length: Optional[int] = 8,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        motion_field_strength_x: float = 12,
+        motion_field_strength_y: float = 12,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        t0: int = 44,
+        t1: int = 47,
+        frame_ids: Optional[List[int]] = None,
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            video_length (`int`, *optional*, defaults to 8):
+                The number of generated video frames.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in video generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated video. Choose between `"latent"` and `"np"`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a
+                [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoPipelineOutput`] instead of
+                a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            motion_field_strength_x (`float`, *optional*, defaults to 12):
+                Strength of motion in generated video along x-axis. See the [paper](https://arxiv.org/abs/2303.13439),
+                Sect. 3.3.1.
+            motion_field_strength_y (`float`, *optional*, defaults to 12):
+                Strength of motion in generated video along y-axis. See the [paper](https://arxiv.org/abs/2303.13439),
+                Sect. 3.3.1.
+            t0 (`int`, *optional*, defaults to 44):
+                Timestep t0. Should be in the range [0, num_inference_steps - 1]. See the
+                [paper](https://arxiv.org/abs/2303.13439), Sect. 3.3.1.
+            t1 (`int`, *optional*, defaults to 47):
+                Timestep t0. Should be in the range [t0 + 1, num_inference_steps - 1]. See the
+                [paper](https://arxiv.org/abs/2303.13439), Sect. 3.3.1.
+            frame_ids (`List[int]`, *optional*):
+                Indexes of the frames that are being generated. This is used when generating longer videos
+                chunk-by-chunk.
+
+        Returns:
+            [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoPipelineOutput`]:
+                The output contains a `ndarray` of the generated video, when `output_type` != `"latent"`, otherwise a
+                latent code of generated videos and a list of `bool`s indicating whether the corresponding generated
+                video contains "not-safe-for-work" (nsfw) content..
+        """
+        assert video_length > 0
+        if frame_ids is None:
+            frame_ids = list(range(video_length))
+        assert len(frame_ids) == video_length
+
+        assert num_videos_per_prompt == 1
+
+        # set the processor
+        original_attn_proc = self.unet.attn_processors
+        processor = (
+            CrossFrameAttnProcessor2_0(batch_size=2)
+            if hasattr(F, "scaled_dot_product_attention")
+            else CrossFrameAttnProcessor(batch_size=2)
+        )
+        self.unet.set_attn_processor(processor)
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+
+        # Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # Encode input prompt
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        # Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+        # Perform the first backward process up to time T_1
+        x_1_t1 = self.backward_loop(
+            timesteps=timesteps[: -t1 - 1],
+            prompt_embeds=prompt_embeds,
+            latents=latents,
+            guidance_scale=guidance_scale,
+            callback=callback,
+            callback_steps=callback_steps,
+            extra_step_kwargs=extra_step_kwargs,
+            num_warmup_steps=num_warmup_steps,
+        )
+        scheduler_copy = copy.deepcopy(self.scheduler)
+
+        # Perform the second backward process up to time T_0
+        x_1_t0 = self.backward_loop(
+            timesteps=timesteps[-t1 - 1 : -t0 - 1],
+            prompt_embeds=prompt_embeds,
+            latents=x_1_t1,
+            guidance_scale=guidance_scale,
+            callback=callback,
+            callback_steps=callback_steps,
+            extra_step_kwargs=extra_step_kwargs,
+            num_warmup_steps=0,
+        )
+
+        # Propagate first frame latents at time T_0 to remaining frames
+        x_2k_t0 = x_1_t0.repeat(video_length - 1, 1, 1, 1)
+
+        # Add motion in latents at time T_0
+        x_2k_t0 = create_motion_field_and_warp_latents(
+            motion_field_strength_x=motion_field_strength_x,
+            motion_field_strength_y=motion_field_strength_y,
+            latents=x_2k_t0,
+            frame_ids=frame_ids[1:],
+        )
+
+        # Perform forward process up to time T_1
+        x_2k_t1 = self.forward_loop(
+            x_t0=x_2k_t0,
+            t0=timesteps[-t0 - 1].item(),
+            t1=timesteps[-t1 - 1].item(),
+            generator=generator,
+        )
+
+        # Perform backward process from time T_1 to 0
+        x_1k_t1 = torch.cat([x_1_t1, x_2k_t1])
+        b, l, d = prompt_embeds.size()
+        prompt_embeds = prompt_embeds[:, None].repeat(1, video_length, 1, 1).reshape(b * video_length, l, d)
+
+        self.scheduler = scheduler_copy
+        x_1k_0 = self.backward_loop(
+            timesteps=timesteps[-t1 - 1 :],
+            prompt_embeds=prompt_embeds,
+            latents=x_1k_t1,
+            guidance_scale=guidance_scale,
+            callback=callback,
+            callback_steps=callback_steps,
+            extra_step_kwargs=extra_step_kwargs,
+            num_warmup_steps=0,
+        )
+        latents = x_1k_0
+
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+        torch.cuda.empty_cache()
+
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        else:
+            image = self.decode_latents(latents)
+            # Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+        # make sure to set the original attention processors back
+        self.unet.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return TextToVideoPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
diff --git a/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py b/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..c46a5ce7c084cf7342ac6a139940f4cfbb5e9b28
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
@@ -0,0 +1,1317 @@
+import copy
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+from torch.nn.functional import grid_sample
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    is_invisible_watermark_available,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.rearrange_0
+def rearrange_0(tensor, f):
+    F, C, H, W = tensor.size()
+    tensor = torch.permute(torch.reshape(tensor, (F // f, f, C, H, W)), (0, 2, 1, 3, 4))
+    return tensor
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.rearrange_1
+def rearrange_1(tensor):
+    B, C, F, H, W = tensor.size()
+    return torch.reshape(torch.permute(tensor, (0, 2, 1, 3, 4)), (B * F, C, H, W))
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.rearrange_3
+def rearrange_3(tensor, f):
+    F, D, C = tensor.size()
+    return torch.reshape(tensor, (F // f, f, D, C))
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.rearrange_4
+def rearrange_4(tensor):
+    B, F, D, C = tensor.size()
+    return torch.reshape(tensor, (B * F, D, C))
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor
+class CrossFrameAttnProcessor:
+    """
+    Cross frame attention processor. Each frame attends the first frame.
+
+    Args:
+        batch_size: The number that represents actual batch size, other than the frames.
+            For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to
+            2, due to classifier-free guidance.
+    """
+
+    def __init__(self, batch_size=2):
+        self.batch_size = batch_size
+
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+
+        is_cross_attention = encoder_hidden_states is not None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        # Cross Frame Attention
+        if not is_cross_attention:
+            video_length = key.size()[0] // self.batch_size
+            first_frame_index = [0] * video_length
+
+            # rearrange keys to have batch and frames in the 1st and 2nd dims respectively
+            key = rearrange_3(key, video_length)
+            key = key[:, first_frame_index]
+            # rearrange values to have batch and frames in the 1st and 2nd dims respectively
+            value = rearrange_3(value, video_length)
+            value = value[:, first_frame_index]
+
+            # rearrange back to original shape
+            key = rearrange_4(key)
+            value = rearrange_4(value)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor2_0
+class CrossFrameAttnProcessor2_0:
+    """
+    Cross frame attention processor with scaled_dot_product attention of Pytorch 2.0.
+
+    Args:
+        batch_size: The number that represents actual batch size, other than the frames.
+            For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to
+            2, due to classifier-free guidance.
+    """
+
+    def __init__(self, batch_size=2):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.batch_size = batch_size
+
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        inner_dim = hidden_states.shape[-1]
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        query = attn.to_q(hidden_states)
+
+        is_cross_attention = encoder_hidden_states is not None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        # Cross Frame Attention
+        if not is_cross_attention:
+            video_length = max(1, key.size()[0] // self.batch_size)
+            first_frame_index = [0] * video_length
+
+            # rearrange keys to have batch and frames in the 1st and 2nd dims respectively
+            key = rearrange_3(key, video_length)
+            key = key[:, first_frame_index]
+            # rearrange values to have batch and frames in the 1st and 2nd dims respectively
+            value = rearrange_3(value, video_length)
+            value = value[:, first_frame_index]
+
+            # rearrange back to original shape
+            key = rearrange_4(key)
+            value = rearrange_4(value)
+
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+
+
+@dataclass
+class TextToVideoSDXLPipelineOutput(BaseOutput):
+    """
+    Output class for zero-shot text-to-video pipeline.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.coords_grid
+def coords_grid(batch, ht, wd, device):
+    # Adapted from https://github.com/princeton-vl/RAFT/blob/master/core/utils/utils.py
+    coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.warp_single_latent
+def warp_single_latent(latent, reference_flow):
+    """
+    Warp latent of a single frame with given flow
+
+    Args:
+        latent: latent code of a single frame
+        reference_flow: flow which to warp the latent with
+
+    Returns:
+        warped: warped latent
+    """
+    _, _, H, W = reference_flow.size()
+    _, _, h, w = latent.size()
+    coords0 = coords_grid(1, H, W, device=latent.device).to(latent.dtype)
+
+    coords_t0 = coords0 + reference_flow
+    coords_t0[:, 0] /= W
+    coords_t0[:, 1] /= H
+
+    coords_t0 = coords_t0 * 2.0 - 1.0
+    coords_t0 = F.interpolate(coords_t0, size=(h, w), mode="bilinear")
+    coords_t0 = torch.permute(coords_t0, (0, 2, 3, 1))
+
+    warped = grid_sample(latent, coords_t0, mode="nearest", padding_mode="reflection")
+    return warped
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.create_motion_field
+def create_motion_field(motion_field_strength_x, motion_field_strength_y, frame_ids, device, dtype):
+    """
+    Create translation motion field
+
+    Args:
+        motion_field_strength_x: motion strength along x-axis
+        motion_field_strength_y: motion strength along y-axis
+        frame_ids: indexes of the frames the latents of which are being processed.
+            This is needed when we perform chunk-by-chunk inference
+        device: device
+        dtype: dtype
+
+    Returns:
+
+    """
+    seq_length = len(frame_ids)
+    reference_flow = torch.zeros((seq_length, 2, 512, 512), device=device, dtype=dtype)
+    for fr_idx in range(seq_length):
+        reference_flow[fr_idx, 0, :, :] = motion_field_strength_x * (frame_ids[fr_idx])
+        reference_flow[fr_idx, 1, :, :] = motion_field_strength_y * (frame_ids[fr_idx])
+    return reference_flow
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.create_motion_field_and_warp_latents
+def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_strength_y, frame_ids, latents):
+    """
+    Creates translation motion and warps the latents accordingly
+
+    Args:
+        motion_field_strength_x: motion strength along x-axis
+        motion_field_strength_y: motion strength along y-axis
+        frame_ids: indexes of the frames the latents of which are being processed.
+            This is needed when we perform chunk-by-chunk inference
+        latents: latent codes of frames
+
+    Returns:
+        warped_latents: warped latents
+    """
+    motion_field = create_motion_field(
+        motion_field_strength_x=motion_field_strength_x,
+        motion_field_strength_y=motion_field_strength_y,
+        frame_ids=frame_ids,
+        device=latents.device,
+        dtype=latents.dtype,
+    )
+    warped_latents = latents.clone().detach()
+    for i in range(len(warped_latents)):
+        warped_latents[i] = warp_single_latent(latents[i][None], motion_field[i][None])
+    return warped_latents
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class TextToVideoZeroSDXLPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+):
+    r"""
+    Pipeline for zero-shot text-to-video generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoZeroPipeline.forward_loop
+    def forward_loop(self, x_t0, t0, t1, generator):
+        """
+        Perform DDPM forward process from time t0 to t1. This is the same as adding noise with corresponding variance.
+
+        Args:
+            x_t0:
+                Latent code at time t0.
+            t0:
+                Timestep at t0.
+            t1:
+                Timestamp at t1.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+
+        Returns:
+            x_t1:
+                Forward process applied to x_t0 from time t0 to t1.
+        """
+        eps = randn_tensor(x_t0.size(), generator=generator, dtype=x_t0.dtype, device=x_t0.device)
+        alpha_vec = torch.prod(self.scheduler.alphas[t0:t1])
+        x_t1 = torch.sqrt(alpha_vec) * x_t0 + torch.sqrt(1 - alpha_vec) * eps
+        return x_t1
+
+    def backward_loop(
+        self,
+        latents,
+        timesteps,
+        prompt_embeds,
+        guidance_scale,
+        callback,
+        callback_steps,
+        num_warmup_steps,
+        extra_step_kwargs,
+        add_text_embeds,
+        add_time_ids,
+        cross_attention_kwargs=None,
+        guidance_rescale: float = 0.0,
+    ):
+        """
+        Perform backward process given list of time steps
+
+        Args:
+            latents:
+                Latents at time timesteps[0].
+            timesteps:
+                Time steps along which to perform backward process.
+            prompt_embeds:
+                Pre-generated text embeddings.
+            guidance_scale:
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            extra_step_kwargs:
+                Extra_step_kwargs.
+            cross_attention_kwargs:
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            num_warmup_steps:
+                number of warmup steps.
+
+        Returns:
+            latents: latents of backward process output at time timesteps[-1]
+        """
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        num_steps = (len(timesteps) - num_warmup_steps) // self.scheduler.order
+
+        with self.progress_bar(total=num_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        return latents.clone().detach()
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        video_length: Optional[int] = 8,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        frame_ids: Optional[List[int]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        latents: Optional[torch.Tensor] = None,
+        motion_field_strength_x: float = 12,
+        motion_field_strength_y: float = 12,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        t0: int = 44,
+        t1: int = 47,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            video_length (`int`, *optional*, defaults to 8):
+                The number of generated video frames.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            frame_ids (`List[int]`, *optional*):
+                Indexes of the frames that are being generated. This is used when generating longer videos
+                chunk-by-chunk.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            motion_field_strength_x (`float`, *optional*, defaults to 12):
+                Strength of motion in generated video along x-axis. See the [paper](https://arxiv.org/abs/2303.13439),
+                Sect. 3.3.1.
+            motion_field_strength_y (`float`, *optional*, defaults to 12):
+                Strength of motion in generated video along y-axis. See the [paper](https://arxiv.org/abs/2303.13439),
+                Sect. 3.3.1.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            t0 (`int`, *optional*, defaults to 44):
+                Timestep t0. Should be in the range [0, num_inference_steps - 1]. See the
+                [paper](https://arxiv.org/abs/2303.13439), Sect. 3.3.1.
+            t1 (`int`, *optional*, defaults to 47):
+                Timestep t0. Should be in the range [t0 + 1, num_inference_steps - 1]. See the
+                [paper](https://arxiv.org/abs/2303.13439), Sect. 3.3.1.
+
+        Returns:
+            [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoSDXLPipelineOutput`] or
+            `tuple`: [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoSDXLPipelineOutput`]
+            if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+        assert video_length > 0
+        if frame_ids is None:
+            frame_ids = list(range(video_length))
+        assert len(frame_ids) == video_length
+
+        assert num_videos_per_prompt == 1
+
+        # set the processor
+        original_attn_proc = self.unet.attn_processors
+        processor = (
+            CrossFrameAttnProcessor2_0(batch_size=2)
+            if hasattr(F, "scaled_dot_product_attention")
+            else CrossFrameAttnProcessor(batch_size=2)
+        )
+        self.unet.set_attn_processor(processor)
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        batch_size = (
+            1 if isinstance(prompt, str) else len(prompt) if isinstance(prompt, list) else prompt_embeds.shape[0]
+        )
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_videos_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_videos_per_prompt, 1)
+
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+        # Perform the first backward process up to time T_1
+        x_1_t1 = self.backward_loop(
+            timesteps=timesteps[: -t1 - 1],
+            prompt_embeds=prompt_embeds,
+            latents=latents,
+            guidance_scale=guidance_scale,
+            callback=callback,
+            callback_steps=callback_steps,
+            extra_step_kwargs=extra_step_kwargs,
+            num_warmup_steps=num_warmup_steps,
+            add_text_embeds=add_text_embeds,
+            add_time_ids=add_time_ids,
+        )
+
+        scheduler_copy = copy.deepcopy(self.scheduler)
+
+        # Perform the second backward process up to time T_0
+        x_1_t0 = self.backward_loop(
+            timesteps=timesteps[-t1 - 1 : -t0 - 1],
+            prompt_embeds=prompt_embeds,
+            latents=x_1_t1,
+            guidance_scale=guidance_scale,
+            callback=callback,
+            callback_steps=callback_steps,
+            extra_step_kwargs=extra_step_kwargs,
+            num_warmup_steps=0,
+            add_text_embeds=add_text_embeds,
+            add_time_ids=add_time_ids,
+        )
+
+        # Propagate first frame latents at time T_0 to remaining frames
+        x_2k_t0 = x_1_t0.repeat(video_length - 1, 1, 1, 1)
+
+        # Add motion in latents at time T_0
+        x_2k_t0 = create_motion_field_and_warp_latents(
+            motion_field_strength_x=motion_field_strength_x,
+            motion_field_strength_y=motion_field_strength_y,
+            latents=x_2k_t0,
+            frame_ids=frame_ids[1:],
+        )
+
+        # Perform forward process up to time T_1
+        x_2k_t1 = self.forward_loop(
+            x_t0=x_2k_t0,
+            t0=timesteps[-t0 - 1].to(torch.long),
+            t1=timesteps[-t1 - 1].to(torch.long),
+            generator=generator,
+        )
+
+        # Perform backward process from time T_1 to 0
+        latents = torch.cat([x_1_t1, x_2k_t1])
+
+        self.scheduler = scheduler_copy
+        timesteps = timesteps[-t1 - 1 :]
+
+        b, l, d = prompt_embeds.size()
+        prompt_embeds = prompt_embeds[:, None].repeat(1, video_length, 1, 1).reshape(b * video_length, l, d)
+
+        b, k = add_text_embeds.size()
+        add_text_embeds = add_text_embeds[:, None].repeat(1, video_length, 1).reshape(b * video_length, k)
+
+        b, k = add_time_ids.size()
+        add_time_ids = add_time_ids[:, None].repeat(1, video_length, 1).reshape(b * video_length, k)
+
+        # 7.1 Apply denoising_end
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        x_1k_0 = self.backward_loop(
+            timesteps=timesteps,
+            prompt_embeds=prompt_embeds,
+            latents=latents,
+            guidance_scale=guidance_scale,
+            callback=callback,
+            callback_steps=callback_steps,
+            extra_step_kwargs=extra_step_kwargs,
+            num_warmup_steps=0,
+            add_text_embeds=add_text_embeds,
+            add_time_ids=add_time_ids,
+        )
+
+        latents = x_1k_0
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+            return TextToVideoSDXLPipelineOutput(images=image)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        self.maybe_free_model_hooks()
+        # make sure to set the original attention processors back
+        self.unet.set_attn_processor(original_attn_proc)
+
+        if not return_dict:
+            return (image,)
+
+        return TextToVideoSDXLPipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/unclip/__init__.py b/diffusers/src/diffusers/pipelines/unclip/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c89e899463beede59b8ccf02688f6168b8ee3d77
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/unclip/__init__.py
@@ -0,0 +1,52 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import UnCLIPImageVariationPipeline, UnCLIPPipeline
+
+    _dummy_objects.update(
+        {"UnCLIPImageVariationPipeline": UnCLIPImageVariationPipeline, "UnCLIPPipeline": UnCLIPPipeline}
+    )
+else:
+    _import_structure["pipeline_unclip"] = ["UnCLIPPipeline"]
+    _import_structure["pipeline_unclip_image_variation"] = ["UnCLIPImageVariationPipeline"]
+    _import_structure["text_proj"] = ["UnCLIPTextProjModel"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_unclip import UnCLIPPipeline
+        from .pipeline_unclip_image_variation import UnCLIPImageVariationPipeline
+        from .text_proj import UnCLIPTextProjModel
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/unclip/pipeline_unclip.py b/diffusers/src/diffusers/pipelines/unclip/pipeline_unclip.py
new file mode 100644
index 0000000000000000000000000000000000000000..25c6739d8720a3696652438c4a7f8a09c5a1a3ba
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/unclip/pipeline_unclip.py
@@ -0,0 +1,493 @@
+# Copyright 2024 Kakao Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch.nn import functional as F
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+from transformers.models.clip.modeling_clip import CLIPTextModelOutput
+
+from ...models import PriorTransformer, UNet2DConditionModel, UNet2DModel
+from ...schedulers import UnCLIPScheduler
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .text_proj import UnCLIPTextProjModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class UnCLIPPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using unCLIP.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        text_proj ([`UnCLIPTextProjModel`]):
+            Utility class to prepare and combine the embeddings before they are passed to the decoder.
+        decoder ([`UNet2DConditionModel`]):
+            The decoder to invert the image embedding into an image.
+        super_res_first ([`UNet2DModel`]):
+            Super resolution UNet. Used in all but the last step of the super resolution diffusion process.
+        super_res_last ([`UNet2DModel`]):
+            Super resolution UNet. Used in the last step of the super resolution diffusion process.
+        prior_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the prior denoising process (a modified [`DDPMScheduler`]).
+        decoder_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the decoder denoising process (a modified [`DDPMScheduler`]).
+        super_res_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the super resolution denoising process (a modified [`DDPMScheduler`]).
+
+    """
+
+    _exclude_from_cpu_offload = ["prior"]
+
+    prior: PriorTransformer
+    decoder: UNet2DConditionModel
+    text_proj: UnCLIPTextProjModel
+    text_encoder: CLIPTextModelWithProjection
+    tokenizer: CLIPTokenizer
+    super_res_first: UNet2DModel
+    super_res_last: UNet2DModel
+
+    prior_scheduler: UnCLIPScheduler
+    decoder_scheduler: UnCLIPScheduler
+    super_res_scheduler: UnCLIPScheduler
+
+    model_cpu_offload_seq = "text_encoder->text_proj->decoder->super_res_first->super_res_last"
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        decoder: UNet2DConditionModel,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_proj: UnCLIPTextProjModel,
+        super_res_first: UNet2DModel,
+        super_res_last: UNet2DModel,
+        prior_scheduler: UnCLIPScheduler,
+        decoder_scheduler: UnCLIPScheduler,
+        super_res_scheduler: UnCLIPScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            decoder=decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_proj=text_proj,
+            super_res_first=super_res_first,
+            super_res_last=super_res_last,
+            prior_scheduler=prior_scheduler,
+            decoder_scheduler=decoder_scheduler,
+            super_res_scheduler=super_res_scheduler,
+        )
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+        text_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if text_model_output is None:
+            batch_size = len(prompt) if isinstance(prompt, list) else 1
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            text_mask = text_inputs.attention_mask.bool().to(device)
+
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+            text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+            prompt_embeds = text_encoder_output.text_embeds
+            text_enc_hid_states = text_encoder_output.last_hidden_state
+
+        else:
+            batch_size = text_model_output[0].shape[0]
+            prompt_embeds, text_enc_hid_states = text_model_output[0], text_model_output[1]
+            text_mask = text_attention_mask
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_enc_hid_states = text_enc_hid_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens = [""] * batch_size
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_enc_hid_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_enc_hid_states.shape[1]
+            uncond_text_enc_hid_states = uncond_text_enc_hid_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_enc_hid_states = uncond_text_enc_hid_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_enc_hid_states = torch.cat([uncond_text_enc_hid_states, text_enc_hid_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_enc_hid_states, text_mask
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        prior_num_inference_steps: int = 25,
+        decoder_num_inference_steps: int = 25,
+        super_res_num_inference_steps: int = 7,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prior_latents: Optional[torch.Tensor] = None,
+        decoder_latents: Optional[torch.Tensor] = None,
+        super_res_latents: Optional[torch.Tensor] = None,
+        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+        text_attention_mask: Optional[torch.Tensor] = None,
+        prior_guidance_scale: float = 4.0,
+        decoder_guidance_scale: float = 8.0,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation. This can only be left undefined if `text_model_output`
+                and `text_attention_mask` is passed.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            prior_num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps for the prior. More denoising steps usually lead to a higher quality
+                image at the expense of slower inference.
+            decoder_num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
+                image at the expense of slower inference.
+            super_res_num_inference_steps (`int`, *optional*, defaults to 7):
+                The number of denoising steps for super resolution. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prior_latents (`torch.Tensor` of shape (batch size, embeddings dimension), *optional*):
+                Pre-generated noisy latents to be used as inputs for the prior.
+            decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*):
+                Pre-generated noisy latents to be used as inputs for the decoder.
+            super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*):
+                Pre-generated noisy latents to be used as inputs for the decoder.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            text_model_output (`CLIPTextModelOutput`, *optional*):
+                Pre-defined [`CLIPTextModel`] outputs that can be derived from the text encoder. Pre-defined text
+                outputs can be passed for tasks like text embedding interpolations. Make sure to also pass
+                `text_attention_mask` in this case. `prompt` can the be left `None`.
+            text_attention_mask (`torch.Tensor`, *optional*):
+                Pre-defined CLIP text attention mask that can be derived from the tokenizer. Pre-defined text attention
+                masks are necessary when passing `text_model_output`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        if prompt is not None:
+            if isinstance(prompt, str):
+                batch_size = 1
+            elif isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        else:
+            batch_size = text_model_output[0].shape[0]
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = prior_guidance_scale > 1.0 or decoder_guidance_scale > 1.0
+
+        prompt_embeds, text_enc_hid_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, text_model_output, text_attention_mask
+        )
+
+        # prior
+
+        self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
+        prior_timesteps_tensor = self.prior_scheduler.timesteps
+
+        embedding_dim = self.prior.config.embedding_dim
+
+        prior_latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            prior_latents,
+            self.prior_scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+                encoder_hidden_states=text_enc_hid_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == prior_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = prior_timesteps_tensor[i + 1]
+
+            prior_latents = self.prior_scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=prior_latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+        prior_latents = self.prior.post_process_latents(prior_latents)
+
+        image_embeddings = prior_latents
+
+        # done prior
+
+        # decoder
+
+        text_enc_hid_states, additive_clip_time_embeddings = self.text_proj(
+            image_embeddings=image_embeddings,
+            prompt_embeds=prompt_embeds,
+            text_encoder_hidden_states=text_enc_hid_states,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+
+        if device.type == "mps":
+            # HACK: MPS: There is a panic when padding bool tensors,
+            # so cast to int tensor for the pad and back to bool afterwards
+            text_mask = text_mask.type(torch.int)
+            decoder_text_mask = F.pad(text_mask, (self.text_proj.clip_extra_context_tokens, 0), value=1)
+            decoder_text_mask = decoder_text_mask.type(torch.bool)
+        else:
+            decoder_text_mask = F.pad(text_mask, (self.text_proj.clip_extra_context_tokens, 0), value=True)
+
+        self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device)
+        decoder_timesteps_tensor = self.decoder_scheduler.timesteps
+
+        num_channels_latents = self.decoder.config.in_channels
+        height = self.decoder.config.sample_size
+        width = self.decoder.config.sample_size
+
+        decoder_latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            text_enc_hid_states.dtype,
+            device,
+            generator,
+            decoder_latents,
+            self.decoder_scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents
+
+            noise_pred = self.decoder(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_enc_hid_states,
+                class_labels=additive_clip_time_embeddings,
+                attention_mask=decoder_text_mask,
+            ).sample
+
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred_uncond, _ = noise_pred_uncond.split(latent_model_input.shape[1], dim=1)
+                noise_pred_text, predicted_variance = noise_pred_text.split(latent_model_input.shape[1], dim=1)
+                noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+            if i + 1 == decoder_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = decoder_timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            decoder_latents = self.decoder_scheduler.step(
+                noise_pred, t, decoder_latents, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
+
+        decoder_latents = decoder_latents.clamp(-1, 1)
+
+        image_small = decoder_latents
+
+        # done decoder
+
+        # super res
+
+        self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device)
+        super_res_timesteps_tensor = self.super_res_scheduler.timesteps
+
+        channels = self.super_res_first.config.in_channels // 2
+        height = self.super_res_first.config.sample_size
+        width = self.super_res_first.config.sample_size
+
+        super_res_latents = self.prepare_latents(
+            (batch_size, channels, height, width),
+            image_small.dtype,
+            device,
+            generator,
+            super_res_latents,
+            self.super_res_scheduler,
+        )
+
+        if device.type == "mps":
+            # MPS does not support many interpolations
+            image_upscaled = F.interpolate(image_small, size=[height, width])
+        else:
+            interpolate_antialias = {}
+            if "antialias" in inspect.signature(F.interpolate).parameters:
+                interpolate_antialias["antialias"] = True
+
+            image_upscaled = F.interpolate(
+                image_small, size=[height, width], mode="bicubic", align_corners=False, **interpolate_antialias
+            )
+
+        for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)):
+            # no classifier free guidance
+
+            if i == super_res_timesteps_tensor.shape[0] - 1:
+                unet = self.super_res_last
+            else:
+                unet = self.super_res_first
+
+            latent_model_input = torch.cat([super_res_latents, image_upscaled], dim=1)
+
+            noise_pred = unet(
+                sample=latent_model_input,
+                timestep=t,
+            ).sample
+
+            if i + 1 == super_res_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = super_res_timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            super_res_latents = self.super_res_scheduler.step(
+                noise_pred, t, super_res_latents, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
+
+        image = super_res_latents
+        # done super res
+
+        self.maybe_free_model_hooks()
+
+        # post processing
+        image = image * 0.5 + 0.5
+        image = image.clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/diffusers/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a0e7e90e4d29d12e360d76bd7846acfd280d6f3
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
@@ -0,0 +1,420 @@
+# Copyright 2024 Kakao Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import List, Optional, Union
+
+import PIL.Image
+import torch
+from torch.nn import functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...models import UNet2DConditionModel, UNet2DModel
+from ...schedulers import UnCLIPScheduler
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .text_proj import UnCLIPTextProjModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class UnCLIPImageVariationPipeline(DiffusionPipeline):
+    """
+    Pipeline to generate image variations from an input image using UnCLIP.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        text_proj ([`UnCLIPTextProjModel`]):
+            Utility class to prepare and combine the embeddings before they are passed to the decoder.
+        decoder ([`UNet2DConditionModel`]):
+            The decoder to invert the image embedding into an image.
+        super_res_first ([`UNet2DModel`]):
+            Super resolution UNet. Used in all but the last step of the super resolution diffusion process.
+        super_res_last ([`UNet2DModel`]):
+            Super resolution UNet. Used in the last step of the super resolution diffusion process.
+        decoder_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the decoder denoising process (a modified [`DDPMScheduler`]).
+        super_res_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the super resolution denoising process (a modified [`DDPMScheduler`]).
+    """
+
+    decoder: UNet2DConditionModel
+    text_proj: UnCLIPTextProjModel
+    text_encoder: CLIPTextModelWithProjection
+    tokenizer: CLIPTokenizer
+    feature_extractor: CLIPImageProcessor
+    image_encoder: CLIPVisionModelWithProjection
+    super_res_first: UNet2DModel
+    super_res_last: UNet2DModel
+
+    decoder_scheduler: UnCLIPScheduler
+    super_res_scheduler: UnCLIPScheduler
+    model_cpu_offload_seq = "text_encoder->image_encoder->text_proj->decoder->super_res_first->super_res_last"
+
+    def __init__(
+        self,
+        decoder: UNet2DConditionModel,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_proj: UnCLIPTextProjModel,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
+        super_res_first: UNet2DModel,
+        super_res_last: UNet2DModel,
+        decoder_scheduler: UnCLIPScheduler,
+        super_res_scheduler: UnCLIPScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            decoder=decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_proj=text_proj,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            super_res_first=super_res_first,
+            super_res_last=super_res_last,
+            decoder_scheduler=decoder_scheduler,
+            super_res_scheduler=super_res_scheduler,
+        )
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens = [""] * batch_size
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    def _encode_image(self, image, device, num_images_per_prompt, image_embeddings: Optional[torch.Tensor] = None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if image_embeddings is None:
+            if not isinstance(image, torch.Tensor):
+                image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+
+            image = image.to(device=device, dtype=dtype)
+            image_embeddings = self.image_encoder(image).image_embeds
+
+        image_embeddings = image_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+        return image_embeddings
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor]] = None,
+        num_images_per_prompt: int = 1,
+        decoder_num_inference_steps: int = 25,
+        super_res_num_inference_steps: int = 7,
+        generator: Optional[torch.Generator] = None,
+        decoder_latents: Optional[torch.Tensor] = None,
+        super_res_latents: Optional[torch.Tensor] = None,
+        image_embeddings: Optional[torch.Tensor] = None,
+        decoder_guidance_scale: float = 8.0,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
+                `Image` or tensor representing an image batch to be used as the starting point. If you provide a
+                tensor, it needs to be compatible with the [`CLIPImageProcessor`]
+                [configuration](https://huggingface.co/fusing/karlo-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+                Can be left as `None` only when `image_embeddings` are passed.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            decoder_num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
+                image at the expense of slower inference.
+            super_res_num_inference_steps (`int`, *optional*, defaults to 7):
+                The number of denoising steps for super resolution. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*):
+                Pre-generated noisy latents to be used as inputs for the decoder.
+            super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*):
+                Pre-generated noisy latents to be used as inputs for the decoder.
+            decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            image_embeddings (`torch.Tensor`, *optional*):
+                Pre-defined image embeddings that can be derived from the image encoder. Pre-defined image embeddings
+                can be passed for tasks like image interpolations. `image` can be left as `None`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        if image is not None:
+            if isinstance(image, PIL.Image.Image):
+                batch_size = 1
+            elif isinstance(image, list):
+                batch_size = len(image)
+            else:
+                batch_size = image.shape[0]
+        else:
+            batch_size = image_embeddings.shape[0]
+
+        prompt = [""] * batch_size
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = decoder_guidance_scale > 1.0
+
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance
+        )
+
+        image_embeddings = self._encode_image(image, device, num_images_per_prompt, image_embeddings)
+
+        # decoder
+        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
+            image_embeddings=image_embeddings,
+            prompt_embeds=prompt_embeds,
+            text_encoder_hidden_states=text_encoder_hidden_states,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+
+        if device.type == "mps":
+            # HACK: MPS: There is a panic when padding bool tensors,
+            # so cast to int tensor for the pad and back to bool afterwards
+            text_mask = text_mask.type(torch.int)
+            decoder_text_mask = F.pad(text_mask, (self.text_proj.clip_extra_context_tokens, 0), value=1)
+            decoder_text_mask = decoder_text_mask.type(torch.bool)
+        else:
+            decoder_text_mask = F.pad(text_mask, (self.text_proj.clip_extra_context_tokens, 0), value=True)
+
+        self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device)
+        decoder_timesteps_tensor = self.decoder_scheduler.timesteps
+
+        num_channels_latents = self.decoder.config.in_channels
+        height = self.decoder.config.sample_size
+        width = self.decoder.config.sample_size
+
+        if decoder_latents is None:
+            decoder_latents = self.prepare_latents(
+                (batch_size, num_channels_latents, height, width),
+                text_encoder_hidden_states.dtype,
+                device,
+                generator,
+                decoder_latents,
+                self.decoder_scheduler,
+            )
+
+        for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents
+
+            noise_pred = self.decoder(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_encoder_hidden_states,
+                class_labels=additive_clip_time_embeddings,
+                attention_mask=decoder_text_mask,
+            ).sample
+
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred_uncond, _ = noise_pred_uncond.split(latent_model_input.shape[1], dim=1)
+                noise_pred_text, predicted_variance = noise_pred_text.split(latent_model_input.shape[1], dim=1)
+                noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+            if i + 1 == decoder_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = decoder_timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            decoder_latents = self.decoder_scheduler.step(
+                noise_pred, t, decoder_latents, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
+
+        decoder_latents = decoder_latents.clamp(-1, 1)
+
+        image_small = decoder_latents
+
+        # done decoder
+
+        # super res
+
+        self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device)
+        super_res_timesteps_tensor = self.super_res_scheduler.timesteps
+
+        channels = self.super_res_first.config.in_channels // 2
+        height = self.super_res_first.config.sample_size
+        width = self.super_res_first.config.sample_size
+
+        if super_res_latents is None:
+            super_res_latents = self.prepare_latents(
+                (batch_size, channels, height, width),
+                image_small.dtype,
+                device,
+                generator,
+                super_res_latents,
+                self.super_res_scheduler,
+            )
+
+        if device.type == "mps":
+            # MPS does not support many interpolations
+            image_upscaled = F.interpolate(image_small, size=[height, width])
+        else:
+            interpolate_antialias = {}
+            if "antialias" in inspect.signature(F.interpolate).parameters:
+                interpolate_antialias["antialias"] = True
+
+            image_upscaled = F.interpolate(
+                image_small, size=[height, width], mode="bicubic", align_corners=False, **interpolate_antialias
+            )
+
+        for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)):
+            # no classifier free guidance
+
+            if i == super_res_timesteps_tensor.shape[0] - 1:
+                unet = self.super_res_last
+            else:
+                unet = self.super_res_first
+
+            latent_model_input = torch.cat([super_res_latents, image_upscaled], dim=1)
+
+            noise_pred = unet(
+                sample=latent_model_input,
+                timestep=t,
+            ).sample
+
+            if i + 1 == super_res_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = super_res_timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            super_res_latents = self.super_res_scheduler.step(
+                noise_pred, t, super_res_latents, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
+
+        image = super_res_latents
+
+        # done super res
+        self.maybe_free_model_hooks()
+
+        # post processing
+
+        image = image * 0.5 + 0.5
+        image = image.clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/diffusers/src/diffusers/pipelines/unclip/text_proj.py b/diffusers/src/diffusers/pipelines/unclip/text_proj.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a86d0c08a8da90ff27505c8868b030cfa10068f
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/unclip/text_proj.py
@@ -0,0 +1,86 @@
+# Copyright 2024 Kakao Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models import ModelMixin
+
+
+class UnCLIPTextProjModel(ModelMixin, ConfigMixin):
+    """
+    Utility class for CLIP embeddings. Used to combine the image and text embeddings into a format usable by the
+    decoder.
+
+    For more details, see the original paper: https://arxiv.org/abs/2204.06125 section 2.1
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        *,
+        clip_extra_context_tokens: int = 4,
+        clip_embeddings_dim: int = 768,
+        time_embed_dim: int,
+        cross_attention_dim,
+    ):
+        super().__init__()
+
+        self.learned_classifier_free_guidance_embeddings = nn.Parameter(torch.zeros(clip_embeddings_dim))
+
+        # parameters for additional clip time embeddings
+        self.embedding_proj = nn.Linear(clip_embeddings_dim, time_embed_dim)
+        self.clip_image_embeddings_project_to_time_embeddings = nn.Linear(clip_embeddings_dim, time_embed_dim)
+
+        # parameters for encoder hidden states
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.clip_extra_context_tokens_proj = nn.Linear(
+            clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim
+        )
+        self.encoder_hidden_states_proj = nn.Linear(clip_embeddings_dim, cross_attention_dim)
+        self.text_encoder_hidden_states_norm = nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, *, image_embeddings, prompt_embeds, text_encoder_hidden_states, do_classifier_free_guidance):
+        if do_classifier_free_guidance:
+            # Add the classifier free guidance embeddings to the image embeddings
+            image_embeddings_batch_size = image_embeddings.shape[0]
+            classifier_free_guidance_embeddings = self.learned_classifier_free_guidance_embeddings.unsqueeze(0)
+            classifier_free_guidance_embeddings = classifier_free_guidance_embeddings.expand(
+                image_embeddings_batch_size, -1
+            )
+            image_embeddings = torch.cat([classifier_free_guidance_embeddings, image_embeddings], dim=0)
+
+        # The image embeddings batch size and the text embeddings batch size are equal
+        assert image_embeddings.shape[0] == prompt_embeds.shape[0]
+
+        batch_size = prompt_embeds.shape[0]
+
+        # "Specifically, we modify the architecture described in Nichol et al. (2021) by projecting and
+        # adding CLIP embeddings to the existing timestep embedding, ...
+        time_projected_prompt_embeds = self.embedding_proj(prompt_embeds)
+        time_projected_image_embeddings = self.clip_image_embeddings_project_to_time_embeddings(image_embeddings)
+        additive_clip_time_embeddings = time_projected_image_embeddings + time_projected_prompt_embeds
+
+        # ... and by projecting CLIP embeddings into four
+        # extra tokens of context that are concatenated to the sequence of outputs from the GLIDE text encoder"
+        clip_extra_context_tokens = self.clip_extra_context_tokens_proj(image_embeddings)
+        clip_extra_context_tokens = clip_extra_context_tokens.reshape(batch_size, -1, self.clip_extra_context_tokens)
+        clip_extra_context_tokens = clip_extra_context_tokens.permute(0, 2, 1)
+
+        text_encoder_hidden_states = self.encoder_hidden_states_proj(text_encoder_hidden_states)
+        text_encoder_hidden_states = self.text_encoder_hidden_states_norm(text_encoder_hidden_states)
+        text_encoder_hidden_states = torch.cat([clip_extra_context_tokens, text_encoder_hidden_states], dim=1)
+
+        return text_encoder_hidden_states, additive_clip_time_embeddings
diff --git a/diffusers/src/diffusers/pipelines/unidiffuser/__init__.py b/diffusers/src/diffusers/pipelines/unidiffuser/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ac2b09a6e570087c80bc11bf1a8102dd4970b8f
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/unidiffuser/__init__.py
@@ -0,0 +1,58 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import (
+        ImageTextPipelineOutput,
+        UniDiffuserPipeline,
+    )
+
+    _dummy_objects.update(
+        {"ImageTextPipelineOutput": ImageTextPipelineOutput, "UniDiffuserPipeline": UniDiffuserPipeline}
+    )
+else:
+    _import_structure["modeling_text_decoder"] = ["UniDiffuserTextDecoder"]
+    _import_structure["modeling_uvit"] = ["UniDiffuserModel", "UTransformer2DModel"]
+    _import_structure["pipeline_unidiffuser"] = ["ImageTextPipelineOutput", "UniDiffuserPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import (
+            ImageTextPipelineOutput,
+            UniDiffuserPipeline,
+        )
+    else:
+        from .modeling_text_decoder import UniDiffuserTextDecoder
+        from .modeling_uvit import UniDiffuserModel, UTransformer2DModel
+        from .pipeline_unidiffuser import ImageTextPipelineOutput, UniDiffuserPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py b/diffusers/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..75e5d43678d5085b4b7bdc1c4e7c0aad1309ba42
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py
@@ -0,0 +1,296 @@
+from typing import Optional
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import GPT2Config, GPT2LMHeadModel
+from transformers.modeling_utils import ModuleUtilsMixin
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models import ModelMixin
+
+
+# Modified from ClipCaptionModel in https://github.com/thu-ml/unidiffuser/blob/main/libs/caption_decoder.py
+class UniDiffuserTextDecoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
+    """
+    Text decoder model for a image-text [UniDiffuser](https://arxiv.org/pdf/2303.06555.pdf) model. This is used to
+    generate text from the UniDiffuser image-text embedding.
+
+    Parameters:
+        prefix_length (`int`):
+            Max number of prefix tokens that will be supplied to the model.
+        prefix_inner_dim (`int`):
+            The hidden size of the incoming prefix embeddings. For UniDiffuser, this would be the hidden dim of the
+            CLIP text encoder.
+        prefix_hidden_dim (`int`, *optional*):
+            Hidden dim of the MLP if we encode the prefix.
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+            dot-product/softmax to float() when training with mixed precision.
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
+
+    @register_to_config
+    def __init__(
+        self,
+        prefix_length: int,
+        prefix_inner_dim: int,
+        prefix_hidden_dim: Optional[int] = None,
+        vocab_size: int = 50257,  # Start of GPT2 config args
+        n_positions: int = 1024,
+        n_embd: int = 768,
+        n_layer: int = 12,
+        n_head: int = 12,
+        n_inner: Optional[int] = None,
+        activation_function: str = "gelu_new",
+        resid_pdrop: float = 0.1,
+        embd_pdrop: float = 0.1,
+        attn_pdrop: float = 0.1,
+        layer_norm_epsilon: float = 1e-5,
+        initializer_range: float = 0.02,
+        scale_attn_weights: bool = True,
+        use_cache: bool = True,
+        scale_attn_by_inverse_layer_idx: bool = False,
+        reorder_and_upcast_attn: bool = False,
+    ):
+        super().__init__()
+
+        self.prefix_length = prefix_length
+
+        if prefix_inner_dim != n_embd and prefix_hidden_dim is None:
+            raise ValueError(
+                f"`prefix_hidden_dim` cannot be `None` when `prefix_inner_dim`: {prefix_hidden_dim} and"
+                f" `n_embd`: {n_embd} are not equal."
+            )
+
+        self.prefix_inner_dim = prefix_inner_dim
+        self.prefix_hidden_dim = prefix_hidden_dim
+
+        self.encode_prefix = (
+            nn.Linear(self.prefix_inner_dim, self.prefix_hidden_dim)
+            if self.prefix_hidden_dim is not None
+            else nn.Identity()
+        )
+        self.decode_prefix = (
+            nn.Linear(self.prefix_hidden_dim, n_embd) if self.prefix_hidden_dim is not None else nn.Identity()
+        )
+
+        gpt_config = GPT2Config(
+            vocab_size=vocab_size,
+            n_positions=n_positions,
+            n_embd=n_embd,
+            n_layer=n_layer,
+            n_head=n_head,
+            n_inner=n_inner,
+            activation_function=activation_function,
+            resid_pdrop=resid_pdrop,
+            embd_pdrop=embd_pdrop,
+            attn_pdrop=attn_pdrop,
+            layer_norm_epsilon=layer_norm_epsilon,
+            initializer_range=initializer_range,
+            scale_attn_weights=scale_attn_weights,
+            use_cache=use_cache,
+            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
+            reorder_and_upcast_attn=reorder_and_upcast_attn,
+        )
+        self.transformer = GPT2LMHeadModel(gpt_config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        prefix_embeds: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            input_ids (`torch.Tensor` of shape `(N, max_seq_len)`):
+                Text tokens to use for inference.
+            prefix_embeds (`torch.Tensor` of shape `(N, prefix_length, 768)`):
+                Prefix embedding to preprend to the embedded tokens.
+            attention_mask (`torch.Tensor` of shape `(N, prefix_length + max_seq_len, 768)`, *optional*):
+                Attention mask for the prefix embedding.
+            labels (`torch.Tensor`, *optional*):
+                Labels to use for language modeling.
+        """
+        embedding_text = self.transformer.transformer.wte(input_ids)
+        hidden = self.encode_prefix(prefix_embeds)
+        prefix_embeds = self.decode_prefix(hidden)
+        embedding_cat = torch.cat((prefix_embeds, embedding_text), dim=1)
+
+        if labels is not None:
+            dummy_token = self.get_dummy_token(input_ids.shape[0], input_ids.device)
+            labels = torch.cat((dummy_token, input_ids), dim=1)
+        out = self.transformer(inputs_embeds=embedding_cat, labels=labels, attention_mask=attention_mask)
+        if self.prefix_hidden_dim is not None:
+            return out, hidden
+        else:
+            return out
+
+    def get_dummy_token(self, batch_size: int, device: torch.device) -> torch.Tensor:
+        return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device)
+
+    def encode(self, prefix):
+        return self.encode_prefix(prefix)
+
+    @torch.no_grad()
+    def generate_captions(self, features, eos_token_id, device):
+        """
+        Generate captions given text embedding features. Returns list[L].
+
+        Args:
+            features (`torch.Tensor` of shape `(B, L, D)`):
+                Text embedding features to generate captions from.
+            eos_token_id (`int`):
+                The token ID of the EOS token for the text decoder model.
+            device:
+                Device to perform text generation on.
+
+        Returns:
+            `List[str]`: A list of strings generated from the decoder model.
+        """
+
+        features = torch.split(features, 1, dim=0)
+        generated_tokens = []
+        generated_seq_lengths = []
+        for feature in features:
+            feature = self.decode_prefix(feature.to(device))  # back to the clip feature
+            # Only support beam search for now
+            output_tokens, seq_lengths = self.generate_beam(
+                input_embeds=feature, device=device, eos_token_id=eos_token_id
+            )
+            generated_tokens.append(output_tokens[0])
+            generated_seq_lengths.append(seq_lengths[0])
+        generated_tokens = torch.stack(generated_tokens)
+        generated_seq_lengths = torch.stack(generated_seq_lengths)
+        return generated_tokens, generated_seq_lengths
+
+    @torch.no_grad()
+    def generate_beam(
+        self,
+        input_ids=None,
+        input_embeds=None,
+        device=None,
+        beam_size: int = 5,
+        entry_length: int = 67,
+        temperature: float = 1.0,
+        eos_token_id: Optional[int] = None,
+    ):
+        """
+        Generates text using the given tokenizer and text prompt or token embedding via beam search. This
+        implementation is based on the beam search implementation from the [original UniDiffuser
+        code](https://github.com/thu-ml/unidiffuser/blob/main/libs/caption_decoder.py#L89).
+
+        Args:
+            eos_token_id (`int`, *optional*):
+                The token ID of the EOS token for the text decoder model.
+            input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+                Tokenizer indices of input sequence tokens in the vocabulary. One of `input_ids` and `input_embeds`
+                must be supplied.
+            input_embeds (`torch.Tensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
+                An embedded representation to directly pass to the transformer as a prefix for beam search. One of
+                `input_ids` and `input_embeds` must be supplied.
+            device:
+                The device to perform beam search on.
+            beam_size (`int`, *optional*, defaults to `5`):
+                The number of best states to store during beam search.
+            entry_length (`int`, *optional*, defaults to `67`):
+                The number of iterations to run beam search.
+            temperature (`float`, *optional*, defaults to 1.0):
+                The temperature to use when performing the softmax over logits from the decoding model.
+
+        Returns:
+            `Tuple(torch.Tensor, torch.Tensor)`: A tuple of tensors where the first element is a tensor of generated
+            token sequences sorted by score in descending order, and the second element is the sequence lengths
+            corresponding to those sequences.
+        """
+        # Generates text until stop_token is reached using beam search with the desired beam size.
+        stop_token_index = eos_token_id
+        tokens = None
+        scores = None
+        seq_lengths = torch.ones(beam_size, device=device, dtype=torch.int)
+        is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool)
+
+        if input_embeds is not None:
+            generated = input_embeds
+        else:
+            generated = self.transformer.transformer.wte(input_ids)
+
+        for i in range(entry_length):
+            outputs = self.transformer(inputs_embeds=generated)
+            logits = outputs.logits
+            logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
+            logits = logits.softmax(-1).log()
+
+            if scores is None:
+                scores, next_tokens = logits.topk(beam_size, -1)
+                generated = generated.expand(beam_size, *generated.shape[1:])
+                next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)
+                if tokens is None:
+                    tokens = next_tokens
+                else:
+                    tokens = tokens.expand(beam_size, *tokens.shape[1:])
+                    tokens = torch.cat((tokens, next_tokens), dim=1)
+            else:
+                logits[is_stopped] = -float(np.inf)
+                logits[is_stopped, 0] = 0
+                scores_sum = scores[:, None] + logits
+                seq_lengths[~is_stopped] += 1
+                scores_sum_average = scores_sum / seq_lengths[:, None]
+                scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(beam_size, -1)
+                next_tokens_source = next_tokens // scores_sum.shape[1]
+                seq_lengths = seq_lengths[next_tokens_source]
+                next_tokens = next_tokens % scores_sum.shape[1]
+                next_tokens = next_tokens.unsqueeze(1)
+                tokens = tokens[next_tokens_source]
+                tokens = torch.cat((tokens, next_tokens), dim=1)
+                generated = generated[next_tokens_source]
+                scores = scores_sum_average * seq_lengths
+                is_stopped = is_stopped[next_tokens_source]
+
+            next_token_embed = self.transformer.transformer.wte(next_tokens.squeeze()).view(generated.shape[0], 1, -1)
+            generated = torch.cat((generated, next_token_embed), dim=1)
+            is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
+            if is_stopped.all():
+                break
+
+        scores = scores / seq_lengths
+        order = scores.argsort(descending=True)
+        # tokens tensors are already padded to max_seq_length
+        output_texts = [tokens[i] for i in order]
+        output_texts = torch.stack(output_texts, dim=0)
+        seq_lengths = torch.tensor([seq_lengths[i] for i in order], dtype=seq_lengths.dtype)
+        return output_texts, seq_lengths
diff --git a/diffusers/src/diffusers/pipelines/unidiffuser/modeling_uvit.py b/diffusers/src/diffusers/pipelines/unidiffuser/modeling_uvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb1514b153ce960438bed7be321329eb0df30dc6
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/unidiffuser/modeling_uvit.py
@@ -0,0 +1,1197 @@
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models import ModelMixin
+from ...models.attention import FeedForward
+from ...models.attention_processor import Attention
+from ...models.embeddings import TimestepEmbedding, Timesteps, get_2d_sincos_pos_embed
+from ...models.modeling_outputs import Transformer2DModelOutput
+from ...models.normalization import AdaLayerNorm
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        logger.warning(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect."
+        )
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    # type: (torch.Tensor, float, float, float, float) -> torch.Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean},
+    \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for
+    generating the random values works best when :math:`a \leq \text{mean} \leq b`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    def __init__(
+        self,
+        height=224,
+        width=224,
+        patch_size=16,
+        in_channels=3,
+        embed_dim=768,
+        layer_norm=False,
+        flatten=True,
+        bias=True,
+        use_pos_embed=True,
+    ):
+        super().__init__()
+
+        num_patches = (height // patch_size) * (width // patch_size)
+        self.flatten = flatten
+        self.layer_norm = layer_norm
+
+        self.proj = nn.Conv2d(
+            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
+        )
+        if layer_norm:
+            self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            self.norm = None
+
+        self.use_pos_embed = use_pos_embed
+        if self.use_pos_embed:
+            pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
+            self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+
+    def forward(self, latent):
+        latent = self.proj(latent)
+        if self.flatten:
+            latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        if self.layer_norm:
+            latent = self.norm(latent)
+        if self.use_pos_embed:
+            return latent + self.pos_embed
+        else:
+            return latent
+
+
+class SkipBlock(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+
+        self.skip_linear = nn.Linear(2 * dim, dim)
+
+        # Use torch.nn.LayerNorm for now, following the original code
+        self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x, skip):
+        x = self.skip_linear(torch.cat([x, skip], dim=-1))
+        x = self.norm(x)
+
+        return x
+
+
+# Modified to support both pre-LayerNorm and post-LayerNorm configurations
+# Don't support AdaLayerNormZero for now
+# Modified from diffusers.models.attention.BasicTransformerBlock
+class UTransformerBlock(nn.Module):
+    r"""
+    A modification of BasicTransformerBlock which supports pre-LayerNorm and post-LayerNorm configurations.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`):
+            Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:obj: `int`, *optional*):
+            The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:obj: `bool`, *optional*, defaults to `False`):
+            Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the query and key to float32 when performing the attention calculation.
+        norm_elementwise_affine (`bool`, *optional*):
+            Whether to use learnable per-element affine parameters during layer normalization.
+        norm_type (`str`, defaults to `"layer_norm"`):
+            The layer norm implementation to use.
+        pre_layer_norm (`bool`, *optional*):
+            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
+            as opposed to after ("post-LayerNorm"). Note that `BasicTransformerBlock` uses pre-LayerNorm, e.g.
+            `pre_layer_norm = True`.
+        final_dropout (`bool`, *optional*):
+            Whether to use a final Dropout layer after the feedforward network.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        pre_layer_norm: bool = True,
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+
+        self.pre_layer_norm = pre_layer_norm
+
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+
+        # 1. Self-Attn
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.attn2 = None
+
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+        else:
+            self.norm2 = None
+
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        timestep=None,
+        cross_attention_kwargs=None,
+        class_labels=None,
+    ):
+        # Pre-LayerNorm
+        if self.pre_layer_norm:
+            if self.use_ada_layer_norm:
+                norm_hidden_states = self.norm1(hidden_states, timestep)
+            else:
+                norm_hidden_states = self.norm1(hidden_states)
+        else:
+            norm_hidden_states = hidden_states
+
+        # 1. Self-Attention
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+        # Post-LayerNorm
+        if not self.pre_layer_norm:
+            if self.use_ada_layer_norm:
+                attn_output = self.norm1(attn_output, timestep)
+            else:
+                attn_output = self.norm1(attn_output)
+
+        hidden_states = attn_output + hidden_states
+
+        if self.attn2 is not None:
+            # Pre-LayerNorm
+            if self.pre_layer_norm:
+                norm_hidden_states = (
+                    self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+                )
+            else:
+                norm_hidden_states = hidden_states
+            # TODO (Birch-San): Here we should prepare the encoder_attention mask correctly
+            # prepare attention mask here
+
+            # 2. Cross-Attention
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+
+            # Post-LayerNorm
+            if not self.pre_layer_norm:
+                attn_output = self.norm2(attn_output, timestep) if self.use_ada_layer_norm else self.norm2(attn_output)
+
+            hidden_states = attn_output + hidden_states
+
+        # 3. Feed-forward
+        # Pre-LayerNorm
+        if self.pre_layer_norm:
+            norm_hidden_states = self.norm3(hidden_states)
+        else:
+            norm_hidden_states = hidden_states
+
+        ff_output = self.ff(norm_hidden_states)
+
+        # Post-LayerNorm
+        if not self.pre_layer_norm:
+            ff_output = self.norm3(ff_output)
+
+        hidden_states = ff_output + hidden_states
+
+        return hidden_states
+
+
+# Like UTransformerBlock except with LayerNorms on the residual backbone of the block
+# Modified from diffusers.models.attention.BasicTransformerBlock
+class UniDiffuserBlock(nn.Module):
+    r"""
+    A modification of BasicTransformerBlock which supports pre-LayerNorm and post-LayerNorm configurations and puts the
+    LayerNorms on the residual backbone of the block. This matches the transformer block in the [original UniDiffuser
+    implementation](https://github.com/thu-ml/unidiffuser/blob/main/libs/uvit_multi_post_ln_v1.py#L104).
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`):
+            Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:obj: `int`, *optional*):
+            The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:obj: `bool`, *optional*, defaults to `False`):
+            Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the query and key to float() when performing the attention calculation.
+        norm_elementwise_affine (`bool`, *optional*):
+            Whether to use learnable per-element affine parameters during layer normalization.
+        norm_type (`str`, defaults to `"layer_norm"`):
+            The layer norm implementation to use.
+        pre_layer_norm (`bool`, *optional*):
+            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
+            as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
+            (`pre_layer_norm = False`).
+        final_dropout (`bool`, *optional*):
+            Whether to use a final Dropout layer after the feedforward network.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        pre_layer_norm: bool = False,
+        final_dropout: bool = True,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+
+        self.pre_layer_norm = pre_layer_norm
+
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+
+        # 1. Self-Attn
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.attn2 = None
+
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+        else:
+            self.norm2 = None
+
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        timestep=None,
+        cross_attention_kwargs=None,
+        class_labels=None,
+    ):
+        # Following the diffusers transformer block implementation, put the LayerNorm on the
+        # residual backbone
+        # Pre-LayerNorm
+        if self.pre_layer_norm:
+            if self.use_ada_layer_norm:
+                hidden_states = self.norm1(hidden_states, timestep)
+            else:
+                hidden_states = self.norm1(hidden_states)
+
+        # 1. Self-Attention
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        attn_output = self.attn1(
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+        hidden_states = attn_output + hidden_states
+
+        # Following the diffusers transformer block implementation, put the LayerNorm on the
+        # residual backbone
+        # Post-LayerNorm
+        if not self.pre_layer_norm:
+            if self.use_ada_layer_norm:
+                hidden_states = self.norm1(hidden_states, timestep)
+            else:
+                hidden_states = self.norm1(hidden_states)
+
+        if self.attn2 is not None:
+            # Pre-LayerNorm
+            if self.pre_layer_norm:
+                hidden_states = (
+                    self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+                )
+            # TODO (Birch-San): Here we should prepare the encoder_attention mask correctly
+            # prepare attention mask here
+
+            # 2. Cross-Attention
+            attn_output = self.attn2(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+
+            hidden_states = attn_output + hidden_states
+
+            # Post-LayerNorm
+            if not self.pre_layer_norm:
+                hidden_states = (
+                    self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+                )
+
+        # 3. Feed-forward
+        # Pre-LayerNorm
+        if self.pre_layer_norm:
+            hidden_states = self.norm3(hidden_states)
+
+        ff_output = self.ff(hidden_states)
+
+        hidden_states = ff_output + hidden_states
+
+        # Post-LayerNorm
+        if not self.pre_layer_norm:
+            hidden_states = self.norm3(hidden_states)
+
+        return hidden_states
+
+
+# Modified from diffusers.models.transformer_2d.Transformer2DModel
+# Modify the transformer block structure to be U-Net like following U-ViT
+# Only supports patch-style input and torch.nn.LayerNorm currently
+# https://github.com/baofff/U-ViT
+class UTransformer2DModel(ModelMixin, ConfigMixin):
+    """
+    Transformer model based on the [U-ViT](https://github.com/baofff/U-ViT) architecture for image-like data. Compared
+    to [`Transformer2DModel`], this model has skip connections between transformer blocks in a "U"-shaped fashion,
+    similar to a U-Net. Supports only continuous (actual embeddings) inputs, which are embedded via a [`PatchEmbed`]
+    layer and then reshaped to (b, t, d).
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input.
+        out_channels (`int`, *optional*):
+            The number of output channels; if `None`, defaults to `in_channels`.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        norm_num_groups (`int`, *optional*, defaults to `32`):
+            The number of groups to use when performing Group Normalization.
+        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        patch_size (`int`, *optional*, defaults to 2):
+            The patch size to use in the patch embedding.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        use_linear_projection (int, *optional*): TODO: Not used
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used in each
+            transformer block.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the query and key to float() when performing the attention calculation.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The Layer Normalization implementation to use. Defaults to `torch.nn.LayerNorm`.
+        block_type (`str`, *optional*, defaults to `"unidiffuser"`):
+            The transformer block implementation to use. If `"unidiffuser"`, has the LayerNorms on the residual
+            backbone of each transformer block; otherwise has them in the attention/feedforward branches (the standard
+            behavior in `diffusers`.)
+        pre_layer_norm (`bool`, *optional*):
+            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
+            as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
+            (`pre_layer_norm = False`).
+        norm_elementwise_affine (`bool`, *optional*):
+            Whether to use learnable per-element affine parameters during layer normalization.
+        use_patch_pos_embed (`bool`, *optional*):
+            Whether to use position embeddings inside the patch embedding layer (`PatchEmbed`).
+        final_dropout (`bool`, *optional*):
+            Whether to use a final Dropout layer after the feedforward network.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = 2,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        block_type: str = "unidiffuser",
+        pre_layer_norm: bool = False,
+        norm_elementwise_affine: bool = True,
+        use_patch_pos_embed=False,
+        ff_final_dropout: bool = False,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+
+        # 1. Input
+        # Only support patch input of shape (batch_size, num_channels, height, width) for now
+        assert in_channels is not None and patch_size is not None, "Patch input requires in_channels and patch_size."
+
+        assert sample_size is not None, "UTransformer2DModel over patched input must provide sample_size"
+
+        # 2. Define input layers
+        self.height = sample_size
+        self.width = sample_size
+
+        self.patch_size = patch_size
+        self.pos_embed = PatchEmbed(
+            height=sample_size,
+            width=sample_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=inner_dim,
+            use_pos_embed=use_patch_pos_embed,
+        )
+
+        # 3. Define transformers blocks
+        # Modify this to have in_blocks ("downsample" blocks, even though we don't actually downsample), a mid_block,
+        # and out_blocks ("upsample" blocks). Like a U-Net, there are skip connections from in_blocks to out_blocks in
+        # a "U"-shaped fashion (e.g. first in_block to last out_block, etc.).
+        # Quick hack to make the transformer block type configurable
+        if block_type == "unidiffuser":
+            block_cls = UniDiffuserBlock
+        else:
+            block_cls = UTransformerBlock
+        self.transformer_in_blocks = nn.ModuleList(
+            [
+                block_cls(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    pre_layer_norm=pre_layer_norm,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    final_dropout=ff_final_dropout,
+                )
+                for d in range(num_layers // 2)
+            ]
+        )
+
+        self.transformer_mid_block = block_cls(
+            inner_dim,
+            num_attention_heads,
+            attention_head_dim,
+            dropout=dropout,
+            cross_attention_dim=cross_attention_dim,
+            activation_fn=activation_fn,
+            num_embeds_ada_norm=num_embeds_ada_norm,
+            attention_bias=attention_bias,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            norm_type=norm_type,
+            pre_layer_norm=pre_layer_norm,
+            norm_elementwise_affine=norm_elementwise_affine,
+            final_dropout=ff_final_dropout,
+        )
+
+        # For each skip connection, we use a SkipBlock (concatenation + Linear + LayerNorm) to process the inputs
+        # before each transformer out_block.
+        self.transformer_out_blocks = nn.ModuleList(
+            [
+                nn.ModuleDict(
+                    {
+                        "skip": SkipBlock(
+                            inner_dim,
+                        ),
+                        "block": block_cls(
+                            inner_dim,
+                            num_attention_heads,
+                            attention_head_dim,
+                            dropout=dropout,
+                            cross_attention_dim=cross_attention_dim,
+                            activation_fn=activation_fn,
+                            num_embeds_ada_norm=num_embeds_ada_norm,
+                            attention_bias=attention_bias,
+                            only_cross_attention=only_cross_attention,
+                            upcast_attention=upcast_attention,
+                            norm_type=norm_type,
+                            pre_layer_norm=pre_layer_norm,
+                            norm_elementwise_affine=norm_elementwise_affine,
+                            final_dropout=ff_final_dropout,
+                        ),
+                    }
+                )
+                for d in range(num_layers // 2)
+            ]
+        )
+
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+
+        # Following the UniDiffuser U-ViT implementation, we process the transformer output with
+        # a LayerNorm layer with per-element affine params
+        self.norm_out = nn.LayerNorm(inner_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        timestep=None,
+        class_labels=None,
+        cross_attention_kwargs=None,
+        return_dict: bool = True,
+        hidden_states_is_embedding: bool = False,
+        unpatchify: bool = True,
+    ):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continuous, `torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels
+                conditioning.
+            cross_attention_kwargs (*optional*):
+                Keyword arguments to supply to the cross attention layers, if used.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            hidden_states_is_embedding (`bool`, *optional*, defaults to `False`):
+                Whether or not hidden_states is an embedding directly usable by the transformer. In this case we will
+                ignore input handling (e.g. continuous, vectorized, etc.) and directly feed hidden_states into the
+                transformer blocks.
+            unpatchify (`bool`, *optional*, defaults to `True`):
+                Whether to unpatchify the transformer output.
+
+        Returns:
+            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
+            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # 0. Check inputs
+
+        if not unpatchify and return_dict:
+            raise ValueError(
+                f"Cannot both define `unpatchify`: {unpatchify} and `return_dict`: {return_dict} since when"
+                f" `unpatchify` is {unpatchify} the returned output is of shape (batch_size, seq_len, hidden_dim)"
+                " rather than (batch_size, num_channels, height, width)."
+            )
+
+        # 1. Input
+        if not hidden_states_is_embedding:
+            hidden_states = self.pos_embed(hidden_states)
+
+        # 2. Blocks
+
+        # In ("downsample") blocks
+        skips = []
+        for in_block in self.transformer_in_blocks:
+            hidden_states = in_block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+            )
+            skips.append(hidden_states)
+
+        # Mid block
+        hidden_states = self.transformer_mid_block(hidden_states)
+
+        # Out ("upsample") blocks
+        for out_block in self.transformer_out_blocks:
+            hidden_states = out_block["skip"](hidden_states, skips.pop())
+            hidden_states = out_block["block"](
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+            )
+
+        # 3. Output
+        # Don't support AdaLayerNorm for now, so no conditioning/scale/shift logic
+        hidden_states = self.norm_out(hidden_states)
+        # hidden_states = self.proj_out(hidden_states)
+
+        if unpatchify:
+            # unpatchify
+            height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+        else:
+            output = hidden_states
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
+
+
+class UniDiffuserModel(ModelMixin, ConfigMixin):
+    """
+    Transformer model for a image-text [UniDiffuser](https://arxiv.org/pdf/2303.06555.pdf) model. This is a
+    modification of [`UTransformer2DModel`] with input and output heads for the VAE-embedded latent image, the
+    CLIP-embedded image, and the CLIP-embedded prompt (see paper for more details).
+
+    Parameters:
+        text_dim (`int`): The hidden dimension of the CLIP text model used to embed images.
+        clip_img_dim (`int`): The hidden dimension of the CLIP vision model used to embed prompts.
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input.
+        out_channels (`int`, *optional*):
+            The number of output channels; if `None`, defaults to `in_channels`.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        norm_num_groups (`int`, *optional*, defaults to `32`):
+            The number of groups to use when performing Group Normalization.
+        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        patch_size (`int`, *optional*, defaults to 2):
+            The patch size to use in the patch embedding.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        use_linear_projection (int, *optional*): TODO: Not used
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used in each
+            transformer block.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the query and key to float32 when performing the attention calculation.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The Layer Normalization implementation to use. Defaults to `torch.nn.LayerNorm`.
+        block_type (`str`, *optional*, defaults to `"unidiffuser"`):
+            The transformer block implementation to use. If `"unidiffuser"`, has the LayerNorms on the residual
+            backbone of each transformer block; otherwise has them in the attention/feedforward branches (the standard
+            behavior in `diffusers`.)
+        pre_layer_norm (`bool`, *optional*):
+            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
+            as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
+            (`pre_layer_norm = False`).
+        norm_elementwise_affine (`bool`, *optional*):
+            Whether to use learnable per-element affine parameters during layer normalization.
+        use_patch_pos_embed (`bool`, *optional*):
+            Whether to use position embeddings inside the patch embedding layer (`PatchEmbed`).
+        ff_final_dropout (`bool`, *optional*):
+            Whether to use a final Dropout layer after the feedforward network.
+        use_data_type_embedding (`bool`, *optional*):
+            Whether to use a data type embedding. This is only relevant for UniDiffuser-v1 style models; UniDiffuser-v1
+            is continue-trained from UniDiffuser-v0 on non-publically-available data and accepts a `data_type`
+            argument, which can either be `1` to use the weights trained on non-publically-available data or `0`
+            otherwise. This argument is subsequently embedded by the data type embedding, if used.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        text_dim: int = 768,
+        clip_img_dim: int = 512,
+        num_text_tokens: int = 77,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        block_type: str = "unidiffuser",
+        pre_layer_norm: bool = False,
+        use_timestep_embedding=False,
+        norm_elementwise_affine: bool = True,
+        use_patch_pos_embed=False,
+        ff_final_dropout: bool = True,
+        use_data_type_embedding: bool = False,
+    ):
+        super().__init__()
+
+        # 0. Handle dimensions
+        self.inner_dim = num_attention_heads * attention_head_dim
+
+        assert sample_size is not None, "UniDiffuserModel over patched input must provide sample_size"
+        self.sample_size = sample_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+
+        self.patch_size = patch_size
+        # Assume image is square...
+        self.num_patches = (self.sample_size // patch_size) * (self.sample_size // patch_size)
+
+        # 1. Define input layers
+        # 1.1 Input layers for text and image input
+        # For now, only support patch input for VAE latent image input
+        self.vae_img_in = PatchEmbed(
+            height=sample_size,
+            width=sample_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=self.inner_dim,
+            use_pos_embed=use_patch_pos_embed,
+        )
+        self.clip_img_in = nn.Linear(clip_img_dim, self.inner_dim)
+        self.text_in = nn.Linear(text_dim, self.inner_dim)
+
+        # 1.2. Timestep embeddings for t_img, t_text
+        self.timestep_img_proj = Timesteps(
+            self.inner_dim,
+            flip_sin_to_cos=True,
+            downscale_freq_shift=0,
+        )
+        self.timestep_img_embed = (
+            TimestepEmbedding(
+                self.inner_dim,
+                4 * self.inner_dim,
+                out_dim=self.inner_dim,
+            )
+            if use_timestep_embedding
+            else nn.Identity()
+        )
+
+        self.timestep_text_proj = Timesteps(
+            self.inner_dim,
+            flip_sin_to_cos=True,
+            downscale_freq_shift=0,
+        )
+        self.timestep_text_embed = (
+            TimestepEmbedding(
+                self.inner_dim,
+                4 * self.inner_dim,
+                out_dim=self.inner_dim,
+            )
+            if use_timestep_embedding
+            else nn.Identity()
+        )
+
+        # 1.3. Positional embedding
+        self.num_text_tokens = num_text_tokens
+        self.num_tokens = 1 + 1 + num_text_tokens + 1 + self.num_patches
+        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_tokens, self.inner_dim))
+        self.pos_embed_drop = nn.Dropout(p=dropout)
+        trunc_normal_(self.pos_embed, std=0.02)
+
+        # 1.4. Handle data type token embeddings for UniDiffuser-V1, if necessary
+        self.use_data_type_embedding = use_data_type_embedding
+        if self.use_data_type_embedding:
+            self.data_type_token_embedding = nn.Embedding(2, self.inner_dim)
+            self.data_type_pos_embed_token = nn.Parameter(torch.zeros(1, 1, self.inner_dim))
+
+        # 2. Define transformer blocks
+        self.transformer = UTransformer2DModel(
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=attention_head_dim,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            num_layers=num_layers,
+            dropout=dropout,
+            norm_num_groups=norm_num_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_bias=attention_bias,
+            sample_size=sample_size,
+            num_vector_embeds=num_vector_embeds,
+            patch_size=patch_size,
+            activation_fn=activation_fn,
+            num_embeds_ada_norm=num_embeds_ada_norm,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            norm_type=norm_type,
+            block_type=block_type,
+            pre_layer_norm=pre_layer_norm,
+            norm_elementwise_affine=norm_elementwise_affine,
+            use_patch_pos_embed=use_patch_pos_embed,
+            ff_final_dropout=ff_final_dropout,
+        )
+
+        # 3. Define output layers
+        patch_dim = (patch_size**2) * out_channels
+        self.vae_img_out = nn.Linear(self.inner_dim, patch_dim)
+        self.clip_img_out = nn.Linear(self.inner_dim, clip_img_dim)
+        self.text_out = nn.Linear(self.inner_dim, text_dim)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"pos_embed"}
+
+    def forward(
+        self,
+        latent_image_embeds: torch.Tensor,
+        image_embeds: torch.Tensor,
+        prompt_embeds: torch.Tensor,
+        timestep_img: Union[torch.Tensor, float, int],
+        timestep_text: Union[torch.Tensor, float, int],
+        data_type: Optional[Union[torch.Tensor, float, int]] = 1,
+        encoder_hidden_states=None,
+        cross_attention_kwargs=None,
+    ):
+        """
+        Args:
+            latent_image_embeds (`torch.Tensor` of shape `(batch size, latent channels, height, width)`):
+                Latent image representation from the VAE encoder.
+            image_embeds (`torch.Tensor` of shape `(batch size, 1, clip_img_dim)`):
+                CLIP-embedded image representation (unsqueezed in the first dimension).
+            prompt_embeds (`torch.Tensor` of shape `(batch size, seq_len, text_dim)`):
+                CLIP-embedded text representation.
+            timestep_img (`torch.long` or `float` or `int`):
+                Current denoising step for the image.
+            timestep_text (`torch.long` or `float` or `int`):
+                Current denoising step for the text.
+            data_type: (`torch.int` or `float` or `int`, *optional*, defaults to `1`):
+                Only used in UniDiffuser-v1-style models. Can be either `1`, to use weights trained on nonpublic data,
+                or `0` otherwise.
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            cross_attention_kwargs (*optional*):
+                Keyword arguments to supply to the cross attention layers, if used.
+
+
+        Returns:
+            `tuple`: Returns relevant parts of the model's noise prediction: the first element of the tuple is tbe VAE
+            image embedding, the second element is the CLIP image embedding, and the third element is the CLIP text
+            embedding.
+        """
+        batch_size = latent_image_embeds.shape[0]
+
+        # 1. Input
+        # 1.1. Map inputs to shape (B, N, inner_dim)
+        vae_hidden_states = self.vae_img_in(latent_image_embeds)
+        clip_hidden_states = self.clip_img_in(image_embeds)
+        text_hidden_states = self.text_in(prompt_embeds)
+
+        num_text_tokens, num_img_tokens = text_hidden_states.size(1), vae_hidden_states.size(1)
+
+        # 1.2. Encode image timesteps to single token (B, 1, inner_dim)
+        if not torch.is_tensor(timestep_img):
+            timestep_img = torch.tensor([timestep_img], dtype=torch.long, device=vae_hidden_states.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timestep_img = timestep_img * torch.ones(batch_size, dtype=timestep_img.dtype, device=timestep_img.device)
+
+        timestep_img_token = self.timestep_img_proj(timestep_img)
+        # t_img_token does not contain any weights and will always return f32 tensors
+        # but time_embedding might be fp16, so we need to cast here.
+        timestep_img_token = timestep_img_token.to(dtype=self.dtype)
+        timestep_img_token = self.timestep_img_embed(timestep_img_token)
+        timestep_img_token = timestep_img_token.unsqueeze(dim=1)
+
+        # 1.3. Encode text timesteps to single token (B, 1, inner_dim)
+        if not torch.is_tensor(timestep_text):
+            timestep_text = torch.tensor([timestep_text], dtype=torch.long, device=vae_hidden_states.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timestep_text = timestep_text * torch.ones(batch_size, dtype=timestep_text.dtype, device=timestep_text.device)
+
+        timestep_text_token = self.timestep_text_proj(timestep_text)
+        # t_text_token does not contain any weights and will always return f32 tensors
+        # but time_embedding might be fp16, so we need to cast here.
+        timestep_text_token = timestep_text_token.to(dtype=self.dtype)
+        timestep_text_token = self.timestep_text_embed(timestep_text_token)
+        timestep_text_token = timestep_text_token.unsqueeze(dim=1)
+
+        # 1.4. Concatenate all of the embeddings together.
+        if self.use_data_type_embedding:
+            assert data_type is not None, "data_type must be supplied if the model uses a data type embedding"
+            if not torch.is_tensor(data_type):
+                data_type = torch.tensor([data_type], dtype=torch.int, device=vae_hidden_states.device)
+
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            data_type = data_type * torch.ones(batch_size, dtype=data_type.dtype, device=data_type.device)
+
+            data_type_token = self.data_type_token_embedding(data_type).unsqueeze(dim=1)
+            hidden_states = torch.cat(
+                [
+                    timestep_img_token,
+                    timestep_text_token,
+                    data_type_token,
+                    text_hidden_states,
+                    clip_hidden_states,
+                    vae_hidden_states,
+                ],
+                dim=1,
+            )
+        else:
+            hidden_states = torch.cat(
+                [timestep_img_token, timestep_text_token, text_hidden_states, clip_hidden_states, vae_hidden_states],
+                dim=1,
+            )
+
+        # 1.5. Prepare the positional embeddings and add to hidden states
+        # Note: I think img_vae should always have the proper shape, so there's no need to interpolate
+        # the position embeddings.
+        if self.use_data_type_embedding:
+            pos_embed = torch.cat(
+                [self.pos_embed[:, : 1 + 1, :], self.data_type_pos_embed_token, self.pos_embed[:, 1 + 1 :, :]], dim=1
+            )
+        else:
+            pos_embed = self.pos_embed
+        hidden_states = hidden_states + pos_embed
+        hidden_states = self.pos_embed_drop(hidden_states)
+
+        # 2. Blocks
+        hidden_states = self.transformer(
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            timestep=None,
+            class_labels=None,
+            cross_attention_kwargs=cross_attention_kwargs,
+            return_dict=False,
+            hidden_states_is_embedding=True,
+            unpatchify=False,
+        )[0]
+
+        # 3. Output
+        # Split out the predicted noise representation.
+        if self.use_data_type_embedding:
+            (
+                t_img_token_out,
+                t_text_token_out,
+                data_type_token_out,
+                text_out,
+                img_clip_out,
+                img_vae_out,
+            ) = hidden_states.split((1, 1, 1, num_text_tokens, 1, num_img_tokens), dim=1)
+        else:
+            t_img_token_out, t_text_token_out, text_out, img_clip_out, img_vae_out = hidden_states.split(
+                (1, 1, num_text_tokens, 1, num_img_tokens), dim=1
+            )
+
+        img_vae_out = self.vae_img_out(img_vae_out)
+
+        # unpatchify
+        height = width = int(img_vae_out.shape[1] ** 0.5)
+        img_vae_out = img_vae_out.reshape(
+            shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+        )
+        img_vae_out = torch.einsum("nhwpqc->nchpwq", img_vae_out)
+        img_vae_out = img_vae_out.reshape(
+            shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+        )
+
+        img_clip_out = self.clip_img_out(img_clip_out)
+
+        text_out = self.text_out(text_out)
+
+        return img_vae_out, img_clip_out, text_out
diff --git a/diffusers/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/diffusers/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f65caf4e61061ff2ce1e71f5db947f6acc4d65a
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
@@ -0,0 +1,1420 @@
+import inspect
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    GPT2Tokenizer,
+)
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.outputs import BaseOutput
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .modeling_text_decoder import UniDiffuserTextDecoder
+from .modeling_uvit import UniDiffuserModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# New BaseOutput child class for joint image-text output
+@dataclass
+class ImageTextPipelineOutput(BaseOutput):
+    """
+    Output class for joint image-text pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        text (`List[str]` or `List[List[str]]`)
+            List of generated text strings of length `batch_size` or a list of list of strings whose outer list has
+            length `batch_size`.
+    """
+
+    images: Optional[Union[List[PIL.Image.Image], np.ndarray]]
+    text: Optional[Union[List[str], List[List[str]]]]
+
+
+class UniDiffuserPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for a bimodal image-text model which supports unconditional text and image generation, text-conditioned
+    image generation, image-conditioned text generation, and joint image-text generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. This
+            is part of the UniDiffuser image representation along with the CLIP vision encoding.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        image_encoder ([`CLIPVisionModel`]):
+            A [`~transformers.CLIPVisionModel`] to encode images as part of its image representation along with the VAE
+            latent representation.
+        image_processor ([`CLIPImageProcessor`]):
+            [`~transformers.CLIPImageProcessor`] to preprocess an image before CLIP encoding it with `image_encoder`.
+        clip_tokenizer ([`CLIPTokenizer`]):
+             A [`~transformers.CLIPTokenizer`] to tokenize the prompt before encoding it with `text_encoder`.
+        text_decoder ([`UniDiffuserTextDecoder`]):
+            Frozen text decoder. This is a GPT-style model which is used to generate text from the UniDiffuser
+            embedding.
+        text_tokenizer ([`GPT2Tokenizer`]):
+            A [`~transformers.GPT2Tokenizer`] to decode text for text generation; used along with the `text_decoder`.
+        unet ([`UniDiffuserModel`]):
+            A [U-ViT](https://github.com/baofff/U-ViT) model with UNNet-style skip connections between transformer
+            layers to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image and/or text latents. The
+            original UniDiffuser paper uses the [`DPMSolverMultistepScheduler`] scheduler.
+    """
+
+    # TODO: support for moving submodules for components with enable_model_cpu_offload
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae->text_decoder"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        image_encoder: CLIPVisionModelWithProjection,
+        clip_image_processor: CLIPImageProcessor,
+        clip_tokenizer: CLIPTokenizer,
+        text_decoder: UniDiffuserTextDecoder,
+        text_tokenizer: GPT2Tokenizer,
+        unet: UniDiffuserModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        if text_encoder.config.hidden_size != text_decoder.prefix_inner_dim:
+            raise ValueError(
+                f"The text encoder hidden size and text decoder prefix inner dim must be the same, but"
+                f" `text_encoder.config.hidden_size`: {text_encoder.config.hidden_size} and `text_decoder.prefix_inner_dim`: {text_decoder.prefix_inner_dim}"
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            image_encoder=image_encoder,
+            clip_image_processor=clip_image_processor,
+            clip_tokenizer=clip_tokenizer,
+            text_decoder=text_decoder,
+            text_tokenizer=text_tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.num_channels_latents = vae.config.latent_channels
+        self.text_encoder_seq_len = text_encoder.config.max_position_embeddings
+        self.text_encoder_hidden_size = text_encoder.config.hidden_size
+        self.image_encoder_projection_dim = image_encoder.config.projection_dim
+        self.unet_resolution = unet.config.sample_size
+
+        self.text_intermediate_dim = self.text_encoder_hidden_size
+        if self.text_decoder.prefix_hidden_dim is not None:
+            self.text_intermediate_dim = self.text_decoder.prefix_hidden_dim
+
+        self.mode = None
+
+        # TODO: handle safety checking?
+        self.safety_checker = None
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def _infer_mode(self, prompt, prompt_embeds, image, latents, prompt_latents, vae_latents, clip_latents):
+        r"""
+        Infer the generation task ('mode') from the inputs to `__call__`. If the mode has been manually set, the set
+        mode will be used.
+        """
+        prompt_available = (prompt is not None) or (prompt_embeds is not None)
+        image_available = image is not None
+        input_available = prompt_available or image_available
+
+        prompt_latents_available = prompt_latents is not None
+        vae_latents_available = vae_latents is not None
+        clip_latents_available = clip_latents is not None
+        full_latents_available = latents is not None
+        image_latents_available = vae_latents_available and clip_latents_available
+        all_indv_latents_available = prompt_latents_available and image_latents_available
+
+        if self.mode is not None:
+            # Preferentially use the mode set by the user
+            mode = self.mode
+        elif prompt_available:
+            mode = "text2img"
+        elif image_available:
+            mode = "img2text"
+        else:
+            # Neither prompt nor image supplied, infer based on availability of latents
+            if full_latents_available or all_indv_latents_available:
+                mode = "joint"
+            elif prompt_latents_available:
+                mode = "text"
+            elif image_latents_available:
+                mode = "img"
+            else:
+                # No inputs or latents available
+                mode = "joint"
+
+        # Give warnings for ambiguous cases
+        if self.mode is None and prompt_available and image_available:
+            logger.warning(
+                f"You have supplied both a text prompt and image to the pipeline and mode has not been set manually,"
+                f" defaulting to mode '{mode}'."
+            )
+
+        if self.mode is None and not input_available:
+            if vae_latents_available != clip_latents_available:
+                # Exactly one of vae_latents and clip_latents is supplied
+                logger.warning(
+                    f"You have supplied exactly one of `vae_latents` and `clip_latents`, whereas either both or none"
+                    f" are expected to be supplied. Defaulting to mode '{mode}'."
+                )
+            elif not prompt_latents_available and not vae_latents_available and not clip_latents_available:
+                # No inputs or latents supplied
+                logger.warning(
+                    f"No inputs or latents have been supplied, and mode has not been manually set,"
+                    f" defaulting to mode '{mode}'."
+                )
+
+        return mode
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    # Functions to manually set the mode
+    def set_text_mode(self):
+        r"""Manually set the generation mode to unconditional ("marginal") text generation."""
+        self.mode = "text"
+
+    def set_image_mode(self):
+        r"""Manually set the generation mode to unconditional ("marginal") image generation."""
+        self.mode = "img"
+
+    def set_text_to_image_mode(self):
+        r"""Manually set the generation mode to text-conditioned image generation."""
+        self.mode = "text2img"
+
+    def set_image_to_text_mode(self):
+        r"""Manually set the generation mode to image-conditioned text generation."""
+        self.mode = "img2text"
+
+    def set_joint_mode(self):
+        r"""Manually set the generation mode to unconditional joint image-text generation."""
+        self.mode = "joint"
+
+    def reset_mode(self):
+        r"""Removes a manually set mode; after calling this, the pipeline will infer the mode from inputs."""
+        self.mode = None
+
+    def _infer_batch_size(
+        self,
+        mode,
+        prompt,
+        prompt_embeds,
+        image,
+        num_images_per_prompt,
+        num_prompts_per_image,
+        latents,
+        prompt_latents,
+        vae_latents,
+        clip_latents,
+    ):
+        r"""Infers the batch size and multiplier depending on mode and supplied arguments to `__call__`."""
+        if num_images_per_prompt is None:
+            num_images_per_prompt = 1
+        if num_prompts_per_image is None:
+            num_prompts_per_image = 1
+
+        assert num_images_per_prompt > 0, "num_images_per_prompt must be a positive integer"
+        assert num_prompts_per_image > 0, "num_prompts_per_image must be a positive integer"
+
+        if mode in ["text2img"]:
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                # Either prompt or prompt_embeds must be present for text2img.
+                batch_size = prompt_embeds.shape[0]
+            multiplier = num_images_per_prompt
+        elif mode in ["img2text"]:
+            if isinstance(image, PIL.Image.Image):
+                batch_size = 1
+            else:
+                # Image must be available and type either PIL.Image.Image or torch.Tensor.
+                # Not currently supporting something like image_embeds.
+                batch_size = image.shape[0]
+            multiplier = num_prompts_per_image
+        elif mode in ["img"]:
+            if vae_latents is not None:
+                batch_size = vae_latents.shape[0]
+            elif clip_latents is not None:
+                batch_size = clip_latents.shape[0]
+            else:
+                batch_size = 1
+            multiplier = num_images_per_prompt
+        elif mode in ["text"]:
+            if prompt_latents is not None:
+                batch_size = prompt_latents.shape[0]
+            else:
+                batch_size = 1
+            multiplier = num_prompts_per_image
+        elif mode in ["joint"]:
+            if latents is not None:
+                batch_size = latents.shape[0]
+            elif prompt_latents is not None:
+                batch_size = prompt_latents.shape[0]
+            elif vae_latents is not None:
+                batch_size = vae_latents.shape[0]
+            elif clip_latents is not None:
+                batch_size = clip_latents.shape[0]
+            else:
+                batch_size = 1
+
+            if num_images_per_prompt == num_prompts_per_image:
+                multiplier = num_images_per_prompt
+            else:
+                multiplier = min(num_images_per_prompt, num_prompts_per_image)
+                logger.warning(
+                    f"You are using mode `{mode}` and `num_images_per_prompt`: {num_images_per_prompt} and"
+                    f" num_prompts_per_image: {num_prompts_per_image} are not equal. Using batch size equal to"
+                    f" `min(num_images_per_prompt, num_prompts_per_image) = {batch_size}."
+                )
+        return batch_size, multiplier
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with self.tokenizer->self.clip_tokenizer
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.clip_tokenizer)
+
+            text_inputs = self.clip_tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.clip_tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.clip_tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.clip_tokenizer.batch_decode(
+                    untruncated_ids[:, self.clip_tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.clip_tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.clip_tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.clip_tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.prepare_image_latents
+    # Add num_prompts_per_image argument, sample from autoencoder moment distribution
+    def encode_image_vae_latents(
+        self,
+        image,
+        batch_size,
+        num_prompts_per_image,
+        dtype,
+        device,
+        do_classifier_free_guidance,
+        generator=None,
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_prompts_per_image
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if isinstance(generator, list):
+            image_latents = [
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator=generator[i])
+                * self.vae.config.scaling_factor
+                for i in range(batch_size)
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = self.vae.encode(image).latent_dist.sample(generator=generator)
+            # Scale image_latents by the VAE's scaling factor
+            image_latents = image_latents * self.vae.config.scaling_factor
+
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand image_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_image_latents = torch.zeros_like(image_latents)
+            image_latents = torch.cat([image_latents, image_latents, uncond_image_latents], dim=0)
+
+        return image_latents
+
+    def encode_image_clip_latents(
+        self,
+        image,
+        batch_size,
+        num_prompts_per_image,
+        dtype,
+        device,
+        generator=None,
+    ):
+        # Map image to CLIP embedding.
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        preprocessed_image = self.clip_image_processor.preprocess(
+            image,
+            return_tensors="pt",
+        )
+        preprocessed_image = preprocessed_image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_prompts_per_image
+        if isinstance(generator, list):
+            image_latents = [
+                self.image_encoder(**preprocessed_image[i : i + 1]).image_embeds for i in range(batch_size)
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = self.image_encoder(**preprocessed_image).image_embeds
+
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand image_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        return image_latents
+
+    def prepare_text_latents(
+        self, batch_size, num_images_per_prompt, seq_len, hidden_size, dtype, device, generator, latents=None
+    ):
+        # Prepare latents for the CLIP embedded prompt.
+        shape = (batch_size * num_images_per_prompt, seq_len, hidden_size)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            # latents is assumed to have shace (B, L, D)
+            latents = latents.repeat(num_images_per_prompt, 1, 1)
+            latents = latents.to(device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    # Rename prepare_latents -> prepare_image_vae_latents and add num_prompts_per_image argument.
+    def prepare_image_vae_latents(
+        self,
+        batch_size,
+        num_prompts_per_image,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size * num_prompts_per_image,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            # latents is assumed to have shape (B, C, H, W)
+            latents = latents.repeat(num_prompts_per_image, 1, 1, 1)
+            latents = latents.to(device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_image_clip_latents(
+        self, batch_size, num_prompts_per_image, clip_img_dim, dtype, device, generator, latents=None
+    ):
+        # Prepare latents for the CLIP embedded image.
+        shape = (batch_size * num_prompts_per_image, 1, clip_img_dim)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            # latents is assumed to have shape (B, L, D)
+            latents = latents.repeat(num_prompts_per_image, 1, 1)
+            latents = latents.to(device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def decode_text_latents(self, text_latents, device):
+        output_token_list, seq_lengths = self.text_decoder.generate_captions(
+            text_latents, self.text_tokenizer.eos_token_id, device=device
+        )
+        output_list = output_token_list.cpu().numpy()
+        generated_text = [
+            self.text_tokenizer.decode(output[: int(length)], skip_special_tokens=True)
+            for output, length in zip(output_list, seq_lengths)
+        ]
+        return generated_text
+
+    def _split(self, x, height, width):
+        r"""
+        Splits a flattened embedding x of shape (B, C * H * W + clip_img_dim) into two tensors of shape (B, C, H, W)
+        and (B, 1, clip_img_dim)
+        """
+        batch_size = x.shape[0]
+        latent_height = height // self.vae_scale_factor
+        latent_width = width // self.vae_scale_factor
+        img_vae_dim = self.num_channels_latents * latent_height * latent_width
+
+        img_vae, img_clip = x.split([img_vae_dim, self.image_encoder_projection_dim], dim=1)
+
+        img_vae = torch.reshape(img_vae, (batch_size, self.num_channels_latents, latent_height, latent_width))
+        img_clip = torch.reshape(img_clip, (batch_size, 1, self.image_encoder_projection_dim))
+        return img_vae, img_clip
+
+    def _combine(self, img_vae, img_clip):
+        r"""
+        Combines a latent iamge img_vae of shape (B, C, H, W) and a CLIP-embedded image img_clip of shape (B, 1,
+        clip_img_dim) into a single tensor of shape (B, C * H * W + clip_img_dim).
+        """
+        img_vae = torch.reshape(img_vae, (img_vae.shape[0], -1))
+        img_clip = torch.reshape(img_clip, (img_clip.shape[0], -1))
+        return torch.concat([img_vae, img_clip], dim=-1)
+
+    def _split_joint(self, x, height, width):
+        r"""
+        Splits a flattened embedding x of shape (B, C * H * W + clip_img_dim + text_seq_len * text_dim] into (img_vae,
+        img_clip, text) where img_vae is of shape (B, C, H, W), img_clip is of shape (B, 1, clip_img_dim), and text is
+        of shape (B, text_seq_len, text_dim).
+        """
+        batch_size = x.shape[0]
+        latent_height = height // self.vae_scale_factor
+        latent_width = width // self.vae_scale_factor
+        img_vae_dim = self.num_channels_latents * latent_height * latent_width
+        text_dim = self.text_encoder_seq_len * self.text_intermediate_dim
+
+        img_vae, img_clip, text = x.split([img_vae_dim, self.image_encoder_projection_dim, text_dim], dim=1)
+
+        img_vae = torch.reshape(img_vae, (batch_size, self.num_channels_latents, latent_height, latent_width))
+        img_clip = torch.reshape(img_clip, (batch_size, 1, self.image_encoder_projection_dim))
+        text = torch.reshape(text, (batch_size, self.text_encoder_seq_len, self.text_intermediate_dim))
+        return img_vae, img_clip, text
+
+    def _combine_joint(self, img_vae, img_clip, text):
+        r"""
+        Combines a latent image img_vae of shape (B, C, H, W), a CLIP-embedded image img_clip of shape (B, L_img,
+        clip_img_dim), and a text embedding text of shape (B, L_text, text_dim) into a single embedding x of shape (B,
+        C * H * W + L_img * clip_img_dim + L_text * text_dim).
+        """
+        img_vae = torch.reshape(img_vae, (img_vae.shape[0], -1))
+        img_clip = torch.reshape(img_clip, (img_clip.shape[0], -1))
+        text = torch.reshape(text, (text.shape[0], -1))
+        return torch.concat([img_vae, img_clip, text], dim=-1)
+
+    def _get_noise_pred(
+        self,
+        mode,
+        latents,
+        t,
+        prompt_embeds,
+        img_vae,
+        img_clip,
+        max_timestep,
+        data_type,
+        guidance_scale,
+        generator,
+        device,
+        height,
+        width,
+    ):
+        r"""
+        Gets the noise prediction using the `unet` and performs classifier-free guidance, if necessary.
+        """
+        if mode == "joint":
+            # Joint text-image generation
+            img_vae_latents, img_clip_latents, text_latents = self._split_joint(latents, height, width)
+
+            img_vae_out, img_clip_out, text_out = self.unet(
+                img_vae_latents, img_clip_latents, text_latents, timestep_img=t, timestep_text=t, data_type=data_type
+            )
+
+            x_out = self._combine_joint(img_vae_out, img_clip_out, text_out)
+
+            if guidance_scale <= 1.0:
+                return x_out
+
+            # Classifier-free guidance
+            img_vae_T = randn_tensor(img_vae.shape, generator=generator, device=device, dtype=img_vae.dtype)
+            img_clip_T = randn_tensor(img_clip.shape, generator=generator, device=device, dtype=img_clip.dtype)
+            text_T = randn_tensor(prompt_embeds.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
+
+            _, _, text_out_uncond = self.unet(
+                img_vae_T, img_clip_T, text_latents, timestep_img=max_timestep, timestep_text=t, data_type=data_type
+            )
+
+            img_vae_out_uncond, img_clip_out_uncond, _ = self.unet(
+                img_vae_latents,
+                img_clip_latents,
+                text_T,
+                timestep_img=t,
+                timestep_text=max_timestep,
+                data_type=data_type,
+            )
+
+            x_out_uncond = self._combine_joint(img_vae_out_uncond, img_clip_out_uncond, text_out_uncond)
+
+            return guidance_scale * x_out + (1.0 - guidance_scale) * x_out_uncond
+        elif mode == "text2img":
+            # Text-conditioned image generation
+            img_vae_latents, img_clip_latents = self._split(latents, height, width)
+
+            img_vae_out, img_clip_out, text_out = self.unet(
+                img_vae_latents, img_clip_latents, prompt_embeds, timestep_img=t, timestep_text=0, data_type=data_type
+            )
+
+            img_out = self._combine(img_vae_out, img_clip_out)
+
+            if guidance_scale <= 1.0:
+                return img_out
+
+            # Classifier-free guidance
+            text_T = randn_tensor(prompt_embeds.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
+
+            img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet(
+                img_vae_latents,
+                img_clip_latents,
+                text_T,
+                timestep_img=t,
+                timestep_text=max_timestep,
+                data_type=data_type,
+            )
+
+            img_out_uncond = self._combine(img_vae_out_uncond, img_clip_out_uncond)
+
+            return guidance_scale * img_out + (1.0 - guidance_scale) * img_out_uncond
+        elif mode == "img2text":
+            # Image-conditioned text generation
+            img_vae_out, img_clip_out, text_out = self.unet(
+                img_vae, img_clip, latents, timestep_img=0, timestep_text=t, data_type=data_type
+            )
+
+            if guidance_scale <= 1.0:
+                return text_out
+
+            # Classifier-free guidance
+            img_vae_T = randn_tensor(img_vae.shape, generator=generator, device=device, dtype=img_vae.dtype)
+            img_clip_T = randn_tensor(img_clip.shape, generator=generator, device=device, dtype=img_clip.dtype)
+
+            img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet(
+                img_vae_T, img_clip_T, latents, timestep_img=max_timestep, timestep_text=t, data_type=data_type
+            )
+
+            return guidance_scale * text_out + (1.0 - guidance_scale) * text_out_uncond
+        elif mode == "text":
+            # Unconditional ("marginal") text generation (no CFG)
+            img_vae_out, img_clip_out, text_out = self.unet(
+                img_vae, img_clip, latents, timestep_img=max_timestep, timestep_text=t, data_type=data_type
+            )
+
+            return text_out
+        elif mode == "img":
+            # Unconditional ("marginal") image generation (no CFG)
+            img_vae_latents, img_clip_latents = self._split(latents, height, width)
+
+            img_vae_out, img_clip_out, text_out = self.unet(
+                img_vae_latents,
+                img_clip_latents,
+                prompt_embeds,
+                timestep_img=t,
+                timestep_text=max_timestep,
+                data_type=data_type,
+            )
+
+            img_out = self._combine(img_vae_out, img_clip_out)
+            return img_out
+
+    def check_latents_shape(self, latents_name, latents, expected_shape):
+        latents_shape = latents.shape
+        expected_num_dims = len(expected_shape) + 1  # expected dimensions plus the batch dimension
+        expected_shape_str = ", ".join(str(dim) for dim in expected_shape)
+        if len(latents_shape) != expected_num_dims:
+            raise ValueError(
+                f"`{latents_name}` should have shape (batch_size, {expected_shape_str}), but the current shape"
+                f" {latents_shape} has {len(latents_shape)} dimensions."
+            )
+        for i in range(1, expected_num_dims):
+            if latents_shape[i] != expected_shape[i - 1]:
+                raise ValueError(
+                    f"`{latents_name}` should have shape (batch_size, {expected_shape_str}), but the current shape"
+                    f" {latents_shape} has {latents_shape[i]} != {expected_shape[i - 1]} at dimension {i}."
+                )
+
+    def check_inputs(
+        self,
+        mode,
+        prompt,
+        image,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        latents=None,
+        prompt_latents=None,
+        vae_latents=None,
+        clip_latents=None,
+    ):
+        # Check inputs before running the generative process.
+        if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
+            )
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if mode == "text2img":
+            if prompt is not None and prompt_embeds is not None:
+                raise ValueError(
+                    f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                    " only forward one of the two."
+                )
+            elif prompt is None and prompt_embeds is None:
+                raise ValueError(
+                    "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+                )
+            elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+            if negative_prompt is not None and negative_prompt_embeds is not None:
+                raise ValueError(
+                    f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                    f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+                )
+
+            if prompt_embeds is not None and negative_prompt_embeds is not None:
+                if prompt_embeds.shape != negative_prompt_embeds.shape:
+                    raise ValueError(
+                        "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                        f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                        f" {negative_prompt_embeds.shape}."
+                    )
+
+        if mode == "img2text":
+            if image is None:
+                raise ValueError("`img2text` mode requires an image to be provided.")
+
+        # Check provided latents
+        latent_height = height // self.vae_scale_factor
+        latent_width = width // self.vae_scale_factor
+        full_latents_available = latents is not None
+        prompt_latents_available = prompt_latents is not None
+        vae_latents_available = vae_latents is not None
+        clip_latents_available = clip_latents is not None
+
+        if full_latents_available:
+            individual_latents_available = (
+                prompt_latents is not None or vae_latents is not None or clip_latents is not None
+            )
+            if individual_latents_available:
+                logger.warning(
+                    "You have supplied both `latents` and at least one of `prompt_latents`, `vae_latents`, and"
+                    " `clip_latents`. The value of `latents` will override the value of any individually supplied latents."
+                )
+            # Check shape of full latents
+            img_vae_dim = self.num_channels_latents * latent_height * latent_width
+            text_dim = self.text_encoder_seq_len * self.text_encoder_hidden_size
+            latents_dim = img_vae_dim + self.image_encoder_projection_dim + text_dim
+            latents_expected_shape = (latents_dim,)
+            self.check_latents_shape("latents", latents, latents_expected_shape)
+
+        # Check individual latent shapes, if present
+        if prompt_latents_available:
+            prompt_latents_expected_shape = (self.text_encoder_seq_len, self.text_encoder_hidden_size)
+            self.check_latents_shape("prompt_latents", prompt_latents, prompt_latents_expected_shape)
+
+        if vae_latents_available:
+            vae_latents_expected_shape = (self.num_channels_latents, latent_height, latent_width)
+            self.check_latents_shape("vae_latents", vae_latents, vae_latents_expected_shape)
+
+        if clip_latents_available:
+            clip_latents_expected_shape = (1, self.image_encoder_projection_dim)
+            self.check_latents_shape("clip_latents", clip_latents, clip_latents_expected_shape)
+
+        if mode in ["text2img", "img"] and vae_latents_available and clip_latents_available:
+            if vae_latents.shape[0] != clip_latents.shape[0]:
+                raise ValueError(
+                    f"Both `vae_latents` and `clip_latents` are supplied, but their batch dimensions are not equal:"
+                    f" {vae_latents.shape[0]} != {clip_latents.shape[0]}."
+                )
+
+        if mode == "joint" and prompt_latents_available and vae_latents_available and clip_latents_available:
+            if prompt_latents.shape[0] != vae_latents.shape[0] or prompt_latents.shape[0] != clip_latents.shape[0]:
+                raise ValueError(
+                    f"All of `prompt_latents`, `vae_latents`, and `clip_latents` are supplied, but their batch"
+                    f" dimensions are not equal: {prompt_latents.shape[0]} != {vae_latents.shape[0]}"
+                    f" != {clip_latents.shape[0]}."
+                )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        image: Optional[Union[torch.Tensor, PIL.Image.Image]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        data_type: Optional[int] = 1,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 8.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        num_prompts_per_image: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_latents: Optional[torch.Tensor] = None,
+        vae_latents: Optional[torch.Tensor] = None,
+        clip_latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+                Required for text-conditioned image generation (`text2img`) mode.
+            image (`torch.Tensor` or `PIL.Image.Image`, *optional*):
+                `Image` or tensor representing an image batch. Required for image-conditioned text generation
+                (`img2text`) mode.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            data_type (`int`, *optional*, defaults to 1):
+                The data type (either 0 or 1). Only used if you are loading a checkpoint which supports a data type
+                embedding; this is added for compatibility with the
+                [UniDiffuser-v1](https://huggingface.co/thu-ml/unidiffuser-v1) checkpoint.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 8.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). Used in
+                text-conditioned image generation (`text2img`) mode.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt. Used in `text2img` (text-conditioned image generation) and
+                `img` mode. If the mode is joint and both `num_images_per_prompt` and `num_prompts_per_image` are
+                supplied, `min(num_images_per_prompt, num_prompts_per_image)` samples are generated.
+            num_prompts_per_image (`int`, *optional*, defaults to 1):
+                The number of prompts to generate per image. Used in `img2text` (image-conditioned text generation) and
+                `text` mode. If the mode is joint and both `num_images_per_prompt` and `num_prompts_per_image` are
+                supplied, `min(num_images_per_prompt, num_prompts_per_image)` samples are generated.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for joint
+                image-text generation. Can be used to tweak the same generation with different prompts. If not
+                provided, a latents tensor is generated by sampling using the supplied random `generator`. This assumes
+                a full set of VAE, CLIP, and text latents, if supplied, overrides the value of `prompt_latents`,
+                `vae_latents`, and `clip_latents`.
+            prompt_latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for text
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            vae_latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            clip_latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument. Used in text-conditioned
+                image generation (`text2img`) mode.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are be generated from the `negative_prompt` input argument. Used
+                in text-conditioned image generation (`text2img`) mode.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImageTextPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Returns:
+            [`~pipelines.unidiffuser.ImageTextPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.unidiffuser.ImageTextPipelineOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is a list with the generated images and the second element
+                is a list of generated texts.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.unet_resolution * self.vae_scale_factor
+        width = width or self.unet_resolution * self.vae_scale_factor
+
+        # 1. Check inputs
+        # Recalculate mode for each call to the pipeline.
+        mode = self._infer_mode(prompt, prompt_embeds, image, latents, prompt_latents, vae_latents, clip_latents)
+        self.check_inputs(
+            mode,
+            prompt,
+            image,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            latents,
+            prompt_latents,
+            vae_latents,
+            clip_latents,
+        )
+
+        # 2. Define call parameters
+        batch_size, multiplier = self._infer_batch_size(
+            mode,
+            prompt,
+            prompt_embeds,
+            image,
+            num_images_per_prompt,
+            num_prompts_per_image,
+            latents,
+            prompt_latents,
+            vae_latents,
+            clip_latents,
+        )
+        device = self._execution_device
+        reduce_text_emb_dim = self.text_intermediate_dim < self.text_encoder_hidden_size or self.mode != "text2img"
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        # Note that this differs from the formulation in the unidiffusers paper!
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # check if scheduler is in sigmas space
+        # scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas")
+
+        # 3. Encode input prompt, if available; otherwise prepare text latents
+        if latents is not None:
+            # Overwrite individual latents
+            vae_latents, clip_latents, prompt_latents = self._split_joint(latents, height, width)
+
+        if mode in ["text2img"]:
+            # 3.1. Encode input prompt, if available
+            assert prompt is not None or prompt_embeds is not None
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=multiplier,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                negative_prompt=negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+            )
+
+            # if do_classifier_free_guidance:
+            #     prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        else:
+            # 3.2. Prepare text latent variables, if input not available
+            prompt_embeds = self.prepare_text_latents(
+                batch_size=batch_size,
+                num_images_per_prompt=multiplier,
+                seq_len=self.text_encoder_seq_len,
+                hidden_size=self.text_encoder_hidden_size,
+                dtype=self.text_encoder.dtype,  # Should work with both full precision and mixed precision
+                device=device,
+                generator=generator,
+                latents=prompt_latents,
+            )
+
+        if reduce_text_emb_dim:
+            prompt_embeds = self.text_decoder.encode(prompt_embeds)
+
+        # 4. Encode image, if available; otherwise prepare image latents
+        if mode in ["img2text"]:
+            # 4.1. Encode images, if available
+            assert image is not None, "`img2text` requires a conditioning image"
+            # Encode image using VAE
+            image_vae = self.image_processor.preprocess(image)
+            height, width = image_vae.shape[-2:]
+            image_vae_latents = self.encode_image_vae_latents(
+                image=image_vae,
+                batch_size=batch_size,
+                num_prompts_per_image=multiplier,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                do_classifier_free_guidance=False,  # Copied from InstructPix2Pix, don't use their version of CFG
+                generator=generator,
+            )
+
+            # Encode image using CLIP
+            image_clip_latents = self.encode_image_clip_latents(
+                image=image,
+                batch_size=batch_size,
+                num_prompts_per_image=multiplier,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+            )
+            # (batch_size, clip_hidden_size) => (batch_size, 1, clip_hidden_size)
+            image_clip_latents = image_clip_latents.unsqueeze(1)
+        else:
+            # 4.2. Prepare image latent variables, if input not available
+            # Prepare image VAE latents in latent space
+            image_vae_latents = self.prepare_image_vae_latents(
+                batch_size=batch_size,
+                num_prompts_per_image=multiplier,
+                num_channels_latents=self.num_channels_latents,
+                height=height,
+                width=width,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+                latents=vae_latents,
+            )
+
+            # Prepare image CLIP latents
+            image_clip_latents = self.prepare_image_clip_latents(
+                batch_size=batch_size,
+                num_prompts_per_image=multiplier,
+                clip_img_dim=self.image_encoder_projection_dim,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+                latents=clip_latents,
+            )
+
+        # 5. Set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # max_timestep = timesteps[0]
+        max_timestep = self.scheduler.config.num_train_timesteps
+
+        # 6. Prepare latent variables
+        if mode == "joint":
+            latents = self._combine_joint(image_vae_latents, image_clip_latents, prompt_embeds)
+        elif mode in ["text2img", "img"]:
+            latents = self._combine(image_vae_latents, image_clip_latents)
+        elif mode in ["img2text", "text"]:
+            latents = prompt_embeds
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        logger.debug(f"Scheduler extra step kwargs: {extra_step_kwargs}")
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # predict the noise residual
+                # Also applies classifier-free guidance as described in the UniDiffuser paper
+                noise_pred = self._get_noise_pred(
+                    mode,
+                    latents,
+                    t,
+                    prompt_embeds,
+                    image_vae_latents,
+                    image_clip_latents,
+                    max_timestep,
+                    data_type,
+                    guidance_scale,
+                    generator,
+                    device,
+                    height,
+                    width,
+                )
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 9. Post-processing
+        image = None
+        text = None
+        if mode == "joint":
+            image_vae_latents, image_clip_latents, text_latents = self._split_joint(latents, height, width)
+
+            if not output_type == "latent":
+                # Map latent VAE image back to pixel space
+                image = self.vae.decode(image_vae_latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            else:
+                image = image_vae_latents
+
+            text = self.decode_text_latents(text_latents, device)
+        elif mode in ["text2img", "img"]:
+            image_vae_latents, image_clip_latents = self._split(latents, height, width)
+
+            if not output_type == "latent":
+                # Map latent VAE image back to pixel space
+                image = self.vae.decode(image_vae_latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            else:
+                image = image_vae_latents
+        elif mode in ["img2text", "text"]:
+            text_latents = latents
+            text = self.decode_text_latents(text_latents, device)
+
+        self.maybe_free_model_hooks()
+
+        # 10. Postprocess the image, if necessary
+        if image is not None:
+            do_denormalize = [True] * image.shape[0]
+            image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, text)
+
+        return ImageTextPipelineOutput(images=image, text=text)
diff --git a/diffusers/src/diffusers/pipelines/wuerstchen/__init__.py b/diffusers/src/diffusers/pipelines/wuerstchen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddb852d1931558fe0948e81e16cf9a92fc2a114b
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/wuerstchen/__init__.py
@@ -0,0 +1,56 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modeling_paella_vq_model"] = ["PaellaVQModel"]
+    _import_structure["modeling_wuerstchen_diffnext"] = ["WuerstchenDiffNeXt"]
+    _import_structure["modeling_wuerstchen_prior"] = ["WuerstchenPrior"]
+    _import_structure["pipeline_wuerstchen"] = ["WuerstchenDecoderPipeline"]
+    _import_structure["pipeline_wuerstchen_combined"] = ["WuerstchenCombinedPipeline"]
+    _import_structure["pipeline_wuerstchen_prior"] = ["DEFAULT_STAGE_C_TIMESTEPS", "WuerstchenPriorPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .modeling_paella_vq_model import PaellaVQModel
+        from .modeling_wuerstchen_diffnext import WuerstchenDiffNeXt
+        from .modeling_wuerstchen_prior import WuerstchenPrior
+        from .pipeline_wuerstchen import WuerstchenDecoderPipeline
+        from .pipeline_wuerstchen_combined import WuerstchenCombinedPipeline
+        from .pipeline_wuerstchen_prior import DEFAULT_STAGE_C_TIMESTEPS, WuerstchenPriorPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py b/diffusers/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2cf8cbc978c402e375d9fa038ede03a5cda4cf7
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2022 Dominic Rampas MIT License
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.autoencoders.vae import DecoderOutput, VectorQuantizer
+from ...models.modeling_utils import ModelMixin
+from ...models.vq_model import VQEncoderOutput
+from ...utils.accelerate_utils import apply_forward_hook
+
+
+class MixingResidualBlock(nn.Module):
+    """
+    Residual block with mixing used by Paella's VQ-VAE.
+    """
+
+    def __init__(self, inp_channels, embed_dim):
+        super().__init__()
+        # depthwise
+        self.norm1 = nn.LayerNorm(inp_channels, elementwise_affine=False, eps=1e-6)
+        self.depthwise = nn.Sequential(
+            nn.ReplicationPad2d(1), nn.Conv2d(inp_channels, inp_channels, kernel_size=3, groups=inp_channels)
+        )
+
+        # channelwise
+        self.norm2 = nn.LayerNorm(inp_channels, elementwise_affine=False, eps=1e-6)
+        self.channelwise = nn.Sequential(
+            nn.Linear(inp_channels, embed_dim), nn.GELU(), nn.Linear(embed_dim, inp_channels)
+        )
+
+        self.gammas = nn.Parameter(torch.zeros(6), requires_grad=True)
+
+    def forward(self, x):
+        mods = self.gammas
+        x_temp = self.norm1(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) * (1 + mods[0]) + mods[1]
+        x = x + self.depthwise(x_temp) * mods[2]
+        x_temp = self.norm2(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) * (1 + mods[3]) + mods[4]
+        x = x + self.channelwise(x_temp.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) * mods[5]
+        return x
+
+
+class PaellaVQModel(ModelMixin, ConfigMixin):
+    r"""VQ-VAE model from Paella model.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the model (such as downloading or saving, etc.)
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        up_down_scale_factor (int, *optional*, defaults to 2): Up and Downscale factor of the input image.
+        levels  (int, *optional*, defaults to 2): Number of levels in the model.
+        bottleneck_blocks (int, *optional*, defaults to 12): Number of bottleneck blocks in the model.
+        embed_dim (int, *optional*, defaults to 384): Number of hidden channels in the model.
+        latent_channels (int, *optional*, defaults to 4): Number of latent channels in the VQ-VAE model.
+        num_vq_embeddings (int, *optional*, defaults to 8192): Number of codebook vectors in the VQ-VAE.
+        scale_factor (float, *optional*, defaults to 0.3764): Scaling factor of the latent space.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_down_scale_factor: int = 2,
+        levels: int = 2,
+        bottleneck_blocks: int = 12,
+        embed_dim: int = 384,
+        latent_channels: int = 4,
+        num_vq_embeddings: int = 8192,
+        scale_factor: float = 0.3764,
+    ):
+        super().__init__()
+
+        c_levels = [embed_dim // (2**i) for i in reversed(range(levels))]
+        # Encoder blocks
+        self.in_block = nn.Sequential(
+            nn.PixelUnshuffle(up_down_scale_factor),
+            nn.Conv2d(in_channels * up_down_scale_factor**2, c_levels[0], kernel_size=1),
+        )
+        down_blocks = []
+        for i in range(levels):
+            if i > 0:
+                down_blocks.append(nn.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
+            block = MixingResidualBlock(c_levels[i], c_levels[i] * 4)
+            down_blocks.append(block)
+        down_blocks.append(
+            nn.Sequential(
+                nn.Conv2d(c_levels[-1], latent_channels, kernel_size=1, bias=False),
+                nn.BatchNorm2d(latent_channels),  # then normalize them to have mean 0 and std 1
+            )
+        )
+        self.down_blocks = nn.Sequential(*down_blocks)
+
+        # Vector Quantizer
+        self.vquantizer = VectorQuantizer(num_vq_embeddings, vq_embed_dim=latent_channels, legacy=False, beta=0.25)
+
+        # Decoder blocks
+        up_blocks = [nn.Sequential(nn.Conv2d(latent_channels, c_levels[-1], kernel_size=1))]
+        for i in range(levels):
+            for j in range(bottleneck_blocks if i == 0 else 1):
+                block = MixingResidualBlock(c_levels[levels - 1 - i], c_levels[levels - 1 - i] * 4)
+                up_blocks.append(block)
+            if i < levels - 1:
+                up_blocks.append(
+                    nn.ConvTranspose2d(
+                        c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2, padding=1
+                    )
+                )
+        self.up_blocks = nn.Sequential(*up_blocks)
+        self.out_block = nn.Sequential(
+            nn.Conv2d(c_levels[0], out_channels * up_down_scale_factor**2, kernel_size=1),
+            nn.PixelShuffle(up_down_scale_factor),
+        )
+
+    @apply_forward_hook
+    def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput:
+        h = self.in_block(x)
+        h = self.down_blocks(h)
+
+        if not return_dict:
+            return (h,)
+
+        return VQEncoderOutput(latents=h)
+
+    @apply_forward_hook
+    def decode(
+        self, h: torch.Tensor, force_not_quantize: bool = True, return_dict: bool = True
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        if not force_not_quantize:
+            quant, _, _ = self.vquantizer(h)
+        else:
+            quant = h
+
+        x = self.up_blocks(quant)
+        dec = self.out_block(x)
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def forward(self, sample: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Args:
+            sample (`torch.Tensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        h = self.encode(x).latents
+        dec = self.decode(h).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
diff --git a/diffusers/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py b/diffusers/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..73e71b3076fbca259ae76138bc4ab3d3797e2755
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py
@@ -0,0 +1,76 @@
+import torch
+import torch.nn as nn
+
+from ...models.attention_processor import Attention
+
+
+class WuerstchenLayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, x):
+        x = x.permute(0, 2, 3, 1)
+        x = super().forward(x)
+        return x.permute(0, 3, 1, 2)
+
+
+class TimestepBlock(nn.Module):
+    def __init__(self, c, c_timestep):
+        super().__init__()
+
+        self.mapper = nn.Linear(c_timestep, c * 2)
+
+    def forward(self, x, t):
+        a, b = self.mapper(t)[:, :, None, None].chunk(2, dim=1)
+        return x * (1 + a) + b
+
+
+class ResBlock(nn.Module):
+    def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0):
+        super().__init__()
+
+        self.depthwise = nn.Conv2d(c + c_skip, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
+        self.norm = WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6)
+        self.channelwise = nn.Sequential(
+            nn.Linear(c, c * 4), nn.GELU(), GlobalResponseNorm(c * 4), nn.Dropout(dropout), nn.Linear(c * 4, c)
+        )
+
+    def forward(self, x, x_skip=None):
+        x_res = x
+        if x_skip is not None:
+            x = torch.cat([x, x_skip], dim=1)
+        x = self.norm(self.depthwise(x)).permute(0, 2, 3, 1)
+        x = self.channelwise(x).permute(0, 3, 1, 2)
+        return x + x_res
+
+
+# from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105
+class GlobalResponseNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
+
+    def forward(self, x):
+        agg_norm = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
+        stand_div_norm = agg_norm / (agg_norm.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * stand_div_norm) + self.beta + x
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, c, c_cond, nhead, self_attn=True, dropout=0.0):
+        super().__init__()
+
+        self.self_attn = self_attn
+        self.norm = WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6)
+        self.attention = Attention(query_dim=c, heads=nhead, dim_head=c // nhead, dropout=dropout, bias=True)
+        self.kv_mapper = nn.Sequential(nn.SiLU(), nn.Linear(c_cond, c))
+
+    def forward(self, x, kv):
+        kv = self.kv_mapper(kv)
+        norm_x = self.norm(x)
+        if self.self_attn:
+            batch_size, channel, _, _ = x.shape
+            kv = torch.cat([norm_x.view(batch_size, channel, -1).transpose(1, 2), kv], dim=1)
+        x = x + self.attention(norm_x, encoder_hidden_states=kv)
+        return x
diff --git a/diffusers/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py b/diffusers/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c06cc0e7303f2ec72c6c2958b9f03eaba5f8c67
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2023 Dominic Rampas MIT License
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.modeling_utils import ModelMixin
+from .modeling_wuerstchen_common import AttnBlock, GlobalResponseNorm, TimestepBlock, WuerstchenLayerNorm
+
+
+class WuerstchenDiffNeXt(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        c_in=4,
+        c_out=4,
+        c_r=64,
+        patch_size=2,
+        c_cond=1024,
+        c_hidden=[320, 640, 1280, 1280],
+        nhead=[-1, 10, 20, 20],
+        blocks=[4, 4, 14, 4],
+        level_config=["CT", "CTA", "CTA", "CTA"],
+        inject_effnet=[False, True, True, True],
+        effnet_embd=16,
+        clip_embd=1024,
+        kernel_size=3,
+        dropout=0.1,
+    ):
+        super().__init__()
+        self.c_r = c_r
+        self.c_cond = c_cond
+        if not isinstance(dropout, list):
+            dropout = [dropout] * len(c_hidden)
+
+        # CONDITIONING
+        self.clip_mapper = nn.Linear(clip_embd, c_cond)
+        self.effnet_mappers = nn.ModuleList(
+            [
+                nn.Conv2d(effnet_embd, c_cond, kernel_size=1) if inject else None
+                for inject in inject_effnet + list(reversed(inject_effnet))
+            ]
+        )
+        self.seq_norm = nn.LayerNorm(c_cond, elementwise_affine=False, eps=1e-6)
+
+        self.embedding = nn.Sequential(
+            nn.PixelUnshuffle(patch_size),
+            nn.Conv2d(c_in * (patch_size**2), c_hidden[0], kernel_size=1),
+            WuerstchenLayerNorm(c_hidden[0], elementwise_affine=False, eps=1e-6),
+        )
+
+        def get_block(block_type, c_hidden, nhead, c_skip=0, dropout=0):
+            if block_type == "C":
+                return ResBlockStageB(c_hidden, c_skip, kernel_size=kernel_size, dropout=dropout)
+            elif block_type == "A":
+                return AttnBlock(c_hidden, c_cond, nhead, self_attn=True, dropout=dropout)
+            elif block_type == "T":
+                return TimestepBlock(c_hidden, c_r)
+            else:
+                raise ValueError(f"Block type {block_type} not supported")
+
+        # BLOCKS
+        # -- down blocks
+        self.down_blocks = nn.ModuleList()
+        for i in range(len(c_hidden)):
+            down_block = nn.ModuleList()
+            if i > 0:
+                down_block.append(
+                    nn.Sequential(
+                        WuerstchenLayerNorm(c_hidden[i - 1], elementwise_affine=False, eps=1e-6),
+                        nn.Conv2d(c_hidden[i - 1], c_hidden[i], kernel_size=2, stride=2),
+                    )
+                )
+            for _ in range(blocks[i]):
+                for block_type in level_config[i]:
+                    c_skip = c_cond if inject_effnet[i] else 0
+                    down_block.append(get_block(block_type, c_hidden[i], nhead[i], c_skip=c_skip, dropout=dropout[i]))
+            self.down_blocks.append(down_block)
+
+        # -- up blocks
+        self.up_blocks = nn.ModuleList()
+        for i in reversed(range(len(c_hidden))):
+            up_block = nn.ModuleList()
+            for j in range(blocks[i]):
+                for k, block_type in enumerate(level_config[i]):
+                    c_skip = c_hidden[i] if i < len(c_hidden) - 1 and j == k == 0 else 0
+                    c_skip += c_cond if inject_effnet[i] else 0
+                    up_block.append(get_block(block_type, c_hidden[i], nhead[i], c_skip=c_skip, dropout=dropout[i]))
+            if i > 0:
+                up_block.append(
+                    nn.Sequential(
+                        WuerstchenLayerNorm(c_hidden[i], elementwise_affine=False, eps=1e-6),
+                        nn.ConvTranspose2d(c_hidden[i], c_hidden[i - 1], kernel_size=2, stride=2),
+                    )
+                )
+            self.up_blocks.append(up_block)
+
+        # OUTPUT
+        self.clf = nn.Sequential(
+            WuerstchenLayerNorm(c_hidden[0], elementwise_affine=False, eps=1e-6),
+            nn.Conv2d(c_hidden[0], 2 * c_out * (patch_size**2), kernel_size=1),
+            nn.PixelShuffle(patch_size),
+        )
+
+        # --- WEIGHT INIT ---
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        # General init
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+        for mapper in self.effnet_mappers:
+            if mapper is not None:
+                nn.init.normal_(mapper.weight, std=0.02)  # conditionings
+        nn.init.normal_(self.clip_mapper.weight, std=0.02)  # conditionings
+        nn.init.xavier_uniform_(self.embedding[1].weight, 0.02)  # inputs
+        nn.init.constant_(self.clf[1].weight, 0)  # outputs
+
+        # blocks
+        for level_block in self.down_blocks + self.up_blocks:
+            for block in level_block:
+                if isinstance(block, ResBlockStageB):
+                    block.channelwise[-1].weight.data *= np.sqrt(1 / sum(self.config.blocks))
+                elif isinstance(block, TimestepBlock):
+                    nn.init.constant_(block.mapper.weight, 0)
+
+    def gen_r_embedding(self, r, max_positions=10000):
+        r = r * max_positions
+        half_dim = self.c_r // 2
+        emb = math.log(max_positions) / (half_dim - 1)
+        emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
+        emb = r[:, None] * emb[None, :]
+        emb = torch.cat([emb.sin(), emb.cos()], dim=1)
+        if self.c_r % 2 == 1:  # zero pad
+            emb = nn.functional.pad(emb, (0, 1), mode="constant")
+        return emb.to(dtype=r.dtype)
+
+    def gen_c_embeddings(self, clip):
+        clip = self.clip_mapper(clip)
+        clip = self.seq_norm(clip)
+        return clip
+
+    def _down_encode(self, x, r_embed, effnet, clip=None):
+        level_outputs = []
+        for i, down_block in enumerate(self.down_blocks):
+            effnet_c = None
+            for block in down_block:
+                if isinstance(block, ResBlockStageB):
+                    if effnet_c is None and self.effnet_mappers[i] is not None:
+                        dtype = effnet.dtype
+                        effnet_c = self.effnet_mappers[i](
+                            nn.functional.interpolate(
+                                effnet.float(), size=x.shape[-2:], mode="bicubic", antialias=True, align_corners=True
+                            ).to(dtype)
+                        )
+                    skip = effnet_c if self.effnet_mappers[i] is not None else None
+                    x = block(x, skip)
+                elif isinstance(block, AttnBlock):
+                    x = block(x, clip)
+                elif isinstance(block, TimestepBlock):
+                    x = block(x, r_embed)
+                else:
+                    x = block(x)
+            level_outputs.insert(0, x)
+        return level_outputs
+
+    def _up_decode(self, level_outputs, r_embed, effnet, clip=None):
+        x = level_outputs[0]
+        for i, up_block in enumerate(self.up_blocks):
+            effnet_c = None
+            for j, block in enumerate(up_block):
+                if isinstance(block, ResBlockStageB):
+                    if effnet_c is None and self.effnet_mappers[len(self.down_blocks) + i] is not None:
+                        dtype = effnet.dtype
+                        effnet_c = self.effnet_mappers[len(self.down_blocks) + i](
+                            nn.functional.interpolate(
+                                effnet.float(), size=x.shape[-2:], mode="bicubic", antialias=True, align_corners=True
+                            ).to(dtype)
+                        )
+                    skip = level_outputs[i] if j == 0 and i > 0 else None
+                    if effnet_c is not None:
+                        if skip is not None:
+                            skip = torch.cat([skip, effnet_c], dim=1)
+                        else:
+                            skip = effnet_c
+                    x = block(x, skip)
+                elif isinstance(block, AttnBlock):
+                    x = block(x, clip)
+                elif isinstance(block, TimestepBlock):
+                    x = block(x, r_embed)
+                else:
+                    x = block(x)
+        return x
+
+    def forward(self, x, r, effnet, clip=None, x_cat=None, eps=1e-3, return_noise=True):
+        if x_cat is not None:
+            x = torch.cat([x, x_cat], dim=1)
+        # Process the conditioning embeddings
+        r_embed = self.gen_r_embedding(r)
+        if clip is not None:
+            clip = self.gen_c_embeddings(clip)
+
+        # Model Blocks
+        x_in = x
+        x = self.embedding(x)
+        level_outputs = self._down_encode(x, r_embed, effnet, clip)
+        x = self._up_decode(level_outputs, r_embed, effnet, clip)
+        a, b = self.clf(x).chunk(2, dim=1)
+        b = b.sigmoid() * (1 - eps * 2) + eps
+        if return_noise:
+            return (x_in - a) / b
+        else:
+            return a, b
+
+
+class ResBlockStageB(nn.Module):
+    def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0):
+        super().__init__()
+        self.depthwise = nn.Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
+        self.norm = WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6)
+        self.channelwise = nn.Sequential(
+            nn.Linear(c + c_skip, c * 4),
+            nn.GELU(),
+            GlobalResponseNorm(c * 4),
+            nn.Dropout(dropout),
+            nn.Linear(c * 4, c),
+        )
+
+    def forward(self, x, x_skip=None):
+        x_res = x
+        x = self.norm(self.depthwise(x))
+        if x_skip is not None:
+            x = torch.cat([x, x_skip], dim=1)
+        x = self.channelwise(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        return x + x_res
diff --git a/diffusers/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py b/diffusers/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..edb0c1ec45dee08ced9606f204f61d6f9a53dada
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2023 Dominic Rampas MIT License
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Dict, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin
+from ...models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ...models.modeling_utils import ModelMixin
+from ...utils import is_torch_version
+from .modeling_wuerstchen_common import AttnBlock, ResBlock, TimestepBlock, WuerstchenLayerNorm
+
+
+class WuerstchenPrior(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
+    unet_name = "prior"
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(self, c_in=16, c=1280, c_cond=1024, c_r=64, depth=16, nhead=16, dropout=0.1):
+        super().__init__()
+
+        self.c_r = c_r
+        self.projection = nn.Conv2d(c_in, c, kernel_size=1)
+        self.cond_mapper = nn.Sequential(
+            nn.Linear(c_cond, c),
+            nn.LeakyReLU(0.2),
+            nn.Linear(c, c),
+        )
+
+        self.blocks = nn.ModuleList()
+        for _ in range(depth):
+            self.blocks.append(ResBlock(c, dropout=dropout))
+            self.blocks.append(TimestepBlock(c, c_r))
+            self.blocks.append(AttnBlock(c, c, nhead, self_attn=True, dropout=dropout))
+        self.out = nn.Sequential(
+            WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6),
+            nn.Conv2d(c, c_in * 2, kernel_size=1),
+        )
+
+        self.gradient_checkpointing = False
+        self.set_default_attn_processor()
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+
+    def gen_r_embedding(self, r, max_positions=10000):
+        r = r * max_positions
+        half_dim = self.c_r // 2
+        emb = math.log(max_positions) / (half_dim - 1)
+        emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
+        emb = r[:, None] * emb[None, :]
+        emb = torch.cat([emb.sin(), emb.cos()], dim=1)
+        if self.c_r % 2 == 1:  # zero pad
+            emb = nn.functional.pad(emb, (0, 1), mode="constant")
+        return emb.to(dtype=r.dtype)
+
+    def forward(self, x, r, c):
+        x_in = x
+        x = self.projection(x)
+        c_embed = self.cond_mapper(c)
+        r_embed = self.gen_r_embedding(r)
+
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                for block in self.blocks:
+                    if isinstance(block, AttnBlock):
+                        x = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(block), x, c_embed, use_reentrant=False
+                        )
+                    elif isinstance(block, TimestepBlock):
+                        x = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(block), x, r_embed, use_reentrant=False
+                        )
+                    else:
+                        x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, use_reentrant=False)
+            else:
+                for block in self.blocks:
+                    if isinstance(block, AttnBlock):
+                        x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, c_embed)
+                    elif isinstance(block, TimestepBlock):
+                        x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, r_embed)
+                    else:
+                        x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x)
+        else:
+            for block in self.blocks:
+                if isinstance(block, AttnBlock):
+                    x = block(x, c_embed)
+                elif isinstance(block, TimestepBlock):
+                    x = block(x, r_embed)
+                else:
+                    x = block(x)
+        a, b = self.out(x).chunk(2, dim=1)
+        return (x_in - a) / ((1 - b).abs() + 1e-5)
diff --git a/diffusers/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py b/diffusers/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
new file mode 100644
index 0000000000000000000000000000000000000000..b08421415b235bc3b4876e7a12149f946f4e5c06
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
@@ -0,0 +1,438 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...schedulers import DDPMWuerstchenScheduler
+from ...utils import deprecate, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .modeling_paella_vq_model import PaellaVQModel
+from .modeling_wuerstchen_diffnext import WuerstchenDiffNeXt
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import WuerstchenPriorPipeline, WuerstchenDecoderPipeline
+
+        >>> prior_pipe = WuerstchenPriorPipeline.from_pretrained(
+        ...     "warp-ai/wuerstchen-prior", torch_dtype=torch.float16
+        ... ).to("cuda")
+        >>> gen_pipe = WuerstchenDecoderPipeline.from_pretrain("warp-ai/wuerstchen", torch_dtype=torch.float16).to(
+        ...     "cuda"
+        ... )
+
+        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+        >>> prior_output = pipe(prompt)
+        >>> images = gen_pipe(prior_output.image_embeddings, prompt=prompt)
+        ```
+"""
+
+
+class WuerstchenDecoderPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating images from the Wuerstchen model.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        tokenizer (`CLIPTokenizer`):
+            The CLIP tokenizer.
+        text_encoder (`CLIPTextModel`):
+            The CLIP text encoder.
+        decoder ([`WuerstchenDiffNeXt`]):
+            The WuerstchenDiffNeXt unet decoder.
+        vqgan ([`PaellaVQModel`]):
+            The VQGAN model.
+        scheduler ([`DDPMWuerstchenScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        latent_dim_scale (float, `optional`, defaults to 10.67):
+            Multiplier to determine the VQ latent space size from the image embeddings. If the image embeddings are
+            height=24 and width=24, the VQ latent shape needs to be height=int(24*10.67)=256 and
+            width=int(24*10.67)=256 in order to match the training conditions.
+    """
+
+    model_cpu_offload_seq = "text_encoder->decoder->vqgan"
+    _callback_tensor_inputs = [
+        "latents",
+        "text_encoder_hidden_states",
+        "negative_prompt_embeds",
+        "image_embeddings",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModel,
+        decoder: WuerstchenDiffNeXt,
+        scheduler: DDPMWuerstchenScheduler,
+        vqgan: PaellaVQModel,
+        latent_dim_scale: float = 10.67,
+    ) -> None:
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            decoder=decoder,
+            scheduler=scheduler,
+            vqgan=vqgan,
+        )
+        self.register_to_config(latent_dim_scale=latent_dim_scale)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        attention_mask = text_inputs.attention_mask
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+            attention_mask = attention_mask[:, : self.tokenizer.model_max_length]
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask.to(device))
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_text_encoder_hidden_states = None
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(
+                uncond_input.input_ids.to(device), attention_mask=uncond_input.attention_mask.to(device)
+            )
+
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        return text_encoder_hidden_states, uncond_text_encoder_hidden_states
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image_embeddings: Union[torch.Tensor, List[torch.Tensor]],
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 12,
+        timesteps: Optional[List[float]] = None,
+        guidance_scale: float = 0.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image_embedding (`torch.Tensor` or `List[torch.Tensor]`):
+                Image Embeddings either extracted from an image or generated by a Prior Model.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            num_inference_steps (`int`, *optional*, defaults to 12):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 0.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `decoder_guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
+                `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely
+                linked to the text `prompt`, usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `decoder_guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
+            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
+            embeddings.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        # 0. Define commonly used variables
+        device = self._execution_device
+        dtype = self.decoder.dtype
+        self._guidance_scale = guidance_scale
+
+        # 1. Check inputs. Raise error if not correct
+        if not isinstance(prompt, list):
+            if isinstance(prompt, str):
+                prompt = [prompt]
+            else:
+                raise TypeError(f"'prompt' must be of type 'list' or 'str', but got {type(prompt)}.")
+
+        if self.do_classifier_free_guidance:
+            if negative_prompt is not None and not isinstance(negative_prompt, list):
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+                else:
+                    raise TypeError(
+                        f"'negative_prompt' must be of type 'list' or 'str', but got {type(negative_prompt)}."
+                    )
+
+        if isinstance(image_embeddings, list):
+            image_embeddings = torch.cat(image_embeddings, dim=0)
+        if isinstance(image_embeddings, np.ndarray):
+            image_embeddings = torch.Tensor(image_embeddings, device=device).to(dtype=dtype)
+        if not isinstance(image_embeddings, torch.Tensor):
+            raise TypeError(
+                f"'image_embeddings' must be of type 'torch.Tensor' or 'np.array', but got {type(image_embeddings)}."
+            )
+
+        if not isinstance(num_inference_steps, int):
+            raise TypeError(
+                f"'num_inference_steps' must be of type 'int', but got {type(num_inference_steps)}\
+                           In Case you want to provide explicit timesteps, please use the 'timesteps' argument."
+            )
+
+        # 2. Encode caption
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            image_embeddings.size(0) * num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+        )
+        text_encoder_hidden_states = (
+            torch.cat([prompt_embeds, negative_prompt_embeds]) if negative_prompt_embeds is not None else prompt_embeds
+        )
+        effnet = (
+            torch.cat([image_embeddings, torch.zeros_like(image_embeddings)])
+            if self.do_classifier_free_guidance
+            else image_embeddings
+        )
+
+        # 3. Determine latent shape of latents
+        latent_height = int(image_embeddings.size(2) * self.config.latent_dim_scale)
+        latent_width = int(image_embeddings.size(3) * self.config.latent_dim_scale)
+        latent_features_shape = (image_embeddings.size(0) * num_images_per_prompt, 4, latent_height, latent_width)
+
+        # 4. Prepare and set timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latents
+        latents = self.prepare_latents(latent_features_shape, dtype, device, generator, latents, self.scheduler)
+
+        # 6. Run denoising loop
+        self._num_timesteps = len(timesteps[:-1])
+        for i, t in enumerate(self.progress_bar(timesteps[:-1])):
+            ratio = t.expand(latents.size(0)).to(dtype)
+            # 7. Denoise latents
+            predicted_latents = self.decoder(
+                torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents,
+                r=torch.cat([ratio] * 2) if self.do_classifier_free_guidance else ratio,
+                effnet=effnet,
+                clip=text_encoder_hidden_states,
+            )
+
+            # 8. Check for classifier free guidance and apply it
+            if self.do_classifier_free_guidance:
+                predicted_latents_text, predicted_latents_uncond = predicted_latents.chunk(2)
+                predicted_latents = torch.lerp(predicted_latents_uncond, predicted_latents_text, self.guidance_scale)
+
+            # 9. Renoise latents to next timestep
+            latents = self.scheduler.step(
+                model_output=predicted_latents,
+                timestep=ratio,
+                sample=latents,
+                generator=generator,
+            ).prev_sample
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                image_embeddings = callback_outputs.pop("image_embeddings", image_embeddings)
+                text_encoder_hidden_states = callback_outputs.pop(
+                    "text_encoder_hidden_states", text_encoder_hidden_states
+                )
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if output_type not in ["pt", "np", "pil", "latent"]:
+            raise ValueError(
+                f"Only the output types `pt`, `np`, `pil` and `latent` are supported not output_type={output_type}"
+            )
+
+        if not output_type == "latent":
+            # 10. Scale and decode the image latents with vq-vae
+            latents = self.vqgan.config.scale_factor * latents
+            images = self.vqgan.decode(latents).sample.clamp(0, 1)
+            if output_type == "np":
+                images = images.permute(0, 2, 3, 1).cpu().float().numpy()
+            elif output_type == "pil":
+                images = images.permute(0, 2, 3, 1).cpu().float().numpy()
+                images = self.numpy_to_pil(images)
+        else:
+            images = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return images
+        return ImagePipelineOutput(images)
diff --git a/diffusers/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py b/diffusers/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..7819c8c0a0efb6308c46b8c9f15b538a7e5ece34
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
@@ -0,0 +1,306 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...schedulers import DDPMWuerstchenScheduler
+from ...utils import deprecate, replace_example_docstring
+from ..pipeline_utils import DiffusionPipeline
+from .modeling_paella_vq_model import PaellaVQModel
+from .modeling_wuerstchen_diffnext import WuerstchenDiffNeXt
+from .modeling_wuerstchen_prior import WuerstchenPrior
+from .pipeline_wuerstchen import WuerstchenDecoderPipeline
+from .pipeline_wuerstchen_prior import WuerstchenPriorPipeline
+
+
+TEXT2IMAGE_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusions import WuerstchenCombinedPipeline
+
+        >>> pipe = WuerstchenCombinedPipeline.from_pretrained("warp-ai/Wuerstchen", torch_dtype=torch.float16).to(
+        ...     "cuda"
+        ... )
+        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+        >>> images = pipe(prompt=prompt)
+        ```
+"""
+
+
+class WuerstchenCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for text-to-image generation using Wuerstchen
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        tokenizer (`CLIPTokenizer`):
+            The decoder tokenizer to be used for text inputs.
+        text_encoder (`CLIPTextModel`):
+            The decoder text encoder to be used for text inputs.
+        decoder (`WuerstchenDiffNeXt`):
+            The decoder model to be used for decoder image generation pipeline.
+        scheduler (`DDPMWuerstchenScheduler`):
+            The scheduler to be used for decoder image generation pipeline.
+        vqgan (`PaellaVQModel`):
+            The VQGAN model to be used for decoder image generation pipeline.
+        prior_tokenizer (`CLIPTokenizer`):
+            The prior tokenizer to be used for text inputs.
+        prior_text_encoder (`CLIPTextModel`):
+            The prior text encoder to be used for text inputs.
+        prior_prior (`WuerstchenPrior`):
+            The prior model to be used for prior pipeline.
+        prior_scheduler (`DDPMWuerstchenScheduler`):
+            The scheduler to be used for prior pipeline.
+    """
+
+    _load_connected_pipes = True
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModel,
+        decoder: WuerstchenDiffNeXt,
+        scheduler: DDPMWuerstchenScheduler,
+        vqgan: PaellaVQModel,
+        prior_tokenizer: CLIPTokenizer,
+        prior_text_encoder: CLIPTextModel,
+        prior_prior: WuerstchenPrior,
+        prior_scheduler: DDPMWuerstchenScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            decoder=decoder,
+            scheduler=scheduler,
+            vqgan=vqgan,
+            prior_prior=prior_prior,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+        )
+        self.prior_pipe = WuerstchenPriorPipeline(
+            prior=prior_prior,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+        )
+        self.decoder_pipe = WuerstchenDecoderPipeline(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            decoder=decoder,
+            scheduler=scheduler,
+            vqgan=vqgan,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
+        self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
+
+    def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
+        Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
+        GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
+        Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(TEXT2IMAGE_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        prior_num_inference_steps: int = 60,
+        prior_timesteps: Optional[List[float]] = None,
+        prior_guidance_scale: float = 4.0,
+        num_inference_steps: int = 12,
+        decoder_timesteps: Optional[List[float]] = None,
+        decoder_guidance_scale: float = 0.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        prior_callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation for the prior and decoder.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
+                prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `prior_guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
+                `prior_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked
+                to the text `prompt`, usually at the expense of lower image quality.
+            prior_num_inference_steps (`Union[int, Dict[float, int]]`, *optional*, defaults to 60):
+                The number of prior denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. For more specific timestep spacing, you can pass customized
+                `prior_timesteps`
+            num_inference_steps (`int`, *optional*, defaults to 12):
+                The number of decoder denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. For more specific timestep spacing, you can pass customized
+                `timesteps`
+            prior_timesteps (`List[float]`, *optional*):
+                Custom timesteps to use for the denoising process for the prior. If not defined, equal spaced
+                `prior_num_inference_steps` timesteps are used. Must be in descending order.
+            decoder_timesteps (`List[float]`, *optional*):
+                Custom timesteps to use for the denoising process for the decoder. If not defined, equal spaced
+                `num_inference_steps` timesteps are used. Must be in descending order.
+            decoder_guidance_scale (`float`, *optional*, defaults to 0.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            prior_callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `prior_callback_on_step_end(self: DiffusionPipeline, step: int, timestep:
+                int, callback_kwargs: Dict)`.
+            prior_callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
+                list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
+                the `._callback_tensor_inputs` attribute of your pipeline class.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
+            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        prior_kwargs = {}
+        if kwargs.get("prior_callback", None) is not None:
+            prior_kwargs["callback"] = kwargs.pop("prior_callback")
+            deprecate(
+                "prior_callback",
+                "1.0.0",
+                "Passing `prior_callback` as an input argument to `__call__` is deprecated, consider use `prior_callback_on_step_end`",
+            )
+        if kwargs.get("prior_callback_steps", None) is not None:
+            deprecate(
+                "prior_callback_steps",
+                "1.0.0",
+                "Passing `prior_callback_steps` as an input argument to `__call__` is deprecated, consider use `prior_callback_on_step_end`",
+            )
+            prior_kwargs["callback_steps"] = kwargs.pop("prior_callback_steps")
+
+        prior_outputs = self.prior_pipe(
+            prompt=prompt if prompt_embeds is None else None,
+            height=height,
+            width=width,
+            num_inference_steps=prior_num_inference_steps,
+            timesteps=prior_timesteps,
+            guidance_scale=prior_guidance_scale,
+            negative_prompt=negative_prompt if negative_prompt_embeds is None else None,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            num_images_per_prompt=num_images_per_prompt,
+            generator=generator,
+            latents=latents,
+            output_type="pt",
+            return_dict=False,
+            callback_on_step_end=prior_callback_on_step_end,
+            callback_on_step_end_tensor_inputs=prior_callback_on_step_end_tensor_inputs,
+            **prior_kwargs,
+        )
+        image_embeddings = prior_outputs[0]
+
+        outputs = self.decoder_pipe(
+            image_embeddings=image_embeddings,
+            prompt=prompt if prompt is not None else "",
+            num_inference_steps=num_inference_steps,
+            timesteps=decoder_timesteps,
+            guidance_scale=decoder_guidance_scale,
+            negative_prompt=negative_prompt,
+            generator=generator,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback_on_step_end=callback_on_step_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            **kwargs,
+        )
+
+        return outputs
diff --git a/diffusers/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py b/diffusers/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..92223ce993a635f6603a6429305dfe2a67059321
--- /dev/null
+++ b/diffusers/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
@@ -0,0 +1,517 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from math import ceil
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...loaders import StableDiffusionLoraLoaderMixin
+from ...schedulers import DDPMWuerstchenScheduler
+from ...utils import BaseOutput, deprecate, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .modeling_wuerstchen_prior import WuerstchenPrior
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+DEFAULT_STAGE_C_TIMESTEPS = list(np.linspace(1.0, 2 / 3, 20)) + list(np.linspace(2 / 3, 0.0, 11))[1:]
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import WuerstchenPriorPipeline
+
+        >>> prior_pipe = WuerstchenPriorPipeline.from_pretrained(
+        ...     "warp-ai/wuerstchen-prior", torch_dtype=torch.float16
+        ... ).to("cuda")
+
+        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+        >>> prior_output = pipe(prompt)
+        ```
+"""
+
+
+@dataclass
+class WuerstchenPriorPipelineOutput(BaseOutput):
+    """
+    Output class for WuerstchenPriorPipeline.
+
+    Args:
+        image_embeddings (`torch.Tensor` or `np.ndarray`)
+            Prior image embeddings for text prompt
+
+    """
+
+    image_embeddings: Union[torch.Tensor, np.ndarray]
+
+
+class WuerstchenPriorPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin):
+    """
+    Pipeline for generating image prior for Wuerstchen.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        prior ([`Prior`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`DDPMWuerstchenScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        latent_mean ('float', *optional*, defaults to 42.0):
+            Mean value for latent diffusers.
+        latent_std ('float', *optional*, defaults to 1.0):
+            Standard value for latent diffusers.
+        resolution_multiple ('float', *optional*, defaults to 42.67):
+            Default resolution for multiple images generated.
+    """
+
+    unet_name = "prior"
+    text_encoder_name = "text_encoder"
+    model_cpu_offload_seq = "text_encoder->prior"
+    _callback_tensor_inputs = ["latents", "text_encoder_hidden_states", "negative_prompt_embeds"]
+    _lora_loadable_modules = ["prior", "text_encoder"]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModel,
+        prior: WuerstchenPrior,
+        scheduler: DDPMWuerstchenScheduler,
+        latent_mean: float = 42.0,
+        latent_std: float = 1.0,
+        resolution_multiple: float = 42.67,
+    ) -> None:
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            prior=prior,
+            scheduler=scheduler,
+        )
+        self.register_to_config(
+            latent_mean=latent_mean, latent_std=latent_std, resolution_multiple=resolution_multiple
+        )
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def encode_prompt(
+        self,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        prompt=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+    ):
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            attention_mask = text_inputs.attention_mask
+
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+                attention_mask = attention_mask[:, : self.tokenizer.model_max_length]
+
+            text_encoder_output = self.text_encoder(
+                text_input_ids.to(device), attention_mask=attention_mask.to(device)
+            )
+            prompt_embeds = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if negative_prompt_embeds is None and do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(
+                uncond_input.input_ids.to(device), attention_mask=uncond_input.attention_mask.to(device)
+            )
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            # done duplicates
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        num_inference_steps,
+        do_classifier_free_guidance,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if not isinstance(num_inference_steps, int):
+            raise TypeError(
+                f"'num_inference_steps' must be of type 'int', but got {type(num_inference_steps)}\
+                           In Case you want to provide explicit timesteps, please use the 'timesteps' argument."
+            )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 1024,
+        width: int = 1024,
+        num_inference_steps: int = 60,
+        timesteps: List[float] = None,
+        guidance_scale: float = 8.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pt",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 1024):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 1024):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 60):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 8.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `decoder_guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
+                `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely
+                linked to the text `prompt`, usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `decoder_guidance_scale` is less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.WuerstchenPriorPipelineOutput`] or `tuple` [`~pipelines.WuerstchenPriorPipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated image embeddings.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        # 0. Define commonly used variables
+        device = self._execution_device
+        self._guidance_scale = guidance_scale
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and not isinstance(prompt, list):
+            if isinstance(prompt, str):
+                prompt = [prompt]
+            else:
+                raise TypeError(f"'prompt' must be of type 'list' or 'str', but got {type(prompt)}.")
+
+        if self.do_classifier_free_guidance:
+            if negative_prompt is not None and not isinstance(negative_prompt, list):
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+                else:
+                    raise TypeError(
+                        f"'negative_prompt' must be of type 'list' or 'str', but got {type(negative_prompt)}."
+                    )
+
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            num_inference_steps,
+            self.do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 2. Encode caption
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        text_encoder_hidden_states = (
+            torch.cat([prompt_embeds, negative_prompt_embeds]) if negative_prompt_embeds is not None else prompt_embeds
+        )
+
+        # 3. Determine latent shape of image embeddings
+        dtype = text_encoder_hidden_states.dtype
+        latent_height = ceil(height / self.config.resolution_multiple)
+        latent_width = ceil(width / self.config.resolution_multiple)
+        num_channels = self.prior.config.c_in
+        effnet_features_shape = (num_images_per_prompt * batch_size, num_channels, latent_height, latent_width)
+
+        # 4. Prepare and set timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latents
+        latents = self.prepare_latents(effnet_features_shape, dtype, device, generator, latents, self.scheduler)
+
+        # 6. Run denoising loop
+        self._num_timesteps = len(timesteps[:-1])
+        for i, t in enumerate(self.progress_bar(timesteps[:-1])):
+            ratio = t.expand(latents.size(0)).to(dtype)
+
+            # 7. Denoise image embeddings
+            predicted_image_embedding = self.prior(
+                torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents,
+                r=torch.cat([ratio] * 2) if self.do_classifier_free_guidance else ratio,
+                c=text_encoder_hidden_states,
+            )
+
+            # 8. Check for classifier free guidance and apply it
+            if self.do_classifier_free_guidance:
+                predicted_image_embedding_text, predicted_image_embedding_uncond = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = torch.lerp(
+                    predicted_image_embedding_uncond, predicted_image_embedding_text, self.guidance_scale
+                )
+
+            # 9. Renoise latents to next timestep
+            latents = self.scheduler.step(
+                model_output=predicted_image_embedding,
+                timestep=ratio,
+                sample=latents,
+                generator=generator,
+            ).prev_sample
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                text_encoder_hidden_states = callback_outputs.pop(
+                    "text_encoder_hidden_states", text_encoder_hidden_states
+                )
+                negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # 10. Denormalize the latents
+        latents = latents * self.config.latent_mean - self.config.latent_std
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if output_type == "np":
+            latents = latents.cpu().float().numpy()
+
+        if not return_dict:
+            return (latents,)
+
+        return WuerstchenPriorPipelineOutput(latents)
diff --git a/diffusers/src/diffusers/py.typed b/diffusers/src/diffusers/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/src/diffusers/schedulers/README.md b/diffusers/src/diffusers/schedulers/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..31ad27793e34783faabc222adf98691fb396a0d8
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/README.md
@@ -0,0 +1,3 @@
+# Schedulers
+
+For more information on the schedulers, please refer to the [docs](https://huggingface.co/docs/diffusers/api/schedulers/overview).
\ No newline at end of file
diff --git a/diffusers/src/diffusers/schedulers/__init__.py b/diffusers/src/diffusers/schedulers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb9088538653721f16938894cc694e55fad0b5e8
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/__init__.py
@@ -0,0 +1,221 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ..utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_scipy_available,
+    is_torch_available,
+    is_torchsde_available,
+)
+
+
+_dummy_modules = {}
+_import_structure = {}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_pt_objects  # noqa F403
+
+    _dummy_modules.update(get_objects_from_module(dummy_pt_objects))
+
+else:
+    _import_structure["deprecated"] = ["KarrasVeScheduler", "ScoreSdeVpScheduler"]
+    _import_structure["scheduling_amused"] = ["AmusedScheduler"]
+    _import_structure["scheduling_consistency_decoder"] = ["ConsistencyDecoderScheduler"]
+    _import_structure["scheduling_consistency_models"] = ["CMStochasticIterativeScheduler"]
+    _import_structure["scheduling_ddim"] = ["DDIMScheduler"]
+    _import_structure["scheduling_ddim_cogvideox"] = ["CogVideoXDDIMScheduler"]
+    _import_structure["scheduling_ddim_inverse"] = ["DDIMInverseScheduler"]
+    _import_structure["scheduling_ddim_parallel"] = ["DDIMParallelScheduler"]
+    _import_structure["scheduling_ddpm"] = ["DDPMScheduler"]
+    _import_structure["scheduling_ddpm_parallel"] = ["DDPMParallelScheduler"]
+    _import_structure["scheduling_ddpm_wuerstchen"] = ["DDPMWuerstchenScheduler"]
+    _import_structure["scheduling_deis_multistep"] = ["DEISMultistepScheduler"]
+    _import_structure["scheduling_dpm_cogvideox"] = ["CogVideoXDPMScheduler"]
+    _import_structure["scheduling_dpmsolver_multistep"] = ["DPMSolverMultistepScheduler"]
+    _import_structure["scheduling_dpmsolver_multistep_inverse"] = ["DPMSolverMultistepInverseScheduler"]
+    _import_structure["scheduling_dpmsolver_singlestep"] = ["DPMSolverSinglestepScheduler"]
+    _import_structure["scheduling_edm_dpmsolver_multistep"] = ["EDMDPMSolverMultistepScheduler"]
+    _import_structure["scheduling_edm_euler"] = ["EDMEulerScheduler"]
+    _import_structure["scheduling_euler_ancestral_discrete"] = ["EulerAncestralDiscreteScheduler"]
+    _import_structure["scheduling_euler_discrete"] = ["EulerDiscreteScheduler"]
+    _import_structure["scheduling_flow_match_euler_discrete"] = ["FlowMatchEulerDiscreteScheduler"]
+    _import_structure["scheduling_flow_match_heun_discrete"] = ["FlowMatchHeunDiscreteScheduler"]
+    _import_structure["scheduling_heun_discrete"] = ["HeunDiscreteScheduler"]
+    _import_structure["scheduling_ipndm"] = ["IPNDMScheduler"]
+    _import_structure["scheduling_k_dpm_2_ancestral_discrete"] = ["KDPM2AncestralDiscreteScheduler"]
+    _import_structure["scheduling_k_dpm_2_discrete"] = ["KDPM2DiscreteScheduler"]
+    _import_structure["scheduling_lcm"] = ["LCMScheduler"]
+    _import_structure["scheduling_pndm"] = ["PNDMScheduler"]
+    _import_structure["scheduling_repaint"] = ["RePaintScheduler"]
+    _import_structure["scheduling_sasolver"] = ["SASolverScheduler"]
+    _import_structure["scheduling_sde_ve"] = ["ScoreSdeVeScheduler"]
+    _import_structure["scheduling_tcd"] = ["TCDScheduler"]
+    _import_structure["scheduling_unclip"] = ["UnCLIPScheduler"]
+    _import_structure["scheduling_unipc_multistep"] = ["UniPCMultistepScheduler"]
+    _import_structure["scheduling_utils"] = ["AysSchedules", "KarrasDiffusionSchedulers", "SchedulerMixin"]
+    _import_structure["scheduling_vq_diffusion"] = ["VQDiffusionScheduler"]
+
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_flax_objects  # noqa F403
+
+    _dummy_modules.update(get_objects_from_module(dummy_flax_objects))
+
+else:
+    _import_structure["scheduling_ddim_flax"] = ["FlaxDDIMScheduler"]
+    _import_structure["scheduling_ddpm_flax"] = ["FlaxDDPMScheduler"]
+    _import_structure["scheduling_dpmsolver_multistep_flax"] = ["FlaxDPMSolverMultistepScheduler"]
+    _import_structure["scheduling_euler_discrete_flax"] = ["FlaxEulerDiscreteScheduler"]
+    _import_structure["scheduling_karras_ve_flax"] = ["FlaxKarrasVeScheduler"]
+    _import_structure["scheduling_lms_discrete_flax"] = ["FlaxLMSDiscreteScheduler"]
+    _import_structure["scheduling_pndm_flax"] = ["FlaxPNDMScheduler"]
+    _import_structure["scheduling_sde_ve_flax"] = ["FlaxScoreSdeVeScheduler"]
+    _import_structure["scheduling_utils_flax"] = [
+        "FlaxKarrasDiffusionSchedulers",
+        "FlaxSchedulerMixin",
+        "FlaxSchedulerOutput",
+        "broadcast_to_shape_from_left",
+    ]
+
+
+try:
+    if not (is_torch_available() and is_scipy_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_scipy_objects  # noqa F403
+
+    _dummy_modules.update(get_objects_from_module(dummy_torch_and_scipy_objects))
+
+else:
+    _import_structure["scheduling_lms_discrete"] = ["LMSDiscreteScheduler"]
+
+try:
+    if not (is_torch_available() and is_torchsde_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_torchsde_objects  # noqa F403
+
+    _dummy_modules.update(get_objects_from_module(dummy_torch_and_torchsde_objects))
+
+else:
+    _import_structure["scheduling_cosine_dpmsolver_multistep"] = ["CosineDPMSolverMultistepScheduler"]
+    _import_structure["scheduling_dpmsolver_sde"] = ["DPMSolverSDEScheduler"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from ..utils import (
+        OptionalDependencyNotAvailable,
+        is_flax_available,
+        is_scipy_available,
+        is_torch_available,
+        is_torchsde_available,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_pt_objects import *  # noqa F403
+    else:
+        from .deprecated import KarrasVeScheduler, ScoreSdeVpScheduler
+        from .scheduling_amused import AmusedScheduler
+        from .scheduling_consistency_decoder import ConsistencyDecoderScheduler
+        from .scheduling_consistency_models import CMStochasticIterativeScheduler
+        from .scheduling_ddim import DDIMScheduler
+        from .scheduling_ddim_cogvideox import CogVideoXDDIMScheduler
+        from .scheduling_ddim_inverse import DDIMInverseScheduler
+        from .scheduling_ddim_parallel import DDIMParallelScheduler
+        from .scheduling_ddpm import DDPMScheduler
+        from .scheduling_ddpm_parallel import DDPMParallelScheduler
+        from .scheduling_ddpm_wuerstchen import DDPMWuerstchenScheduler
+        from .scheduling_deis_multistep import DEISMultistepScheduler
+        from .scheduling_dpm_cogvideox import CogVideoXDPMScheduler
+        from .scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
+        from .scheduling_dpmsolver_multistep_inverse import DPMSolverMultistepInverseScheduler
+        from .scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
+        from .scheduling_edm_dpmsolver_multistep import EDMDPMSolverMultistepScheduler
+        from .scheduling_edm_euler import EDMEulerScheduler
+        from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
+        from .scheduling_euler_discrete import EulerDiscreteScheduler
+        from .scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
+        from .scheduling_flow_match_heun_discrete import FlowMatchHeunDiscreteScheduler
+        from .scheduling_heun_discrete import HeunDiscreteScheduler
+        from .scheduling_ipndm import IPNDMScheduler
+        from .scheduling_k_dpm_2_ancestral_discrete import KDPM2AncestralDiscreteScheduler
+        from .scheduling_k_dpm_2_discrete import KDPM2DiscreteScheduler
+        from .scheduling_lcm import LCMScheduler
+        from .scheduling_pndm import PNDMScheduler
+        from .scheduling_repaint import RePaintScheduler
+        from .scheduling_sasolver import SASolverScheduler
+        from .scheduling_sde_ve import ScoreSdeVeScheduler
+        from .scheduling_tcd import TCDScheduler
+        from .scheduling_unclip import UnCLIPScheduler
+        from .scheduling_unipc_multistep import UniPCMultistepScheduler
+        from .scheduling_utils import AysSchedules, KarrasDiffusionSchedulers, SchedulerMixin
+        from .scheduling_vq_diffusion import VQDiffusionScheduler
+
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_flax_objects import *  # noqa F403
+    else:
+        from .scheduling_ddim_flax import FlaxDDIMScheduler
+        from .scheduling_ddpm_flax import FlaxDDPMScheduler
+        from .scheduling_dpmsolver_multistep_flax import FlaxDPMSolverMultistepScheduler
+        from .scheduling_euler_discrete_flax import FlaxEulerDiscreteScheduler
+        from .scheduling_karras_ve_flax import FlaxKarrasVeScheduler
+        from .scheduling_lms_discrete_flax import FlaxLMSDiscreteScheduler
+        from .scheduling_pndm_flax import FlaxPNDMScheduler
+        from .scheduling_sde_ve_flax import FlaxScoreSdeVeScheduler
+        from .scheduling_utils_flax import (
+            FlaxKarrasDiffusionSchedulers,
+            FlaxSchedulerMixin,
+            FlaxSchedulerOutput,
+            broadcast_to_shape_from_left,
+        )
+
+    try:
+        if not (is_torch_available() and is_scipy_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_scipy_objects import *  # noqa F403
+    else:
+        from .scheduling_lms_discrete import LMSDiscreteScheduler
+
+    try:
+        if not (is_torch_available() and is_torchsde_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_torchsde_objects import *  # noqa F403
+    else:
+        from .scheduling_cosine_dpmsolver_multistep import CosineDPMSolverMultistepScheduler
+        from .scheduling_dpmsolver_sde import DPMSolverSDEScheduler
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    for name, value in _dummy_modules.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/__init__.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f74c6039ed80f1498149b531c4f6fd2400269db
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/__init__.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_consistency_decoder.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_consistency_decoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f3dd3fddc6aa526c7d1dbfd33e9bab74060dcf3
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_consistency_decoder.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_ddim.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_ddim.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbf7dc7c47789d530785413fd5a5372463b6ff22
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_ddim.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_ddpm.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_ddpm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac814790c7f8651ea3c15cfd8282ecbc809b3aaa
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_ddpm.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_deis_multistep.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_deis_multistep.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b7a91f294db8ba083c8cd0c12d4c4c800af3154
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_deis_multistep.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_dpmsolver_multistep.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_dpmsolver_multistep.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc4dd58e69805607f562b3fe70839b3e3ae532ea
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_dpmsolver_multistep.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_dpmsolver_singlestep.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_dpmsolver_singlestep.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78e2f565fab233fbeba6c74fe948671a9337feff
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_dpmsolver_singlestep.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_edm_dpmsolver_multistep.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_edm_dpmsolver_multistep.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec6e1719ec59b3c66cce11bba2d05cce33c99b46
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_edm_dpmsolver_multistep.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_edm_euler.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_edm_euler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54d85a011a963de9c189b5f3b7050651235b47f9
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_edm_euler.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_euler_ancestral_discrete.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_euler_ancestral_discrete.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4c3e16375ee7d923a6c0672c5e5e72402957916
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_euler_ancestral_discrete.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_euler_discrete.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_euler_discrete.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1e250c1a0fdb75160c7cc9164faaf854042e67d
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_euler_discrete.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_heun_discrete.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_heun_discrete.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67ae122d551a199e4a83eff68eab59beb46a61fd
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_heun_discrete.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_k_dpm_2_ancestral_discrete.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_k_dpm_2_ancestral_discrete.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5f3daf2d7699edae6504e50eef2db868ffa2e20
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_k_dpm_2_ancestral_discrete.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_k_dpm_2_discrete.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_k_dpm_2_discrete.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4b71907572d69d18b831d8e4c6047284175d51d
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_k_dpm_2_discrete.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_lms_discrete.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_lms_discrete.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c884a4e0faa0eb8c966d6b8066e3a4e01677063c
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_lms_discrete.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_pndm.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_pndm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fce4729066baad80dc6a59ce9fa99d27dff59f35
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_pndm.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_unipc_multistep.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_unipc_multistep.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66cf8f8a9d1e80db3b76c8a554fb9ef7275df04d
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_unipc_multistep.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/__pycache__/scheduling_utils.cpython-311.pyc b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e78aa50e1f6d6638c239bf90b81a522c996648d
Binary files /dev/null and b/diffusers/src/diffusers/schedulers/__pycache__/scheduling_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/schedulers/deprecated/__init__.py b/diffusers/src/diffusers/schedulers/deprecated/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..479cf9bd568be320b60e87a831bf80835e9943f2
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/deprecated/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_pt_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_pt_objects))
+else:
+    _import_structure["scheduling_karras_ve"] = ["KarrasVeScheduler"]
+    _import_structure["scheduling_sde_vp"] = ["ScoreSdeVpScheduler"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_pt_objects import *  # noqa F403
+    else:
+        from .scheduling_karras_ve import KarrasVeScheduler
+        from .scheduling_sde_vp import ScoreSdeVpScheduler
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/diffusers/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py b/diffusers/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5f9bd256c2e57f03176d93d66461a3a43edecae
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py
@@ -0,0 +1,243 @@
+# Copyright 2024 NVIDIA and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ...utils.torch_utils import randn_tensor
+from ..scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class KarrasVeOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        derivative (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Derivative of predicted original image sample (x_0).
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    derivative: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+class KarrasVeScheduler(SchedulerMixin, ConfigMixin):
+    """
+    A stochastic scheduler tailored to variance-expanding models.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    <Tip>
+
+    For more details on the parameters, see [Appendix E](https://arxiv.org/abs/2206.00364). The grid search values used
+    to find the optimal `{s_noise, s_churn, s_min, s_max}` for a specific model are described in Table 5 of the paper.
+
+    </Tip>
+
+    Args:
+        sigma_min (`float`, defaults to 0.02):
+            The minimum noise magnitude.
+        sigma_max (`float`, defaults to 100):
+            The maximum noise magnitude.
+        s_noise (`float`, defaults to 1.007):
+            The amount of additional noise to counteract loss of detail during sampling. A reasonable range is [1.000,
+            1.011].
+        s_churn (`float`, defaults to 80):
+            The parameter controlling the overall amount of stochasticity. A reasonable range is [0, 100].
+        s_min (`float`, defaults to 0.05):
+            The start value of the sigma range to add noise (enable stochasticity). A reasonable range is [0, 10].
+        s_max (`float`, defaults to 50):
+            The end value of the sigma range to add noise. A reasonable range is [0.2, 80].
+    """
+
+    order = 2
+
+    @register_to_config
+    def __init__(
+        self,
+        sigma_min: float = 0.02,
+        sigma_max: float = 100,
+        s_noise: float = 1.007,
+        s_churn: float = 80,
+        s_min: float = 0.05,
+        s_max: float = 50,
+    ):
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = sigma_max
+
+        # setable values
+        self.num_inference_steps: int = None
+        self.timesteps: np.IntTensor = None
+        self.schedule: torch.Tensor = None  # sigma(t_i)
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+        timesteps = np.arange(0, self.num_inference_steps)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+        schedule = [
+            (
+                self.config.sigma_max**2
+                * (self.config.sigma_min**2 / self.config.sigma_max**2) ** (i / (num_inference_steps - 1))
+            )
+            for i in self.timesteps
+        ]
+        self.schedule = torch.tensor(schedule, dtype=torch.float32, device=device)
+
+    def add_noise_to_input(
+        self, sample: torch.Tensor, sigma: float, generator: Optional[torch.Generator] = None
+    ) -> Tuple[torch.Tensor, float]:
+        """
+        Explicit Langevin-like "churn" step of adding noise to the sample according to a `gamma_i ≥ 0` to reach a
+        higher noise level `sigma_hat = sigma_i + gamma_i*sigma_i`.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            sigma (`float`):
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+        """
+        if self.config.s_min <= sigma <= self.config.s_max:
+            gamma = min(self.config.s_churn / self.num_inference_steps, 2**0.5 - 1)
+        else:
+            gamma = 0
+
+        # sample eps ~ N(0, S_noise^2 * I)
+        eps = self.config.s_noise * randn_tensor(sample.shape, generator=generator).to(sample.device)
+        sigma_hat = sigma + gamma * sigma
+        sample_hat = sample + ((sigma_hat**2 - sigma**2) ** 0.5 * eps)
+
+        return sample_hat, sigma_hat
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        sigma_hat: float,
+        sigma_prev: float,
+        sample_hat: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[KarrasVeOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            sigma_hat (`float`):
+            sigma_prev (`float`):
+            sample_hat (`torch.Tensor`):
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+
+        """
+
+        pred_original_sample = sample_hat + sigma_hat * model_output
+        derivative = (sample_hat - pred_original_sample) / sigma_hat
+        sample_prev = sample_hat + (sigma_prev - sigma_hat) * derivative
+
+        if not return_dict:
+            return (sample_prev, derivative)
+
+        return KarrasVeOutput(
+            prev_sample=sample_prev, derivative=derivative, pred_original_sample=pred_original_sample
+        )
+
+    def step_correct(
+        self,
+        model_output: torch.Tensor,
+        sigma_hat: float,
+        sigma_prev: float,
+        sample_hat: torch.Tensor,
+        sample_prev: torch.Tensor,
+        derivative: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[KarrasVeOutput, Tuple]:
+        """
+        Corrects the predicted sample based on the `model_output` of the network.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            sigma_hat (`float`): TODO
+            sigma_prev (`float`): TODO
+            sample_hat (`torch.Tensor`): TODO
+            sample_prev (`torch.Tensor`): TODO
+            derivative (`torch.Tensor`): TODO
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
+
+        Returns:
+            prev_sample (TODO): updated sample in the diffusion chain. derivative (TODO): TODO
+
+        """
+        pred_original_sample = sample_prev + sigma_prev * model_output
+        derivative_corr = (sample_prev - pred_original_sample) / sigma_prev
+        sample_prev = sample_hat + (sigma_prev - sigma_hat) * (0.5 * derivative + 0.5 * derivative_corr)
+
+        if not return_dict:
+            return (sample_prev, derivative)
+
+        return KarrasVeOutput(
+            prev_sample=sample_prev, derivative=derivative, pred_original_sample=pred_original_sample
+        )
+
+    def add_noise(self, original_samples, noise, timesteps):
+        raise NotImplementedError()
diff --git a/diffusers/src/diffusers/schedulers/deprecated/scheduling_sde_vp.py b/diffusers/src/diffusers/schedulers/deprecated/scheduling_sde_vp.py
new file mode 100644
index 0000000000000000000000000000000000000000..09b02cadc4008b1d80184a355a06201a85f97e8f
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/deprecated/scheduling_sde_vp.py
@@ -0,0 +1,109 @@
+# Copyright 2024 Google Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
+
+import math
+from typing import Union
+
+import torch
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils.torch_utils import randn_tensor
+from ..scheduling_utils import SchedulerMixin
+
+
+class ScoreSdeVpScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `ScoreSdeVpScheduler` is a variance preserving stochastic differential equation (SDE) scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 2000):
+            The number of diffusion steps to train the model.
+        beta_min (`int`, defaults to 0.1):
+        beta_max (`int`, defaults to 20):
+        sampling_eps (`int`, defaults to 1e-3):
+            The end value of sampling where timesteps decrease progressively from 1 to epsilon.
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(self, num_train_timesteps=2000, beta_min=0.1, beta_max=20, sampling_eps=1e-3):
+        self.sigmas = None
+        self.discrete_sigmas = None
+        self.timesteps = None
+
+    def set_timesteps(self, num_inference_steps, device: Union[str, torch.device] = None):
+        """
+        Sets the continuous timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.timesteps = torch.linspace(1, self.config.sampling_eps, num_inference_steps, device=device)
+
+    def step_pred(self, score, x, t, generator=None):
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            score ():
+            x ():
+            t ():
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+        """
+        if self.timesteps is None:
+            raise ValueError(
+                "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # TODO(Patrick) better comments + non-PyTorch
+        # postprocess model score
+        log_mean_coeff = -0.25 * t**2 * (self.config.beta_max - self.config.beta_min) - 0.5 * t * self.config.beta_min
+        std = torch.sqrt(1.0 - torch.exp(2.0 * log_mean_coeff))
+        std = std.flatten()
+        while len(std.shape) < len(score.shape):
+            std = std.unsqueeze(-1)
+        score = -score / std
+
+        # compute
+        dt = -1.0 / len(self.timesteps)
+
+        beta_t = self.config.beta_min + t * (self.config.beta_max - self.config.beta_min)
+        beta_t = beta_t.flatten()
+        while len(beta_t.shape) < len(x.shape):
+            beta_t = beta_t.unsqueeze(-1)
+        drift = -0.5 * beta_t * x
+
+        diffusion = torch.sqrt(beta_t)
+        drift = drift - diffusion**2 * score
+        x_mean = x + drift * dt
+
+        # add noise
+        noise = randn_tensor(x.shape, layout=x.layout, generator=generator, device=x.device, dtype=x.dtype)
+        x = x_mean + diffusion * math.sqrt(-dt) * noise
+
+        return x, x_mean
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_amused.py b/diffusers/src/diffusers/schedulers/scheduling_amused.py
new file mode 100644
index 0000000000000000000000000000000000000000..238b8d8691715a83105df815c0ebcad748092ec1
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_amused.py
@@ -0,0 +1,162 @@
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import SchedulerMixin
+
+
+def gumbel_noise(t, generator=None):
+    device = generator.device if generator is not None else t.device
+    noise = torch.zeros_like(t, device=device).uniform_(0, 1, generator=generator).to(t.device)
+    return -torch.log((-torch.log(noise.clamp(1e-20))).clamp(1e-20))
+
+
+def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None):
+    confidence = torch.log(probs.clamp(1e-20)) + temperature * gumbel_noise(probs, generator=generator)
+    sorted_confidence = torch.sort(confidence, dim=-1).values
+    cut_off = torch.gather(sorted_confidence, 1, mask_len.long())
+    masking = confidence < cut_off
+    return masking
+
+
+@dataclass
+class AmusedSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: torch.Tensor = None
+
+
+class AmusedScheduler(SchedulerMixin, ConfigMixin):
+    order = 1
+
+    temperatures: torch.Tensor
+
+    @register_to_config
+    def __init__(
+        self,
+        mask_token_id: int,
+        masking_schedule: str = "cosine",
+    ):
+        self.temperatures = None
+        self.timesteps = None
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
+        device: Union[str, torch.device] = None,
+    ):
+        self.timesteps = torch.arange(num_inference_steps, device=device).flip(0)
+
+        if isinstance(temperature, (tuple, list)):
+            self.temperatures = torch.linspace(temperature[0], temperature[1], num_inference_steps, device=device)
+        else:
+            self.temperatures = torch.linspace(temperature, 0.01, num_inference_steps, device=device)
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: torch.long,
+        sample: torch.LongTensor,
+        starting_mask_ratio: int = 1,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[AmusedSchedulerOutput, Tuple]:
+        two_dim_input = sample.ndim == 3 and model_output.ndim == 4
+
+        if two_dim_input:
+            batch_size, codebook_size, height, width = model_output.shape
+            sample = sample.reshape(batch_size, height * width)
+            model_output = model_output.reshape(batch_size, codebook_size, height * width).permute(0, 2, 1)
+
+        unknown_map = sample == self.config.mask_token_id
+
+        probs = model_output.softmax(dim=-1)
+
+        device = probs.device
+        probs_ = probs.to(generator.device) if generator is not None else probs  # handles when generator is on CPU
+        if probs_.device.type == "cpu" and probs_.dtype != torch.float32:
+            probs_ = probs_.float()  # multinomial is not implemented for cpu half precision
+        probs_ = probs_.reshape(-1, probs.size(-1))
+        pred_original_sample = torch.multinomial(probs_, 1, generator=generator).to(device=device)
+        pred_original_sample = pred_original_sample[:, 0].view(*probs.shape[:-1])
+        pred_original_sample = torch.where(unknown_map, pred_original_sample, sample)
+
+        if timestep == 0:
+            prev_sample = pred_original_sample
+        else:
+            seq_len = sample.shape[1]
+            step_idx = (self.timesteps == timestep).nonzero()
+            ratio = (step_idx + 1) / len(self.timesteps)
+
+            if self.config.masking_schedule == "cosine":
+                mask_ratio = torch.cos(ratio * math.pi / 2)
+            elif self.config.masking_schedule == "linear":
+                mask_ratio = 1 - ratio
+            else:
+                raise ValueError(f"unknown masking schedule {self.config.masking_schedule}")
+
+            mask_ratio = starting_mask_ratio * mask_ratio
+
+            mask_len = (seq_len * mask_ratio).floor()
+            # do not mask more than amount previously masked
+            mask_len = torch.min(unknown_map.sum(dim=-1, keepdim=True) - 1, mask_len)
+            # mask at least one
+            mask_len = torch.max(torch.tensor([1], device=model_output.device), mask_len)
+
+            selected_probs = torch.gather(probs, -1, pred_original_sample[:, :, None])[:, :, 0]
+            # Ignores the tokens given in the input by overwriting their confidence.
+            selected_probs = torch.where(unknown_map, selected_probs, torch.finfo(selected_probs.dtype).max)
+
+            masking = mask_by_random_topk(mask_len, selected_probs, self.temperatures[step_idx], generator)
+
+            # Masks tokens with lower confidence.
+            prev_sample = torch.where(masking, self.config.mask_token_id, pred_original_sample)
+
+        if two_dim_input:
+            prev_sample = prev_sample.reshape(batch_size, height, width)
+            pred_original_sample = pred_original_sample.reshape(batch_size, height, width)
+
+        if not return_dict:
+            return (prev_sample, pred_original_sample)
+
+        return AmusedSchedulerOutput(prev_sample, pred_original_sample)
+
+    def add_noise(self, sample, timesteps, generator=None):
+        step_idx = (self.timesteps == timesteps).nonzero()
+        ratio = (step_idx + 1) / len(self.timesteps)
+
+        if self.config.masking_schedule == "cosine":
+            mask_ratio = torch.cos(ratio * math.pi / 2)
+        elif self.config.masking_schedule == "linear":
+            mask_ratio = 1 - ratio
+        else:
+            raise ValueError(f"unknown masking schedule {self.config.masking_schedule}")
+
+        mask_indices = (
+            torch.rand(
+                sample.shape, device=generator.device if generator is not None else sample.device, generator=generator
+            ).to(sample.device)
+            < mask_ratio
+        )
+
+        masked_sample = sample.clone()
+
+        masked_sample[mask_indices] = self.config.mask_token_id
+
+        return masked_sample
diff --git a/diffusers/src/diffusers/schedulers/scheduling_consistency_decoder.py b/diffusers/src/diffusers/schedulers/scheduling_consistency_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7af018b284a59878c3cae86cec57e02d344e227
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_consistency_decoder.py
@@ -0,0 +1,180 @@
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+@dataclass
+class ConsistencyDecoderSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.Tensor
+
+
+class ConsistencyDecoderScheduler(SchedulerMixin, ConfigMixin):
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1024,
+        sigma_data: float = 0.5,
+    ):
+        betas = betas_for_alpha_bar(num_train_timesteps)
+
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+
+        self.sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
+
+        sigmas = torch.sqrt(1.0 / alphas_cumprod - 1)
+
+        sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / alphas_cumprod)
+
+        self.c_skip = sqrt_recip_alphas_cumprod * sigma_data**2 / (sigmas**2 + sigma_data**2)
+        self.c_out = sigmas * sigma_data / (sigmas**2 + sigma_data**2) ** 0.5
+        self.c_in = sqrt_recip_alphas_cumprod / (sigmas**2 + sigma_data**2) ** 0.5
+
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+    ):
+        if num_inference_steps != 2:
+            raise ValueError("Currently more than 2 inference steps are not supported.")
+
+        self.timesteps = torch.tensor([1008, 512], dtype=torch.long, device=device)
+        self.sqrt_alphas_cumprod = self.sqrt_alphas_cumprod.to(device)
+        self.sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod.to(device)
+        self.c_skip = self.c_skip.to(device)
+        self.c_out = self.c_out.to(device)
+        self.c_in = self.c_in.to(device)
+
+    @property
+    def init_noise_sigma(self):
+        return self.sqrt_one_minus_alphas_cumprod[self.timesteps[0]]
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample * self.c_in[timestep]
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[float, torch.Tensor],
+        sample: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[ConsistencyDecoderSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            timestep (`float`):
+                The current timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a
+                [`~schedulers.scheduling_consistency_models.ConsistencyDecoderSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_consistency_models.ConsistencyDecoderSchedulerOutput`] or `tuple`:
+                If return_dict is `True`,
+                [`~schedulers.scheduling_consistency_models.ConsistencyDecoderSchedulerOutput`] is returned, otherwise
+                a tuple is returned where the first element is the sample tensor.
+        """
+        x_0 = self.c_out[timestep] * model_output + self.c_skip[timestep] * sample
+
+        timestep_idx = torch.where(self.timesteps == timestep)[0]
+
+        if timestep_idx == len(self.timesteps) - 1:
+            prev_sample = x_0
+        else:
+            noise = randn_tensor(x_0.shape, generator=generator, dtype=x_0.dtype, device=x_0.device)
+            prev_sample = (
+                self.sqrt_alphas_cumprod[self.timesteps[timestep_idx + 1]].to(x_0.dtype) * x_0
+                + self.sqrt_one_minus_alphas_cumprod[self.timesteps[timestep_idx + 1]].to(x_0.dtype) * noise
+            )
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return ConsistencyDecoderSchedulerOutput(prev_sample=prev_sample)
diff --git a/diffusers/src/diffusers/schedulers/scheduling_consistency_models.py b/diffusers/src/diffusers/schedulers/scheduling_consistency_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..653171638ccfe53ce55913d7c2eba454b305a63d
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_consistency_models.py
@@ -0,0 +1,446 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class CMStochasticIterativeSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.Tensor
+
+
+class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Multistep and onestep sampling for consistency models.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 40):
+            The number of diffusion steps to train the model.
+        sigma_min (`float`, defaults to 0.002):
+            Minimum noise magnitude in the sigma schedule. Defaults to 0.002 from the original implementation.
+        sigma_max (`float`, defaults to 80.0):
+            Maximum noise magnitude in the sigma schedule. Defaults to 80.0 from the original implementation.
+        sigma_data (`float`, defaults to 0.5):
+            The standard deviation of the data distribution from the EDM
+            [paper](https://huggingface.co/papers/2206.00364). Defaults to 0.5 from the original implementation.
+        s_noise (`float`, defaults to 1.0):
+            The amount of additional noise to counteract loss of detail during sampling. A reasonable range is [1.000,
+            1.011]. Defaults to 1.0 from the original implementation.
+        rho (`float`, defaults to 7.0):
+            The parameter for calculating the Karras sigma schedule from the EDM
+            [paper](https://huggingface.co/papers/2206.00364). Defaults to 7.0 from the original implementation.
+        clip_denoised (`bool`, defaults to `True`):
+            Whether to clip the denoised outputs to `(-1, 1)`.
+        timesteps (`List` or `np.ndarray` or `torch.Tensor`, *optional*):
+            An explicit timestep schedule that can be optionally specified. The timesteps are expected to be in
+            increasing order.
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 40,
+        sigma_min: float = 0.002,
+        sigma_max: float = 80.0,
+        sigma_data: float = 0.5,
+        s_noise: float = 1.0,
+        rho: float = 7.0,
+        clip_denoised: bool = True,
+    ):
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = sigma_max
+
+        ramp = np.linspace(0, 1, num_train_timesteps)
+        sigmas = self._convert_to_karras(ramp)
+        timesteps = self.sigma_to_t(sigmas)
+
+        # setable values
+        self.num_inference_steps = None
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps)
+        self.custom_timesteps = False
+        self.is_scale_input_called = False
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor:
+        """
+        Scales the consistency model input by `(sigma**2 + sigma_data**2) ** 0.5`.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`float` or `torch.Tensor`):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        # Get sigma corresponding to timestep
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+
+        sample = sample / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
+
+        self.is_scale_input_called = True
+        return sample
+
+    def sigma_to_t(self, sigmas: Union[float, np.ndarray]):
+        """
+        Gets scaled timesteps from the Karras sigmas for input to the consistency model.
+
+        Args:
+            sigmas (`float` or `np.ndarray`):
+                A single Karras sigma or an array of Karras sigmas.
+
+        Returns:
+            `float` or `np.ndarray`:
+                A scaled input timestep or scaled input timestep array.
+        """
+        if not isinstance(sigmas, np.ndarray):
+            sigmas = np.array(sigmas, dtype=np.float64)
+
+        timesteps = 1000 * 0.25 * np.log(sigmas + 1e-44)
+
+        return timesteps
+
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
+        """
+        Sets the timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps is used. If `timesteps` is passed,
+                `num_inference_steps` must be `None`.
+        """
+        if num_inference_steps is None and timesteps is None:
+            raise ValueError("Exactly one of `num_inference_steps` or `timesteps` must be supplied.")
+
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `timesteps`.")
+
+        # Follow DDPMScheduler custom timesteps logic
+        if timesteps is not None:
+            for i in range(1, len(timesteps)):
+                if timesteps[i] >= timesteps[i - 1]:
+                    raise ValueError("`timesteps` must be in descending order.")
+
+            if timesteps[0] >= self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`timesteps` must start before `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps}."
+                )
+
+            timesteps = np.array(timesteps, dtype=np.int64)
+            self.custom_timesteps = True
+        else:
+            if num_inference_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+
+            self.num_inference_steps = num_inference_steps
+
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+            self.custom_timesteps = False
+
+        # Map timesteps to Karras sigmas directly for multistep sampling
+        # See https://github.com/openai/consistency_models/blob/main/cm/karras_diffusion.py#L675
+        num_train_timesteps = self.config.num_train_timesteps
+        ramp = timesteps[::-1].copy()
+        ramp = ramp / (num_train_timesteps - 1)
+        sigmas = self._convert_to_karras(ramp)
+        timesteps = self.sigma_to_t(sigmas)
+
+        sigmas = np.concatenate([sigmas, [self.config.sigma_min]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas).to(device=device)
+
+        if str(device).startswith("mps"):
+            # mps does not support float64
+            self.timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32)
+        else:
+            self.timesteps = torch.from_numpy(timesteps).to(device=device)
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Modified _convert_to_karras implementation that takes in ramp as argument
+    def _convert_to_karras(self, ramp):
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        sigma_min: float = self.config.sigma_min
+        sigma_max: float = self.config.sigma_max
+
+        rho = self.config.rho
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    def get_scalings(self, sigma):
+        sigma_data = self.config.sigma_data
+
+        c_skip = sigma_data**2 / (sigma**2 + sigma_data**2)
+        c_out = sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        return c_skip, c_out
+
+    def get_scalings_for_boundary_condition(self, sigma):
+        """
+        Gets the scalings used in the consistency model parameterization (from Appendix C of the
+        [paper](https://huggingface.co/papers/2303.01469)) to enforce boundary condition.
+
+        <Tip>
+
+        `epsilon` in the equations for `c_skip` and `c_out` is set to `sigma_min`.
+
+        </Tip>
+
+        Args:
+            sigma (`torch.Tensor`):
+                The current sigma in the Karras sigma schedule.
+
+        Returns:
+            `tuple`:
+                A two-element tuple where `c_skip` (which weights the current sample) is the first element and `c_out`
+                (which weights the consistency model output) is the second element.
+        """
+        sigma_min = self.config.sigma_min
+        sigma_data = self.config.sigma_data
+
+        c_skip = sigma_data**2 / ((sigma - sigma_min) ** 2 + sigma_data**2)
+        c_out = (sigma - sigma_min) * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        return c_skip, c_out
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[float, torch.Tensor],
+        sample: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[CMStochasticIterativeSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            timestep (`float`):
+                The current timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a
+                [`~schedulers.scheduling_consistency_models.CMStochasticIterativeSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_consistency_models.CMStochasticIterativeSchedulerOutput`] or `tuple`:
+                If return_dict is `True`,
+                [`~schedulers.scheduling_consistency_models.CMStochasticIterativeSchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+        """
+
+        if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    f" `{self.__class__}.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+
+        sigma_min = self.config.sigma_min
+        sigma_max = self.config.sigma_max
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # sigma_next corresponds to next_t in original implementation
+        sigma = self.sigmas[self.step_index]
+        if self.step_index + 1 < self.config.num_train_timesteps:
+            sigma_next = self.sigmas[self.step_index + 1]
+        else:
+            # Set sigma_next to sigma_min
+            sigma_next = self.sigmas[-1]
+
+        # Get scalings for boundary conditions
+        c_skip, c_out = self.get_scalings_for_boundary_condition(sigma)
+
+        # 1. Denoise model output using boundary conditions
+        denoised = c_out * model_output + c_skip * sample
+        if self.config.clip_denoised:
+            denoised = denoised.clamp(-1, 1)
+
+        # 2. Sample z ~ N(0, s_noise^2 * I)
+        # Noise is not used for onestep sampling.
+        if len(self.timesteps) > 1:
+            noise = randn_tensor(
+                model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator
+            )
+        else:
+            noise = torch.zeros_like(model_output)
+        z = noise * self.config.s_noise
+
+        sigma_hat = sigma_next.clamp(min=sigma_min, max=sigma_max)
+
+        # 3. Return noisy sample
+        # tau = sigma_hat, eps = sigma_min
+        prev_sample = denoised + z * (sigma_hat**2 - sigma_min**2) ** 0.5
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return CMStochasticIterativeSchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py b/diffusers/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab56650dbac526f72246d720fe015e62c327a558
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
@@ -0,0 +1,572 @@
+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver and https://github.com/NVlabs/edm
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_dpmsolver_sde import BrownianTreeNoiseSampler
+from .scheduling_utils import SchedulerMixin, SchedulerOutput
+
+
+class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Implements a variant of `DPMSolverMultistepScheduler` with cosine schedule, proposed by Nichol and Dhariwal (2021).
+    This scheduler was used in Stable Audio Open [1].
+
+    [1] Evans, Parker, et al. "Stable Audio Open" https://arxiv.org/abs/2407.14358
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        sigma_min (`float`, *optional*, defaults to 0.3):
+            Minimum noise magnitude in the sigma schedule. This was set to 0.3 in Stable Audio Open [1].
+        sigma_max (`float`, *optional*, defaults to 500):
+            Maximum noise magnitude in the sigma schedule. This was set to 500 in Stable Audio Open [1].
+        sigma_data (`float`, *optional*, defaults to 1.0):
+            The standard deviation of the data distribution. This is set to 1.0 in Stable Audio Open [1].
+        sigma_schedule (`str`, *optional*, defaults to `exponential`):
+            Sigma schedule to compute the `sigmas`. By default, we the schedule introduced in the EDM paper
+            (https://arxiv.org/abs/2206.00364). Other acceptable value is "exponential". The exponential schedule was
+            incorporated in this model: https://huggingface.co/stabilityai/cosxl.
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        solver_order (`int`, defaults to 2):
+            The DPMSolver order which can be `1` or `2`. It is recommended to use `solver_order=2`.
+        prediction_type (`str`, defaults to `v_prediction`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        solver_type (`str`, defaults to `midpoint`):
+            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
+            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        euler_at_final (`bool`, defaults to `False`):
+            Whether to use Euler's method in the final step. It is a trade-off between numerical stability and detail
+            richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
+            steps, but sometimes may result in blurring.
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+    """
+
+    _compatibles = []
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        sigma_min: float = 0.3,
+        sigma_max: float = 500,
+        sigma_data: float = 1.0,
+        sigma_schedule: str = "exponential",
+        num_train_timesteps: int = 1000,
+        solver_order: int = 2,
+        prediction_type: str = "v_prediction",
+        rho: float = 7.0,
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        euler_at_final: bool = False,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+    ):
+        if solver_type not in ["midpoint", "heun"]:
+            if solver_type in ["logrho", "bh1", "bh2"]:
+                self.register_to_config(solver_type="midpoint")
+            else:
+                raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
+
+        ramp = torch.linspace(0, 1, num_train_timesteps)
+        if sigma_schedule == "karras":
+            sigmas = self._compute_karras_sigmas(ramp)
+        elif sigma_schedule == "exponential":
+            sigmas = self._compute_exponential_sigmas(ramp)
+
+        self.timesteps = self.precondition_noise(sigmas)
+
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+
+        # setable values
+        self.num_inference_steps = None
+        self.model_outputs = [None] * solver_order
+        self.lower_order_nums = 0
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        return (self.config.sigma_max**2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.precondition_inputs
+    def precondition_inputs(self, sample, sigma):
+        c_in = 1 / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
+        scaled_sample = sample * c_in
+        return scaled_sample
+
+    def precondition_noise(self, sigma):
+        if not isinstance(sigma, torch.Tensor):
+            sigma = torch.tensor([sigma])
+
+        return sigma.atan() / math.pi * 2
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.precondition_outputs
+    def precondition_outputs(self, sample, model_output, sigma):
+        sigma_data = self.config.sigma_data
+        c_skip = sigma_data**2 / (sigma**2 + sigma_data**2)
+
+        if self.config.prediction_type == "epsilon":
+            c_out = sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        elif self.config.prediction_type == "v_prediction":
+            c_out = -sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        else:
+            raise ValueError(f"Prediction type {self.config.prediction_type} is not supported.")
+
+        denoised = c_skip * sample + c_out * model_output
+
+        return denoised
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.scale_model_input
+    def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = self.precondition_inputs(sample, sigma)
+
+        self.is_scale_input_called = True
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+
+        self.num_inference_steps = num_inference_steps
+
+        ramp = torch.linspace(0, 1, self.num_inference_steps)
+        if self.config.sigma_schedule == "karras":
+            sigmas = self._compute_karras_sigmas(ramp)
+        elif self.config.sigma_schedule == "exponential":
+            sigmas = self._compute_exponential_sigmas(ramp)
+
+        sigmas = sigmas.to(dtype=torch.float32, device=device)
+        self.timesteps = self.precondition_noise(sigmas)
+
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = self.config.sigma_min
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+
+        self.sigmas = torch.cat([sigmas, torch.tensor([sigma_last], dtype=torch.float32, device=device)])
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+        # if a noise sampler is used, reinitialise it
+        self.noise_sampler = None
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas
+    def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        sigma_min = sigma_min or self.config.sigma_min
+        sigma_max = sigma_max or self.config.sigma_max
+
+        rho = self.config.rho
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_exponential_sigmas
+    def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
+        """Implementation closely follows k-diffusion.
+
+        https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26
+        """
+        sigma_min = sigma_min or self.config.sigma_min
+        sigma_max = sigma_max or self.config.sigma_max
+        sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), len(ramp)).exp().flip(0)
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = torch.tensor(1)  # Inputs are pre-scaled before going into unet, so alpha_t = 1
+        sigma_t = sigma
+
+        return alpha_t, sigma_t
+
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        sample: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+
+        <Tip>
+
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction models.
+
+        </Tip>
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        sigma = self.sigmas[self.step_index]
+        x0_pred = self.precondition_outputs(sample, model_output, sigma)
+
+        return x0_pred
+
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.Tensor,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        One step for the first-order DPMSolver (equivalent to DDIM).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+        assert noise is not None
+        x_t = (
+            (sigma_t / sigma_s * torch.exp(-h)) * sample
+            + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+            + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+        )
+
+        return x_t
+
+    def multistep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        One step for the second-order multistep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+
+        # sde-dpmsolver++
+        assert noise is not None
+        if self.config.solver_type == "midpoint":
+            x_t = (
+                (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+                + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+            )
+        elif self.config.solver_type == "heun":
+            x_t = (
+                (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
+                + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+            )
+
+        return x_t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        return step_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep DPMSolver.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Improve numerical stability for small number of steps
+        lower_order_final = (self.step_index == len(self.timesteps) - 1) and (
+            self.config.euler_at_final
+            or (self.config.lower_order_final and len(self.timesteps) < 15)
+            or self.config.final_sigmas_type == "zero"
+        )
+        lower_order_second = (
+            (self.step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        if self.noise_sampler is None:
+            seed = None
+            if generator is not None:
+                seed = (
+                    [g.initial_seed() for g in generator] if isinstance(generator, list) else generator.initial_seed()
+                )
+            self.noise_sampler = BrownianTreeNoiseSampler(
+                model_output, sigma_min=self.config.sigma_min, sigma_max=self.config.sigma_max, seed=seed
+            )
+        noise = self.noise_sampler(self.sigmas[self.step_index], self.sigmas[self.step_index + 1]).to(
+            model_output.device
+        )
+
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(model_output, sample=sample, noise=noise)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            prev_sample = self.multistep_dpm_solver_second_order_update(self.model_outputs, sample=sample, noise=noise)
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_ddim.py b/diffusers/src/diffusers/schedulers/scheduling_ddim.py
new file mode 100644
index 0000000000000000000000000000000000000000..14356eafdaea8cc7f4e165bcb4e8840468fa932c
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_ddim.py
@@ -0,0 +1,518 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class DDIMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class DDIMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
+    non-Markovian guidance.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                .round()[::-1]
+                .copy()
+                .astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
+            )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DDIMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                The weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`, defaults to `False`):
+                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
+                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
+                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
+                `use_clipped_model_output` has no effect.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        # 4. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 5. compute variance: "sigma_t(η)" -> see formula (16)
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        variance = self._get_variance(timestep, prev_timestep)
+        std_dev_t = eta * variance ** (0.5)
+
+        if use_clipped_model_output:
+            # the pred_epsilon is always re-derived from the clipped x_0 in Glide
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+
+        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
+
+        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+        if eta > 0:
+            if variance_noise is not None and generator is not None:
+                raise ValueError(
+                    "Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
+                    " `variance_noise` stays `None`."
+                )
+
+            if variance_noise is None:
+                variance_noise = randn_tensor(
+                    model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+                )
+            variance = std_dev_t * variance_noise
+
+            prev_sample = prev_sample + variance
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_ddim_cogvideox.py b/diffusers/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec5c5f3e1c5dd620c2b51d222e681b93680afccf
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
@@ -0,0 +1,449 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class DDIMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+def rescale_zero_terminal_snr(alphas_cumprod):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+
+    return alphas_bar
+
+
+class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
+    non-Markovian guidance.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,
+        beta_end: float = 0.0120,
+        beta_schedule: str = "scaled_linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+        snr_shift_scale: float = 3.0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float64) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # Modify: SNR shift following SD3
+        self.alphas_cumprod = self.alphas_cumprod / (snr_shift_scale + (1 - snr_shift_scale) * self.alphas_cumprod)
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.alphas_cumprod = rescale_zero_terminal_snr(self.alphas_cumprod)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                .round()[::-1]
+                .copy()
+                .astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
+            )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DDIMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                The weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`, defaults to `False`):
+                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
+                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
+                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
+                `use_clipped_model_output` has no effect.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        # To make style tests pass, commented out `pred_epsilon` as it is an unused variable
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            # pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            # pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            # pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        a_t = ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** 0.5
+        b_t = alpha_prod_t_prev**0.5 - alpha_prod_t**0.5 * a_t
+
+        prev_sample = a_t * sample + b_t * pred_original_sample
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_ddim_flax.py b/diffusers/src/diffusers/schedulers/scheduling_ddim_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..23c71a61452a9bf1bb1e17e7c0d8eb82daee4616
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_ddim_flax.py
@@ -0,0 +1,314 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import (
+    CommonSchedulerState,
+    FlaxKarrasDiffusionSchedulers,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    add_noise_common,
+    get_velocity_common,
+)
+
+
+@flax.struct.dataclass
+class DDIMSchedulerState:
+    common: CommonSchedulerState
+    final_alpha_cumprod: jnp.ndarray
+
+    # setable values
+    init_noise_sigma: jnp.ndarray
+    timesteps: jnp.ndarray
+    num_inference_steps: Optional[int] = None
+
+    @classmethod
+    def create(
+        cls,
+        common: CommonSchedulerState,
+        final_alpha_cumprod: jnp.ndarray,
+        init_noise_sigma: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ):
+        return cls(
+            common=common,
+            final_alpha_cumprod=final_alpha_cumprod,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+
+@dataclass
+class FlaxDDIMSchedulerOutput(FlaxSchedulerOutput):
+    state: DDIMSchedulerState
+
+
+class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
+    diffusion probabilistic models (DDPMs) with non-Markovian guidance.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2010.02502
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`jnp.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample between for numerical stability. The clip range is determined by
+            `clip_sample_range`.
+        clip_sample_range (`float`, default `1.0`):
+            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, default `True`):
+            each diffusion step uses the value of alphas product at that step and at the previous one. For the final
+            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the value of alpha at step 0.
+        steps_offset (`int`, default `0`):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`.
+            `v-prediction` is not supported for this scheduler.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
+    """
+
+    _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
+
+    dtype: jnp.dtype
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
+        clip_sample: bool = True,
+        clip_sample_range: float = 1.0,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        self.dtype = dtype
+
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> DDIMSchedulerState:
+        if common is None:
+            common = CommonSchedulerState.create(self)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        final_alpha_cumprod = (
+            jnp.array(1.0, dtype=self.dtype) if self.config.set_alpha_to_one else common.alphas_cumprod[0]
+        )
+
+        # standard deviation of the initial noise distribution
+        init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
+
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
+
+        return DDIMSchedulerState.create(
+            common=common,
+            final_alpha_cumprod=final_alpha_cumprod,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+    def scale_model_input(
+        self, state: DDIMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
+    ) -> jnp.ndarray:
+        """
+        Args:
+            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+            sample (`jnp.ndarray`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `jnp.ndarray`: scaled input sample
+        """
+        return sample
+
+    def set_timesteps(
+        self, state: DDIMSchedulerState, num_inference_steps: int, shape: Tuple = ()
+    ) -> DDIMSchedulerState:
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`DDIMSchedulerState`):
+                the `FlaxDDIMScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+        step_ratio = self.config.num_train_timesteps // num_inference_steps
+        # creates integer timesteps by multiplying by ratio
+        # rounding to avoid issues when num_inference_step is power of 3
+        timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round()[::-1] + self.config.steps_offset
+
+        return state.replace(
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+        )
+
+    def _get_variance(self, state: DDIMSchedulerState, timestep, prev_timestep):
+        alpha_prod_t = state.common.alphas_cumprod[timestep]
+        alpha_prod_t_prev = jnp.where(
+            prev_timestep >= 0, state.common.alphas_cumprod[prev_timestep], state.final_alpha_cumprod
+        )
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    def step(
+        self,
+        state: DDIMSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        eta: float = 0.0,
+        return_dict: bool = True,
+    ) -> Union[FlaxDDIMSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`DDIMSchedulerState`): the `FlaxDDIMScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than FlaxDDIMSchedulerOutput class
+
+        Returns:
+            [`FlaxDDIMSchedulerOutput`] or `tuple`: [`FlaxDDIMSchedulerOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        prev_timestep = timestep - self.config.num_train_timesteps // state.num_inference_steps
+
+        alphas_cumprod = state.common.alphas_cumprod
+        final_alpha_cumprod = state.final_alpha_cumprod
+
+        # 2. compute alphas, betas
+        alpha_prod_t = alphas_cumprod[timestep]
+        alpha_prod_t_prev = jnp.where(prev_timestep >= 0, alphas_cumprod[prev_timestep], final_alpha_cumprod)
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        # 4. Clip or threshold "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clip(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 4. compute variance: "sigma_t(η)" -> see formula (16)
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        variance = self._get_variance(state, timestep, prev_timestep)
+        std_dev_t = eta * variance ** (0.5)
+
+        # 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
+
+        # 6. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+        if not return_dict:
+            return (prev_sample, state)
+
+        return FlaxDDIMSchedulerOutput(prev_sample=prev_sample, state=state)
+
+    def add_noise(
+        self,
+        state: DDIMSchedulerState,
+        original_samples: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        return add_noise_common(state.common, original_samples, noise, timesteps)
+
+    def get_velocity(
+        self,
+        state: DDIMSchedulerState,
+        sample: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        return get_velocity_common(state.common, sample, noise, timesteps)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_ddim_inverse.py b/diffusers/src/diffusers/schedulers/scheduling_ddim_inverse.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c2352f2c828e1fc92e8a9e0403d621fe841fe8d
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_ddim_inverse.py
@@ -0,0 +1,374 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils import BaseOutput, deprecate
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class DDIMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DDIMInverseScheduler` is the reverse scheduler of [`DDIMScheduler`].
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to 0, otherwise
+            it uses the alpha value at step `num_train_timesteps - 1`.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    order = 1
+    ignore_for_config = ["kwargs"]
+    _deprecated_kwargs = ["set_alpha_to_zero"]
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        clip_sample_range: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+        **kwargs,
+    ):
+        if kwargs.get("set_alpha_to_zero", None) is not None:
+            deprecation_message = (
+                "The `set_alpha_to_zero` argument is deprecated. Please use `set_alpha_to_one` instead."
+            )
+            deprecate("set_alpha_to_zero", "1.0.0", deprecation_message, standard_warn=False)
+            set_alpha_to_one = kwargs["set_alpha_to_zero"]
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in inverted ddim, we are looking into the next alphas_cumprod
+        # For the initial step, there is no current alphas_cumprod, and the index is out of bounds
+        # `set_alpha_to_one` decides whether we set this parameter simply to one
+        # in this case, self.step() just output the predicted noise
+        # or whether we use the initial alpha used in training the diffusion model.
+        self.initial_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps).copy().astype(np.int64))
+
+    # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.scale_model_input
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # "leading" and "trailing" corresponds to annotation of Table 1. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round().copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)[::-1]).astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
+            )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[DDIMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                The weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`, defaults to `False`):
+                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
+                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
+                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
+                `use_clipped_model_output` has no effect.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddim_inverse.DDIMInverseSchedulerOutput`] or
+                `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_ddim_inverse.DDIMInverseSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddim_inverse.DDIMInverseSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+
+        """
+        # 1. get previous step value (=t+1)
+        prev_timestep = timestep
+        timestep = min(
+            timestep - self.config.num_train_timesteps // self.num_inference_steps, self.config.num_train_timesteps - 1
+        )
+
+        # 2. compute alphas, betas
+        # change original implementation to exactly match noise levels for analogous forward process
+        alpha_prod_t = self.alphas_cumprod[timestep] if timestep >= 0 else self.initial_alpha_cumprod
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep]
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        # 4. Clip or threshold "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * pred_epsilon
+
+        # 6. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+        if not return_dict:
+            return (prev_sample, pred_original_sample)
+        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_ddim_parallel.py b/diffusers/src/diffusers/schedulers/scheduling_ddim_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cf84b694db51c3021ea51e542fc3cb103d04882
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_ddim_parallel.py
@@ -0,0 +1,643 @@
+# Copyright 2024 ParaDiGMS authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput
+class DDIMParallelSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
+    diffusion probabilistic models (DDPMs) with non-Markovian guidance.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2010.02502
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample for numerical stability.
+        clip_sample_range (`float`, default `1.0`):
+            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, default `True`):
+            each diffusion step uses the value of alphas product at that step and at the previous one. For the final
+            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the value of alpha at step 0.
+        steps_offset (`int`, default `0`):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+            https://imagen.research.google/video/paper.pdf)
+        thresholding (`bool`, default `False`):
+            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
+            Note that the thresholding method is unsuitable for latent-space diffusion models (such as
+            stable-diffusion).
+        dynamic_thresholding_ratio (`float`, default `0.995`):
+            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
+            (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`.
+        sample_max_value (`float`, default `1.0`):
+            the threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, default `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample
+            Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, default `False`):
+            whether to rescale the betas to have zero terminal SNR (proposed by https://arxiv.org/pdf/2305.08891.pdf).
+            This can enable the model to generate very bright and dark samples instead of limiting it to samples with
+            medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    _is_ode_scheduler = True
+
+    @register_to_config
+    # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.__init__
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+
+    # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.scale_model_input
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def _get_variance(self, timestep, prev_timestep=None):
+        if prev_timestep is None:
+            prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    def _batch_get_variance(self, t, prev_t):
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[torch.clip(prev_t, min=0)]
+        alpha_prod_t_prev[prev_t < 0] = torch.tensor(1.0)
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.set_timesteps
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                .round()[::-1]
+                .copy()
+                .astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
+            )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DDIMParallelSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                current instance of sample being created by diffusion process.
+            eta (`float`): weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
+                predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
+                `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
+                coincide with the one provided as input and `use_clipped_model_output` will have not effect.
+            generator: random number generator.
+            variance_noise (`torch.Tensor`): instead of generating noise for the variance using `generator`, we
+                can directly provide the noise for the variance itself. This is useful for methods such as
+                CycleDiffusion. (https://arxiv.org/abs/2210.05559)
+            return_dict (`bool`): option for returning tuple rather than DDIMParallelSchedulerOutput class
+
+        Returns:
+            [`~schedulers.scheduling_utils.DDIMParallelSchedulerOutput`] or `tuple`:
+            [`~schedulers.scheduling_utils.DDIMParallelSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`.
+            When returning a tuple, the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        # 4. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 5. compute variance: "sigma_t(η)" -> see formula (16)
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        variance = self._get_variance(timestep, prev_timestep)
+        std_dev_t = eta * variance ** (0.5)
+
+        if use_clipped_model_output:
+            # the pred_epsilon is always re-derived from the clipped x_0 in Glide
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+
+        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
+
+        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+        if eta > 0:
+            if variance_noise is not None and generator is not None:
+                raise ValueError(
+                    "Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
+                    " `variance_noise` stays `None`."
+                )
+
+            if variance_noise is None:
+                variance_noise = randn_tensor(
+                    model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+                )
+            variance = std_dev_t * variance_noise
+
+            prev_sample = prev_sample + variance
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return DDIMParallelSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    def batch_step_no_noise(
+        self,
+        model_output: torch.Tensor,
+        timesteps: List[int],
+        sample: torch.Tensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+    ) -> torch.Tensor:
+        """
+        Batched version of the `step` function, to be able to reverse the SDE for multiple samples/timesteps at once.
+        Also, does not add any noise to the predicted sample, which is necessary for parallel sampling where the noise
+        is pre-sampled by the pipeline.
+
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`): direct output from learned diffusion model.
+            timesteps (`List[int]`):
+                current discrete timesteps in the diffusion chain. This is now a list of integers.
+            sample (`torch.Tensor`):
+                current instance of sample being created by diffusion process.
+            eta (`float`): weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
+                predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
+                `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
+                coincide with the one provided as input and `use_clipped_model_output` will have not effect.
+
+        Returns:
+            `torch.Tensor`: sample tensor at previous timestep.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        assert eta == 0.0
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        t = timesteps
+        prev_t = t - self.config.num_train_timesteps // self.num_inference_steps
+
+        t = t.view(-1, *([1] * (model_output.ndim - 1)))
+        prev_t = prev_t.view(-1, *([1] * (model_output.ndim - 1)))
+
+        # 1. compute alphas, betas
+        self.alphas_cumprod = self.alphas_cumprod.to(model_output.device)
+        self.final_alpha_cumprod = self.final_alpha_cumprod.to(model_output.device)
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[torch.clip(prev_t, min=0)]
+        alpha_prod_t_prev[prev_t < 0] = torch.tensor(1.0)
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        # 4. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 5. compute variance: "sigma_t(η)" -> see formula (16)
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        variance = self._batch_get_variance(t, prev_t).to(model_output.device).view(*alpha_prod_t_prev.shape)
+        std_dev_t = eta * variance ** (0.5)
+
+        if use_clipped_model_output:
+            # the pred_epsilon is always re-derived from the clipped x_0 in Glide
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+
+        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
+
+        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+        return prev_sample
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_ddpm.py b/diffusers/src/diffusers/schedulers/scheduling_ddpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..81a770edf635aa41e6a7bd8b6532f378845eee1e
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_ddpm.py
@@ -0,0 +1,560 @@
+# Copyright 2024 UC Berkeley Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+class DDPMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class DDPMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DDPMScheduler` explores the connections between denoising score matching and Langevin dynamics sampling.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            An array of betas to pass directly to the constructor without using `beta_start` and `beta_end`.
+        variance_type (`str`, defaults to `"fixed_small"`):
+            Clip the variance when adding noise to the denoised sample. Choose from `fixed_small`, `fixed_small_log`,
+            `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        variance_type: str = "fixed_small",
+        clip_sample: bool = True,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        elif beta_schedule == "sigmoid":
+            # GeoDiff sigmoid schedule
+            betas = torch.linspace(-6, 6, num_train_timesteps)
+            self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.one = torch.tensor(1.0)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.custom_timesteps = False
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+        self.variance_type = variance_type
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model. If used,
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps is used. If `timesteps` is passed,
+                `num_inference_steps` must be `None`.
+
+        """
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+
+        if timesteps is not None:
+            for i in range(1, len(timesteps)):
+                if timesteps[i] >= timesteps[i - 1]:
+                    raise ValueError("`custom_timesteps` must be in descending order.")
+
+            if timesteps[0] >= self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`timesteps` must start before `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps}."
+                )
+
+            timesteps = np.array(timesteps, dtype=np.int64)
+            self.custom_timesteps = True
+        else:
+            if num_inference_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+
+            self.num_inference_steps = num_inference_steps
+            self.custom_timesteps = False
+
+            # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+            if self.config.timestep_spacing == "linspace":
+                timesteps = (
+                    np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                    .round()[::-1]
+                    .copy()
+                    .astype(np.int64)
+                )
+            elif self.config.timestep_spacing == "leading":
+                step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+                timesteps += self.config.steps_offset
+            elif self.config.timestep_spacing == "trailing":
+                step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+                timesteps -= 1
+            else:
+                raise ValueError(
+                    f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+                )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def _get_variance(self, t, predicted_variance=None, variance_type=None):
+        prev_t = self.previous_timestep(t)
+
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
+
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # and sample from it to get previous sample
+        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * current_beta_t
+
+        # we always take the log of variance, so clamp it to ensure it's not 0
+        variance = torch.clamp(variance, min=1e-20)
+
+        if variance_type is None:
+            variance_type = self.config.variance_type
+
+        # hacks - were probably added for training stability
+        if variance_type == "fixed_small":
+            variance = variance
+        # for rl-diffuser https://arxiv.org/abs/2205.09991
+        elif variance_type == "fixed_small_log":
+            variance = torch.log(variance)
+            variance = torch.exp(0.5 * variance)
+        elif variance_type == "fixed_large":
+            variance = current_beta_t
+        elif variance_type == "fixed_large_log":
+            # Glide max_log
+            variance = torch.log(current_beta_t)
+        elif variance_type == "learned":
+            return predicted_variance
+        elif variance_type == "learned_range":
+            min_log = torch.log(variance)
+            max_log = torch.log(current_beta_t)
+            frac = (predicted_variance + 1) / 2
+            variance = frac * max_log + (1 - frac) * min_log
+
+        return variance
+
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[DDPMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        t = timestep
+
+        prev_t = self.previous_timestep(t)
+
+        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
+            model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+        else:
+            predicted_variance = None
+
+        # 1. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        current_alpha_t = alpha_prod_t / alpha_prod_t_prev
+        current_beta_t = 1 - current_alpha_t
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction`  for the DDPMScheduler."
+            )
+
+        # 3. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+        # 6. Add noise
+        variance = 0
+        if t > 0:
+            device = model_output.device
+            variance_noise = randn_tensor(
+                model_output.shape, generator=generator, device=device, dtype=model_output.dtype
+            )
+            if self.variance_type == "fixed_small_log":
+                variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise
+            elif self.variance_type == "learned_range":
+                variance = self._get_variance(t, predicted_variance=predicted_variance)
+                variance = torch.exp(0.5 * variance) * variance_noise
+            else:
+                variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
+
+        pred_prev_sample = pred_prev_sample + variance
+
+        if not return_dict:
+            return (pred_prev_sample,)
+
+        return DDPMSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+    def previous_timestep(self, timestep):
+        if self.custom_timesteps:
+            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
+            if index == self.timesteps.shape[0] - 1:
+                prev_t = torch.tensor(-1)
+            else:
+                prev_t = self.timesteps[index + 1]
+        else:
+            num_inference_steps = (
+                self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+            )
+            prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
+
+        return prev_t
diff --git a/diffusers/src/diffusers/schedulers/scheduling_ddpm_flax.py b/diffusers/src/diffusers/schedulers/scheduling_ddpm_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..d06a171159ee765ff23e50ea2baa94ae62e24dd4
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_ddpm_flax.py
@@ -0,0 +1,303 @@
+# Copyright 2024 UC Berkeley Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import (
+    CommonSchedulerState,
+    FlaxKarrasDiffusionSchedulers,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    add_noise_common,
+    get_velocity_common,
+)
+
+
+@flax.struct.dataclass
+class DDPMSchedulerState:
+    common: CommonSchedulerState
+
+    # setable values
+    init_noise_sigma: jnp.ndarray
+    timesteps: jnp.ndarray
+    num_inference_steps: Optional[int] = None
+
+    @classmethod
+    def create(cls, common: CommonSchedulerState, init_noise_sigma: jnp.ndarray, timesteps: jnp.ndarray):
+        return cls(common=common, init_noise_sigma=init_noise_sigma, timesteps=timesteps)
+
+
+@dataclass
+class FlaxDDPMSchedulerOutput(FlaxSchedulerOutput):
+    state: DDPMSchedulerState
+
+
+class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    Denoising diffusion probabilistic models (DDPMs) explores the connections between denoising score matching and
+    Langevin dynamics sampling.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2006.11239
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        variance_type (`str`):
+            options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
+            `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample between -1 and 1 for numerical stability.
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`.
+            `v-prediction` is not supported for this scheduler.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
+    """
+
+    _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
+
+    dtype: jnp.dtype
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
+        variance_type: str = "fixed_small",
+        clip_sample: bool = True,
+        prediction_type: str = "epsilon",
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        self.dtype = dtype
+
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> DDPMSchedulerState:
+        if common is None:
+            common = CommonSchedulerState.create(self)
+
+        # standard deviation of the initial noise distribution
+        init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
+
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
+
+        return DDPMSchedulerState.create(
+            common=common,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+    def scale_model_input(
+        self, state: DDPMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
+    ) -> jnp.ndarray:
+        """
+        Args:
+            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+            sample (`jnp.ndarray`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `jnp.ndarray`: scaled input sample
+        """
+        return sample
+
+    def set_timesteps(
+        self, state: DDPMSchedulerState, num_inference_steps: int, shape: Tuple = ()
+    ) -> DDPMSchedulerState:
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`DDIMSchedulerState`):
+                the `FlaxDDPMScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        step_ratio = self.config.num_train_timesteps // num_inference_steps
+        # creates integer timesteps by multiplying by ratio
+        # rounding to avoid issues when num_inference_step is power of 3
+        timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round()[::-1]
+
+        return state.replace(
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+        )
+
+    def _get_variance(self, state: DDPMSchedulerState, t, predicted_variance=None, variance_type=None):
+        alpha_prod_t = state.common.alphas_cumprod[t]
+        alpha_prod_t_prev = jnp.where(t > 0, state.common.alphas_cumprod[t - 1], jnp.array(1.0, dtype=self.dtype))
+
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # and sample from it to get previous sample
+        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * state.common.betas[t]
+
+        if variance_type is None:
+            variance_type = self.config.variance_type
+
+        # hacks - were probably added for training stability
+        if variance_type == "fixed_small":
+            variance = jnp.clip(variance, a_min=1e-20)
+        # for rl-diffuser https://arxiv.org/abs/2205.09991
+        elif variance_type == "fixed_small_log":
+            variance = jnp.log(jnp.clip(variance, a_min=1e-20))
+        elif variance_type == "fixed_large":
+            variance = state.common.betas[t]
+        elif variance_type == "fixed_large_log":
+            # Glide max_log
+            variance = jnp.log(state.common.betas[t])
+        elif variance_type == "learned":
+            return predicted_variance
+        elif variance_type == "learned_range":
+            min_log = variance
+            max_log = state.common.betas[t]
+            frac = (predicted_variance + 1) / 2
+            variance = frac * max_log + (1 - frac) * min_log
+
+        return variance
+
+    def step(
+        self,
+        state: DDPMSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        key: Optional[jax.Array] = None,
+        return_dict: bool = True,
+    ) -> Union[FlaxDDPMSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`DDPMSchedulerState`): the `FlaxDDPMScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            key (`jax.Array`): a PRNG key.
+            return_dict (`bool`): option for returning tuple rather than FlaxDDPMSchedulerOutput class
+
+        Returns:
+            [`FlaxDDPMSchedulerOutput`] or `tuple`: [`FlaxDDPMSchedulerOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        t = timestep
+
+        if key is None:
+            key = jax.random.key(0)
+
+        if (
+            len(model_output.shape) > 1
+            and model_output.shape[1] == sample.shape[1] * 2
+            and self.config.variance_type in ["learned", "learned_range"]
+        ):
+            model_output, predicted_variance = jnp.split(model_output, sample.shape[1], axis=1)
+        else:
+            predicted_variance = None
+
+        # 1. compute alphas, betas
+        alpha_prod_t = state.common.alphas_cumprod[t]
+        alpha_prod_t_prev = jnp.where(t > 0, state.common.alphas_cumprod[t - 1], jnp.array(1.0, dtype=self.dtype))
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
+                " for the FlaxDDPMScheduler."
+            )
+
+        # 3. Clip "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = jnp.clip(pred_original_sample, -1, 1)
+
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * state.common.betas[t]) / beta_prod_t
+        current_sample_coeff = state.common.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+        # 6. Add noise
+        def random_variance():
+            split_key = jax.random.split(key, num=1)[0]
+            noise = jax.random.normal(split_key, shape=model_output.shape, dtype=self.dtype)
+            return (self._get_variance(state, t, predicted_variance=predicted_variance) ** 0.5) * noise
+
+        variance = jnp.where(t > 0, random_variance(), jnp.zeros(model_output.shape, dtype=self.dtype))
+
+        pred_prev_sample = pred_prev_sample + variance
+
+        if not return_dict:
+            return (pred_prev_sample, state)
+
+        return FlaxDDPMSchedulerOutput(prev_sample=pred_prev_sample, state=state)
+
+    def add_noise(
+        self,
+        state: DDPMSchedulerState,
+        original_samples: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        return add_noise_common(state.common, original_samples, noise, timesteps)
+
+    def get_velocity(
+        self,
+        state: DDPMSchedulerState,
+        sample: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        return get_velocity_common(state.common, sample, noise, timesteps)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/diffusers/src/diffusers/schedulers/scheduling_ddpm_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dfcf3c17a2fcda2abd9916febe1aad65fed36f7
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_ddpm_parallel.py
@@ -0,0 +1,651 @@
+# Copyright 2024 ParaDiGMS authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput
+class DDPMParallelSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Denoising diffusion probabilistic models (DDPMs) explores the connections between denoising score matching and
+    Langevin dynamics sampling.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2006.11239
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, `squaredcos_cap_v2` or `sigmoid`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        variance_type (`str`):
+            options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
+            `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample for numerical stability.
+        clip_sample_range (`float`, default `1.0`):
+            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+            https://imagen.research.google/video/paper.pdf)
+        thresholding (`bool`, default `False`):
+            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
+            Note that the thresholding method is unsuitable for latent-space diffusion models (such as
+            stable-diffusion).
+        dynamic_thresholding_ratio (`float`, default `0.995`):
+            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
+            (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`.
+        sample_max_value (`float`, default `1.0`):
+            the threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, default `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample
+            Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information.
+        steps_offset (`int`, default `0`):
+            An offset added to the inference steps, as required by some model families.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    _is_ode_scheduler = False
+
+    @register_to_config
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.__init__
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        variance_type: str = "fixed_small",
+        clip_sample: bool = True,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        elif beta_schedule == "sigmoid":
+            # GeoDiff sigmoid schedule
+            betas = torch.linspace(-6, 6, num_train_timesteps)
+            self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.one = torch.tensor(1.0)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.custom_timesteps = False
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+        self.variance_type = variance_type
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.scale_model_input
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.set_timesteps
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model. If used,
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps is used. If `timesteps` is passed,
+                `num_inference_steps` must be `None`.
+
+        """
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+
+        if timesteps is not None:
+            for i in range(1, len(timesteps)):
+                if timesteps[i] >= timesteps[i - 1]:
+                    raise ValueError("`custom_timesteps` must be in descending order.")
+
+            if timesteps[0] >= self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`timesteps` must start before `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps}."
+                )
+
+            timesteps = np.array(timesteps, dtype=np.int64)
+            self.custom_timesteps = True
+        else:
+            if num_inference_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+
+            self.num_inference_steps = num_inference_steps
+            self.custom_timesteps = False
+
+            # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+            if self.config.timestep_spacing == "linspace":
+                timesteps = (
+                    np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                    .round()[::-1]
+                    .copy()
+                    .astype(np.int64)
+                )
+            elif self.config.timestep_spacing == "leading":
+                step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+                timesteps += self.config.steps_offset
+            elif self.config.timestep_spacing == "trailing":
+                step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+                timesteps -= 1
+            else:
+                raise ValueError(
+                    f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+                )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance
+    def _get_variance(self, t, predicted_variance=None, variance_type=None):
+        prev_t = self.previous_timestep(t)
+
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
+
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # and sample from it to get previous sample
+        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * current_beta_t
+
+        # we always take the log of variance, so clamp it to ensure it's not 0
+        variance = torch.clamp(variance, min=1e-20)
+
+        if variance_type is None:
+            variance_type = self.config.variance_type
+
+        # hacks - were probably added for training stability
+        if variance_type == "fixed_small":
+            variance = variance
+        # for rl-diffuser https://arxiv.org/abs/2205.09991
+        elif variance_type == "fixed_small_log":
+            variance = torch.log(variance)
+            variance = torch.exp(0.5 * variance)
+        elif variance_type == "fixed_large":
+            variance = current_beta_t
+        elif variance_type == "fixed_large_log":
+            # Glide max_log
+            variance = torch.log(current_beta_t)
+        elif variance_type == "learned":
+            return predicted_variance
+        elif variance_type == "learned_range":
+            min_log = torch.log(variance)
+            max_log = torch.log(current_beta_t)
+            frac = (predicted_variance + 1) / 2
+            variance = frac * max_log + (1 - frac) * min_log
+
+        return variance
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[DDPMParallelSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                current instance of sample being created by diffusion process.
+            generator: random number generator.
+            return_dict (`bool`): option for returning tuple rather than DDPMParallelSchedulerOutput class
+
+        Returns:
+            [`~schedulers.scheduling_utils.DDPMParallelSchedulerOutput`] or `tuple`:
+            [`~schedulers.scheduling_utils.DDPMParallelSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`.
+            When returning a tuple, the first element is the sample tensor.
+
+        """
+        t = timestep
+
+        prev_t = self.previous_timestep(t)
+
+        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
+            model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+        else:
+            predicted_variance = None
+
+        # 1. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        current_alpha_t = alpha_prod_t / alpha_prod_t_prev
+        current_beta_t = 1 - current_alpha_t
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction`  for the DDPMScheduler."
+            )
+
+        # 3. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+        # 6. Add noise
+        variance = 0
+        if t > 0:
+            device = model_output.device
+            variance_noise = randn_tensor(
+                model_output.shape, generator=generator, device=device, dtype=model_output.dtype
+            )
+            if self.variance_type == "fixed_small_log":
+                variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise
+            elif self.variance_type == "learned_range":
+                variance = self._get_variance(t, predicted_variance=predicted_variance)
+                variance = torch.exp(0.5 * variance) * variance_noise
+            else:
+                variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
+
+        pred_prev_sample = pred_prev_sample + variance
+
+        if not return_dict:
+            return (pred_prev_sample,)
+
+        return DDPMParallelSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+    def batch_step_no_noise(
+        self,
+        model_output: torch.Tensor,
+        timesteps: List[int],
+        sample: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Batched version of the `step` function, to be able to reverse the SDE for multiple samples/timesteps at once.
+        Also, does not add any noise to the predicted sample, which is necessary for parallel sampling where the noise
+        is pre-sampled by the pipeline.
+
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`): direct output from learned diffusion model.
+            timesteps (`List[int]`):
+                current discrete timesteps in the diffusion chain. This is now a list of integers.
+            sample (`torch.Tensor`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `torch.Tensor`: sample tensor at previous timestep.
+        """
+        t = timesteps
+        num_inference_steps = self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+        prev_t = t - self.config.num_train_timesteps // num_inference_steps
+
+        t = t.view(-1, *([1] * (model_output.ndim - 1)))
+        prev_t = prev_t.view(-1, *([1] * (model_output.ndim - 1)))
+
+        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
+            model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+        else:
+            pass
+
+        # 1. compute alphas, betas
+        self.alphas_cumprod = self.alphas_cumprod.to(model_output.device)
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[torch.clip(prev_t, min=0)]
+        alpha_prod_t_prev[prev_t < 0] = torch.tensor(1.0)
+
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        current_alpha_t = alpha_prod_t / alpha_prod_t_prev
+        current_beta_t = 1 - current_alpha_t
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction`  for the DDPMParallelScheduler."
+            )
+
+        # 3. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+        return pred_prev_sample
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
+    def previous_timestep(self, timestep):
+        if self.custom_timesteps:
+            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
+            if index == self.timesteps.shape[0] - 1:
+                prev_t = torch.tensor(-1)
+            else:
+                prev_t = self.timesteps[index + 1]
+        else:
+            num_inference_steps = (
+                self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+            )
+            prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
+
+        return prev_t
diff --git a/diffusers/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py b/diffusers/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
new file mode 100644
index 0000000000000000000000000000000000000000..71b5669b052864647998959d2dc48453dfdcbb73
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2022 Pablo Pernías MIT License
+# Copyright 2024 UC Berkeley Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class DDPMWuerstchenSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.Tensor
+
+
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class DDPMWuerstchenScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Denoising diffusion probabilistic models (DDPMs) explores the connections between denoising score matching and
+    Langevin dynamics sampling.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2006.11239
+
+    Args:
+        scaler (`float`): ....
+        s (`float`): ....
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        scaler: float = 1.0,
+        s: float = 0.008,
+    ):
+        self.scaler = scaler
+        self.s = torch.tensor([s])
+        self._init_alpha_cumprod = torch.cos(self.s / (1 + self.s) * torch.pi * 0.5) ** 2
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+    def _alpha_cumprod(self, t, device):
+        if self.scaler > 1:
+            t = 1 - (1 - t) ** self.scaler
+        elif self.scaler < 1:
+            t = t**self.scaler
+        alpha_cumprod = torch.cos(
+            (t + self.s.to(device)) / (1 + self.s.to(device)) * torch.pi * 0.5
+        ) ** 2 / self._init_alpha_cumprod.to(device)
+        return alpha_cumprod.clamp(0.0001, 0.9999)
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `torch.Tensor`: scaled input sample
+        """
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        timesteps: Optional[List[int]] = None,
+        device: Union[str, torch.device] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            num_inference_steps (`Dict[float, int]`):
+                the number of diffusion steps used when generating samples with a pre-trained model. If passed, then
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, optional):
+                the device to which the timesteps are moved to. {2 / 3: 20, 0.0: 10}
+        """
+        if timesteps is None:
+            timesteps = torch.linspace(1.0, 0.0, num_inference_steps + 1, device=device)
+        if not isinstance(timesteps, torch.Tensor):
+            timesteps = torch.Tensor(timesteps).to(device)
+        self.timesteps = timesteps
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[DDPMWuerstchenSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                current instance of sample being created by diffusion process.
+            generator: random number generator.
+            return_dict (`bool`): option for returning tuple rather than DDPMWuerstchenSchedulerOutput class
+
+        Returns:
+            [`DDPMWuerstchenSchedulerOutput`] or `tuple`: [`DDPMWuerstchenSchedulerOutput`] if `return_dict` is True,
+            otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        dtype = model_output.dtype
+        device = model_output.device
+        t = timestep
+
+        prev_t = self.previous_timestep(t)
+
+        alpha_cumprod = self._alpha_cumprod(t, device).view(t.size(0), *[1 for _ in sample.shape[1:]])
+        alpha_cumprod_prev = self._alpha_cumprod(prev_t, device).view(prev_t.size(0), *[1 for _ in sample.shape[1:]])
+        alpha = alpha_cumprod / alpha_cumprod_prev
+
+        mu = (1.0 / alpha).sqrt() * (sample - (1 - alpha) * model_output / (1 - alpha_cumprod).sqrt())
+
+        std_noise = randn_tensor(mu.shape, generator=generator, device=model_output.device, dtype=model_output.dtype)
+        std = ((1 - alpha) * (1.0 - alpha_cumprod_prev) / (1.0 - alpha_cumprod)).sqrt() * std_noise
+        pred = mu + std * (prev_t != 0).float().view(prev_t.size(0), *[1 for _ in sample.shape[1:]])
+
+        if not return_dict:
+            return (pred.to(dtype),)
+
+        return DDPMWuerstchenSchedulerOutput(prev_sample=pred.to(dtype))
+
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        device = original_samples.device
+        dtype = original_samples.dtype
+        alpha_cumprod = self._alpha_cumprod(timesteps, device=device).view(
+            timesteps.size(0), *[1 for _ in original_samples.shape[1:]]
+        )
+        noisy_samples = alpha_cumprod.sqrt() * original_samples + (1 - alpha_cumprod).sqrt() * noise
+        return noisy_samples.to(dtype=dtype)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+    def previous_timestep(self, timestep):
+        index = (self.timesteps - timestep[0]).abs().argmin().item()
+        prev_t = self.timesteps[index + 1][None].expand(timestep.shape[0])
+        return prev_t
diff --git a/diffusers/src/diffusers/schedulers/scheduling_deis_multistep.py b/diffusers/src/diffusers/schedulers/scheduling_deis_multistep.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b26befac64ea8944bd46f0ea6de3c17a367c619
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_deis_multistep.py
@@ -0,0 +1,820 @@
+# Copyright 2024 FLAIR Lab and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: check https://arxiv.org/abs/2204.13902 and https://github.com/qsh-zh/deis for more info
+# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DEISMultistepScheduler` is a fast high order solver for diffusion ordinary differential equations (ODEs).
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        solver_order (`int`, defaults to 2):
+            The DEIS order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        algorithm_type (`str`, defaults to `deis`):
+            The algorithm type for the solver.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+             Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+             the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[np.ndarray] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "deis",
+        solver_type: str = "logrho",
+        lower_order_final: bool = True,
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if sum([self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError("Only one of `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used.")
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # settings for DEIS
+        if algorithm_type not in ["deis"]:
+            if algorithm_type in ["dpmsolver", "dpmsolver++"]:
+                self.register_to_config(algorithm_type="deis")
+            else:
+                raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}")
+
+        if solver_type not in ["logrho"]:
+            if solver_type in ["midpoint", "heun", "bh1", "bh2"]:
+                self.register_to_config(solver_type="logrho")
+            else:
+                raise NotImplementedError(f"solver type {solver_type} is not implemented for {self.__class__}")
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.model_outputs = [None] * solver_order
+        self.lower_order_nums = 0
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+                .round()[::-1][:-1]
+                .copy()
+                .astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // (num_inference_steps + 1)
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.arange(self.config.num_train_timesteps, 0, -step_ratio).round().copy().astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        if self.config.use_karras_sigmas:
+            log_sigmas = np.log(sigmas)
+            sigmas = np.flip(sigmas).copy()
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+            sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
+        elif self.config.use_exponential_sigmas:
+            sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+            sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+
+        return alpha_t, sigma_t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
+    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """Constructs an exponential noise schedule."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps).exp()
+        return sigmas
+
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Convert the model output to the corresponding type the DEIS algorithm needs.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma = self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        if self.config.prediction_type == "epsilon":
+            x0_pred = (sample - sigma_t * model_output) / alpha_t
+        elif self.config.prediction_type == "sample":
+            x0_pred = model_output
+        elif self.config.prediction_type == "v_prediction":
+            x0_pred = alpha_t * sample - sigma_t * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction` for the DEISMultistepScheduler."
+            )
+
+        if self.config.thresholding:
+            x0_pred = self._threshold_sample(x0_pred)
+
+        if self.config.algorithm_type == "deis":
+            return (sample - alpha_t * x0_pred) / sigma_t
+        else:
+            raise NotImplementedError("only support log-rho multistep deis now")
+
+    def deis_first_order_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the first-order DEIS (equivalent to DDIM).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "deis":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+        else:
+            raise NotImplementedError("only support log-rho multistep deis now")
+        return x_t
+
+    def multistep_deis_second_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the second-order multistep DEIS.
+
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        rho_t, rho_s0, rho_s1 = sigma_t / alpha_t, sigma_s0 / alpha_s0, sigma_s1 / alpha_s1
+
+        if self.config.algorithm_type == "deis":
+
+            def ind_fn(t, b, c):
+                # Integrate[(log(t) - log(c)) / (log(b) - log(c)), {t}]
+                return t * (-np.log(c) + np.log(t) - 1) / (np.log(b) - np.log(c))
+
+            coef1 = ind_fn(rho_t, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s0, rho_s1)
+            coef2 = ind_fn(rho_t, rho_s1, rho_s0) - ind_fn(rho_s0, rho_s1, rho_s0)
+
+            x_t = alpha_t * (sample / alpha_s0 + coef1 * m0 + coef2 * m1)
+            return x_t
+        else:
+            raise NotImplementedError("only support log-rho multistep deis now")
+
+    def multistep_deis_third_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the third-order multistep DEIS.
+
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing`sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+            self.sigmas[self.step_index - 2],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
+
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+
+        rho_t, rho_s0, rho_s1, rho_s2 = (
+            sigma_t / alpha_t,
+            sigma_s0 / alpha_s0,
+            sigma_s1 / alpha_s1,
+            sigma_s2 / alpha_s2,
+        )
+
+        if self.config.algorithm_type == "deis":
+
+            def ind_fn(t, b, c, d):
+                # Integrate[(log(t) - log(c))(log(t) - log(d)) / (log(b) - log(c))(log(b) - log(d)), {t}]
+                numerator = t * (
+                    np.log(c) * (np.log(d) - np.log(t) + 1)
+                    - np.log(d) * np.log(t)
+                    + np.log(d)
+                    + np.log(t) ** 2
+                    - 2 * np.log(t)
+                    + 2
+                )
+                denominator = (np.log(b) - np.log(c)) * (np.log(b) - np.log(d))
+                return numerator / denominator
+
+            coef1 = ind_fn(rho_t, rho_s0, rho_s1, rho_s2) - ind_fn(rho_s0, rho_s0, rho_s1, rho_s2)
+            coef2 = ind_fn(rho_t, rho_s1, rho_s2, rho_s0) - ind_fn(rho_s0, rho_s1, rho_s2, rho_s0)
+            coef3 = ind_fn(rho_t, rho_s2, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s2, rho_s0, rho_s1)
+
+            x_t = alpha_t * (sample / alpha_s0 + coef1 * m0 + coef2 * m1 + coef3 * m2)
+
+            return x_t
+        else:
+            raise NotImplementedError("only support log-rho multistep deis now")
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        return step_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep DEIS.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        lower_order_final = (
+            (self.step_index == len(self.timesteps) - 1) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+        lower_order_second = (
+            (self.step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.deis_first_order_update(model_output, sample=sample)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            prev_sample = self.multistep_deis_second_order_update(self.model_outputs, sample=sample)
+        else:
+            prev_sample = self.multistep_deis_third_order_update(self.model_outputs, sample=sample)
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_dpm_cogvideox.py b/diffusers/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a2c7be7115bc1188de1a6dc399e91668202681c
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
@@ -0,0 +1,489 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class DDIMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+def rescale_zero_terminal_snr(alphas_cumprod):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+
+    return alphas_bar
+
+
+class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
+    non-Markovian guidance.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,
+        beta_end: float = 0.0120,
+        beta_schedule: str = "scaled_linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+        snr_shift_scale: float = 3.0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float64) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # Modify: SNR shift following SD3
+        self.alphas_cumprod = self.alphas_cumprod / (snr_shift_scale + (1 - snr_shift_scale) * self.alphas_cumprod)
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.alphas_cumprod = rescale_zero_terminal_snr(self.alphas_cumprod)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                .round()[::-1]
+                .copy()
+                .astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
+            )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def get_variables(self, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back=None):
+        lamb = ((alpha_prod_t / (1 - alpha_prod_t)) ** 0.5).log()
+        lamb_next = ((alpha_prod_t_prev / (1 - alpha_prod_t_prev)) ** 0.5).log()
+        h = lamb_next - lamb
+
+        if alpha_prod_t_back is not None:
+            lamb_previous = ((alpha_prod_t_back / (1 - alpha_prod_t_back)) ** 0.5).log()
+            h_last = lamb - lamb_previous
+            r = h_last / h
+            return h, r, lamb, lamb_next
+        else:
+            return h, None, lamb, lamb_next
+
+    def get_mult(self, h, r, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back):
+        mult1 = ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** 0.5 * (-h).exp()
+        mult2 = (-2 * h).expm1() * alpha_prod_t_prev**0.5
+
+        if alpha_prod_t_back is not None:
+            mult3 = 1 + 1 / (2 * r)
+            mult4 = 1 / (2 * r)
+            return mult1, mult2, mult3, mult4
+        else:
+            return mult1, mult2
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        old_pred_original_sample: torch.Tensor,
+        timestep: int,
+        timestep_back: int,
+        sample: torch.Tensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = False,
+    ) -> Union[DDIMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                The weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`, defaults to `False`):
+                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
+                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
+                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
+                `use_clipped_model_output` has no effect.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        alpha_prod_t_back = self.alphas_cumprod[timestep_back] if timestep_back is not None else None
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        # To make style tests pass, commented out `pred_epsilon` as it is an unused variable
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            # pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            # pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            # pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        h, r, lamb, lamb_next = self.get_variables(alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back)
+        mult = list(self.get_mult(h, r, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back))
+        mult_noise = (1 - alpha_prod_t_prev) ** 0.5 * (1 - (-2 * h).exp()) ** 0.5
+
+        noise = randn_tensor(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype)
+        prev_sample = mult[0] * sample - mult[1] * pred_original_sample + mult_noise * noise
+
+        if old_pred_original_sample is None or prev_timestep < 0:
+            # Save a network evaluation if all noise levels are 0 or on the first step
+            return prev_sample, pred_original_sample
+        else:
+            denoised_d = mult[2] * pred_original_sample - mult[3] * old_pred_original_sample
+            noise = randn_tensor(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype)
+            x_advanced = mult[0] * sample - mult[1] * denoised_d + mult_noise * noise
+
+            prev_sample = x_advanced
+
+        if not return_dict:
+            return (prev_sample, pred_original_sample)
+
+        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
new file mode 100644
index 0000000000000000000000000000000000000000..924eefb0e98d7fb1e36e5e75e0cd9b4700e9c9ba
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -0,0 +1,1091 @@
+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DPMSolverMultistepScheduler` is a fast dedicated high-order solver for diffusion ODEs.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        solver_order (`int`, defaults to 2):
+            The DPMSolver order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `dpmsolver++`):
+            Algorithm type for the solver; can be `dpmsolver`, `dpmsolver++`, `sde-dpmsolver` or `sde-dpmsolver++`. The
+            `dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927)
+            paper, and the `dpmsolver++` type implements the algorithms in the
+            [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
+            `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
+        solver_type (`str`, defaults to `midpoint`):
+            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
+            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        euler_at_final (`bool`, defaults to `False`):
+            Whether to use Euler's method in the final step. It is a trade-off between numerical stability and detail
+            richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
+            steps, but sometimes may result in blurring.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        use_lu_lambdas (`bool`, *optional*, defaults to `False`):
+            Whether to use the uniform-logSNR for step sizes proposed by Lu's DPM-Solver in the noise schedule during
+            the sampling process. If `True`, the sigmas and time steps are determined according to a sequence of
+            `lambda(t)`.
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+        lambda_min_clipped (`float`, defaults to `-inf`):
+            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
+            cosine (`squaredcos_cap_v2`) noise schedule.
+        variance_type (`str`, *optional*):
+            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
+            contains the predicted Gaussian variance.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        euler_at_final: bool = False,
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        use_lu_lambdas: Optional[bool] = False,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+        lambda_min_clipped: float = -float("inf"),
+        variance_type: Optional[str] = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if sum([self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError("Only one of `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used.")
+        if algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            deprecation_message = f"algorithm_type {algorithm_type} is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead"
+            deprecate("algorithm_types dpmsolver and sde-dpmsolver", "1.0.0", deprecation_message)
+
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        if rescale_betas_zero_snr:
+            # Close to 0 without being 0 so first sigma is not inf
+            # FP16 smallest positive subnormal works well here
+            self.alphas_cumprod[-1] = 2**-24
+
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"]:
+            if algorithm_type == "deis":
+                self.register_to_config(algorithm_type="dpmsolver++")
+            else:
+                raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}")
+
+        if solver_type not in ["midpoint", "heun"]:
+            if solver_type in ["logrho", "bh1", "bh2"]:
+                self.register_to_config(solver_type="midpoint")
+            else:
+                raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
+
+        if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"] and final_sigmas_type == "zero":
+            raise ValueError(
+                f"`final_sigmas_type` {final_sigmas_type} is not supported for `algorithm_type` {algorithm_type}. Please choose `sigma_min` instead."
+            )
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.model_outputs = [None] * solver_order
+        self.lower_order_nums = 0
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary timesteps schedule. If `None`, timesteps will be generated
+                based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps` and `sigmas`
+                must be `None`, and `timestep_spacing` attribute will be ignored.
+        """
+        if num_inference_steps is None and timesteps is None:
+            raise ValueError("Must pass exactly one of `num_inference_steps` or `timesteps`.")
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+        if timesteps is not None and self.config.use_karras_sigmas:
+            raise ValueError("Cannot use `timesteps` with `config.use_karras_sigmas = True`")
+        if timesteps is not None and self.config.use_lu_lambdas:
+            raise ValueError("Cannot use `timesteps` with `config.use_lu_lambdas = True`")
+        if timesteps is not None and self.config.use_exponential_sigmas:
+            raise ValueError("Cannot set `timesteps` with `config.use_exponential_sigmas = True`.")
+
+        if timesteps is not None:
+            timesteps = np.array(timesteps).astype(np.int64)
+        else:
+            # Clipping the minimum of all lambda(t) for numerical stability.
+            # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+            clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
+            last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item()
+
+            # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+            if self.config.timestep_spacing == "linspace":
+                timesteps = (
+                    np.linspace(0, last_timestep - 1, num_inference_steps + 1)
+                    .round()[::-1][:-1]
+                    .copy()
+                    .astype(np.int64)
+                )
+            elif self.config.timestep_spacing == "leading":
+                step_ratio = last_timestep // (num_inference_steps + 1)
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = (
+                    (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
+                )
+                timesteps += self.config.steps_offset
+            elif self.config.timestep_spacing == "trailing":
+                step_ratio = self.config.num_train_timesteps / num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
+                timesteps -= 1
+            else:
+                raise ValueError(
+                    f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+                )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+
+        if self.config.use_karras_sigmas:
+            sigmas = np.flip(sigmas).copy()
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+        elif self.config.use_lu_lambdas:
+            lambdas = np.flip(log_sigmas.copy())
+            lambdas = self._convert_to_lu(in_lambdas=lambdas, num_inference_steps=num_inference_steps)
+            sigmas = np.exp(lambdas)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+        elif self.config.use_exponential_sigmas:
+            sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+
+        sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+
+        return alpha_t, sigma_t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    def _convert_to_lu(self, in_lambdas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Lu et al. (2022)."""
+
+        lambda_min: float = in_lambdas[-1].item()
+        lambda_max: float = in_lambdas[0].item()
+
+        rho = 1.0  # 1.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = lambda_min ** (1 / rho)
+        max_inv_rho = lambda_max ** (1 / rho)
+        lambdas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return lambdas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
+    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """Constructs an exponential noise schedule."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps).exp()
+        return sigmas
+
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+
+        <Tip>
+
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction models.
+
+        </Tip>
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    model_output = model_output[:, :3]
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    epsilon = model_output[:, :3]
+                else:
+                    epsilon = model_output
+            elif self.config.prediction_type == "sample":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = alpha_t * model_output + sigma_t * sample
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = (sample - sigma_t * epsilon) / alpha_t
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = (sample - alpha_t * x0_pred) / sigma_t
+
+            return epsilon
+
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the first-order DPMSolver (equivalent to DDIM).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "dpmsolver":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            x_t = (
+                (sigma_t / sigma_s * torch.exp(-h)) * sample
+                + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+                + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+            )
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            x_t = (
+                (alpha_t / alpha_s) * sample
+                - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * model_output
+                + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+            )
+        return x_t
+
+    def multistep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the second-order multistep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (torch.exp(h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D1
+                    + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 2.0 * (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                    + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+                )
+        return x_t
+
+    def multistep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the third-order multistep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing`sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+            self.sigmas[self.step_index - 2],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+        lambda_s2 = torch.log(alpha_s2) - torch.log(sigma_s2)
+
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (alpha_t / alpha_s0) * sample
+                - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+            )
+        return x_t
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        return step_index
+
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep DPMSolver.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`LEdits++`].
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Improve numerical stability for small number of steps
+        lower_order_final = (self.step_index == len(self.timesteps) - 1) and (
+            self.config.euler_at_final
+            or (self.config.lower_order_final and len(self.timesteps) < 15)
+            or self.config.final_sigmas_type == "zero"
+        )
+        lower_order_second = (
+            (self.step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"] and variance_noise is None:
+            noise = randn_tensor(
+                model_output.shape, generator=generator, device=model_output.device, dtype=torch.float32
+            )
+        elif self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+            noise = variance_noise.to(device=model_output.device, dtype=torch.float32)
+        else:
+            noise = None
+
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(model_output, sample=sample, noise=noise)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            prev_sample = self.multistep_dpm_solver_second_order_update(self.model_outputs, sample=sample, noise=noise)
+        else:
+            prev_sample = self.multistep_dpm_solver_third_order_update(self.model_outputs, sample=sample)
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # Cast sample back to expected dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py b/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f48066455fb2328ee3acd882157807f684e8917
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
@@ -0,0 +1,643 @@
+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import (
+    CommonSchedulerState,
+    FlaxKarrasDiffusionSchedulers,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    add_noise_common,
+)
+
+
+@flax.struct.dataclass
+class DPMSolverMultistepSchedulerState:
+    common: CommonSchedulerState
+    alpha_t: jnp.ndarray
+    sigma_t: jnp.ndarray
+    lambda_t: jnp.ndarray
+
+    # setable values
+    init_noise_sigma: jnp.ndarray
+    timesteps: jnp.ndarray
+    num_inference_steps: Optional[int] = None
+
+    # running values
+    model_outputs: Optional[jnp.ndarray] = None
+    lower_order_nums: Optional[jnp.int32] = None
+    prev_timestep: Optional[jnp.int32] = None
+    cur_sample: Optional[jnp.ndarray] = None
+
+    @classmethod
+    def create(
+        cls,
+        common: CommonSchedulerState,
+        alpha_t: jnp.ndarray,
+        sigma_t: jnp.ndarray,
+        lambda_t: jnp.ndarray,
+        init_noise_sigma: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ):
+        return cls(
+            common=common,
+            alpha_t=alpha_t,
+            sigma_t=sigma_t,
+            lambda_t=lambda_t,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+
+@dataclass
+class FlaxDPMSolverMultistepSchedulerOutput(FlaxSchedulerOutput):
+    state: DPMSolverMultistepSchedulerState
+
+
+class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    DPM-Solver (and the improved version DPM-Solver++) is a fast dedicated high-order solver for diffusion ODEs with
+    the convergence order guarantee. Empirically, sampling by DPM-Solver with only 20 steps can generate high-quality
+    samples, and it can generate quite good samples even in only 10 steps.
+
+    For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
+
+    Currently, we support the multistep DPM-Solver for both noise prediction models and data prediction models. We
+    recommend to use `solver_order=2` for guided sampling, and `solver_order=3` for unconditional sampling.
+
+    We also support the "dynamic thresholding" method in Imagen (https://arxiv.org/abs/2205.11487). For pixel-space
+    diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use the dynamic
+    thresholding. Note that the thresholding method is unsuitable for latent-space diffusion models (such as
+    stable-diffusion).
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        solver_order (`int`, default `2`):
+            the order of DPM-Solver; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the data / `x0`. One of `epsilon`, `sample`,
+            or `v-prediction`.
+        thresholding (`bool`, default `False`):
+            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
+            For pixel-space diffusion models, you can set both `algorithm_type=dpmsolver++` and `thresholding=True` to
+            use the dynamic thresholding. Note that the thresholding method is unsuitable for latent-space diffusion
+            models (such as stable-diffusion).
+        dynamic_thresholding_ratio (`float`, default `0.995`):
+            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
+            (https://arxiv.org/abs/2205.11487).
+        sample_max_value (`float`, default `1.0`):
+            the threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++`.
+        algorithm_type (`str`, default `dpmsolver++`):
+            the algorithm type for the solver. Either `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the
+            algorithms in https://arxiv.org/abs/2206.00927, and the `dpmsolver++` type implements the algorithms in
+            https://arxiv.org/abs/2211.01095. We recommend to use `dpmsolver++` with `solver_order=2` for guided
+            sampling (e.g. stable-diffusion).
+        solver_type (`str`, default `midpoint`):
+            the solver type for the second-order solver. Either `midpoint` or `heun`. The solver type slightly affects
+            the sample quality, especially for small number of steps. We empirically find that `midpoint` solvers are
+            slightly better, so we recommend to use the `midpoint` type.
+        lower_order_final (`bool`, default `True`):
+            whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. We empirically
+            find this trick can stabilize the sampling of DPM-Solver for steps < 15, especially for steps <= 10.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
+    """
+
+    _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
+
+    dtype: jnp.dtype
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        timestep_spacing: str = "linspace",
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        self.dtype = dtype
+
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> DPMSolverMultistepSchedulerState:
+        if common is None:
+            common = CommonSchedulerState.create(self)
+
+        # Currently we only support VP-type noise schedule
+        alpha_t = jnp.sqrt(common.alphas_cumprod)
+        sigma_t = jnp.sqrt(1 - common.alphas_cumprod)
+        lambda_t = jnp.log(alpha_t) - jnp.log(sigma_t)
+
+        # settings for DPM-Solver
+        if self.config.algorithm_type not in ["dpmsolver", "dpmsolver++"]:
+            raise NotImplementedError(f"{self.config.algorithm_type} is not implemented for {self.__class__}")
+        if self.config.solver_type not in ["midpoint", "heun"]:
+            raise NotImplementedError(f"{self.config.solver_type} is not implemented for {self.__class__}")
+
+        # standard deviation of the initial noise distribution
+        init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
+
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
+
+        return DPMSolverMultistepSchedulerState.create(
+            common=common,
+            alpha_t=alpha_t,
+            sigma_t=sigma_t,
+            lambda_t=lambda_t,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+    def set_timesteps(
+        self, state: DPMSolverMultistepSchedulerState, num_inference_steps: int, shape: Tuple
+    ) -> DPMSolverMultistepSchedulerState:
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`DPMSolverMultistepSchedulerState`):
+                the `FlaxDPMSolverMultistepScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+            shape (`Tuple`):
+                the shape of the samples to be generated.
+        """
+        last_timestep = self.config.num_train_timesteps
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                jnp.linspace(0, last_timestep - 1, num_inference_steps + 1).round()[::-1][:-1].astype(jnp.int32)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = last_timestep // (num_inference_steps + 1)
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (
+                (jnp.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(jnp.int32)
+            )
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = jnp.arange(last_timestep, 0, -step_ratio).round().copy().astype(jnp.int32)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        # initial running values
+
+        model_outputs = jnp.zeros((self.config.solver_order,) + shape, dtype=self.dtype)
+        lower_order_nums = jnp.int32(0)
+        prev_timestep = jnp.int32(-1)
+        cur_sample = jnp.zeros(shape, dtype=self.dtype)
+
+        return state.replace(
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            model_outputs=model_outputs,
+            lower_order_nums=lower_order_nums,
+            prev_timestep=prev_timestep,
+            cur_sample=cur_sample,
+        )
+
+    def convert_model_output(
+        self,
+        state: DPMSolverMultistepSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+    ) -> jnp.ndarray:
+        """
+        Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
+
+        DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to
+        discretize an integral of the data prediction model. So we need to first convert the model output to the
+        corresponding type to match the algorithm.
+
+        Note that the algorithm type and the model type is decoupled. That is to say, we can use either DPM-Solver or
+        DPM-Solver++ for both noise prediction model and data prediction model.
+
+        Args:
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `jnp.ndarray`: the converted model output.
+        """
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type == "dpmsolver++":
+            if self.config.prediction_type == "epsilon":
+                alpha_t, sigma_t = state.alpha_t[timestep], state.sigma_t[timestep]
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                alpha_t, sigma_t = state.alpha_t[timestep], state.sigma_t[timestep]
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, "
+                    " or `v_prediction` for the FlaxDPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                # Dynamic thresholding in https://arxiv.org/abs/2205.11487
+                dynamic_max_val = jnp.percentile(
+                    jnp.abs(x0_pred), self.config.dynamic_thresholding_ratio, axis=tuple(range(1, x0_pred.ndim))
+                )
+                dynamic_max_val = jnp.maximum(
+                    dynamic_max_val, self.config.sample_max_value * jnp.ones_like(dynamic_max_val)
+                )
+                x0_pred = jnp.clip(x0_pred, -dynamic_max_val, dynamic_max_val) / dynamic_max_val
+            return x0_pred
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type == "dpmsolver":
+            if self.config.prediction_type == "epsilon":
+                return model_output
+            elif self.config.prediction_type == "sample":
+                alpha_t, sigma_t = state.alpha_t[timestep], state.sigma_t[timestep]
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+                return epsilon
+            elif self.config.prediction_type == "v_prediction":
+                alpha_t, sigma_t = state.alpha_t[timestep], state.sigma_t[timestep]
+                epsilon = alpha_t * model_output + sigma_t * sample
+                return epsilon
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, "
+                    " or `v_prediction` for the FlaxDPMSolverMultistepScheduler."
+                )
+
+    def dpm_solver_first_order_update(
+        self,
+        state: DPMSolverMultistepSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        prev_timestep: int,
+        sample: jnp.ndarray,
+    ) -> jnp.ndarray:
+        """
+        One step for the first-order DPM-Solver (equivalent to DDIM).
+
+        See https://arxiv.org/abs/2206.00927 for the detailed derivation.
+
+        Args:
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `jnp.ndarray`: the sample tensor at the previous timestep.
+        """
+        t, s0 = prev_timestep, timestep
+        m0 = model_output
+        lambda_t, lambda_s = state.lambda_t[t], state.lambda_t[s0]
+        alpha_t, alpha_s = state.alpha_t[t], state.alpha_t[s0]
+        sigma_t, sigma_s = state.sigma_t[t], state.sigma_t[s0]
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (jnp.exp(-h) - 1.0)) * m0
+        elif self.config.algorithm_type == "dpmsolver":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (jnp.exp(h) - 1.0)) * m0
+        return x_t
+
+    def multistep_dpm_solver_second_order_update(
+        self,
+        state: DPMSolverMultistepSchedulerState,
+        model_output_list: jnp.ndarray,
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: jnp.ndarray,
+    ) -> jnp.ndarray:
+        """
+        One step for the second-order multistep DPM-Solver.
+
+        Args:
+            model_output_list (`List[jnp.ndarray]`):
+                direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`): current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `jnp.ndarray`: the sample tensor at the previous timestep.
+        """
+        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+        lambda_t, lambda_s0, lambda_s1 = state.lambda_t[t], state.lambda_t[s0], state.lambda_t[s1]
+        alpha_t, alpha_s0 = state.alpha_t[t], state.alpha_t[s0]
+        sigma_t, sigma_s0 = state.sigma_t[t], state.sigma_t[s0]
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (jnp.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (jnp.exp(-h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (jnp.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((jnp.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (jnp.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (jnp.exp(h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (jnp.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((jnp.exp(h) - 1.0) / h - 1.0)) * D1
+                )
+        return x_t
+
+    def multistep_dpm_solver_third_order_update(
+        self,
+        state: DPMSolverMultistepSchedulerState,
+        model_output_list: jnp.ndarray,
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: jnp.ndarray,
+    ) -> jnp.ndarray:
+        """
+        One step for the third-order multistep DPM-Solver.
+
+        Args:
+            model_output_list (`List[jnp.ndarray]`):
+                direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`): current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `jnp.ndarray`: the sample tensor at the previous timestep.
+        """
+        t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+        lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
+            state.lambda_t[t],
+            state.lambda_t[s0],
+            state.lambda_t[s1],
+            state.lambda_t[s2],
+        )
+        alpha_t, alpha_s0 = state.alpha_t[t], state.alpha_t[s0]
+        sigma_t, sigma_s0 = state.sigma_t[t], state.sigma_t[s0]
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (jnp.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((jnp.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((jnp.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (alpha_t / alpha_s0) * sample
+                - (sigma_t * (jnp.exp(h) - 1.0)) * D0
+                - (sigma_t * ((jnp.exp(h) - 1.0) / h - 1.0)) * D1
+                - (sigma_t * ((jnp.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+            )
+        return x_t
+
+    def step(
+        self,
+        state: DPMSolverMultistepSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        return_dict: bool = True,
+    ) -> Union[FlaxDPMSolverMultistepSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by DPM-Solver. Core function to propagate the diffusion process
+        from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`DPMSolverMultistepSchedulerState`):
+                the `FlaxDPMSolverMultistepScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than FlaxDPMSolverMultistepSchedulerOutput class
+
+        Returns:
+            [`FlaxDPMSolverMultistepSchedulerOutput`] or `tuple`: [`FlaxDPMSolverMultistepSchedulerOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        (step_index,) = jnp.where(state.timesteps == timestep, size=1)
+        step_index = step_index[0]
+
+        prev_timestep = jax.lax.select(step_index == len(state.timesteps) - 1, 0, state.timesteps[step_index + 1])
+
+        model_output = self.convert_model_output(state, model_output, timestep, sample)
+
+        model_outputs_new = jnp.roll(state.model_outputs, -1, axis=0)
+        model_outputs_new = model_outputs_new.at[-1].set(model_output)
+        state = state.replace(
+            model_outputs=model_outputs_new,
+            prev_timestep=prev_timestep,
+            cur_sample=sample,
+        )
+
+        def step_1(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+            return self.dpm_solver_first_order_update(
+                state,
+                state.model_outputs[-1],
+                state.timesteps[step_index],
+                state.prev_timestep,
+                state.cur_sample,
+            )
+
+        def step_23(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+            def step_2(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+                timestep_list = jnp.array([state.timesteps[step_index - 1], state.timesteps[step_index]])
+                return self.multistep_dpm_solver_second_order_update(
+                    state,
+                    state.model_outputs,
+                    timestep_list,
+                    state.prev_timestep,
+                    state.cur_sample,
+                )
+
+            def step_3(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+                timestep_list = jnp.array(
+                    [
+                        state.timesteps[step_index - 2],
+                        state.timesteps[step_index - 1],
+                        state.timesteps[step_index],
+                    ]
+                )
+                return self.multistep_dpm_solver_third_order_update(
+                    state,
+                    state.model_outputs,
+                    timestep_list,
+                    state.prev_timestep,
+                    state.cur_sample,
+                )
+
+            step_2_output = step_2(state)
+            step_3_output = step_3(state)
+
+            if self.config.solver_order == 2:
+                return step_2_output
+            elif self.config.lower_order_final and len(state.timesteps) < 15:
+                return jax.lax.select(
+                    state.lower_order_nums < 2,
+                    step_2_output,
+                    jax.lax.select(
+                        step_index == len(state.timesteps) - 2,
+                        step_2_output,
+                        step_3_output,
+                    ),
+                )
+            else:
+                return jax.lax.select(
+                    state.lower_order_nums < 2,
+                    step_2_output,
+                    step_3_output,
+                )
+
+        step_1_output = step_1(state)
+        step_23_output = step_23(state)
+
+        if self.config.solver_order == 1:
+            prev_sample = step_1_output
+
+        elif self.config.lower_order_final and len(state.timesteps) < 15:
+            prev_sample = jax.lax.select(
+                state.lower_order_nums < 1,
+                step_1_output,
+                jax.lax.select(
+                    step_index == len(state.timesteps) - 1,
+                    step_1_output,
+                    step_23_output,
+                ),
+            )
+
+        else:
+            prev_sample = jax.lax.select(
+                state.lower_order_nums < 1,
+                step_1_output,
+                step_23_output,
+            )
+
+        state = state.replace(
+            lower_order_nums=jnp.minimum(state.lower_order_nums + 1, self.config.solver_order),
+        )
+
+        if not return_dict:
+            return (prev_sample, state)
+
+        return FlaxDPMSolverMultistepSchedulerOutput(prev_sample=prev_sample, state=state)
+
+    def scale_model_input(
+        self, state: DPMSolverMultistepSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
+    ) -> jnp.ndarray:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            state (`DPMSolverMultistepSchedulerState`):
+                the `FlaxDPMSolverMultistepScheduler` state data class instance.
+            sample (`jnp.ndarray`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `jnp.ndarray`: scaled input sample
+        """
+        return sample
+
+    def add_noise(
+        self,
+        state: DPMSolverMultistepSchedulerState,
+        original_samples: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        return add_noise_common(state.common, original_samples, noise, timesteps)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f024b8c4c751416b1336e8472e145ab5fb9efb0
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -0,0 +1,952 @@
+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DPMSolverMultistepInverseScheduler` is the reverse scheduler of [`DPMSolverMultistepScheduler`].
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        solver_order (`int`, defaults to 2):
+            The DPMSolver order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `dpmsolver++`):
+            Algorithm type for the solver; can be `dpmsolver`, `dpmsolver++`, `sde-dpmsolver` or `sde-dpmsolver++`. The
+            `dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927)
+            paper, and the `dpmsolver++` type implements the algorithms in the
+            [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
+            `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
+        solver_type (`str`, defaults to `midpoint`):
+            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
+            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        euler_at_final (`bool`, defaults to `False`):
+            Whether to use Euler's method in the final step. It is a trade-off between numerical stability and detail
+            richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
+            steps, but sometimes may result in blurring.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        lambda_min_clipped (`float`, defaults to `-inf`):
+            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
+            cosine (`squaredcos_cap_v2`) noise schedule.
+        variance_type (`str`, *optional*):
+            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
+            contains the predicted Gaussian variance.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        euler_at_final: bool = False,
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        lambda_min_clipped: float = -float("inf"),
+        variance_type: Optional[str] = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if sum([self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError("Only one of `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used.")
+        if algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            deprecation_message = f"algorithm_type {algorithm_type} is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead"
+            deprecate("algorithm_types dpmsolver and sde-dpmsolver", "1.0.0", deprecation_message)
+
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"]:
+            if algorithm_type == "deis":
+                self.register_to_config(algorithm_type="dpmsolver++")
+            else:
+                raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}")
+
+        if solver_type not in ["midpoint", "heun"]:
+            if solver_type in ["logrho", "bh1", "bh2"]:
+                self.register_to_config(solver_type="midpoint")
+            else:
+                raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32).copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.model_outputs = [None] * solver_order
+        self.lower_order_nums = 0
+        self._step_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+        self.use_karras_sigmas = use_karras_sigmas
+        self.use_exponential_sigmas = use_exponential_sigmas
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        # Clipping the minimum of all lambda(t) for numerical stability.
+        # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+        clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped).item()
+        self.noisiest_timestep = self.config.num_train_timesteps - 1 - clipped_idx
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.noisiest_timestep, num_inference_steps + 1).round()[:-1].copy().astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = (self.noisiest_timestep + 1) // (num_inference_steps + 1)
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[:-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.arange(self.noisiest_timestep + 1, 0, -step_ratio).round()[::-1].copy().astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', "
+                "'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+
+        if self.config.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+            timesteps = timesteps.copy().astype(np.int64)
+            sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
+        elif self.config.use_exponential_sigmas:
+            sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+            sigma_max = (
+                (1 - self.alphas_cumprod[self.noisiest_timestep]) / self.alphas_cumprod[self.noisiest_timestep]
+            ) ** 0.5
+            sigmas = np.concatenate([sigmas, [sigma_max]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas)
+
+        # when num_inference_steps == num_train_timesteps, we can end up with
+        # duplicates in timesteps.
+        _, unique_indices = np.unique(timesteps, return_index=True)
+        timesteps = timesteps[np.sort(unique_indices)]
+
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+
+        return alpha_t, sigma_t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
+    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """Constructs an exponential noise schedule."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps).exp()
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.convert_model_output
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+
+        <Tip>
+
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction models.
+
+        </Tip>
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    model_output = model_output[:, :3]
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    epsilon = model_output[:, :3]
+                else:
+                    epsilon = model_output
+            elif self.config.prediction_type == "sample":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = alpha_t * model_output + sigma_t * sample
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = (sample - sigma_t * epsilon) / alpha_t
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = (sample - alpha_t * x0_pred) / sigma_t
+
+            return epsilon
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.dpm_solver_first_order_update
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the first-order DPMSolver (equivalent to DDIM).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "dpmsolver":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            x_t = (
+                (sigma_t / sigma_s * torch.exp(-h)) * sample
+                + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+                + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+            )
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            x_t = (
+                (alpha_t / alpha_s) * sample
+                - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * model_output
+                + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+            )
+        return x_t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_second_order_update
+    def multistep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the second-order multistep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (torch.exp(h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D1
+                    + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 2.0 * (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                    + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+                )
+        return x_t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_third_order_update
+    def multistep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the third-order multistep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing`sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+            self.sigmas[self.step_index - 2],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+        lambda_s2 = torch.log(alpha_s2) - torch.log(sigma_s2)
+
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (alpha_t / alpha_s0) * sample
+                - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+            )
+        return x_t
+
+    def _init_step_index(self, timestep):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.to(self.timesteps.device)
+
+        index_candidates = (self.timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        self._step_index = step_index
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep DPMSolver.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Improve numerical stability for small number of steps
+        lower_order_final = (self.step_index == len(self.timesteps) - 1) and (
+            self.config.euler_at_final or (self.config.lower_order_final and len(self.timesteps) < 15)
+        )
+        lower_order_second = (
+            (self.step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"] and variance_noise is None:
+            noise = randn_tensor(
+                model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+            )
+        elif self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+            noise = variance_noise
+        else:
+            noise = None
+
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(model_output, sample=sample, noise=noise)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            prev_sample = self.multistep_dpm_solver_second_order_update(self.model_outputs, sample=sample, noise=noise)
+        else:
+            prev_sample = self.multistep_dpm_solver_third_order_update(self.model_outputs, sample=sample)
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.scale_model_input
+    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        step_indices = []
+        for timestep in timesteps:
+            index_candidates = (schedule_timesteps == timestep).nonzero()
+            if len(index_candidates) == 0:
+                step_index = len(schedule_timesteps) - 1
+            elif len(index_candidates) > 1:
+                step_index = index_candidates[1].item()
+            else:
+                step_index = index_candidates[0].item()
+            step_indices.append(step_index)
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
new file mode 100644
index 0000000000000000000000000000000000000000..3748de63388a711d9902a10befb0e29c35fd8e05
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -0,0 +1,604 @@
+# Copyright 2024 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torchsde
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+class BatchedBrownianTree:
+    """A wrapper around torchsde.BrownianTree that enables batches of entropy."""
+
+    def __init__(self, x, t0, t1, seed=None, **kwargs):
+        t0, t1, self.sign = self.sort(t0, t1)
+        w0 = kwargs.get("w0", torch.zeros_like(x))
+        if seed is None:
+            seed = torch.randint(0, 2**63 - 1, []).item()
+        self.batched = True
+        try:
+            assert len(seed) == x.shape[0]
+            w0 = w0[0]
+        except TypeError:
+            seed = [seed]
+            self.batched = False
+        self.trees = [
+            torchsde.BrownianInterval(
+                t0=t0,
+                t1=t1,
+                size=w0.shape,
+                dtype=w0.dtype,
+                device=w0.device,
+                entropy=s,
+                tol=1e-6,
+                pool_size=24,
+                halfway_tree=True,
+            )
+            for s in seed
+        ]
+
+    @staticmethod
+    def sort(a, b):
+        return (a, b, 1) if a < b else (b, a, -1)
+
+    def __call__(self, t0, t1):
+        t0, t1, sign = self.sort(t0, t1)
+        w = torch.stack([tree(t0, t1) for tree in self.trees]) * (self.sign * sign)
+        return w if self.batched else w[0]
+
+
+class BrownianTreeNoiseSampler:
+    """A noise sampler backed by a torchsde.BrownianTree.
+
+    Args:
+        x (Tensor): The tensor whose shape, device and dtype to use to generate
+            random samples.
+        sigma_min (float): The low end of the valid interval.
+        sigma_max (float): The high end of the valid interval.
+        seed (int or List[int]): The random seed. If a list of seeds is
+            supplied instead of a single integer, then the noise sampler will use one BrownianTree per batch item, each
+            with its own seed.
+        transform (callable): A function that maps sigma to the sampler's
+            internal timestep.
+    """
+
+    def __init__(self, x, sigma_min, sigma_max, seed=None, transform=lambda x: x):
+        self.transform = transform
+        t0, t1 = self.transform(torch.as_tensor(sigma_min)), self.transform(torch.as_tensor(sigma_max))
+        self.tree = BatchedBrownianTree(x, t0, t1, seed)
+
+    def __call__(self, sigma, sigma_next):
+        t0, t1 = self.transform(torch.as_tensor(sigma)), self.transform(torch.as_tensor(sigma_next))
+        return self.tree(t0, t1) / (t1 - t0).abs().sqrt()
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
+    """
+    DPMSolverSDEScheduler implements the stochastic sampler from the [Elucidating the Design Space of Diffusion-Based
+    Generative Models](https://huggingface.co/papers/2206.00364) paper.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.00085):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.012):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        noise_sampler_seed (`int`, *optional*, defaults to `None`):
+            The random seed to use for the noise sampler. If `None`, a random seed is generated.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 2
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,  # sensible defaults
+        beta_end: float = 0.012,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        noise_sampler_seed: Optional[int] = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if sum([self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError("Only one of `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used.")
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        #  set all values
+        self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
+        self.use_karras_sigmas = use_karras_sigmas
+        self.noise_sampler = None
+        self.noise_sampler_seed = noise_sampler_seed
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return self.sigmas.max()
+
+        return (self.sigmas.max() ** 2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[float, torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sigma_input = sigma if self.state_in_first_order else self.mid_point_sigma
+        sample = sample / ((sigma_input**2 + 1) ** 0.5)
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        device: Union[str, torch.device] = None,
+        num_train_timesteps: Optional[int] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(float)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(float)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.config.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+        elif self.config.use_exponential_sigmas:
+            sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
+        second_order_timesteps = self._second_order_timesteps(sigmas, log_sigmas)
+
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        sigmas = torch.from_numpy(sigmas).to(device=device)
+        self.sigmas = torch.cat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]])
+
+        timesteps = torch.from_numpy(timesteps)
+        second_order_timesteps = torch.from_numpy(second_order_timesteps)
+        timesteps = torch.cat([timesteps[:1], timesteps[1:].repeat_interleave(2)])
+        timesteps[1::2] = second_order_timesteps
+
+        if str(device).startswith("mps"):
+            # mps does not support float64
+            self.timesteps = timesteps.to(device, dtype=torch.float32)
+        else:
+            self.timesteps = timesteps.to(device=device)
+
+        # empty first order variables
+        self.sample = None
+        self.mid_point_sigma = None
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+        self.noise_sampler = None
+
+    def _second_order_timesteps(self, sigmas, log_sigmas):
+        def sigma_fn(_t):
+            return np.exp(-_t)
+
+        def t_fn(_sigma):
+            return -np.log(_sigma)
+
+        midpoint_ratio = 0.5
+        t = t_fn(sigmas)
+        delta_time = np.diff(t)
+        t_proposed = t[:-1] + delta_time * midpoint_ratio
+        sig_proposed = sigma_fn(t_proposed)
+        timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sig_proposed])
+        return timesteps
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        sigma_min: float = in_sigmas[-1].item()
+        sigma_max: float = in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, self.num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
+    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """Constructs an exponential noise schedule."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps).exp()
+        return sigmas
+
+    @property
+    def state_in_first_order(self):
+        return self.sample is None
+
+    def step(
+        self,
+        model_output: Union[torch.Tensor, np.ndarray],
+        timestep: Union[float, torch.Tensor],
+        sample: Union[torch.Tensor, np.ndarray],
+        return_dict: bool = True,
+        s_noise: float = 1.0,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor` or `np.ndarray`):
+                The direct output from learned diffusion model.
+            timestep (`float` or `torch.Tensor`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor` or `np.ndarray`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+            s_noise (`float`, *optional*, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Create a noise sampler if it hasn't been created yet
+        if self.noise_sampler is None:
+            min_sigma, max_sigma = self.sigmas[self.sigmas > 0].min(), self.sigmas.max()
+            self.noise_sampler = BrownianTreeNoiseSampler(sample, min_sigma, max_sigma, self.noise_sampler_seed)
+
+        # Define functions to compute sigma and t from each other
+        def sigma_fn(_t: torch.Tensor) -> torch.Tensor:
+            return _t.neg().exp()
+
+        def t_fn(_sigma: torch.Tensor) -> torch.Tensor:
+            return _sigma.log().neg()
+
+        if self.state_in_first_order:
+            sigma = self.sigmas[self.step_index]
+            sigma_next = self.sigmas[self.step_index + 1]
+        else:
+            # 2nd order
+            sigma = self.sigmas[self.step_index - 1]
+            sigma_next = self.sigmas[self.step_index]
+
+        # Set the midpoint and step size for the current step
+        midpoint_ratio = 0.5
+        t, t_next = t_fn(sigma), t_fn(sigma_next)
+        delta_time = t_next - t
+        t_proposed = t + delta_time * midpoint_ratio
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            sigma_input = sigma if self.state_in_first_order else sigma_fn(t_proposed)
+            pred_original_sample = sample - sigma_input * model_output
+        elif self.config.prediction_type == "v_prediction":
+            sigma_input = sigma if self.state_in_first_order else sigma_fn(t_proposed)
+            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+                sample / (sigma_input**2 + 1)
+            )
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        if sigma_next == 0:
+            derivative = (sample - pred_original_sample) / sigma
+            dt = sigma_next - sigma
+            prev_sample = sample + derivative * dt
+        else:
+            if self.state_in_first_order:
+                t_next = t_proposed
+            else:
+                sample = self.sample
+
+            sigma_from = sigma_fn(t)
+            sigma_to = sigma_fn(t_next)
+            sigma_up = min(sigma_to, (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5)
+            sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+            ancestral_t = t_fn(sigma_down)
+            prev_sample = (sigma_fn(ancestral_t) / sigma_fn(t)) * sample - (
+                t - ancestral_t
+            ).expm1() * pred_original_sample
+            prev_sample = prev_sample + self.noise_sampler(sigma_fn(t), sigma_fn(t_next)) * s_noise * sigma_up
+
+            if self.state_in_first_order:
+                # store for 2nd order step
+                self.sample = sample
+                self.mid_point_sigma = sigma_fn(t_next)
+            else:
+                # free for "first order mode"
+                self.sample = None
+                self.mid_point_sigma = None
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
new file mode 100644
index 0000000000000000000000000000000000000000..353baf08e81dfe6b92fd62d73de4906bbfcf083d
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -0,0 +1,1081 @@
+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate, logging
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DPMSolverSinglestepScheduler` is a fast dedicated high-order solver for diffusion ODEs.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        solver_order (`int`, defaults to 2):
+            The DPMSolver order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `dpmsolver++`):
+            Algorithm type for the solver; can be `dpmsolver` or `dpmsolver++` or `sde-dpmsolver++`. The `dpmsolver`
+            type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927) paper, and the
+            `dpmsolver++` type implements the algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095)
+            paper. It is recommended to use `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided
+            sampling like in Stable Diffusion.
+        solver_type (`str`, defaults to `midpoint`):
+            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
+            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        final_sigmas_type (`str`, *optional*, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+        lambda_min_clipped (`float`, defaults to `-inf`):
+            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
+            cosine (`squaredcos_cap_v2`) noise schedule.
+        variance_type (`str`, *optional*):
+            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
+            contains the predicted Gaussian variance.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[np.ndarray] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = False,
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+        lambda_min_clipped: float = -float("inf"),
+        variance_type: Optional[str] = None,
+    ):
+        if sum([self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError("Only one of `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used.")
+        if algorithm_type == "dpmsolver":
+            deprecation_message = "algorithm_type `dpmsolver` is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead"
+            deprecate("algorithm_types=dpmsolver", "1.0.0", deprecation_message)
+
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver++"]:
+            if algorithm_type == "deis":
+                self.register_to_config(algorithm_type="dpmsolver++")
+            else:
+                raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}")
+        if solver_type not in ["midpoint", "heun"]:
+            if solver_type in ["logrho", "bh1", "bh2"]:
+                self.register_to_config(solver_type="midpoint")
+            else:
+                raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
+
+        if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"] and final_sigmas_type == "zero":
+            raise ValueError(
+                f"`final_sigmas_type` {final_sigmas_type} is not supported for `algorithm_type` {algorithm_type}. Please chooose `sigma_min` instead."
+            )
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.model_outputs = [None] * solver_order
+        self.sample = None
+        self.order_list = self.get_order_list(num_train_timesteps)
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    def get_order_list(self, num_inference_steps: int) -> List[int]:
+        """
+        Computes the solver order at each time step.
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+        steps = num_inference_steps
+        order = self.config.solver_order
+        if order > 3:
+            raise ValueError("Order > 3 is not supported by this scheduler")
+        if self.config.lower_order_final:
+            if order == 3:
+                if steps % 3 == 0:
+                    orders = [1, 2, 3] * (steps // 3 - 1) + [1, 2] + [1]
+                elif steps % 3 == 1:
+                    orders = [1, 2, 3] * (steps // 3) + [1]
+                else:
+                    orders = [1, 2, 3] * (steps // 3) + [1, 2]
+            elif order == 2:
+                if steps % 2 == 0:
+                    orders = [1, 2] * (steps // 2 - 1) + [1, 1]
+                else:
+                    orders = [1, 2] * (steps // 2) + [1]
+            elif order == 1:
+                orders = [1] * steps
+        else:
+            if order == 3:
+                orders = [1, 2, 3] * (steps // 3)
+            elif order == 2:
+                orders = [1, 2] * (steps // 2)
+            elif order == 1:
+                orders = [1] * steps
+        return orders
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps schedule is used. If `timesteps` is
+                passed, `num_inference_steps` must be `None`.
+        """
+        if num_inference_steps is None and timesteps is None:
+            raise ValueError("Must pass exactly one of  `num_inference_steps` or `timesteps`.")
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Must pass exactly one of  `num_inference_steps` or `timesteps`.")
+        if timesteps is not None and self.config.use_karras_sigmas:
+            raise ValueError("Cannot use `timesteps` when `config.use_karras_sigmas=True`.")
+        if timesteps is not None and self.config.use_exponential_sigmas:
+            raise ValueError("Cannot set `timesteps` with `config.use_exponential_sigmas = True`.")
+
+        num_inference_steps = num_inference_steps or len(timesteps)
+        self.num_inference_steps = num_inference_steps
+
+        if timesteps is not None:
+            timesteps = np.array(timesteps).astype(np.int64)
+        else:
+            # Clipping the minimum of all lambda(t) for numerical stability.
+            # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+            clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
+            timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1 - clipped_idx, num_inference_steps + 1)
+                .round()[::-1][:-1]
+                .copy()
+                .astype(np.int64)
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        if self.config.use_karras_sigmas:
+            log_sigmas = np.log(sigmas)
+            sigmas = np.flip(sigmas).copy()
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+        elif self.config.use_exponential_sigmas:
+            sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f" `final_sigmas_type` must be one of `sigma_min` or `zero`, but got {self.config.final_sigmas_type}"
+            )
+        sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas).to(device=device)
+
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+        self.model_outputs = [None] * self.config.solver_order
+        self.sample = None
+
+        if not self.config.lower_order_final and num_inference_steps % self.config.solver_order != 0:
+            logger.warning(
+                "Changing scheduler {self.config} to have `lower_order_final` set to True to handle uneven amount of inference steps. Please make sure to always use an even number of `num_inference steps when using `lower_order_final=False`."
+            )
+            self.register_to_config(lower_order_final=True)
+
+        if not self.config.lower_order_final and self.config.final_sigmas_type == "zero":
+            logger.warning(
+                " `last_sigmas_type='zero'` is not supported for `lower_order_final=False`. Changing scheduler {self.config} to have `lower_order_final` set to True."
+            )
+            self.register_to_config(lower_order_final=True)
+
+        self.order_list = self.get_order_list(num_inference_steps)
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+
+        return alpha_t, sigma_t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
+    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """Constructs an exponential noise schedule."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps).exp()
+        return sigmas
+
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+
+        <Tip>
+
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction models.
+
+        </Tip>
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    model_output = model_output[:, :3]
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverSinglestepScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type == "dpmsolver":
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    epsilon = model_output[:, :3]
+                else:
+                    epsilon = model_output
+            elif self.config.prediction_type == "sample":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = alpha_t * model_output + sigma_t * sample
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverSinglestepScheduler."
+                )
+
+            if self.config.thresholding:
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = (sample - sigma_t * epsilon) / alpha_t
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = (sample - alpha_t * x0_pred) / sigma_t
+
+            return epsilon
+
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the first-order DPMSolver (equivalent to DDIM).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "dpmsolver":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            x_t = (
+                (sigma_t / sigma_s * torch.exp(-h)) * sample
+                + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+                + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+            )
+        return x_t
+
+    def singlestep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the second-order singlestep DPMSolver that computes the solution at time `prev_timestep` from the
+        time `timestep_list[-2]`.
+
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`):
+                The current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s1, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m1, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s1) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s1) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s1) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (torch.exp(h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s1) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s1 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s1 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+        return x_t
+
+    def singlestep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the third-order singlestep DPMSolver that computes the solution at time `prev_timestep` from the
+        time `timestep_list[-3]`.
+
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`):
+                The current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing`sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+            self.sigmas[self.step_index - 2],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+        lambda_s2 = torch.log(alpha_s2) - torch.log(sigma_s2)
+
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+
+        h, h_0, h_1 = lambda_t - lambda_s2, lambda_s0 - lambda_s2, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m2
+        D1_0, D1_1 = (1.0 / r1) * (m1 - m2), (1.0 / r0) * (m0 - m2)
+        D1 = (r0 * D1_0 - r1 * D1_1) / (r0 - r1)
+        D2 = 2.0 * (D1_1 - D1_0) / (r0 - r1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s2) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1_1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s2) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                    - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+                )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s2) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1_1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s2) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                    - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+                )
+        return x_t
+
+    def singlestep_dpm_solver_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        order: int = None,
+        noise: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the singlestep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`):
+                The current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by diffusion process.
+            order (`int`):
+                The solver order at this step.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing`sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 3:
+                order = args[3]
+            else:
+                raise ValueError(" missing `order` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if order == 1:
+            return self.dpm_solver_first_order_update(model_output_list[-1], sample=sample, noise=noise)
+        elif order == 2:
+            return self.singlestep_dpm_solver_second_order_update(model_output_list, sample=sample, noise=noise)
+        elif order == 3:
+            return self.singlestep_dpm_solver_third_order_update(model_output_list, sample=sample)
+        else:
+            raise ValueError(f"Order must be 1, 2, 3, got {order}")
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        return step_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the singlestep DPMSolver.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        if self.config.algorithm_type == "sde-dpmsolver++":
+            noise = randn_tensor(
+                model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+            )
+        else:
+            noise = None
+
+        order = self.order_list[self.step_index]
+
+        #  For img2img denoising might start with order>1 which is not possible
+        #  In this case make sure that the first two steps are both order=1
+        while self.model_outputs[-order] is None:
+            order -= 1
+
+        # For single-step solvers, we use the initial value at each time with order = 1.
+        if order == 1:
+            self.sample = sample
+
+        prev_sample = self.singlestep_dpm_solver_update(
+            self.model_outputs, sample=self.sample, order=order, noise=noise
+        )
+
+        # upon completion increase step index by one, noise=noise
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/diffusers/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
new file mode 100644
index 0000000000000000000000000000000000000000..c49e8e9a191a85040250a428c817b7201433ec87
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
@@ -0,0 +1,707 @@
+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver and https://github.com/NVlabs/edm
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin, SchedulerOutput
+
+
+class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Implements DPMSolverMultistepScheduler in EDM formulation as presented in Karras et al. 2022 [1].
+    `EDMDPMSolverMultistepScheduler` is a fast dedicated high-order solver for diffusion ODEs.
+
+    [1] Karras, Tero, et al. "Elucidating the Design Space of Diffusion-Based Generative Models."
+    https://arxiv.org/abs/2206.00364
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        sigma_min (`float`, *optional*, defaults to 0.002):
+            Minimum noise magnitude in the sigma schedule. This was set to 0.002 in the EDM paper [1]; a reasonable
+            range is [0, 10].
+        sigma_max (`float`, *optional*, defaults to 80.0):
+            Maximum noise magnitude in the sigma schedule. This was set to 80.0 in the EDM paper [1]; a reasonable
+            range is [0.2, 80.0].
+        sigma_data (`float`, *optional*, defaults to 0.5):
+            The standard deviation of the data distribution. This is set to 0.5 in the EDM paper [1].
+        sigma_schedule (`str`, *optional*, defaults to `karras`):
+            Sigma schedule to compute the `sigmas`. By default, we the schedule introduced in the EDM paper
+            (https://arxiv.org/abs/2206.00364). Other acceptable value is "exponential". The exponential schedule was
+            incorporated in this model: https://huggingface.co/stabilityai/cosxl.
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        solver_order (`int`, defaults to 2):
+            The DPMSolver order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `dpmsolver++`):
+            Algorithm type for the solver; can be `dpmsolver++` or `sde-dpmsolver++`. The `dpmsolver++` type implements
+            the algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to
+            use `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
+        solver_type (`str`, defaults to `midpoint`):
+            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
+            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        euler_at_final (`bool`, defaults to `False`):
+            Whether to use Euler's method in the final step. It is a trade-off between numerical stability and detail
+            richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
+            steps, but sometimes may result in blurring.
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+    """
+
+    _compatibles = []
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        sigma_min: float = 0.002,
+        sigma_max: float = 80.0,
+        sigma_data: float = 0.5,
+        sigma_schedule: str = "karras",
+        num_train_timesteps: int = 1000,
+        prediction_type: str = "epsilon",
+        rho: float = 7.0,
+        solver_order: int = 2,
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        euler_at_final: bool = False,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+    ):
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"]:
+            if algorithm_type == "deis":
+                self.register_to_config(algorithm_type="dpmsolver++")
+            else:
+                raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}")
+
+        if solver_type not in ["midpoint", "heun"]:
+            if solver_type in ["logrho", "bh1", "bh2"]:
+                self.register_to_config(solver_type="midpoint")
+            else:
+                raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
+
+        if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"] and final_sigmas_type == "zero":
+            raise ValueError(
+                f"`final_sigmas_type` {final_sigmas_type} is not supported for `algorithm_type` {algorithm_type}. Please choose `sigma_min` instead."
+            )
+
+        ramp = torch.linspace(0, 1, num_train_timesteps)
+        if sigma_schedule == "karras":
+            sigmas = self._compute_karras_sigmas(ramp)
+        elif sigma_schedule == "exponential":
+            sigmas = self._compute_exponential_sigmas(ramp)
+
+        self.timesteps = self.precondition_noise(sigmas)
+
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+
+        # setable values
+        self.num_inference_steps = None
+        self.model_outputs = [None] * solver_order
+        self.lower_order_nums = 0
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        return (self.config.sigma_max**2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.precondition_inputs
+    def precondition_inputs(self, sample, sigma):
+        c_in = 1 / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
+        scaled_sample = sample * c_in
+        return scaled_sample
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.precondition_noise
+    def precondition_noise(self, sigma):
+        if not isinstance(sigma, torch.Tensor):
+            sigma = torch.tensor([sigma])
+
+        c_noise = 0.25 * torch.log(sigma)
+
+        return c_noise
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.precondition_outputs
+    def precondition_outputs(self, sample, model_output, sigma):
+        sigma_data = self.config.sigma_data
+        c_skip = sigma_data**2 / (sigma**2 + sigma_data**2)
+
+        if self.config.prediction_type == "epsilon":
+            c_out = sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        elif self.config.prediction_type == "v_prediction":
+            c_out = -sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        else:
+            raise ValueError(f"Prediction type {self.config.prediction_type} is not supported.")
+
+        denoised = c_skip * sample + c_out * model_output
+
+        return denoised
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.scale_model_input
+    def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = self.precondition_inputs(sample, sigma)
+
+        self.is_scale_input_called = True
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+
+        self.num_inference_steps = num_inference_steps
+
+        ramp = torch.linspace(0, 1, self.num_inference_steps)
+        if self.config.sigma_schedule == "karras":
+            sigmas = self._compute_karras_sigmas(ramp)
+        elif self.config.sigma_schedule == "exponential":
+            sigmas = self._compute_exponential_sigmas(ramp)
+
+        sigmas = sigmas.to(dtype=torch.float32, device=device)
+        self.timesteps = self.precondition_noise(sigmas)
+
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = self.config.sigma_min
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+
+        self.sigmas = torch.cat([sigmas, torch.tensor([sigma_last], dtype=torch.float32, device=device)])
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas
+    def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        sigma_min = sigma_min or self.config.sigma_min
+        sigma_max = sigma_max or self.config.sigma_max
+
+        rho = self.config.rho
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_exponential_sigmas
+    def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
+        """Implementation closely follows k-diffusion.
+
+        https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26
+        """
+        sigma_min = sigma_min or self.config.sigma_min
+        sigma_max = sigma_max or self.config.sigma_max
+        sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), len(ramp)).exp().flip(0)
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = torch.tensor(1)  # Inputs are pre-scaled before going into unet, so alpha_t = 1
+        sigma_t = sigma
+
+        return alpha_t, sigma_t
+
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        sample: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+
+        <Tip>
+
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction models.
+
+        </Tip>
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        sigma = self.sigmas[self.step_index]
+        x0_pred = self.precondition_outputs(sample, model_output, sigma)
+
+        if self.config.thresholding:
+            x0_pred = self._threshold_sample(x0_pred)
+
+        return x0_pred
+
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.Tensor,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        One step for the first-order DPMSolver (equivalent to DDIM).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            x_t = (
+                (sigma_t / sigma_s * torch.exp(-h)) * sample
+                + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+                + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+            )
+
+        return x_t
+
+    def multistep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        One step for the second-order multistep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+
+        return x_t
+
+    def multistep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        sample: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """
+        One step for the third-order multistep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+            self.sigmas[self.step_index - 2],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+        lambda_s2 = torch.log(alpha_s2) - torch.log(sigma_s2)
+
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
+
+        return x_t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        return step_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep DPMSolver.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Improve numerical stability for small number of steps
+        lower_order_final = (self.step_index == len(self.timesteps) - 1) and (
+            self.config.euler_at_final
+            or (self.config.lower_order_final and len(self.timesteps) < 15)
+            or self.config.final_sigmas_type == "zero"
+        )
+        lower_order_second = (
+            (self.step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        if self.config.algorithm_type == "sde-dpmsolver++":
+            noise = randn_tensor(
+                model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+            )
+        else:
+            noise = None
+
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(model_output, sample=sample, noise=noise)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            prev_sample = self.multistep_dpm_solver_second_order_update(self.model_outputs, sample=sample, noise=noise)
+        else:
+            prev_sample = self.multistep_dpm_solver_third_order_update(self.model_outputs, sample=sample)
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_edm_euler.py b/diffusers/src/diffusers/schedulers/scheduling_edm_euler.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b823c0d281b3eba295072c5cc4b48e2e90b12c0
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_edm_euler.py
@@ -0,0 +1,402 @@
+# Copyright 2024 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerDiscrete
+class EDMEulerSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Implements the Euler scheduler in EDM formulation as presented in Karras et al. 2022 [1].
+
+    [1] Karras, Tero, et al. "Elucidating the Design Space of Diffusion-Based Generative Models."
+    https://arxiv.org/abs/2206.00364
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        sigma_min (`float`, *optional*, defaults to 0.002):
+            Minimum noise magnitude in the sigma schedule. This was set to 0.002 in the EDM paper [1]; a reasonable
+            range is [0, 10].
+        sigma_max (`float`, *optional*, defaults to 80.0):
+            Maximum noise magnitude in the sigma schedule. This was set to 80.0 in the EDM paper [1]; a reasonable
+            range is [0.2, 80.0].
+        sigma_data (`float`, *optional*, defaults to 0.5):
+            The standard deviation of the data distribution. This is set to 0.5 in the EDM paper [1].
+        sigma_schedule (`str`, *optional*, defaults to `karras`):
+            Sigma schedule to compute the `sigmas`. By default, we the schedule introduced in the EDM paper
+            (https://arxiv.org/abs/2206.00364). Other acceptable value is "exponential". The exponential schedule was
+            incorporated in this model: https://huggingface.co/stabilityai/cosxl.
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        rho (`float`, *optional*, defaults to 7.0):
+            The rho parameter used for calculating the Karras sigma schedule, which is set to 7.0 in the EDM paper [1].
+    """
+
+    _compatibles = []
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        sigma_min: float = 0.002,
+        sigma_max: float = 80.0,
+        sigma_data: float = 0.5,
+        sigma_schedule: str = "karras",
+        num_train_timesteps: int = 1000,
+        prediction_type: str = "epsilon",
+        rho: float = 7.0,
+    ):
+        if sigma_schedule not in ["karras", "exponential"]:
+            raise ValueError(f"Wrong value for provided for `{sigma_schedule=}`.`")
+
+        # setable values
+        self.num_inference_steps = None
+
+        ramp = torch.linspace(0, 1, num_train_timesteps)
+        if sigma_schedule == "karras":
+            sigmas = self._compute_karras_sigmas(ramp)
+        elif sigma_schedule == "exponential":
+            sigmas = self._compute_exponential_sigmas(ramp)
+
+        self.timesteps = self.precondition_noise(sigmas)
+
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+
+        self.is_scale_input_called = False
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        return (self.config.sigma_max**2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def precondition_inputs(self, sample, sigma):
+        c_in = 1 / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
+        scaled_sample = sample * c_in
+        return scaled_sample
+
+    def precondition_noise(self, sigma):
+        if not isinstance(sigma, torch.Tensor):
+            sigma = torch.tensor([sigma])
+
+        c_noise = 0.25 * torch.log(sigma)
+
+        return c_noise
+
+    def precondition_outputs(self, sample, model_output, sigma):
+        sigma_data = self.config.sigma_data
+        c_skip = sigma_data**2 / (sigma**2 + sigma_data**2)
+
+        if self.config.prediction_type == "epsilon":
+            c_out = sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        elif self.config.prediction_type == "v_prediction":
+            c_out = -sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        else:
+            raise ValueError(f"Prediction type {self.config.prediction_type} is not supported.")
+
+        denoised = c_skip * sample + c_out * model_output
+
+        return denoised
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = self.precondition_inputs(sample, sigma)
+
+        self.is_scale_input_called = True
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        ramp = torch.linspace(0, 1, self.num_inference_steps)
+        if self.config.sigma_schedule == "karras":
+            sigmas = self._compute_karras_sigmas(ramp)
+        elif self.config.sigma_schedule == "exponential":
+            sigmas = self._compute_exponential_sigmas(ramp)
+
+        sigmas = sigmas.to(dtype=torch.float32, device=device)
+        self.timesteps = self.precondition_noise(sigmas)
+
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
+    def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        sigma_min = sigma_min or self.config.sigma_min
+        sigma_max = sigma_max or self.config.sigma_max
+
+        rho = self.config.rho
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
+        """Implementation closely follows k-diffusion.
+
+        https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26
+        """
+        sigma_min = sigma_min or self.config.sigma_min
+        sigma_max = sigma_max or self.config.sigma_max
+        sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), len(ramp)).exp().flip(0)
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[float, torch.Tensor],
+        sample: torch.Tensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EDMEulerSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+
+        if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EDMEulerScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
+        sigma = self.sigmas[self.step_index]
+
+        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
+
+        noise = randn_tensor(
+            model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator
+        )
+
+        eps = noise * s_noise
+        sigma_hat = sigma * (gamma + 1)
+
+        if gamma > 0:
+            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        pred_original_sample = self.precondition_outputs(sample, model_output, sigma_hat)
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma_hat
+
+        dt = self.sigmas[self.step_index + 1] - sigma_hat
+
+        prev_sample = sample + derivative * dt
+
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return EDMEulerSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/diffusers/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..485e919e9cc548f0a9be6b9ddc04c210a22545cb
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -0,0 +1,479 @@
+# Copyright 2024 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerAncestralDiscrete
+class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Ancestral sampling with Euler method steps.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        if rescale_betas_zero_snr:
+            # Close to 0 without being 0 so first sigma is not inf
+            # FP16 smallest positive subnormal works well here
+            self.alphas_cumprod[-1] = 2**-24
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas)
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.is_scale_input_called = False
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return self.sigmas.max()
+
+        return (self.sigmas.max() ** 2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        self.is_scale_input_called = True
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[
+                ::-1
+            ].copy()
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(self.config.num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas).to(device=device)
+
+        self.timesteps = torch.from_numpy(timesteps).to(device=device)
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[float, torch.Tensor],
+        sample: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`,
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+
+        """
+
+        if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        sigma_from = self.sigmas[self.step_index]
+        sigma_to = self.sigmas[self.step_index + 1]
+        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+        sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma
+
+        dt = sigma_down - sigma
+
+        prev_sample = sample + derivative * dt
+
+        device = model_output.device
+        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=device, generator=generator)
+
+        prev_sample = prev_sample + noise * sigma_up
+
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return EulerAncestralDiscreteSchedulerOutput(
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample
+        )
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_euler_discrete.py b/diffusers/src/diffusers/schedulers/scheduling_euler_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c39583356ad50aca04f44483eb478dff8e9c0e4
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -0,0 +1,755 @@
+# Copyright 2024 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, is_scipy_available, logging
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+if is_scipy_available():
+    import scipy.stats
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerDiscrete
+class EulerDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        interpolation_type(`str`, defaults to `"linear"`, *optional*):
+            The interpolation type to compute intermediate sigmas for the scheduler denoising steps. Should be on of
+            `"linear"` or `"log_linear"`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        use_beta_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use beta sigmas for step sizes in the noise schedule during the sampling process. Refer to [Beta
+            Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        interpolation_type: str = "linear",
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        use_beta_sigmas: Optional[bool] = False,
+        sigma_min: Optional[float] = None,
+        sigma_max: Optional[float] = None,
+        timestep_spacing: str = "linspace",
+        timestep_type: str = "discrete",  # can be "discrete" or "continuous"
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: bool = False,
+        final_sigmas_type: str = "zero",  # can be "zero" or "sigma_min"
+    ):
+        if self.config.use_beta_sigmas and not is_scipy_available():
+            raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
+        if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError(
+                "Only one of `config.use_beta_sigmas`, `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used."
+            )
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        if rescale_betas_zero_snr:
+            # Close to 0 without being 0 so first sigma is not inf
+            # FP16 smallest positive subnormal works well here
+            self.alphas_cumprod[-1] = 2**-24
+
+        sigmas = (((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5).flip(0)
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
+
+        # setable values
+        self.num_inference_steps = None
+
+        # TODO: Support the full EDM scalings for all prediction types and timestep types
+        if timestep_type == "continuous" and prediction_type == "v_prediction":
+            self.timesteps = torch.Tensor([0.25 * sigma.log() for sigma in sigmas])
+        else:
+            self.timesteps = timesteps
+
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+
+        self.is_scale_input_called = False
+        self.use_karras_sigmas = use_karras_sigmas
+        self.use_exponential_sigmas = use_exponential_sigmas
+        self.use_beta_sigmas = use_beta_sigmas
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        max_sigma = max(self.sigmas) if isinstance(self.sigmas, list) else self.sigmas.max()
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return max_sigma
+
+        return (max_sigma**2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+
+        self.is_scale_input_called = True
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[int]] = None,
+        sigmas: Optional[List[float]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary timesteps schedule. If `None`, timesteps will be generated
+                based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps` and `sigmas`
+                must be `None`, and `timestep_spacing` attribute will be ignored.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas used to support arbitrary timesteps schedule schedule. If `None`, timesteps and sigmas
+                will be generated based on the relevant scheduler attributes. If `sigmas` is passed,
+                `num_inference_steps` and `timesteps` must be `None`, and the timesteps will be generated based on the
+                custom sigmas schedule.
+        """
+
+        if timesteps is not None and sigmas is not None:
+            raise ValueError("Only one of `timesteps` or `sigmas` should be set.")
+        if num_inference_steps is None and timesteps is None and sigmas is None:
+            raise ValueError("Must pass exactly one of `num_inference_steps` or `timesteps` or `sigmas.")
+        if num_inference_steps is not None and (timesteps is not None or sigmas is not None):
+            raise ValueError("Can only pass one of `num_inference_steps` or `timesteps` or `sigmas`.")
+        if timesteps is not None and self.config.use_karras_sigmas:
+            raise ValueError("Cannot set `timesteps` with `config.use_karras_sigmas = True`.")
+        if timesteps is not None and self.config.use_exponential_sigmas:
+            raise ValueError("Cannot set `timesteps` with `config.use_exponential_sigmas = True`.")
+        if timesteps is not None and self.config.use_beta_sigmas:
+            raise ValueError("Cannot set `timesteps` with `config.use_beta_sigmas = True`.")
+        if (
+            timesteps is not None
+            and self.config.timestep_type == "continuous"
+            and self.config.prediction_type == "v_prediction"
+        ):
+            raise ValueError(
+                "Cannot set `timesteps` with `config.timestep_type = 'continuous'` and `config.prediction_type = 'v_prediction'`."
+            )
+
+        if num_inference_steps is None:
+            num_inference_steps = len(timesteps) if timesteps is not None else len(sigmas) - 1
+        self.num_inference_steps = num_inference_steps
+
+        if sigmas is not None:
+            log_sigmas = np.log(np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5))
+            sigmas = np.array(sigmas).astype(np.float32)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas[:-1]])
+
+        else:
+            if timesteps is not None:
+                timesteps = np.array(timesteps).astype(np.float32)
+            else:
+                # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+                if self.config.timestep_spacing == "linspace":
+                    timesteps = np.linspace(
+                        0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=np.float32
+                    )[::-1].copy()
+                elif self.config.timestep_spacing == "leading":
+                    step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+                    # creates integer timesteps by multiplying by ratio
+                    # casting to int to avoid issues when num_inference_step is power of 3
+                    timesteps = (
+                        (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
+                    )
+                    timesteps += self.config.steps_offset
+                elif self.config.timestep_spacing == "trailing":
+                    step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+                    # creates integer timesteps by multiplying by ratio
+                    # casting to int to avoid issues when num_inference_step is power of 3
+                    timesteps = (
+                        (np.arange(self.config.num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
+                    )
+                    timesteps -= 1
+                else:
+                    raise ValueError(
+                        f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+                    )
+
+            sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+            log_sigmas = np.log(sigmas)
+            if self.config.interpolation_type == "linear":
+                sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+            elif self.config.interpolation_type == "log_linear":
+                sigmas = torch.linspace(np.log(sigmas[-1]), np.log(sigmas[0]), num_inference_steps + 1).exp().numpy()
+            else:
+                raise ValueError(
+                    f"{self.config.interpolation_type} is not implemented. Please specify interpolation_type to either"
+                    " 'linear' or 'log_linear'"
+                )
+
+            if self.config.use_karras_sigmas:
+                sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+                timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
+            elif self.config.use_exponential_sigmas:
+                sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+                timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
+            elif self.config.use_beta_sigmas:
+                sigmas = self._convert_to_beta(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+                timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
+            if self.config.final_sigmas_type == "sigma_min":
+                sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+            elif self.config.final_sigmas_type == "zero":
+                sigma_last = 0
+            else:
+                raise ValueError(
+                    f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+                )
+
+            sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
+
+        # TODO: Support the full EDM scalings for all prediction types and timestep types
+        if self.config.timestep_type == "continuous" and self.config.prediction_type == "v_prediction":
+            self.timesteps = torch.Tensor([0.25 * sigma.log() for sigma in sigmas[:-1]]).to(device=device)
+        else:
+            self.timesteps = torch.from_numpy(timesteps.astype(np.float32)).to(device=device)
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L26
+    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """Constructs an exponential noise schedule."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps).exp()
+        return sigmas
+
+    def _convert_to_beta(
+        self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
+    ) -> torch.Tensor:
+        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = torch.Tensor(
+            [
+                sigma_min + (ppf * (sigma_max - sigma_min))
+                for ppf in [
+                    scipy.stats.beta.ppf(timestep, alpha, beta)
+                    for timestep in 1 - np.linspace(0, 1, num_inference_steps)
+                ]
+            ]
+        )
+        return sigmas
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[float, torch.Tensor],
+        sample: torch.Tensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+
+        if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
+        sigma = self.sigmas[self.step_index]
+
+        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
+
+        noise = randn_tensor(
+            model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator
+        )
+
+        eps = noise * s_noise
+        sigma_hat = sigma * (gamma + 1)
+
+        if gamma > 0:
+            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        # NOTE: "original_sample" should not be an expected prediction_type but is left in for
+        # backwards compatibility
+        if self.config.prediction_type == "original_sample" or self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma_hat * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # denoised = model_output * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma_hat
+
+        dt = self.sigmas[self.step_index + 1] - sigma_hat
+
+        prev_sample = sample + derivative * dt
+
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return EulerDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
+        if (
+            isinstance(timesteps, int)
+            or isinstance(timesteps, torch.IntTensor)
+            or isinstance(timesteps, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.get_velocity()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if sample.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32)
+            timesteps = timesteps.to(sample.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(sample.device)
+            timesteps = timesteps.to(sample.device)
+
+        step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        alphas_cumprod = self.alphas_cumprod.to(sample)
+        sqrt_alpha_prod = alphas_cumprod[step_indices] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[step_indices]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_euler_discrete_flax.py b/diffusers/src/diffusers/schedulers/scheduling_euler_discrete_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..55b0c2460a81d1d9ca01fa93a0f0061066a7fe63
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_euler_discrete_flax.py
@@ -0,0 +1,265 @@
+# Copyright 2024 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import (
+    CommonSchedulerState,
+    FlaxKarrasDiffusionSchedulers,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    broadcast_to_shape_from_left,
+)
+
+
+@flax.struct.dataclass
+class EulerDiscreteSchedulerState:
+    common: CommonSchedulerState
+
+    # setable values
+    init_noise_sigma: jnp.ndarray
+    timesteps: jnp.ndarray
+    sigmas: jnp.ndarray
+    num_inference_steps: Optional[int] = None
+
+    @classmethod
+    def create(
+        cls, common: CommonSchedulerState, init_noise_sigma: jnp.ndarray, timesteps: jnp.ndarray, sigmas: jnp.ndarray
+    ):
+        return cls(common=common, init_noise_sigma=init_noise_sigma, timesteps=timesteps, sigmas=sigmas)
+
+
+@dataclass
+class FlaxEulerDiscreteSchedulerOutput(FlaxSchedulerOutput):
+    state: EulerDiscreteSchedulerState
+
+
+class FlaxEulerDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler (Algorithm 2) from Karras et al. (2022) https://arxiv.org/abs/2206.00364. . Based on the original
+    k-diffusion implementation by Katherine Crowson:
+    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51
+
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`jnp.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+            https://imagen.research.google/video/paper.pdf)
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
+    """
+
+    _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
+
+    dtype: jnp.dtype
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "linspace",
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        self.dtype = dtype
+
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> EulerDiscreteSchedulerState:
+        if common is None:
+            common = CommonSchedulerState.create(self)
+
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
+        sigmas = ((1 - common.alphas_cumprod) / common.alphas_cumprod) ** 0.5
+        sigmas = jnp.interp(timesteps, jnp.arange(0, len(sigmas)), sigmas)
+        sigmas = jnp.concatenate([sigmas, jnp.array([0.0], dtype=self.dtype)])
+
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            init_noise_sigma = sigmas.max()
+        else:
+            init_noise_sigma = (sigmas.max() ** 2 + 1) ** 0.5
+
+        return EulerDiscreteSchedulerState.create(
+            common=common,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+            sigmas=sigmas,
+        )
+
+    def scale_model_input(self, state: EulerDiscreteSchedulerState, sample: jnp.ndarray, timestep: int) -> jnp.ndarray:
+        """
+        Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+        Args:
+            state (`EulerDiscreteSchedulerState`):
+                the `FlaxEulerDiscreteScheduler` state data class instance.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            timestep (`int`):
+                current discrete timestep in the diffusion chain.
+
+        Returns:
+            `jnp.ndarray`: scaled input sample
+        """
+        (step_index,) = jnp.where(state.timesteps == timestep, size=1)
+        step_index = step_index[0]
+
+        sigma = state.sigmas[step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        return sample
+
+    def set_timesteps(
+        self, state: EulerDiscreteSchedulerState, num_inference_steps: int, shape: Tuple = ()
+    ) -> EulerDiscreteSchedulerState:
+        """
+        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`EulerDiscreteSchedulerState`):
+                the `FlaxEulerDiscreteScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if self.config.timestep_spacing == "linspace":
+            timesteps = jnp.linspace(self.config.num_train_timesteps - 1, 0, num_inference_steps, dtype=self.dtype)
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // num_inference_steps
+            timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(float)
+            timesteps += 1
+        else:
+            raise ValueError(
+                f"timestep_spacing must be one of ['linspace', 'leading'], got {self.config.timestep_spacing}"
+            )
+
+        sigmas = ((1 - state.common.alphas_cumprod) / state.common.alphas_cumprod) ** 0.5
+        sigmas = jnp.interp(timesteps, jnp.arange(0, len(sigmas)), sigmas)
+        sigmas = jnp.concatenate([sigmas, jnp.array([0.0], dtype=self.dtype)])
+
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            init_noise_sigma = sigmas.max()
+        else:
+            init_noise_sigma = (sigmas.max() ** 2 + 1) ** 0.5
+
+        return state.replace(
+            timesteps=timesteps,
+            sigmas=sigmas,
+            num_inference_steps=num_inference_steps,
+            init_noise_sigma=init_noise_sigma,
+        )
+
+    def step(
+        self,
+        state: EulerDiscreteSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        return_dict: bool = True,
+    ) -> Union[FlaxEulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`EulerDiscreteSchedulerState`):
+                the `FlaxEulerDiscreteScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            order: coefficient for multi-step inference.
+            return_dict (`bool`): option for returning tuple rather than FlaxEulerDiscreteScheduler class
+
+        Returns:
+            [`FlaxEulerDiscreteScheduler`] or `tuple`: [`FlaxEulerDiscreteScheduler`] if `return_dict` is True,
+            otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        (step_index,) = jnp.where(state.timesteps == timestep, size=1)
+        step_index = step_index[0]
+
+        sigma = state.sigmas[step_index]
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma
+
+        # dt = sigma_down - sigma
+        dt = state.sigmas[step_index + 1] - sigma
+
+        prev_sample = sample + derivative * dt
+
+        if not return_dict:
+            return (prev_sample, state)
+
+        return FlaxEulerDiscreteSchedulerOutput(prev_sample=prev_sample, state=state)
+
+    def add_noise(
+        self,
+        state: EulerDiscreteSchedulerState,
+        original_samples: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        sigma = state.sigmas[timesteps].flatten()
+        sigma = broadcast_to_shape_from_left(sigma, noise.shape)
+
+        noisy_samples = original_samples + noise * sigma
+
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/diffusers/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..937cae2e47f5af731de7b4af1929aff29cd29a54
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
@@ -0,0 +1,311 @@
+# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from .scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.FloatTensor
+
+
+class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+    """
+
+    _compatibles = []
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+        use_dynamic_shifting=False,
+        base_shift: Optional[float] = 0.5,
+        max_shift: Optional[float] = 1.15,
+        base_image_seq_len: Optional[int] = 256,
+        max_image_seq_len: Optional[int] = 4096,
+    ):
+        timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
+
+        sigmas = timesteps / num_train_timesteps
+        if not use_dynamic_shifting:
+            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
+            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+
+        self.timesteps = sigmas * num_train_timesteps
+
+        self._step_index = None
+        self._begin_index = None
+
+        self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_noise(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Forward process in flow-matching
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype)
+
+        if sample.device.type == "mps" and torch.is_floating_point(timestep):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32)
+            timestep = timestep.to(sample.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(sample.device)
+            timestep = timestep.to(sample.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timestep.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timestep.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(sample.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        sample = sigma * noise + (1.0 - sigma) * sample
+
+        return sample
+
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+
+    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        device: Union[str, torch.device] = None,
+        sigmas: Optional[List[float]] = None,
+        mu: Optional[float] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+
+        if self.config.use_dynamic_shifting and mu is None:
+            raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
+
+        if sigmas is None:
+            self.num_inference_steps = num_inference_steps
+            timesteps = np.linspace(
+                self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
+            )
+
+            sigmas = timesteps / self.config.num_train_timesteps
+
+        if self.config.use_dynamic_shifting:
+            sigmas = self.time_shift(mu, 1.0, sigmas)
+        else:
+            sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas)
+
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
+        timesteps = sigmas * self.config.num_train_timesteps
+
+        self.timesteps = timesteps.to(device=device)
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+
+        self._step_index = None
+        self._begin_index = None
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
+        sigma = self.sigmas[self.step_index]
+        sigma_next = self.sigmas[self.step_index + 1]
+
+        prev_sample = sample + (sigma_next - sigma) * model_output
+
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_flow_match_heun_discrete.py b/diffusers/src/diffusers/schedulers/scheduling_flow_match_heun_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9a3ca2d4b0ad9062baafb9749cc3fe4a35401ea
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_flow_match_heun_discrete.py
@@ -0,0 +1,321 @@
+# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class FlowMatchHeunDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.FloatTensor
+
+
+class FlowMatchHeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Heun scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+    """
+
+    _compatibles = []
+    order = 2
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+    ):
+        timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
+
+        sigmas = timesteps / num_train_timesteps
+        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+
+        self.timesteps = sigmas * num_train_timesteps
+
+        self._step_index = None
+        self._begin_index = None
+
+        self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_noise(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Forward process in flow-matching
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = sigma * noise + (1.0 - sigma) * sample
+
+        return sample
+
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        timesteps = np.linspace(
+            self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
+        )
+
+        sigmas = timesteps / self.config.num_train_timesteps
+        sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas)
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
+
+        timesteps = sigmas * self.config.num_train_timesteps
+        timesteps = torch.cat([timesteps[:1], timesteps[1:].repeat_interleave(2)])
+        self.timesteps = timesteps.to(device=device)
+
+        sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+        self.sigmas = torch.cat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]])
+
+        # empty dt and derivative
+        self.prev_derivative = None
+        self.dt = None
+
+        self._step_index = None
+        self._begin_index = None
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    @property
+    def state_in_first_order(self):
+        return self.dt is None
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[FlowMatchHeunDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_Heun_discrete.HeunDiscreteSchedulerOutput`] or
+                tuple.
+
+        Returns:
+            [`~schedulers.scheduling_Heun_discrete.HeunDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_Heun_discrete.HeunDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `HeunDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
+        if self.state_in_first_order:
+            sigma = self.sigmas[self.step_index]
+            sigma_next = self.sigmas[self.step_index + 1]
+        else:
+            # 2nd order / Heun's method
+            sigma = self.sigmas[self.step_index - 1]
+            sigma_next = self.sigmas[self.step_index]
+
+        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
+
+        noise = randn_tensor(
+            model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator
+        )
+
+        eps = noise * s_noise
+        sigma_hat = sigma * (gamma + 1)
+
+        if gamma > 0:
+            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
+
+        if self.state_in_first_order:
+            # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+            denoised = sample - model_output * sigma
+            # 2. convert to an ODE derivative for 1st order
+            derivative = (sample - denoised) / sigma_hat
+            # 3. Delta timestep
+            dt = sigma_next - sigma_hat
+
+            # store for 2nd order step
+            self.prev_derivative = derivative
+            self.dt = dt
+            self.sample = sample
+        else:
+            # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+            denoised = sample - model_output * sigma_next
+            # 2. 2nd order / Heun's method
+            derivative = (sample - denoised) / sigma_next
+            derivative = 0.5 * (self.prev_derivative + derivative)
+
+            # 3. take prev timestep & sample
+            dt = self.dt
+            sample = self.sample
+
+            # free dt and derivative
+            # Note, this puts the scheduler in "first order mode"
+            self.prev_derivative = None
+            self.dt = None
+            self.sample = None
+
+        prev_sample = sample + derivative * dt
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return FlowMatchHeunDiscreteSchedulerOutput(prev_sample=prev_sample)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_heun_discrete.py b/diffusers/src/diffusers/schedulers/scheduling_heun_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..efcfdeb1d5ef49fc3156d123463203ed6bcfaa8c
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -0,0 +1,536 @@
+# Copyright 2024 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Scheduler with Heun steps for discrete beta schedules.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 2
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,  # sensible defaults
+        beta_end: float = 0.012,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        clip_sample: Optional[bool] = False,
+        clip_sample_range: float = 1.0,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if sum([self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError("Only one of `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used.")
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="cosine")
+        elif beta_schedule == "exp":
+            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="exp")
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        #  set all values
+        self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
+        self.use_karras_sigmas = use_karras_sigmas
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return self.sigmas.max()
+
+        return (self.sigmas.max() ** 2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[float, torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        num_train_timesteps: Optional[int] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            num_train_timesteps (`int`, *optional*):
+                The number of diffusion steps used when training the model. If `None`, the default
+                `num_train_timesteps` attribute is used.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, timesteps will be
+                generated based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps`
+                must be `None`, and `timestep_spacing` attribute will be ignored.
+        """
+        if num_inference_steps is None and timesteps is None:
+            raise ValueError("Must pass exactly one of `num_inference_steps` or `custom_timesteps`.")
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+        if timesteps is not None and self.config.use_karras_sigmas:
+            raise ValueError("Cannot use `timesteps` with `config.use_karras_sigmas = True`")
+        if timesteps is not None and self.config.use_exponential_sigmas:
+            raise ValueError("Cannot set `timesteps` with `config.use_exponential_sigmas = True`.")
+
+        num_inference_steps = num_inference_steps or len(timesteps)
+        self.num_inference_steps = num_inference_steps
+        num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
+
+        if timesteps is not None:
+            timesteps = np.array(timesteps, dtype=np.float32)
+        else:
+            # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+            if self.config.timestep_spacing == "linspace":
+                timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[::-1].copy()
+            elif self.config.timestep_spacing == "leading":
+                step_ratio = num_train_timesteps // self.num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
+                timesteps += self.config.steps_offset
+            elif self.config.timestep_spacing == "trailing":
+                step_ratio = num_train_timesteps / self.num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
+                timesteps -= 1
+            else:
+                raise ValueError(
+                    f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+                )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.config.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+        elif self.config.use_exponential_sigmas:
+            sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        sigmas = torch.from_numpy(sigmas).to(device=device)
+        self.sigmas = torch.cat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]])
+
+        timesteps = torch.from_numpy(timesteps)
+        timesteps = torch.cat([timesteps[:1], timesteps[1:].repeat_interleave(2)])
+
+        self.timesteps = timesteps.to(device=device)
+
+        # empty dt and derivative
+        self.prev_derivative = None
+        self.dt = None
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
+    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """Constructs an exponential noise schedule."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps).exp()
+        return sigmas
+
+    @property
+    def state_in_first_order(self):
+        return self.dt is None
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: Union[torch.Tensor, np.ndarray],
+        timestep: Union[float, torch.Tensor],
+        sample: Union[torch.Tensor, np.ndarray],
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        if self.state_in_first_order:
+            sigma = self.sigmas[self.step_index]
+            sigma_next = self.sigmas[self.step_index + 1]
+        else:
+            # 2nd order / Heun's method
+            sigma = self.sigmas[self.step_index - 1]
+            sigma_next = self.sigmas[self.step_index]
+
+        # currently only gamma=0 is supported. This usually works best anyways.
+        # We can support gamma in the future but then need to scale the timestep before
+        # passing it to the model which requires a change in API
+        gamma = 0
+        sigma_hat = sigma * (gamma + 1)  # Note: sigma_hat == sigma for now
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            sigma_input = sigma_hat if self.state_in_first_order else sigma_next
+            pred_original_sample = sample - sigma_input * model_output
+        elif self.config.prediction_type == "v_prediction":
+            sigma_input = sigma_hat if self.state_in_first_order else sigma_next
+            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+                sample / (sigma_input**2 + 1)
+            )
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        if self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        if self.state_in_first_order:
+            # 2. Convert to an ODE derivative for 1st order
+            derivative = (sample - pred_original_sample) / sigma_hat
+            # 3. delta timestep
+            dt = sigma_next - sigma_hat
+
+            # store for 2nd order step
+            self.prev_derivative = derivative
+            self.dt = dt
+            self.sample = sample
+        else:
+            # 2. 2nd order / Heun's method
+            derivative = (sample - pred_original_sample) / sigma_next
+            derivative = (self.prev_derivative + derivative) / 2
+
+            # 3. take prev timestep & sample
+            dt = self.dt
+            sample = self.sample
+
+            # free dt and derivative
+            # Note, this puts the scheduler in "first order mode"
+            self.prev_derivative = None
+            self.dt = None
+            self.sample = None
+
+        prev_sample = sample + derivative * dt
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_ipndm.py b/diffusers/src/diffusers/schedulers/scheduling_ipndm.py
new file mode 100644
index 0000000000000000000000000000000000000000..28f349ae21147acbb670c4e383749174bcb92d53
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_ipndm.py
@@ -0,0 +1,224 @@
+# Copyright 2024 Zhejiang University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import SchedulerMixin, SchedulerOutput
+
+
+class IPNDMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    A fourth-order Improved Pseudo Linear Multistep scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self, num_train_timesteps: int = 1000, trained_betas: Optional[Union[np.ndarray, List[float]]] = None
+    ):
+        # set `betas`, `alphas`, `timesteps`
+        self.set_timesteps(num_train_timesteps)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # For now we only support F-PNDM, i.e. the runge-kutta method
+        # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
+        # mainly at formula (9), (12), (13) and the Algorithm 2.
+        self.pndm_order = 4
+
+        # running values
+        self.ets = []
+        self._step_index = None
+        self._begin_index = None
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+        steps = torch.linspace(1, 0, num_inference_steps + 1)[:-1]
+        steps = torch.cat([steps, torch.tensor([0.0])])
+
+        if self.config.trained_betas is not None:
+            self.betas = torch.tensor(self.config.trained_betas, dtype=torch.float32)
+        else:
+            self.betas = torch.sin(steps * math.pi / 2) ** 2
+
+        self.alphas = (1.0 - self.betas**2) ** 0.5
+
+        timesteps = (torch.atan2(self.betas, self.alphas) / math.pi * 2)[:-1]
+        self.timesteps = timesteps.to(device)
+
+        self.ets = []
+        self._step_index = None
+        self._begin_index = None
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the linear multistep method. It performs one forward pass multiple times to approximate the solution.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        timestep_index = self.step_index
+        prev_timestep_index = self.step_index + 1
+
+        ets = sample * self.betas[timestep_index] + model_output * self.alphas[timestep_index]
+        self.ets.append(ets)
+
+        if len(self.ets) == 1:
+            ets = self.ets[-1]
+        elif len(self.ets) == 2:
+            ets = (3 * self.ets[-1] - self.ets[-2]) / 2
+        elif len(self.ets) == 3:
+            ets = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
+        else:
+            ets = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4])
+
+        prev_sample = self._get_prev_sample(sample, timestep_index, prev_timestep_index, ets)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def _get_prev_sample(self, sample, timestep_index, prev_timestep_index, ets):
+        alpha = self.alphas[timestep_index]
+        sigma = self.betas[timestep_index]
+
+        next_alpha = self.alphas[prev_timestep_index]
+        next_sigma = self.betas[prev_timestep_index]
+
+        pred = (sample - sigma * ets) / max(alpha, 1e-8)
+        prev_sample = next_alpha * pred + ets * next_sigma
+
+        return prev_sample
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/diffusers/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..038aa19603ea8d5bf78e9931c76d364838a2d88d
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -0,0 +1,542 @@
+# Copyright 2024 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    KDPM2DiscreteScheduler with ancestral sampling is inspired by the DPMSolver2 and Algorithm 2 from the [Elucidating
+    the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.00085):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.012):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 2
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,  # sensible defaults
+        beta_end: float = 0.012,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if sum([self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError("Only one of `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used.")
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        #  set all values
+        self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return self.sigmas.max()
+
+        return (self.sigmas.max() ** 2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[float, torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        if self.state_in_first_order:
+            sigma = self.sigmas[self.step_index]
+        else:
+            sigma = self.sigmas_interpol[self.step_index - 1]
+
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        device: Union[str, torch.device] = None,
+        num_train_timesteps: Optional[int] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[::-1].copy()
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.config.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+        elif self.config.use_exponential_sigmas:
+            sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
+        self.log_sigmas = torch.from_numpy(log_sigmas).to(device)
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        sigmas = torch.from_numpy(sigmas).to(device=device)
+
+        # compute up and down sigmas
+        sigmas_next = sigmas.roll(-1)
+        sigmas_next[-1] = 0.0
+        sigmas_up = (sigmas_next**2 * (sigmas**2 - sigmas_next**2) / sigmas**2) ** 0.5
+        sigmas_down = (sigmas_next**2 - sigmas_up**2) ** 0.5
+        sigmas_down[-1] = 0.0
+
+        # compute interpolated sigmas
+        sigmas_interpol = sigmas.log().lerp(sigmas_down.log(), 0.5).exp()
+        sigmas_interpol[-2:] = 0.0
+
+        # set sigmas
+        self.sigmas = torch.cat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
+        self.sigmas_interpol = torch.cat(
+            [sigmas_interpol[:1], sigmas_interpol[1:].repeat_interleave(2), sigmas_interpol[-1:]]
+        )
+        self.sigmas_up = torch.cat([sigmas_up[:1], sigmas_up[1:].repeat_interleave(2), sigmas_up[-1:]])
+        self.sigmas_down = torch.cat([sigmas_down[:1], sigmas_down[1:].repeat_interleave(2), sigmas_down[-1:]])
+
+        if str(device).startswith("mps"):
+            timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32)
+        else:
+            timesteps = torch.from_numpy(timesteps).to(device)
+
+        sigmas_interpol = sigmas_interpol.cpu()
+        log_sigmas = self.log_sigmas.cpu()
+        timesteps_interpol = np.array(
+            [self._sigma_to_t(sigma_interpol, log_sigmas) for sigma_interpol in sigmas_interpol]
+        )
+
+        timesteps_interpol = torch.from_numpy(timesteps_interpol).to(device, dtype=timesteps.dtype)
+        interleaved_timesteps = torch.stack((timesteps_interpol[:-2, None], timesteps[1:, None]), dim=-1).flatten()
+
+        self.timesteps = torch.cat([timesteps[:1], interleaved_timesteps])
+
+        self.sample = None
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
+    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """Constructs an exponential noise schedule."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps).exp()
+        return sigmas
+
+    @property
+    def state_in_first_order(self):
+        return self.sample is None
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: Union[torch.Tensor, np.ndarray],
+        timestep: Union[float, torch.Tensor],
+        sample: Union[torch.Tensor, np.ndarray],
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddim.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        if self.state_in_first_order:
+            sigma = self.sigmas[self.step_index]
+            sigma_interpol = self.sigmas_interpol[self.step_index]
+            sigma_up = self.sigmas_up[self.step_index]
+            sigma_down = self.sigmas_down[self.step_index - 1]
+        else:
+            # 2nd order / KPDM2's method
+            sigma = self.sigmas[self.step_index - 1]
+            sigma_interpol = self.sigmas_interpol[self.step_index - 1]
+            sigma_up = self.sigmas_up[self.step_index - 1]
+            sigma_down = self.sigmas_down[self.step_index - 1]
+
+        # currently only gamma=0 is supported. This usually works best anyways.
+        # We can support gamma in the future but then need to scale the timestep before
+        # passing it to the model which requires a change in API
+        gamma = 0
+        sigma_hat = sigma * (gamma + 1)  # Note: sigma_hat == sigma for now
+
+        device = model_output.device
+        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=device, generator=generator)
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
+            pred_original_sample = sample - sigma_input * model_output
+        elif self.config.prediction_type == "v_prediction":
+            sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
+            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+                sample / (sigma_input**2 + 1)
+            )
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        if self.state_in_first_order:
+            # 2. Convert to an ODE derivative for 1st order
+            derivative = (sample - pred_original_sample) / sigma_hat
+            # 3. delta timestep
+            dt = sigma_interpol - sigma_hat
+
+            # store for 2nd order step
+            self.sample = sample
+            self.dt = dt
+            prev_sample = sample + derivative * dt
+        else:
+            # DPM-Solver-2
+            # 2. Convert to an ODE derivative for 2nd order
+            derivative = (sample - pred_original_sample) / sigma_interpol
+            # 3. delta timestep
+            dt = sigma_down - sigma_hat
+
+            sample = self.sample
+            self.sample = None
+
+            prev_sample = sample + derivative * dt
+            prev_sample = prev_sample + noise * sigma_up
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/diffusers/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fbf66832668aae634655d1e25d7d22737a3a2a5
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -0,0 +1,517 @@
+# Copyright 2024 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    KDPM2DiscreteScheduler is inspired by the DPMSolver2 and Algorithm 2 from the [Elucidating the Design Space of
+    Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.00085):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.012):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 2
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,  # sensible defaults
+        beta_end: float = 0.012,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if sum([self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError("Only one of `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used.")
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        #  set all values
+        self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return self.sigmas.max()
+
+        return (self.sigmas.max() ** 2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[float, torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        if self.state_in_first_order:
+            sigma = self.sigmas[self.step_index]
+        else:
+            sigma = self.sigmas_interpol[self.step_index]
+
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        device: Union[str, torch.device] = None,
+        num_train_timesteps: Optional[int] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[::-1].copy()
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.config.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+        elif self.config.use_exponential_sigmas:
+            sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
+        self.log_sigmas = torch.from_numpy(log_sigmas).to(device=device)
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        sigmas = torch.from_numpy(sigmas).to(device=device)
+
+        # interpolate sigmas
+        sigmas_interpol = sigmas.log().lerp(sigmas.roll(1).log(), 0.5).exp()
+
+        self.sigmas = torch.cat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
+        self.sigmas_interpol = torch.cat(
+            [sigmas_interpol[:1], sigmas_interpol[1:].repeat_interleave(2), sigmas_interpol[-1:]]
+        )
+
+        timesteps = torch.from_numpy(timesteps).to(device)
+
+        # interpolate timesteps
+        sigmas_interpol = sigmas_interpol.cpu()
+        log_sigmas = self.log_sigmas.cpu()
+        timesteps_interpol = np.array(
+            [self._sigma_to_t(sigma_interpol, log_sigmas) for sigma_interpol in sigmas_interpol]
+        )
+        timesteps_interpol = torch.from_numpy(timesteps_interpol).to(device, dtype=timesteps.dtype)
+        interleaved_timesteps = torch.stack((timesteps_interpol[1:-1, None], timesteps[1:, None]), dim=-1).flatten()
+
+        self.timesteps = torch.cat([timesteps[:1], interleaved_timesteps])
+
+        self.sample = None
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def state_in_first_order(self):
+        return self.sample is None
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
+    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """Constructs an exponential noise schedule."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps).exp()
+        return sigmas
+
+    def step(
+        self,
+        model_output: Union[torch.Tensor, np.ndarray],
+        timestep: Union[float, torch.Tensor],
+        sample: Union[torch.Tensor, np.ndarray],
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        if self.state_in_first_order:
+            sigma = self.sigmas[self.step_index]
+            sigma_interpol = self.sigmas_interpol[self.step_index + 1]
+            sigma_next = self.sigmas[self.step_index + 1]
+        else:
+            # 2nd order / KDPM2's method
+            sigma = self.sigmas[self.step_index - 1]
+            sigma_interpol = self.sigmas_interpol[self.step_index]
+            sigma_next = self.sigmas[self.step_index]
+
+        # currently only gamma=0 is supported. This usually works best anyways.
+        # We can support gamma in the future but then need to scale the timestep before
+        # passing it to the model which requires a change in API
+        gamma = 0
+        sigma_hat = sigma * (gamma + 1)  # Note: sigma_hat == sigma for now
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
+            pred_original_sample = sample - sigma_input * model_output
+        elif self.config.prediction_type == "v_prediction":
+            sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
+            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+                sample / (sigma_input**2 + 1)
+            )
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        if self.state_in_first_order:
+            # 2. Convert to an ODE derivative for 1st order
+            derivative = (sample - pred_original_sample) / sigma_hat
+            # 3. delta timestep
+            dt = sigma_interpol - sigma_hat
+
+            # store for 2nd order step
+            self.sample = sample
+        else:
+            # DPM-Solver-2
+            # 2. Convert to an ODE derivative for 2nd order
+            derivative = (sample - pred_original_sample) / sigma_interpol
+
+            # 3. delta timestep
+            dt = sigma_next - sigma_hat
+
+            sample = self.sample
+            self.sample = None
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        prev_sample = sample + derivative * dt
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_karras_ve_flax.py b/diffusers/src/diffusers/schedulers/scheduling_karras_ve_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d387b53ac3ea6ab13bd7dd3c91972d6b427e3cb
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_karras_ve_flax.py
@@ -0,0 +1,238 @@
+# Copyright 2024 NVIDIA and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+from jax import random
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils_flax import FlaxSchedulerMixin
+
+
+@flax.struct.dataclass
+class KarrasVeSchedulerState:
+    # setable values
+    num_inference_steps: Optional[int] = None
+    timesteps: Optional[jnp.ndarray] = None
+    schedule: Optional[jnp.ndarray] = None  # sigma(t_i)
+
+    @classmethod
+    def create(cls):
+        return cls()
+
+
+@dataclass
+class FlaxKarrasVeOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        derivative (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Derivative of predicted original image sample (x_0).
+        state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class.
+    """
+
+    prev_sample: jnp.ndarray
+    derivative: jnp.ndarray
+    state: KarrasVeSchedulerState
+
+
+class FlaxKarrasVeScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    Stochastic sampling from Karras et al. [1] tailored to the Variance-Expanding (VE) models [2]. Use Algorithm 2 and
+    the VE column of Table 1 from [1] for reference.
+
+    [1] Karras, Tero, et al. "Elucidating the Design Space of Diffusion-Based Generative Models."
+    https://arxiv.org/abs/2206.00364 [2] Song, Yang, et al. "Score-based generative modeling through stochastic
+    differential equations." https://arxiv.org/abs/2011.13456
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details on the parameters, see the original paper's Appendix E.: "Elucidating the Design Space of
+    Diffusion-Based Generative Models." https://arxiv.org/abs/2206.00364. The grid search values used to find the
+    optimal {s_noise, s_churn, s_min, s_max} for a specific model are described in Table 5 of the paper.
+
+    Args:
+        sigma_min (`float`): minimum noise magnitude
+        sigma_max (`float`): maximum noise magnitude
+        s_noise (`float`): the amount of additional noise to counteract loss of detail during sampling.
+            A reasonable range is [1.000, 1.011].
+        s_churn (`float`): the parameter controlling the overall amount of stochasticity.
+            A reasonable range is [0, 100].
+        s_min (`float`): the start value of the sigma range where we add noise (enable stochasticity).
+            A reasonable range is [0, 10].
+        s_max (`float`): the end value of the sigma range where we add noise.
+            A reasonable range is [0.2, 80].
+    """
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        sigma_min: float = 0.02,
+        sigma_max: float = 100,
+        s_noise: float = 1.007,
+        s_churn: float = 80,
+        s_min: float = 0.05,
+        s_max: float = 50,
+    ):
+        pass
+
+    def create_state(self):
+        return KarrasVeSchedulerState.create()
+
+    def set_timesteps(
+        self, state: KarrasVeSchedulerState, num_inference_steps: int, shape: Tuple = ()
+    ) -> KarrasVeSchedulerState:
+        """
+        Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`KarrasVeSchedulerState`):
+                the `FlaxKarrasVeScheduler` state data class.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+
+        """
+        timesteps = jnp.arange(0, num_inference_steps)[::-1].copy()
+        schedule = [
+            (
+                self.config.sigma_max**2
+                * (self.config.sigma_min**2 / self.config.sigma_max**2) ** (i / (num_inference_steps - 1))
+            )
+            for i in timesteps
+        ]
+
+        return state.replace(
+            num_inference_steps=num_inference_steps,
+            schedule=jnp.array(schedule, dtype=jnp.float32),
+            timesteps=timesteps,
+        )
+
+    def add_noise_to_input(
+        self,
+        state: KarrasVeSchedulerState,
+        sample: jnp.ndarray,
+        sigma: float,
+        key: jax.Array,
+    ) -> Tuple[jnp.ndarray, float]:
+        """
+        Explicit Langevin-like "churn" step of adding noise to the sample according to a factor gamma_i ≥ 0 to reach a
+        higher noise level sigma_hat = sigma_i + gamma_i*sigma_i.
+
+        TODO Args:
+        """
+        if self.config.s_min <= sigma <= self.config.s_max:
+            gamma = min(self.config.s_churn / state.num_inference_steps, 2**0.5 - 1)
+        else:
+            gamma = 0
+
+        # sample eps ~ N(0, S_noise^2 * I)
+        key = random.split(key, num=1)
+        eps = self.config.s_noise * random.normal(key=key, shape=sample.shape)
+        sigma_hat = sigma + gamma * sigma
+        sample_hat = sample + ((sigma_hat**2 - sigma**2) ** 0.5 * eps)
+
+        return sample_hat, sigma_hat
+
+    def step(
+        self,
+        state: KarrasVeSchedulerState,
+        model_output: jnp.ndarray,
+        sigma_hat: float,
+        sigma_prev: float,
+        sample_hat: jnp.ndarray,
+        return_dict: bool = True,
+    ) -> Union[FlaxKarrasVeOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class.
+            model_output (`torch.Tensor` or `np.ndarray`): direct output from learned diffusion model.
+            sigma_hat (`float`): TODO
+            sigma_prev (`float`): TODO
+            sample_hat (`torch.Tensor` or `np.ndarray`): TODO
+            return_dict (`bool`): option for returning tuple rather than FlaxKarrasVeOutput class
+
+        Returns:
+            [`~schedulers.scheduling_karras_ve_flax.FlaxKarrasVeOutput`] or `tuple`: Updated sample in the diffusion
+            chain and derivative. [`~schedulers.scheduling_karras_ve_flax.FlaxKarrasVeOutput`] if `return_dict` is
+            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+        """
+
+        pred_original_sample = sample_hat + sigma_hat * model_output
+        derivative = (sample_hat - pred_original_sample) / sigma_hat
+        sample_prev = sample_hat + (sigma_prev - sigma_hat) * derivative
+
+        if not return_dict:
+            return (sample_prev, derivative, state)
+
+        return FlaxKarrasVeOutput(prev_sample=sample_prev, derivative=derivative, state=state)
+
+    def step_correct(
+        self,
+        state: KarrasVeSchedulerState,
+        model_output: jnp.ndarray,
+        sigma_hat: float,
+        sigma_prev: float,
+        sample_hat: jnp.ndarray,
+        sample_prev: jnp.ndarray,
+        derivative: jnp.ndarray,
+        return_dict: bool = True,
+    ) -> Union[FlaxKarrasVeOutput, Tuple]:
+        """
+        Correct the predicted sample based on the output model_output of the network. TODO complete description
+
+        Args:
+            state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class.
+            model_output (`torch.Tensor` or `np.ndarray`): direct output from learned diffusion model.
+            sigma_hat (`float`): TODO
+            sigma_prev (`float`): TODO
+            sample_hat (`torch.Tensor` or `np.ndarray`): TODO
+            sample_prev (`torch.Tensor` or `np.ndarray`): TODO
+            derivative (`torch.Tensor` or `np.ndarray`): TODO
+            return_dict (`bool`): option for returning tuple rather than FlaxKarrasVeOutput class
+
+        Returns:
+            prev_sample (TODO): updated sample in the diffusion chain. derivative (TODO): TODO
+
+        """
+        pred_original_sample = sample_prev + sigma_prev * model_output
+        derivative_corr = (sample_prev - pred_original_sample) / sigma_prev
+        sample_prev = sample_hat + (sigma_prev - sigma_hat) * (0.5 * derivative + 0.5 * derivative_corr)
+
+        if not return_dict:
+            return (sample_prev, derivative, state)
+
+        return FlaxKarrasVeOutput(prev_sample=sample_prev, derivative=derivative, state=state)
+
+    def add_noise(self, state: KarrasVeSchedulerState, original_samples, noise, timesteps):
+        raise NotImplementedError()
diff --git a/diffusers/src/diffusers/schedulers/scheduling_lcm.py b/diffusers/src/diffusers/schedulers/scheduling_lcm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1aa09ab1723c1f4d48cc4625323db0e10080d6d
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_lcm.py
@@ -0,0 +1,658 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class LCMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    denoised: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class LCMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
+    non-Markovian guidance.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. [`~ConfigMixin`] takes care of storing all config
+    attributes that are passed in the scheduler's `__init__` function, such as `num_train_timesteps`. They can be
+    accessed via `scheduler.config.num_train_timesteps`. [`SchedulerMixin`] provides general loading and saving
+    functionality via the [`SchedulerMixin.save_pretrained`] and [`~SchedulerMixin.from_pretrained`] functions.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        original_inference_steps (`int`, *optional*, defaults to 50):
+            The default number of inference steps used to generate a linearly-spaced timestep schedule, from which we
+            will ultimately take `num_inference_steps` evenly spaced timesteps to form the final timestep schedule.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        timestep_scaling (`float`, defaults to 10.0):
+            The factor the timesteps will be multiplied by when calculating the consistency model boundary conditions
+            `c_skip` and `c_out`. Increasing this will decrease the approximation error (although the approximation
+            error at the default of `10.0` is already pretty small).
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,
+        beta_end: float = 0.012,
+        beta_schedule: str = "scaled_linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        original_inference_steps: int = 50,
+        clip_sample: bool = False,
+        clip_sample_range: float = 1.0,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        timestep_scaling: float = 10.0,
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+        self.custom_timesteps = False
+
+        self._step_index = None
+        self._begin_index = None
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    @property
+    def step_index(self):
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        original_inference_steps: Optional[int] = None,
+        timesteps: Optional[List[int]] = None,
+        strength: int = 1.0,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`, *optional*):
+                The number of diffusion steps used when generating samples with a pre-trained model. If used,
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            original_inference_steps (`int`, *optional*):
+                The original number of inference steps, which will be used to generate a linearly-spaced timestep
+                schedule (which is different from the standard `diffusers` implementation). We will then take
+                `num_inference_steps` timesteps from this schedule, evenly spaced in terms of indices, and use that as
+                our final timestep schedule. If not set, this will default to the `original_inference_steps` attribute.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps on the training/distillation timestep
+                schedule is used. If `timesteps` is passed, `num_inference_steps` must be `None`.
+        """
+        # 0. Check inputs
+        if num_inference_steps is None and timesteps is None:
+            raise ValueError("Must pass exactly one of `num_inference_steps` or `custom_timesteps`.")
+
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+
+        # 1. Calculate the LCM original training/distillation timestep schedule.
+        original_steps = (
+            original_inference_steps if original_inference_steps is not None else self.config.original_inference_steps
+        )
+
+        if original_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`original_steps`: {original_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        # LCM Timesteps Setting
+        # The skipping step parameter k from the paper.
+        k = self.config.num_train_timesteps // original_steps
+        # LCM Training/Distillation Steps Schedule
+        # Currently, only a linearly-spaced schedule is supported (same as in the LCM distillation scripts).
+        lcm_origin_timesteps = np.asarray(list(range(1, int(original_steps * strength) + 1))) * k - 1
+
+        # 2. Calculate the LCM inference timestep schedule.
+        if timesteps is not None:
+            # 2.1 Handle custom timestep schedules.
+            train_timesteps = set(lcm_origin_timesteps)
+            non_train_timesteps = []
+            for i in range(1, len(timesteps)):
+                if timesteps[i] >= timesteps[i - 1]:
+                    raise ValueError("`custom_timesteps` must be in descending order.")
+
+                if timesteps[i] not in train_timesteps:
+                    non_train_timesteps.append(timesteps[i])
+
+            if timesteps[0] >= self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`timesteps` must start before `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps}."
+                )
+
+            # Raise warning if timestep schedule does not start with self.config.num_train_timesteps - 1
+            if strength == 1.0 and timesteps[0] != self.config.num_train_timesteps - 1:
+                logger.warning(
+                    f"The first timestep on the custom timestep schedule is {timesteps[0]}, not"
+                    f" `self.config.num_train_timesteps - 1`: {self.config.num_train_timesteps - 1}. You may get"
+                    f" unexpected results when using this timestep schedule."
+                )
+
+            # Raise warning if custom timestep schedule contains timesteps not on original timestep schedule
+            if non_train_timesteps:
+                logger.warning(
+                    f"The custom timestep schedule contains the following timesteps which are not on the original"
+                    f" training/distillation timestep schedule: {non_train_timesteps}. You may get unexpected results"
+                    f" when using this timestep schedule."
+                )
+
+            # Raise warning if custom timestep schedule is longer than original_steps
+            if len(timesteps) > original_steps:
+                logger.warning(
+                    f"The number of timesteps in the custom timestep schedule is {len(timesteps)}, which exceeds the"
+                    f" the length of the timestep schedule used for training: {original_steps}. You may get some"
+                    f" unexpected results when using this timestep schedule."
+                )
+
+            timesteps = np.array(timesteps, dtype=np.int64)
+            self.num_inference_steps = len(timesteps)
+            self.custom_timesteps = True
+
+            # Apply strength (e.g. for img2img pipelines) (see StableDiffusionImg2ImgPipeline.get_timesteps)
+            init_timestep = min(int(self.num_inference_steps * strength), self.num_inference_steps)
+            t_start = max(self.num_inference_steps - init_timestep, 0)
+            timesteps = timesteps[t_start * self.order :]
+            # TODO: also reset self.num_inference_steps?
+        else:
+            # 2.2 Create the "standard" LCM inference timestep schedule.
+            if num_inference_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+
+            skipping_step = len(lcm_origin_timesteps) // num_inference_steps
+
+            if skipping_step < 1:
+                raise ValueError(
+                    f"The combination of `original_steps x strength`: {original_steps} x {strength} is smaller than `num_inference_steps`: {num_inference_steps}. Make sure to either reduce `num_inference_steps` to a value smaller than {int(original_steps * strength)} or increase `strength` to a value higher than {float(num_inference_steps / original_steps)}."
+                )
+
+            self.num_inference_steps = num_inference_steps
+
+            if num_inference_steps > original_steps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `original_inference_steps`:"
+                    f" {original_steps} because the final timestep schedule will be a subset of the"
+                    f" `original_inference_steps`-sized initial timestep schedule."
+                )
+
+            # LCM Inference Steps Schedule
+            lcm_origin_timesteps = lcm_origin_timesteps[::-1].copy()
+            # Select (approximately) evenly spaced indices from lcm_origin_timesteps.
+            inference_indices = np.linspace(0, len(lcm_origin_timesteps), num=num_inference_steps, endpoint=False)
+            inference_indices = np.floor(inference_indices).astype(np.int64)
+            timesteps = lcm_origin_timesteps[inference_indices]
+
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.long)
+
+        self._step_index = None
+        self._begin_index = None
+
+    def get_scalings_for_boundary_condition_discrete(self, timestep):
+        self.sigma_data = 0.5  # Default: 0.5
+        scaled_timestep = timestep * self.config.timestep_scaling
+
+        c_skip = self.sigma_data**2 / (scaled_timestep**2 + self.sigma_data**2)
+        c_out = scaled_timestep / (scaled_timestep**2 + self.sigma_data**2) ** 0.5
+        return c_skip, c_out
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[LCMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # 1. get previous step value
+        prev_step_index = self.step_index + 1
+        if prev_step_index < len(self.timesteps):
+            prev_timestep = self.timesteps[prev_step_index]
+        else:
+            prev_timestep = timestep
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        # 3. Get scalings for boundary conditions
+        c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
+
+        # 4. Compute the predicted original sample x_0 based on the model parameterization
+        if self.config.prediction_type == "epsilon":  # noise-prediction
+            predicted_original_sample = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
+        elif self.config.prediction_type == "sample":  # x-prediction
+            predicted_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":  # v-prediction
+            predicted_original_sample = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction` for `LCMScheduler`."
+            )
+
+        # 5. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            predicted_original_sample = self._threshold_sample(predicted_original_sample)
+        elif self.config.clip_sample:
+            predicted_original_sample = predicted_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 6. Denoise model output using boundary conditions
+        denoised = c_out * predicted_original_sample + c_skip * sample
+
+        # 7. Sample and inject noise z ~ N(0, I) for MultiStep Inference
+        # Noise is not used on the final timestep of the timestep schedule.
+        # This also means that noise is not used for one-step sampling.
+        if self.step_index != self.num_inference_steps - 1:
+            noise = randn_tensor(
+                model_output.shape, generator=generator, device=model_output.device, dtype=denoised.dtype
+            )
+            prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
+        else:
+            prev_sample = denoised
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample, denoised)
+
+        return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
+    def previous_timestep(self, timestep):
+        if self.custom_timesteps:
+            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
+            if index == self.timesteps.shape[0] - 1:
+                prev_t = torch.tensor(-1)
+            else:
+                prev_t = self.timesteps[index + 1]
+        else:
+            num_inference_steps = (
+                self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+            )
+            prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
+
+        return prev_t
diff --git a/diffusers/src/diffusers/schedulers/scheduling_lms_discrete.py b/diffusers/src/diffusers/schedulers/scheduling_lms_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ef8ffb0dcbf9ff41b8d42242d47a97b93eba12d
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from scipy import integrate
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->LMSDiscrete
+class LMSDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    A linear multistep scheduler for discrete beta schedules.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if sum([self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError("Only one of `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used.")
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas)
+
+        # setable values
+        self.num_inference_steps = None
+        self.use_karras_sigmas = use_karras_sigmas
+        self.set_timesteps(num_train_timesteps, None)
+        self.derivatives = []
+        self.is_scale_input_called = False
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return self.sigmas.max()
+
+        return (self.sigmas.max() ** 2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`float` or `torch.Tensor`):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        self.is_scale_input_called = True
+        return sample
+
+    def get_lms_coefficient(self, order, t, current_order):
+        """
+        Compute the linear multistep coefficient.
+
+        Args:
+            order ():
+            t ():
+            current_order ():
+        """
+
+        def lms_derivative(tau):
+            prod = 1.0
+            for k in range(order):
+                if current_order == k:
+                    continue
+                prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
+            return prod
+
+        integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
+
+        return integrated_coeff
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[
+                ::-1
+            ].copy()
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(self.config.num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.config.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+        elif self.config.use_exponential_sigmas:
+            sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas).to(device=device)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device)
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+        self.derivatives = []
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        sigma_min: float = in_sigmas[-1].item()
+        sigma_max: float = in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, self.num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
+    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """Constructs an exponential noise schedule."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps).exp()
+        return sigmas
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[float, torch.Tensor],
+        sample: torch.Tensor,
+        order: int = 4,
+        return_dict: bool = True,
+    ) -> Union[LMSDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float` or `torch.Tensor`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`, defaults to 4):
+                The order of the linear multistep method.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if not self.is_scale_input_called:
+            warnings.warn(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma
+        self.derivatives.append(derivative)
+        if len(self.derivatives) > order:
+            self.derivatives.pop(0)
+
+        # 3. Compute linear multistep coefficients
+        order = min(self.step_index + 1, order)
+        lms_coeffs = [self.get_lms_coefficient(order, self.step_index, curr_order) for curr_order in range(order)]
+
+        # 4. Compute previous sample based on the derivatives path
+        prev_sample = sample + sum(
+            coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives))
+        )
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return LMSDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_lms_discrete_flax.py b/diffusers/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1169cc90a7bd53d2ede80584e09647e5f800ccf
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
@@ -0,0 +1,283 @@
+# Copyright 2024 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+from scipy import integrate
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import (
+    CommonSchedulerState,
+    FlaxKarrasDiffusionSchedulers,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    broadcast_to_shape_from_left,
+)
+
+
+@flax.struct.dataclass
+class LMSDiscreteSchedulerState:
+    common: CommonSchedulerState
+
+    # setable values
+    init_noise_sigma: jnp.ndarray
+    timesteps: jnp.ndarray
+    sigmas: jnp.ndarray
+    num_inference_steps: Optional[int] = None
+
+    # running values
+    derivatives: Optional[jnp.ndarray] = None
+
+    @classmethod
+    def create(
+        cls, common: CommonSchedulerState, init_noise_sigma: jnp.ndarray, timesteps: jnp.ndarray, sigmas: jnp.ndarray
+    ):
+        return cls(common=common, init_noise_sigma=init_noise_sigma, timesteps=timesteps, sigmas=sigmas)
+
+
+@dataclass
+class FlaxLMSSchedulerOutput(FlaxSchedulerOutput):
+    state: LMSDiscreteSchedulerState
+
+
+class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    Linear Multistep Scheduler for discrete beta schedules. Based on the original k-diffusion implementation by
+    Katherine Crowson:
+    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`jnp.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+            https://imagen.research.google/video/paper.pdf)
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
+    """
+
+    _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
+
+    dtype: jnp.dtype
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
+        prediction_type: str = "epsilon",
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        self.dtype = dtype
+
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> LMSDiscreteSchedulerState:
+        if common is None:
+            common = CommonSchedulerState.create(self)
+
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
+        sigmas = ((1 - common.alphas_cumprod) / common.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        init_noise_sigma = sigmas.max()
+
+        return LMSDiscreteSchedulerState.create(
+            common=common,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+            sigmas=sigmas,
+        )
+
+    def scale_model_input(self, state: LMSDiscreteSchedulerState, sample: jnp.ndarray, timestep: int) -> jnp.ndarray:
+        """
+        Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm.
+
+        Args:
+            state (`LMSDiscreteSchedulerState`):
+                the `FlaxLMSDiscreteScheduler` state data class instance.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            timestep (`int`):
+                current discrete timestep in the diffusion chain.
+
+        Returns:
+            `jnp.ndarray`: scaled input sample
+        """
+        (step_index,) = jnp.where(state.timesteps == timestep, size=1)
+        step_index = step_index[0]
+
+        sigma = state.sigmas[step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        return sample
+
+    def get_lms_coefficient(self, state: LMSDiscreteSchedulerState, order, t, current_order):
+        """
+        Compute a linear multistep coefficient.
+
+        Args:
+            order (TODO):
+            t (TODO):
+            current_order (TODO):
+        """
+
+        def lms_derivative(tau):
+            prod = 1.0
+            for k in range(order):
+                if current_order == k:
+                    continue
+                prod *= (tau - state.sigmas[t - k]) / (state.sigmas[t - current_order] - state.sigmas[t - k])
+            return prod
+
+        integrated_coeff = integrate.quad(lms_derivative, state.sigmas[t], state.sigmas[t + 1], epsrel=1e-4)[0]
+
+        return integrated_coeff
+
+    def set_timesteps(
+        self, state: LMSDiscreteSchedulerState, num_inference_steps: int, shape: Tuple = ()
+    ) -> LMSDiscreteSchedulerState:
+        """
+        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`LMSDiscreteSchedulerState`):
+                the `FlaxLMSDiscreteScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        timesteps = jnp.linspace(self.config.num_train_timesteps - 1, 0, num_inference_steps, dtype=self.dtype)
+
+        low_idx = jnp.floor(timesteps).astype(jnp.int32)
+        high_idx = jnp.ceil(timesteps).astype(jnp.int32)
+
+        frac = jnp.mod(timesteps, 1.0)
+
+        sigmas = ((1 - state.common.alphas_cumprod) / state.common.alphas_cumprod) ** 0.5
+        sigmas = (1 - frac) * sigmas[low_idx] + frac * sigmas[high_idx]
+        sigmas = jnp.concatenate([sigmas, jnp.array([0.0], dtype=self.dtype)])
+
+        timesteps = timesteps.astype(jnp.int32)
+
+        # initial running values
+        derivatives = jnp.zeros((0,) + shape, dtype=self.dtype)
+
+        return state.replace(
+            timesteps=timesteps,
+            sigmas=sigmas,
+            num_inference_steps=num_inference_steps,
+            derivatives=derivatives,
+        )
+
+    def step(
+        self,
+        state: LMSDiscreteSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        order: int = 4,
+        return_dict: bool = True,
+    ) -> Union[FlaxLMSSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`LMSDiscreteSchedulerState`): the `FlaxLMSDiscreteScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            order: coefficient for multi-step inference.
+            return_dict (`bool`): option for returning tuple rather than FlaxLMSSchedulerOutput class
+
+        Returns:
+            [`FlaxLMSSchedulerOutput`] or `tuple`: [`FlaxLMSSchedulerOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        sigma = state.sigmas[timestep]
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma
+        state = state.replace(derivatives=jnp.append(state.derivatives, derivative))
+        if len(state.derivatives) > order:
+            state = state.replace(derivatives=jnp.delete(state.derivatives, 0))
+
+        # 3. Compute linear multistep coefficients
+        order = min(timestep + 1, order)
+        lms_coeffs = [self.get_lms_coefficient(state, order, timestep, curr_order) for curr_order in range(order)]
+
+        # 4. Compute previous sample based on the derivatives path
+        prev_sample = sample + sum(
+            coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(state.derivatives))
+        )
+
+        if not return_dict:
+            return (prev_sample, state)
+
+        return FlaxLMSSchedulerOutput(prev_sample=prev_sample, state=state)
+
+    def add_noise(
+        self,
+        state: LMSDiscreteSchedulerState,
+        original_samples: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        sigma = state.sigmas[timesteps].flatten()
+        sigma = broadcast_to_shape_from_left(sigma, noise.shape)
+
+        noisy_samples = original_samples + noise * sigma
+
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_pndm.py b/diffusers/src/diffusers/schedulers/scheduling_pndm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a05e71c3c2255e984a275bd9a69ac4e9f456ae1e
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_pndm.py
@@ -0,0 +1,476 @@
+# Copyright 2024 Zhejiang University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class PNDMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `PNDMScheduler` uses pseudo numerical methods for diffusion models such as the Runge-Kutta and linear multi-step
+    method.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        skip_prk_steps (`bool`, defaults to `False`):
+            Allows the scheduler to skip the Runge-Kutta steps defined in the original paper as being required before
+            PLMS steps.
+        set_alpha_to_one (`bool`, defaults to `False`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process)
+            or `v_prediction` (see section 2.4 of [Imagen Video](https://imagen.research.google/video/paper.pdf)
+            paper).
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        skip_prk_steps: bool = False,
+        set_alpha_to_one: bool = False,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "leading",
+        steps_offset: int = 0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # For now we only support F-PNDM, i.e. the runge-kutta method
+        # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
+        # mainly at formula (9), (12), (13) and the Algorithm 2.
+        self.pndm_order = 4
+
+        # running values
+        self.cur_model_output = 0
+        self.counter = 0
+        self.cur_sample = None
+        self.ets = []
+
+        # setable values
+        self.num_inference_steps = None
+        self._timesteps = np.arange(0, num_train_timesteps)[::-1].copy()
+        self.prk_timesteps = None
+        self.plms_timesteps = None
+        self.timesteps = None
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+
+        self.num_inference_steps = num_inference_steps
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            self._timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps).round().astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            self._timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()
+            self._timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            self._timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio))[::-1].astype(
+                np.int64
+            )
+            self._timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        if self.config.skip_prk_steps:
+            # for some models like stable diffusion the prk steps can/should be skipped to
+            # produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation
+            # is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51
+            self.prk_timesteps = np.array([])
+            self.plms_timesteps = np.concatenate([self._timesteps[:-1], self._timesteps[-2:-1], self._timesteps[-1:]])[
+                ::-1
+            ].copy()
+        else:
+            prk_timesteps = np.array(self._timesteps[-self.pndm_order :]).repeat(2) + np.tile(
+                np.array([0, self.config.num_train_timesteps // num_inference_steps // 2]), self.pndm_order
+            )
+            self.prk_timesteps = (prk_timesteps[:-1].repeat(2)[1:-1])[::-1].copy()
+            self.plms_timesteps = self._timesteps[:-3][
+                ::-1
+            ].copy()  # we copy to avoid having negative strides which are not supported by torch.from_numpy
+
+        timesteps = np.concatenate([self.prk_timesteps, self.plms_timesteps]).astype(np.int64)
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+        self.ets = []
+        self.counter = 0
+        self.cur_model_output = 0
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise), and calls [`~PNDMScheduler.step_prk`]
+        or [`~PNDMScheduler.step_plms`] depending on the internal variable `counter`.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.counter < len(self.prk_timesteps) and not self.config.skip_prk_steps:
+            return self.step_prk(model_output=model_output, timestep=timestep, sample=sample, return_dict=return_dict)
+        else:
+            return self.step_plms(model_output=model_output, timestep=timestep, sample=sample, return_dict=return_dict)
+
+    def step_prk(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the Runge-Kutta method. It performs four forward passes to approximate the solution to the differential
+        equation.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        diff_to_prev = 0 if self.counter % 2 else self.config.num_train_timesteps // self.num_inference_steps // 2
+        prev_timestep = timestep - diff_to_prev
+        timestep = self.prk_timesteps[self.counter // 4 * 4]
+
+        if self.counter % 4 == 0:
+            self.cur_model_output += 1 / 6 * model_output
+            self.ets.append(model_output)
+            self.cur_sample = sample
+        elif (self.counter - 1) % 4 == 0:
+            self.cur_model_output += 1 / 3 * model_output
+        elif (self.counter - 2) % 4 == 0:
+            self.cur_model_output += 1 / 3 * model_output
+        elif (self.counter - 3) % 4 == 0:
+            model_output = self.cur_model_output + 1 / 6 * model_output
+            self.cur_model_output = 0
+
+        # cur_sample should not be `None`
+        cur_sample = self.cur_sample if self.cur_sample is not None else sample
+
+        prev_sample = self._get_prev_sample(cur_sample, timestep, prev_timestep, model_output)
+        self.counter += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def step_plms(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the linear multistep method. It performs one forward pass multiple times to approximate the solution.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if not self.config.skip_prk_steps and len(self.ets) < 3:
+            raise ValueError(
+                f"{self.__class__} can only be run AFTER scheduler has been run "
+                "in 'prk' mode for at least 12 iterations "
+                "See: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py "
+                "for more information."
+            )
+
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        if self.counter != 1:
+            self.ets = self.ets[-3:]
+            self.ets.append(model_output)
+        else:
+            prev_timestep = timestep
+            timestep = timestep + self.config.num_train_timesteps // self.num_inference_steps
+
+        if len(self.ets) == 1 and self.counter == 0:
+            model_output = model_output
+            self.cur_sample = sample
+        elif len(self.ets) == 1 and self.counter == 1:
+            model_output = (model_output + self.ets[-1]) / 2
+            sample = self.cur_sample
+            self.cur_sample = None
+        elif len(self.ets) == 2:
+            model_output = (3 * self.ets[-1] - self.ets[-2]) / 2
+        elif len(self.ets) == 3:
+            model_output = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
+        else:
+            model_output = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4])
+
+        prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output)
+        self.counter += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
+        # See formula (9) of PNDM paper https://arxiv.org/pdf/2202.09778.pdf
+        # this function computes x_(t−δ) using the formula of (9)
+        # Note that x_t needs to be added to both sides of the equation
+
+        # Notation (<variable name> -> <name in paper>
+        # alpha_prod_t -> α_t
+        # alpha_prod_t_prev -> α_(t−δ)
+        # beta_prod_t -> (1 - α_t)
+        # beta_prod_t_prev -> (1 - α_(t−δ))
+        # sample -> x_t
+        # model_output -> e_θ(x_t, t)
+        # prev_sample -> x_(t−δ)
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        if self.config.prediction_type == "v_prediction":
+            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        elif self.config.prediction_type != "epsilon":
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`"
+            )
+
+        # corresponds to (α_(t−δ) - α_t) divided by
+        # denominator of x_t in formula (9) and plus 1
+        # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
+        # sqrt(α_(t−δ)) / sqrt(α_t))
+        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
+
+        # corresponds to denominator of e_θ(x_t, t) in formula (9)
+        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
+            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
+        ) ** (0.5)
+
+        # full formula (9)
+        prev_sample = (
+            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
+        )
+
+        return prev_sample
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_pndm_flax.py b/diffusers/src/diffusers/schedulers/scheduling_pndm_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ac3ba5ca1ba1f4b4ffcdc7d0261c5877a4dddd0
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_pndm_flax.py
@@ -0,0 +1,509 @@
+# Copyright 2024 Zhejiang University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import (
+    CommonSchedulerState,
+    FlaxKarrasDiffusionSchedulers,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    add_noise_common,
+)
+
+
+@flax.struct.dataclass
+class PNDMSchedulerState:
+    common: CommonSchedulerState
+    final_alpha_cumprod: jnp.ndarray
+
+    # setable values
+    init_noise_sigma: jnp.ndarray
+    timesteps: jnp.ndarray
+    num_inference_steps: Optional[int] = None
+    prk_timesteps: Optional[jnp.ndarray] = None
+    plms_timesteps: Optional[jnp.ndarray] = None
+
+    # running values
+    cur_model_output: Optional[jnp.ndarray] = None
+    counter: Optional[jnp.int32] = None
+    cur_sample: Optional[jnp.ndarray] = None
+    ets: Optional[jnp.ndarray] = None
+
+    @classmethod
+    def create(
+        cls,
+        common: CommonSchedulerState,
+        final_alpha_cumprod: jnp.ndarray,
+        init_noise_sigma: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ):
+        return cls(
+            common=common,
+            final_alpha_cumprod=final_alpha_cumprod,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+
+@dataclass
+class FlaxPNDMSchedulerOutput(FlaxSchedulerOutput):
+    state: PNDMSchedulerState
+
+
+class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    Pseudo numerical methods for diffusion models (PNDM) proposes using more advanced ODE integration techniques,
+    namely Runge-Kutta method and a linear multi-step method.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2202.09778
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`jnp.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        skip_prk_steps (`bool`):
+            allows the scheduler to skip the Runge-Kutta steps that are defined in the original paper as being required
+            before plms steps; defaults to `False`.
+        set_alpha_to_one (`bool`, default `False`):
+            each diffusion step uses the value of alphas product at that step and at the previous one. For the final
+            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the value of alpha at step 0.
+        steps_offset (`int`, default `0`):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+            https://imagen.research.google/video/paper.pdf)
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
+    """
+
+    _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
+
+    dtype: jnp.dtype
+    pndm_order: int
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
+        skip_prk_steps: bool = False,
+        set_alpha_to_one: bool = False,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        self.dtype = dtype
+
+        # For now we only support F-PNDM, i.e. the runge-kutta method
+        # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
+        # mainly at formula (9), (12), (13) and the Algorithm 2.
+        self.pndm_order = 4
+
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> PNDMSchedulerState:
+        if common is None:
+            common = CommonSchedulerState.create(self)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        final_alpha_cumprod = (
+            jnp.array(1.0, dtype=self.dtype) if self.config.set_alpha_to_one else common.alphas_cumprod[0]
+        )
+
+        # standard deviation of the initial noise distribution
+        init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
+
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
+
+        return PNDMSchedulerState.create(
+            common=common,
+            final_alpha_cumprod=final_alpha_cumprod,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+    def set_timesteps(self, state: PNDMSchedulerState, num_inference_steps: int, shape: Tuple) -> PNDMSchedulerState:
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`PNDMSchedulerState`):
+                the `FlaxPNDMScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+            shape (`Tuple`):
+                the shape of the samples to be generated.
+        """
+
+        step_ratio = self.config.num_train_timesteps // num_inference_steps
+        # creates integer timesteps by multiplying by ratio
+        # rounding to avoid issues when num_inference_step is power of 3
+        _timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round() + self.config.steps_offset
+
+        if self.config.skip_prk_steps:
+            # for some models like stable diffusion the prk steps can/should be skipped to
+            # produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation
+            # is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51
+
+            prk_timesteps = jnp.array([], dtype=jnp.int32)
+            plms_timesteps = jnp.concatenate([_timesteps[:-1], _timesteps[-2:-1], _timesteps[-1:]])[::-1]
+
+        else:
+            prk_timesteps = _timesteps[-self.pndm_order :].repeat(2) + jnp.tile(
+                jnp.array([0, self.config.num_train_timesteps // num_inference_steps // 2], dtype=jnp.int32),
+                self.pndm_order,
+            )
+
+            prk_timesteps = (prk_timesteps[:-1].repeat(2)[1:-1])[::-1]
+            plms_timesteps = _timesteps[:-3][::-1]
+
+        timesteps = jnp.concatenate([prk_timesteps, plms_timesteps])
+
+        # initial running values
+
+        cur_model_output = jnp.zeros(shape, dtype=self.dtype)
+        counter = jnp.int32(0)
+        cur_sample = jnp.zeros(shape, dtype=self.dtype)
+        ets = jnp.zeros((4,) + shape, dtype=self.dtype)
+
+        return state.replace(
+            timesteps=timesteps,
+            num_inference_steps=num_inference_steps,
+            prk_timesteps=prk_timesteps,
+            plms_timesteps=plms_timesteps,
+            cur_model_output=cur_model_output,
+            counter=counter,
+            cur_sample=cur_sample,
+            ets=ets,
+        )
+
+    def scale_model_input(
+        self, state: PNDMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
+    ) -> jnp.ndarray:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+            sample (`jnp.ndarray`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `jnp.ndarray`: scaled input sample
+        """
+        return sample
+
+    def step(
+        self,
+        state: PNDMSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        return_dict: bool = True,
+    ) -> Union[FlaxPNDMSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        This function calls `step_prk()` or `step_plms()` depending on the internal variable `counter`.
+
+        Args:
+            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than FlaxPNDMSchedulerOutput class
+
+        Returns:
+            [`FlaxPNDMSchedulerOutput`] or `tuple`: [`FlaxPNDMSchedulerOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.config.skip_prk_steps:
+            prev_sample, state = self.step_plms(state, model_output, timestep, sample)
+        else:
+            prk_prev_sample, prk_state = self.step_prk(state, model_output, timestep, sample)
+            plms_prev_sample, plms_state = self.step_plms(state, model_output, timestep, sample)
+
+            cond = state.counter < len(state.prk_timesteps)
+
+            prev_sample = jax.lax.select(cond, prk_prev_sample, plms_prev_sample)
+
+            state = state.replace(
+                cur_model_output=jax.lax.select(cond, prk_state.cur_model_output, plms_state.cur_model_output),
+                ets=jax.lax.select(cond, prk_state.ets, plms_state.ets),
+                cur_sample=jax.lax.select(cond, prk_state.cur_sample, plms_state.cur_sample),
+                counter=jax.lax.select(cond, prk_state.counter, plms_state.counter),
+            )
+
+        if not return_dict:
+            return (prev_sample, state)
+
+        return FlaxPNDMSchedulerOutput(prev_sample=prev_sample, state=state)
+
+    def step_prk(
+        self,
+        state: PNDMSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+    ) -> Union[FlaxPNDMSchedulerOutput, Tuple]:
+        """
+        Step function propagating the sample with the Runge-Kutta method. RK takes 4 forward passes to approximate the
+        solution to the differential equation.
+
+        Args:
+            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than FlaxPNDMSchedulerOutput class
+
+        Returns:
+            [`FlaxPNDMSchedulerOutput`] or `tuple`: [`FlaxPNDMSchedulerOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        diff_to_prev = jnp.where(
+            state.counter % 2, 0, self.config.num_train_timesteps // state.num_inference_steps // 2
+        )
+        prev_timestep = timestep - diff_to_prev
+        timestep = state.prk_timesteps[state.counter // 4 * 4]
+
+        model_output = jax.lax.select(
+            (state.counter % 4) != 3,
+            model_output,  # remainder 0, 1, 2
+            state.cur_model_output + 1 / 6 * model_output,  # remainder 3
+        )
+
+        state = state.replace(
+            cur_model_output=jax.lax.select_n(
+                state.counter % 4,
+                state.cur_model_output + 1 / 6 * model_output,  # remainder 0
+                state.cur_model_output + 1 / 3 * model_output,  # remainder 1
+                state.cur_model_output + 1 / 3 * model_output,  # remainder 2
+                jnp.zeros_like(state.cur_model_output),  # remainder 3
+            ),
+            ets=jax.lax.select(
+                (state.counter % 4) == 0,
+                state.ets.at[0:3].set(state.ets[1:4]).at[3].set(model_output),  # remainder 0
+                state.ets,  # remainder 1, 2, 3
+            ),
+            cur_sample=jax.lax.select(
+                (state.counter % 4) == 0,
+                sample,  # remainder 0
+                state.cur_sample,  # remainder 1, 2, 3
+            ),
+        )
+
+        cur_sample = state.cur_sample
+        prev_sample = self._get_prev_sample(state, cur_sample, timestep, prev_timestep, model_output)
+        state = state.replace(counter=state.counter + 1)
+
+        return (prev_sample, state)
+
+    def step_plms(
+        self,
+        state: PNDMSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+    ) -> Union[FlaxPNDMSchedulerOutput, Tuple]:
+        """
+        Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
+        times to approximate the solution.
+
+        Args:
+            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than FlaxPNDMSchedulerOutput class
+
+        Returns:
+            [`FlaxPNDMSchedulerOutput`] or `tuple`: [`FlaxPNDMSchedulerOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # NOTE: There is no way to check in the jitted runtime if the prk mode was ran before
+
+        prev_timestep = timestep - self.config.num_train_timesteps // state.num_inference_steps
+        prev_timestep = jnp.where(prev_timestep > 0, prev_timestep, 0)
+
+        # Reference:
+        # if state.counter != 1:
+        #     state.ets.append(model_output)
+        # else:
+        #     prev_timestep = timestep
+        #     timestep = timestep + self.config.num_train_timesteps // state.num_inference_steps
+
+        prev_timestep = jnp.where(state.counter == 1, timestep, prev_timestep)
+        timestep = jnp.where(
+            state.counter == 1, timestep + self.config.num_train_timesteps // state.num_inference_steps, timestep
+        )
+
+        # Reference:
+        # if len(state.ets) == 1 and state.counter == 0:
+        #     model_output = model_output
+        #     state.cur_sample = sample
+        # elif len(state.ets) == 1 and state.counter == 1:
+        #     model_output = (model_output + state.ets[-1]) / 2
+        #     sample = state.cur_sample
+        #     state.cur_sample = None
+        # elif len(state.ets) == 2:
+        #     model_output = (3 * state.ets[-1] - state.ets[-2]) / 2
+        # elif len(state.ets) == 3:
+        #     model_output = (23 * state.ets[-1] - 16 * state.ets[-2] + 5 * state.ets[-3]) / 12
+        # else:
+        #     model_output = (1 / 24) * (55 * state.ets[-1] - 59 * state.ets[-2] + 37 * state.ets[-3] - 9 * state.ets[-4])
+
+        state = state.replace(
+            ets=jax.lax.select(
+                state.counter != 1,
+                state.ets.at[0:3].set(state.ets[1:4]).at[3].set(model_output),  # counter != 1
+                state.ets,  # counter 1
+            ),
+            cur_sample=jax.lax.select(
+                state.counter != 1,
+                sample,  # counter != 1
+                state.cur_sample,  # counter 1
+            ),
+        )
+
+        state = state.replace(
+            cur_model_output=jax.lax.select_n(
+                jnp.clip(state.counter, 0, 4),
+                model_output,  # counter 0
+                (model_output + state.ets[-1]) / 2,  # counter 1
+                (3 * state.ets[-1] - state.ets[-2]) / 2,  # counter 2
+                (23 * state.ets[-1] - 16 * state.ets[-2] + 5 * state.ets[-3]) / 12,  # counter 3
+                (1 / 24)
+                * (55 * state.ets[-1] - 59 * state.ets[-2] + 37 * state.ets[-3] - 9 * state.ets[-4]),  # counter >= 4
+            ),
+        )
+
+        sample = state.cur_sample
+        model_output = state.cur_model_output
+        prev_sample = self._get_prev_sample(state, sample, timestep, prev_timestep, model_output)
+        state = state.replace(counter=state.counter + 1)
+
+        return (prev_sample, state)
+
+    def _get_prev_sample(self, state: PNDMSchedulerState, sample, timestep, prev_timestep, model_output):
+        # See formula (9) of PNDM paper https://arxiv.org/pdf/2202.09778.pdf
+        # this function computes x_(t−δ) using the formula of (9)
+        # Note that x_t needs to be added to both sides of the equation
+
+        # Notation (<variable name> -> <name in paper>
+        # alpha_prod_t -> α_t
+        # alpha_prod_t_prev -> α_(t−δ)
+        # beta_prod_t -> (1 - α_t)
+        # beta_prod_t_prev -> (1 - α_(t−δ))
+        # sample -> x_t
+        # model_output -> e_θ(x_t, t)
+        # prev_sample -> x_(t−δ)
+        alpha_prod_t = state.common.alphas_cumprod[timestep]
+        alpha_prod_t_prev = jnp.where(
+            prev_timestep >= 0, state.common.alphas_cumprod[prev_timestep], state.final_alpha_cumprod
+        )
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        if self.config.prediction_type == "v_prediction":
+            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        elif self.config.prediction_type != "epsilon":
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`"
+            )
+
+        # corresponds to (α_(t−δ) - α_t) divided by
+        # denominator of x_t in formula (9) and plus 1
+        # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
+        # sqrt(α_(t−δ)) / sqrt(α_t))
+        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
+
+        # corresponds to denominator of e_θ(x_t, t) in formula (9)
+        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
+            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
+        ) ** (0.5)
+
+        # full formula (9)
+        prev_sample = (
+            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
+        )
+
+        return prev_sample
+
+    def add_noise(
+        self,
+        state: PNDMSchedulerState,
+        original_samples: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        return add_noise_common(state.common, original_samples, noise, timesteps)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_repaint.py b/diffusers/src/diffusers/schedulers/scheduling_repaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..97665bb5277b35bf19663ee12a59a1daa00a9edb
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_repaint.py
@@ -0,0 +1,361 @@
+# Copyright 2024 ETH Zurich Computer Vision Lab and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class RePaintSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample (x_{0}) based on the model output from
+             the current timestep. `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: torch.Tensor
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class RePaintScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `RePaintScheduler` is a scheduler for DDPM inpainting inside a given mask.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, `squaredcos_cap_v2`, or `sigmoid`.
+        eta (`float`):
+            The weight of noise for added noise in diffusion step. If its value is between 0.0 and 1.0 it corresponds
+            to the DDIM scheduler, and if its value is between -0.0 and 1.0 it corresponds to the DDPM scheduler.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample between -1 and 1 for numerical stability.
+
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        eta: float = 0.0,
+        trained_betas: Optional[np.ndarray] = None,
+        clip_sample: bool = True,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.from_numpy(trained_betas)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        elif beta_schedule == "sigmoid":
+            # GeoDiff sigmoid schedule
+            betas = torch.linspace(-6, 6, num_train_timesteps)
+            self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.one = torch.tensor(1.0)
+
+        self.final_alpha_cumprod = torch.tensor(1.0)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+        self.eta = eta
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        jump_length: int = 10,
+        jump_n_sample: int = 10,
+        device: Union[str, torch.device] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model. If used,
+                `timesteps` must be `None`.
+            jump_length (`int`, defaults to 10):
+                The number of steps taken forward in time before going backward in time for a single jump (“j” in
+                RePaint paper). Take a look at Figure 9 and 10 in the paper.
+            jump_n_sample (`int`, defaults to 10):
+                The number of times to make a forward time jump for a given chosen time sample. Take a look at Figure 9
+                and 10 in the paper.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+
+        """
+        num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps)
+        self.num_inference_steps = num_inference_steps
+
+        timesteps = []
+
+        jumps = {}
+        for j in range(0, num_inference_steps - jump_length, jump_length):
+            jumps[j] = jump_n_sample - 1
+
+        t = num_inference_steps
+        while t >= 1:
+            t = t - 1
+            timesteps.append(t)
+
+            if jumps.get(t, 0) > 0:
+                jumps[t] = jumps[t] - 1
+                for _ in range(jump_length):
+                    t = t + 1
+                    timesteps.append(t)
+
+        timesteps = np.array(timesteps) * (self.config.num_train_timesteps // self.num_inference_steps)
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def _get_variance(self, t):
+        prev_timestep = t - self.config.num_train_timesteps // self.num_inference_steps
+
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from
+        # https://arxiv.org/pdf/2006.11239.pdf) and sample from it to get
+        # previous sample x_{t-1} ~ N(pred_prev_sample, variance) == add
+        # variance to pred_sample
+        # Is equivalent to formula (16) in https://arxiv.org/pdf/2010.02502.pdf
+        # without eta.
+        # variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t]
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        original_image: torch.Tensor,
+        mask: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[RePaintSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            original_image (`torch.Tensor`):
+                The original image to inpaint on.
+            mask (`torch.Tensor`):
+                The mask where a value of 0.0 indicates which part of the original image to inpaint.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_repaint.RePaintSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_repaint.RePaintSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_repaint.RePaintSchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+
+        """
+        t = timestep
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        # 1. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample = (sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+
+        # 3. Clip "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+        # We choose to follow RePaint Algorithm 1 to get x_{t-1}, however we
+        # substitute formula (7) in the algorithm coming from DDPM paper
+        # (formula (4) Algorithm 2 - Sampling) with formula (12) from DDIM paper.
+        # DDIM schedule gives the same results as DDPM with eta = 1.0
+        # Noise is being reused in 7. and 8., but no impact on quality has
+        # been observed.
+
+        # 5. Add noise
+        device = model_output.device
+        noise = randn_tensor(model_output.shape, generator=generator, device=device, dtype=model_output.dtype)
+        std_dev_t = self.eta * self._get_variance(timestep) ** 0.5
+
+        variance = 0
+        if t > 0 and self.eta > 0:
+            variance = std_dev_t * noise
+
+        # 6. compute "direction pointing to x_t" of formula (12)
+        # from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** 0.5 * model_output
+
+        # 7. compute x_{t-1} of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_unknown_part = alpha_prod_t_prev**0.5 * pred_original_sample + pred_sample_direction + variance
+
+        # 8. Algorithm 1 Line 5 https://arxiv.org/pdf/2201.09865.pdf
+        prev_known_part = (alpha_prod_t_prev**0.5) * original_image + ((1 - alpha_prod_t_prev) ** 0.5) * noise
+
+        # 9. Algorithm 1 Line 8 https://arxiv.org/pdf/2201.09865.pdf
+        pred_prev_sample = mask * prev_known_part + (1.0 - mask) * prev_unknown_part
+
+        if not return_dict:
+            return (
+                pred_prev_sample,
+                pred_original_sample,
+            )
+
+        return RePaintSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+    def undo_step(self, sample, timestep, generator=None):
+        n = self.config.num_train_timesteps // self.num_inference_steps
+
+        for i in range(n):
+            beta = self.betas[timestep + i]
+            if sample.device.type == "mps":
+                # randn does not work reproducibly on mps
+                noise = randn_tensor(sample.shape, dtype=sample.dtype, generator=generator)
+                noise = noise.to(sample.device)
+            else:
+                noise = randn_tensor(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype)
+
+            # 10. Algorithm 1 Line 10 https://arxiv.org/pdf/2201.09865.pdf
+            sample = (1 - beta) ** 0.5 * sample + beta**0.5 * noise
+
+        return sample
+
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError("Use `DDPMScheduler.add_noise()` to train for sampling with RePaint.")
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_sasolver.py b/diffusers/src/diffusers/schedulers/scheduling_sasolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad79c69fc714b9a60f6b88279b31f9b9ce634b26
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_sasolver.py
@@ -0,0 +1,1155 @@
+# Copyright 2024 Shuchen Xue, etc. in University of Chinese Academy of Sciences Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: check https://arxiv.org/abs/2309.05019
+# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+
+import math
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class SASolverScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `SASolverScheduler` is a fast dedicated high-order solver for diffusion SDEs.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        predictor_order (`int`, defaults to 2):
+            The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for
+            guided sampling, and `predictor_order=3` for unconditional sampling.
+        corrector_order (`int`, defaults to 2):
+            The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for
+            guided sampling, and `corrector_order=3` for unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        tau_func (`Callable`, *optional*):
+            Stochasticity during the sampling. Default in init is `lambda t: 1 if t >= 200 and t <= 800 else 0`.
+            SA-Solver will sample from vanilla diffusion ODE if tau_func is set to `lambda t: 0`. SA-Solver will sample
+            from vanilla diffusion SDE if tau_func is set to `lambda t: 1`. For more details, please check
+            https://arxiv.org/abs/2309.05019
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `data_prediction`):
+            Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use
+            `data_prediction` with `solver_order=2` for guided sampling like in Stable Diffusion.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Default = True.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        lambda_min_clipped (`float`, defaults to `-inf`):
+            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
+            cosine (`squaredcos_cap_v2`) noise schedule.
+        variance_type (`str`, *optional*):
+            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
+            contains the predicted Gaussian variance.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        predictor_order: int = 2,
+        corrector_order: int = 2,
+        prediction_type: str = "epsilon",
+        tau_func: Optional[Callable] = None,
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "data_prediction",
+        lower_order_final: bool = True,
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        lambda_min_clipped: float = -float("inf"),
+        variance_type: Optional[str] = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if sum([self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError("Only one of `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used.")
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (
+                torch.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=torch.float32,
+                )
+                ** 2
+            )
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        if algorithm_type not in ["data_prediction", "noise_prediction"]:
+            raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}")
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.timestep_list = [None] * max(predictor_order, corrector_order - 1)
+        self.model_outputs = [None] * max(predictor_order, corrector_order - 1)
+
+        if tau_func is None:
+            self.tau_func = lambda t: 1 if t >= 200 and t <= 800 else 0
+        else:
+            self.tau_func = tau_func
+        self.predict_x0 = algorithm_type == "data_prediction"
+        self.lower_order_nums = 0
+        self.last_sample = None
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        # Clipping the minimum of all lambda(t) for numerical stability.
+        # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+        clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
+        last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item()
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, last_timestep - 1, num_inference_steps + 1).round()[::-1][:-1].copy().astype(np.int64)
+            )
+
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = last_timestep // (num_inference_steps + 1)
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        if self.config.use_karras_sigmas:
+            log_sigmas = np.log(sigmas)
+            sigmas = np.flip(sigmas).copy()
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+            sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
+        elif self.config.use_exponential_sigmas:
+            sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+            sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+        self.model_outputs = [
+            None,
+        ] * max(self.config.predictor_order, self.config.corrector_order - 1)
+        self.lower_order_nums = 0
+        self.last_sample = None
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+
+        return alpha_t, sigma_t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
+    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """Constructs an exponential noise schedule."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps).exp()
+        return sigmas
+
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs.
+        Noise_prediction is designed to discretize an integral of the noise prediction model, and data_prediction is
+        designed to discretize an integral of the data prediction model.
+
+        <Tip>
+
+        The algorithm and model type are decoupled. You can use either data_prediction or noise_prediction for both
+        noise prediction and data prediction models.
+
+        </Tip>
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma = self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        # SA-Solver_data_prediction needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type in ["data_prediction"]:
+            if self.config.prediction_type == "epsilon":
+                # SA-Solver only needs the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    model_output = model_output[:, :3]
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the SASolverScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+
+        # SA-Solver_noise_prediction needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type in ["noise_prediction"]:
+            if self.config.prediction_type == "epsilon":
+                # SA-Solver only needs the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    epsilon = model_output[:, :3]
+                else:
+                    epsilon = model_output
+            elif self.config.prediction_type == "sample":
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+            elif self.config.prediction_type == "v_prediction":
+                epsilon = alpha_t * model_output + sigma_t * sample
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the SASolverScheduler."
+                )
+
+            if self.config.thresholding:
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = (sample - sigma_t * epsilon) / alpha_t
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = (sample - alpha_t * x0_pred) / sigma_t
+
+            return epsilon
+
+    def get_coefficients_exponential_negative(self, order, interval_start, interval_end):
+        """
+        Calculate the integral of exp(-x) * x^order dx from interval_start to interval_end
+        """
+        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
+
+        if order == 0:
+            return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1)
+        elif order == 1:
+            return torch.exp(-interval_end) * (
+                (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1)
+            )
+        elif order == 2:
+            return torch.exp(-interval_end) * (
+                (interval_start**2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start)
+                - (interval_end**2 + 2 * interval_end + 2)
+            )
+        elif order == 3:
+            return torch.exp(-interval_end) * (
+                (interval_start**3 + 3 * interval_start**2 + 6 * interval_start + 6)
+                * torch.exp(interval_end - interval_start)
+                - (interval_end**3 + 3 * interval_end**2 + 6 * interval_end + 6)
+            )
+
+    def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau):
+        """
+        Calculate the integral of exp(x(1+tau^2)) * x^order dx from interval_start to interval_end
+        """
+        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
+
+        # after change of variable(cov)
+        interval_end_cov = (1 + tau**2) * interval_end
+        interval_start_cov = (1 + tau**2) * interval_start
+
+        if order == 0:
+            return (
+                torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / (1 + tau**2)
+            )
+        elif order == 1:
+            return (
+                torch.exp(interval_end_cov)
+                * (
+                    (interval_end_cov - 1)
+                    - (interval_start_cov - 1) * torch.exp(-(interval_end_cov - interval_start_cov))
+                )
+                / ((1 + tau**2) ** 2)
+            )
+        elif order == 2:
+            return (
+                torch.exp(interval_end_cov)
+                * (
+                    (interval_end_cov**2 - 2 * interval_end_cov + 2)
+                    - (interval_start_cov**2 - 2 * interval_start_cov + 2)
+                    * torch.exp(-(interval_end_cov - interval_start_cov))
+                )
+                / ((1 + tau**2) ** 3)
+            )
+        elif order == 3:
+            return (
+                torch.exp(interval_end_cov)
+                * (
+                    (interval_end_cov**3 - 3 * interval_end_cov**2 + 6 * interval_end_cov - 6)
+                    - (interval_start_cov**3 - 3 * interval_start_cov**2 + 6 * interval_start_cov - 6)
+                    * torch.exp(-(interval_end_cov - interval_start_cov))
+                )
+                / ((1 + tau**2) ** 4)
+            )
+
+    def lagrange_polynomial_coefficient(self, order, lambda_list):
+        """
+        Calculate the coefficient of lagrange polynomial
+        """
+
+        assert order in [0, 1, 2, 3]
+        assert order == len(lambda_list) - 1
+        if order == 0:
+            return [[1]]
+        elif order == 1:
+            return [
+                [
+                    1 / (lambda_list[0] - lambda_list[1]),
+                    -lambda_list[1] / (lambda_list[0] - lambda_list[1]),
+                ],
+                [
+                    1 / (lambda_list[1] - lambda_list[0]),
+                    -lambda_list[0] / (lambda_list[1] - lambda_list[0]),
+                ],
+            ]
+        elif order == 2:
+            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2])
+            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2])
+            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1])
+            return [
+                [
+                    1 / denominator1,
+                    (-lambda_list[1] - lambda_list[2]) / denominator1,
+                    lambda_list[1] * lambda_list[2] / denominator1,
+                ],
+                [
+                    1 / denominator2,
+                    (-lambda_list[0] - lambda_list[2]) / denominator2,
+                    lambda_list[0] * lambda_list[2] / denominator2,
+                ],
+                [
+                    1 / denominator3,
+                    (-lambda_list[0] - lambda_list[1]) / denominator3,
+                    lambda_list[0] * lambda_list[1] / denominator3,
+                ],
+            ]
+        elif order == 3:
+            denominator1 = (
+                (lambda_list[0] - lambda_list[1])
+                * (lambda_list[0] - lambda_list[2])
+                * (lambda_list[0] - lambda_list[3])
+            )
+            denominator2 = (
+                (lambda_list[1] - lambda_list[0])
+                * (lambda_list[1] - lambda_list[2])
+                * (lambda_list[1] - lambda_list[3])
+            )
+            denominator3 = (
+                (lambda_list[2] - lambda_list[0])
+                * (lambda_list[2] - lambda_list[1])
+                * (lambda_list[2] - lambda_list[3])
+            )
+            denominator4 = (
+                (lambda_list[3] - lambda_list[0])
+                * (lambda_list[3] - lambda_list[1])
+                * (lambda_list[3] - lambda_list[2])
+            )
+            return [
+                [
+                    1 / denominator1,
+                    (-lambda_list[1] - lambda_list[2] - lambda_list[3]) / denominator1,
+                    (
+                        lambda_list[1] * lambda_list[2]
+                        + lambda_list[1] * lambda_list[3]
+                        + lambda_list[2] * lambda_list[3]
+                    )
+                    / denominator1,
+                    (-lambda_list[1] * lambda_list[2] * lambda_list[3]) / denominator1,
+                ],
+                [
+                    1 / denominator2,
+                    (-lambda_list[0] - lambda_list[2] - lambda_list[3]) / denominator2,
+                    (
+                        lambda_list[0] * lambda_list[2]
+                        + lambda_list[0] * lambda_list[3]
+                        + lambda_list[2] * lambda_list[3]
+                    )
+                    / denominator2,
+                    (-lambda_list[0] * lambda_list[2] * lambda_list[3]) / denominator2,
+                ],
+                [
+                    1 / denominator3,
+                    (-lambda_list[0] - lambda_list[1] - lambda_list[3]) / denominator3,
+                    (
+                        lambda_list[0] * lambda_list[1]
+                        + lambda_list[0] * lambda_list[3]
+                        + lambda_list[1] * lambda_list[3]
+                    )
+                    / denominator3,
+                    (-lambda_list[0] * lambda_list[1] * lambda_list[3]) / denominator3,
+                ],
+                [
+                    1 / denominator4,
+                    (-lambda_list[0] - lambda_list[1] - lambda_list[2]) / denominator4,
+                    (
+                        lambda_list[0] * lambda_list[1]
+                        + lambda_list[0] * lambda_list[2]
+                        + lambda_list[1] * lambda_list[2]
+                    )
+                    / denominator4,
+                    (-lambda_list[0] * lambda_list[1] * lambda_list[2]) / denominator4,
+                ],
+            ]
+
+    def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau):
+        assert order in [1, 2, 3, 4]
+        assert order == len(lambda_list), "the length of lambda list must be equal to the order"
+        coefficients = []
+        lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list)
+        for i in range(order):
+            coefficient = 0
+            for j in range(order):
+                if self.predict_x0:
+                    coefficient += lagrange_coefficient[i][j] * self.get_coefficients_exponential_positive(
+                        order - 1 - j, interval_start, interval_end, tau
+                    )
+                else:
+                    coefficient += lagrange_coefficient[i][j] * self.get_coefficients_exponential_negative(
+                        order - 1 - j, interval_start, interval_end
+                    )
+            coefficients.append(coefficient)
+        assert len(coefficients) == order, "the length of coefficients does not match the order"
+        return coefficients
+
+    def stochastic_adams_bashforth_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor,
+        noise: torch.Tensor,
+        order: int,
+        tau: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the SA-Predictor.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model at the current timestep.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`):
+                The order of SA-Predictor at this timestep.
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if noise is None:
+            if len(args) > 2:
+                noise = args[2]
+            else:
+                raise ValueError(" missing `noise` as a required keyward argument")
+        if order is None:
+            if len(args) > 3:
+                order = args[3]
+            else:
+                raise ValueError(" missing `order` as a required keyward argument")
+        if tau is None:
+            if len(args) > 4:
+                tau = args[4]
+            else:
+                raise ValueError(" missing `tau` as a required keyward argument")
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+        sigma_t, sigma_s0 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+        )
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+
+        gradient_part = torch.zeros_like(sample)
+        h = lambda_t - lambda_s0
+        lambda_list = []
+
+        for i in range(order):
+            si = self.step_index - i
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            lambda_list.append(lambda_si)
+
+        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)
+
+        x = sample
+
+        if self.predict_x0:
+            if (
+                order == 2
+            ):  ## if order = 2 we do a modification that does not influence the convergence order similar to unipc. Note: This is used only for few steps sampling.
+                # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
+                # ODE case
+                # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
+                # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
+                temp_sigma = self.sigmas[self.step_index - 1]
+                temp_alpha_s, temp_sigma_s = self._sigma_to_alpha_sigma_t(temp_sigma)
+                temp_lambda_s = torch.log(temp_alpha_s) - torch.log(temp_sigma_s)
+                gradient_coefficients[0] += (
+                    1.0
+                    * torch.exp((1 + tau**2) * lambda_t)
+                    * (h**2 / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2))
+                    / (lambda_s0 - temp_lambda_s)
+                )
+                gradient_coefficients[1] -= (
+                    1.0
+                    * torch.exp((1 + tau**2) * lambda_t)
+                    * (h**2 / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2))
+                    / (lambda_s0 - temp_lambda_s)
+                )
+
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (
+                    (1 + tau**2)
+                    * sigma_t
+                    * torch.exp(-(tau**2) * lambda_t)
+                    * gradient_coefficients[i]
+                    * model_output_list[-(i + 1)]
+                )
+            else:
+                gradient_part += -(1 + tau**2) * alpha_t * gradient_coefficients[i] * model_output_list[-(i + 1)]
+
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau**2 * h)) * noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
+
+        if self.predict_x0:
+            x_t = torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
+
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    def stochastic_adams_moulton_update(
+        self,
+        this_model_output: torch.Tensor,
+        *args,
+        last_sample: torch.Tensor,
+        last_noise: torch.Tensor,
+        this_sample: torch.Tensor,
+        order: int,
+        tau: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the SA-Corrector.
+
+        Args:
+            this_model_output (`torch.Tensor`):
+                The model outputs at `x_t`.
+            this_timestep (`int`):
+                The current timestep `t`.
+            last_sample (`torch.Tensor`):
+                The generated sample before the last predictor `x_{t-1}`.
+            this_sample (`torch.Tensor`):
+                The generated sample after the last predictor `x_{t}`.
+            order (`int`):
+                The order of SA-Corrector at this step.
+
+        Returns:
+            `torch.Tensor`:
+                The corrected sample tensor at the current timestep.
+        """
+
+        this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None)
+        if last_sample is None:
+            if len(args) > 1:
+                last_sample = args[1]
+            else:
+                raise ValueError(" missing`last_sample` as a required keyward argument")
+        if last_noise is None:
+            if len(args) > 2:
+                last_noise = args[2]
+            else:
+                raise ValueError(" missing`last_noise` as a required keyward argument")
+        if this_sample is None:
+            if len(args) > 3:
+                this_sample = args[3]
+            else:
+                raise ValueError(" missing`this_sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 4:
+                order = args[4]
+            else:
+                raise ValueError(" missing`order` as a required keyward argument")
+        if tau is None:
+            if len(args) > 5:
+                tau = args[5]
+            else:
+                raise ValueError(" missing`tau` as a required keyward argument")
+        if this_timestep is not None:
+            deprecate(
+                "this_timestep",
+                "1.0.0",
+                "Passing `this_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        model_output_list = self.model_outputs
+        sigma_t, sigma_s0 = (
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        gradient_part = torch.zeros_like(this_sample)
+        h = lambda_t - lambda_s0
+        lambda_list = []
+        for i in range(order):
+            si = self.step_index - i
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            lambda_list.append(lambda_si)
+
+        model_prev_list = model_output_list + [this_model_output]
+
+        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)
+
+        x = last_sample
+
+        if self.predict_x0:
+            if (
+                order == 2
+            ):  ## if order = 2 we do a modification that does not influence the convergence order similar to UniPC. Note: This is used only for few steps sampling.
+                # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
+                # ODE case
+                # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
+                # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
+                gradient_coefficients[0] += (
+                    1.0
+                    * torch.exp((1 + tau**2) * lambda_t)
+                    * (h / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2 * h))
+                )
+                gradient_coefficients[1] -= (
+                    1.0
+                    * torch.exp((1 + tau**2) * lambda_t)
+                    * (h / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2 * h))
+                )
+
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (
+                    (1 + tau**2)
+                    * sigma_t
+                    * torch.exp(-(tau**2) * lambda_t)
+                    * gradient_coefficients[i]
+                    * model_prev_list[-(i + 1)]
+                )
+            else:
+                gradient_part += -(1 + tau**2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau**2 * h)) * last_noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * last_noise
+
+        if self.predict_x0:
+            x_t = torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
+
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        return step_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the SA-Solver.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        use_corrector = self.step_index > 0 and self.last_sample is not None
+
+        model_output_convert = self.convert_model_output(model_output, sample=sample)
+
+        if use_corrector:
+            current_tau = self.tau_func(self.timestep_list[-1])
+            sample = self.stochastic_adams_moulton_update(
+                this_model_output=model_output_convert,
+                last_sample=self.last_sample,
+                last_noise=self.last_noise,
+                this_sample=sample,
+                order=self.this_corrector_order,
+                tau=current_tau,
+            )
+
+        for i in range(max(self.config.predictor_order, self.config.corrector_order - 1) - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+            self.timestep_list[i] = self.timestep_list[i + 1]
+
+        self.model_outputs[-1] = model_output_convert
+        self.timestep_list[-1] = timestep
+
+        noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+
+        if self.config.lower_order_final:
+            this_predictor_order = min(self.config.predictor_order, len(self.timesteps) - self.step_index)
+            this_corrector_order = min(self.config.corrector_order, len(self.timesteps) - self.step_index + 1)
+        else:
+            this_predictor_order = self.config.predictor_order
+            this_corrector_order = self.config.corrector_order
+
+        self.this_predictor_order = min(this_predictor_order, self.lower_order_nums + 1)  # warmup for multistep
+        self.this_corrector_order = min(this_corrector_order, self.lower_order_nums + 2)  # warmup for multistep
+        assert self.this_predictor_order > 0
+        assert self.this_corrector_order > 0
+
+        self.last_sample = sample
+        self.last_noise = noise
+
+        current_tau = self.tau_func(self.timestep_list[-1])
+        prev_sample = self.stochastic_adams_bashforth_update(
+            model_output=model_output_convert,
+            sample=sample,
+            noise=noise,
+            order=self.this_predictor_order,
+            tau=current_tau,
+        )
+
+        if self.lower_order_nums < max(self.config.predictor_order, self.config.corrector_order - 1):
+            self.lower_order_nums += 1
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_sde_ve.py b/diffusers/src/diffusers/schedulers/scheduling_sde_ve.py
new file mode 100644
index 0000000000000000000000000000000000000000..cedfbf7d6ad5c48075eeb18b67aa3ce36edc54eb
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_sde_ve.py
@@ -0,0 +1,301 @@
+# Copyright 2024 Google Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin, SchedulerOutput
+
+
+@dataclass
+class SdeVeOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        prev_sample_mean (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Mean averaged `prev_sample` over previous timesteps.
+    """
+
+    prev_sample: torch.Tensor
+    prev_sample_mean: torch.Tensor
+
+
+class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `ScoreSdeVeScheduler` is a variance exploding stochastic differential equation (SDE) scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        snr (`float`, defaults to 0.15):
+            A coefficient weighting the step from the `model_output` sample (from the network) to the random noise.
+        sigma_min (`float`, defaults to 0.01):
+            The initial noise scale for the sigma sequence in the sampling procedure. The minimum sigma should mirror
+            the distribution of the data.
+        sigma_max (`float`, defaults to 1348.0):
+            The maximum value used for the range of continuous timesteps passed into the model.
+        sampling_eps (`float`, defaults to 1e-5):
+            The end value of sampling where timesteps decrease progressively from 1 to epsilon.
+        correct_steps (`int`, defaults to 1):
+            The number of correction steps performed on a produced sample.
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 2000,
+        snr: float = 0.15,
+        sigma_min: float = 0.01,
+        sigma_max: float = 1348.0,
+        sampling_eps: float = 1e-5,
+        correct_steps: int = 1,
+    ):
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = sigma_max
+
+        # setable values
+        self.timesteps = None
+
+        self.set_sigmas(num_train_timesteps, sigma_min, sigma_max, sampling_eps)
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(
+        self, num_inference_steps: int, sampling_eps: float = None, device: Union[str, torch.device] = None
+    ):
+        """
+        Sets the continuous timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            sampling_eps (`float`, *optional*):
+                The final timestep value (overrides value given during scheduler instantiation).
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+
+        """
+        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
+
+        self.timesteps = torch.linspace(1, sampling_eps, num_inference_steps, device=device)
+
+    def set_sigmas(
+        self, num_inference_steps: int, sigma_min: float = None, sigma_max: float = None, sampling_eps: float = None
+    ):
+        """
+        Sets the noise scales used for the diffusion chain (to be run before inference). The sigmas control the weight
+        of the `drift` and `diffusion` components of the sample update.
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            sigma_min (`float`, optional):
+                The initial noise scale value (overrides value given during scheduler instantiation).
+            sigma_max (`float`, optional):
+                The final noise scale value (overrides value given during scheduler instantiation).
+            sampling_eps (`float`, optional):
+                The final timestep value (overrides value given during scheduler instantiation).
+
+        """
+        sigma_min = sigma_min if sigma_min is not None else self.config.sigma_min
+        sigma_max = sigma_max if sigma_max is not None else self.config.sigma_max
+        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
+        if self.timesteps is None:
+            self.set_timesteps(num_inference_steps, sampling_eps)
+
+        self.sigmas = sigma_min * (sigma_max / sigma_min) ** (self.timesteps / sampling_eps)
+        self.discrete_sigmas = torch.exp(torch.linspace(math.log(sigma_min), math.log(sigma_max), num_inference_steps))
+        self.sigmas = torch.tensor([sigma_min * (sigma_max / sigma_min) ** t for t in self.timesteps])
+
+    def get_adjacent_sigma(self, timesteps, t):
+        return torch.where(
+            timesteps == 0,
+            torch.zeros_like(t.to(timesteps.device)),
+            self.discrete_sigmas[timesteps - 1].to(timesteps.device),
+        )
+
+    def step_pred(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[SdeVeOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_sde_ve.SdeVeOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_sde_ve.SdeVeOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_sde_ve.SdeVeOutput`] is returned, otherwise a tuple
+                is returned where the first element is the sample tensor.
+
+        """
+        if self.timesteps is None:
+            raise ValueError(
+                "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        timestep = timestep * torch.ones(
+            sample.shape[0], device=sample.device
+        )  # torch.repeat_interleave(timestep, sample.shape[0])
+        timesteps = (timestep * (len(self.timesteps) - 1)).long()
+
+        # mps requires indices to be in the same device, so we use cpu as is the default with cuda
+        timesteps = timesteps.to(self.discrete_sigmas.device)
+
+        sigma = self.discrete_sigmas[timesteps].to(sample.device)
+        adjacent_sigma = self.get_adjacent_sigma(timesteps, timestep).to(sample.device)
+        drift = torch.zeros_like(sample)
+        diffusion = (sigma**2 - adjacent_sigma**2) ** 0.5
+
+        # equation 6 in the paper: the model_output modeled by the network is grad_x log pt(x)
+        # also equation 47 shows the analog from SDE models to ancestral sampling methods
+        diffusion = diffusion.flatten()
+        while len(diffusion.shape) < len(sample.shape):
+            diffusion = diffusion.unsqueeze(-1)
+        drift = drift - diffusion**2 * model_output
+
+        #  equation 6: sample noise for the diffusion term of
+        noise = randn_tensor(
+            sample.shape, layout=sample.layout, generator=generator, device=sample.device, dtype=sample.dtype
+        )
+        prev_sample_mean = sample - drift  # subtract because `dt` is a small negative timestep
+        # TODO is the variable diffusion the correct scaling term for the noise?
+        prev_sample = prev_sample_mean + diffusion * noise  # add impact of diffusion field g
+
+        if not return_dict:
+            return (prev_sample, prev_sample_mean)
+
+        return SdeVeOutput(prev_sample=prev_sample, prev_sample_mean=prev_sample_mean)
+
+    def step_correct(
+        self,
+        model_output: torch.Tensor,
+        sample: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Correct the predicted sample based on the `model_output` of the network. This is often run repeatedly after
+        making the prediction for the previous timestep.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_sde_ve.SdeVeOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_sde_ve.SdeVeOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_sde_ve.SdeVeOutput`] is returned, otherwise a tuple
+                is returned where the first element is the sample tensor.
+
+        """
+        if self.timesteps is None:
+            raise ValueError(
+                "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # For small batch sizes, the paper "suggest replacing norm(z) with sqrt(d), where d is the dim. of z"
+        # sample noise for correction
+        noise = randn_tensor(sample.shape, layout=sample.layout, generator=generator).to(sample.device)
+
+        # compute step size from the model_output, the noise, and the snr
+        grad_norm = torch.norm(model_output.reshape(model_output.shape[0], -1), dim=-1).mean()
+        noise_norm = torch.norm(noise.reshape(noise.shape[0], -1), dim=-1).mean()
+        step_size = (self.config.snr * noise_norm / grad_norm) ** 2 * 2
+        step_size = step_size * torch.ones(sample.shape[0]).to(sample.device)
+        # self.repeat_scalar(step_size, sample.shape[0])
+
+        # compute corrected sample: model_output term and noise term
+        step_size = step_size.flatten()
+        while len(step_size.shape) < len(sample.shape):
+            step_size = step_size.unsqueeze(-1)
+        prev_sample_mean = sample + step_size * model_output
+        prev_sample = prev_sample_mean + ((step_size * 2) ** 0.5) * noise
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        timesteps = timesteps.to(original_samples.device)
+        sigmas = self.discrete_sigmas.to(original_samples.device)[timesteps]
+        noise = (
+            noise * sigmas[:, None, None, None]
+            if noise is not None
+            else torch.randn_like(original_samples) * sigmas[:, None, None, None]
+        )
+        noisy_samples = noise + original_samples
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_sde_ve_flax.py b/diffusers/src/diffusers/schedulers/scheduling_sde_ve_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a8d45d4acbcfb6dd922aa86058aee8bb30893b5
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_sde_ve_flax.py
@@ -0,0 +1,280 @@
+# Copyright 2024 Google Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+from jax import random
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+
+
+@flax.struct.dataclass
+class ScoreSdeVeSchedulerState:
+    # setable values
+    timesteps: Optional[jnp.ndarray] = None
+    discrete_sigmas: Optional[jnp.ndarray] = None
+    sigmas: Optional[jnp.ndarray] = None
+
+    @classmethod
+    def create(cls):
+        return cls()
+
+
+@dataclass
+class FlaxSdeVeOutput(FlaxSchedulerOutput):
+    """
+    Output class for the ScoreSdeVeScheduler's step function output.
+
+    Args:
+        state (`ScoreSdeVeSchedulerState`):
+        prev_sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        prev_sample_mean (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Mean averaged `prev_sample`. Same as `prev_sample`, only mean-averaged over previous timesteps.
+    """
+
+    state: ScoreSdeVeSchedulerState
+    prev_sample: jnp.ndarray
+    prev_sample_mean: Optional[jnp.ndarray] = None
+
+
+class FlaxScoreSdeVeScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    The variance exploding stochastic differential equation (SDE) scheduler.
+
+    For more information, see the original paper: https://arxiv.org/abs/2011.13456
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        snr (`float`):
+            coefficient weighting the step from the model_output sample (from the network) to the random noise.
+        sigma_min (`float`):
+                initial noise scale for sigma sequence in sampling procedure. The minimum sigma should mirror the
+                distribution of the data.
+        sigma_max (`float`): maximum value used for the range of continuous timesteps passed into the model.
+        sampling_eps (`float`): the end value of sampling, where timesteps decrease progressively from 1 to
+        epsilon.
+        correct_steps (`int`): number of correction steps performed on a produced sample.
+    """
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 2000,
+        snr: float = 0.15,
+        sigma_min: float = 0.01,
+        sigma_max: float = 1348.0,
+        sampling_eps: float = 1e-5,
+        correct_steps: int = 1,
+    ):
+        pass
+
+    def create_state(self):
+        state = ScoreSdeVeSchedulerState.create()
+        return self.set_sigmas(
+            state,
+            self.config.num_train_timesteps,
+            self.config.sigma_min,
+            self.config.sigma_max,
+            self.config.sampling_eps,
+        )
+
+    def set_timesteps(
+        self, state: ScoreSdeVeSchedulerState, num_inference_steps: int, shape: Tuple = (), sampling_eps: float = None
+    ) -> ScoreSdeVeSchedulerState:
+        """
+        Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`ScoreSdeVeSchedulerState`): the `FlaxScoreSdeVeScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+            sampling_eps (`float`, optional):
+                final timestep value (overrides value given at Scheduler instantiation).
+
+        """
+        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
+
+        timesteps = jnp.linspace(1, sampling_eps, num_inference_steps)
+        return state.replace(timesteps=timesteps)
+
+    def set_sigmas(
+        self,
+        state: ScoreSdeVeSchedulerState,
+        num_inference_steps: int,
+        sigma_min: float = None,
+        sigma_max: float = None,
+        sampling_eps: float = None,
+    ) -> ScoreSdeVeSchedulerState:
+        """
+        Sets the noise scales used for the diffusion chain. Supporting function to be run before inference.
+
+        The sigmas control the weight of the `drift` and `diffusion` components of sample update.
+
+        Args:
+            state (`ScoreSdeVeSchedulerState`): the `FlaxScoreSdeVeScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+            sigma_min (`float`, optional):
+                initial noise scale value (overrides value given at Scheduler instantiation).
+            sigma_max (`float`, optional):
+                final noise scale value (overrides value given at Scheduler instantiation).
+            sampling_eps (`float`, optional):
+                final timestep value (overrides value given at Scheduler instantiation).
+        """
+        sigma_min = sigma_min if sigma_min is not None else self.config.sigma_min
+        sigma_max = sigma_max if sigma_max is not None else self.config.sigma_max
+        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
+        if state.timesteps is None:
+            state = self.set_timesteps(state, num_inference_steps, sampling_eps)
+
+        discrete_sigmas = jnp.exp(jnp.linspace(jnp.log(sigma_min), jnp.log(sigma_max), num_inference_steps))
+        sigmas = jnp.array([sigma_min * (sigma_max / sigma_min) ** t for t in state.timesteps])
+
+        return state.replace(discrete_sigmas=discrete_sigmas, sigmas=sigmas)
+
+    def get_adjacent_sigma(self, state, timesteps, t):
+        return jnp.where(timesteps == 0, jnp.zeros_like(t), state.discrete_sigmas[timesteps - 1])
+
+    def step_pred(
+        self,
+        state: ScoreSdeVeSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        key: jax.Array,
+        return_dict: bool = True,
+    ) -> Union[FlaxSdeVeOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`ScoreSdeVeSchedulerState`): the `FlaxScoreSdeVeScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            generator: random number generator.
+            return_dict (`bool`): option for returning tuple rather than FlaxSdeVeOutput class
+
+        Returns:
+            [`FlaxSdeVeOutput`] or `tuple`: [`FlaxSdeVeOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+
+        """
+        if state.timesteps is None:
+            raise ValueError(
+                "`state.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        timestep = timestep * jnp.ones(
+            sample.shape[0],
+        )
+        timesteps = (timestep * (len(state.timesteps) - 1)).long()
+
+        sigma = state.discrete_sigmas[timesteps]
+        adjacent_sigma = self.get_adjacent_sigma(state, timesteps, timestep)
+        drift = jnp.zeros_like(sample)
+        diffusion = (sigma**2 - adjacent_sigma**2) ** 0.5
+
+        # equation 6 in the paper: the model_output modeled by the network is grad_x log pt(x)
+        # also equation 47 shows the analog from SDE models to ancestral sampling methods
+        diffusion = diffusion.flatten()
+        diffusion = broadcast_to_shape_from_left(diffusion, sample.shape)
+        drift = drift - diffusion**2 * model_output
+
+        #  equation 6: sample noise for the diffusion term of
+        key = random.split(key, num=1)
+        noise = random.normal(key=key, shape=sample.shape)
+        prev_sample_mean = sample - drift  # subtract because `dt` is a small negative timestep
+        # TODO is the variable diffusion the correct scaling term for the noise?
+        prev_sample = prev_sample_mean + diffusion * noise  # add impact of diffusion field g
+
+        if not return_dict:
+            return (prev_sample, prev_sample_mean, state)
+
+        return FlaxSdeVeOutput(prev_sample=prev_sample, prev_sample_mean=prev_sample_mean, state=state)
+
+    def step_correct(
+        self,
+        state: ScoreSdeVeSchedulerState,
+        model_output: jnp.ndarray,
+        sample: jnp.ndarray,
+        key: jax.Array,
+        return_dict: bool = True,
+    ) -> Union[FlaxSdeVeOutput, Tuple]:
+        """
+        Correct the predicted sample based on the output model_output of the network. This is often run repeatedly
+        after making the prediction for the previous timestep.
+
+        Args:
+            state (`ScoreSdeVeSchedulerState`): the `FlaxScoreSdeVeScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            generator: random number generator.
+            return_dict (`bool`): option for returning tuple rather than FlaxSdeVeOutput class
+
+        Returns:
+            [`FlaxSdeVeOutput`] or `tuple`: [`FlaxSdeVeOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+
+        """
+        if state.timesteps is None:
+            raise ValueError(
+                "`state.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # For small batch sizes, the paper "suggest replacing norm(z) with sqrt(d), where d is the dim. of z"
+        # sample noise for correction
+        key = random.split(key, num=1)
+        noise = random.normal(key=key, shape=sample.shape)
+
+        # compute step size from the model_output, the noise, and the snr
+        grad_norm = jnp.linalg.norm(model_output)
+        noise_norm = jnp.linalg.norm(noise)
+        step_size = (self.config.snr * noise_norm / grad_norm) ** 2 * 2
+        step_size = step_size * jnp.ones(sample.shape[0])
+
+        # compute corrected sample: model_output term and noise term
+        step_size = step_size.flatten()
+        step_size = broadcast_to_shape_from_left(step_size, sample.shape)
+        prev_sample_mean = sample + step_size * model_output
+        prev_sample = prev_sample_mean + ((step_size * 2) ** 0.5) * noise
+
+        if not return_dict:
+            return (prev_sample, state)
+
+        return FlaxSdeVeOutput(prev_sample=prev_sample, state=state)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_tcd.py b/diffusers/src/diffusers/schedulers/scheduling_tcd.py
new file mode 100644
index 0000000000000000000000000000000000000000..580224404c540abd31cb0300bb4848485b3a87c9
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_tcd.py
@@ -0,0 +1,695 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..schedulers.scheduling_utils import SchedulerMixin
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class TCDSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_noised_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted noised sample `(x_{s})` based on the model output from the current timestep.
+    """
+
+    prev_sample: torch.Tensor
+    pred_noised_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class TCDScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `TCDScheduler` incorporates the `Strategic Stochastic Sampling` introduced by the paper `Trajectory Consistency
+    Distillation`, extending the original Multistep Consistency Sampling to enable unrestricted trajectory traversal.
+
+    This code is based on the official repo of TCD(https://github.com/jabir-zheng/TCD).
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. [`~ConfigMixin`] takes care of storing all config
+    attributes that are passed in the scheduler's `__init__` function, such as `num_train_timesteps`. They can be
+    accessed via `scheduler.config.num_train_timesteps`. [`SchedulerMixin`] provides general loading and saving
+    functionality via the [`SchedulerMixin.save_pretrained`] and [`~SchedulerMixin.from_pretrained`] functions.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        original_inference_steps (`int`, *optional*, defaults to 50):
+            The default number of inference steps used to generate a linearly-spaced timestep schedule, from which we
+            will ultimately take `num_inference_steps` evenly spaced timesteps to form the final timestep schedule.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        timestep_scaling (`float`, defaults to 10.0):
+            The factor the timesteps will be multiplied by when calculating the consistency model boundary conditions
+            `c_skip` and `c_out`. Increasing this will decrease the approximation error (although the approximation
+            error at the default of `10.0` is already pretty small).
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,
+        beta_end: float = 0.012,
+        beta_schedule: str = "scaled_linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        original_inference_steps: int = 50,
+        clip_sample: bool = False,
+        clip_sample_range: float = 1.0,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        timestep_scaling: float = 10.0,
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+        self.custom_timesteps = False
+
+        self._step_index = None
+        self._begin_index = None
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    @property
+    def step_index(self):
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler._get_variance
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        original_inference_steps: Optional[int] = None,
+        timesteps: Optional[List[int]] = None,
+        strength: float = 1.0,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`, *optional*):
+                The number of diffusion steps used when generating samples with a pre-trained model. If used,
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            original_inference_steps (`int`, *optional*):
+                The original number of inference steps, which will be used to generate a linearly-spaced timestep
+                schedule (which is different from the standard `diffusers` implementation). We will then take
+                `num_inference_steps` timesteps from this schedule, evenly spaced in terms of indices, and use that as
+                our final timestep schedule. If not set, this will default to the `original_inference_steps` attribute.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps on the training/distillation timestep
+                schedule is used. If `timesteps` is passed, `num_inference_steps` must be `None`.
+            strength (`float`, *optional*, defaults to 1.0):
+                Used to determine the number of timesteps used for inference when using img2img, inpaint, etc.
+        """
+        # 0. Check inputs
+        if num_inference_steps is None and timesteps is None:
+            raise ValueError("Must pass exactly one of `num_inference_steps` or `custom_timesteps`.")
+
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+
+        # 1. Calculate the TCD original training/distillation timestep schedule.
+        original_steps = (
+            original_inference_steps if original_inference_steps is not None else self.config.original_inference_steps
+        )
+
+        if original_inference_steps is None:
+            # default option, timesteps align with discrete inference steps
+            if original_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`original_steps`: {original_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+            # TCD Timesteps Setting
+            # The skipping step parameter k from the paper.
+            k = self.config.num_train_timesteps // original_steps
+            # TCD Training/Distillation Steps Schedule
+            tcd_origin_timesteps = np.asarray(list(range(1, int(original_steps * strength) + 1))) * k - 1
+        else:
+            # customised option, sampled timesteps can be any arbitrary value
+            tcd_origin_timesteps = np.asarray(list(range(0, int(self.config.num_train_timesteps * strength))))
+
+        # 2. Calculate the TCD inference timestep schedule.
+        if timesteps is not None:
+            # 2.1 Handle custom timestep schedules.
+            train_timesteps = set(tcd_origin_timesteps)
+            non_train_timesteps = []
+            for i in range(1, len(timesteps)):
+                if timesteps[i] >= timesteps[i - 1]:
+                    raise ValueError("`custom_timesteps` must be in descending order.")
+
+                if timesteps[i] not in train_timesteps:
+                    non_train_timesteps.append(timesteps[i])
+
+            if timesteps[0] >= self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`timesteps` must start before `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps}."
+                )
+
+            # Raise warning if timestep schedule does not start with self.config.num_train_timesteps - 1
+            if strength == 1.0 and timesteps[0] != self.config.num_train_timesteps - 1:
+                logger.warning(
+                    f"The first timestep on the custom timestep schedule is {timesteps[0]}, not"
+                    f" `self.config.num_train_timesteps - 1`: {self.config.num_train_timesteps - 1}. You may get"
+                    f" unexpected results when using this timestep schedule."
+                )
+
+            # Raise warning if custom timestep schedule contains timesteps not on original timestep schedule
+            if non_train_timesteps:
+                logger.warning(
+                    f"The custom timestep schedule contains the following timesteps which are not on the original"
+                    f" training/distillation timestep schedule: {non_train_timesteps}. You may get unexpected results"
+                    f" when using this timestep schedule."
+                )
+
+            # Raise warning if custom timestep schedule is longer than original_steps
+            if original_steps is not None:
+                if len(timesteps) > original_steps:
+                    logger.warning(
+                        f"The number of timesteps in the custom timestep schedule is {len(timesteps)}, which exceeds the"
+                        f" the length of the timestep schedule used for training: {original_steps}. You may get some"
+                        f" unexpected results when using this timestep schedule."
+                    )
+            else:
+                if len(timesteps) > self.config.num_train_timesteps:
+                    logger.warning(
+                        f"The number of timesteps in the custom timestep schedule is {len(timesteps)}, which exceeds the"
+                        f" the length of the timestep schedule used for training: {self.config.num_train_timesteps}. You may get some"
+                        f" unexpected results when using this timestep schedule."
+                    )
+
+            timesteps = np.array(timesteps, dtype=np.int64)
+            self.num_inference_steps = len(timesteps)
+            self.custom_timesteps = True
+
+            # Apply strength (e.g. for img2img pipelines) (see StableDiffusionImg2ImgPipeline.get_timesteps)
+            init_timestep = min(int(self.num_inference_steps * strength), self.num_inference_steps)
+            t_start = max(self.num_inference_steps - init_timestep, 0)
+            timesteps = timesteps[t_start * self.order :]
+            # TODO: also reset self.num_inference_steps?
+        else:
+            # 2.2 Create the "standard" TCD inference timestep schedule.
+            if num_inference_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+
+            if original_steps is not None:
+                skipping_step = len(tcd_origin_timesteps) // num_inference_steps
+
+                if skipping_step < 1:
+                    raise ValueError(
+                        f"The combination of `original_steps x strength`: {original_steps} x {strength} is smaller than `num_inference_steps`: {num_inference_steps}. Make sure to either reduce `num_inference_steps` to a value smaller than {int(original_steps * strength)} or increase `strength` to a value higher than {float(num_inference_steps / original_steps)}."
+                    )
+
+            self.num_inference_steps = num_inference_steps
+
+            if original_steps is not None:
+                if num_inference_steps > original_steps:
+                    raise ValueError(
+                        f"`num_inference_steps`: {num_inference_steps} cannot be larger than `original_inference_steps`:"
+                        f" {original_steps} because the final timestep schedule will be a subset of the"
+                        f" `original_inference_steps`-sized initial timestep schedule."
+                    )
+            else:
+                if num_inference_steps > self.config.num_train_timesteps:
+                    raise ValueError(
+                        f"`num_inference_steps`: {num_inference_steps} cannot be larger than `num_train_timesteps`:"
+                        f" {self.config.num_train_timesteps} because the final timestep schedule will be a subset of the"
+                        f" `num_train_timesteps`-sized initial timestep schedule."
+                    )
+
+            # TCD Inference Steps Schedule
+            tcd_origin_timesteps = tcd_origin_timesteps[::-1].copy()
+            # Select (approximately) evenly spaced indices from tcd_origin_timesteps.
+            inference_indices = np.linspace(0, len(tcd_origin_timesteps), num=num_inference_steps, endpoint=False)
+            inference_indices = np.floor(inference_indices).astype(np.int64)
+            timesteps = tcd_origin_timesteps[inference_indices]
+
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.long)
+
+        self._step_index = None
+        self._begin_index = None
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        eta: float = 0.3,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[TCDSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                A stochastic parameter (referred to as `gamma` in the paper) used to control the stochasticity in every
+                step. When eta = 0, it represents deterministic sampling, whereas eta = 1 indicates full stochastic
+                sampling.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_tcd.TCDSchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.TCDSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_tcd.TCDSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        assert 0 <= eta <= 1.0, "gamma must be less than or equal to 1.0"
+
+        # 1. get previous step value
+        prev_step_index = self.step_index + 1
+        if prev_step_index < len(self.timesteps):
+            prev_timestep = self.timesteps[prev_step_index]
+        else:
+            prev_timestep = torch.tensor(0)
+
+        timestep_s = torch.floor((1 - eta) * prev_timestep).to(dtype=torch.long)
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        beta_prod_t = 1 - alpha_prod_t
+
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        alpha_prod_s = self.alphas_cumprod[timestep_s]
+        beta_prod_s = 1 - alpha_prod_s
+
+        # 3. Compute the predicted noised sample x_s based on the model parameterization
+        if self.config.prediction_type == "epsilon":  # noise-prediction
+            pred_original_sample = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
+            pred_epsilon = model_output
+            pred_noised_sample = alpha_prod_s.sqrt() * pred_original_sample + beta_prod_s.sqrt() * pred_epsilon
+        elif self.config.prediction_type == "sample":  # x-prediction
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+            pred_noised_sample = alpha_prod_s.sqrt() * pred_original_sample + beta_prod_s.sqrt() * pred_epsilon
+        elif self.config.prediction_type == "v_prediction":  # v-prediction
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+            pred_noised_sample = alpha_prod_s.sqrt() * pred_original_sample + beta_prod_s.sqrt() * pred_epsilon
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction` for `TCDScheduler`."
+            )
+
+        # 4. Sample and inject noise z ~ N(0, I) for MultiStep Inference
+        # Noise is not used on the final timestep of the timestep schedule.
+        # This also means that noise is not used for one-step sampling.
+        # Eta (referred to as "gamma" in the paper) was introduced to control the stochasticity in every step.
+        # When eta = 0, it represents deterministic sampling, whereas eta = 1 indicates full stochastic sampling.
+        if eta > 0:
+            if self.step_index != self.num_inference_steps - 1:
+                noise = randn_tensor(
+                    model_output.shape, generator=generator, device=model_output.device, dtype=pred_noised_sample.dtype
+                )
+                prev_sample = (alpha_prod_t_prev / alpha_prod_s).sqrt() * pred_noised_sample + (
+                    1 - alpha_prod_t_prev / alpha_prod_s
+                ).sqrt() * noise
+            else:
+                prev_sample = pred_noised_sample
+        else:
+            prev_sample = pred_noised_sample
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample, pred_noised_sample)
+
+        return TCDSchedulerOutput(prev_sample=prev_sample, pred_noised_sample=pred_noised_sample)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
+    def previous_timestep(self, timestep):
+        if self.custom_timesteps:
+            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
+            if index == self.timesteps.shape[0] - 1:
+                prev_t = torch.tensor(-1)
+            else:
+                prev_t = self.timesteps[index + 1]
+        else:
+            num_inference_steps = (
+                self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+            )
+            prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
+
+        return prev_t
diff --git a/diffusers/src/diffusers/schedulers/scheduling_unclip.py b/diffusers/src/diffusers/schedulers/scheduling_unclip.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e1580290f2274a36e89dd119bd4fd991cccf71f
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_unclip.py
@@ -0,0 +1,352 @@
+# Copyright 2024 Kakao Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->UnCLIP
+class UnCLIPSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class UnCLIPScheduler(SchedulerMixin, ConfigMixin):
+    """
+    NOTE: do not use this scheduler. The DDPM scheduler has been updated to support the changes made here. This
+    scheduler will be removed and replaced with DDPM.
+
+    This is a modified DDPM Scheduler specifically for the karlo unCLIP model.
+
+    This scheduler has some minor variations in how it calculates the learned range variance and dynamically
+    re-calculates betas based off the timesteps it is skipping.
+
+    The scheduler also uses a slightly different step ratio when computing timesteps to use for inference.
+
+    See [`~DDPMScheduler`] for more information on DDPM scheduling
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        variance_type (`str`):
+            options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small_log`
+            or `learned_range`.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample between `-clip_sample_range` and `clip_sample_range` for numerical
+            stability.
+        clip_sample_range (`float`, default `1.0`):
+            The range to clip the sample between. See `clip_sample`.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process)
+            or `sample` (directly predicting the noisy sample`)
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        variance_type: str = "fixed_small_log",
+        clip_sample: bool = True,
+        clip_sample_range: Optional[float] = 1.0,
+        prediction_type: str = "epsilon",
+        beta_schedule: str = "squaredcos_cap_v2",
+    ):
+        if beta_schedule != "squaredcos_cap_v2":
+            raise ValueError("UnCLIPScheduler only supports `beta_schedule`: 'squaredcos_cap_v2'")
+
+        self.betas = betas_for_alpha_bar(num_train_timesteps)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.one = torch.tensor(1.0)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+        self.variance_type = variance_type
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `torch.Tensor`: scaled input sample
+        """
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Note that this scheduler uses a slightly different step ratio than the other diffusers schedulers. The
+        different step ratio is to mimic the original karlo implementation and does not affect the quality or accuracy
+        of the results.
+
+        Args:
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+        self.num_inference_steps = num_inference_steps
+        step_ratio = (self.config.num_train_timesteps - 1) / (self.num_inference_steps - 1)
+        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance_type=None):
+        if prev_timestep is None:
+            prev_timestep = t - 1
+
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        if prev_timestep == t - 1:
+            beta = self.betas[t]
+        else:
+            beta = 1 - alpha_prod_t / alpha_prod_t_prev
+
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # and sample from it to get previous sample
+        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+        variance = beta_prod_t_prev / beta_prod_t * beta
+
+        if variance_type is None:
+            variance_type = self.config.variance_type
+
+        # hacks - were probably added for training stability
+        if variance_type == "fixed_small_log":
+            variance = torch.log(torch.clamp(variance, min=1e-20))
+            variance = torch.exp(0.5 * variance)
+        elif variance_type == "learned_range":
+            # NOTE difference with DDPM scheduler
+            min_log = variance.log()
+            max_log = beta.log()
+
+            frac = (predicted_variance + 1) / 2
+            variance = frac * max_log + (1 - frac) * min_log
+
+        return variance
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        prev_timestep: Optional[int] = None,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[UnCLIPSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                current instance of sample being created by diffusion process.
+            prev_timestep (`int`, *optional*): The previous timestep to predict the previous sample at.
+                Used to dynamically compute beta. If not given, `t-1` is used and the pre-computed beta is used.
+            generator: random number generator.
+            return_dict (`bool`): option for returning tuple rather than UnCLIPSchedulerOutput class
+
+        Returns:
+            [`~schedulers.scheduling_utils.UnCLIPSchedulerOutput`] or `tuple`:
+            [`~schedulers.scheduling_utils.UnCLIPSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+
+        """
+        t = timestep
+
+        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type == "learned_range":
+            model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+        else:
+            predicted_variance = None
+
+        # 1. compute alphas, betas
+        if prev_timestep is None:
+            prev_timestep = t - 1
+
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        if prev_timestep == t - 1:
+            beta = self.betas[t]
+            alpha = self.alphas[t]
+        else:
+            beta = 1 - alpha_prod_t / alpha_prod_t_prev
+            alpha = 1 - beta
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `sample`"
+                " for the UnCLIPScheduler."
+            )
+
+        # 3. Clip "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = torch.clamp(
+                pred_original_sample, -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * beta) / beta_prod_t
+        current_sample_coeff = alpha ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+        # 6. Add noise
+        variance = 0
+        if t > 0:
+            variance_noise = randn_tensor(
+                model_output.shape, dtype=model_output.dtype, generator=generator, device=model_output.device
+            )
+
+            variance = self._get_variance(
+                t,
+                predicted_variance=predicted_variance,
+                prev_timestep=prev_timestep,
+            )
+
+            if self.variance_type == "fixed_small_log":
+                variance = variance
+            elif self.variance_type == "learned_range":
+                variance = (0.5 * variance).exp()
+            else:
+                raise ValueError(
+                    f"variance_type given as {self.variance_type} must be one of `fixed_small_log` or `learned_range`"
+                    " for the UnCLIPScheduler."
+                )
+
+            variance = variance * variance_noise
+
+        pred_prev_sample = pred_prev_sample + variance
+
+        if not return_dict:
+            return (pred_prev_sample,)
+
+        return UnCLIPSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
diff --git a/diffusers/src/diffusers/schedulers/scheduling_unipc_multistep.py b/diffusers/src/diffusers/schedulers/scheduling_unipc_multistep.py
new file mode 100644
index 0000000000000000000000000000000000000000..78cf0b6d16a74482d8580a510109d66c513347cd
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_unipc_multistep.py
@@ -0,0 +1,984 @@
+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: check https://arxiv.org/abs/2302.04867 and https://github.com/wl-zhao/UniPC for more info
+# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `UniPCMultistepScheduler` is a training-free framework designed for the fast sampling of diffusion models.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        solver_order (`int`, default `2`):
+            The UniPC order which can be any positive integer. The effective order of accuracy is `solver_order + 1`
+            due to the UniC. It is recommended to use `solver_order=2` for guided sampling, and `solver_order=3` for
+            unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `predict_x0=True`.
+        predict_x0 (`bool`, defaults to `True`):
+            Whether to use the updating algorithm on the predicted x0.
+        solver_type (`str`, default `bh2`):
+            Solver type for UniPC. It is recommended to use `bh1` for unconditional sampling when steps < 10, and `bh2`
+            otherwise.
+        lower_order_final (`bool`, default `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        disable_corrector (`list`, default `[]`):
+            Decides which step to disable the corrector to mitigate the misalignment between `epsilon_theta(x_t, c)`
+            and `epsilon_theta(x_t^c, c)` which can influence convergence for a large guidance scale. Corrector is
+            usually disabled during the first few steps.
+        solver_p (`SchedulerMixin`, default `None`):
+            Any other scheduler that if specified, the algorithm becomes `solver_p + UniC`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        predict_x0: bool = True,
+        solver_type: str = "bh2",
+        lower_order_final: bool = True,
+        disable_corrector: List[int] = [],
+        solver_p: SchedulerMixin = None,
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if sum([self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError("Only one of `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used.")
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        if rescale_betas_zero_snr:
+            # Close to 0 without being 0 so first sigma is not inf
+            # FP16 smallest positive subnormal works well here
+            self.alphas_cumprod[-1] = 2**-24
+
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        if solver_type not in ["bh1", "bh2"]:
+            if solver_type in ["midpoint", "heun", "logrho"]:
+                self.register_to_config(solver_type="bh2")
+            else:
+                raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
+
+        self.predict_x0 = predict_x0
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.model_outputs = [None] * solver_order
+        self.timestep_list = [None] * solver_order
+        self.lower_order_nums = 0
+        self.disable_corrector = disable_corrector
+        self.solver_p = solver_p
+        self.last_sample = None
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+                .round()[::-1][:-1]
+                .copy()
+                .astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // (num_inference_steps + 1)
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.arange(self.config.num_train_timesteps, 0, -step_ratio).round().copy().astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        if self.config.use_karras_sigmas:
+            log_sigmas = np.log(sigmas)
+            sigmas = np.flip(sigmas).copy()
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+            if self.config.final_sigmas_type == "sigma_min":
+                sigma_last = sigmas[-1]
+            elif self.config.final_sigmas_type == "zero":
+                sigma_last = 0
+            else:
+                raise ValueError(
+                    f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+                )
+            sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+        elif self.config.use_exponential_sigmas:
+            sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+            if self.config.final_sigmas_type == "sigma_min":
+                sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+            elif self.config.final_sigmas_type == "zero":
+                sigma_last = 0
+            else:
+                raise ValueError(
+                    f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+                )
+            sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+        self.last_sample = None
+        if self.solver_p:
+            self.solver_p.set_timesteps(self.num_inference_steps, device=device)
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+
+        return alpha_t, sigma_t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
+    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """Constructs an exponential noise schedule."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps).exp()
+        return sigmas
+
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Convert the model output to the corresponding type the UniPC algorithm needs.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma = self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+
+        if self.predict_x0:
+            if self.config.prediction_type == "epsilon":
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the UniPCMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+        else:
+            if self.config.prediction_type == "epsilon":
+                return model_output
+            elif self.config.prediction_type == "sample":
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+                return epsilon
+            elif self.config.prediction_type == "v_prediction":
+                epsilon = alpha_t * model_output + sigma_t * sample
+                return epsilon
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the UniPCMultistepScheduler."
+                )
+
+    def multistep_uni_p_bh_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        order: int = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model at the current timestep.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`):
+                The order of UniP at this timestep (corresponds to the *p* in UniPC-p).
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 2:
+                order = args[2]
+            else:
+                raise ValueError(" missing `order` as a required keyward argument")
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+
+        s0 = self.timestep_list[-1]
+        m0 = model_output_list[-1]
+        x = sample
+
+        if self.solver_p:
+            x_t = self.solver_p.step(model_output, s0, x).prev_sample
+            return x_t
+
+        sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+
+        h = lambda_t - lambda_s0
+        device = sample.device
+
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - i
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)
+
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+
+        R = []
+        b = []
+
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+
+        factorial_i = 1
+
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)  # (B, K)
+            # for order 2, we use a simplified version
+            if order == 2:
+                rhos_p = torch.tensor([0.5], dtype=x.dtype, device=device)
+            else:
+                rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1]).to(device).to(x.dtype)
+        else:
+            D1s = None
+
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)
+            else:
+                pred_res = 0
+            x_t = x_t_ - alpha_t * B_h * pred_res
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)
+            else:
+                pred_res = 0
+            x_t = x_t_ - sigma_t * B_h * pred_res
+
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    def multistep_uni_c_bh_update(
+        self,
+        this_model_output: torch.Tensor,
+        *args,
+        last_sample: torch.Tensor = None,
+        this_sample: torch.Tensor = None,
+        order: int = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniC (B(h) version).
+
+        Args:
+            this_model_output (`torch.Tensor`):
+                The model outputs at `x_t`.
+            this_timestep (`int`):
+                The current timestep `t`.
+            last_sample (`torch.Tensor`):
+                The generated sample before the last predictor `x_{t-1}`.
+            this_sample (`torch.Tensor`):
+                The generated sample after the last predictor `x_{t}`.
+            order (`int`):
+                The `p` of UniC-p at this step. The effective order of accuracy should be `order + 1`.
+
+        Returns:
+            `torch.Tensor`:
+                The corrected sample tensor at the current timestep.
+        """
+        this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None)
+        if last_sample is None:
+            if len(args) > 1:
+                last_sample = args[1]
+            else:
+                raise ValueError(" missing`last_sample` as a required keyward argument")
+        if this_sample is None:
+            if len(args) > 2:
+                this_sample = args[2]
+            else:
+                raise ValueError(" missing`this_sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 3:
+                order = args[3]
+            else:
+                raise ValueError(" missing`order` as a required keyward argument")
+        if this_timestep is not None:
+            deprecate(
+                "this_timestep",
+                "1.0.0",
+                "Passing `this_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        model_output_list = self.model_outputs
+
+        m0 = model_output_list[-1]
+        x = last_sample
+        x_t = this_sample
+        model_t = this_model_output
+
+        sigma_t, sigma_s0 = self.sigmas[self.step_index], self.sigmas[self.step_index - 1]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+
+        h = lambda_t - lambda_s0
+        device = this_sample.device
+
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - (i + 1)
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)
+
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+
+        R = []
+        b = []
+
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+
+        factorial_i = 1
+
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)
+        else:
+            D1s = None
+
+        # for order 1, we use a simplified version
+        if order == 1:
+            rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device)
+        else:
+            rhos_c = torch.linalg.solve(R, b).to(device).to(x.dtype)
+
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        return step_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep UniPC.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        use_corrector = (
+            self.step_index > 0 and self.step_index - 1 not in self.disable_corrector and self.last_sample is not None
+        )
+
+        model_output_convert = self.convert_model_output(model_output, sample=sample)
+        if use_corrector:
+            sample = self.multistep_uni_c_bh_update(
+                this_model_output=model_output_convert,
+                last_sample=self.last_sample,
+                this_sample=sample,
+                order=self.this_order,
+            )
+
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+            self.timestep_list[i] = self.timestep_list[i + 1]
+
+        self.model_outputs[-1] = model_output_convert
+        self.timestep_list[-1] = timestep
+
+        if self.config.lower_order_final:
+            this_order = min(self.config.solver_order, len(self.timesteps) - self.step_index)
+        else:
+            this_order = self.config.solver_order
+
+        self.this_order = min(this_order, self.lower_order_nums + 1)  # warmup for multistep
+        assert self.this_order > 0
+
+        self.last_sample = sample
+        prev_sample = self.multistep_uni_p_bh_update(
+            model_output=model_output,  # pass the original non-converted model output, in case solver-p is used
+            sample=sample,
+            order=self.this_order,
+        )
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/diffusers/src/diffusers/schedulers/scheduling_utils.py b/diffusers/src/diffusers/schedulers/scheduling_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f20224b19009eed6c7e34cd308f4978210f5c466
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_utils.py
@@ -0,0 +1,193 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import os
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Union
+
+import torch
+from huggingface_hub.utils import validate_hf_hub_args
+
+from ..utils import BaseOutput, PushToHubMixin
+
+
+SCHEDULER_CONFIG_NAME = "scheduler_config.json"
+
+
+# NOTE: We make this type an enum because it simplifies usage in docs and prevents
+# circular imports when used for `_compatibles` within the schedulers module.
+# When it's used as a type in pipelines, it really is a Union because the actual
+# scheduler instance is passed in.
+class KarrasDiffusionSchedulers(Enum):
+    DDIMScheduler = 1
+    DDPMScheduler = 2
+    PNDMScheduler = 3
+    LMSDiscreteScheduler = 4
+    EulerDiscreteScheduler = 5
+    HeunDiscreteScheduler = 6
+    EulerAncestralDiscreteScheduler = 7
+    DPMSolverMultistepScheduler = 8
+    DPMSolverSinglestepScheduler = 9
+    KDPM2DiscreteScheduler = 10
+    KDPM2AncestralDiscreteScheduler = 11
+    DEISMultistepScheduler = 12
+    UniPCMultistepScheduler = 13
+    DPMSolverSDEScheduler = 14
+    EDMEulerScheduler = 15
+
+
+AysSchedules = {
+    "StableDiffusionTimesteps": [999, 850, 736, 645, 545, 455, 343, 233, 124, 24],
+    "StableDiffusionSigmas": [14.615, 6.475, 3.861, 2.697, 1.886, 1.396, 0.963, 0.652, 0.399, 0.152, 0.0],
+    "StableDiffusionXLTimesteps": [999, 845, 730, 587, 443, 310, 193, 116, 53, 13],
+    "StableDiffusionXLSigmas": [14.615, 6.315, 3.771, 2.181, 1.342, 0.862, 0.555, 0.380, 0.234, 0.113, 0.0],
+    "StableDiffusionVideoSigmas": [700.00, 54.5, 15.886, 7.977, 4.248, 1.789, 0.981, 0.403, 0.173, 0.034, 0.0],
+}
+
+
+@dataclass
+class SchedulerOutput(BaseOutput):
+    """
+    Base class for the output of a scheduler's `step` function.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.Tensor
+
+
+class SchedulerMixin(PushToHubMixin):
+    """
+    Base class for all schedulers.
+
+    [`SchedulerMixin`] contains common functions shared by all schedulers such as general loading and saving
+    functionalities.
+
+    [`ConfigMixin`] takes care of storing the configuration attributes (like `num_train_timesteps`) that are passed to
+    the scheduler's `__init__` function, and the attributes can be accessed by `scheduler.config.num_train_timesteps`.
+
+    Class attributes:
+        - **_compatibles** (`List[str]`) -- A list of scheduler classes that are compatible with the parent scheduler
+          class. Use [`~ConfigMixin.from_config`] to load a different compatible scheduler class (should be overridden
+          by parent class).
+    """
+
+    config_name = SCHEDULER_CONFIG_NAME
+    _compatibles = []
+    has_compatibles = True
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        subfolder: Optional[str] = None,
+        return_unused_kwargs=False,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a scheduler from a pre-defined JSON configuration file in a local directory or Hub repository.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the scheduler
+                      configuration saved with [`~SchedulerMixin.save_pretrained`].
+            subfolder (`str`, *optional*):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                Whether kwargs that are not consumed by the Python class should be returned or not.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+
+        <Tip>
+
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`. You can also activate the special
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        firewalled environment.
+
+        </Tip>
+
+        """
+        config, kwargs, commit_hash = cls.load_config(
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            subfolder=subfolder,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            **kwargs,
+        )
+        return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save a scheduler configuration object to a directory so that it can be reloaded using the
+        [`~SchedulerMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
+
+    @property
+    def compatibles(self):
+        """
+        Returns all schedulers that are compatible with this scheduler
+
+        Returns:
+            `List[SchedulerMixin]`: List of compatible schedulers
+        """
+        return self._get_compatibles()
+
+    @classmethod
+    def _get_compatibles(cls):
+        compatible_classes_str = list(set([cls.__name__] + cls._compatibles))
+        diffusers_library = importlib.import_module(__name__.split(".")[0])
+        compatible_classes = [
+            getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c)
+        ]
+        return compatible_classes
diff --git a/diffusers/src/diffusers/schedulers/scheduling_utils_flax.py b/diffusers/src/diffusers/schedulers/scheduling_utils_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae11baf9ea1b4d3d0e67cb77ea7dc01eca1511bf
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_utils_flax.py
@@ -0,0 +1,291 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import math
+import os
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+from huggingface_hub.utils import validate_hf_hub_args
+
+from ..utils import BaseOutput, PushToHubMixin
+
+
+SCHEDULER_CONFIG_NAME = "scheduler_config.json"
+
+
+# NOTE: We make this type an enum because it simplifies usage in docs and prevents
+# circular imports when used for `_compatibles` within the schedulers module.
+# When it's used as a type in pipelines, it really is a Union because the actual
+# scheduler instance is passed in.
+class FlaxKarrasDiffusionSchedulers(Enum):
+    FlaxDDIMScheduler = 1
+    FlaxDDPMScheduler = 2
+    FlaxPNDMScheduler = 3
+    FlaxLMSDiscreteScheduler = 4
+    FlaxDPMSolverMultistepScheduler = 5
+    FlaxEulerDiscreteScheduler = 6
+
+
+@dataclass
+class FlaxSchedulerOutput(BaseOutput):
+    """
+    Base class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: jnp.ndarray
+
+
+class FlaxSchedulerMixin(PushToHubMixin):
+    """
+    Mixin containing common functions for the schedulers.
+
+    Class attributes:
+        - **_compatibles** (`List[str]`) -- A list of classes that are compatible with the parent class, so that
+          `from_config` can be used from a class different than the one used to save the config (should be overridden
+          by parent class).
+    """
+
+    config_name = SCHEDULER_CONFIG_NAME
+    ignore_for_config = ["dtype"]
+    _compatibles = []
+    has_compatibles = True
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        subfolder: Optional[str] = None,
+        return_unused_kwargs=False,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a Scheduler class from a pre-defined JSON-file.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
+                      organization name, like `google/ddpm-celebahq-256`.
+                    - A path to a *directory* containing model weights saved using [`~SchedulerMixin.save_pretrained`],
+                      e.g., `./my_model_directory/`.
+            subfolder (`str`, *optional*):
+                In case the relevant files are located inside a subfolder of the model repo (either remote in
+                huggingface.co or downloaded locally), you can specify the folder name here.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                Whether kwargs that are not consumed by the Python class should be returned or not.
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+        <Tip>
+
+         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
+         models](https://huggingface.co/docs/hub/models-gated#gated-models).
+
+        </Tip>
+
+        <Tip>
+
+        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
+        use this method in a firewalled environment.
+
+        </Tip>
+
+        """
+        config, kwargs = cls.load_config(
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            subfolder=subfolder,
+            return_unused_kwargs=True,
+            **kwargs,
+        )
+        scheduler, unused_kwargs = cls.from_config(config, return_unused_kwargs=True, **kwargs)
+
+        if hasattr(scheduler, "create_state") and getattr(scheduler, "has_state", False):
+            state = scheduler.create_state()
+
+        if return_unused_kwargs:
+            return scheduler, state, unused_kwargs
+
+        return scheduler, state
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save a scheduler configuration object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~FlaxSchedulerMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
+
+    @property
+    def compatibles(self):
+        """
+        Returns all schedulers that are compatible with this scheduler
+
+        Returns:
+            `List[SchedulerMixin]`: List of compatible schedulers
+        """
+        return self._get_compatibles()
+
+    @classmethod
+    def _get_compatibles(cls):
+        compatible_classes_str = list(set([cls.__name__] + cls._compatibles))
+        diffusers_library = importlib.import_module(__name__.split(".")[0])
+        compatible_classes = [
+            getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c)
+        ]
+        return compatible_classes
+
+
+def broadcast_to_shape_from_left(x: jnp.ndarray, shape: Tuple[int]) -> jnp.ndarray:
+    assert len(shape) >= x.ndim
+    return jnp.broadcast_to(x.reshape(x.shape + (1,) * (len(shape) - x.ndim)), shape)
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps: int, max_beta=0.999, dtype=jnp.float32) -> jnp.ndarray:
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+
+    Returns:
+        betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+
+    def alpha_bar(time_step):
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return jnp.array(betas, dtype=dtype)
+
+
+@flax.struct.dataclass
+class CommonSchedulerState:
+    alphas: jnp.ndarray
+    betas: jnp.ndarray
+    alphas_cumprod: jnp.ndarray
+
+    @classmethod
+    def create(cls, scheduler):
+        config = scheduler.config
+
+        if config.trained_betas is not None:
+            betas = jnp.asarray(config.trained_betas, dtype=scheduler.dtype)
+        elif config.beta_schedule == "linear":
+            betas = jnp.linspace(config.beta_start, config.beta_end, config.num_train_timesteps, dtype=scheduler.dtype)
+        elif config.beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            betas = (
+                jnp.linspace(
+                    config.beta_start**0.5, config.beta_end**0.5, config.num_train_timesteps, dtype=scheduler.dtype
+                )
+                ** 2
+            )
+        elif config.beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            betas = betas_for_alpha_bar(config.num_train_timesteps, dtype=scheduler.dtype)
+        else:
+            raise NotImplementedError(
+                f"beta_schedule {config.beta_schedule} is not implemented for scheduler {scheduler.__class__.__name__}"
+            )
+
+        alphas = 1.0 - betas
+
+        alphas_cumprod = jnp.cumprod(alphas, axis=0)
+
+        return cls(
+            alphas=alphas,
+            betas=betas,
+            alphas_cumprod=alphas_cumprod,
+        )
+
+
+def get_sqrt_alpha_prod(
+    state: CommonSchedulerState, original_samples: jnp.ndarray, noise: jnp.ndarray, timesteps: jnp.ndarray
+):
+    alphas_cumprod = state.alphas_cumprod
+
+    sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+    sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+    sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
+
+    sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+    sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+    sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
+
+    return sqrt_alpha_prod, sqrt_one_minus_alpha_prod
+
+
+def add_noise_common(
+    state: CommonSchedulerState, original_samples: jnp.ndarray, noise: jnp.ndarray, timesteps: jnp.ndarray
+):
+    sqrt_alpha_prod, sqrt_one_minus_alpha_prod = get_sqrt_alpha_prod(state, original_samples, noise, timesteps)
+    noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+    return noisy_samples
+
+
+def get_velocity_common(state: CommonSchedulerState, sample: jnp.ndarray, noise: jnp.ndarray, timesteps: jnp.ndarray):
+    sqrt_alpha_prod, sqrt_one_minus_alpha_prod = get_sqrt_alpha_prod(state, sample, noise, timesteps)
+    velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+    return velocity
diff --git a/diffusers/src/diffusers/schedulers/scheduling_vq_diffusion.py b/diffusers/src/diffusers/schedulers/scheduling_vq_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd8d255fa9016621a4ffa03f39e019983c643445
--- /dev/null
+++ b/diffusers/src/diffusers/schedulers/scheduling_vq_diffusion.py
@@ -0,0 +1,467 @@
+# Copyright 2024 Microsoft and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class VQDiffusionSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+            Computed sample x_{t-1} of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.LongTensor
+
+
+def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.Tensor:
+    """
+    Convert batch of vector of class indices into batch of log onehot vectors
+
+    Args:
+        x (`torch.LongTensor` of shape `(batch size, vector length)`):
+            Batch of class indices
+
+        num_classes (`int`):
+            number of classes to be used for the onehot vectors
+
+    Returns:
+        `torch.Tensor` of shape `(batch size, num classes, vector length)`:
+            Log onehot vectors
+    """
+    x_onehot = F.one_hot(x, num_classes)
+    x_onehot = x_onehot.permute(0, 2, 1)
+    log_x = torch.log(x_onehot.float().clamp(min=1e-30))
+    return log_x
+
+
+def gumbel_noised(logits: torch.Tensor, generator: Optional[torch.Generator]) -> torch.Tensor:
+    """
+    Apply gumbel noise to `logits`
+    """
+    uniform = torch.rand(logits.shape, device=logits.device, generator=generator)
+    gumbel_noise = -torch.log(-torch.log(uniform + 1e-30) + 1e-30)
+    noised = gumbel_noise + logits
+    return noised
+
+
+def alpha_schedules(num_diffusion_timesteps: int, alpha_cum_start=0.99999, alpha_cum_end=0.000009):
+    """
+    Cumulative and non-cumulative alpha schedules.
+
+    See section 4.1.
+    """
+    att = (
+        np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (alpha_cum_end - alpha_cum_start)
+        + alpha_cum_start
+    )
+    att = np.concatenate(([1], att))
+    at = att[1:] / att[:-1]
+    att = np.concatenate((att[1:], [1]))
+    return at, att
+
+
+def gamma_schedules(num_diffusion_timesteps: int, gamma_cum_start=0.000009, gamma_cum_end=0.99999):
+    """
+    Cumulative and non-cumulative gamma schedules.
+
+    See section 4.1.
+    """
+    ctt = (
+        np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (gamma_cum_end - gamma_cum_start)
+        + gamma_cum_start
+    )
+    ctt = np.concatenate(([0], ctt))
+    one_minus_ctt = 1 - ctt
+    one_minus_ct = one_minus_ctt[1:] / one_minus_ctt[:-1]
+    ct = 1 - one_minus_ct
+    ctt = np.concatenate((ctt[1:], [0]))
+    return ct, ctt
+
+
+class VQDiffusionScheduler(SchedulerMixin, ConfigMixin):
+    """
+    A scheduler for vector quantized diffusion.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_vec_classes (`int`):
+            The number of classes of the vector embeddings of the latent pixels. Includes the class for the masked
+            latent pixel.
+        num_train_timesteps (`int`, defaults to 100):
+            The number of diffusion steps to train the model.
+        alpha_cum_start (`float`, defaults to 0.99999):
+            The starting cumulative alpha value.
+        alpha_cum_end (`float`, defaults to 0.00009):
+            The ending cumulative alpha value.
+        gamma_cum_start (`float`, defaults to 0.00009):
+            The starting cumulative gamma value.
+        gamma_cum_end (`float`, defaults to 0.99999):
+            The ending cumulative gamma value.
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_vec_classes: int,
+        num_train_timesteps: int = 100,
+        alpha_cum_start: float = 0.99999,
+        alpha_cum_end: float = 0.000009,
+        gamma_cum_start: float = 0.000009,
+        gamma_cum_end: float = 0.99999,
+    ):
+        self.num_embed = num_vec_classes
+
+        # By convention, the index for the mask class is the last class index
+        self.mask_class = self.num_embed - 1
+
+        at, att = alpha_schedules(num_train_timesteps, alpha_cum_start=alpha_cum_start, alpha_cum_end=alpha_cum_end)
+        ct, ctt = gamma_schedules(num_train_timesteps, gamma_cum_start=gamma_cum_start, gamma_cum_end=gamma_cum_end)
+
+        num_non_mask_classes = self.num_embed - 1
+        bt = (1 - at - ct) / num_non_mask_classes
+        btt = (1 - att - ctt) / num_non_mask_classes
+
+        at = torch.tensor(at.astype("float64"))
+        bt = torch.tensor(bt.astype("float64"))
+        ct = torch.tensor(ct.astype("float64"))
+        log_at = torch.log(at)
+        log_bt = torch.log(bt)
+        log_ct = torch.log(ct)
+
+        att = torch.tensor(att.astype("float64"))
+        btt = torch.tensor(btt.astype("float64"))
+        ctt = torch.tensor(ctt.astype("float64"))
+        log_cumprod_at = torch.log(att)
+        log_cumprod_bt = torch.log(btt)
+        log_cumprod_ct = torch.log(ctt)
+
+        self.log_at = log_at.float()
+        self.log_bt = log_bt.float()
+        self.log_ct = log_ct.float()
+        self.log_cumprod_at = log_cumprod_at.float()
+        self.log_cumprod_bt = log_cumprod_bt.float()
+        self.log_cumprod_ct = log_cumprod_ct.float()
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps and diffusion process parameters (alpha, beta, gamma) should be moved
+                to.
+        """
+        self.num_inference_steps = num_inference_steps
+        timesteps = np.arange(0, self.num_inference_steps)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+        self.log_at = self.log_at.to(device)
+        self.log_bt = self.log_bt.to(device)
+        self.log_ct = self.log_ct.to(device)
+        self.log_cumprod_at = self.log_cumprod_at.to(device)
+        self.log_cumprod_bt = self.log_cumprod_bt.to(device)
+        self.log_cumprod_ct = self.log_cumprod_ct.to(device)
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: torch.long,
+        sample: torch.LongTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[VQDiffusionSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by the reverse transition distribution. See
+        [`~VQDiffusionScheduler.q_posterior`] for more details about how the distribution is computer.
+
+        Args:
+            log_p_x_0: (`torch.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`):
+                The log probabilities for the predicted classes of the initial latent pixels. Does not include a
+                prediction for the masked class as the initial unnoised image cannot be masked.
+            t (`torch.long`):
+                The timestep that determines which transition matrices are used.
+            x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+                The classes of each latent pixel at time `t`.
+            generator (`torch.Generator`, or `None`):
+                A random number generator for the noise applied to `p(x_{t-1} | x_t)` before it is sampled from.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_vq_diffusion.VQDiffusionSchedulerOutput`] or
+                `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_vq_diffusion.VQDiffusionSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_vq_diffusion.VQDiffusionSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if timestep == 0:
+            log_p_x_t_min_1 = model_output
+        else:
+            log_p_x_t_min_1 = self.q_posterior(model_output, sample, timestep)
+
+        log_p_x_t_min_1 = gumbel_noised(log_p_x_t_min_1, generator)
+
+        x_t_min_1 = log_p_x_t_min_1.argmax(dim=1)
+
+        if not return_dict:
+            return (x_t_min_1,)
+
+        return VQDiffusionSchedulerOutput(prev_sample=x_t_min_1)
+
+    def q_posterior(self, log_p_x_0, x_t, t):
+        """
+        Calculates the log probabilities for the predicted classes of the image at timestep `t-1`:
+
+        ```
+        p(x_{t-1} | x_t) = sum( q(x_t | x_{t-1}) * q(x_{t-1} | x_0) * p(x_0) / q(x_t | x_0) )
+        ```
+
+        Args:
+            log_p_x_0 (`torch.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`):
+                The log probabilities for the predicted classes of the initial latent pixels. Does not include a
+                prediction for the masked class as the initial unnoised image cannot be masked.
+            x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+                The classes of each latent pixel at time `t`.
+            t (`torch.Long`):
+                The timestep that determines which transition matrix is used.
+
+        Returns:
+            `torch.Tensor` of shape `(batch size, num classes, num latent pixels)`:
+                The log probabilities for the predicted classes of the image at timestep `t-1`.
+        """
+        log_onehot_x_t = index_to_log_onehot(x_t, self.num_embed)
+
+        log_q_x_t_given_x_0 = self.log_Q_t_transitioning_to_known_class(
+            t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=True
+        )
+
+        log_q_t_given_x_t_min_1 = self.log_Q_t_transitioning_to_known_class(
+            t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=False
+        )
+
+        # p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0)          ...      p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0)
+        #               .                    .                                   .
+        #               .                            .                           .
+        #               .                                      .                 .
+        # p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})  ...      p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})
+        q = log_p_x_0 - log_q_x_t_given_x_0
+
+        # sum_0 = p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) + ... + p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}), ... ,
+        # sum_n = p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) + ... + p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})
+        q_log_sum_exp = torch.logsumexp(q, dim=1, keepdim=True)
+
+        # p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0          ...      p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n
+        #                        .                             .                                   .
+        #                        .                                     .                           .
+        #                        .                                               .                 .
+        # p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0  ...      p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n
+        q = q - q_log_sum_exp
+
+        # (p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}          ...      (p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}
+        #                                         .                                                .                                              .
+        #                                         .                                                        .                                      .
+        #                                         .                                                                  .                            .
+        # (p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}  ...      (p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}
+        # c_cumulative_{t-1}                                                                                 ...      c_cumulative_{t-1}
+        q = self.apply_cumulative_transitions(q, t - 1)
+
+        # ((p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_0) * sum_0              ...      ((p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_0) * sum_n
+        #                                                            .                                                                 .                                              .
+        #                                                            .                                                                         .                                      .
+        #                                                            .                                                                                   .                            .
+        # ((p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_{k-1}) * sum_0  ...      ((p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_{k-1}) * sum_n
+        # c_cumulative_{t-1} * q(x_t | x_{t-1}=C_k) * sum_0                                                                                       ...      c_cumulative_{t-1} * q(x_t | x_{t-1}=C_k) * sum_0
+        log_p_x_t_min_1 = q + log_q_t_given_x_t_min_1 + q_log_sum_exp
+
+        # For each column, there are two possible cases.
+        #
+        # Where:
+        # - sum(p_n(x_0))) is summing over all classes for x_0
+        # - C_i is the class transitioning from (not to be confused with c_t and c_cumulative_t being used for gamma's)
+        # - C_j is the class transitioning to
+        #
+        # 1. x_t is masked i.e. x_t = c_k
+        #
+        # Simplifying the expression, the column vector is:
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # (c_t / c_cumulative_t) * (a_cumulative_{t-1} * p_n(x_0 = C_i | x_t) + b_cumulative_{t-1} * sum(p_n(x_0)))
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # (c_cumulative_{t-1} / c_cumulative_t) * sum(p_n(x_0))
+        #
+        # From equation (11) stated in terms of forward probabilities, the last row is trivially verified.
+        #
+        # For the other rows, we can state the equation as ...
+        #
+        # (c_t / c_cumulative_t) * [b_cumulative_{t-1} * p(x_0=c_0) + ... + (a_cumulative_{t-1} + b_cumulative_{t-1}) * p(x_0=C_i) + ... + b_cumulative_{k-1} * p(x_0=c_{k-1})]
+        #
+        # This verifies the other rows.
+        #
+        # 2. x_t is not masked
+        #
+        # Simplifying the expression, there are two cases for the rows of the column vector, where C_j = C_i and where C_j != C_i:
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # C_j != C_i:        b_t * ((b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_0) + ... + ((a_cumulative_{t-1} + b_cumulative_{t-1}) / b_cumulative_t) * p_n(x_0 = C_i) + ... + (b_cumulative_{t-1} / (a_cumulative_t + b_cumulative_t)) * p_n(c_0=C_j) + ... + (b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_{k-1}))
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # C_j = C_i: (a_t + b_t) * ((b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_0) + ... + ((a_cumulative_{t-1} + b_cumulative_{t-1}) / (a_cumulative_t + b_cumulative_t)) * p_n(x_0 = C_i = C_j) + ... + (b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_{k-1}))
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # 0
+        #
+        # The last row is trivially verified. The other rows can be verified by directly expanding equation (11) stated in terms of forward probabilities.
+        return log_p_x_t_min_1
+
+    def log_Q_t_transitioning_to_known_class(
+        self, *, t: torch.int, x_t: torch.LongTensor, log_onehot_x_t: torch.Tensor, cumulative: bool
+    ):
+        """
+        Calculates the log probabilities of the rows from the (cumulative or non-cumulative) transition matrix for each
+        latent pixel in `x_t`.
+
+        Args:
+            t (`torch.Long`):
+                The timestep that determines which transition matrix is used.
+            x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+                The classes of each latent pixel at time `t`.
+            log_onehot_x_t (`torch.Tensor` of shape `(batch size, num classes, num latent pixels)`):
+                The log one-hot vectors of `x_t`.
+            cumulative (`bool`):
+                If cumulative is `False`, the single step transition matrix `t-1`->`t` is used. If cumulative is
+                `True`, the cumulative transition matrix `0`->`t` is used.
+
+        Returns:
+            `torch.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`:
+                Each _column_ of the returned matrix is a _row_ of log probabilities of the complete probability
+                transition matrix.
+
+                When non cumulative, returns `self.num_classes - 1` rows because the initial latent pixel cannot be
+                masked.
+
+                Where:
+                - `q_n` is the probability distribution for the forward process of the `n`th latent pixel.
+                - C_0 is a class of a latent pixel embedding
+                - C_k is the class of the masked latent pixel
+
+                non-cumulative result (omitting logarithms):
+                ```
+                q_0(x_t | x_{t-1} = C_0) ... q_n(x_t | x_{t-1} = C_0)
+                          .      .                     .
+                          .               .            .
+                          .                      .     .
+                q_0(x_t | x_{t-1} = C_k) ... q_n(x_t | x_{t-1} = C_k)
+                ```
+
+                cumulative result (omitting logarithms):
+                ```
+                q_0_cumulative(x_t | x_0 = C_0)    ...  q_n_cumulative(x_t | x_0 = C_0)
+                          .               .                          .
+                          .                        .                 .
+                          .                               .          .
+                q_0_cumulative(x_t | x_0 = C_{k-1}) ... q_n_cumulative(x_t | x_0 = C_{k-1})
+                ```
+        """
+        if cumulative:
+            a = self.log_cumprod_at[t]
+            b = self.log_cumprod_bt[t]
+            c = self.log_cumprod_ct[t]
+        else:
+            a = self.log_at[t]
+            b = self.log_bt[t]
+            c = self.log_ct[t]
+
+        if not cumulative:
+            # The values in the onehot vector can also be used as the logprobs for transitioning
+            # from masked latent pixels. If we are not calculating the cumulative transitions,
+            # we need to save these vectors to be re-appended to the final matrix so the values
+            # aren't overwritten.
+            #
+            # `P(x_t!=mask|x_{t-1=mask}) = 0` and 0 will be the value of the last row of the onehot vector
+            # if x_t is not masked
+            #
+            # `P(x_t=mask|x_{t-1=mask}) = 1` and 1 will be the value of the last row of the onehot vector
+            # if x_t is masked
+            log_onehot_x_t_transitioning_from_masked = log_onehot_x_t[:, -1, :].unsqueeze(1)
+
+        # `index_to_log_onehot` will add onehot vectors for masked pixels,
+        # so the default one hot matrix has one too many rows. See the doc string
+        # for an explanation of the dimensionality of the returned matrix.
+        log_onehot_x_t = log_onehot_x_t[:, :-1, :]
+
+        # this is a cheeky trick to produce the transition probabilities using log one-hot vectors.
+        #
+        # Don't worry about what values this sets in the columns that mark transitions
+        # to masked latent pixels. They are overwrote later with the `mask_class_mask`.
+        #
+        # Looking at the below logspace formula in non-logspace, each value will evaluate to either
+        # `1 * a + b = a + b` where `log_Q_t` has the one hot value in the column
+        # or
+        # `0 * a + b = b` where `log_Q_t` has the 0 values in the column.
+        #
+        # See equation 7 for more details.
+        log_Q_t = (log_onehot_x_t + a).logaddexp(b)
+
+        # The whole column of each masked pixel is `c`
+        mask_class_mask = x_t == self.mask_class
+        mask_class_mask = mask_class_mask.unsqueeze(1).expand(-1, self.num_embed - 1, -1)
+        log_Q_t[mask_class_mask] = c
+
+        if not cumulative:
+            log_Q_t = torch.cat((log_Q_t, log_onehot_x_t_transitioning_from_masked), dim=1)
+
+        return log_Q_t
+
+    def apply_cumulative_transitions(self, q, t):
+        bsz = q.shape[0]
+        a = self.log_cumprod_at[t]
+        b = self.log_cumprod_bt[t]
+        c = self.log_cumprod_ct[t]
+
+        num_latent_pixels = q.shape[2]
+        c = c.expand(bsz, 1, num_latent_pixels)
+
+        q = (q + a).logaddexp(b)
+        q = torch.cat((q, c), dim=1)
+
+        return q
diff --git a/diffusers/src/diffusers/training_utils.py b/diffusers/src/diffusers/training_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..26d4a2a504c6e9dac13e5ab048fa16dd5efc633b
--- /dev/null
+++ b/diffusers/src/diffusers/training_utils.py
@@ -0,0 +1,610 @@
+import contextlib
+import copy
+import gc
+import math
+import random
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from .models import UNet2DConditionModel
+from .schedulers import SchedulerMixin
+from .utils import (
+    convert_state_dict_to_diffusers,
+    convert_state_dict_to_peft,
+    deprecate,
+    is_peft_available,
+    is_torch_npu_available,
+    is_torchvision_available,
+    is_transformers_available,
+)
+
+
+if is_transformers_available():
+    import transformers
+
+if is_peft_available():
+    from peft import set_peft_model_state_dict
+
+if is_torchvision_available():
+    from torchvision import transforms
+
+if is_torch_npu_available():
+    import torch_npu  # noqa: F401
+
+
+def set_seed(seed: int):
+    """
+    Args:
+    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
+        seed (`int`): The seed to set.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if is_torch_npu_available():
+        torch.npu.manual_seed_all(seed)
+    else:
+        torch.cuda.manual_seed_all(seed)
+        # ^^ safe to call this function even if cuda is not available
+
+
+def compute_snr(noise_scheduler, timesteps):
+    """
+    Computes SNR as per
+    https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+    """
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = alphas_cumprod**0.5
+    sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+
+    # Expand the tensors.
+    # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+    sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+    while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+    alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+
+    sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+    while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+    sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+
+    # Compute SNR.
+    snr = (alpha / sigma) ** 2
+    return snr
+
+
+def resolve_interpolation_mode(interpolation_type: str):
+    """
+    Maps a string describing an interpolation function to the corresponding torchvision `InterpolationMode` enum. The
+    full list of supported enums is documented at
+    https://pytorch.org/vision/0.9/transforms.html#torchvision.transforms.functional.InterpolationMode.
+
+    Args:
+        interpolation_type (`str`):
+            A string describing an interpolation method. Currently, `bilinear`, `bicubic`, `box`, `nearest`,
+            `nearest_exact`, `hamming`, and `lanczos` are supported, corresponding to the supported interpolation modes
+            in torchvision.
+
+    Returns:
+        `torchvision.transforms.InterpolationMode`: an `InterpolationMode` enum used by torchvision's `resize`
+        transform.
+    """
+    if not is_torchvision_available():
+        raise ImportError(
+            "Please make sure to install `torchvision` to be able to use the `resolve_interpolation_mode()` function."
+        )
+
+    if interpolation_type == "bilinear":
+        interpolation_mode = transforms.InterpolationMode.BILINEAR
+    elif interpolation_type == "bicubic":
+        interpolation_mode = transforms.InterpolationMode.BICUBIC
+    elif interpolation_type == "box":
+        interpolation_mode = transforms.InterpolationMode.BOX
+    elif interpolation_type == "nearest":
+        interpolation_mode = transforms.InterpolationMode.NEAREST
+    elif interpolation_type == "nearest_exact":
+        interpolation_mode = transforms.InterpolationMode.NEAREST_EXACT
+    elif interpolation_type == "hamming":
+        interpolation_mode = transforms.InterpolationMode.HAMMING
+    elif interpolation_type == "lanczos":
+        interpolation_mode = transforms.InterpolationMode.LANCZOS
+    else:
+        raise ValueError(
+            f"The given interpolation mode {interpolation_type} is not supported. Currently supported interpolation"
+            f" modes are `bilinear`, `bicubic`, `box`, `nearest`, `nearest_exact`, `hamming`, and `lanczos`."
+        )
+
+    return interpolation_mode
+
+
+def compute_dream_and_update_latents(
+    unet: UNet2DConditionModel,
+    noise_scheduler: SchedulerMixin,
+    timesteps: torch.Tensor,
+    noise: torch.Tensor,
+    noisy_latents: torch.Tensor,
+    target: torch.Tensor,
+    encoder_hidden_states: torch.Tensor,
+    dream_detail_preservation: float = 1.0,
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    """
+    Implements "DREAM (Diffusion Rectification and Estimation-Adaptive Models)" from http://arxiv.org/abs/2312.00210.
+    DREAM helps align training with sampling to help training be more efficient and accurate at the cost of an extra
+    forward step without gradients.
+
+    Args:
+        `unet`: The state unet to use to make a prediction.
+        `noise_scheduler`: The noise scheduler used to add noise for the given timestep.
+        `timesteps`: The timesteps for the noise_scheduler to user.
+        `noise`: A tensor of noise in the shape of noisy_latents.
+        `noisy_latents`: Previously noise latents from the training loop.
+        `target`: The ground-truth tensor to predict after eps is removed.
+        `encoder_hidden_states`: Text embeddings from the text model.
+        `dream_detail_preservation`: A float value that indicates detail preservation level.
+          See reference.
+
+    Returns:
+        `tuple[torch.Tensor, torch.Tensor]`: Adjusted noisy_latents and target.
+    """
+    alphas_cumprod = noise_scheduler.alphas_cumprod.to(timesteps.device)[timesteps, None, None, None]
+    sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+
+    # The paper uses lambda = sqrt(1 - alpha) ** p, with p = 1 in their experiments.
+    dream_lambda = sqrt_one_minus_alphas_cumprod**dream_detail_preservation
+
+    pred = None
+    with torch.no_grad():
+        pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+    _noisy_latents, _target = (None, None)
+    if noise_scheduler.config.prediction_type == "epsilon":
+        predicted_noise = pred
+        delta_noise = (noise - predicted_noise).detach()
+        delta_noise.mul_(dream_lambda)
+        _noisy_latents = noisy_latents.add(sqrt_one_minus_alphas_cumprod * delta_noise)
+        _target = target.add(delta_noise)
+    elif noise_scheduler.config.prediction_type == "v_prediction":
+        raise NotImplementedError("DREAM has not been implemented for v-prediction")
+    else:
+        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+    return _noisy_latents, _target
+
+
+def unet_lora_state_dict(unet: UNet2DConditionModel) -> Dict[str, torch.Tensor]:
+    r"""
+    Returns:
+        A state dict containing just the LoRA parameters.
+    """
+    lora_state_dict = {}
+
+    for name, module in unet.named_modules():
+        if hasattr(module, "set_lora_layer"):
+            lora_layer = getattr(module, "lora_layer")
+            if lora_layer is not None:
+                current_lora_layer_sd = lora_layer.state_dict()
+                for lora_layer_matrix_name, lora_param in current_lora_layer_sd.items():
+                    # The matrix name can either be "down" or "up".
+                    lora_state_dict[f"{name}.lora.{lora_layer_matrix_name}"] = lora_param
+
+    return lora_state_dict
+
+
+def cast_training_params(model: Union[torch.nn.Module, List[torch.nn.Module]], dtype=torch.float32):
+    if not isinstance(model, list):
+        model = [model]
+    for m in model:
+        for param in m.parameters():
+            # only upcast trainable parameters into fp32
+            if param.requires_grad:
+                param.data = param.to(dtype)
+
+
+def _set_state_dict_into_text_encoder(
+    lora_state_dict: Dict[str, torch.Tensor], prefix: str, text_encoder: torch.nn.Module
+):
+    """
+    Sets the `lora_state_dict` into `text_encoder` coming from `transformers`.
+
+    Args:
+        lora_state_dict: The state dictionary to be set.
+        prefix: String identifier to retrieve the portion of the state dict that belongs to `text_encoder`.
+        text_encoder: Where the `lora_state_dict` is to be set.
+    """
+
+    text_encoder_state_dict = {
+        f'{k.replace(prefix, "")}': v for k, v in lora_state_dict.items() if k.startswith(prefix)
+    }
+    text_encoder_state_dict = convert_state_dict_to_peft(convert_state_dict_to_diffusers(text_encoder_state_dict))
+    set_peft_model_state_dict(text_encoder, text_encoder_state_dict, adapter_name="default")
+
+
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+):
+    """Compute the density for sampling the timesteps when doing SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "logit_normal":
+        # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.nn.functional.sigmoid(u)
+    elif weighting_scheme == "mode":
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    else:
+        u = torch.rand(size=(batch_size,), device="cpu")
+    return u
+
+
+def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
+    """Computes loss weighting scheme for SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "sigma_sqrt":
+        weighting = (sigmas**-2.0).float()
+    elif weighting_scheme == "cosmap":
+        bot = 1 - 2 * sigmas + 2 * sigmas**2
+        weighting = 2 / (math.pi * bot)
+    else:
+        weighting = torch.ones_like(sigmas)
+    return weighting
+
+
+def clear_objs_and_retain_memory(objs: List[Any]):
+    """Deletes `objs` and runs garbage collection. Then clears the cache of the available accelerator."""
+    if len(objs) >= 1:
+        for obj in objs:
+            del obj
+
+    gc.collect()
+
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    elif torch.backends.mps.is_available():
+        torch.mps.empty_cache()
+    elif is_torch_npu_available():
+        torch_npu.empty_cache()
+
+
+# Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14
+class EMAModel:
+    """
+    Exponential Moving Average of models weights
+    """
+
+    def __init__(
+        self,
+        parameters: Iterable[torch.nn.Parameter],
+        decay: float = 0.9999,
+        min_decay: float = 0.0,
+        update_after_step: int = 0,
+        use_ema_warmup: bool = False,
+        inv_gamma: Union[float, int] = 1.0,
+        power: Union[float, int] = 2 / 3,
+        foreach: bool = False,
+        model_cls: Optional[Any] = None,
+        model_config: Dict[str, Any] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            parameters (Iterable[torch.nn.Parameter]): The parameters to track.
+            decay (float): The decay factor for the exponential moving average.
+            min_decay (float): The minimum decay factor for the exponential moving average.
+            update_after_step (int): The number of steps to wait before starting to update the EMA weights.
+            use_ema_warmup (bool): Whether to use EMA warmup.
+            inv_gamma (float):
+                Inverse multiplicative factor of EMA warmup. Default: 1. Only used if `use_ema_warmup` is True.
+            power (float): Exponential factor of EMA warmup. Default: 2/3. Only used if `use_ema_warmup` is True.
+            foreach (bool): Use torch._foreach functions for updating shadow parameters. Should be faster.
+            device (Optional[Union[str, torch.device]]): The device to store the EMA weights on. If None, the EMA
+                        weights will be stored on CPU.
+
+        @crowsonkb's notes on EMA Warmup:
+            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
+            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
+            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
+            at 215.4k steps).
+        """
+
+        if isinstance(parameters, torch.nn.Module):
+            deprecation_message = (
+                "Passing a `torch.nn.Module` to `ExponentialMovingAverage` is deprecated. "
+                "Please pass the parameters of the module instead."
+            )
+            deprecate(
+                "passing a `torch.nn.Module` to `ExponentialMovingAverage`",
+                "1.0.0",
+                deprecation_message,
+                standard_warn=False,
+            )
+            parameters = parameters.parameters()
+
+            # set use_ema_warmup to True if a torch.nn.Module is passed for backwards compatibility
+            use_ema_warmup = True
+
+        if kwargs.get("max_value", None) is not None:
+            deprecation_message = "The `max_value` argument is deprecated. Please use `decay` instead."
+            deprecate("max_value", "1.0.0", deprecation_message, standard_warn=False)
+            decay = kwargs["max_value"]
+
+        if kwargs.get("min_value", None) is not None:
+            deprecation_message = "The `min_value` argument is deprecated. Please use `min_decay` instead."
+            deprecate("min_value", "1.0.0", deprecation_message, standard_warn=False)
+            min_decay = kwargs["min_value"]
+
+        parameters = list(parameters)
+        self.shadow_params = [p.clone().detach() for p in parameters]
+
+        if kwargs.get("device", None) is not None:
+            deprecation_message = "The `device` argument is deprecated. Please use `to` instead."
+            deprecate("device", "1.0.0", deprecation_message, standard_warn=False)
+            self.to(device=kwargs["device"])
+
+        self.temp_stored_params = None
+
+        self.decay = decay
+        self.min_decay = min_decay
+        self.update_after_step = update_after_step
+        self.use_ema_warmup = use_ema_warmup
+        self.inv_gamma = inv_gamma
+        self.power = power
+        self.optimization_step = 0
+        self.cur_decay_value = None  # set in `step()`
+        self.foreach = foreach
+
+        self.model_cls = model_cls
+        self.model_config = model_config
+
+    @classmethod
+    def from_pretrained(cls, path, model_cls, foreach=False) -> "EMAModel":
+        _, ema_kwargs = model_cls.load_config(path, return_unused_kwargs=True)
+        model = model_cls.from_pretrained(path)
+
+        ema_model = cls(model.parameters(), model_cls=model_cls, model_config=model.config, foreach=foreach)
+
+        ema_model.load_state_dict(ema_kwargs)
+        return ema_model
+
+    def save_pretrained(self, path):
+        if self.model_cls is None:
+            raise ValueError("`save_pretrained` can only be used if `model_cls` was defined at __init__.")
+
+        if self.model_config is None:
+            raise ValueError("`save_pretrained` can only be used if `model_config` was defined at __init__.")
+
+        model = self.model_cls.from_config(self.model_config)
+        state_dict = self.state_dict()
+        state_dict.pop("shadow_params", None)
+
+        model.register_to_config(**state_dict)
+        self.copy_to(model.parameters())
+        model.save_pretrained(path)
+
+    def get_decay(self, optimization_step: int) -> float:
+        """
+        Compute the decay factor for the exponential moving average.
+        """
+        step = max(0, optimization_step - self.update_after_step - 1)
+
+        if step <= 0:
+            return 0.0
+
+        if self.use_ema_warmup:
+            cur_decay_value = 1 - (1 + step / self.inv_gamma) ** -self.power
+        else:
+            cur_decay_value = (1 + step) / (10 + step)
+
+        cur_decay_value = min(cur_decay_value, self.decay)
+        # make sure decay is not smaller than min_decay
+        cur_decay_value = max(cur_decay_value, self.min_decay)
+        return cur_decay_value
+
+    @torch.no_grad()
+    def step(self, parameters: Iterable[torch.nn.Parameter]):
+        if isinstance(parameters, torch.nn.Module):
+            deprecation_message = (
+                "Passing a `torch.nn.Module` to `ExponentialMovingAverage.step` is deprecated. "
+                "Please pass the parameters of the module instead."
+            )
+            deprecate(
+                "passing a `torch.nn.Module` to `ExponentialMovingAverage.step`",
+                "1.0.0",
+                deprecation_message,
+                standard_warn=False,
+            )
+            parameters = parameters.parameters()
+
+        parameters = list(parameters)
+
+        self.optimization_step += 1
+
+        # Compute the decay factor for the exponential moving average.
+        decay = self.get_decay(self.optimization_step)
+        self.cur_decay_value = decay
+        one_minus_decay = 1 - decay
+
+        context_manager = contextlib.nullcontext
+        if is_transformers_available() and transformers.integrations.deepspeed.is_deepspeed_zero3_enabled():
+            import deepspeed
+
+        if self.foreach:
+            if is_transformers_available() and transformers.integrations.deepspeed.is_deepspeed_zero3_enabled():
+                context_manager = deepspeed.zero.GatheredParameters(parameters, modifier_rank=None)
+
+            with context_manager():
+                params_grad = [param for param in parameters if param.requires_grad]
+                s_params_grad = [
+                    s_param for s_param, param in zip(self.shadow_params, parameters) if param.requires_grad
+                ]
+
+                if len(params_grad) < len(parameters):
+                    torch._foreach_copy_(
+                        [s_param for s_param, param in zip(self.shadow_params, parameters) if not param.requires_grad],
+                        [param for param in parameters if not param.requires_grad],
+                        non_blocking=True,
+                    )
+
+                torch._foreach_sub_(
+                    s_params_grad, torch._foreach_sub(s_params_grad, params_grad), alpha=one_minus_decay
+                )
+
+        else:
+            for s_param, param in zip(self.shadow_params, parameters):
+                if is_transformers_available() and transformers.integrations.deepspeed.is_deepspeed_zero3_enabled():
+                    context_manager = deepspeed.zero.GatheredParameters(param, modifier_rank=None)
+
+                with context_manager():
+                    if param.requires_grad:
+                        s_param.sub_(one_minus_decay * (s_param - param))
+                    else:
+                        s_param.copy_(param)
+
+    def copy_to(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        """
+        Copy current averaged parameters into given collection of parameters.
+
+        Args:
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                updated with the stored moving averages. If `None`, the parameters with which this
+                `ExponentialMovingAverage` was initialized will be used.
+        """
+        parameters = list(parameters)
+        if self.foreach:
+            torch._foreach_copy_(
+                [param.data for param in parameters],
+                [s_param.to(param.device).data for s_param, param in zip(self.shadow_params, parameters)],
+            )
+        else:
+            for s_param, param in zip(self.shadow_params, parameters):
+                param.data.copy_(s_param.to(param.device).data)
+
+    def pin_memory(self) -> None:
+        r"""
+        Move internal buffers of the ExponentialMovingAverage to pinned memory. Useful for non-blocking transfers for
+        offloading EMA params to the host.
+        """
+
+        self.shadow_params = [p.pin_memory() for p in self.shadow_params]
+
+    def to(self, device=None, dtype=None, non_blocking=False) -> None:
+        r"""Move internal buffers of the ExponentialMovingAverage to `device`.
+
+        Args:
+            device: like `device` argument to `torch.Tensor.to`
+        """
+        # .to() on the tensors handles None correctly
+        self.shadow_params = [
+            p.to(device=device, dtype=dtype, non_blocking=non_blocking)
+            if p.is_floating_point()
+            else p.to(device=device, non_blocking=non_blocking)
+            for p in self.shadow_params
+        ]
+
+    def state_dict(self) -> dict:
+        r"""
+        Returns the state of the ExponentialMovingAverage as a dict. This method is used by accelerate during
+        checkpointing to save the ema state dict.
+        """
+        # Following PyTorch conventions, references to tensors are returned:
+        # "returns a reference to the state and not its copy!" -
+        # https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict
+        return {
+            "decay": self.decay,
+            "min_decay": self.min_decay,
+            "optimization_step": self.optimization_step,
+            "update_after_step": self.update_after_step,
+            "use_ema_warmup": self.use_ema_warmup,
+            "inv_gamma": self.inv_gamma,
+            "power": self.power,
+            "shadow_params": self.shadow_params,
+        }
+
+    def store(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        r"""
+        Args:
+        Save the current parameters for restoring later.
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                temporarily stored.
+        """
+        self.temp_stored_params = [param.detach().cpu().clone() for param in parameters]
+
+    def restore(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        r"""
+        Args:
+        Restore the parameters stored with the `store` method. Useful to validate the model with EMA parameters without:
+        affecting the original optimization process. Store the parameters before the `copy_to()` method. After
+        validation (or model saving), use this to restore the former parameters.
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                updated with the stored parameters. If `None`, the parameters with which this
+                `ExponentialMovingAverage` was initialized will be used.
+        """
+        if self.temp_stored_params is None:
+            raise RuntimeError("This ExponentialMovingAverage has no `store()`ed weights " "to `restore()`")
+        if self.foreach:
+            torch._foreach_copy_(
+                [param.data for param in parameters], [c_param.data for c_param in self.temp_stored_params]
+            )
+        else:
+            for c_param, param in zip(self.temp_stored_params, parameters):
+                param.data.copy_(c_param.data)
+
+        # Better memory-wise.
+        self.temp_stored_params = None
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        r"""
+        Args:
+        Loads the ExponentialMovingAverage state. This method is used by accelerate during checkpointing to save the
+        ema state dict.
+            state_dict (dict): EMA state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        # deepcopy, to be consistent with module API
+        state_dict = copy.deepcopy(state_dict)
+
+        self.decay = state_dict.get("decay", self.decay)
+        if self.decay < 0.0 or self.decay > 1.0:
+            raise ValueError("Decay must be between 0 and 1")
+
+        self.min_decay = state_dict.get("min_decay", self.min_decay)
+        if not isinstance(self.min_decay, float):
+            raise ValueError("Invalid min_decay")
+
+        self.optimization_step = state_dict.get("optimization_step", self.optimization_step)
+        if not isinstance(self.optimization_step, int):
+            raise ValueError("Invalid optimization_step")
+
+        self.update_after_step = state_dict.get("update_after_step", self.update_after_step)
+        if not isinstance(self.update_after_step, int):
+            raise ValueError("Invalid update_after_step")
+
+        self.use_ema_warmup = state_dict.get("use_ema_warmup", self.use_ema_warmup)
+        if not isinstance(self.use_ema_warmup, bool):
+            raise ValueError("Invalid use_ema_warmup")
+
+        self.inv_gamma = state_dict.get("inv_gamma", self.inv_gamma)
+        if not isinstance(self.inv_gamma, (float, int)):
+            raise ValueError("Invalid inv_gamma")
+
+        self.power = state_dict.get("power", self.power)
+        if not isinstance(self.power, (float, int)):
+            raise ValueError("Invalid power")
+
+        shadow_params = state_dict.get("shadow_params", None)
+        if shadow_params is not None:
+            self.shadow_params = shadow_params
+            if not isinstance(self.shadow_params, list):
+                raise ValueError("shadow_params must be a list")
+            if not all(isinstance(p, torch.Tensor) for p in self.shadow_params):
+                raise ValueError("shadow_params must all be Tensors")
diff --git a/diffusers/src/diffusers/utils/__init__.py b/diffusers/src/diffusers/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7ea2bcc5b7ff86679965dd73d62f3618c2ed096
--- /dev/null
+++ b/diffusers/src/diffusers/utils/__init__.py
@@ -0,0 +1,134 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+from packaging import version
+
+from .. import __version__
+from .constants import (
+    CONFIG_NAME,
+    DEPRECATED_REVISION_ARGS,
+    DIFFUSERS_DYNAMIC_MODULE_NAME,
+    FLAX_WEIGHTS_NAME,
+    HF_MODULES_CACHE,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    MIN_PEFT_VERSION,
+    ONNX_EXTERNAL_WEIGHTS_NAME,
+    ONNX_WEIGHTS_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFETENSORS_FILE_EXTENSION,
+    SAFETENSORS_WEIGHTS_NAME,
+    USE_PEFT_BACKEND,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+)
+from .deprecation_utils import deprecate
+from .doc_utils import replace_example_docstring
+from .dynamic_modules_utils import get_class_from_dynamic_module
+from .export_utils import export_to_gif, export_to_obj, export_to_ply, export_to_video
+from .hub_utils import (
+    PushToHubMixin,
+    _add_variant,
+    _get_checkpoint_shard_files,
+    _get_model_file,
+    extract_commit_hash,
+    http_user_agent,
+)
+from .import_utils import (
+    BACKENDS_MAPPING,
+    DIFFUSERS_SLOW_IMPORT,
+    ENV_VARS_TRUE_AND_AUTO_VALUES,
+    ENV_VARS_TRUE_VALUES,
+    USE_JAX,
+    USE_TF,
+    USE_TORCH,
+    DummyObject,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_accelerate_available,
+    is_accelerate_version,
+    is_bitsandbytes_available,
+    is_bs4_available,
+    is_flax_available,
+    is_ftfy_available,
+    is_google_colab,
+    is_inflect_available,
+    is_invisible_watermark_available,
+    is_k_diffusion_available,
+    is_k_diffusion_version,
+    is_librosa_available,
+    is_matplotlib_available,
+    is_note_seq_available,
+    is_onnx_available,
+    is_peft_available,
+    is_peft_version,
+    is_safetensors_available,
+    is_scipy_available,
+    is_sentencepiece_available,
+    is_tensorboard_available,
+    is_timm_available,
+    is_torch_available,
+    is_torch_npu_available,
+    is_torch_version,
+    is_torch_xla_available,
+    is_torchsde_available,
+    is_torchvision_available,
+    is_transformers_available,
+    is_transformers_version,
+    is_unidecode_available,
+    is_wandb_available,
+    is_xformers_available,
+    requires_backends,
+)
+from .loading_utils import load_image, load_video
+from .logging import get_logger
+from .outputs import BaseOutput
+from .peft_utils import (
+    check_peft_version,
+    delete_adapter_layers,
+    get_adapter_name,
+    get_peft_kwargs,
+    recurse_remove_peft_layers,
+    scale_lora_layers,
+    set_adapter_layers,
+    set_weights_and_activate_adapters,
+    unscale_lora_layers,
+)
+from .pil_utils import PIL_INTERPOLATION, make_image_grid, numpy_to_pil, pt_to_pil
+from .state_dict_utils import (
+    convert_all_state_dict_to_peft,
+    convert_state_dict_to_diffusers,
+    convert_state_dict_to_kohya,
+    convert_state_dict_to_peft,
+    convert_unet_state_dict_to_peft,
+)
+
+
+logger = get_logger(__name__)
+
+
+def check_min_version(min_version):
+    if version.parse(__version__) < version.parse(min_version):
+        if "dev" in min_version:
+            error_message = (
+                "This example requires a source install from HuggingFace diffusers (see "
+                "`https://huggingface.co/docs/diffusers/installation#install-from-source`),"
+            )
+        else:
+            error_message = f"This example requires a minimum version of {min_version},"
+        error_message += f" but the version found is {__version__}.\n"
+        raise ImportError(error_message)
diff --git a/diffusers/src/diffusers/utils/__pycache__/__init__.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6118f85db99af542ebad4cbe0a20266f59a811af
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/accelerate_utils.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/accelerate_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d86c8aa63478c37bbdad34277bf2b0c320ee4213
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/accelerate_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/constants.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/constants.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..603ed1b2d7e2311188316c3cb760fd713377a40c
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/constants.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/deprecation_utils.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/deprecation_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d39f8af725c5218238162e5e53698ab6cf4245a9
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/deprecation_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/doc_utils.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/doc_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d857f8dc7ab40d395409ebd5fb759dc0f90e215
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/doc_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/dummy_flax_and_transformers_objects.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/dummy_flax_and_transformers_objects.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d0b6ef70b2b9db47aa852b40361a5460fe1a2c2
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/dummy_flax_and_transformers_objects.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/dummy_flax_objects.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/dummy_flax_objects.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f48887522906422d556b9a5a9bf91a41323003d3
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/dummy_flax_objects.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/dummy_note_seq_objects.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/dummy_note_seq_objects.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4597c64a1631e4c272c1ffd3be406ee9dad81adb
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/dummy_note_seq_objects.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/dummy_onnx_objects.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/dummy_onnx_objects.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d405cd3cc1b7048cae696a5892ce952435bad17
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/dummy_onnx_objects.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/dummy_torch_and_torchsde_objects.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/dummy_torch_and_torchsde_objects.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d54112e58557fcf1727d981c7a217d6127a15d4
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/dummy_torch_and_torchsde_objects.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/dummy_torch_and_transformers_and_k_diffusion_objects.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/dummy_torch_and_transformers_and_k_diffusion_objects.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20d8fde7dac5cc00cdce31a079cbb97c4458189a
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/dummy_torch_and_transformers_and_k_diffusion_objects.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/dummy_torch_and_transformers_and_onnx_objects.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/dummy_torch_and_transformers_and_onnx_objects.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7de1064a038dde378c2d5f0bea21e732a27d6f57
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/dummy_torch_and_transformers_and_onnx_objects.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/dummy_transformers_and_torch_and_note_seq_objects.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/dummy_transformers_and_torch_and_note_seq_objects.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a349efa0a82035d619ee3ba84cff76ffd37e70c0
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/dummy_transformers_and_torch_and_note_seq_objects.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/dynamic_modules_utils.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/dynamic_modules_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..290a7d4467da2b924c50693650a186716782e0f9
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/dynamic_modules_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/export_utils.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/export_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0dcb2201a8830552658b225a098248801fbc1c0d
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/export_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/hub_utils.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/hub_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90e0059a99936ec89626dfee3ac88007f6652480
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/hub_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/import_utils.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/import_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5b1f1aff5e0ccb8ac7c3bd8110635efa78f788f
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/import_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/loading_utils.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/loading_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..787665fe871fc189e9f2f77e00fa6964c9b7413d
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/loading_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/logging.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/logging.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23f7292b256351cf79cf3a2ca74bd7e2ce397c06
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/logging.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/outputs.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/outputs.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3168a04a1c7a0a8a707eb8a3e173a7f15defd16
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/outputs.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/peft_utils.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/peft_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..765fcfa8650fa101ac256283c725359038652afd
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/peft_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/pil_utils.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/pil_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e10f507f584c2283fc54699f34b3cc1c0c39cd3
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/pil_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/state_dict_utils.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/state_dict_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c629fff7b850225eba1903b4ba249ade11658bbf
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/state_dict_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/torch_utils.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/torch_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c3b574689067f57f66b3fd2abcb3e50dc3f3216
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/torch_utils.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/__pycache__/versions.cpython-311.pyc b/diffusers/src/diffusers/utils/__pycache__/versions.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4506be5e8b10e2f4c5adb779138e6a3305760e60
Binary files /dev/null and b/diffusers/src/diffusers/utils/__pycache__/versions.cpython-311.pyc differ
diff --git a/diffusers/src/diffusers/utils/accelerate_utils.py b/diffusers/src/diffusers/utils/accelerate_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..99a8b3a47c256cc80dbd5de7d14101dcc7a9ff9a
--- /dev/null
+++ b/diffusers/src/diffusers/utils/accelerate_utils.py
@@ -0,0 +1,48 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Accelerate utilities: Utilities related to accelerate
+"""
+
+from packaging import version
+
+from .import_utils import is_accelerate_available
+
+
+if is_accelerate_available():
+    import accelerate
+
+
+def apply_forward_hook(method):
+    """
+    Decorator that applies a registered CpuOffload hook to an arbitrary function rather than `forward`. This is useful
+    for cases where a PyTorch module provides functions other than `forward` that should trigger a move to the
+    appropriate acceleration device. This is the case for `encode` and `decode` in [`AutoencoderKL`].
+
+    This decorator looks inside the internal `_hf_hook` property to find a registered offload hook.
+
+    :param method: The method to decorate. This method should be a method of a PyTorch module.
+    """
+    if not is_accelerate_available():
+        return method
+    accelerate_version = version.parse(accelerate.__version__).base_version
+    if version.parse(accelerate_version) < version.parse("0.17.0"):
+        return method
+
+    def wrapper(self, *args, **kwargs):
+        if hasattr(self, "_hf_hook") and hasattr(self._hf_hook, "pre_forward"):
+            self._hf_hook.pre_forward(self)
+        return method(self, *args, **kwargs)
+
+    return wrapper
diff --git a/diffusers/src/diffusers/utils/constants.py b/diffusers/src/diffusers/utils/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..553ac5d1bb2794cc480e08790ad3577664c66ce3
--- /dev/null
+++ b/diffusers/src/diffusers/utils/constants.py
@@ -0,0 +1,57 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import os
+
+from huggingface_hub.constants import HF_HOME
+from packaging import version
+
+from ..dependency_versions_check import dep_version_check
+from .import_utils import ENV_VARS_TRUE_VALUES, is_peft_available, is_transformers_available
+
+
+MIN_PEFT_VERSION = "0.6.0"
+MIN_TRANSFORMERS_VERSION = "4.34.0"
+_CHECK_PEFT = os.environ.get("_CHECK_PEFT", "1") in ENV_VARS_TRUE_VALUES
+
+
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "diffusion_pytorch_model.bin"
+WEIGHTS_INDEX_NAME = "diffusion_pytorch_model.bin.index.json"
+FLAX_WEIGHTS_NAME = "diffusion_flax_model.msgpack"
+ONNX_WEIGHTS_NAME = "model.onnx"
+SAFETENSORS_WEIGHTS_NAME = "diffusion_pytorch_model.safetensors"
+SAFE_WEIGHTS_INDEX_NAME = "diffusion_pytorch_model.safetensors.index.json"
+SAFETENSORS_FILE_EXTENSION = "safetensors"
+ONNX_EXTERNAL_WEIGHTS_NAME = "weights.pb"
+HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
+DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules"
+HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(HF_HOME, "modules"))
+DEPRECATED_REVISION_ARGS = ["fp16", "non-ema"]
+
+# Below should be `True` if the current version of `peft` and `transformers` are compatible with
+# PEFT backend. Will automatically fall back to PEFT backend if the correct versions of the libraries are
+# available.
+# For PEFT it is has to be greater than or equal to 0.6.0 and for transformers it has to be greater than or equal to 4.34.0.
+_required_peft_version = is_peft_available() and version.parse(
+    version.parse(importlib.metadata.version("peft")).base_version
+) >= version.parse(MIN_PEFT_VERSION)
+_required_transformers_version = is_transformers_available() and version.parse(
+    version.parse(importlib.metadata.version("transformers")).base_version
+) >= version.parse(MIN_TRANSFORMERS_VERSION)
+
+USE_PEFT_BACKEND = _required_peft_version and _required_transformers_version
+
+if USE_PEFT_BACKEND and _CHECK_PEFT:
+    dep_version_check("peft")
diff --git a/diffusers/src/diffusers/utils/deprecation_utils.py b/diffusers/src/diffusers/utils/deprecation_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f482deddd2f46b8d2e29d5229faa0e9a21f2fd98
--- /dev/null
+++ b/diffusers/src/diffusers/utils/deprecation_utils.py
@@ -0,0 +1,49 @@
+import inspect
+import warnings
+from typing import Any, Dict, Optional, Union
+
+from packaging import version
+
+
+def deprecate(*args, take_from: Optional[Union[Dict, Any]] = None, standard_warn=True, stacklevel=2):
+    from .. import __version__
+
+    deprecated_kwargs = take_from
+    values = ()
+    if not isinstance(args[0], tuple):
+        args = (args,)
+
+    for attribute, version_name, message in args:
+        if version.parse(version.parse(__version__).base_version) >= version.parse(version_name):
+            raise ValueError(
+                f"The deprecation tuple {(attribute, version_name, message)} should be removed since diffusers'"
+                f" version {__version__} is >= {version_name}"
+            )
+
+        warning = None
+        if isinstance(deprecated_kwargs, dict) and attribute in deprecated_kwargs:
+            values += (deprecated_kwargs.pop(attribute),)
+            warning = f"The `{attribute}` argument is deprecated and will be removed in version {version_name}."
+        elif hasattr(deprecated_kwargs, attribute):
+            values += (getattr(deprecated_kwargs, attribute),)
+            warning = f"The `{attribute}` attribute is deprecated and will be removed in version {version_name}."
+        elif deprecated_kwargs is None:
+            warning = f"`{attribute}` is deprecated and will be removed in version {version_name}."
+
+        if warning is not None:
+            warning = warning + " " if standard_warn else ""
+            warnings.warn(warning + message, FutureWarning, stacklevel=stacklevel)
+
+    if isinstance(deprecated_kwargs, dict) and len(deprecated_kwargs) > 0:
+        call_frame = inspect.getouterframes(inspect.currentframe())[1]
+        filename = call_frame.filename
+        line_number = call_frame.lineno
+        function = call_frame.function
+        key, value = next(iter(deprecated_kwargs.items()))
+        raise TypeError(f"{function} in {filename} line {line_number-1} got an unexpected keyword argument `{key}`")
+
+    if len(values) == 0:
+        return
+    elif len(values) == 1:
+        return values[0]
+    return values
diff --git a/diffusers/src/diffusers/utils/doc_utils.py b/diffusers/src/diffusers/utils/doc_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe633e6836422f22618548b3f6d946d6b073650f
--- /dev/null
+++ b/diffusers/src/diffusers/utils/doc_utils.py
@@ -0,0 +1,39 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Doc utilities: Utilities related to documentation
+"""
+
+import re
+
+
+def replace_example_docstring(example_docstring):
+    def docstring_decorator(fn):
+        func_doc = fn.__doc__
+        lines = func_doc.split("\n")
+        i = 0
+        while i < len(lines) and re.search(r"^\s*Examples?:\s*$", lines[i]) is None:
+            i += 1
+        if i < len(lines):
+            lines[i] = example_docstring
+            func_doc = "\n".join(lines)
+        else:
+            raise ValueError(
+                f"The function {fn} should have an empty 'Examples:' in its docstring as placeholder, "
+                f"current docstring is:\n{func_doc}"
+            )
+        fn.__doc__ = func_doc
+        return fn
+
+    return docstring_decorator
diff --git a/diffusers/src/diffusers/utils/dummy_flax_and_transformers_objects.py b/diffusers/src/diffusers/utils/dummy_flax_and_transformers_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e65e5349bb0a6a0bac62cddf0ce0fad64237c68
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dummy_flax_and_transformers_objects.py
@@ -0,0 +1,77 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class FlaxStableDiffusionControlNetPipeline(metaclass=DummyObject):
+    _backends = ["flax", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+
+class FlaxStableDiffusionImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["flax", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+
+class FlaxStableDiffusionInpaintPipeline(metaclass=DummyObject):
+    _backends = ["flax", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+
+class FlaxStableDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["flax", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+
+class FlaxStableDiffusionXLPipeline(metaclass=DummyObject):
+    _backends = ["flax", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
diff --git a/diffusers/src/diffusers/utils/dummy_flax_objects.py b/diffusers/src/diffusers/utils/dummy_flax_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fa8dbc819316e96f7483addba43f90b9d8f397b
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dummy_flax_objects.py
@@ -0,0 +1,212 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class FlaxControlNetModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxModelMixin(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxUNet2DConditionModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxAutoencoderKL(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxDDIMScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxDDPMScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxDPMSolverMultistepScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxEulerDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxKarrasVeScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxLMSDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxPNDMScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxSchedulerMixin(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxScoreSdeVeScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
diff --git a/diffusers/src/diffusers/utils/dummy_note_seq_objects.py b/diffusers/src/diffusers/utils/dummy_note_seq_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..c02d0b015aedc37c01fb3b843bc79547aae5da68
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dummy_note_seq_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class MidiProcessor(metaclass=DummyObject):
+    _backends = ["note_seq"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["note_seq"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["note_seq"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["note_seq"])
diff --git a/diffusers/src/diffusers/utils/dummy_onnx_objects.py b/diffusers/src/diffusers/utils/dummy_onnx_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..bde5f6ad0793e2d81bc638600b46ff81748d09ee
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dummy_onnx_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class OnnxRuntimeModel(metaclass=DummyObject):
+    _backends = ["onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["onnx"])
diff --git a/diffusers/src/diffusers/utils/dummy_pt_objects.py b/diffusers/src/diffusers/utils/dummy_pt_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ab946ce725737a087eaf936f6b3d8bce55d8a56
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dummy_pt_objects.py
@@ -0,0 +1,1545 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class AsymmetricAutoencoderKL(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AuraFlowTransformer2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoencoderKL(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoencoderKLCogVideoX(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoencoderKLTemporalDecoder(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoencoderOobleck(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoencoderTiny(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class CogVideoXTransformer3DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ConsistencyDecoderVAE(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ControlNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ControlNetXSAdapter(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DiTTransformer2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class FluxControlNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class FluxMultiControlNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class FluxTransformer2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class HunyuanDiT2DControlNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class HunyuanDiT2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class HunyuanDiT2DMultiControlNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class I2VGenXLUNet(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class Kandinsky3UNet(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class LatteTransformer3DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class LuminaNextDiT2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ModelMixin(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class MotionAdapter(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class MultiAdapter(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class PixArtTransformer2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class PriorTransformer(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class SD3ControlNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class SD3MultiControlNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class SD3Transformer2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class SparseControlNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class StableAudioDiTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class T2IAdapter(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class T5FilmDecoder(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class Transformer2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UNet1DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UNet2DConditionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UNet2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UNet3DConditionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UNetControlNetXSModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UNetMotionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UNetSpatioTemporalConditionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UVit2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class VQModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+def get_constant_schedule(*args, **kwargs):
+    requires_backends(get_constant_schedule, ["torch"])
+
+
+def get_constant_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_constant_schedule_with_warmup, ["torch"])
+
+
+def get_cosine_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_cosine_schedule_with_warmup, ["torch"])
+
+
+def get_cosine_with_hard_restarts_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, ["torch"])
+
+
+def get_linear_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_linear_schedule_with_warmup, ["torch"])
+
+
+def get_polynomial_decay_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_polynomial_decay_schedule_with_warmup, ["torch"])
+
+
+def get_scheduler(*args, **kwargs):
+    requires_backends(get_scheduler, ["torch"])
+
+
+class AudioPipelineOutput(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoPipelineForImage2Image(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoPipelineForInpainting(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoPipelineForText2Image(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class BlipDiffusionControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class BlipDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class CLIPImageProjection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ConsistencyModelPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DanceDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDIMPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDPMPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DiTPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ImagePipelineOutput(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class KarrasVePipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class LDMPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class LDMSuperResolutionPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class PNDMPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class RePaintPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ScoreSdeVePipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class StableDiffusionMixin(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AmusedScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class CMStochasticIterativeScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class CogVideoXDDIMScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class CogVideoXDPMScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDIMInverseScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDIMParallelScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDIMScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDPMParallelScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDPMScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDPMWuerstchenScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DEISMultistepScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DPMSolverMultistepInverseScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DPMSolverMultistepScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DPMSolverSinglestepScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class EDMDPMSolverMultistepScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class EDMEulerScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class EulerAncestralDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class EulerDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class FlowMatchEulerDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class FlowMatchHeunDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class HeunDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class IPNDMScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class KarrasVeScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class KDPM2AncestralDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class KDPM2DiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class LCMScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class PNDMScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class RePaintScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class SASolverScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class SchedulerMixin(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ScoreSdeVeScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class TCDScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UnCLIPScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UniPCMultistepScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class VQDiffusionScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class EMAModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
diff --git a/diffusers/src/diffusers/utils/dummy_torch_and_librosa_objects.py b/diffusers/src/diffusers/utils/dummy_torch_and_librosa_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..2088bc4a744198284f22fe54e6f1055cf3568566
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dummy_torch_and_librosa_objects.py
@@ -0,0 +1,32 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class AudioDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "librosa"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "librosa"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "librosa"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "librosa"])
+
+
+class Mel(metaclass=DummyObject):
+    _backends = ["torch", "librosa"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "librosa"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "librosa"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "librosa"])
diff --git a/diffusers/src/diffusers/utils/dummy_torch_and_scipy_objects.py b/diffusers/src/diffusers/utils/dummy_torch_and_scipy_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1ff25863822b04971d2c6dfdc17f5b28774cf05
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dummy_torch_and_scipy_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class LMSDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch", "scipy"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "scipy"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "scipy"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "scipy"])
diff --git a/diffusers/src/diffusers/utils/dummy_torch_and_torchsde_objects.py b/diffusers/src/diffusers/utils/dummy_torch_and_torchsde_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ff14231b9cc8c88effc4d11f7b439fa16bb7f20
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dummy_torch_and_torchsde_objects.py
@@ -0,0 +1,32 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class CosineDPMSolverMultistepScheduler(metaclass=DummyObject):
+    _backends = ["torch", "torchsde"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "torchsde"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "torchsde"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "torchsde"])
+
+
+class DPMSolverSDEScheduler(metaclass=DummyObject):
+    _backends = ["torch", "torchsde"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "torchsde"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "torchsde"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "torchsde"])
diff --git a/diffusers/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py b/diffusers/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ab00c54ce2d7ca78f674085f0fe294d1f05c1e0
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py
@@ -0,0 +1,32 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class StableDiffusionKDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "k_diffusion"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "k_diffusion"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "k_diffusion"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "k_diffusion"])
+
+
+class StableDiffusionXLKDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "k_diffusion"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "k_diffusion"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "k_diffusion"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "k_diffusion"])
diff --git a/diffusers/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py b/diffusers/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7afad8226b87292100270e3e7daad6885be0e7f
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
@@ -0,0 +1,92 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class OnnxStableDiffusionImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class OnnxStableDiffusionInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class OnnxStableDiffusionInpaintPipelineLegacy(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class OnnxStableDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class OnnxStableDiffusionUpscalePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class StableDiffusionOnnxPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
diff --git a/diffusers/src/diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py b/diffusers/src/diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..349db10d1fa5b8505173e1578d3c83ea17e95fbc
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py
@@ -0,0 +1,47 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class KolorsImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "sentencepiece"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "sentencepiece"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "sentencepiece"])
+
+
+class KolorsPAGPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "sentencepiece"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "sentencepiece"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "sentencepiece"])
+
+
+class KolorsPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "sentencepiece"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "sentencepiece"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "sentencepiece"])
diff --git a/diffusers/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/diffusers/src/diffusers/utils/dummy_torch_and_transformers_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f22501ce7eccd8483e320ae3546306a0b313d4d
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -0,0 +1,2207 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class AltDiffusionImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AltDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AmusedImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AmusedInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AmusedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AnimateDiffControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AnimateDiffPAGPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AnimateDiffPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AnimateDiffSDXLPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AnimateDiffSparseControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AnimateDiffVideoToVideoControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AnimateDiffVideoToVideoPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AudioLDM2Pipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AudioLDM2ProjectionModel(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AudioLDM2UNet2DConditionModel(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AudioLDMPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AuraFlowPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class CLIPImageProjection(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class CogVideoXImageToVideoPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class CogVideoXPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class CogVideoXVideoToVideoPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class CycleDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class FluxControlNetImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class FluxControlNetInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class FluxControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class FluxImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class FluxInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class FluxPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class HunyuanDiTControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class HunyuanDiTPAGPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class HunyuanDiTPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class I2VGenXLPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFImg2ImgSuperResolutionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFInpaintingPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFInpaintingSuperResolutionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFSuperResolutionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class ImageTextPipelineOutput(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class Kandinsky3Img2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class Kandinsky3Pipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyImg2ImgCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyInpaintCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyPriorPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22CombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22ControlnetImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22ControlnetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22Img2ImgCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22Img2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22InpaintCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22InpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22Pipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22PriorEmb2EmbPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22PriorPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class LatentConsistencyModelImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class LatentConsistencyModelPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class LattePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class LDMTextToImagePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class LEditsPPPipelineStableDiffusion(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class LEditsPPPipelineStableDiffusionXL(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class LuminaText2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class MarigoldDepthPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class MarigoldNormalsPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class MusicLDMPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class PaintByExamplePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class PIAPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class PixArtAlphaPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class PixArtSigmaPAGPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class PixArtSigmaPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class SemanticStableDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class ShapEImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class ShapEPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableAudioPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableAudioProjectionModel(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableCascadeCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableCascadeDecoderPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableCascadePriorPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusion3ControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusion3Img2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusion3InpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusion3PAGPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusion3Pipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionAdapterPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionAttendAndExcitePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionControlNetImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionControlNetInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionControlNetPAGPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionControlNetXSPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionDepth2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionDiffEditPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionGLIGENPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionGLIGENTextImagePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionImageVariationPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionInpaintPipelineLegacy(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionInstructPix2PixPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionLatentUpscalePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionLDM3DPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionModelEditingPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionPAGPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionPanoramaPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionParadigmsPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionPipelineSafe(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionPix2PixZeroPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionSAGPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionUpscalePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLAdapterPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLControlNetImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLControlNetInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLControlNetPAGImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLControlNetPAGPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLControlNetXSPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLInstructPix2PixPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLPAGImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLPAGInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLPAGPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableUnCLIPImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableUnCLIPPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableVideoDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class TextToVideoSDPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class TextToVideoZeroPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class TextToVideoZeroSDXLPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class UnCLIPImageVariationPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class UnCLIPPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class UniDiffuserModel(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class UniDiffuserPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class UniDiffuserTextDecoder(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VersatileDiffusionDualGuidedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VersatileDiffusionImageVariationPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VersatileDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VersatileDiffusionTextToImagePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VideoToVideoSDPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VQDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class WuerstchenCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class WuerstchenDecoderPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class WuerstchenPriorPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
diff --git a/diffusers/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py b/diffusers/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbde04e33f0abd86d12f3dee048a4f0585c9f19d
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class SpectrogramDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["transformers", "torch", "note_seq"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["transformers", "torch", "note_seq"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["transformers", "torch", "note_seq"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["transformers", "torch", "note_seq"])
diff --git a/diffusers/src/diffusers/utils/dynamic_modules_utils.py b/diffusers/src/diffusers/utils/dynamic_modules_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0cf953924ad8e6d93f96a318e7ccca038a36999
--- /dev/null
+++ b/diffusers/src/diffusers/utils/dynamic_modules_utils.py
@@ -0,0 +1,457 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities to dynamically load objects from the Hub."""
+
+import importlib
+import inspect
+import json
+import os
+import re
+import shutil
+import sys
+from pathlib import Path
+from typing import Dict, Optional, Union
+from urllib import request
+
+from huggingface_hub import hf_hub_download, model_info
+from huggingface_hub.utils import RevisionNotFoundError, validate_hf_hub_args
+from packaging import version
+
+from .. import __version__
+from . import DIFFUSERS_DYNAMIC_MODULE_NAME, HF_MODULES_CACHE, logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# See https://huggingface.co/datasets/diffusers/community-pipelines-mirror
+COMMUNITY_PIPELINES_MIRROR_ID = "diffusers/community-pipelines-mirror"
+
+
+def get_diffusers_versions():
+    url = "https://pypi.org/pypi/diffusers/json"
+    releases = json.loads(request.urlopen(url).read())["releases"].keys()
+    return sorted(releases, key=lambda x: version.Version(x))
+
+
+def init_hf_modules():
+    """
+    Creates the cache directory for modules with an init, and adds it to the Python path.
+    """
+    # This function has already been executed if HF_MODULES_CACHE already is in the Python path.
+    if HF_MODULES_CACHE in sys.path:
+        return
+
+    sys.path.append(HF_MODULES_CACHE)
+    os.makedirs(HF_MODULES_CACHE, exist_ok=True)
+    init_path = Path(HF_MODULES_CACHE) / "__init__.py"
+    if not init_path.exists():
+        init_path.touch()
+
+
+def create_dynamic_module(name: Union[str, os.PathLike]):
+    """
+    Creates a dynamic module in the cache directory for modules.
+    """
+    init_hf_modules()
+    dynamic_module_path = Path(HF_MODULES_CACHE) / name
+    # If the parent module does not exist yet, recursively create it.
+    if not dynamic_module_path.parent.exists():
+        create_dynamic_module(dynamic_module_path.parent)
+    os.makedirs(dynamic_module_path, exist_ok=True)
+    init_path = dynamic_module_path / "__init__.py"
+    if not init_path.exists():
+        init_path.touch()
+
+
+def get_relative_imports(module_file):
+    """
+    Get the list of modules that are relatively imported in a module file.
+
+    Args:
+        module_file (`str` or `os.PathLike`): The module file to inspect.
+    """
+    with open(module_file, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Imports of the form `import .xxx`
+    relative_imports = re.findall(r"^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE)
+    # Imports of the form `from .xxx import yyy`
+    relative_imports += re.findall(r"^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE)
+    # Unique-ify
+    return list(set(relative_imports))
+
+
+def get_relative_import_files(module_file):
+    """
+    Get the list of all files that are needed for a given module. Note that this function recurses through the relative
+    imports (if a imports b and b imports c, it will return module files for b and c).
+
+    Args:
+        module_file (`str` or `os.PathLike`): The module file to inspect.
+    """
+    no_change = False
+    files_to_check = [module_file]
+    all_relative_imports = []
+
+    # Let's recurse through all relative imports
+    while not no_change:
+        new_imports = []
+        for f in files_to_check:
+            new_imports.extend(get_relative_imports(f))
+
+        module_path = Path(module_file).parent
+        new_import_files = [str(module_path / m) for m in new_imports]
+        new_import_files = [f for f in new_import_files if f not in all_relative_imports]
+        files_to_check = [f"{f}.py" for f in new_import_files]
+
+        no_change = len(new_import_files) == 0
+        all_relative_imports.extend(files_to_check)
+
+    return all_relative_imports
+
+
+def check_imports(filename):
+    """
+    Check if the current Python environment contains all the libraries that are imported in a file.
+    """
+    with open(filename, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Imports of the form `import xxx`
+    imports = re.findall(r"^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
+    # Imports of the form `from xxx import yyy`
+    imports += re.findall(r"^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
+    # Only keep the top-level module
+    imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")]
+
+    # Unique-ify and test we got them all
+    imports = list(set(imports))
+    missing_packages = []
+    for imp in imports:
+        try:
+            importlib.import_module(imp)
+        except ImportError:
+            missing_packages.append(imp)
+
+    if len(missing_packages) > 0:
+        raise ImportError(
+            "This modeling file requires the following packages that were not found in your environment: "
+            f"{', '.join(missing_packages)}. Run `pip install {' '.join(missing_packages)}`"
+        )
+
+    return get_relative_imports(filename)
+
+
+def get_class_in_module(class_name, module_path):
+    """
+    Import a module on the cache directory for modules and extract a class from it.
+    """
+    module_path = module_path.replace(os.path.sep, ".")
+    module = importlib.import_module(module_path)
+
+    if class_name is None:
+        return find_pipeline_class(module)
+    return getattr(module, class_name)
+
+
+def find_pipeline_class(loaded_module):
+    """
+    Retrieve pipeline class that inherits from `DiffusionPipeline`. Note that there has to be exactly one class
+    inheriting from `DiffusionPipeline`.
+    """
+    from ..pipelines import DiffusionPipeline
+
+    cls_members = dict(inspect.getmembers(loaded_module, inspect.isclass))
+
+    pipeline_class = None
+    for cls_name, cls in cls_members.items():
+        if (
+            cls_name != DiffusionPipeline.__name__
+            and issubclass(cls, DiffusionPipeline)
+            and cls.__module__.split(".")[0] != "diffusers"
+        ):
+            if pipeline_class is not None:
+                raise ValueError(
+                    f"Multiple classes that inherit from {DiffusionPipeline.__name__} have been found:"
+                    f" {pipeline_class.__name__}, and {cls_name}. Please make sure to define only one in"
+                    f" {loaded_module}."
+                )
+            pipeline_class = cls
+
+    return pipeline_class
+
+
+@validate_hf_hub_args
+def get_cached_module_file(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    module_file: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+):
+    """
+    Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached
+    Transformers module.
+
+    Args:
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
+            This can be either:
+
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
+              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+
+        module_file (`str`):
+            The name of the module file containing the class to look for.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+
+    <Tip>
+
+    You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private or
+    [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
+
+    </Tip>
+
+    Returns:
+        `str`: The path to the module inside the cache.
+    """
+    # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
+    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+
+    module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
+
+    if os.path.isfile(module_file_or_url):
+        resolved_module_file = module_file_or_url
+        submodule = "local"
+    elif pretrained_model_name_or_path.count("/") == 0:
+        available_versions = get_diffusers_versions()
+        # cut ".dev0"
+        latest_version = "v" + ".".join(__version__.split(".")[:3])
+
+        # retrieve github version that matches
+        if revision is None:
+            revision = latest_version if latest_version[1:] in available_versions else "main"
+            logger.info(f"Defaulting to latest_version: {revision}.")
+        elif revision in available_versions:
+            revision = f"v{revision}"
+        elif revision == "main":
+            revision = revision
+        else:
+            raise ValueError(
+                f"`custom_revision`: {revision} does not exist. Please make sure to choose one of"
+                f" {', '.join(available_versions + ['main'])}."
+            )
+
+        try:
+            resolved_module_file = hf_hub_download(
+                repo_id=COMMUNITY_PIPELINES_MIRROR_ID,
+                repo_type="dataset",
+                filename=f"{revision}/{pretrained_model_name_or_path}.py",
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+            )
+            submodule = "git"
+            module_file = pretrained_model_name_or_path + ".py"
+        except RevisionNotFoundError as e:
+            raise EnvironmentError(
+                f"Revision '{revision}' not found in the community pipelines mirror. Check available revisions on"
+                " https://huggingface.co/datasets/diffusers/community-pipelines-mirror/tree/main."
+                " If you don't find the revision you are looking for, please open an issue on https://github.com/huggingface/diffusers/issues."
+            ) from e
+        except EnvironmentError:
+            logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
+            raise
+    else:
+        try:
+            # Load from URL or cache if already cached
+            resolved_module_file = hf_hub_download(
+                pretrained_model_name_or_path,
+                module_file,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+            )
+            submodule = os.path.join("local", "--".join(pretrained_model_name_or_path.split("/")))
+        except EnvironmentError:
+            logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
+            raise
+
+    # Check we have all the requirements in our environment
+    modules_needed = check_imports(resolved_module_file)
+
+    # Now we move the module inside our cached dynamic modules.
+    full_submodule = DIFFUSERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule
+    create_dynamic_module(full_submodule)
+    submodule_path = Path(HF_MODULES_CACHE) / full_submodule
+    if submodule == "local" or submodule == "git":
+        # We always copy local files (we could hash the file to see if there was a change, and give them the name of
+        # that hash, to only copy when there is a modification but it seems overkill for now).
+        # The only reason we do the copy is to avoid putting too many folders in sys.path.
+        shutil.copy(resolved_module_file, submodule_path / module_file)
+        for module_needed in modules_needed:
+            if len(module_needed.split(".")) == 2:
+                module_needed = "/".join(module_needed.split("."))
+                module_folder = module_needed.split("/")[0]
+                if not os.path.exists(submodule_path / module_folder):
+                    os.makedirs(submodule_path / module_folder)
+            module_needed = f"{module_needed}.py"
+            shutil.copy(os.path.join(pretrained_model_name_or_path, module_needed), submodule_path / module_needed)
+    else:
+        # Get the commit hash
+        # TODO: we will get this info in the etag soon, so retrieve it from there and not here.
+        commit_hash = model_info(pretrained_model_name_or_path, revision=revision, token=token).sha
+
+        # The module file will end up being placed in a subfolder with the git hash of the repo. This way we get the
+        # benefit of versioning.
+        submodule_path = submodule_path / commit_hash
+        full_submodule = full_submodule + os.path.sep + commit_hash
+        create_dynamic_module(full_submodule)
+
+        if not (submodule_path / module_file).exists():
+            if len(module_file.split("/")) == 2:
+                module_folder = module_file.split("/")[0]
+                if not os.path.exists(submodule_path / module_folder):
+                    os.makedirs(submodule_path / module_folder)
+            shutil.copy(resolved_module_file, submodule_path / module_file)
+
+        # Make sure we also have every file with relative
+        for module_needed in modules_needed:
+            if len(module_needed.split(".")) == 2:
+                module_needed = "/".join(module_needed.split("."))
+            if not (submodule_path / module_needed).exists():
+                get_cached_module_file(
+                    pretrained_model_name_or_path,
+                    f"{module_needed}.py",
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    token=token,
+                    revision=revision,
+                    local_files_only=local_files_only,
+                )
+    return os.path.join(full_submodule, module_file)
+
+
+@validate_hf_hub_args
+def get_class_from_dynamic_module(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    module_file: str,
+    class_name: Optional[str] = None,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    **kwargs,
+):
+    """
+    Extracts a class from a module file, present in the local folder or repository of a model.
+
+    <Tip warning={true}>
+
+    Calling this function will execute the code in the module file found locally or downloaded from the Hub. It should
+    therefore only be called on trusted repos.
+
+    </Tip>
+
+    Args:
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
+            This can be either:
+
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
+              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+
+        module_file (`str`):
+            The name of the module file containing the class to look for.
+        class_name (`str`):
+            The name of the class to import in the module.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or `bool`, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+
+    <Tip>
+
+    You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private or
+    [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
+
+    </Tip>
+
+    Returns:
+        `type`: The class, dynamically imported from the module.
+
+    Examples:
+
+    ```python
+    # Download module `modeling.py` from huggingface.co and cache then extract the class `MyBertModel` from this
+    # module.
+    cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel")
+    ```"""
+    # And lastly we get the class inside our newly created module
+    final_module = get_cached_module_file(
+        pretrained_model_name_or_path,
+        module_file,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        proxies=proxies,
+        token=token,
+        revision=revision,
+        local_files_only=local_files_only,
+    )
+    return get_class_in_module(class_name, final_module.replace(".py", ""))
diff --git a/diffusers/src/diffusers/utils/export_utils.py b/diffusers/src/diffusers/utils/export_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..00805433cebaa4a3898877da549f947a323b3ac7
--- /dev/null
+++ b/diffusers/src/diffusers/utils/export_utils.py
@@ -0,0 +1,184 @@
+import io
+import random
+import struct
+import tempfile
+from contextlib import contextmanager
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+import PIL.ImageOps
+
+from .import_utils import BACKENDS_MAPPING, is_imageio_available, is_opencv_available
+from .logging import get_logger
+
+
+global_rng = random.Random()
+
+logger = get_logger(__name__)
+
+
+@contextmanager
+def buffered_writer(raw_f):
+    f = io.BufferedWriter(raw_f)
+    yield f
+    f.flush()
+
+
+def export_to_gif(image: List[PIL.Image.Image], output_gif_path: str = None, fps: int = 10) -> str:
+    if output_gif_path is None:
+        output_gif_path = tempfile.NamedTemporaryFile(suffix=".gif").name
+
+    image[0].save(
+        output_gif_path,
+        save_all=True,
+        append_images=image[1:],
+        optimize=False,
+        duration=1000 // fps,
+        loop=0,
+    )
+    return output_gif_path
+
+
+def export_to_ply(mesh, output_ply_path: str = None):
+    """
+    Write a PLY file for a mesh.
+    """
+    if output_ply_path is None:
+        output_ply_path = tempfile.NamedTemporaryFile(suffix=".ply").name
+
+    coords = mesh.verts.detach().cpu().numpy()
+    faces = mesh.faces.cpu().numpy()
+    rgb = np.stack([mesh.vertex_channels[x].detach().cpu().numpy() for x in "RGB"], axis=1)
+
+    with buffered_writer(open(output_ply_path, "wb")) as f:
+        f.write(b"ply\n")
+        f.write(b"format binary_little_endian 1.0\n")
+        f.write(bytes(f"element vertex {len(coords)}\n", "ascii"))
+        f.write(b"property float x\n")
+        f.write(b"property float y\n")
+        f.write(b"property float z\n")
+        if rgb is not None:
+            f.write(b"property uchar red\n")
+            f.write(b"property uchar green\n")
+            f.write(b"property uchar blue\n")
+        if faces is not None:
+            f.write(bytes(f"element face {len(faces)}\n", "ascii"))
+            f.write(b"property list uchar int vertex_index\n")
+        f.write(b"end_header\n")
+
+        if rgb is not None:
+            rgb = (rgb * 255.499).round().astype(int)
+            vertices = [
+                (*coord, *rgb)
+                for coord, rgb in zip(
+                    coords.tolist(),
+                    rgb.tolist(),
+                )
+            ]
+            format = struct.Struct("<3f3B")
+            for item in vertices:
+                f.write(format.pack(*item))
+        else:
+            format = struct.Struct("<3f")
+            for vertex in coords.tolist():
+                f.write(format.pack(*vertex))
+
+        if faces is not None:
+            format = struct.Struct("<B3I")
+            for tri in faces.tolist():
+                f.write(format.pack(len(tri), *tri))
+
+    return output_ply_path
+
+
+def export_to_obj(mesh, output_obj_path: str = None):
+    if output_obj_path is None:
+        output_obj_path = tempfile.NamedTemporaryFile(suffix=".obj").name
+
+    verts = mesh.verts.detach().cpu().numpy()
+    faces = mesh.faces.cpu().numpy()
+
+    vertex_colors = np.stack([mesh.vertex_channels[x].detach().cpu().numpy() for x in "RGB"], axis=1)
+    vertices = [
+        "{} {} {} {} {} {}".format(*coord, *color) for coord, color in zip(verts.tolist(), vertex_colors.tolist())
+    ]
+
+    faces = ["f {} {} {}".format(str(tri[0] + 1), str(tri[1] + 1), str(tri[2] + 1)) for tri in faces.tolist()]
+
+    combined_data = ["v " + vertex for vertex in vertices] + faces
+
+    with open(output_obj_path, "w") as f:
+        f.writelines("\n".join(combined_data))
+
+
+def _legacy_export_to_video(
+    video_frames: Union[List[np.ndarray], List[PIL.Image.Image]], output_video_path: str = None, fps: int = 10
+):
+    if is_opencv_available():
+        import cv2
+    else:
+        raise ImportError(BACKENDS_MAPPING["opencv"][1].format("export_to_video"))
+    if output_video_path is None:
+        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
+
+    if isinstance(video_frames[0], np.ndarray):
+        video_frames = [(frame * 255).astype(np.uint8) for frame in video_frames]
+
+    elif isinstance(video_frames[0], PIL.Image.Image):
+        video_frames = [np.array(frame) for frame in video_frames]
+
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    h, w, c = video_frames[0].shape
+    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps=fps, frameSize=(w, h))
+    for i in range(len(video_frames)):
+        img = cv2.cvtColor(video_frames[i], cv2.COLOR_RGB2BGR)
+        video_writer.write(img)
+
+    return output_video_path
+
+
+def export_to_video(
+    video_frames: Union[List[np.ndarray], List[PIL.Image.Image]], output_video_path: str = None, fps: int = 10
+) -> str:
+    # TODO: Dhruv. Remove by Diffusers release 0.33.0
+    # Added to prevent breaking existing code
+    if not is_imageio_available():
+        logger.warning(
+            (
+                "It is recommended to use `export_to_video` with `imageio` and `imageio-ffmpeg` as a backend. \n"
+                "These libraries are not present in your environment. Attempting to use legacy OpenCV backend to export video. \n"
+                "Support for the OpenCV backend will be deprecated in a future Diffusers version"
+            )
+        )
+        return _legacy_export_to_video(video_frames, output_video_path, fps)
+
+    if is_imageio_available():
+        import imageio
+    else:
+        raise ImportError(BACKENDS_MAPPING["imageio"][1].format("export_to_video"))
+
+    try:
+        imageio.plugins.ffmpeg.get_exe()
+    except AttributeError:
+        raise AttributeError(
+            (
+                "Found an existing imageio backend in your environment. Attempting to export video with imageio. \n"
+                "Unable to find a compatible ffmpeg installation in your environment to use with imageio. Please install via `pip install imageio-ffmpeg"
+            )
+        )
+
+    if output_video_path is None:
+        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
+
+    if isinstance(video_frames[0], np.ndarray):
+        video_frames = [(frame * 255).astype(np.uint8) for frame in video_frames]
+
+    elif isinstance(video_frames[0], PIL.Image.Image):
+        video_frames = [np.array(frame) for frame in video_frames]
+
+    with imageio.get_writer(output_video_path, fps=fps) as writer:
+        for frame in video_frames:
+            writer.append_data(frame)
+
+    return output_video_path
diff --git a/diffusers/src/diffusers/utils/hub_utils.py b/diffusers/src/diffusers/utils/hub_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cdc02e873285f6cc5d86232ec640b2217c16fbf
--- /dev/null
+++ b/diffusers/src/diffusers/utils/hub_utils.py
@@ -0,0 +1,603 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import re
+import sys
+import tempfile
+import traceback
+import warnings
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+from uuid import uuid4
+
+from huggingface_hub import (
+    ModelCard,
+    ModelCardData,
+    create_repo,
+    hf_hub_download,
+    model_info,
+    snapshot_download,
+    upload_folder,
+)
+from huggingface_hub.constants import HF_HUB_CACHE, HF_HUB_DISABLE_TELEMETRY, HF_HUB_OFFLINE
+from huggingface_hub.file_download import REGEX_COMMIT_HASH
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    is_jinja_available,
+    validate_hf_hub_args,
+)
+from packaging import version
+from requests import HTTPError
+
+from .. import __version__
+from .constants import (
+    DEPRECATED_REVISION_ARGS,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+)
+from .import_utils import (
+    ENV_VARS_TRUE_VALUES,
+    _flax_version,
+    _jax_version,
+    _onnxruntime_version,
+    _torch_version,
+    is_flax_available,
+    is_onnx_available,
+    is_torch_available,
+)
+from .logging import get_logger
+
+
+logger = get_logger(__name__)
+
+MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "model_card_template.md"
+SESSION_ID = uuid4().hex
+
+
+def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
+    """
+    Formats a user-agent string with basic info about a request.
+    """
+    ua = f"diffusers/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}"
+    if HF_HUB_DISABLE_TELEMETRY or HF_HUB_OFFLINE:
+        return ua + "; telemetry/off"
+    if is_torch_available():
+        ua += f"; torch/{_torch_version}"
+    if is_flax_available():
+        ua += f"; jax/{_jax_version}"
+        ua += f"; flax/{_flax_version}"
+    if is_onnx_available():
+        ua += f"; onnxruntime/{_onnxruntime_version}"
+    # CI will set this value to True
+    if os.environ.get("DIFFUSERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
+        ua += "; is_ci/true"
+    if isinstance(user_agent, dict):
+        ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items())
+    elif isinstance(user_agent, str):
+        ua += "; " + user_agent
+    return ua
+
+
+def load_or_create_model_card(
+    repo_id_or_path: str = None,
+    token: Optional[str] = None,
+    is_pipeline: bool = False,
+    from_training: bool = False,
+    model_description: Optional[str] = None,
+    base_model: str = None,
+    prompt: Optional[str] = None,
+    license: Optional[str] = None,
+    widget: Optional[List[dict]] = None,
+    inference: Optional[bool] = None,
+) -> ModelCard:
+    """
+    Loads or creates a model card.
+
+    Args:
+        repo_id_or_path (`str`):
+            The repo id (e.g., "runwayml/stable-diffusion-v1-5") or local path where to look for the model card.
+        token (`str`, *optional*):
+            Authentication token. Will default to the stored token. See https://huggingface.co/settings/token for more
+            details.
+        is_pipeline (`bool`):
+            Boolean to indicate if we're adding tag to a [`DiffusionPipeline`].
+        from_training: (`bool`): Boolean flag to denote if the model card is being created from a training script.
+        model_description (`str`, *optional*): Model description to add to the model card. Helpful when using
+            `load_or_create_model_card` from a training script.
+        base_model (`str`): Base model identifier (e.g., "stabilityai/stable-diffusion-xl-base-1.0"). Useful
+            for DreamBooth-like training.
+        prompt (`str`, *optional*): Prompt used for training. Useful for DreamBooth-like training.
+        license: (`str`, *optional*): License of the output artifact. Helpful when using
+            `load_or_create_model_card` from a training script.
+        widget (`List[dict]`, *optional*): Widget to accompany a gallery template.
+        inference: (`bool`, optional): Whether to turn on inference widget. Helpful when using
+            `load_or_create_model_card` from a training script.
+    """
+    if not is_jinja_available():
+        raise ValueError(
+            "Modelcard rendering is based on Jinja templates."
+            " Please make sure to have `jinja` installed before using `load_or_create_model_card`."
+            " To install it, please run `pip install Jinja2`."
+        )
+
+    try:
+        # Check if the model card is present on the remote repo
+        model_card = ModelCard.load(repo_id_or_path, token=token)
+    except (EntryNotFoundError, RepositoryNotFoundError):
+        # Otherwise create a model card from template
+        if from_training:
+            model_card = ModelCard.from_template(
+                card_data=ModelCardData(  # Card metadata object that will be converted to YAML block
+                    license=license,
+                    library_name="diffusers",
+                    inference=inference,
+                    base_model=base_model,
+                    instance_prompt=prompt,
+                    widget=widget,
+                ),
+                template_path=MODEL_CARD_TEMPLATE_PATH,
+                model_description=model_description,
+            )
+        else:
+            card_data = ModelCardData()
+            component = "pipeline" if is_pipeline else "model"
+            if model_description is None:
+                model_description = f"This is the model card of a 🧨 diffusers {component} that has been pushed on the Hub. This model card has been automatically generated."
+            model_card = ModelCard.from_template(card_data, model_description=model_description)
+
+    return model_card
+
+
+def populate_model_card(model_card: ModelCard, tags: Union[str, List[str]] = None) -> ModelCard:
+    """Populates the `model_card` with library name and optional tags."""
+    if model_card.data.library_name is None:
+        model_card.data.library_name = "diffusers"
+
+    if tags is not None:
+        if isinstance(tags, str):
+            tags = [tags]
+        if model_card.data.tags is None:
+            model_card.data.tags = []
+        for tag in tags:
+            model_card.data.tags.append(tag)
+
+    return model_card
+
+
+def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str] = None):
+    """
+    Extracts the commit hash from a resolved filename toward a cache file.
+    """
+    if resolved_file is None or commit_hash is not None:
+        return commit_hash
+    resolved_file = str(Path(resolved_file).as_posix())
+    search = re.search(r"snapshots/([^/]+)/", resolved_file)
+    if search is None:
+        return None
+    commit_hash = search.groups()[0]
+    return commit_hash if REGEX_COMMIT_HASH.match(commit_hash) else None
+
+
+# Old default cache path, potentially to be migrated.
+# This logic was more or less taken from `transformers`, with the following differences:
+# - Diffusers doesn't use custom environment variables to specify the cache path.
+# - There is no need to migrate the cache format, just move the files to the new location.
+hf_cache_home = os.path.expanduser(
+    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
+)
+old_diffusers_cache = os.path.join(hf_cache_home, "diffusers")
+
+
+def move_cache(old_cache_dir: Optional[str] = None, new_cache_dir: Optional[str] = None) -> None:
+    if new_cache_dir is None:
+        new_cache_dir = HF_HUB_CACHE
+    if old_cache_dir is None:
+        old_cache_dir = old_diffusers_cache
+
+    old_cache_dir = Path(old_cache_dir).expanduser()
+    new_cache_dir = Path(new_cache_dir).expanduser()
+    for old_blob_path in old_cache_dir.glob("**/blobs/*"):
+        if old_blob_path.is_file() and not old_blob_path.is_symlink():
+            new_blob_path = new_cache_dir / old_blob_path.relative_to(old_cache_dir)
+            new_blob_path.parent.mkdir(parents=True, exist_ok=True)
+            os.replace(old_blob_path, new_blob_path)
+            try:
+                os.symlink(new_blob_path, old_blob_path)
+            except OSError:
+                logger.warning(
+                    "Could not create symlink between old cache and new cache. If you use an older version of diffusers again, files will be re-downloaded."
+                )
+    # At this point, old_cache_dir contains symlinks to the new cache (it can still be used).
+
+
+cache_version_file = os.path.join(HF_HUB_CACHE, "version_diffusers_cache.txt")
+if not os.path.isfile(cache_version_file):
+    cache_version = 0
+else:
+    with open(cache_version_file) as f:
+        try:
+            cache_version = int(f.read())
+        except ValueError:
+            cache_version = 0
+
+if cache_version < 1:
+    old_cache_is_not_empty = os.path.isdir(old_diffusers_cache) and len(os.listdir(old_diffusers_cache)) > 0
+    if old_cache_is_not_empty:
+        logger.warning(
+            "The cache for model files in Diffusers v0.14.0 has moved to a new location. Moving your "
+            "existing cached models. This is a one-time operation, you can interrupt it or run it "
+            "later by calling `diffusers.utils.hub_utils.move_cache()`."
+        )
+        try:
+            move_cache()
+        except Exception as e:
+            trace = "\n".join(traceback.format_tb(e.__traceback__))
+            logger.error(
+                f"There was a problem when trying to move your cache:\n\n{trace}\n{e.__class__.__name__}: {e}\n\nPlease "
+                "file an issue at https://github.com/huggingface/diffusers/issues/new/choose, copy paste this whole "
+                "message and we will do our best to help."
+            )
+
+if cache_version < 1:
+    try:
+        os.makedirs(HF_HUB_CACHE, exist_ok=True)
+        with open(cache_version_file, "w") as f:
+            f.write("1")
+    except Exception:
+        logger.warning(
+            f"There was a problem when trying to write in your cache folder ({HF_HUB_CACHE}). Please, ensure "
+            "the directory exists and can be written to."
+        )
+
+
+def _add_variant(weights_name: str, variant: Optional[str] = None) -> str:
+    if variant is not None:
+        splits = weights_name.split(".")
+        split_index = -2 if weights_name.endswith(".index.json") else -1
+        splits = splits[:-split_index] + [variant] + splits[-split_index:]
+        weights_name = ".".join(splits)
+
+    return weights_name
+
+
+@validate_hf_hub_args
+def _get_model_file(
+    pretrained_model_name_or_path: Union[str, Path],
+    *,
+    weights_name: str,
+    subfolder: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+    force_download: bool = False,
+    proxies: Optional[Dict] = None,
+    local_files_only: bool = False,
+    token: Optional[str] = None,
+    user_agent: Optional[Union[Dict, str]] = None,
+    revision: Optional[str] = None,
+    commit_hash: Optional[str] = None,
+):
+    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+    if os.path.isfile(pretrained_model_name_or_path):
+        return pretrained_model_name_or_path
+    elif os.path.isdir(pretrained_model_name_or_path):
+        if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)):
+            # Load from a PyTorch checkpoint
+            model_file = os.path.join(pretrained_model_name_or_path, weights_name)
+            return model_file
+        elif subfolder is not None and os.path.isfile(
+            os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
+        ):
+            model_file = os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
+            return model_file
+        else:
+            raise EnvironmentError(
+                f"Error no file named {weights_name} found in directory {pretrained_model_name_or_path}."
+            )
+    else:
+        # 1. First check if deprecated way of loading from branches is used
+        if (
+            revision in DEPRECATED_REVISION_ARGS
+            and (weights_name == WEIGHTS_NAME or weights_name == SAFETENSORS_WEIGHTS_NAME)
+            and version.parse(version.parse(__version__).base_version) >= version.parse("0.22.0")
+        ):
+            try:
+                model_file = hf_hub_download(
+                    pretrained_model_name_or_path,
+                    filename=_add_variant(weights_name, revision),
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    subfolder=subfolder,
+                    revision=revision or commit_hash,
+                )
+                warnings.warn(
+                    f"Loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` is deprecated. Loading instead from `revision='main'` with `variant={revision}`. Loading model variants via `revision='{revision}'` will be removed in diffusers v1. Please use `variant='{revision}'` instead.",
+                    FutureWarning,
+                )
+                return model_file
+            except:  # noqa: E722
+                warnings.warn(
+                    f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have a {_add_variant(weights_name, revision)} file in the 'main' branch of {pretrained_model_name_or_path}. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {_add_variant(weights_name, revision)}' so that the correct variant file can be added.",
+                    FutureWarning,
+                )
+        try:
+            # 2. Load model file as usual
+            model_file = hf_hub_download(
+                pretrained_model_name_or_path,
+                filename=weights_name,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                user_agent=user_agent,
+                subfolder=subfolder,
+                revision=revision or commit_hash,
+            )
+            return model_file
+
+        except RepositoryNotFoundError as e:
+            raise EnvironmentError(
+                f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
+                "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
+                "token having permission to this repo with `token` or log in with `huggingface-cli "
+                "login`."
+            ) from e
+        except RevisionNotFoundError as e:
+            raise EnvironmentError(
+                f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
+                "this model name. Check the model page at "
+                f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+            ) from e
+        except EntryNotFoundError as e:
+            raise EnvironmentError(
+                f"{pretrained_model_name_or_path} does not appear to have a file named {weights_name}."
+            ) from e
+        except HTTPError as e:
+            raise EnvironmentError(
+                f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{e}"
+            ) from e
+        except ValueError as e:
+            raise EnvironmentError(
+                f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+                f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+                f" directory containing a file named {weights_name} or"
+                " \nCheckout your internet connection or see how to run the library in"
+                " offline mode at 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
+            ) from e
+        except EnvironmentError as e:
+            raise EnvironmentError(
+                f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                f"containing a file named {weights_name}"
+            ) from e
+
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/1360801a69c0b169e3efdbb0cd05d9a0e72bfb70/src/transformers/utils/hub.py#L976
+# Differences are in parallelization of shard downloads and checking if shards are present.
+
+
+def _check_if_shards_exist_locally(local_dir, subfolder, original_shard_filenames):
+    shards_path = os.path.join(local_dir, subfolder)
+    shard_filenames = [os.path.join(shards_path, f) for f in original_shard_filenames]
+    for shard_file in shard_filenames:
+        if not os.path.exists(shard_file):
+            raise ValueError(
+                f"{shards_path} does not appear to have a file named {shard_file} which is "
+                "required according to the checkpoint index."
+            )
+
+
+def _get_checkpoint_shard_files(
+    pretrained_model_name_or_path,
+    index_filename,
+    cache_dir=None,
+    proxies=None,
+    local_files_only=False,
+    token=None,
+    user_agent=None,
+    revision=None,
+    subfolder="",
+):
+    """
+    For a given model:
+
+    - download and cache all the shards of a sharded checkpoint if `pretrained_model_name_or_path` is a model ID on the
+      Hub
+    - returns the list of paths to all the shards, as well as some metadata.
+
+    For the description of each arg, see [`PreTrainedModel.from_pretrained`]. `index_filename` is the full path to the
+    index (downloaded and cached if `pretrained_model_name_or_path` is a model ID on the Hub).
+    """
+    if not os.path.isfile(index_filename):
+        raise ValueError(f"Can't find a checkpoint index ({index_filename}) in {pretrained_model_name_or_path}.")
+
+    with open(index_filename, "r") as f:
+        index = json.loads(f.read())
+
+    original_shard_filenames = sorted(set(index["weight_map"].values()))
+    sharded_metadata = index["metadata"]
+    sharded_metadata["all_checkpoint_keys"] = list(index["weight_map"].keys())
+    sharded_metadata["weight_map"] = index["weight_map"].copy()
+    shards_path = os.path.join(pretrained_model_name_or_path, subfolder)
+
+    # First, let's deal with local folder.
+    if os.path.isdir(pretrained_model_name_or_path):
+        _check_if_shards_exist_locally(
+            pretrained_model_name_or_path, subfolder=subfolder, original_shard_filenames=original_shard_filenames
+        )
+        return shards_path, sharded_metadata
+
+    # At this stage pretrained_model_name_or_path is a model identifier on the Hub
+    allow_patterns = original_shard_filenames
+    if subfolder is not None:
+        allow_patterns = [os.path.join(subfolder, p) for p in allow_patterns]
+
+    ignore_patterns = ["*.json", "*.md"]
+    if not local_files_only:
+        # `model_info` call must guarded with the above condition.
+        model_files_info = model_info(pretrained_model_name_or_path, revision=revision)
+        for shard_file in original_shard_filenames:
+            shard_file_present = any(shard_file in k.rfilename for k in model_files_info.siblings)
+            if not shard_file_present:
+                raise EnvironmentError(
+                    f"{shards_path} does not appear to have a file named {shard_file} which is "
+                    "required according to the checkpoint index."
+                )
+
+        try:
+            # Load from URL
+            cached_folder = snapshot_download(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
+                user_agent=user_agent,
+            )
+            if subfolder is not None:
+                cached_folder = os.path.join(cached_folder, subfolder)
+
+        # We have already dealt with RepositoryNotFoundError and RevisionNotFoundError when getting the index, so
+        # we don't have to catch them here. We have also dealt with EntryNotFoundError.
+        except HTTPError as e:
+            raise EnvironmentError(
+                f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load {pretrained_model_name_or_path}. You should try"
+                " again after checking your internet connection."
+            ) from e
+
+    # If `local_files_only=True`, `cached_folder` may not contain all the shard files.
+    elif local_files_only:
+        _check_if_shards_exist_locally(
+            local_dir=cache_dir, subfolder=subfolder, original_shard_filenames=original_shard_filenames
+        )
+        if subfolder is not None:
+            cached_folder = os.path.join(cached_folder, subfolder)
+
+    return cached_folder, sharded_metadata
+
+
+class PushToHubMixin:
+    """
+    A Mixin to push a model, scheduler, or pipeline to the Hugging Face Hub.
+    """
+
+    def _upload_folder(
+        self,
+        working_dir: Union[str, os.PathLike],
+        repo_id: str,
+        token: Optional[str] = None,
+        commit_message: Optional[str] = None,
+        create_pr: bool = False,
+    ):
+        """
+        Uploads all files in `working_dir` to `repo_id`.
+        """
+        if commit_message is None:
+            if "Model" in self.__class__.__name__:
+                commit_message = "Upload model"
+            elif "Scheduler" in self.__class__.__name__:
+                commit_message = "Upload scheduler"
+            else:
+                commit_message = f"Upload {self.__class__.__name__}"
+
+        logger.info(f"Uploading the files of {working_dir} to {repo_id}.")
+        return upload_folder(
+            repo_id=repo_id, folder_path=working_dir, token=token, commit_message=commit_message, create_pr=create_pr
+        )
+
+    def push_to_hub(
+        self,
+        repo_id: str,
+        commit_message: Optional[str] = None,
+        private: Optional[bool] = None,
+        token: Optional[str] = None,
+        create_pr: bool = False,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+    ) -> str:
+        """
+        Upload model, scheduler, or pipeline files to the 🤗 Hugging Face Hub.
+
+        Parameters:
+            repo_id (`str`):
+                The name of the repository you want to push your model, scheduler, or pipeline files to. It should
+                contain your organization name when pushing to an organization. `repo_id` can also be a path to a local
+                directory.
+            commit_message (`str`, *optional*):
+                Message to commit while pushing. Default to `"Upload {object}"`.
+            private (`bool`, *optional*):
+                Whether or not the repository created should be private.
+            token (`str`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. The token generated when running
+                `huggingface-cli login` (stored in `~/.huggingface`).
+            create_pr (`bool`, *optional*, defaults to `False`):
+                Whether or not to create a PR with the uploaded files or directly commit.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether or not to convert the model weights to the `safetensors` format.
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
+
+        Examples:
+
+        ```python
+        from diffusers import UNet2DConditionModel
+
+        unet = UNet2DConditionModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="unet")
+
+        # Push the `unet` to your namespace with the name "my-finetuned-unet".
+        unet.push_to_hub("my-finetuned-unet")
+
+        # Push the `unet` to an organization with the name "my-finetuned-unet".
+        unet.push_to_hub("your-org/my-finetuned-unet")
+        ```
+        """
+        repo_id = create_repo(repo_id, private=private, token=token, exist_ok=True).repo_id
+
+        # Create a new empty model card and eventually tag it
+        model_card = load_or_create_model_card(repo_id, token=token)
+        model_card = populate_model_card(model_card)
+
+        # Save all files.
+        save_kwargs = {"safe_serialization": safe_serialization}
+        if "Scheduler" not in self.__class__.__name__:
+            save_kwargs.update({"variant": variant})
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            self.save_pretrained(tmpdir, **save_kwargs)
+
+            # Update model card if needed:
+            model_card.save(os.path.join(tmpdir, "README.md"))
+
+            return self._upload_folder(
+                tmpdir,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
diff --git a/diffusers/src/diffusers/utils/import_utils.py b/diffusers/src/diffusers/utils/import_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..34cc5fcc8605ffe5d5e33995f0952e263552ebbb
--- /dev/null
+++ b/diffusers/src/diffusers/utils/import_utils.py
@@ -0,0 +1,838 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Import utilities: Utilities related to imports and our lazy inits.
+"""
+
+import importlib.util
+import operator as op
+import os
+import sys
+from collections import OrderedDict
+from itertools import chain
+from types import ModuleType
+from typing import Any, Union
+
+from huggingface_hub.utils import is_jinja_available  # noqa: F401
+from packaging import version
+from packaging.version import Version, parse
+
+from . import logging
+
+
+# The package importlib_metadata is in a different place, depending on the python version.
+if sys.version_info < (3, 8):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
+
+USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+USE_JAX = os.environ.get("USE_FLAX", "AUTO").upper()
+USE_SAFETENSORS = os.environ.get("USE_SAFETENSORS", "AUTO").upper()
+DIFFUSERS_SLOW_IMPORT = os.environ.get("DIFFUSERS_SLOW_IMPORT", "FALSE").upper()
+DIFFUSERS_SLOW_IMPORT = DIFFUSERS_SLOW_IMPORT in ENV_VARS_TRUE_VALUES
+
+STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
+
+_torch_version = "N/A"
+if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
+    _torch_available = importlib.util.find_spec("torch") is not None
+    if _torch_available:
+        try:
+            _torch_version = importlib_metadata.version("torch")
+            logger.info(f"PyTorch version {_torch_version} available.")
+        except importlib_metadata.PackageNotFoundError:
+            _torch_available = False
+else:
+    logger.info("Disabling PyTorch because USE_TORCH is set")
+    _torch_available = False
+
+_torch_xla_available = importlib.util.find_spec("torch_xla") is not None
+if _torch_xla_available:
+    try:
+        _torch_xla_version = importlib_metadata.version("torch_xla")
+        logger.info(f"PyTorch XLA version {_torch_xla_version} available.")
+    except ImportError:
+        _torch_xla_available = False
+
+# check whether torch_npu is available
+_torch_npu_available = importlib.util.find_spec("torch_npu") is not None
+if _torch_npu_available:
+    try:
+        _torch_npu_version = importlib_metadata.version("torch_npu")
+        logger.info(f"torch_npu version {_torch_npu_version} available.")
+    except ImportError:
+        _torch_npu_available = False
+
+_jax_version = "N/A"
+_flax_version = "N/A"
+if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
+    _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
+    if _flax_available:
+        try:
+            _jax_version = importlib_metadata.version("jax")
+            _flax_version = importlib_metadata.version("flax")
+            logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
+        except importlib_metadata.PackageNotFoundError:
+            _flax_available = False
+else:
+    _flax_available = False
+
+if USE_SAFETENSORS in ENV_VARS_TRUE_AND_AUTO_VALUES:
+    _safetensors_available = importlib.util.find_spec("safetensors") is not None
+    if _safetensors_available:
+        try:
+            _safetensors_version = importlib_metadata.version("safetensors")
+            logger.info(f"Safetensors version {_safetensors_version} available.")
+        except importlib_metadata.PackageNotFoundError:
+            _safetensors_available = False
+else:
+    logger.info("Disabling Safetensors because USE_TF is set")
+    _safetensors_available = False
+
+_transformers_available = importlib.util.find_spec("transformers") is not None
+try:
+    _transformers_version = importlib_metadata.version("transformers")
+    logger.debug(f"Successfully imported transformers version {_transformers_version}")
+except importlib_metadata.PackageNotFoundError:
+    _transformers_available = False
+
+
+_inflect_available = importlib.util.find_spec("inflect") is not None
+try:
+    _inflect_version = importlib_metadata.version("inflect")
+    logger.debug(f"Successfully imported inflect version {_inflect_version}")
+except importlib_metadata.PackageNotFoundError:
+    _inflect_available = False
+
+
+_unidecode_available = importlib.util.find_spec("unidecode") is not None
+try:
+    _unidecode_version = importlib_metadata.version("unidecode")
+    logger.debug(f"Successfully imported unidecode version {_unidecode_version}")
+except importlib_metadata.PackageNotFoundError:
+    _unidecode_available = False
+
+_onnxruntime_version = "N/A"
+_onnx_available = importlib.util.find_spec("onnxruntime") is not None
+if _onnx_available:
+    candidates = (
+        "onnxruntime",
+        "onnxruntime-gpu",
+        "ort_nightly_gpu",
+        "onnxruntime-directml",
+        "onnxruntime-openvino",
+        "ort_nightly_directml",
+        "onnxruntime-rocm",
+        "onnxruntime-training",
+    )
+    _onnxruntime_version = None
+    # For the metadata, we have to look for both onnxruntime and onnxruntime-gpu
+    for pkg in candidates:
+        try:
+            _onnxruntime_version = importlib_metadata.version(pkg)
+            break
+        except importlib_metadata.PackageNotFoundError:
+            pass
+    _onnx_available = _onnxruntime_version is not None
+    if _onnx_available:
+        logger.debug(f"Successfully imported onnxruntime version {_onnxruntime_version}")
+
+# (sayakpaul): importlib.util.find_spec("opencv-python") returns None even when it's installed.
+# _opencv_available = importlib.util.find_spec("opencv-python") is not None
+try:
+    candidates = (
+        "opencv-python",
+        "opencv-contrib-python",
+        "opencv-python-headless",
+        "opencv-contrib-python-headless",
+    )
+    _opencv_version = None
+    for pkg in candidates:
+        try:
+            _opencv_version = importlib_metadata.version(pkg)
+            break
+        except importlib_metadata.PackageNotFoundError:
+            pass
+    _opencv_available = _opencv_version is not None
+    if _opencv_available:
+        logger.debug(f"Successfully imported cv2 version {_opencv_version}")
+except importlib_metadata.PackageNotFoundError:
+    _opencv_available = False
+
+_scipy_available = importlib.util.find_spec("scipy") is not None
+try:
+    _scipy_version = importlib_metadata.version("scipy")
+    logger.debug(f"Successfully imported scipy version {_scipy_version}")
+except importlib_metadata.PackageNotFoundError:
+    _scipy_available = False
+
+_librosa_available = importlib.util.find_spec("librosa") is not None
+try:
+    _librosa_version = importlib_metadata.version("librosa")
+    logger.debug(f"Successfully imported librosa version {_librosa_version}")
+except importlib_metadata.PackageNotFoundError:
+    _librosa_available = False
+
+_accelerate_available = importlib.util.find_spec("accelerate") is not None
+try:
+    _accelerate_version = importlib_metadata.version("accelerate")
+    logger.debug(f"Successfully imported accelerate version {_accelerate_version}")
+except importlib_metadata.PackageNotFoundError:
+    _accelerate_available = False
+
+_xformers_available = importlib.util.find_spec("xformers") is not None
+try:
+    _xformers_version = importlib_metadata.version("xformers")
+    if _torch_available:
+        _torch_version = importlib_metadata.version("torch")
+        if version.Version(_torch_version) < version.Version("1.12"):
+            raise ValueError("xformers is installed in your environment and requires PyTorch >= 1.12")
+
+    logger.debug(f"Successfully imported xformers version {_xformers_version}")
+except importlib_metadata.PackageNotFoundError:
+    _xformers_available = False
+
+_k_diffusion_available = importlib.util.find_spec("k_diffusion") is not None
+try:
+    _k_diffusion_version = importlib_metadata.version("k_diffusion")
+    logger.debug(f"Successfully imported k-diffusion version {_k_diffusion_version}")
+except importlib_metadata.PackageNotFoundError:
+    _k_diffusion_available = False
+
+_note_seq_available = importlib.util.find_spec("note_seq") is not None
+try:
+    _note_seq_version = importlib_metadata.version("note_seq")
+    logger.debug(f"Successfully imported note-seq version {_note_seq_version}")
+except importlib_metadata.PackageNotFoundError:
+    _note_seq_available = False
+
+_wandb_available = importlib.util.find_spec("wandb") is not None
+try:
+    _wandb_version = importlib_metadata.version("wandb")
+    logger.debug(f"Successfully imported wandb version {_wandb_version }")
+except importlib_metadata.PackageNotFoundError:
+    _wandb_available = False
+
+
+_tensorboard_available = importlib.util.find_spec("tensorboard")
+try:
+    _tensorboard_version = importlib_metadata.version("tensorboard")
+    logger.debug(f"Successfully imported tensorboard version {_tensorboard_version}")
+except importlib_metadata.PackageNotFoundError:
+    _tensorboard_available = False
+
+
+_compel_available = importlib.util.find_spec("compel")
+try:
+    _compel_version = importlib_metadata.version("compel")
+    logger.debug(f"Successfully imported compel version {_compel_version}")
+except importlib_metadata.PackageNotFoundError:
+    _compel_available = False
+
+
+_ftfy_available = importlib.util.find_spec("ftfy") is not None
+try:
+    _ftfy_version = importlib_metadata.version("ftfy")
+    logger.debug(f"Successfully imported ftfy version {_ftfy_version}")
+except importlib_metadata.PackageNotFoundError:
+    _ftfy_available = False
+
+
+_bs4_available = importlib.util.find_spec("bs4") is not None
+try:
+    # importlib metadata under different name
+    _bs4_version = importlib_metadata.version("beautifulsoup4")
+    logger.debug(f"Successfully imported ftfy version {_bs4_version}")
+except importlib_metadata.PackageNotFoundError:
+    _bs4_available = False
+
+_torchsde_available = importlib.util.find_spec("torchsde") is not None
+try:
+    _torchsde_version = importlib_metadata.version("torchsde")
+    logger.debug(f"Successfully imported torchsde version {_torchsde_version}")
+except importlib_metadata.PackageNotFoundError:
+    _torchsde_available = False
+
+_invisible_watermark_available = importlib.util.find_spec("imwatermark") is not None
+try:
+    _invisible_watermark_version = importlib_metadata.version("invisible-watermark")
+    logger.debug(f"Successfully imported invisible-watermark version {_invisible_watermark_version}")
+except importlib_metadata.PackageNotFoundError:
+    _invisible_watermark_available = False
+
+
+_peft_available = importlib.util.find_spec("peft") is not None
+try:
+    _peft_version = importlib_metadata.version("peft")
+    logger.debug(f"Successfully imported peft version {_peft_version}")
+except importlib_metadata.PackageNotFoundError:
+    _peft_available = False
+
+_torchvision_available = importlib.util.find_spec("torchvision") is not None
+try:
+    _torchvision_version = importlib_metadata.version("torchvision")
+    logger.debug(f"Successfully imported torchvision version {_torchvision_version}")
+except importlib_metadata.PackageNotFoundError:
+    _torchvision_available = False
+
+_sentencepiece_available = importlib.util.find_spec("sentencepiece") is not None
+try:
+    _sentencepiece_version = importlib_metadata.version("sentencepiece")
+    logger.info(f"Successfully imported sentencepiece version {_sentencepiece_version}")
+except importlib_metadata.PackageNotFoundError:
+    _sentencepiece_available = False
+
+_matplotlib_available = importlib.util.find_spec("matplotlib") is not None
+try:
+    _matplotlib_version = importlib_metadata.version("matplotlib")
+    logger.debug(f"Successfully imported matplotlib version {_matplotlib_version}")
+except importlib_metadata.PackageNotFoundError:
+    _matplotlib_available = False
+
+_timm_available = importlib.util.find_spec("timm") is not None
+if _timm_available:
+    try:
+        _timm_version = importlib_metadata.version("timm")
+        logger.info(f"Timm version {_timm_version} available.")
+    except importlib_metadata.PackageNotFoundError:
+        _timm_available = False
+
+
+def is_timm_available():
+    return _timm_available
+
+
+_bitsandbytes_available = importlib.util.find_spec("bitsandbytes") is not None
+try:
+    _bitsandbytes_version = importlib_metadata.version("bitsandbytes")
+    logger.debug(f"Successfully imported bitsandbytes version {_bitsandbytes_version}")
+except importlib_metadata.PackageNotFoundError:
+    _bitsandbytes_available = False
+
+_is_google_colab = "google.colab" in sys.modules or any(k.startswith("COLAB_") for k in os.environ)
+
+_imageio_available = importlib.util.find_spec("imageio") is not None
+if _imageio_available:
+    try:
+        _imageio_version = importlib_metadata.version("imageio")
+        logger.debug(f"Successfully imported imageio version {_imageio_version}")
+
+    except importlib_metadata.PackageNotFoundError:
+        _imageio_available = False
+
+
+def is_torch_available():
+    return _torch_available
+
+
+def is_torch_xla_available():
+    return _torch_xla_available
+
+
+def is_torch_npu_available():
+    return _torch_npu_available
+
+
+def is_flax_available():
+    return _flax_available
+
+
+def is_transformers_available():
+    return _transformers_available
+
+
+def is_inflect_available():
+    return _inflect_available
+
+
+def is_unidecode_available():
+    return _unidecode_available
+
+
+def is_onnx_available():
+    return _onnx_available
+
+
+def is_opencv_available():
+    return _opencv_available
+
+
+def is_scipy_available():
+    return _scipy_available
+
+
+def is_librosa_available():
+    return _librosa_available
+
+
+def is_xformers_available():
+    return _xformers_available
+
+
+def is_accelerate_available():
+    return _accelerate_available
+
+
+def is_k_diffusion_available():
+    return _k_diffusion_available
+
+
+def is_note_seq_available():
+    return _note_seq_available
+
+
+def is_wandb_available():
+    return _wandb_available
+
+
+def is_tensorboard_available():
+    return _tensorboard_available
+
+
+def is_compel_available():
+    return _compel_available
+
+
+def is_ftfy_available():
+    return _ftfy_available
+
+
+def is_bs4_available():
+    return _bs4_available
+
+
+def is_torchsde_available():
+    return _torchsde_available
+
+
+def is_invisible_watermark_available():
+    return _invisible_watermark_available
+
+
+def is_peft_available():
+    return _peft_available
+
+
+def is_torchvision_available():
+    return _torchvision_available
+
+
+def is_matplotlib_available():
+    return _matplotlib_available
+
+
+def is_safetensors_available():
+    return _safetensors_available
+
+
+def is_bitsandbytes_available():
+    return _bitsandbytes_available
+
+
+def is_google_colab():
+    return _is_google_colab
+
+
+def is_sentencepiece_available():
+    return _sentencepiece_available
+
+
+def is_imageio_available():
+    return _imageio_available
+
+
+# docstyle-ignore
+FLAX_IMPORT_ERROR = """
+{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
+installation page: https://github.com/google/flax and follow the ones that match your environment.
+"""
+
+# docstyle-ignore
+INFLECT_IMPORT_ERROR = """
+{0} requires the inflect library but it was not found in your environment. You can install it with pip: `pip install
+inflect`
+"""
+
+# docstyle-ignore
+PYTORCH_IMPORT_ERROR = """
+{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
+installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
+"""
+
+# docstyle-ignore
+ONNX_IMPORT_ERROR = """
+{0} requires the onnxruntime library but it was not found in your environment. You can install it with pip: `pip
+install onnxruntime`
+"""
+
+# docstyle-ignore
+OPENCV_IMPORT_ERROR = """
+{0} requires the OpenCV library but it was not found in your environment. You can install it with pip: `pip
+install opencv-python`
+"""
+
+# docstyle-ignore
+SCIPY_IMPORT_ERROR = """
+{0} requires the scipy library but it was not found in your environment. You can install it with pip: `pip install
+scipy`
+"""
+
+# docstyle-ignore
+LIBROSA_IMPORT_ERROR = """
+{0} requires the librosa library but it was not found in your environment.  Checkout the instructions on the
+installation page: https://librosa.org/doc/latest/install.html and follow the ones that match your environment.
+"""
+
+# docstyle-ignore
+TRANSFORMERS_IMPORT_ERROR = """
+{0} requires the transformers library but it was not found in your environment. You can install it with pip: `pip
+install transformers`
+"""
+
+# docstyle-ignore
+UNIDECODE_IMPORT_ERROR = """
+{0} requires the unidecode library but it was not found in your environment. You can install it with pip: `pip install
+Unidecode`
+"""
+
+# docstyle-ignore
+K_DIFFUSION_IMPORT_ERROR = """
+{0} requires the k-diffusion library but it was not found in your environment. You can install it with pip: `pip
+install k-diffusion`
+"""
+
+# docstyle-ignore
+NOTE_SEQ_IMPORT_ERROR = """
+{0} requires the note-seq library but it was not found in your environment. You can install it with pip: `pip
+install note-seq`
+"""
+
+# docstyle-ignore
+WANDB_IMPORT_ERROR = """
+{0} requires the wandb library but it was not found in your environment. You can install it with pip: `pip
+install wandb`
+"""
+
+# docstyle-ignore
+TENSORBOARD_IMPORT_ERROR = """
+{0} requires the tensorboard library but it was not found in your environment. You can install it with pip: `pip
+install tensorboard`
+"""
+
+
+# docstyle-ignore
+COMPEL_IMPORT_ERROR = """
+{0} requires the compel library but it was not found in your environment. You can install it with pip: `pip install compel`
+"""
+
+# docstyle-ignore
+BS4_IMPORT_ERROR = """
+{0} requires the Beautiful Soup library but it was not found in your environment. You can install it with pip:
+`pip install beautifulsoup4`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+FTFY_IMPORT_ERROR = """
+{0} requires the ftfy library but it was not found in your environment. Checkout the instructions on the
+installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+TORCHSDE_IMPORT_ERROR = """
+{0} requires the torchsde library but it was not found in your environment. You can install it with pip: `pip install torchsde`
+"""
+
+# docstyle-ignore
+INVISIBLE_WATERMARK_IMPORT_ERROR = """
+{0} requires the invisible-watermark library but it was not found in your environment. You can install it with pip: `pip install invisible-watermark>=0.2.0`
+"""
+
+# docstyle-ignore
+PEFT_IMPORT_ERROR = """
+{0} requires the peft library but it was not found in your environment. You can install it with pip: `pip install peft`
+"""
+
+# docstyle-ignore
+SAFETENSORS_IMPORT_ERROR = """
+{0} requires the safetensors library but it was not found in your environment. You can install it with pip: `pip install safetensors`
+"""
+
+# docstyle-ignore
+SENTENCEPIECE_IMPORT_ERROR = """
+{0} requires the sentencepiece library but it was not found in your environment. You can install it with pip: `pip install sentencepiece`
+"""
+
+
+# docstyle-ignore
+BITSANDBYTES_IMPORT_ERROR = """
+{0} requires the bitsandbytes library but it was not found in your environment. You can install it with pip: `pip install bitsandbytes`
+"""
+
+# docstyle-ignore
+IMAGEIO_IMPORT_ERROR = """
+{0} requires the imageio library and ffmpeg but it was not found in your environment. You can install it with pip: `pip install imageio imageio-ffmpeg`
+"""
+
+BACKENDS_MAPPING = OrderedDict(
+    [
+        ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
+        ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
+        ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)),
+        ("onnx", (is_onnx_available, ONNX_IMPORT_ERROR)),
+        ("opencv", (is_opencv_available, OPENCV_IMPORT_ERROR)),
+        ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
+        ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
+        ("transformers", (is_transformers_available, TRANSFORMERS_IMPORT_ERROR)),
+        ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)),
+        ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
+        ("k_diffusion", (is_k_diffusion_available, K_DIFFUSION_IMPORT_ERROR)),
+        ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)),
+        ("wandb", (is_wandb_available, WANDB_IMPORT_ERROR)),
+        ("tensorboard", (is_tensorboard_available, TENSORBOARD_IMPORT_ERROR)),
+        ("compel", (is_compel_available, COMPEL_IMPORT_ERROR)),
+        ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
+        ("torchsde", (is_torchsde_available, TORCHSDE_IMPORT_ERROR)),
+        ("invisible_watermark", (is_invisible_watermark_available, INVISIBLE_WATERMARK_IMPORT_ERROR)),
+        ("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
+        ("safetensors", (is_safetensors_available, SAFETENSORS_IMPORT_ERROR)),
+        ("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)),
+        ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
+        ("imageio", (is_imageio_available, IMAGEIO_IMPORT_ERROR)),
+    ]
+)
+
+
+def requires_backends(obj, backends):
+    if not isinstance(backends, (list, tuple)):
+        backends = [backends]
+
+    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
+    checks = (BACKENDS_MAPPING[backend] for backend in backends)
+    failed = [msg.format(name) for available, msg in checks if not available()]
+    if failed:
+        raise ImportError("".join(failed))
+
+    if name in [
+        "VersatileDiffusionTextToImagePipeline",
+        "VersatileDiffusionPipeline",
+        "VersatileDiffusionDualGuidedPipeline",
+        "StableDiffusionImageVariationPipeline",
+        "UnCLIPPipeline",
+    ] and is_transformers_version("<", "4.25.0"):
+        raise ImportError(
+            f"You need to install `transformers>=4.25` in order to use {name}: \n```\n pip install"
+            " --upgrade transformers \n```"
+        )
+
+    if name in ["StableDiffusionDepth2ImgPipeline", "StableDiffusionPix2PixZeroPipeline"] and is_transformers_version(
+        "<", "4.26.0"
+    ):
+        raise ImportError(
+            f"You need to install `transformers>=4.26` in order to use {name}: \n```\n pip install"
+            " --upgrade transformers \n```"
+        )
+
+
+class DummyObject(type):
+    """
+    Metaclass for the dummy objects. Any class inheriting from it will return the ImportError generated by
+    `requires_backend` each time a user tries to access any method of that class.
+    """
+
+    def __getattr__(cls, key):
+        if key.startswith("_") and key not in ["_load_connected_pipes", "_is_onnx"]:
+            return super().__getattr__(cls, key)
+        requires_backends(cls, cls._backends)
+
+
+# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
+def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
+    """
+    Args:
+    Compares a library version to some requirement using a given operation.
+        library_or_version (`str` or `packaging.version.Version`):
+            A library name or a version to check.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`.
+        requirement_version (`str`):
+            The version to compare the library version against
+    """
+    if operation not in STR_OPERATION_TO_FUNC.keys():
+        raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}")
+    operation = STR_OPERATION_TO_FUNC[operation]
+    if isinstance(library_or_version, str):
+        library_or_version = parse(importlib_metadata.version(library_or_version))
+    return operation(library_or_version, parse(requirement_version))
+
+
+# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L338
+def is_torch_version(operation: str, version: str):
+    """
+    Args:
+    Compares the current PyTorch version to a given reference with an operation.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A string version of PyTorch
+    """
+    return compare_versions(parse(_torch_version), operation, version)
+
+
+def is_transformers_version(operation: str, version: str):
+    """
+    Args:
+    Compares the current Transformers version to a given reference with an operation.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A version string
+    """
+    if not _transformers_available:
+        return False
+    return compare_versions(parse(_transformers_version), operation, version)
+
+
+def is_accelerate_version(operation: str, version: str):
+    """
+    Args:
+    Compares the current Accelerate version to a given reference with an operation.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A version string
+    """
+    if not _accelerate_available:
+        return False
+    return compare_versions(parse(_accelerate_version), operation, version)
+
+
+def is_peft_version(operation: str, version: str):
+    """
+    Args:
+    Compares the current PEFT version to a given reference with an operation.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A version string
+    """
+    if not _peft_version:
+        return False
+    return compare_versions(parse(_peft_version), operation, version)
+
+
+def is_k_diffusion_version(operation: str, version: str):
+    """
+    Args:
+    Compares the current k-diffusion version to a given reference with an operation.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A version string
+    """
+    if not _k_diffusion_available:
+        return False
+    return compare_versions(parse(_k_diffusion_version), operation, version)
+
+
+def get_objects_from_module(module):
+    """
+    Args:
+    Returns a dict of object names and values in a module, while skipping private/internal objects
+        module (ModuleType):
+            Module to extract the objects from.
+
+    Returns:
+        dict: Dictionary of object names and corresponding values
+    """
+
+    objects = {}
+    for name in dir(module):
+        if name.startswith("_"):
+            continue
+        objects[name] = getattr(module, name)
+
+    return objects
+
+
+class OptionalDependencyNotAvailable(BaseException):
+    """An error indicating that an optional dependency of Diffusers was not found in the environment."""
+
+
+class _LazyModule(ModuleType):
+    """
+    Module class that surfaces all objects but only performs associated imports when the objects are requested.
+    """
+
+    # Very heavily inspired by optuna.integration._IntegrationModule
+    # https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
+    def __init__(self, name, module_file, import_structure, module_spec=None, extra_objects=None):
+        super().__init__(name)
+        self._modules = set(import_structure.keys())
+        self._class_to_module = {}
+        for key, values in import_structure.items():
+            for value in values:
+                self._class_to_module[value] = key
+        # Needed for autocompletion in an IDE
+        self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
+        self.__file__ = module_file
+        self.__spec__ = module_spec
+        self.__path__ = [os.path.dirname(module_file)]
+        self._objects = {} if extra_objects is None else extra_objects
+        self._name = name
+        self._import_structure = import_structure
+
+    # Needed for autocompletion in an IDE
+    def __dir__(self):
+        result = super().__dir__()
+        # The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether
+        # they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir.
+        for attr in self.__all__:
+            if attr not in result:
+                result.append(attr)
+        return result
+
+    def __getattr__(self, name: str) -> Any:
+        if name in self._objects:
+            return self._objects[name]
+        if name in self._modules:
+            value = self._get_module(name)
+        elif name in self._class_to_module.keys():
+            module = self._get_module(self._class_to_module[name])
+            value = getattr(module, name)
+        else:
+            raise AttributeError(f"module {self.__name__} has no attribute {name}")
+
+        setattr(self, name, value)
+        return value
+
+    def _get_module(self, module_name: str):
+        try:
+            return importlib.import_module("." + module_name, self.__name__)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its"
+                f" traceback):\n{e}"
+            ) from e
+
+    def __reduce__(self):
+        return (self.__class__, (self._name, self.__file__, self._import_structure))
diff --git a/diffusers/src/diffusers/utils/loading_utils.py b/diffusers/src/diffusers/utils/loading_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36664cb81ff771c284bc3919e054bd684dc342b
--- /dev/null
+++ b/diffusers/src/diffusers/utils/loading_utils.py
@@ -0,0 +1,137 @@
+import os
+import tempfile
+from typing import Callable, List, Optional, Union
+from urllib.parse import unquote, urlparse
+
+import PIL.Image
+import PIL.ImageOps
+import requests
+
+from .import_utils import BACKENDS_MAPPING, is_imageio_available
+
+
+def load_image(
+    image: Union[str, PIL.Image.Image], convert_method: Optional[Callable[[PIL.Image.Image], PIL.Image.Image]] = None
+) -> PIL.Image.Image:
+    """
+    Loads `image` to a PIL Image.
+
+    Args:
+        image (`str` or `PIL.Image.Image`):
+            The image to convert to the PIL Image format.
+        convert_method (Callable[[PIL.Image.Image], PIL.Image.Image], *optional*):
+            A conversion method to apply to the image after loading it. When set to `None` the image will be converted
+            "RGB".
+
+    Returns:
+        `PIL.Image.Image`:
+            A PIL Image.
+    """
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            image = PIL.Image.open(requests.get(image, stream=True).raw)
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            raise ValueError(
+                f"Incorrect path or URL. URLs must start with `http://` or `https://`, and {image} is not a valid path."
+            )
+    elif isinstance(image, PIL.Image.Image):
+        image = image
+    else:
+        raise ValueError(
+            "Incorrect format used for the image. Should be a URL linking to an image, a local path, or a PIL image."
+        )
+
+    image = PIL.ImageOps.exif_transpose(image)
+
+    if convert_method is not None:
+        image = convert_method(image)
+    else:
+        image = image.convert("RGB")
+
+    return image
+
+
+def load_video(
+    video: str,
+    convert_method: Optional[Callable[[List[PIL.Image.Image]], List[PIL.Image.Image]]] = None,
+) -> List[PIL.Image.Image]:
+    """
+    Loads `video` to a list of PIL Image.
+
+    Args:
+        video (`str`):
+            A URL or Path to a video to convert to a list of PIL Image format.
+        convert_method (Callable[[List[PIL.Image.Image]], List[PIL.Image.Image]], *optional*):
+            A conversion method to apply to the video after loading it. When set to `None` the images will be converted
+            to "RGB".
+
+    Returns:
+        `List[PIL.Image.Image]`:
+            The video as a list of PIL images.
+    """
+    is_url = video.startswith("http://") or video.startswith("https://")
+    is_file = os.path.isfile(video)
+    was_tempfile_created = False
+
+    if not (is_url or is_file):
+        raise ValueError(
+            f"Incorrect path or URL. URLs must start with `http://` or `https://`, and {video} is not a valid path."
+        )
+
+    if is_url:
+        response = requests.get(video, stream=True)
+        if response.status_code != 200:
+            raise ValueError(f"Failed to download video. Status code: {response.status_code}")
+
+        parsed_url = urlparse(video)
+        file_name = os.path.basename(unquote(parsed_url.path))
+
+        suffix = os.path.splitext(file_name)[1] or ".mp4"
+        video_path = tempfile.NamedTemporaryFile(suffix=suffix, delete=False).name
+
+        was_tempfile_created = True
+
+        video_data = response.iter_content(chunk_size=8192)
+        with open(video_path, "wb") as f:
+            for chunk in video_data:
+                f.write(chunk)
+
+        video = video_path
+
+    pil_images = []
+    if video.endswith(".gif"):
+        gif = PIL.Image.open(video)
+        try:
+            while True:
+                pil_images.append(gif.copy())
+                gif.seek(gif.tell() + 1)
+        except EOFError:
+            pass
+
+    else:
+        if is_imageio_available():
+            import imageio
+        else:
+            raise ImportError(BACKENDS_MAPPING["imageio"][1].format("load_video"))
+
+        try:
+            imageio.plugins.ffmpeg.get_exe()
+        except AttributeError:
+            raise AttributeError(
+                "`Unable to find an ffmpeg installation on your machine. Please install via `pip install imageio-ffmpeg"
+            )
+
+        with imageio.get_reader(video) as reader:
+            # Read all frames
+            for frame in reader:
+                pil_images.append(PIL.Image.fromarray(frame))
+
+    if was_tempfile_created:
+        os.remove(video_path)
+
+    if convert_method is not None:
+        pil_images = convert_method(pil_images)
+
+    return pil_images
diff --git a/diffusers/src/diffusers/utils/logging.py b/diffusers/src/diffusers/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f93450c410c9325aa9d0cf10262b67e4c355fda
--- /dev/null
+++ b/diffusers/src/diffusers/utils/logging.py
@@ -0,0 +1,341 @@
+# coding=utf-8
+# Copyright 2024 Optuna, Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Logging utilities."""
+
+import logging
+import os
+import sys
+import threading
+from logging import (
+    CRITICAL,  # NOQA
+    DEBUG,  # NOQA
+    ERROR,  # NOQA
+    FATAL,  # NOQA
+    INFO,  # NOQA
+    NOTSET,  # NOQA
+    WARN,  # NOQA
+    WARNING,  # NOQA
+)
+from typing import Dict, Optional
+
+from tqdm import auto as tqdm_lib
+
+
+_lock = threading.Lock()
+_default_handler: Optional[logging.Handler] = None
+
+log_levels = {
+    "debug": logging.DEBUG,
+    "info": logging.INFO,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+
+_default_log_level = logging.WARNING
+
+_tqdm_active = True
+
+
+def _get_default_logging_level() -> int:
+    """
+    If DIFFUSERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
+    not - fall back to `_default_log_level`
+    """
+    env_level_str = os.getenv("DIFFUSERS_VERBOSITY", None)
+    if env_level_str:
+        if env_level_str in log_levels:
+            return log_levels[env_level_str]
+        else:
+            logging.getLogger().warning(
+                f"Unknown option DIFFUSERS_VERBOSITY={env_level_str}, "
+                f"has to be one of: { ', '.join(log_levels.keys()) }"
+            )
+    return _default_log_level
+
+
+def _get_library_name() -> str:
+    return __name__.split(".")[0]
+
+
+def _get_library_root_logger() -> logging.Logger:
+    return logging.getLogger(_get_library_name())
+
+
+def _configure_library_root_logger() -> None:
+    global _default_handler
+
+    with _lock:
+        if _default_handler:
+            # This library has already configured the library root logger.
+            return
+        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+
+        if sys.stderr:  # only if sys.stderr exists, e.g. when not using pythonw in windows
+            _default_handler.flush = sys.stderr.flush
+
+        # Apply our default configuration to the library root logger.
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.addHandler(_default_handler)
+        library_root_logger.setLevel(_get_default_logging_level())
+        library_root_logger.propagate = False
+
+
+def _reset_library_root_logger() -> None:
+    global _default_handler
+
+    with _lock:
+        if not _default_handler:
+            return
+
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.removeHandler(_default_handler)
+        library_root_logger.setLevel(logging.NOTSET)
+        _default_handler = None
+
+
+def get_log_levels_dict() -> Dict[str, int]:
+    return log_levels
+
+
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+    """
+    Return a logger with the specified name.
+
+    This function is not supposed to be directly accessed unless you are writing a custom diffusers module.
+    """
+
+    if name is None:
+        name = _get_library_name()
+
+    _configure_library_root_logger()
+    return logging.getLogger(name)
+
+
+def get_verbosity() -> int:
+    """
+    Return the current level for the 🤗 Diffusers' root logger as an `int`.
+
+    Returns:
+        `int`:
+            Logging level integers which can be one of:
+
+            - `50`: `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL`
+            - `40`: `diffusers.logging.ERROR`
+            - `30`: `diffusers.logging.WARNING` or `diffusers.logging.WARN`
+            - `20`: `diffusers.logging.INFO`
+            - `10`: `diffusers.logging.DEBUG`
+
+    """
+
+    _configure_library_root_logger()
+    return _get_library_root_logger().getEffectiveLevel()
+
+
+def set_verbosity(verbosity: int) -> None:
+    """
+    Set the verbosity level for the 🤗 Diffusers' root logger.
+
+    Args:
+        verbosity (`int`):
+            Logging level which can be one of:
+
+            - `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL`
+            - `diffusers.logging.ERROR`
+            - `diffusers.logging.WARNING` or `diffusers.logging.WARN`
+            - `diffusers.logging.INFO`
+            - `diffusers.logging.DEBUG`
+    """
+
+    _configure_library_root_logger()
+    _get_library_root_logger().setLevel(verbosity)
+
+
+def set_verbosity_info() -> None:
+    """Set the verbosity to the `INFO` level."""
+    return set_verbosity(INFO)
+
+
+def set_verbosity_warning() -> None:
+    """Set the verbosity to the `WARNING` level."""
+    return set_verbosity(WARNING)
+
+
+def set_verbosity_debug() -> None:
+    """Set the verbosity to the `DEBUG` level."""
+    return set_verbosity(DEBUG)
+
+
+def set_verbosity_error() -> None:
+    """Set the verbosity to the `ERROR` level."""
+    return set_verbosity(ERROR)
+
+
+def disable_default_handler() -> None:
+    """Disable the default handler of the 🤗 Diffusers' root logger."""
+
+    _configure_library_root_logger()
+
+    assert _default_handler is not None
+    _get_library_root_logger().removeHandler(_default_handler)
+
+
+def enable_default_handler() -> None:
+    """Enable the default handler of the 🤗 Diffusers' root logger."""
+
+    _configure_library_root_logger()
+
+    assert _default_handler is not None
+    _get_library_root_logger().addHandler(_default_handler)
+
+
+def add_handler(handler: logging.Handler) -> None:
+    """adds a handler to the HuggingFace Diffusers' root logger."""
+
+    _configure_library_root_logger()
+
+    assert handler is not None
+    _get_library_root_logger().addHandler(handler)
+
+
+def remove_handler(handler: logging.Handler) -> None:
+    """removes given handler from the HuggingFace Diffusers' root logger."""
+
+    _configure_library_root_logger()
+
+    assert handler is not None and handler in _get_library_root_logger().handlers
+    _get_library_root_logger().removeHandler(handler)
+
+
+def disable_propagation() -> None:
+    """
+    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
+    """
+
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = False
+
+
+def enable_propagation() -> None:
+    """
+    Enable propagation of the library log outputs. Please disable the HuggingFace Diffusers' default handler to prevent
+    double logging if the root logger has been configured.
+    """
+
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = True
+
+
+def enable_explicit_format() -> None:
+    """
+    Enable explicit formatting for every 🤗 Diffusers' logger. The explicit formatter is as follows:
+    ```
+    [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
+    ```
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+
+    for handler in handlers:
+        formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
+        handler.setFormatter(formatter)
+
+
+def reset_format() -> None:
+    """
+    Resets the formatting for 🤗 Diffusers' loggers.
+
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+
+    for handler in handlers:
+        handler.setFormatter(None)
+
+
+def warning_advice(self, *args, **kwargs) -> None:
+    """
+    This method is identical to `logger.warning()`, but if env var DIFFUSERS_NO_ADVISORY_WARNINGS=1 is set, this
+    warning will not be printed
+    """
+    no_advisory_warnings = os.getenv("DIFFUSERS_NO_ADVISORY_WARNINGS", False)
+    if no_advisory_warnings:
+        return
+    self.warning(*args, **kwargs)
+
+
+logging.Logger.warning_advice = warning_advice
+
+
+class EmptyTqdm:
+    """Dummy tqdm which doesn't do anything."""
+
+    def __init__(self, *args, **kwargs):  # pylint: disable=unused-argument
+        self._iterator = args[0] if args else None
+
+    def __iter__(self):
+        return iter(self._iterator)
+
+    def __getattr__(self, _):
+        """Return empty function."""
+
+        def empty_fn(*args, **kwargs):  # pylint: disable=unused-argument
+            return
+
+        return empty_fn
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type_, value, traceback):
+        return
+
+
+class _tqdm_cls:
+    def __call__(self, *args, **kwargs):
+        if _tqdm_active:
+            return tqdm_lib.tqdm(*args, **kwargs)
+        else:
+            return EmptyTqdm(*args, **kwargs)
+
+    def set_lock(self, *args, **kwargs):
+        self._lock = None
+        if _tqdm_active:
+            return tqdm_lib.tqdm.set_lock(*args, **kwargs)
+
+    def get_lock(self):
+        if _tqdm_active:
+            return tqdm_lib.tqdm.get_lock()
+
+
+tqdm = _tqdm_cls()
+
+
+def is_progress_bar_enabled() -> bool:
+    """Return a boolean indicating whether tqdm progress bars are enabled."""
+    global _tqdm_active
+    return bool(_tqdm_active)
+
+
+def enable_progress_bar() -> None:
+    """Enable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = True
+
+
+def disable_progress_bar() -> None:
+    """Disable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = False
diff --git a/diffusers/src/diffusers/utils/model_card_template.md b/diffusers/src/diffusers/utils/model_card_template.md
new file mode 100644
index 0000000000000000000000000000000000000000..f41b71e24e2081425d3049ae51b50036d5d28b6a
--- /dev/null
+++ b/diffusers/src/diffusers/utils/model_card_template.md
@@ -0,0 +1,24 @@
+---
+{{ card_data }}
+---
+
+<!-- This model card has been generated automatically according to the information the training script had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+{{ model_description }}
+
+## Intended uses & limitations
+
+#### How to use
+
+```python
+# TODO: add an example code snippet for running this diffusion pipeline
+```
+
+#### Limitations and bias
+
+[TODO: provide examples of latent issues and potential remediations]
+
+## Training details
+
+[TODO: describe the data used to train the model]
diff --git a/diffusers/src/diffusers/utils/outputs.py b/diffusers/src/diffusers/utils/outputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..6080a86b871aec2218471591bccac408a45d82b2
--- /dev/null
+++ b/diffusers/src/diffusers/utils/outputs.py
@@ -0,0 +1,137 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Generic utilities
+"""
+
+from collections import OrderedDict
+from dataclasses import fields, is_dataclass
+from typing import Any, Tuple
+
+import numpy as np
+
+from .import_utils import is_torch_available, is_torch_version
+
+
+def is_tensor(x) -> bool:
+    """
+    Tests if `x` is a `torch.Tensor` or `np.ndarray`.
+    """
+    if is_torch_available():
+        import torch
+
+        if isinstance(x, torch.Tensor):
+            return True
+
+    return isinstance(x, np.ndarray)
+
+
+class BaseOutput(OrderedDict):
+    """
+    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
+    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
+    Python dictionary.
+
+    <Tip warning={true}>
+
+    You can't unpack a [`BaseOutput`] directly. Use the [`~utils.BaseOutput.to_tuple`] method to convert it to a tuple
+    first.
+
+    </Tip>
+    """
+
+    def __init_subclass__(cls) -> None:
+        """Register subclasses as pytree nodes.
+
+        This is necessary to synchronize gradients when using `torch.nn.parallel.DistributedDataParallel` with
+        `static_graph=True` with modules that output `ModelOutput` subclasses.
+        """
+        if is_torch_available():
+            import torch.utils._pytree
+
+            if is_torch_version("<", "2.2"):
+                torch.utils._pytree._register_pytree_node(
+                    cls,
+                    torch.utils._pytree._dict_flatten,
+                    lambda values, context: cls(**torch.utils._pytree._dict_unflatten(values, context)),
+                )
+            else:
+                torch.utils._pytree.register_pytree_node(
+                    cls,
+                    torch.utils._pytree._dict_flatten,
+                    lambda values, context: cls(**torch.utils._pytree._dict_unflatten(values, context)),
+                )
+
+    def __post_init__(self) -> None:
+        class_fields = fields(self)
+
+        # Safety and consistency checks
+        if not len(class_fields):
+            raise ValueError(f"{self.__class__.__name__} has no fields.")
+
+        first_field = getattr(self, class_fields[0].name)
+        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
+
+        if other_fields_are_none and isinstance(first_field, dict):
+            for key, value in first_field.items():
+                self[key] = value
+        else:
+            for field in class_fields:
+                v = getattr(self, field.name)
+                if v is not None:
+                    self[field.name] = v
+
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+    def setdefault(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+    def pop(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+    def __getitem__(self, k: Any) -> Any:
+        if isinstance(k, str):
+            inner_dict = dict(self.items())
+            return inner_dict[k]
+        else:
+            return self.to_tuple()[k]
+
+    def __setattr__(self, name: Any, value: Any) -> None:
+        if name in self.keys() and value is not None:
+            # Don't call self.__setitem__ to avoid recursion errors
+            super().__setitem__(name, value)
+        super().__setattr__(name, value)
+
+    def __setitem__(self, key, value):
+        # Will raise a KeyException if needed
+        super().__setitem__(key, value)
+        # Don't call self.__setattr__ to avoid recursion errors
+        super().__setattr__(key, value)
+
+    def __reduce__(self):
+        if not is_dataclass(self):
+            return super().__reduce__()
+        callable, _args, *remaining = super().__reduce__()
+        args = tuple(getattr(self, field.name) for field in fields(self))
+        return callable, args, *remaining
+
+    def to_tuple(self) -> Tuple[Any, ...]:
+        """
+        Convert self to a tuple containing all the attributes/keys that are not `None`.
+        """
+        return tuple(self[k] for k in self.keys())
diff --git a/diffusers/src/diffusers/utils/peft_utils.py b/diffusers/src/diffusers/utils/peft_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca55192ff7ae83139b239ec7ff46d0fa79555a19
--- /dev/null
+++ b/diffusers/src/diffusers/utils/peft_utils.py
@@ -0,0 +1,295 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PEFT utilities: Utilities related to peft library
+"""
+
+import collections
+import importlib
+from typing import Optional
+
+from packaging import version
+
+from .import_utils import is_peft_available, is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+
+def recurse_remove_peft_layers(model):
+    r"""
+    Recursively replace all instances of `LoraLayer` with corresponding new layers in `model`.
+    """
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    has_base_layer_pattern = False
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            has_base_layer_pattern = hasattr(module, "base_layer")
+            break
+
+    if has_base_layer_pattern:
+        from peft.utils import _get_submodules
+
+        key_list = [key for key, _ in model.named_modules() if "lora" not in key]
+        for key in key_list:
+            try:
+                parent, target, target_name = _get_submodules(model, key)
+            except AttributeError:
+                continue
+            if hasattr(target, "base_layer"):
+                setattr(parent, target_name, target.get_base_layer())
+    else:
+        # This is for backwards compatibility with PEFT <= 0.6.2.
+        # TODO can be removed once that PEFT version is no longer supported.
+        from peft.tuners.lora import LoraLayer
+
+        for name, module in model.named_children():
+            if len(list(module.children())) > 0:
+                ## compound module, go inside it
+                recurse_remove_peft_layers(module)
+
+            module_replaced = False
+
+            if isinstance(module, LoraLayer) and isinstance(module, torch.nn.Linear):
+                new_module = torch.nn.Linear(
+                    module.in_features,
+                    module.out_features,
+                    bias=module.bias is not None,
+                ).to(module.weight.device)
+                new_module.weight = module.weight
+                if module.bias is not None:
+                    new_module.bias = module.bias
+
+                module_replaced = True
+            elif isinstance(module, LoraLayer) and isinstance(module, torch.nn.Conv2d):
+                new_module = torch.nn.Conv2d(
+                    module.in_channels,
+                    module.out_channels,
+                    module.kernel_size,
+                    module.stride,
+                    module.padding,
+                    module.dilation,
+                    module.groups,
+                ).to(module.weight.device)
+
+                new_module.weight = module.weight
+                if module.bias is not None:
+                    new_module.bias = module.bias
+
+                module_replaced = True
+
+            if module_replaced:
+                setattr(model, name, new_module)
+                del module
+
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+    return model
+
+
+def scale_lora_layers(model, weight):
+    """
+    Adjust the weightage given to the LoRA layers of the model.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to scale.
+        weight (`float`):
+            The weight to be given to the LoRA layers.
+    """
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    if weight == 1.0:
+        return
+
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            module.scale_layer(weight)
+
+
+def unscale_lora_layers(model, weight: Optional[float] = None):
+    """
+    Removes the previously passed weight given to the LoRA layers of the model.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to scale.
+        weight (`float`, *optional*):
+            The weight to be given to the LoRA layers. If no scale is passed the scale of the lora layer will be
+            re-initialized to the correct value. If 0.0 is passed, we will re-initialize the scale with the correct
+            value.
+    """
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    if weight == 1.0:
+        return
+
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            if weight is not None and weight != 0:
+                module.unscale_layer(weight)
+            elif weight is not None and weight == 0:
+                for adapter_name in module.active_adapters:
+                    # if weight == 0 unscale should re-set the scale to the original value.
+                    module.set_scale(adapter_name, 1.0)
+
+
+def get_peft_kwargs(rank_dict, network_alpha_dict, peft_state_dict, is_unet=True):
+    rank_pattern = {}
+    alpha_pattern = {}
+    r = lora_alpha = list(rank_dict.values())[0]
+
+    if len(set(rank_dict.values())) > 1:
+        # get the rank occuring the most number of times
+        r = collections.Counter(rank_dict.values()).most_common()[0][0]
+
+        # for modules with rank different from the most occuring rank, add it to the `rank_pattern`
+        rank_pattern = dict(filter(lambda x: x[1] != r, rank_dict.items()))
+        rank_pattern = {k.split(".lora_B.")[0]: v for k, v in rank_pattern.items()}
+
+    if network_alpha_dict is not None and len(network_alpha_dict) > 0:
+        if len(set(network_alpha_dict.values())) > 1:
+            # get the alpha occuring the most number of times
+            lora_alpha = collections.Counter(network_alpha_dict.values()).most_common()[0][0]
+
+            # for modules with alpha different from the most occuring alpha, add it to the `alpha_pattern`
+            alpha_pattern = dict(filter(lambda x: x[1] != lora_alpha, network_alpha_dict.items()))
+            if is_unet:
+                alpha_pattern = {
+                    ".".join(k.split(".lora_A.")[0].split(".")).replace(".alpha", ""): v
+                    for k, v in alpha_pattern.items()
+                }
+            else:
+                alpha_pattern = {".".join(k.split(".down.")[0].split(".")[:-1]): v for k, v in alpha_pattern.items()}
+        else:
+            lora_alpha = set(network_alpha_dict.values()).pop()
+
+    # layer names without the Diffusers specific
+    target_modules = list({name.split(".lora")[0] for name in peft_state_dict.keys()})
+    use_dora = any("lora_magnitude_vector" in k for k in peft_state_dict)
+
+    lora_config_kwargs = {
+        "r": r,
+        "lora_alpha": lora_alpha,
+        "rank_pattern": rank_pattern,
+        "alpha_pattern": alpha_pattern,
+        "target_modules": target_modules,
+        "use_dora": use_dora,
+    }
+    return lora_config_kwargs
+
+
+def get_adapter_name(model):
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            return f"default_{len(module.r)}"
+    return "default_0"
+
+
+def set_adapter_layers(model, enabled=True):
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            # The recent version of PEFT needs to call `enable_adapters` instead
+            if hasattr(module, "enable_adapters"):
+                module.enable_adapters(enabled=enabled)
+            else:
+                module.disable_adapters = not enabled
+
+
+def delete_adapter_layers(model, adapter_name):
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            if hasattr(module, "delete_adapter"):
+                module.delete_adapter(adapter_name)
+            else:
+                raise ValueError(
+                    "The version of PEFT you are using is not compatible, please use a version that is greater than 0.6.1"
+                )
+
+    # For transformers integration - we need to pop the adapter from the config
+    if getattr(model, "_hf_peft_config_loaded", False) and hasattr(model, "peft_config"):
+        model.peft_config.pop(adapter_name, None)
+        # In case all adapters are deleted, we need to delete the config
+        # and make sure to set the flag to False
+        if len(model.peft_config) == 0:
+            del model.peft_config
+            model._hf_peft_config_loaded = None
+
+
+def set_weights_and_activate_adapters(model, adapter_names, weights):
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    def get_module_weight(weight_for_adapter, module_name):
+        if not isinstance(weight_for_adapter, dict):
+            # If weight_for_adapter is a single number, always return it.
+            return weight_for_adapter
+
+        for layer_name, weight_ in weight_for_adapter.items():
+            if layer_name in module_name:
+                return weight_
+
+        parts = module_name.split(".")
+        # e.g. key = "down_blocks.1.attentions.0"
+        key = f"{parts[0]}.{parts[1]}.attentions.{parts[3]}"
+        block_weight = weight_for_adapter.get(key, 1.0)
+
+        return block_weight
+
+    # iterate over each adapter, make it active and set the corresponding scaling weight
+    for adapter_name, weight in zip(adapter_names, weights):
+        for module_name, module in model.named_modules():
+            if isinstance(module, BaseTunerLayer):
+                # For backward compatbility with previous PEFT versions
+                if hasattr(module, "set_adapter"):
+                    module.set_adapter(adapter_name)
+                else:
+                    module.active_adapter = adapter_name
+                module.set_scale(adapter_name, get_module_weight(weight, module_name))
+
+    # set multiple active adapters
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            # For backward compatbility with previous PEFT versions
+            if hasattr(module, "set_adapter"):
+                module.set_adapter(adapter_names)
+            else:
+                module.active_adapter = adapter_names
+
+
+def check_peft_version(min_version: str) -> None:
+    r"""
+    Checks if the version of PEFT is compatible.
+
+    Args:
+        version (`str`):
+            The version of PEFT to check against.
+    """
+    if not is_peft_available():
+        raise ValueError("PEFT is not installed. Please install it with `pip install peft`")
+
+    is_peft_version_compatible = version.parse(importlib.metadata.version("peft")) > version.parse(min_version)
+
+    if not is_peft_version_compatible:
+        raise ValueError(
+            f"The version of PEFT you are using is not compatible, please use a version that is greater"
+            f" than {min_version}"
+        )
diff --git a/diffusers/src/diffusers/utils/pil_utils.py b/diffusers/src/diffusers/utils/pil_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..76678070b697c7d87fc3691d9bc5bb3bea83c5b1
--- /dev/null
+++ b/diffusers/src/diffusers/utils/pil_utils.py
@@ -0,0 +1,67 @@
+from typing import List
+
+import PIL.Image
+import PIL.ImageOps
+from packaging import version
+from PIL import Image
+
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+
+
+def pt_to_pil(images):
+    """
+    Convert a torch image to a PIL image.
+    """
+    images = (images / 2 + 0.5).clamp(0, 1)
+    images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+    images = numpy_to_pil(images)
+    return images
+
+
+def numpy_to_pil(images):
+    """
+    Convert a numpy image or a batch of images to a PIL image.
+    """
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    if images.shape[-1] == 1:
+        # special case for grayscale (single channel) images
+        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+    else:
+        pil_images = [Image.fromarray(image) for image in images]
+
+    return pil_images
+
+
+def make_image_grid(images: List[PIL.Image.Image], rows: int, cols: int, resize: int = None) -> PIL.Image.Image:
+    """
+    Prepares a single grid of images. Useful for visualization purposes.
+    """
+    assert len(images) == rows * cols
+
+    if resize is not None:
+        images = [img.resize((resize, resize)) for img in images]
+
+    w, h = images[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+
+    for i, img in enumerate(images):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
diff --git a/diffusers/src/diffusers/utils/state_dict_utils.py b/diffusers/src/diffusers/utils/state_dict_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..62b114ba67e35c696ba859909a83deea39f5ef22
--- /dev/null
+++ b/diffusers/src/diffusers/utils/state_dict_utils.py
@@ -0,0 +1,335 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+State dict utilities: utility methods for converting state dicts easily
+"""
+
+import enum
+
+from .logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class StateDictType(enum.Enum):
+    """
+    The mode to use when converting state dicts.
+    """
+
+    DIFFUSERS_OLD = "diffusers_old"
+    KOHYA_SS = "kohya_ss"
+    PEFT = "peft"
+    DIFFUSERS = "diffusers"
+
+
+# We need to define a proper mapping for Unet since it uses different output keys than text encoder
+# e.g. to_q_lora -> q_proj / to_q
+UNET_TO_DIFFUSERS = {
+    ".to_out_lora.up": ".to_out.0.lora_B",
+    ".to_out_lora.down": ".to_out.0.lora_A",
+    ".to_q_lora.down": ".to_q.lora_A",
+    ".to_q_lora.up": ".to_q.lora_B",
+    ".to_k_lora.down": ".to_k.lora_A",
+    ".to_k_lora.up": ".to_k.lora_B",
+    ".to_v_lora.down": ".to_v.lora_A",
+    ".to_v_lora.up": ".to_v.lora_B",
+    ".lora.up": ".lora_B",
+    ".lora.down": ".lora_A",
+    ".to_out.lora_magnitude_vector": ".to_out.0.lora_magnitude_vector",
+}
+
+
+DIFFUSERS_TO_PEFT = {
+    ".q_proj.lora_linear_layer.up": ".q_proj.lora_B",
+    ".q_proj.lora_linear_layer.down": ".q_proj.lora_A",
+    ".k_proj.lora_linear_layer.up": ".k_proj.lora_B",
+    ".k_proj.lora_linear_layer.down": ".k_proj.lora_A",
+    ".v_proj.lora_linear_layer.up": ".v_proj.lora_B",
+    ".v_proj.lora_linear_layer.down": ".v_proj.lora_A",
+    ".out_proj.lora_linear_layer.up": ".out_proj.lora_B",
+    ".out_proj.lora_linear_layer.down": ".out_proj.lora_A",
+    ".lora_linear_layer.up": ".lora_B",
+    ".lora_linear_layer.down": ".lora_A",
+    "text_projection.lora.down.weight": "text_projection.lora_A.weight",
+    "text_projection.lora.up.weight": "text_projection.lora_B.weight",
+}
+
+DIFFUSERS_OLD_TO_PEFT = {
+    ".to_q_lora.up": ".q_proj.lora_B",
+    ".to_q_lora.down": ".q_proj.lora_A",
+    ".to_k_lora.up": ".k_proj.lora_B",
+    ".to_k_lora.down": ".k_proj.lora_A",
+    ".to_v_lora.up": ".v_proj.lora_B",
+    ".to_v_lora.down": ".v_proj.lora_A",
+    ".to_out_lora.up": ".out_proj.lora_B",
+    ".to_out_lora.down": ".out_proj.lora_A",
+    ".lora_linear_layer.up": ".lora_B",
+    ".lora_linear_layer.down": ".lora_A",
+}
+
+PEFT_TO_DIFFUSERS = {
+    ".q_proj.lora_B": ".q_proj.lora_linear_layer.up",
+    ".q_proj.lora_A": ".q_proj.lora_linear_layer.down",
+    ".k_proj.lora_B": ".k_proj.lora_linear_layer.up",
+    ".k_proj.lora_A": ".k_proj.lora_linear_layer.down",
+    ".v_proj.lora_B": ".v_proj.lora_linear_layer.up",
+    ".v_proj.lora_A": ".v_proj.lora_linear_layer.down",
+    ".out_proj.lora_B": ".out_proj.lora_linear_layer.up",
+    ".out_proj.lora_A": ".out_proj.lora_linear_layer.down",
+    "to_k.lora_A": "to_k.lora.down",
+    "to_k.lora_B": "to_k.lora.up",
+    "to_q.lora_A": "to_q.lora.down",
+    "to_q.lora_B": "to_q.lora.up",
+    "to_v.lora_A": "to_v.lora.down",
+    "to_v.lora_B": "to_v.lora.up",
+    "to_out.0.lora_A": "to_out.0.lora.down",
+    "to_out.0.lora_B": "to_out.0.lora.up",
+}
+
+DIFFUSERS_OLD_TO_DIFFUSERS = {
+    ".to_q_lora.up": ".q_proj.lora_linear_layer.up",
+    ".to_q_lora.down": ".q_proj.lora_linear_layer.down",
+    ".to_k_lora.up": ".k_proj.lora_linear_layer.up",
+    ".to_k_lora.down": ".k_proj.lora_linear_layer.down",
+    ".to_v_lora.up": ".v_proj.lora_linear_layer.up",
+    ".to_v_lora.down": ".v_proj.lora_linear_layer.down",
+    ".to_out_lora.up": ".out_proj.lora_linear_layer.up",
+    ".to_out_lora.down": ".out_proj.lora_linear_layer.down",
+    ".to_k.lora_magnitude_vector": ".k_proj.lora_magnitude_vector",
+    ".to_v.lora_magnitude_vector": ".v_proj.lora_magnitude_vector",
+    ".to_q.lora_magnitude_vector": ".q_proj.lora_magnitude_vector",
+    ".to_out.lora_magnitude_vector": ".out_proj.lora_magnitude_vector",
+}
+
+PEFT_TO_KOHYA_SS = {
+    "lora_A": "lora_down",
+    "lora_B": "lora_up",
+    # This is not a comprehensive dict as kohya format requires replacing `.` with `_` in keys,
+    # adding prefixes and adding alpha values
+    # Check `convert_state_dict_to_kohya` for more
+}
+
+PEFT_STATE_DICT_MAPPINGS = {
+    StateDictType.DIFFUSERS_OLD: DIFFUSERS_OLD_TO_PEFT,
+    StateDictType.DIFFUSERS: DIFFUSERS_TO_PEFT,
+}
+
+DIFFUSERS_STATE_DICT_MAPPINGS = {
+    StateDictType.DIFFUSERS_OLD: DIFFUSERS_OLD_TO_DIFFUSERS,
+    StateDictType.PEFT: PEFT_TO_DIFFUSERS,
+}
+
+KOHYA_STATE_DICT_MAPPINGS = {StateDictType.PEFT: PEFT_TO_KOHYA_SS}
+
+KEYS_TO_ALWAYS_REPLACE = {
+    ".processor.": ".",
+}
+
+
+def convert_state_dict(state_dict, mapping):
+    r"""
+    Simply iterates over the state dict and replaces the patterns in `mapping` with the corresponding values.
+
+    Args:
+        state_dict (`dict[str, torch.Tensor]`):
+            The state dict to convert.
+        mapping (`dict[str, str]`):
+            The mapping to use for conversion, the mapping should be a dictionary with the following structure:
+                - key: the pattern to replace
+                - value: the pattern to replace with
+
+    Returns:
+        converted_state_dict (`dict`)
+            The converted state dict.
+    """
+    converted_state_dict = {}
+    for k, v in state_dict.items():
+        # First, filter out the keys that we always want to replace
+        for pattern in KEYS_TO_ALWAYS_REPLACE.keys():
+            if pattern in k:
+                new_pattern = KEYS_TO_ALWAYS_REPLACE[pattern]
+                k = k.replace(pattern, new_pattern)
+
+        for pattern in mapping.keys():
+            if pattern in k:
+                new_pattern = mapping[pattern]
+                k = k.replace(pattern, new_pattern)
+                break
+        converted_state_dict[k] = v
+    return converted_state_dict
+
+
+def convert_state_dict_to_peft(state_dict, original_type=None, **kwargs):
+    r"""
+    Converts a state dict to the PEFT format The state dict can be from previous diffusers format (`OLD_DIFFUSERS`), or
+    new diffusers format (`DIFFUSERS`). The method only supports the conversion from diffusers old/new to PEFT for now.
+
+    Args:
+        state_dict (`dict[str, torch.Tensor]`):
+            The state dict to convert.
+        original_type (`StateDictType`, *optional*):
+            The original type of the state dict, if not provided, the method will try to infer it automatically.
+    """
+    if original_type is None:
+        # Old diffusers to PEFT
+        if any("to_out_lora" in k for k in state_dict.keys()):
+            original_type = StateDictType.DIFFUSERS_OLD
+        elif any("lora_linear_layer" in k for k in state_dict.keys()):
+            original_type = StateDictType.DIFFUSERS
+        else:
+            raise ValueError("Could not automatically infer state dict type")
+
+    if original_type not in PEFT_STATE_DICT_MAPPINGS.keys():
+        raise ValueError(f"Original type {original_type} is not supported")
+
+    mapping = PEFT_STATE_DICT_MAPPINGS[original_type]
+    return convert_state_dict(state_dict, mapping)
+
+
+def convert_state_dict_to_diffusers(state_dict, original_type=None, **kwargs):
+    r"""
+    Converts a state dict to new diffusers format. The state dict can be from previous diffusers format
+    (`OLD_DIFFUSERS`), or PEFT format (`PEFT`) or new diffusers format (`DIFFUSERS`). In the last case the method will
+    return the state dict as is.
+
+    The method only supports the conversion from diffusers old, PEFT to diffusers new for now.
+
+    Args:
+        state_dict (`dict[str, torch.Tensor]`):
+            The state dict to convert.
+        original_type (`StateDictType`, *optional*):
+            The original type of the state dict, if not provided, the method will try to infer it automatically.
+        kwargs (`dict`, *args*):
+            Additional arguments to pass to the method.
+
+            - **adapter_name**: For example, in case of PEFT, some keys will be pre-pended
+                with the adapter name, therefore needs a special handling. By default PEFT also takes care of that in
+                `get_peft_model_state_dict` method:
+                https://github.com/huggingface/peft/blob/ba0477f2985b1ba311b83459d29895c809404e99/src/peft/utils/save_and_load.py#L92
+                but we add it here in case we don't want to rely on that method.
+    """
+    peft_adapter_name = kwargs.pop("adapter_name", None)
+    if peft_adapter_name is not None:
+        peft_adapter_name = "." + peft_adapter_name
+    else:
+        peft_adapter_name = ""
+
+    if original_type is None:
+        # Old diffusers to PEFT
+        if any("to_out_lora" in k for k in state_dict.keys()):
+            original_type = StateDictType.DIFFUSERS_OLD
+        elif any(f".lora_A{peft_adapter_name}.weight" in k for k in state_dict.keys()):
+            original_type = StateDictType.PEFT
+        elif any("lora_linear_layer" in k for k in state_dict.keys()):
+            # nothing to do
+            return state_dict
+        else:
+            raise ValueError("Could not automatically infer state dict type")
+
+    if original_type not in DIFFUSERS_STATE_DICT_MAPPINGS.keys():
+        raise ValueError(f"Original type {original_type} is not supported")
+
+    mapping = DIFFUSERS_STATE_DICT_MAPPINGS[original_type]
+    return convert_state_dict(state_dict, mapping)
+
+
+def convert_unet_state_dict_to_peft(state_dict):
+    r"""
+    Converts a state dict from UNet format to diffusers format - i.e. by removing some keys
+    """
+    mapping = UNET_TO_DIFFUSERS
+    return convert_state_dict(state_dict, mapping)
+
+
+def convert_all_state_dict_to_peft(state_dict):
+    r"""
+    Attempts to first `convert_state_dict_to_peft`, and if it doesn't detect `lora_linear_layer` for a valid
+    `DIFFUSERS` LoRA for example, attempts to exclusively convert the Unet `convert_unet_state_dict_to_peft`
+    """
+    try:
+        peft_dict = convert_state_dict_to_peft(state_dict)
+    except Exception as e:
+        if str(e) == "Could not automatically infer state dict type":
+            peft_dict = convert_unet_state_dict_to_peft(state_dict)
+        else:
+            raise
+
+    if not any("lora_A" in key or "lora_B" in key for key in peft_dict.keys()):
+        raise ValueError("Your LoRA was not converted to PEFT")
+
+    return peft_dict
+
+
+def convert_state_dict_to_kohya(state_dict, original_type=None, **kwargs):
+    r"""
+    Converts a `PEFT` state dict to `Kohya` format that can be used in AUTOMATIC1111, ComfyUI, SD.Next, InvokeAI, etc.
+    The method only supports the conversion from PEFT to Kohya for now.
+
+    Args:
+        state_dict (`dict[str, torch.Tensor]`):
+            The state dict to convert.
+        original_type (`StateDictType`, *optional*):
+            The original type of the state dict, if not provided, the method will try to infer it automatically.
+        kwargs (`dict`, *args*):
+            Additional arguments to pass to the method.
+
+            - **adapter_name**: For example, in case of PEFT, some keys will be pre-pended
+                with the adapter name, therefore needs a special handling. By default PEFT also takes care of that in
+                `get_peft_model_state_dict` method:
+                https://github.com/huggingface/peft/blob/ba0477f2985b1ba311b83459d29895c809404e99/src/peft/utils/save_and_load.py#L92
+                but we add it here in case we don't want to rely on that method.
+    """
+    try:
+        import torch
+    except ImportError:
+        logger.error("Converting PEFT state dicts to Kohya requires torch to be installed.")
+        raise
+
+    peft_adapter_name = kwargs.pop("adapter_name", None)
+    if peft_adapter_name is not None:
+        peft_adapter_name = "." + peft_adapter_name
+    else:
+        peft_adapter_name = ""
+
+    if original_type is None:
+        if any(f".lora_A{peft_adapter_name}.weight" in k for k in state_dict.keys()):
+            original_type = StateDictType.PEFT
+
+    if original_type not in KOHYA_STATE_DICT_MAPPINGS.keys():
+        raise ValueError(f"Original type {original_type} is not supported")
+
+    # Use the convert_state_dict function with the appropriate mapping
+    kohya_ss_partial_state_dict = convert_state_dict(state_dict, KOHYA_STATE_DICT_MAPPINGS[StateDictType.PEFT])
+    kohya_ss_state_dict = {}
+
+    # Additional logic for replacing header, alpha parameters `.` with `_` in all keys
+    for kohya_key, weight in kohya_ss_partial_state_dict.items():
+        if "text_encoder_2." in kohya_key:
+            kohya_key = kohya_key.replace("text_encoder_2.", "lora_te2.")
+        elif "text_encoder." in kohya_key:
+            kohya_key = kohya_key.replace("text_encoder.", "lora_te1.")
+        elif "unet" in kohya_key:
+            kohya_key = kohya_key.replace("unet", "lora_unet")
+        elif "lora_magnitude_vector" in kohya_key:
+            kohya_key = kohya_key.replace("lora_magnitude_vector", "dora_scale")
+
+        kohya_key = kohya_key.replace(".", "_", kohya_key.count(".") - 2)
+        kohya_key = kohya_key.replace(peft_adapter_name, "")  # Kohya doesn't take names
+        kohya_ss_state_dict[kohya_key] = weight
+        if "lora_down" in kohya_key:
+            alpha_key = f'{kohya_key.split(".")[0]}.alpha'
+            kohya_ss_state_dict[alpha_key] = torch.tensor(len(weight))
+
+    return kohya_ss_state_dict
diff --git a/diffusers/src/diffusers/utils/testing_utils.py b/diffusers/src/diffusers/utils/testing_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..be3e9983c80ffa7c16a4666de5f624a7ffb9a591
--- /dev/null
+++ b/diffusers/src/diffusers/utils/testing_utils.py
@@ -0,0 +1,1035 @@
+import functools
+import importlib
+import inspect
+import io
+import logging
+import multiprocessing
+import os
+import random
+import re
+import struct
+import sys
+import tempfile
+import time
+import unittest
+import urllib.parse
+from contextlib import contextmanager
+from io import BytesIO, StringIO
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import PIL.ImageOps
+import requests
+from numpy.linalg import norm
+from packaging import version
+
+from .import_utils import (
+    BACKENDS_MAPPING,
+    is_compel_available,
+    is_flax_available,
+    is_note_seq_available,
+    is_onnx_available,
+    is_opencv_available,
+    is_peft_available,
+    is_timm_available,
+    is_torch_available,
+    is_torch_version,
+    is_torchsde_available,
+    is_transformers_available,
+)
+from .logging import get_logger
+
+
+global_rng = random.Random()
+
+logger = get_logger(__name__)
+
+_required_peft_version = is_peft_available() and version.parse(
+    version.parse(importlib.metadata.version("peft")).base_version
+) > version.parse("0.5")
+_required_transformers_version = is_transformers_available() and version.parse(
+    version.parse(importlib.metadata.version("transformers")).base_version
+) > version.parse("4.33")
+
+USE_PEFT_BACKEND = _required_peft_version and _required_transformers_version
+
+if is_torch_available():
+    import torch
+
+    # Set a backend environment variable for any extra module import required for a custom accelerator
+    if "DIFFUSERS_TEST_BACKEND" in os.environ:
+        backend = os.environ["DIFFUSERS_TEST_BACKEND"]
+        try:
+            _ = importlib.import_module(backend)
+        except ModuleNotFoundError as e:
+            raise ModuleNotFoundError(
+                f"Failed to import `DIFFUSERS_TEST_BACKEND` '{backend}'! This should be the name of an installed module \
+                    to enable a specified backend.):\n{e}"
+            ) from e
+
+    if "DIFFUSERS_TEST_DEVICE" in os.environ:
+        torch_device = os.environ["DIFFUSERS_TEST_DEVICE"]
+        try:
+            # try creating device to see if provided device is valid
+            _ = torch.device(torch_device)
+        except RuntimeError as e:
+            raise RuntimeError(
+                f"Unknown testing device specified by environment variable `DIFFUSERS_TEST_DEVICE`: {torch_device}"
+            ) from e
+        logger.info(f"torch_device overrode to {torch_device}")
+    else:
+        torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+        is_torch_higher_equal_than_1_12 = version.parse(
+            version.parse(torch.__version__).base_version
+        ) >= version.parse("1.12")
+
+        if is_torch_higher_equal_than_1_12:
+            # Some builds of torch 1.12 don't have the mps backend registered. See #892 for more details
+            mps_backend_registered = hasattr(torch.backends, "mps")
+            torch_device = "mps" if (mps_backend_registered and torch.backends.mps.is_available()) else torch_device
+
+
+def torch_all_close(a, b, *args, **kwargs):
+    if not is_torch_available():
+        raise ValueError("PyTorch needs to be installed to use this function.")
+    if not torch.allclose(a, b, *args, **kwargs):
+        assert False, f"Max diff is absolute {(a - b).abs().max()}. Diff tensor is {(a - b).abs()}."
+    return True
+
+
+def numpy_cosine_similarity_distance(a, b):
+    similarity = np.dot(a, b) / (norm(a) * norm(b))
+    distance = 1.0 - similarity.mean()
+
+    return distance
+
+
+def print_tensor_test(
+    tensor,
+    limit_to_slices=None,
+    max_torch_print=None,
+    filename="test_corrections.txt",
+    expected_tensor_name="expected_slice",
+):
+    if max_torch_print:
+        torch.set_printoptions(threshold=10_000)
+
+    test_name = os.environ.get("PYTEST_CURRENT_TEST")
+    if not torch.is_tensor(tensor):
+        tensor = torch.from_numpy(tensor)
+    if limit_to_slices:
+        tensor = tensor[0, -3:, -3:, -1]
+
+    tensor_str = str(tensor.detach().cpu().flatten().to(torch.float32)).replace("\n", "")
+    # format is usually:
+    # expected_slice = np.array([-0.5713, -0.3018, -0.9814, 0.04663, -0.879, 0.76, -1.734, 0.1044, 1.161])
+    output_str = tensor_str.replace("tensor", f"{expected_tensor_name} = np.array")
+    test_file, test_class, test_fn = test_name.split("::")
+    test_fn = test_fn.split()[0]
+    with open(filename, "a") as f:
+        print("::".join([test_file, test_class, test_fn, output_str]), file=f)
+
+
+def get_tests_dir(append_path=None):
+    """
+    Args:
+        append_path: optional path to append to the tests dir path
+    Return:
+        The full path to the `tests` dir, so that the tests can be invoked from anywhere. Optionally `append_path` is
+        joined after the `tests` dir the former is provided.
+    """
+    # this function caller's __file__
+    caller__file__ = inspect.stack()[1][1]
+    tests_dir = os.path.abspath(os.path.dirname(caller__file__))
+
+    while not tests_dir.endswith("tests"):
+        tests_dir = os.path.dirname(tests_dir)
+
+    if append_path:
+        return Path(tests_dir, append_path).as_posix()
+    else:
+        return tests_dir
+
+
+# Taken from the following PR:
+# https://github.com/huggingface/accelerate/pull/1964
+def str_to_bool(value) -> int:
+    """
+    Converts a string representation of truth to `True` (1) or `False` (0). True values are `y`, `yes`, `t`, `true`,
+    `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`;
+    """
+    value = value.lower()
+    if value in ("y", "yes", "t", "true", "on", "1"):
+        return 1
+    elif value in ("n", "no", "f", "false", "off", "0"):
+        return 0
+    else:
+        raise ValueError(f"invalid truth value {value}")
+
+
+def parse_flag_from_env(key, default=False):
+    try:
+        value = os.environ[key]
+    except KeyError:
+        # KEY isn't set, default to `default`.
+        _value = default
+    else:
+        # KEY is set, convert it to True or False.
+        try:
+            _value = str_to_bool(value)
+        except ValueError:
+            # More values are supported, but let's keep the message simple.
+            raise ValueError(f"If set, {key} must be yes or no.")
+    return _value
+
+
+_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
+_run_nightly_tests = parse_flag_from_env("RUN_NIGHTLY", default=False)
+_run_compile_tests = parse_flag_from_env("RUN_COMPILE", default=False)
+
+
+def floats_tensor(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.random() * scale)
+
+    return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous()
+
+
+def slow(test_case):
+    """
+    Decorator marking a test as slow.
+
+    Slow tests are skipped by default. Set the RUN_SLOW environment variable to a truthy value to run them.
+
+    """
+    return unittest.skipUnless(_run_slow_tests, "test is slow")(test_case)
+
+
+def nightly(test_case):
+    """
+    Decorator marking a test that runs nightly in the diffusers CI.
+
+    Slow tests are skipped by default. Set the RUN_NIGHTLY environment variable to a truthy value to run them.
+
+    """
+    return unittest.skipUnless(_run_nightly_tests, "test is nightly")(test_case)
+
+
+def is_torch_compile(test_case):
+    """
+    Decorator marking a test that runs compile tests in the diffusers CI.
+
+    Compile tests are skipped by default. Set the RUN_COMPILE environment variable to a truthy value to run them.
+
+    """
+    return unittest.skipUnless(_run_compile_tests, "test is torch compile")(test_case)
+
+
+def require_torch(test_case):
+    """
+    Decorator marking a test that requires PyTorch. These tests are skipped when PyTorch isn't installed.
+    """
+    return unittest.skipUnless(is_torch_available(), "test requires PyTorch")(test_case)
+
+
+def require_torch_2(test_case):
+    """
+    Decorator marking a test that requires PyTorch 2. These tests are skipped when it isn't installed.
+    """
+    return unittest.skipUnless(is_torch_available() and is_torch_version(">=", "2.0.0"), "test requires PyTorch 2")(
+        test_case
+    )
+
+
+def require_torch_gpu(test_case):
+    """Decorator marking a test that requires CUDA and PyTorch."""
+    return unittest.skipUnless(is_torch_available() and torch_device == "cuda", "test requires PyTorch+CUDA")(
+        test_case
+    )
+
+
+# These decorators are for accelerator-specific behaviours that are not GPU-specific
+def require_torch_accelerator(test_case):
+    """Decorator marking a test that requires an accelerator backend and PyTorch."""
+    return unittest.skipUnless(is_torch_available() and torch_device != "cpu", "test requires accelerator+PyTorch")(
+        test_case
+    )
+
+
+def require_torch_multi_gpu(test_case):
+    """
+    Decorator marking a test that requires a multi-GPU setup (in PyTorch). These tests are skipped on a machine without
+    multiple GPUs. To run *only* the multi_gpu tests, assuming all test names contain multi_gpu: $ pytest -sv ./tests
+    -k "multi_gpu"
+    """
+    if not is_torch_available():
+        return unittest.skip("test requires PyTorch")(test_case)
+
+    import torch
+
+    return unittest.skipUnless(torch.cuda.device_count() > 1, "test requires multiple GPUs")(test_case)
+
+
+def require_torch_accelerator_with_fp16(test_case):
+    """Decorator marking a test that requires an accelerator with support for the FP16 data type."""
+    return unittest.skipUnless(_is_torch_fp16_available(torch_device), "test requires accelerator with fp16 support")(
+        test_case
+    )
+
+
+def require_torch_accelerator_with_fp64(test_case):
+    """Decorator marking a test that requires an accelerator with support for the FP64 data type."""
+    return unittest.skipUnless(_is_torch_fp64_available(torch_device), "test requires accelerator with fp64 support")(
+        test_case
+    )
+
+
+def require_torch_accelerator_with_training(test_case):
+    """Decorator marking a test that requires an accelerator with support for training."""
+    return unittest.skipUnless(
+        is_torch_available() and backend_supports_training(torch_device),
+        "test requires accelerator with training support",
+    )(test_case)
+
+
+def skip_mps(test_case):
+    """Decorator marking a test to skip if torch_device is 'mps'"""
+    return unittest.skipUnless(torch_device != "mps", "test requires non 'mps' device")(test_case)
+
+
+def require_flax(test_case):
+    """
+    Decorator marking a test that requires JAX & Flax. These tests are skipped when one / both are not installed
+    """
+    return unittest.skipUnless(is_flax_available(), "test requires JAX & Flax")(test_case)
+
+
+def require_compel(test_case):
+    """
+    Decorator marking a test that requires compel: https://github.com/damian0815/compel. These tests are skipped when
+    the library is not installed.
+    """
+    return unittest.skipUnless(is_compel_available(), "test requires compel")(test_case)
+
+
+def require_onnxruntime(test_case):
+    """
+    Decorator marking a test that requires onnxruntime. These tests are skipped when onnxruntime isn't installed.
+    """
+    return unittest.skipUnless(is_onnx_available(), "test requires onnxruntime")(test_case)
+
+
+def require_note_seq(test_case):
+    """
+    Decorator marking a test that requires note_seq. These tests are skipped when note_seq isn't installed.
+    """
+    return unittest.skipUnless(is_note_seq_available(), "test requires note_seq")(test_case)
+
+
+def require_torchsde(test_case):
+    """
+    Decorator marking a test that requires torchsde. These tests are skipped when torchsde isn't installed.
+    """
+    return unittest.skipUnless(is_torchsde_available(), "test requires torchsde")(test_case)
+
+
+def require_peft_backend(test_case):
+    """
+    Decorator marking a test that requires PEFT backend, this would require some specific versions of PEFT and
+    transformers.
+    """
+    return unittest.skipUnless(USE_PEFT_BACKEND, "test requires PEFT backend")(test_case)
+
+
+def require_timm(test_case):
+    """
+    Decorator marking a test that requires timm. These tests are skipped when timm isn't installed.
+    """
+    return unittest.skipUnless(is_timm_available(), "test requires timm")(test_case)
+
+
+def require_peft_version_greater(peft_version):
+    """
+    Decorator marking a test that requires PEFT backend with a specific version, this would require some specific
+    versions of PEFT and transformers.
+    """
+
+    def decorator(test_case):
+        correct_peft_version = is_peft_available() and version.parse(
+            version.parse(importlib.metadata.version("peft")).base_version
+        ) > version.parse(peft_version)
+        return unittest.skipUnless(
+            correct_peft_version, f"test requires PEFT backend with the version greater than {peft_version}"
+        )(test_case)
+
+    return decorator
+
+
+def require_accelerate_version_greater(accelerate_version):
+    def decorator(test_case):
+        correct_accelerate_version = is_peft_available() and version.parse(
+            version.parse(importlib.metadata.version("accelerate")).base_version
+        ) > version.parse(accelerate_version)
+        return unittest.skipUnless(
+            correct_accelerate_version, f"Test requires accelerate with the version greater than {accelerate_version}."
+        )(test_case)
+
+    return decorator
+
+
+def deprecate_after_peft_backend(test_case):
+    """
+    Decorator marking a test that will be skipped after PEFT backend
+    """
+    return unittest.skipUnless(not USE_PEFT_BACKEND, "test skipped in favor of PEFT backend")(test_case)
+
+
+def get_python_version():
+    sys_info = sys.version_info
+    major, minor = sys_info.major, sys_info.minor
+    return major, minor
+
+
+def load_numpy(arry: Union[str, np.ndarray], local_path: Optional[str] = None) -> np.ndarray:
+    if isinstance(arry, str):
+        if local_path is not None:
+            # local_path can be passed to correct images of tests
+            return Path(local_path, arry.split("/")[-5], arry.split("/")[-2], arry.split("/")[-1]).as_posix()
+        elif arry.startswith("http://") or arry.startswith("https://"):
+            response = requests.get(arry)
+            response.raise_for_status()
+            arry = np.load(BytesIO(response.content))
+        elif os.path.isfile(arry):
+            arry = np.load(arry)
+        else:
+            raise ValueError(
+                f"Incorrect path or url, URLs must start with `http://` or `https://`, and {arry} is not a valid path"
+            )
+    elif isinstance(arry, np.ndarray):
+        pass
+    else:
+        raise ValueError(
+            "Incorrect format used for numpy ndarray. Should be an url linking to an image, a local path, or a"
+            " ndarray."
+        )
+
+    return arry
+
+
+def load_pt(url: str):
+    response = requests.get(url)
+    response.raise_for_status()
+    arry = torch.load(BytesIO(response.content))
+    return arry
+
+
+def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
+    """
+    Loads `image` to a PIL Image.
+
+    Args:
+        image (`str` or `PIL.Image.Image`):
+            The image to convert to the PIL Image format.
+    Returns:
+        `PIL.Image.Image`:
+            A PIL Image.
+    """
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            image = PIL.Image.open(requests.get(image, stream=True).raw)
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            raise ValueError(
+                f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
+            )
+    elif isinstance(image, PIL.Image.Image):
+        image = image
+    else:
+        raise ValueError(
+            "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
+        )
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+
+
+def preprocess_image(image: PIL.Image, batch_size: int):
+    w, h = image.size
+    w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+def export_to_gif(image: List[PIL.Image.Image], output_gif_path: str = None) -> str:
+    if output_gif_path is None:
+        output_gif_path = tempfile.NamedTemporaryFile(suffix=".gif").name
+
+    image[0].save(
+        output_gif_path,
+        save_all=True,
+        append_images=image[1:],
+        optimize=False,
+        duration=100,
+        loop=0,
+    )
+    return output_gif_path
+
+
+@contextmanager
+def buffered_writer(raw_f):
+    f = io.BufferedWriter(raw_f)
+    yield f
+    f.flush()
+
+
+def export_to_ply(mesh, output_ply_path: str = None):
+    """
+    Write a PLY file for a mesh.
+    """
+    if output_ply_path is None:
+        output_ply_path = tempfile.NamedTemporaryFile(suffix=".ply").name
+
+    coords = mesh.verts.detach().cpu().numpy()
+    faces = mesh.faces.cpu().numpy()
+    rgb = np.stack([mesh.vertex_channels[x].detach().cpu().numpy() for x in "RGB"], axis=1)
+
+    with buffered_writer(open(output_ply_path, "wb")) as f:
+        f.write(b"ply\n")
+        f.write(b"format binary_little_endian 1.0\n")
+        f.write(bytes(f"element vertex {len(coords)}\n", "ascii"))
+        f.write(b"property float x\n")
+        f.write(b"property float y\n")
+        f.write(b"property float z\n")
+        if rgb is not None:
+            f.write(b"property uchar red\n")
+            f.write(b"property uchar green\n")
+            f.write(b"property uchar blue\n")
+        if faces is not None:
+            f.write(bytes(f"element face {len(faces)}\n", "ascii"))
+            f.write(b"property list uchar int vertex_index\n")
+        f.write(b"end_header\n")
+
+        if rgb is not None:
+            rgb = (rgb * 255.499).round().astype(int)
+            vertices = [
+                (*coord, *rgb)
+                for coord, rgb in zip(
+                    coords.tolist(),
+                    rgb.tolist(),
+                )
+            ]
+            format = struct.Struct("<3f3B")
+            for item in vertices:
+                f.write(format.pack(*item))
+        else:
+            format = struct.Struct("<3f")
+            for vertex in coords.tolist():
+                f.write(format.pack(*vertex))
+
+        if faces is not None:
+            format = struct.Struct("<B3I")
+            for tri in faces.tolist():
+                f.write(format.pack(len(tri), *tri))
+
+    return output_ply_path
+
+
+def export_to_obj(mesh, output_obj_path: str = None):
+    if output_obj_path is None:
+        output_obj_path = tempfile.NamedTemporaryFile(suffix=".obj").name
+
+    verts = mesh.verts.detach().cpu().numpy()
+    faces = mesh.faces.cpu().numpy()
+
+    vertex_colors = np.stack([mesh.vertex_channels[x].detach().cpu().numpy() for x in "RGB"], axis=1)
+    vertices = [
+        "{} {} {} {} {} {}".format(*coord, *color) for coord, color in zip(verts.tolist(), vertex_colors.tolist())
+    ]
+
+    faces = ["f {} {} {}".format(str(tri[0] + 1), str(tri[1] + 1), str(tri[2] + 1)) for tri in faces.tolist()]
+
+    combined_data = ["v " + vertex for vertex in vertices] + faces
+
+    with open(output_obj_path, "w") as f:
+        f.writelines("\n".join(combined_data))
+
+
+def export_to_video(video_frames: List[np.ndarray], output_video_path: str = None) -> str:
+    if is_opencv_available():
+        import cv2
+    else:
+        raise ImportError(BACKENDS_MAPPING["opencv"][1].format("export_to_video"))
+    if output_video_path is None:
+        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
+
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    h, w, c = video_frames[0].shape
+    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps=8, frameSize=(w, h))
+    for i in range(len(video_frames)):
+        img = cv2.cvtColor(video_frames[i], cv2.COLOR_RGB2BGR)
+        video_writer.write(img)
+    return output_video_path
+
+
+def load_hf_numpy(path) -> np.ndarray:
+    base_url = "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main"
+
+    if not path.startswith("http://") and not path.startswith("https://"):
+        path = os.path.join(base_url, urllib.parse.quote(path))
+
+    return load_numpy(path)
+
+
+# --- pytest conf functions --- #
+
+# to avoid multiple invocation from tests/conftest.py and examples/conftest.py - make sure it's called only once
+pytest_opt_registered = {}
+
+
+def pytest_addoption_shared(parser):
+    """
+    This function is to be called from `conftest.py` via `pytest_addoption` wrapper that has to be defined there.
+
+    It allows loading both `conftest.py` files at once without causing a failure due to adding the same `pytest`
+    option.
+
+    """
+    option = "--make-reports"
+    if option not in pytest_opt_registered:
+        parser.addoption(
+            option,
+            action="store",
+            default=False,
+            help="generate report files. The value of this option is used as a prefix to report names",
+        )
+        pytest_opt_registered[option] = 1
+
+
+def pytest_terminal_summary_main(tr, id):
+    """
+    Generate multiple reports at the end of test suite run - each report goes into a dedicated file in the current
+    directory. The report files are prefixed with the test suite name.
+
+    This function emulates --duration and -rA pytest arguments.
+
+    This function is to be called from `conftest.py` via `pytest_terminal_summary` wrapper that has to be defined
+    there.
+
+    Args:
+    - tr: `terminalreporter` passed from `conftest.py`
+    - id: unique id like `tests` or `examples` that will be incorporated into the final reports filenames - this is
+      needed as some jobs have multiple runs of pytest, so we can't have them overwrite each other.
+
+    NB: this functions taps into a private _pytest API and while unlikely, it could break should
+    pytest do internal changes - also it calls default internal methods of terminalreporter which
+    can be hijacked by various `pytest-` plugins and interfere.
+
+    """
+    from _pytest.config import create_terminal_writer
+
+    if not len(id):
+        id = "tests"
+
+    config = tr.config
+    orig_writer = config.get_terminal_writer()
+    orig_tbstyle = config.option.tbstyle
+    orig_reportchars = tr.reportchars
+
+    dir = "reports"
+    Path(dir).mkdir(parents=True, exist_ok=True)
+    report_files = {
+        k: f"{dir}/{id}_{k}.txt"
+        for k in [
+            "durations",
+            "errors",
+            "failures_long",
+            "failures_short",
+            "failures_line",
+            "passes",
+            "stats",
+            "summary_short",
+            "warnings",
+        ]
+    }
+
+    # custom durations report
+    # note: there is no need to call pytest --durations=XX to get this separate report
+    # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/runner.py#L66
+    dlist = []
+    for replist in tr.stats.values():
+        for rep in replist:
+            if hasattr(rep, "duration"):
+                dlist.append(rep)
+    if dlist:
+        dlist.sort(key=lambda x: x.duration, reverse=True)
+        with open(report_files["durations"], "w") as f:
+            durations_min = 0.05  # sec
+            f.write("slowest durations\n")
+            for i, rep in enumerate(dlist):
+                if rep.duration < durations_min:
+                    f.write(f"{len(dlist)-i} durations < {durations_min} secs were omitted")
+                    break
+                f.write(f"{rep.duration:02.2f}s {rep.when:<8} {rep.nodeid}\n")
+
+    def summary_failures_short(tr):
+        # expecting that the reports were --tb=long (default) so we chop them off here to the last frame
+        reports = tr.getreports("failed")
+        if not reports:
+            return
+        tr.write_sep("=", "FAILURES SHORT STACK")
+        for rep in reports:
+            msg = tr._getfailureheadline(rep)
+            tr.write_sep("_", msg, red=True, bold=True)
+            # chop off the optional leading extra frames, leaving only the last one
+            longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0, re.M | re.S)
+            tr._tw.line(longrepr)
+            # note: not printing out any rep.sections to keep the report short
+
+    # use ready-made report funcs, we are just hijacking the filehandle to log to a dedicated file each
+    # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/terminal.py#L814
+    # note: some pytest plugins may interfere by hijacking the default `terminalreporter` (e.g.
+    # pytest-instafail does that)
+
+    # report failures with line/short/long styles
+    config.option.tbstyle = "auto"  # full tb
+    with open(report_files["failures_long"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.summary_failures()
+
+    # config.option.tbstyle = "short" # short tb
+    with open(report_files["failures_short"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        summary_failures_short(tr)
+
+    config.option.tbstyle = "line"  # one line per error
+    with open(report_files["failures_line"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.summary_failures()
+
+    with open(report_files["errors"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.summary_errors()
+
+    with open(report_files["warnings"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.summary_warnings()  # normal warnings
+        tr.summary_warnings()  # final warnings
+
+    tr.reportchars = "wPpsxXEf"  # emulate -rA (used in summary_passes() and short_test_summary())
+    with open(report_files["passes"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.summary_passes()
+
+    with open(report_files["summary_short"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.short_test_summary()
+
+    with open(report_files["stats"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.summary_stats()
+
+    # restore:
+    tr._tw = orig_writer
+    tr.reportchars = orig_reportchars
+    config.option.tbstyle = orig_tbstyle
+
+
+# Copied from https://github.com/huggingface/transformers/blob/000e52aec8850d3fe2f360adc6fd256e5b47fe4c/src/transformers/testing_utils.py#L1905
+def is_flaky(max_attempts: int = 5, wait_before_retry: Optional[float] = None, description: Optional[str] = None):
+    """
+    To decorate flaky tests. They will be retried on failures.
+
+    Args:
+        max_attempts (`int`, *optional*, defaults to 5):
+            The maximum number of attempts to retry the flaky test.
+        wait_before_retry (`float`, *optional*):
+            If provided, will wait that number of seconds before retrying the test.
+        description (`str`, *optional*):
+            A string to describe the situation (what / where / why is flaky, link to GH issue/PR comments, errors,
+            etc.)
+    """
+
+    def decorator(test_func_ref):
+        @functools.wraps(test_func_ref)
+        def wrapper(*args, **kwargs):
+            retry_count = 1
+
+            while retry_count < max_attempts:
+                try:
+                    return test_func_ref(*args, **kwargs)
+
+                except Exception as err:
+                    print(f"Test failed with {err} at try {retry_count}/{max_attempts}.", file=sys.stderr)
+                    if wait_before_retry is not None:
+                        time.sleep(wait_before_retry)
+                    retry_count += 1
+
+            return test_func_ref(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+# Taken from: https://github.com/huggingface/transformers/blob/3658488ff77ff8d45101293e749263acf437f4d5/src/transformers/testing_utils.py#L1787
+def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):
+    """
+    To run a test in a subprocess. In particular, this can avoid (GPU) memory issue.
+
+    Args:
+        test_case (`unittest.TestCase`):
+            The test that will run `target_func`.
+        target_func (`Callable`):
+            The function implementing the actual testing logic.
+        inputs (`dict`, *optional*, defaults to `None`):
+            The inputs that will be passed to `target_func` through an (input) queue.
+        timeout (`int`, *optional*, defaults to `None`):
+            The timeout (in seconds) that will be passed to the input and output queues. If not specified, the env.
+            variable `PYTEST_TIMEOUT` will be checked. If still `None`, its value will be set to `600`.
+    """
+    if timeout is None:
+        timeout = int(os.environ.get("PYTEST_TIMEOUT", 600))
+
+    start_methohd = "spawn"
+    ctx = multiprocessing.get_context(start_methohd)
+
+    input_queue = ctx.Queue(1)
+    output_queue = ctx.JoinableQueue(1)
+
+    # We can't send `unittest.TestCase` to the child, otherwise we get issues regarding pickle.
+    input_queue.put(inputs, timeout=timeout)
+
+    process = ctx.Process(target=target_func, args=(input_queue, output_queue, timeout))
+    process.start()
+    # Kill the child process if we can't get outputs from it in time: otherwise, the hanging subprocess prevents
+    # the test to exit properly.
+    try:
+        results = output_queue.get(timeout=timeout)
+        output_queue.task_done()
+    except Exception as e:
+        process.terminate()
+        test_case.fail(e)
+    process.join(timeout=timeout)
+
+    if results["error"] is not None:
+        test_case.fail(f'{results["error"]}')
+
+
+class CaptureLogger:
+    """
+    Args:
+    Context manager to capture `logging` streams
+        logger: 'logging` logger object
+    Returns:
+        The captured output is available via `self.out`
+    Example:
+    ```python
+    >>> from diffusers import logging
+    >>> from diffusers.testing_utils import CaptureLogger
+
+    >>> msg = "Testing 1, 2, 3"
+    >>> logging.set_verbosity_info()
+    >>> logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.py")
+    >>> with CaptureLogger(logger) as cl:
+    ...     logger.info(msg)
+    >>> assert cl.out, msg + "\n"
+    ```
+    """
+
+    def __init__(self, logger):
+        self.logger = logger
+        self.io = StringIO()
+        self.sh = logging.StreamHandler(self.io)
+        self.out = ""
+
+    def __enter__(self):
+        self.logger.addHandler(self.sh)
+        return self
+
+    def __exit__(self, *exc):
+        self.logger.removeHandler(self.sh)
+        self.out = self.io.getvalue()
+
+    def __repr__(self):
+        return f"captured: {self.out}\n"
+
+
+def enable_full_determinism():
+    """
+    Helper function for reproducible behavior during distributed training. See
+    - https://pytorch.org/docs/stable/notes/randomness.html for pytorch
+    """
+    #  Enable PyTorch deterministic mode. This potentially requires either the environment
+    #  variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set,
+    # depending on the CUDA version, so we set them both here
+    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+    torch.use_deterministic_algorithms(True)
+
+    # Enable CUDNN deterministic mode
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cuda.matmul.allow_tf32 = False
+
+
+def disable_full_determinism():
+    os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ""
+    torch.use_deterministic_algorithms(False)
+
+
+# Utils for custom and alternative accelerator devices
+def _is_torch_fp16_available(device):
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    device = torch.device(device)
+
+    try:
+        x = torch.zeros((2, 2), dtype=torch.float16).to(device)
+        _ = torch.mul(x, x)
+        return True
+
+    except Exception as e:
+        if device.type == "cuda":
+            raise ValueError(
+                f"You have passed a device of type 'cuda' which should work with 'fp16', but 'cuda' does not seem to be correctly installed on your machine: {e}"
+            )
+
+        return False
+
+
+def _is_torch_fp64_available(device):
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    device = torch.device(device)
+
+    try:
+        x = torch.zeros((2, 2), dtype=torch.float64).to(device)
+        _ = torch.mul(x, x)
+        return True
+
+    except Exception as e:
+        if device.type == "cuda":
+            raise ValueError(
+                f"You have passed a device of type 'cuda' which should work with 'fp64', but 'cuda' does not seem to be correctly installed on your machine: {e}"
+            )
+
+        return False
+
+
+# Guard these lookups for when Torch is not used - alternative accelerator support is for PyTorch
+if is_torch_available():
+    # Behaviour flags
+    BACKEND_SUPPORTS_TRAINING = {"cuda": True, "cpu": True, "mps": False, "default": True}
+
+    # Function definitions
+    BACKEND_EMPTY_CACHE = {"cuda": torch.cuda.empty_cache, "cpu": None, "mps": None, "default": None}
+    BACKEND_DEVICE_COUNT = {"cuda": torch.cuda.device_count, "cpu": lambda: 0, "mps": lambda: 0, "default": 0}
+    BACKEND_MANUAL_SEED = {"cuda": torch.cuda.manual_seed, "cpu": torch.manual_seed, "default": torch.manual_seed}
+
+
+# This dispatches a defined function according to the accelerator from the function definitions.
+def _device_agnostic_dispatch(device: str, dispatch_table: Dict[str, Callable], *args, **kwargs):
+    if device not in dispatch_table:
+        return dispatch_table["default"](*args, **kwargs)
+
+    fn = dispatch_table[device]
+
+    # Some device agnostic functions return values. Need to guard against 'None' instead at
+    # user level
+    if fn is None:
+        return None
+
+    return fn(*args, **kwargs)
+
+
+# These are callables which automatically dispatch the function specific to the accelerator
+def backend_manual_seed(device: str, seed: int):
+    return _device_agnostic_dispatch(device, BACKEND_MANUAL_SEED, seed)
+
+
+def backend_empty_cache(device: str):
+    return _device_agnostic_dispatch(device, BACKEND_EMPTY_CACHE)
+
+
+def backend_device_count(device: str):
+    return _device_agnostic_dispatch(device, BACKEND_DEVICE_COUNT)
+
+
+# These are callables which return boolean behaviour flags and can be used to specify some
+# device agnostic alternative where the feature is unsupported.
+def backend_supports_training(device: str):
+    if not is_torch_available():
+        return False
+
+    if device not in BACKEND_SUPPORTS_TRAINING:
+        device = "default"
+
+    return BACKEND_SUPPORTS_TRAINING[device]
+
+
+# Guard for when Torch is not available
+if is_torch_available():
+    # Update device function dict mapping
+    def update_mapping_from_spec(device_fn_dict: Dict[str, Callable], attribute_name: str):
+        try:
+            # Try to import the function directly
+            spec_fn = getattr(device_spec_module, attribute_name)
+            device_fn_dict[torch_device] = spec_fn
+        except AttributeError as e:
+            # If the function doesn't exist, and there is no default, throw an error
+            if "default" not in device_fn_dict:
+                raise AttributeError(
+                    f"`{attribute_name}` not found in '{device_spec_path}' and no default fallback function found."
+                ) from e
+
+    if "DIFFUSERS_TEST_DEVICE_SPEC" in os.environ:
+        device_spec_path = os.environ["DIFFUSERS_TEST_DEVICE_SPEC"]
+        if not Path(device_spec_path).is_file():
+            raise ValueError(f"Specified path to device specification file is not found. Received {device_spec_path}")
+
+        try:
+            import_name = device_spec_path[: device_spec_path.index(".py")]
+        except ValueError as e:
+            raise ValueError(f"Provided device spec file is not a Python file! Received {device_spec_path}") from e
+
+        device_spec_module = importlib.import_module(import_name)
+
+        try:
+            device_name = device_spec_module.DEVICE_NAME
+        except AttributeError:
+            raise AttributeError("Device spec file did not contain `DEVICE_NAME`")
+
+        if "DIFFUSERS_TEST_DEVICE" in os.environ and torch_device != device_name:
+            msg = f"Mismatch between environment variable `DIFFUSERS_TEST_DEVICE` '{torch_device}' and device found in spec '{device_name}'\n"
+            msg += "Either unset `DIFFUSERS_TEST_DEVICE` or ensure it matches device spec name."
+            raise ValueError(msg)
+
+        torch_device = device_name
+
+        # Add one entry here for each `BACKEND_*` dictionary.
+        update_mapping_from_spec(BACKEND_MANUAL_SEED, "MANUAL_SEED_FN")
+        update_mapping_from_spec(BACKEND_EMPTY_CACHE, "EMPTY_CACHE_FN")
+        update_mapping_from_spec(BACKEND_DEVICE_COUNT, "DEVICE_COUNT_FN")
+        update_mapping_from_spec(BACKEND_SUPPORTS_TRAINING, "SUPPORTS_TRAINING")
diff --git a/diffusers/src/diffusers/utils/torch_utils.py b/diffusers/src/diffusers/utils/torch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cf75b4fad4e17e416e5a3f72a25b1f2b3702fe1
--- /dev/null
+++ b/diffusers/src/diffusers/utils/torch_utils.py
@@ -0,0 +1,148 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PyTorch utilities: Utilities related to PyTorch
+"""
+
+from typing import List, Optional, Tuple, Union
+
+from . import logging
+from .import_utils import is_torch_available, is_torch_version
+
+
+if is_torch_available():
+    import torch
+    from torch.fft import fftn, fftshift, ifftn, ifftshift
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+try:
+    from torch._dynamo import allow_in_graph as maybe_allow_in_graph
+except (ImportError, ModuleNotFoundError):
+
+    def maybe_allow_in_graph(cls):
+        return cls
+
+
+def randn_tensor(
+    shape: Union[Tuple, List],
+    generator: Optional[Union[List["torch.Generator"], "torch.Generator"]] = None,
+    device: Optional["torch.device"] = None,
+    dtype: Optional["torch.dtype"] = None,
+    layout: Optional["torch.layout"] = None,
+):
+    """A helper function to create random tensors on the desired `device` with the desired `dtype`. When
+    passing a list of generators, you can seed each batch size individually. If CPU generators are passed, the tensor
+    is always created on the CPU.
+    """
+    # device on which tensor is created defaults to device
+    rand_device = device
+    batch_size = shape[0]
+
+    layout = layout or torch.strided
+    device = device or torch.device("cpu")
+
+    if generator is not None:
+        gen_device_type = generator.device.type if not isinstance(generator, list) else generator[0].device.type
+        if gen_device_type != device.type and gen_device_type == "cpu":
+            rand_device = "cpu"
+            if device != "mps":
+                logger.info(
+                    f"The passed generator was created on 'cpu' even though a tensor on {device} was expected."
+                    f" Tensors will be created on 'cpu' and then moved to {device}. Note that one can probably"
+                    f" slighly speed up this function by passing a generator that was created on the {device} device."
+                )
+        elif gen_device_type != device.type and gen_device_type == "cuda":
+            raise ValueError(f"Cannot generate a {device} tensor from a generator of type {gen_device_type}.")
+
+    # make sure generator list of length 1 is treated like a non-list
+    if isinstance(generator, list) and len(generator) == 1:
+        generator = generator[0]
+
+    if isinstance(generator, list):
+        shape = (1,) + shape[1:]
+        latents = [
+            torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype, layout=layout)
+            for i in range(batch_size)
+        ]
+        latents = torch.cat(latents, dim=0).to(device)
+    else:
+        latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype, layout=layout).to(device)
+
+    return latents
+
+
+def is_compiled_module(module) -> bool:
+    """Check whether the module was compiled with torch.compile()"""
+    if is_torch_version("<", "2.0.0") or not hasattr(torch, "_dynamo"):
+        return False
+    return isinstance(module, torch._dynamo.eval_frame.OptimizedModule)
+
+
+def fourier_filter(x_in: "torch.Tensor", threshold: int, scale: int) -> "torch.Tensor":
+    """Fourier filter as introduced in FreeU (https://arxiv.org/abs/2309.11497).
+
+    This version of the method comes from here:
+    https://github.com/huggingface/diffusers/pull/5164#issuecomment-1732638706
+    """
+    x = x_in
+    B, C, H, W = x.shape
+
+    # Non-power of 2 images must be float32
+    if (W & (W - 1)) != 0 or (H & (H - 1)) != 0:
+        x = x.to(dtype=torch.float32)
+
+    # FFT
+    x_freq = fftn(x, dim=(-2, -1))
+    x_freq = fftshift(x_freq, dim=(-2, -1))
+
+    B, C, H, W = x_freq.shape
+    mask = torch.ones((B, C, H, W), device=x.device)
+
+    crow, ccol = H // 2, W // 2
+    mask[..., crow - threshold : crow + threshold, ccol - threshold : ccol + threshold] = scale
+    x_freq = x_freq * mask
+
+    # IFFT
+    x_freq = ifftshift(x_freq, dim=(-2, -1))
+    x_filtered = ifftn(x_freq, dim=(-2, -1)).real
+
+    return x_filtered.to(dtype=x_in.dtype)
+
+
+def apply_freeu(
+    resolution_idx: int, hidden_states: "torch.Tensor", res_hidden_states: "torch.Tensor", **freeu_kwargs
+) -> Tuple["torch.Tensor", "torch.Tensor"]:
+    """Applies the FreeU mechanism as introduced in https:
+    //arxiv.org/abs/2309.11497. Adapted from the official code repository: https://github.com/ChenyangSi/FreeU.
+
+    Args:
+        resolution_idx (`int`): Integer denoting the UNet block where FreeU is being applied.
+        hidden_states (`torch.Tensor`): Inputs to the underlying block.
+        res_hidden_states (`torch.Tensor`): Features from the skip block corresponding to the underlying block.
+        s1 (`float`): Scaling factor for stage 1 to attenuate the contributions of the skip features.
+        s2 (`float`): Scaling factor for stage 2 to attenuate the contributions of the skip features.
+        b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+        b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+    """
+    if resolution_idx == 0:
+        num_half_channels = hidden_states.shape[1] // 2
+        hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * freeu_kwargs["b1"]
+        res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=freeu_kwargs["s1"])
+    if resolution_idx == 1:
+        num_half_channels = hidden_states.shape[1] // 2
+        hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * freeu_kwargs["b2"]
+        res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=freeu_kwargs["s2"])
+
+    return hidden_states, res_hidden_states
diff --git a/diffusers/src/diffusers/utils/versions.py b/diffusers/src/diffusers/utils/versions.py
new file mode 100644
index 0000000000000000000000000000000000000000..945a3977ce62a9a55307862193e4be6f12c3c17f
--- /dev/null
+++ b/diffusers/src/diffusers/utils/versions.py
@@ -0,0 +1,117 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities for working with package versions
+"""
+
+import importlib.metadata
+import operator
+import re
+import sys
+from typing import Optional
+
+from packaging import version
+
+
+ops = {
+    "<": operator.lt,
+    "<=": operator.le,
+    "==": operator.eq,
+    "!=": operator.ne,
+    ">=": operator.ge,
+    ">": operator.gt,
+}
+
+
+def _compare_versions(op, got_ver, want_ver, requirement, pkg, hint):
+    if got_ver is None or want_ver is None:
+        raise ValueError(
+            f"Unable to compare versions for {requirement}: need={want_ver} found={got_ver}. This is unusual. Consider"
+            f" reinstalling {pkg}."
+        )
+    if not ops[op](version.parse(got_ver), version.parse(want_ver)):
+        raise ImportError(
+            f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}"
+        )
+
+
+def require_version(requirement: str, hint: Optional[str] = None) -> None:
+    """
+    Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
+
+    The installed module version comes from the *site-packages* dir via *importlib.metadata*.
+
+    Args:
+        requirement (`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
+        hint (`str`, *optional*): what suggestion to print in case of requirements not being met
+
+    Example:
+
+    ```python
+    require_version("pandas>1.1.2")
+    require_version("numpy>1.18.5", "this is important to have for whatever reason")
+    ```"""
+
+    hint = f"\n{hint}" if hint is not None else ""
+
+    # non-versioned check
+    if re.match(r"^[\w_\-\d]+$", requirement):
+        pkg, op, want_ver = requirement, None, None
+    else:
+        match = re.findall(r"^([^!=<>\s]+)([\s!=<>]{1,2}.+)", requirement)
+        if not match:
+            raise ValueError(
+                "requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but"
+                f" got {requirement}"
+            )
+        pkg, want_full = match[0]
+        want_range = want_full.split(",")  # there could be multiple requirements
+        wanted = {}
+        for w in want_range:
+            match = re.findall(r"^([\s!=<>]{1,2})(.+)", w)
+            if not match:
+                raise ValueError(
+                    "requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23,"
+                    f" but got {requirement}"
+                )
+            op, want_ver = match[0]
+            wanted[op] = want_ver
+            if op not in ops:
+                raise ValueError(f"{requirement}: need one of {list(ops.keys())}, but got {op}")
+
+    # special case
+    if pkg == "python":
+        got_ver = ".".join([str(x) for x in sys.version_info[:3]])
+        for op, want_ver in wanted.items():
+            _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
+        return
+
+    # check if any version is installed
+    try:
+        got_ver = importlib.metadata.version(pkg)
+    except importlib.metadata.PackageNotFoundError:
+        raise importlib.metadata.PackageNotFoundError(
+            f"The '{requirement}' distribution was not found and is required by this application. {hint}"
+        )
+
+    # check that the right version is installed if version number or a range was provided
+    if want_ver is not None:
+        for op, want_ver in wanted.items():
+            _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
+
+
+def require_version_core(requirement):
+    """require_version wrapper which emits a core-specific hint on failure"""
+    hint = "Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git main"
+    return require_version(requirement, hint)
diff --git a/diffusers/src/diffusers/video_processor.py b/diffusers/src/diffusers/video_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e2727b85377cd47f5f44224c5122118518ab8d9
--- /dev/null
+++ b/diffusers/src/diffusers/video_processor.py
@@ -0,0 +1,113 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+
+from .image_processor import VaeImageProcessor, is_valid_image, is_valid_image_imagelist
+
+
+class VideoProcessor(VaeImageProcessor):
+    r"""Simple video processor."""
+
+    def preprocess_video(self, video, height: Optional[int] = None, width: Optional[int] = None) -> torch.Tensor:
+        r"""
+        Preprocesses input video(s).
+
+        Args:
+            video (`List[PIL.Image]`, `List[List[PIL.Image]]`, `torch.Tensor`, `np.array`, `List[torch.Tensor]`, `List[np.array]`):
+                The input video. It can be one of the following:
+                * List of the PIL images.
+                * List of list of PIL images.
+                * 4D Torch tensors (expected shape for each tensor `(num_frames, num_channels, height, width)`).
+                * 4D NumPy arrays (expected shape for each array `(num_frames, height, width, num_channels)`).
+                * List of 4D Torch tensors (expected shape for each tensor `(num_frames, num_channels, height,
+                  width)`).
+                * List of 4D NumPy arrays (expected shape for each array `(num_frames, height, width, num_channels)`).
+                * 5D NumPy arrays: expected shape for each array `(batch_size, num_frames, height, width,
+                  num_channels)`.
+                * 5D Torch tensors: expected shape for each array `(batch_size, num_frames, num_channels, height,
+                  width)`.
+            height (`int`, *optional*, defaults to `None`):
+                The height in preprocessed frames of the video. If `None`, will use the `get_default_height_width()` to
+                get default height.
+            width (`int`, *optional*`, defaults to `None`):
+                The width in preprocessed frames of the video. If `None`, will use get_default_height_width()` to get
+                the default width.
+        """
+        if isinstance(video, list) and isinstance(video[0], np.ndarray) and video[0].ndim == 5:
+            warnings.warn(
+                "Passing `video` as a list of 5d np.ndarray is deprecated."
+                "Please concatenate the list along the batch dimension and pass it as a single 5d np.ndarray",
+                FutureWarning,
+            )
+            video = np.concatenate(video, axis=0)
+        if isinstance(video, list) and isinstance(video[0], torch.Tensor) and video[0].ndim == 5:
+            warnings.warn(
+                "Passing `video` as a list of 5d torch.Tensor is deprecated."
+                "Please concatenate the list along the batch dimension and pass it as a single 5d torch.Tensor",
+                FutureWarning,
+            )
+            video = torch.cat(video, axis=0)
+
+        # ensure the input is a list of videos:
+        # - if it is a batch of videos (5d torch.Tensor or np.ndarray), it is converted to a list of videos (a list of 4d torch.Tensor or np.ndarray)
+        # - if it is is a single video, it is convereted to a list of one video.
+        if isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 5:
+            video = list(video)
+        elif isinstance(video, list) and is_valid_image(video[0]) or is_valid_image_imagelist(video):
+            video = [video]
+        elif isinstance(video, list) and is_valid_image_imagelist(video[0]):
+            video = video
+        else:
+            raise ValueError(
+                "Input is in incorrect format. Currently, we only support numpy.ndarray, torch.Tensor, PIL.Image.Image"
+            )
+
+        video = torch.stack([self.preprocess(img, height=height, width=width) for img in video], dim=0)
+
+        # move the number of channels before the number of frames.
+        video = video.permute(0, 2, 1, 3, 4)
+
+        return video
+
+    def postprocess_video(
+        self, video: torch.Tensor, output_type: str = "np"
+    ) -> Union[np.ndarray, torch.Tensor, List[PIL.Image.Image]]:
+        r"""
+        Converts a video tensor to a list of frames for export.
+
+        Args:
+            video (`torch.Tensor`): The video as a tensor.
+            output_type (`str`, defaults to `"np"`): Output type of the postprocessed `video` tensor.
+        """
+        batch_size = video.shape[0]
+        outputs = []
+        for batch_idx in range(batch_size):
+            batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+            batch_output = self.postprocess(batch_vid, output_type)
+            outputs.append(batch_output)
+
+        if output_type == "np":
+            outputs = np.stack(outputs)
+        elif output_type == "pt":
+            outputs = torch.stack(outputs)
+        elif not output_type == "pil":
+            raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
+
+        return outputs
diff --git a/diffusers/tests/__init__.py b/diffusers/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/conftest.py b/diffusers/tests/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..4993ed94e8f1c116f12821e9dea77520acf0e05b
--- /dev/null
+++ b/diffusers/tests/conftest.py
@@ -0,0 +1,44 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tests directory-specific settings - this file is run automatically
+# by pytest before any tests are run
+
+import sys
+import warnings
+from os.path import abspath, dirname, join
+
+
+# allow having multiple repository checkouts and not needing to remember to rerun
+# 'pip install -e .[dev]' when switching between checkouts and running tests.
+git_repo_path = abspath(join(dirname(dirname(__file__)), "src"))
+sys.path.insert(1, git_repo_path)
+
+# silence FutureWarning warnings in tests since often we can't act on them until
+# they become normal warnings - i.e. the tests still need to test the current functionality
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
+
+def pytest_addoption(parser):
+    from diffusers.utils.testing_utils import pytest_addoption_shared
+
+    pytest_addoption_shared(parser)
+
+
+def pytest_terminal_summary(terminalreporter):
+    from diffusers.utils.testing_utils import pytest_terminal_summary_main
+
+    make_reports = terminalreporter.config.getoption("--make-reports")
+    if make_reports:
+        pytest_terminal_summary_main(terminalreporter, id=make_reports)
diff --git a/diffusers/tests/fixtures/custom_pipeline/pipeline.py b/diffusers/tests/fixtures/custom_pipeline/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..601f51b1263e8799be2d753997124a2c5af21704
--- /dev/null
+++ b/diffusers/tests/fixtures/custom_pipeline/pipeline.py
@@ -0,0 +1,101 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# limitations under the License.
+
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+from diffusers import DiffusionPipeline, ImagePipelineOutput
+
+
+class CustomLocalPipeline(DiffusionPipeline):
+    r"""
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Parameters:
+        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+            [`DDPMScheduler`], or [`DDIMScheduler`].
+    """
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[torch.Generator] = None,
+        num_inference_steps: int = 50,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            eta (`float`, *optional*, defaults to 0.0):
+                The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        # Sample gaussian noise to begin loop
+        image = torch.randn(
+            (batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
+            generator=generator,
+        )
+        image = image.to(self.device)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(image, t).sample
+
+            # 2. predict previous mean of image x_t-1 and add variance depending on eta
+            # eta corresponds to η in paper and should be between [0, 1]
+            # do x_t -> x_t-1
+            image = self.scheduler.step(model_output, t, image).prev_sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,), "This is a local test"
+
+        return ImagePipelineOutput(images=image), "This is a local test"
diff --git a/diffusers/tests/fixtures/custom_pipeline/what_ever.py b/diffusers/tests/fixtures/custom_pipeline/what_ever.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ceeb4211e376039a40bfa11da8297d250f29ab1
--- /dev/null
+++ b/diffusers/tests/fixtures/custom_pipeline/what_ever.py
@@ -0,0 +1,101 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# limitations under the License.
+
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class CustomLocalPipeline(DiffusionPipeline):
+    r"""
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Parameters:
+        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+            [`DDPMScheduler`], or [`DDIMScheduler`].
+    """
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[torch.Generator] = None,
+        num_inference_steps: int = 50,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            eta (`float`, *optional*, defaults to 0.0):
+                The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        # Sample gaussian noise to begin loop
+        image = torch.randn(
+            (batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
+            generator=generator,
+        )
+        image = image.to(self.device)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(image, t).sample
+
+            # 2. predict previous mean of image x_t-1 and add variance depending on eta
+            # eta corresponds to η in paper and should be between [0, 1]
+            # do x_t -> x_t-1
+            image = self.scheduler.step(model_output, t, image).prev_sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,), "This is a local test"
+
+        return ImagePipelineOutput(images=image), "This is a local test"
diff --git a/diffusers/tests/fixtures/elise_format0.mid b/diffusers/tests/fixtures/elise_format0.mid
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
Binary files /dev/null and b/diffusers/tests/fixtures/elise_format0.mid differ
diff --git a/diffusers/tests/lora/test_lora_layers_cogvideox.py b/diffusers/tests/lora/test_lora_layers_cogvideox.py
new file mode 100644
index 0000000000000000000000000000000000000000..c141ebc96b3ed243d26bcf02ca922e8658d2c9cd
--- /dev/null
+++ b/diffusers/tests/lora/test_lora_layers_cogvideox.py
@@ -0,0 +1,194 @@
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDDIMScheduler,
+    CogVideoXDPMScheduler,
+    CogVideoXPipeline,
+    CogVideoXTransformer3DModel,
+)
+from diffusers.utils.testing_utils import (
+    floats_tensor,
+    is_peft_available,
+    require_peft_backend,
+    skip_mps,
+    torch_device,
+)
+
+
+if is_peft_available():
+    pass
+
+sys.path.append(".")
+
+from utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set  # noqa: E402
+
+
+@require_peft_backend
+class CogVideoXLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
+    pipeline_class = CogVideoXPipeline
+    scheduler_cls = CogVideoXDPMScheduler
+    scheduler_kwargs = {"timestep_spacing": "trailing"}
+    scheduler_classes = [CogVideoXDDIMScheduler, CogVideoXDPMScheduler]
+
+    transformer_kwargs = {
+        "num_attention_heads": 4,
+        "attention_head_dim": 8,
+        "in_channels": 4,
+        "out_channels": 4,
+        "time_embed_dim": 2,
+        "text_embed_dim": 32,
+        "num_layers": 1,
+        "sample_width": 16,
+        "sample_height": 16,
+        "sample_frames": 9,
+        "patch_size": 2,
+        "temporal_compression_ratio": 4,
+        "max_text_seq_length": 16,
+    }
+    transformer_cls = CogVideoXTransformer3DModel
+    vae_kwargs = {
+        "in_channels": 3,
+        "out_channels": 3,
+        "down_block_types": (
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+        ),
+        "up_block_types": (
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+        ),
+        "block_out_channels": (8, 8, 8, 8),
+        "latent_channels": 4,
+        "layers_per_block": 1,
+        "norm_num_groups": 2,
+        "temporal_compression_ratio": 4,
+    }
+    vae_cls = AutoencoderKLCogVideoX
+    tokenizer_cls, tokenizer_id = AutoTokenizer, "hf-internal-testing/tiny-random-t5"
+    text_encoder_cls, text_encoder_id = T5EncoderModel, "hf-internal-testing/tiny-random-t5"
+
+    text_encoder_target_modules = ["q", "k", "v", "o"]
+
+    @property
+    def output_shape(self):
+        return (1, 9, 16, 16, 3)
+
+    def get_dummy_inputs(self, with_generator=True):
+        batch_size = 1
+        sequence_length = 16
+        num_channels = 4
+        num_frames = 9
+        num_latent_frames = 3  # (num_frames - 1) // temporal_compression_ratio + 1
+        sizes = (2, 2)
+
+        generator = torch.manual_seed(0)
+        noise = floats_tensor((batch_size, num_latent_frames, num_channels) + sizes)
+        input_ids = torch.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator)
+
+        pipeline_inputs = {
+            "prompt": "dance monkey",
+            "num_frames": num_frames,
+            "num_inference_steps": 4,
+            "guidance_scale": 6.0,
+            # Cannot reduce because convolution kernel becomes bigger than sample
+            "height": 16,
+            "width": 16,
+            "max_sequence_length": sequence_length,
+            "output_type": "np",
+        }
+        if with_generator:
+            pipeline_inputs.update({"generator": generator})
+
+        return noise, input_ids, pipeline_inputs
+
+    @skip_mps
+    def test_lora_fuse_nan(self):
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1")
+
+            self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in denoiser")
+
+            # corrupt one LoRA weight with `inf` values
+            with torch.no_grad():
+                pipe.transformer.transformer_blocks[0].attn1.to_q.lora_A["adapter-1"].weight += float("inf")
+
+            # with `safe_fusing=True` we should see an Error
+            with self.assertRaises(ValueError):
+                pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=True)
+
+            # without we should not see an error, but every image will be black
+            pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=False)
+
+            out = pipe(
+                "test", num_inference_steps=2, max_sequence_length=inputs["max_sequence_length"], output_type="np"
+            )[0]
+
+            self.assertTrue(np.isnan(out).all())
+
+    def test_simple_inference_with_text_lora_denoiser_fused_multi(self):
+        super().test_simple_inference_with_text_lora_denoiser_fused_multi(expected_atol=9e-3)
+
+    def test_simple_inference_with_text_denoiser_lora_unfused(self):
+        super().test_simple_inference_with_text_denoiser_lora_unfused(expected_atol=9e-3)
+
+    @unittest.skip("Not supported in CogVideoX.")
+    def test_simple_inference_with_text_denoiser_block_scale(self):
+        pass
+
+    @unittest.skip("Not supported in CogVideoX.")
+    def test_simple_inference_with_text_denoiser_block_scale_for_all_dict_options(self):
+        pass
+
+    @unittest.skip("Not supported in CogVideoX.")
+    def test_modify_padding_mode(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in CogVideoX.")
+    def test_simple_inference_with_partial_text_lora(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in CogVideoX.")
+    def test_simple_inference_with_text_lora(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in CogVideoX.")
+    def test_simple_inference_with_text_lora_and_scale(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in CogVideoX.")
+    def test_simple_inference_with_text_lora_fused(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in CogVideoX.")
+    def test_simple_inference_with_text_lora_save_load(self):
+        pass
diff --git a/diffusers/tests/lora/test_lora_layers_flux.py b/diffusers/tests/lora/test_lora_layers_flux.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c336ebc3cbf398af2c0a7f23676ce32d64a55b3
--- /dev/null
+++ b/diffusers/tests/lora/test_lora_layers_flux.py
@@ -0,0 +1,249 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import os
+import sys
+import tempfile
+import unittest
+
+import numpy as np
+import safetensors.torch
+import torch
+from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, T5EncoderModel
+
+from diffusers import FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
+from diffusers.utils.testing_utils import (
+    floats_tensor,
+    is_peft_available,
+    require_peft_backend,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+
+if is_peft_available():
+    from peft.utils import get_peft_model_state_dict
+
+sys.path.append(".")
+
+from utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set  # noqa: E402
+
+
+@require_peft_backend
+class FluxLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
+    pipeline_class = FluxPipeline
+    scheduler_cls = FlowMatchEulerDiscreteScheduler()
+    scheduler_kwargs = {}
+    scheduler_classes = [FlowMatchEulerDiscreteScheduler]
+    transformer_kwargs = {
+        "patch_size": 1,
+        "in_channels": 4,
+        "num_layers": 1,
+        "num_single_layers": 1,
+        "attention_head_dim": 16,
+        "num_attention_heads": 2,
+        "joint_attention_dim": 32,
+        "pooled_projection_dim": 32,
+        "axes_dims_rope": [4, 4, 8],
+    }
+    transformer_cls = FluxTransformer2DModel
+    vae_kwargs = {
+        "sample_size": 32,
+        "in_channels": 3,
+        "out_channels": 3,
+        "block_out_channels": (4,),
+        "layers_per_block": 1,
+        "latent_channels": 1,
+        "norm_num_groups": 1,
+        "use_quant_conv": False,
+        "use_post_quant_conv": False,
+        "shift_factor": 0.0609,
+        "scaling_factor": 1.5035,
+    }
+    has_two_text_encoders = True
+    tokenizer_cls, tokenizer_id = CLIPTokenizer, "peft-internal-testing/tiny-clip-text-2"
+    tokenizer_2_cls, tokenizer_2_id = AutoTokenizer, "hf-internal-testing/tiny-random-t5"
+    text_encoder_cls, text_encoder_id = CLIPTextModel, "peft-internal-testing/tiny-clip-text-2"
+    text_encoder_2_cls, text_encoder_2_id = T5EncoderModel, "hf-internal-testing/tiny-random-t5"
+
+    @property
+    def output_shape(self):
+        return (1, 8, 8, 3)
+
+    def get_dummy_inputs(self, with_generator=True):
+        batch_size = 1
+        sequence_length = 10
+        num_channels = 4
+        sizes = (32, 32)
+
+        generator = torch.manual_seed(0)
+        noise = floats_tensor((batch_size, num_channels) + sizes)
+        input_ids = torch.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator)
+
+        pipeline_inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "num_inference_steps": 4,
+            "guidance_scale": 0.0,
+            "height": 8,
+            "width": 8,
+            "output_type": "np",
+        }
+        if with_generator:
+            pipeline_inputs.update({"generator": generator})
+
+        return noise, input_ids, pipeline_inputs
+
+    def test_with_alpha_in_state_dict(self):
+        components, _, denoiser_lora_config = self.get_dummy_components(FlowMatchEulerDiscreteScheduler)
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images
+        self.assertTrue(output_no_lora.shape == self.output_shape)
+
+        pipe.transformer.add_adapter(denoiser_lora_config)
+        self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer")
+
+        images_lora = pipe(**inputs, generator=torch.manual_seed(0)).images
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            denoiser_state_dict = get_peft_model_state_dict(pipe.transformer)
+            self.pipeline_class.save_lora_weights(tmpdirname, transformer_lora_layers=denoiser_state_dict)
+
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
+            pipe.unload_lora_weights()
+            pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))
+
+            # modify the state dict to have alpha values following
+            # https://huggingface.co/TheLastBen/Jon_Snow_Flux_LoRA/blob/main/jon_snow.safetensors
+            state_dict_with_alpha = safetensors.torch.load_file(
+                os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")
+            )
+            alpha_dict = {}
+            for k, v in state_dict_with_alpha.items():
+                # only do for `transformer` and for the k projections -- should be enough to test.
+                if "transformer" in k and "to_k" in k and "lora_A" in k:
+                    alpha_dict[f"{k}.alpha"] = float(torch.randint(10, 100, size=()))
+            state_dict_with_alpha.update(alpha_dict)
+
+        images_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0)).images
+        self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in denoiser")
+
+        pipe.unload_lora_weights()
+        pipe.load_lora_weights(state_dict_with_alpha)
+        images_lora_with_alpha = pipe(**inputs, generator=torch.manual_seed(0)).images
+
+        self.assertTrue(
+            np.allclose(images_lora, images_lora_from_pretrained, atol=1e-3, rtol=1e-3),
+            "Loading from saved checkpoints should give same results.",
+        )
+        self.assertFalse(np.allclose(images_lora_with_alpha, images_lora, atol=1e-3, rtol=1e-3))
+
+    @unittest.skip("Not supported in Flux.")
+    def test_simple_inference_with_text_denoiser_block_scale_for_all_dict_options(self):
+        pass
+
+    @unittest.skip("Not supported in Flux.")
+    def test_modify_padding_mode(self):
+        pass
+
+
+@slow
+@require_torch_gpu
+@require_peft_backend
+@unittest.skip("We cannot run inference on this model with the current CI hardware")
+# TODO (DN6, sayakpaul): move these tests to a beefier GPU
+class FluxLoRAIntegrationTests(unittest.TestCase):
+    """internal note: The integration slices were obtained on audace."""
+
+    num_inference_steps = 10
+    seed = 0
+
+    def setUp(self):
+        super().setUp()
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        self.pipeline = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
+
+    def tearDown(self):
+        super().tearDown()
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_flux_the_last_ben(self):
+        self.pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors")
+        self.pipeline.fuse_lora()
+        self.pipeline.unload_lora_weights()
+        self.pipeline.enable_model_cpu_offload()
+
+        prompt = "jon snow eating pizza with ketchup"
+
+        out = self.pipeline(
+            prompt,
+            num_inference_steps=self.num_inference_steps,
+            guidance_scale=4.0,
+            output_type="np",
+            generator=torch.manual_seed(self.seed),
+        ).images
+        out_slice = out[0, -3:, -3:, -1].flatten()
+        expected_slice = np.array([0.1719, 0.1719, 0.1699, 0.1719, 0.1719, 0.1738, 0.1641, 0.1621, 0.2090])
+
+        assert np.allclose(out_slice, expected_slice, atol=1e-4, rtol=1e-4)
+
+    def test_flux_kohya(self):
+        self.pipeline.load_lora_weights("Norod78/brain-slug-flux")
+        self.pipeline.fuse_lora()
+        self.pipeline.unload_lora_weights()
+        self.pipeline.enable_model_cpu_offload()
+
+        prompt = "The cat with a brain slug earring"
+        out = self.pipeline(
+            prompt,
+            num_inference_steps=self.num_inference_steps,
+            guidance_scale=4.5,
+            output_type="np",
+            generator=torch.manual_seed(self.seed),
+        ).images
+
+        out_slice = out[0, -3:, -3:, -1].flatten()
+        expected_slice = np.array([0.6367, 0.6367, 0.6328, 0.6367, 0.6328, 0.6289, 0.6367, 0.6328, 0.6484])
+
+        assert np.allclose(out_slice, expected_slice, atol=1e-4, rtol=1e-4)
+
+    def test_flux_xlabs(self):
+        self.pipeline.load_lora_weights("XLabs-AI/flux-lora-collection", weight_name="disney_lora.safetensors")
+        self.pipeline.fuse_lora()
+        self.pipeline.unload_lora_weights()
+        self.pipeline.enable_model_cpu_offload()
+
+        prompt = "A blue jay standing on a large basket of rainbow macarons, disney style"
+
+        out = self.pipeline(
+            prompt,
+            num_inference_steps=self.num_inference_steps,
+            guidance_scale=3.5,
+            output_type="np",
+            generator=torch.manual_seed(self.seed),
+        ).images
+        out_slice = out[0, -3:, -3:, -1].flatten()
+        expected_slice = np.array([0.3984, 0.4199, 0.4453, 0.4102, 0.4375, 0.4590, 0.4141, 0.4355, 0.4980])
+
+        assert np.allclose(out_slice, expected_slice, atol=1e-4, rtol=1e-4)
diff --git a/diffusers/tests/lora/test_lora_layers_sd.py b/diffusers/tests/lora/test_lora_layers_sd.py
new file mode 100644
index 0000000000000000000000000000000000000000..50187e50a912c1f4496797b7575c7044f5f28859
--- /dev/null
+++ b/diffusers/tests/lora/test_lora_layers_sd.py
@@ -0,0 +1,674 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import sys
+import unittest
+
+import numpy as np
+import torch
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoPipelineForImage2Image,
+    AutoPipelineForText2Image,
+    DDIMScheduler,
+    DiffusionPipeline,
+    LCMScheduler,
+    StableDiffusionPipeline,
+)
+from diffusers.utils.import_utils import is_accelerate_available
+from diffusers.utils.testing_utils import (
+    load_image,
+    numpy_cosine_similarity_distance,
+    require_peft_backend,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+
+sys.path.append(".")
+
+from utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set  # noqa: E402
+
+
+if is_accelerate_available():
+    from accelerate.utils import release_memory
+
+
+class StableDiffusionLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
+    pipeline_class = StableDiffusionPipeline
+    scheduler_cls = DDIMScheduler
+    scheduler_kwargs = {
+        "beta_start": 0.00085,
+        "beta_end": 0.012,
+        "beta_schedule": "scaled_linear",
+        "clip_sample": False,
+        "set_alpha_to_one": False,
+        "steps_offset": 1,
+    }
+    unet_kwargs = {
+        "block_out_channels": (32, 64),
+        "layers_per_block": 2,
+        "sample_size": 32,
+        "in_channels": 4,
+        "out_channels": 4,
+        "down_block_types": ("DownBlock2D", "CrossAttnDownBlock2D"),
+        "up_block_types": ("CrossAttnUpBlock2D", "UpBlock2D"),
+        "cross_attention_dim": 32,
+    }
+    vae_kwargs = {
+        "block_out_channels": [32, 64],
+        "in_channels": 3,
+        "out_channels": 3,
+        "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
+        "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
+        "latent_channels": 4,
+    }
+    text_encoder_cls, text_encoder_id = CLIPTextModel, "peft-internal-testing/tiny-clip-text-2"
+    tokenizer_cls, tokenizer_id = CLIPTokenizer, "peft-internal-testing/tiny-clip-text-2"
+
+    @property
+    def output_shape(self):
+        return (1, 64, 64, 3)
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    # Keeping this test here makes sense because it doesn't look any integration
+    # (value assertions on logits).
+    @slow
+    @require_torch_gpu
+    def test_integration_move_lora_cpu(self):
+        path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+        lora_id = "takuma104/lora-test-text-encoder-lora-target"
+
+        pipe = StableDiffusionPipeline.from_pretrained(path, torch_dtype=torch.float16)
+        pipe.load_lora_weights(lora_id, adapter_name="adapter-1")
+        pipe.load_lora_weights(lora_id, adapter_name="adapter-2")
+        pipe = pipe.to(torch_device)
+
+        self.assertTrue(
+            check_if_lora_correctly_set(pipe.text_encoder),
+            "Lora not correctly set in text encoder",
+        )
+
+        self.assertTrue(
+            check_if_lora_correctly_set(pipe.unet),
+            "Lora not correctly set in text encoder",
+        )
+
+        # We will offload the first adapter in CPU and check if the offloading
+        # has been performed correctly
+        pipe.set_lora_device(["adapter-1"], "cpu")
+
+        for name, module in pipe.unet.named_modules():
+            if "adapter-1" in name and not isinstance(module, (nn.Dropout, nn.Identity)):
+                self.assertTrue(module.weight.device == torch.device("cpu"))
+            elif "adapter-2" in name and not isinstance(module, (nn.Dropout, nn.Identity)):
+                self.assertTrue(module.weight.device != torch.device("cpu"))
+
+        for name, module in pipe.text_encoder.named_modules():
+            if "adapter-1" in name and not isinstance(module, (nn.Dropout, nn.Identity)):
+                self.assertTrue(module.weight.device == torch.device("cpu"))
+            elif "adapter-2" in name and not isinstance(module, (nn.Dropout, nn.Identity)):
+                self.assertTrue(module.weight.device != torch.device("cpu"))
+
+        pipe.set_lora_device(["adapter-1"], 0)
+
+        for n, m in pipe.unet.named_modules():
+            if "adapter-1" in n and not isinstance(m, (nn.Dropout, nn.Identity)):
+                self.assertTrue(m.weight.device != torch.device("cpu"))
+
+        for n, m in pipe.text_encoder.named_modules():
+            if "adapter-1" in n and not isinstance(m, (nn.Dropout, nn.Identity)):
+                self.assertTrue(m.weight.device != torch.device("cpu"))
+
+        pipe.set_lora_device(["adapter-1", "adapter-2"], torch_device)
+
+        for n, m in pipe.unet.named_modules():
+            if ("adapter-1" in n or "adapter-2" in n) and not isinstance(m, (nn.Dropout, nn.Identity)):
+                self.assertTrue(m.weight.device != torch.device("cpu"))
+
+        for n, m in pipe.text_encoder.named_modules():
+            if ("adapter-1" in n or "adapter-2" in n) and not isinstance(m, (nn.Dropout, nn.Identity)):
+                self.assertTrue(m.weight.device != torch.device("cpu"))
+
+    @slow
+    @require_torch_gpu
+    def test_integration_move_lora_dora_cpu(self):
+        from peft import LoraConfig
+
+        path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+        unet_lora_config = LoraConfig(
+            init_lora_weights="gaussian",
+            target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+            use_dora=True,
+        )
+        text_lora_config = LoraConfig(
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+            use_dora=True,
+        )
+
+        pipe = StableDiffusionPipeline.from_pretrained(path, torch_dtype=torch.float16)
+        pipe.unet.add_adapter(unet_lora_config, "adapter-1")
+        pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+
+        self.assertTrue(
+            check_if_lora_correctly_set(pipe.text_encoder),
+            "Lora not correctly set in text encoder",
+        )
+
+        self.assertTrue(
+            check_if_lora_correctly_set(pipe.unet),
+            "Lora not correctly set in text encoder",
+        )
+
+        for name, param in pipe.unet.named_parameters():
+            if "lora_" in name:
+                self.assertEqual(param.device, torch.device("cpu"))
+
+        for name, param in pipe.text_encoder.named_parameters():
+            if "lora_" in name:
+                self.assertEqual(param.device, torch.device("cpu"))
+
+        pipe.set_lora_device(["adapter-1"], torch_device)
+
+        for name, param in pipe.unet.named_parameters():
+            if "lora_" in name:
+                self.assertNotEqual(param.device, torch.device("cpu"))
+
+        for name, param in pipe.text_encoder.named_parameters():
+            if "lora_" in name:
+                self.assertNotEqual(param.device, torch.device("cpu"))
+
+
+@slow
+@require_torch_gpu
+@require_peft_backend
+class LoraIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_integration_logits_with_scale(self):
+        path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+        lora_id = "takuma104/lora-test-text-encoder-lora-target"
+
+        pipe = StableDiffusionPipeline.from_pretrained(path, torch_dtype=torch.float32)
+        pipe.load_lora_weights(lora_id)
+        pipe = pipe.to(torch_device)
+
+        self.assertTrue(
+            check_if_lora_correctly_set(pipe.text_encoder),
+            "Lora not correctly set in text encoder",
+        )
+
+        prompt = "a red sks dog"
+
+        images = pipe(
+            prompt=prompt,
+            num_inference_steps=15,
+            cross_attention_kwargs={"scale": 0.5},
+            generator=torch.manual_seed(0),
+            output_type="np",
+        ).images
+
+        expected_slice_scale = np.array([0.307, 0.283, 0.310, 0.310, 0.300, 0.314, 0.336, 0.314, 0.321])
+        predicted_slice = images[0, -3:, -3:, -1].flatten()
+
+        max_diff = numpy_cosine_similarity_distance(expected_slice_scale, predicted_slice)
+        assert max_diff < 1e-3
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_integration_logits_no_scale(self):
+        path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+        lora_id = "takuma104/lora-test-text-encoder-lora-target"
+
+        pipe = StableDiffusionPipeline.from_pretrained(path, torch_dtype=torch.float32)
+        pipe.load_lora_weights(lora_id)
+        pipe = pipe.to(torch_device)
+
+        self.assertTrue(
+            check_if_lora_correctly_set(pipe.text_encoder),
+            "Lora not correctly set in text encoder",
+        )
+
+        prompt = "a red sks dog"
+
+        images = pipe(prompt=prompt, num_inference_steps=30, generator=torch.manual_seed(0), output_type="np").images
+
+        expected_slice_scale = np.array([0.074, 0.064, 0.073, 0.0842, 0.069, 0.0641, 0.0794, 0.076, 0.084])
+        predicted_slice = images[0, -3:, -3:, -1].flatten()
+
+        max_diff = numpy_cosine_similarity_distance(expected_slice_scale, predicted_slice)
+
+        assert max_diff < 1e-3
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_dreambooth_old_format(self):
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        lora_model_id = "hf-internal-testing/lora_dreambooth_dog_example"
+
+        base_model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+
+        pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
+        pipe = pipe.to(torch_device)
+        pipe.load_lora_weights(lora_model_id)
+
+        images = pipe(
+            "A photo of a sks dog floating in the river", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.7207, 0.6787, 0.6010, 0.7478, 0.6838, 0.6064, 0.6984, 0.6443, 0.5785])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-4
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_dreambooth_text_encoder_new_format(self):
+        generator = torch.Generator().manual_seed(0)
+
+        lora_model_id = "hf-internal-testing/lora-trained"
+
+        base_model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+
+        pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
+        pipe = pipe.to(torch_device)
+        pipe.load_lora_weights(lora_model_id)
+
+        images = pipe("A photo of a sks dog", output_type="np", generator=generator, num_inference_steps=2).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+
+        expected = np.array([0.6628, 0.6138, 0.5390, 0.6625, 0.6130, 0.5463, 0.6166, 0.5788, 0.5359])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-4
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_a1111(self):
+        generator = torch.Generator().manual_seed(0)
+
+        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None).to(
+            torch_device
+        )
+        lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
+        lora_filename = "light_and_shadow.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.3636, 0.3708, 0.3694, 0.3679, 0.3829, 0.3677, 0.3692, 0.3688, 0.3292])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-3
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_lycoris(self):
+        generator = torch.Generator().manual_seed(0)
+
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/Amixx", safety_checker=None, use_safetensors=True, variant="fp16"
+        ).to(torch_device)
+        lora_model_id = "hf-internal-testing/edgLycorisMugler-light"
+        lora_filename = "edgLycorisMugler-light.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.6463, 0.658, 0.599, 0.6542, 0.6512, 0.6213, 0.658, 0.6485, 0.6017])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-3
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_a1111_with_model_cpu_offload(self):
+        generator = torch.Generator().manual_seed(0)
+
+        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
+        pipe.enable_model_cpu_offload()
+        lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
+        lora_filename = "light_and_shadow.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.3636, 0.3708, 0.3694, 0.3679, 0.3829, 0.3677, 0.3692, 0.3688, 0.3292])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-3
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_a1111_with_sequential_cpu_offload(self):
+        generator = torch.Generator().manual_seed(0)
+
+        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
+        pipe.enable_sequential_cpu_offload()
+        lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
+        lora_filename = "light_and_shadow.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.3636, 0.3708, 0.3694, 0.3679, 0.3829, 0.3677, 0.3692, 0.3688, 0.3292])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-3
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_kohya_sd_v15_with_higher_dimensions(self):
+        generator = torch.Generator().manual_seed(0)
+
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None
+        ).to(torch_device)
+        lora_model_id = "hf-internal-testing/urushisato-lora"
+        lora_filename = "urushisato_v15.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.7165, 0.6616, 0.5833, 0.7504, 0.6718, 0.587, 0.6871, 0.6361, 0.5694])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-3
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_vanilla_funetuning(self):
+        generator = torch.Generator().manual_seed(0)
+
+        lora_model_id = "hf-internal-testing/sd-model-finetuned-lora-t4"
+
+        base_model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+
+        pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
+        pipe = pipe.to(torch_device)
+        pipe.load_lora_weights(lora_model_id)
+
+        images = pipe("A pokemon with blue eyes.", output_type="np", generator=generator, num_inference_steps=2).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+
+        expected = np.array([0.7406, 0.699, 0.5963, 0.7493, 0.7045, 0.6096, 0.6886, 0.6388, 0.583])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-4
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_unload_kohya_lora(self):
+        generator = torch.manual_seed(0)
+        prompt = "masterpiece, best quality, mountain"
+        num_inference_steps = 2
+
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None
+        ).to(torch_device)
+        initial_images = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        initial_images = initial_images[0, -3:, -3:, -1].flatten()
+
+        lora_model_id = "hf-internal-testing/civitai-colored-icons-lora"
+        lora_filename = "Colored_Icons_by_vizsumit.safetensors"
+
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        generator = torch.manual_seed(0)
+        lora_images = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        lora_images = lora_images[0, -3:, -3:, -1].flatten()
+
+        pipe.unload_lora_weights()
+        generator = torch.manual_seed(0)
+        unloaded_lora_images = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        unloaded_lora_images = unloaded_lora_images[0, -3:, -3:, -1].flatten()
+
+        self.assertFalse(np.allclose(initial_images, lora_images))
+        self.assertTrue(np.allclose(initial_images, unloaded_lora_images, atol=1e-3))
+
+        release_memory(pipe)
+
+    def test_load_unload_load_kohya_lora(self):
+        # This test ensures that a Kohya-style LoRA can be safely unloaded and then loaded
+        # without introducing any side-effects. Even though the test uses a Kohya-style
+        # LoRA, the underlying adapter handling mechanism is format-agnostic.
+        generator = torch.manual_seed(0)
+        prompt = "masterpiece, best quality, mountain"
+        num_inference_steps = 2
+
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None
+        ).to(torch_device)
+        initial_images = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        initial_images = initial_images[0, -3:, -3:, -1].flatten()
+
+        lora_model_id = "hf-internal-testing/civitai-colored-icons-lora"
+        lora_filename = "Colored_Icons_by_vizsumit.safetensors"
+
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        generator = torch.manual_seed(0)
+        lora_images = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        lora_images = lora_images[0, -3:, -3:, -1].flatten()
+
+        pipe.unload_lora_weights()
+        generator = torch.manual_seed(0)
+        unloaded_lora_images = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        unloaded_lora_images = unloaded_lora_images[0, -3:, -3:, -1].flatten()
+
+        self.assertFalse(np.allclose(initial_images, lora_images))
+        self.assertTrue(np.allclose(initial_images, unloaded_lora_images, atol=1e-3))
+
+        # make sure we can load a LoRA again after unloading and they don't have
+        # any undesired effects.
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        generator = torch.manual_seed(0)
+        lora_images_again = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        lora_images_again = lora_images_again[0, -3:, -3:, -1].flatten()
+
+        self.assertTrue(np.allclose(lora_images, lora_images_again, atol=1e-3))
+        release_memory(pipe)
+
+    def test_not_empty_state_dict(self):
+        # Makes sure https://github.com/huggingface/diffusers/issues/7054 does not happen again
+        pipe = AutoPipelineForText2Image.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16
+        ).to(torch_device)
+        pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+        cached_file = hf_hub_download("hf-internal-testing/lcm-lora-test-sd-v1-5", "test_lora.safetensors")
+        lcm_lora = load_file(cached_file)
+
+        pipe.load_lora_weights(lcm_lora, adapter_name="lcm")
+        self.assertTrue(lcm_lora != {})
+        release_memory(pipe)
+
+    def test_load_unload_load_state_dict(self):
+        # Makes sure https://github.com/huggingface/diffusers/issues/7054 does not happen again
+        pipe = AutoPipelineForText2Image.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16
+        ).to(torch_device)
+        pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+        cached_file = hf_hub_download("hf-internal-testing/lcm-lora-test-sd-v1-5", "test_lora.safetensors")
+        lcm_lora = load_file(cached_file)
+        previous_state_dict = lcm_lora.copy()
+
+        pipe.load_lora_weights(lcm_lora, adapter_name="lcm")
+        self.assertDictEqual(lcm_lora, previous_state_dict)
+
+        pipe.unload_lora_weights()
+        pipe.load_lora_weights(lcm_lora, adapter_name="lcm")
+        self.assertDictEqual(lcm_lora, previous_state_dict)
+
+        release_memory(pipe)
+
+    def test_sdv1_5_lcm_lora(self):
+        pipe = DiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16
+        )
+        pipe.to(torch_device)
+        pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        lora_model_id = "latent-consistency/lcm-lora-sdv1-5"
+        pipe.load_lora_weights(lora_model_id)
+
+        image = pipe(
+            "masterpiece, best quality, mountain", generator=generator, num_inference_steps=4, guidance_scale=0.5
+        ).images[0]
+
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/lcm_lora/sdv15_lcm_lora.png"
+        )
+
+        image_np = pipe.image_processor.pil_to_numpy(image)
+        expected_image_np = pipe.image_processor.pil_to_numpy(expected_image)
+
+        max_diff = numpy_cosine_similarity_distance(image_np.flatten(), expected_image_np.flatten())
+        assert max_diff < 1e-4
+
+        pipe.unload_lora_weights()
+
+        release_memory(pipe)
+
+    def test_sdv1_5_lcm_lora_img2img(self):
+        pipe = AutoPipelineForImage2Image.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16
+        )
+        pipe.to(torch_device)
+        pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape.png"
+        )
+
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        lora_model_id = "latent-consistency/lcm-lora-sdv1-5"
+        pipe.load_lora_weights(lora_model_id)
+
+        image = pipe(
+            "snowy mountain",
+            generator=generator,
+            image=init_image,
+            strength=0.5,
+            num_inference_steps=4,
+            guidance_scale=0.5,
+        ).images[0]
+
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/lcm_lora/sdv15_lcm_lora_img2img.png"
+        )
+
+        image_np = pipe.image_processor.pil_to_numpy(image)
+        expected_image_np = pipe.image_processor.pil_to_numpy(expected_image)
+
+        max_diff = numpy_cosine_similarity_distance(image_np.flatten(), expected_image_np.flatten())
+        assert max_diff < 1e-4
+
+        pipe.unload_lora_weights()
+
+        release_memory(pipe)
+
+    def test_sd_load_civitai_empty_network_alpha(self):
+        """
+        This test simply checks that loading a LoRA with an empty network alpha works fine
+        See: https://github.com/huggingface/diffusers/issues/5606
+        """
+        pipeline = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+        pipeline.enable_sequential_cpu_offload()
+        civitai_path = hf_hub_download("ybelkada/test-ahi-civitai", "ahi_lora_weights.safetensors")
+        pipeline.load_lora_weights(civitai_path, adapter_name="ahri")
+
+        images = pipeline(
+            "ahri, masterpiece, league of legends",
+            output_type="np",
+            generator=torch.manual_seed(156),
+            num_inference_steps=5,
+        ).images
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.0, 0.0, 0.0, 0.002557, 0.020954, 0.001792, 0.006581, 0.00591, 0.002995])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-3
+
+        pipeline.unload_lora_weights()
+        release_memory(pipeline)
diff --git a/diffusers/tests/lora/test_lora_layers_sd3.py b/diffusers/tests/lora/test_lora_layers_sd3.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f61c95c2fc878acb82cb3fac64a8361bfbb5b13
--- /dev/null
+++ b/diffusers/tests/lora/test_lora_layers_sd3.py
@@ -0,0 +1,110 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import unittest
+
+from transformers import AutoTokenizer, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
+
+from diffusers import FlowMatchEulerDiscreteScheduler, SD3Transformer2DModel, StableDiffusion3Pipeline
+from diffusers.utils.testing_utils import is_peft_available, require_peft_backend, require_torch_gpu, torch_device
+
+
+if is_peft_available():
+    pass
+
+sys.path.append(".")
+
+from utils import PeftLoraLoaderMixinTests  # noqa: E402
+
+
+@require_peft_backend
+class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
+    pipeline_class = StableDiffusion3Pipeline
+    scheduler_cls = FlowMatchEulerDiscreteScheduler
+    scheduler_kwargs = {}
+    scheduler_classes = [FlowMatchEulerDiscreteScheduler]
+    transformer_kwargs = {
+        "sample_size": 32,
+        "patch_size": 1,
+        "in_channels": 4,
+        "num_layers": 1,
+        "attention_head_dim": 8,
+        "num_attention_heads": 4,
+        "caption_projection_dim": 32,
+        "joint_attention_dim": 32,
+        "pooled_projection_dim": 64,
+        "out_channels": 4,
+    }
+    transformer_cls = SD3Transformer2DModel
+    vae_kwargs = {
+        "sample_size": 32,
+        "in_channels": 3,
+        "out_channels": 3,
+        "block_out_channels": (4,),
+        "layers_per_block": 1,
+        "latent_channels": 4,
+        "norm_num_groups": 1,
+        "use_quant_conv": False,
+        "use_post_quant_conv": False,
+        "shift_factor": 0.0609,
+        "scaling_factor": 1.5035,
+    }
+    has_three_text_encoders = True
+    tokenizer_cls, tokenizer_id = CLIPTokenizer, "hf-internal-testing/tiny-random-clip"
+    tokenizer_2_cls, tokenizer_2_id = CLIPTokenizer, "hf-internal-testing/tiny-random-clip"
+    tokenizer_3_cls, tokenizer_3_id = AutoTokenizer, "hf-internal-testing/tiny-random-t5"
+    text_encoder_cls, text_encoder_id = CLIPTextModelWithProjection, "hf-internal-testing/tiny-sd3-text_encoder"
+    text_encoder_2_cls, text_encoder_2_id = CLIPTextModelWithProjection, "hf-internal-testing/tiny-sd3-text_encoder-2"
+    text_encoder_3_cls, text_encoder_3_id = T5EncoderModel, "hf-internal-testing/tiny-random-t5"
+
+    @property
+    def output_shape(self):
+        return (1, 32, 32, 3)
+
+    @require_torch_gpu
+    def test_sd3_lora(self):
+        """
+        Test loading the loras that are saved with the diffusers and peft formats.
+        Related PR: https://github.com/huggingface/diffusers/pull/8584
+        """
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components[0])
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        lora_model_id = "hf-internal-testing/tiny-sd3-loras"
+
+        lora_filename = "lora_diffusers_format.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe.unload_lora_weights()
+
+        lora_filename = "lora_peft_format.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+    @unittest.skip("Not supported in SD3.")
+    def test_simple_inference_with_text_denoiser_block_scale(self):
+        pass
+
+    @unittest.skip("Not supported in SD3.")
+    def test_simple_inference_with_text_denoiser_multi_adapter_block_lora(self):
+        pass
+
+    @unittest.skip("Not supported in SD3.")
+    def test_simple_inference_with_text_denoiser_block_scale_for_all_dict_options(self):
+        pass
+
+    @unittest.skip("Not supported in SD3.")
+    def test_modify_padding_mode(self):
+        pass
diff --git a/diffusers/tests/lora/test_lora_layers_sdxl.py b/diffusers/tests/lora/test_lora_layers_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ec7ef897485536a782d3ba6d999d2a833e743a0
--- /dev/null
+++ b/diffusers/tests/lora/test_lora_layers_sdxl.py
@@ -0,0 +1,633 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import gc
+import importlib
+import sys
+import time
+import unittest
+
+import numpy as np
+import torch
+from packaging import version
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import (
+    ControlNetModel,
+    EulerDiscreteScheduler,
+    LCMScheduler,
+    StableDiffusionXLAdapterPipeline,
+    StableDiffusionXLControlNetPipeline,
+    StableDiffusionXLPipeline,
+    T2IAdapter,
+)
+from diffusers.utils.import_utils import is_accelerate_available
+from diffusers.utils.testing_utils import (
+    load_image,
+    nightly,
+    numpy_cosine_similarity_distance,
+    require_peft_backend,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+
+sys.path.append(".")
+
+from utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set, state_dicts_almost_equal  # noqa: E402
+
+
+if is_accelerate_available():
+    from accelerate.utils import release_memory
+
+
+class StableDiffusionXLLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
+    has_two_text_encoders = True
+    pipeline_class = StableDiffusionXLPipeline
+    scheduler_cls = EulerDiscreteScheduler
+    scheduler_kwargs = {
+        "beta_start": 0.00085,
+        "beta_end": 0.012,
+        "beta_schedule": "scaled_linear",
+        "timestep_spacing": "leading",
+        "steps_offset": 1,
+    }
+    unet_kwargs = {
+        "block_out_channels": (32, 64),
+        "layers_per_block": 2,
+        "sample_size": 32,
+        "in_channels": 4,
+        "out_channels": 4,
+        "down_block_types": ("DownBlock2D", "CrossAttnDownBlock2D"),
+        "up_block_types": ("CrossAttnUpBlock2D", "UpBlock2D"),
+        "attention_head_dim": (2, 4),
+        "use_linear_projection": True,
+        "addition_embed_type": "text_time",
+        "addition_time_embed_dim": 8,
+        "transformer_layers_per_block": (1, 2),
+        "projection_class_embeddings_input_dim": 80,  # 6 * 8 + 32
+        "cross_attention_dim": 64,
+    }
+    vae_kwargs = {
+        "block_out_channels": [32, 64],
+        "in_channels": 3,
+        "out_channels": 3,
+        "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
+        "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
+        "latent_channels": 4,
+        "sample_size": 128,
+    }
+    text_encoder_cls, text_encoder_id = CLIPTextModel, "peft-internal-testing/tiny-clip-text-2"
+    tokenizer_cls, tokenizer_id = CLIPTokenizer, "peft-internal-testing/tiny-clip-text-2"
+    text_encoder_2_cls, text_encoder_2_id = CLIPTextModelWithProjection, "peft-internal-testing/tiny-clip-text-2"
+    tokenizer_2_cls, tokenizer_2_id = CLIPTokenizer, "peft-internal-testing/tiny-clip-text-2"
+
+    @property
+    def output_shape(self):
+        return (1, 64, 64, 3)
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+@slow
+@require_torch_gpu
+@require_peft_backend
+class LoraSDXLIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_sdxl_1_0_lora(self):
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.enable_model_cpu_offload()
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.4468, 0.4061, 0.4134, 0.3637, 0.3202, 0.365, 0.3786, 0.3725, 0.3535])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-4
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_sdxl_1_0_blockwise_lora(self):
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.enable_model_cpu_offload()
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, adapter_name="offset")
+        scales = {
+            "unet": {
+                "down": {"block_1": [1.0, 1.0], "block_2": [1.0, 1.0]},
+                "mid": 1.0,
+                "up": {"block_0": [1.0, 1.0, 1.0], "block_1": [1.0, 1.0, 1.0]},
+            },
+        }
+        pipe.set_adapters(["offset"], [scales])
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([00.4468, 0.4061, 0.4134, 0.3637, 0.3202, 0.365, 0.3786, 0.3725, 0.3535])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-4
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_sdxl_lcm_lora(self):
+        pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        )
+        pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+        pipe.enable_model_cpu_offload()
+
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        lora_model_id = "latent-consistency/lcm-lora-sdxl"
+
+        pipe.load_lora_weights(lora_model_id)
+
+        image = pipe(
+            "masterpiece, best quality, mountain", generator=generator, num_inference_steps=4, guidance_scale=0.5
+        ).images[0]
+
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/lcm_lora/sdxl_lcm_lora.png"
+        )
+
+        image_np = pipe.image_processor.pil_to_numpy(image)
+        expected_image_np = pipe.image_processor.pil_to_numpy(expected_image)
+
+        max_diff = numpy_cosine_similarity_distance(image_np.flatten(), expected_image_np.flatten())
+        assert max_diff < 1e-4
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_sdxl_1_0_lora_fusion(self):
+        generator = torch.Generator().manual_seed(0)
+
+        pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        pipe.fuse_lora()
+        # We need to unload the lora weights since in the previous API `fuse_lora` led to lora weights being
+        # silently deleted - otherwise this will CPU OOM
+        pipe.unload_lora_weights()
+
+        pipe.enable_model_cpu_offload()
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        # This way we also test equivalence between LoRA fusion and the non-fusion behaviour.
+        expected = np.array([0.4468, 0.4061, 0.4134, 0.3637, 0.3202, 0.365, 0.3786, 0.3725, 0.3535])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-4
+
+        release_memory(pipe)
+
+    def test_sdxl_1_0_lora_unfusion(self):
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe.fuse_lora()
+
+        pipe.enable_model_cpu_offload()
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=3
+        ).images
+        images_with_fusion = images.flatten()
+
+        pipe.unfuse_lora()
+        generator = torch.Generator("cpu").manual_seed(0)
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=3
+        ).images
+        images_without_fusion = images.flatten()
+
+        max_diff = numpy_cosine_similarity_distance(images_with_fusion, images_without_fusion)
+        assert max_diff < 1e-4
+
+        release_memory(pipe)
+
+    def test_sdxl_1_0_lora_unfusion_effectivity(self):
+        pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.enable_model_cpu_offload()
+
+        generator = torch.Generator().manual_seed(0)
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+        original_image_slice = images[0, -3:, -3:, -1].flatten()
+
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe.fuse_lora()
+
+        generator = torch.Generator().manual_seed(0)
+        _ = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        pipe.unfuse_lora()
+
+        # We need to unload the lora weights - in the old API unfuse led to unloading the adapter weights
+        pipe.unload_lora_weights()
+
+        generator = torch.Generator().manual_seed(0)
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+        images_without_fusion_slice = images[0, -3:, -3:, -1].flatten()
+
+        max_diff = numpy_cosine_similarity_distance(images_without_fusion_slice, original_image_slice)
+        assert max_diff < 1e-3
+
+        release_memory(pipe)
+
+    def test_sdxl_1_0_lora_fusion_efficiency(self):
+        generator = torch.Generator().manual_seed(0)
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+
+        pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        )
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, torch_dtype=torch.float16)
+        pipe.enable_model_cpu_offload()
+
+        start_time = time.time()
+        for _ in range(3):
+            pipe(
+                "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+            ).images
+        end_time = time.time()
+        elapsed_time_non_fusion = end_time - start_time
+
+        del pipe
+
+        pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        )
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, torch_dtype=torch.float16)
+        pipe.fuse_lora()
+
+        # We need to unload the lora weights since in the previous API `fuse_lora` led to lora weights being
+        # silently deleted - otherwise this will CPU OOM
+        pipe.unload_lora_weights()
+        pipe.enable_model_cpu_offload()
+
+        generator = torch.Generator().manual_seed(0)
+        start_time = time.time()
+        for _ in range(3):
+            pipe(
+                "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+            ).images
+        end_time = time.time()
+        elapsed_time_fusion = end_time - start_time
+
+        self.assertTrue(elapsed_time_fusion < elapsed_time_non_fusion)
+
+        release_memory(pipe)
+
+    def test_sdxl_1_0_last_ben(self):
+        generator = torch.Generator().manual_seed(0)
+
+        pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.enable_model_cpu_offload()
+        lora_model_id = "TheLastBen/Papercut_SDXL"
+        lora_filename = "papercut.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe("papercut.safetensors", output_type="np", generator=generator, num_inference_steps=2).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.5244, 0.4347, 0.4312, 0.4246, 0.4398, 0.4409, 0.4884, 0.4938, 0.4094])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-3
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_sdxl_1_0_fuse_unfuse_all(self):
+        pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        )
+        text_encoder_1_sd = copy.deepcopy(pipe.text_encoder.state_dict())
+        text_encoder_2_sd = copy.deepcopy(pipe.text_encoder_2.state_dict())
+        unet_sd = copy.deepcopy(pipe.unet.state_dict())
+
+        pipe.load_lora_weights(
+            "davizca87/sun-flower", weight_name="snfw3rXL-000004.safetensors", torch_dtype=torch.float16
+        )
+
+        fused_te_state_dict = pipe.text_encoder.state_dict()
+        fused_te_2_state_dict = pipe.text_encoder_2.state_dict()
+        unet_state_dict = pipe.unet.state_dict()
+
+        peft_ge_070 = version.parse(importlib.metadata.version("peft")) >= version.parse("0.7.0")
+
+        def remap_key(key, sd):
+            # some keys have moved around for PEFT >= 0.7.0, but they should still be loaded correctly
+            if (key in sd) or (not peft_ge_070):
+                return key
+
+            # instead of linear.weight, we now have linear.base_layer.weight, etc.
+            if key.endswith(".weight"):
+                key = key[:-7] + ".base_layer.weight"
+            elif key.endswith(".bias"):
+                key = key[:-5] + ".base_layer.bias"
+            return key
+
+        for key, value in text_encoder_1_sd.items():
+            key = remap_key(key, fused_te_state_dict)
+            self.assertTrue(torch.allclose(fused_te_state_dict[key], value))
+
+        for key, value in text_encoder_2_sd.items():
+            key = remap_key(key, fused_te_2_state_dict)
+            self.assertTrue(torch.allclose(fused_te_2_state_dict[key], value))
+
+        for key, value in unet_state_dict.items():
+            self.assertTrue(torch.allclose(unet_state_dict[key], value))
+
+        pipe.fuse_lora()
+        pipe.unload_lora_weights()
+
+        assert not state_dicts_almost_equal(text_encoder_1_sd, pipe.text_encoder.state_dict())
+        assert not state_dicts_almost_equal(text_encoder_2_sd, pipe.text_encoder_2.state_dict())
+        assert not state_dicts_almost_equal(unet_sd, pipe.unet.state_dict())
+
+        release_memory(pipe)
+        del unet_sd, text_encoder_1_sd, text_encoder_2_sd
+
+    def test_sdxl_1_0_lora_with_sequential_cpu_offloading(self):
+        generator = torch.Generator().manual_seed(0)
+
+        pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.enable_sequential_cpu_offload()
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-3
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_controlnet_canny_lora(self):
+        controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-canny-sdxl-1.0")
+
+        pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet
+        )
+        pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors")
+        pipe.enable_sequential_cpu_offload()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "corgi"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        )
+        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
+
+        assert images[0].shape == (768, 512, 3)
+
+        original_image = images[0, -3:, -3:, -1].flatten()
+        expected_image = np.array([0.4574, 0.4487, 0.4435, 0.5163, 0.4396, 0.4411, 0.518, 0.4465, 0.4333])
+
+        max_diff = numpy_cosine_similarity_distance(expected_image, original_image)
+        assert max_diff < 1e-4
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    def test_sdxl_t2i_adapter_canny_lora(self):
+        adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16).to(
+            "cpu"
+        )
+        pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            adapter=adapter,
+            torch_dtype=torch.float16,
+            variant="fp16",
+        )
+        pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors")
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "toy"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/t2i_adapter/toy_canny.png"
+        )
+
+        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
+
+        assert images[0].shape == (768, 512, 3)
+
+        image_slice = images[0, -3:, -3:, -1].flatten()
+        expected_slice = np.array([0.4284, 0.4337, 0.4319, 0.4255, 0.4329, 0.4280, 0.4338, 0.4420, 0.4226])
+        assert numpy_cosine_similarity_distance(image_slice, expected_slice) < 1e-4
+
+    @nightly
+    def test_sequential_fuse_unfuse(self):
+        pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        )
+
+        # 1. round
+        pipe.load_lora_weights("Pclanglais/TintinIA", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.fuse_lora()
+
+        generator = torch.Generator().manual_seed(0)
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+        image_slice = images[0, -3:, -3:, -1].flatten()
+
+        pipe.unfuse_lora()
+
+        # 2. round
+        pipe.load_lora_weights("ProomptEngineer/pe-balloon-diffusion-style", torch_dtype=torch.float16)
+        pipe.fuse_lora()
+        pipe.unfuse_lora()
+
+        # 3. round
+        pipe.load_lora_weights("ostris/crayon_style_lora_sdxl", torch_dtype=torch.float16)
+        pipe.fuse_lora()
+        pipe.unfuse_lora()
+
+        # 4. back to 1st round
+        pipe.load_lora_weights("Pclanglais/TintinIA", torch_dtype=torch.float16)
+        pipe.fuse_lora()
+
+        generator = torch.Generator().manual_seed(0)
+        images_2 = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+        image_slice_2 = images_2[0, -3:, -3:, -1].flatten()
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, image_slice_2)
+        assert max_diff < 1e-3
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
+    @nightly
+    def test_integration_logits_multi_adapter(self):
+        path = "stabilityai/stable-diffusion-xl-base-1.0"
+        lora_id = "CiroN2022/toy-face"
+
+        pipe = StableDiffusionXLPipeline.from_pretrained(path, torch_dtype=torch.float16)
+        pipe.load_lora_weights(lora_id, weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
+        pipe = pipe.to(torch_device)
+
+        self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet")
+
+        prompt = "toy_face of a hacker with a hoodie"
+
+        lora_scale = 0.9
+
+        images = pipe(
+            prompt=prompt,
+            num_inference_steps=30,
+            generator=torch.manual_seed(0),
+            cross_attention_kwargs={"scale": lora_scale},
+            output_type="np",
+        ).images
+        expected_slice_scale = np.array([0.538, 0.539, 0.540, 0.540, 0.542, 0.539, 0.538, 0.541, 0.539])
+
+        predicted_slice = images[0, -3:, -3:, -1].flatten()
+        max_diff = numpy_cosine_similarity_distance(expected_slice_scale, predicted_slice)
+        assert max_diff < 1e-3
+
+        pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipe.set_adapters("pixel")
+
+        prompt = "pixel art, a hacker with a hoodie, simple, flat colors"
+        images = pipe(
+            prompt,
+            num_inference_steps=30,
+            guidance_scale=7.5,
+            cross_attention_kwargs={"scale": lora_scale},
+            generator=torch.manual_seed(0),
+            output_type="np",
+        ).images
+
+        predicted_slice = images[0, -3:, -3:, -1].flatten()
+        expected_slice_scale = np.array(
+            [0.61973065, 0.62018543, 0.62181497, 0.61933696, 0.6208608, 0.620576, 0.6200281, 0.62258327, 0.6259889]
+        )
+        max_diff = numpy_cosine_similarity_distance(expected_slice_scale, predicted_slice)
+        assert max_diff < 1e-3
+
+        # multi-adapter inference
+        pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0])
+        images = pipe(
+            prompt,
+            num_inference_steps=30,
+            guidance_scale=7.5,
+            cross_attention_kwargs={"scale": 1.0},
+            generator=torch.manual_seed(0),
+            output_type="np",
+        ).images
+        predicted_slice = images[0, -3:, -3:, -1].flatten()
+        expected_slice_scale = np.array([0.5888, 0.5897, 0.5946, 0.5888, 0.5935, 0.5946, 0.5857, 0.5891, 0.5909])
+        max_diff = numpy_cosine_similarity_distance(expected_slice_scale, predicted_slice)
+        assert max_diff < 1e-3
+
+        # Lora disabled
+        pipe.disable_lora()
+        images = pipe(
+            prompt,
+            num_inference_steps=30,
+            guidance_scale=7.5,
+            cross_attention_kwargs={"scale": lora_scale},
+            generator=torch.manual_seed(0),
+            output_type="np",
+        ).images
+        predicted_slice = images[0, -3:, -3:, -1].flatten()
+        expected_slice_scale = np.array([0.5456, 0.5466, 0.5487, 0.5458, 0.5469, 0.5454, 0.5446, 0.5479, 0.5487])
+        max_diff = numpy_cosine_similarity_distance(expected_slice_scale, predicted_slice)
+        assert max_diff < 1e-3
+
+    @nightly
+    def test_integration_logits_for_dora_lora(self):
+        pipeline = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipeline.load_lora_weights("hf-internal-testing/dora-trained-on-kohya")
+        pipeline.enable_model_cpu_offload()
+
+        images = pipeline(
+            "photo of ohwx dog",
+            num_inference_steps=10,
+            generator=torch.manual_seed(0),
+            output_type="np",
+        ).images
+
+        predicted_slice = images[0, -3:, -3:, -1].flatten()
+        expected_slice_scale = np.array([0.3932, 0.3742, 0.4429, 0.3737, 0.3504, 0.433, 0.3948, 0.3769, 0.4516])
+        max_diff = numpy_cosine_similarity_distance(expected_slice_scale, predicted_slice)
+        assert max_diff < 1e-3
diff --git a/diffusers/tests/lora/utils.py b/diffusers/tests/lora/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..43c45daaa322cc0b3c1ad77d158143375af84328
--- /dev/null
+++ b/diffusers/tests/lora/utils.py
@@ -0,0 +1,1696 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import os
+import tempfile
+import unittest
+from itertools import product
+
+import numpy as np
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LCMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.utils.import_utils import is_peft_available
+from diffusers.utils.testing_utils import (
+    floats_tensor,
+    require_peft_backend,
+    require_peft_version_greater,
+    skip_mps,
+    torch_device,
+)
+
+
+if is_peft_available():
+    from peft import LoraConfig
+    from peft.tuners.tuners_utils import BaseTunerLayer
+    from peft.utils import get_peft_model_state_dict
+
+
+def state_dicts_almost_equal(sd1, sd2):
+    sd1 = dict(sorted(sd1.items()))
+    sd2 = dict(sorted(sd2.items()))
+
+    models_are_equal = True
+    for ten1, ten2 in zip(sd1.values(), sd2.values()):
+        if (ten1 - ten2).abs().max() > 1e-3:
+            models_are_equal = False
+
+    return models_are_equal
+
+
+def check_if_lora_correctly_set(model) -> bool:
+    """
+    Checks if the LoRA layers are correctly set with peft
+    """
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            return True
+    return False
+
+
+@require_peft_backend
+class PeftLoraLoaderMixinTests:
+    pipeline_class = None
+
+    scheduler_cls = None
+    scheduler_kwargs = None
+    scheduler_classes = [DDIMScheduler, LCMScheduler]
+
+    has_two_text_encoders = False
+    has_three_text_encoders = False
+    text_encoder_cls, text_encoder_id = None, None
+    text_encoder_2_cls, text_encoder_2_id = None, None
+    text_encoder_3_cls, text_encoder_3_id = None, None
+    tokenizer_cls, tokenizer_id = None, None
+    tokenizer_2_cls, tokenizer_2_id = None, None
+    tokenizer_3_cls, tokenizer_3_id = None, None
+
+    unet_kwargs = None
+    transformer_cls = None
+    transformer_kwargs = None
+    vae_cls = AutoencoderKL
+    vae_kwargs = None
+
+    text_encoder_target_modules = ["q_proj", "k_proj", "v_proj", "out_proj"]
+
+    def get_dummy_components(self, scheduler_cls=None, use_dora=False):
+        if self.unet_kwargs and self.transformer_kwargs:
+            raise ValueError("Both `unet_kwargs` and `transformer_kwargs` cannot be specified.")
+        if self.has_two_text_encoders and self.has_three_text_encoders:
+            raise ValueError("Both `has_two_text_encoders` and `has_three_text_encoders` cannot be True.")
+
+        scheduler_cls = self.scheduler_cls if scheduler_cls is None else scheduler_cls
+        rank = 4
+
+        torch.manual_seed(0)
+        if self.unet_kwargs is not None:
+            unet = UNet2DConditionModel(**self.unet_kwargs)
+        else:
+            transformer = self.transformer_cls(**self.transformer_kwargs)
+
+        scheduler = scheduler_cls(**self.scheduler_kwargs)
+
+        torch.manual_seed(0)
+        vae = self.vae_cls(**self.vae_kwargs)
+
+        text_encoder = self.text_encoder_cls.from_pretrained(self.text_encoder_id)
+        tokenizer = self.tokenizer_cls.from_pretrained(self.tokenizer_id)
+
+        if self.text_encoder_2_cls is not None:
+            text_encoder_2 = self.text_encoder_2_cls.from_pretrained(self.text_encoder_2_id)
+            tokenizer_2 = self.tokenizer_2_cls.from_pretrained(self.tokenizer_2_id)
+
+        if self.text_encoder_3_cls is not None:
+            text_encoder_3 = self.text_encoder_3_cls.from_pretrained(self.text_encoder_3_id)
+            tokenizer_3 = self.tokenizer_3_cls.from_pretrained(self.tokenizer_3_id)
+
+        text_lora_config = LoraConfig(
+            r=rank,
+            lora_alpha=rank,
+            target_modules=self.text_encoder_target_modules,
+            init_lora_weights=False,
+            use_dora=use_dora,
+        )
+
+        denoiser_lora_config = LoraConfig(
+            r=rank,
+            lora_alpha=rank,
+            target_modules=["to_q", "to_k", "to_v", "to_out.0"],
+            init_lora_weights=False,
+            use_dora=use_dora,
+        )
+
+        pipeline_components = {
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        # Denoiser
+        if self.unet_kwargs is not None:
+            pipeline_components.update({"unet": unet})
+        elif self.transformer_kwargs is not None:
+            pipeline_components.update({"transformer": transformer})
+
+        # Remaining text encoders.
+        if self.text_encoder_2_cls is not None:
+            pipeline_components.update({"tokenizer_2": tokenizer_2, "text_encoder_2": text_encoder_2})
+        if self.text_encoder_3_cls is not None:
+            pipeline_components.update({"tokenizer_3": tokenizer_3, "text_encoder_3": text_encoder_3})
+
+        # Remaining stuff
+        init_params = inspect.signature(self.pipeline_class.__init__).parameters
+        if "safety_checker" in init_params:
+            pipeline_components.update({"safety_checker": None})
+        if "feature_extractor" in init_params:
+            pipeline_components.update({"feature_extractor": None})
+        if "image_encoder" in init_params:
+            pipeline_components.update({"image_encoder": None})
+
+        return pipeline_components, text_lora_config, denoiser_lora_config
+
+    @property
+    def output_shape(self):
+        raise NotImplementedError
+
+    def get_dummy_inputs(self, with_generator=True):
+        batch_size = 1
+        sequence_length = 10
+        num_channels = 4
+        sizes = (32, 32)
+
+        generator = torch.manual_seed(0)
+        noise = floats_tensor((batch_size, num_channels) + sizes)
+        input_ids = torch.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator)
+
+        pipeline_inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "num_inference_steps": 5,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        if with_generator:
+            pipeline_inputs.update({"generator": generator})
+
+        return noise, input_ids, pipeline_inputs
+
+    # Copied from: https://colab.research.google.com/gist/sayakpaul/df2ef6e1ae6d8c10a49d859883b10860/scratchpad.ipynb
+    def get_dummy_tokens(self):
+        max_seq_length = 77
+
+        inputs = torch.randint(2, 56, size=(1, max_seq_length), generator=torch.manual_seed(0))
+
+        prepared_inputs = {}
+        prepared_inputs["input_ids"] = inputs
+        return prepared_inputs
+
+    def test_simple_inference(self):
+        """
+        Tests a simple inference and makes sure it works as expected
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, _ = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+
+            _, _, inputs = self.get_dummy_inputs()
+            output_no_lora = pipe(**inputs)[0]
+            self.assertTrue(output_no_lora.shape == self.output_shape)
+
+    def test_simple_inference_with_text_lora(self):
+        """
+        Tests a simple inference with lora attached on the text encoder
+        and makes sure it works as expected
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, _ = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(output_no_lora.shape == self.output_shape)
+
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                lora_loadable_components = self.pipeline_class._lora_loadable_modules
+                if "text_encoder_2" in lora_loadable_components:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(
+                not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output"
+            )
+
+    def test_simple_inference_with_text_lora_and_scale(self):
+        """
+        Tests a simple inference with lora attached on the text encoder + scale argument
+        and makes sure it works as expected
+        """
+        call_signature_keys = inspect.signature(self.pipeline_class.__call__).parameters.keys()
+
+        # TODO(diffusers): Discuss a common naming convention across library for 1.0.0 release
+        for possible_attention_kwargs in ["cross_attention_kwargs", "joint_attention_kwargs", "attention_kwargs"]:
+            if possible_attention_kwargs in call_signature_keys:
+                attention_kwargs_name = possible_attention_kwargs
+                break
+        assert attention_kwargs_name is not None
+
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, _ = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(output_no_lora.shape == self.output_shape)
+
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                lora_loadable_components = self.pipeline_class._lora_loadable_modules
+                if "text_encoder_2" in lora_loadable_components:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(
+                not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output"
+            )
+
+            attention_kwargs = {attention_kwargs_name: {"scale": 0.5}}
+            output_lora_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0]
+
+            self.assertTrue(
+                not np.allclose(output_lora, output_lora_scale, atol=1e-3, rtol=1e-3),
+                "Lora + scale should change the output",
+            )
+
+            attention_kwargs = {attention_kwargs_name: {"scale": 0.0}}
+            output_lora_0_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0]
+
+            self.assertTrue(
+                np.allclose(output_no_lora, output_lora_0_scale, atol=1e-3, rtol=1e-3),
+                "Lora + 0 scale should lead to same result as no LoRA",
+            )
+
+    def test_simple_inference_with_text_lora_fused(self):
+        """
+        Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model
+        and makes sure it works as expected
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, _ = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(output_no_lora.shape == self.output_shape)
+
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            pipe.fuse_lora()
+            # Fusing should still keep the LoRA layers
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            ouput_fused = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertFalse(
+                np.allclose(ouput_fused, output_no_lora, atol=1e-3, rtol=1e-3), "Fused lora should change the output"
+            )
+
+    def test_simple_inference_with_text_lora_unloaded(self):
+        """
+        Tests a simple inference with lora attached to text encoder, then unloads the lora weights
+        and makes sure it works as expected
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, _ = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(output_no_lora.shape == self.output_shape)
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                lora_loadable_components = self.pipeline_class._lora_loadable_modules
+                if "text_encoder_2" in lora_loadable_components:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            pipe.unload_lora_weights()
+            # unloading should remove the LoRA layers
+            self.assertFalse(
+                check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly unloaded in text encoder"
+            )
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    self.assertFalse(
+                        check_if_lora_correctly_set(pipe.text_encoder_2),
+                        "Lora not correctly unloaded in text encoder 2",
+                    )
+
+            ouput_unloaded = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(
+                np.allclose(ouput_unloaded, output_no_lora, atol=1e-3, rtol=1e-3),
+                "Fused lora should change the output",
+            )
+
+    def test_simple_inference_with_text_lora_save_load(self):
+        """
+        Tests a simple usecase where users could use saving utilities for LoRA.
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, _ = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(output_no_lora.shape == self.output_shape)
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                text_encoder_state_dict = get_peft_model_state_dict(pipe.text_encoder)
+                if self.has_two_text_encoders or self.has_three_text_encoders:
+                    if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                        text_encoder_2_state_dict = get_peft_model_state_dict(pipe.text_encoder_2)
+
+                        self.pipeline_class.save_lora_weights(
+                            save_directory=tmpdirname,
+                            text_encoder_lora_layers=text_encoder_state_dict,
+                            text_encoder_2_lora_layers=text_encoder_2_state_dict,
+                            safe_serialization=False,
+                        )
+                else:
+                    self.pipeline_class.save_lora_weights(
+                        save_directory=tmpdirname,
+                        text_encoder_lora_layers=text_encoder_state_dict,
+                        safe_serialization=False,
+                    )
+
+                if self.has_two_text_encoders:
+                    if "text_encoder_2" not in self.pipeline_class._lora_loadable_modules:
+                        self.pipeline_class.save_lora_weights(
+                            save_directory=tmpdirname,
+                            text_encoder_lora_layers=text_encoder_state_dict,
+                            safe_serialization=False,
+                        )
+
+                self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
+                pipe.unload_lora_weights()
+
+                pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))
+
+            images_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            self.assertTrue(
+                np.allclose(images_lora, images_lora_from_pretrained, atol=1e-3, rtol=1e-3),
+                "Loading from saved checkpoints should give same results.",
+            )
+
+    def test_simple_inference_with_partial_text_lora(self):
+        """
+        Tests a simple inference with lora attached on the text encoder
+        with different ranks and some adapters removed
+        and makes sure it works as expected
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, _, _ = self.get_dummy_components(scheduler_cls)
+            # Verify `StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder` handles different ranks per module (PR#8324).
+            text_lora_config = LoraConfig(
+                r=4,
+                rank_pattern={"q_proj": 1, "k_proj": 2, "v_proj": 3},
+                lora_alpha=4,
+                target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+                init_lora_weights=False,
+                use_dora=False,
+            )
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(output_no_lora.shape == self.output_shape)
+
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+            # Gather the state dict for the PEFT model, excluding `layers.4`, to ensure `load_lora_into_text_encoder`
+            # supports missing layers (PR#8324).
+            state_dict = {
+                f"text_encoder.{module_name}": param
+                for module_name, param in get_peft_model_state_dict(pipe.text_encoder).items()
+                if "text_model.encoder.layers.4" not in module_name
+            }
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+                    state_dict.update(
+                        {
+                            f"text_encoder_2.{module_name}": param
+                            for module_name, param in get_peft_model_state_dict(pipe.text_encoder_2).items()
+                            if "text_model.encoder.layers.4" not in module_name
+                        }
+                    )
+
+            output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(
+                not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output"
+            )
+
+            # Unload lora and load it back using the pipe.load_lora_weights machinery
+            pipe.unload_lora_weights()
+            pipe.load_lora_weights(state_dict)
+
+            output_partial_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(
+                not np.allclose(output_partial_lora, output_lora, atol=1e-3, rtol=1e-3),
+                "Removing adapters should change the output",
+            )
+
+    def test_simple_inference_save_pretrained(self):
+        """
+        Tests a simple usecase where users could use saving utilities for LoRA through save_pretrained
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, _ = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(output_no_lora.shape == self.output_shape)
+
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                pipe.save_pretrained(tmpdirname)
+
+                pipe_from_pretrained = self.pipeline_class.from_pretrained(tmpdirname)
+                pipe_from_pretrained.to(torch_device)
+
+            self.assertTrue(
+                check_if_lora_correctly_set(pipe_from_pretrained.text_encoder),
+                "Lora not correctly set in text encoder",
+            )
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe_from_pretrained.text_encoder_2),
+                        "Lora not correctly set in text encoder 2",
+                    )
+
+            images_lora_save_pretrained = pipe_from_pretrained(**inputs, generator=torch.manual_seed(0))[0]
+
+            self.assertTrue(
+                np.allclose(images_lora, images_lora_save_pretrained, atol=1e-3, rtol=1e-3),
+                "Loading from saved checkpoints should give same results.",
+            )
+
+    def test_simple_inference_with_text_denoiser_lora_save_load(self):
+        """
+        Tests a simple usecase where users could use saving utilities for LoRA for Unet + text encoder
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(output_no_lora.shape == self.output_shape)
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                text_encoder_state_dict = (
+                    get_peft_model_state_dict(pipe.text_encoder)
+                    if "text_encoder" in self.pipeline_class._lora_loadable_modules
+                    else None
+                )
+
+                denoiser_state_dict = get_peft_model_state_dict(denoiser)
+
+                saving_kwargs = {
+                    "save_directory": tmpdirname,
+                    "safe_serialization": False,
+                }
+
+                if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                    saving_kwargs.update({"text_encoder_lora_layers": text_encoder_state_dict})
+
+                if self.unet_kwargs is not None:
+                    saving_kwargs.update({"unet_lora_layers": denoiser_state_dict})
+                else:
+                    saving_kwargs.update({"transformer_lora_layers": denoiser_state_dict})
+
+                if self.has_two_text_encoders or self.has_three_text_encoders:
+                    if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                        text_encoder_2_state_dict = get_peft_model_state_dict(pipe.text_encoder_2)
+                        saving_kwargs.update({"text_encoder_2_lora_layers": text_encoder_2_state_dict})
+
+                self.pipeline_class.save_lora_weights(**saving_kwargs)
+
+                self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
+                pipe.unload_lora_weights()
+
+                pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))
+
+            images_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            self.assertTrue(
+                np.allclose(images_lora, images_lora_from_pretrained, atol=1e-3, rtol=1e-3),
+                "Loading from saved checkpoints should give same results.",
+            )
+
+    def test_simple_inference_with_text_denoiser_lora_and_scale(self):
+        """
+        Tests a simple inference with lora attached on the text encoder + Unet + scale argument
+        and makes sure it works as expected
+        """
+        call_signature_keys = inspect.signature(self.pipeline_class.__call__).parameters.keys()
+        for possible_attention_kwargs in ["cross_attention_kwargs", "joint_attention_kwargs", "attention_kwargs"]:
+            if possible_attention_kwargs in call_signature_keys:
+                attention_kwargs_name = possible_attention_kwargs
+                break
+        assert attention_kwargs_name is not None
+
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(output_no_lora.shape == self.output_shape)
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(
+                not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output"
+            )
+
+            attention_kwargs = {attention_kwargs_name: {"scale": 0.5}}
+            output_lora_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0]
+
+            self.assertTrue(
+                not np.allclose(output_lora, output_lora_scale, atol=1e-3, rtol=1e-3),
+                "Lora + scale should change the output",
+            )
+
+            attention_kwargs = {attention_kwargs_name: {"scale": 0.0}}
+            output_lora_0_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0]
+
+            self.assertTrue(
+                np.allclose(output_no_lora, output_lora_0_scale, atol=1e-3, rtol=1e-3),
+                "Lora + 0 scale should lead to same result as no LoRA",
+            )
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                self.assertTrue(
+                    pipe.text_encoder.text_model.encoder.layers[0].self_attn.q_proj.scaling["default"] == 1.0,
+                    "The scaling parameter has not been correctly restored!",
+                )
+
+    def test_simple_inference_with_text_lora_denoiser_fused(self):
+        """
+        Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model
+        and makes sure it works as expected - with unet
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(output_no_lora.shape == self.output_shape)
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules)
+
+            # Fusing should still keep the LoRA layers
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            output_fused = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertFalse(
+                np.allclose(output_fused, output_no_lora, atol=1e-3, rtol=1e-3), "Fused lora should change the output"
+            )
+
+    def test_simple_inference_with_text_denoiser_lora_unloaded(self):
+        """
+        Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights
+        and makes sure it works as expected
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(output_no_lora.shape == self.output_shape)
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            pipe.unload_lora_weights()
+            # unloading should remove the LoRA layers
+            self.assertFalse(
+                check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly unloaded in text encoder"
+            )
+            self.assertFalse(check_if_lora_correctly_set(denoiser), "Lora not correctly unloaded in denoiser")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    self.assertFalse(
+                        check_if_lora_correctly_set(pipe.text_encoder_2),
+                        "Lora not correctly unloaded in text encoder 2",
+                    )
+
+            output_unloaded = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(
+                np.allclose(output_unloaded, output_no_lora, atol=1e-3, rtol=1e-3),
+                "Fused lora should change the output",
+            )
+
+    def test_simple_inference_with_text_denoiser_lora_unfused(
+        self, expected_atol: float = 1e-3, expected_rtol: float = 1e-3
+    ):
+        """
+        Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights
+        and makes sure it works as expected
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules)
+            output_fused_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            pipe.unfuse_lora(components=self.pipeline_class._lora_loadable_modules)
+            output_unfused_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            # unloading should remove the LoRA layers
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Unfuse should still keep LoRA layers")
+
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Unfuse should still keep LoRA layers")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Unfuse should still keep LoRA layers"
+                    )
+
+            # Fuse and unfuse should lead to the same results
+            self.assertTrue(
+                np.allclose(output_fused_lora, output_unfused_lora, atol=expected_atol, rtol=expected_rtol),
+                "Fused lora should not change the output",
+            )
+
+    def test_simple_inference_with_text_denoiser_multi_adapter(self):
+        """
+        Tests a simple inference with lora attached to text encoder and unet, attaches
+        multiple adapters and set them
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-2")
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+            denoiser.add_adapter(denoiser_lora_config, "adapter-2")
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1")
+                    pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2")
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            pipe.set_adapters("adapter-1")
+            output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertFalse(
+                np.allclose(output_no_lora, output_adapter_1, atol=1e-3, rtol=1e-3),
+                "Adapter outputs should be different.",
+            )
+
+            pipe.set_adapters("adapter-2")
+            output_adapter_2 = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertFalse(
+                np.allclose(output_no_lora, output_adapter_2, atol=1e-3, rtol=1e-3),
+                "Adapter outputs should be different.",
+            )
+
+            pipe.set_adapters(["adapter-1", "adapter-2"])
+            output_adapter_mixed = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertFalse(
+                np.allclose(output_no_lora, output_adapter_mixed, atol=1e-3, rtol=1e-3),
+                "Adapter outputs should be different.",
+            )
+
+            # Fuse and unfuse should lead to the same results
+            self.assertFalse(
+                np.allclose(output_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3),
+                "Adapter 1 and 2 should give different results",
+            )
+
+            self.assertFalse(
+                np.allclose(output_adapter_1, output_adapter_mixed, atol=1e-3, rtol=1e-3),
+                "Adapter 1 and mixed adapters should give different results",
+            )
+
+            self.assertFalse(
+                np.allclose(output_adapter_2, output_adapter_mixed, atol=1e-3, rtol=1e-3),
+                "Adapter 2 and mixed adapters should give different results",
+            )
+
+            pipe.disable_lora()
+            output_disabled = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            self.assertTrue(
+                np.allclose(output_no_lora, output_disabled, atol=1e-3, rtol=1e-3),
+                "output with no lora and output with lora disabled should give same results",
+            )
+
+    def test_wrong_adapter_name_raises_error(self):
+        scheduler_cls = self.scheduler_classes[0]
+        components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+        if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+            pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+        denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+        denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+        self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+        if self.has_two_text_encoders or self.has_three_text_encoders:
+            if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1")
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                )
+
+        with self.assertRaises(ValueError) as err_context:
+            pipe.set_adapters("test")
+
+        self.assertTrue("not in the list of present adapters" in str(err_context.exception))
+
+        # test this works.
+        pipe.set_adapters("adapter-1")
+        _ = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+    def test_simple_inference_with_text_denoiser_block_scale(self):
+        """
+        Tests a simple inference with lora attached to text encoder and unet, attaches
+        one adapter and set different weights for different blocks (i.e. block lora)
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1")
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            weights_1 = {"text_encoder": 2, "unet": {"down": 5}}
+            pipe.set_adapters("adapter-1", weights_1)
+            output_weights_1 = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            weights_2 = {"unet": {"up": 5}}
+            pipe.set_adapters("adapter-1", weights_2)
+            output_weights_2 = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            self.assertFalse(
+                np.allclose(output_weights_1, output_weights_2, atol=1e-3, rtol=1e-3),
+                "LoRA weights 1 and 2 should give different results",
+            )
+            self.assertFalse(
+                np.allclose(output_no_lora, output_weights_1, atol=1e-3, rtol=1e-3),
+                "No adapter and LoRA weights 1 should give different results",
+            )
+            self.assertFalse(
+                np.allclose(output_no_lora, output_weights_2, atol=1e-3, rtol=1e-3),
+                "No adapter and LoRA weights 2 should give different results",
+            )
+
+            pipe.disable_lora()
+            output_disabled = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            self.assertTrue(
+                np.allclose(output_no_lora, output_disabled, atol=1e-3, rtol=1e-3),
+                "output with no lora and output with lora disabled should give same results",
+            )
+
+    def test_simple_inference_with_text_denoiser_multi_adapter_block_lora(self):
+        """
+        Tests a simple inference with lora attached to text encoder and unet, attaches
+        multiple adapters and set differnt weights for different blocks (i.e. block lora)
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-2")
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+            denoiser.add_adapter(denoiser_lora_config, "adapter-2")
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1")
+                    pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2")
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            scales_1 = {"text_encoder": 2, "unet": {"down": 5}}
+            scales_2 = {"unet": {"down": 5, "mid": 5}}
+
+            pipe.set_adapters("adapter-1", scales_1)
+            output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            pipe.set_adapters("adapter-2", scales_2)
+            output_adapter_2 = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            pipe.set_adapters(["adapter-1", "adapter-2"], [scales_1, scales_2])
+            output_adapter_mixed = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            # Fuse and unfuse should lead to the same results
+            self.assertFalse(
+                np.allclose(output_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3),
+                "Adapter 1 and 2 should give different results",
+            )
+
+            self.assertFalse(
+                np.allclose(output_adapter_1, output_adapter_mixed, atol=1e-3, rtol=1e-3),
+                "Adapter 1 and mixed adapters should give different results",
+            )
+
+            self.assertFalse(
+                np.allclose(output_adapter_2, output_adapter_mixed, atol=1e-3, rtol=1e-3),
+                "Adapter 2 and mixed adapters should give different results",
+            )
+
+            pipe.disable_lora()
+            output_disabled = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            self.assertTrue(
+                np.allclose(output_no_lora, output_disabled, atol=1e-3, rtol=1e-3),
+                "output with no lora and output with lora disabled should give same results",
+            )
+
+            # a mismatching number of adapter_names and adapter_weights should raise an error
+            with self.assertRaises(ValueError):
+                pipe.set_adapters(["adapter-1", "adapter-2"], [scales_1])
+
+    def test_simple_inference_with_text_denoiser_block_scale_for_all_dict_options(self):
+        """Tests that any valid combination of lora block scales can be used in pipe.set_adapter"""
+
+        def updown_options(blocks_with_tf, layers_per_block, value):
+            """
+            Generate every possible combination for how a lora weight dict for the up/down part can be.
+            E.g. 2, {"block_1": 2}, {"block_1": [2,2,2]}, {"block_1": 2, "block_2": [2,2,2]}, ...
+            """
+            num_val = value
+            list_val = [value] * layers_per_block
+
+            node_opts = [None, num_val, list_val]
+            node_opts_foreach_block = [node_opts] * len(blocks_with_tf)
+
+            updown_opts = [num_val]
+            for nodes in product(*node_opts_foreach_block):
+                if all(n is None for n in nodes):
+                    continue
+                opt = {}
+                for b, n in zip(blocks_with_tf, nodes):
+                    if n is not None:
+                        opt["block_" + str(b)] = n
+                updown_opts.append(opt)
+            return updown_opts
+
+        def all_possible_dict_opts(unet, value):
+            """
+            Generate every possible combination for how a lora weight dict can be.
+            E.g. 2, {"unet: {"down": 2}}, {"unet: {"down": [2,2,2]}}, {"unet: {"mid": 2, "up": [2,2,2]}}, ...
+            """
+
+            down_blocks_with_tf = [i for i, d in enumerate(unet.down_blocks) if hasattr(d, "attentions")]
+            up_blocks_with_tf = [i for i, u in enumerate(unet.up_blocks) if hasattr(u, "attentions")]
+
+            layers_per_block = unet.config.layers_per_block
+
+            text_encoder_opts = [None, value]
+            text_encoder_2_opts = [None, value]
+            mid_opts = [None, value]
+            down_opts = [None] + updown_options(down_blocks_with_tf, layers_per_block, value)
+            up_opts = [None] + updown_options(up_blocks_with_tf, layers_per_block + 1, value)
+
+            opts = []
+
+            for t1, t2, d, m, u in product(text_encoder_opts, text_encoder_2_opts, down_opts, mid_opts, up_opts):
+                if all(o is None for o in (t1, t2, d, m, u)):
+                    continue
+                opt = {}
+                if t1 is not None:
+                    opt["text_encoder"] = t1
+                if t2 is not None:
+                    opt["text_encoder_2"] = t2
+                if all(o is None for o in (d, m, u)):
+                    # no unet scaling
+                    continue
+                opt["unet"] = {}
+                if d is not None:
+                    opt["unet"]["down"] = d
+                if m is not None:
+                    opt["unet"]["mid"] = m
+                if u is not None:
+                    opt["unet"]["up"] = u
+                opts.append(opt)
+
+            return opts
+
+        components, text_lora_config, denoiser_lora_config = self.get_dummy_components(self.scheduler_cls)
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+        pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+
+        denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+        denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+
+        if self.has_two_text_encoders or self.has_three_text_encoders:
+            lora_loadable_components = self.pipeline_class._lora_loadable_modules
+            if "text_encoder_2" in lora_loadable_components:
+                pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1")
+
+        for scale_dict in all_possible_dict_opts(pipe.unet, value=1234):
+            # test if lora block scales can be set with this scale_dict
+            if not self.has_two_text_encoders and "text_encoder_2" in scale_dict:
+                del scale_dict["text_encoder_2"]
+
+            pipe.set_adapters("adapter-1", scale_dict)  # test will fail if this line throws an error
+
+    def test_simple_inference_with_text_denoiser_multi_adapter_delete_adapter(self):
+        """
+        Tests a simple inference with lora attached to text encoder and unet, attaches
+        multiple adapters and set/delete them
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-2")
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+            denoiser.add_adapter(denoiser_lora_config, "adapter-2")
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                lora_loadable_components = self.pipeline_class._lora_loadable_modules
+                if "text_encoder_2" in lora_loadable_components:
+                    pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1")
+                    pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2")
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            pipe.set_adapters("adapter-1")
+            output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            pipe.set_adapters("adapter-2")
+            output_adapter_2 = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            pipe.set_adapters(["adapter-1", "adapter-2"])
+            output_adapter_mixed = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            self.assertFalse(
+                np.allclose(output_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3),
+                "Adapter 1 and 2 should give different results",
+            )
+
+            self.assertFalse(
+                np.allclose(output_adapter_1, output_adapter_mixed, atol=1e-3, rtol=1e-3),
+                "Adapter 1 and mixed adapters should give different results",
+            )
+
+            self.assertFalse(
+                np.allclose(output_adapter_2, output_adapter_mixed, atol=1e-3, rtol=1e-3),
+                "Adapter 2 and mixed adapters should give different results",
+            )
+
+            pipe.delete_adapters("adapter-1")
+            output_deleted_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            self.assertTrue(
+                np.allclose(output_deleted_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3),
+                "Adapter 1 and 2 should give different results",
+            )
+
+            pipe.delete_adapters("adapter-2")
+            output_deleted_adapters = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            self.assertTrue(
+                np.allclose(output_no_lora, output_deleted_adapters, atol=1e-3, rtol=1e-3),
+                "output with no lora and output with lora disabled should give same results",
+            )
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-2")
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+            denoiser.add_adapter(denoiser_lora_config, "adapter-2")
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            pipe.set_adapters(["adapter-1", "adapter-2"])
+            pipe.delete_adapters(["adapter-1", "adapter-2"])
+
+            output_deleted_adapters = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            self.assertTrue(
+                np.allclose(output_no_lora, output_deleted_adapters, atol=1e-3, rtol=1e-3),
+                "output with no lora and output with lora disabled should give same results",
+            )
+
+    def test_simple_inference_with_text_denoiser_multi_adapter_weighted(self):
+        """
+        Tests a simple inference with lora attached to text encoder and unet, attaches
+        multiple adapters and set them
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-2")
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+            denoiser.add_adapter(denoiser_lora_config, "adapter-2")
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                lora_loadable_components = self.pipeline_class._lora_loadable_modules
+                if "text_encoder_2" in lora_loadable_components:
+                    pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1")
+                    pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2")
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            pipe.set_adapters("adapter-1")
+            output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            pipe.set_adapters("adapter-2")
+            output_adapter_2 = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            pipe.set_adapters(["adapter-1", "adapter-2"])
+            output_adapter_mixed = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            # Fuse and unfuse should lead to the same results
+            self.assertFalse(
+                np.allclose(output_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3),
+                "Adapter 1 and 2 should give different results",
+            )
+
+            self.assertFalse(
+                np.allclose(output_adapter_1, output_adapter_mixed, atol=1e-3, rtol=1e-3),
+                "Adapter 1 and mixed adapters should give different results",
+            )
+
+            self.assertFalse(
+                np.allclose(output_adapter_2, output_adapter_mixed, atol=1e-3, rtol=1e-3),
+                "Adapter 2 and mixed adapters should give different results",
+            )
+
+            pipe.set_adapters(["adapter-1", "adapter-2"], [0.5, 0.6])
+            output_adapter_mixed_weighted = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            self.assertFalse(
+                np.allclose(output_adapter_mixed_weighted, output_adapter_mixed, atol=1e-3, rtol=1e-3),
+                "Weighted adapter and mixed adapter should give different results",
+            )
+
+            pipe.disable_lora()
+            output_disabled = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            self.assertTrue(
+                np.allclose(output_no_lora, output_disabled, atol=1e-3, rtol=1e-3),
+                "output with no lora and output with lora disabled should give same results",
+            )
+
+    @skip_mps
+    def test_lora_fuse_nan(self):
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            # corrupt one LoRA weight with `inf` values
+            with torch.no_grad():
+                if self.unet_kwargs:
+                    pipe.unet.mid_block.attentions[0].transformer_blocks[0].attn1.to_q.lora_A[
+                        "adapter-1"
+                    ].weight += float("inf")
+                else:
+                    pipe.transformer.transformer_blocks[0].attn.to_q.lora_A["adapter-1"].weight += float("inf")
+
+            # with `safe_fusing=True` we should see an Error
+            with self.assertRaises(ValueError):
+                pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=True)
+
+            # without we should not see an error, but every image will be black
+            pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=False)
+            out = pipe("test", num_inference_steps=2, output_type="np")[0]
+
+            self.assertTrue(np.isnan(out).all())
+
+    def test_get_adapters(self):
+        """
+        Tests a simple usecase where we attach multiple adapters and check if the results
+        are the expected results
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+
+            adapter_names = pipe.get_active_adapters()
+            self.assertListEqual(adapter_names, ["adapter-1"])
+
+            pipe.text_encoder.add_adapter(text_lora_config, "adapter-2")
+            denoiser.add_adapter(denoiser_lora_config, "adapter-2")
+
+            adapter_names = pipe.get_active_adapters()
+            self.assertListEqual(adapter_names, ["adapter-2"])
+
+            pipe.set_adapters(["adapter-1", "adapter-2"])
+            self.assertListEqual(pipe.get_active_adapters(), ["adapter-1", "adapter-2"])
+
+    def test_get_list_adapters(self):
+        """
+        Tests a simple usecase where we attach multiple adapters and check if the results
+        are the expected results
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+
+            # 1.
+            dicts_to_be_checked = {}
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+                dicts_to_be_checked = {"text_encoder": ["adapter-1"]}
+
+            if self.unet_kwargs is not None:
+                pipe.unet.add_adapter(denoiser_lora_config, "adapter-1")
+                dicts_to_be_checked.update({"unet": ["adapter-1"]})
+            else:
+                pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1")
+                dicts_to_be_checked.update({"transformer": ["adapter-1"]})
+
+            self.assertDictEqual(pipe.get_list_adapters(), dicts_to_be_checked)
+
+            # 2.
+            dicts_to_be_checked = {}
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-2")
+                dicts_to_be_checked = {"text_encoder": ["adapter-1", "adapter-2"]}
+
+            if self.unet_kwargs is not None:
+                pipe.unet.add_adapter(denoiser_lora_config, "adapter-2")
+                dicts_to_be_checked.update({"unet": ["adapter-1", "adapter-2"]})
+            else:
+                pipe.transformer.add_adapter(denoiser_lora_config, "adapter-2")
+                dicts_to_be_checked.update({"transformer": ["adapter-1", "adapter-2"]})
+
+            self.assertDictEqual(pipe.get_list_adapters(), dicts_to_be_checked)
+
+            # 3.
+            pipe.set_adapters(["adapter-1", "adapter-2"])
+
+            dicts_to_be_checked = {}
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                dicts_to_be_checked = {"text_encoder": ["adapter-1", "adapter-2"]}
+
+            if self.unet_kwargs is not None:
+                dicts_to_be_checked.update({"unet": ["adapter-1", "adapter-2"]})
+            else:
+                dicts_to_be_checked.update({"transformer": ["adapter-1", "adapter-2"]})
+
+            self.assertDictEqual(
+                pipe.get_list_adapters(),
+                dicts_to_be_checked,
+            )
+
+            # 4.
+            dicts_to_be_checked = {}
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                dicts_to_be_checked = {"text_encoder": ["adapter-1", "adapter-2"]}
+
+            if self.unet_kwargs is not None:
+                pipe.unet.add_adapter(denoiser_lora_config, "adapter-3")
+                dicts_to_be_checked.update({"unet": ["adapter-1", "adapter-2", "adapter-3"]})
+            else:
+                pipe.transformer.add_adapter(denoiser_lora_config, "adapter-3")
+                dicts_to_be_checked.update({"transformer": ["adapter-1", "adapter-2", "adapter-3"]})
+
+            self.assertDictEqual(pipe.get_list_adapters(), dicts_to_be_checked)
+
+    @require_peft_version_greater(peft_version="0.6.2")
+    def test_simple_inference_with_text_lora_denoiser_fused_multi(
+        self, expected_atol: float = 1e-3, expected_rtol: float = 1e-3
+    ):
+        """
+        Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model
+        and makes sure it works as expected - with unet and multi-adapter case
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(output_no_lora.shape == self.output_shape)
+
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+
+            # Attach a second adapter
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config, "adapter-2")
+
+            denoiser.add_adapter(denoiser_lora_config, "adapter-2")
+
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                lora_loadable_components = self.pipeline_class._lora_loadable_modules
+                if "text_encoder_2" in lora_loadable_components:
+                    pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1")
+                    pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2")
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            # set them to multi-adapter inference mode
+            pipe.set_adapters(["adapter-1", "adapter-2"])
+            outputs_all_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            pipe.set_adapters(["adapter-1"])
+            outputs_lora_1 = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, adapter_names=["adapter-1"])
+
+            # Fusing should still keep the LoRA layers so outpout should remain the same
+            outputs_lora_1_fused = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            self.assertTrue(
+                np.allclose(outputs_lora_1, outputs_lora_1_fused, atol=expected_atol, rtol=expected_rtol),
+                "Fused lora should not change the output",
+            )
+
+            pipe.unfuse_lora(components=self.pipeline_class._lora_loadable_modules)
+            pipe.fuse_lora(
+                components=self.pipeline_class._lora_loadable_modules, adapter_names=["adapter-2", "adapter-1"]
+            )
+
+            # Fusing should still keep the LoRA layers
+            output_all_lora_fused = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(
+                np.allclose(output_all_lora_fused, outputs_all_lora, atol=expected_atol, rtol=expected_rtol),
+                "Fused lora should not change the output",
+            )
+
+    @require_peft_version_greater(peft_version="0.9.0")
+    def test_simple_inference_with_dora(self):
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(
+                scheduler_cls, use_dora=True
+            )
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            output_no_dora_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+            self.assertTrue(output_no_dora_lora.shape == self.output_shape)
+
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                lora_loadable_components = self.pipeline_class._lora_loadable_modules
+                if "text_encoder_2" in lora_loadable_components:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
+            output_dora_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+            self.assertFalse(
+                np.allclose(output_dora_lora, output_no_dora_lora, atol=1e-3, rtol=1e-3),
+                "DoRA lora should change the output",
+            )
+
+    @unittest.skip("This is failing for now - need to investigate")
+    def test_simple_inference_with_text_denoiser_lora_unfused_torch_compile(self):
+        """
+        Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights
+        and makes sure it works as expected
+        """
+        for scheduler_cls in self.scheduler_classes:
+            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                pipe.text_encoder_2.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                )
+
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+            pipe.text_encoder = torch.compile(pipe.text_encoder, mode="reduce-overhead", fullgraph=True)
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                pipe.text_encoder_2 = torch.compile(pipe.text_encoder_2, mode="reduce-overhead", fullgraph=True)
+
+            # Just makes sure it works..
+            _ = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+    def test_modify_padding_mode(self):
+        def set_pad_mode(network, mode="circular"):
+            for _, module in network.named_modules():
+                if isinstance(module, torch.nn.Conv2d):
+                    module.padding_mode = mode
+
+        for scheduler_cls in self.scheduler_classes:
+            components, _, _ = self.get_dummy_components(scheduler_cls)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(torch_device)
+            pipe.set_progress_bar_config(disable=None)
+            _pad_mode = "circular"
+            set_pad_mode(pipe.vae, _pad_mode)
+            set_pad_mode(pipe.unet, _pad_mode)
+
+            _, _, inputs = self.get_dummy_inputs()
+            _ = pipe(**inputs)[0]
diff --git a/diffusers/tests/models/__init__.py b/diffusers/tests/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/models/autoencoders/__init__.py b/diffusers/tests/models/autoencoders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/models/autoencoders/test_models_vae.py b/diffusers/tests/models/autoencoders/test_models_vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..0188f9121ae04d0afcf529ef5c8e96738d9d1057
--- /dev/null
+++ b/diffusers/tests/models/autoencoders/test_models_vae.py
@@ -0,0 +1,1270 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from datasets import load_dataset
+from parameterized import parameterized
+
+from diffusers import (
+    AsymmetricAutoencoderKL,
+    AutoencoderKL,
+    AutoencoderKLTemporalDecoder,
+    AutoencoderOobleck,
+    AutoencoderTiny,
+    ConsistencyDecoderVAE,
+    StableDiffusionPipeline,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.loading_utils import load_image
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    floats_tensor,
+    load_hf_numpy,
+    require_torch_accelerator,
+    require_torch_accelerator_with_fp16,
+    require_torch_accelerator_with_training,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_all_close,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+enable_full_determinism()
+
+
+def get_autoencoder_kl_config(block_out_channels=None, norm_num_groups=None):
+    block_out_channels = block_out_channels or [2, 4]
+    norm_num_groups = norm_num_groups or 2
+    init_dict = {
+        "block_out_channels": block_out_channels,
+        "in_channels": 3,
+        "out_channels": 3,
+        "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
+        "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels),
+        "latent_channels": 4,
+        "norm_num_groups": norm_num_groups,
+    }
+    return init_dict
+
+
+def get_asym_autoencoder_kl_config(block_out_channels=None, norm_num_groups=None):
+    block_out_channels = block_out_channels or [2, 4]
+    norm_num_groups = norm_num_groups or 2
+    init_dict = {
+        "in_channels": 3,
+        "out_channels": 3,
+        "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
+        "down_block_out_channels": block_out_channels,
+        "layers_per_down_block": 1,
+        "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels),
+        "up_block_out_channels": block_out_channels,
+        "layers_per_up_block": 1,
+        "act_fn": "silu",
+        "latent_channels": 4,
+        "norm_num_groups": norm_num_groups,
+        "sample_size": 32,
+        "scaling_factor": 0.18215,
+    }
+    return init_dict
+
+
+def get_autoencoder_tiny_config(block_out_channels=None):
+    block_out_channels = (len(block_out_channels) * [32]) if block_out_channels is not None else [32, 32]
+    init_dict = {
+        "in_channels": 3,
+        "out_channels": 3,
+        "encoder_block_out_channels": block_out_channels,
+        "decoder_block_out_channels": block_out_channels,
+        "num_encoder_blocks": [b // min(block_out_channels) for b in block_out_channels],
+        "num_decoder_blocks": [b // min(block_out_channels) for b in reversed(block_out_channels)],
+    }
+    return init_dict
+
+
+def get_consistency_vae_config(block_out_channels=None, norm_num_groups=None):
+    block_out_channels = block_out_channels or [2, 4]
+    norm_num_groups = norm_num_groups or 2
+    return {
+        "encoder_block_out_channels": block_out_channels,
+        "encoder_in_channels": 3,
+        "encoder_out_channels": 4,
+        "encoder_down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
+        "decoder_add_attention": False,
+        "decoder_block_out_channels": block_out_channels,
+        "decoder_down_block_types": ["ResnetDownsampleBlock2D"] * len(block_out_channels),
+        "decoder_downsample_padding": 1,
+        "decoder_in_channels": 7,
+        "decoder_layers_per_block": 1,
+        "decoder_norm_eps": 1e-05,
+        "decoder_norm_num_groups": norm_num_groups,
+        "encoder_norm_num_groups": norm_num_groups,
+        "decoder_num_train_timesteps": 1024,
+        "decoder_out_channels": 6,
+        "decoder_resnet_time_scale_shift": "scale_shift",
+        "decoder_time_embedding_type": "learned",
+        "decoder_up_block_types": ["ResnetUpsampleBlock2D"] * len(block_out_channels),
+        "scaling_factor": 1,
+        "latent_channels": 4,
+    }
+
+
+def get_autoencoder_oobleck_config(block_out_channels=None):
+    init_dict = {
+        "encoder_hidden_size": 12,
+        "decoder_channels": 12,
+        "decoder_input_channels": 6,
+        "audio_channels": 2,
+        "downsampling_ratios": [2, 4],
+        "channel_multiples": [1, 2],
+    }
+    return init_dict
+
+
+class AutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = AutoencoderKL
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+
+        return {"sample": image}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = get_autoencoder_kl_config()
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_forward_signature(self):
+        pass
+
+    def test_training(self):
+        pass
+
+    @require_torch_accelerator_with_training
+    def test_gradient_checkpointing(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        assert not model.is_gradient_checkpointing and model.training
+
+        out = model(**inputs_dict).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model.zero_grad()
+
+        labels = torch.randn_like(out)
+        loss = (out - labels).mean()
+        loss.backward()
+
+        # re-instantiate the model now enabling gradient checkpointing
+        model_2 = self.model_class(**init_dict)
+        # clone model
+        model_2.load_state_dict(model.state_dict())
+        model_2.to(torch_device)
+        model_2.enable_gradient_checkpointing()
+
+        assert model_2.is_gradient_checkpointing and model_2.training
+
+        out_2 = model_2(**inputs_dict).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model_2.zero_grad()
+        loss_2 = (out_2 - labels).mean()
+        loss_2.backward()
+
+        # compare the output and parameters gradients
+        self.assertTrue((loss - loss_2).abs() < 1e-5)
+        named_params = dict(model.named_parameters())
+        named_params_2 = dict(model_2.named_parameters())
+        for name, param in named_params.items():
+            self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
+
+    def test_from_pretrained_hub(self):
+        model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True)
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
+
+        model.to(torch_device)
+        image = model(**self.dummy_input)
+
+        assert image is not None, "Make sure output is not None"
+
+    def test_output_pretrained(self):
+        model = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy")
+        model = model.to(torch_device)
+        model.eval()
+
+        # Keep generator on CPU for non-CUDA devices to compare outputs with CPU result tensors
+        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        if torch_device != "mps":
+            generator = torch.Generator(device=generator_device).manual_seed(0)
+        else:
+            generator = torch.manual_seed(0)
+
+        image = torch.randn(
+            1,
+            model.config.in_channels,
+            model.config.sample_size,
+            model.config.sample_size,
+            generator=torch.manual_seed(0),
+        )
+        image = image.to(torch_device)
+        with torch.no_grad():
+            output = model(image, sample_posterior=True, generator=generator).sample
+
+        output_slice = output[0, -1, -3:, -3:].flatten().cpu()
+
+        # Since the VAE Gaussian prior's generator is seeded on the appropriate device,
+        # the expected output slices are not the same for CPU and GPU.
+        if torch_device == "mps":
+            expected_output_slice = torch.tensor(
+                [
+                    -4.0078e-01,
+                    -3.8323e-04,
+                    -1.2681e-01,
+                    -1.1462e-01,
+                    2.0095e-01,
+                    1.0893e-01,
+                    -8.8247e-02,
+                    -3.0361e-01,
+                    -9.8644e-03,
+                ]
+            )
+        elif generator_device == "cpu":
+            expected_output_slice = torch.tensor(
+                [
+                    -0.1352,
+                    0.0878,
+                    0.0419,
+                    -0.0818,
+                    -0.1069,
+                    0.0688,
+                    -0.1458,
+                    -0.4446,
+                    -0.0026,
+                ]
+            )
+        else:
+            expected_output_slice = torch.tensor(
+                [
+                    -0.2421,
+                    0.4642,
+                    0.2507,
+                    -0.0438,
+                    0.0682,
+                    0.3160,
+                    -0.2018,
+                    -0.0727,
+                    0.2485,
+                ]
+            )
+
+        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
+
+
+class AsymmetricAutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = AsymmetricAutoencoderKL
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+        mask = torch.ones((batch_size, 1) + sizes).to(torch_device)
+
+        return {"sample": image, "mask": mask}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = get_asym_autoencoder_kl_config()
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_forward_signature(self):
+        pass
+
+    def test_forward_with_norm_groups(self):
+        pass
+
+
+class AutoencoderTinyTests(ModelTesterMixin, unittest.TestCase):
+    model_class = AutoencoderTiny
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+
+        return {"sample": image}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = get_autoencoder_tiny_config()
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_outputs_equivalence(self):
+        pass
+
+
+class ConsistencyDecoderVAETests(ModelTesterMixin, unittest.TestCase):
+    model_class = ConsistencyDecoderVAE
+    main_input_name = "sample"
+    base_precision = 1e-2
+    forward_requires_fresh_args = True
+
+    def inputs_dict(self, seed=None):
+        if seed is None:
+            generator = torch.Generator("cpu").manual_seed(0)
+        else:
+            generator = torch.Generator("cpu").manual_seed(seed)
+        image = randn_tensor((4, 3, 32, 32), generator=generator, device=torch.device(torch_device))
+
+        return {"sample": image, "generator": generator}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def init_dict(self):
+        return get_consistency_vae_config()
+
+    def prepare_init_args_and_inputs_for_common(self):
+        return self.init_dict, self.inputs_dict()
+
+    @unittest.skip
+    def test_training(self):
+        ...
+
+    @unittest.skip
+    def test_ema_training(self):
+        ...
+
+
+class AutoencoderKLTemporalDecoderFastTests(ModelTesterMixin, unittest.TestCase):
+    model_class = AutoencoderKLTemporalDecoder
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    @property
+    def dummy_input(self):
+        batch_size = 3
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+        num_frames = 3
+
+        return {"sample": image, "num_frames": num_frames}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": [32, 64],
+            "in_channels": 3,
+            "out_channels": 3,
+            "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            "latent_channels": 4,
+            "layers_per_block": 2,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_forward_signature(self):
+        pass
+
+    def test_training(self):
+        pass
+
+    @unittest.skipIf(torch_device == "mps", "Gradient checkpointing skipped on MPS")
+    def test_gradient_checkpointing(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        assert not model.is_gradient_checkpointing and model.training
+
+        out = model(**inputs_dict).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model.zero_grad()
+
+        labels = torch.randn_like(out)
+        loss = (out - labels).mean()
+        loss.backward()
+
+        # re-instantiate the model now enabling gradient checkpointing
+        model_2 = self.model_class(**init_dict)
+        # clone model
+        model_2.load_state_dict(model.state_dict())
+        model_2.to(torch_device)
+        model_2.enable_gradient_checkpointing()
+
+        assert model_2.is_gradient_checkpointing and model_2.training
+
+        out_2 = model_2(**inputs_dict).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model_2.zero_grad()
+        loss_2 = (out_2 - labels).mean()
+        loss_2.backward()
+
+        # compare the output and parameters gradients
+        self.assertTrue((loss - loss_2).abs() < 1e-5)
+        named_params = dict(model.named_parameters())
+        named_params_2 = dict(model_2.named_parameters())
+        for name, param in named_params.items():
+            if "post_quant_conv" in name:
+                continue
+
+            self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
+
+
+class AutoencoderOobleckTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = AutoencoderOobleck
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 2
+        seq_len = 24
+
+        waveform = floats_tensor((batch_size, num_channels, seq_len)).to(torch_device)
+
+        return {"sample": waveform, "sample_posterior": False}
+
+    @property
+    def input_shape(self):
+        return (2, 24)
+
+    @property
+    def output_shape(self):
+        return (2, 24)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = get_autoencoder_oobleck_config()
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_forward_signature(self):
+        pass
+
+    def test_forward_with_norm_groups(self):
+        pass
+
+    @unittest.skip("No attention module used in this model")
+    def test_set_attn_processor_for_determinism(self):
+        return
+
+
+@slow
+class AutoencoderTinyIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def get_file_format(self, seed, shape):
+        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
+
+    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
+        dtype = torch.float16 if fp16 else torch.float32
+        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
+        return image
+
+    def get_sd_vae_model(self, model_id="hf-internal-testing/taesd-diffusers", fp16=False):
+        torch_dtype = torch.float16 if fp16 else torch.float32
+
+        model = AutoencoderTiny.from_pretrained(model_id, torch_dtype=torch_dtype)
+        model.to(torch_device).eval()
+        return model
+
+    @parameterized.expand(
+        [
+            [(1, 4, 73, 97), (1, 3, 584, 776)],
+            [(1, 4, 97, 73), (1, 3, 776, 584)],
+            [(1, 4, 49, 65), (1, 3, 392, 520)],
+            [(1, 4, 65, 49), (1, 3, 520, 392)],
+            [(1, 4, 49, 49), (1, 3, 392, 392)],
+        ]
+    )
+    def test_tae_tiling(self, in_shape, out_shape):
+        model = self.get_sd_vae_model()
+        model.enable_tiling()
+        with torch.no_grad():
+            zeros = torch.zeros(in_shape).to(torch_device)
+            dec = model.decode(zeros).sample
+            assert dec.shape == out_shape
+
+    def test_stable_diffusion(self):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed=33)
+
+        with torch.no_grad():
+            sample = model(image).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor([0.0093, 0.6385, -0.1274, 0.1631, -0.1762, 0.5232, -0.3108, -0.0382])
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
+
+    @parameterized.expand([(True,), (False,)])
+    def test_tae_roundtrip(self, enable_tiling):
+        # load the autoencoder
+        model = self.get_sd_vae_model()
+        if enable_tiling:
+            model.enable_tiling()
+
+        # make a black image with a white square in the middle,
+        # which is large enough to split across multiple tiles
+        image = -torch.ones(1, 3, 1024, 1024, device=torch_device)
+        image[..., 256:768, 256:768] = 1.0
+
+        # round-trip the image through the autoencoder
+        with torch.no_grad():
+            sample = model(image).sample
+
+        # the autoencoder reconstruction should match original image, sorta
+        def downscale(x):
+            return torch.nn.functional.avg_pool2d(x, model.spatial_scale_factor)
+
+        assert torch_all_close(downscale(sample), downscale(image), atol=0.125)
+
+
+@slow
+class AutoencoderKLIntegrationTests(unittest.TestCase):
+    def get_file_format(self, seed, shape):
+        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
+        dtype = torch.float16 if fp16 else torch.float32
+        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
+        return image
+
+    def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False):
+        revision = "fp16" if fp16 else None
+        torch_dtype = torch.float16 if fp16 else torch.float32
+
+        model = AutoencoderKL.from_pretrained(
+            model_id,
+            subfolder="vae",
+            torch_dtype=torch_dtype,
+            revision=revision,
+        )
+        model.to(torch_device)
+
+        return model
+
+    def get_generator(self, seed=0):
+        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        if torch_device != "mps":
+            return torch.Generator(device=generator_device).manual_seed(seed)
+        return torch.manual_seed(seed)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [
+                33,
+                [-0.1556, 0.9848, -0.0410, -0.0642, -0.2685, 0.8381, -0.2004, -0.0700],
+                [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824],
+            ],
+            [
+                47,
+                [-0.2376, 0.1200, 0.1337, -0.4830, -0.2504, -0.0759, -0.0486, -0.4077],
+                [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131],
+            ],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            sample = model(image, generator=generator, sample_posterior=True).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [-0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, -0.0999]],
+            [47, [-0.4128, -0.1320, -0.3704, 0.1965, -0.4116, -0.2332, -0.3340, 0.2247]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator_with_fp16
+    def test_stable_diffusion_fp16(self, seed, expected_slice):
+        model = self.get_sd_vae_model(fp16=True)
+        image = self.get_sd_image(seed, fp16=True)
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            sample = model(image, generator=generator, sample_posterior=True).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-2)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [
+                33,
+                [-0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, -0.0814],
+                [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824],
+            ],
+            [
+                47,
+                [-0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, -0.4085],
+                [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131],
+            ],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+
+        with torch.no_grad():
+            sample = model(image).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [13, [-0.2051, -0.1803, -0.2311, -0.2114, -0.3292, -0.3574, -0.2953, -0.3323]],
+            [37, [-0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.4990, -0.3720, -0.4925]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator
+    @skip_mps
+    def test_stable_diffusion_decode(self, seed, expected_slice):
+        model = self.get_sd_vae_model()
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        output_slice = sample[-1, -2:, :2, -2:].flatten().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [27, [-0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.1930, -0.1465, -0.2039]],
+            [16, [-0.1628, -0.2134, -0.2747, -0.2642, -0.3774, -0.4404, -0.3687, -0.4277]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator_with_fp16
+    def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
+        model = self.get_sd_vae_model(fp16=True)
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True)
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
+
+    @parameterized.expand([(13,), (16,), (27,)])
+    @require_torch_gpu
+    @unittest.skipIf(
+        not is_xformers_available(),
+        reason="xformers is not required when using PyTorch 2.0.",
+    )
+    def test_stable_diffusion_decode_xformers_vs_2_0_fp16(self, seed):
+        model = self.get_sd_vae_model(fp16=True)
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True)
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        model.enable_xformers_memory_efficient_attention()
+        with torch.no_grad():
+            sample_2 = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        assert torch_all_close(sample, sample_2, atol=1e-1)
+
+    @parameterized.expand([(13,), (16,), (37,)])
+    @require_torch_gpu
+    @unittest.skipIf(
+        not is_xformers_available(),
+        reason="xformers is not required when using PyTorch 2.0.",
+    )
+    def test_stable_diffusion_decode_xformers_vs_2_0(self, seed):
+        model = self.get_sd_vae_model()
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        model.enable_xformers_memory_efficient_attention()
+        with torch.no_grad():
+            sample_2 = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        assert torch_all_close(sample, sample_2, atol=1e-2)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]],
+            [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion_encode_sample(self, seed, expected_slice):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            dist = model.encode(image).latent_dist
+            sample = dist.sample(generator=generator)
+
+        assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]]
+
+        output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        tolerance = 3e-3 if torch_device != "mps" else 1e-2
+        assert torch_all_close(output_slice, expected_output_slice, atol=tolerance)
+
+
+@slow
+class AsymmetricAutoencoderKLIntegrationTests(unittest.TestCase):
+    def get_file_format(self, seed, shape):
+        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
+        dtype = torch.float16 if fp16 else torch.float32
+        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
+        return image
+
+    def get_sd_vae_model(self, model_id="cross-attention/asymmetric-autoencoder-kl-x-1-5", fp16=False):
+        revision = "main"
+        torch_dtype = torch.float32
+
+        model = AsymmetricAutoencoderKL.from_pretrained(
+            model_id,
+            torch_dtype=torch_dtype,
+            revision=revision,
+        )
+        model.to(torch_device).eval()
+
+        return model
+
+    def get_generator(self, seed=0):
+        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        if torch_device != "mps":
+            return torch.Generator(device=generator_device).manual_seed(seed)
+        return torch.manual_seed(seed)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [
+                33,
+                [-0.0336, 0.3011, 0.1764, 0.0087, -0.3401, 0.3645, -0.1247, 0.1205],
+                [-0.1603, 0.9878, -0.0495, -0.0790, -0.2709, 0.8375, -0.2060, -0.0824],
+            ],
+            [
+                47,
+                [0.4400, 0.0543, 0.2873, 0.2946, 0.0553, 0.0839, -0.1585, 0.2529],
+                [-0.2376, 0.1168, 0.1332, -0.4840, -0.2508, -0.0791, -0.0493, -0.4089],
+            ],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            sample = model(image, generator=generator, sample_posterior=True).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [
+                33,
+                [-0.0340, 0.2870, 0.1698, -0.0105, -0.3448, 0.3529, -0.1321, 0.1097],
+                [-0.0344, 0.2912, 0.1687, -0.0137, -0.3462, 0.3552, -0.1337, 0.1078],
+            ],
+            [
+                47,
+                [0.4397, 0.0550, 0.2873, 0.2946, 0.0567, 0.0855, -0.1580, 0.2531],
+                [0.4397, 0.0550, 0.2873, 0.2946, 0.0567, 0.0855, -0.1580, 0.2531],
+            ],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+
+        with torch.no_grad():
+            sample = model(image).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [13, [-0.0521, -0.2939, 0.1540, -0.1855, -0.5936, -0.3138, -0.4579, -0.2275]],
+            [37, [-0.1820, -0.4345, -0.0455, -0.2923, -0.8035, -0.5089, -0.4795, -0.3106]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator
+    @skip_mps
+    def test_stable_diffusion_decode(self, seed, expected_slice):
+        model = self.get_sd_vae_model()
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        output_slice = sample[-1, -2:, :2, -2:].flatten().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=2e-3)
+
+    @parameterized.expand([(13,), (16,), (37,)])
+    @require_torch_gpu
+    @unittest.skipIf(
+        not is_xformers_available(),
+        reason="xformers is not required when using PyTorch 2.0.",
+    )
+    def test_stable_diffusion_decode_xformers_vs_2_0(self, seed):
+        model = self.get_sd_vae_model()
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        model.enable_xformers_memory_efficient_attention()
+        with torch.no_grad():
+            sample_2 = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        assert torch_all_close(sample, sample_2, atol=5e-2)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]],
+            [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion_encode_sample(self, seed, expected_slice):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            dist = model.encode(image).latent_dist
+            sample = dist.sample(generator=generator)
+
+        assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]]
+
+        output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        tolerance = 3e-3 if torch_device != "mps" else 1e-2
+        assert torch_all_close(output_slice, expected_output_slice, atol=tolerance)
+
+
+@slow
+class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @torch.no_grad()
+    def test_encode_decode(self):
+        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder")  # TODO - update
+        vae.to(torch_device)
+
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        ).resize((256, 256))
+        image = torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[None, :, :, :].to(
+            torch_device
+        )
+
+        latent = vae.encode(image).latent_dist.mean
+
+        sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample
+
+        actual_output = sample[0, :2, :2, :2].flatten().cpu()
+        expected_output = torch.tensor([-0.0141, -0.0014, 0.0115, 0.0086, 0.1051, 0.1053, 0.1031, 0.1024])
+
+        assert torch_all_close(actual_output, expected_output, atol=5e-3)
+
+    def test_sd(self):
+        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder")  # TODO - update
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, safety_checker=None
+        )
+        pipe.to(torch_device)
+
+        out = pipe(
+            "horse",
+            num_inference_steps=2,
+            output_type="pt",
+            generator=torch.Generator("cpu").manual_seed(0),
+        ).images[0]
+
+        actual_output = out[:2, :2, :2].flatten().cpu()
+        expected_output = torch.tensor([0.7686, 0.8228, 0.6489, 0.7455, 0.8661, 0.8797, 0.8241, 0.8759])
+
+        assert torch_all_close(actual_output, expected_output, atol=5e-3)
+
+    def test_encode_decode_f16(self):
+        vae = ConsistencyDecoderVAE.from_pretrained(
+            "openai/consistency-decoder", torch_dtype=torch.float16
+        )  # TODO - update
+        vae.to(torch_device)
+
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        ).resize((256, 256))
+        image = (
+            torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[None, :, :, :]
+            .half()
+            .to(torch_device)
+        )
+
+        latent = vae.encode(image).latent_dist.mean
+
+        sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample
+
+        actual_output = sample[0, :2, :2, :2].flatten().cpu()
+        expected_output = torch.tensor(
+            [-0.0111, -0.0125, -0.0017, -0.0007, 0.1257, 0.1465, 0.1450, 0.1471],
+            dtype=torch.float16,
+        )
+
+        assert torch_all_close(actual_output, expected_output, atol=5e-3)
+
+    def test_sd_f16(self):
+        vae = ConsistencyDecoderVAE.from_pretrained(
+            "openai/consistency-decoder", torch_dtype=torch.float16
+        )  # TODO - update
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            torch_dtype=torch.float16,
+            vae=vae,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+
+        out = pipe(
+            "horse",
+            num_inference_steps=2,
+            output_type="pt",
+            generator=torch.Generator("cpu").manual_seed(0),
+        ).images[0]
+
+        actual_output = out[:2, :2, :2].flatten().cpu()
+        expected_output = torch.tensor(
+            [0.0000, 0.0249, 0.0000, 0.0000, 0.1709, 0.2773, 0.0471, 0.1035],
+            dtype=torch.float16,
+        )
+
+        assert torch_all_close(actual_output, expected_output, atol=5e-3)
+
+    def test_vae_tiling(self):
+        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        out_1 = pipe(
+            "horse",
+            num_inference_steps=2,
+            output_type="pt",
+            generator=torch.Generator("cpu").manual_seed(0),
+        ).images[0]
+
+        # make sure tiled vae decode yields the same result
+        pipe.enable_vae_tiling()
+        out_2 = pipe(
+            "horse",
+            num_inference_steps=2,
+            output_type="pt",
+            generator=torch.Generator("cpu").manual_seed(0),
+        ).images[0]
+
+        assert torch_all_close(out_1, out_2, atol=5e-3)
+
+        # test that tiled decode works with various shapes
+        shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)]
+        with torch.no_grad():
+            for shape in shapes:
+                image = torch.zeros(shape, device=torch_device, dtype=pipe.vae.dtype)
+                pipe.vae.decode(image)
+
+
+@slow
+class AutoencoderOobleckIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return torch.nn.utils.rnn.pad_sequence(
+            [torch.from_numpy(x["array"]) for x in speech_samples], batch_first=True
+        )
+
+    def get_audio(self, audio_sample_size=2097152, fp16=False):
+        dtype = torch.float16 if fp16 else torch.float32
+        audio = self._load_datasamples(2).to(torch_device).to(dtype)
+
+        # pad / crop to audio_sample_size
+        audio = torch.nn.functional.pad(audio[:, :audio_sample_size], pad=(0, audio_sample_size - audio.shape[-1]))
+
+        # todo channel
+        audio = audio.unsqueeze(1).repeat(1, 2, 1).to(torch_device)
+
+        return audio
+
+    def get_oobleck_vae_model(self, model_id="stabilityai/stable-audio-open-1.0", fp16=False):
+        torch_dtype = torch.float16 if fp16 else torch.float32
+
+        model = AutoencoderOobleck.from_pretrained(
+            model_id,
+            subfolder="vae",
+            torch_dtype=torch_dtype,
+        )
+        model.to(torch_device)
+
+        return model
+
+    def get_generator(self, seed=0):
+        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        if torch_device != "mps":
+            return torch.Generator(device=generator_device).manual_seed(seed)
+        return torch.manual_seed(seed)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [1.193e-4, 6.56e-05, 1.314e-4, 3.80e-05, -4.01e-06], 0.001192],
+            [44, [2.77e-05, -2.65e-05, 1.18e-05, -6.94e-05, -9.57e-05], 0.001196],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion(self, seed, expected_slice, expected_mean_absolute_diff):
+        model = self.get_oobleck_vae_model()
+        audio = self.get_audio()
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            sample = model(audio, generator=generator, sample_posterior=True).sample
+
+        assert sample.shape == audio.shape
+        assert ((sample - audio).abs().mean() - expected_mean_absolute_diff).abs() <= 1e-6
+
+        output_slice = sample[-1, 1, 5:10].cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-5)
+
+    def test_stable_diffusion_mode(self):
+        model = self.get_oobleck_vae_model()
+        audio = self.get_audio()
+
+        with torch.no_grad():
+            sample = model(audio, sample_posterior=False).sample
+
+        assert sample.shape == audio.shape
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [1.193e-4, 6.56e-05, 1.314e-4, 3.80e-05, -4.01e-06], 0.001192],
+            [44, [2.77e-05, -2.65e-05, 1.18e-05, -6.94e-05, -9.57e-05], 0.001196],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion_encode_decode(self, seed, expected_slice, expected_mean_absolute_diff):
+        model = self.get_oobleck_vae_model()
+        audio = self.get_audio()
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            x = audio
+            posterior = model.encode(x).latent_dist
+            z = posterior.sample(generator=generator)
+            sample = model.decode(z).sample
+
+        # (batch_size, latent_dim, sequence_length)
+        assert posterior.mean.shape == (audio.shape[0], model.config.decoder_input_channels, 1024)
+
+        assert sample.shape == audio.shape
+        assert ((sample - audio).abs().mean() - expected_mean_absolute_diff).abs() <= 1e-6
+
+        output_slice = sample[-1, 1, 5:10].cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-5)
diff --git a/diffusers/tests/models/autoencoders/test_models_vae_flax.py b/diffusers/tests/models/autoencoders/test_models_vae_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fedb85eccfc73e9a0900f7bb947887da3ffe4e9
--- /dev/null
+++ b/diffusers/tests/models/autoencoders/test_models_vae_flax.py
@@ -0,0 +1,39 @@
+import unittest
+
+from diffusers import FlaxAutoencoderKL
+from diffusers.utils import is_flax_available
+from diffusers.utils.testing_utils import require_flax
+
+from ..test_modeling_common_flax import FlaxModelTesterMixin
+
+
+if is_flax_available():
+    import jax
+
+
+@require_flax
+class FlaxAutoencoderKLTests(FlaxModelTesterMixin, unittest.TestCase):
+    model_class = FlaxAutoencoderKL
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        prng_key = jax.random.PRNGKey(0)
+        image = jax.random.uniform(prng_key, ((batch_size, num_channels) + sizes))
+
+        return {"sample": image, "prng_key": prng_key}
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": [32, 64],
+            "in_channels": 3,
+            "out_channels": 3,
+            "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            "latent_channels": 4,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
diff --git a/diffusers/tests/models/autoencoders/test_models_vq.py b/diffusers/tests/models/autoencoders/test_models_vq.py
new file mode 100644
index 0000000000000000000000000000000000000000..c61ae1bdf0ffdf842ca50721701e6db647c459b3
--- /dev/null
+++ b/diffusers/tests/models/autoencoders/test_models_vq.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import VQModel
+from diffusers.utils.testing_utils import (
+    backend_manual_seed,
+    enable_full_determinism,
+    floats_tensor,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+enable_full_determinism()
+
+
+class VQModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = VQModel
+    main_input_name = "sample"
+
+    @property
+    def dummy_input(self, sizes=(32, 32)):
+        batch_size = 4
+        num_channels = 3
+
+        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+
+        return {"sample": image}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": [8, 16],
+            "norm_num_groups": 8,
+            "in_channels": 3,
+            "out_channels": 3,
+            "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            "latent_channels": 3,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_forward_signature(self):
+        pass
+
+    def test_training(self):
+        pass
+
+    def test_from_pretrained_hub(self):
+        model, loading_info = VQModel.from_pretrained("fusing/vqgan-dummy", output_loading_info=True)
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
+
+        model.to(torch_device)
+        image = model(**self.dummy_input)
+
+        assert image is not None, "Make sure output is not None"
+
+    def test_output_pretrained(self):
+        model = VQModel.from_pretrained("fusing/vqgan-dummy")
+        model.to(torch_device).eval()
+
+        torch.manual_seed(0)
+        backend_manual_seed(torch_device, 0)
+
+        image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+        image = image.to(torch_device)
+        with torch.no_grad():
+            output = model(image).sample
+
+        output_slice = output[0, -1, -3:, -3:].flatten().cpu()
+        # fmt: off
+        expected_output_slice = torch.tensor([-0.0153, -0.4044, -0.1880, -0.5161, -0.2418, -0.4072, -0.1612, -0.0633, -0.0143])
+        # fmt: on
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
+
+    def test_loss_pretrained(self):
+        model = VQModel.from_pretrained("fusing/vqgan-dummy")
+        model.to(torch_device).eval()
+
+        torch.manual_seed(0)
+        backend_manual_seed(torch_device, 0)
+
+        image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+        image = image.to(torch_device)
+        with torch.no_grad():
+            output = model(image).commit_loss.cpu()
+        # fmt: off
+        expected_output = torch.tensor([0.1936])
+        # fmt: on
+        self.assertTrue(torch.allclose(output, expected_output, atol=1e-3))
diff --git a/diffusers/tests/models/test_activations.py b/diffusers/tests/models/test_activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e8e51453e98157a753fc178ce146849e189a5a1
--- /dev/null
+++ b/diffusers/tests/models/test_activations.py
@@ -0,0 +1,48 @@
+import unittest
+
+import torch
+from torch import nn
+
+from diffusers.models.activations import get_activation
+
+
+class ActivationsTests(unittest.TestCase):
+    def test_swish(self):
+        act = get_activation("swish")
+
+        self.assertIsInstance(act, nn.SiLU)
+
+        self.assertEqual(act(torch.tensor(-100, dtype=torch.float32)).item(), 0)
+        self.assertNotEqual(act(torch.tensor(-1, dtype=torch.float32)).item(), 0)
+        self.assertEqual(act(torch.tensor(0, dtype=torch.float32)).item(), 0)
+        self.assertEqual(act(torch.tensor(20, dtype=torch.float32)).item(), 20)
+
+    def test_silu(self):
+        act = get_activation("silu")
+
+        self.assertIsInstance(act, nn.SiLU)
+
+        self.assertEqual(act(torch.tensor(-100, dtype=torch.float32)).item(), 0)
+        self.assertNotEqual(act(torch.tensor(-1, dtype=torch.float32)).item(), 0)
+        self.assertEqual(act(torch.tensor(0, dtype=torch.float32)).item(), 0)
+        self.assertEqual(act(torch.tensor(20, dtype=torch.float32)).item(), 20)
+
+    def test_mish(self):
+        act = get_activation("mish")
+
+        self.assertIsInstance(act, nn.Mish)
+
+        self.assertEqual(act(torch.tensor(-200, dtype=torch.float32)).item(), 0)
+        self.assertNotEqual(act(torch.tensor(-1, dtype=torch.float32)).item(), 0)
+        self.assertEqual(act(torch.tensor(0, dtype=torch.float32)).item(), 0)
+        self.assertEqual(act(torch.tensor(20, dtype=torch.float32)).item(), 20)
+
+    def test_gelu(self):
+        act = get_activation("gelu")
+
+        self.assertIsInstance(act, nn.GELU)
+
+        self.assertEqual(act(torch.tensor(-100, dtype=torch.float32)).item(), 0)
+        self.assertNotEqual(act(torch.tensor(-1, dtype=torch.float32)).item(), 0)
+        self.assertEqual(act(torch.tensor(0, dtype=torch.float32)).item(), 0)
+        self.assertEqual(act(torch.tensor(20, dtype=torch.float32)).item(), 20)
diff --git a/diffusers/tests/models/test_attention_processor.py b/diffusers/tests/models/test_attention_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..2489604274b49c66e1f23b603b96c316618bbd7a
--- /dev/null
+++ b/diffusers/tests/models/test_attention_processor.py
@@ -0,0 +1,120 @@
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import DiffusionPipeline
+from diffusers.models.attention_processor import Attention, AttnAddedKVProcessor
+
+
+class AttnAddedKVProcessorTests(unittest.TestCase):
+    def get_constructor_arguments(self, only_cross_attention: bool = False):
+        query_dim = 10
+
+        if only_cross_attention:
+            cross_attention_dim = 12
+        else:
+            # when only cross attention is not set, the cross attention dim must be the same as the query dim
+            cross_attention_dim = query_dim
+
+        return {
+            "query_dim": query_dim,
+            "cross_attention_dim": cross_attention_dim,
+            "heads": 2,
+            "dim_head": 4,
+            "added_kv_proj_dim": 6,
+            "norm_num_groups": 1,
+            "only_cross_attention": only_cross_attention,
+            "processor": AttnAddedKVProcessor(),
+        }
+
+    def get_forward_arguments(self, query_dim, added_kv_proj_dim):
+        batch_size = 2
+
+        hidden_states = torch.rand(batch_size, query_dim, 3, 2)
+        encoder_hidden_states = torch.rand(batch_size, 4, added_kv_proj_dim)
+        attention_mask = None
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "attention_mask": attention_mask,
+        }
+
+    def test_only_cross_attention(self):
+        # self and cross attention
+
+        torch.manual_seed(0)
+
+        constructor_args = self.get_constructor_arguments(only_cross_attention=False)
+        attn = Attention(**constructor_args)
+
+        self.assertTrue(attn.to_k is not None)
+        self.assertTrue(attn.to_v is not None)
+
+        forward_args = self.get_forward_arguments(
+            query_dim=constructor_args["query_dim"], added_kv_proj_dim=constructor_args["added_kv_proj_dim"]
+        )
+
+        self_and_cross_attn_out = attn(**forward_args)
+
+        # only self attention
+
+        torch.manual_seed(0)
+
+        constructor_args = self.get_constructor_arguments(only_cross_attention=True)
+        attn = Attention(**constructor_args)
+
+        self.assertTrue(attn.to_k is None)
+        self.assertTrue(attn.to_v is None)
+
+        forward_args = self.get_forward_arguments(
+            query_dim=constructor_args["query_dim"], added_kv_proj_dim=constructor_args["added_kv_proj_dim"]
+        )
+
+        only_cross_attn_out = attn(**forward_args)
+
+        self.assertTrue((only_cross_attn_out != self_and_cross_attn_out).all())
+
+
+class DeprecatedAttentionBlockTests(unittest.TestCase):
+    def test_conversion_when_using_device_map(self):
+        pipe = DiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
+
+        pre_conversion = pipe(
+            "foo",
+            num_inference_steps=2,
+            generator=torch.Generator("cpu").manual_seed(0),
+            output_type="np",
+        ).images
+
+        # the initial conversion succeeds
+        pipe = DiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", device_map="balanced", safety_checker=None
+        )
+
+        conversion = pipe(
+            "foo",
+            num_inference_steps=2,
+            generator=torch.Generator("cpu").manual_seed(0),
+            output_type="np",
+        ).images
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # save the converted model
+            pipe.save_pretrained(tmpdir)
+
+            # can also load the converted weights
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, device_map="balanced", safety_checker=None)
+        after_conversion = pipe(
+            "foo",
+            num_inference_steps=2,
+            generator=torch.Generator("cpu").manual_seed(0),
+            output_type="np",
+        ).images
+
+        self.assertTrue(np.allclose(pre_conversion, conversion, atol=1e-3))
+        self.assertTrue(np.allclose(conversion, after_conversion, atol=1e-3))
diff --git a/diffusers/tests/models/test_layers_utils.py b/diffusers/tests/models/test_layers_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..66e142f8c66a3dce09741df636b680c08b3e8840
--- /dev/null
+++ b/diffusers/tests/models/test_layers_utils.py
@@ -0,0 +1,517 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+import torch
+from torch import nn
+
+from diffusers.models.attention import GEGLU, AdaLayerNorm, ApproximateGELU
+from diffusers.models.embeddings import get_timestep_embedding
+from diffusers.models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
+from diffusers.models.transformers.transformer_2d import Transformer2DModel
+from diffusers.utils.testing_utils import (
+    backend_manual_seed,
+    require_torch_accelerator_with_fp64,
+    torch_device,
+)
+
+
+class EmbeddingsTests(unittest.TestCase):
+    def test_timestep_embeddings(self):
+        embedding_dim = 256
+        timesteps = torch.arange(16)
+
+        t1 = get_timestep_embedding(timesteps, embedding_dim)
+
+        # first vector should always be composed only of 0's and 1's
+        assert (t1[0, : embedding_dim // 2] - 0).abs().sum() < 1e-5
+        assert (t1[0, embedding_dim // 2 :] - 1).abs().sum() < 1e-5
+
+        # last element of each vector should be one
+        assert (t1[:, -1] - 1).abs().sum() < 1e-5
+
+        # For large embeddings (e.g. 128) the frequency of every vector is higher
+        # than the previous one which means that the gradients of later vectors are
+        # ALWAYS higher than the previous ones
+        grad_mean = np.abs(np.gradient(t1, axis=-1)).mean(axis=1)
+
+        prev_grad = 0.0
+        for grad in grad_mean:
+            assert grad > prev_grad
+            prev_grad = grad
+
+    def test_timestep_flip_sin_cos(self):
+        embedding_dim = 16
+        timesteps = torch.arange(10)
+
+        t1 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=True)
+        t1 = torch.cat([t1[:, embedding_dim // 2 :], t1[:, : embedding_dim // 2]], dim=-1)
+
+        t2 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=False)
+
+        assert torch.allclose(t1.cpu(), t2.cpu(), 1e-3)
+
+    def test_timestep_downscale_freq_shift(self):
+        embedding_dim = 16
+        timesteps = torch.arange(10)
+
+        t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0)
+        t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1)
+
+        # get cosine half (vectors that are wrapped into cosine)
+        cosine_half = (t1 - t2)[:, embedding_dim // 2 :]
+
+        # cosine needs to be negative
+        assert (np.abs((cosine_half <= 0).numpy()) - 1).sum() < 1e-5
+
+    def test_sinoid_embeddings_hardcoded(self):
+        embedding_dim = 64
+        timesteps = torch.arange(128)
+
+        # standard unet, score_vde
+        t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1, flip_sin_to_cos=False)
+        # glide, ldm
+        t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0, flip_sin_to_cos=True)
+        # grad-tts
+        t3 = get_timestep_embedding(timesteps, embedding_dim, scale=1000)
+
+        assert torch.allclose(
+            t1[23:26, 47:50].flatten().cpu(),
+            torch.tensor([0.9646, 0.9804, 0.9892, 0.9615, 0.9787, 0.9882, 0.9582, 0.9769, 0.9872]),
+            1e-3,
+        )
+        assert torch.allclose(
+            t2[23:26, 47:50].flatten().cpu(),
+            torch.tensor([0.3019, 0.2280, 0.1716, 0.3146, 0.2377, 0.1790, 0.3272, 0.2474, 0.1864]),
+            1e-3,
+        )
+        assert torch.allclose(
+            t3[23:26, 47:50].flatten().cpu(),
+            torch.tensor([-0.9801, -0.9464, -0.9349, -0.3952, 0.8887, -0.9709, 0.5299, -0.2853, -0.9927]),
+            1e-3,
+        )
+
+
+class Upsample2DBlockTests(unittest.TestCase):
+    def test_upsample_default(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 32, 32)
+        upsample = Upsample2D(channels=32, use_conv=False)
+        with torch.no_grad():
+            upsampled = upsample(sample)
+
+        assert upsampled.shape == (1, 32, 64, 64)
+        output_slice = upsampled[0, -1, -3:, -3:]
+        expected_slice = torch.tensor([-0.2173, -1.2079, -1.2079, 0.2952, 1.1254, 1.1254, 0.2952, 1.1254, 1.1254])
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_upsample_with_conv(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 32, 32)
+        upsample = Upsample2D(channels=32, use_conv=True)
+        with torch.no_grad():
+            upsampled = upsample(sample)
+
+        assert upsampled.shape == (1, 32, 64, 64)
+        output_slice = upsampled[0, -1, -3:, -3:]
+        expected_slice = torch.tensor([0.7145, 1.3773, 0.3492, 0.8448, 1.0839, -0.3341, 0.5956, 0.1250, -0.4841])
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_upsample_with_conv_out_dim(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 32, 32)
+        upsample = Upsample2D(channels=32, use_conv=True, out_channels=64)
+        with torch.no_grad():
+            upsampled = upsample(sample)
+
+        assert upsampled.shape == (1, 64, 64, 64)
+        output_slice = upsampled[0, -1, -3:, -3:]
+        expected_slice = torch.tensor([0.2703, 0.1656, -0.2538, -0.0553, -0.2984, 0.1044, 0.1155, 0.2579, 0.7755])
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_upsample_with_transpose(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 32, 32)
+        upsample = Upsample2D(channels=32, use_conv=False, use_conv_transpose=True)
+        with torch.no_grad():
+            upsampled = upsample(sample)
+
+        assert upsampled.shape == (1, 32, 64, 64)
+        output_slice = upsampled[0, -1, -3:, -3:]
+        expected_slice = torch.tensor([-0.3028, -0.1582, 0.0071, 0.0350, -0.4799, -0.1139, 0.1056, -0.1153, -0.1046])
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+
+class Downsample2DBlockTests(unittest.TestCase):
+    def test_downsample_default(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 64, 64)
+        downsample = Downsample2D(channels=32, use_conv=False)
+        with torch.no_grad():
+            downsampled = downsample(sample)
+
+        assert downsampled.shape == (1, 32, 32, 32)
+        output_slice = downsampled[0, -1, -3:, -3:]
+        expected_slice = torch.tensor([-0.0513, -0.3889, 0.0640, 0.0836, -0.5460, -0.0341, -0.0169, -0.6967, 0.1179])
+        max_diff = (output_slice.flatten() - expected_slice).abs().sum().item()
+        assert max_diff <= 1e-3
+        # assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-1)
+
+    def test_downsample_with_conv(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 64, 64)
+        downsample = Downsample2D(channels=32, use_conv=True)
+        with torch.no_grad():
+            downsampled = downsample(sample)
+
+        assert downsampled.shape == (1, 32, 32, 32)
+        output_slice = downsampled[0, -1, -3:, -3:]
+
+        expected_slice = torch.tensor(
+            [0.9267, 0.5878, 0.3337, 1.2321, -0.1191, -0.3984, -0.7532, -0.0715, -0.3913],
+        )
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_downsample_with_conv_pad1(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 64, 64)
+        downsample = Downsample2D(channels=32, use_conv=True, padding=1)
+        with torch.no_grad():
+            downsampled = downsample(sample)
+
+        assert downsampled.shape == (1, 32, 32, 32)
+        output_slice = downsampled[0, -1, -3:, -3:]
+        expected_slice = torch.tensor([0.9267, 0.5878, 0.3337, 1.2321, -0.1191, -0.3984, -0.7532, -0.0715, -0.3913])
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_downsample_with_conv_out_dim(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 64, 64)
+        downsample = Downsample2D(channels=32, use_conv=True, out_channels=16)
+        with torch.no_grad():
+            downsampled = downsample(sample)
+
+        assert downsampled.shape == (1, 16, 32, 32)
+        output_slice = downsampled[0, -1, -3:, -3:]
+        expected_slice = torch.tensor([-0.6586, 0.5985, 0.0721, 0.1256, -0.1492, 0.4436, -0.2544, 0.5021, 1.1522])
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+
+class ResnetBlock2DTests(unittest.TestCase):
+    def test_resnet_default(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 64, 64).to(torch_device)
+        temb = torch.randn(1, 128).to(torch_device)
+        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128).to(torch_device)
+        with torch.no_grad():
+            output_tensor = resnet_block(sample, temb)
+
+        assert output_tensor.shape == (1, 32, 64, 64)
+        output_slice = output_tensor[0, -1, -3:, -3:]
+        expected_slice = torch.tensor(
+            [-1.9010, -0.2974, -0.8245, -1.3533, 0.8742, -0.9645, -2.0584, 1.3387, -0.4746], device=torch_device
+        )
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_restnet_with_use_in_shortcut(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 64, 64).to(torch_device)
+        temb = torch.randn(1, 128).to(torch_device)
+        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, use_in_shortcut=True).to(torch_device)
+        with torch.no_grad():
+            output_tensor = resnet_block(sample, temb)
+
+        assert output_tensor.shape == (1, 32, 64, 64)
+        output_slice = output_tensor[0, -1, -3:, -3:]
+        expected_slice = torch.tensor(
+            [0.2226, -1.0791, -0.1629, 0.3659, -0.2889, -1.2376, 0.0582, 0.9206, 0.0044], device=torch_device
+        )
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_resnet_up(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 64, 64).to(torch_device)
+        temb = torch.randn(1, 128).to(torch_device)
+        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, up=True).to(torch_device)
+        with torch.no_grad():
+            output_tensor = resnet_block(sample, temb)
+
+        assert output_tensor.shape == (1, 32, 128, 128)
+        output_slice = output_tensor[0, -1, -3:, -3:]
+        expected_slice = torch.tensor(
+            [1.2130, -0.8753, -0.9027, 1.5783, -0.5362, -0.5001, 1.0726, -0.7732, -0.4182], device=torch_device
+        )
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_resnet_down(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 64, 64).to(torch_device)
+        temb = torch.randn(1, 128).to(torch_device)
+        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, down=True).to(torch_device)
+        with torch.no_grad():
+            output_tensor = resnet_block(sample, temb)
+
+        assert output_tensor.shape == (1, 32, 32, 32)
+        output_slice = output_tensor[0, -1, -3:, -3:]
+        expected_slice = torch.tensor(
+            [-0.3002, -0.7135, 0.1359, 0.0561, -0.7935, 0.0113, -0.1766, -0.6714, -0.0436], device=torch_device
+        )
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_restnet_with_kernel_fir(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 64, 64).to(torch_device)
+        temb = torch.randn(1, 128).to(torch_device)
+        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="fir", down=True).to(torch_device)
+        with torch.no_grad():
+            output_tensor = resnet_block(sample, temb)
+
+        assert output_tensor.shape == (1, 32, 32, 32)
+        output_slice = output_tensor[0, -1, -3:, -3:]
+        expected_slice = torch.tensor(
+            [-0.0934, -0.5729, 0.0909, -0.2710, -0.5044, 0.0243, -0.0665, -0.5267, -0.3136], device=torch_device
+        )
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_restnet_with_kernel_sde_vp(self):
+        torch.manual_seed(0)
+        sample = torch.randn(1, 32, 64, 64).to(torch_device)
+        temb = torch.randn(1, 128).to(torch_device)
+        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="sde_vp", down=True).to(torch_device)
+        with torch.no_grad():
+            output_tensor = resnet_block(sample, temb)
+
+        assert output_tensor.shape == (1, 32, 32, 32)
+        output_slice = output_tensor[0, -1, -3:, -3:]
+        expected_slice = torch.tensor(
+            [-0.3002, -0.7135, 0.1359, 0.0561, -0.7935, 0.0113, -0.1766, -0.6714, -0.0436], device=torch_device
+        )
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+
+class Transformer2DModelTests(unittest.TestCase):
+    def test_spatial_transformer_default(self):
+        torch.manual_seed(0)
+        backend_manual_seed(torch_device, 0)
+
+        sample = torch.randn(1, 32, 64, 64).to(torch_device)
+        spatial_transformer_block = Transformer2DModel(
+            in_channels=32,
+            num_attention_heads=1,
+            attention_head_dim=32,
+            dropout=0.0,
+            cross_attention_dim=None,
+        ).to(torch_device)
+        with torch.no_grad():
+            attention_scores = spatial_transformer_block(sample).sample
+
+        assert attention_scores.shape == (1, 32, 64, 64)
+        output_slice = attention_scores[0, -1, -3:, -3:]
+
+        expected_slice = torch.tensor(
+            [-1.9455, -0.0066, -1.3933, -1.5878, 0.5325, -0.6486, -1.8648, 0.7515, -0.9689], device=torch_device
+        )
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_spatial_transformer_cross_attention_dim(self):
+        torch.manual_seed(0)
+        backend_manual_seed(torch_device, 0)
+
+        sample = torch.randn(1, 64, 64, 64).to(torch_device)
+        spatial_transformer_block = Transformer2DModel(
+            in_channels=64,
+            num_attention_heads=2,
+            attention_head_dim=32,
+            dropout=0.0,
+            cross_attention_dim=64,
+        ).to(torch_device)
+        with torch.no_grad():
+            context = torch.randn(1, 4, 64).to(torch_device)
+            attention_scores = spatial_transformer_block(sample, context).sample
+
+        assert attention_scores.shape == (1, 64, 64, 64)
+        output_slice = attention_scores[0, -1, -3:, -3:]
+        expected_slice = torch.tensor(
+            [0.0143, -0.6909, -2.1547, -1.8893, 1.4097, 0.1359, -0.2521, -1.3359, 0.2598], device=torch_device
+        )
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_spatial_transformer_timestep(self):
+        torch.manual_seed(0)
+        backend_manual_seed(torch_device, 0)
+
+        num_embeds_ada_norm = 5
+
+        sample = torch.randn(1, 64, 64, 64).to(torch_device)
+        spatial_transformer_block = Transformer2DModel(
+            in_channels=64,
+            num_attention_heads=2,
+            attention_head_dim=32,
+            dropout=0.0,
+            cross_attention_dim=64,
+            num_embeds_ada_norm=num_embeds_ada_norm,
+        ).to(torch_device)
+        with torch.no_grad():
+            timestep_1 = torch.tensor(1, dtype=torch.long).to(torch_device)
+            timestep_2 = torch.tensor(2, dtype=torch.long).to(torch_device)
+            attention_scores_1 = spatial_transformer_block(sample, timestep=timestep_1).sample
+            attention_scores_2 = spatial_transformer_block(sample, timestep=timestep_2).sample
+
+        assert attention_scores_1.shape == (1, 64, 64, 64)
+        assert attention_scores_2.shape == (1, 64, 64, 64)
+
+        output_slice_1 = attention_scores_1[0, -1, -3:, -3:]
+        output_slice_2 = attention_scores_2[0, -1, -3:, -3:]
+
+        expected_slice = torch.tensor(
+            [-0.3923, -1.0923, -1.7144, -1.5570, 1.4154, 0.1738, -0.1157, -1.2998, -0.1703], device=torch_device
+        )
+        expected_slice_2 = torch.tensor(
+            [-0.4311, -1.1376, -1.7732, -1.5997, 1.3450, 0.0964, -0.1569, -1.3590, -0.2348], device=torch_device
+        )
+
+        assert torch.allclose(output_slice_1.flatten(), expected_slice, atol=1e-3)
+        assert torch.allclose(output_slice_2.flatten(), expected_slice_2, atol=1e-3)
+
+    def test_spatial_transformer_dropout(self):
+        torch.manual_seed(0)
+        backend_manual_seed(torch_device, 0)
+
+        sample = torch.randn(1, 32, 64, 64).to(torch_device)
+        spatial_transformer_block = (
+            Transformer2DModel(
+                in_channels=32,
+                num_attention_heads=2,
+                attention_head_dim=16,
+                dropout=0.3,
+                cross_attention_dim=None,
+            )
+            .to(torch_device)
+            .eval()
+        )
+        with torch.no_grad():
+            attention_scores = spatial_transformer_block(sample).sample
+
+        assert attention_scores.shape == (1, 32, 64, 64)
+        output_slice = attention_scores[0, -1, -3:, -3:]
+
+        expected_slice = torch.tensor(
+            [-1.9380, -0.0083, -1.3771, -1.5819, 0.5209, -0.6441, -1.8545, 0.7563, -0.9615], device=torch_device
+        )
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    @require_torch_accelerator_with_fp64
+    def test_spatial_transformer_discrete(self):
+        torch.manual_seed(0)
+        backend_manual_seed(torch_device, 0)
+
+        num_embed = 5
+
+        sample = torch.randint(0, num_embed, (1, 32)).to(torch_device)
+        spatial_transformer_block = (
+            Transformer2DModel(
+                num_attention_heads=1,
+                attention_head_dim=32,
+                num_vector_embeds=num_embed,
+                sample_size=16,
+            )
+            .to(torch_device)
+            .eval()
+        )
+
+        with torch.no_grad():
+            attention_scores = spatial_transformer_block(sample).sample
+
+        assert attention_scores.shape == (1, num_embed - 1, 32)
+
+        output_slice = attention_scores[0, -2:, -3:]
+
+        expected_slice = torch.tensor([-1.7648, -1.0241, -2.0985, -1.8035, -1.6404, -1.2098], device=torch_device)
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_spatial_transformer_default_norm_layers(self):
+        spatial_transformer_block = Transformer2DModel(num_attention_heads=1, attention_head_dim=32, in_channels=32)
+
+        assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == nn.LayerNorm
+        assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == nn.LayerNorm
+
+    def test_spatial_transformer_ada_norm_layers(self):
+        spatial_transformer_block = Transformer2DModel(
+            num_attention_heads=1,
+            attention_head_dim=32,
+            in_channels=32,
+            num_embeds_ada_norm=5,
+        )
+
+        assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == AdaLayerNorm
+        assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == nn.LayerNorm
+
+    def test_spatial_transformer_default_ff_layers(self):
+        spatial_transformer_block = Transformer2DModel(
+            num_attention_heads=1,
+            attention_head_dim=32,
+            in_channels=32,
+        )
+
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == GEGLU
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == nn.Dropout
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == nn.Linear
+
+        dim = 32
+        inner_dim = 128
+
+        # First dimension change
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.in_features == dim
+        # NOTE: inner_dim * 2 because GEGLU
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.out_features == inner_dim * 2
+
+        # Second dimension change
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].in_features == inner_dim
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].out_features == dim
+
+    def test_spatial_transformer_geglu_approx_ff_layers(self):
+        spatial_transformer_block = Transformer2DModel(
+            num_attention_heads=1,
+            attention_head_dim=32,
+            in_channels=32,
+            activation_fn="geglu-approximate",
+        )
+
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == ApproximateGELU
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == nn.Dropout
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == nn.Linear
+
+        dim = 32
+        inner_dim = 128
+
+        # First dimension change
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.in_features == dim
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.out_features == inner_dim
+
+        # Second dimension change
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].in_features == inner_dim
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].out_features == dim
+
+    def test_spatial_transformer_attention_bias(self):
+        spatial_transformer_block = Transformer2DModel(
+            num_attention_heads=1, attention_head_dim=32, in_channels=32, attention_bias=True
+        )
+
+        assert spatial_transformer_block.transformer_blocks[0].attn1.to_q.bias is not None
+        assert spatial_transformer_block.transformer_blocks[0].attn1.to_k.bias is not None
+        assert spatial_transformer_block.transformer_blocks[0].attn1.to_v.bias is not None
diff --git a/diffusers/tests/models/test_modeling_common.py b/diffusers/tests/models/test_modeling_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..b56ac233ef2913aeebd9ad0f1efa1a214c08c40b
--- /dev/null
+++ b/diffusers/tests/models/test_modeling_common.py
@@ -0,0 +1,1069 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import json
+import os
+import tempfile
+import traceback
+import unittest
+import unittest.mock as mock
+import uuid
+from typing import Dict, List, Tuple
+
+import numpy as np
+import requests_mock
+import torch
+from accelerate.utils import compute_module_sizes
+from huggingface_hub import ModelCard, delete_repo
+from huggingface_hub.utils import is_jinja_available
+from requests.exceptions import HTTPError
+
+from diffusers.models import UNet2DConditionModel
+from diffusers.models.attention_processor import (
+    AttnProcessor,
+    AttnProcessor2_0,
+    AttnProcessorNPU,
+    XFormersAttnProcessor,
+)
+from diffusers.training_utils import EMAModel
+from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME, is_torch_npu_available, is_xformers_available, logging
+from diffusers.utils.hub_utils import _add_variant
+from diffusers.utils.testing_utils import (
+    CaptureLogger,
+    get_python_version,
+    is_torch_compile,
+    require_torch_2,
+    require_torch_accelerator_with_training,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    run_test_in_subprocess,
+    torch_device,
+)
+
+from ..others.test_utils import TOKEN, USER, is_staging_test
+
+
+def caculate_expected_num_shards(index_map_path):
+    with open(index_map_path) as f:
+        weight_map_dict = json.load(f)["weight_map"]
+    first_key = list(weight_map_dict.keys())[0]
+    weight_loc = weight_map_dict[first_key]  # e.g., diffusion_pytorch_model-00001-of-00002.safetensors
+    expected_num_shards = int(weight_loc.split("-")[-1].split(".")[0])
+    return expected_num_shards
+
+
+# Will be run via run_test_in_subprocess
+def _test_from_save_pretrained_dynamo(in_queue, out_queue, timeout):
+    error = None
+    try:
+        init_dict, model_class = in_queue.get(timeout=timeout)
+
+        model = model_class(**init_dict)
+        model.to(torch_device)
+        model = torch.compile(model)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, safe_serialization=False)
+            new_model = model_class.from_pretrained(tmpdirname)
+            new_model.to(torch_device)
+
+        assert new_model.__class__ == model_class
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
+class ModelUtilsTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+
+    def test_accelerate_loading_error_message(self):
+        with self.assertRaises(ValueError) as error_context:
+            UNet2DConditionModel.from_pretrained("hf-internal-testing/stable-diffusion-broken", subfolder="unet")
+
+        # make sure that error message states what keys are missing
+        assert "conv_out.bias" in str(error_context.exception)
+
+    def test_cached_files_are_used_when_no_internet(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        orig_model = UNet2DConditionModel.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet"
+        )
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.request", return_value=response_mock):
+            # Download this model to make sure it's in the cache.
+            model = UNet2DConditionModel.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet", local_files_only=True
+            )
+
+        for p1, p2 in zip(orig_model.parameters(), model.parameters()):
+            if p1.data.ne(p2.data).sum() > 0:
+                assert False, "Parameters not the same!"
+
+    @unittest.skip("Flaky behaviour on CI. Re-enable after migrating to new runners")
+    @unittest.skipIf(torch_device == "mps", reason="Test not supported for MPS.")
+    def test_one_request_upon_cached(self):
+        use_safetensors = False
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            with requests_mock.mock(real_http=True) as m:
+                UNet2DConditionModel.from_pretrained(
+                    "hf-internal-testing/tiny-stable-diffusion-torch",
+                    subfolder="unet",
+                    cache_dir=tmpdirname,
+                    use_safetensors=use_safetensors,
+                )
+
+            download_requests = [r.method for r in m.request_history]
+            assert (
+                download_requests.count("HEAD") == 3
+            ), "3 HEAD requests one for config, one for model, and one for shard index file."
+            assert download_requests.count("GET") == 2, "2 GET requests one for config, one for model"
+
+            with requests_mock.mock(real_http=True) as m:
+                UNet2DConditionModel.from_pretrained(
+                    "hf-internal-testing/tiny-stable-diffusion-torch",
+                    subfolder="unet",
+                    cache_dir=tmpdirname,
+                    use_safetensors=use_safetensors,
+                )
+
+            cache_requests = [r.method for r in m.request_history]
+            assert (
+                "HEAD" == cache_requests[0] and len(cache_requests) == 2
+            ), "We should call only `model_info` to check for commit hash and  knowing if shard index is present."
+
+    def test_weight_overwrite(self):
+        with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises(ValueError) as error_context:
+            UNet2DConditionModel.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-torch",
+                subfolder="unet",
+                cache_dir=tmpdirname,
+                in_channels=9,
+            )
+
+        # make sure that error message states what keys are missing
+        assert "Cannot load" in str(error_context.exception)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model = UNet2DConditionModel.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-torch",
+                subfolder="unet",
+                cache_dir=tmpdirname,
+                in_channels=9,
+                low_cpu_mem_usage=False,
+                ignore_mismatched_sizes=True,
+            )
+
+        assert model.config.in_channels == 9
+
+
+class UNetTesterMixin:
+    def test_forward_with_norm_groups(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["norm_num_groups"] = 16
+        init_dict["block_out_channels"] = (16, 32)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.to_tuple()[0]
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+
+class ModelTesterMixin:
+    main_input_name = None  # overwrite in model specific tester class
+    base_precision = 1e-3
+    forward_requires_fresh_args = False
+    model_split_percents = [0.5, 0.7, 0.9]
+    uses_custom_attn_processor = False
+
+    def check_device_map_is_respected(self, model, device_map):
+        for param_name, param in model.named_parameters():
+            # Find device in device_map
+            while len(param_name) > 0 and param_name not in device_map:
+                param_name = ".".join(param_name.split(".")[:-1])
+            if param_name not in device_map:
+                raise ValueError("device map is incomplete, it does not contain any device for `param_name`.")
+
+            param_device = device_map[param_name]
+            if param_device in ["cpu", "disk"]:
+                self.assertEqual(param.device, torch.device("meta"))
+            else:
+                self.assertEqual(param.device, torch.device(param_device))
+
+    def test_from_save_pretrained(self, expected_max_diff=5e-5):
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+
+        if hasattr(model, "set_default_attn_processor"):
+            model.set_default_attn_processor()
+        model.to(torch_device)
+        model.eval()
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, safe_serialization=False)
+            new_model = self.model_class.from_pretrained(tmpdirname)
+            if hasattr(new_model, "set_default_attn_processor"):
+                new_model.set_default_attn_processor()
+            new_model.to(torch_device)
+
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                image = model(**self.inputs_dict(0))
+            else:
+                image = model(**inputs_dict)
+
+            if isinstance(image, dict):
+                image = image.to_tuple()[0]
+
+            if self.forward_requires_fresh_args:
+                new_image = new_model(**self.inputs_dict(0))
+            else:
+                new_image = new_model(**inputs_dict)
+
+            if isinstance(new_image, dict):
+                new_image = new_image.to_tuple()[0]
+
+        max_diff = (image - new_image).abs().max().item()
+        self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
+
+    def test_getattr_is_correct(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+
+        # save some things to test
+        model.dummy_attribute = 5
+        model.register_to_config(test_attribute=5)
+
+        logger = logging.get_logger("diffusers.models.modeling_utils")
+        # 30 for warning
+        logger.setLevel(30)
+        with CaptureLogger(logger) as cap_logger:
+            assert hasattr(model, "dummy_attribute")
+            assert getattr(model, "dummy_attribute") == 5
+            assert model.dummy_attribute == 5
+
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+        logger = logging.get_logger("diffusers.models.modeling_utils")
+        # 30 for warning
+        logger.setLevel(30)
+        with CaptureLogger(logger) as cap_logger:
+            assert hasattr(model, "save_pretrained")
+            fn = model.save_pretrained
+            fn_1 = getattr(model, "save_pretrained")
+
+            assert fn == fn_1
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+        # warning should be thrown
+        with self.assertWarns(FutureWarning):
+            assert model.test_attribute == 5
+
+        with self.assertWarns(FutureWarning):
+            assert getattr(model, "test_attribute") == 5
+
+        with self.assertRaises(AttributeError) as error:
+            model.does_not_exist
+
+        assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'"
+
+    @unittest.skipIf(
+        torch_device != "npu" or not is_torch_npu_available(),
+        reason="torch npu flash attention is only available with NPU and `torch_npu` installed",
+    )
+    def test_set_torch_npu_flash_attn_processor_determinism(self):
+        torch.use_deterministic_algorithms(False)
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        if not hasattr(model, "set_attn_processor"):
+            # If not has `set_attn_processor`, skip test
+            return
+
+        model.set_default_attn_processor()
+        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output = model(**self.inputs_dict(0))[0]
+            else:
+                output = model(**inputs_dict)[0]
+
+        model.enable_npu_flash_attention()
+        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_2 = model(**self.inputs_dict(0))[0]
+            else:
+                output_2 = model(**inputs_dict)[0]
+
+        model.set_attn_processor(AttnProcessorNPU())
+        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_3 = model(**self.inputs_dict(0))[0]
+            else:
+                output_3 = model(**inputs_dict)[0]
+
+        torch.use_deterministic_algorithms(True)
+
+        assert torch.allclose(output, output_2, atol=self.base_precision)
+        assert torch.allclose(output, output_3, atol=self.base_precision)
+        assert torch.allclose(output_2, output_3, atol=self.base_precision)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_set_xformers_attn_processor_for_determinism(self):
+        torch.use_deterministic_algorithms(False)
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        if not hasattr(model, "set_attn_processor"):
+            # If not has `set_attn_processor`, skip test
+            return
+
+        if not hasattr(model, "set_default_attn_processor"):
+            # If not has `set_attn_processor`, skip test
+            return
+
+        model.set_default_attn_processor()
+        assert all(type(proc) == AttnProcessor for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output = model(**self.inputs_dict(0))[0]
+            else:
+                output = model(**inputs_dict)[0]
+
+        model.enable_xformers_memory_efficient_attention()
+        assert all(type(proc) == XFormersAttnProcessor for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_2 = model(**self.inputs_dict(0))[0]
+            else:
+                output_2 = model(**inputs_dict)[0]
+
+        model.set_attn_processor(XFormersAttnProcessor())
+        assert all(type(proc) == XFormersAttnProcessor for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_3 = model(**self.inputs_dict(0))[0]
+            else:
+                output_3 = model(**inputs_dict)[0]
+
+        torch.use_deterministic_algorithms(True)
+
+        assert torch.allclose(output, output_2, atol=self.base_precision)
+        assert torch.allclose(output, output_3, atol=self.base_precision)
+        assert torch.allclose(output_2, output_3, atol=self.base_precision)
+
+    @require_torch_gpu
+    def test_set_attn_processor_for_determinism(self):
+        if self.uses_custom_attn_processor:
+            return
+
+        torch.use_deterministic_algorithms(False)
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+
+        model.to(torch_device)
+
+        if not hasattr(model, "set_attn_processor"):
+            # If not has `set_attn_processor`, skip test
+            return
+
+        assert all(type(proc) == AttnProcessor2_0 for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_1 = model(**self.inputs_dict(0))[0]
+            else:
+                output_1 = model(**inputs_dict)[0]
+
+        model.set_default_attn_processor()
+        assert all(type(proc) == AttnProcessor for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_2 = model(**self.inputs_dict(0))[0]
+            else:
+                output_2 = model(**inputs_dict)[0]
+
+        model.set_attn_processor(AttnProcessor2_0())
+        assert all(type(proc) == AttnProcessor2_0 for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_4 = model(**self.inputs_dict(0))[0]
+            else:
+                output_4 = model(**inputs_dict)[0]
+
+        model.set_attn_processor(AttnProcessor())
+        assert all(type(proc) == AttnProcessor for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_5 = model(**self.inputs_dict(0))[0]
+            else:
+                output_5 = model(**inputs_dict)[0]
+
+        torch.use_deterministic_algorithms(True)
+
+        # make sure that outputs match
+        assert torch.allclose(output_2, output_1, atol=self.base_precision)
+        assert torch.allclose(output_2, output_4, atol=self.base_precision)
+        assert torch.allclose(output_2, output_5, atol=self.base_precision)
+
+    def test_from_save_pretrained_variant(self, expected_max_diff=5e-5):
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+
+        if hasattr(model, "set_default_attn_processor"):
+            model.set_default_attn_processor()
+
+        model.to(torch_device)
+        model.eval()
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, variant="fp16", safe_serialization=False)
+            new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16")
+            if hasattr(new_model, "set_default_attn_processor"):
+                new_model.set_default_attn_processor()
+
+            # non-variant cannot be loaded
+            with self.assertRaises(OSError) as error_context:
+                self.model_class.from_pretrained(tmpdirname)
+
+            # make sure that error message states what keys are missing
+            assert "Error no file named diffusion_pytorch_model.bin found in directory" in str(error_context.exception)
+
+            new_model.to(torch_device)
+
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                image = model(**self.inputs_dict(0))
+            else:
+                image = model(**inputs_dict)
+            if isinstance(image, dict):
+                image = image.to_tuple()[0]
+
+            if self.forward_requires_fresh_args:
+                new_image = new_model(**self.inputs_dict(0))
+            else:
+                new_image = new_model(**inputs_dict)
+
+            if isinstance(new_image, dict):
+                new_image = new_image.to_tuple()[0]
+
+        max_diff = (image - new_image).abs().max().item()
+        self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
+
+    @is_torch_compile
+    @require_torch_2
+    @unittest.skipIf(
+        get_python_version == (3, 12),
+        reason="Torch Dynamo isn't yet supported for Python 3.12.",
+    )
+    def test_from_save_pretrained_dynamo(self):
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        inputs = [init_dict, self.model_class]
+        run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=inputs)
+
+    def test_from_save_pretrained_dtype(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            if torch_device == "mps" and dtype == torch.bfloat16:
+                continue
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.to(dtype)
+                model.save_pretrained(tmpdirname, safe_serialization=False)
+                new_model = self.model_class.from_pretrained(tmpdirname, low_cpu_mem_usage=True, torch_dtype=dtype)
+                assert new_model.dtype == dtype
+                new_model = self.model_class.from_pretrained(tmpdirname, low_cpu_mem_usage=False, torch_dtype=dtype)
+                assert new_model.dtype == dtype
+
+    def test_determinism(self, expected_max_diff=1e-5):
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                first = model(**self.inputs_dict(0))
+            else:
+                first = model(**inputs_dict)
+            if isinstance(first, dict):
+                first = first.to_tuple()[0]
+
+            if self.forward_requires_fresh_args:
+                second = model(**self.inputs_dict(0))
+            else:
+                second = model(**inputs_dict)
+            if isinstance(second, dict):
+                second = second.to_tuple()[0]
+
+        out_1 = first.cpu().numpy()
+        out_2 = second.cpu().numpy()
+        out_1 = out_1[~np.isnan(out_1)]
+        out_2 = out_2[~np.isnan(out_2)]
+        max_diff = np.amax(np.abs(out_1 - out_2))
+        self.assertLessEqual(max_diff, expected_max_diff)
+
+    def test_output(self, expected_output_shape=None):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.to_tuple()[0]
+
+        self.assertIsNotNone(output)
+
+        # input & output have to have the same shape
+        input_tensor = inputs_dict[self.main_input_name]
+
+        if expected_output_shape is None:
+            expected_shape = input_tensor.shape
+            self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+        else:
+            self.assertEqual(output.shape, expected_output_shape, "Input and output shapes do not match")
+
+    def test_model_from_pretrained(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        # test if the model can be loaded from the config
+        # and has all the expected shape
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, safe_serialization=False)
+            new_model = self.model_class.from_pretrained(tmpdirname)
+            new_model.to(torch_device)
+            new_model.eval()
+
+        # check if all parameters shape are the same
+        for param_name in model.state_dict().keys():
+            param_1 = model.state_dict()[param_name]
+            param_2 = new_model.state_dict()[param_name]
+            self.assertEqual(param_1.shape, param_2.shape)
+
+        with torch.no_grad():
+            output_1 = model(**inputs_dict)
+
+            if isinstance(output_1, dict):
+                output_1 = output_1.to_tuple()[0]
+
+            output_2 = new_model(**inputs_dict)
+
+            if isinstance(output_2, dict):
+                output_2 = output_2.to_tuple()[0]
+
+        self.assertEqual(output_1.shape, output_2.shape)
+
+    @require_torch_accelerator_with_training
+    def test_training(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.train()
+        output = model(**inputs_dict)
+
+        if isinstance(output, dict):
+            output = output.to_tuple()[0]
+
+        input_tensor = inputs_dict[self.main_input_name]
+        noise = torch.randn((input_tensor.shape[0],) + self.output_shape).to(torch_device)
+        loss = torch.nn.functional.mse_loss(output, noise)
+        loss.backward()
+
+    @require_torch_accelerator_with_training
+    def test_ema_training(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.train()
+        ema_model = EMAModel(model.parameters())
+
+        output = model(**inputs_dict)
+
+        if isinstance(output, dict):
+            output = output.to_tuple()[0]
+
+        input_tensor = inputs_dict[self.main_input_name]
+        noise = torch.randn((input_tensor.shape[0],) + self.output_shape).to(torch_device)
+        loss = torch.nn.functional.mse_loss(output, noise)
+        loss.backward()
+        ema_model.step(model.parameters())
+
+    def test_outputs_equivalence(self):
+        def set_nan_tensor_to_zero(t):
+            # Temporary fallback until `aten::_index_put_impl_` is implemented in mps
+            # Track progress in https://github.com/pytorch/pytorch/issues/77764
+            device = t.device
+            if device.type == "mps":
+                t = t.to("cpu")
+            t[t != t] = 0
+            return t.to(device)
+
+        def recursive_check(tuple_object, dict_object):
+            if isinstance(tuple_object, (List, Tuple)):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif isinstance(tuple_object, Dict):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif tuple_object is None:
+                return
+            else:
+                self.assertTrue(
+                    torch.allclose(
+                        set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                    ),
+                    msg=(
+                        "Tuple and dict output are not equal. Difference:"
+                        f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                        f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                        f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                    ),
+                )
+
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                outputs_dict = model(**self.inputs_dict(0))
+                outputs_tuple = model(**self.inputs_dict(0), return_dict=False)
+            else:
+                outputs_dict = model(**inputs_dict)
+                outputs_tuple = model(**inputs_dict, return_dict=False)
+
+        recursive_check(outputs_tuple, outputs_dict)
+
+    @require_torch_accelerator_with_training
+    def test_enable_disable_gradient_checkpointing(self):
+        if not self.model_class._supports_gradient_checkpointing:
+            return  # Skip test if model does not support gradient checkpointing
+
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+
+        # at init model should have gradient checkpointing disabled
+        model = self.model_class(**init_dict)
+        self.assertFalse(model.is_gradient_checkpointing)
+
+        # check enable works
+        model.enable_gradient_checkpointing()
+        self.assertTrue(model.is_gradient_checkpointing)
+
+        # check disable works
+        model.disable_gradient_checkpointing()
+        self.assertFalse(model.is_gradient_checkpointing)
+
+    def test_deprecated_kwargs(self):
+        has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters
+        has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0
+
+        if has_kwarg_in_model_class and not has_deprecated_kwarg:
+            raise ValueError(
+                f"{self.model_class} has `**kwargs` in its __init__ method but has not defined any deprecated kwargs"
+                " under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if there are"
+                " no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
+                " [<deprecated_argument>]`"
+            )
+
+        if not has_kwarg_in_model_class and has_deprecated_kwarg:
+            raise ValueError(
+                f"{self.model_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated kwargs"
+                " under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs` argument to"
+                f" {self.model_class}.__init__ if there are deprecated arguments or remove the deprecated argument"
+                " from `_deprecated_kwargs = [<deprecated_argument>]`"
+            )
+
+    @require_torch_gpu
+    def test_cpu_offload(self):
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        if model._no_split_modules is None:
+            return
+
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_sizes(model)[""]
+        # We test several splits of sizes to make sure it works.
+        max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir)
+
+            for max_size in max_gpu_sizes:
+                max_memory = {0: max_size, "cpu": model_size * 2}
+                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
+                # Making sure part of the model will actually end up offloaded
+                self.assertSetEqual(set(new_model.hf_device_map.values()), {0, "cpu"})
+
+                self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+                torch.manual_seed(0)
+                new_output = new_model(**inputs_dict)
+
+                self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+    @require_torch_gpu
+    def test_disk_offload_without_safetensors(self):
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        if model._no_split_modules is None:
+            return
+
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_sizes(model)[""]
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir, safe_serialization=False)
+
+            with self.assertRaises(ValueError):
+                max_size = int(self.model_split_percents[0] * model_size)
+                max_memory = {0: max_size, "cpu": max_size}
+                # This errors out because it's missing an offload folder
+                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
+
+            max_size = int(self.model_split_percents[0] * model_size)
+            max_memory = {0: max_size, "cpu": max_size}
+            new_model = self.model_class.from_pretrained(
+                tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir
+            )
+
+            self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+            torch.manual_seed(0)
+            new_output = new_model(**inputs_dict)
+
+            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+    @require_torch_gpu
+    def test_disk_offload_with_safetensors(self):
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        if model._no_split_modules is None:
+            return
+
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_sizes(model)[""]
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir)
+
+            max_size = int(self.model_split_percents[0] * model_size)
+            max_memory = {0: max_size, "cpu": max_size}
+            new_model = self.model_class.from_pretrained(
+                tmp_dir, device_map="auto", offload_folder=tmp_dir, max_memory=max_memory
+            )
+
+            self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+            torch.manual_seed(0)
+            new_output = new_model(**inputs_dict)
+
+            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+    @require_torch_multi_gpu
+    def test_model_parallelism(self):
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        if model._no_split_modules is None:
+            return
+
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_sizes(model)[""]
+        # We test several splits of sizes to make sure it works.
+        max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir)
+
+            for max_size in max_gpu_sizes:
+                max_memory = {0: max_size, 1: model_size * 2, "cpu": model_size * 2}
+                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
+                # Making sure part of the model will actually end up offloaded
+                self.assertSetEqual(set(new_model.hf_device_map.values()), {0, 1})
+
+                self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+
+                torch.manual_seed(0)
+                new_output = new_model(**inputs_dict)
+
+                self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+    @require_torch_gpu
+    def test_sharded_checkpoints(self):
+        torch.manual_seed(0)
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_sizes(model)[""]
+        max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
+
+            # Now check if the right number of shards exists. First, let's get the number of shards.
+            # Since this number can be dependent on the model being tested, it's important that we calculate it
+            # instead of hardcoding it.
+            expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))
+            actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
+            self.assertTrue(actual_num_shards == expected_num_shards)
+
+            new_model = self.model_class.from_pretrained(tmp_dir).eval()
+            new_model = new_model.to(torch_device)
+
+            torch.manual_seed(0)
+            if "generator" in inputs_dict:
+                _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            new_output = new_model(**inputs_dict)
+
+            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+    @require_torch_gpu
+    def test_sharded_checkpoints_with_variant(self):
+        torch.manual_seed(0)
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_sizes(model)[""]
+        max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
+        variant = "fp16"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # It doesn't matter if the actual model is in fp16 or not. Just adding the variant and
+            # testing if loading works with the variant when the checkpoint is sharded should be
+            # enough.
+            model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB", variant=variant)
+            index_filename = _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, index_filename)))
+
+            # Now check if the right number of shards exists. First, let's get the number of shards.
+            # Since this number can be dependent on the model being tested, it's important that we calculate it
+            # instead of hardcoding it.
+            expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, index_filename))
+            actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
+            self.assertTrue(actual_num_shards == expected_num_shards)
+
+            new_model = self.model_class.from_pretrained(tmp_dir, variant=variant).eval()
+            new_model = new_model.to(torch_device)
+
+            torch.manual_seed(0)
+            if "generator" in inputs_dict:
+                _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            new_output = new_model(**inputs_dict)
+
+            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+    @require_torch_gpu
+    def test_sharded_checkpoints_device_map(self):
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        if model._no_split_modules is None:
+            return
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_sizes(model)[""]
+        max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
+
+            # Now check if the right number of shards exists. First, let's get the number of shards.
+            # Since this number can be dependent on the model being tested, it's important that we calculate it
+            # instead of hardcoding it.
+            expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))
+            actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
+            self.assertTrue(actual_num_shards == expected_num_shards)
+
+            new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto")
+
+            torch.manual_seed(0)
+            if "generator" in inputs_dict:
+                _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            new_output = new_model(**inputs_dict)
+            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+
+@is_staging_test
+class ModelPushToHubTester(unittest.TestCase):
+    identifier = uuid.uuid4()
+    repo_id = f"test-model-{identifier}"
+    org_repo_id = f"valid_org/{repo_id}-org"
+
+    def test_push_to_hub(self):
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        model.push_to_hub(self.repo_id, token=TOKEN)
+
+        new_model = UNet2DConditionModel.from_pretrained(f"{USER}/{self.repo_id}")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(token=TOKEN, repo_id=self.repo_id)
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, repo_id=self.repo_id, push_to_hub=True, token=TOKEN)
+
+        new_model = UNet2DConditionModel.from_pretrained(f"{USER}/{self.repo_id}")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(self.repo_id, token=TOKEN)
+
+    def test_push_to_hub_in_organization(self):
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        model.push_to_hub(self.org_repo_id, token=TOKEN)
+
+        new_model = UNet2DConditionModel.from_pretrained(self.org_repo_id)
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(token=TOKEN, repo_id=self.org_repo_id)
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, push_to_hub=True, token=TOKEN, repo_id=self.org_repo_id)
+
+        new_model = UNet2DConditionModel.from_pretrained(self.org_repo_id)
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(self.org_repo_id, token=TOKEN)
+
+    @unittest.skipIf(
+        not is_jinja_available(),
+        reason="Model card tests cannot be performed without Jinja installed.",
+    )
+    def test_push_to_hub_library_name(self):
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        model.push_to_hub(self.repo_id, token=TOKEN)
+
+        model_card = ModelCard.load(f"{USER}/{self.repo_id}", token=TOKEN).data
+        assert model_card.library_name == "diffusers"
+
+        # Reset repo
+        delete_repo(self.repo_id, token=TOKEN)
diff --git a/diffusers/tests/models/test_modeling_common_flax.py b/diffusers/tests/models/test_modeling_common_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..8945aed7c93fb1e664c7b6d799f7e0a96525b1a2
--- /dev/null
+++ b/diffusers/tests/models/test_modeling_common_flax.py
@@ -0,0 +1,66 @@
+import inspect
+
+from diffusers.utils import is_flax_available
+from diffusers.utils.testing_utils import require_flax
+
+
+if is_flax_available():
+    import jax
+
+
+@require_flax
+class FlaxModelTesterMixin:
+    def test_output(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        variables = model.init(inputs_dict["prng_key"], inputs_dict["sample"])
+        jax.lax.stop_gradient(variables)
+
+        output = model.apply(variables, inputs_dict["sample"])
+
+        if isinstance(output, dict):
+            output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_forward_with_norm_groups(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["norm_num_groups"] = 16
+        init_dict["block_out_channels"] = (16, 32)
+
+        model = self.model_class(**init_dict)
+        variables = model.init(inputs_dict["prng_key"], inputs_dict["sample"])
+        jax.lax.stop_gradient(variables)
+
+        output = model.apply(variables, inputs_dict["sample"])
+
+        if isinstance(output, dict):
+            output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_deprecated_kwargs(self):
+        has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters
+        has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0
+
+        if has_kwarg_in_model_class and not has_deprecated_kwarg:
+            raise ValueError(
+                f"{self.model_class} has `**kwargs` in its __init__ method but has not defined any deprecated kwargs"
+                " under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if there are"
+                " no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
+                " [<deprecated_argument>]`"
+            )
+
+        if not has_kwarg_in_model_class and has_deprecated_kwarg:
+            raise ValueError(
+                f"{self.model_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated kwargs"
+                " under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs` argument to"
+                f" {self.model_class}.__init__ if there are deprecated arguments or remove the deprecated argument"
+                " from `_deprecated_kwargs = [<deprecated_argument>]`"
+            )
diff --git a/diffusers/tests/models/transformers/__init__.py b/diffusers/tests/models/transformers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/models/transformers/test_models_dit_transformer2d.py b/diffusers/tests/models/transformers/test_models_dit_transformer2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..b12cae1a88793b974fac3b6c568104e84fb30f43
--- /dev/null
+++ b/diffusers/tests/models/transformers/test_models_dit_transformer2d.py
@@ -0,0 +1,95 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import DiTTransformer2DModel, Transformer2DModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    slow,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class DiTTransformer2DModelTests(ModelTesterMixin, unittest.TestCase):
+    model_class = DiTTransformer2DModel
+    main_input_name = "hidden_states"
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        in_channels = 4
+        sample_size = 8
+        scheduler_num_train_steps = 1000
+        num_class_labels = 4
+
+        hidden_states = floats_tensor((batch_size, in_channels, sample_size, sample_size)).to(torch_device)
+        timesteps = torch.randint(0, scheduler_num_train_steps, size=(batch_size,)).to(torch_device)
+        class_label_ids = torch.randint(0, num_class_labels, size=(batch_size,)).to(torch_device)
+
+        return {"hidden_states": hidden_states, "timestep": timesteps, "class_labels": class_label_ids}
+
+    @property
+    def input_shape(self):
+        return (4, 8, 8)
+
+    @property
+    def output_shape(self):
+        return (8, 8, 8)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 4,
+            "out_channels": 8,
+            "activation_fn": "gelu-approximate",
+            "num_attention_heads": 2,
+            "attention_head_dim": 4,
+            "attention_bias": True,
+            "num_layers": 1,
+            "norm_type": "ada_norm_zero",
+            "num_embeds_ada_norm": 8,
+            "patch_size": 2,
+            "sample_size": 8,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        super().test_output(
+            expected_output_shape=(self.dummy_input[self.main_input_name].shape[0],) + self.output_shape
+        )
+
+    def test_correct_class_remapping_from_dict_config(self):
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        model = Transformer2DModel.from_config(init_dict)
+        assert isinstance(model, DiTTransformer2DModel)
+
+    def test_correct_class_remapping_from_pretrained_config(self):
+        config = DiTTransformer2DModel.load_config("facebook/DiT-XL-2-256", subfolder="transformer")
+        model = Transformer2DModel.from_config(config)
+        assert isinstance(model, DiTTransformer2DModel)
+
+    @slow
+    def test_correct_class_remapping(self):
+        model = Transformer2DModel.from_pretrained("facebook/DiT-XL-2-256", subfolder="transformer")
+        assert isinstance(model, DiTTransformer2DModel)
diff --git a/diffusers/tests/models/transformers/test_models_pixart_transformer2d.py b/diffusers/tests/models/transformers/test_models_pixart_transformer2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..30293f5d35cbd1fb1ba596e1491e48347db031fd
--- /dev/null
+++ b/diffusers/tests/models/transformers/test_models_pixart_transformer2d.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import PixArtTransformer2DModel, Transformer2DModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    slow,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class PixArtTransformer2DModelTests(ModelTesterMixin, unittest.TestCase):
+    model_class = PixArtTransformer2DModel
+    main_input_name = "hidden_states"
+    # We override the items here because the transformer under consideration is small.
+    model_split_percents = [0.7, 0.6, 0.6]
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        in_channels = 4
+        sample_size = 8
+        scheduler_num_train_steps = 1000
+        cross_attention_dim = 8
+        seq_len = 8
+
+        hidden_states = floats_tensor((batch_size, in_channels, sample_size, sample_size)).to(torch_device)
+        timesteps = torch.randint(0, scheduler_num_train_steps, size=(batch_size,)).to(torch_device)
+        encoder_hidden_states = floats_tensor((batch_size, seq_len, cross_attention_dim)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": timesteps,
+            "encoder_hidden_states": encoder_hidden_states,
+            "added_cond_kwargs": {"aspect_ratio": None, "resolution": None},
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 8, 8)
+
+    @property
+    def output_shape(self):
+        return (8, 8, 8)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "sample_size": 8,
+            "num_layers": 1,
+            "patch_size": 2,
+            "attention_head_dim": 2,
+            "num_attention_heads": 2,
+            "in_channels": 4,
+            "cross_attention_dim": 8,
+            "out_channels": 8,
+            "attention_bias": True,
+            "activation_fn": "gelu-approximate",
+            "num_embeds_ada_norm": 8,
+            "norm_type": "ada_norm_single",
+            "norm_elementwise_affine": False,
+            "norm_eps": 1e-6,
+            "use_additional_conditions": False,
+            "caption_channels": None,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        super().test_output(
+            expected_output_shape=(self.dummy_input[self.main_input_name].shape[0],) + self.output_shape
+        )
+
+    def test_correct_class_remapping_from_dict_config(self):
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        model = Transformer2DModel.from_config(init_dict)
+        assert isinstance(model, PixArtTransformer2DModel)
+
+    def test_correct_class_remapping_from_pretrained_config(self):
+        config = PixArtTransformer2DModel.load_config("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="transformer")
+        model = Transformer2DModel.from_config(config)
+        assert isinstance(model, PixArtTransformer2DModel)
+
+    @slow
+    def test_correct_class_remapping(self):
+        model = Transformer2DModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="transformer")
+        assert isinstance(model, PixArtTransformer2DModel)
diff --git a/diffusers/tests/models/transformers/test_models_prior.py b/diffusers/tests/models/transformers/test_models_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2ed10dfa1f661bb43e99b5c6ec59f50c06819cb
--- /dev/null
+++ b/diffusers/tests/models/transformers/test_models_prior.py
@@ -0,0 +1,188 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import inspect
+import unittest
+
+import torch
+from parameterized import parameterized
+
+from diffusers import PriorTransformer
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    floats_tensor,
+    slow,
+    torch_all_close,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class PriorTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = PriorTransformer
+    main_input_name = "hidden_states"
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        embedding_dim = 8
+        num_embeddings = 7
+
+        hidden_states = floats_tensor((batch_size, embedding_dim)).to(torch_device)
+
+        proj_embedding = floats_tensor((batch_size, embedding_dim)).to(torch_device)
+        encoder_hidden_states = floats_tensor((batch_size, num_embeddings, embedding_dim)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": 2,
+            "proj_embedding": proj_embedding,
+            "encoder_hidden_states": encoder_hidden_states,
+        }
+
+    def get_dummy_seed_input(self, seed=0):
+        torch.manual_seed(seed)
+        batch_size = 4
+        embedding_dim = 8
+        num_embeddings = 7
+
+        hidden_states = torch.randn((batch_size, embedding_dim)).to(torch_device)
+
+        proj_embedding = torch.randn((batch_size, embedding_dim)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, num_embeddings, embedding_dim)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": 2,
+            "proj_embedding": proj_embedding,
+            "encoder_hidden_states": encoder_hidden_states,
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 8)
+
+    @property
+    def output_shape(self):
+        return (4, 8)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 4,
+            "num_layers": 2,
+            "embedding_dim": 8,
+            "num_embeddings": 7,
+            "additional_embeddings": 4,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_from_pretrained_hub(self):
+        model, loading_info = PriorTransformer.from_pretrained(
+            "hf-internal-testing/prior-dummy", output_loading_info=True
+        )
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
+
+        model.to(torch_device)
+        hidden_states = model(**self.dummy_input)[0]
+
+        assert hidden_states is not None, "Make sure output is not None"
+
+    def test_forward_signature(self):
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        signature = inspect.signature(model.forward)
+        # signature.parameters is an OrderedDict => so arg_names order is deterministic
+        arg_names = [*signature.parameters.keys()]
+
+        expected_arg_names = ["hidden_states", "timestep"]
+        self.assertListEqual(arg_names[:2], expected_arg_names)
+
+    def test_output_pretrained(self):
+        model = PriorTransformer.from_pretrained("hf-internal-testing/prior-dummy")
+        model = model.to(torch_device)
+
+        if hasattr(model, "set_default_attn_processor"):
+            model.set_default_attn_processor()
+
+        input = self.get_dummy_seed_input()
+
+        with torch.no_grad():
+            output = model(**input)[0]
+
+        output_slice = output[0, :5].flatten().cpu()
+        print(output_slice)
+
+        # Since the VAE Gaussian prior's generator is seeded on the appropriate device,
+        # the expected output slices are not the same for CPU and GPU.
+        expected_output_slice = torch.tensor([-1.3436, -0.2870, 0.7538, 0.4368, -0.0239])
+        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
+
+
+@slow
+class PriorTransformerIntegrationTests(unittest.TestCase):
+    def get_dummy_seed_input(self, batch_size=1, embedding_dim=768, num_embeddings=77, seed=0):
+        torch.manual_seed(seed)
+
+        hidden_states = torch.randn((batch_size, embedding_dim)).to(torch_device)
+
+        proj_embedding = torch.randn((batch_size, embedding_dim)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, num_embeddings, embedding_dim)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": 2,
+            "proj_embedding": proj_embedding,
+            "encoder_hidden_states": encoder_hidden_states,
+        }
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [13, [-0.5861, 0.1283, -0.0931, 0.0882, 0.4476, 0.1329, -0.0498, 0.0640]],
+            [37, [-0.4913, 0.0110, -0.0483, 0.0541, 0.4954, -0.0170, 0.0354, 0.1651]],
+            # fmt: on
+        ]
+    )
+    def test_kandinsky_prior(self, seed, expected_slice):
+        model = PriorTransformer.from_pretrained("kandinsky-community/kandinsky-2-1-prior", subfolder="prior")
+        model.to(torch_device)
+        input = self.get_dummy_seed_input(seed=seed)
+
+        with torch.no_grad():
+            sample = model(**input)[0]
+
+        assert list(sample.shape) == [1, 768]
+
+        output_slice = sample[0, :8].flatten().cpu()
+        print(output_slice)
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
diff --git a/diffusers/tests/models/transformers/test_models_transformer_aura_flow.py b/diffusers/tests/models/transformers/test_models_transformer_aura_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..376d8b57da4dde04e980fa9f291ca4f94315a565
--- /dev/null
+++ b/diffusers/tests/models/transformers/test_models_transformer_aura_flow.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import AuraFlowTransformer2DModel
+from diffusers.utils.testing_utils import enable_full_determinism, torch_device
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class AuraFlowTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = AuraFlowTransformer2DModel
+    main_input_name = "hidden_states"
+    # We override the items here because the transformer under consideration is small.
+    model_split_percents = [0.7, 0.6, 0.6]
+
+    @property
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        height = width = embedding_dim = 32
+        sequence_length = 256
+
+        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (4, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "sample_size": 32,
+            "patch_size": 2,
+            "in_channels": 4,
+            "num_mmdit_layers": 1,
+            "num_single_dit_layers": 1,
+            "attention_head_dim": 8,
+            "num_attention_heads": 4,
+            "caption_projection_dim": 32,
+            "joint_attention_dim": 32,
+            "out_channels": 4,
+            "pos_embed_max_size": 256,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    @unittest.skip("AuraFlowTransformer2DModel uses its own dedicated attention processor. This test does not apply")
+    def test_set_attn_processor_for_determinism(self):
+        pass
diff --git a/diffusers/tests/models/transformers/test_models_transformer_cogvideox.py b/diffusers/tests/models/transformers/test_models_transformer_cogvideox.py
new file mode 100644
index 0000000000000000000000000000000000000000..6db4113cbd1b951036f45c2a0a201d5a61c33030
--- /dev/null
+++ b/diffusers/tests/models/transformers/test_models_transformer_cogvideox.py
@@ -0,0 +1,83 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import CogVideoXTransformer3DModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class CogVideoXTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = CogVideoXTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        num_frames = 1
+        height = 8
+        width = 8
+        embedding_dim = 8
+        sequence_length = 8
+
+        hidden_states = torch.randn((batch_size, num_frames, num_channels, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+        }
+
+    @property
+    def input_shape(self):
+        return (1, 4, 8, 8)
+
+    @property
+    def output_shape(self):
+        return (1, 4, 8, 8)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings.
+            "num_attention_heads": 2,
+            "attention_head_dim": 8,
+            "in_channels": 4,
+            "out_channels": 4,
+            "time_embed_dim": 2,
+            "text_embed_dim": 8,
+            "num_layers": 1,
+            "sample_width": 8,
+            "sample_height": 8,
+            "sample_frames": 8,
+            "patch_size": 2,
+            "temporal_compression_ratio": 4,
+            "max_text_seq_length": 8,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
diff --git a/diffusers/tests/models/transformers/test_models_transformer_flux.py b/diffusers/tests/models/transformers/test_models_transformer_flux.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf7a4f75707ad6e26af0e51593ed8af6f12ec9f
--- /dev/null
+++ b/diffusers/tests/models/transformers/test_models_transformer_flux.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import FluxTransformer2DModel
+from diffusers.utils.testing_utils import enable_full_determinism, torch_device
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class FluxTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = FluxTransformer2DModel
+    main_input_name = "hidden_states"
+    # We override the items here because the transformer under consideration is small.
+    model_split_percents = [0.7, 0.6, 0.6]
+
+    # Skip setting testing with default: AttnProcessor
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        batch_size = 1
+        num_latent_channels = 4
+        num_image_channels = 3
+        height = width = 4
+        sequence_length = 48
+        embedding_dim = 32
+
+        hidden_states = torch.randn((batch_size, height * width, num_latent_channels)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        pooled_prompt_embeds = torch.randn((batch_size, embedding_dim)).to(torch_device)
+        text_ids = torch.randn((sequence_length, num_image_channels)).to(torch_device)
+        image_ids = torch.randn((height * width, num_image_channels)).to(torch_device)
+        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "img_ids": image_ids,
+            "txt_ids": text_ids,
+            "pooled_projections": pooled_prompt_embeds,
+            "timestep": timestep,
+        }
+
+    @property
+    def input_shape(self):
+        return (16, 4)
+
+    @property
+    def output_shape(self):
+        return (16, 4)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "patch_size": 1,
+            "in_channels": 4,
+            "num_layers": 1,
+            "num_single_layers": 1,
+            "attention_head_dim": 16,
+            "num_attention_heads": 2,
+            "joint_attention_dim": 32,
+            "pooled_projection_dim": 32,
+            "axes_dims_rope": [4, 4, 8],
+        }
+
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_deprecated_inputs_img_txt_ids_3d(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output_1 = model(**inputs_dict).to_tuple()[0]
+
+        # update inputs_dict with txt_ids and img_ids as 3d tensors (deprecated)
+        text_ids_3d = inputs_dict["txt_ids"].unsqueeze(0)
+        image_ids_3d = inputs_dict["img_ids"].unsqueeze(0)
+
+        assert text_ids_3d.ndim == 3, "text_ids_3d should be a 3d tensor"
+        assert image_ids_3d.ndim == 3, "img_ids_3d should be a 3d tensor"
+
+        inputs_dict["txt_ids"] = text_ids_3d
+        inputs_dict["img_ids"] = image_ids_3d
+
+        with torch.no_grad():
+            output_2 = model(**inputs_dict).to_tuple()[0]
+
+        self.assertEqual(output_1.shape, output_2.shape)
+        self.assertTrue(
+            torch.allclose(output_1, output_2, atol=1e-5),
+            msg="output with deprecated inputs (img_ids and txt_ids as 3d torch tensors) are not equal as them as 2d inputs",
+        )
diff --git a/diffusers/tests/models/transformers/test_models_transformer_hunyuan_dit.py b/diffusers/tests/models/transformers/test_models_transformer_hunyuan_dit.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea05abed38d9f19218a422e43ec3f43879645d35
--- /dev/null
+++ b/diffusers/tests/models/transformers/test_models_transformer_hunyuan_dit.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import HunyuanDiT2DModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class HunyuanDiTTests(ModelTesterMixin, unittest.TestCase):
+    model_class = HunyuanDiT2DModel
+    main_input_name = "hidden_states"
+
+    @property
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        height = width = 8
+        embedding_dim = 8
+        sequence_length = 4
+        sequence_length_t5 = 4
+
+        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        text_embedding_mask = torch.ones(size=(batch_size, sequence_length)).to(torch_device)
+        encoder_hidden_states_t5 = torch.randn((batch_size, sequence_length_t5, embedding_dim)).to(torch_device)
+        text_embedding_mask_t5 = torch.ones(size=(batch_size, sequence_length_t5)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,), dtype=encoder_hidden_states.dtype).to(torch_device)
+
+        original_size = [1024, 1024]
+        target_size = [16, 16]
+        crops_coords_top_left = [0, 0]
+        add_time_ids = list(original_size + target_size + crops_coords_top_left)
+        add_time_ids = torch.tensor([add_time_ids, add_time_ids], dtype=encoder_hidden_states.dtype).to(torch_device)
+        style = torch.zeros(size=(batch_size,), dtype=int).to(torch_device)
+        image_rotary_emb = [
+            torch.ones(size=(1, 8), dtype=encoder_hidden_states.dtype),
+            torch.zeros(size=(1, 8), dtype=encoder_hidden_states.dtype),
+        ]
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "text_embedding_mask": text_embedding_mask,
+            "encoder_hidden_states_t5": encoder_hidden_states_t5,
+            "text_embedding_mask_t5": text_embedding_mask_t5,
+            "timestep": timestep,
+            "image_meta_size": add_time_ids,
+            "style": style,
+            "image_rotary_emb": image_rotary_emb,
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 8, 8)
+
+    @property
+    def output_shape(self):
+        return (8, 8, 8)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "sample_size": 8,
+            "patch_size": 2,
+            "in_channels": 4,
+            "num_layers": 1,
+            "attention_head_dim": 8,
+            "num_attention_heads": 2,
+            "cross_attention_dim": 8,
+            "cross_attention_dim_t5": 8,
+            "pooled_projection_dim": 4,
+            "hidden_size": 16,
+            "text_len": 4,
+            "text_len_t5": 4,
+            "activation_fn": "gelu-approximate",
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        super().test_output(
+            expected_output_shape=(self.dummy_input[self.main_input_name].shape[0],) + self.output_shape
+        )
+
+    @unittest.skip("HunyuanDIT use a custom processor HunyuanAttnProcessor2_0")
+    def test_set_xformers_attn_processor_for_determinism(self):
+        pass
+
+    @unittest.skip("HunyuanDIT use a custom processor HunyuanAttnProcessor2_0")
+    def test_set_attn_processor_for_determinism(self):
+        pass
diff --git a/diffusers/tests/models/transformers/test_models_transformer_latte.py b/diffusers/tests/models/transformers/test_models_transformer_latte.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fe0a6098045f9556a910db4b687a20b8d18e816
--- /dev/null
+++ b/diffusers/tests/models/transformers/test_models_transformer_latte.py
@@ -0,0 +1,88 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import LatteTransformer3DModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class LatteTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = LatteTransformer3DModel
+    main_input_name = "hidden_states"
+
+    @property
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        num_frames = 1
+        height = width = 8
+        embedding_dim = 8
+        sequence_length = 8
+
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+            "enable_temporal_attentions": True,
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 1, 8, 8)
+
+    @property
+    def output_shape(self):
+        return (8, 1, 8, 8)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "sample_size": 8,
+            "num_layers": 1,
+            "patch_size": 2,
+            "attention_head_dim": 4,
+            "num_attention_heads": 2,
+            "caption_channels": 8,
+            "in_channels": 4,
+            "cross_attention_dim": 8,
+            "out_channels": 8,
+            "attention_bias": True,
+            "activation_fn": "gelu-approximate",
+            "num_embeds_ada_norm": 1000,
+            "norm_type": "ada_norm_single",
+            "norm_elementwise_affine": False,
+            "norm_eps": 1e-6,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        super().test_output(
+            expected_output_shape=(self.dummy_input[self.main_input_name].shape[0],) + self.output_shape
+        )
diff --git a/diffusers/tests/models/transformers/test_models_transformer_lumina.py b/diffusers/tests/models/transformers/test_models_transformer_lumina.py
new file mode 100644
index 0000000000000000000000000000000000000000..6744fb8ac84bc1ad80388ee8446e5b47874cfb53
--- /dev/null
+++ b/diffusers/tests/models/transformers/test_models_transformer_lumina.py
@@ -0,0 +1,112 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import LuminaNextDiT2DModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class LuminaNextDiT2DModelTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = LuminaNextDiT2DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        """
+        Args:
+            None
+        Returns:
+            Dict: Dictionary of dummy input tensors
+        """
+        batch_size = 2  # N
+        num_channels = 4  # C
+        height = width = 16  # H, W
+        embedding_dim = 32  # D
+        sequence_length = 16  # L
+
+        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        timestep = torch.rand(size=(batch_size,)).to(torch_device)
+        encoder_mask = torch.randn(size=(batch_size, sequence_length)).to(torch_device)
+        image_rotary_emb = torch.randn((384, 384, 4)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+            "encoder_mask": encoder_mask,
+            "image_rotary_emb": image_rotary_emb,
+            "cross_attention_kwargs": {},
+        }
+
+    @property
+    def input_shape(self):
+        """
+        Args:
+            None
+        Returns:
+            Tuple: (int, int, int)
+        """
+        return (4, 16, 16)
+
+    @property
+    def output_shape(self):
+        """
+        Args:
+            None
+        Returns:
+            Tuple: (int, int, int)
+        """
+        return (4, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        """
+        Args:
+            None
+
+        Returns:
+            Tuple: (Dict, Dict)
+        """
+        init_dict = {
+            "sample_size": 16,
+            "patch_size": 2,
+            "in_channels": 4,
+            "hidden_size": 24,
+            "num_layers": 2,
+            "num_attention_heads": 3,
+            "num_kv_heads": 1,
+            "multiple_of": 16,
+            "ffn_dim_multiplier": None,
+            "norm_eps": 1e-5,
+            "learn_sigma": False,
+            "qk_norm": True,
+            "cross_attention_dim": 32,
+            "scaling_factor": 1.0,
+        }
+
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
diff --git a/diffusers/tests/models/transformers/test_models_transformer_sd3.py b/diffusers/tests/models/transformers/test_models_transformer_sd3.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b9084327289a7220b3ea4ffc8c405cf35513d24
--- /dev/null
+++ b/diffusers/tests/models/transformers/test_models_transformer_sd3.py
@@ -0,0 +1,82 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import SD3Transformer2DModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = SD3Transformer2DModel
+    main_input_name = "hidden_states"
+
+    @property
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        height = width = embedding_dim = 32
+        pooled_embedding_dim = embedding_dim * 2
+        sequence_length = 154
+
+        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        pooled_prompt_embeds = torch.randn((batch_size, pooled_embedding_dim)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "pooled_projections": pooled_prompt_embeds,
+            "timestep": timestep,
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (4, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "sample_size": 32,
+            "patch_size": 1,
+            "in_channels": 4,
+            "num_layers": 1,
+            "attention_head_dim": 8,
+            "num_attention_heads": 4,
+            "caption_projection_dim": 32,
+            "joint_attention_dim": 32,
+            "pooled_projection_dim": 64,
+            "out_channels": 4,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    @unittest.skip("SD3Transformer2DModel uses a dedicated attention processor. This test doesn't apply")
+    def test_set_attn_processor_for_determinism(self):
+        pass
diff --git a/diffusers/tests/models/transformers/test_models_transformer_temporal.py b/diffusers/tests/models/transformers/test_models_transformer_temporal.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b689447cf29ce78159a32bd321e8b185c7ebd44
--- /dev/null
+++ b/diffusers/tests/models/transformers/test_models_transformer_temporal.py
@@ -0,0 +1,67 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers.models.transformers import TransformerTemporalModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class TemporalTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = TransformerTemporalModel
+    main_input_name = "hidden_states"
+
+    @property
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        height = width = 32
+
+        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (4, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "num_attention_heads": 8,
+            "attention_head_dim": 4,
+            "in_channels": 4,
+            "num_layers": 1,
+            "norm_num_groups": 1,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
diff --git a/diffusers/tests/models/unets/__init__.py b/diffusers/tests/models/unets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/models/unets/test_models_unet_1d.py b/diffusers/tests/models/unets/test_models_unet_1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f7ef3bca0853092aedbb7b12989770449a1830a
--- /dev/null
+++ b/diffusers/tests/models/unets/test_models_unet_1d.py
@@ -0,0 +1,270 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import UNet1DModel
+from diffusers.utils.testing_utils import (
+    backend_manual_seed,
+    floats_tensor,
+    slow,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+class UNet1DModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = UNet1DModel
+    main_input_name = "sample"
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_features = 14
+        seq_len = 16
+
+        noise = floats_tensor((batch_size, num_features, seq_len)).to(torch_device)
+        time_step = torch.tensor([10] * batch_size).to(torch_device)
+
+        return {"sample": noise, "timestep": time_step}
+
+    @property
+    def input_shape(self):
+        return (4, 14, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 14, 16)
+
+    def test_ema_training(self):
+        pass
+
+    def test_training(self):
+        pass
+
+    def test_determinism(self):
+        super().test_determinism()
+
+    def test_outputs_equivalence(self):
+        super().test_outputs_equivalence()
+
+    def test_from_save_pretrained(self):
+        super().test_from_save_pretrained()
+
+    def test_from_save_pretrained_variant(self):
+        super().test_from_save_pretrained_variant()
+
+    def test_model_from_pretrained(self):
+        super().test_model_from_pretrained()
+
+    def test_output(self):
+        super().test_output()
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": (8, 8, 16, 16),
+            "in_channels": 14,
+            "out_channels": 14,
+            "time_embedding_type": "positional",
+            "use_timestep_embedding": True,
+            "flip_sin_to_cos": False,
+            "freq_shift": 1.0,
+            "out_block_type": "OutConv1DBlock",
+            "mid_block_type": "MidResTemporalBlock1D",
+            "down_block_types": ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"),
+            "up_block_types": ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D"),
+            "act_fn": "swish",
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_from_pretrained_hub(self):
+        model, loading_info = UNet1DModel.from_pretrained(
+            "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="unet"
+        )
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
+
+        model.to(torch_device)
+        image = model(**self.dummy_input)
+
+        assert image is not None, "Make sure output is not None"
+
+    def test_output_pretrained(self):
+        model = UNet1DModel.from_pretrained("bglick13/hopper-medium-v2-value-function-hor32", subfolder="unet")
+        torch.manual_seed(0)
+        backend_manual_seed(torch_device, 0)
+
+        num_features = model.config.in_channels
+        seq_len = 16
+        noise = torch.randn((1, seq_len, num_features)).permute(
+            0, 2, 1
+        )  # match original, we can update values and remove
+        time_step = torch.full((num_features,), 0)
+
+        with torch.no_grad():
+            output = model(noise, time_step).sample.permute(0, 2, 1)
+
+        output_slice = output[0, -3:, -3:].flatten()
+        # fmt: off
+        expected_output_slice = torch.tensor([-2.137172, 1.1426016, 0.3688687, -0.766922, 0.7303146, 0.11038864, -0.4760633, 0.13270172, 0.02591348])
+        # fmt: on
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-3))
+
+    def test_forward_with_norm_groups(self):
+        # Not implemented yet for this UNet
+        pass
+
+    @slow
+    def test_unet_1d_maestro(self):
+        model_id = "harmonai/maestro-150k"
+        model = UNet1DModel.from_pretrained(model_id, subfolder="unet")
+        model.to(torch_device)
+
+        sample_size = 65536
+        noise = torch.sin(torch.arange(sample_size)[None, None, :].repeat(1, 2, 1)).to(torch_device)
+        timestep = torch.tensor([1]).to(torch_device)
+
+        with torch.no_grad():
+            output = model(noise, timestep).sample
+
+        output_sum = output.abs().sum()
+        output_max = output.abs().max()
+
+        assert (output_sum - 224.0896).abs() < 0.5
+        assert (output_max - 0.0607).abs() < 4e-4
+
+
+class UNetRLModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = UNet1DModel
+    main_input_name = "sample"
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_features = 14
+        seq_len = 16
+
+        noise = floats_tensor((batch_size, num_features, seq_len)).to(torch_device)
+        time_step = torch.tensor([10] * batch_size).to(torch_device)
+
+        return {"sample": noise, "timestep": time_step}
+
+    @property
+    def input_shape(self):
+        return (4, 14, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 14, 1)
+
+    def test_determinism(self):
+        super().test_determinism()
+
+    def test_outputs_equivalence(self):
+        super().test_outputs_equivalence()
+
+    def test_from_save_pretrained(self):
+        super().test_from_save_pretrained()
+
+    def test_from_save_pretrained_variant(self):
+        super().test_from_save_pretrained_variant()
+
+    def test_model_from_pretrained(self):
+        super().test_model_from_pretrained()
+
+    def test_output(self):
+        # UNetRL is a value-function is different output shape
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = torch.Size((inputs_dict["sample"].shape[0], 1))
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_ema_training(self):
+        pass
+
+    def test_training(self):
+        pass
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 14,
+            "out_channels": 14,
+            "down_block_types": ["DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"],
+            "up_block_types": [],
+            "out_block_type": "ValueFunction",
+            "mid_block_type": "ValueFunctionMidBlock1D",
+            "block_out_channels": [32, 64, 128, 256],
+            "layers_per_block": 1,
+            "downsample_each_block": True,
+            "use_timestep_embedding": True,
+            "freq_shift": 1.0,
+            "flip_sin_to_cos": False,
+            "time_embedding_type": "positional",
+            "act_fn": "mish",
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_from_pretrained_hub(self):
+        value_function, vf_loading_info = UNet1DModel.from_pretrained(
+            "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="value_function"
+        )
+        self.assertIsNotNone(value_function)
+        self.assertEqual(len(vf_loading_info["missing_keys"]), 0)
+
+        value_function.to(torch_device)
+        image = value_function(**self.dummy_input)
+
+        assert image is not None, "Make sure output is not None"
+
+    def test_output_pretrained(self):
+        value_function, vf_loading_info = UNet1DModel.from_pretrained(
+            "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="value_function"
+        )
+        torch.manual_seed(0)
+        backend_manual_seed(torch_device, 0)
+
+        num_features = value_function.config.in_channels
+        seq_len = 14
+        noise = torch.randn((1, seq_len, num_features)).permute(
+            0, 2, 1
+        )  # match original, we can update values and remove
+        time_step = torch.full((num_features,), 0)
+
+        with torch.no_grad():
+            output = value_function(noise, time_step).sample
+
+        # fmt: off
+        expected_output_slice = torch.tensor([165.25] * seq_len)
+        # fmt: on
+        self.assertTrue(torch.allclose(output, expected_output_slice, rtol=1e-3))
+
+    def test_forward_with_norm_groups(self):
+        # Not implemented yet for this UNet
+        pass
diff --git a/diffusers/tests/models/unets/test_models_unet_2d.py b/diffusers/tests/models/unets/test_models_unet_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f827f274224b3dbf61f50c40bad9fde4b85f5ae
--- /dev/null
+++ b/diffusers/tests/models/unets/test_models_unet_2d.py
@@ -0,0 +1,331 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import math
+import unittest
+
+import torch
+
+from diffusers import UNet2DModel
+from diffusers.utils import logging
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    require_torch_accelerator,
+    slow,
+    torch_all_close,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+logger = logging.get_logger(__name__)
+
+enable_full_determinism()
+
+
+class Unet2DModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = UNet2DModel
+    main_input_name = "sample"
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor([10]).to(torch_device)
+
+        return {"sample": noise, "timestep": time_step}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": (4, 8),
+            "norm_num_groups": 2,
+            "down_block_types": ("DownBlock2D", "AttnDownBlock2D"),
+            "up_block_types": ("AttnUpBlock2D", "UpBlock2D"),
+            "attention_head_dim": 3,
+            "out_channels": 3,
+            "in_channels": 3,
+            "layers_per_block": 2,
+            "sample_size": 32,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_mid_block_attn_groups(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["add_attention"] = True
+        init_dict["attn_norm_num_groups"] = 4
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        self.assertIsNotNone(
+            model.mid_block.attentions[0].group_norm, "Mid block Attention group norm should exist but does not."
+        )
+        self.assertEqual(
+            model.mid_block.attentions[0].group_norm.num_groups,
+            init_dict["attn_norm_num_groups"],
+            "Mid block Attention group norm does not have the expected number of groups.",
+        )
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.to_tuple()[0]
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+
+class UNetLDMModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = UNet2DModel
+    main_input_name = "sample"
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 4
+        sizes = (32, 32)
+
+        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor([10]).to(torch_device)
+
+        return {"sample": noise, "timestep": time_step}
+
+    @property
+    def input_shape(self):
+        return (4, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (4, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "sample_size": 32,
+            "in_channels": 4,
+            "out_channels": 4,
+            "layers_per_block": 2,
+            "block_out_channels": (32, 64),
+            "attention_head_dim": 32,
+            "down_block_types": ("DownBlock2D", "DownBlock2D"),
+            "up_block_types": ("UpBlock2D", "UpBlock2D"),
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_from_pretrained_hub(self):
+        model, loading_info = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
+
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
+
+        model.to(torch_device)
+        image = model(**self.dummy_input).sample
+
+        assert image is not None, "Make sure output is not None"
+
+    @require_torch_accelerator
+    def test_from_pretrained_accelerate(self):
+        model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
+        model.to(torch_device)
+        image = model(**self.dummy_input).sample
+
+        assert image is not None, "Make sure output is not None"
+
+    @require_torch_accelerator
+    def test_from_pretrained_accelerate_wont_change_results(self):
+        # by default model loading will use accelerate as `low_cpu_mem_usage=True`
+        model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
+        model_accelerate.to(torch_device)
+        model_accelerate.eval()
+
+        noise = torch.randn(
+            1,
+            model_accelerate.config.in_channels,
+            model_accelerate.config.sample_size,
+            model_accelerate.config.sample_size,
+            generator=torch.manual_seed(0),
+        )
+        noise = noise.to(torch_device)
+        time_step = torch.tensor([10] * noise.shape[0]).to(torch_device)
+
+        arr_accelerate = model_accelerate(noise, time_step)["sample"]
+
+        # two models don't need to stay in the device at the same time
+        del model_accelerate
+        torch.cuda.empty_cache()
+        gc.collect()
+
+        model_normal_load, _ = UNet2DModel.from_pretrained(
+            "fusing/unet-ldm-dummy-update", output_loading_info=True, low_cpu_mem_usage=False
+        )
+        model_normal_load.to(torch_device)
+        model_normal_load.eval()
+        arr_normal_load = model_normal_load(noise, time_step)["sample"]
+
+        assert torch_all_close(arr_accelerate, arr_normal_load, rtol=1e-3)
+
+    def test_output_pretrained(self):
+        model = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update")
+        model.eval()
+        model.to(torch_device)
+
+        noise = torch.randn(
+            1,
+            model.config.in_channels,
+            model.config.sample_size,
+            model.config.sample_size,
+            generator=torch.manual_seed(0),
+        )
+        noise = noise.to(torch_device)
+        time_step = torch.tensor([10] * noise.shape[0]).to(torch_device)
+
+        with torch.no_grad():
+            output = model(noise, time_step).sample
+
+        output_slice = output[0, -1, -3:, -3:].flatten().cpu()
+        # fmt: off
+        expected_output_slice = torch.tensor([-13.3258, -20.1100, -15.9873, -17.6617, -23.0596, -17.9419, -13.3675, -16.1889, -12.3800])
+        # fmt: on
+
+        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-3))
+
+
+class NCSNppModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = UNet2DModel
+    main_input_name = "sample"
+
+    @property
+    def dummy_input(self, sizes=(32, 32)):
+        batch_size = 4
+        num_channels = 3
+
+        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor(batch_size * [10]).to(dtype=torch.int32, device=torch_device)
+
+        return {"sample": noise, "timestep": time_step}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": [32, 64, 64, 64],
+            "in_channels": 3,
+            "layers_per_block": 1,
+            "out_channels": 3,
+            "time_embedding_type": "fourier",
+            "norm_eps": 1e-6,
+            "mid_block_scale_factor": math.sqrt(2.0),
+            "norm_num_groups": None,
+            "down_block_types": [
+                "SkipDownBlock2D",
+                "AttnSkipDownBlock2D",
+                "SkipDownBlock2D",
+                "SkipDownBlock2D",
+            ],
+            "up_block_types": [
+                "SkipUpBlock2D",
+                "SkipUpBlock2D",
+                "AttnSkipUpBlock2D",
+                "SkipUpBlock2D",
+            ],
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    @slow
+    def test_from_pretrained_hub(self):
+        model, loading_info = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256", output_loading_info=True)
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
+
+        model.to(torch_device)
+        inputs = self.dummy_input
+        noise = floats_tensor((4, 3) + (256, 256)).to(torch_device)
+        inputs["sample"] = noise
+        image = model(**inputs)
+
+        assert image is not None, "Make sure output is not None"
+
+    @slow
+    def test_output_pretrained_ve_mid(self):
+        model = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256")
+        model.to(torch_device)
+
+        batch_size = 4
+        num_channels = 3
+        sizes = (256, 256)
+
+        noise = torch.ones((batch_size, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor(batch_size * [1e-4]).to(torch_device)
+
+        with torch.no_grad():
+            output = model(noise, time_step).sample
+
+        output_slice = output[0, -3:, -3:, -1].flatten().cpu()
+        # fmt: off
+        expected_output_slice = torch.tensor([-4836.2178, -6487.1470, -3816.8196, -7964.9302, -10966.3037, -20043.5957, 8137.0513, 2340.3328, 544.6056])
+        # fmt: on
+
+        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
+
+    def test_output_pretrained_ve_large(self):
+        model = UNet2DModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update")
+        model.to(torch_device)
+
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        noise = torch.ones((batch_size, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor(batch_size * [1e-4]).to(torch_device)
+
+        with torch.no_grad():
+            output = model(noise, time_step).sample
+
+        output_slice = output[0, -3:, -3:, -1].flatten().cpu()
+        # fmt: off
+        expected_output_slice = torch.tensor([-0.0325, -0.0900, -0.0869, -0.0332, -0.0725, -0.0270, -0.0101, 0.0227, 0.0256])
+        # fmt: on
+
+        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
+
+    def test_forward_with_norm_groups(self):
+        # not required for this model
+        pass
diff --git a/diffusers/tests/models/unets/test_models_unet_2d_condition.py b/diffusers/tests/models/unets/test_models_unet_2d_condition.py
new file mode 100644
index 0000000000000000000000000000000000000000..f354950b6075ffb2d8e38364195fc30bf6289b03
--- /dev/null
+++ b/diffusers/tests/models/unets/test_models_unet_2d_condition.py
@@ -0,0 +1,1506 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import gc
+import os
+import tempfile
+import unittest
+from collections import OrderedDict
+
+import torch
+from huggingface_hub import snapshot_download
+from parameterized import parameterized
+from pytest import mark
+
+from diffusers import UNet2DConditionModel
+from diffusers.models.attention_processor import (
+    CustomDiffusionAttnProcessor,
+    IPAdapterAttnProcessor,
+    IPAdapterAttnProcessor2_0,
+)
+from diffusers.models.embeddings import ImageProjection, IPAdapterFaceIDImageProjection, IPAdapterPlusImageProjection
+from diffusers.utils import logging
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    floats_tensor,
+    is_peft_available,
+    load_hf_numpy,
+    require_peft_backend,
+    require_torch_accelerator,
+    require_torch_accelerator_with_fp16,
+    require_torch_accelerator_with_training,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_all_close,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+if is_peft_available():
+    from peft import LoraConfig
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+
+logger = logging.get_logger(__name__)
+
+enable_full_determinism()
+
+
+def get_unet_lora_config():
+    rank = 4
+    unet_lora_config = LoraConfig(
+        r=rank,
+        lora_alpha=rank,
+        target_modules=["to_q", "to_k", "to_v", "to_out.0"],
+        init_lora_weights=False,
+        use_dora=False,
+    )
+    return unet_lora_config
+
+
+def check_if_lora_correctly_set(model) -> bool:
+    """
+    Checks if the LoRA layers are correctly set with peft
+    """
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            return True
+    return False
+
+
+def create_ip_adapter_state_dict(model):
+    # "ip_adapter" (cross-attention weights)
+    ip_cross_attn_state_dict = {}
+    key_id = 1
+
+    for name in model.attn_processors.keys():
+        cross_attention_dim = (
+            None if name.endswith("attn1.processor") or "motion_module" in name else model.config.cross_attention_dim
+        )
+
+        if name.startswith("mid_block"):
+            hidden_size = model.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = model.config.block_out_channels[block_id]
+
+        if cross_attention_dim is not None:
+            sd = IPAdapterAttnProcessor(
+                hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
+            ).state_dict()
+            ip_cross_attn_state_dict.update(
+                {
+                    f"{key_id}.to_k_ip.weight": sd["to_k_ip.0.weight"],
+                    f"{key_id}.to_v_ip.weight": sd["to_v_ip.0.weight"],
+                }
+            )
+
+            key_id += 2
+
+    # "image_proj" (ImageProjection layer weights)
+    cross_attention_dim = model.config["cross_attention_dim"]
+    image_projection = ImageProjection(
+        cross_attention_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, num_image_text_embeds=4
+    )
+
+    ip_image_projection_state_dict = {}
+    sd = image_projection.state_dict()
+    ip_image_projection_state_dict.update(
+        {
+            "proj.weight": sd["image_embeds.weight"],
+            "proj.bias": sd["image_embeds.bias"],
+            "norm.weight": sd["norm.weight"],
+            "norm.bias": sd["norm.bias"],
+        }
+    )
+
+    del sd
+    ip_state_dict = {}
+    ip_state_dict.update({"image_proj": ip_image_projection_state_dict, "ip_adapter": ip_cross_attn_state_dict})
+    return ip_state_dict
+
+
+def create_ip_adapter_plus_state_dict(model):
+    # "ip_adapter" (cross-attention weights)
+    ip_cross_attn_state_dict = {}
+    key_id = 1
+
+    for name in model.attn_processors.keys():
+        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = model.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = model.config.block_out_channels[block_id]
+        if cross_attention_dim is not None:
+            sd = IPAdapterAttnProcessor(
+                hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
+            ).state_dict()
+            ip_cross_attn_state_dict.update(
+                {
+                    f"{key_id}.to_k_ip.weight": sd["to_k_ip.0.weight"],
+                    f"{key_id}.to_v_ip.weight": sd["to_v_ip.0.weight"],
+                }
+            )
+
+            key_id += 2
+
+    # "image_proj" (ImageProjection layer weights)
+    cross_attention_dim = model.config["cross_attention_dim"]
+    image_projection = IPAdapterPlusImageProjection(
+        embed_dims=cross_attention_dim, output_dims=cross_attention_dim, dim_head=32, heads=2, num_queries=4
+    )
+
+    ip_image_projection_state_dict = OrderedDict()
+    keys = [k for k in image_projection.state_dict() if "layers." in k]
+    print(keys)
+    for k, v in image_projection.state_dict().items():
+        if "2.to" in k:
+            k = k.replace("2.to", "0.to")
+        elif "layers.0.ln0" in k:
+            k = k.replace("layers.0.ln0", "layers.0.0.norm1")
+        elif "layers.0.ln1" in k:
+            k = k.replace("layers.0.ln1", "layers.0.0.norm2")
+        elif "layers.1.ln0" in k:
+            k = k.replace("layers.1.ln0", "layers.1.0.norm1")
+        elif "layers.1.ln1" in k:
+            k = k.replace("layers.1.ln1", "layers.1.0.norm2")
+        elif "layers.2.ln0" in k:
+            k = k.replace("layers.2.ln0", "layers.2.0.norm1")
+        elif "layers.2.ln1" in k:
+            k = k.replace("layers.2.ln1", "layers.2.0.norm2")
+        elif "layers.3.ln0" in k:
+            k = k.replace("layers.3.ln0", "layers.3.0.norm1")
+        elif "layers.3.ln1" in k:
+            k = k.replace("layers.3.ln1", "layers.3.0.norm2")
+        elif "to_q" in k:
+            parts = k.split(".")
+            parts[2] = "attn"
+            k = ".".join(parts)
+        elif "to_out.0" in k:
+            parts = k.split(".")
+            parts[2] = "attn"
+            k = ".".join(parts)
+            k = k.replace("to_out.0", "to_out")
+        else:
+            k = k.replace("0.ff.0", "0.1.0")
+            k = k.replace("0.ff.1.net.0.proj", "0.1.1")
+            k = k.replace("0.ff.1.net.2", "0.1.3")
+
+            k = k.replace("1.ff.0", "1.1.0")
+            k = k.replace("1.ff.1.net.0.proj", "1.1.1")
+            k = k.replace("1.ff.1.net.2", "1.1.3")
+
+            k = k.replace("2.ff.0", "2.1.0")
+            k = k.replace("2.ff.1.net.0.proj", "2.1.1")
+            k = k.replace("2.ff.1.net.2", "2.1.3")
+
+            k = k.replace("3.ff.0", "3.1.0")
+            k = k.replace("3.ff.1.net.0.proj", "3.1.1")
+            k = k.replace("3.ff.1.net.2", "3.1.3")
+
+        # if "norm_cross" in k:
+        #     ip_image_projection_state_dict[k.replace("norm_cross", "norm1")] = v
+        # elif "layer_norm" in k:
+        #     ip_image_projection_state_dict[k.replace("layer_norm", "norm2")] = v
+        if "to_k" in k:
+            parts = k.split(".")
+            parts[2] = "attn"
+            k = ".".join(parts)
+            ip_image_projection_state_dict[k.replace("to_k", "to_kv")] = torch.cat([v, v], dim=0)
+        elif "to_v" in k:
+            continue
+        else:
+            ip_image_projection_state_dict[k] = v
+
+    ip_state_dict = {}
+    ip_state_dict.update({"image_proj": ip_image_projection_state_dict, "ip_adapter": ip_cross_attn_state_dict})
+    return ip_state_dict
+
+
+def create_ip_adapter_faceid_state_dict(model):
+    # "ip_adapter" (cross-attention weights)
+    # no LoRA weights
+    ip_cross_attn_state_dict = {}
+    key_id = 1
+
+    for name in model.attn_processors.keys():
+        cross_attention_dim = (
+            None if name.endswith("attn1.processor") or "motion_module" in name else model.config.cross_attention_dim
+        )
+
+        if name.startswith("mid_block"):
+            hidden_size = model.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = model.config.block_out_channels[block_id]
+
+        if cross_attention_dim is not None:
+            sd = IPAdapterAttnProcessor(
+                hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
+            ).state_dict()
+            ip_cross_attn_state_dict.update(
+                {
+                    f"{key_id}.to_k_ip.weight": sd["to_k_ip.0.weight"],
+                    f"{key_id}.to_v_ip.weight": sd["to_v_ip.0.weight"],
+                }
+            )
+
+            key_id += 2
+
+    # "image_proj" (ImageProjection layer weights)
+    cross_attention_dim = model.config["cross_attention_dim"]
+    image_projection = IPAdapterFaceIDImageProjection(
+        cross_attention_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, mult=2, num_tokens=4
+    )
+
+    ip_image_projection_state_dict = {}
+    sd = image_projection.state_dict()
+    ip_image_projection_state_dict.update(
+        {
+            "proj.0.weight": sd["ff.net.0.proj.weight"],
+            "proj.0.bias": sd["ff.net.0.proj.bias"],
+            "proj.2.weight": sd["ff.net.2.weight"],
+            "proj.2.bias": sd["ff.net.2.bias"],
+            "norm.weight": sd["norm.weight"],
+            "norm.bias": sd["norm.bias"],
+        }
+    )
+
+    del sd
+    ip_state_dict = {}
+    ip_state_dict.update({"image_proj": ip_image_projection_state_dict, "ip_adapter": ip_cross_attn_state_dict})
+    return ip_state_dict
+
+
+def create_custom_diffusion_layers(model, mock_weights: bool = True):
+    train_kv = True
+    train_q_out = True
+    custom_diffusion_attn_procs = {}
+
+    st = model.state_dict()
+    for name, _ in model.attn_processors.items():
+        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = model.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = model.config.block_out_channels[block_id]
+        layer_name = name.split(".processor")[0]
+        weights = {
+            "to_k_custom_diffusion.weight": st[layer_name + ".to_k.weight"],
+            "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"],
+        }
+        if train_q_out:
+            weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"]
+            weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"]
+            weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"]
+        if cross_attention_dim is not None:
+            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
+                train_kv=train_kv,
+                train_q_out=train_q_out,
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+            ).to(model.device)
+            custom_diffusion_attn_procs[name].load_state_dict(weights)
+            if mock_weights:
+                # add 1 to weights to mock trained weights
+                with torch.no_grad():
+                    custom_diffusion_attn_procs[name].to_k_custom_diffusion.weight += 1
+                    custom_diffusion_attn_procs[name].to_v_custom_diffusion.weight += 1
+        else:
+            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
+                train_kv=False,
+                train_q_out=False,
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+            )
+    del st
+    return custom_diffusion_attn_procs
+
+
+class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = UNet2DConditionModel
+    main_input_name = "sample"
+    # We override the items here because the unet under consideration is small.
+    model_split_percents = [0.5, 0.3, 0.4]
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 4
+        sizes = (16, 16)
+
+        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor([10]).to(torch_device)
+        encoder_hidden_states = floats_tensor((batch_size, 4, 8)).to(torch_device)
+
+        return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
+
+    @property
+    def input_shape(self):
+        return (4, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": (4, 8),
+            "norm_num_groups": 4,
+            "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"),
+            "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D"),
+            "cross_attention_dim": 8,
+            "attention_head_dim": 2,
+            "out_channels": 4,
+            "in_channels": 4,
+            "layers_per_block": 1,
+            "sample_size": 16,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_enable_works(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+
+        model.enable_xformers_memory_efficient_attention()
+
+        assert (
+            model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
+            == "XFormersAttnProcessor"
+        ), "xformers is not enabled"
+
+    @require_torch_accelerator_with_training
+    def test_gradient_checkpointing(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        assert not model.is_gradient_checkpointing and model.training
+
+        out = model(**inputs_dict).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model.zero_grad()
+
+        labels = torch.randn_like(out)
+        loss = (out - labels).mean()
+        loss.backward()
+
+        # re-instantiate the model now enabling gradient checkpointing
+        model_2 = self.model_class(**init_dict)
+        # clone model
+        model_2.load_state_dict(model.state_dict())
+        model_2.to(torch_device)
+        model_2.enable_gradient_checkpointing()
+
+        assert model_2.is_gradient_checkpointing and model_2.training
+
+        out_2 = model_2(**inputs_dict).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model_2.zero_grad()
+        loss_2 = (out_2 - labels).mean()
+        loss_2.backward()
+
+        # compare the output and parameters gradients
+        self.assertTrue((loss - loss_2).abs() < 1e-5)
+        named_params = dict(model.named_parameters())
+        named_params_2 = dict(model_2.named_parameters())
+        for name, param in named_params.items():
+            self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
+
+    def test_model_with_attention_head_dim_tuple(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["block_out_channels"] = (16, 32)
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_model_with_use_linear_projection(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["use_linear_projection"] = True
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_model_with_cross_attention_dim_tuple(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["cross_attention_dim"] = (8, 8)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_model_with_simple_projection(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        batch_size, _, _, sample_size = inputs_dict["sample"].shape
+
+        init_dict["class_embed_type"] = "simple_projection"
+        init_dict["projection_class_embeddings_input_dim"] = sample_size
+
+        inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size)).to(torch_device)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_model_with_class_embeddings_concat(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        batch_size, _, _, sample_size = inputs_dict["sample"].shape
+
+        init_dict["class_embed_type"] = "simple_projection"
+        init_dict["projection_class_embeddings_input_dim"] = sample_size
+        init_dict["class_embeddings_concat"] = True
+
+        inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size)).to(torch_device)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_model_attention_slicing(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["block_out_channels"] = (16, 32)
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        model.set_attention_slice("auto")
+        with torch.no_grad():
+            output = model(**inputs_dict)
+        assert output is not None
+
+        model.set_attention_slice("max")
+        with torch.no_grad():
+            output = model(**inputs_dict)
+        assert output is not None
+
+        model.set_attention_slice(2)
+        with torch.no_grad():
+            output = model(**inputs_dict)
+        assert output is not None
+
+    def test_model_sliceable_head_dim(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["block_out_channels"] = (16, 32)
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model = self.model_class(**init_dict)
+
+        def check_sliceable_dim_attr(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                assert isinstance(module.sliceable_head_dim, int)
+
+            for child in module.children():
+                check_sliceable_dim_attr(child)
+
+        # retrieve number of attention layers
+        for module in model.children():
+            check_sliceable_dim_attr(module)
+
+    def test_gradient_checkpointing_is_applied(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["block_out_channels"] = (16, 32)
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model_class_copy = copy.copy(self.model_class)
+
+        modules_with_gc_enabled = {}
+
+        # now monkey patch the following function:
+        #     def _set_gradient_checkpointing(self, module, value=False):
+        #         if hasattr(module, "gradient_checkpointing"):
+        #             module.gradient_checkpointing = value
+
+        def _set_gradient_checkpointing_new(self, module, value=False):
+            if hasattr(module, "gradient_checkpointing"):
+                module.gradient_checkpointing = value
+                modules_with_gc_enabled[module.__class__.__name__] = True
+
+        model_class_copy._set_gradient_checkpointing = _set_gradient_checkpointing_new
+
+        model = model_class_copy(**init_dict)
+        model.enable_gradient_checkpointing()
+
+        EXPECTED_SET = {
+            "CrossAttnUpBlock2D",
+            "CrossAttnDownBlock2D",
+            "UNetMidBlock2DCrossAttn",
+            "UpBlock2D",
+            "Transformer2DModel",
+            "DownBlock2D",
+        }
+
+        assert set(modules_with_gc_enabled.keys()) == EXPECTED_SET
+        assert all(modules_with_gc_enabled.values()), "All modules should be enabled"
+
+    def test_special_attn_proc(self):
+        class AttnEasyProc(torch.nn.Module):
+            def __init__(self, num):
+                super().__init__()
+                self.weight = torch.nn.Parameter(torch.tensor(num))
+                self.is_run = False
+                self.number = 0
+                self.counter = 0
+
+            def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, number=None):
+                batch_size, sequence_length, _ = hidden_states.shape
+                attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+                query = attn.to_q(hidden_states)
+
+                encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+                key = attn.to_k(encoder_hidden_states)
+                value = attn.to_v(encoder_hidden_states)
+
+                query = attn.head_to_batch_dim(query)
+                key = attn.head_to_batch_dim(key)
+                value = attn.head_to_batch_dim(value)
+
+                attention_probs = attn.get_attention_scores(query, key, attention_mask)
+                hidden_states = torch.bmm(attention_probs, value)
+                hidden_states = attn.batch_to_head_dim(hidden_states)
+
+                # linear proj
+                hidden_states = attn.to_out[0](hidden_states)
+                # dropout
+                hidden_states = attn.to_out[1](hidden_states)
+
+                hidden_states += self.weight
+
+                self.is_run = True
+                self.counter += 1
+                self.number = number
+
+                return hidden_states
+
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["block_out_channels"] = (16, 32)
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        processor = AttnEasyProc(5.0)
+
+        model.set_attn_processor(processor)
+        model(**inputs_dict, cross_attention_kwargs={"number": 123}).sample
+
+        assert processor.counter == 8
+        assert processor.is_run
+        assert processor.number == 123
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [torch.bool],
+            [torch.long],
+            [torch.float],
+            # fmt: on
+        ]
+    )
+    def test_model_xattn_mask(self, mask_dtype):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**{**init_dict, "attention_head_dim": (8, 16), "block_out_channels": (16, 32)})
+        model.to(torch_device)
+        model.eval()
+
+        cond = inputs_dict["encoder_hidden_states"]
+        with torch.no_grad():
+            full_cond_out = model(**inputs_dict).sample
+            assert full_cond_out is not None
+
+            keepall_mask = torch.ones(*cond.shape[:-1], device=cond.device, dtype=mask_dtype)
+            full_cond_keepallmask_out = model(**{**inputs_dict, "encoder_attention_mask": keepall_mask}).sample
+            assert full_cond_keepallmask_out.allclose(
+                full_cond_out, rtol=1e-05, atol=1e-05
+            ), "a 'keep all' mask should give the same result as no mask"
+
+            trunc_cond = cond[:, :-1, :]
+            trunc_cond_out = model(**{**inputs_dict, "encoder_hidden_states": trunc_cond}).sample
+            assert not trunc_cond_out.allclose(
+                full_cond_out, rtol=1e-05, atol=1e-05
+            ), "discarding the last token from our cond should change the result"
+
+            batch, tokens, _ = cond.shape
+            mask_last = (torch.arange(tokens) < tokens - 1).expand(batch, -1).to(cond.device, mask_dtype)
+            masked_cond_out = model(**{**inputs_dict, "encoder_attention_mask": mask_last}).sample
+            assert masked_cond_out.allclose(
+                trunc_cond_out, rtol=1e-05, atol=1e-05
+            ), "masking the last token from our cond should be equivalent to truncating that token out of the condition"
+
+    # see diffusers.models.attention_processor::Attention#prepare_attention_mask
+    # note: we may not need to fix mask padding to work for stable-diffusion cross-attn masks.
+    # since the use-case (somebody passes in a too-short cross-attn mask) is pretty esoteric.
+    # maybe it's fine that this only works for the unclip use-case.
+    @mark.skip(
+        reason="we currently pad mask by target_length tokens (what unclip needs), whereas stable-diffusion's cross-attn needs to instead pad by remaining_length."
+    )
+    def test_model_xattn_padding(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**{**init_dict, "attention_head_dim": (8, 16)})
+        model.to(torch_device)
+        model.eval()
+
+        cond = inputs_dict["encoder_hidden_states"]
+        with torch.no_grad():
+            full_cond_out = model(**inputs_dict).sample
+            assert full_cond_out is not None
+
+            batch, tokens, _ = cond.shape
+            keeplast_mask = (torch.arange(tokens) == tokens - 1).expand(batch, -1).to(cond.device, torch.bool)
+            keeplast_out = model(**{**inputs_dict, "encoder_attention_mask": keeplast_mask}).sample
+            assert not keeplast_out.allclose(full_cond_out), "a 'keep last token' mask should change the result"
+
+            trunc_mask = torch.zeros(batch, tokens - 1, device=cond.device, dtype=torch.bool)
+            trunc_mask_out = model(**{**inputs_dict, "encoder_attention_mask": trunc_mask}).sample
+            assert trunc_mask_out.allclose(
+                keeplast_out
+            ), "a mask with fewer tokens than condition, will be padded with 'keep' tokens. a 'discard-all' mask missing the final token is thus equivalent to a 'keep last' mask."
+
+    def test_custom_diffusion_processors(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["block_out_channels"] = (16, 32)
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        with torch.no_grad():
+            sample1 = model(**inputs_dict).sample
+
+        custom_diffusion_attn_procs = create_custom_diffusion_layers(model, mock_weights=False)
+
+        # make sure we can set a list of attention processors
+        model.set_attn_processor(custom_diffusion_attn_procs)
+        model.to(torch_device)
+
+        # test that attn processors can be set to itself
+        model.set_attn_processor(model.attn_processors)
+
+        with torch.no_grad():
+            sample2 = model(**inputs_dict).sample
+
+        assert (sample1 - sample2).abs().max() < 3e-3
+
+    def test_custom_diffusion_save_load(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["block_out_channels"] = (16, 32)
+        init_dict["attention_head_dim"] = (8, 16)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        with torch.no_grad():
+            old_sample = model(**inputs_dict).sample
+
+        custom_diffusion_attn_procs = create_custom_diffusion_layers(model, mock_weights=False)
+        model.set_attn_processor(custom_diffusion_attn_procs)
+
+        with torch.no_grad():
+            sample = model(**inputs_dict).sample
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_attn_procs(tmpdirname, safe_serialization=False)
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_custom_diffusion_weights.bin")))
+            torch.manual_seed(0)
+            new_model = self.model_class(**init_dict)
+            new_model.load_attn_procs(tmpdirname, weight_name="pytorch_custom_diffusion_weights.bin")
+            new_model.to(torch_device)
+
+        with torch.no_grad():
+            new_sample = new_model(**inputs_dict).sample
+
+        assert (sample - new_sample).abs().max() < 1e-4
+
+        # custom diffusion and no custom diffusion should be the same
+        assert (sample - old_sample).abs().max() < 3e-3
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_custom_diffusion_xformers_on_off(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["block_out_channels"] = (16, 32)
+        init_dict["attention_head_dim"] = (8, 16)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        custom_diffusion_attn_procs = create_custom_diffusion_layers(model, mock_weights=False)
+        model.set_attn_processor(custom_diffusion_attn_procs)
+
+        # default
+        with torch.no_grad():
+            sample = model(**inputs_dict).sample
+
+            model.enable_xformers_memory_efficient_attention()
+            on_sample = model(**inputs_dict).sample
+
+            model.disable_xformers_memory_efficient_attention()
+            off_sample = model(**inputs_dict).sample
+
+        assert (sample - on_sample).abs().max() < 1e-4
+        assert (sample - off_sample).abs().max() < 1e-4
+
+    def test_pickle(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["block_out_channels"] = (16, 32)
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        with torch.no_grad():
+            sample = model(**inputs_dict).sample
+
+        sample_copy = copy.copy(sample)
+
+        assert (sample - sample_copy).abs().max() < 1e-4
+
+    def test_asymmetrical_unet(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        # Add asymmetry to configs
+        init_dict["transformer_layers_per_block"] = [[3, 2], 1]
+        init_dict["reverse_transformer_layers_per_block"] = [[3, 4], 1]
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        output = model(**inputs_dict).sample
+        expected_shape = inputs_dict["sample"].shape
+
+        # Check if input and output shapes are the same
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_ip_adapter(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["block_out_channels"] = (16, 32)
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        # forward pass without ip-adapter
+        with torch.no_grad():
+            sample1 = model(**inputs_dict).sample
+
+        # update inputs_dict for ip-adapter
+        batch_size = inputs_dict["encoder_hidden_states"].shape[0]
+        # for ip-adapter image_embeds has shape [batch_size, num_image, embed_dim]
+        image_embeds = floats_tensor((batch_size, 1, model.config.cross_attention_dim)).to(torch_device)
+        inputs_dict["added_cond_kwargs"] = {"image_embeds": [image_embeds]}
+
+        # make ip_adapter_1 and ip_adapter_2
+        ip_adapter_1 = create_ip_adapter_state_dict(model)
+
+        image_proj_state_dict_2 = {k: w + 1.0 for k, w in ip_adapter_1["image_proj"].items()}
+        cross_attn_state_dict_2 = {k: w + 1.0 for k, w in ip_adapter_1["ip_adapter"].items()}
+        ip_adapter_2 = {}
+        ip_adapter_2.update({"image_proj": image_proj_state_dict_2, "ip_adapter": cross_attn_state_dict_2})
+
+        # forward pass ip_adapter_1
+        model._load_ip_adapter_weights([ip_adapter_1])
+        assert model.config.encoder_hid_dim_type == "ip_image_proj"
+        assert model.encoder_hid_proj is not None
+        assert model.down_blocks[0].attentions[0].transformer_blocks[0].attn2.processor.__class__.__name__ in (
+            "IPAdapterAttnProcessor",
+            "IPAdapterAttnProcessor2_0",
+        )
+        with torch.no_grad():
+            sample2 = model(**inputs_dict).sample
+
+        # forward pass with ip_adapter_2
+        model._load_ip_adapter_weights([ip_adapter_2])
+        with torch.no_grad():
+            sample3 = model(**inputs_dict).sample
+
+        # forward pass with ip_adapter_1 again
+        model._load_ip_adapter_weights([ip_adapter_1])
+        with torch.no_grad():
+            sample4 = model(**inputs_dict).sample
+
+        # forward pass with multiple ip-adapters and multiple images
+        model._load_ip_adapter_weights([ip_adapter_1, ip_adapter_2])
+        # set the scale for ip_adapter_2 to 0 so that result should be same as only load ip_adapter_1
+        for attn_processor in model.attn_processors.values():
+            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
+                attn_processor.scale = [1, 0]
+        image_embeds_multi = image_embeds.repeat(1, 2, 1)
+        inputs_dict["added_cond_kwargs"] = {"image_embeds": [image_embeds_multi, image_embeds_multi]}
+        with torch.no_grad():
+            sample5 = model(**inputs_dict).sample
+
+        # forward pass with single ip-adapter & single image when image_embeds is not a list and a 2-d tensor
+        image_embeds = image_embeds.squeeze(1)
+        inputs_dict["added_cond_kwargs"] = {"image_embeds": image_embeds}
+
+        model._load_ip_adapter_weights(ip_adapter_1)
+        with torch.no_grad():
+            sample6 = model(**inputs_dict).sample
+
+        assert not sample1.allclose(sample2, atol=1e-4, rtol=1e-4)
+        assert not sample2.allclose(sample3, atol=1e-4, rtol=1e-4)
+        assert sample2.allclose(sample4, atol=1e-4, rtol=1e-4)
+        assert sample2.allclose(sample5, atol=1e-4, rtol=1e-4)
+        assert sample2.allclose(sample6, atol=1e-4, rtol=1e-4)
+
+    def test_ip_adapter_plus(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["block_out_channels"] = (16, 32)
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        # forward pass without ip-adapter
+        with torch.no_grad():
+            sample1 = model(**inputs_dict).sample
+
+        # update inputs_dict for ip-adapter
+        batch_size = inputs_dict["encoder_hidden_states"].shape[0]
+        # for ip-adapter-plus image_embeds has shape [batch_size, num_image, sequence_length, embed_dim]
+        image_embeds = floats_tensor((batch_size, 1, 1, model.config.cross_attention_dim)).to(torch_device)
+        inputs_dict["added_cond_kwargs"] = {"image_embeds": [image_embeds]}
+
+        # make ip_adapter_1 and ip_adapter_2
+        ip_adapter_1 = create_ip_adapter_plus_state_dict(model)
+
+        image_proj_state_dict_2 = {k: w + 1.0 for k, w in ip_adapter_1["image_proj"].items()}
+        cross_attn_state_dict_2 = {k: w + 1.0 for k, w in ip_adapter_1["ip_adapter"].items()}
+        ip_adapter_2 = {}
+        ip_adapter_2.update({"image_proj": image_proj_state_dict_2, "ip_adapter": cross_attn_state_dict_2})
+
+        # forward pass ip_adapter_1
+        model._load_ip_adapter_weights([ip_adapter_1])
+        assert model.config.encoder_hid_dim_type == "ip_image_proj"
+        assert model.encoder_hid_proj is not None
+        assert model.down_blocks[0].attentions[0].transformer_blocks[0].attn2.processor.__class__.__name__ in (
+            "IPAdapterAttnProcessor",
+            "IPAdapterAttnProcessor2_0",
+        )
+        with torch.no_grad():
+            sample2 = model(**inputs_dict).sample
+
+        # forward pass with ip_adapter_2
+        model._load_ip_adapter_weights([ip_adapter_2])
+        with torch.no_grad():
+            sample3 = model(**inputs_dict).sample
+
+        # forward pass with ip_adapter_1 again
+        model._load_ip_adapter_weights([ip_adapter_1])
+        with torch.no_grad():
+            sample4 = model(**inputs_dict).sample
+
+        # forward pass with multiple ip-adapters and multiple images
+        model._load_ip_adapter_weights([ip_adapter_1, ip_adapter_2])
+        # set the scale for ip_adapter_2 to 0 so that result should be same as only load ip_adapter_1
+        for attn_processor in model.attn_processors.values():
+            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
+                attn_processor.scale = [1, 0]
+        image_embeds_multi = image_embeds.repeat(1, 2, 1, 1)
+        inputs_dict["added_cond_kwargs"] = {"image_embeds": [image_embeds_multi, image_embeds_multi]}
+        with torch.no_grad():
+            sample5 = model(**inputs_dict).sample
+
+        # forward pass with single ip-adapter & single image when image_embeds is a 3-d tensor
+        image_embeds = image_embeds[:,].squeeze(1)
+        inputs_dict["added_cond_kwargs"] = {"image_embeds": image_embeds}
+
+        model._load_ip_adapter_weights(ip_adapter_1)
+        with torch.no_grad():
+            sample6 = model(**inputs_dict).sample
+
+        assert not sample1.allclose(sample2, atol=1e-4, rtol=1e-4)
+        assert not sample2.allclose(sample3, atol=1e-4, rtol=1e-4)
+        assert sample2.allclose(sample4, atol=1e-4, rtol=1e-4)
+        assert sample2.allclose(sample5, atol=1e-4, rtol=1e-4)
+        assert sample2.allclose(sample6, atol=1e-4, rtol=1e-4)
+
+    @require_torch_gpu
+    def test_load_sharded_checkpoint_from_hub(self):
+        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        loaded_model = self.model_class.from_pretrained("hf-internal-testing/unet2d-sharded-dummy")
+        loaded_model = loaded_model.to(torch_device)
+        new_output = loaded_model(**inputs_dict)
+
+        assert loaded_model
+        assert new_output.sample.shape == (4, 4, 16, 16)
+
+    @require_torch_gpu
+    def test_load_sharded_checkpoint_from_hub_subfolder(self):
+        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        loaded_model = self.model_class.from_pretrained(
+            "hf-internal-testing/unet2d-sharded-dummy-subfolder", subfolder="unet"
+        )
+        loaded_model = loaded_model.to(torch_device)
+        new_output = loaded_model(**inputs_dict)
+
+        assert loaded_model
+        assert new_output.sample.shape == (4, 4, 16, 16)
+
+    @require_torch_gpu
+    def test_load_sharded_checkpoint_from_hub_local(self):
+        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy")
+        loaded_model = self.model_class.from_pretrained(ckpt_path, local_files_only=True)
+        loaded_model = loaded_model.to(torch_device)
+        new_output = loaded_model(**inputs_dict)
+
+        assert loaded_model
+        assert new_output.sample.shape == (4, 4, 16, 16)
+
+    @require_torch_gpu
+    def test_load_sharded_checkpoint_from_hub_local_subfolder(self):
+        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy-subfolder")
+        loaded_model = self.model_class.from_pretrained(ckpt_path, subfolder="unet", local_files_only=True)
+        loaded_model = loaded_model.to(torch_device)
+        new_output = loaded_model(**inputs_dict)
+
+        assert loaded_model
+        assert new_output.sample.shape == (4, 4, 16, 16)
+
+    @require_torch_gpu
+    def test_load_sharded_checkpoint_device_map_from_hub(self):
+        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        loaded_model = self.model_class.from_pretrained("hf-internal-testing/unet2d-sharded-dummy", device_map="auto")
+        new_output = loaded_model(**inputs_dict)
+
+        assert loaded_model
+        assert new_output.sample.shape == (4, 4, 16, 16)
+
+    @require_torch_gpu
+    def test_load_sharded_checkpoint_device_map_from_hub_subfolder(self):
+        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        loaded_model = self.model_class.from_pretrained(
+            "hf-internal-testing/unet2d-sharded-dummy-subfolder", subfolder="unet", device_map="auto"
+        )
+        new_output = loaded_model(**inputs_dict)
+
+        assert loaded_model
+        assert new_output.sample.shape == (4, 4, 16, 16)
+
+    @require_torch_gpu
+    def test_load_sharded_checkpoint_device_map_from_hub_local(self):
+        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy")
+        loaded_model = self.model_class.from_pretrained(ckpt_path, local_files_only=True, device_map="auto")
+        new_output = loaded_model(**inputs_dict)
+
+        assert loaded_model
+        assert new_output.sample.shape == (4, 4, 16, 16)
+
+    @require_torch_gpu
+    def test_load_sharded_checkpoint_device_map_from_hub_local_subfolder(self):
+        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy-subfolder")
+        loaded_model = self.model_class.from_pretrained(
+            ckpt_path, local_files_only=True, subfolder="unet", device_map="auto"
+        )
+        new_output = loaded_model(**inputs_dict)
+
+        assert loaded_model
+        assert new_output.sample.shape == (4, 4, 16, 16)
+
+    @require_torch_gpu
+    def test_load_sharded_checkpoint_with_variant_from_hub(self):
+        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        loaded_model = self.model_class.from_pretrained(
+            "hf-internal-testing/unet2d-sharded-with-variant-dummy", variant="fp16"
+        )
+        loaded_model = loaded_model.to(torch_device)
+        new_output = loaded_model(**inputs_dict)
+
+        assert loaded_model
+        assert new_output.sample.shape == (4, 4, 16, 16)
+
+    @require_peft_backend
+    def test_lora(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        # forward pass without LoRA
+        with torch.no_grad():
+            non_lora_sample = model(**inputs_dict).sample
+
+        unet_lora_config = get_unet_lora_config()
+        model.add_adapter(unet_lora_config)
+
+        assert check_if_lora_correctly_set(model), "Lora not correctly set in UNet."
+
+        # forward pass with LoRA
+        with torch.no_grad():
+            lora_sample = model(**inputs_dict).sample
+
+        assert not torch.allclose(
+            non_lora_sample, lora_sample, atol=1e-4, rtol=1e-4
+        ), "LoRA injected UNet should produce different results."
+
+    @require_peft_backend
+    def test_lora_serialization(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        # forward pass without LoRA
+        with torch.no_grad():
+            non_lora_sample = model(**inputs_dict).sample
+
+        unet_lora_config = get_unet_lora_config()
+        model.add_adapter(unet_lora_config)
+
+        assert check_if_lora_correctly_set(model), "Lora not correctly set in UNet."
+
+        # forward pass with LoRA
+        with torch.no_grad():
+            lora_sample_1 = model(**inputs_dict).sample
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_attn_procs(tmpdirname)
+            model.unload_lora()
+            model.load_attn_procs(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))
+
+            assert check_if_lora_correctly_set(model), "Lora not correctly set in UNet."
+
+            with torch.no_grad():
+                lora_sample_2 = model(**inputs_dict).sample
+
+        assert not torch.allclose(
+            non_lora_sample, lora_sample_1, atol=1e-4, rtol=1e-4
+        ), "LoRA injected UNet should produce different results."
+        assert torch.allclose(
+            lora_sample_1, lora_sample_2, atol=1e-4, rtol=1e-4
+        ), "Loading from a saved checkpoint should produce identical results."
+
+
+@slow
+class UNet2DConditionModelIntegrationTests(unittest.TestCase):
+    def get_file_format(self, seed, shape):
+        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
+        dtype = torch.float16 if fp16 else torch.float32
+        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
+        return image
+
+    def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
+        variant = "fp16" if fp16 else None
+        torch_dtype = torch.float16 if fp16 else torch.float32
+
+        model = UNet2DConditionModel.from_pretrained(
+            model_id, subfolder="unet", torch_dtype=torch_dtype, variant=variant
+        )
+        model.to(torch_device).eval()
+
+        return model
+
+    @require_torch_gpu
+    def test_set_attention_slice_auto(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        unet = self.get_unet_model()
+        unet.set_attention_slice("auto")
+
+        latents = self.get_latents(33)
+        encoder_hidden_states = self.get_encoder_hidden_states(33)
+        timestep = 1
+
+        with torch.no_grad():
+            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+
+        assert mem_bytes < 5 * 10**9
+
+    @require_torch_gpu
+    def test_set_attention_slice_max(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        unet = self.get_unet_model()
+        unet.set_attention_slice("max")
+
+        latents = self.get_latents(33)
+        encoder_hidden_states = self.get_encoder_hidden_states(33)
+        timestep = 1
+
+        with torch.no_grad():
+            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+
+        assert mem_bytes < 5 * 10**9
+
+    @require_torch_gpu
+    def test_set_attention_slice_int(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        unet = self.get_unet_model()
+        unet.set_attention_slice(2)
+
+        latents = self.get_latents(33)
+        encoder_hidden_states = self.get_encoder_hidden_states(33)
+        timestep = 1
+
+        with torch.no_grad():
+            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+
+        assert mem_bytes < 5 * 10**9
+
+    @require_torch_gpu
+    def test_set_attention_slice_list(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        # there are 32 sliceable layers
+        slice_list = 16 * [2, 3]
+        unet = self.get_unet_model()
+        unet.set_attention_slice(slice_list)
+
+        latents = self.get_latents(33)
+        encoder_hidden_states = self.get_encoder_hidden_states(33)
+        timestep = 1
+
+        with torch.no_grad():
+            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+
+        assert mem_bytes < 5 * 10**9
+
+    def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False):
+        dtype = torch.float16 if fp16 else torch.float32
+        hidden_states = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
+        return hidden_states
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, 4, [-0.4424, 0.1510, -0.1937, 0.2118, 0.3746, -0.3957, 0.0160, -0.0435]],
+            [47, 0.55, [-0.1508, 0.0379, -0.3075, 0.2540, 0.3633, -0.0821, 0.1719, -0.0207]],
+            [21, 0.89, [-0.6479, 0.6364, -0.3464, 0.8697, 0.4443, -0.6289, -0.0091, 0.1778]],
+            [9, 1000, [0.8888, -0.5659, 0.5834, -0.7469, 1.1912, -0.3923, 1.1241, -0.4424]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator_with_fp16
+    def test_compvis_sd_v1_4(self, seed, timestep, expected_slice):
+        model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4")
+        latents = self.get_latents(seed)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed)
+
+        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
+
+        with torch.no_grad():
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        assert sample.shape == latents.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [83, 4, [-0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, -0.5806]],
+            [17, 0.55, [-0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, 0.0701]],
+            [8, 0.89, [-0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, 0.4639]],
+            [3, 1000, [-0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, -1.0078]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator_with_fp16
+    def test_compvis_sd_v1_4_fp16(self, seed, timestep, expected_slice):
+        model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4", fp16=True)
+        latents = self.get_latents(seed, fp16=True)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
+
+        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
+
+        with torch.no_grad():
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        assert sample.shape == latents.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, 4, [-0.4430, 0.1570, -0.1867, 0.2376, 0.3205, -0.3681, 0.0525, -0.0722]],
+            [47, 0.55, [-0.1415, 0.0129, -0.3136, 0.2257, 0.3430, -0.0536, 0.2114, -0.0436]],
+            [21, 0.89, [-0.7091, 0.6664, -0.3643, 0.9032, 0.4499, -0.6541, 0.0139, 0.1750]],
+            [9, 1000, [0.8878, -0.5659, 0.5844, -0.7442, 1.1883, -0.3927, 1.1192, -0.4423]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator
+    @skip_mps
+    def test_compvis_sd_v1_5(self, seed, timestep, expected_slice):
+        model = self.get_unet_model(model_id="stable-diffusion-v1-5/stable-diffusion-v1-5")
+        latents = self.get_latents(seed)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed)
+
+        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
+
+        with torch.no_grad():
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        assert sample.shape == latents.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [83, 4, [-0.2695, -0.1669, 0.0073, -0.3181, -0.1187, -0.1676, -0.1395, -0.5972]],
+            [17, 0.55, [-0.1290, -0.2588, 0.0551, -0.0916, 0.3286, 0.0238, -0.3669, 0.0322]],
+            [8, 0.89, [-0.5283, 0.1198, 0.0870, -0.1141, 0.9189, -0.0150, 0.5474, 0.4319]],
+            [3, 1000, [-0.5601, 0.2411, -0.5435, 0.1268, 1.1338, -0.2427, -0.0280, -1.0020]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator_with_fp16
+    def test_compvis_sd_v1_5_fp16(self, seed, timestep, expected_slice):
+        model = self.get_unet_model(model_id="stable-diffusion-v1-5/stable-diffusion-v1-5", fp16=True)
+        latents = self.get_latents(seed, fp16=True)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
+
+        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
+
+        with torch.no_grad():
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        assert sample.shape == latents.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, 4, [-0.7639, 0.0106, -0.1615, -0.3487, -0.0423, -0.7972, 0.0085, -0.4858]],
+            [47, 0.55, [-0.6564, 0.0795, -1.9026, -0.6258, 1.8235, 1.2056, 1.2169, 0.9073]],
+            [21, 0.89, [0.0327, 0.4399, -0.6358, 0.3417, 0.4120, -0.5621, -0.0397, -1.0430]],
+            [9, 1000, [0.1600, 0.7303, -1.0556, -0.3515, -0.7440, -1.2037, -1.8149, -1.8931]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator
+    @skip_mps
+    def test_compvis_sd_inpaint(self, seed, timestep, expected_slice):
+        model = self.get_unet_model(model_id="stable-diffusion-v1-5/stable-diffusion-inpainting")
+        latents = self.get_latents(seed, shape=(4, 9, 64, 64))
+        encoder_hidden_states = self.get_encoder_hidden_states(seed)
+
+        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
+
+        with torch.no_grad():
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        assert sample.shape == (4, 4, 64, 64)
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [83, 4, [-0.1047, -1.7227, 0.1067, 0.0164, -0.5698, -0.4172, -0.1388, 1.1387]],
+            [17, 0.55, [0.0975, -0.2856, -0.3508, -0.4600, 0.3376, 0.2930, -0.2747, -0.7026]],
+            [8, 0.89, [-0.0952, 0.0183, -0.5825, -0.1981, 0.1131, 0.4668, -0.0395, -0.3486]],
+            [3, 1000, [0.4790, 0.4949, -1.0732, -0.7158, 0.7959, -0.9478, 0.1105, -0.9741]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator_with_fp16
+    def test_compvis_sd_inpaint_fp16(self, seed, timestep, expected_slice):
+        model = self.get_unet_model(model_id="stable-diffusion-v1-5/stable-diffusion-inpainting", fp16=True)
+        latents = self.get_latents(seed, shape=(4, 9, 64, 64), fp16=True)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
+
+        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
+
+        with torch.no_grad():
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        assert sample.shape == (4, 4, 64, 64)
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [83, 4, [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.2310]],
+            [17, 0.55, [0.1164, -0.0216, 0.0170, 0.1589, -0.3120, 0.1005, -0.0581, -0.1458]],
+            [8, 0.89, [-0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996, 0.2139]],
+            [3, 1000, [0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.2340, -0.0539]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator_with_fp16
+    def test_stabilityai_sd_v2_fp16(self, seed, timestep, expected_slice):
+        model = self.get_unet_model(model_id="stabilityai/stable-diffusion-2", fp16=True)
+        latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True)
+
+        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
+
+        with torch.no_grad():
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        assert sample.shape == latents.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
diff --git a/diffusers/tests/models/unets/test_models_unet_2d_flax.py b/diffusers/tests/models/unets/test_models_unet_2d_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..69a0704dca9dae32a7d612b82cbedc0454a0a1b5
--- /dev/null
+++ b/diffusers/tests/models/unets/test_models_unet_2d_flax.py
@@ -0,0 +1,104 @@
+import gc
+import unittest
+
+from parameterized import parameterized
+
+from diffusers import FlaxUNet2DConditionModel
+from diffusers.utils import is_flax_available
+from diffusers.utils.testing_utils import load_hf_numpy, require_flax, slow
+
+
+if is_flax_available():
+    import jax
+    import jax.numpy as jnp
+
+
+@slow
+@require_flax
+class FlaxUNet2DConditionModelIntegrationTests(unittest.TestCase):
+    def get_file_format(self, seed, shape):
+        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+
+    def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
+        dtype = jnp.bfloat16 if fp16 else jnp.float32
+        image = jnp.array(load_hf_numpy(self.get_file_format(seed, shape)), dtype=dtype)
+        return image
+
+    def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
+        dtype = jnp.bfloat16 if fp16 else jnp.float32
+        revision = "bf16" if fp16 else None
+
+        model, params = FlaxUNet2DConditionModel.from_pretrained(
+            model_id, subfolder="unet", dtype=dtype, revision=revision
+        )
+        return model, params
+
+    def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False):
+        dtype = jnp.bfloat16 if fp16 else jnp.float32
+        hidden_states = jnp.array(load_hf_numpy(self.get_file_format(seed, shape)), dtype=dtype)
+        return hidden_states
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [83, 4, [-0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, -0.5806]],
+            [17, 0.55, [-0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, 0.0701]],
+            [8, 0.89, [-0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, 0.4639]],
+            [3, 1000, [-0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, -1.0078]],
+            # fmt: on
+        ]
+    )
+    def test_compvis_sd_v1_4_flax_vs_torch_fp16(self, seed, timestep, expected_slice):
+        model, params = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4", fp16=True)
+        latents = self.get_latents(seed, fp16=True)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
+
+        sample = model.apply(
+            {"params": params},
+            latents,
+            jnp.array(timestep, dtype=jnp.int32),
+            encoder_hidden_states=encoder_hidden_states,
+        ).sample
+
+        assert sample.shape == latents.shape
+
+        output_slice = jnp.asarray(jax.device_get((sample[-1, -2:, -2:, :2].flatten())), dtype=jnp.float32)
+        expected_output_slice = jnp.array(expected_slice, dtype=jnp.float32)
+
+        # Found torch (float16) and flax (bfloat16) outputs to be within this tolerance, in the same hardware
+        assert jnp.allclose(output_slice, expected_output_slice, atol=1e-2)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [83, 4, [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.2310]],
+            [17, 0.55, [0.1164, -0.0216, 0.0170, 0.1589, -0.3120, 0.1005, -0.0581, -0.1458]],
+            [8, 0.89, [-0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996, 0.2139]],
+            [3, 1000, [0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.2340, -0.0539]],
+            # fmt: on
+        ]
+    )
+    def test_stabilityai_sd_v2_flax_vs_torch_fp16(self, seed, timestep, expected_slice):
+        model, params = self.get_unet_model(model_id="stabilityai/stable-diffusion-2", fp16=True)
+        latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True)
+
+        sample = model.apply(
+            {"params": params},
+            latents,
+            jnp.array(timestep, dtype=jnp.int32),
+            encoder_hidden_states=encoder_hidden_states,
+        ).sample
+
+        assert sample.shape == latents.shape
+
+        output_slice = jnp.asarray(jax.device_get((sample[-1, -2:, -2:, :2].flatten())), dtype=jnp.float32)
+        expected_output_slice = jnp.array(expected_slice, dtype=jnp.float32)
+
+        # Found torch (float16) and flax (bfloat16) outputs to be within this tolerance, on the same hardware
+        assert jnp.allclose(output_slice, expected_output_slice, atol=1e-2)
diff --git a/diffusers/tests/models/unets/test_models_unet_3d_condition.py b/diffusers/tests/models/unets/test_models_unet_3d_condition.py
new file mode 100644
index 0000000000000000000000000000000000000000..e798586b696517acaa852d80bb4248d96a6e17c3
--- /dev/null
+++ b/diffusers/tests/models/unets/test_models_unet_3d_condition.py
@@ -0,0 +1,183 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers.models import ModelMixin, UNet3DConditionModel
+from diffusers.utils import logging
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, skip_mps, torch_device
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+enable_full_determinism()
+
+logger = logging.get_logger(__name__)
+
+
+@skip_mps
+class UNet3DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = UNet3DConditionModel
+    main_input_name = "sample"
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 4
+        num_frames = 4
+        sizes = (16, 16)
+
+        noise = floats_tensor((batch_size, num_channels, num_frames) + sizes).to(torch_device)
+        time_step = torch.tensor([10]).to(torch_device)
+        encoder_hidden_states = floats_tensor((batch_size, 4, 8)).to(torch_device)
+
+        return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
+
+    @property
+    def input_shape(self):
+        return (4, 4, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 4, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": (4, 8),
+            "norm_num_groups": 4,
+            "down_block_types": (
+                "CrossAttnDownBlock3D",
+                "DownBlock3D",
+            ),
+            "up_block_types": ("UpBlock3D", "CrossAttnUpBlock3D"),
+            "cross_attention_dim": 8,
+            "attention_head_dim": 2,
+            "out_channels": 4,
+            "in_channels": 4,
+            "layers_per_block": 1,
+            "sample_size": 16,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_enable_works(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+
+        model.enable_xformers_memory_efficient_attention()
+
+        assert (
+            model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
+            == "XFormersAttnProcessor"
+        ), "xformers is not enabled"
+
+    # Overriding to set `norm_num_groups` needs to be different for this model.
+    def test_forward_with_norm_groups(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        init_dict["block_out_channels"] = (32, 64)
+        init_dict["norm_num_groups"] = 32
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    # Overriding since the UNet3D outputs a different structure.
+    def test_determinism(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            # Warmup pass when using mps (see #372)
+            if torch_device == "mps" and isinstance(model, ModelMixin):
+                model(**self.dummy_input)
+
+            first = model(**inputs_dict)
+            if isinstance(first, dict):
+                first = first.sample
+
+            second = model(**inputs_dict)
+            if isinstance(second, dict):
+                second = second.sample
+
+        out_1 = first.cpu().numpy()
+        out_2 = second.cpu().numpy()
+        out_1 = out_1[~np.isnan(out_1)]
+        out_2 = out_2[~np.isnan(out_2)]
+        max_diff = np.amax(np.abs(out_1 - out_2))
+        self.assertLessEqual(max_diff, 1e-5)
+
+    def test_model_attention_slicing(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["block_out_channels"] = (16, 32)
+        init_dict["attention_head_dim"] = 8
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        model.set_attention_slice("auto")
+        with torch.no_grad():
+            output = model(**inputs_dict)
+        assert output is not None
+
+        model.set_attention_slice("max")
+        with torch.no_grad():
+            output = model(**inputs_dict)
+        assert output is not None
+
+        model.set_attention_slice(2)
+        with torch.no_grad():
+            output = model(**inputs_dict)
+        assert output is not None
+
+    def test_feed_forward_chunking(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        init_dict["block_out_channels"] = (32, 64)
+        init_dict["norm_num_groups"] = 32
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)[0]
+
+        model.enable_forward_chunking()
+        with torch.no_grad():
+            output_2 = model(**inputs_dict)[0]
+
+        self.assertEqual(output.shape, output_2.shape, "Shape doesn't match")
+        assert np.abs(output.cpu() - output_2.cpu()).max() < 1e-2
diff --git a/diffusers/tests/models/unets/test_models_unet_controlnetxs.py b/diffusers/tests/models/unets/test_models_unet_controlnetxs.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f3662e017504c6bd01633d39c25ec0f03f93f19
--- /dev/null
+++ b/diffusers/tests/models/unets/test_models_unet_controlnetxs.py
@@ -0,0 +1,349 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import unittest
+
+import numpy as np
+import torch
+from torch import nn
+
+from diffusers import ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel
+from diffusers.utils import logging
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, is_flaky, torch_device
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+logger = logging.get_logger(__name__)
+
+enable_full_determinism()
+
+
+class UNetControlNetXSModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = UNetControlNetXSModel
+    main_input_name = "sample"
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 4
+        sizes = (16, 16)
+        conditioning_image_size = (3, 32, 32)  # size of additional, unprocessed image for control-conditioning
+
+        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor([10]).to(torch_device)
+        encoder_hidden_states = floats_tensor((batch_size, 4, 8)).to(torch_device)
+        controlnet_cond = floats_tensor((batch_size, *conditioning_image_size)).to(torch_device)
+        conditioning_scale = 1
+
+        return {
+            "sample": noise,
+            "timestep": time_step,
+            "encoder_hidden_states": encoder_hidden_states,
+            "controlnet_cond": controlnet_cond,
+            "conditioning_scale": conditioning_scale,
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "sample_size": 16,
+            "down_block_types": ("DownBlock2D", "CrossAttnDownBlock2D"),
+            "up_block_types": ("CrossAttnUpBlock2D", "UpBlock2D"),
+            "block_out_channels": (4, 8),
+            "cross_attention_dim": 8,
+            "transformer_layers_per_block": 1,
+            "num_attention_heads": 2,
+            "norm_num_groups": 4,
+            "upcast_attention": False,
+            "ctrl_block_out_channels": [2, 4],
+            "ctrl_num_attention_heads": 4,
+            "ctrl_max_norm_num_groups": 2,
+            "ctrl_conditioning_embedding_out_channels": (2, 2),
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def get_dummy_unet(self):
+        """For some tests we also need the underlying UNet. For these, we'll build the UNetControlNetXSModel from the UNet and ControlNetXS-Adapter"""
+        return UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            sample_size=16,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=8,
+            norm_num_groups=4,
+            use_linear_projection=True,
+        )
+
+    def get_dummy_controlnet_from_unet(self, unet, **kwargs):
+        """For some tests we also need the underlying ControlNetXS-Adapter. For these, we'll build the UNetControlNetXSModel from the UNet and ControlNetXS-Adapter"""
+        # size_ratio and conditioning_embedding_out_channels chosen to keep model small
+        return ControlNetXSAdapter.from_unet(unet, size_ratio=1, conditioning_embedding_out_channels=(2, 2), **kwargs)
+
+    def test_from_unet(self):
+        unet = self.get_dummy_unet()
+        controlnet = self.get_dummy_controlnet_from_unet(unet)
+
+        model = UNetControlNetXSModel.from_unet(unet, controlnet)
+        model_state_dict = model.state_dict()
+
+        def assert_equal_weights(module, weight_dict_prefix):
+            for param_name, param_value in module.named_parameters():
+                assert torch.equal(model_state_dict[weight_dict_prefix + "." + param_name], param_value)
+
+        # # check unet
+        # everything expect down,mid,up blocks
+        modules_from_unet = [
+            "time_embedding",
+            "conv_in",
+            "conv_norm_out",
+            "conv_out",
+        ]
+        for p in modules_from_unet:
+            assert_equal_weights(getattr(unet, p), "base_" + p)
+        optional_modules_from_unet = [
+            "class_embedding",
+            "add_time_proj",
+            "add_embedding",
+        ]
+        for p in optional_modules_from_unet:
+            if hasattr(unet, p) and getattr(unet, p) is not None:
+                assert_equal_weights(getattr(unet, p), "base_" + p)
+        # down blocks
+        assert len(unet.down_blocks) == len(model.down_blocks)
+        for i, d in enumerate(unet.down_blocks):
+            assert_equal_weights(d.resnets, f"down_blocks.{i}.base_resnets")
+            if hasattr(d, "attentions"):
+                assert_equal_weights(d.attentions, f"down_blocks.{i}.base_attentions")
+            if hasattr(d, "downsamplers") and getattr(d, "downsamplers") is not None:
+                assert_equal_weights(d.downsamplers[0], f"down_blocks.{i}.base_downsamplers")
+        # mid block
+        assert_equal_weights(unet.mid_block, "mid_block.base_midblock")
+        # up blocks
+        assert len(unet.up_blocks) == len(model.up_blocks)
+        for i, u in enumerate(unet.up_blocks):
+            assert_equal_weights(u.resnets, f"up_blocks.{i}.resnets")
+            if hasattr(u, "attentions"):
+                assert_equal_weights(u.attentions, f"up_blocks.{i}.attentions")
+            if hasattr(u, "upsamplers") and getattr(u, "upsamplers") is not None:
+                assert_equal_weights(u.upsamplers[0], f"up_blocks.{i}.upsamplers")
+
+        # # check controlnet
+        # everything expect down,mid,up blocks
+        modules_from_controlnet = {
+            "controlnet_cond_embedding": "controlnet_cond_embedding",
+            "conv_in": "ctrl_conv_in",
+            "control_to_base_for_conv_in": "control_to_base_for_conv_in",
+        }
+        optional_modules_from_controlnet = {"time_embedding": "ctrl_time_embedding"}
+        for name_in_controlnet, name_in_unetcnxs in modules_from_controlnet.items():
+            assert_equal_weights(getattr(controlnet, name_in_controlnet), name_in_unetcnxs)
+
+        for name_in_controlnet, name_in_unetcnxs in optional_modules_from_controlnet.items():
+            if hasattr(controlnet, name_in_controlnet) and getattr(controlnet, name_in_controlnet) is not None:
+                assert_equal_weights(getattr(controlnet, name_in_controlnet), name_in_unetcnxs)
+        # down blocks
+        assert len(controlnet.down_blocks) == len(model.down_blocks)
+        for i, d in enumerate(controlnet.down_blocks):
+            assert_equal_weights(d.resnets, f"down_blocks.{i}.ctrl_resnets")
+            assert_equal_weights(d.base_to_ctrl, f"down_blocks.{i}.base_to_ctrl")
+            assert_equal_weights(d.ctrl_to_base, f"down_blocks.{i}.ctrl_to_base")
+            if d.attentions is not None:
+                assert_equal_weights(d.attentions, f"down_blocks.{i}.ctrl_attentions")
+            if d.downsamplers is not None:
+                assert_equal_weights(d.downsamplers, f"down_blocks.{i}.ctrl_downsamplers")
+        # mid block
+        assert_equal_weights(controlnet.mid_block.base_to_ctrl, "mid_block.base_to_ctrl")
+        assert_equal_weights(controlnet.mid_block.midblock, "mid_block.ctrl_midblock")
+        assert_equal_weights(controlnet.mid_block.ctrl_to_base, "mid_block.ctrl_to_base")
+        # up blocks
+        assert len(controlnet.up_connections) == len(model.up_blocks)
+        for i, u in enumerate(controlnet.up_connections):
+            assert_equal_weights(u.ctrl_to_base, f"up_blocks.{i}.ctrl_to_base")
+
+    def test_freeze_unet(self):
+        def assert_frozen(module):
+            for p in module.parameters():
+                assert not p.requires_grad
+
+        def assert_unfrozen(module):
+            for p in module.parameters():
+                assert p.requires_grad
+
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        model = UNetControlNetXSModel(**init_dict)
+        model.freeze_unet_params()
+
+        # # check unet
+        # everything expect down,mid,up blocks
+        modules_from_unet = [
+            model.base_time_embedding,
+            model.base_conv_in,
+            model.base_conv_norm_out,
+            model.base_conv_out,
+        ]
+        for m in modules_from_unet:
+            assert_frozen(m)
+
+        optional_modules_from_unet = [
+            model.base_add_time_proj,
+            model.base_add_embedding,
+        ]
+        for m in optional_modules_from_unet:
+            if m is not None:
+                assert_frozen(m)
+
+        # down blocks
+        for i, d in enumerate(model.down_blocks):
+            assert_frozen(d.base_resnets)
+            if isinstance(d.base_attentions, nn.ModuleList):  # attentions can be list of Nones
+                assert_frozen(d.base_attentions)
+            if d.base_downsamplers is not None:
+                assert_frozen(d.base_downsamplers)
+
+        # mid block
+        assert_frozen(model.mid_block.base_midblock)
+
+        # up blocks
+        for i, u in enumerate(model.up_blocks):
+            assert_frozen(u.resnets)
+            if isinstance(u.attentions, nn.ModuleList):  # attentions can be list of Nones
+                assert_frozen(u.attentions)
+            if u.upsamplers is not None:
+                assert_frozen(u.upsamplers)
+
+        # # check controlnet
+        # everything expect down,mid,up blocks
+        modules_from_controlnet = [
+            model.controlnet_cond_embedding,
+            model.ctrl_conv_in,
+            model.control_to_base_for_conv_in,
+        ]
+        optional_modules_from_controlnet = [model.ctrl_time_embedding]
+
+        for m in modules_from_controlnet:
+            assert_unfrozen(m)
+        for m in optional_modules_from_controlnet:
+            if m is not None:
+                assert_unfrozen(m)
+
+        # down blocks
+        for d in model.down_blocks:
+            assert_unfrozen(d.ctrl_resnets)
+            assert_unfrozen(d.base_to_ctrl)
+            assert_unfrozen(d.ctrl_to_base)
+            if isinstance(d.ctrl_attentions, nn.ModuleList):  # attentions can be list of Nones
+                assert_unfrozen(d.ctrl_attentions)
+            if d.ctrl_downsamplers is not None:
+                assert_unfrozen(d.ctrl_downsamplers)
+        # mid block
+        assert_unfrozen(model.mid_block.base_to_ctrl)
+        assert_unfrozen(model.mid_block.ctrl_midblock)
+        assert_unfrozen(model.mid_block.ctrl_to_base)
+        # up blocks
+        for u in model.up_blocks:
+            assert_unfrozen(u.ctrl_to_base)
+
+    def test_gradient_checkpointing_is_applied(self):
+        model_class_copy = copy.copy(UNetControlNetXSModel)
+
+        modules_with_gc_enabled = {}
+
+        # now monkey patch the following function:
+        #     def _set_gradient_checkpointing(self, module, value=False):
+        #         if hasattr(module, "gradient_checkpointing"):
+        #             module.gradient_checkpointing = value
+
+        def _set_gradient_checkpointing_new(self, module, value=False):
+            if hasattr(module, "gradient_checkpointing"):
+                module.gradient_checkpointing = value
+                modules_with_gc_enabled[module.__class__.__name__] = True
+
+        model_class_copy._set_gradient_checkpointing = _set_gradient_checkpointing_new
+
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        model = model_class_copy(**init_dict)
+
+        model.enable_gradient_checkpointing()
+
+        EXPECTED_SET = {
+            "Transformer2DModel",
+            "UNetMidBlock2DCrossAttn",
+            "ControlNetXSCrossAttnDownBlock2D",
+            "ControlNetXSCrossAttnMidBlock2D",
+            "ControlNetXSCrossAttnUpBlock2D",
+        }
+
+        assert set(modules_with_gc_enabled.keys()) == EXPECTED_SET
+        assert all(modules_with_gc_enabled.values()), "All modules should be enabled"
+
+    @is_flaky
+    def test_forward_no_control(self):
+        unet = self.get_dummy_unet()
+        controlnet = self.get_dummy_controlnet_from_unet(unet)
+
+        model = UNetControlNetXSModel.from_unet(unet, controlnet)
+
+        unet = unet.to(torch_device)
+        model = model.to(torch_device)
+
+        input_ = self.dummy_input
+
+        control_specific_input = ["controlnet_cond", "conditioning_scale"]
+        input_for_unet = {k: v for k, v in input_.items() if k not in control_specific_input}
+
+        with torch.no_grad():
+            unet_output = unet(**input_for_unet).sample.cpu()
+            unet_controlnet_output = model(**input_, apply_control=False).sample.cpu()
+
+        assert np.abs(unet_output.flatten() - unet_controlnet_output.flatten()).max() < 3e-4
+
+    def test_time_embedding_mixing(self):
+        unet = self.get_dummy_unet()
+        controlnet = self.get_dummy_controlnet_from_unet(unet)
+        controlnet_mix_time = self.get_dummy_controlnet_from_unet(
+            unet, time_embedding_mix=0.5, learn_time_embedding=True
+        )
+
+        model = UNetControlNetXSModel.from_unet(unet, controlnet)
+        model_mix_time = UNetControlNetXSModel.from_unet(unet, controlnet_mix_time)
+
+        unet = unet.to(torch_device)
+        model = model.to(torch_device)
+        model_mix_time = model_mix_time.to(torch_device)
+
+        input_ = self.dummy_input
+
+        with torch.no_grad():
+            output = model(**input_).sample
+            output_mix_time = model_mix_time(**input_).sample
+
+        assert output.shape == output_mix_time.shape
+
+    def test_forward_with_norm_groups(self):
+        # UNetControlNetXSModel currently only supports StableDiffusion and StableDiffusion-XL, both of which have norm_num_groups fixed at 32. So we don't need to test different values for norm_num_groups.
+        pass
diff --git a/diffusers/tests/models/unets/test_models_unet_motion.py b/diffusers/tests/models/unets/test_models_unet_motion.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee05f0d9382408020d7e7bcd1e015836c08f3ec7
--- /dev/null
+++ b/diffusers/tests/models/unets/test_models_unet_motion.py
@@ -0,0 +1,341 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import MotionAdapter, UNet2DConditionModel, UNetMotionModel
+from diffusers.utils import logging
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+logger = logging.get_logger(__name__)
+
+enable_full_determinism()
+
+
+class UNetMotionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = UNetMotionModel
+    main_input_name = "sample"
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 4
+        num_frames = 4
+        sizes = (16, 16)
+
+        noise = floats_tensor((batch_size, num_channels, num_frames) + sizes).to(torch_device)
+        time_step = torch.tensor([10]).to(torch_device)
+        encoder_hidden_states = floats_tensor((batch_size * num_frames, 4, 16)).to(torch_device)
+
+        return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
+
+    @property
+    def input_shape(self):
+        return (4, 4, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 4, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": (16, 32),
+            "norm_num_groups": 16,
+            "down_block_types": ("CrossAttnDownBlockMotion", "DownBlockMotion"),
+            "up_block_types": ("UpBlockMotion", "CrossAttnUpBlockMotion"),
+            "cross_attention_dim": 16,
+            "num_attention_heads": 2,
+            "out_channels": 4,
+            "in_channels": 4,
+            "layers_per_block": 1,
+            "sample_size": 16,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_from_unet2d(self):
+        torch.manual_seed(0)
+        unet2d = UNet2DConditionModel()
+
+        torch.manual_seed(1)
+        model = self.model_class.from_unet2d(unet2d)
+        model_state_dict = model.state_dict()
+
+        for param_name, param_value in unet2d.named_parameters():
+            self.assertTrue(torch.equal(model_state_dict[param_name], param_value))
+
+    def test_freeze_unet2d(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.freeze_unet2d_params()
+
+        for param_name, param_value in model.named_parameters():
+            if "motion_modules" not in param_name:
+                self.assertFalse(param_value.requires_grad)
+
+            else:
+                self.assertTrue(param_value.requires_grad)
+
+    def test_loading_motion_adapter(self):
+        model = self.model_class()
+        adapter = MotionAdapter()
+        model.load_motion_modules(adapter)
+
+        for idx, down_block in enumerate(model.down_blocks):
+            adapter_state_dict = adapter.down_blocks[idx].motion_modules.state_dict()
+            for param_name, param_value in down_block.motion_modules.named_parameters():
+                self.assertTrue(torch.equal(adapter_state_dict[param_name], param_value))
+
+        for idx, up_block in enumerate(model.up_blocks):
+            adapter_state_dict = adapter.up_blocks[idx].motion_modules.state_dict()
+            for param_name, param_value in up_block.motion_modules.named_parameters():
+                self.assertTrue(torch.equal(adapter_state_dict[param_name], param_value))
+
+        mid_block_adapter_state_dict = adapter.mid_block.motion_modules.state_dict()
+        for param_name, param_value in model.mid_block.motion_modules.named_parameters():
+            self.assertTrue(torch.equal(mid_block_adapter_state_dict[param_name], param_value))
+
+    def test_saving_motion_modules(self):
+        torch.manual_seed(0)
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_motion_modules(tmpdirname)
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "diffusion_pytorch_model.safetensors")))
+
+            adapter_loaded = MotionAdapter.from_pretrained(tmpdirname)
+
+            torch.manual_seed(0)
+            model_loaded = self.model_class(**init_dict)
+            model_loaded.load_motion_modules(adapter_loaded)
+            model_loaded.to(torch_device)
+
+        with torch.no_grad():
+            output = model(**inputs_dict)[0]
+            output_loaded = model_loaded(**inputs_dict)[0]
+
+        max_diff = (output - output_loaded).abs().max().item()
+        self.assertLessEqual(max_diff, 1e-4, "Models give different forward passes")
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_enable_works(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+
+        model.enable_xformers_memory_efficient_attention()
+
+        assert (
+            model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
+            == "XFormersAttnProcessor"
+        ), "xformers is not enabled"
+
+    def test_gradient_checkpointing_is_applied(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model_class_copy = copy.copy(self.model_class)
+
+        modules_with_gc_enabled = {}
+
+        # now monkey patch the following function:
+        #     def _set_gradient_checkpointing(self, module, value=False):
+        #         if hasattr(module, "gradient_checkpointing"):
+        #             module.gradient_checkpointing = value
+
+        def _set_gradient_checkpointing_new(self, module, value=False):
+            if hasattr(module, "gradient_checkpointing"):
+                module.gradient_checkpointing = value
+                modules_with_gc_enabled[module.__class__.__name__] = True
+
+        model_class_copy._set_gradient_checkpointing = _set_gradient_checkpointing_new
+
+        model = model_class_copy(**init_dict)
+        model.enable_gradient_checkpointing()
+
+        EXPECTED_SET = {
+            "CrossAttnUpBlockMotion",
+            "CrossAttnDownBlockMotion",
+            "UNetMidBlockCrossAttnMotion",
+            "UpBlockMotion",
+            "Transformer2DModel",
+            "DownBlockMotion",
+        }
+
+        assert set(modules_with_gc_enabled.keys()) == EXPECTED_SET
+        assert all(modules_with_gc_enabled.values()), "All modules should be enabled"
+
+    def test_feed_forward_chunking(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        init_dict["block_out_channels"] = (32, 64)
+        init_dict["norm_num_groups"] = 32
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)[0]
+
+        model.enable_forward_chunking()
+        with torch.no_grad():
+            output_2 = model(**inputs_dict)[0]
+
+        self.assertEqual(output.shape, output_2.shape, "Shape doesn't match")
+        assert np.abs(output.cpu() - output_2.cpu()).max() < 1e-2
+
+    def test_pickle(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        with torch.no_grad():
+            sample = model(**inputs_dict).sample
+
+        sample_copy = copy.copy(sample)
+
+        assert (sample - sample_copy).abs().max() < 1e-4
+
+    def test_from_save_pretrained(self, expected_max_diff=5e-5):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, safe_serialization=False)
+            torch.manual_seed(0)
+            new_model = self.model_class.from_pretrained(tmpdirname)
+            new_model.to(torch_device)
+
+        with torch.no_grad():
+            image = model(**inputs_dict)
+            if isinstance(image, dict):
+                image = image.to_tuple()[0]
+
+            new_image = new_model(**inputs_dict)
+
+            if isinstance(new_image, dict):
+                new_image = new_image.to_tuple()[0]
+
+        max_diff = (image - new_image).abs().max().item()
+        self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
+
+    def test_from_save_pretrained_variant(self, expected_max_diff=5e-5):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, variant="fp16", safe_serialization=False)
+
+            torch.manual_seed(0)
+            new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16")
+            # non-variant cannot be loaded
+            with self.assertRaises(OSError) as error_context:
+                self.model_class.from_pretrained(tmpdirname)
+
+            # make sure that error message states what keys are missing
+            assert "Error no file named diffusion_pytorch_model.bin found in directory" in str(error_context.exception)
+
+            new_model.to(torch_device)
+
+        with torch.no_grad():
+            image = model(**inputs_dict)
+            if isinstance(image, dict):
+                image = image.to_tuple()[0]
+
+            new_image = new_model(**inputs_dict)
+
+            if isinstance(new_image, dict):
+                new_image = new_image.to_tuple()[0]
+
+        max_diff = (image - new_image).abs().max().item()
+        self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
+
+    def test_forward_with_norm_groups(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["norm_num_groups"] = 16
+        init_dict["block_out_channels"] = (16, 32)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.to_tuple()[0]
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_asymmetric_motion_model(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["layers_per_block"] = (2, 3)
+        init_dict["transformer_layers_per_block"] = ((1, 2), (3, 4, 5))
+        init_dict["reverse_transformer_layers_per_block"] = ((7, 6, 7, 4), (4, 2, 2))
+
+        init_dict["temporal_transformer_layers_per_block"] = ((2, 5), (2, 3, 5))
+        init_dict["reverse_temporal_transformer_layers_per_block"] = ((5, 4, 3, 4), (3, 2, 2))
+
+        init_dict["num_attention_heads"] = (2, 4)
+        init_dict["motion_num_attention_heads"] = (4, 4)
+        init_dict["reverse_motion_num_attention_heads"] = (2, 2)
+
+        init_dict["use_motion_mid_block"] = True
+        init_dict["mid_block_layers"] = 2
+        init_dict["transformer_layers_per_mid_block"] = (1, 5)
+        init_dict["temporal_transformer_layers_per_mid_block"] = (2, 4)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.to_tuple()[0]
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
diff --git a/diffusers/tests/models/unets/test_models_unet_spatiotemporal.py b/diffusers/tests/models/unets/test_models_unet_spatiotemporal.py
new file mode 100644
index 0000000000000000000000000000000000000000..afdd3d127702556d33484baa7292c7c5075defd6
--- /dev/null
+++ b/diffusers/tests/models/unets/test_models_unet_spatiotemporal.py
@@ -0,0 +1,291 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import unittest
+
+import torch
+
+from diffusers import UNetSpatioTemporalConditionModel
+from diffusers.utils import logging
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    skip_mps,
+    torch_all_close,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+logger = logging.get_logger(__name__)
+
+enable_full_determinism()
+
+
+@skip_mps
+class UNetSpatioTemporalConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = UNetSpatioTemporalConditionModel
+    main_input_name = "sample"
+
+    @property
+    def dummy_input(self):
+        batch_size = 2
+        num_frames = 2
+        num_channels = 4
+        sizes = (32, 32)
+
+        noise = floats_tensor((batch_size, num_frames, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor([10]).to(torch_device)
+        encoder_hidden_states = floats_tensor((batch_size, 1, 32)).to(torch_device)
+
+        return {
+            "sample": noise,
+            "timestep": time_step,
+            "encoder_hidden_states": encoder_hidden_states,
+            "added_time_ids": self._get_add_time_ids(),
+        }
+
+    @property
+    def input_shape(self):
+        return (2, 2, 4, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (4, 32, 32)
+
+    @property
+    def fps(self):
+        return 6
+
+    @property
+    def motion_bucket_id(self):
+        return 127
+
+    @property
+    def noise_aug_strength(self):
+        return 0.02
+
+    @property
+    def addition_time_embed_dim(self):
+        return 32
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": (32, 64),
+            "down_block_types": (
+                "CrossAttnDownBlockSpatioTemporal",
+                "DownBlockSpatioTemporal",
+            ),
+            "up_block_types": (
+                "UpBlockSpatioTemporal",
+                "CrossAttnUpBlockSpatioTemporal",
+            ),
+            "cross_attention_dim": 32,
+            "num_attention_heads": 8,
+            "out_channels": 4,
+            "in_channels": 4,
+            "layers_per_block": 2,
+            "sample_size": 32,
+            "projection_class_embeddings_input_dim": self.addition_time_embed_dim * 3,
+            "addition_time_embed_dim": self.addition_time_embed_dim,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def _get_add_time_ids(self, do_classifier_free_guidance=True):
+        add_time_ids = [self.fps, self.motion_bucket_id, self.noise_aug_strength]
+
+        passed_add_embed_dim = self.addition_time_embed_dim * len(add_time_ids)
+        expected_add_embed_dim = self.addition_time_embed_dim * 3
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], device=torch_device)
+        add_time_ids = add_time_ids.repeat(1, 1)
+        if do_classifier_free_guidance:
+            add_time_ids = torch.cat([add_time_ids, add_time_ids])
+
+        return add_time_ids
+
+    @unittest.skip("Number of Norm Groups is not configurable")
+    def test_forward_with_norm_groups(self):
+        pass
+
+    @unittest.skip("Deprecated functionality")
+    def test_model_attention_slicing(self):
+        pass
+
+    @unittest.skip("Not supported")
+    def test_model_with_use_linear_projection(self):
+        pass
+
+    @unittest.skip("Not supported")
+    def test_model_with_simple_projection(self):
+        pass
+
+    @unittest.skip("Not supported")
+    def test_model_with_class_embeddings_concat(self):
+        pass
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_enable_works(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+
+        model.enable_xformers_memory_efficient_attention()
+
+        assert (
+            model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
+            == "XFormersAttnProcessor"
+        ), "xformers is not enabled"
+
+    @unittest.skipIf(torch_device == "mps", "Gradient checkpointing skipped on MPS")
+    def test_gradient_checkpointing(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        assert not model.is_gradient_checkpointing and model.training
+
+        out = model(**inputs_dict).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model.zero_grad()
+
+        labels = torch.randn_like(out)
+        loss = (out - labels).mean()
+        loss.backward()
+
+        # re-instantiate the model now enabling gradient checkpointing
+        model_2 = self.model_class(**init_dict)
+        # clone model
+        model_2.load_state_dict(model.state_dict())
+        model_2.to(torch_device)
+        model_2.enable_gradient_checkpointing()
+
+        assert model_2.is_gradient_checkpointing and model_2.training
+
+        out_2 = model_2(**inputs_dict).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model_2.zero_grad()
+        loss_2 = (out_2 - labels).mean()
+        loss_2.backward()
+
+        # compare the output and parameters gradients
+        self.assertTrue((loss - loss_2).abs() < 1e-5)
+        named_params = dict(model.named_parameters())
+        named_params_2 = dict(model_2.named_parameters())
+        for name, param in named_params.items():
+            self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
+
+    def test_model_with_num_attention_heads_tuple(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["num_attention_heads"] = (8, 16)
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_model_with_cross_attention_dim_tuple(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["cross_attention_dim"] = (32, 32)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_gradient_checkpointing_is_applied(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["num_attention_heads"] = (8, 16)
+
+        model_class_copy = copy.copy(self.model_class)
+
+        modules_with_gc_enabled = {}
+
+        # now monkey patch the following function:
+        #     def _set_gradient_checkpointing(self, module, value=False):
+        #         if hasattr(module, "gradient_checkpointing"):
+        #             module.gradient_checkpointing = value
+
+        def _set_gradient_checkpointing_new(self, module, value=False):
+            if hasattr(module, "gradient_checkpointing"):
+                module.gradient_checkpointing = value
+                modules_with_gc_enabled[module.__class__.__name__] = True
+
+        model_class_copy._set_gradient_checkpointing = _set_gradient_checkpointing_new
+
+        model = model_class_copy(**init_dict)
+        model.enable_gradient_checkpointing()
+
+        EXPECTED_SET = {
+            "TransformerSpatioTemporalModel",
+            "CrossAttnDownBlockSpatioTemporal",
+            "DownBlockSpatioTemporal",
+            "UpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+            "UNetMidBlockSpatioTemporal",
+        }
+
+        assert set(modules_with_gc_enabled.keys()) == EXPECTED_SET
+        assert all(modules_with_gc_enabled.values()), "All modules should be enabled"
+
+    def test_pickle(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["num_attention_heads"] = (8, 16)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        with torch.no_grad():
+            sample = model(**inputs_dict).sample
+
+        sample_copy = copy.copy(sample)
+
+        assert (sample - sample_copy).abs().max() < 1e-4
diff --git a/diffusers/tests/models/unets/test_unet_2d_blocks.py b/diffusers/tests/models/unets/test_unet_2d_blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..e37199170214d6a3f4036157763c8d27f41efa72
--- /dev/null
+++ b/diffusers/tests/models/unets/test_unet_2d_blocks.py
@@ -0,0 +1,337 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+from diffusers.models.unets.unet_2d_blocks import *  # noqa F403
+from diffusers.utils.testing_utils import torch_device
+
+from .test_unet_blocks_common import UNetBlockTesterMixin
+
+
+class DownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = DownBlock2D  # noqa F405
+    block_type = "down"
+
+    def test_output(self):
+        expected_slice = [-0.0232, -0.9869, 0.8054, -0.0637, -0.1688, -1.4264, 0.4470, -1.3394, 0.0904]
+        super().test_output(expected_slice)
+
+
+class ResnetDownsampleBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = ResnetDownsampleBlock2D  # noqa F405
+    block_type = "down"
+
+    def test_output(self):
+        expected_slice = [0.0710, 0.2410, -0.7320, -1.0757, -1.1343, 0.3540, -0.0133, -0.2576, 0.0948]
+        super().test_output(expected_slice)
+
+
+class AttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = AttnDownBlock2D  # noqa F405
+    block_type = "down"
+
+    def test_output(self):
+        expected_slice = [0.0636, 0.8964, -0.6234, -1.0131, 0.0844, 0.4935, 0.3437, 0.0911, -0.2957]
+        super().test_output(expected_slice)
+
+
+class CrossAttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = CrossAttnDownBlock2D  # noqa F405
+    block_type = "down"
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
+        init_dict["cross_attention_dim"] = 32
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        expected_slice = [0.2238, -0.7396, -0.2255, -0.3829, 0.1925, 1.1665, 0.0603, -0.7295, 0.1983]
+        super().test_output(expected_slice)
+
+
+class SimpleCrossAttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = SimpleCrossAttnDownBlock2D  # noqa F405
+    block_type = "down"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_encoder_hidden_states=True)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
+        init_dict["cross_attention_dim"] = 32
+        return init_dict, inputs_dict
+
+    @unittest.skipIf(torch_device == "mps", "MPS result is not consistent")
+    def test_output(self):
+        expected_slice = [0.7921, -0.0992, -0.1962, -0.7695, -0.4242, 0.7804, 0.4737, 0.2765, 0.3338]
+        super().test_output(expected_slice)
+
+
+class SkipDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = SkipDownBlock2D  # noqa F405
+    block_type = "down"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_skip_sample=True)
+
+    def test_output(self):
+        expected_slice = [-0.0845, -0.2087, -0.2465, 0.0971, 0.1900, -0.0484, 0.2664, 0.4179, 0.5069]
+        super().test_output(expected_slice)
+
+
+class AttnSkipDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = AttnSkipDownBlock2D  # noqa F405
+    block_type = "down"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_skip_sample=True)
+
+    def test_output(self):
+        expected_slice = [0.5539, 0.1609, 0.4924, 0.0537, -0.1995, 0.4050, 0.0979, -0.2721, -0.0642]
+        super().test_output(expected_slice)
+
+
+class DownEncoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = DownEncoderBlock2D  # noqa F405
+    block_type = "down"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_temb=False)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 32,
+            "out_channels": 32,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        expected_slice = [1.1102, 0.5302, 0.4872, -0.0023, -0.8042, 0.0483, -0.3489, -0.5632, 0.7626]
+        super().test_output(expected_slice)
+
+
+class AttnDownEncoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = AttnDownEncoderBlock2D  # noqa F405
+    block_type = "down"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_temb=False)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 32,
+            "out_channels": 32,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        expected_slice = [0.8966, -0.1486, 0.8568, 0.8141, -0.9046, -0.1342, -0.0972, -0.7417, 0.1538]
+        super().test_output(expected_slice)
+
+
+class UNetMidBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = UNetMidBlock2D  # noqa F405
+    block_type = "mid"
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 32,
+            "temb_channels": 128,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        expected_slice = [-0.1062, 1.7248, 0.3494, 1.4569, -0.0910, -1.2421, -0.9984, 0.6736, 1.0028]
+        super().test_output(expected_slice)
+
+
+class UNetMidBlock2DCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = UNetMidBlock2DCrossAttn  # noqa F405
+    block_type = "mid"
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
+        init_dict["cross_attention_dim"] = 32
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        expected_slice = [0.0187, 2.4220, 0.4484, 1.1203, -0.6121, -1.5122, -0.8270, 0.7851, 1.8335]
+        super().test_output(expected_slice)
+
+
+class UNetMidBlock2DSimpleCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = UNetMidBlock2DSimpleCrossAttn  # noqa F405
+    block_type = "mid"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_encoder_hidden_states=True)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
+        init_dict["cross_attention_dim"] = 32
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        expected_slice = [0.7143, 1.9974, 0.5448, 1.3977, 0.1282, -1.1237, -1.4238, 0.5530, 0.8880]
+        super().test_output(expected_slice)
+
+
+class UpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = UpBlock2D  # noqa F405
+    block_type = "up"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_res_hidden_states_tuple=True)
+
+    def test_output(self):
+        expected_slice = [-0.2041, -0.4165, -0.3022, 0.0041, -0.6628, -0.7053, 0.1928, -0.0325, 0.0523]
+        super().test_output(expected_slice)
+
+
+class ResnetUpsampleBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = ResnetUpsampleBlock2D  # noqa F405
+    block_type = "up"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_res_hidden_states_tuple=True)
+
+    def test_output(self):
+        expected_slice = [0.2287, 0.3549, -0.1346, 0.4797, -0.1715, -0.9649, 0.7305, -0.5864, -0.6244]
+        super().test_output(expected_slice)
+
+
+class CrossAttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = CrossAttnUpBlock2D  # noqa F405
+    block_type = "up"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_res_hidden_states_tuple=True)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
+        init_dict["cross_attention_dim"] = 32
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        expected_slice = [-0.1403, -0.3515, -0.0420, -0.1425, 0.3167, 0.5094, -0.2181, 0.5931, 0.5582]
+        super().test_output(expected_slice)
+
+
+class SimpleCrossAttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = SimpleCrossAttnUpBlock2D  # noqa F405
+    block_type = "up"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_res_hidden_states_tuple=True, include_encoder_hidden_states=True)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
+        init_dict["cross_attention_dim"] = 32
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        expected_slice = [0.2645, 0.1480, 0.0909, 0.8044, -0.9758, -0.9083, 0.0994, -1.1453, -0.7402]
+        super().test_output(expected_slice)
+
+
+class AttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = AttnUpBlock2D  # noqa F405
+    block_type = "up"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_res_hidden_states_tuple=True)
+
+    @unittest.skipIf(torch_device == "mps", "MPS result is not consistent")
+    def test_output(self):
+        expected_slice = [0.0979, 0.1326, 0.0021, 0.0659, 0.2249, 0.0059, 0.1132, 0.5952, 0.1033]
+        super().test_output(expected_slice)
+
+
+class SkipUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = SkipUpBlock2D  # noqa F405
+    block_type = "up"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_res_hidden_states_tuple=True)
+
+    def test_output(self):
+        expected_slice = [-0.0893, -0.1234, -0.1506, -0.0332, 0.0123, -0.0211, 0.0566, 0.0143, 0.0362]
+        super().test_output(expected_slice)
+
+
+class AttnSkipUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = AttnSkipUpBlock2D  # noqa F405
+    block_type = "up"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_res_hidden_states_tuple=True)
+
+    def test_output(self):
+        expected_slice = [0.0361, 0.0617, 0.2787, -0.0350, 0.0342, 0.3421, -0.0843, 0.0913, 0.3015]
+        super().test_output(expected_slice)
+
+
+class UpDecoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = UpDecoderBlock2D  # noqa F405
+    block_type = "up"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_temb=False)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {"in_channels": 32, "out_channels": 32}
+
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        expected_slice = [0.4404, 0.1998, -0.9886, -0.3320, -0.3128, -0.7034, -0.6955, -0.2338, -0.3137]
+        super().test_output(expected_slice)
+
+
+class AttnUpDecoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
+    block_class = AttnUpDecoderBlock2D  # noqa F405
+    block_type = "up"
+
+    @property
+    def dummy_input(self):
+        return super().get_dummy_input(include_temb=False)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {"in_channels": 32, "out_channels": 32}
+
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        expected_slice = [0.6738, 0.4491, 0.1055, 1.0710, 0.7316, 0.3339, 0.3352, 0.1023, 0.3568]
+        super().test_output(expected_slice)
diff --git a/diffusers/tests/models/unets/test_unet_blocks_common.py b/diffusers/tests/models/unets/test_unet_blocks_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..dce28c77cbb7c3e907b564a3e5062bd9f3ee6238
--- /dev/null
+++ b/diffusers/tests/models/unets/test_unet_blocks_common.py
@@ -0,0 +1,126 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+
+import torch
+
+from diffusers.utils.testing_utils import (
+    floats_tensor,
+    require_torch,
+    require_torch_accelerator_with_training,
+    torch_all_close,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+@require_torch
+class UNetBlockTesterMixin:
+    @property
+    def dummy_input(self):
+        return self.get_dummy_input()
+
+    @property
+    def output_shape(self):
+        if self.block_type == "down":
+            return (4, 32, 16, 16)
+        elif self.block_type == "mid":
+            return (4, 32, 32, 32)
+        elif self.block_type == "up":
+            return (4, 32, 64, 64)
+
+        raise ValueError(f"'{self.block_type}' is not a supported block_type. Set it to 'up', 'mid', or 'down'.")
+
+    def get_dummy_input(
+        self,
+        include_temb=True,
+        include_res_hidden_states_tuple=False,
+        include_encoder_hidden_states=False,
+        include_skip_sample=False,
+    ):
+        batch_size = 4
+        num_channels = 32
+        sizes = (32, 32)
+
+        generator = torch.manual_seed(0)
+        device = torch.device(torch_device)
+        shape = (batch_size, num_channels) + sizes
+        hidden_states = randn_tensor(shape, generator=generator, device=device)
+        dummy_input = {"hidden_states": hidden_states}
+
+        if include_temb:
+            temb_channels = 128
+            dummy_input["temb"] = randn_tensor((batch_size, temb_channels), generator=generator, device=device)
+
+        if include_res_hidden_states_tuple:
+            generator_1 = torch.manual_seed(1)
+            dummy_input["res_hidden_states_tuple"] = (randn_tensor(shape, generator=generator_1, device=device),)
+
+        if include_encoder_hidden_states:
+            dummy_input["encoder_hidden_states"] = floats_tensor((batch_size, 32, 32)).to(torch_device)
+
+        if include_skip_sample:
+            dummy_input["skip_sample"] = randn_tensor(((batch_size, 3) + sizes), generator=generator, device=device)
+
+        return dummy_input
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 32,
+            "out_channels": 32,
+            "temb_channels": 128,
+        }
+        if self.block_type == "up":
+            init_dict["prev_output_channel"] = 32
+
+        if self.block_type == "mid":
+            init_dict.pop("out_channels")
+
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_output(self, expected_slice):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        unet_block = self.block_class(**init_dict)
+        unet_block.to(torch_device)
+        unet_block.eval()
+
+        with torch.no_grad():
+            output = unet_block(**inputs_dict)
+
+        if isinstance(output, Tuple):
+            output = output[0]
+
+        self.assertEqual(output.shape, self.output_shape)
+
+        output_slice = output[0, -1, -3:, -3:]
+        expected_slice = torch.tensor(expected_slice).to(torch_device)
+        assert torch_all_close(output_slice.flatten(), expected_slice, atol=5e-3)
+
+    @require_torch_accelerator_with_training
+    def test_training(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.block_class(**init_dict)
+        model.to(torch_device)
+        model.train()
+        output = model(**inputs_dict)
+
+        if isinstance(output, Tuple):
+            output = output[0]
+
+        device = torch.device(torch_device)
+        noise = randn_tensor(output.shape, device=device)
+        loss = torch.nn.functional.mse_loss(output, noise)
+        loss.backward()
diff --git a/diffusers/tests/others/test_check_copies.py b/diffusers/tests/others/test_check_copies.py
new file mode 100644
index 0000000000000000000000000000000000000000..5835712343c795c4237bbb20346534be3639e588
--- /dev/null
+++ b/diffusers/tests/others/test_check_copies.py
@@ -0,0 +1,117 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import shutil
+import sys
+import tempfile
+import unittest
+
+
+git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+sys.path.append(os.path.join(git_repo_path, "utils"))
+
+import check_copies  # noqa: E402
+
+
+# This is the reference code that will be used in the tests.
+# If DDPMSchedulerOutput is changed in scheduling_ddpm.py, this code needs to be manually updated.
+REFERENCE_CODE = """    \"""
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    \"""
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+"""
+
+
+class CopyCheckTester(unittest.TestCase):
+    def setUp(self):
+        self.diffusers_dir = tempfile.mkdtemp()
+        os.makedirs(os.path.join(self.diffusers_dir, "schedulers/"))
+        check_copies.DIFFUSERS_PATH = self.diffusers_dir
+        shutil.copy(
+            os.path.join(git_repo_path, "src/diffusers/schedulers/scheduling_ddpm.py"),
+            os.path.join(self.diffusers_dir, "schedulers/scheduling_ddpm.py"),
+        )
+
+    def tearDown(self):
+        check_copies.DIFFUSERS_PATH = "src/diffusers"
+        shutil.rmtree(self.diffusers_dir)
+
+    def check_copy_consistency(self, comment, class_name, class_code, overwrite_result=None):
+        code = comment + f"\nclass {class_name}(nn.Module):\n" + class_code
+        if overwrite_result is not None:
+            expected = comment + f"\nclass {class_name}(nn.Module):\n" + overwrite_result
+        code = check_copies.run_ruff(code)
+        fname = os.path.join(self.diffusers_dir, "new_code.py")
+        with open(fname, "w", newline="\n") as f:
+            f.write(code)
+        if overwrite_result is None:
+            self.assertTrue(len(check_copies.is_copy_consistent(fname)) == 0)
+        else:
+            check_copies.is_copy_consistent(f.name, overwrite=True)
+            with open(fname, "r") as f:
+                self.assertTrue(f.read(), expected)
+
+    def test_find_code_in_diffusers(self):
+        code = check_copies.find_code_in_diffusers("schedulers.scheduling_ddpm.DDPMSchedulerOutput")
+        self.assertEqual(code, REFERENCE_CODE)
+
+    def test_is_copy_consistent(self):
+        # Base copy consistency
+        self.check_copy_consistency(
+            "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput",
+            "DDPMSchedulerOutput",
+            REFERENCE_CODE + "\n",
+        )
+
+        # With no empty line at the end
+        self.check_copy_consistency(
+            "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput",
+            "DDPMSchedulerOutput",
+            REFERENCE_CODE,
+        )
+
+        # Copy consistency with rename
+        self.check_copy_consistency(
+            "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->Test",
+            "TestSchedulerOutput",
+            re.sub("DDPM", "Test", REFERENCE_CODE),
+        )
+
+        # Copy consistency with a really long name
+        long_class_name = "TestClassWithAReallyLongNameBecauseSomePeopleLikeThatForSomeReason"
+        self.check_copy_consistency(
+            f"# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->{long_class_name}",
+            f"{long_class_name}SchedulerOutput",
+            re.sub("Bert", long_class_name, REFERENCE_CODE),
+        )
+
+        # Copy consistency with overwrite
+        self.check_copy_consistency(
+            "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->Test",
+            "TestSchedulerOutput",
+            REFERENCE_CODE,
+            overwrite_result=re.sub("DDPM", "Test", REFERENCE_CODE),
+        )
diff --git a/diffusers/tests/others/test_check_dummies.py b/diffusers/tests/others/test_check_dummies.py
new file mode 100644
index 0000000000000000000000000000000000000000..1890ffaecd8d78752b46f9a91f4306e46bb19d57
--- /dev/null
+++ b/diffusers/tests/others/test_check_dummies.py
@@ -0,0 +1,122 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import unittest
+
+
+git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+sys.path.append(os.path.join(git_repo_path, "utils"))
+
+import check_dummies  # noqa: E402
+from check_dummies import create_dummy_files, create_dummy_object, find_backend, read_init  # noqa: E402
+
+
+# Align TRANSFORMERS_PATH in check_dummies with the current path
+check_dummies.PATH_TO_DIFFUSERS = os.path.join(git_repo_path, "src", "diffusers")
+
+
+class CheckDummiesTester(unittest.TestCase):
+    def test_find_backend(self):
+        simple_backend = find_backend("    if not is_torch_available():")
+        self.assertEqual(simple_backend, "torch")
+
+        # backend_with_underscore = find_backend("    if not is_tensorflow_text_available():")
+        # self.assertEqual(backend_with_underscore, "tensorflow_text")
+
+        double_backend = find_backend("    if not (is_torch_available() and is_transformers_available()):")
+        self.assertEqual(double_backend, "torch_and_transformers")
+
+        # double_backend_with_underscore = find_backend(
+        #    "    if not (is_sentencepiece_available() and is_tensorflow_text_available()):"
+        # )
+        # self.assertEqual(double_backend_with_underscore, "sentencepiece_and_tensorflow_text")
+
+        triple_backend = find_backend(
+            "    if not (is_torch_available() and is_transformers_available() and is_onnx_available()):"
+        )
+        self.assertEqual(triple_backend, "torch_and_transformers_and_onnx")
+
+    def test_read_init(self):
+        objects = read_init()
+        # We don't assert on the exact list of keys to allow for smooth grow of backend-specific objects
+        self.assertIn("torch", objects)
+        self.assertIn("torch_and_transformers", objects)
+        self.assertIn("flax_and_transformers", objects)
+        self.assertIn("torch_and_transformers_and_onnx", objects)
+
+        # Likewise, we can't assert on the exact content of a key
+        self.assertIn("UNet2DModel", objects["torch"])
+        self.assertIn("FlaxUNet2DConditionModel", objects["flax"])
+        self.assertIn("StableDiffusionPipeline", objects["torch_and_transformers"])
+        self.assertIn("FlaxStableDiffusionPipeline", objects["flax_and_transformers"])
+        self.assertIn("LMSDiscreteScheduler", objects["torch_and_scipy"])
+        self.assertIn("OnnxStableDiffusionPipeline", objects["torch_and_transformers_and_onnx"])
+
+    def test_create_dummy_object(self):
+        dummy_constant = create_dummy_object("CONSTANT", "'torch'")
+        self.assertEqual(dummy_constant, "\nCONSTANT = None\n")
+
+        dummy_function = create_dummy_object("function", "'torch'")
+        self.assertEqual(
+            dummy_function, "\ndef function(*args, **kwargs):\n    requires_backends(function, 'torch')\n"
+        )
+
+        expected_dummy_class = """
+class FakeClass(metaclass=DummyObject):
+    _backends = 'torch'
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, 'torch')
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, 'torch')
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, 'torch')
+"""
+        dummy_class = create_dummy_object("FakeClass", "'torch'")
+        self.assertEqual(dummy_class, expected_dummy_class)
+
+    def test_create_dummy_files(self):
+        expected_dummy_pytorch_file = """# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+CONSTANT = None
+
+
+def function(*args, **kwargs):
+    requires_backends(function, ["torch"])
+
+
+class FakeClass(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+"""
+        dummy_files = create_dummy_files({"torch": ["CONSTANT", "function", "FakeClass"]})
+        self.assertEqual(dummy_files["torch"], expected_dummy_pytorch_file)
diff --git a/diffusers/tests/others/test_config.py b/diffusers/tests/others/test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..664c36ac33d6b8e14e64ae4185c2d6a32da6fa5a
--- /dev/null
+++ b/diffusers/tests/others/test_config.py
@@ -0,0 +1,306 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import tempfile
+import unittest
+from pathlib import Path
+
+from diffusers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    PNDMScheduler,
+    logging,
+)
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils.testing_utils import CaptureLogger
+
+
+class SampleObject(ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        e=[1, 3],
+    ):
+        pass
+
+
+class SampleObject2(ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        f=[1, 3],
+    ):
+        pass
+
+
+class SampleObject3(ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        e=[1, 3],
+        f=[1, 3],
+    ):
+        pass
+
+
+class SampleObject4(ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        e=[1, 5],
+        f=[5, 4],
+    ):
+        pass
+
+
+class SampleObjectPaths(ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(self, test_file_1=Path("foo/bar"), test_file_2=Path("foo bar\\bar")):
+        pass
+
+
+class ConfigTester(unittest.TestCase):
+    def test_load_not_from_mixin(self):
+        with self.assertRaises(ValueError):
+            ConfigMixin.load_config("dummy_path")
+
+    def test_register_to_config(self):
+        obj = SampleObject()
+        config = obj.config
+        assert config["a"] == 2
+        assert config["b"] == 5
+        assert config["c"] == (2, 5)
+        assert config["d"] == "for diffusion"
+        assert config["e"] == [1, 3]
+
+        # init ignore private arguments
+        obj = SampleObject(_name_or_path="lalala")
+        config = obj.config
+        assert config["a"] == 2
+        assert config["b"] == 5
+        assert config["c"] == (2, 5)
+        assert config["d"] == "for diffusion"
+        assert config["e"] == [1, 3]
+
+        # can override default
+        obj = SampleObject(c=6)
+        config = obj.config
+        assert config["a"] == 2
+        assert config["b"] == 5
+        assert config["c"] == 6
+        assert config["d"] == "for diffusion"
+        assert config["e"] == [1, 3]
+
+        # can use positional arguments.
+        obj = SampleObject(1, c=6)
+        config = obj.config
+        assert config["a"] == 1
+        assert config["b"] == 5
+        assert config["c"] == 6
+        assert config["d"] == "for diffusion"
+        assert config["e"] == [1, 3]
+
+    def test_save_load(self):
+        obj = SampleObject()
+        config = obj.config
+
+        assert config["a"] == 2
+        assert config["b"] == 5
+        assert config["c"] == (2, 5)
+        assert config["d"] == "for diffusion"
+        assert config["e"] == [1, 3]
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            obj.save_config(tmpdirname)
+            new_obj = SampleObject.from_config(SampleObject.load_config(tmpdirname))
+            new_config = new_obj.config
+
+        # unfreeze configs
+        config = dict(config)
+        new_config = dict(new_config)
+
+        assert config.pop("c") == (2, 5)  # instantiated as tuple
+        assert new_config.pop("c") == [2, 5]  # saved & loaded as list because of json
+        config.pop("_use_default_values")
+        assert config == new_config
+
+    def test_load_ddim_from_pndm(self):
+        logger = logging.get_logger("diffusers.configuration_utils")
+        # 30 for warning
+        logger.setLevel(30)
+
+        with CaptureLogger(logger) as cap_logger:
+            ddim = DDIMScheduler.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
+
+        assert ddim.__class__ == DDIMScheduler
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+    def test_load_euler_from_pndm(self):
+        logger = logging.get_logger("diffusers.configuration_utils")
+        # 30 for warning
+        logger.setLevel(30)
+
+        with CaptureLogger(logger) as cap_logger:
+            euler = EulerDiscreteScheduler.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
+
+        assert euler.__class__ == EulerDiscreteScheduler
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+    def test_load_euler_ancestral_from_pndm(self):
+        logger = logging.get_logger("diffusers.configuration_utils")
+        # 30 for warning
+        logger.setLevel(30)
+
+        with CaptureLogger(logger) as cap_logger:
+            euler = EulerAncestralDiscreteScheduler.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
+
+        assert euler.__class__ == EulerAncestralDiscreteScheduler
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+    def test_load_pndm(self):
+        logger = logging.get_logger("diffusers.configuration_utils")
+        # 30 for warning
+        logger.setLevel(30)
+
+        with CaptureLogger(logger) as cap_logger:
+            pndm = PNDMScheduler.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
+
+        assert pndm.__class__ == PNDMScheduler
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+    def test_overwrite_config_on_load(self):
+        logger = logging.get_logger("diffusers.configuration_utils")
+        # 30 for warning
+        logger.setLevel(30)
+
+        with CaptureLogger(logger) as cap_logger:
+            ddpm = DDPMScheduler.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-torch",
+                subfolder="scheduler",
+                prediction_type="sample",
+                beta_end=8,
+            )
+
+        with CaptureLogger(logger) as cap_logger_2:
+            ddpm_2 = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256", beta_start=88)
+
+        assert ddpm.__class__ == DDPMScheduler
+        assert ddpm.config.prediction_type == "sample"
+        assert ddpm.config.beta_end == 8
+        assert ddpm_2.config.beta_start == 88
+
+        # no warning should be thrown
+        assert cap_logger.out == ""
+        assert cap_logger_2.out == ""
+
+    def test_load_dpmsolver(self):
+        logger = logging.get_logger("diffusers.configuration_utils")
+        # 30 for warning
+        logger.setLevel(30)
+
+        with CaptureLogger(logger) as cap_logger:
+            dpm = DPMSolverMultistepScheduler.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
+
+        assert dpm.__class__ == DPMSolverMultistepScheduler
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+    def test_use_default_values(self):
+        # let's first save a config that should be in the form
+        #    a=2,
+        #    b=5,
+        #    c=(2, 5),
+        #    d="for diffusion",
+        #    e=[1, 3],
+
+        config = SampleObject()
+
+        config_dict = {k: v for k, v in config.config.items() if not k.startswith("_")}
+
+        # make sure that default config has all keys in `_use_default_values`
+        assert set(config_dict.keys()) == set(config.config._use_default_values)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            config.save_config(tmpdirname)
+
+            # now loading it with SampleObject2 should put f into `_use_default_values`
+            config = SampleObject2.from_config(SampleObject2.load_config(tmpdirname))
+
+            assert "f" in config.config._use_default_values
+            assert config.config.f == [1, 3]
+
+        # now loading the config, should **NOT** use [1, 3] for `f`, but the default [1, 4] value
+        # **BECAUSE** it is part of `config.config._use_default_values`
+        new_config = SampleObject4.from_config(config.config)
+        assert new_config.config.f == [5, 4]
+
+        config.config._use_default_values.pop()
+        new_config_2 = SampleObject4.from_config(config.config)
+        assert new_config_2.config.f == [1, 3]
+
+        # Nevertheless "e" should still be correctly loaded to [1, 3] from SampleObject2 instead of defaulting to [1, 5]
+        assert new_config_2.config.e == [1, 3]
+
+    def test_check_path_types(self):
+        # Verify that we get a string returned from a WindowsPath or PosixPath (depending on system)
+        config = SampleObjectPaths()
+        json_string = config.to_json_string()
+        result = json.loads(json_string)
+        assert result["test_file_1"] == config.config.test_file_1.as_posix()
+        assert result["test_file_2"] == config.config.test_file_2.as_posix()
diff --git a/diffusers/tests/others/test_dependencies.py b/diffusers/tests/others/test_dependencies.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0839ef0236bfd296053212722b1a32d9a378777
--- /dev/null
+++ b/diffusers/tests/others/test_dependencies.py
@@ -0,0 +1,50 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import unittest
+from importlib import import_module
+
+
+class DependencyTester(unittest.TestCase):
+    def test_diffusers_import(self):
+        try:
+            import diffusers  # noqa: F401
+        except ImportError:
+            assert False
+
+    def test_backend_registration(self):
+        import diffusers
+        from diffusers.dependency_versions_table import deps
+
+        all_classes = inspect.getmembers(diffusers, inspect.isclass)
+
+        for cls_name, cls_module in all_classes:
+            if "dummy_" in cls_module.__module__:
+                for backend in cls_module._backends:
+                    if backend == "k_diffusion":
+                        backend = "k-diffusion"
+                    elif backend == "invisible_watermark":
+                        backend = "invisible-watermark"
+                    assert backend in deps, f"{backend} is not in the deps table!"
+
+    def test_pipeline_imports(self):
+        import diffusers
+        import diffusers.pipelines
+
+        all_classes = inspect.getmembers(diffusers, inspect.isclass)
+        for cls_name, cls_module in all_classes:
+            if hasattr(diffusers.pipelines, cls_name):
+                pipeline_folder_module = ".".join(str(cls_module.__module__).split(".")[:3])
+                _ = import_module(pipeline_folder_module, str(cls_name))
diff --git a/diffusers/tests/others/test_ema.py b/diffusers/tests/others/test_ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bed42b8488f981cddb06c420427adf83d81101d
--- /dev/null
+++ b/diffusers/tests/others/test_ema.py
@@ -0,0 +1,294 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+import torch
+
+from diffusers import UNet2DConditionModel
+from diffusers.training_utils import EMAModel
+from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
+
+
+enable_full_determinism()
+
+
+class EMAModelTests(unittest.TestCase):
+    model_id = "hf-internal-testing/tiny-stable-diffusion-pipe"
+    batch_size = 1
+    prompt_length = 77
+    text_encoder_hidden_dim = 32
+    num_in_channels = 4
+    latent_height = latent_width = 64
+    generator = torch.manual_seed(0)
+
+    def get_models(self, decay=0.9999):
+        unet = UNet2DConditionModel.from_pretrained(self.model_id, subfolder="unet")
+        unet = unet.to(torch_device)
+        ema_unet = EMAModel(unet.parameters(), decay=decay, model_cls=UNet2DConditionModel, model_config=unet.config)
+        return unet, ema_unet
+
+    def get_dummy_inputs(self):
+        noisy_latents = torch.randn(
+            self.batch_size, self.num_in_channels, self.latent_height, self.latent_width, generator=self.generator
+        ).to(torch_device)
+        timesteps = torch.randint(0, 1000, size=(self.batch_size,), generator=self.generator).to(torch_device)
+        encoder_hidden_states = torch.randn(
+            self.batch_size, self.prompt_length, self.text_encoder_hidden_dim, generator=self.generator
+        ).to(torch_device)
+        return noisy_latents, timesteps, encoder_hidden_states
+
+    def simulate_backprop(self, unet):
+        updated_state_dict = {}
+        for k, param in unet.state_dict().items():
+            updated_param = torch.randn_like(param) + (param * torch.randn_like(param))
+            updated_state_dict.update({k: updated_param})
+        unet.load_state_dict(updated_state_dict)
+        return unet
+
+    def test_optimization_steps_updated(self):
+        unet, ema_unet = self.get_models()
+        # Take the first (hypothetical) EMA step.
+        ema_unet.step(unet.parameters())
+        assert ema_unet.optimization_step == 1
+
+        # Take two more.
+        for _ in range(2):
+            ema_unet.step(unet.parameters())
+        assert ema_unet.optimization_step == 3
+
+    def test_shadow_params_not_updated(self):
+        unet, ema_unet = self.get_models()
+        # Since the `unet` is not being updated (i.e., backprop'd)
+        # there won't be any difference between the `params` of `unet`
+        # and `ema_unet` even if we call `ema_unet.step(unet.parameters())`.
+        ema_unet.step(unet.parameters())
+        orig_params = list(unet.parameters())
+        for s_param, param in zip(ema_unet.shadow_params, orig_params):
+            assert torch.allclose(s_param, param)
+
+        # The above holds true even if we call `ema.step()` multiple times since
+        # `unet` params are still not being updated.
+        for _ in range(4):
+            ema_unet.step(unet.parameters())
+        for s_param, param in zip(ema_unet.shadow_params, orig_params):
+            assert torch.allclose(s_param, param)
+
+    def test_shadow_params_updated(self):
+        unet, ema_unet = self.get_models()
+        # Here we simulate the parameter updates for `unet`. Since there might
+        # be some parameters which are initialized to zero we take extra care to
+        # initialize their values to something non-zero before the multiplication.
+        unet_pseudo_updated_step_one = self.simulate_backprop(unet)
+
+        # Take the EMA step.
+        ema_unet.step(unet_pseudo_updated_step_one.parameters())
+
+        # Now the EMA'd parameters won't be equal to the original model parameters.
+        orig_params = list(unet_pseudo_updated_step_one.parameters())
+        for s_param, param in zip(ema_unet.shadow_params, orig_params):
+            assert ~torch.allclose(s_param, param)
+
+        # Ensure this is the case when we take multiple EMA steps.
+        for _ in range(4):
+            ema_unet.step(unet.parameters())
+        for s_param, param in zip(ema_unet.shadow_params, orig_params):
+            assert ~torch.allclose(s_param, param)
+
+    def test_consecutive_shadow_params_updated(self):
+        # If we call EMA step after a backpropagation consecutively for two times,
+        # the shadow params from those two steps should be different.
+        unet, ema_unet = self.get_models()
+
+        # First backprop + EMA
+        unet_step_one = self.simulate_backprop(unet)
+        ema_unet.step(unet_step_one.parameters())
+        step_one_shadow_params = ema_unet.shadow_params
+
+        # Second backprop + EMA
+        unet_step_two = self.simulate_backprop(unet_step_one)
+        ema_unet.step(unet_step_two.parameters())
+        step_two_shadow_params = ema_unet.shadow_params
+
+        for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params):
+            assert ~torch.allclose(step_one, step_two)
+
+    def test_zero_decay(self):
+        # If there's no decay even if there are backprops, EMA steps
+        # won't take any effect i.e., the shadow params would remain the
+        # same.
+        unet, ema_unet = self.get_models(decay=0.0)
+        unet_step_one = self.simulate_backprop(unet)
+        ema_unet.step(unet_step_one.parameters())
+        step_one_shadow_params = ema_unet.shadow_params
+
+        unet_step_two = self.simulate_backprop(unet_step_one)
+        ema_unet.step(unet_step_two.parameters())
+        step_two_shadow_params = ema_unet.shadow_params
+
+        for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params):
+            assert torch.allclose(step_one, step_two)
+
+    @skip_mps
+    def test_serialization(self):
+        unet, ema_unet = self.get_models()
+        noisy_latents, timesteps, encoder_hidden_states = self.get_dummy_inputs()
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            ema_unet.save_pretrained(tmpdir)
+            loaded_unet = UNet2DConditionModel.from_pretrained(tmpdir, model_cls=UNet2DConditionModel)
+            loaded_unet = loaded_unet.to(unet.device)
+
+        # Since no EMA step has been performed the outputs should match.
+        output = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+        output_loaded = loaded_unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+        assert torch.allclose(output, output_loaded, atol=1e-4)
+
+
+class EMAModelTestsForeach(unittest.TestCase):
+    model_id = "hf-internal-testing/tiny-stable-diffusion-pipe"
+    batch_size = 1
+    prompt_length = 77
+    text_encoder_hidden_dim = 32
+    num_in_channels = 4
+    latent_height = latent_width = 64
+    generator = torch.manual_seed(0)
+
+    def get_models(self, decay=0.9999):
+        unet = UNet2DConditionModel.from_pretrained(self.model_id, subfolder="unet")
+        unet = unet.to(torch_device)
+        ema_unet = EMAModel(
+            unet.parameters(), decay=decay, model_cls=UNet2DConditionModel, model_config=unet.config, foreach=True
+        )
+        return unet, ema_unet
+
+    def get_dummy_inputs(self):
+        noisy_latents = torch.randn(
+            self.batch_size, self.num_in_channels, self.latent_height, self.latent_width, generator=self.generator
+        ).to(torch_device)
+        timesteps = torch.randint(0, 1000, size=(self.batch_size,), generator=self.generator).to(torch_device)
+        encoder_hidden_states = torch.randn(
+            self.batch_size, self.prompt_length, self.text_encoder_hidden_dim, generator=self.generator
+        ).to(torch_device)
+        return noisy_latents, timesteps, encoder_hidden_states
+
+    def simulate_backprop(self, unet):
+        updated_state_dict = {}
+        for k, param in unet.state_dict().items():
+            updated_param = torch.randn_like(param) + (param * torch.randn_like(param))
+            updated_state_dict.update({k: updated_param})
+        unet.load_state_dict(updated_state_dict)
+        return unet
+
+    def test_optimization_steps_updated(self):
+        unet, ema_unet = self.get_models()
+        # Take the first (hypothetical) EMA step.
+        ema_unet.step(unet.parameters())
+        assert ema_unet.optimization_step == 1
+
+        # Take two more.
+        for _ in range(2):
+            ema_unet.step(unet.parameters())
+        assert ema_unet.optimization_step == 3
+
+    def test_shadow_params_not_updated(self):
+        unet, ema_unet = self.get_models()
+        # Since the `unet` is not being updated (i.e., backprop'd)
+        # there won't be any difference between the `params` of `unet`
+        # and `ema_unet` even if we call `ema_unet.step(unet.parameters())`.
+        ema_unet.step(unet.parameters())
+        orig_params = list(unet.parameters())
+        for s_param, param in zip(ema_unet.shadow_params, orig_params):
+            assert torch.allclose(s_param, param)
+
+        # The above holds true even if we call `ema.step()` multiple times since
+        # `unet` params are still not being updated.
+        for _ in range(4):
+            ema_unet.step(unet.parameters())
+        for s_param, param in zip(ema_unet.shadow_params, orig_params):
+            assert torch.allclose(s_param, param)
+
+    def test_shadow_params_updated(self):
+        unet, ema_unet = self.get_models()
+        # Here we simulate the parameter updates for `unet`. Since there might
+        # be some parameters which are initialized to zero we take extra care to
+        # initialize their values to something non-zero before the multiplication.
+        unet_pseudo_updated_step_one = self.simulate_backprop(unet)
+
+        # Take the EMA step.
+        ema_unet.step(unet_pseudo_updated_step_one.parameters())
+
+        # Now the EMA'd parameters won't be equal to the original model parameters.
+        orig_params = list(unet_pseudo_updated_step_one.parameters())
+        for s_param, param in zip(ema_unet.shadow_params, orig_params):
+            assert ~torch.allclose(s_param, param)
+
+        # Ensure this is the case when we take multiple EMA steps.
+        for _ in range(4):
+            ema_unet.step(unet.parameters())
+        for s_param, param in zip(ema_unet.shadow_params, orig_params):
+            assert ~torch.allclose(s_param, param)
+
+    def test_consecutive_shadow_params_updated(self):
+        # If we call EMA step after a backpropagation consecutively for two times,
+        # the shadow params from those two steps should be different.
+        unet, ema_unet = self.get_models()
+
+        # First backprop + EMA
+        unet_step_one = self.simulate_backprop(unet)
+        ema_unet.step(unet_step_one.parameters())
+        step_one_shadow_params = ema_unet.shadow_params
+
+        # Second backprop + EMA
+        unet_step_two = self.simulate_backprop(unet_step_one)
+        ema_unet.step(unet_step_two.parameters())
+        step_two_shadow_params = ema_unet.shadow_params
+
+        for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params):
+            assert ~torch.allclose(step_one, step_two)
+
+    def test_zero_decay(self):
+        # If there's no decay even if there are backprops, EMA steps
+        # won't take any effect i.e., the shadow params would remain the
+        # same.
+        unet, ema_unet = self.get_models(decay=0.0)
+        unet_step_one = self.simulate_backprop(unet)
+        ema_unet.step(unet_step_one.parameters())
+        step_one_shadow_params = ema_unet.shadow_params
+
+        unet_step_two = self.simulate_backprop(unet_step_one)
+        ema_unet.step(unet_step_two.parameters())
+        step_two_shadow_params = ema_unet.shadow_params
+
+        for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params):
+            assert torch.allclose(step_one, step_two)
+
+    @skip_mps
+    def test_serialization(self):
+        unet, ema_unet = self.get_models()
+        noisy_latents, timesteps, encoder_hidden_states = self.get_dummy_inputs()
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            ema_unet.save_pretrained(tmpdir)
+            loaded_unet = UNet2DConditionModel.from_pretrained(tmpdir, model_cls=UNet2DConditionModel)
+            loaded_unet = loaded_unet.to(unet.device)
+
+        # Since no EMA step has been performed the outputs should match.
+        output = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+        output_loaded = loaded_unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+        assert torch.allclose(output, output_loaded, atol=1e-4)
diff --git a/diffusers/tests/others/test_hub_utils.py b/diffusers/tests/others/test_hub_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a0c29dcb0f3a315e81cc4383197a7a5b56d7bfa
--- /dev/null
+++ b/diffusers/tests/others/test_hub_utils.py
@@ -0,0 +1,29 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+
+
+class CreateModelCardTest(unittest.TestCase):
+    def test_generate_model_card_with_library_name(self):
+        with TemporaryDirectory() as tmpdir:
+            file_path = Path(tmpdir) / "README.md"
+            file_path.write_text("---\nlibrary_name: foo\n---\nContent\n")
+            model_card = load_or_create_model_card(file_path)
+            populate_model_card(model_card)
+            assert model_card.data.library_name == "foo"
diff --git a/diffusers/tests/others/test_image_processor.py b/diffusers/tests/others/test_image_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..3397ca9e394affd980e0ff71897b11adedaf4332
--- /dev/null
+++ b/diffusers/tests/others/test_image_processor.py
@@ -0,0 +1,310 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import PIL.Image
+import torch
+
+from diffusers.image_processor import VaeImageProcessor
+
+
+class ImageProcessorTest(unittest.TestCase):
+    @property
+    def dummy_sample(self):
+        batch_size = 1
+        num_channels = 3
+        height = 8
+        width = 8
+
+        sample = torch.rand((batch_size, num_channels, height, width))
+
+        return sample
+
+    @property
+    def dummy_mask(self):
+        batch_size = 1
+        num_channels = 1
+        height = 8
+        width = 8
+
+        sample = torch.rand((batch_size, num_channels, height, width))
+
+        return sample
+
+    def to_np(self, image):
+        if isinstance(image[0], PIL.Image.Image):
+            return np.stack([np.array(i) for i in image], axis=0)
+        elif isinstance(image, torch.Tensor):
+            return image.cpu().numpy().transpose(0, 2, 3, 1)
+        return image
+
+    def test_vae_image_processor_pt(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
+
+        input_pt = self.dummy_sample
+        input_np = self.to_np(input_pt)
+
+        for output_type in ["pt", "np", "pil"]:
+            out = image_processor.postprocess(
+                image_processor.preprocess(input_pt),
+                output_type=output_type,
+            )
+            out_np = self.to_np(out)
+            in_np = (input_np * 255).round() if output_type == "pil" else input_np
+            assert (
+                np.abs(in_np - out_np).max() < 1e-6
+            ), f"decoded output does not match input for output_type {output_type}"
+
+    def test_vae_image_processor_np(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
+        input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
+
+        for output_type in ["pt", "np", "pil"]:
+            out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type)
+
+            out_np = self.to_np(out)
+            in_np = (input_np * 255).round() if output_type == "pil" else input_np
+            assert (
+                np.abs(in_np - out_np).max() < 1e-6
+            ), f"decoded output does not match input for output_type {output_type}"
+
+    def test_vae_image_processor_pil(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
+
+        input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
+        input_pil = image_processor.numpy_to_pil(input_np)
+
+        for output_type in ["pt", "np", "pil"]:
+            out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type)
+            for i, o in zip(input_pil, out):
+                in_np = np.array(i)
+                out_np = self.to_np(out) if output_type == "pil" else (self.to_np(out) * 255).round()
+                assert (
+                    np.abs(in_np - out_np).max() < 1e-6
+                ), f"decoded output does not match input for output_type {output_type}"
+
+    def test_preprocess_input_3d(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
+
+        input_pt_4d = self.dummy_sample
+        input_pt_3d = input_pt_4d.squeeze(0)
+
+        out_pt_4d = image_processor.postprocess(
+            image_processor.preprocess(input_pt_4d),
+            output_type="np",
+        )
+        out_pt_3d = image_processor.postprocess(
+            image_processor.preprocess(input_pt_3d),
+            output_type="np",
+        )
+
+        input_np_4d = self.to_np(self.dummy_sample)
+        input_np_3d = input_np_4d.squeeze(0)
+
+        out_np_4d = image_processor.postprocess(
+            image_processor.preprocess(input_np_4d),
+            output_type="np",
+        )
+        out_np_3d = image_processor.postprocess(
+            image_processor.preprocess(input_np_3d),
+            output_type="np",
+        )
+
+        assert np.abs(out_pt_4d - out_pt_3d).max() < 1e-6
+        assert np.abs(out_np_4d - out_np_3d).max() < 1e-6
+
+    def test_preprocess_input_list(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
+
+        input_pt_4d = self.dummy_sample
+        input_pt_list = list(input_pt_4d)
+
+        out_pt_4d = image_processor.postprocess(
+            image_processor.preprocess(input_pt_4d),
+            output_type="np",
+        )
+
+        out_pt_list = image_processor.postprocess(
+            image_processor.preprocess(input_pt_list),
+            output_type="np",
+        )
+
+        input_np_4d = self.to_np(self.dummy_sample)
+        input_np_list = list(input_np_4d)
+
+        out_np_4d = image_processor.postprocess(
+            image_processor.preprocess(input_np_4d),
+            output_type="np",
+        )
+
+        out_np_list = image_processor.postprocess(
+            image_processor.preprocess(input_np_list),
+            output_type="np",
+        )
+
+        assert np.abs(out_pt_4d - out_pt_list).max() < 1e-6
+        assert np.abs(out_np_4d - out_np_list).max() < 1e-6
+
+    def test_preprocess_input_mask_3d(self):
+        image_processor = VaeImageProcessor(
+            do_resize=False, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+
+        input_pt_4d = self.dummy_mask
+        input_pt_3d = input_pt_4d.squeeze(0)
+        input_pt_2d = input_pt_3d.squeeze(0)
+
+        out_pt_4d = image_processor.postprocess(
+            image_processor.preprocess(input_pt_4d),
+            output_type="np",
+        )
+        out_pt_3d = image_processor.postprocess(
+            image_processor.preprocess(input_pt_3d),
+            output_type="np",
+        )
+
+        out_pt_2d = image_processor.postprocess(
+            image_processor.preprocess(input_pt_2d),
+            output_type="np",
+        )
+
+        input_np_4d = self.to_np(self.dummy_mask)
+        input_np_3d = input_np_4d.squeeze(0)
+        input_np_3d_1 = input_np_4d.squeeze(-1)
+        input_np_2d = input_np_3d.squeeze(-1)
+
+        out_np_4d = image_processor.postprocess(
+            image_processor.preprocess(input_np_4d),
+            output_type="np",
+        )
+        out_np_3d = image_processor.postprocess(
+            image_processor.preprocess(input_np_3d),
+            output_type="np",
+        )
+
+        out_np_3d_1 = image_processor.postprocess(
+            image_processor.preprocess(input_np_3d_1),
+            output_type="np",
+        )
+
+        out_np_2d = image_processor.postprocess(
+            image_processor.preprocess(input_np_2d),
+            output_type="np",
+        )
+
+        assert np.abs(out_pt_4d - out_pt_3d).max() == 0
+        assert np.abs(out_pt_4d - out_pt_2d).max() == 0
+        assert np.abs(out_np_4d - out_np_3d).max() == 0
+        assert np.abs(out_np_4d - out_np_3d_1).max() == 0
+        assert np.abs(out_np_4d - out_np_2d).max() == 0
+
+    def test_preprocess_input_mask_list(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False, do_convert_grayscale=True)
+
+        input_pt_4d = self.dummy_mask
+        input_pt_3d = input_pt_4d.squeeze(0)
+        input_pt_2d = input_pt_3d.squeeze(0)
+
+        inputs_pt = [input_pt_4d, input_pt_3d, input_pt_2d]
+        inputs_pt_list = [[input_pt] for input_pt in inputs_pt]
+
+        for input_pt, input_pt_list in zip(inputs_pt, inputs_pt_list):
+            out_pt = image_processor.postprocess(
+                image_processor.preprocess(input_pt),
+                output_type="np",
+            )
+            out_pt_list = image_processor.postprocess(
+                image_processor.preprocess(input_pt_list),
+                output_type="np",
+            )
+            assert np.abs(out_pt - out_pt_list).max() < 1e-6
+
+        input_np_4d = self.to_np(self.dummy_mask)
+        input_np_3d = input_np_4d.squeeze(0)
+        input_np_2d = input_np_3d.squeeze(-1)
+
+        inputs_np = [input_np_4d, input_np_3d, input_np_2d]
+        inputs_np_list = [[input_np] for input_np in inputs_np]
+
+        for input_np, input_np_list in zip(inputs_np, inputs_np_list):
+            out_np = image_processor.postprocess(
+                image_processor.preprocess(input_np),
+                output_type="np",
+            )
+            out_np_list = image_processor.postprocess(
+                image_processor.preprocess(input_np_list),
+                output_type="np",
+            )
+            assert np.abs(out_np - out_np_list).max() < 1e-6
+
+    def test_preprocess_input_mask_3d_batch(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False, do_convert_grayscale=True)
+
+        # create a dummy mask input with batch_size 2
+        dummy_mask_batch = torch.cat([self.dummy_mask] * 2, axis=0)
+
+        # squeeze out the channel dimension
+        input_pt_3d = dummy_mask_batch.squeeze(1)
+        input_np_3d = self.to_np(dummy_mask_batch).squeeze(-1)
+
+        input_pt_3d_list = list(input_pt_3d)
+        input_np_3d_list = list(input_np_3d)
+
+        out_pt_3d = image_processor.postprocess(
+            image_processor.preprocess(input_pt_3d),
+            output_type="np",
+        )
+        out_pt_3d_list = image_processor.postprocess(
+            image_processor.preprocess(input_pt_3d_list),
+            output_type="np",
+        )
+
+        assert np.abs(out_pt_3d - out_pt_3d_list).max() < 1e-6
+
+        out_np_3d = image_processor.postprocess(
+            image_processor.preprocess(input_np_3d),
+            output_type="np",
+        )
+        out_np_3d_list = image_processor.postprocess(
+            image_processor.preprocess(input_np_3d_list),
+            output_type="np",
+        )
+
+        assert np.abs(out_np_3d - out_np_3d_list).max() < 1e-6
+
+    def test_vae_image_processor_resize_pt(self):
+        image_processor = VaeImageProcessor(do_resize=True, vae_scale_factor=1)
+        input_pt = self.dummy_sample
+        b, c, h, w = input_pt.shape
+        scale = 2
+        out_pt = image_processor.resize(image=input_pt, height=h // scale, width=w // scale)
+        exp_pt_shape = (b, c, h // scale, w // scale)
+        assert (
+            out_pt.shape == exp_pt_shape
+        ), f"resized image output shape '{out_pt.shape}' didn't match expected shape '{exp_pt_shape}'."
+
+    def test_vae_image_processor_resize_np(self):
+        image_processor = VaeImageProcessor(do_resize=True, vae_scale_factor=1)
+        input_pt = self.dummy_sample
+        b, c, h, w = input_pt.shape
+        scale = 2
+        input_np = self.to_np(input_pt)
+        out_np = image_processor.resize(image=input_np, height=h // scale, width=w // scale)
+        exp_np_shape = (b, h // scale, w // scale, c)
+        assert (
+            out_np.shape == exp_np_shape
+        ), f"resized image output shape '{out_np.shape}' didn't match expected shape '{exp_np_shape}'."
diff --git a/diffusers/tests/others/test_outputs.py b/diffusers/tests/others/test_outputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf709d93f709ec2c29bf179a9bb1431e0fc682e8
--- /dev/null
+++ b/diffusers/tests/others/test_outputs.py
@@ -0,0 +1,93 @@
+import pickle as pkl
+import unittest
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+
+from diffusers.utils.outputs import BaseOutput
+from diffusers.utils.testing_utils import require_torch
+
+
+@dataclass
+class CustomOutput(BaseOutput):
+    images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+class ConfigTester(unittest.TestCase):
+    def test_outputs_single_attribute(self):
+        outputs = CustomOutput(images=np.random.rand(1, 3, 4, 4))
+
+        # check every way of getting the attribute
+        assert isinstance(outputs.images, np.ndarray)
+        assert outputs.images.shape == (1, 3, 4, 4)
+        assert isinstance(outputs["images"], np.ndarray)
+        assert outputs["images"].shape == (1, 3, 4, 4)
+        assert isinstance(outputs[0], np.ndarray)
+        assert outputs[0].shape == (1, 3, 4, 4)
+
+        # test with a non-tensor attribute
+        outputs = CustomOutput(images=[PIL.Image.new("RGB", (4, 4))])
+
+        # check every way of getting the attribute
+        assert isinstance(outputs.images, list)
+        assert isinstance(outputs.images[0], PIL.Image.Image)
+        assert isinstance(outputs["images"], list)
+        assert isinstance(outputs["images"][0], PIL.Image.Image)
+        assert isinstance(outputs[0], list)
+        assert isinstance(outputs[0][0], PIL.Image.Image)
+
+    def test_outputs_dict_init(self):
+        # test output reinitialization with a `dict` for compatibility with `accelerate`
+        outputs = CustomOutput({"images": np.random.rand(1, 3, 4, 4)})
+
+        # check every way of getting the attribute
+        assert isinstance(outputs.images, np.ndarray)
+        assert outputs.images.shape == (1, 3, 4, 4)
+        assert isinstance(outputs["images"], np.ndarray)
+        assert outputs["images"].shape == (1, 3, 4, 4)
+        assert isinstance(outputs[0], np.ndarray)
+        assert outputs[0].shape == (1, 3, 4, 4)
+
+        # test with a non-tensor attribute
+        outputs = CustomOutput({"images": [PIL.Image.new("RGB", (4, 4))]})
+
+        # check every way of getting the attribute
+        assert isinstance(outputs.images, list)
+        assert isinstance(outputs.images[0], PIL.Image.Image)
+        assert isinstance(outputs["images"], list)
+        assert isinstance(outputs["images"][0], PIL.Image.Image)
+        assert isinstance(outputs[0], list)
+        assert isinstance(outputs[0][0], PIL.Image.Image)
+
+    def test_outputs_serialization(self):
+        outputs_orig = CustomOutput(images=[PIL.Image.new("RGB", (4, 4))])
+        serialized = pkl.dumps(outputs_orig)
+        outputs_copy = pkl.loads(serialized)
+
+        # Check original and copy are equal
+        assert dir(outputs_orig) == dir(outputs_copy)
+        assert dict(outputs_orig) == dict(outputs_copy)
+        assert vars(outputs_orig) == vars(outputs_copy)
+
+    @require_torch
+    def test_torch_pytree(self):
+        # ensure torch.utils._pytree treats ModelOutput subclasses as nodes (and not leaves)
+        # this is important for DistributedDataParallel gradient synchronization with static_graph=True
+        import torch
+        import torch.utils._pytree
+
+        data = np.random.rand(1, 3, 4, 4)
+        x = CustomOutput(images=data)
+        self.assertFalse(torch.utils._pytree._is_leaf(x))
+
+        expected_flat_outs = [data]
+        expected_tree_spec = torch.utils._pytree.TreeSpec(CustomOutput, ["images"], [torch.utils._pytree.LeafSpec()])
+
+        actual_flat_outs, actual_tree_spec = torch.utils._pytree.tree_flatten(x)
+        self.assertEqual(expected_flat_outs, actual_flat_outs)
+        self.assertEqual(expected_tree_spec, actual_tree_spec)
+
+        unflattened_x = torch.utils._pytree.tree_unflatten(actual_flat_outs, actual_tree_spec)
+        self.assertEqual(x, unflattened_x)
diff --git a/diffusers/tests/others/test_training.py b/diffusers/tests/others/test_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..863ba6e10798207b17385db0154fe15ec99fdcdf
--- /dev/null
+++ b/diffusers/tests/others/test_training.py
@@ -0,0 +1,86 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import DDIMScheduler, DDPMScheduler, UNet2DModel
+from diffusers.training_utils import set_seed
+from diffusers.utils.testing_utils import slow
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class TrainingTests(unittest.TestCase):
+    def get_model_optimizer(self, resolution=32):
+        set_seed(0)
+        model = UNet2DModel(sample_size=resolution, in_channels=3, out_channels=3)
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
+        return model, optimizer
+
+    @slow
+    def test_training_step_equality(self):
+        device = "cpu"  # ensure full determinism without setting the CUBLAS_WORKSPACE_CONFIG env variable
+        ddpm_scheduler = DDPMScheduler(
+            num_train_timesteps=1000,
+            beta_start=0.0001,
+            beta_end=0.02,
+            beta_schedule="linear",
+            clip_sample=True,
+        )
+        ddim_scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_start=0.0001,
+            beta_end=0.02,
+            beta_schedule="linear",
+            clip_sample=True,
+        )
+
+        assert ddpm_scheduler.config.num_train_timesteps == ddim_scheduler.config.num_train_timesteps
+
+        # shared batches for DDPM and DDIM
+        set_seed(0)
+        clean_images = [torch.randn((4, 3, 32, 32)).clip(-1, 1).to(device) for _ in range(4)]
+        noise = [torch.randn((4, 3, 32, 32)).to(device) for _ in range(4)]
+        timesteps = [torch.randint(0, 1000, (4,)).long().to(device) for _ in range(4)]
+
+        # train with a DDPM scheduler
+        model, optimizer = self.get_model_optimizer(resolution=32)
+        model.train().to(device)
+        for i in range(4):
+            optimizer.zero_grad()
+            ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
+            ddpm_noise_pred = model(ddpm_noisy_images, timesteps[i]).sample
+            loss = torch.nn.functional.mse_loss(ddpm_noise_pred, noise[i])
+            loss.backward()
+            optimizer.step()
+        del model, optimizer
+
+        # recreate the model and optimizer, and retry with DDIM
+        model, optimizer = self.get_model_optimizer(resolution=32)
+        model.train().to(device)
+        for i in range(4):
+            optimizer.zero_grad()
+            ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
+            ddim_noise_pred = model(ddim_noisy_images, timesteps[i]).sample
+            loss = torch.nn.functional.mse_loss(ddim_noise_pred, noise[i])
+            loss.backward()
+            optimizer.step()
+        del model, optimizer
+
+        self.assertTrue(torch.allclose(ddpm_noisy_images, ddim_noisy_images, atol=1e-5))
+        self.assertTrue(torch.allclose(ddpm_noise_pred, ddim_noise_pred, atol=1e-5))
diff --git a/diffusers/tests/others/test_utils.py b/diffusers/tests/others/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..65c0b3ece4d20b6d3279152f8a30067f74984461
--- /dev/null
+++ b/diffusers/tests/others/test_utils.py
@@ -0,0 +1,213 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import pytest
+
+from diffusers import __version__
+from diffusers.utils import deprecate
+from diffusers.utils.testing_utils import str_to_bool
+
+
+# Used to test the hub
+USER = "__DUMMY_TRANSFORMERS_USER__"
+ENDPOINT_STAGING = "https://hub-ci.huggingface.co"
+
+# Not critical, only usable on the sandboxed CI instance.
+TOKEN = "hf_94wBhPGp6KrrTH3KDchhKpRxZwd6dmHWLL"
+
+
+class DeprecateTester(unittest.TestCase):
+    higher_version = ".".join([str(int(__version__.split(".")[0]) + 1)] + __version__.split(".")[1:])
+    lower_version = "0.0.1"
+
+    def test_deprecate_function_arg(self):
+        kwargs = {"deprecated_arg": 4}
+
+        with self.assertWarns(FutureWarning) as warning:
+            output = deprecate("deprecated_arg", self.higher_version, "message", take_from=kwargs)
+
+        assert output == 4
+        assert (
+            str(warning.warning)
+            == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}."
+            " message"
+        )
+
+    def test_deprecate_function_arg_tuple(self):
+        kwargs = {"deprecated_arg": 4}
+
+        with self.assertWarns(FutureWarning) as warning:
+            output = deprecate(("deprecated_arg", self.higher_version, "message"), take_from=kwargs)
+
+        assert output == 4
+        assert (
+            str(warning.warning)
+            == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}."
+            " message"
+        )
+
+    def test_deprecate_function_args(self):
+        kwargs = {"deprecated_arg_1": 4, "deprecated_arg_2": 8}
+        with self.assertWarns(FutureWarning) as warning:
+            output_1, output_2 = deprecate(
+                ("deprecated_arg_1", self.higher_version, "Hey"),
+                ("deprecated_arg_2", self.higher_version, "Hey"),
+                take_from=kwargs,
+            )
+        assert output_1 == 4
+        assert output_2 == 8
+        assert (
+            str(warning.warnings[0].message)
+            == "The `deprecated_arg_1` argument is deprecated and will be removed in version"
+            f" {self.higher_version}. Hey"
+        )
+        assert (
+            str(warning.warnings[1].message)
+            == "The `deprecated_arg_2` argument is deprecated and will be removed in version"
+            f" {self.higher_version}. Hey"
+        )
+
+    def test_deprecate_function_incorrect_arg(self):
+        kwargs = {"deprecated_arg": 4}
+
+        with self.assertRaises(TypeError) as error:
+            deprecate(("wrong_arg", self.higher_version, "message"), take_from=kwargs)
+
+        assert "test_deprecate_function_incorrect_arg in" in str(error.exception)
+        assert "line" in str(error.exception)
+        assert "got an unexpected keyword argument `deprecated_arg`" in str(error.exception)
+
+    def test_deprecate_arg_no_kwarg(self):
+        with self.assertWarns(FutureWarning) as warning:
+            deprecate(("deprecated_arg", self.higher_version, "message"))
+
+        assert (
+            str(warning.warning)
+            == f"`deprecated_arg` is deprecated and will be removed in version {self.higher_version}. message"
+        )
+
+    def test_deprecate_args_no_kwarg(self):
+        with self.assertWarns(FutureWarning) as warning:
+            deprecate(
+                ("deprecated_arg_1", self.higher_version, "Hey"),
+                ("deprecated_arg_2", self.higher_version, "Hey"),
+            )
+        assert (
+            str(warning.warnings[0].message)
+            == f"`deprecated_arg_1` is deprecated and will be removed in version {self.higher_version}. Hey"
+        )
+        assert (
+            str(warning.warnings[1].message)
+            == f"`deprecated_arg_2` is deprecated and will be removed in version {self.higher_version}. Hey"
+        )
+
+    def test_deprecate_class_obj(self):
+        class Args:
+            arg = 5
+
+        with self.assertWarns(FutureWarning) as warning:
+            arg = deprecate(("arg", self.higher_version, "message"), take_from=Args())
+
+        assert arg == 5
+        assert (
+            str(warning.warning)
+            == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
+        )
+
+    def test_deprecate_class_objs(self):
+        class Args:
+            arg = 5
+            foo = 7
+
+        with self.assertWarns(FutureWarning) as warning:
+            arg_1, arg_2 = deprecate(
+                ("arg", self.higher_version, "message"),
+                ("foo", self.higher_version, "message"),
+                ("does not exist", self.higher_version, "message"),
+                take_from=Args(),
+            )
+
+        assert arg_1 == 5
+        assert arg_2 == 7
+        assert (
+            str(warning.warning)
+            == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
+        )
+        assert (
+            str(warning.warnings[0].message)
+            == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
+        )
+        assert (
+            str(warning.warnings[1].message)
+            == f"The `foo` attribute is deprecated and will be removed in version {self.higher_version}. message"
+        )
+
+    def test_deprecate_incorrect_version(self):
+        kwargs = {"deprecated_arg": 4}
+
+        with self.assertRaises(ValueError) as error:
+            deprecate(("wrong_arg", self.lower_version, "message"), take_from=kwargs)
+
+        assert (
+            str(error.exception)
+            == "The deprecation tuple ('wrong_arg', '0.0.1', 'message') should be removed since diffusers' version"
+            f" {__version__} is >= {self.lower_version}"
+        )
+
+    def test_deprecate_incorrect_no_standard_warn(self):
+        with self.assertWarns(FutureWarning) as warning:
+            deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False)
+
+        assert str(warning.warning) == "This message is better!!!"
+
+    def test_deprecate_stacklevel(self):
+        with self.assertWarns(FutureWarning) as warning:
+            deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False)
+        assert str(warning.warning) == "This message is better!!!"
+        assert "diffusers/tests/others/test_utils.py" in warning.filename
+
+
+def parse_flag_from_env(key, default=False):
+    try:
+        value = os.environ[key]
+    except KeyError:
+        # KEY isn't set, default to `default`.
+        _value = default
+    else:
+        # KEY is set, convert it to True or False.
+        try:
+            _value = str_to_bool(value)
+        except ValueError:
+            # More values are supported, but let's keep the message simple.
+            raise ValueError(f"If set, {key} must be yes or no.")
+    return _value
+
+
+_run_staging = parse_flag_from_env("HUGGINGFACE_CO_STAGING", default=False)
+
+
+def is_staging_test(test_case):
+    """
+    Decorator marking a test as a staging test.
+
+    Those tests will run using the staging environment of huggingface.co instead of the real model hub.
+    """
+    if not _run_staging:
+        return unittest.skip("test is staging test")(test_case)
+    else:
+        return pytest.mark.is_staging_test()(test_case)
diff --git a/diffusers/tests/others/test_video_processor.py b/diffusers/tests/others/test_video_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2fc87717f9d5a673f9e084fd9893bea623675ec
--- /dev/null
+++ b/diffusers/tests/others/test_video_processor.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import PIL.Image
+import torch
+from parameterized import parameterized
+
+from diffusers.video_processor import VideoProcessor
+
+
+np.random.seed(0)
+torch.manual_seed(0)
+
+
+class VideoProcessorTest(unittest.TestCase):
+    def get_dummy_sample(self, input_type):
+        batch_size = 1
+        num_frames = 5
+        num_channels = 3
+        height = 8
+        width = 8
+
+        def generate_image():
+            return PIL.Image.fromarray(np.random.randint(0, 256, size=(height, width, num_channels)).astype("uint8"))
+
+        def generate_4d_array():
+            return np.random.rand(num_frames, height, width, num_channels)
+
+        def generate_5d_array():
+            return np.random.rand(batch_size, num_frames, height, width, num_channels)
+
+        def generate_4d_tensor():
+            return torch.rand(num_frames, num_channels, height, width)
+
+        def generate_5d_tensor():
+            return torch.rand(batch_size, num_frames, num_channels, height, width)
+
+        if input_type == "list_images":
+            sample = [generate_image() for _ in range(num_frames)]
+        elif input_type == "list_list_images":
+            sample = [[generate_image() for _ in range(num_frames)] for _ in range(num_frames)]
+        elif input_type == "list_4d_np":
+            sample = [generate_4d_array() for _ in range(num_frames)]
+        elif input_type == "list_list_4d_np":
+            sample = [[generate_4d_array() for _ in range(num_frames)] for _ in range(num_frames)]
+        elif input_type == "list_5d_np":
+            sample = [generate_5d_array() for _ in range(num_frames)]
+        elif input_type == "5d_np":
+            sample = generate_5d_array()
+        elif input_type == "list_4d_pt":
+            sample = [generate_4d_tensor() for _ in range(num_frames)]
+        elif input_type == "list_list_4d_pt":
+            sample = [[generate_4d_tensor() for _ in range(num_frames)] for _ in range(num_frames)]
+        elif input_type == "list_5d_pt":
+            sample = [generate_5d_tensor() for _ in range(num_frames)]
+        elif input_type == "5d_pt":
+            sample = generate_5d_tensor()
+
+        return sample
+
+    def to_np(self, video):
+        # List of images.
+        if isinstance(video[0], PIL.Image.Image):
+            video = np.stack([np.array(i) for i in video], axis=0)
+
+        # List of list of images.
+        elif isinstance(video, list) and isinstance(video[0][0], PIL.Image.Image):
+            frames = []
+            for vid in video:
+                all_current_frames = np.stack([np.array(i) for i in vid], axis=0)
+                frames.append(all_current_frames)
+            video = np.stack([np.array(frame) for frame in frames], axis=0)
+
+        # List of 4d/5d {ndarrays, torch tensors}.
+        elif isinstance(video, list) and isinstance(video[0], (torch.Tensor, np.ndarray)):
+            if isinstance(video[0], np.ndarray):
+                video = np.stack(video, axis=0) if video[0].ndim == 4 else np.concatenate(video, axis=0)
+            else:
+                if video[0].ndim == 4:
+                    video = np.stack([i.cpu().numpy().transpose(0, 2, 3, 1) for i in video], axis=0)
+                elif video[0].ndim == 5:
+                    video = np.concatenate([i.cpu().numpy().transpose(0, 1, 3, 4, 2) for i in video], axis=0)
+
+        # List of list of 4d/5d {ndarrays, torch tensors}.
+        elif (
+            isinstance(video, list)
+            and isinstance(video[0], list)
+            and isinstance(video[0][0], (torch.Tensor, np.ndarray))
+        ):
+            all_frames = []
+            for list_of_videos in video:
+                temp_frames = []
+                for vid in list_of_videos:
+                    if vid.ndim == 4:
+                        current_vid_frames = np.stack(
+                            [i if isinstance(i, np.ndarray) else i.cpu().numpy().transpose(1, 2, 0) for i in vid],
+                            axis=0,
+                        )
+                    elif vid.ndim == 5:
+                        current_vid_frames = np.concatenate(
+                            [i if isinstance(i, np.ndarray) else i.cpu().numpy().transpose(0, 2, 3, 1) for i in vid],
+                            axis=0,
+                        )
+                    temp_frames.append(current_vid_frames)
+                temp_frames = np.stack(temp_frames, axis=0)
+                all_frames.append(temp_frames)
+
+            video = np.concatenate(all_frames, axis=0)
+
+        # Just 5d {ndarrays, torch tensors}.
+        elif isinstance(video, (torch.Tensor, np.ndarray)) and video.ndim == 5:
+            video = video if isinstance(video, np.ndarray) else video.cpu().numpy().transpose(0, 1, 3, 4, 2)
+
+        return video
+
+    @parameterized.expand(["list_images", "list_list_images"])
+    def test_video_processor_pil(self, input_type):
+        video_processor = VideoProcessor(do_resize=False, do_normalize=True)
+
+        input = self.get_dummy_sample(input_type=input_type)
+
+        for output_type in ["pt", "np", "pil"]:
+            out = video_processor.postprocess_video(video_processor.preprocess_video(input), output_type=output_type)
+            out_np = self.to_np(out)
+            input_np = self.to_np(input).astype("float32") / 255.0 if output_type != "pil" else self.to_np(input)
+            assert np.abs(input_np - out_np).max() < 1e-6, f"Decoded output does not match input for {output_type=}"
+
+    @parameterized.expand(["list_4d_np", "list_5d_np", "5d_np"])
+    def test_video_processor_np(self, input_type):
+        video_processor = VideoProcessor(do_resize=False, do_normalize=True)
+
+        input = self.get_dummy_sample(input_type=input_type)
+
+        for output_type in ["pt", "np", "pil"]:
+            out = video_processor.postprocess_video(video_processor.preprocess_video(input), output_type=output_type)
+            out_np = self.to_np(out)
+            input_np = (
+                (self.to_np(input) * 255.0).round().astype("uint8") if output_type == "pil" else self.to_np(input)
+            )
+            assert np.abs(input_np - out_np).max() < 1e-6, f"Decoded output does not match input for {output_type=}"
+
+    @parameterized.expand(["list_4d_pt", "list_5d_pt", "5d_pt"])
+    def test_video_processor_pt(self, input_type):
+        video_processor = VideoProcessor(do_resize=False, do_normalize=True)
+
+        input = self.get_dummy_sample(input_type=input_type)
+
+        for output_type in ["pt", "np", "pil"]:
+            out = video_processor.postprocess_video(video_processor.preprocess_video(input), output_type=output_type)
+            out_np = self.to_np(out)
+            input_np = (
+                (self.to_np(input) * 255.0).round().astype("uint8") if output_type == "pil" else self.to_np(input)
+            )
+            assert np.abs(input_np - out_np).max() < 1e-6, f"Decoded output does not match input for {output_type=}"
diff --git a/diffusers/tests/pipelines/__init__.py b/diffusers/tests/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/amused/__init__.py b/diffusers/tests/pipelines/amused/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/amused/test_amused.py b/diffusers/tests/pipelines/amused/test_amused.py
new file mode 100644
index 0000000000000000000000000000000000000000..32f3e13ad911a7d488cd1e4102183ae0938fe4d6
--- /dev/null
+++ b/diffusers/tests/pipelines/amused/test_amused.py
@@ -0,0 +1,170 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import AmusedPipeline, AmusedScheduler, UVit2DModel, VQModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class AmusedPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = AmusedPipeline
+    params = TEXT_TO_IMAGE_PARAMS | {"encoder_hidden_states", "negative_encoder_hidden_states"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = UVit2DModel(
+            hidden_size=8,
+            use_bias=False,
+            hidden_dropout=0.0,
+            cond_embed_dim=8,
+            micro_cond_encode_dim=2,
+            micro_cond_embed_dim=10,
+            encoder_hidden_size=8,
+            vocab_size=32,
+            codebook_size=8,
+            in_channels=8,
+            block_out_channels=8,
+            num_res_blocks=1,
+            downsample=True,
+            upsample=True,
+            block_num_heads=1,
+            num_hidden_layers=1,
+            num_attention_heads=1,
+            attention_dropout=0.0,
+            intermediate_size=8,
+            layer_norm_eps=1e-06,
+            ln_elementwise_affine=True,
+        )
+        scheduler = AmusedScheduler(mask_token_id=31)
+        torch.manual_seed(0)
+        vqvae = VQModel(
+            act_fn="silu",
+            block_out_channels=[8],
+            down_block_types=["DownEncoderBlock2D"],
+            in_channels=3,
+            latent_channels=8,
+            layers_per_block=1,
+            norm_num_groups=8,
+            num_vq_embeddings=8,
+            out_channels=3,
+            sample_size=8,
+            up_block_types=["UpDecoderBlock2D"],
+            mid_block_add_attention=False,
+            lookup_from_codebook=True,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=8,
+            intermediate_size=8,
+            layer_norm_eps=1e-05,
+            num_attention_heads=1,
+            num_hidden_layers=1,
+            pad_token_id=1,
+            vocab_size=1000,
+            projection_dim=8,
+        )
+        text_encoder = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        components = {
+            "transformer": transformer,
+            "scheduler": scheduler,
+            "vqvae": vqvae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+            "height": 4,
+            "width": 4,
+        }
+        return inputs
+
+    def test_inference_batch_consistent(self, batch_sizes=[2]):
+        self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False)
+
+    @unittest.skip("aMUSEd does not support lists of generators")
+    def test_inference_batch_single_identical(self):
+        ...
+
+
+@slow
+@require_torch_gpu
+class AmusedPipelineSlowTests(unittest.TestCase):
+    def test_amused_256(self):
+        pipe = AmusedPipeline.from_pretrained("amused/amused-256")
+        pipe.to(torch_device)
+        image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.4011, 0.3992, 0.379, 0.3856, 0.3772, 0.3711, 0.3919, 0.385, 0.3625])
+        assert np.abs(image_slice - expected_slice).max() < 0.003
+
+    def test_amused_256_fp16(self):
+        pipe = AmusedPipeline.from_pretrained("amused/amused-256", variant="fp16", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.0554, 0.05129, 0.0344, 0.0452, 0.0476, 0.0271, 0.0495, 0.0527, 0.0158])
+        assert np.abs(image_slice - expected_slice).max() < 0.007
+
+    def test_amused_512(self):
+        pipe = AmusedPipeline.from_pretrained("amused/amused-512")
+        pipe.to(torch_device)
+        image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.1199, 0.1171, 0.1229, 0.1188, 0.1210, 0.1147, 0.1260, 0.1346, 0.1152])
+        assert np.abs(image_slice - expected_slice).max() < 0.003
+
+    def test_amused_512_fp16(self):
+        pipe = AmusedPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.1509, 0.1492, 0.1531, 0.1485, 0.1501, 0.1465, 0.1581, 0.1690, 0.1499])
+        assert np.abs(image_slice - expected_slice).max() < 0.003
diff --git a/diffusers/tests/pipelines/amused/test_amused_img2img.py b/diffusers/tests/pipelines/amused/test_amused_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..c647a5aa304e811afb920d26b4f077f14a0e7b6a
--- /dev/null
+++ b/diffusers/tests/pipelines/amused/test_amused_img2img.py
@@ -0,0 +1,216 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import AmusedImg2ImgPipeline, AmusedScheduler, UVit2DModel, VQModel
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class AmusedImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = AmusedImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "latents"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = UVit2DModel(
+            hidden_size=8,
+            use_bias=False,
+            hidden_dropout=0.0,
+            cond_embed_dim=8,
+            micro_cond_encode_dim=2,
+            micro_cond_embed_dim=10,
+            encoder_hidden_size=8,
+            vocab_size=32,
+            codebook_size=8,
+            in_channels=8,
+            block_out_channels=8,
+            num_res_blocks=1,
+            downsample=True,
+            upsample=True,
+            block_num_heads=1,
+            num_hidden_layers=1,
+            num_attention_heads=1,
+            attention_dropout=0.0,
+            intermediate_size=8,
+            layer_norm_eps=1e-06,
+            ln_elementwise_affine=True,
+        )
+        scheduler = AmusedScheduler(mask_token_id=31)
+        torch.manual_seed(0)
+        vqvae = VQModel(
+            act_fn="silu",
+            block_out_channels=[8],
+            down_block_types=["DownEncoderBlock2D"],
+            in_channels=3,
+            latent_channels=8,
+            layers_per_block=1,
+            norm_num_groups=8,
+            num_vq_embeddings=32,
+            out_channels=3,
+            sample_size=8,
+            up_block_types=["UpDecoderBlock2D"],
+            mid_block_add_attention=False,
+            lookup_from_codebook=True,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=8,
+            intermediate_size=8,
+            layer_norm_eps=1e-05,
+            num_attention_heads=1,
+            num_hidden_layers=1,
+            pad_token_id=1,
+            vocab_size=1000,
+            projection_dim=8,
+        )
+        text_encoder = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        components = {
+            "transformer": transformer,
+            "scheduler": scheduler,
+            "vqvae": vqvae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        image = torch.full((1, 3, 4, 4), 1.0, dtype=torch.float32, device=device)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+            "image": image,
+        }
+        return inputs
+
+    def test_inference_batch_consistent(self, batch_sizes=[2]):
+        self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False)
+
+    @unittest.skip("aMUSEd does not support lists of generators")
+    def test_inference_batch_single_identical(self):
+        ...
+
+
+@slow
+@require_torch_gpu
+class AmusedImg2ImgPipelineSlowTests(unittest.TestCase):
+    def test_amused_256(self):
+        pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-256")
+        pipe.to(torch_device)
+        image = (
+            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
+            .resize((256, 256))
+            .convert("RGB")
+        )
+        image = pipe(
+            "winter mountains",
+            image,
+            generator=torch.Generator().manual_seed(0),
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.9993, 1.0, 0.9996, 1.0, 0.9995, 0.9925, 0.999, 0.9954, 1.0])
+        assert np.abs(image_slice - expected_slice).max() < 0.01
+
+    def test_amused_256_fp16(self):
+        pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-256", torch_dtype=torch.float16, variant="fp16")
+        pipe.to(torch_device)
+        image = (
+            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
+            .resize((256, 256))
+            .convert("RGB")
+        )
+        image = pipe(
+            "winter mountains",
+            image,
+            generator=torch.Generator().manual_seed(0),
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.998, 0.998, 0.994, 0.9944, 0.996, 0.9908, 1.0, 1.0, 0.9986])
+        assert np.abs(image_slice - expected_slice).max() < 0.01
+
+    def test_amused_512(self):
+        pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-512")
+        pipe.to(torch_device)
+        image = (
+            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
+            .resize((512, 512))
+            .convert("RGB")
+        )
+        image = pipe(
+            "winter mountains",
+            image,
+            generator=torch.Generator().manual_seed(0),
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.2809, 0.1879, 0.2027, 0.2418, 0.1852, 0.2145, 0.2484, 0.2425, 0.2317])
+        assert np.abs(image_slice - expected_slice).max() < 0.1
+
+    def test_amused_512_fp16(self):
+        pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        image = (
+            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
+            .resize((512, 512))
+            .convert("RGB")
+        )
+        image = pipe(
+            "winter mountains",
+            image,
+            generator=torch.Generator().manual_seed(0),
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.2795, 0.1867, 0.2028, 0.2450, 0.1856, 0.2140, 0.2473, 0.2406, 0.2313])
+        assert np.abs(image_slice - expected_slice).max() < 0.1
diff --git a/diffusers/tests/pipelines/amused/test_amused_inpaint.py b/diffusers/tests/pipelines/amused/test_amused_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a8d501450bbb1e9060dd942adb4763507d0829c
--- /dev/null
+++ b/diffusers/tests/pipelines/amused/test_amused_inpaint.py
@@ -0,0 +1,251 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import AmusedInpaintPipeline, AmusedScheduler, UVit2DModel, VQModel
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class AmusedInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = AmusedInpaintPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = UVit2DModel(
+            hidden_size=8,
+            use_bias=False,
+            hidden_dropout=0.0,
+            cond_embed_dim=8,
+            micro_cond_encode_dim=2,
+            micro_cond_embed_dim=10,
+            encoder_hidden_size=8,
+            vocab_size=32,
+            codebook_size=32,
+            in_channels=8,
+            block_out_channels=8,
+            num_res_blocks=1,
+            downsample=True,
+            upsample=True,
+            block_num_heads=1,
+            num_hidden_layers=1,
+            num_attention_heads=1,
+            attention_dropout=0.0,
+            intermediate_size=8,
+            layer_norm_eps=1e-06,
+            ln_elementwise_affine=True,
+        )
+        scheduler = AmusedScheduler(mask_token_id=31)
+        torch.manual_seed(0)
+        vqvae = VQModel(
+            act_fn="silu",
+            block_out_channels=[8],
+            down_block_types=["DownEncoderBlock2D"],
+            in_channels=3,
+            latent_channels=8,
+            layers_per_block=1,
+            norm_num_groups=8,
+            num_vq_embeddings=32,
+            out_channels=3,
+            sample_size=8,
+            up_block_types=["UpDecoderBlock2D"],
+            mid_block_add_attention=False,
+            lookup_from_codebook=True,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=8,
+            intermediate_size=8,
+            layer_norm_eps=1e-05,
+            num_attention_heads=1,
+            num_hidden_layers=1,
+            pad_token_id=1,
+            vocab_size=1000,
+            projection_dim=8,
+        )
+        text_encoder = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        components = {
+            "transformer": transformer,
+            "scheduler": scheduler,
+            "vqvae": vqvae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        image = torch.full((1, 3, 4, 4), 1.0, dtype=torch.float32, device=device)
+        mask_image = torch.full((1, 1, 4, 4), 1.0, dtype=torch.float32, device=device)
+        mask_image[0, 0, 0, 0] = 0
+        mask_image[0, 0, 0, 1] = 0
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+            "image": image,
+            "mask_image": mask_image,
+        }
+        return inputs
+
+    def test_inference_batch_consistent(self, batch_sizes=[2]):
+        self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False)
+
+    @unittest.skip("aMUSEd does not support lists of generators")
+    def test_inference_batch_single_identical(self):
+        ...
+
+
+@slow
+@require_torch_gpu
+class AmusedInpaintPipelineSlowTests(unittest.TestCase):
+    def test_amused_256(self):
+        pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-256")
+        pipe.to(torch_device)
+        image = (
+            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
+            .resize((256, 256))
+            .convert("RGB")
+        )
+        mask_image = (
+            load_image(
+                "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
+            )
+            .resize((256, 256))
+            .convert("L")
+        )
+        image = pipe(
+            "winter mountains",
+            image,
+            mask_image,
+            generator=torch.Generator().manual_seed(0),
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.0699, 0.0716, 0.0608, 0.0715, 0.0797, 0.0638, 0.0802, 0.0924, 0.0634])
+        assert np.abs(image_slice - expected_slice).max() < 0.1
+
+    def test_amused_256_fp16(self):
+        pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-256", variant="fp16", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        image = (
+            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
+            .resize((256, 256))
+            .convert("RGB")
+        )
+        mask_image = (
+            load_image(
+                "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
+            )
+            .resize((256, 256))
+            .convert("L")
+        )
+        image = pipe(
+            "winter mountains",
+            image,
+            mask_image,
+            generator=torch.Generator().manual_seed(0),
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.0735, 0.0749, 0.065, 0.0739, 0.0805, 0.0667, 0.0802, 0.0923, 0.0622])
+        assert np.abs(image_slice - expected_slice).max() < 0.1
+
+    def test_amused_512(self):
+        pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-512")
+        pipe.to(torch_device)
+        image = (
+            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
+            .resize((512, 512))
+            .convert("RGB")
+        )
+        mask_image = (
+            load_image(
+                "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
+            )
+            .resize((512, 512))
+            .convert("L")
+        )
+        image = pipe(
+            "winter mountains",
+            image,
+            mask_image,
+            generator=torch.Generator().manual_seed(0),
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0005, 0.0])
+        assert np.abs(image_slice - expected_slice).max() < 0.05
+
+    def test_amused_512_fp16(self):
+        pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        image = (
+            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
+            .resize((512, 512))
+            .convert("RGB")
+        )
+        mask_image = (
+            load_image(
+                "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
+            )
+            .resize((512, 512))
+            .convert("L")
+        )
+        image = pipe(
+            "winter mountains",
+            image,
+            mask_image,
+            generator=torch.Generator().manual_seed(0),
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0227, 0.0157, 0.0098, 0.0213, 0.0250, 0.0127, 0.0280, 0.0380, 0.0095])
+        assert np.abs(image_slice - expected_slice).max() < 0.003
diff --git a/diffusers/tests/pipelines/animatediff/__init__.py b/diffusers/tests/pipelines/animatediff/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/animatediff/test_animatediff.py b/diffusers/tests/pipelines/animatediff/test_animatediff.py
new file mode 100644
index 0000000000000000000000000000000000000000..54c83d6a1b68102b6954aaa73b5763d080a3b7f2
--- /dev/null
+++ b/diffusers/tests/pipelines/animatediff/test_animatediff.py
@@ -0,0 +1,604 @@
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AnimateDiffPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    LCMScheduler,
+    MotionAdapter,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    UNetMotionModel,
+)
+from diffusers.models.attention import FreeNoiseTransformerBlock
+from diffusers.utils import is_xformers_available, logging
+from diffusers.utils.testing_utils import numpy_cosine_similarity_distance, require_torch_gpu, slow, torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineFromPipeTesterMixin,
+    PipelineTesterMixin,
+    SDFunctionTesterMixin,
+)
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+class AnimateDiffPipelineFastTests(
+    IPAdapterTesterMixin, SDFunctionTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
+):
+    pipeline_class = AnimateDiffPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+
+    def get_dummy_components(self):
+        cross_attention_dim = 8
+        block_out_channels = (8, 8)
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=block_out_channels,
+            layers_per_block=2,
+            sample_size=8,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="linear",
+            clip_sample=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=block_out_channels,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=cross_attention_dim,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        torch.manual_seed(0)
+        motion_adapter = MotionAdapter(
+            block_out_channels=block_out_channels,
+            motion_layers_per_block=2,
+            motion_norm_num_groups=2,
+            motion_num_attention_heads=4,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "motion_adapter": motion_adapter,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 7.5,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_from_pipe_consistent_config(self):
+        assert self.original_pipeline_class == StableDiffusionPipeline
+        original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
+        original_kwargs = {"requires_safety_checker": False}
+
+        # create original_pipeline_class(sd)
+        pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
+
+        # original_pipeline_class(sd) -> pipeline_class
+        pipe_components = self.get_dummy_components()
+        pipe_additional_components = {}
+        for name, component in pipe_components.items():
+            if name not in pipe_original.components:
+                pipe_additional_components[name] = component
+
+        pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
+
+        # pipeline_class -> original_pipeline_class(sd)
+        original_pipe_additional_components = {}
+        for name, component in pipe_original.components.items():
+            if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
+                original_pipe_additional_components[name] = component
+
+        pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
+
+        # compare the config
+        original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
+        original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
+        assert original_config_2 == original_config
+
+    def test_motion_unet_loading(self):
+        components = self.get_dummy_components()
+        pipe = AnimateDiffPipeline(**components)
+
+        assert isinstance(pipe.unet, UNetMotionModel)
+
+    @unittest.skip("Attention slicing is not enabled in this pipeline")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array(
+                [
+                    0.5216,
+                    0.5620,
+                    0.4927,
+                    0.5082,
+                    0.4786,
+                    0.5932,
+                    0.5125,
+                    0.4514,
+                    0.5315,
+                    0.4694,
+                    0.3276,
+                    0.4863,
+                    0.3920,
+                    0.3684,
+                    0.5745,
+                    0.4499,
+                    0.5081,
+                    0.5414,
+                    0.6014,
+                    0.5062,
+                    0.3630,
+                    0.5296,
+                    0.6018,
+                    0.5098,
+                    0.4948,
+                    0.5101,
+                    0.5620,
+                ]
+            )
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        expected_slice = None
+        if torch_device == "cpu":
+            expected_slice = np.array([0.5125, 0.4514, 0.5315, 0.4499, 0.5081, 0.5414, 0.4948, 0.5101, 0.5620])
+        return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
+
+    def test_inference_batch_single_identical(
+        self,
+        batch_size=2,
+        expected_max_diff=1e-4,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for components in pipe.components.values():
+            if hasattr(components, "set_default_attn_processor"):
+                components.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = self.get_generator(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        for name in self.batch_params:
+            if name not in inputs:
+                continue
+
+            value = inputs[name]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
+
+            else:
+                batched_inputs[name] = batch_size * [value]
+
+        if "generator" in inputs:
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        output = pipe(**inputs)
+        output_batch = pipe(**batched_inputs)
+
+        assert output_batch[0].shape[0] == batch_size
+
+        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+        assert max_diff < expected_max_diff
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.float16)
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    def test_prompt_embeds(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs.pop("prompt")
+        inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
+        pipe(**inputs)
+
+    def test_free_init(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        pipe.enable_free_init(
+            num_iters=2,
+            use_fast_sampling=True,
+            method="butterworth",
+            order=4,
+            spatial_stop_frequency=0.25,
+            temporal_stop_frequency=0.25,
+        )
+        inputs_enable_free_init = self.get_dummy_inputs(torch_device)
+        frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
+
+        pipe.disable_free_init()
+        inputs_disable_free_init = self.get_dummy_inputs(torch_device)
+        frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
+
+        sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+        max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
+        self.assertGreater(
+            sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results"
+        )
+        self.assertLess(
+            max_diff_disabled,
+            1e-4,
+            "Disabling of FreeInit should lead to results similar to the default pipeline results",
+        )
+
+    def test_free_init_with_schedulers(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        schedulers_to_test = [
+            DPMSolverMultistepScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                algorithm_type="dpmsolver++",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+            LCMScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+        ]
+        components.pop("scheduler")
+
+        for scheduler in schedulers_to_test:
+            components["scheduler"] = scheduler
+            pipe: AnimateDiffPipeline = self.pipeline_class(**components)
+            pipe.set_progress_bar_config(disable=None)
+            pipe.to(torch_device)
+
+            pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
+
+            inputs = self.get_dummy_inputs(torch_device)
+            frames_enable_free_init = pipe(**inputs).frames[0]
+            sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+
+            self.assertGreater(
+                sum_enabled,
+                1e1,
+                "Enabling of FreeInit should lead to results different from the default pipeline results",
+            )
+
+    def test_free_noise_blocks(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        pipe.enable_free_noise()
+        for block in pipe.unet.down_blocks:
+            for motion_module in block.motion_modules:
+                for transformer_block in motion_module.transformer_blocks:
+                    self.assertTrue(
+                        isinstance(transformer_block, FreeNoiseTransformerBlock),
+                        "Motion module transformer blocks must be an instance of `FreeNoiseTransformerBlock` after enabling FreeNoise.",
+                    )
+
+        pipe.disable_free_noise()
+        for block in pipe.unet.down_blocks:
+            for motion_module in block.motion_modules:
+                for transformer_block in motion_module.transformer_blocks:
+                    self.assertFalse(
+                        isinstance(transformer_block, FreeNoiseTransformerBlock),
+                        "Motion module transformer blocks must not be an instance of `FreeNoiseTransformerBlock` after disabling FreeNoise.",
+                    )
+
+    def test_free_noise(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        for context_length in [8, 9]:
+            for context_stride in [4, 6]:
+                pipe.enable_free_noise(context_length, context_stride)
+
+                inputs_enable_free_noise = self.get_dummy_inputs(torch_device)
+                frames_enable_free_noise = pipe(**inputs_enable_free_noise).frames[0]
+
+                pipe.disable_free_noise()
+
+                inputs_disable_free_noise = self.get_dummy_inputs(torch_device)
+                frames_disable_free_noise = pipe(**inputs_disable_free_noise).frames[0]
+
+                sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_noise)).sum()
+                max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_noise)).max()
+                self.assertGreater(
+                    sum_enabled,
+                    1e1,
+                    "Enabling of FreeNoise should lead to results different from the default pipeline results",
+                )
+                self.assertLess(
+                    max_diff_disabled,
+                    1e-4,
+                    "Disabling of FreeNoise should lead to results similar to the default pipeline results",
+                )
+
+    def test_free_noise_split_inference(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        pipe.enable_free_noise(8, 4)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        # Test FreeNoise with split inference memory-optimization
+        pipe.enable_free_noise_split_inference(spatial_split_size=16, temporal_split_size=4)
+
+        inputs_enable_split_inference = self.get_dummy_inputs(torch_device)
+        frames_enable_split_inference = pipe(**inputs_enable_split_inference).frames[0]
+
+        sum_split_inference = np.abs(to_np(frames_normal) - to_np(frames_enable_split_inference)).sum()
+        self.assertLess(
+            sum_split_inference,
+            1e-4,
+            "Enabling FreeNoise Split Inference memory-optimizations should lead to results similar to the default pipeline results",
+        )
+
+    def test_free_noise_multi_prompt(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        context_length = 8
+        context_stride = 4
+        pipe.enable_free_noise(context_length, context_stride)
+
+        # Make sure that pipeline works when prompt indices are within num_frames bounds
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf"}
+        inputs["num_frames"] = 16
+        pipe(**inputs).frames[0]
+
+        with self.assertRaises(ValueError):
+            # Ensure that prompt indices are within bounds
+            inputs = self.get_dummy_inputs(torch_device)
+            inputs["num_frames"] = 16
+            inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf", 42: "Error on a leaf"}
+            pipe(**inputs).frames[0]
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = pipe(**inputs).frames[0]
+        output_without_offload = (
+            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
+        )
+
+        pipe.enable_xformers_memory_efficient_attention()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = pipe(**inputs).frames[0]
+        output_with_offload = (
+            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
+        )
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
+
+    def test_vae_slicing(self):
+        return super().test_vae_slicing(image_count=2)
+
+
+@slow
+@require_torch_gpu
+class AnimateDiffPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_animatediff(self):
+        adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
+        pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
+        pipe = pipe.to(torch_device)
+        pipe.scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="linear",
+            steps_offset=1,
+            clip_sample=False,
+        )
+        pipe.enable_vae_slicing()
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "night, b&w photo of old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain"
+        negative_prompt = "bad quality, worse quality"
+
+        generator = torch.Generator("cpu").manual_seed(0)
+        output = pipe(
+            prompt,
+            negative_prompt=negative_prompt,
+            num_frames=16,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=3,
+            output_type="np",
+        )
+
+        image = output.frames[0]
+        assert image.shape == (16, 512, 512, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array(
+            [
+                0.11357737,
+                0.11285847,
+                0.11180121,
+                0.11084166,
+                0.11414117,
+                0.09785956,
+                0.10742754,
+                0.10510018,
+                0.08045256,
+            ]
+        )
+        assert numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice.flatten()) < 1e-3
diff --git a/diffusers/tests/pipelines/animatediff/test_animatediff_controlnet.py b/diffusers/tests/pipelines/animatediff/test_animatediff_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..519d848c6dc2d3bb1d632fae7e607144d944b463
--- /dev/null
+++ b/diffusers/tests/pipelines/animatediff/test_animatediff_controlnet.py
@@ -0,0 +1,519 @@
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AnimateDiffControlNetPipeline,
+    AutoencoderKL,
+    ControlNetModel,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    LCMScheduler,
+    MotionAdapter,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    UNetMotionModel,
+)
+from diffusers.models.attention import FreeNoiseTransformerBlock
+from diffusers.utils import logging
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineFromPipeTesterMixin,
+    PipelineTesterMixin,
+    SDFunctionTesterMixin,
+)
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+class AnimateDiffControlNetPipelineFastTests(
+    IPAdapterTesterMixin, SDFunctionTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
+):
+    pipeline_class = AnimateDiffControlNetPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"conditioning_frames"})
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+
+    def get_dummy_components(self):
+        cross_attention_dim = 8
+        block_out_channels = (8, 8)
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=block_out_channels,
+            layers_per_block=2,
+            sample_size=8,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="linear",
+            clip_sample=False,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=block_out_channels,
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            conditioning_embedding_out_channels=(8, 8),
+            norm_num_groups=1,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=block_out_channels,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=cross_attention_dim,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        motion_adapter = MotionAdapter(
+            block_out_channels=block_out_channels,
+            motion_layers_per_block=2,
+            motion_norm_num_groups=2,
+            motion_num_attention_heads=4,
+        )
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "motion_adapter": motion_adapter,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed: int = 0, num_frames: int = 2):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        video_height = 32
+        video_width = 32
+        conditioning_frames = [Image.new("RGB", (video_width, video_height))] * num_frames
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "conditioning_frames": conditioning_frames,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "num_frames": num_frames,
+            "guidance_scale": 7.5,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_from_pipe_consistent_config(self):
+        assert self.original_pipeline_class == StableDiffusionPipeline
+        original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
+        original_kwargs = {"requires_safety_checker": False}
+
+        # create original_pipeline_class(sd)
+        pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
+
+        # original_pipeline_class(sd) -> pipeline_class
+        pipe_components = self.get_dummy_components()
+        pipe_additional_components = {}
+        for name, component in pipe_components.items():
+            if name not in pipe_original.components:
+                pipe_additional_components[name] = component
+
+        pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
+
+        # pipeline_class -> original_pipeline_class(sd)
+        original_pipe_additional_components = {}
+        for name, component in pipe_original.components.items():
+            if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
+                original_pipe_additional_components[name] = component
+
+        pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
+
+        # compare the config
+        original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
+        original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
+        assert original_config_2 == original_config
+
+    def test_motion_unet_loading(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        assert isinstance(pipe.unet, UNetMotionModel)
+
+    @unittest.skip("Attention slicing is not enabled in this pipeline")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array(
+                [
+                    0.6604,
+                    0.4099,
+                    0.4928,
+                    0.5706,
+                    0.5096,
+                    0.5012,
+                    0.6051,
+                    0.5169,
+                    0.5021,
+                    0.4864,
+                    0.4261,
+                    0.5779,
+                    0.5822,
+                    0.4049,
+                    0.5253,
+                    0.6160,
+                    0.4150,
+                    0.5155,
+                ]
+            )
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        expected_slice = None
+        if torch_device == "cpu":
+            expected_slice = np.array([0.6051, 0.5169, 0.5021, 0.6160, 0.4150, 0.5155])
+        return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
+
+    def test_inference_batch_single_identical(
+        self,
+        batch_size=2,
+        expected_max_diff=1e-4,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for components in pipe.components.values():
+            if hasattr(components, "set_default_attn_processor"):
+                components.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = self.get_generator(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        for name in self.batch_params:
+            if name not in inputs:
+                continue
+
+            value = inputs[name]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
+
+            else:
+                batched_inputs[name] = batch_size * [value]
+
+        if "generator" in inputs:
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        output = pipe(**inputs)
+        output_batch = pipe(**batched_inputs)
+
+        assert output_batch[0].shape[0] == batch_size
+
+        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+        assert max_diff < expected_max_diff
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.float16)
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    def test_prompt_embeds(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs.pop("prompt")
+        inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
+        pipe(**inputs)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        super()._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
+
+    def test_free_init(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffControlNetPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        pipe.enable_free_init(
+            num_iters=2,
+            use_fast_sampling=True,
+            method="butterworth",
+            order=4,
+            spatial_stop_frequency=0.25,
+            temporal_stop_frequency=0.25,
+        )
+        inputs_enable_free_init = self.get_dummy_inputs(torch_device)
+        frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
+
+        pipe.disable_free_init()
+        inputs_disable_free_init = self.get_dummy_inputs(torch_device)
+        frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
+
+        sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+        max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
+        self.assertGreater(
+            sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results"
+        )
+        self.assertLess(
+            max_diff_disabled,
+            1e-4,
+            "Disabling of FreeInit should lead to results similar to the default pipeline results",
+        )
+
+    def test_free_init_with_schedulers(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffControlNetPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        schedulers_to_test = [
+            DPMSolverMultistepScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                algorithm_type="dpmsolver++",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+            LCMScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+        ]
+        components.pop("scheduler")
+
+        for scheduler in schedulers_to_test:
+            components["scheduler"] = scheduler
+            pipe: AnimateDiffControlNetPipeline = self.pipeline_class(**components)
+            pipe.set_progress_bar_config(disable=None)
+            pipe.to(torch_device)
+
+            pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
+
+            inputs = self.get_dummy_inputs(torch_device)
+            frames_enable_free_init = pipe(**inputs).frames[0]
+            sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+
+            self.assertGreater(
+                sum_enabled,
+                1e1,
+                "Enabling of FreeInit should lead to results different from the default pipeline results",
+            )
+
+    def test_free_noise_blocks(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffControlNetPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        pipe.enable_free_noise()
+        for block in pipe.unet.down_blocks:
+            for motion_module in block.motion_modules:
+                for transformer_block in motion_module.transformer_blocks:
+                    self.assertTrue(
+                        isinstance(transformer_block, FreeNoiseTransformerBlock),
+                        "Motion module transformer blocks must be an instance of `FreeNoiseTransformerBlock` after enabling FreeNoise.",
+                    )
+
+        pipe.disable_free_noise()
+        for block in pipe.unet.down_blocks:
+            for motion_module in block.motion_modules:
+                for transformer_block in motion_module.transformer_blocks:
+                    self.assertFalse(
+                        isinstance(transformer_block, FreeNoiseTransformerBlock),
+                        "Motion module transformer blocks must not be an instance of `FreeNoiseTransformerBlock` after disabling FreeNoise.",
+                    )
+
+    def test_free_noise(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffControlNetPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device, num_frames=16)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        for context_length in [8, 9]:
+            for context_stride in [4, 6]:
+                pipe.enable_free_noise(context_length, context_stride)
+
+                inputs_enable_free_noise = self.get_dummy_inputs(torch_device, num_frames=16)
+                frames_enable_free_noise = pipe(**inputs_enable_free_noise).frames[0]
+
+                pipe.disable_free_noise()
+
+                inputs_disable_free_noise = self.get_dummy_inputs(torch_device, num_frames=16)
+                frames_disable_free_noise = pipe(**inputs_disable_free_noise).frames[0]
+
+                sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_noise)).sum()
+                max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_noise)).max()
+                self.assertGreater(
+                    sum_enabled,
+                    1e1,
+                    "Enabling of FreeNoise should lead to results different from the default pipeline results",
+                )
+                self.assertLess(
+                    max_diff_disabled,
+                    1e-4,
+                    "Disabling of FreeNoise should lead to results similar to the default pipeline results",
+                )
+
+    def test_free_noise_multi_prompt(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffControlNetPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        context_length = 8
+        context_stride = 4
+        pipe.enable_free_noise(context_length, context_stride)
+
+        # Make sure that pipeline works when prompt indices are within num_frames bounds
+        inputs = self.get_dummy_inputs(torch_device, num_frames=16)
+        inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf"}
+        pipe(**inputs).frames[0]
+
+        with self.assertRaises(ValueError):
+            # Ensure that prompt indices are within bounds
+            inputs = self.get_dummy_inputs(torch_device, num_frames=16)
+            inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf", 42: "Error on a leaf"}
+            pipe(**inputs).frames[0]
+
+    def test_vae_slicing(self, video_count=2):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * video_count
+        inputs["conditioning_frames"] = [inputs["conditioning_frames"]] * video_count
+        output_1 = pipe(**inputs)
+
+        # make sure sliced vae decode yields the same result
+        pipe.enable_vae_slicing()
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * video_count
+        inputs["conditioning_frames"] = [inputs["conditioning_frames"]] * video_count
+        output_2 = pipe(**inputs)
+
+        assert np.abs(output_2[0].flatten() - output_1[0].flatten()).max() < 1e-2
diff --git a/diffusers/tests/pipelines/animatediff/test_animatediff_sdxl.py b/diffusers/tests/pipelines/animatediff/test_animatediff_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..2db0139154e92ddbb08f0c83e5add02d6044ef14
--- /dev/null
+++ b/diffusers/tests/pipelines/animatediff/test_animatediff_sdxl.py
@@ -0,0 +1,307 @@
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AnimateDiffSDXLPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    MotionAdapter,
+    UNet2DConditionModel,
+    UNetMotionModel,
+)
+from diffusers.utils import is_xformers_available, logging
+from diffusers.utils.testing_utils import torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineTesterMixin,
+    SDFunctionTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+class AnimateDiffPipelineSDXLFastTests(
+    IPAdapterTesterMixin,
+    SDFunctionTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = AnimateDiffSDXLPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"})
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64, 128),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4, 8),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2, 4),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+            norm_num_groups=1,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="linear",
+            clip_sample=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        motion_adapter = MotionAdapter(
+            block_out_channels=(32, 64, 128),
+            motion_layers_per_block=2,
+            motion_norm_num_groups=2,
+            motion_num_attention_heads=4,
+            use_motion_mid_block=False,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "motion_adapter": motion_adapter,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_motion_unet_loading(self):
+        components = self.get_dummy_components()
+        pipe = AnimateDiffSDXLPipeline(**components)
+
+        assert isinstance(pipe.unet, UNetMotionModel)
+
+    @unittest.skip("Attention slicing is not enabled in this pipeline")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_inference_batch_single_identical(
+        self,
+        batch_size=2,
+        expected_max_diff=1e-4,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for components in pipe.components.values():
+            if hasattr(components, "set_default_attn_processor"):
+                components.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = self.get_generator(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        for name in self.batch_params:
+            if name not in inputs:
+                continue
+
+            value = inputs[name]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
+
+            else:
+                batched_inputs[name] = batch_size * [value]
+
+        if "generator" in inputs:
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        output = pipe(**inputs)
+        output_batch = pipe(**batched_inputs)
+
+        assert output_batch[0].shape[0] == batch_size
+
+        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+        assert max_diff < expected_max_diff
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.float16)
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    def test_prompt_embeds(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = inputs.pop("prompt")
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = pipe.encode_prompt(prompt)
+
+        pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = pipe(**inputs).frames[0]
+        output_without_offload = (
+            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
+        )
+
+        pipe.enable_xformers_memory_efficient_attention()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = pipe(**inputs).frames[0]
+        output_with_offload = (
+            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
+        )
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
diff --git a/diffusers/tests/pipelines/animatediff/test_animatediff_sparsectrl.py b/diffusers/tests/pipelines/animatediff/test_animatediff_sparsectrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..189d6765de4fde37f773734cb0595b31b0437e8a
--- /dev/null
+++ b/diffusers/tests/pipelines/animatediff/test_animatediff_sparsectrl.py
@@ -0,0 +1,486 @@
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AnimateDiffSparseControlNetPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    LCMScheduler,
+    MotionAdapter,
+    SparseControlNetModel,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    UNetMotionModel,
+)
+from diffusers.utils import logging
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineFromPipeTesterMixin,
+    PipelineTesterMixin,
+    SDFunctionTesterMixin,
+)
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+class AnimateDiffSparseControlNetPipelineFastTests(
+    IPAdapterTesterMixin, SDFunctionTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
+):
+    pipeline_class = AnimateDiffSparseControlNetPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+
+    def get_dummy_components(self):
+        cross_attention_dim = 8
+        block_out_channels = (8, 8)
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=block_out_channels,
+            layers_per_block=2,
+            sample_size=8,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="linear",
+            clip_sample=False,
+        )
+        torch.manual_seed(0)
+        controlnet = SparseControlNetModel(
+            block_out_channels=block_out_channels,
+            layers_per_block=2,
+            in_channels=4,
+            conditioning_channels=3,
+            down_block_types=("CrossAttnDownBlockMotion", "DownBlockMotion"),
+            cross_attention_dim=cross_attention_dim,
+            conditioning_embedding_out_channels=(8, 8),
+            norm_num_groups=1,
+            use_simplified_condition_embedding=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=block_out_channels,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=cross_attention_dim,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        motion_adapter = MotionAdapter(
+            block_out_channels=block_out_channels,
+            motion_layers_per_block=2,
+            motion_norm_num_groups=2,
+            motion_num_attention_heads=4,
+        )
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "motion_adapter": motion_adapter,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed: int = 0, num_frames: int = 2):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        video_height = 32
+        video_width = 32
+        conditioning_frames = [Image.new("RGB", (video_width, video_height))] * num_frames
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "conditioning_frames": conditioning_frames,
+            "controlnet_frame_indices": list(range(num_frames)),
+            "generator": generator,
+            "num_inference_steps": 2,
+            "num_frames": num_frames,
+            "guidance_scale": 7.5,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_from_pipe_consistent_config(self):
+        assert self.original_pipeline_class == StableDiffusionPipeline
+        original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
+        original_kwargs = {"requires_safety_checker": False}
+
+        # create original_pipeline_class(sd)
+        pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
+
+        # original_pipeline_class(sd) -> pipeline_class
+        pipe_components = self.get_dummy_components()
+        pipe_additional_components = {}
+        for name, component in pipe_components.items():
+            if name not in pipe_original.components:
+                pipe_additional_components[name] = component
+
+        pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
+
+        # pipeline_class -> original_pipeline_class(sd)
+        original_pipe_additional_components = {}
+        for name, component in pipe_original.components.items():
+            if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
+                original_pipe_additional_components[name] = component
+
+        pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
+
+        # compare the config
+        original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
+        original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
+        assert original_config_2 == original_config
+
+    def test_motion_unet_loading(self):
+        components = self.get_dummy_components()
+        pipe = AnimateDiffSparseControlNetPipeline(**components)
+
+        assert isinstance(pipe.unet, UNetMotionModel)
+
+    @unittest.skip("Attention slicing is not enabled in this pipeline")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array(
+                [
+                    0.6604,
+                    0.4099,
+                    0.4928,
+                    0.5706,
+                    0.5096,
+                    0.5012,
+                    0.6051,
+                    0.5169,
+                    0.5021,
+                    0.4864,
+                    0.4261,
+                    0.5779,
+                    0.5822,
+                    0.4049,
+                    0.5253,
+                    0.6160,
+                    0.4150,
+                    0.5155,
+                ]
+            )
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        expected_slice = None
+        if torch_device == "cpu":
+            expected_slice = np.array([0.6051, 0.5169, 0.5021, 0.6160, 0.4150, 0.5155])
+        return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
+
+    def test_inference_batch_single_identical(
+        self,
+        batch_size=2,
+        expected_max_diff=1e-4,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for components in pipe.components.values():
+            if hasattr(components, "set_default_attn_processor"):
+                components.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = self.get_generator(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        for name in self.batch_params:
+            if name not in inputs:
+                continue
+
+            value = inputs[name]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
+
+            else:
+                batched_inputs[name] = batch_size * [value]
+
+        if "generator" in inputs:
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        output = pipe(**inputs)
+        output_batch = pipe(**batched_inputs)
+
+        assert output_batch[0].shape[0] == batch_size
+
+        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+        assert max_diff < expected_max_diff
+
+    def test_inference_batch_single_identical_use_simplified_condition_embedding_true(
+        self,
+        batch_size=2,
+        expected_max_diff=1e-4,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        components = self.get_dummy_components()
+
+        torch.manual_seed(0)
+        old_controlnet = components.pop("controlnet")
+        components["controlnet"] = SparseControlNetModel.from_config(
+            old_controlnet.config, conditioning_channels=4, use_simplified_condition_embedding=True
+        )
+
+        pipe = self.pipeline_class(**components)
+        for components in pipe.components.values():
+            if hasattr(components, "set_default_attn_processor"):
+                components.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = self.get_generator(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        for name in self.batch_params:
+            if name not in inputs:
+                continue
+
+            value = inputs[name]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
+
+            else:
+                batched_inputs[name] = batch_size * [value]
+
+        if "generator" in inputs:
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        output = pipe(**inputs)
+        output_batch = pipe(**batched_inputs)
+
+        assert output_batch[0].shape[0] == batch_size
+
+        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+        assert max_diff < expected_max_diff
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.float16)
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    def test_prompt_embeds(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs.pop("prompt")
+        inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
+        pipe(**inputs)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        super()._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
+
+    def test_free_init(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffSparseControlNetPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        pipe.enable_free_init(
+            num_iters=2,
+            use_fast_sampling=True,
+            method="butterworth",
+            order=4,
+            spatial_stop_frequency=0.25,
+            temporal_stop_frequency=0.25,
+        )
+        inputs_enable_free_init = self.get_dummy_inputs(torch_device)
+        frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
+
+        pipe.disable_free_init()
+        inputs_disable_free_init = self.get_dummy_inputs(torch_device)
+        frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
+
+        sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+        max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
+        self.assertGreater(
+            sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results"
+        )
+        self.assertLess(
+            max_diff_disabled,
+            1e-4,
+            "Disabling of FreeInit should lead to results similar to the default pipeline results",
+        )
+
+    def test_free_init_with_schedulers(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffSparseControlNetPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        schedulers_to_test = [
+            DPMSolverMultistepScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                algorithm_type="dpmsolver++",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+            LCMScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+        ]
+        components.pop("scheduler")
+
+        for scheduler in schedulers_to_test:
+            components["scheduler"] = scheduler
+            pipe: AnimateDiffSparseControlNetPipeline = self.pipeline_class(**components)
+            pipe.set_progress_bar_config(disable=None)
+            pipe.to(torch_device)
+
+            pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
+
+            inputs = self.get_dummy_inputs(torch_device)
+            frames_enable_free_init = pipe(**inputs).frames[0]
+            sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+
+            self.assertGreater(
+                sum_enabled,
+                1e1,
+                "Enabling of FreeInit should lead to results different from the default pipeline results",
+            )
+
+    def test_vae_slicing(self):
+        return super().test_vae_slicing(image_count=2)
diff --git a/diffusers/tests/pipelines/animatediff/test_animatediff_video2video.py b/diffusers/tests/pipelines/animatediff/test_animatediff_video2video.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3fd4c73736a4bada0120a3cadcf3ee5400923bb
--- /dev/null
+++ b/diffusers/tests/pipelines/animatediff/test_animatediff_video2video.py
@@ -0,0 +1,546 @@
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AnimateDiffVideoToVideoPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    LCMScheduler,
+    MotionAdapter,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    UNetMotionModel,
+)
+from diffusers.models.attention import FreeNoiseTransformerBlock
+from diffusers.utils import is_xformers_available, logging
+from diffusers.utils.testing_utils import torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_PARAMS, VIDEO_TO_VIDEO_BATCH_PARAMS
+from ..test_pipelines_common import IPAdapterTesterMixin, PipelineFromPipeTesterMixin, PipelineTesterMixin
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+class AnimateDiffVideoToVideoPipelineFastTests(
+    IPAdapterTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
+):
+    pipeline_class = AnimateDiffVideoToVideoPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = VIDEO_TO_VIDEO_BATCH_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+
+    def get_dummy_components(self):
+        cross_attention_dim = 8
+        block_out_channels = (8, 8)
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=block_out_channels,
+            layers_per_block=2,
+            sample_size=8,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="linear",
+            clip_sample=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=block_out_channels,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=cross_attention_dim,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        torch.manual_seed(0)
+        motion_adapter = MotionAdapter(
+            block_out_channels=block_out_channels,
+            motion_layers_per_block=2,
+            motion_norm_num_groups=2,
+            motion_num_attention_heads=4,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "motion_adapter": motion_adapter,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0, num_frames: int = 2):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        video_height = 32
+        video_width = 32
+        video = [Image.new("RGB", (video_width, video_height))] * num_frames
+
+        inputs = {
+            "video": video,
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 7.5,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_from_pipe_consistent_config(self):
+        assert self.original_pipeline_class == StableDiffusionPipeline
+        original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
+        original_kwargs = {"requires_safety_checker": False}
+
+        # create original_pipeline_class(sd)
+        pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
+
+        # original_pipeline_class(sd) -> pipeline_class
+        pipe_components = self.get_dummy_components()
+        pipe_additional_components = {}
+        for name, component in pipe_components.items():
+            if name not in pipe_original.components:
+                pipe_additional_components[name] = component
+
+        pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
+
+        # pipeline_class -> original_pipeline_class(sd)
+        original_pipe_additional_components = {}
+        for name, component in pipe_original.components.items():
+            if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
+                original_pipe_additional_components[name] = component
+
+        pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
+
+        # compare the config
+        original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
+        original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
+        assert original_config_2 == original_config
+
+    def test_motion_unet_loading(self):
+        components = self.get_dummy_components()
+        pipe = AnimateDiffVideoToVideoPipeline(**components)
+
+        assert isinstance(pipe.unet, UNetMotionModel)
+
+    @unittest.skip("Attention slicing is not enabled in this pipeline")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array(
+                [
+                    0.5569,
+                    0.6250,
+                    0.4145,
+                    0.5613,
+                    0.5563,
+                    0.5213,
+                    0.5092,
+                    0.4950,
+                    0.4950,
+                    0.5685,
+                    0.3858,
+                    0.4864,
+                    0.6458,
+                    0.4312,
+                    0.5518,
+                    0.5608,
+                    0.4418,
+                    0.5378,
+                ]
+            )
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_inference_batch_single_identical(
+        self,
+        batch_size=2,
+        expected_max_diff=1e-4,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for components in pipe.components.values():
+            if hasattr(components, "set_default_attn_processor"):
+                components.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = self.get_generator(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        for name in self.batch_params:
+            if name not in inputs:
+                continue
+
+            value = inputs[name]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
+
+            else:
+                batched_inputs[name] = batch_size * [value]
+
+        if "generator" in inputs:
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        output = pipe(**inputs)
+        output_batch = pipe(**batched_inputs)
+
+        assert output_batch[0].shape[0] == batch_size
+
+        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+        assert max_diff < expected_max_diff
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.float16)
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    def test_prompt_embeds(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs.pop("prompt")
+        inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
+        pipe(**inputs)
+
+    def test_latent_inputs(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        sample_size = pipe.unet.config.sample_size
+        inputs["latents"] = torch.randn((1, 4, 1, sample_size, sample_size), device=torch_device)
+        inputs.pop("video")
+        pipe(**inputs)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = pipe(**inputs).frames[0]
+        output_without_offload = (
+            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
+        )
+
+        pipe.enable_xformers_memory_efficient_attention()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = pipe(**inputs).frames[0]
+        output_with_offload = (
+            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
+        )
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
+
+    def test_free_init(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        pipe.enable_free_init(
+            num_iters=2,
+            use_fast_sampling=True,
+            method="butterworth",
+            order=4,
+            spatial_stop_frequency=0.25,
+            temporal_stop_frequency=0.25,
+        )
+        inputs_enable_free_init = self.get_dummy_inputs(torch_device)
+        frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
+
+        pipe.disable_free_init()
+        inputs_disable_free_init = self.get_dummy_inputs(torch_device)
+        frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
+
+        sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+        max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
+        self.assertGreater(
+            sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results"
+        )
+        self.assertLess(
+            max_diff_disabled,
+            1e-4,
+            "Disabling of FreeInit should lead to results similar to the default pipeline results",
+        )
+
+    def test_free_init_with_schedulers(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffVideoToVideoPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        schedulers_to_test = [
+            DPMSolverMultistepScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                algorithm_type="dpmsolver++",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+            LCMScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+        ]
+        components.pop("scheduler")
+
+        for scheduler in schedulers_to_test:
+            components["scheduler"] = scheduler
+            pipe: AnimateDiffVideoToVideoPipeline = self.pipeline_class(**components)
+            pipe.set_progress_bar_config(disable=None)
+            pipe.to(torch_device)
+
+            pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
+
+            inputs = self.get_dummy_inputs(torch_device)
+            frames_enable_free_init = pipe(**inputs).frames[0]
+            sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+
+            self.assertGreater(
+                sum_enabled,
+                1e1,
+                "Enabling of FreeInit should lead to results different from the default pipeline results",
+            )
+
+    def test_free_noise_blocks(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffVideoToVideoPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        pipe.enable_free_noise()
+        for block in pipe.unet.down_blocks:
+            for motion_module in block.motion_modules:
+                for transformer_block in motion_module.transformer_blocks:
+                    self.assertTrue(
+                        isinstance(transformer_block, FreeNoiseTransformerBlock),
+                        "Motion module transformer blocks must be an instance of `FreeNoiseTransformerBlock` after enabling FreeNoise.",
+                    )
+
+        pipe.disable_free_noise()
+        for block in pipe.unet.down_blocks:
+            for motion_module in block.motion_modules:
+                for transformer_block in motion_module.transformer_blocks:
+                    self.assertFalse(
+                        isinstance(transformer_block, FreeNoiseTransformerBlock),
+                        "Motion module transformer blocks must not be an instance of `FreeNoiseTransformerBlock` after disabling FreeNoise.",
+                    )
+
+    def test_free_noise(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffVideoToVideoPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device, num_frames=16)
+        inputs_normal["num_inference_steps"] = 2
+        inputs_normal["strength"] = 0.5
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        for context_length in [8, 9]:
+            for context_stride in [4, 6]:
+                pipe.enable_free_noise(context_length, context_stride)
+
+                inputs_enable_free_noise = self.get_dummy_inputs(torch_device, num_frames=16)
+                inputs_enable_free_noise["num_inference_steps"] = 2
+                inputs_enable_free_noise["strength"] = 0.5
+                frames_enable_free_noise = pipe(**inputs_enable_free_noise).frames[0]
+
+                pipe.disable_free_noise()
+                inputs_disable_free_noise = self.get_dummy_inputs(torch_device, num_frames=16)
+                inputs_disable_free_noise["num_inference_steps"] = 2
+                inputs_disable_free_noise["strength"] = 0.5
+                frames_disable_free_noise = pipe(**inputs_disable_free_noise).frames[0]
+
+                sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_noise)).sum()
+                max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_noise)).max()
+                self.assertGreater(
+                    sum_enabled,
+                    1e1,
+                    "Enabling of FreeNoise should lead to results different from the default pipeline results",
+                )
+                self.assertLess(
+                    max_diff_disabled,
+                    1e-4,
+                    "Disabling of FreeNoise should lead to results similar to the default pipeline results",
+                )
+
+    def test_free_noise_split_inference(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffVideoToVideoPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        pipe.enable_free_noise(8, 4)
+
+        inputs_normal = self.get_dummy_inputs(torch_device, num_frames=16)
+        inputs_normal["num_inference_steps"] = 2
+        inputs_normal["strength"] = 0.5
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        # Test FreeNoise with split inference memory-optimization
+        pipe.enable_free_noise_split_inference(spatial_split_size=16, temporal_split_size=4)
+
+        inputs_enable_split_inference = self.get_dummy_inputs(torch_device, num_frames=16)
+        inputs_enable_split_inference["num_inference_steps"] = 2
+        inputs_enable_split_inference["strength"] = 0.5
+        frames_enable_split_inference = pipe(**inputs_enable_split_inference).frames[0]
+
+        sum_split_inference = np.abs(to_np(frames_normal) - to_np(frames_enable_split_inference)).sum()
+        self.assertLess(
+            sum_split_inference,
+            1e-4,
+            "Enabling FreeNoise Split Inference memory-optimizations should lead to results similar to the default pipeline results",
+        )
+
+    def test_free_noise_multi_prompt(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffVideoToVideoPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        context_length = 8
+        context_stride = 4
+        pipe.enable_free_noise(context_length, context_stride)
+
+        # Make sure that pipeline works when prompt indices are within num_frames bounds
+        inputs = self.get_dummy_inputs(torch_device, num_frames=16)
+        inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf"}
+        inputs["num_inference_steps"] = 2
+        inputs["strength"] = 0.5
+        pipe(**inputs).frames[0]
+
+        with self.assertRaises(ValueError):
+            # Ensure that prompt indices are within bounds
+            inputs = self.get_dummy_inputs(torch_device, num_frames=16)
+            inputs["num_inference_steps"] = 2
+            inputs["strength"] = 0.5
+            inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf", 42: "Error on a leaf"}
+            pipe(**inputs).frames[0]
diff --git a/diffusers/tests/pipelines/animatediff/test_animatediff_video2video_controlnet.py b/diffusers/tests/pipelines/animatediff/test_animatediff_video2video_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e598e67ec11d8c9a9a1ea3c02aa5b44a3cd183f
--- /dev/null
+++ b/diffusers/tests/pipelines/animatediff/test_animatediff_video2video_controlnet.py
@@ -0,0 +1,535 @@
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AnimateDiffVideoToVideoControlNetPipeline,
+    AutoencoderKL,
+    ControlNetModel,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    LCMScheduler,
+    MotionAdapter,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    UNetMotionModel,
+)
+from diffusers.models.attention import FreeNoiseTransformerBlock
+from diffusers.utils import is_xformers_available, logging
+from diffusers.utils.testing_utils import torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_PARAMS, VIDEO_TO_VIDEO_BATCH_PARAMS
+from ..test_pipelines_common import IPAdapterTesterMixin, PipelineFromPipeTesterMixin, PipelineTesterMixin
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+class AnimateDiffVideoToVideoControlNetPipelineFastTests(
+    IPAdapterTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
+):
+    pipeline_class = AnimateDiffVideoToVideoControlNetPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = VIDEO_TO_VIDEO_BATCH_PARAMS.union({"conditioning_frames"})
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+
+    def get_dummy_components(self):
+        cross_attention_dim = 8
+        block_out_channels = (8, 8)
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=block_out_channels,
+            layers_per_block=2,
+            sample_size=8,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="linear",
+            clip_sample=False,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=block_out_channels,
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            conditioning_embedding_out_channels=(8, 8),
+            norm_num_groups=1,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=block_out_channels,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=cross_attention_dim,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        torch.manual_seed(0)
+        motion_adapter = MotionAdapter(
+            block_out_channels=block_out_channels,
+            motion_layers_per_block=2,
+            motion_norm_num_groups=2,
+            motion_num_attention_heads=4,
+        )
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "motion_adapter": motion_adapter,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0, num_frames: int = 2):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        video_height = 32
+        video_width = 32
+        video = [Image.new("RGB", (video_width, video_height))] * num_frames
+
+        video_height = 32
+        video_width = 32
+        conditioning_frames = [Image.new("RGB", (video_width, video_height))] * num_frames
+
+        inputs = {
+            "video": video,
+            "conditioning_frames": conditioning_frames,
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 7.5,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_from_pipe_consistent_config(self):
+        assert self.original_pipeline_class == StableDiffusionPipeline
+        original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
+        original_kwargs = {"requires_safety_checker": False}
+
+        # create original_pipeline_class(sd)
+        pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
+
+        # original_pipeline_class(sd) -> pipeline_class
+        pipe_components = self.get_dummy_components()
+        pipe_additional_components = {}
+        for name, component in pipe_components.items():
+            if name not in pipe_original.components:
+                pipe_additional_components[name] = component
+
+        pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
+
+        # pipeline_class -> original_pipeline_class(sd)
+        original_pipe_additional_components = {}
+        for name, component in pipe_original.components.items():
+            if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
+                original_pipe_additional_components[name] = component
+
+        pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
+
+        # compare the config
+        original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
+        original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
+        assert original_config_2 == original_config
+
+    def test_motion_unet_loading(self):
+        components = self.get_dummy_components()
+        pipe = AnimateDiffVideoToVideoControlNetPipeline(**components)
+
+        assert isinstance(pipe.unet, UNetMotionModel)
+
+    @unittest.skip("Attention slicing is not enabled in this pipeline")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array(
+                [
+                    0.5569,
+                    0.6250,
+                    0.4144,
+                    0.5613,
+                    0.5563,
+                    0.5213,
+                    0.5091,
+                    0.4950,
+                    0.4950,
+                    0.5684,
+                    0.3858,
+                    0.4863,
+                    0.6457,
+                    0.4311,
+                    0.5517,
+                    0.5608,
+                    0.4417,
+                    0.5377,
+                ]
+            )
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_inference_batch_single_identical(
+        self,
+        batch_size=2,
+        expected_max_diff=1e-4,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for components in pipe.components.values():
+            if hasattr(components, "set_default_attn_processor"):
+                components.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = self.get_generator(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        for name in self.batch_params:
+            if name not in inputs:
+                continue
+
+            value = inputs[name]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
+
+            else:
+                batched_inputs[name] = batch_size * [value]
+
+        if "generator" in inputs:
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        output = pipe(**inputs)
+        output_batch = pipe(**batched_inputs)
+
+        assert output_batch[0].shape[0] == batch_size
+
+        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+        assert max_diff < expected_max_diff
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.float16)
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    def test_prompt_embeds(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs.pop("prompt")
+        inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
+        pipe(**inputs)
+
+    def test_latent_inputs(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        sample_size = pipe.unet.config.sample_size
+        num_frames = len(inputs["conditioning_frames"])
+        inputs["latents"] = torch.randn((1, 4, num_frames, sample_size, sample_size), device=torch_device)
+        inputs.pop("video")
+        pipe(**inputs)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = pipe(**inputs).frames[0]
+        output_without_offload = (
+            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
+        )
+
+        pipe.enable_xformers_memory_efficient_attention()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = pipe(**inputs).frames[0]
+        output_with_offload = (
+            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
+        )
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
+
+    def test_free_init(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffVideoToVideoControlNetPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        pipe.enable_free_init(
+            num_iters=2,
+            use_fast_sampling=True,
+            method="butterworth",
+            order=4,
+            spatial_stop_frequency=0.25,
+            temporal_stop_frequency=0.25,
+        )
+        inputs_enable_free_init = self.get_dummy_inputs(torch_device)
+        frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
+
+        pipe.disable_free_init()
+        inputs_disable_free_init = self.get_dummy_inputs(torch_device)
+        frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
+
+        sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+        max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
+        self.assertGreater(
+            sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results"
+        )
+        self.assertLess(
+            max_diff_disabled,
+            1e-4,
+            "Disabling of FreeInit should lead to results similar to the default pipeline results",
+        )
+
+    def test_free_init_with_schedulers(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffVideoToVideoControlNetPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        schedulers_to_test = [
+            DPMSolverMultistepScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                algorithm_type="dpmsolver++",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+            LCMScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+        ]
+        components.pop("scheduler")
+
+        for scheduler in schedulers_to_test:
+            components["scheduler"] = scheduler
+            pipe: AnimateDiffVideoToVideoControlNetPipeline = self.pipeline_class(**components)
+            pipe.set_progress_bar_config(disable=None)
+            pipe.to(torch_device)
+
+            pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
+
+            inputs = self.get_dummy_inputs(torch_device)
+            frames_enable_free_init = pipe(**inputs).frames[0]
+            sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+
+            self.assertGreater(
+                sum_enabled,
+                1e1,
+                "Enabling of FreeInit should lead to results different from the default pipeline results",
+            )
+
+    def test_free_noise_blocks(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffVideoToVideoControlNetPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        pipe.enable_free_noise()
+        for block in pipe.unet.down_blocks:
+            for motion_module in block.motion_modules:
+                for transformer_block in motion_module.transformer_blocks:
+                    self.assertTrue(
+                        isinstance(transformer_block, FreeNoiseTransformerBlock),
+                        "Motion module transformer blocks must be an instance of `FreeNoiseTransformerBlock` after enabling FreeNoise.",
+                    )
+
+        pipe.disable_free_noise()
+        for block in pipe.unet.down_blocks:
+            for motion_module in block.motion_modules:
+                for transformer_block in motion_module.transformer_blocks:
+                    self.assertFalse(
+                        isinstance(transformer_block, FreeNoiseTransformerBlock),
+                        "Motion module transformer blocks must not be an instance of `FreeNoiseTransformerBlock` after disabling FreeNoise.",
+                    )
+
+    def test_free_noise(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffVideoToVideoControlNetPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device, num_frames=16)
+        inputs_normal["num_inference_steps"] = 2
+        inputs_normal["strength"] = 0.5
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        for context_length in [8, 9]:
+            for context_stride in [4, 6]:
+                pipe.enable_free_noise(context_length, context_stride)
+
+                inputs_enable_free_noise = self.get_dummy_inputs(torch_device, num_frames=16)
+                inputs_enable_free_noise["num_inference_steps"] = 2
+                inputs_enable_free_noise["strength"] = 0.5
+                frames_enable_free_noise = pipe(**inputs_enable_free_noise).frames[0]
+
+                pipe.disable_free_noise()
+                inputs_disable_free_noise = self.get_dummy_inputs(torch_device, num_frames=16)
+                inputs_disable_free_noise["num_inference_steps"] = 2
+                inputs_disable_free_noise["strength"] = 0.5
+                frames_disable_free_noise = pipe(**inputs_disable_free_noise).frames[0]
+
+                sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_noise)).sum()
+                max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_noise)).max()
+                self.assertGreater(
+                    sum_enabled,
+                    1e1,
+                    "Enabling of FreeNoise should lead to results different from the default pipeline results",
+                )
+                self.assertLess(
+                    max_diff_disabled,
+                    1e-4,
+                    "Disabling of FreeNoise should lead to results similar to the default pipeline results",
+                )
+
+    def test_free_noise_multi_prompt(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffVideoToVideoControlNetPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        context_length = 8
+        context_stride = 4
+        pipe.enable_free_noise(context_length, context_stride)
+
+        # Make sure that pipeline works when prompt indices are within num_frames bounds
+        inputs = self.get_dummy_inputs(torch_device, num_frames=16)
+        inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf"}
+        inputs["num_inference_steps"] = 2
+        inputs["strength"] = 0.5
+        pipe(**inputs).frames[0]
+
+        with self.assertRaises(ValueError):
+            # Ensure that prompt indices are within bounds
+            inputs = self.get_dummy_inputs(torch_device, num_frames=16)
+            inputs["num_inference_steps"] = 2
+            inputs["strength"] = 0.5
+            inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf", 42: "Error on a leaf"}
+            pipe(**inputs).frames[0]
diff --git a/diffusers/tests/pipelines/audioldm/__init__.py b/diffusers/tests/pipelines/audioldm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/audioldm/test_audioldm.py b/diffusers/tests/pipelines/audioldm/test_audioldm.py
new file mode 100644
index 0000000000000000000000000000000000000000..eddab54a3c039486ea66f1595010e224b044662e
--- /dev/null
+++ b/diffusers/tests/pipelines/audioldm/test_audioldm.py
@@ -0,0 +1,459 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import (
+    ClapTextConfig,
+    ClapTextModelWithProjection,
+    RobertaTokenizer,
+    SpeechT5HifiGan,
+    SpeechT5HifiGanConfig,
+)
+
+from diffusers import (
+    AudioLDMPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, torch_device
+
+from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = AudioLDMPipeline
+    params = TEXT_TO_AUDIO_PARAMS
+    batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "num_waveforms_per_prompt",
+            "generator",
+            "latents",
+            "output_type",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(8, 16),
+            layers_per_block=1,
+            norm_num_groups=8,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=(8, 16),
+            class_embed_type="simple_projection",
+            projection_class_embeddings_input_dim=8,
+            class_embeddings_concat=True,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[8, 16],
+            in_channels=1,
+            out_channels=1,
+            norm_num_groups=8,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = ClapTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=8,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=1,
+            num_hidden_layers=1,
+            pad_token_id=1,
+            vocab_size=1000,
+            projection_dim=8,
+        )
+        text_encoder = ClapTextModelWithProjection(text_encoder_config)
+        tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
+
+        vocoder_config = SpeechT5HifiGanConfig(
+            model_in_dim=8,
+            sampling_rate=16000,
+            upsample_initial_channel=16,
+            upsample_rates=[2, 2],
+            upsample_kernel_sizes=[4, 4],
+            resblock_kernel_sizes=[3, 7],
+            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
+            normalize_before=False,
+        )
+
+        vocoder = SpeechT5HifiGan(vocoder_config)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "vocoder": vocoder,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A hammer hitting a wooden surface",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+        }
+        return inputs
+
+    def test_audioldm_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = audioldm_pipe(**inputs)
+        audio = output.audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 256
+
+        audio_slice = audio[:10]
+        expected_slice = np.array(
+            [-0.0050, 0.0050, -0.0060, 0.0033, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0033]
+        )
+
+        assert np.abs(audio_slice - expected_slice).max() < 1e-2
+
+    def test_audioldm_prompt_embeds(self):
+        components = self.get_dummy_components()
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = audioldm_pipe(**inputs)
+        audio_1 = output.audios[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        text_inputs = audioldm_pipe.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=audioldm_pipe.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_inputs = text_inputs["input_ids"].to(torch_device)
+
+        prompt_embeds = audioldm_pipe.text_encoder(
+            text_inputs,
+        )
+        prompt_embeds = prompt_embeds.text_embeds
+        # additional L_2 normalization over each hidden-state
+        prompt_embeds = F.normalize(prompt_embeds, dim=-1)
+
+        inputs["prompt_embeds"] = prompt_embeds
+
+        # forward
+        output = audioldm_pipe(**inputs)
+        audio_2 = output.audios[0]
+
+        assert np.abs(audio_1 - audio_2).max() < 1e-2
+
+    def test_audioldm_negative_prompt_embeds(self):
+        components = self.get_dummy_components()
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = audioldm_pipe(**inputs)
+        audio_1 = output.audios[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        embeds = []
+        for p in [prompt, negative_prompt]:
+            text_inputs = audioldm_pipe.tokenizer(
+                p,
+                padding="max_length",
+                max_length=audioldm_pipe.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_inputs = text_inputs["input_ids"].to(torch_device)
+
+            text_embeds = audioldm_pipe.text_encoder(
+                text_inputs,
+            )
+            text_embeds = text_embeds.text_embeds
+            # additional L_2 normalization over each hidden-state
+            text_embeds = F.normalize(text_embeds, dim=-1)
+
+            embeds.append(text_embeds)
+
+        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
+
+        # forward
+        output = audioldm_pipe(**inputs)
+        audio_2 = output.audios[0]
+
+        assert np.abs(audio_1 - audio_2).max() < 1e-2
+
+    def test_audioldm_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "egg cracking"
+        output = audioldm_pipe(**inputs, negative_prompt=negative_prompt)
+        audio = output.audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 256
+
+        audio_slice = audio[:10]
+        expected_slice = np.array(
+            [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
+        )
+
+        assert np.abs(audio_slice - expected_slice).max() < 1e-2
+
+    def test_audioldm_num_waveforms_per_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A hammer hitting a wooden surface"
+
+        # test num_waveforms_per_prompt=1 (default)
+        audios = audioldm_pipe(prompt, num_inference_steps=2).audios
+
+        assert audios.shape == (1, 256)
+
+        # test num_waveforms_per_prompt=1 (default) for batch of prompts
+        batch_size = 2
+        audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
+
+        assert audios.shape == (batch_size, 256)
+
+        # test num_waveforms_per_prompt for single prompt
+        num_waveforms_per_prompt = 2
+        audios = audioldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios
+
+        assert audios.shape == (num_waveforms_per_prompt, 256)
+
+        # test num_waveforms_per_prompt for batch of prompts
+        batch_size = 2
+        audios = audioldm_pipe(
+            [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
+        ).audios
+
+        assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
+
+    def test_audioldm_audio_length_in_s(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+        vocoder_sampling_rate = audioldm_pipe.vocoder.config.sampling_rate
+
+        inputs = self.get_dummy_inputs(device)
+        output = audioldm_pipe(audio_length_in_s=0.016, **inputs)
+        audio = output.audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) / vocoder_sampling_rate == 0.016
+
+        output = audioldm_pipe(audio_length_in_s=0.032, **inputs)
+        audio = output.audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) / vocoder_sampling_rate == 0.032
+
+    def test_audioldm_vocoder_model_in_dim(self):
+        components = self.get_dummy_components()
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        prompt = ["hey"]
+
+        output = audioldm_pipe(prompt, num_inference_steps=1)
+        audio_shape = output.audios.shape
+        assert audio_shape == (1, 256)
+
+        config = audioldm_pipe.vocoder.config
+        config.model_in_dim *= 2
+        audioldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
+        output = audioldm_pipe(prompt, num_inference_steps=1)
+        audio_shape = output.audios.shape
+        # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
+        assert audio_shape == (1, 256)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical()
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
+
+
+@nightly
+class AudioLDMPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "A hammer hitting a wooden surface",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 2.5,
+        }
+        return inputs
+
+    def test_audioldm(self):
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 25
+        audio = audioldm_pipe(**inputs).audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 81920
+
+        audio_slice = audio[77230:77240]
+        expected_slice = np.array(
+            [-0.4884, -0.4607, 0.0023, 0.5007, 0.5896, 0.5151, 0.3813, -0.0208, -0.3687, -0.4315]
+        )
+        max_diff = np.abs(expected_slice - audio_slice).max()
+        assert max_diff < 1e-2
+
+
+@nightly
+class AudioLDMPipelineNightlyTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "A hammer hitting a wooden surface",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 2.5,
+        }
+        return inputs
+
+    def test_audioldm_lms(self):
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
+        audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        audio = audioldm_pipe(**inputs).audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 81920
+
+        audio_slice = audio[27780:27790]
+        expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189, 0.0569, 0.1373, 0.1883, 0.2886, 0.3297, 0.2212])
+        max_diff = np.abs(expected_slice - audio_slice).max()
+        assert max_diff < 3e-2
diff --git a/diffusers/tests/pipelines/audioldm2/__init__.py b/diffusers/tests/pipelines/audioldm2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/audioldm2/test_audioldm2.py b/diffusers/tests/pipelines/audioldm2/test_audioldm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb550dd3219dbc12d2029d48118bc0a0c55cb862
--- /dev/null
+++ b/diffusers/tests/pipelines/audioldm2/test_audioldm2.py
@@ -0,0 +1,626 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import (
+    ClapAudioConfig,
+    ClapConfig,
+    ClapFeatureExtractor,
+    ClapModel,
+    ClapTextConfig,
+    GPT2Config,
+    GPT2Model,
+    RobertaTokenizer,
+    SpeechT5HifiGan,
+    SpeechT5HifiGanConfig,
+    T5Config,
+    T5EncoderModel,
+    T5Tokenizer,
+)
+
+from diffusers import (
+    AudioLDM2Pipeline,
+    AudioLDM2ProjectionModel,
+    AudioLDM2UNet2DConditionModel,
+    AutoencoderKL,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, torch_device
+
+from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = AudioLDM2Pipeline
+    params = TEXT_TO_AUDIO_PARAMS
+    batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "num_waveforms_per_prompt",
+            "generator",
+            "latents",
+            "output_type",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = AudioLDM2UNet2DConditionModel(
+            block_out_channels=(8, 16),
+            layers_per_block=1,
+            norm_num_groups=8,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=(8, 16),
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[8, 16],
+            in_channels=1,
+            out_channels=1,
+            norm_num_groups=8,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_branch_config = ClapTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=8,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=1,
+            num_hidden_layers=1,
+            pad_token_id=1,
+            vocab_size=1000,
+            projection_dim=8,
+        )
+        audio_branch_config = ClapAudioConfig(
+            spec_size=8,
+            window_size=4,
+            num_mel_bins=8,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            depths=[1, 1],
+            num_attention_heads=[1, 1],
+            num_hidden_layers=1,
+            hidden_size=192,
+            projection_dim=8,
+            patch_size=2,
+            patch_stride=2,
+            patch_embed_input_channels=4,
+        )
+        text_encoder_config = ClapConfig.from_text_audio_configs(
+            text_config=text_branch_config,
+            audio_config=audio_branch_config,
+            projection_dim=16,
+        )
+        text_encoder = ClapModel(text_encoder_config)
+        tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
+        feature_extractor = ClapFeatureExtractor.from_pretrained(
+            "hf-internal-testing/tiny-random-ClapModel", hop_length=7900
+        )
+
+        torch.manual_seed(0)
+        text_encoder_2_config = T5Config(
+            vocab_size=32100,
+            d_model=32,
+            d_ff=37,
+            d_kv=8,
+            num_heads=1,
+            num_layers=1,
+        )
+        text_encoder_2 = T5EncoderModel(text_encoder_2_config)
+        tokenizer_2 = T5Tokenizer.from_pretrained("hf-internal-testing/tiny-random-T5Model", model_max_length=77)
+
+        torch.manual_seed(0)
+        language_model_config = GPT2Config(
+            n_embd=16,
+            n_head=1,
+            n_layer=1,
+            vocab_size=1000,
+            n_ctx=99,
+            n_positions=99,
+        )
+        language_model = GPT2Model(language_model_config)
+        language_model.config.max_new_tokens = 8
+
+        torch.manual_seed(0)
+        projection_model = AudioLDM2ProjectionModel(
+            text_encoder_dim=16,
+            text_encoder_1_dim=32,
+            langauge_model_dim=16,
+        )
+
+        vocoder_config = SpeechT5HifiGanConfig(
+            model_in_dim=8,
+            sampling_rate=16000,
+            upsample_initial_channel=16,
+            upsample_rates=[2, 2],
+            upsample_kernel_sizes=[4, 4],
+            resblock_kernel_sizes=[3, 7],
+            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
+            normalize_before=False,
+        )
+
+        vocoder = SpeechT5HifiGan(vocoder_config)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "feature_extractor": feature_extractor,
+            "language_model": language_model,
+            "projection_model": projection_model,
+            "vocoder": vocoder,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A hammer hitting a wooden surface",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+        }
+        return inputs
+
+    def test_audioldm2_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        audioldm_pipe = AudioLDM2Pipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = audioldm_pipe(**inputs)
+        audio = output.audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 256
+
+        audio_slice = audio[:10]
+        expected_slice = np.array(
+            [
+                2.602e-03,
+                1.729e-03,
+                1.863e-03,
+                -2.219e-03,
+                -2.656e-03,
+                -2.017e-03,
+                -2.648e-03,
+                -2.115e-03,
+                -2.502e-03,
+                -2.081e-03,
+            ]
+        )
+
+        assert np.abs(audio_slice - expected_slice).max() < 1e-4
+
+    def test_audioldm2_prompt_embeds(self):
+        components = self.get_dummy_components()
+        audioldm_pipe = AudioLDM2Pipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = audioldm_pipe(**inputs)
+        audio_1 = output.audios[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        text_inputs = audioldm_pipe.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=audioldm_pipe.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_inputs = text_inputs["input_ids"].to(torch_device)
+
+        clap_prompt_embeds = audioldm_pipe.text_encoder.get_text_features(text_inputs)
+        clap_prompt_embeds = clap_prompt_embeds[:, None, :]
+
+        text_inputs = audioldm_pipe.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=True,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_inputs = text_inputs["input_ids"].to(torch_device)
+
+        t5_prompt_embeds = audioldm_pipe.text_encoder_2(
+            text_inputs,
+        )
+        t5_prompt_embeds = t5_prompt_embeds[0]
+
+        projection_embeds = audioldm_pipe.projection_model(clap_prompt_embeds, t5_prompt_embeds)[0]
+        generated_prompt_embeds = audioldm_pipe.generate_language_model(projection_embeds, max_new_tokens=8)
+
+        inputs["prompt_embeds"] = t5_prompt_embeds
+        inputs["generated_prompt_embeds"] = generated_prompt_embeds
+
+        # forward
+        output = audioldm_pipe(**inputs)
+        audio_2 = output.audios[0]
+
+        assert np.abs(audio_1 - audio_2).max() < 1e-2
+
+    def test_audioldm2_negative_prompt_embeds(self):
+        components = self.get_dummy_components()
+        audioldm_pipe = AudioLDM2Pipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = audioldm_pipe(**inputs)
+        audio_1 = output.audios[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        embeds = []
+        generated_embeds = []
+        for p in [prompt, negative_prompt]:
+            text_inputs = audioldm_pipe.tokenizer(
+                p,
+                padding="max_length",
+                max_length=audioldm_pipe.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_inputs = text_inputs["input_ids"].to(torch_device)
+
+            clap_prompt_embeds = audioldm_pipe.text_encoder.get_text_features(text_inputs)
+            clap_prompt_embeds = clap_prompt_embeds[:, None, :]
+
+            text_inputs = audioldm_pipe.tokenizer_2(
+                prompt,
+                padding="max_length",
+                max_length=True if len(embeds) == 0 else embeds[0].shape[1],
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_inputs = text_inputs["input_ids"].to(torch_device)
+
+            t5_prompt_embeds = audioldm_pipe.text_encoder_2(
+                text_inputs,
+            )
+            t5_prompt_embeds = t5_prompt_embeds[0]
+
+            projection_embeds = audioldm_pipe.projection_model(clap_prompt_embeds, t5_prompt_embeds)[0]
+            generated_prompt_embeds = audioldm_pipe.generate_language_model(projection_embeds, max_new_tokens=8)
+
+            embeds.append(t5_prompt_embeds)
+            generated_embeds.append(generated_prompt_embeds)
+
+        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
+        inputs["generated_prompt_embeds"], inputs["negative_generated_prompt_embeds"] = generated_embeds
+
+        # forward
+        output = audioldm_pipe(**inputs)
+        audio_2 = output.audios[0]
+
+        assert np.abs(audio_1 - audio_2).max() < 1e-2
+
+    def test_audioldm2_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
+        audioldm_pipe = AudioLDM2Pipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "egg cracking"
+        output = audioldm_pipe(**inputs, negative_prompt=negative_prompt)
+        audio = output.audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 256
+
+        audio_slice = audio[:10]
+        expected_slice = np.array(
+            [0.0026, 0.0017, 0.0018, -0.0022, -0.0026, -0.002, -0.0026, -0.0021, -0.0025, -0.0021]
+        )
+
+        assert np.abs(audio_slice - expected_slice).max() < 1e-4
+
+    def test_audioldm2_num_waveforms_per_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
+        audioldm_pipe = AudioLDM2Pipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A hammer hitting a wooden surface"
+
+        # test num_waveforms_per_prompt=1 (default)
+        audios = audioldm_pipe(prompt, num_inference_steps=2).audios
+
+        assert audios.shape == (1, 256)
+
+        # test num_waveforms_per_prompt=1 (default) for batch of prompts
+        batch_size = 2
+        audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
+
+        assert audios.shape == (batch_size, 256)
+
+        # test num_waveforms_per_prompt for single prompt
+        num_waveforms_per_prompt = 1
+        audios = audioldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios
+
+        assert audios.shape == (num_waveforms_per_prompt, 256)
+
+        # test num_waveforms_per_prompt for batch of prompts
+        batch_size = 2
+        audios = audioldm_pipe(
+            [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
+        ).audios
+
+        assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
+
+    def test_audioldm2_audio_length_in_s(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        audioldm_pipe = AudioLDM2Pipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+        vocoder_sampling_rate = audioldm_pipe.vocoder.config.sampling_rate
+
+        inputs = self.get_dummy_inputs(device)
+        output = audioldm_pipe(audio_length_in_s=0.016, **inputs)
+        audio = output.audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) / vocoder_sampling_rate == 0.016
+
+        output = audioldm_pipe(audio_length_in_s=0.032, **inputs)
+        audio = output.audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) / vocoder_sampling_rate == 0.032
+
+    def test_audioldm2_vocoder_model_in_dim(self):
+        components = self.get_dummy_components()
+        audioldm_pipe = AudioLDM2Pipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        prompt = ["hey"]
+
+        output = audioldm_pipe(prompt, num_inference_steps=1)
+        audio_shape = output.audios.shape
+        assert audio_shape == (1, 256)
+
+        config = audioldm_pipe.vocoder.config
+        config.model_in_dim *= 2
+        audioldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
+        output = audioldm_pipe(prompt, num_inference_steps=1)
+        audio_shape = output.audios.shape
+        # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
+        assert audio_shape == (1, 256)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
+
+    @unittest.skip("Raises a not implemented error in AudioLDM2")
+    def test_xformers_attention_forwardGenerator_pass(self):
+        pass
+
+    def test_dict_tuple_outputs_equivalent(self):
+        # increase tolerance from 1e-4 -> 2e-4 to account for large composite model
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=2e-4)
+
+    def test_inference_batch_single_identical(self):
+        # increase tolerance from 1e-4 -> 2e-4 to account for large composite model
+        self._test_inference_batch_single_identical(expected_max_diff=2e-4)
+
+    def test_save_load_local(self):
+        # increase tolerance from 1e-4 -> 2e-4 to account for large composite model
+        super().test_save_load_local(expected_max_difference=2e-4)
+
+    def test_save_load_optional_components(self):
+        # increase tolerance from 1e-4 -> 2e-4 to account for large composite model
+        super().test_save_load_optional_components(expected_max_difference=2e-4)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # The method component.dtype returns the dtype of the first parameter registered in the model, not the
+        # dtype of the entire model. In the case of CLAP, the first parameter is a float64 constant (logit scale)
+        model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
+
+        # Without the logit scale parameters, everything is float32
+        model_dtypes.pop("text_encoder")
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
+
+        # the CLAP sub-models are float32
+        model_dtypes["clap_text_branch"] = components["text_encoder"].text_model.dtype
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
+
+        # Once we send to fp16, all params are in half-precision, including the logit scale
+        pipe.to(dtype=torch.float16)
+        model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
+
+    def test_sequential_cpu_offload_forward_pass(self):
+        pass
+
+
+@nightly
+class AudioLDM2PipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "A hammer hitting a wooden surface",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 2.5,
+        }
+        return inputs
+
+    def get_inputs_tts(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "A men saying",
+            "transcription": "hello my name is John",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 2.5,
+        }
+        return inputs
+
+    def test_audioldm2(self):
+        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 25
+        audio = audioldm_pipe(**inputs).audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 81952
+
+        # check the portion of the generated audio with the largest dynamic range (reduces flakiness)
+        audio_slice = audio[17275:17285]
+        expected_slice = np.array([0.0791, 0.0666, 0.1158, 0.1227, 0.1171, -0.2880, -0.1940, -0.0283, -0.0126, 0.1127])
+        max_diff = np.abs(expected_slice - audio_slice).max()
+        assert max_diff < 1e-3
+
+    def test_audioldm2_lms(self):
+        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
+        audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        audio = audioldm_pipe(**inputs).audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 81952
+
+        # check the portion of the generated audio with the largest dynamic range (reduces flakiness)
+        audio_slice = audio[31390:31400]
+        expected_slice = np.array(
+            [-0.1318, -0.0577, 0.0446, -0.0573, 0.0659, 0.1074, -0.2600, 0.0080, -0.2190, -0.4301]
+        )
+        max_diff = np.abs(expected_slice - audio_slice).max()
+        assert max_diff < 1e-3
+
+    def test_audioldm2_large(self):
+        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-large")
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        audio = audioldm_pipe(**inputs).audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 81952
+
+        # check the portion of the generated audio with the largest dynamic range (reduces flakiness)
+        audio_slice = audio[8825:8835]
+        expected_slice = np.array(
+            [-0.1829, -0.1461, 0.0759, -0.1493, -0.1396, 0.5783, 0.3001, -0.3038, -0.0639, -0.2244]
+        )
+        max_diff = np.abs(expected_slice - audio_slice).max()
+        assert max_diff < 1e-3
+
+    def test_audioldm2_tts(self):
+        audioldm_tts_pipe = AudioLDM2Pipeline.from_pretrained("anhnct/audioldm2_gigaspeech")
+        audioldm_tts_pipe = audioldm_tts_pipe.to(torch_device)
+        audioldm_tts_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs_tts(torch_device)
+        audio = audioldm_tts_pipe(**inputs).audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 81952
+
+        # check the portion of the generated audio with the largest dynamic range (reduces flakiness)
+        audio_slice = audio[8825:8835]
+        expected_slice = np.array(
+            [-0.1829, -0.1461, 0.0759, -0.1493, -0.1396, 0.5783, 0.3001, -0.3038, -0.0639, -0.2244]
+        )
+        max_diff = np.abs(expected_slice - audio_slice).max()
+        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/aura_flow/__init__.py b/diffusers/tests/pipelines/aura_flow/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/aura_flow/test_pipeline_aura_flow.py b/diffusers/tests/pipelines/aura_flow/test_pipeline_aura_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..14bc588df905c1fd8832f1e003a27d4d7c36788f
--- /dev/null
+++ b/diffusers/tests/pipelines/aura_flow/test_pipeline_aura_flow.py
@@ -0,0 +1,169 @@
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, UMT5EncoderModel
+
+from diffusers import AuraFlowPipeline, AuraFlowTransformer2DModel, AutoencoderKL, FlowMatchEulerDiscreteScheduler
+from diffusers.utils.testing_utils import (
+    torch_device,
+)
+
+from ..test_pipelines_common import (
+    PipelineTesterMixin,
+    check_qkv_fusion_matches_attn_procs_length,
+    check_qkv_fusion_processors_exist,
+)
+
+
+class AuraFlowPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = AuraFlowPipeline
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "guidance_scale",
+            "negative_prompt",
+            "prompt_embeds",
+            "negative_prompt_embeds",
+        ]
+    )
+    batch_params = frozenset(["prompt", "negative_prompt"])
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = AuraFlowTransformer2DModel(
+            sample_size=32,
+            patch_size=2,
+            in_channels=4,
+            num_mmdit_layers=1,
+            num_single_dit_layers=1,
+            attention_head_dim=8,
+            num_attention_heads=4,
+            caption_projection_dim=32,
+            joint_attention_dim=32,
+            out_channels=4,
+            pos_embed_max_size=256,
+        )
+
+        text_encoder = UMT5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-umt5")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=32,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "transformer": transformer,
+            "vae": vae,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "height": None,
+            "width": None,
+        }
+        return inputs
+
+    def test_aura_flow_prompt_embeds(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+        inputs = self.get_dummy_inputs(torch_device)
+
+        output_with_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = inputs.pop("prompt")
+
+        do_classifier_free_guidance = inputs["guidance_scale"] > 1
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = pipe.encode_prompt(
+            prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            device=torch_device,
+        )
+        output_with_embeds = pipe(
+            prompt_embeds=prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            **inputs,
+        ).images[0]
+
+        max_diff = np.abs(output_with_prompt - output_with_embeds).max()
+        assert max_diff < 1e-4
+
+    def test_attention_slicing_forward_pass(self):
+        # Attention slicing needs to implemented differently for this because how single DiT and MMDiT
+        # blocks interfere with each other.
+        return
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        original_image_slice = image[0, -3:, -3:, -1]
+
+        # TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added
+        # to the pipeline level.
+        pipe.transformer.fuse_qkv_projections()
+        assert check_qkv_fusion_processors_exist(
+            pipe.transformer
+        ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+        assert check_qkv_fusion_matches_attn_procs_length(
+            pipe.transformer, pipe.transformer.original_attn_processors
+        ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_fused = image[0, -3:, -3:, -1]
+
+        pipe.transformer.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_disabled = image[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
+
+    @unittest.skip("xformers attention processor does not exist for AuraFlow")
+    def test_xformers_attention_forwardGenerator_pass(self):
+        pass
diff --git a/diffusers/tests/pipelines/blipdiffusion/__init__.py b/diffusers/tests/pipelines/blipdiffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/blipdiffusion/test_blipdiffusion.py b/diffusers/tests/pipelines/blipdiffusion/test_blipdiffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e85cef6512922571200caf20265ec62475bb76e
--- /dev/null
+++ b/diffusers/tests/pipelines/blipdiffusion/test_blipdiffusion.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTokenizer
+from transformers.models.blip_2.configuration_blip_2 import Blip2Config
+from transformers.models.clip.configuration_clip import CLIPTextConfig
+
+from diffusers import AutoencoderKL, BlipDiffusionPipeline, PNDMScheduler, UNet2DConditionModel
+from diffusers.utils.testing_utils import enable_full_determinism
+from src.diffusers.pipelines.blip_diffusion.blip_image_processing import BlipImageProcessor
+from src.diffusers.pipelines.blip_diffusion.modeling_blip2 import Blip2QFormerModel
+from src.diffusers.pipelines.blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class BlipDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = BlipDiffusionPipeline
+    params = [
+        "prompt",
+        "reference_image",
+        "source_subject_category",
+        "target_subject_category",
+    ]
+    batch_params = [
+        "prompt",
+        "reference_image",
+        "source_subject_category",
+        "target_subject_category",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "num_inference_steps",
+        "neg_prompt",
+        "guidance_scale",
+        "prompt_strength",
+        "prompt_reps",
+    ]
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            vocab_size=1000,
+            hidden_size=8,
+            intermediate_size=8,
+            projection_dim=8,
+            num_hidden_layers=1,
+            num_attention_heads=1,
+            max_position_embeddings=77,
+        )
+        text_encoder = ContextCLIPTextModel(text_encoder_config)
+
+        vae = AutoencoderKL(
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownEncoderBlock2D",),
+            up_block_types=("UpDecoderBlock2D",),
+            block_out_channels=(8,),
+            norm_num_groups=8,
+            layers_per_block=1,
+            act_fn="silu",
+            latent_channels=4,
+            sample_size=8,
+        )
+
+        blip_vision_config = {
+            "hidden_size": 8,
+            "intermediate_size": 8,
+            "num_hidden_layers": 1,
+            "num_attention_heads": 1,
+            "image_size": 224,
+            "patch_size": 14,
+            "hidden_act": "quick_gelu",
+        }
+
+        blip_qformer_config = {
+            "vocab_size": 1000,
+            "hidden_size": 8,
+            "num_hidden_layers": 1,
+            "num_attention_heads": 1,
+            "intermediate_size": 8,
+            "max_position_embeddings": 512,
+            "cross_attention_frequency": 1,
+            "encoder_hidden_size": 8,
+        }
+        qformer_config = Blip2Config(
+            vision_config=blip_vision_config,
+            qformer_config=blip_qformer_config,
+            num_query_tokens=8,
+            tokenizer="hf-internal-testing/tiny-random-bert",
+        )
+        qformer = Blip2QFormerModel(qformer_config)
+
+        unet = UNet2DConditionModel(
+            block_out_channels=(8, 16),
+            norm_num_groups=8,
+            layers_per_block=1,
+            sample_size=16,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=8,
+        )
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        scheduler = PNDMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            set_alpha_to_one=False,
+            skip_prk_steps=True,
+        )
+
+        vae.eval()
+        qformer.eval()
+        text_encoder.eval()
+
+        image_processor = BlipImageProcessor()
+
+        components = {
+            "text_encoder": text_encoder,
+            "vae": vae,
+            "qformer": qformer,
+            "unet": unet,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+            "image_processor": image_processor,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        np.random.seed(seed)
+        reference_image = np.random.rand(32, 32, 3) * 255
+        reference_image = Image.fromarray(reference_image.astype("uint8")).convert("RGBA")
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "swimming underwater",
+            "generator": generator,
+            "reference_image": reference_image,
+            "source_subject_category": "dog",
+            "target_subject_category": "dog",
+            "height": 32,
+            "width": 32,
+            "guidance_scale": 7.5,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_blipdiffusion(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        image = pipe(**self.get_dummy_inputs(device))[0]
+        image_slice = image[0, -3:, -3:, 0]
+
+        assert image.shape == (1, 16, 16, 4)
+
+        expected_slice = np.array(
+            [0.5329548, 0.8372512, 0.33269387, 0.82096875, 0.43657133, 0.3783, 0.5953028, 0.51934963, 0.42142007]
+        )
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {image_slice.flatten()}, but got {image_slice.flatten()}"
diff --git a/diffusers/tests/pipelines/cogvideo/__init__.py b/diffusers/tests/pipelines/cogvideo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/cogvideo/test_cogvideox.py b/diffusers/tests/pipelines/cogvideo/test_cogvideox.py
new file mode 100644
index 0000000000000000000000000000000000000000..884ddfb2a95adf7df92aebe5197d535708288c8e
--- /dev/null
+++ b/diffusers/tests/pipelines/cogvideo/test_cogvideox.py
@@ -0,0 +1,359 @@
+# Copyright 2024 The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import inspect
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+
+from diffusers import AutoencoderKLCogVideoX, CogVideoXPipeline, CogVideoXTransformer3DModel, DDIMScheduler
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    PipelineTesterMixin,
+    check_qkv_fusion_matches_attn_procs_length,
+    check_qkv_fusion_processors_exist,
+    to_np,
+)
+
+
+enable_full_determinism()
+
+
+class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = CogVideoXPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = CogVideoXTransformer3DModel(
+            # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings
+            # But, since we are using tiny-random-t5 here, we need the internal dim of CogVideoXTransformer3DModel
+            # to be 32. The internal dim is product of num_attention_heads and attention_head_dim
+            num_attention_heads=4,
+            attention_head_dim=8,
+            in_channels=4,
+            out_channels=4,
+            time_embed_dim=2,
+            text_embed_dim=32,  # Must match with tiny-random-t5
+            num_layers=1,
+            sample_width=2,  # latent width: 2 -> final width: 16
+            sample_height=2,  # latent height: 2 -> final height: 16
+            sample_frames=9,  # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
+            patch_size=2,
+            temporal_compression_ratio=4,
+            max_text_seq_length=16,
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderKLCogVideoX(
+            in_channels=3,
+            out_channels=3,
+            down_block_types=(
+                "CogVideoXDownBlock3D",
+                "CogVideoXDownBlock3D",
+                "CogVideoXDownBlock3D",
+                "CogVideoXDownBlock3D",
+            ),
+            up_block_types=(
+                "CogVideoXUpBlock3D",
+                "CogVideoXUpBlock3D",
+                "CogVideoXUpBlock3D",
+                "CogVideoXUpBlock3D",
+            ),
+            block_out_channels=(8, 8, 8, 8),
+            latent_channels=4,
+            layers_per_block=1,
+            norm_num_groups=2,
+            temporal_compression_ratio=4,
+        )
+
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler()
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "dance monkey",
+            "negative_prompt": "",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            # Cannot reduce because convolution kernel becomes bigger than sample
+            "height": 16,
+            "width": 16,
+            "num_frames": 8,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        video = pipe(**inputs).frames
+        generated_video = video[0]
+
+        self.assertEqual(generated_video.shape, (8, 3, 16, 16))
+        expected_video = torch.randn(8, 3, 16, 16)
+        max_diff = np.abs(generated_video - expected_video).max()
+        self.assertLessEqual(max_diff, 1e10)
+
+    def test_callback_inputs(self):
+        sig = inspect.signature(self.pipeline_class.__call__)
+        has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
+        has_callback_step_end = "callback_on_step_end" in sig.parameters
+
+        if not (has_callback_tensor_inputs and has_callback_step_end):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        self.assertTrue(
+            hasattr(pipe, "_callback_tensor_inputs"),
+            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
+        )
+
+        def callback_inputs_subset(pipe, i, t, callback_kwargs):
+            # iterate over callback args
+            for tensor_name, tensor_value in callback_kwargs.items():
+                # check that we're only passing in allowed tensor inputs
+                assert tensor_name in pipe._callback_tensor_inputs
+
+            return callback_kwargs
+
+        def callback_inputs_all(pipe, i, t, callback_kwargs):
+            for tensor_name in pipe._callback_tensor_inputs:
+                assert tensor_name in callback_kwargs
+
+            # iterate over callback args
+            for tensor_name, tensor_value in callback_kwargs.items():
+                # check that we're only passing in allowed tensor inputs
+                assert tensor_name in pipe._callback_tensor_inputs
+
+            return callback_kwargs
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        # Test passing in a subset
+        inputs["callback_on_step_end"] = callback_inputs_subset
+        inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
+        output = pipe(**inputs)[0]
+
+        # Test passing in a everything
+        inputs["callback_on_step_end"] = callback_inputs_all
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        output = pipe(**inputs)[0]
+
+        def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
+            is_last = i == (pipe.num_timesteps - 1)
+            if is_last:
+                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
+            return callback_kwargs
+
+        inputs["callback_on_step_end"] = callback_inputs_change_tensor
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        output = pipe(**inputs)[0]
+        assert output.abs().sum() < 1e10
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-3)
+
+    def test_attention_slicing_forward_pass(
+        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
+    ):
+        if not self.test_attention_slicing:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_slicing = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=1)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing1 = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=2)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing2 = pipe(**inputs)[0]
+
+        if test_max_difference:
+            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
+            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
+            self.assertLess(
+                max(max_diff1, max_diff2),
+                expected_max_diff,
+                "Attention slicing should not affect the inference results",
+            )
+
+    def test_vae_tiling(self, expected_diff_max: float = 0.2):
+        generator_device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe.to("cpu")
+        pipe.set_progress_bar_config(disable=None)
+
+        # Without tiling
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["height"] = inputs["width"] = 128
+        output_without_tiling = pipe(**inputs)[0]
+
+        # With tiling
+        pipe.vae.enable_tiling(
+            tile_sample_min_height=96,
+            tile_sample_min_width=96,
+            tile_overlap_factor_height=1 / 12,
+            tile_overlap_factor_width=1 / 12,
+        )
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["height"] = inputs["width"] = 128
+        output_with_tiling = pipe(**inputs)[0]
+
+        self.assertLess(
+            (to_np(output_without_tiling) - to_np(output_with_tiling)).max(),
+            expected_diff_max,
+            "VAE tiling should not affect the inference results",
+        )
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        frames = pipe(**inputs).frames  # [B, F, C, H, W]
+        original_image_slice = frames[0, -2:, -1, -3:, -3:]
+
+        pipe.fuse_qkv_projections()
+        assert check_qkv_fusion_processors_exist(
+            pipe.transformer
+        ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+        assert check_qkv_fusion_matches_attn_procs_length(
+            pipe.transformer, pipe.transformer.original_attn_processors
+        ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+        inputs = self.get_dummy_inputs(device)
+        frames = pipe(**inputs).frames
+        image_slice_fused = frames[0, -2:, -1, -3:, -3:]
+
+        pipe.transformer.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        frames = pipe(**inputs).frames
+        image_slice_disabled = frames[0, -2:, -1, -3:, -3:]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
+
+
+@slow
+@require_torch_gpu
+class CogVideoXPipelineIntegrationTests(unittest.TestCase):
+    prompt = "A painting of a squirrel eating a burger."
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_cogvideox(self):
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16)
+        pipe.enable_model_cpu_offload()
+        prompt = self.prompt
+
+        videos = pipe(
+            prompt=prompt,
+            height=480,
+            width=720,
+            num_frames=16,
+            generator=generator,
+            num_inference_steps=2,
+            output_type="pt",
+        ).frames
+
+        video = videos[0]
+        expected_video = torch.randn(1, 16, 480, 720, 3).numpy()
+
+        max_diff = numpy_cosine_similarity_distance(video, expected_video)
+        assert max_diff < 1e-3, f"Max diff is too high. got {video}"
diff --git a/diffusers/tests/pipelines/cogvideo/test_cogvideox_image2video.py b/diffusers/tests/pipelines/cogvideo/test_cogvideox_image2video.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec9a5fdd153e5cde56825ba8b62b1cb69368aee1
--- /dev/null
+++ b/diffusers/tests/pipelines/cogvideo/test_cogvideox_image2video.py
@@ -0,0 +1,388 @@
+# Copyright 2024 The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import inspect
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoTokenizer, T5EncoderModel
+
+from diffusers import AutoencoderKLCogVideoX, CogVideoXImageToVideoPipeline, CogVideoXTransformer3DModel, DDIMScheduler
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    PipelineTesterMixin,
+    check_qkv_fusion_matches_attn_procs_length,
+    check_qkv_fusion_processors_exist,
+    to_np,
+)
+
+
+enable_full_determinism()
+
+
+class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = CogVideoXImageToVideoPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"image"})
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = CogVideoXTransformer3DModel(
+            # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings
+            # But, since we are using tiny-random-t5 here, we need the internal dim of CogVideoXTransformer3DModel
+            # to be 32. The internal dim is product of num_attention_heads and attention_head_dim
+            # Note: The num_attention_heads and attention_head_dim is different from the T2V and I2V tests because
+            # attention_head_dim must be divisible by 16 for RoPE to work. We also need to maintain a product of 32 as
+            # detailed above.
+            num_attention_heads=2,
+            attention_head_dim=16,
+            in_channels=8,
+            out_channels=4,
+            time_embed_dim=2,
+            text_embed_dim=32,  # Must match with tiny-random-t5
+            num_layers=1,
+            sample_width=2,  # latent width: 2 -> final width: 16
+            sample_height=2,  # latent height: 2 -> final height: 16
+            sample_frames=9,  # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
+            patch_size=2,
+            temporal_compression_ratio=4,
+            max_text_seq_length=16,
+            use_rotary_positional_embeddings=True,
+            use_learned_positional_embeddings=True,
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderKLCogVideoX(
+            in_channels=3,
+            out_channels=3,
+            down_block_types=(
+                "CogVideoXDownBlock3D",
+                "CogVideoXDownBlock3D",
+                "CogVideoXDownBlock3D",
+                "CogVideoXDownBlock3D",
+            ),
+            up_block_types=(
+                "CogVideoXUpBlock3D",
+                "CogVideoXUpBlock3D",
+                "CogVideoXUpBlock3D",
+                "CogVideoXUpBlock3D",
+            ),
+            block_out_channels=(8, 8, 8, 8),
+            latent_channels=4,
+            layers_per_block=1,
+            norm_num_groups=2,
+            temporal_compression_ratio=4,
+        )
+
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler()
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        # Cannot reduce below 16 because convolution kernel becomes bigger than sample
+        # Cannot reduce below 32 because 3D RoPE errors out
+        image_height = 16
+        image_width = 16
+        image = Image.new("RGB", (image_width, image_height))
+        inputs = {
+            "image": image,
+            "prompt": "dance monkey",
+            "negative_prompt": "",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "height": image_height,
+            "width": image_width,
+            "num_frames": 8,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        video = pipe(**inputs).frames
+        generated_video = video[0]
+
+        self.assertEqual(generated_video.shape, (8, 3, 16, 16))
+        expected_video = torch.randn(8, 3, 16, 16)
+        max_diff = np.abs(generated_video - expected_video).max()
+        self.assertLessEqual(max_diff, 1e10)
+
+    def test_callback_inputs(self):
+        sig = inspect.signature(self.pipeline_class.__call__)
+        has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
+        has_callback_step_end = "callback_on_step_end" in sig.parameters
+
+        if not (has_callback_tensor_inputs and has_callback_step_end):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        self.assertTrue(
+            hasattr(pipe, "_callback_tensor_inputs"),
+            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
+        )
+
+        def callback_inputs_subset(pipe, i, t, callback_kwargs):
+            # iterate over callback args
+            for tensor_name, tensor_value in callback_kwargs.items():
+                # check that we're only passing in allowed tensor inputs
+                assert tensor_name in pipe._callback_tensor_inputs
+
+            return callback_kwargs
+
+        def callback_inputs_all(pipe, i, t, callback_kwargs):
+            for tensor_name in pipe._callback_tensor_inputs:
+                assert tensor_name in callback_kwargs
+
+            # iterate over callback args
+            for tensor_name, tensor_value in callback_kwargs.items():
+                # check that we're only passing in allowed tensor inputs
+                assert tensor_name in pipe._callback_tensor_inputs
+
+            return callback_kwargs
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        # Test passing in a subset
+        inputs["callback_on_step_end"] = callback_inputs_subset
+        inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
+        output = pipe(**inputs)[0]
+
+        # Test passing in a everything
+        inputs["callback_on_step_end"] = callback_inputs_all
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        output = pipe(**inputs)[0]
+
+        def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
+            is_last = i == (pipe.num_timesteps - 1)
+            if is_last:
+                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
+            return callback_kwargs
+
+        inputs["callback_on_step_end"] = callback_inputs_change_tensor
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        output = pipe(**inputs)[0]
+        assert output.abs().sum() < 1e10
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-3)
+
+    def test_attention_slicing_forward_pass(
+        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
+    ):
+        if not self.test_attention_slicing:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_slicing = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=1)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing1 = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=2)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing2 = pipe(**inputs)[0]
+
+        if test_max_difference:
+            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
+            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
+            self.assertLess(
+                max(max_diff1, max_diff2),
+                expected_max_diff,
+                "Attention slicing should not affect the inference results",
+            )
+
+    def test_vae_tiling(self, expected_diff_max: float = 0.3):
+        # Note(aryan): Investigate why this needs a bit higher tolerance
+        generator_device = "cpu"
+        components = self.get_dummy_components()
+
+        # The reason to modify it this way is because I2V Transformer limits the generation to resolutions used during initalization.
+        # This limitation comes from using learned positional embeddings which cannot be generated on-the-fly like sincos or RoPE embeddings.
+        # See the if-statement on "self.use_learned_positional_embeddings" in diffusers/models/embeddings.py
+        components["transformer"] = CogVideoXTransformer3DModel.from_config(
+            components["transformer"].config,
+            sample_height=16,
+            sample_width=16,
+        )
+
+        pipe = self.pipeline_class(**components)
+        pipe.to("cpu")
+        pipe.set_progress_bar_config(disable=None)
+
+        # Without tiling
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["height"] = inputs["width"] = 128
+        output_without_tiling = pipe(**inputs)[0]
+
+        # With tiling
+        pipe.vae.enable_tiling(
+            tile_sample_min_height=96,
+            tile_sample_min_width=96,
+            tile_overlap_factor_height=1 / 12,
+            tile_overlap_factor_width=1 / 12,
+        )
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["height"] = inputs["width"] = 128
+        output_with_tiling = pipe(**inputs)[0]
+
+        self.assertLess(
+            (to_np(output_without_tiling) - to_np(output_with_tiling)).max(),
+            expected_diff_max,
+            "VAE tiling should not affect the inference results",
+        )
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        frames = pipe(**inputs).frames  # [B, F, C, H, W]
+        original_image_slice = frames[0, -2:, -1, -3:, -3:]
+
+        pipe.fuse_qkv_projections()
+        assert check_qkv_fusion_processors_exist(
+            pipe.transformer
+        ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+        assert check_qkv_fusion_matches_attn_procs_length(
+            pipe.transformer, pipe.transformer.original_attn_processors
+        ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+        inputs = self.get_dummy_inputs(device)
+        frames = pipe(**inputs).frames
+        image_slice_fused = frames[0, -2:, -1, -3:, -3:]
+
+        pipe.transformer.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        frames = pipe(**inputs).frames
+        image_slice_disabled = frames[0, -2:, -1, -3:, -3:]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
+
+
+@unittest.skip("The model 'THUDM/CogVideoX-5b-I2V' is not public yet.")
+@slow
+@require_torch_gpu
+class CogVideoXImageToVideoPipelineIntegrationTests(unittest.TestCase):
+    prompt = "A painting of a squirrel eating a burger."
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_cogvideox(self):
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        pipe = CogVideoXImageToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b-I2V", torch_dtype=torch.bfloat16)
+        pipe.enable_model_cpu_offload()
+
+        prompt = self.prompt
+        image = load_image(
+            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
+        )
+
+        videos = pipe(
+            image=image,
+            prompt=prompt,
+            height=480,
+            width=720,
+            num_frames=16,
+            generator=generator,
+            num_inference_steps=2,
+            output_type="pt",
+        ).frames
+
+        video = videos[0]
+        expected_video = torch.randn(1, 16, 480, 720, 3).numpy()
+
+        max_diff = numpy_cosine_similarity_distance(video, expected_video)
+        assert max_diff < 1e-3, f"Max diff is too high. got {video}"
diff --git a/diffusers/tests/pipelines/cogvideo/test_cogvideox_video2video.py b/diffusers/tests/pipelines/cogvideo/test_cogvideox_video2video.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d836cb5e2a4f0d1ab15a759386ce38c4b738531
--- /dev/null
+++ b/diffusers/tests/pipelines/cogvideo/test_cogvideox_video2video.py
@@ -0,0 +1,325 @@
+# Copyright 2024 The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoTokenizer, T5EncoderModel
+
+from diffusers import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, CogVideoXVideoToVideoPipeline, DDIMScheduler
+from diffusers.utils.testing_utils import enable_full_determinism, torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    PipelineTesterMixin,
+    check_qkv_fusion_matches_attn_procs_length,
+    check_qkv_fusion_processors_exist,
+    to_np,
+)
+
+
+enable_full_determinism()
+
+
+class CogVideoXVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = CogVideoXVideoToVideoPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"video"})
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = CogVideoXTransformer3DModel(
+            # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings
+            # But, since we are using tiny-random-t5 here, we need the internal dim of CogVideoXTransformer3DModel
+            # to be 32. The internal dim is product of num_attention_heads and attention_head_dim
+            num_attention_heads=4,
+            attention_head_dim=8,
+            in_channels=4,
+            out_channels=4,
+            time_embed_dim=2,
+            text_embed_dim=32,  # Must match with tiny-random-t5
+            num_layers=1,
+            sample_width=2,  # latent width: 2 -> final width: 16
+            sample_height=2,  # latent height: 2 -> final height: 16
+            sample_frames=9,  # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
+            patch_size=2,
+            temporal_compression_ratio=4,
+            max_text_seq_length=16,
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderKLCogVideoX(
+            in_channels=3,
+            out_channels=3,
+            down_block_types=(
+                "CogVideoXDownBlock3D",
+                "CogVideoXDownBlock3D",
+                "CogVideoXDownBlock3D",
+                "CogVideoXDownBlock3D",
+            ),
+            up_block_types=(
+                "CogVideoXUpBlock3D",
+                "CogVideoXUpBlock3D",
+                "CogVideoXUpBlock3D",
+                "CogVideoXUpBlock3D",
+            ),
+            block_out_channels=(8, 8, 8, 8),
+            latent_channels=4,
+            layers_per_block=1,
+            norm_num_groups=2,
+            temporal_compression_ratio=4,
+        )
+
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler()
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed: int = 0, num_frames: int = 8):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        video_height = 16
+        video_width = 16
+        video = [Image.new("RGB", (video_width, video_height))] * num_frames
+
+        inputs = {
+            "video": video,
+            "prompt": "dance monkey",
+            "negative_prompt": "",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "strength": 0.5,
+            "guidance_scale": 6.0,
+            # Cannot reduce because convolution kernel becomes bigger than sample
+            "height": video_height,
+            "width": video_width,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        video = pipe(**inputs).frames
+        generated_video = video[0]
+
+        self.assertEqual(generated_video.shape, (8, 3, 16, 16))
+        expected_video = torch.randn(8, 3, 16, 16)
+        max_diff = np.abs(generated_video - expected_video).max()
+        self.assertLessEqual(max_diff, 1e10)
+
+    def test_callback_inputs(self):
+        sig = inspect.signature(self.pipeline_class.__call__)
+        has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
+        has_callback_step_end = "callback_on_step_end" in sig.parameters
+
+        if not (has_callback_tensor_inputs and has_callback_step_end):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        self.assertTrue(
+            hasattr(pipe, "_callback_tensor_inputs"),
+            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
+        )
+
+        def callback_inputs_subset(pipe, i, t, callback_kwargs):
+            # iterate over callback args
+            for tensor_name, tensor_value in callback_kwargs.items():
+                # check that we're only passing in allowed tensor inputs
+                assert tensor_name in pipe._callback_tensor_inputs
+
+            return callback_kwargs
+
+        def callback_inputs_all(pipe, i, t, callback_kwargs):
+            for tensor_name in pipe._callback_tensor_inputs:
+                assert tensor_name in callback_kwargs
+
+            # iterate over callback args
+            for tensor_name, tensor_value in callback_kwargs.items():
+                # check that we're only passing in allowed tensor inputs
+                assert tensor_name in pipe._callback_tensor_inputs
+
+            return callback_kwargs
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        # Test passing in a subset
+        inputs["callback_on_step_end"] = callback_inputs_subset
+        inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
+        output = pipe(**inputs)[0]
+
+        # Test passing in a everything
+        inputs["callback_on_step_end"] = callback_inputs_all
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        output = pipe(**inputs)[0]
+
+        def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
+            is_last = i == (pipe.num_timesteps - 1)
+            if is_last:
+                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
+            return callback_kwargs
+
+        inputs["callback_on_step_end"] = callback_inputs_change_tensor
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        output = pipe(**inputs)[0]
+        assert output.abs().sum() < 1e10
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-3)
+
+    def test_attention_slicing_forward_pass(
+        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
+    ):
+        if not self.test_attention_slicing:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_slicing = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=1)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing1 = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=2)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing2 = pipe(**inputs)[0]
+
+        if test_max_difference:
+            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
+            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
+            self.assertLess(
+                max(max_diff1, max_diff2),
+                expected_max_diff,
+                "Attention slicing should not affect the inference results",
+            )
+
+    def test_vae_tiling(self, expected_diff_max: float = 0.2):
+        # Since VideoToVideo uses both encoder and decoder tiling, there seems to be much more numerical
+        # difference. We seem to need a higher tolerance here...
+        # TODO(aryan): Look into this more deeply
+        expected_diff_max = 0.4
+
+        generator_device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe.to("cpu")
+        pipe.set_progress_bar_config(disable=None)
+
+        # Without tiling
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["height"] = inputs["width"] = 128
+        output_without_tiling = pipe(**inputs)[0]
+
+        # With tiling
+        pipe.vae.enable_tiling(
+            tile_sample_min_height=96,
+            tile_sample_min_width=96,
+            tile_overlap_factor_height=1 / 12,
+            tile_overlap_factor_width=1 / 12,
+        )
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["height"] = inputs["width"] = 128
+        output_with_tiling = pipe(**inputs)[0]
+
+        self.assertLess(
+            (to_np(output_without_tiling) - to_np(output_with_tiling)).max(),
+            expected_diff_max,
+            "VAE tiling should not affect the inference results",
+        )
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        frames = pipe(**inputs).frames  # [B, F, C, H, W]
+        original_image_slice = frames[0, -2:, -1, -3:, -3:]
+
+        pipe.fuse_qkv_projections()
+        assert check_qkv_fusion_processors_exist(
+            pipe.transformer
+        ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+        assert check_qkv_fusion_matches_attn_procs_length(
+            pipe.transformer, pipe.transformer.original_attn_processors
+        ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+        inputs = self.get_dummy_inputs(device)
+        frames = pipe(**inputs).frames
+        image_slice_fused = frames[0, -2:, -1, -3:, -3:]
+
+        pipe.transformer.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        frames = pipe(**inputs).frames
+        image_slice_disabled = frames[0, -2:, -1, -3:, -3:]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
diff --git a/diffusers/tests/pipelines/consistency_models/__init__.py b/diffusers/tests/pipelines/consistency_models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/consistency_models/test_consistency_models.py b/diffusers/tests/pipelines/consistency_models/test_consistency_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..e255cb510c4290f7a7d0ef67807729b03102305d
--- /dev/null
+++ b/diffusers/tests/pipelines/consistency_models/test_consistency_models.py
@@ -0,0 +1,299 @@
+import gc
+import unittest
+
+import numpy as np
+import torch
+from torch.backends.cuda import sdp_kernel
+
+from diffusers import (
+    CMStochasticIterativeScheduler,
+    ConsistencyModelPipeline,
+    UNet2DModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    nightly,
+    require_torch_2,
+    require_torch_gpu,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class ConsistencyModelPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = ConsistencyModelPipeline
+    params = UNCONDITIONAL_IMAGE_GENERATION_PARAMS
+    batch_params = UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS
+
+    # Override required_optional_params to remove num_images_per_prompt
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "output_type",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
+
+    @property
+    def dummy_uncond_unet(self):
+        unet = UNet2DModel.from_pretrained(
+            "diffusers/consistency-models-test",
+            subfolder="test_unet",
+        )
+        return unet
+
+    @property
+    def dummy_cond_unet(self):
+        unet = UNet2DModel.from_pretrained(
+            "diffusers/consistency-models-test",
+            subfolder="test_unet_class_cond",
+        )
+        return unet
+
+    def get_dummy_components(self, class_cond=False):
+        if class_cond:
+            unet = self.dummy_cond_unet
+        else:
+            unet = self.dummy_uncond_unet
+
+        # Default to CM multistep sampler
+        scheduler = CMStochasticIterativeScheduler(
+            num_train_timesteps=40,
+            sigma_min=0.002,
+            sigma_max=80.0,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "batch_size": 1,
+            "num_inference_steps": None,
+            "timesteps": [22, 0],
+            "generator": generator,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    def test_consistency_model_pipeline_multistep(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = ConsistencyModelPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.3572, 0.6273, 0.4031, 0.3961, 0.4321, 0.5730, 0.5266, 0.4780, 0.5004])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_consistency_model_pipeline_multistep_class_cond(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(class_cond=True)
+        pipe = ConsistencyModelPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["class_labels"] = 0
+        image = pipe(**inputs).images
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.3572, 0.6273, 0.4031, 0.3961, 0.4321, 0.5730, 0.5266, 0.4780, 0.5004])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_consistency_model_pipeline_onestep(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = ConsistencyModelPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_inference_steps"] = 1
+        inputs["timesteps"] = None
+        image = pipe(**inputs).images
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.5004, 0.5004, 0.4994, 0.5008, 0.4976, 0.5018, 0.4990, 0.4982, 0.4987])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_consistency_model_pipeline_onestep_class_cond(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(class_cond=True)
+        pipe = ConsistencyModelPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_inference_steps"] = 1
+        inputs["timesteps"] = None
+        inputs["class_labels"] = 0
+        image = pipe(**inputs).images
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.5004, 0.5004, 0.4994, 0.5008, 0.4976, 0.5018, 0.4990, 0.4982, 0.4987])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+
+@nightly
+@require_torch_gpu
+class ConsistencyModelPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, seed=0, get_fixed_latents=False, device="cpu", dtype=torch.float32, shape=(1, 3, 64, 64)):
+        generator = torch.manual_seed(seed)
+
+        inputs = {
+            "num_inference_steps": None,
+            "timesteps": [22, 0],
+            "class_labels": 0,
+            "generator": generator,
+            "output_type": "np",
+        }
+
+        if get_fixed_latents:
+            latents = self.get_fixed_latents(seed=seed, device=device, dtype=dtype, shape=shape)
+            inputs["latents"] = latents
+
+        return inputs
+
+    def get_fixed_latents(self, seed=0, device="cpu", dtype=torch.float32, shape=(1, 3, 64, 64)):
+        if isinstance(device, str):
+            device = torch.device(device)
+        generator = torch.Generator(device=device).manual_seed(seed)
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+
+    def test_consistency_model_cd_multistep(self):
+        unet = UNet2DModel.from_pretrained("diffusers/consistency_models", subfolder="diffusers_cd_imagenet64_l2")
+        scheduler = CMStochasticIterativeScheduler(
+            num_train_timesteps=40,
+            sigma_min=0.002,
+            sigma_max=80.0,
+        )
+        pipe = ConsistencyModelPipeline(unet=unet, scheduler=scheduler)
+        pipe.to(torch_device=torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs()
+        image = pipe(**inputs).images
+        assert image.shape == (1, 64, 64, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        expected_slice = np.array([0.0146, 0.0158, 0.0092, 0.0086, 0.0000, 0.0000, 0.0000, 0.0000, 0.0058])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_consistency_model_cd_onestep(self):
+        unet = UNet2DModel.from_pretrained("diffusers/consistency_models", subfolder="diffusers_cd_imagenet64_l2")
+        scheduler = CMStochasticIterativeScheduler(
+            num_train_timesteps=40,
+            sigma_min=0.002,
+            sigma_max=80.0,
+        )
+        pipe = ConsistencyModelPipeline(unet=unet, scheduler=scheduler)
+        pipe.to(torch_device=torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs()
+        inputs["num_inference_steps"] = 1
+        inputs["timesteps"] = None
+        image = pipe(**inputs).images
+        assert image.shape == (1, 64, 64, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        expected_slice = np.array([0.0059, 0.0003, 0.0000, 0.0023, 0.0052, 0.0007, 0.0165, 0.0081, 0.0095])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    @require_torch_2
+    def test_consistency_model_cd_multistep_flash_attn(self):
+        unet = UNet2DModel.from_pretrained("diffusers/consistency_models", subfolder="diffusers_cd_imagenet64_l2")
+        scheduler = CMStochasticIterativeScheduler(
+            num_train_timesteps=40,
+            sigma_min=0.002,
+            sigma_max=80.0,
+        )
+        pipe = ConsistencyModelPipeline(unet=unet, scheduler=scheduler)
+        pipe.to(torch_device=torch_device, torch_dtype=torch.float16)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(get_fixed_latents=True, device=torch_device)
+        # Ensure usage of flash attention in torch 2.0
+        with sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+            image = pipe(**inputs).images
+        assert image.shape == (1, 64, 64, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        expected_slice = np.array([0.1845, 0.1371, 0.1211, 0.2035, 0.1954, 0.1323, 0.1773, 0.1593, 0.1314])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    @require_torch_2
+    def test_consistency_model_cd_onestep_flash_attn(self):
+        unet = UNet2DModel.from_pretrained("diffusers/consistency_models", subfolder="diffusers_cd_imagenet64_l2")
+        scheduler = CMStochasticIterativeScheduler(
+            num_train_timesteps=40,
+            sigma_min=0.002,
+            sigma_max=80.0,
+        )
+        pipe = ConsistencyModelPipeline(unet=unet, scheduler=scheduler)
+        pipe.to(torch_device=torch_device, torch_dtype=torch.float16)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(get_fixed_latents=True, device=torch_device)
+        inputs["num_inference_steps"] = 1
+        inputs["timesteps"] = None
+        # Ensure usage of flash attention in torch 2.0
+        with sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+            image = pipe(**inputs).images
+        assert image.shape == (1, 64, 64, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        expected_slice = np.array([0.1623, 0.2009, 0.2387, 0.1731, 0.1168, 0.1202, 0.2031, 0.1327, 0.2447])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
diff --git a/diffusers/tests/pipelines/controlnet/__init__.py b/diffusers/tests/pipelines/controlnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/controlnet/test_controlnet.py b/diffusers/tests/pipelines/controlnet/test_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b12655d989d4ce30fca9f062f82efd764255ab6d
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet/test_controlnet.py
@@ -0,0 +1,1110 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import traceback
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    DDIMScheduler,
+    EulerDiscreteScheduler,
+    LCMScheduler,
+    StableDiffusionControlNetPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    get_python_version,
+    is_torch_compile,
+    load_image,
+    load_numpy,
+    require_torch_2,
+    require_torch_gpu,
+    run_test_in_subprocess,
+    slow,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+# Will be run via run_test_in_subprocess
+def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        _ = in_queue.get(timeout=timeout)
+
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.to("cuda")
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        pipe.controlnet.to(memory_format=torch.channels_last)
+        pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ).resize((512, 512))
+
+        output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np")
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
+        )
+        expected_image = np.resize(expected_image, (512, 512, 3))
+
+        assert np.abs(expected_image - image).max() < 1.0
+
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
+class ControlNetPipelineFastTests(
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionControlNetPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            norm_num_groups=1,
+            time_cond_proj_dim=time_cond_proj_dim,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=32,
+            conditioning_embedding_out_channels=(16, 32),
+            norm_num_groups=1,
+        )
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+        image = randn_tensor(
+            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+            generator=generator,
+            device=torch.device(device),
+        )
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": image,
+        }
+
+        return inputs
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.5234, 0.3333, 0.1745, 0.7605, 0.6224, 0.4637, 0.6989, 0.7526, 0.4665])
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    def test_controlnet_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array(
+            [0.52700454, 0.3930534, 0.25509018, 0.7132304, 0.53696585, 0.46568912, 0.7095368, 0.7059624, 0.4744786]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_controlnet_lcm_custom_timesteps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["num_inference_steps"]
+        inputs["timesteps"] = [999, 499]
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array(
+            [0.52700454, 0.3930534, 0.25509018, 0.7132304, 0.53696585, 0.46568912, 0.7095368, 0.7059624, 0.4744786]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+
+class StableDiffusionMultiControlNetPipelineFastTests(
+    IPAdapterTesterMixin, PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionControlNetPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = frozenset([])  # TO_DO: add image_params once refactored VaeImageProcessor.preprocess
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            norm_num_groups=1,
+        )
+        torch.manual_seed(0)
+
+        def init_weights(m):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.init.normal_(m.weight)
+                m.bias.data.fill_(1.0)
+
+        controlnet1 = ControlNetModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=32,
+            conditioning_embedding_out_channels=(16, 32),
+            norm_num_groups=1,
+        )
+        controlnet1.controlnet_down_blocks.apply(init_weights)
+
+        torch.manual_seed(0)
+        controlnet2 = ControlNetModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=32,
+            conditioning_embedding_out_channels=(16, 32),
+            norm_num_groups=1,
+        )
+        controlnet2.controlnet_down_blocks.apply(init_weights)
+
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        controlnet = MultiControlNetModel([controlnet1, controlnet2])
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+
+        images = [
+            randn_tensor(
+                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+                generator=generator,
+                device=torch.device(device),
+            ),
+            randn_tensor(
+                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+                generator=generator,
+                device=torch.device(device),
+            ),
+        ]
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": images,
+        }
+
+        return inputs
+
+    def test_control_guidance_switch(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+
+        scale = 10.0
+        steps = 4
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_1 = pipe(**inputs)[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_2 = pipe(**inputs, control_guidance_start=0.1, control_guidance_end=0.2)[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_3 = pipe(**inputs, control_guidance_start=[0.1, 0.3], control_guidance_end=[0.2, 0.7])[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_4 = pipe(**inputs, control_guidance_start=0.4, control_guidance_end=[0.5, 0.8])[0]
+
+        # make sure that all outputs are different
+        assert np.sum(np.abs(output_1 - output_2)) > 1e-3
+        assert np.sum(np.abs(output_1 - output_3)) > 1e-3
+        assert np.sum(np.abs(output_1 - output_4)) > 1e-3
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.2422, 0.3425, 0.4048, 0.5351, 0.3503, 0.2419, 0.4645, 0.4570, 0.3804])
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_save_pretrained_raise_not_implemented_exception(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            try:
+                # save_pretrained is not implemented for Multi-ControlNet
+                pipe.save_pretrained(tmpdir)
+            except NotImplementedError:
+                pass
+
+    def test_inference_multiple_prompt_input(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"], inputs["prompt"]]
+        inputs["image"] = [inputs["image"], inputs["image"]]
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        assert image.shape == (2, 64, 64, 3)
+
+        image_1, image_2 = image
+        # make sure that the outputs are different
+        assert np.sum(np.abs(image_1 - image_2)) > 1e-3
+
+        # multiple prompts, single image conditioning
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"], inputs["prompt"]]
+        output_1 = sd_pipe(**inputs)
+
+        assert np.abs(image - output_1.images).max() < 1e-3
+
+        # multiple prompts, multiple image conditioning
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"], inputs["prompt"], inputs["prompt"], inputs["prompt"]]
+        inputs["image"] = [inputs["image"], inputs["image"], inputs["image"], inputs["image"]]
+        output_2 = sd_pipe(**inputs)
+        image = output_2.images
+
+        assert image.shape == (4, 64, 64, 3)
+
+
+class StableDiffusionMultiControlNetOneModelPipelineFastTests(
+    IPAdapterTesterMixin, PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionControlNetPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = frozenset([])  # TO_DO: add image_params once refactored VaeImageProcessor.preprocess
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            norm_num_groups=1,
+        )
+        torch.manual_seed(0)
+
+        def init_weights(m):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.init.normal_(m.weight)
+                m.bias.data.fill_(1.0)
+
+        controlnet = ControlNetModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=32,
+            conditioning_embedding_out_channels=(16, 32),
+            norm_num_groups=1,
+        )
+        controlnet.controlnet_down_blocks.apply(init_weights)
+
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        controlnet = MultiControlNetModel([controlnet])
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+
+        images = [
+            randn_tensor(
+                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+                generator=generator,
+                device=torch.device(device),
+            ),
+        ]
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": images,
+        }
+
+        return inputs
+
+    def test_control_guidance_switch(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+
+        scale = 10.0
+        steps = 4
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_1 = pipe(**inputs)[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_2 = pipe(**inputs, control_guidance_start=0.1, control_guidance_end=0.2)[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_3 = pipe(
+            **inputs,
+            control_guidance_start=[0.1],
+            control_guidance_end=[0.2],
+        )[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_4 = pipe(**inputs, control_guidance_start=0.4, control_guidance_end=[0.5])[0]
+
+        # make sure that all outputs are different
+        assert np.sum(np.abs(output_1 - output_2)) > 1e-3
+        assert np.sum(np.abs(output_1 - output_3)) > 1e-3
+        assert np.sum(np.abs(output_1 - output_4)) > 1e-3
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.5264, 0.3203, 0.1602, 0.8235, 0.6332, 0.4593, 0.7226, 0.7777, 0.4780])
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_save_pretrained_raise_not_implemented_exception(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            try:
+                # save_pretrained is not implemented for Multi-ControlNet
+                pipe.save_pretrained(tmpdir)
+            except NotImplementedError:
+                pass
+
+
+@slow
+@require_torch_gpu
+class ControlNetPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_canny(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+
+        image = output.images[0]
+
+        assert image.shape == (768, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 9e-2
+
+    def test_depth(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "Stormtrooper's lecture"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 8e-1
+
+    def test_hed(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "oil painting of handsome old man, masterpiece"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+
+        image = output.images[0]
+
+        assert image.shape == (704, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 8e-2
+
+    def test_mlsd(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "room"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+
+        image = output.images[0]
+
+        assert image.shape == (704, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 5e-2
+
+    def test_normal(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "cute toy"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 5e-2
+
+    def test_openpose(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "Chef in the kitchen"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+
+        image = output.images[0]
+
+        assert image.shape == (768, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/chef_pose_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 8e-2
+
+    def test_scribble(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-scribble")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(5)
+        prompt = "bag"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+
+        image = output.images[0]
+
+        assert image.shape == (640, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 8e-2
+
+    def test_seg(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(5)
+        prompt = "house"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 8e-2
+
+    def test_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+        pipe.enable_sequential_cpu_offload()
+
+        prompt = "house"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg.png"
+        )
+
+        _ = pipe(
+            prompt,
+            image,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 7 GB is allocated
+        assert mem_bytes < 4 * 10**9
+
+    def test_canny_guess_mode(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = ""
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        )
+
+        output = pipe(
+            prompt,
+            image,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=3,
+            guidance_scale=3.0,
+            guess_mode=True,
+        )
+
+        image = output.images[0]
+        assert image.shape == (768, 512, 3)
+
+        image_slice = image[-3:, -3:, -1]
+        expected_slice = np.array([0.2724, 0.2846, 0.2724, 0.3843, 0.3682, 0.2736, 0.4675, 0.3862, 0.2887])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_canny_guess_mode_euler(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = ""
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        )
+
+        output = pipe(
+            prompt,
+            image,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=3,
+            guidance_scale=3.0,
+            guess_mode=True,
+        )
+
+        image = output.images[0]
+        assert image.shape == (768, 512, 3)
+
+        image_slice = image[-3:, -3:, -1]
+        expected_slice = np.array([0.1655, 0.1721, 0.1623, 0.1685, 0.1711, 0.1646, 0.1651, 0.1631, 0.1494])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @is_torch_compile
+    @require_torch_2
+    @unittest.skipIf(
+        get_python_version == (3, 12),
+        reason="Torch Dynamo isn't yet supported for Python 3.12.",
+    )
+    def test_stable_diffusion_compile(self):
+        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
+
+    def test_v11_shuffle_global_pool_conditions(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "New York"
+        image = load_image(
+            "https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle/resolve/main/images/control.png"
+        )
+
+        output = pipe(
+            prompt,
+            image,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=3,
+            guidance_scale=7.0,
+        )
+
+        image = output.images[0]
+        assert image.shape == (512, 640, 3)
+
+        image_slice = image[-3:, -3:, -1]
+        expected_slice = np.array([0.1338, 0.1597, 0.1202, 0.1687, 0.1377, 0.1017, 0.2070, 0.1574, 0.1348])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionMultiControlNetPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_pose_and_canny(self):
+        controlnet_canny = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+        controlnet_pose = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            safety_checker=None,
+            controlnet=[controlnet_pose, controlnet_canny],
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird and Chef"
+        image_canny = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        )
+        image_pose = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png"
+        )
+
+        output = pipe(prompt, [image_pose, image_canny], generator=generator, output_type="np", num_inference_steps=3)
+
+        image = output.images[0]
+
+        assert image.shape == (768, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose_canny_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 5e-2
diff --git a/diffusers/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py b/diffusers/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..99a238caf53ac63749320c01fc2fceb7513b87e8
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py
@@ -0,0 +1,222 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTokenizer
+from transformers.models.blip_2.configuration_blip_2 import Blip2Config
+from transformers.models.clip.configuration_clip import CLIPTextConfig
+
+from diffusers import (
+    AutoencoderKL,
+    BlipDiffusionControlNetPipeline,
+    ControlNetModel,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import enable_full_determinism, torch_device
+from src.diffusers.pipelines.blip_diffusion.blip_image_processing import BlipImageProcessor
+from src.diffusers.pipelines.blip_diffusion.modeling_blip2 import Blip2QFormerModel
+from src.diffusers.pipelines.blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class BlipDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = BlipDiffusionControlNetPipeline
+    params = [
+        "prompt",
+        "reference_image",
+        "source_subject_category",
+        "target_subject_category",
+        "condtioning_image",
+    ]
+    batch_params = [
+        "prompt",
+        "reference_image",
+        "source_subject_category",
+        "target_subject_category",
+        "condtioning_image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "num_inference_steps",
+        "neg_prompt",
+        "guidance_scale",
+        "prompt_strength",
+        "prompt_reps",
+    ]
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            vocab_size=1000,
+            hidden_size=16,
+            intermediate_size=16,
+            projection_dim=16,
+            num_hidden_layers=1,
+            num_attention_heads=1,
+            max_position_embeddings=77,
+        )
+        text_encoder = ContextCLIPTextModel(text_encoder_config)
+
+        vae = AutoencoderKL(
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownEncoderBlock2D",),
+            up_block_types=("UpDecoderBlock2D",),
+            block_out_channels=(32,),
+            layers_per_block=1,
+            act_fn="silu",
+            latent_channels=4,
+            norm_num_groups=16,
+            sample_size=16,
+        )
+
+        blip_vision_config = {
+            "hidden_size": 16,
+            "intermediate_size": 16,
+            "num_hidden_layers": 1,
+            "num_attention_heads": 1,
+            "image_size": 224,
+            "patch_size": 14,
+            "hidden_act": "quick_gelu",
+        }
+
+        blip_qformer_config = {
+            "vocab_size": 1000,
+            "hidden_size": 16,
+            "num_hidden_layers": 1,
+            "num_attention_heads": 1,
+            "intermediate_size": 16,
+            "max_position_embeddings": 512,
+            "cross_attention_frequency": 1,
+            "encoder_hidden_size": 16,
+        }
+        qformer_config = Blip2Config(
+            vision_config=blip_vision_config,
+            qformer_config=blip_qformer_config,
+            num_query_tokens=16,
+            tokenizer="hf-internal-testing/tiny-random-bert",
+        )
+        qformer = Blip2QFormerModel(qformer_config)
+
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 16),
+            layers_per_block=1,
+            norm_num_groups=4,
+            sample_size=16,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=16,
+        )
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        scheduler = PNDMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            set_alpha_to_one=False,
+            skip_prk_steps=True,
+        )
+        controlnet = ControlNetModel(
+            block_out_channels=(4, 16),
+            layers_per_block=1,
+            in_channels=4,
+            norm_num_groups=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=16,
+            conditioning_embedding_out_channels=(8, 16),
+        )
+
+        vae.eval()
+        qformer.eval()
+        text_encoder.eval()
+
+        image_processor = BlipImageProcessor()
+
+        components = {
+            "text_encoder": text_encoder,
+            "vae": vae,
+            "qformer": qformer,
+            "unet": unet,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+            "controlnet": controlnet,
+            "image_processor": image_processor,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        np.random.seed(seed)
+        reference_image = np.random.rand(32, 32, 3) * 255
+        reference_image = Image.fromarray(reference_image.astype("uint8")).convert("RGBA")
+        cond_image = np.random.rand(32, 32, 3) * 255
+        cond_image = Image.fromarray(cond_image.astype("uint8")).convert("RGBA")
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "swimming underwater",
+            "generator": generator,
+            "reference_image": reference_image,
+            "condtioning_image": cond_image,
+            "source_subject_category": "dog",
+            "target_subject_category": "dog",
+            "height": 32,
+            "width": 32,
+            "guidance_scale": 7.5,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_dict_tuple_outputs_equivalent(self):
+        expected_slice = None
+        if torch_device == "cpu":
+            expected_slice = np.array([0.4803, 0.3865, 0.1422, 0.6119, 0.2283, 0.6365, 0.5453, 0.5205, 0.3581])
+        super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
+
+    def test_blipdiffusion_controlnet(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        image = pipe(**self.get_dummy_inputs(device))[0]
+        image_slice = image[0, -3:, -3:, 0]
+
+        assert image.shape == (1, 16, 16, 4)
+        expected_slice = np.array([0.7953, 0.7136, 0.6597, 0.4779, 0.7389, 0.4111, 0.5826, 0.4150, 0.8422])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
diff --git a/diffusers/tests/pipelines/controlnet/test_controlnet_img2img.py b/diffusers/tests/pipelines/controlnet/test_controlnet_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c4ae716b37ddf280876f8bc9fa5c1f48ab0e747
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet/test_controlnet_img2img.py
@@ -0,0 +1,442 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This model implementation is heavily inspired by https://github.com/haofanwang/ControlNet-for-Diffusers/
+
+import gc
+import random
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    DDIMScheduler,
+    StableDiffusionControlNetImg2ImgPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel
+from diffusers.utils import load_image
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class ControlNetImg2ImgPipelineFastTests(
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionControlNetImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS.union({"control_image"})
+    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            norm_num_groups=1,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=32,
+            conditioning_embedding_out_channels=(16, 32),
+            norm_num_groups=1,
+        )
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+        control_image = randn_tensor(
+            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+            generator=generator,
+            device=torch.device(device),
+        )
+        image = floats_tensor(control_image.shape, rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": image,
+            "control_image": control_image,
+        }
+
+        return inputs
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.7096, 0.5149, 0.3571, 0.5897, 0.4715, 0.4052, 0.6098, 0.6886, 0.4213])
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+
+class StableDiffusionMultiControlNetPipelineFastTests(
+    IPAdapterTesterMixin, PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionControlNetImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = frozenset([])  # TO_DO: add image_params once refactored VaeImageProcessor.preprocess
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            norm_num_groups=1,
+        )
+        torch.manual_seed(0)
+
+        def init_weights(m):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.init.normal_(m.weight)
+                m.bias.data.fill_(1.0)
+
+        controlnet1 = ControlNetModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=32,
+            conditioning_embedding_out_channels=(16, 32),
+            norm_num_groups=1,
+        )
+        controlnet1.controlnet_down_blocks.apply(init_weights)
+
+        torch.manual_seed(0)
+        controlnet2 = ControlNetModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=32,
+            conditioning_embedding_out_channels=(16, 32),
+            norm_num_groups=1,
+        )
+        controlnet2.controlnet_down_blocks.apply(init_weights)
+
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        controlnet = MultiControlNetModel([controlnet1, controlnet2])
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+
+        control_image = [
+            randn_tensor(
+                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+                generator=generator,
+                device=torch.device(device),
+            ),
+            randn_tensor(
+                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+                generator=generator,
+                device=torch.device(device),
+            ),
+        ]
+
+        image = floats_tensor(control_image[0].shape, rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": image,
+            "control_image": control_image,
+        }
+
+        return inputs
+
+    def test_control_guidance_switch(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+
+        scale = 10.0
+        steps = 4
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_1 = pipe(**inputs)[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_2 = pipe(**inputs, control_guidance_start=0.1, control_guidance_end=0.2)[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_3 = pipe(**inputs, control_guidance_start=[0.1, 0.3], control_guidance_end=[0.2, 0.7])[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_4 = pipe(**inputs, control_guidance_start=0.4, control_guidance_end=[0.5, 0.8])[0]
+
+        # make sure that all outputs are different
+        assert np.sum(np.abs(output_1 - output_2)) > 1e-3
+        assert np.sum(np.abs(output_1 - output_3)) > 1e-3
+        assert np.sum(np.abs(output_1 - output_4)) > 1e-3
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.5293, 0.7339, 0.6642, 0.3950, 0.5212, 0.5175, 0.7002, 0.5907, 0.5182])
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_save_pretrained_raise_not_implemented_exception(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            try:
+                # save_pretrained is not implemented for Multi-ControlNet
+                pipe.save_pretrained(tmpdir)
+            except NotImplementedError:
+                pass
+
+
+@slow
+@require_torch_gpu
+class ControlNetImg2ImgPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_canny(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+        pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "evil space-punk bird"
+        control_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ).resize((512, 512))
+        image = load_image(
+            "https://huggingface.co/lllyasviel/sd-controlnet-canny/resolve/main/images/bird.png"
+        ).resize((512, 512))
+
+        output = pipe(
+            prompt,
+            image,
+            control_image=control_image,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=50,
+            strength=0.6,
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/img2img.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 9e-2
diff --git a/diffusers/tests/pipelines/controlnet/test_controlnet_inpaint.py b/diffusers/tests/pipelines/controlnet/test_controlnet_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e49106334c2e4c2f9b2befbdeabf6d85f3c0500d
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet/test_controlnet_inpaint.py
@@ -0,0 +1,558 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This model implementation is heavily based on:
+
+import gc
+import random
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    DDIMScheduler,
+    StableDiffusionControlNetInpaintPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel
+from diffusers.utils import load_image
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_numpy,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class ControlNetInpaintPipelineFastTests(
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionControlNetInpaintPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    image_params = frozenset({"control_image"})  # skip `image` and `mask` for now, only test for control_image
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=9,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=32,
+            conditioning_embedding_out_channels=(16, 32),
+        )
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+        control_image = randn_tensor(
+            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+            generator=generator,
+            device=torch.device(device),
+        )
+        init_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        init_image = init_image.cpu().permute(0, 2, 3, 1)[0]
+
+        image = Image.fromarray(np.uint8(init_image)).convert("RGB").resize((64, 64))
+        mask_image = Image.fromarray(np.uint8(init_image + 4)).convert("RGB").resize((64, 64))
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": image,
+            "mask_image": mask_image,
+            "control_image": control_image,
+        }
+
+        return inputs
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+
+class ControlNetSimpleInpaintPipelineFastTests(ControlNetInpaintPipelineFastTests):
+    pipeline_class = StableDiffusionControlNetInpaintPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    image_params = frozenset([])
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=32,
+            conditioning_embedding_out_channels=(16, 32),
+        )
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+
+class MultiControlNetInpaintPipelineFastTests(
+    PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionControlNetInpaintPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=9,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        torch.manual_seed(0)
+
+        def init_weights(m):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.init.normal_(m.weight)
+                m.bias.data.fill_(1.0)
+
+        controlnet1 = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=32,
+            conditioning_embedding_out_channels=(16, 32),
+        )
+        controlnet1.controlnet_down_blocks.apply(init_weights)
+
+        torch.manual_seed(0)
+        controlnet2 = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=32,
+            conditioning_embedding_out_channels=(16, 32),
+        )
+        controlnet2.controlnet_down_blocks.apply(init_weights)
+
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        controlnet = MultiControlNetModel([controlnet1, controlnet2])
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+
+        control_image = [
+            randn_tensor(
+                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+                generator=generator,
+                device=torch.device(device),
+            ),
+            randn_tensor(
+                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+                generator=generator,
+                device=torch.device(device),
+            ),
+        ]
+        init_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        init_image = init_image.cpu().permute(0, 2, 3, 1)[0]
+
+        image = Image.fromarray(np.uint8(init_image)).convert("RGB").resize((64, 64))
+        mask_image = Image.fromarray(np.uint8(init_image + 4)).convert("RGB").resize((64, 64))
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": image,
+            "mask_image": mask_image,
+            "control_image": control_image,
+        }
+
+        return inputs
+
+    def test_control_guidance_switch(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+
+        scale = 10.0
+        steps = 4
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_1 = pipe(**inputs)[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_2 = pipe(**inputs, control_guidance_start=0.1, control_guidance_end=0.2)[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_3 = pipe(**inputs, control_guidance_start=[0.1, 0.3], control_guidance_end=[0.2, 0.7])[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_4 = pipe(**inputs, control_guidance_start=0.4, control_guidance_end=[0.5, 0.8])[0]
+
+        # make sure that all outputs are different
+        assert np.sum(np.abs(output_1 - output_2)) > 1e-3
+        assert np.sum(np.abs(output_1 - output_3)) > 1e-3
+        assert np.sum(np.abs(output_1 - output_4)) > 1e-3
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    def test_save_pretrained_raise_not_implemented_exception(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            try:
+                # save_pretrained is not implemented for Multi-ControlNet
+                pipe.save_pretrained(tmpdir)
+            except NotImplementedError:
+                pass
+
+
+@slow
+@require_torch_gpu
+class ControlNetInpaintPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_canny(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+        pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image = load_image(
+            "https://huggingface.co/lllyasviel/sd-controlnet-canny/resolve/main/images/bird.png"
+        ).resize((512, 512))
+
+        mask_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/input_bench_mask.png"
+        ).resize((512, 512))
+
+        prompt = "pitch black hole"
+
+        control_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ).resize((512, 512))
+
+        output = pipe(
+            prompt,
+            image=image,
+            mask_image=mask_image,
+            control_image=control_image,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=3,
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/inpaint.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 9e-2
+
+    def test_inpaint(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_inpaint")
+
+        pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(33)
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy.png"
+        )
+        init_image = init_image.resize((512, 512))
+
+        mask_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy_mask.png"
+        )
+        mask_image = mask_image.resize((512, 512))
+
+        prompt = "a handsome man with ray-ban sunglasses"
+
+        def make_inpaint_condition(image, image_mask):
+            image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
+            image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0
+
+            assert image.shape[0:1] == image_mask.shape[0:1], "image and image_mask must have the same image size"
+            image[image_mask > 0.5] = -1.0  # set as masked pixel
+            image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
+            image = torch.from_numpy(image)
+            return image
+
+        control_image = make_inpaint_condition(init_image, mask_image)
+
+        output = pipe(
+            prompt,
+            image=init_image,
+            mask_image=mask_image,
+            control_image=control_image,
+            guidance_scale=9.0,
+            eta=1.0,
+            generator=generator,
+            num_inference_steps=20,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/boy_ray_ban.npy"
+        )
+
+        assert numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten()) < 1e-2
diff --git a/diffusers/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py b/diffusers/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2c63137c99e29d137bf88dcd888f9aae7ee5f61
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py
@@ -0,0 +1,354 @@
+# coding=utf-8
+# Copyright 2024 Harutatsu Akiyama, Jinbin Bai, and HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    EulerDiscreteScheduler,
+    StableDiffusionXLControlNetInpaintPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    require_torch_gpu,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class ControlNetPipelineSDXLFastTests(
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionXLControlNetInpaintPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = frozenset(IMAGE_TO_IMAGE_IMAGE_PARAMS.union({"mask_image", "control_image"}))
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union(
+        {
+            "add_text_embeds",
+            "add_time_ids",
+            "mask",
+            "masked_image_latents",
+        }
+    )
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            conditioning_embedding_out_channels=(16, 32),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        torch.manual_seed(0)
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=32,
+            image_size=224,
+            projection_dim=32,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+
+        feature_extractor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "image_encoder": image_encoder,
+            "feature_extractor": feature_extractor,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0, img_res=64):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        # Get random floats in [0, 1] as image
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        mask_image = torch.ones_like(image)
+        controlnet_embedder_scale_factor = 2
+        control_image = (
+            floats_tensor(
+                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+                rng=random.Random(seed),
+            )
+            .to(device)
+            .cpu()
+        )
+        control_image = control_image.cpu().permute(0, 2, 3, 1)[0]
+        # Convert image and mask_image to [0, 255]
+        image = 255 * image
+        mask_image = 255 * mask_image
+        control_image = 255 * control_image
+        # Convert to PIL image
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((img_res, img_res))
+        mask_image = Image.fromarray(np.uint8(mask_image)).convert("L").resize((img_res, img_res))
+        control_image = Image.fromarray(np.uint8(control_image)).convert("RGB").resize((img_res, img_res))
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": init_image,
+            "mask_image": mask_image,
+            "control_image": control_image,
+        }
+        return inputs
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    @require_torch_gpu
+    def test_stable_diffusion_xl_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            pipe.unet.set_default_attn_processor()
+
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_stable_diffusion_xl_multi_prompts(self):
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+
+        # forward with single prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = inputs["prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "different prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+        # manually set a negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same negative_prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = inputs["negative_prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = "different negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+    def test_controlnet_sdxl_guess(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(device)
+
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["guess_mode"] = True
+
+        output = sd_pipe(**inputs)
+        image_slice = output.images[0, -3:, -3:, -1]
+
+        expected_slice = np.array([0.5460, 0.4943, 0.4635, 0.5832, 0.5366, 0.4815, 0.6034, 0.5741, 0.4341])
+
+        # make sure that it's equal
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4
+
+    # TODO(Patrick, Sayak) - skip for now as this requires more refiner tests
+    def test_save_load_optional_components(self):
+        pass
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=5e-1)
diff --git a/diffusers/tests/pipelines/controlnet/test_controlnet_sdxl.py b/diffusers/tests/pipelines/controlnet/test_controlnet_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..c931391ac4d5511b19126a28f9c9efd9ab09361b
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet/test_controlnet_sdxl.py
@@ -0,0 +1,1113 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LCMScheduler,
+    StableDiffusionXLControlNetPipeline,
+    StableDiffusionXLImg2ImgPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2D
+from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_image,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLControlNetPipelineFastTests(
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionXLControlNetPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+            time_cond_proj_dim=time_cond_proj_dim,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            conditioning_embedding_out_channels=(16, 32),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        torch.manual_seed(0)
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+        image = randn_tensor(
+            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+            generator=generator,
+            device=torch.device(device),
+        )
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": image,
+        }
+
+        return inputs
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    def test_ip_adapter(self, from_ssd1b=False, expected_pipe_slice=None):
+        if not from_ssd1b:
+            expected_pipe_slice = None
+            if torch_device == "cpu":
+                expected_pipe_slice = np.array(
+                    [0.7335, 0.5866, 0.5623, 0.6242, 0.5751, 0.5999, 0.4091, 0.4590, 0.5054]
+                )
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @require_torch_gpu
+    def test_stable_diffusion_xl_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            pipe.unet.set_default_attn_processor()
+
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_stable_diffusion_xl_multi_prompts(self):
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+
+        # forward with single prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = inputs["prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "different prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+        # manually set a negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same negative_prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = inputs["negative_prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = "different negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+    # Copied from test_stable_diffusion_xl.py
+    def test_stable_diffusion_xl_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward without prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = 2 * [inputs["prompt"]]
+        inputs["num_images_per_prompt"] = 2
+
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 2 * [inputs.pop("prompt")]
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = sd_pipe.encode_prompt(prompt)
+
+        output = sd_pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # make sure that it's equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_controlnet_sdxl_guess(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(device)
+
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["guess_mode"] = True
+
+        output = sd_pipe(**inputs)
+        image_slice = output.images[0, -3:, -3:, -1]
+
+        expected_slice = np.array([0.7335, 0.5866, 0.5623, 0.6242, 0.5751, 0.5999, 0.4091, 0.4590, 0.5054])
+
+        # make sure that it's equal
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4
+
+    def test_controlnet_sdxl_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionXLControlNetPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.7820, 0.6195, 0.6193, 0.7045, 0.6706, 0.5837, 0.4147, 0.5232, 0.4868])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    # Copied from test_stable_diffusion_xl.py:test_stable_diffusion_two_xl_mixture_of_denoiser_fast
+    # with `StableDiffusionXLControlNetPipeline` instead of `StableDiffusionXLPipeline`
+    def test_controlnet_sdxl_two_mixture_of_denoiser_fast(self):
+        components = self.get_dummy_components()
+        pipe_1 = StableDiffusionXLControlNetPipeline(**components).to(torch_device)
+        pipe_1.unet.set_default_attn_processor()
+
+        components_without_controlnet = {k: v for k, v in components.items() if k != "controlnet"}
+        pipe_2 = StableDiffusionXLImg2ImgPipeline(**components_without_controlnet).to(torch_device)
+        pipe_2.unet.set_default_attn_processor()
+
+        def assert_run_mixture(
+            num_steps,
+            split,
+            scheduler_cls_orig,
+            expected_tss,
+            num_train_timesteps=pipe_1.scheduler.config.num_train_timesteps,
+        ):
+            inputs = self.get_dummy_inputs(torch_device)
+            inputs["num_inference_steps"] = num_steps
+
+            class scheduler_cls(scheduler_cls_orig):
+                pass
+
+            pipe_1.scheduler = scheduler_cls.from_config(pipe_1.scheduler.config)
+            pipe_2.scheduler = scheduler_cls.from_config(pipe_2.scheduler.config)
+
+            # Let's retrieve the number of timesteps we want to use
+            pipe_1.scheduler.set_timesteps(num_steps)
+            expected_steps = pipe_1.scheduler.timesteps.tolist()
+
+            if pipe_1.scheduler.order == 2:
+                expected_steps_1 = list(filter(lambda ts: ts >= split, expected_tss))
+                expected_steps_2 = expected_steps_1[-1:] + list(filter(lambda ts: ts < split, expected_tss))
+                expected_steps = expected_steps_1 + expected_steps_2
+            else:
+                expected_steps_1 = list(filter(lambda ts: ts >= split, expected_tss))
+                expected_steps_2 = list(filter(lambda ts: ts < split, expected_tss))
+
+            # now we monkey patch step `done_steps`
+            # list into the step function for testing
+            done_steps = []
+            old_step = copy.copy(scheduler_cls.step)
+
+            def new_step(self, *args, **kwargs):
+                done_steps.append(args[1].cpu().item())  # args[1] is always the passed `t`
+                return old_step(self, *args, **kwargs)
+
+            scheduler_cls.step = new_step
+
+            inputs_1 = {
+                **inputs,
+                **{
+                    "denoising_end": 1.0 - (split / num_train_timesteps),
+                    "output_type": "latent",
+                },
+            }
+            latents = pipe_1(**inputs_1).images[0]
+
+            assert expected_steps_1 == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}"
+
+            inputs_2 = {
+                **inputs,
+                **{
+                    "denoising_start": 1.0 - (split / num_train_timesteps),
+                    "image": latents,
+                },
+            }
+            pipe_2(**inputs_2).images[0]
+
+            assert expected_steps_2 == done_steps[len(expected_steps_1) :]
+            assert expected_steps == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}"
+
+        steps = 10
+        for split in [300, 700]:
+            for scheduler_cls_timesteps in [
+                (EulerDiscreteScheduler, [901, 801, 701, 601, 501, 401, 301, 201, 101, 1]),
+                (
+                    HeunDiscreteScheduler,
+                    [
+                        901.0,
+                        801.0,
+                        801.0,
+                        701.0,
+                        701.0,
+                        601.0,
+                        601.0,
+                        501.0,
+                        501.0,
+                        401.0,
+                        401.0,
+                        301.0,
+                        301.0,
+                        201.0,
+                        201.0,
+                        101.0,
+                        101.0,
+                        1.0,
+                        1.0,
+                    ],
+                ),
+            ]:
+                assert_run_mixture(steps, split, scheduler_cls_timesteps[0], scheduler_cls_timesteps[1])
+
+
+class StableDiffusionXLMultiControlNetPipelineFastTests(
+    PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, SDXLOptionalComponentsTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionXLControlNetPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = frozenset([])  # TO_DO: add image_params once refactored VaeImageProcessor.preprocess
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        torch.manual_seed(0)
+
+        def init_weights(m):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.init.normal_(m.weight)
+                m.bias.data.fill_(1.0)
+
+        controlnet1 = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            conditioning_embedding_out_channels=(16, 32),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        controlnet1.controlnet_down_blocks.apply(init_weights)
+
+        torch.manual_seed(0)
+        controlnet2 = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            conditioning_embedding_out_channels=(16, 32),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        controlnet2.controlnet_down_blocks.apply(init_weights)
+
+        torch.manual_seed(0)
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        controlnet = MultiControlNetModel([controlnet1, controlnet2])
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+
+        images = [
+            randn_tensor(
+                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+                generator=generator,
+                device=torch.device(device),
+            ),
+            randn_tensor(
+                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+                generator=generator,
+                device=torch.device(device),
+            ),
+        ]
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": images,
+        }
+
+        return inputs
+
+    def test_control_guidance_switch(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+
+        scale = 10.0
+        steps = 4
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_1 = pipe(**inputs)[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_2 = pipe(**inputs, control_guidance_start=0.1, control_guidance_end=0.2)[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_3 = pipe(**inputs, control_guidance_start=[0.1, 0.3], control_guidance_end=[0.2, 0.7])[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_4 = pipe(**inputs, control_guidance_start=0.4, control_guidance_end=[0.5, 0.8])[0]
+
+        # make sure that all outputs are different
+        assert np.sum(np.abs(output_1 - output_2)) > 1e-3
+        assert np.sum(np.abs(output_1 - output_3)) > 1e-3
+        assert np.sum(np.abs(output_1 - output_4)) > 1e-3
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    def test_save_load_optional_components(self):
+        return self._test_save_load_optional_components()
+
+
+class StableDiffusionXLMultiControlNetOneModelPipelineFastTests(
+    PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, SDXLOptionalComponentsTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionXLControlNetPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = frozenset([])  # TO_DO: add image_params once refactored VaeImageProcessor.preprocess
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        torch.manual_seed(0)
+
+        def init_weights(m):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.init.normal_(m.weight)
+                m.bias.data.fill_(1.0)
+
+        controlnet = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            conditioning_embedding_out_channels=(16, 32),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        controlnet.controlnet_down_blocks.apply(init_weights)
+
+        torch.manual_seed(0)
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        controlnet = MultiControlNetModel([controlnet])
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+        images = [
+            randn_tensor(
+                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+                generator=generator,
+                device=torch.device(device),
+            ),
+        ]
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": images,
+        }
+
+        return inputs
+
+    def test_control_guidance_switch(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+
+        scale = 10.0
+        steps = 4
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_1 = pipe(**inputs)[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_2 = pipe(**inputs, control_guidance_start=0.1, control_guidance_end=0.2)[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_3 = pipe(
+            **inputs,
+            control_guidance_start=[0.1],
+            control_guidance_end=[0.2],
+        )[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = steps
+        inputs["controlnet_conditioning_scale"] = scale
+        output_4 = pipe(**inputs, control_guidance_start=0.4, control_guidance_end=[0.5])[0]
+
+        # make sure that all outputs are different
+        assert np.sum(np.abs(output_1 - output_2)) > 1e-3
+        assert np.sum(np.abs(output_1 - output_3)) > 1e-3
+        assert np.sum(np.abs(output_1 - output_4)) > 1e-3
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    def test_negative_conditions(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        image = pipe(**inputs).images
+        image_slice_without_neg_cond = image[0, -3:, -3:, -1]
+
+        image = pipe(
+            **inputs,
+            negative_original_size=(512, 512),
+            negative_crops_coords_top_left=(0, 0),
+            negative_target_size=(1024, 1024),
+        ).images
+        image_slice_with_neg_cond = image[0, -3:, -3:, -1]
+
+        self.assertTrue(np.abs(image_slice_without_neg_cond - image_slice_with_neg_cond).max() > 1e-2)
+
+
+@slow
+@require_torch_gpu
+class ControlNetSDXLPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_canny(self):
+        controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-canny-sdxl-1.0")
+
+        pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet
+        )
+        pipe.enable_sequential_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        )
+
+        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
+
+        assert images[0].shape == (768, 512, 3)
+
+        original_image = images[0, -3:, -3:, -1].flatten()
+        expected_image = np.array([0.4185, 0.4127, 0.4089, 0.4046, 0.4115, 0.4096, 0.4081, 0.4112, 0.3913])
+        assert np.allclose(original_image, expected_image, atol=1e-04)
+
+    def test_depth(self):
+        controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-depth-sdxl-1.0")
+
+        pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet
+        )
+        pipe.enable_sequential_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "Stormtrooper's lecture"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
+        )
+
+        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
+
+        assert images[0].shape == (512, 512, 3)
+
+        original_image = images[0, -3:, -3:, -1].flatten()
+        expected_image = np.array([0.4399, 0.5112, 0.5478, 0.4314, 0.472, 0.4823, 0.4647, 0.4957, 0.4853])
+        assert np.allclose(original_image, expected_image, atol=1e-04)
+
+
+class StableDiffusionSSD1BControlNetPipelineFastTests(StableDiffusionXLControlNetPipelineFastTests):
+    def test_controlnet_sdxl_guess(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(device)
+
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["guess_mode"] = True
+
+        output = sd_pipe(**inputs)
+        image_slice = output.images[0, -3:, -3:, -1]
+
+        expected_slice = np.array([0.7212, 0.5890, 0.5491, 0.6425, 0.5970, 0.6091, 0.4418, 0.4556, 0.5032])
+
+        # make sure that it's equal
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.7212, 0.5890, 0.5491, 0.6425, 0.5970, 0.6091, 0.4418, 0.4556, 0.5032])
+
+        return super().test_ip_adapter(from_ssd1b=True, expected_pipe_slice=expected_pipe_slice)
+
+    def test_controlnet_sdxl_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionXLControlNetPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.6787, 0.5117, 0.5558, 0.6963, 0.6571, 0.5928, 0.4121, 0.5468, 0.5057])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_conditioning_channels(self):
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            mid_block_type="UNetMidBlock2D",
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+            time_cond_proj_dim=None,
+        )
+
+        controlnet = ControlNetModel.from_unet(unet, conditioning_channels=4)
+        assert type(controlnet.mid_block) == UNetMidBlock2D
+        assert controlnet.conditioning_channels == 4
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            mid_block_type="UNetMidBlock2D",
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+            time_cond_proj_dim=time_cond_proj_dim,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            conditioning_embedding_out_channels=(16, 32),
+            mid_block_type="UNetMidBlock2D",
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        torch.manual_seed(0)
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
diff --git a/diffusers/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py b/diffusers/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a5976bd0ddabb89bea34060b3b208129bfb0830
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py
@@ -0,0 +1,363 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    EulerDiscreteScheduler,
+    StableDiffusionXLControlNetImg2ImgPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, torch_device
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class ControlNetPipelineSDXLImg2ImgFastTests(
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionXLControlNetImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union(
+        {"add_text_embeds", "add_time_ids", "add_neg_time_ids"}
+    )
+
+    def get_dummy_components(self, skip_first_text_encoder=False):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64 if not skip_first_text_encoder else 32,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            conditioning_embedding_out_channels=(16, 32),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        torch.manual_seed(0)
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder if not skip_first_text_encoder else None,
+            "tokenizer": tokenizer if not skip_first_text_encoder else None,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "image_encoder": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        controlnet_embedder_scale_factor = 2
+        image = floats_tensor(
+            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+            rng=random.Random(seed),
+        ).to(device)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": image,
+            "control_image": image,
+        }
+
+        return inputs
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.6276, 0.5271, 0.5205, 0.5393, 0.5774, 0.5872, 0.5456, 0.5415, 0.5354])
+        # TODO: update after slices.p
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_stable_diffusion_xl_controlnet_img2img(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.5557202, 0.46418434, 0.46983826, 0.623529, 0.5557242, 0.49262643, 0.6070508, 0.5702978, 0.43777135]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_xl_controlnet_img2img_guess(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(device)
+
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["guess_mode"] = True
+
+        output = sd_pipe(**inputs)
+        image_slice = output.images[0, -3:, -3:, -1]
+        assert output.images.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.5557202, 0.46418434, 0.46983826, 0.623529, 0.5557242, 0.49262643, 0.6070508, 0.5702978, 0.43777135]
+        )
+
+        # make sure that it's equal
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    # TODO(Patrick, Sayak) - skip for now as this requires more refiner tests
+    def test_save_load_optional_components(self):
+        pass
+
+    @require_torch_gpu
+    def test_stable_diffusion_xl_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            pipe.unet.set_default_attn_processor()
+
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_stable_diffusion_xl_multi_prompts(self):
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+
+        # forward with single prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = inputs["prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "different prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+        # manually set a negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same negative_prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = inputs["negative_prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = "different negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+    # Copied from test_stable_diffusion_xl.py
+    def test_stable_diffusion_xl_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward without prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = 2 * [inputs["prompt"]]
+        inputs["num_images_per_prompt"] = 2
+
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 2 * [inputs.pop("prompt")]
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = sd_pipe.encode_prompt(prompt)
+
+        output = sd_pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # make sure that it's equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
diff --git a/diffusers/tests/pipelines/controlnet/test_flax_controlnet.py b/diffusers/tests/pipelines/controlnet/test_flax_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf5564e810ef221a8bbd579dcce923702bb812a7
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet/test_flax_controlnet.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+from diffusers import FlaxControlNetModel, FlaxStableDiffusionControlNetPipeline
+from diffusers.utils import is_flax_available, load_image
+from diffusers.utils.testing_utils import require_flax, slow
+
+
+if is_flax_available():
+    import jax
+    import jax.numpy as jnp
+    from flax.jax_utils import replicate
+    from flax.training.common_utils import shard
+
+
+@slow
+@require_flax
+class FlaxControlNetPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+
+    def test_canny(self):
+        controlnet, controlnet_params = FlaxControlNetModel.from_pretrained(
+            "lllyasviel/sd-controlnet-canny", from_pt=True, dtype=jnp.bfloat16
+        )
+        pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, from_pt=True, dtype=jnp.bfloat16
+        )
+        params["controlnet"] = controlnet_params
+
+        prompts = "bird"
+        num_samples = jax.device_count()
+        prompt_ids = pipe.prepare_text_inputs([prompts] * num_samples)
+
+        canny_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        )
+        processed_image = pipe.prepare_image_inputs([canny_image] * num_samples)
+
+        rng = jax.random.PRNGKey(0)
+        rng = jax.random.split(rng, jax.device_count())
+
+        p_params = replicate(params)
+        prompt_ids = shard(prompt_ids)
+        processed_image = shard(processed_image)
+
+        images = pipe(
+            prompt_ids=prompt_ids,
+            image=processed_image,
+            params=p_params,
+            prng_seed=rng,
+            num_inference_steps=50,
+            jit=True,
+        ).images
+        assert images.shape == (jax.device_count(), 1, 768, 512, 3)
+
+        images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
+        image_slice = images[0, 253:256, 253:256, -1]
+
+        output_slice = jnp.asarray(jax.device_get(image_slice.flatten()))
+        expected_slice = jnp.array(
+            [0.167969, 0.116699, 0.081543, 0.154297, 0.132812, 0.108887, 0.169922, 0.169922, 0.205078]
+        )
+        print(f"output_slice: {output_slice}")
+        assert jnp.abs(output_slice - expected_slice).max() < 1e-2
+
+    def test_pose(self):
+        controlnet, controlnet_params = FlaxControlNetModel.from_pretrained(
+            "lllyasviel/sd-controlnet-openpose", from_pt=True, dtype=jnp.bfloat16
+        )
+        pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, from_pt=True, dtype=jnp.bfloat16
+        )
+        params["controlnet"] = controlnet_params
+
+        prompts = "Chef in the kitchen"
+        num_samples = jax.device_count()
+        prompt_ids = pipe.prepare_text_inputs([prompts] * num_samples)
+
+        pose_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png"
+        )
+        processed_image = pipe.prepare_image_inputs([pose_image] * num_samples)
+
+        rng = jax.random.PRNGKey(0)
+        rng = jax.random.split(rng, jax.device_count())
+
+        p_params = replicate(params)
+        prompt_ids = shard(prompt_ids)
+        processed_image = shard(processed_image)
+
+        images = pipe(
+            prompt_ids=prompt_ids,
+            image=processed_image,
+            params=p_params,
+            prng_seed=rng,
+            num_inference_steps=50,
+            jit=True,
+        ).images
+        assert images.shape == (jax.device_count(), 1, 768, 512, 3)
+
+        images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
+        image_slice = images[0, 253:256, 253:256, -1]
+
+        output_slice = jnp.asarray(jax.device_get(image_slice.flatten()))
+        expected_slice = jnp.array(
+            [[0.271484, 0.261719, 0.275391, 0.277344, 0.279297, 0.291016, 0.294922, 0.302734, 0.302734]]
+        )
+        print(f"output_slice: {output_slice}")
+        assert jnp.abs(output_slice - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/controlnet_flux/__init__.py b/diffusers/tests/pipelines/controlnet_flux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/controlnet_flux/test_controlnet_flux.py b/diffusers/tests/pipelines/controlnet_flux/test_controlnet_flux.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2db28bdda35967781e02aa18ddd762bf2e03ce5
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet_flux/test_controlnet_flux.py
@@ -0,0 +1,233 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc and The InstantX Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    FluxControlNetPipeline,
+    FluxTransformer2DModel,
+)
+from diffusers.models import FluxControlNetModel
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class FluxControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = FluxControlNetPipeline
+
+    params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
+    batch_params = frozenset(["prompt"])
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = FluxTransformer2DModel(
+            patch_size=1,
+            in_channels=16,
+            num_layers=1,
+            num_single_layers=1,
+            attention_head_dim=16,
+            num_attention_heads=2,
+            joint_attention_dim=32,
+            pooled_projection_dim=32,
+            axes_dims_rope=[4, 4, 8],
+        )
+
+        torch.manual_seed(0)
+        controlnet = FluxControlNetModel(
+            patch_size=1,
+            in_channels=16,
+            num_layers=1,
+            num_single_layers=1,
+            attention_head_dim=16,
+            num_attention_heads=2,
+            joint_attention_dim=32,
+            pooled_projection_dim=32,
+            axes_dims_rope=[4, 4, 8],
+        )
+
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModel(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_2 = T5TokenizerFast.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=4,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            shift_factor=0.0609,
+            scaling_factor=1.5035,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "transformer": transformer,
+            "vae": vae,
+            "controlnet": controlnet,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        control_image = randn_tensor(
+            (1, 3, 32, 32),
+            generator=generator,
+            device=torch.device(device),
+            dtype=torch.float16,
+        )
+
+        controlnet_conditioning_scale = 0.5
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 3.5,
+            "output_type": "np",
+            "control_image": control_image,
+            "controlnet_conditioning_scale": controlnet_conditioning_scale,
+        }
+
+        return inputs
+
+    def test_controlnet_flux(self):
+        components = self.get_dummy_components()
+        flux_pipe = FluxControlNetPipeline(**components)
+        flux_pipe = flux_pipe.to(torch_device, dtype=torch.float16)
+        flux_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = flux_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+
+        expected_slice = np.array(
+            [0.7348633, 0.41333008, 0.6621094, 0.5444336, 0.47607422, 0.5859375, 0.44677734, 0.4506836, 0.40454102]
+        )
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f"Expected: {expected_slice}, got: {image_slice.flatten()}"
+
+    @unittest.skip("xFormersAttnProcessor does not work with SD3 Joint Attention")
+    def test_xformers_attention_forwardGenerator_pass(self):
+        pass
+
+
+@slow
+@require_torch_gpu
+class FluxControlNetPipelineSlowTests(unittest.TestCase):
+    pipeline_class = FluxControlNetPipeline
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_canny(self):
+        controlnet = FluxControlNetModel.from_pretrained(
+            "InstantX/FLUX.1-dev-Controlnet-Canny-alpha", torch_dtype=torch.bfloat16
+        )
+        pipe = FluxControlNetPipeline.from_pretrained(
+            "black-forest-labs/FLUX.1-dev", controlnet=controlnet, torch_dtype=torch.bfloat16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "A girl in city, 25 years old, cool, futuristic"
+        control_image = load_image(
+            "https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Canny-alpha/resolve/main/canny.jpg"
+        )
+
+        output = pipe(
+            prompt,
+            control_image=control_image,
+            controlnet_conditioning_scale=0.6,
+            num_inference_steps=2,
+            guidance_scale=3.5,
+            output_type="np",
+            generator=generator,
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (1024, 1024, 3)
+
+        original_image = image[-3:, -3:, -1].flatten()
+
+        expected_image = np.array(
+            [0.33007812, 0.33984375, 0.33984375, 0.328125, 0.34179688, 0.33984375, 0.30859375, 0.3203125, 0.3203125]
+        )
+
+        assert np.abs(original_image.flatten() - expected_image).max() < 1e-2
diff --git a/diffusers/tests/pipelines/controlnet_flux/test_controlnet_flux_img2img.py b/diffusers/tests/pipelines/controlnet_flux/test_controlnet_flux_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c0e948861f724e88abcaf9119e7e12cd1630498
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet_flux/test_controlnet_flux_img2img.py
@@ -0,0 +1,291 @@
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    FluxControlNetImg2ImgPipeline,
+    FluxControlNetModel,
+    FluxTransformer2DModel,
+)
+from diffusers.utils.testing_utils import (
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import (
+    PipelineTesterMixin,
+    check_qkv_fusion_matches_attn_procs_length,
+    check_qkv_fusion_processors_exist,
+)
+
+
+class FluxControlNetImg2ImgPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = FluxControlNetImg2ImgPipeline
+    params = frozenset(
+        [
+            "prompt",
+            "image",
+            "control_image",
+            "height",
+            "width",
+            "strength",
+            "guidance_scale",
+            "controlnet_conditioning_scale",
+            "prompt_embeds",
+            "pooled_prompt_embeds",
+        ]
+    )
+    batch_params = frozenset(["prompt", "image", "control_image"])
+
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = FluxTransformer2DModel(
+            patch_size=1,
+            in_channels=4,
+            num_layers=1,
+            num_single_layers=1,
+            attention_head_dim=16,
+            num_attention_heads=2,
+            joint_attention_dim=32,
+            pooled_projection_dim=32,
+            axes_dims_rope=[4, 4, 8],
+        )
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModel(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=1,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            shift_factor=0.0609,
+            scaling_factor=1.5035,
+        )
+
+        torch.manual_seed(0)
+        controlnet = FluxControlNetModel(
+            in_channels=4,
+            num_layers=1,
+            num_single_layers=1,
+            attention_head_dim=16,
+            num_attention_heads=2,
+            joint_attention_dim=32,
+            pooled_projection_dim=32,
+            axes_dims_rope=[4, 4, 8],
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "transformer": transformer,
+            "vae": vae,
+            "controlnet": controlnet,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        image = torch.randn(1, 3, 32, 32).to(device)
+        control_image = torch.randn(1, 3, 32, 32).to(device)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "control_image": control_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "controlnet_conditioning_scale": 1.0,
+            "strength": 0.8,
+            "height": 32,
+            "width": 32,
+            "max_sequence_length": 48,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_flux_controlnet_different_prompts(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_same_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "a different prompt"
+        output_different_prompts = pipe(**inputs).images[0]
+
+        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
+
+        assert max_diff > 1e-6
+
+    def test_flux_controlnet_prompt_embeds(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+        inputs = self.get_dummy_inputs(torch_device)
+
+        output_with_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = inputs.pop("prompt")
+
+        (prompt_embeds, pooled_prompt_embeds, text_ids) = pipe.encode_prompt(
+            prompt,
+            prompt_2=None,
+            device=torch_device,
+            max_sequence_length=inputs["max_sequence_length"],
+        )
+        output_with_embeds = pipe(
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            **inputs,
+        ).images[0]
+
+        max_diff = np.abs(output_with_prompt - output_with_embeds).max()
+        assert max_diff < 1e-4
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        original_image_slice = image[0, -3:, -3:, -1]
+
+        pipe.transformer.fuse_qkv_projections()
+        assert check_qkv_fusion_processors_exist(
+            pipe.transformer
+        ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+        assert check_qkv_fusion_matches_attn_procs_length(
+            pipe.transformer, pipe.transformer.original_attn_processors
+        ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_fused = image[0, -3:, -3:, -1]
+
+        pipe.transformer.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_disabled = image[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
+
+
+@slow
+@require_torch_gpu
+class FluxControlNetImg2ImgPipelineSlowTests(unittest.TestCase):
+    pipeline_class = FluxControlNetImg2ImgPipeline
+    repo_id = "black-forest-labs/FLUX.1-schnell"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        image = torch.randn(1, 3, 64, 64).to(device)
+        control_image = torch.randn(1, 3, 64, 64).to(device)
+
+        return {
+            "prompt": "A photo of a cat",
+            "image": image,
+            "control_image": control_image,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "controlnet_conditioning_scale": 1.0,
+            "strength": 0.8,
+            "output_type": "np",
+            "generator": generator,
+        }
+
+    @unittest.skip("We cannot run inference on this model with the current CI hardware")
+    def test_flux_controlnet_img2img_inference(self):
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.bfloat16)
+        pipe.enable_model_cpu_offload()
+
+        inputs = self.get_inputs(torch_device)
+
+        image = pipe(**inputs).images[0]
+        image_slice = image[0, :10, :10]
+        expected_slice = np.array(
+            [
+                [0.36132812, 0.30004883, 0.25830078],
+                [0.36669922, 0.31103516, 0.23754883],
+                [0.34814453, 0.29248047, 0.23583984],
+                [0.35791016, 0.30981445, 0.23999023],
+                [0.36328125, 0.31274414, 0.2607422],
+                [0.37304688, 0.32177734, 0.26171875],
+                [0.3671875, 0.31933594, 0.25756836],
+                [0.36035156, 0.31103516, 0.2578125],
+                [0.3857422, 0.33789062, 0.27563477],
+                [0.3701172, 0.31982422, 0.265625],
+            ],
+            dtype=np.float32,
+        )
+
+        max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
+
+        assert max_diff < 1e-4
diff --git a/diffusers/tests/pipelines/controlnet_flux/test_controlnet_flux_inpaint.py b/diffusers/tests/pipelines/controlnet_flux/test_controlnet_flux_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..d66eaaf6a76f452076b863a8243c604f0d9f84bd
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet_flux/test_controlnet_flux_inpaint.py
@@ -0,0 +1,194 @@
+import random
+import unittest
+
+import numpy as np
+import torch
+
+# torch_device,  # {{ edit_1 }} Removed unused import
+from transformers import (
+    AutoTokenizer,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    T5EncoderModel,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    FluxControlNetInpaintPipeline,
+    FluxControlNetModel,
+    FluxTransformer2DModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class FluxControlNetInpaintPipelineTests(unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = FluxControlNetInpaintPipeline
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "guidance_scale",
+            "prompt_embeds",
+            "pooled_prompt_embeds",
+            "image",
+            "mask_image",
+            "control_image",
+            "strength",
+            "num_inference_steps",
+            "controlnet_conditioning_scale",
+        ]
+    )
+    batch_params = frozenset(["prompt", "image", "mask_image", "control_image"])
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = FluxTransformer2DModel(
+            patch_size=1,
+            in_channels=8,
+            num_layers=1,
+            num_single_layers=1,
+            attention_head_dim=16,
+            num_attention_heads=2,
+            joint_attention_dim=32,
+            pooled_projection_dim=32,
+            axes_dims_rope=[4, 4, 8],
+        )
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModel(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=2,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            shift_factor=0.0609,
+            scaling_factor=1.5035,
+        )
+
+        torch.manual_seed(0)
+        controlnet = FluxControlNetModel(
+            patch_size=1,
+            in_channels=8,
+            num_layers=1,
+            num_single_layers=1,
+            attention_head_dim=16,
+            num_attention_heads=2,
+            joint_attention_dim=32,
+            pooled_projection_dim=32,
+            axes_dims_rope=[4, 4, 8],
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "transformer": transformer,
+            "vae": vae,
+            "controlnet": controlnet,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        mask_image = torch.ones((1, 1, 32, 32)).to(device)
+        control_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "mask_image": mask_image,
+            "control_image": control_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "height": 32,
+            "width": 32,
+            "max_sequence_length": 48,
+            "strength": 0.8,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_flux_controlnet_inpaint_with_num_images_per_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_images_per_prompt"] = 2
+        output = pipe(**inputs)
+        images = output.images
+
+        assert images.shape == (2, 32, 32, 3)
+
+    def test_flux_controlnet_inpaint_with_controlnet_conditioning_scale(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output_default = pipe(**inputs)
+        image_default = output_default.images
+
+        inputs["controlnet_conditioning_scale"] = 0.5
+        output_scaled = pipe(**inputs)
+        image_scaled = output_scaled.images
+
+        # Ensure that changing the controlnet_conditioning_scale produces a different output
+        assert not np.allclose(image_default, image_scaled, atol=0.01)
+
+    def test_attention_slicing_forward_pass(self):
+        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
diff --git a/diffusers/tests/pipelines/controlnet_hunyuandit/__init__.py b/diffusers/tests/pipelines/controlnet_hunyuandit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py b/diffusers/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py
new file mode 100644
index 0000000000000000000000000000000000000000..5500c7bd1c81c286b644473a999529ee577e5d4d
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py
@@ -0,0 +1,350 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc and Tencent Hunyuan Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, BertModel, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    HunyuanDiT2DModel,
+    HunyuanDiTControlNetPipeline,
+)
+from diffusers.models import HunyuanDiT2DControlNetModel, HunyuanDiT2DMultiControlNetModel
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class HunyuanDiTControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = HunyuanDiTControlNetPipeline
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "guidance_scale",
+            "negative_prompt",
+            "prompt_embeds",
+            "negative_prompt_embeds",
+        ]
+    )
+    batch_params = frozenset(["prompt", "negative_prompt"])
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = HunyuanDiT2DModel(
+            sample_size=16,
+            num_layers=4,
+            patch_size=2,
+            attention_head_dim=8,
+            num_attention_heads=3,
+            in_channels=4,
+            cross_attention_dim=32,
+            cross_attention_dim_t5=32,
+            pooled_projection_dim=16,
+            hidden_size=24,
+            activation_fn="gelu-approximate",
+        )
+
+        torch.manual_seed(0)
+        controlnet = HunyuanDiT2DControlNetModel(
+            sample_size=16,
+            transformer_num_layers=4,
+            patch_size=2,
+            attention_head_dim=8,
+            num_attention_heads=3,
+            in_channels=4,
+            cross_attention_dim=32,
+            cross_attention_dim_t5=32,
+            pooled_projection_dim=16,
+            hidden_size=24,
+            activation_fn="gelu-approximate",
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL()
+
+        scheduler = DDPMScheduler()
+        text_encoder = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel")
+        text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+        tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        components = {
+            "transformer": transformer.eval(),
+            "vae": vae.eval(),
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "controlnet": controlnet,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        control_image = randn_tensor(
+            (1, 3, 16, 16),
+            generator=generator,
+            device=torch.device(device),
+            dtype=torch.float16,
+        )
+
+        controlnet_conditioning_scale = 0.5
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "control_image": control_image,
+            "controlnet_conditioning_scale": controlnet_conditioning_scale,
+        }
+
+        return inputs
+
+    def test_controlnet_hunyuandit(self):
+        components = self.get_dummy_components()
+        pipe = HunyuanDiTControlNetPipeline(**components)
+        pipe = pipe.to(torch_device, dtype=torch.float16)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+        assert image.shape == (1, 16, 16, 3)
+
+        expected_slice = np.array(
+            [0.6953125, 0.89208984, 0.59375, 0.5078125, 0.5786133, 0.6035156, 0.5839844, 0.53564453, 0.52246094]
+        )
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f"Expected: {expected_slice}, got: {image_slice.flatten()}"
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-3,
+        )
+
+    def test_sequential_cpu_offload_forward_pass(self):
+        # TODO(YiYi) need to fix later
+        pass
+
+    def test_sequential_offload_forward_pass_twice(self):
+        # TODO(YiYi) need to fix later
+        pass
+
+    def test_save_load_optional_components(self):
+        # TODO(YiYi) need to fix later
+        pass
+
+
+@slow
+@require_torch_gpu
+class HunyuanDiTControlNetPipelineSlowTests(unittest.TestCase):
+    pipeline_class = HunyuanDiTControlNetPipeline
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_canny(self):
+        controlnet = HunyuanDiT2DControlNetModel.from_pretrained(
+            "Tencent-Hunyuan/HunyuanDiT-v1.1-ControlNet-Diffusers-Canny", torch_dtype=torch.float16
+        )
+        pipe = HunyuanDiTControlNetPipeline.from_pretrained(
+            "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "At night, an ancient Chinese-style lion statue stands in front of the hotel, its eyes gleaming as if guarding the building. The background is the hotel entrance at night, with a close-up, eye-level, and centered composition. This photo presents a realistic photographic style, embodies Chinese sculpture culture, and reveals a mysterious atmosphere."
+        n_prompt = ""
+        control_image = load_image(
+            "https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.1-ControlNet-Diffusers-Canny/resolve/main/canny.jpg?download=true"
+        )
+
+        output = pipe(
+            prompt,
+            negative_prompt=n_prompt,
+            control_image=control_image,
+            controlnet_conditioning_scale=0.5,
+            guidance_scale=5.0,
+            num_inference_steps=2,
+            output_type="np",
+            generator=generator,
+        )
+        image = output.images[0]
+
+        assert image.shape == (1024, 1024, 3)
+
+        original_image = image[-3:, -3:, -1].flatten()
+
+        expected_image = np.array(
+            [0.43652344, 0.4399414, 0.44921875, 0.45043945, 0.45703125, 0.44873047, 0.43579102, 0.44018555, 0.42578125]
+        )
+
+        assert np.abs(original_image.flatten() - expected_image).max() < 1e-2
+
+    def test_pose(self):
+        controlnet = HunyuanDiT2DControlNetModel.from_pretrained(
+            "Tencent-Hunyuan/HunyuanDiT-v1.1-ControlNet-Diffusers-Pose", torch_dtype=torch.float16
+        )
+        pipe = HunyuanDiTControlNetPipeline.from_pretrained(
+            "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "An Asian woman, dressed in a green top, wearing a purple headscarf and a purple scarf, stands in front of a blackboard. The background is the blackboard. The photo is presented in a close-up, eye-level, and centered composition, adopting a realistic photographic style"
+        n_prompt = ""
+        control_image = load_image(
+            "https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.1-ControlNet-Diffusers-Pose/resolve/main/pose.jpg?download=true"
+        )
+
+        output = pipe(
+            prompt,
+            negative_prompt=n_prompt,
+            control_image=control_image,
+            controlnet_conditioning_scale=0.5,
+            guidance_scale=5.0,
+            num_inference_steps=2,
+            output_type="np",
+            generator=generator,
+        )
+        image = output.images[0]
+
+        assert image.shape == (1024, 1024, 3)
+
+        original_image = image[-3:, -3:, -1].flatten()
+
+        expected_image = np.array(
+            [0.4091797, 0.4177246, 0.39526367, 0.4194336, 0.40356445, 0.3857422, 0.39208984, 0.40429688, 0.37451172]
+        )
+
+        assert np.abs(original_image.flatten() - expected_image).max() < 1e-2
+
+    def test_depth(self):
+        controlnet = HunyuanDiT2DControlNetModel.from_pretrained(
+            "Tencent-Hunyuan/HunyuanDiT-v1.1-ControlNet-Diffusers-Depth", torch_dtype=torch.float16
+        )
+        pipe = HunyuanDiTControlNetPipeline.from_pretrained(
+            "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "In the dense forest, a black and white panda sits quietly in green trees and red flowers, surrounded by mountains, rivers, and the ocean. The background is the forest in a bright environment."
+        n_prompt = ""
+        control_image = load_image(
+            "https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.1-ControlNet-Diffusers-Depth/resolve/main/depth.jpg?download=true"
+        )
+
+        output = pipe(
+            prompt,
+            negative_prompt=n_prompt,
+            control_image=control_image,
+            controlnet_conditioning_scale=0.5,
+            guidance_scale=5.0,
+            num_inference_steps=2,
+            output_type="np",
+            generator=generator,
+        )
+        image = output.images[0]
+
+        assert image.shape == (1024, 1024, 3)
+
+        original_image = image[-3:, -3:, -1].flatten()
+
+        expected_image = np.array(
+            [0.31982422, 0.32177734, 0.30126953, 0.3190918, 0.3100586, 0.31396484, 0.3232422, 0.33544922, 0.30810547]
+        )
+
+        assert np.abs(original_image.flatten() - expected_image).max() < 1e-2
+
+    def test_multi_controlnet(self):
+        controlnet = HunyuanDiT2DControlNetModel.from_pretrained(
+            "Tencent-Hunyuan/HunyuanDiT-v1.1-ControlNet-Diffusers-Canny", torch_dtype=torch.float16
+        )
+        controlnet = HunyuanDiT2DMultiControlNetModel([controlnet, controlnet])
+
+        pipe = HunyuanDiTControlNetPipeline.from_pretrained(
+            "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "At night, an ancient Chinese-style lion statue stands in front of the hotel, its eyes gleaming as if guarding the building. The background is the hotel entrance at night, with a close-up, eye-level, and centered composition. This photo presents a realistic photographic style, embodies Chinese sculpture culture, and reveals a mysterious atmosphere."
+        n_prompt = ""
+        control_image = load_image(
+            "https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.1-ControlNet-Diffusers-Canny/resolve/main/canny.jpg?download=true"
+        )
+
+        output = pipe(
+            prompt,
+            negative_prompt=n_prompt,
+            control_image=[control_image, control_image],
+            controlnet_conditioning_scale=[0.25, 0.25],
+            guidance_scale=5.0,
+            num_inference_steps=2,
+            output_type="np",
+            generator=generator,
+        )
+        image = output.images[0]
+
+        assert image.shape == (1024, 1024, 3)
+
+        original_image = image[-3:, -3:, -1].flatten()
+        expected_image = np.array(
+            [0.43652344, 0.44018555, 0.4494629, 0.44995117, 0.45654297, 0.44848633, 0.43603516, 0.4404297, 0.42626953]
+        )
+
+        assert np.abs(original_image.flatten() - expected_image).max() < 1e-2
diff --git a/diffusers/tests/pipelines/controlnet_sd3/__init__.py b/diffusers/tests/pipelines/controlnet_sd3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/controlnet_sd3/test_controlnet_inpaint_sd3.py b/diffusers/tests/pipelines/controlnet_sd3/test_controlnet_inpaint_sd3.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a2a0019d68b86871a338af7f915359a68d9c89e
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet_sd3/test_controlnet_inpaint_sd3.py
@@ -0,0 +1,201 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    SD3Transformer2DModel,
+    StableDiffusion3ControlNetInpaintingPipeline,
+)
+from diffusers.models import SD3ControlNetModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class StableDiffusion3ControlInpaintNetPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = StableDiffusion3ControlNetInpaintingPipeline
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "guidance_scale",
+            "negative_prompt",
+            "prompt_embeds",
+            "negative_prompt_embeds",
+        ]
+    )
+    batch_params = frozenset(["prompt", "negative_prompt"])
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = SD3Transformer2DModel(
+            sample_size=32,
+            patch_size=1,
+            in_channels=8,
+            num_layers=4,
+            attention_head_dim=8,
+            num_attention_heads=4,
+            joint_attention_dim=32,
+            caption_projection_dim=32,
+            pooled_projection_dim=64,
+            out_channels=8,
+        )
+
+        torch.manual_seed(0)
+        controlnet = SD3ControlNetModel(
+            sample_size=32,
+            patch_size=1,
+            in_channels=8,
+            num_layers=1,
+            attention_head_dim=8,
+            num_attention_heads=4,
+            joint_attention_dim=32,
+            caption_projection_dim=32,
+            pooled_projection_dim=64,
+            out_channels=8,
+            extra_conditioning_channels=1,
+        )
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModelWithProjection(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_3 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=8,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            shift_factor=0.0609,
+            scaling_factor=1.5035,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "text_encoder_3": text_encoder_3,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "tokenizer_3": tokenizer_3,
+            "transformer": transformer,
+            "vae": vae,
+            "controlnet": controlnet,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        control_image = randn_tensor(
+            (1, 3, 32, 32),
+            generator=generator,
+            device=torch.device(device),
+            dtype=torch.float16,
+        )
+
+        control_mask = randn_tensor(
+            (1, 1, 32, 32),
+            generator=generator,
+            device=torch.device(device),
+            dtype=torch.float16,
+        )
+
+        controlnet_conditioning_scale = 0.95
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 7.0,
+            "output_type": "np",
+            "control_image": control_image,
+            "control_mask": control_mask,
+            "controlnet_conditioning_scale": controlnet_conditioning_scale,
+        }
+
+        return inputs
+
+    def test_controlnet_inpaint_sd3(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusion3ControlNetInpaintingPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device, dtype=torch.float16)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+
+        expected_slice = np.array(
+            [0.51708984, 0.7421875, 0.4580078, 0.6435547, 0.65625, 0.43603516, 0.5151367, 0.65722656, 0.60839844]
+        )
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f"Expected: {expected_slice}, got: {image_slice.flatten()}"
+
+    @unittest.skip("xFormersAttnProcessor does not work with SD3 Joint Attention")
+    def test_xformers_attention_forwardGenerator_pass(self):
+        pass
diff --git a/diffusers/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py b/diffusers/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
new file mode 100644
index 0000000000000000000000000000000000000000..74cb56e0337ad2b9a800260613ed62dfb231914b
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
@@ -0,0 +1,351 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc and The InstantX Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    SD3Transformer2DModel,
+    StableDiffusion3ControlNetPipeline,
+)
+from diffusers.models import SD3ControlNetModel, SD3MultiControlNetModel
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class StableDiffusion3ControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = StableDiffusion3ControlNetPipeline
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "guidance_scale",
+            "negative_prompt",
+            "prompt_embeds",
+            "negative_prompt_embeds",
+        ]
+    )
+    batch_params = frozenset(["prompt", "negative_prompt"])
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = SD3Transformer2DModel(
+            sample_size=32,
+            patch_size=1,
+            in_channels=8,
+            num_layers=4,
+            attention_head_dim=8,
+            num_attention_heads=4,
+            joint_attention_dim=32,
+            caption_projection_dim=32,
+            pooled_projection_dim=64,
+            out_channels=8,
+        )
+
+        torch.manual_seed(0)
+        controlnet = SD3ControlNetModel(
+            sample_size=32,
+            patch_size=1,
+            in_channels=8,
+            num_layers=1,
+            attention_head_dim=8,
+            num_attention_heads=4,
+            joint_attention_dim=32,
+            caption_projection_dim=32,
+            pooled_projection_dim=64,
+            out_channels=8,
+        )
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModelWithProjection(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_3 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=8,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            shift_factor=0.0609,
+            scaling_factor=1.5035,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "text_encoder_3": text_encoder_3,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "tokenizer_3": tokenizer_3,
+            "transformer": transformer,
+            "vae": vae,
+            "controlnet": controlnet,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        control_image = randn_tensor(
+            (1, 3, 32, 32),
+            generator=generator,
+            device=torch.device(device),
+            dtype=torch.float16,
+        )
+
+        controlnet_conditioning_scale = 0.5
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "control_image": control_image,
+            "controlnet_conditioning_scale": controlnet_conditioning_scale,
+        }
+
+        return inputs
+
+    def test_controlnet_sd3(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusion3ControlNetPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device, dtype=torch.float16)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+
+        expected_slice = np.array([0.5767, 0.7100, 0.5981, 0.5674, 0.5952, 0.4102, 0.5093, 0.5044, 0.6030])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f"Expected: {expected_slice}, got: {image_slice.flatten()}"
+
+    @unittest.skip("xFormersAttnProcessor does not work with SD3 Joint Attention")
+    def test_xformers_attention_forwardGenerator_pass(self):
+        pass
+
+
+@slow
+@require_torch_gpu
+class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
+    pipeline_class = StableDiffusion3ControlNetPipeline
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_canny(self):
+        controlnet = SD3ControlNetModel.from_pretrained("InstantX/SD3-Controlnet-Canny", torch_dtype=torch.float16)
+        pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "Anime style illustration of a girl wearing a suit. A moon in sky. In the background we see a big rain approaching. text 'InstantX' on image"
+        n_prompt = "NSFW, nude, naked, porn, ugly"
+        control_image = load_image("https://huggingface.co/InstantX/SD3-Controlnet-Canny/resolve/main/canny.jpg")
+
+        output = pipe(
+            prompt,
+            negative_prompt=n_prompt,
+            control_image=control_image,
+            controlnet_conditioning_scale=0.5,
+            guidance_scale=5.0,
+            num_inference_steps=2,
+            output_type="np",
+            generator=generator,
+        )
+        image = output.images[0]
+
+        assert image.shape == (1024, 1024, 3)
+
+        original_image = image[-3:, -3:, -1].flatten()
+
+        expected_image = np.array(
+            [0.20947266, 0.1574707, 0.19897461, 0.15063477, 0.1418457, 0.17285156, 0.14160156, 0.13989258, 0.30810547]
+        )
+
+        assert np.abs(original_image.flatten() - expected_image).max() < 1e-2
+
+    def test_pose(self):
+        controlnet = SD3ControlNetModel.from_pretrained("InstantX/SD3-Controlnet-Pose", torch_dtype=torch.float16)
+        pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = 'Anime style illustration of a girl wearing a suit. A moon in sky. In the background we see a big rain approaching. text "InstantX" on image'
+        n_prompt = "NSFW, nude, naked, porn, ugly"
+        control_image = load_image("https://huggingface.co/InstantX/SD3-Controlnet-Pose/resolve/main/pose.jpg")
+
+        output = pipe(
+            prompt,
+            negative_prompt=n_prompt,
+            control_image=control_image,
+            controlnet_conditioning_scale=0.5,
+            guidance_scale=5.0,
+            num_inference_steps=2,
+            output_type="np",
+            generator=generator,
+        )
+        image = output.images[0]
+
+        assert image.shape == (1024, 1024, 3)
+
+        original_image = image[-3:, -3:, -1].flatten()
+
+        expected_image = np.array(
+            [0.8671875, 0.86621094, 0.91015625, 0.8491211, 0.87890625, 0.9140625, 0.8300781, 0.8334961, 0.8623047]
+        )
+
+        assert np.abs(original_image.flatten() - expected_image).max() < 1e-2
+
+    def test_tile(self):
+        controlnet = SD3ControlNetModel.from_pretrained("InstantX//SD3-Controlnet-Tile", torch_dtype=torch.float16)
+        pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = 'Anime style illustration of a girl wearing a suit. A moon in sky. In the background we see a big rain approaching. text "InstantX" on image'
+        n_prompt = "NSFW, nude, naked, porn, ugly"
+        control_image = load_image("https://huggingface.co/InstantX/SD3-Controlnet-Tile/resolve/main/tile.jpg")
+
+        output = pipe(
+            prompt,
+            negative_prompt=n_prompt,
+            control_image=control_image,
+            controlnet_conditioning_scale=0.5,
+            guidance_scale=5.0,
+            num_inference_steps=2,
+            output_type="np",
+            generator=generator,
+        )
+        image = output.images[0]
+
+        assert image.shape == (1024, 1024, 3)
+
+        original_image = image[-3:, -3:, -1].flatten()
+
+        expected_image = np.array(
+            [0.6982422, 0.7011719, 0.65771484, 0.6904297, 0.7416992, 0.6904297, 0.6977539, 0.7080078, 0.6386719]
+        )
+
+        assert np.abs(original_image.flatten() - expected_image).max() < 1e-2
+
+    def test_multi_controlnet(self):
+        controlnet = SD3ControlNetModel.from_pretrained("InstantX/SD3-Controlnet-Canny", torch_dtype=torch.float16)
+        controlnet = SD3MultiControlNetModel([controlnet, controlnet])
+
+        pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "Anime style illustration of a girl wearing a suit. A moon in sky. In the background we see a big rain approaching. text 'InstantX' on image"
+        n_prompt = "NSFW, nude, naked, porn, ugly"
+        control_image = load_image("https://huggingface.co/InstantX/SD3-Controlnet-Canny/resolve/main/canny.jpg")
+
+        output = pipe(
+            prompt,
+            negative_prompt=n_prompt,
+            control_image=[control_image, control_image],
+            controlnet_conditioning_scale=[0.25, 0.25],
+            guidance_scale=5.0,
+            num_inference_steps=2,
+            output_type="np",
+            generator=generator,
+        )
+        image = output.images[0]
+
+        assert image.shape == (1024, 1024, 3)
+
+        original_image = image[-3:, -3:, -1].flatten()
+        expected_image = np.array(
+            [0.7451172, 0.7416992, 0.7158203, 0.7792969, 0.7607422, 0.7089844, 0.6855469, 0.71777344, 0.7314453]
+        )
+
+        assert np.abs(original_image.flatten() - expected_image).max() < 1e-2
diff --git a/diffusers/tests/pipelines/controlnet_xs/__init__.py b/diffusers/tests/pipelines/controlnet_xs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/controlnet_xs/test_controlnetxs.py b/diffusers/tests/pipelines/controlnet_xs/test_controlnetxs.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb0306741fdb35e8151457852b1f7889c7a68670
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet_xs/test_controlnetxs.py
@@ -0,0 +1,398 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import traceback
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AsymmetricAutoencoderKL,
+    AutoencoderKL,
+    AutoencoderTiny,
+    ConsistencyDecoderVAE,
+    ControlNetXSAdapter,
+    DDIMScheduler,
+    LCMScheduler,
+    StableDiffusionControlNetXSPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    is_torch_compile,
+    load_image,
+    load_numpy,
+    require_torch_2,
+    require_torch_gpu,
+    run_test_in_subprocess,
+    slow,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ...models.autoencoders.test_models_vae import (
+    get_asym_autoencoder_kl_config,
+    get_autoencoder_kl_config,
+    get_autoencoder_tiny_config,
+    get_consistency_vae_config,
+)
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDFunctionTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+# Will be run via run_test_in_subprocess
+def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        _ = in_queue.get(timeout=timeout)
+
+        controlnet = ControlNetXSAdapter.from_pretrained(
+            "UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16
+        )
+        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-1-base",
+            controlnet=controlnet,
+            safety_checker=None,
+            torch_dtype=torch.float16,
+        )
+        pipe.to("cuda")
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ).resize((512, 512))
+
+        output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np")
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
+        )
+        expected_image = np.resize(expected_image, (512, 512, 3))
+
+        assert np.abs(expected_image - image).max() < 1.0
+
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
+class ControlNetXSPipelineFastTests(
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    SDFunctionTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionControlNetXSPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    test_attention_slicing = False
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            sample_size=16,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=8,
+            norm_num_groups=4,
+            time_cond_proj_dim=time_cond_proj_dim,
+            use_linear_projection=True,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetXSAdapter.from_unet(
+            unet=unet,
+            size_ratio=1,
+            learn_time_embedding=True,
+            conditioning_embedding_out_channels=(2, 2),
+        )
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=8,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+        image = randn_tensor(
+            (1, 3, 8 * controlnet_embedder_scale_factor, 8 * controlnet_embedder_scale_factor),
+            generator=generator,
+            device=torch.device(device),
+        )
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "numpy",
+            "image": image,
+        }
+
+        return inputs
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    def test_controlnet_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=8)
+        sd_pipe = StableDiffusionControlNetXSPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 16, 16, 3)
+        expected_slice = np.array([0.745, 0.753, 0.767, 0.543, 0.523, 0.502, 0.314, 0.521, 0.478])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the dtype from pipe.components
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.float16)
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    def test_multi_vae(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        block_out_channels = pipe.vae.config.block_out_channels
+        norm_num_groups = pipe.vae.config.norm_num_groups
+
+        vae_classes = [AutoencoderKL, AsymmetricAutoencoderKL, ConsistencyDecoderVAE, AutoencoderTiny]
+        configs = [
+            get_autoencoder_kl_config(block_out_channels, norm_num_groups),
+            get_asym_autoencoder_kl_config(block_out_channels, norm_num_groups),
+            get_consistency_vae_config(block_out_channels, norm_num_groups),
+            get_autoencoder_tiny_config(block_out_channels),
+        ]
+
+        out_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
+
+        for vae_cls, config in zip(vae_classes, configs):
+            vae = vae_cls(**config)
+            vae = vae.to(torch_device)
+            components["vae"] = vae
+            vae_pipe = self.pipeline_class(**components)
+
+            # pipeline creates a new UNetControlNetXSModel under the hood, which aren't on device.
+            # So we need to move the new pipe to device.
+            vae_pipe.to(torch_device)
+            vae_pipe.set_progress_bar_config(disable=None)
+
+            out_vae_np = vae_pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
+
+            assert out_vae_np.shape == out_np.shape
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        # pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the device from pipe.components
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+
+@slow
+@require_torch_gpu
+class ControlNetXSPipelineSlowTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_canny(self):
+        controlnet = ControlNetXSAdapter.from_pretrained(
+            "UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16
+        )
+        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+
+        image = output.images[0]
+
+        assert image.shape == (768, 512, 3)
+
+        original_image = image[-3:, -3:, -1].flatten()
+        expected_image = np.array([0.1963, 0.229, 0.2659, 0.2109, 0.2332, 0.2827, 0.2534, 0.2422, 0.2808])
+        assert np.allclose(original_image, expected_image, atol=1e-04)
+
+    def test_depth(self):
+        controlnet = ControlNetXSAdapter.from_pretrained(
+            "UmerHA/Testing-ConrolNetXS-SD2.1-depth", torch_dtype=torch.float16
+        )
+        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "Stormtrooper's lecture"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        original_image = image[-3:, -3:, -1].flatten()
+        expected_image = np.array([0.4844, 0.4937, 0.4956, 0.4663, 0.5039, 0.5044, 0.4565, 0.4883, 0.4941])
+        assert np.allclose(original_image, expected_image, atol=1e-04)
+
+    @is_torch_compile
+    @require_torch_2
+    def test_stable_diffusion_compile(self):
+        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
diff --git a/diffusers/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py b/diffusers/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..c940504d6c3ec269391594a43a52a165e7c47e39
--- /dev/null
+++ b/diffusers/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py
@@ -0,0 +1,425 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import (
+    AsymmetricAutoencoderKL,
+    AutoencoderKL,
+    AutoencoderTiny,
+    ConsistencyDecoderVAE,
+    ControlNetXSAdapter,
+    EulerDiscreteScheduler,
+    StableDiffusionXLControlNetXSPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, slow, torch_device
+from diffusers.utils.torch_utils import randn_tensor
+
+from ...models.autoencoders.test_models_vae import (
+    get_asym_autoencoder_kl_config,
+    get_autoencoder_kl_config,
+    get_autoencoder_tiny_config,
+    get_consistency_vae_config,
+)
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLControlNetXSPipelineFastTests(
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionXLControlNetXSPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    test_attention_slicing = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            sample_size=16,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            use_linear_projection=True,
+            norm_num_groups=4,
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=56,  # 6 * 8 (addition_time_embed_dim) + 8 (cross_attention_dim)
+            cross_attention_dim=8,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetXSAdapter.from_unet(
+            unet=unet,
+            size_ratio=0.5,
+            learn_time_embedding=True,
+            conditioning_embedding_out_channels=(2, 2),
+        )
+        torch.manual_seed(0)
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=4,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=8,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "feature_extractor": None,
+        }
+        return components
+
+    # Copied from test_controlnet_sdxl.py
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+        image = randn_tensor(
+            (1, 3, 8 * controlnet_embedder_scale_factor, 8 * controlnet_embedder_scale_factor),
+            generator=generator,
+            device=torch.device(device),
+        )
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": image,
+        }
+
+        return inputs
+
+    # Copied from test_controlnet_sdxl.py
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    # Copied from test_controlnet_sdxl.py
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    # Copied from test_controlnet_sdxl.py
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    @require_torch_gpu
+    # Copied from test_controlnet_sdxl.py
+    def test_stable_diffusion_xl_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            pipe.unet.set_default_attn_processor()
+
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    # Copied from test_controlnet_sdxl.py
+    def test_stable_diffusion_xl_multi_prompts(self):
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+
+        # forward with single prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = inputs["prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "different prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+        # manually set a negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same negative_prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = inputs["negative_prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = "different negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+    # Copied from test_stable_diffusion_xl.py
+    def test_stable_diffusion_xl_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward without prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = 2 * [inputs["prompt"]]
+        inputs["num_images_per_prompt"] = 2
+
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 2 * [inputs.pop("prompt")]
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = sd_pipe.encode_prompt(prompt)
+
+        output = sd_pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # make sure that it's equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1.1e-4
+
+    # Copied from test_stable_diffusion_xl.py
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    # Copied from test_controlnetxs.py
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the dtype from pipe.components
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.float16)
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    def test_multi_vae(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        block_out_channels = pipe.vae.config.block_out_channels
+        norm_num_groups = pipe.vae.config.norm_num_groups
+
+        vae_classes = [AutoencoderKL, AsymmetricAutoencoderKL, ConsistencyDecoderVAE, AutoencoderTiny]
+        configs = [
+            get_autoencoder_kl_config(block_out_channels, norm_num_groups),
+            get_asym_autoencoder_kl_config(block_out_channels, norm_num_groups),
+            get_consistency_vae_config(block_out_channels, norm_num_groups),
+            get_autoencoder_tiny_config(block_out_channels),
+        ]
+
+        out_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
+
+        for vae_cls, config in zip(vae_classes, configs):
+            vae = vae_cls(**config)
+            vae = vae.to(torch_device)
+            components["vae"] = vae
+            vae_pipe = self.pipeline_class(**components)
+
+            # pipeline creates a new UNetControlNetXSModel under the hood, which aren't on device.
+            # So we need to move the new pipe to device.
+            vae_pipe.to(torch_device)
+            vae_pipe.set_progress_bar_config(disable=None)
+
+            out_vae_np = vae_pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
+
+            assert out_vae_np.shape == out_np.shape
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionXLControlNetXSPipelineSlowTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_canny(self):
+        controlnet = ControlNetXSAdapter.from_pretrained(
+            "UmerHA/Testing-ConrolNetXS-SDXL-canny", torch_dtype=torch.float16
+        )
+        pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.enable_sequential_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        )
+
+        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
+
+        assert images[0].shape == (768, 512, 3)
+
+        original_image = images[0, -3:, -3:, -1].flatten()
+        expected_image = np.array([0.3202, 0.3151, 0.3328, 0.3172, 0.337, 0.3381, 0.3378, 0.3389, 0.3224])
+        assert np.allclose(original_image, expected_image, atol=1e-04)
+
+    def test_depth(self):
+        controlnet = ControlNetXSAdapter.from_pretrained(
+            "UmerHA/Testing-ConrolNetXS-SDXL-depth", torch_dtype=torch.float16
+        )
+        pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.enable_sequential_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "Stormtrooper's lecture"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
+        )
+
+        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
+
+        assert images[0].shape == (512, 512, 3)
+
+        original_image = images[0, -3:, -3:, -1].flatten()
+        expected_image = np.array([0.5448, 0.5437, 0.5426, 0.5543, 0.553, 0.5475, 0.5595, 0.5602, 0.5529])
+        assert np.allclose(original_image, expected_image, atol=1e-04)
diff --git a/diffusers/tests/pipelines/dance_diffusion/__init__.py b/diffusers/tests/pipelines/dance_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/diffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f60c0b421f3fce737a2ee3aaab270b2409396e9
--- /dev/null
+++ b/diffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py
@@ -0,0 +1,167 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, skip_mps, torch_device
+
+from ..pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = DanceDiffusionPipeline
+    params = UNCONDITIONAL_AUDIO_GENERATION_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {
+        "callback",
+        "latents",
+        "callback_steps",
+        "output_type",
+        "num_images_per_prompt",
+    }
+    batch_params = UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS
+    test_attention_slicing = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet1DModel(
+            block_out_channels=(32, 32, 64),
+            extra_in_channels=16,
+            sample_size=512,
+            sample_rate=16_000,
+            in_channels=2,
+            out_channels=2,
+            flip_sin_to_cos=True,
+            use_timestep_embedding=False,
+            time_embedding_type="fourier",
+            mid_block_type="UNetMidBlock1D",
+            down_block_types=("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
+            up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
+        )
+        scheduler = IPNDMScheduler()
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "batch_size": 1,
+            "generator": generator,
+            "num_inference_steps": 4,
+        }
+        return inputs
+
+    def test_dance_diffusion(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = DanceDiffusionPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = pipe(**inputs)
+        audio = output.audios
+
+        audio_slice = audio[0, -3:, -3:]
+
+        assert audio.shape == (1, 2, components["unet"].sample_size)
+        expected_slice = np.array([-0.7265, 1.0000, -0.8388, 0.1175, 0.9498, -1.0000])
+        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
+
+    @skip_mps
+    def test_save_load_local(self):
+        return super().test_save_load_local()
+
+    @skip_mps
+    def test_dict_tuple_outputs_equivalent(self):
+        return super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3)
+
+    @skip_mps
+    def test_save_load_optional_components(self):
+        return super().test_save_load_optional_components()
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        return super().test_attention_slicing_forward_pass()
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+
+@nightly
+@require_torch_gpu
+class PipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_dance_diffusion(self):
+        device = torch_device
+
+        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
+        audio = output.audios
+
+        audio_slice = audio[0, -3:, -3:]
+
+        assert audio.shape == (1, 2, pipe.unet.config.sample_size)
+        expected_slice = np.array([-0.0192, -0.0231, -0.0318, -0.0059, 0.0002, -0.0020])
+
+        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_dance_diffusion_fp16(self):
+        device = torch_device
+
+        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", torch_dtype=torch.float16)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
+        audio = output.audios
+
+        audio_slice = audio[0, -3:, -3:]
+
+        assert audio.shape == (1, 2, pipe.unet.config.sample_size)
+        expected_slice = np.array([-0.0367, -0.0488, -0.0771, -0.0525, -0.0444, -0.0341])
+
+        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/ddim/__init__.py b/diffusers/tests/pipelines/ddim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/ddim/test_ddim.py b/diffusers/tests/pipelines/ddim/test_ddim.py
new file mode 100644
index 0000000000000000000000000000000000000000..2078a592ceca09e68cacdf1c44d76a0063c52c49
--- /dev/null
+++ b/diffusers/tests/pipelines/ddim/test_ddim.py
@@ -0,0 +1,142 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import DDIMPipeline, DDIMScheduler, UNet2DModel
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device
+
+from ..pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class DDIMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = DDIMPipeline
+    params = UNCONDITIONAL_IMAGE_GENERATION_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {
+        "num_images_per_prompt",
+        "latents",
+        "callback",
+        "callback_steps",
+    }
+    batch_params = UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DModel(
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            norm_num_groups=4,
+            sample_size=8,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        scheduler = DDIMScheduler()
+        components = {"unet": unet, "scheduler": scheduler}
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "batch_size": 1,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        self.assertEqual(image.shape, (1, 8, 8, 3))
+        expected_slice = np.array([0.0, 9.979e-01, 0.0, 9.999e-01, 9.986e-01, 9.991e-01, 7.106e-04, 0.0, 0.0])
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3)
+
+    def test_save_load_local(self):
+        super().test_save_load_local(expected_max_difference=3e-3)
+
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=3e-3)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+
+@slow
+@require_torch_gpu
+class DDIMPipelineIntegrationTests(unittest.TestCase):
+    def test_inference_cifar10(self):
+        model_id = "google/ddpm-cifar10-32"
+
+        unet = UNet2DModel.from_pretrained(model_id)
+        scheduler = DDIMScheduler()
+
+        ddim = DDIMPipeline(unet=unet, scheduler=scheduler)
+        ddim.to(torch_device)
+        ddim.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = ddim(generator=generator, eta=0.0, output_type="np").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.1723, 0.1617, 0.1600, 0.1626, 0.1497, 0.1513, 0.1505, 0.1442, 0.1453])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_inference_ema_bedroom(self):
+        model_id = "google/ddpm-ema-bedroom-256"
+
+        unet = UNet2DModel.from_pretrained(model_id)
+        scheduler = DDIMScheduler.from_pretrained(model_id)
+
+        ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = ddpm(generator=generator, output_type="np").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.0060, 0.0201, 0.0344, 0.0024, 0.0018, 0.0002, 0.0022, 0.0000, 0.0069])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/ddpm/__init__.py b/diffusers/tests/pipelines/ddpm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/ddpm/test_ddpm.py b/diffusers/tests/pipelines/ddpm/test_ddpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6d0821da4c221d1b178630fb20c29e192f116d1
--- /dev/null
+++ b/diffusers/tests/pipelines/ddpm/test_ddpm.py
@@ -0,0 +1,110 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device
+
+
+enable_full_determinism()
+
+
+class DDPMPipelineFastTests(unittest.TestCase):
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            norm_num_groups=4,
+            sample_size=8,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    def test_fast_inference(self):
+        device = "cpu"
+        unet = self.dummy_uncond_unet
+        scheduler = DDPMScheduler()
+
+        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
+        ddpm.to(device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image = ddpm(generator=generator, num_inference_steps=2, output_type="np").images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = ddpm(generator=generator, num_inference_steps=2, output_type="np", return_dict=False)[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 8, 8, 3)
+        expected_slice = np.array([0.0, 0.9996672, 0.00329116, 1.0, 0.9995991, 1.0, 0.0060907, 0.00115037, 0.0])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_inference_predict_sample(self):
+        unet = self.dummy_uncond_unet
+        scheduler = DDPMScheduler(prediction_type="sample")
+
+        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = ddpm(generator=generator, num_inference_steps=2, output_type="np").images
+
+        generator = torch.manual_seed(0)
+        image_eps = ddpm(generator=generator, num_inference_steps=2, output_type="np")[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_eps_slice = image_eps[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 8, 8, 3)
+        tolerance = 1e-2 if torch_device != "mps" else 3e-2
+        assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance
+
+
+@slow
+@require_torch_gpu
+class DDPMPipelineIntegrationTests(unittest.TestCase):
+    def test_inference_cifar10(self):
+        model_id = "google/ddpm-cifar10-32"
+
+        unet = UNet2DModel.from_pretrained(model_id)
+        scheduler = DDPMScheduler.from_pretrained(model_id)
+
+        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = ddpm(generator=generator, output_type="np").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.4200, 0.3588, 0.1939, 0.3847, 0.3382, 0.2647, 0.4155, 0.3582, 0.3385])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/deepfloyd_if/__init__.py b/diffusers/tests/pipelines/deepfloyd_if/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..094254a6187595da9f35378a15a68cd2d4aa29f7
--- /dev/null
+++ b/diffusers/tests/pipelines/deepfloyd_if/__init__.py
@@ -0,0 +1,272 @@
+import tempfile
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+
+from diffusers import DDPMScheduler, UNet2DConditionModel
+from diffusers.models.attention_processor import AttnAddedKVProcessor
+from diffusers.pipelines.deepfloyd_if import IFWatermarker
+from diffusers.utils.testing_utils import torch_device
+
+from ..test_pipelines_common import to_np
+
+
+# WARN: the hf-internal-testing/tiny-random-t5 text encoder has some non-determinism in the `save_load` tests.
+
+
+class IFPipelineTesterMixin:
+    def _get_dummy_components(self):
+        torch.manual_seed(0)
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            sample_size=32,
+            layers_per_block=1,
+            block_out_channels=[32, 64],
+            down_block_types=[
+                "ResnetDownsampleBlock2D",
+                "SimpleCrossAttnDownBlock2D",
+            ],
+            mid_block_type="UNetMidBlock2DSimpleCrossAttn",
+            up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"],
+            in_channels=3,
+            out_channels=6,
+            cross_attention_dim=32,
+            encoder_hid_dim=32,
+            attention_head_dim=8,
+            addition_embed_type="text",
+            addition_embed_type_num_heads=2,
+            cross_attention_norm="group_norm",
+            resnet_time_scale_shift="scale_shift",
+            act_fn="gelu",
+        )
+        unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
+
+        torch.manual_seed(0)
+        scheduler = DDPMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="squaredcos_cap_v2",
+            beta_start=0.0001,
+            beta_end=0.02,
+            thresholding=True,
+            dynamic_thresholding_ratio=0.95,
+            sample_max_value=1.0,
+            prediction_type="epsilon",
+            variance_type="learned_range",
+        )
+
+        torch.manual_seed(0)
+        watermarker = IFWatermarker()
+
+        return {
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "unet": unet,
+            "scheduler": scheduler,
+            "watermarker": watermarker,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+
+    def _get_superresolution_dummy_components(self):
+        torch.manual_seed(0)
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            sample_size=32,
+            layers_per_block=[1, 2],
+            block_out_channels=[32, 64],
+            down_block_types=[
+                "ResnetDownsampleBlock2D",
+                "SimpleCrossAttnDownBlock2D",
+            ],
+            mid_block_type="UNetMidBlock2DSimpleCrossAttn",
+            up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"],
+            in_channels=6,
+            out_channels=6,
+            cross_attention_dim=32,
+            encoder_hid_dim=32,
+            attention_head_dim=8,
+            addition_embed_type="text",
+            addition_embed_type_num_heads=2,
+            cross_attention_norm="group_norm",
+            resnet_time_scale_shift="scale_shift",
+            act_fn="gelu",
+            class_embed_type="timestep",
+            mid_block_scale_factor=1.414,
+            time_embedding_act_fn="gelu",
+            time_embedding_dim=32,
+        )
+        unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
+
+        torch.manual_seed(0)
+        scheduler = DDPMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="squaredcos_cap_v2",
+            beta_start=0.0001,
+            beta_end=0.02,
+            thresholding=True,
+            dynamic_thresholding_ratio=0.95,
+            sample_max_value=1.0,
+            prediction_type="epsilon",
+            variance_type="learned_range",
+        )
+
+        torch.manual_seed(0)
+        image_noising_scheduler = DDPMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="squaredcos_cap_v2",
+            beta_start=0.0001,
+            beta_end=0.02,
+        )
+
+        torch.manual_seed(0)
+        watermarker = IFWatermarker()
+
+        return {
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "unet": unet,
+            "scheduler": scheduler,
+            "image_noising_scheduler": image_noising_scheduler,
+            "watermarker": watermarker,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+
+    # this test is modified from the base class because if pipelines set the text encoder
+    # as optional with the intention that the user is allowed to encode the prompt once
+    # and then pass the embeddings directly to the pipeline. The base class test uses
+    # the unmodified arguments from `self.get_dummy_inputs` which will pass the unencoded
+    # prompt to the pipeline when the text encoder is set to None, throwing an error.
+    # So we make the test reflect the intended usage of setting the text encoder to None.
+    def _test_save_load_optional_components(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = inputs["prompt"]
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        if "image" in inputs:
+            image = inputs["image"]
+        else:
+            image = None
+
+        if "mask_image" in inputs:
+            mask_image = inputs["mask_image"]
+        else:
+            mask_image = None
+
+        if "original_image" in inputs:
+            original_image = inputs["original_image"]
+        else:
+            original_image = None
+
+        prompt_embeds, negative_prompt_embeds = pipe.encode_prompt(prompt)
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+        }
+
+        if image is not None:
+            inputs["image"] = image
+
+        if mask_image is not None:
+            inputs["mask_image"] = mask_image
+
+        if original_image is not None:
+            inputs["original_image"] = original_image
+
+        # set all optional components to None
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+        }
+
+        if image is not None:
+            inputs["image"] = image
+
+        if mask_image is not None:
+            inputs["mask_image"] = mask_image
+
+        if original_image is not None:
+            inputs["original_image"] = original_image
+
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, 1e-4)
+
+    # Modified from `PipelineTesterMixin` to set the attn processor as it's not serialized.
+    # This should be handled in the base test and then this method can be removed.
+    def _test_save_load_local(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, 1e-4)
diff --git a/diffusers/tests/pipelines/deepfloyd_if/test_if.py b/diffusers/tests/pipelines/deepfloyd_if/test_if.py
new file mode 100644
index 0000000000000000000000000000000000000000..0818665ea113276a058eac810f65f89bfd17bbee
--- /dev/null
+++ b/diffusers/tests/pipelines/deepfloyd_if/test_if.py
@@ -0,0 +1,126 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import torch
+
+from diffusers import (
+    IFPipeline,
+)
+from diffusers.models.attention_processor import AttnAddedKVProcessor
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import load_numpy, require_torch_gpu, skip_mps, slow, torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from . import IFPipelineTesterMixin
+
+
+@skip_mps
+class IFPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+    pipeline_class = IFPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"width", "height", "latents"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    def get_dummy_components(self):
+        return self._get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self):
+        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
+        super().test_save_load_float16(expected_max_diff=1e-1)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
+
+    def test_save_load_local(self):
+        self._test_save_load_local()
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
+
+
+@slow
+@require_torch_gpu
+class IFPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_if_text_to_image(self):
+        pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
+        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
+        pipe.enable_model_cpu_offload()
+
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipe(
+            prompt="anime turtle",
+            num_inference_steps=2,
+            generator=generator,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes < 12 * 10**9
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if.npy"
+        )
+        assert_mean_pixel_difference(image, expected_image)
+        pipe.remove_all_hooks()
diff --git a/diffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py b/diffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..b71cb05e50aecc03abd11e5ec841873cb4158722
--- /dev/null
+++ b/diffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import torch
+
+from diffusers import IFImg2ImgPipeline
+from diffusers.models.attention_processor import AttnAddedKVProcessor
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from . import IFPipelineTesterMixin
+
+
+@skip_mps
+class IFImg2ImgPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+    pipeline_class = IFImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    def get_dummy_components(self):
+        return self._get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self):
+        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
+        super().test_save_load_float16(expected_max_diff=1e-1)
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=1e-1)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
+
+    def test_save_load_local(self):
+        self._test_save_load_local()
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
+
+
+@slow
+@require_torch_gpu
+class IFImg2ImgPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_if_img2img(self):
+        pipe = IFImg2ImgPipeline.from_pretrained(
+            "DeepFloyd/IF-I-L-v1.0",
+            variant="fp16",
+            torch_dtype=torch.float16,
+        )
+        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
+        pipe.enable_model_cpu_offload()
+
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipe(
+            prompt="anime turtle",
+            image=image,
+            num_inference_steps=2,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes < 12 * 10**9
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_img2img.npy"
+        )
+        assert_mean_pixel_difference(image, expected_image)
+
+        pipe.remove_all_hooks()
diff --git a/diffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/diffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc0cf9826b6227c1304ea8280817df97c5868609
--- /dev/null
+++ b/diffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
@@ -0,0 +1,146 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import torch
+
+from diffusers import IFImg2ImgSuperResolutionPipeline
+from diffusers.models.attention_processor import AttnAddedKVProcessor
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from . import IFPipelineTesterMixin
+
+
+@skip_mps
+class IFImg2ImgSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+    pipeline_class = IFImg2ImgSuperResolutionPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"original_image"})
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    def get_dummy_components(self):
+        return self._get_superresolution_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        original_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = floats_tensor((1, 3, 16, 16), rng=random.Random(seed)).to(device)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "original_image": original_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self):
+        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
+        super().test_save_load_float16(expected_max_diff=1e-1)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
+
+    def test_save_load_local(self):
+        self._test_save_load_local()
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
+
+
+@slow
+@require_torch_gpu
+class IFImg2ImgSuperResolutionPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_if_img2img_superresolution(self):
+        pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained(
+            "DeepFloyd/IF-II-L-v1.0",
+            variant="fp16",
+            torch_dtype=torch.float16,
+        )
+        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
+        pipe.enable_model_cpu_offload()
+
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        original_image = floats_tensor((1, 3, 256, 256), rng=random.Random(0)).to(torch_device)
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
+
+        output = pipe(
+            prompt="anime turtle",
+            image=image,
+            original_image=original_image,
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (256, 256, 3)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes < 12 * 10**9
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_img2img_superresolution_stage_II.npy"
+        )
+        assert_mean_pixel_difference(image, expected_image)
+
+        pipe.remove_all_hooks()
diff --git a/diffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/diffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py
new file mode 100644
index 0000000000000000000000000000000000000000..df0cecd8c3072f0aec98bc253c427cca0230ce6b
--- /dev/null
+++ b/diffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import torch
+
+from diffusers import IFInpaintingPipeline
+from diffusers.models.attention_processor import AttnAddedKVProcessor
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from . import IFPipelineTesterMixin
+
+
+@skip_mps
+class IFInpaintingPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+    pipeline_class = IFInpaintingPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    def get_dummy_components(self):
+        return self._get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        mask_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self):
+        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
+        super().test_save_load_float16(expected_max_diff=1e-1)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
+
+    def test_save_load_local(self):
+        self._test_save_load_local()
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
+
+
+@slow
+@require_torch_gpu
+class IFInpaintingPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_if_inpainting(self):
+        pipe = IFInpaintingPipeline.from_pretrained(
+            "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
+        )
+        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
+        pipe.enable_model_cpu_offload()
+
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
+        mask_image = floats_tensor((1, 3, 64, 64), rng=random.Random(1)).to(torch_device)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipe(
+            prompt="anime prompts",
+            image=image,
+            mask_image=mask_image,
+            num_inference_steps=2,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes < 12 * 10**9
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_inpainting.npy"
+        )
+        assert_mean_pixel_difference(image, expected_image)
+        pipe.remove_all_hooks()
diff --git a/diffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/diffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e9f647732892319c76d3b24ea127676683d2ea3
--- /dev/null
+++ b/diffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
@@ -0,0 +1,149 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import torch
+
+from diffusers import IFInpaintingSuperResolutionPipeline
+from diffusers.models.attention_processor import AttnAddedKVProcessor
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from . import IFPipelineTesterMixin
+
+
+@skip_mps
+class IFInpaintingSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+    pipeline_class = IFInpaintingSuperResolutionPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS.union({"original_image"})
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    def get_dummy_components(self):
+        return self._get_superresolution_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = floats_tensor((1, 3, 16, 16), rng=random.Random(seed)).to(device)
+        original_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        mask_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "original_image": original_image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self):
+        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
+        super().test_save_load_float16(expected_max_diff=1e-1)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
+
+    def test_save_load_local(self):
+        self._test_save_load_local()
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
+
+
+@slow
+@require_torch_gpu
+class IFInpaintingSuperResolutionPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_if_inpainting_superresolution(self):
+        pipe = IFInpaintingSuperResolutionPipeline.from_pretrained(
+            "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16
+        )
+        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
+        pipe.enable_model_cpu_offload()
+
+        # Super resolution test
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
+        original_image = floats_tensor((1, 3, 256, 256), rng=random.Random(0)).to(torch_device)
+        mask_image = floats_tensor((1, 3, 256, 256), rng=random.Random(1)).to(torch_device)
+
+        output = pipe(
+            prompt="anime turtle",
+            image=image,
+            original_image=original_image,
+            mask_image=mask_image,
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (256, 256, 3)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes < 12 * 10**9
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_inpainting_superresolution_stage_II.npy"
+        )
+        assert_mean_pixel_difference(image, expected_image)
+
+        pipe.remove_all_hooks()
diff --git a/diffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/diffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e3c8c6e0e152218e86c8a52d3409b7f7a30a319
--- /dev/null
+++ b/diffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import torch
+
+from diffusers import IFSuperResolutionPipeline
+from diffusers.models.attention_processor import AttnAddedKVProcessor
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device
+
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from . import IFPipelineTesterMixin
+
+
+@skip_mps
+class IFSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+    pipeline_class = IFSuperResolutionPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    def get_dummy_components(self):
+        return self._get_superresolution_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self):
+        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
+        super().test_save_load_float16(expected_max_diff=1e-1)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
+
+    def test_save_load_local(self):
+        self._test_save_load_local()
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
+
+
+@slow
+@require_torch_gpu
+class IFSuperResolutionPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_if_superresolution(self):
+        pipe = IFSuperResolutionPipeline.from_pretrained(
+            "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16
+        )
+        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
+        pipe.enable_model_cpu_offload()
+
+        # Super resolution test
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipe(
+            prompt="anime turtle",
+            image=image,
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (256, 256, 3)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes < 12 * 10**9
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_superresolution_stage_II.npy"
+        )
+        assert_mean_pixel_difference(image, expected_image)
+
+        pipe.remove_all_hooks()
diff --git a/diffusers/tests/pipelines/dit/__init__.py b/diffusers/tests/pipelines/dit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/dit/test_dit.py b/diffusers/tests/pipelines/dit/test_dit.py
new file mode 100644
index 0000000000000000000000000000000000000000..30883ac4a63dac6c0f62131b0a04f93bef5aba8e
--- /dev/null
+++ b/diffusers/tests/pipelines/dit/test_dit.py
@@ -0,0 +1,156 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, DiTTransformer2DModel, DPMSolverMultistepScheduler
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, nightly, require_torch_gpu, torch_device
+
+from ..pipeline_params import (
+    CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
+    CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = DiTPipeline
+    params = CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {
+        "latents",
+        "num_images_per_prompt",
+        "callback",
+        "callback_steps",
+    }
+    batch_params = CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = DiTTransformer2DModel(
+            sample_size=16,
+            num_layers=2,
+            patch_size=4,
+            attention_head_dim=8,
+            num_attention_heads=2,
+            in_channels=4,
+            out_channels=8,
+            attention_bias=True,
+            activation_fn="gelu-approximate",
+            num_embeds_ada_norm=1000,
+            norm_type="ada_norm_zero",
+            norm_elementwise_affine=False,
+        )
+        vae = AutoencoderKL()
+        scheduler = DDIMScheduler()
+        components = {"transformer": transformer.eval(), "vae": vae.eval(), "scheduler": scheduler}
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "class_labels": [1],
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        self.assertEqual(image.shape, (1, 16, 16, 3))
+        expected_slice = np.array([0.2946, 0.6601, 0.4329, 0.3296, 0.4144, 0.5319, 0.7273, 0.5013, 0.4457])
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
+
+
+@nightly
+@require_torch_gpu
+class DiTPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_dit_256(self):
+        generator = torch.manual_seed(0)
+
+        pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256")
+        pipe.to("cuda")
+
+        words = ["vase", "umbrella", "white shark", "white wolf"]
+        ids = pipe.get_label_ids(words)
+
+        images = pipe(ids, generator=generator, num_inference_steps=40, output_type="np").images
+
+        for word, image in zip(words, images):
+            expected_image = load_numpy(
+                f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}.npy"
+            )
+            assert np.abs((expected_image - image).max()) < 1e-2
+
+    def test_dit_512(self):
+        pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-512")
+        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        pipe.to("cuda")
+
+        words = ["vase", "umbrella"]
+        ids = pipe.get_label_ids(words)
+
+        generator = torch.manual_seed(0)
+        images = pipe(ids, generator=generator, num_inference_steps=25, output_type="np").images
+
+        for word, image in zip(words, images):
+            expected_image = load_numpy(
+                "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+                f"/dit/{word}_512.npy"
+            )
+
+            assert np.abs((expected_image - image).max()) < 1e-1
diff --git a/diffusers/tests/pipelines/flux/__init__.py b/diffusers/tests/pipelines/flux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/flux/test_pipeline_flux.py b/diffusers/tests/pipelines/flux/test_pipeline_flux.py
new file mode 100644
index 0000000000000000000000000000000000000000..4caff40302616e982bf9bcb17c371b3bd1fa74c5
--- /dev/null
+++ b/diffusers/tests/pipelines/flux/test_pipeline_flux.py
@@ -0,0 +1,251 @@
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
+
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
+from diffusers.utils.testing_utils import (
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import (
+    PipelineTesterMixin,
+    check_qkv_fusion_matches_attn_procs_length,
+    check_qkv_fusion_processors_exist,
+)
+
+
+class FluxPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = FluxPipeline
+    params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
+    batch_params = frozenset(["prompt"])
+
+    # there is no xformers processor for Flux
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = FluxTransformer2DModel(
+            patch_size=1,
+            in_channels=4,
+            num_layers=1,
+            num_single_layers=1,
+            attention_head_dim=16,
+            num_attention_heads=2,
+            joint_attention_dim=32,
+            pooled_projection_dim=32,
+            axes_dims_rope=[4, 4, 8],
+        )
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModel(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=1,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            shift_factor=0.0609,
+            scaling_factor=1.5035,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "transformer": transformer,
+            "vae": vae,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "height": 8,
+            "width": 8,
+            "max_sequence_length": 48,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_flux_different_prompts(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_same_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "a different prompt"
+        output_different_prompts = pipe(**inputs).images[0]
+
+        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
+
+        # Outputs should be different here
+        # For some reasons, they don't show large differences
+        assert max_diff > 1e-6
+
+    def test_flux_prompt_embeds(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+        inputs = self.get_dummy_inputs(torch_device)
+
+        output_with_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = inputs.pop("prompt")
+
+        (prompt_embeds, pooled_prompt_embeds, text_ids) = pipe.encode_prompt(
+            prompt,
+            prompt_2=None,
+            device=torch_device,
+            max_sequence_length=inputs["max_sequence_length"],
+        )
+        output_with_embeds = pipe(
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            **inputs,
+        ).images[0]
+
+        max_diff = np.abs(output_with_prompt - output_with_embeds).max()
+        assert max_diff < 1e-4
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        original_image_slice = image[0, -3:, -3:, -1]
+
+        # TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added
+        # to the pipeline level.
+        pipe.transformer.fuse_qkv_projections()
+        assert check_qkv_fusion_processors_exist(
+            pipe.transformer
+        ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+        assert check_qkv_fusion_matches_attn_procs_length(
+            pipe.transformer, pipe.transformer.original_attn_processors
+        ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_fused = image[0, -3:, -3:, -1]
+
+        pipe.transformer.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_disabled = image[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
+
+
+@slow
+@require_torch_gpu
+class FluxPipelineSlowTests(unittest.TestCase):
+    pipeline_class = FluxPipeline
+    repo_id = "black-forest-labs/FLUX.1-schnell"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        return {
+            "prompt": "A photo of a cat",
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "generator": generator,
+        }
+
+    # TODO: Dhruv. Move large model tests to a dedicated runner)
+    @unittest.skip("We cannot run inference on this model with the current CI hardware")
+    def test_flux_inference(self):
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.bfloat16)
+        pipe.enable_model_cpu_offload()
+
+        inputs = self.get_inputs(torch_device)
+
+        image = pipe(**inputs).images[0]
+        image_slice = image[0, :10, :10]
+        expected_slice = np.array(
+            [
+                [0.36132812, 0.30004883, 0.25830078],
+                [0.36669922, 0.31103516, 0.23754883],
+                [0.34814453, 0.29248047, 0.23583984],
+                [0.35791016, 0.30981445, 0.23999023],
+                [0.36328125, 0.31274414, 0.2607422],
+                [0.37304688, 0.32177734, 0.26171875],
+                [0.3671875, 0.31933594, 0.25756836],
+                [0.36035156, 0.31103516, 0.2578125],
+                [0.3857422, 0.33789062, 0.27563477],
+                [0.3701172, 0.31982422, 0.265625],
+            ],
+            dtype=np.float32,
+        )
+
+        max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
+
+        assert max_diff < 1e-4
diff --git a/diffusers/tests/pipelines/flux/test_pipeline_flux_img2img.py b/diffusers/tests/pipelines/flux/test_pipeline_flux_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..a038b17258122289ed4c4fbd582829be9069cf45
--- /dev/null
+++ b/diffusers/tests/pipelines/flux/test_pipeline_flux_img2img.py
@@ -0,0 +1,149 @@
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
+
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxImg2ImgPipeline, FluxTransformer2DModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class FluxImg2ImgPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = FluxImg2ImgPipeline
+    params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
+    batch_params = frozenset(["prompt"])
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = FluxTransformer2DModel(
+            patch_size=1,
+            in_channels=4,
+            num_layers=1,
+            num_single_layers=1,
+            attention_head_dim=16,
+            num_attention_heads=2,
+            joint_attention_dim=32,
+            pooled_projection_dim=32,
+            axes_dims_rope=[4, 4, 8],
+        )
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModel(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=1,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            shift_factor=0.0609,
+            scaling_factor=1.5035,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "transformer": transformer,
+            "vae": vae,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "height": 8,
+            "width": 8,
+            "max_sequence_length": 48,
+            "strength": 0.8,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_flux_different_prompts(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_same_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "a different prompt"
+        output_different_prompts = pipe(**inputs).images[0]
+
+        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
+
+        # Outputs should be different here
+        # For some reasons, they don't show large differences
+        assert max_diff > 1e-6
+
+    def test_flux_prompt_embeds(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+        inputs = self.get_dummy_inputs(torch_device)
+
+        output_with_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = inputs.pop("prompt")
+
+        (prompt_embeds, pooled_prompt_embeds, text_ids) = pipe.encode_prompt(
+            prompt,
+            prompt_2=None,
+            device=torch_device,
+            max_sequence_length=inputs["max_sequence_length"],
+        )
+        output_with_embeds = pipe(
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            **inputs,
+        ).images[0]
+
+        max_diff = np.abs(output_with_prompt - output_with_embeds).max()
+        assert max_diff < 1e-4
diff --git a/diffusers/tests/pipelines/flux/test_pipeline_flux_inpaint.py b/diffusers/tests/pipelines/flux/test_pipeline_flux_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac2eb1fa261b1396a0d115f94c832d2a0b83cad8
--- /dev/null
+++ b/diffusers/tests/pipelines/flux/test_pipeline_flux_inpaint.py
@@ -0,0 +1,151 @@
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
+
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxInpaintPipeline, FluxTransformer2DModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class FluxInpaintPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = FluxInpaintPipeline
+    params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
+    batch_params = frozenset(["prompt"])
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = FluxTransformer2DModel(
+            patch_size=1,
+            in_channels=8,
+            num_layers=1,
+            num_single_layers=1,
+            attention_head_dim=16,
+            num_attention_heads=2,
+            joint_attention_dim=32,
+            pooled_projection_dim=32,
+            axes_dims_rope=[4, 4, 8],
+        )
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModel(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=2,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            shift_factor=0.0609,
+            scaling_factor=1.5035,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "transformer": transformer,
+            "vae": vae,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        mask_image = torch.ones((1, 1, 32, 32)).to(device)
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "height": 32,
+            "width": 32,
+            "max_sequence_length": 48,
+            "strength": 0.8,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_flux_inpaint_different_prompts(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_same_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "a different prompt"
+        output_different_prompts = pipe(**inputs).images[0]
+
+        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
+
+        # Outputs should be different here
+        # For some reasons, they don't show large differences
+        assert max_diff > 1e-6
+
+    def test_flux_inpaint_prompt_embeds(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+        inputs = self.get_dummy_inputs(torch_device)
+
+        output_with_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = inputs.pop("prompt")
+
+        (prompt_embeds, pooled_prompt_embeds, text_ids) = pipe.encode_prompt(
+            prompt,
+            prompt_2=None,
+            device=torch_device,
+            max_sequence_length=inputs["max_sequence_length"],
+        )
+        output_with_embeds = pipe(
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            **inputs,
+        ).images[0]
+
+        max_diff = np.abs(output_with_prompt - output_with_embeds).max()
+        assert max_diff < 1e-4
diff --git a/diffusers/tests/pipelines/hunyuan_dit/__init__.py b/diffusers/tests/pipelines/hunyuan_dit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py b/diffusers/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py
new file mode 100644
index 0000000000000000000000000000000000000000..653cb41e4bc4f58806955d1374dd7de2d8b4b9c8
--- /dev/null
+++ b/diffusers/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py
@@ -0,0 +1,335 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, BertModel, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    HunyuanDiT2DModel,
+    HunyuanDiTPipeline,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    PipelineTesterMixin,
+    check_qkv_fusion_matches_attn_procs_length,
+    check_qkv_fusion_processors_exist,
+    to_np,
+)
+
+
+enable_full_determinism()
+
+
+class HunyuanDiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = HunyuanDiTPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    required_optional_params = PipelineTesterMixin.required_optional_params
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = HunyuanDiT2DModel(
+            sample_size=16,
+            num_layers=2,
+            patch_size=2,
+            attention_head_dim=8,
+            num_attention_heads=3,
+            in_channels=4,
+            cross_attention_dim=32,
+            cross_attention_dim_t5=32,
+            pooled_projection_dim=16,
+            hidden_size=24,
+            activation_fn="gelu-approximate",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL()
+
+        scheduler = DDPMScheduler()
+        text_encoder = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel")
+        text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+        tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        components = {
+            "transformer": transformer.eval(),
+            "vae": vae.eval(),
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "use_resolution_binning": False,
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        self.assertEqual(image.shape, (1, 16, 16, 3))
+        expected_slice = np.array(
+            [0.56939435, 0.34541583, 0.35915792, 0.46489206, 0.38775963, 0.45004836, 0.5957267, 0.59481275, 0.33287364]
+        )
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_sequential_cpu_offload_forward_pass(self):
+        # TODO(YiYi) need to fix later
+        pass
+
+    def test_sequential_offload_forward_pass_twice(self):
+        # TODO(YiYi) need to fix later
+        pass
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-3,
+        )
+
+    def test_save_load_optional_components(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = inputs["prompt"]
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        ) = pipe.encode_prompt(prompt, device=torch_device, dtype=torch.float32, text_encoder_index=0)
+
+        (
+            prompt_embeds_2,
+            negative_prompt_embeds_2,
+            prompt_attention_mask_2,
+            negative_prompt_attention_mask_2,
+        ) = pipe.encode_prompt(
+            prompt,
+            device=torch_device,
+            dtype=torch.float32,
+            text_encoder_index=1,
+        )
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attention_mask,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": negative_prompt_attention_mask,
+            "prompt_embeds_2": prompt_embeds_2,
+            "prompt_attention_mask_2": prompt_attention_mask_2,
+            "negative_prompt_embeds_2": negative_prompt_embeds_2,
+            "negative_prompt_attention_mask_2": negative_prompt_attention_mask_2,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "use_resolution_binning": False,
+        }
+
+        # set all optional components to None
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attention_mask,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": negative_prompt_attention_mask,
+            "prompt_embeds_2": prompt_embeds_2,
+            "prompt_attention_mask_2": prompt_attention_mask_2,
+            "negative_prompt_embeds_2": negative_prompt_embeds_2,
+            "negative_prompt_attention_mask_2": negative_prompt_attention_mask_2,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "use_resolution_binning": False,
+        }
+
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, 1e-4)
+
+    def test_feed_forward_chunking(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_no_chunking = image[0, -3:, -3:, -1]
+
+        pipe.transformer.enable_forward_chunking(chunk_size=1, dim=0)
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_chunking = image[0, -3:, -3:, -1]
+
+        max_diff = np.abs(to_np(image_slice_no_chunking) - to_np(image_slice_chunking)).max()
+        self.assertLess(max_diff, 1e-4)
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["return_dict"] = False
+        image = pipe(**inputs)[0]
+        original_image_slice = image[0, -3:, -3:, -1]
+
+        pipe.transformer.fuse_qkv_projections()
+        # TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added
+        # to the pipeline level.
+        pipe.transformer.fuse_qkv_projections()
+        assert check_qkv_fusion_processors_exist(
+            pipe.transformer
+        ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+        assert check_qkv_fusion_matches_attn_procs_length(
+            pipe.transformer, pipe.transformer.original_attn_processors
+        ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["return_dict"] = False
+        image_fused = pipe(**inputs)[0]
+        image_slice_fused = image_fused[0, -3:, -3:, -1]
+
+        pipe.transformer.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        inputs["return_dict"] = False
+        image_disabled = pipe(**inputs)[0]
+        image_slice_disabled = image_disabled[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-2, rtol=1e-2
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
+
+
+@slow
+@require_torch_gpu
+class HunyuanDiTPipelineIntegrationTests(unittest.TestCase):
+    prompt = "一个宇航员在骑马"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_hunyuan_dit_1024(self):
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        pipe = HunyuanDiTPipeline.from_pretrained(
+            "XCLiu/HunyuanDiT-0523", revision="refs/pr/2", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        prompt = self.prompt
+
+        image = pipe(
+            prompt=prompt, height=1024, width=1024, generator=generator, num_inference_steps=2, output_type="np"
+        ).images
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array(
+            [0.48388672, 0.33789062, 0.30737305, 0.47875977, 0.25097656, 0.30029297, 0.4440918, 0.26953125, 0.30078125]
+        )
+
+        max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
+        assert max_diff < 1e-3, f"Max diff is too high. got {image_slice.flatten()}"
diff --git a/diffusers/tests/pipelines/i2vgen_xl/__init__.py b/diffusers/tests/pipelines/i2vgen_xl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/diffusers/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..592ebd35f4a940ef16bf696d4e02dd94451f22f9
--- /dev/null
+++ b/diffusers/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
@@ -0,0 +1,268 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    I2VGenXLPipeline,
+)
+from diffusers.models.unets import I2VGenXLUNet
+from diffusers.utils import is_xformers_available, load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin, SDFunctionTesterMixin
+
+
+enable_full_determinism()
+
+
+@skip_mps
+class I2VGenXLPipelineFastTests(SDFunctionTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = I2VGenXLPipeline
+    params = frozenset(["prompt", "negative_prompt", "image"])
+    batch_params = frozenset(["prompt", "negative_prompt", "image", "generator"])
+    # No `output_type`.
+    required_optional_params = frozenset(["num_inference_steps", "generator", "latents", "return_dict"])
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+
+        torch.manual_seed(0)
+        unet = I2VGenXLUNet(
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
+            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
+            cross_attention_dim=4,
+            attention_head_dim=4,
+            num_attention_heads=None,
+            norm_num_groups=2,
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=(8,),
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=32,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=4,
+            intermediate_size=16,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        torch.manual_seed(0)
+        vision_encoder_config = CLIPVisionConfig(
+            hidden_size=4,
+            projection_dim=4,
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            image_size=32,
+            intermediate_size=16,
+            patch_size=1,
+        )
+        image_encoder = CLIPVisionModelWithProjection(vision_encoder_config)
+
+        torch.manual_seed(0)
+        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "image_encoder": image_encoder,
+            "tokenizer": tokenizer,
+            "feature_extractor": feature_extractor,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": input_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "pt",
+            "num_frames": 4,
+            "width": 32,
+            "height": 32,
+        }
+        return inputs
+
+    def test_text_to_video_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["output_type"] = "np"
+        frames = pipe(**inputs).frames
+
+        image_slice = frames[0][0][-3:, -3:, -1]
+
+        assert frames[0][0].shape == (32, 32, 3)
+        expected_slice = np.array([0.5146, 0.6525, 0.6032, 0.5204, 0.5675, 0.4125, 0.3016, 0.5172, 0.4095])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_save_load_local(self):
+        super().test_save_load_local(expected_max_difference=0.006)
+
+    def test_sequential_cpu_offload_forward_pass(self):
+        super().test_sequential_cpu_offload_forward_pass(expected_max_diff=0.008)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=0.008)
+
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=0.008)
+
+    @unittest.skip("Deprecated functionality")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=1e-2)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=0.008)
+
+    def test_model_cpu_offload_forward_pass(self):
+        super().test_model_cpu_offload_forward_pass(expected_max_diff=0.008)
+
+    def test_num_videos_per_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["output_type"] = "np"
+        frames = pipe(**inputs, num_videos_per_prompt=2).frames
+
+        assert frames.shape == (2, 4, 32, 32, 3)
+        assert frames[0][0].shape == (32, 32, 3)
+
+        image_slice = frames[0][0][-3:, -3:, -1]
+        expected_slice = np.array([0.5146, 0.6525, 0.6032, 0.5204, 0.5675, 0.4125, 0.3016, 0.5172, 0.4095])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch_gpu
+class I2VGenXLPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_i2vgen_xl(self):
+        pipe = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true"
+        )
+
+        generator = torch.Generator("cpu").manual_seed(0)
+        num_frames = 3
+
+        output = pipe(
+            image=image,
+            prompt="my cat",
+            num_frames=num_frames,
+            generator=generator,
+            num_inference_steps=3,
+            output_type="np",
+        )
+
+        image = output.frames[0]
+        assert image.shape == (num_frames, 704, 1280, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.5482, 0.6244, 0.6274, 0.4584, 0.5935, 0.5937, 0.4579, 0.5767, 0.5892])
+        assert numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice.flatten()) < 1e-3
diff --git a/diffusers/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/diffusers/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8180a3bc27f651809374829f3c6d5b77479e0e8
--- /dev/null
+++ b/diffusers/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py
@@ -0,0 +1,698 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipeline,
+    StableDiffusionPipeline,
+    StableDiffusionXLImg2ImgPipeline,
+    StableDiffusionXLInpaintPipeline,
+    StableDiffusionXLPipeline,
+)
+from diffusers.image_processor import IPAdapterMaskProcessor
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    is_flaky,
+    load_pt,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+
+enable_full_determinism()
+
+
+class IPAdapterNightlyTestsMixin(unittest.TestCase):
+    dtype = torch.float16
+
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_image_encoder(self, repo_id, subfolder):
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            repo_id, subfolder=subfolder, torch_dtype=self.dtype
+        ).to(torch_device)
+        return image_encoder
+
+    def get_image_processor(self, repo_id):
+        image_processor = CLIPImageProcessor.from_pretrained(repo_id)
+        return image_processor
+
+    def get_dummy_inputs(
+        self, for_image_to_image=False, for_inpainting=False, for_sdxl=False, for_masks=False, for_instant_style=False
+    ):
+        image = load_image(
+            "https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png"
+        )
+        if for_sdxl:
+            image = image.resize((1024, 1024))
+
+        input_kwargs = {
+            "prompt": "best quality, high quality",
+            "negative_prompt": "monochrome, lowres, bad anatomy, worst quality, low quality",
+            "num_inference_steps": 5,
+            "generator": torch.Generator(device="cpu").manual_seed(33),
+            "ip_adapter_image": image,
+            "output_type": "np",
+        }
+        if for_image_to_image:
+            image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg")
+            ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/river.png")
+
+            if for_sdxl:
+                image = image.resize((1024, 1024))
+                ip_image = ip_image.resize((1024, 1024))
+
+            input_kwargs.update({"image": image, "ip_adapter_image": ip_image})
+
+        elif for_inpainting:
+            image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/inpaint_image.png")
+            mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/mask.png")
+            ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/girl.png")
+
+            if for_sdxl:
+                image = image.resize((1024, 1024))
+                mask = mask.resize((1024, 1024))
+                ip_image = ip_image.resize((1024, 1024))
+
+            input_kwargs.update({"image": image, "mask_image": mask, "ip_adapter_image": ip_image})
+
+        elif for_masks:
+            face_image1 = load_image(
+                "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl1.png"
+            )
+            face_image2 = load_image(
+                "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl2.png"
+            )
+            mask1 = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask1.png")
+            mask2 = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask2.png")
+            input_kwargs.update(
+                {
+                    "ip_adapter_image": [[face_image1], [face_image2]],
+                    "cross_attention_kwargs": {"ip_adapter_masks": [mask1, mask2]},
+                }
+            )
+
+        elif for_instant_style:
+            composition_mask = load_image(
+                "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/1024_whole_mask.png"
+            )
+            female_mask = load_image(
+                "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter_None_20240321125641_mask.png"
+            )
+            male_mask = load_image(
+                "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter_None_20240321125344_mask.png"
+            )
+            background_mask = load_image(
+                "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter_6_20240321130722_mask.png"
+            )
+            ip_composition_image = load_image(
+                "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter__20240321125152.png"
+            )
+            ip_female_style = load_image(
+                "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter__20240321125625.png"
+            )
+            ip_male_style = load_image(
+                "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter__20240321125329.png"
+            )
+            ip_background = load_image(
+                "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter__20240321130643.png"
+            )
+            input_kwargs.update(
+                {
+                    "ip_adapter_image": [ip_composition_image, [ip_female_style, ip_male_style, ip_background]],
+                    "cross_attention_kwargs": {
+                        "ip_adapter_masks": [[composition_mask], [female_mask, male_mask, background_mask]]
+                    },
+                }
+            )
+
+        return input_kwargs
+
+
+@slow
+@require_torch_gpu
+class IPAdapterSDIntegrationTests(IPAdapterNightlyTestsMixin):
+    def test_text_to_image(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            image_encoder=image_encoder,
+            safety_checker=None,
+            torch_dtype=self.dtype,
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+        inputs = self.get_dummy_inputs()
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+
+        expected_slice = np.array([0.80810547, 0.88183594, 0.9296875, 0.9189453, 0.9848633, 1.0, 0.97021484, 1.0, 1.0])
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-plus_sd15.bin")
+
+        inputs = self.get_dummy_inputs()
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+
+        expected_slice = np.array(
+            [0.30444336, 0.26513672, 0.22436523, 0.2758789, 0.25585938, 0.20751953, 0.25390625, 0.24633789, 0.21923828]
+        )
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+    def test_image_to_image(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        pipeline = StableDiffusionImg2ImgPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            image_encoder=image_encoder,
+            safety_checker=None,
+            torch_dtype=self.dtype,
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+        inputs = self.get_dummy_inputs(for_image_to_image=True)
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+
+        expected_slice = np.array(
+            [0.22167969, 0.21875, 0.21728516, 0.22607422, 0.21948242, 0.23925781, 0.22387695, 0.25268555, 0.2722168]
+        )
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-plus_sd15.bin")
+
+        inputs = self.get_dummy_inputs(for_image_to_image=True)
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+
+        expected_slice = np.array(
+            [0.35913086, 0.265625, 0.26367188, 0.24658203, 0.19750977, 0.39990234, 0.15258789, 0.20336914, 0.5517578]
+        )
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+    def test_inpainting(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        pipeline = StableDiffusionInpaintPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            image_encoder=image_encoder,
+            safety_checker=None,
+            torch_dtype=self.dtype,
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+        inputs = self.get_dummy_inputs(for_inpainting=True)
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+
+        expected_slice = np.array(
+            [0.27148438, 0.24047852, 0.22167969, 0.23217773, 0.21118164, 0.21142578, 0.21875, 0.20751953, 0.20019531]
+        )
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-plus_sd15.bin")
+
+        inputs = self.get_dummy_inputs(for_inpainting=True)
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+    def test_text_to_image_model_cpu_offload(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            image_encoder=image_encoder,
+            safety_checker=None,
+            torch_dtype=self.dtype,
+        )
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+        pipeline.to(torch_device)
+
+        inputs = self.get_dummy_inputs()
+        output_without_offload = pipeline(**inputs).images
+
+        pipeline.enable_model_cpu_offload()
+        inputs = self.get_dummy_inputs()
+        output_with_offload = pipeline(**inputs).images
+        max_diff = np.abs(output_with_offload - output_without_offload).max()
+        self.assertLess(max_diff, 1e-3, "CPU offloading should not affect the inference results")
+
+        offloaded_modules = [
+            v
+            for k, v in pipeline.components.items()
+            if isinstance(v, torch.nn.Module) and k not in pipeline._exclude_from_cpu_offload
+        ]
+        (
+            self.assertTrue(all(v.device.type == "cpu" for v in offloaded_modules)),
+            f"Not offloaded: {[v for v in offloaded_modules if v.device.type != 'cpu']}",
+        )
+
+    def test_text_to_image_full_face(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            image_encoder=image_encoder,
+            safety_checker=None,
+            torch_dtype=self.dtype,
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-full-face_sd15.bin")
+        pipeline.set_ip_adapter_scale(0.7)
+
+        inputs = self.get_dummy_inputs()
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+        expected_slice = np.array([0.1704, 0.1296, 0.1272, 0.2212, 0.1514, 0.1479, 0.4172, 0.4263, 0.4360])
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+    def test_unload(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            image_encoder=image_encoder,
+            safety_checker=None,
+            torch_dtype=self.dtype,
+        )
+        before_processors = [attn_proc.__class__ for attn_proc in pipeline.unet.attn_processors.values()]
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+        pipeline.set_ip_adapter_scale(0.7)
+
+        pipeline.unload_ip_adapter()
+
+        assert getattr(pipeline, "image_encoder") is None
+        assert getattr(pipeline, "feature_extractor") is not None
+        after_processors = [attn_proc.__class__ for attn_proc in pipeline.unet.attn_processors.values()]
+
+        assert before_processors == after_processors
+
+    @is_flaky
+    def test_multi(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            image_encoder=image_encoder,
+            safety_checker=None,
+            torch_dtype=self.dtype,
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter(
+            "h94/IP-Adapter", subfolder="models", weight_name=["ip-adapter_sd15.bin", "ip-adapter-plus_sd15.bin"]
+        )
+        pipeline.set_ip_adapter_scale([0.7, 0.3])
+
+        inputs = self.get_dummy_inputs()
+        ip_adapter_image = inputs["ip_adapter_image"]
+        inputs["ip_adapter_image"] = [ip_adapter_image, [ip_adapter_image] * 2]
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+        expected_slice = np.array([0.5234, 0.5352, 0.5625, 0.5713, 0.5947, 0.6206, 0.5786, 0.6187, 0.6494])
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+    def test_text_to_image_face_id(self):
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, torch_dtype=self.dtype
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter(
+            "h94/IP-Adapter-FaceID",
+            subfolder=None,
+            weight_name="ip-adapter-faceid_sd15.bin",
+            image_encoder_folder=None,
+        )
+        pipeline.set_ip_adapter_scale(0.7)
+
+        inputs = self.get_dummy_inputs()
+        id_embeds = load_pt("https://huggingface.co/datasets/fabiorigano/testing-images/resolve/main/ai_face2.ipadpt")[
+            0
+        ]
+        id_embeds = id_embeds.reshape((2, 1, 1, 512))
+        inputs["ip_adapter_image_embeds"] = [id_embeds]
+        inputs["ip_adapter_image"] = None
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+
+        expected_slice = np.array([0.3237, 0.3186, 0.3406, 0.3154, 0.2942, 0.3220, 0.3188, 0.3528, 0.3242])
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+
+@slow
+@require_torch_gpu
+class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):
+    def test_text_to_image_sdxl(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="sdxl_models/image_encoder")
+        feature_extractor = self.get_image_processor("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            torch_dtype=self.dtype,
+        )
+        pipeline.enable_model_cpu_offload()
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
+
+        inputs = self.get_dummy_inputs()
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+
+        expected_slice = np.array(
+            [
+                0.09630299,
+                0.09551358,
+                0.08480701,
+                0.09070173,
+                0.09437338,
+                0.09264627,
+                0.08883232,
+                0.09287417,
+                0.09197289,
+            ]
+        )
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            torch_dtype=self.dtype,
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter(
+            "h94/IP-Adapter",
+            subfolder="sdxl_models",
+            weight_name="ip-adapter-plus_sdxl_vit-h.bin",
+        )
+
+        inputs = self.get_dummy_inputs()
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+
+        expected_slice = np.array([0.0596, 0.0539, 0.0459, 0.0580, 0.0560, 0.0548, 0.0501, 0.0563, 0.0500])
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+    def test_image_to_image_sdxl(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="sdxl_models/image_encoder")
+        feature_extractor = self.get_image_processor("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+
+        pipeline = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            torch_dtype=self.dtype,
+        )
+        pipeline.enable_model_cpu_offload()
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
+
+        inputs = self.get_dummy_inputs(for_image_to_image=True)
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+
+        expected_slice = np.array(
+            [
+                0.06513795,
+                0.07009393,
+                0.07234055,
+                0.07426041,
+                0.07002589,
+                0.06415862,
+                0.07827643,
+                0.07962808,
+                0.07411247,
+            ]
+        )
+
+        assert np.allclose(image_slice, expected_slice, atol=1e-3)
+
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        feature_extractor = self.get_image_processor("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+
+        pipeline = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            torch_dtype=self.dtype,
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter(
+            "h94/IP-Adapter",
+            subfolder="sdxl_models",
+            weight_name="ip-adapter-plus_sdxl_vit-h.bin",
+        )
+
+        inputs = self.get_dummy_inputs(for_image_to_image=True)
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+
+        expected_slice = np.array(
+            [
+                0.07126552,
+                0.07025367,
+                0.07348302,
+                0.07580167,
+                0.07467338,
+                0.06918576,
+                0.07480252,
+                0.08279955,
+                0.08547315,
+            ]
+        )
+
+        assert np.allclose(image_slice, expected_slice, atol=1e-3)
+
+    def test_inpainting_sdxl(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="sdxl_models/image_encoder")
+        feature_extractor = self.get_image_processor("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+
+        pipeline = StableDiffusionXLInpaintPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            torch_dtype=self.dtype,
+        )
+        pipeline.enable_model_cpu_offload()
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
+
+        inputs = self.get_dummy_inputs(for_inpainting=True)
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+        image_slice.tolist()
+
+        expected_slice = np.array(
+            [0.14181179, 0.1493012, 0.14283323, 0.14602411, 0.14915377, 0.15015268, 0.14725655, 0.15009224, 0.15164584]
+        )
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        feature_extractor = self.get_image_processor("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+
+        pipeline = StableDiffusionXLInpaintPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            torch_dtype=self.dtype,
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter(
+            "h94/IP-Adapter",
+            subfolder="sdxl_models",
+            weight_name="ip-adapter-plus_sdxl_vit-h.bin",
+        )
+
+        inputs = self.get_dummy_inputs(for_inpainting=True)
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+        image_slice.tolist()
+
+        expected_slice = np.array([0.1398, 0.1476, 0.1407, 0.1442, 0.1470, 0.1480, 0.1449, 0.1481, 0.1494])
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+    def test_ip_adapter_mask(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            image_encoder=image_encoder,
+            torch_dtype=self.dtype,
+        )
+        pipeline.enable_model_cpu_offload()
+        pipeline.load_ip_adapter(
+            "h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter-plus-face_sdxl_vit-h.safetensors"
+        )
+        pipeline.set_ip_adapter_scale(0.7)
+
+        inputs = self.get_dummy_inputs(for_masks=True)
+        mask = inputs["cross_attention_kwargs"]["ip_adapter_masks"][0]
+        processor = IPAdapterMaskProcessor()
+        mask = processor.preprocess(mask)
+        inputs["cross_attention_kwargs"]["ip_adapter_masks"] = mask
+        inputs["ip_adapter_image"] = inputs["ip_adapter_image"][0]
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+        expected_slice = np.array(
+            [0.7307304, 0.73450166, 0.73731124, 0.7377061, 0.7318013, 0.73720926, 0.74746597, 0.7409929, 0.74074936]
+        )
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+    def test_ip_adapter_multiple_masks(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            image_encoder=image_encoder,
+            torch_dtype=self.dtype,
+        )
+        pipeline.enable_model_cpu_offload()
+        pipeline.load_ip_adapter(
+            "h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] * 2
+        )
+        pipeline.set_ip_adapter_scale([0.7] * 2)
+
+        inputs = self.get_dummy_inputs(for_masks=True)
+        masks = inputs["cross_attention_kwargs"]["ip_adapter_masks"]
+        processor = IPAdapterMaskProcessor()
+        masks = processor.preprocess(masks)
+        inputs["cross_attention_kwargs"]["ip_adapter_masks"] = masks
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+        expected_slice = np.array(
+            [0.79474676, 0.7977683, 0.8013954, 0.7988008, 0.7970615, 0.8029355, 0.80614823, 0.8050743, 0.80627424]
+        )
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+    def test_instant_style_multiple_masks(self):
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            "h94/IP-Adapter", subfolder="models/image_encoder", torch_dtype=torch.float16
+        )
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            "RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.float16, image_encoder=image_encoder, variant="fp16"
+        )
+        pipeline.enable_model_cpu_offload()
+
+        pipeline.load_ip_adapter(
+            ["ostris/ip-composition-adapter", "h94/IP-Adapter"],
+            subfolder=["", "sdxl_models"],
+            weight_name=[
+                "ip_plus_composition_sdxl.safetensors",
+                "ip-adapter_sdxl_vit-h.safetensors",
+            ],
+            image_encoder_folder=None,
+        )
+        scale_1 = {
+            "down": [[0.0, 0.0, 1.0]],
+            "mid": [[0.0, 0.0, 1.0]],
+            "up": {"block_0": [[0.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 1.0]], "block_1": [[0.0, 0.0, 1.0]]},
+        }
+        pipeline.set_ip_adapter_scale([1.0, scale_1])
+
+        inputs = self.get_dummy_inputs(for_instant_style=True)
+        processor = IPAdapterMaskProcessor()
+        masks1 = inputs["cross_attention_kwargs"]["ip_adapter_masks"][0]
+        masks2 = inputs["cross_attention_kwargs"]["ip_adapter_masks"][1]
+        masks1 = processor.preprocess(masks1, height=1024, width=1024)
+        masks2 = processor.preprocess(masks2, height=1024, width=1024)
+        masks2 = masks2.reshape(1, masks2.shape[0], masks2.shape[2], masks2.shape[3])
+        inputs["cross_attention_kwargs"]["ip_adapter_masks"] = [masks1, masks2]
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+
+        expected_slice = np.array([0.2323, 0.1026, 0.1338, 0.0638, 0.0662, 0.0000, 0.0000, 0.0000, 0.0199])
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
+
+    def test_ip_adapter_multiple_masks_one_adapter(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            image_encoder=image_encoder,
+            torch_dtype=self.dtype,
+        )
+        pipeline.enable_model_cpu_offload()
+        pipeline.load_ip_adapter(
+            "h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"]
+        )
+        pipeline.set_ip_adapter_scale([[0.7, 0.7]])
+
+        inputs = self.get_dummy_inputs(for_masks=True)
+        masks = inputs["cross_attention_kwargs"]["ip_adapter_masks"]
+        processor = IPAdapterMaskProcessor()
+        masks = processor.preprocess(masks)
+        masks = masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])
+        inputs["cross_attention_kwargs"]["ip_adapter_masks"] = [masks]
+        ip_images = inputs["ip_adapter_image"]
+        inputs["ip_adapter_image"] = [[image[0] for image in ip_images]]
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+        expected_slice = np.array(
+            [0.79474676, 0.7977683, 0.8013954, 0.7988008, 0.7970615, 0.8029355, 0.80614823, 0.8050743, 0.80627424]
+        )
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 5e-4
diff --git a/diffusers/tests/pipelines/kandinsky/__init__.py b/diffusers/tests/pipelines/kandinsky/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/kandinsky/test_kandinsky.py b/diffusers/tests/pipelines/kandinsky/test_kandinsky.py
new file mode 100644
index 0000000000000000000000000000000000000000..8553ed96e9e187311100fddad4d8d72a17ad1a7f
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky/test_kandinsky.py
@@ -0,0 +1,329 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import XLMRobertaTokenizerFast
+
+from diffusers import DDIMScheduler, KandinskyPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
+from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class Dummies:
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 32
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/tiny-random-mclip-base")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = MCLIPConfig(
+            numDims=self.cross_attention_dim,
+            transformerDimensions=self.text_embedder_hidden_size,
+            hidden_size=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            vocab_size=1005,
+        )
+
+        text_encoder = MultilingualCLIP(config)
+        text_encoder = text_encoder.eval()
+
+        return text_encoder
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 4,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "text_image",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "text_image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="linear",
+            beta_start=0.00085,
+            beta_end=0.012,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+            prediction_type="epsilon",
+            thresholding=False,
+        )
+
+        components = {
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device)
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+
+class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyPipeline
+    params = [
+        "prompt",
+        "image_embeds",
+        "negative_image_embeds",
+    ]
+    batch_params = ["prompt", "negative_prompt", "image_embeds", "negative_image_embeds"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        dummy = Dummies()
+        return dummy.get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        dummy = Dummies()
+        return dummy.get_dummy_inputs(device=device, seed=seed)
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([1.0000, 1.0000, 0.2766, 1.0000, 0.5447, 0.1737, 1.0000, 0.4316, 0.9024])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+
+@slow
+@require_torch_gpu
+class KandinskyPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_text2img(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinsky/kandinsky_text2img_cat_fp16.npy"
+        )
+
+        pipe_prior = KandinskyPriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
+        pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        prompt = "red cat, 4k photo"
+
+        generator = torch.Generator(device="cuda").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=5,
+            negative_prompt="",
+        ).to_tuple()
+
+        generator = torch.Generator(device="cuda").manual_seed(0)
+        output = pipeline(
+            prompt,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
diff --git a/diffusers/tests/pipelines/kandinsky/test_kandinsky_combined.py b/diffusers/tests/pipelines/kandinsky/test_kandinsky_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..607a47e08e58e8d9ba47b3b6a62b7dbf56328eb5
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky/test_kandinsky_combined.py
@@ -0,0 +1,365 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from diffusers import KandinskyCombinedPipeline, KandinskyImg2ImgCombinedPipeline, KandinskyInpaintCombinedPipeline
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
+
+from ..test_pipelines_common import PipelineTesterMixin
+from .test_kandinsky import Dummies
+from .test_kandinsky_img2img import Dummies as Img2ImgDummies
+from .test_kandinsky_inpaint import Dummies as InpaintDummies
+from .test_kandinsky_prior import Dummies as PriorDummies
+
+
+enable_full_determinism()
+
+
+class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyCombinedPipeline
+    params = [
+        "prompt",
+    ]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = True
+
+    def get_dummy_components(self):
+        dummy = Dummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(
+            {
+                "height": 64,
+                "width": 64,
+            }
+        )
+        return inputs
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.2893, 0.1464, 0.4603, 0.3529, 0.4612, 0.7701, 0.4027, 0.3051, 0.5155])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=2e-1)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
+
+
+class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyImg2ImgCombinedPipeline
+    params = ["prompt", "image"]
+    batch_params = ["prompt", "negative_prompt", "image"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        dummy = Img2ImgDummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        dummy = Img2ImgDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(dummy.get_dummy_inputs(device=device, seed=seed))
+        inputs.pop("image_embeds")
+        inputs.pop("negative_image_embeds")
+        return inputs
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.4852, 0.4136, 0.4539, 0.4781, 0.4680, 0.5217, 0.4973, 0.4089, 0.4977])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=5e-1)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
+
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=5e-4)
+
+
+class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyInpaintCombinedPipeline
+    params = ["prompt", "image", "mask_image"]
+    batch_params = ["prompt", "negative_prompt", "image", "mask_image"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        dummy = InpaintDummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        dummy = InpaintDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(dummy.get_dummy_inputs(device=device, seed=seed))
+        inputs.pop("image_embeds")
+        inputs.pop("negative_image_embeds")
+        return inputs
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        print(image_from_tuple_slice)
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.0320, 0.0860, 0.4013, 0.0518, 0.2484, 0.5847, 0.4411, 0.2321, 0.4593])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    @unittest.skip("Difference between FP16 and FP32 too large on CI")
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=5e-1)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
+
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=5e-4)
+
+    def test_save_load_local(self):
+        super().test_save_load_local(expected_max_difference=5e-3)
diff --git a/diffusers/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/diffusers/tests/pipelines/kandinsky/test_kandinsky_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea289c5ccd71d71b96a4f5edffc3bf62a4e68125
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -0,0 +1,429 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import XLMRobertaTokenizerFast
+
+from diffusers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    KandinskyImg2ImgPipeline,
+    KandinskyPriorPipeline,
+    UNet2DConditionModel,
+    VQModel,
+)
+from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class Dummies:
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 32
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/tiny-random-mclip-base")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = MCLIPConfig(
+            numDims=self.cross_attention_dim,
+            transformerDimensions=self.text_embedder_hidden_size,
+            hidden_size=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            vocab_size=1005,
+        )
+
+        text_encoder = MultilingualCLIP(config)
+        text_encoder = text_encoder.eval()
+
+        return text_encoder
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 4,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "text_image",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "text_image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        ddim_config = {
+            "num_train_timesteps": 1000,
+            "beta_schedule": "linear",
+            "beta_start": 0.00085,
+            "beta_end": 0.012,
+            "clip_sample": False,
+            "set_alpha_to_one": False,
+            "steps_offset": 0,
+            "prediction_type": "epsilon",
+            "thresholding": False,
+        }
+
+        scheduler = DDIMScheduler(**ddim_config)
+
+        components = {
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device)
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "image": init_image,
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 10,
+            "guidance_scale": 7.0,
+            "strength": 0.2,
+            "output_type": "np",
+        }
+        return inputs
+
+
+class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyImg2ImgPipeline
+    params = ["prompt", "image_embeds", "negative_image_embeds", "image"]
+    batch_params = [
+        "prompt",
+        "negative_prompt",
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "strength",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
+
+    def test_kandinsky_img2img(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.5816, 0.5872, 0.4634, 0.5982, 0.4767, 0.4710, 0.4669, 0.4717, 0.4966])
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
+
+
+@slow
+@require_torch_gpu
+class KandinskyImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_img2img(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinsky/kandinsky_img2img_frog.npy"
+        )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+        prompt = "A red cartoon frog, 4k"
+
+        pipe_prior = KandinskyPriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyImg2ImgPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=5,
+            negative_prompt="",
+        ).to_tuple()
+
+        output = pipeline(
+            prompt,
+            image=init_image,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            height=768,
+            width=768,
+            strength=0.2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
+
+
+@nightly
+@require_torch_gpu
+class KandinskyImg2ImgPipelineNightlyTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_img2img_ddpm(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinsky/kandinsky_img2img_ddpm_frog.npy"
+        )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/frog.png"
+        )
+        prompt = "A red cartoon frog, 4k"
+
+        pipe_prior = KandinskyPriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        scheduler = DDPMScheduler.from_pretrained("kandinsky-community/kandinsky-2-1", subfolder="ddpm_scheduler")
+        pipeline = KandinskyImg2ImgPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-1", scheduler=scheduler, torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=5,
+            negative_prompt="",
+        ).to_tuple()
+
+        output = pipeline(
+            prompt,
+            image=init_image,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            height=768,
+            width=768,
+            strength=0.2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
diff --git a/diffusers/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/diffusers/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..740046678744e483e601852947d02819aae9d808
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -0,0 +1,362 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import XLMRobertaTokenizerFast
+
+from diffusers import DDIMScheduler, KandinskyInpaintPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
+from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
+    require_torch_gpu,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class Dummies:
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 32
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/tiny-random-mclip-base")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = MCLIPConfig(
+            numDims=self.cross_attention_dim,
+            transformerDimensions=self.text_embedder_hidden_size,
+            hidden_size=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            vocab_size=1005,
+        )
+
+        text_encoder = MultilingualCLIP(config)
+        text_encoder = text_encoder.eval()
+
+        return text_encoder
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 9,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "text_image",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "text_image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="linear",
+            beta_start=0.00085,
+            beta_end=0.012,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+            prediction_type="epsilon",
+            thresholding=False,
+        )
+
+        components = {
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device)
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+        # create mask
+        mask = np.zeros((64, 64), dtype=np.float32)
+        mask[:32, :32] = 1
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "image": init_image,
+            "mask_image": mask,
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 2,
+            "guidance_scale": 4.0,
+            "output_type": "np",
+        }
+        return inputs
+
+
+class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyInpaintPipeline
+    params = ["prompt", "image_embeds", "negative_image_embeds", "image", "mask_image"]
+    batch_params = [
+        "prompt",
+        "negative_prompt",
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+        "mask_image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
+
+    def test_kandinsky_inpaint(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.8222, 0.8896, 0.4373, 0.8088, 0.4905, 0.2609, 0.6816, 0.4291, 0.5129])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=5e-1)
+
+
+@nightly
+@require_torch_gpu
+class KandinskyInpaintPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_inpaint(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinsky/kandinsky_inpaint_cat_with_hat_fp16.npy"
+        )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+        mask = np.zeros((768, 768), dtype=np.float32)
+        mask[:250, 250:-250] = 1
+
+        prompt = "a hat"
+
+        pipe_prior = KandinskyPriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyInpaintPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=5,
+            negative_prompt="",
+        ).to_tuple()
+
+        output = pipeline(
+            prompt,
+            image=init_image,
+            mask_image=mask,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            height=768,
+            width=768,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
diff --git a/diffusers/tests/pipelines/kandinsky/test_kandinsky_prior.py b/diffusers/tests/pipelines/kandinsky/test_kandinsky_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f42447bd9d52c4f9c5a891adb820f369a1de2b0
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky/test_kandinsky_prior.py
@@ -0,0 +1,238 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import KandinskyPriorPipeline, PriorTransformer, UnCLIPScheduler
+from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class Dummies:
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config)
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "embedding_dim": self.text_embedder_hidden_size,
+            "num_layers": 1,
+        }
+
+        model = PriorTransformer(**model_kwargs)
+        # clip_std and clip_mean is initialized to be 0 so PriorTransformer.post_process_latents will always return 0 - set clip_std to be 1 so it won't return 0
+        model.clip_std = nn.Parameter(torch.ones(model.clip_std.shape))
+        return model
+
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=self.text_embedder_hidden_size,
+            image_size=224,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        model = CLIPVisionModelWithProjection(config)
+        return model
+
+    @property
+    def dummy_image_processor(self):
+        image_processor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        return image_processor
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        image_encoder = self.dummy_image_encoder
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        image_processor = self.dummy_image_processor
+
+        scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="sample",
+            num_train_timesteps=1000,
+            clip_sample=True,
+            clip_sample_range=10.0,
+        )
+
+        components = {
+            "prior": prior,
+            "image_encoder": image_encoder,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+            "image_processor": image_processor,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "generator": generator,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+
+class KandinskyPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyPriorPipeline
+    params = ["prompt"]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "generator",
+        "num_inference_steps",
+        "latents",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        dummy = Dummies()
+        return dummy.get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        dummy = Dummies()
+        return dummy.get_dummy_inputs(device=device, seed=seed)
+
+    def test_kandinsky_prior(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.image_embeds
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -10:]
+
+        image_from_tuple_slice = image_from_tuple[0, -10:]
+
+        assert image.shape == (1, 32)
+
+        expected_slice = np.array(
+            [-0.5948, 0.1875, -0.1523, -1.1995, -1.4061, -0.6367, -1.4607, -0.6406, 0.8793, -0.3891]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+        test_mean_pixel_difference = False
+
+        self._test_attention_slicing_forward_pass(
+            test_max_difference=test_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )
diff --git a/diffusers/tests/pipelines/kandinsky2_2/__init__.py b/diffusers/tests/pipelines/kandinsky2_2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky.py b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbd9166efadae0ca9148d59b83871a472b6e382d
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky.py
@@ -0,0 +1,277 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import DDIMScheduler, KandinskyV22Pipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_numpy,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class Dummies:
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 32
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 4,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="linear",
+            beta_start=0.00085,
+            beta_end=0.012,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+            prediction_type="epsilon",
+            thresholding=False,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+
+class KandinskyV22PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22Pipeline
+    params = [
+        "image_embeds",
+        "negative_image_embeds",
+    ]
+    batch_params = ["image_embeds", "negative_image_embeds"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    callback_cfg_params = ["image_embds"]
+    test_xformers_attention = False
+
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
+
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.3420, 0.9505, 0.3919, 1.0000, 0.5188, 0.3109, 0.6139, 0.5624, 0.6811])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=1e-1)
+
+
+@slow
+@require_torch_gpu
+class KandinskyV22PipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_text2img(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_text2img_cat_fp16.npy"
+        )
+
+        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.enable_model_cpu_offload()
+
+        pipeline = KandinskyV22Pipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        )
+        pipeline.enable_model_cpu_offload()
+        pipeline.set_progress_bar_config(disable=None)
+
+        prompt = "red cat, 4k photo"
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=3,
+            negative_prompt="",
+        ).to_tuple()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipeline(
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=3,
+            output_type="np",
+        )
+        image = output.images[0]
+        assert image.shape == (512, 512, 3)
+
+        max_diff = numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten())
+        assert max_diff < 1e-4
diff --git a/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbba0831397b3087dfe6521d64e0ddd45329c09e
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
@@ -0,0 +1,406 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from diffusers import (
+    KandinskyV22CombinedPipeline,
+    KandinskyV22Img2ImgCombinedPipeline,
+    KandinskyV22InpaintCombinedPipeline,
+)
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
+
+from ..test_pipelines_common import PipelineTesterMixin
+from .test_kandinsky import Dummies
+from .test_kandinsky_img2img import Dummies as Img2ImgDummies
+from .test_kandinsky_inpaint import Dummies as InpaintDummies
+from .test_kandinsky_prior import Dummies as PriorDummies
+
+
+enable_full_determinism()
+
+
+class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22CombinedPipeline
+    params = [
+        "prompt",
+    ]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = True
+    callback_cfg_params = ["image_embds"]
+
+    def get_dummy_components(self):
+        dummy = Dummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(
+            {
+                "height": 64,
+                "width": 64,
+            }
+        )
+        return inputs
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.3076, 0.2729, 0.5668, 0.0522, 0.3384, 0.7028, 0.4908, 0.3659, 0.6243])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=5e-1)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
+
+    def test_model_cpu_offload_forward_pass(self):
+        super().test_model_cpu_offload_forward_pass(expected_max_diff=5e-4)
+
+    def test_save_load_local(self):
+        super().test_save_load_local(expected_max_difference=5e-3)
+
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=5e-3)
+
+    def test_callback_inputs(self):
+        pass
+
+    def test_callback_cfg(self):
+        pass
+
+
+class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22Img2ImgCombinedPipeline
+    params = ["prompt", "image"]
+    batch_params = ["prompt", "negative_prompt", "image"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    callback_cfg_params = ["image_embds"]
+
+    def get_dummy_components(self):
+        dummy = Img2ImgDummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        dummy = Img2ImgDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(dummy.get_dummy_inputs(device=device, seed=seed))
+        inputs.pop("image_embeds")
+        inputs.pop("negative_image_embeds")
+        return inputs
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.4445, 0.4287, 0.4596, 0.3919, 0.3730, 0.5039, 0.4834, 0.4269, 0.5521])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=2e-1)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
+
+    def test_model_cpu_offload_forward_pass(self):
+        super().test_model_cpu_offload_forward_pass(expected_max_diff=5e-4)
+
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=5e-4)
+
+    def save_load_local(self):
+        super().test_save_load_local(expected_max_difference=5e-3)
+
+    def test_callback_inputs(self):
+        pass
+
+    def test_callback_cfg(self):
+        pass
+
+
+class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22InpaintCombinedPipeline
+    params = ["prompt", "image", "mask_image"]
+    batch_params = ["prompt", "negative_prompt", "image", "mask_image"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        dummy = InpaintDummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        dummy = InpaintDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(dummy.get_dummy_inputs(device=device, seed=seed))
+        inputs.pop("image_embeds")
+        inputs.pop("negative_image_embeds")
+        return inputs
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.5039, 0.4926, 0.4898, 0.4978, 0.4838, 0.4942, 0.4738, 0.4702, 0.4816])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=5e-1)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
+
+    def test_model_cpu_offload_forward_pass(self):
+        super().test_model_cpu_offload_forward_pass(expected_max_diff=5e-4)
+
+    def test_save_load_local(self):
+        super().test_save_load_local(expected_max_difference=5e-3)
+
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=5e-4)
+
+    def test_sequential_cpu_offload_forward_pass(self):
+        super().test_sequential_cpu_offload_forward_pass(expected_max_diff=5e-4)
+
+    def test_callback_inputs(self):
+        pass
+
+    def test_callback_cfg(self):
+        pass
diff --git a/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f3219e0d69e5a7168cdf80f0ef0a272582bd424
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py
@@ -0,0 +1,292 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import (
+    DDIMScheduler,
+    KandinskyV22ControlnetPipeline,
+    KandinskyV22PriorPipeline,
+    UNet2DConditionModel,
+    VQModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class KandinskyV22ControlnetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22ControlnetPipeline
+    params = ["image_embeds", "negative_image_embeds", "hint"]
+    batch_params = ["image_embeds", "negative_image_embeds", "hint"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 8,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image_hint",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 32, 64, 64],
+            "down_block_types": [
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "AttnDownEncoderBlock2D",
+            ],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": ["AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="linear",
+            beta_start=0.00085,
+            beta_end=0.012,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+            prediction_type="epsilon",
+            thresholding=False,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
+
+        # create hint
+        hint = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "hint": hint,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_controlnet(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.6959826, 0.868279, 0.7558092, 0.68769467, 0.85805804, 0.65977496, 0.44885302, 0.5959111, 0.4251595]
+        )
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=1e-1)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=5e-4)
+
+
+@nightly
+@require_torch_gpu
+class KandinskyV22ControlnetPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_controlnet(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_controlnet_robotcat_fp16.npy"
+        )
+
+        hint = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/hint_image_cat.png"
+        )
+        hint = torch.from_numpy(np.array(hint)).float() / 255.0
+        hint = hint.permute(2, 0, 1).unsqueeze(0)
+
+        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.enable_model_cpu_offload()
+
+        pipeline = KandinskyV22ControlnetPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+        )
+        pipeline.enable_model_cpu_offload()
+        pipeline.set_progress_bar_config(disable=None)
+
+        prompt = "A robot, 4k photo"
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=2,
+            negative_prompt="",
+        ).to_tuple()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipeline(
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            hint=hint,
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        max_diff = numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten())
+        assert max_diff < 1e-4
diff --git a/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet_img2img.py b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..20944aa3d6f8e663f466f4baa55a4fc48e7b9ec5
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet_img2img.py
@@ -0,0 +1,312 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+
+from diffusers import (
+    DDIMScheduler,
+    KandinskyV22ControlnetImg2ImgPipeline,
+    KandinskyV22PriorEmb2EmbPipeline,
+    UNet2DConditionModel,
+    VQModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class KandinskyV22ControlnetImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22ControlnetImg2ImgPipeline
+    params = ["image_embeds", "negative_image_embeds", "image", "hint"]
+    batch_params = ["image_embeds", "negative_image_embeds", "image", "hint"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "strength",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 8,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image_hint",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 32, 64, 64],
+            "down_block_types": [
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "AttnDownEncoderBlock2D",
+            ],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": ["AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        ddim_config = {
+            "num_train_timesteps": 1000,
+            "beta_schedule": "linear",
+            "beta_start": 0.00085,
+            "beta_end": 0.012,
+            "clip_sample": False,
+            "set_alpha_to_one": False,
+            "steps_offset": 0,
+            "prediction_type": "epsilon",
+            "thresholding": False,
+        }
+
+        scheduler = DDIMScheduler(**ddim_config)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+        # create hint
+        hint = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": init_image,
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "hint": hint,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 10,
+            "guidance_scale": 7.0,
+            "strength": 0.2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_controlnet_img2img(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.54985034, 0.55509365, 0.52561504, 0.5570494, 0.5593818, 0.5263979, 0.50285643, 0.5069846, 0.51196736]
+        )
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1.75e-3)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=2e-1)
+
+
+@nightly
+@require_torch_gpu
+class KandinskyV22ControlnetImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_controlnet_img2img(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_controlnet_img2img_robotcat_fp16.npy"
+        )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+        init_image = init_image.resize((512, 512))
+
+        hint = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/hint_image_cat.png"
+        )
+        hint = torch.from_numpy(np.array(hint)).float() / 255.0
+        hint = hint.permute(2, 0, 1).unsqueeze(0)
+
+        prompt = "A robot, 4k photo"
+
+        pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.enable_model_cpu_offload()
+
+        pipeline = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+        )
+        pipeline.enable_model_cpu_offload()
+
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            image=init_image,
+            strength=0.85,
+            generator=generator,
+            negative_prompt="",
+            num_inference_steps=5,
+        ).to_tuple()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipeline(
+            image=init_image,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            hint=hint,
+            generator=generator,
+            num_inference_steps=5,
+            height=512,
+            width=512,
+            strength=0.5,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        max_diff = numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten())
+        assert max_diff < 1e-4
diff --git a/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..26d8b45cf90007f23c23b5ec820d1b1b265c2e14
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py
@@ -0,0 +1,303 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+
+from diffusers import (
+    DDIMScheduler,
+    KandinskyV22Img2ImgPipeline,
+    KandinskyV22PriorPipeline,
+    UNet2DConditionModel,
+    VQModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class Dummies:
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 32
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 4,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        ddim_config = {
+            "num_train_timesteps": 1000,
+            "beta_schedule": "linear",
+            "beta_start": 0.00085,
+            "beta_end": 0.012,
+            "clip_sample": False,
+            "set_alpha_to_one": False,
+            "steps_offset": 0,
+            "prediction_type": "epsilon",
+            "thresholding": False,
+        }
+
+        scheduler = DDIMScheduler(**ddim_config)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": init_image,
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 10,
+            "guidance_scale": 7.0,
+            "strength": 0.2,
+            "output_type": "np",
+        }
+        return inputs
+
+
+class KandinskyV22Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22Img2ImgPipeline
+    params = ["image_embeds", "negative_image_embeds", "image"]
+    batch_params = [
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "strength",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    callback_cfg_params = ["image_embeds"]
+
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
+
+    def test_kandinsky_img2img(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.5712, 0.5443, 0.4725, 0.6195, 0.5184, 0.4651, 0.4473, 0.4590, 0.5016])
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=2e-1)
+
+
+@slow
+@require_torch_gpu
+class KandinskyV22Img2ImgPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_img2img(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_img2img_frog.npy"
+        )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+        prompt = "A red cartoon frog, 4k"
+
+        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.enable_model_cpu_offload()
+
+        pipeline = KandinskyV22Img2ImgPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        )
+        pipeline.enable_model_cpu_offload()
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=5,
+            negative_prompt="",
+        ).to_tuple()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipeline(
+            image=init_image,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=5,
+            height=768,
+            width=768,
+            strength=0.2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        max_diff = numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten())
+        assert max_diff < 1e-4
diff --git a/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..25cf4bbed456bf7c86b8987a9e2fa0abe24176ba
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py
@@ -0,0 +1,360 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+
+from diffusers import (
+    DDIMScheduler,
+    KandinskyV22InpaintPipeline,
+    KandinskyV22PriorPipeline,
+    UNet2DConditionModel,
+    VQModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    is_flaky,
+    load_image,
+    load_numpy,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class Dummies:
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 32
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 9,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="linear",
+            beta_start=0.00085,
+            beta_end=0.012,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+            prediction_type="epsilon",
+            thresholding=False,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+        # create mask
+        mask = np.zeros((64, 64), dtype=np.float32)
+        mask[:32, :32] = 1
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": init_image,
+            "mask_image": mask,
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 2,
+            "guidance_scale": 4.0,
+            "output_type": "np",
+        }
+        return inputs
+
+
+class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22InpaintPipeline
+    params = ["image_embeds", "negative_image_embeds", "image", "mask_image"]
+    batch_params = [
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+        "mask_image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    callback_cfg_params = ["image_embeds", "masked_image", "mask_image"]
+
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
+
+    def test_kandinsky_inpaint(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.50775903, 0.49527195, 0.48824543, 0.50192237, 0.48644906, 0.49373814, 0.4780598, 0.47234827, 0.48327848]
+        )
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=5e-1)
+
+    @is_flaky()
+    def test_model_cpu_offload_forward_pass(self):
+        super().test_inference_batch_single_identical(expected_max_diff=8e-4)
+
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=5e-4)
+
+    def test_sequential_cpu_offload_forward_pass(self):
+        super().test_sequential_cpu_offload_forward_pass(expected_max_diff=5e-4)
+
+    # override default test because we need to zero out mask too in order to make sure final latent is all zero
+    def test_callback_inputs(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        self.assertTrue(
+            hasattr(pipe, "_callback_tensor_inputs"),
+            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
+        )
+
+        def callback_inputs_test(pipe, i, t, callback_kwargs):
+            missing_callback_inputs = set()
+            for v in pipe._callback_tensor_inputs:
+                if v not in callback_kwargs:
+                    missing_callback_inputs.add(v)
+            self.assertTrue(
+                len(missing_callback_inputs) == 0, f"Missing callback tensor inputs: {missing_callback_inputs}"
+            )
+            last_i = pipe.num_timesteps - 1
+            if i == last_i:
+                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
+                callback_kwargs["mask_image"] = torch.zeros_like(callback_kwargs["mask_image"])
+            return callback_kwargs
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["callback_on_step_end"] = callback_inputs_test
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        inputs["output_type"] = "latent"
+
+        output = pipe(**inputs)[0]
+        assert output.abs().sum() == 0
+
+
+@slow
+@require_torch_gpu
+class KandinskyV22InpaintPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_inpaint(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_inpaint_cat_with_hat_fp16.npy"
+        )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+        mask = np.zeros((768, 768), dtype=np.float32)
+        mask[:250, 250:-250] = 1
+
+        prompt = "a hat"
+
+        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyV22InpaintPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=2,
+            negative_prompt="",
+        ).to_tuple()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipeline(
+            image=init_image,
+            mask_image=mask,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=2,
+            height=768,
+            width=768,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        max_diff = numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten())
+        assert max_diff < 1e-4
diff --git a/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..be0bc238d4daea0f7d46469c6cf802f2551ca13b
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py
@@ -0,0 +1,279 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import unittest
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import KandinskyV22PriorPipeline, PriorTransformer, UnCLIPScheduler
+from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class Dummies:
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config)
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "embedding_dim": self.text_embedder_hidden_size,
+            "num_layers": 1,
+        }
+
+        model = PriorTransformer(**model_kwargs)
+        # clip_std and clip_mean is initialized to be 0 so PriorTransformer.post_process_latents will always return 0 - set clip_std to be 1 so it won't return 0
+        model.clip_std = nn.Parameter(torch.ones(model.clip_std.shape))
+        return model
+
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=self.text_embedder_hidden_size,
+            image_size=224,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        model = CLIPVisionModelWithProjection(config)
+        return model
+
+    @property
+    def dummy_image_processor(self):
+        image_processor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        return image_processor
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        image_encoder = self.dummy_image_encoder
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        image_processor = self.dummy_image_processor
+
+        scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="sample",
+            num_train_timesteps=1000,
+            clip_sample=True,
+            clip_sample_range=10.0,
+        )
+
+        components = {
+            "prior": prior,
+            "image_encoder": image_encoder,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+            "image_processor": image_processor,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "generator": generator,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+
+class KandinskyV22PriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22PriorPipeline
+    params = ["prompt"]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "generator",
+        "num_inference_steps",
+        "latents",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict",
+    ]
+    callback_cfg_params = ["prompt_embeds", "text_encoder_hidden_states", "text_mask"]
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
+
+    def test_kandinsky_prior(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.image_embeds
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -10:]
+
+        image_from_tuple_slice = image_from_tuple[0, -10:]
+
+        assert image.shape == (1, 32)
+
+        expected_slice = np.array(
+            [-0.5948, 0.1875, -0.1523, -1.1995, -1.4061, -0.6367, -1.4607, -0.6406, 0.8793, -0.3891]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+        test_mean_pixel_difference = False
+
+        self._test_attention_slicing_forward_pass(
+            test_max_difference=test_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )
+
+    # override default test because no output_type "latent", use "pt" instead
+    def test_callback_inputs(self):
+        sig = inspect.signature(self.pipeline_class.__call__)
+
+        if not ("callback_on_step_end_tensor_inputs" in sig.parameters and "callback_on_step_end" in sig.parameters):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        self.assertTrue(
+            hasattr(pipe, "_callback_tensor_inputs"),
+            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
+        )
+
+        def callback_inputs_test(pipe, i, t, callback_kwargs):
+            missing_callback_inputs = set()
+            for v in pipe._callback_tensor_inputs:
+                if v not in callback_kwargs:
+                    missing_callback_inputs.add(v)
+            self.assertTrue(
+                len(missing_callback_inputs) == 0, f"Missing callback tensor inputs: {missing_callback_inputs}"
+            )
+            last_i = pipe.num_timesteps - 1
+            if i == last_i:
+                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
+            return callback_kwargs
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["callback_on_step_end"] = callback_inputs_test
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        inputs["num_inference_steps"] = 2
+        inputs["output_type"] = "pt"
+
+        output = pipe(**inputs)[0]
+        assert output.abs().sum() == 0
diff --git a/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py
new file mode 100644
index 0000000000000000000000000000000000000000..e898824e2d17df56d2fe2f97d7bafe4e9e6605d1
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py
@@ -0,0 +1,242 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from torch import nn
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import KandinskyV22PriorEmb2EmbPipeline, PriorTransformer, UnCLIPScheduler
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    skip_mps,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class KandinskyV22PriorEmb2EmbPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22PriorEmb2EmbPipeline
+    params = ["prompt", "image"]
+    batch_params = ["prompt", "image"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "strength",
+        "generator",
+        "num_inference_steps",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config)
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "embedding_dim": self.text_embedder_hidden_size,
+            "num_layers": 1,
+        }
+
+        model = PriorTransformer(**model_kwargs)
+        # clip_std and clip_mean is initialized to be 0 so PriorTransformer.post_process_latents will always return 0 - set clip_std to be 1 so it won't return 0
+        model.clip_std = nn.Parameter(torch.ones(model.clip_std.shape))
+        return model
+
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=self.text_embedder_hidden_size,
+            image_size=224,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        model = CLIPVisionModelWithProjection(config)
+        return model
+
+    @property
+    def dummy_image_processor(self):
+        image_processor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        return image_processor
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        image_encoder = self.dummy_image_encoder
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        image_processor = self.dummy_image_processor
+
+        scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="sample",
+            num_train_timesteps=1000,
+            clip_sample=True,
+            clip_sample_range=10.0,
+        )
+
+        components = {
+            "prior": prior,
+            "image_encoder": image_encoder,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+            "image_processor": image_processor,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+
+        inputs = {
+            "prompt": "horse",
+            "image": init_image,
+            "strength": 0.5,
+            "generator": generator,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_prior_emb2emb(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.image_embeds
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -10:]
+
+        image_from_tuple_slice = image_from_tuple[0, -10:]
+
+        assert image.shape == (1, 32)
+
+        expected_slice = np.array(
+            [-0.8947, 0.7225, -0.2400, -1.4224, -1.9268, -1.1454, -1.8220, -0.7972, 1.0465, -0.5207]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+        test_mean_pixel_difference = False
+
+        self._test_attention_slicing_forward_pass(
+            test_max_difference=test_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )
diff --git a/diffusers/tests/pipelines/kandinsky3/__init__.py b/diffusers/tests/pipelines/kandinsky3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/kandinsky3/test_kandinsky3.py b/diffusers/tests/pipelines/kandinsky3/test_kandinsky3.py
new file mode 100644
index 0000000000000000000000000000000000000000..941ef9093361b69faf407da21c9fd099401a181c
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky3/test_kandinsky3.py
@@ -0,0 +1,239 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoTokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoPipelineForImage2Image,
+    AutoPipelineForText2Image,
+    Kandinsky3Pipeline,
+    Kandinsky3UNet,
+    VQModel,
+)
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_image,
+    require_torch_gpu,
+    slow,
+)
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class Kandinsky3PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = Kandinsky3Pipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS
+    test_xformers_attention = False
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = Kandinsky3UNet(
+            in_channels=4,
+            time_embedding_dim=4,
+            groups=2,
+            attention_head_dim=4,
+            layers_per_block=3,
+            block_out_channels=(32, 64),
+            cross_attention_dim=4,
+            encoder_hid_dim=32,
+        )
+        scheduler = DDPMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="squaredcos_cap_v2",
+            clip_sample=True,
+            thresholding=False,
+        )
+        torch.manual_seed(0)
+        movq = self.dummy_movq
+        torch.manual_seed(0)
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "width": 16,
+            "height": 16,
+        }
+        return inputs
+
+    def test_kandinsky3(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 16, 16, 3)
+
+        expected_slice = np.array([0.3768, 0.4373, 0.4865, 0.4890, 0.4299, 0.5122, 0.4921, 0.4924, 0.5599])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=1e-1)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+
+@slow
+@require_torch_gpu
+class Kandinsky3PipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinskyV3(self):
+        pipe = AutoPipelineForText2Image.from_pretrained(
+            "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        image = pipe(prompt, num_inference_steps=5, generator=generator).images[0]
+
+        assert image.size == (1024, 1024)
+
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png"
+        )
+
+        image_processor = VaeImageProcessor()
+
+        image_np = image_processor.pil_to_numpy(image)
+        expected_image_np = image_processor.pil_to_numpy(expected_image)
+
+        self.assertTrue(np.allclose(image_np, expected_image_np, atol=5e-2))
+
+    def test_kandinskyV3_img2img(self):
+        pipe = AutoPipelineForImage2Image.from_pretrained(
+            "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png"
+        )
+        w, h = 512, 512
+        image = image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
+        prompt = "A painting of the inside of a subway train with tiny raccoons."
+
+        image = pipe(prompt, image=image, strength=0.75, num_inference_steps=5, generator=generator).images[0]
+
+        assert image.size == (512, 512)
+
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/i2i.png"
+        )
+
+        image_processor = VaeImageProcessor()
+
+        image_np = image_processor.pil_to_numpy(image)
+        expected_image_np = image_processor.pil_to_numpy(expected_image)
+
+        self.assertTrue(np.allclose(image_np, expected_image_np, atol=5e-2))
diff --git a/diffusers/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py b/diffusers/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c817df32e0c9baac3f2261b8f575d1c4b58eb0d
--- /dev/null
+++ b/diffusers/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py
@@ -0,0 +1,238 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoTokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoPipelineForImage2Image,
+    Kandinsky3Img2ImgPipeline,
+    Kandinsky3UNet,
+    VQModel,
+)
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class Kandinsky3Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = Kandinsky3Img2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS
+    test_xformers_attention = False
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "num_images_per_prompt",
+            "generator",
+            "output_type",
+            "return_dict",
+        ]
+    )
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = Kandinsky3UNet(
+            in_channels=4,
+            time_embedding_dim=4,
+            groups=2,
+            attention_head_dim=4,
+            layers_per_block=3,
+            block_out_channels=(32, 64),
+            cross_attention_dim=4,
+            encoder_hid_dim=32,
+        )
+        scheduler = DDPMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="squaredcos_cap_v2",
+            clip_sample=True,
+            thresholding=False,
+        )
+        torch.manual_seed(0)
+        movq = self.dummy_movq
+        torch.manual_seed(0)
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": init_image,
+            "generator": generator,
+            "strength": 0.75,
+            "num_inference_steps": 10,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_dict_tuple_outputs_equivalent(self):
+        expected_slice = None
+        if torch_device == "cpu":
+            expected_slice = np.array([0.5762, 0.6112, 0.4150, 0.6018, 0.6167, 0.4626, 0.5426, 0.5641, 0.6536])
+        super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
+
+    def test_kandinsky3_img2img(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.576259, 0.6132097, 0.41703486, 0.603196, 0.62062526, 0.4655338, 0.5434324, 0.5660727, 0.65433365]
+        )
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=1e-1)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+
+@slow
+@require_torch_gpu
+class Kandinsky3Img2ImgPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinskyV3_img2img(self):
+        pipe = AutoPipelineForImage2Image.from_pretrained(
+            "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png"
+        )
+        w, h = 512, 512
+        image = image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
+        prompt = "A painting of the inside of a subway train with tiny raccoons."
+
+        image = pipe(prompt, image=image, strength=0.75, num_inference_steps=5, generator=generator).images[0]
+
+        assert image.size == (512, 512)
+
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/i2i.png"
+        )
+
+        image_processor = VaeImageProcessor()
+
+        image_np = image_processor.pil_to_numpy(image)
+        expected_image_np = image_processor.pil_to_numpy(expected_image)
+
+        self.assertTrue(np.allclose(image_np, expected_image_np, atol=5e-2))
diff --git a/diffusers/tests/pipelines/kolors/__init__.py b/diffusers/tests/pipelines/kolors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/kolors/test_kolors.py b/diffusers/tests/pipelines/kolors/test_kolors.py
new file mode 100644
index 0000000000000000000000000000000000000000..de44af6d59085a624abc9499bc06442310697681
--- /dev/null
+++ b/diffusers/tests/pipelines/kolors/test_kolors.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    EulerDiscreteScheduler,
+    KolorsPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.kolors import ChatGLMModel, ChatGLMTokenizer
+from diffusers.utils.testing_utils import enable_full_determinism
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class KolorsPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KolorsPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"})
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(2, 4),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=56,
+            cross_attention_dim=8,
+            norm_num_groups=1,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder = ChatGLMModel.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
+        tokenizer = ChatGLMTokenizer.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "image_encoder": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        self.assertEqual(image.shape, (1, 64, 64, 3))
+        expected_slice = np.array(
+            [0.26413745, 0.4425478, 0.4102801, 0.42693347, 0.52529025, 0.3867405, 0.47512037, 0.41538602, 0.43855375]
+        )
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=2e-4)
+
+    def test_save_load_float16(self):
+        super().test_save_load_float16(expected_max_diff=2e-1)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=5e-4)
diff --git a/diffusers/tests/pipelines/kolors/test_kolors_img2img.py b/diffusers/tests/pipelines/kolors/test_kolors_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..2010dbd7055a1a494bf5b1e90718da7e4693a161
--- /dev/null
+++ b/diffusers/tests/pipelines/kolors/test_kolors_img2img.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    EulerDiscreteScheduler,
+    KolorsImg2ImgPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.kolors import ChatGLMModel, ChatGLMTokenizer
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+)
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class KolorsPipelineImg2ImgFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KolorsImg2ImgPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"})
+
+    # Copied from tests.pipelines.kolors.test_kolors.KolorsPipelineFastTests.get_dummy_components
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(2, 4),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=56,
+            cross_attention_dim=8,
+            norm_num_groups=1,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder = ChatGLMModel.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
+        tokenizer = ChatGLMTokenizer.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "image_encoder": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image / 2 + 0.5
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "strength": 0.8,
+        }
+
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        self.assertEqual(image.shape, (1, 64, 64, 3))
+        expected_slice = np.array(
+            [0.54823864, 0.43654007, 0.4886489, 0.63072854, 0.53641886, 0.4896852, 0.62123513, 0.5621531, 0.42809626]
+        )
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=3e-3)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=7e-2)
diff --git a/diffusers/tests/pipelines/latent_consistency_models/__init__.py b/diffusers/tests/pipelines/latent_consistency_models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py b/diffusers/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..b60a4553cded6027f6cd0609b79224ddcbe2fd04
--- /dev/null
+++ b/diffusers/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py
@@ -0,0 +1,265 @@
+import gc
+import inspect
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    LatentConsistencyModelPipeline,
+    LCMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class LatentConsistencyModelPipelineFastTests(
+    IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = LatentConsistencyModelPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"negative_prompt", "negative_prompt_embeds"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - {"negative_prompt"}
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            norm_num_groups=2,
+            time_cond_proj_dim=32,
+        )
+        scheduler = LCMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=64,
+            layer_norm_eps=1e-05,
+            num_attention_heads=8,
+            num_hidden_layers=3,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+            "requires_safety_checker": False,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.1403, 0.5072, 0.5316, 0.1202, 0.3865, 0.4211, 0.5363, 0.3557, 0.3645])
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_lcm_onestep(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        pipe = LatentConsistencyModelPipeline(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_inference_steps"] = 1
+        output = pipe(**inputs)
+        image = output.images
+        assert image.shape == (1, 64, 64, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.1441, 0.5304, 0.5452, 0.1361, 0.4011, 0.4370, 0.5326, 0.3492, 0.3637])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_lcm_multistep(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        pipe = LatentConsistencyModelPipeline(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = pipe(**inputs)
+        image = output.images
+        assert image.shape == (1, 64, 64, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.1403, 0.5072, 0.5316, 0.1202, 0.3865, 0.4211, 0.5363, 0.3557, 0.3645])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_lcm_custom_timesteps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        pipe = LatentConsistencyModelPipeline(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["num_inference_steps"]
+        inputs["timesteps"] = [999, 499]
+        output = pipe(**inputs)
+        image = output.images
+        assert image.shape == (1, 64, 64, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.1403, 0.5072, 0.5316, 0.1202, 0.3865, 0.4211, 0.5363, 0.3557, 0.3645])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=5e-4)
+
+    # skip because lcm pipeline apply cfg differently
+    def test_callback_cfg(self):
+        pass
+
+    # override default test because the final latent variable is "denoised" instead of "latents"
+    def test_callback_inputs(self):
+        sig = inspect.signature(self.pipeline_class.__call__)
+
+        if not ("callback_on_step_end_tensor_inputs" in sig.parameters and "callback_on_step_end" in sig.parameters):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        self.assertTrue(
+            hasattr(pipe, "_callback_tensor_inputs"),
+            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
+        )
+
+        def callback_inputs_test(pipe, i, t, callback_kwargs):
+            missing_callback_inputs = set()
+            for v in pipe._callback_tensor_inputs:
+                if v not in callback_kwargs:
+                    missing_callback_inputs.add(v)
+            self.assertTrue(
+                len(missing_callback_inputs) == 0, f"Missing callback tensor inputs: {missing_callback_inputs}"
+            )
+            last_i = pipe.num_timesteps - 1
+            if i == last_i:
+                callback_kwargs["denoised"] = torch.zeros_like(callback_kwargs["denoised"])
+            return callback_kwargs
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["callback_on_step_end"] = callback_inputs_test
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        inputs["output_type"] = "latent"
+
+        output = pipe(**inputs)[0]
+        assert output.abs().sum() == 0
+
+
+@slow
+@require_torch_gpu
+class LatentConsistencyModelPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "a photograph of an astronaut riding a horse",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_lcm_onestep(self):
+        pipe = LatentConsistencyModelPipeline.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", safety_checker=None)
+        pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 1
+        image = pipe(**inputs).images
+        assert image.shape == (1, 512, 512, 3)
+
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        expected_slice = np.array([0.1025, 0.0911, 0.0984, 0.0981, 0.0901, 0.0918, 0.1055, 0.0940, 0.0730])
+        assert np.abs(image_slice - expected_slice).max() < 1e-3
+
+    def test_lcm_multistep(self):
+        pipe = LatentConsistencyModelPipeline.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", safety_checker=None)
+        pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+        assert image.shape == (1, 512, 512, 3)
+
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        expected_slice = np.array([0.01855, 0.01855, 0.01489, 0.01392, 0.01782, 0.01465, 0.01831, 0.02539, 0.0])
+        assert np.abs(image_slice - expected_slice).max() < 1e-3
diff --git a/diffusers/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py b/diffusers/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..386e60c54ac68a9c95cbac4a21aa38d1a302c18c
--- /dev/null
+++ b/diffusers/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py
@@ -0,0 +1,283 @@
+import gc
+import inspect
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    LatentConsistencyModelImg2ImgPipeline,
+    LCMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class LatentConsistencyModelImg2ImgPipelineFastTests(
+    IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = LatentConsistencyModelImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "negative_prompt", "negative_prompt_embeds"}
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents", "negative_prompt"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            norm_num_groups=2,
+            time_cond_proj_dim=32,
+        )
+        scheduler = LCMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=64,
+            layer_norm_eps=1e-05,
+            num_attention_heads=8,
+            num_hidden_layers=3,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+            "requires_safety_checker": False,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image / 2 + 0.5
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.4003, 0.3718, 0.2863, 0.5500, 0.5587, 0.3772, 0.4617, 0.4961, 0.4417])
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_lcm_onestep(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_inference_steps"] = 1
+        output = pipe(**inputs)
+        image = output.images
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.4388, 0.3717, 0.2202, 0.7213, 0.6370, 0.3664, 0.5815, 0.6080, 0.4977])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_lcm_multistep(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = pipe(**inputs)
+        image = output.images
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.4150, 0.3719, 0.2479, 0.6333, 0.6024, 0.3778, 0.5036, 0.5420, 0.4678])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_lcm_custom_timesteps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["num_inference_steps"]
+        inputs["timesteps"] = [999, 499]
+        output = pipe(**inputs)
+        image = output.images
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.3994, 0.3471, 0.2540, 0.7030, 0.6193, 0.3645, 0.5777, 0.5850, 0.4965])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=5e-4)
+
+    # override default test because the final latent variable is "denoised" instead of "latents"
+    def test_callback_inputs(self):
+        sig = inspect.signature(self.pipeline_class.__call__)
+
+        if not ("callback_on_step_end_tensor_inputs" in sig.parameters and "callback_on_step_end" in sig.parameters):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        self.assertTrue(
+            hasattr(pipe, "_callback_tensor_inputs"),
+            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
+        )
+
+        def callback_inputs_test(pipe, i, t, callback_kwargs):
+            missing_callback_inputs = set()
+            for v in pipe._callback_tensor_inputs:
+                if v not in callback_kwargs:
+                    missing_callback_inputs.add(v)
+            self.assertTrue(
+                len(missing_callback_inputs) == 0, f"Missing callback tensor inputs: {missing_callback_inputs}"
+            )
+            last_i = pipe.num_timesteps - 1
+            if i == last_i:
+                callback_kwargs["denoised"] = torch.zeros_like(callback_kwargs["denoised"])
+            return callback_kwargs
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["callback_on_step_end"] = callback_inputs_test
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        inputs["output_type"] = "latent"
+
+        output = pipe(**inputs)[0]
+        assert output.abs().sum() == 0
+
+
+@slow
+@require_torch_gpu
+class LatentConsistencyModelImg2ImgPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/sketch-mountains-input.png"
+        )
+        init_image = init_image.resize((512, 512))
+
+        inputs = {
+            "prompt": "a photograph of an astronaut riding a horse",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+            "image": init_image,
+        }
+        return inputs
+
+    def test_lcm_onestep(self):
+        pipe = LatentConsistencyModelImg2ImgPipeline.from_pretrained(
+            "SimianLuo/LCM_Dreamshaper_v7", safety_checker=None
+        )
+        pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 1
+        image = pipe(**inputs).images
+        assert image.shape == (1, 512, 512, 3)
+
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        expected_slice = np.array([0.3479, 0.3314, 0.3555, 0.3430, 0.3649, 0.3423, 0.3239, 0.3117, 0.3240])
+        assert np.abs(image_slice - expected_slice).max() < 1e-3
+
+    def test_lcm_multistep(self):
+        pipe = LatentConsistencyModelImg2ImgPipeline.from_pretrained(
+            "SimianLuo/LCM_Dreamshaper_v7", safety_checker=None
+        )
+        pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+        assert image.shape == (1, 512, 512, 3)
+
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        expected_slice = np.array([0.1442, 0.1201, 0.1598, 0.1281, 0.1412, 0.1502, 0.1455, 0.1544, 0.1231])
+        assert np.abs(image_slice - expected_slice).max() < 1e-3
diff --git a/diffusers/tests/pipelines/latent_diffusion/__init__.py b/diffusers/tests/pipelines/latent_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..e751240e43b0c8150e1664ed86ff6958c54faa9e
--- /dev/null
+++ b/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py
@@ -0,0 +1,217 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, UNet2DConditionModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_numpy,
+    nightly,
+    require_torch_gpu,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class LDMTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = LDMTextToImagePipeline
+    params = TEXT_TO_IMAGE_PARAMS - {
+        "negative_prompt",
+        "negative_prompt_embeds",
+        "cross_attention_kwargs",
+        "prompt_embeds",
+    }
+    required_optional_params = PipelineTesterMixin.required_optional_params - {
+        "num_images_per_prompt",
+        "callback",
+        "callback_steps",
+    }
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=(32, 64),
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
+            up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vqvae": vae,
+            "bert": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_inference_text2img(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        pipe = LDMTextToImagePipeline(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 16, 16, 3)
+        expected_slice = np.array([0.6101, 0.6156, 0.5622, 0.4895, 0.6661, 0.3804, 0.5748, 0.6136, 0.5014])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+
+@nightly
+@require_torch_gpu
+class LDMTextToImagePipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, dtype=torch.float32, seed=0):
+        generator = torch.manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_ldm_default_ddim(self):
+        pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256").to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.51825, 0.52850, 0.52543, 0.54258, 0.52304, 0.52569, 0.54363, 0.55276, 0.56878])
+        max_diff = np.abs(expected_slice - image_slice).max()
+        assert max_diff < 1e-3
+
+
+@nightly
+@require_torch_gpu
+class LDMTextToImagePipelineNightlyTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, dtype=torch.float32, seed=0):
+        generator = torch.manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 50,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_ldm_default_ddim(self):
+        pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256").to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/ldm_text2img/ldm_large_256_ddim.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b9a8ef6557293b7eac176bb452b9f3d9fb1230f
--- /dev/null
+++ b/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
@@ -0,0 +1,138 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel
+from diffusers.utils import PIL_INTERPOLATION
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    nightly,
+    require_torch,
+    torch_device,
+)
+
+
+enable_full_determinism()
+
+
+class LDMSuperResolutionPipelineFastTests(unittest.TestCase):
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=6,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    @property
+    def dummy_vq_model(self):
+        torch.manual_seed(0)
+        model = VQModel(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=3,
+        )
+        return model
+
+    def test_inference_superresolution(self):
+        device = "cpu"
+        unet = self.dummy_uncond_unet
+        scheduler = DDIMScheduler()
+        vqvae = self.dummy_vq_model
+
+        ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
+        ldm.to(device)
+        ldm.set_progress_bar_config(disable=None)
+
+        init_image = self.dummy_image.to(device)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image = ldm(image=init_image, generator=generator, num_inference_steps=2, output_type="np").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.8678, 0.8245, 0.6381, 0.6830, 0.4385, 0.5599, 0.4641, 0.6201, 0.5150])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_inference_superresolution_fp16(self):
+        unet = self.dummy_uncond_unet
+        scheduler = DDIMScheduler()
+        vqvae = self.dummy_vq_model
+
+        # put models in fp16
+        unet = unet.half()
+        vqvae = vqvae.half()
+
+        ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
+        ldm.to(torch_device)
+        ldm.set_progress_bar_config(disable=None)
+
+        init_image = self.dummy_image.to(torch_device)
+
+        image = ldm(init_image, num_inference_steps=2, output_type="np").images
+
+        assert image.shape == (1, 64, 64, 3)
+
+
+@nightly
+@require_torch
+class LDMSuperResolutionPipelineIntegrationTests(unittest.TestCase):
+    def test_inference_superresolution(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/vq_diffusion/teddy_bear_pool.png"
+        )
+        init_image = init_image.resize((64, 64), resample=PIL_INTERPOLATION["lanczos"])
+
+        ldm = LDMSuperResolutionPipeline.from_pretrained("duongna/ldm-super-resolution")
+        ldm.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = ldm(image=init_image, generator=generator, num_inference_steps=20, output_type="np").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.7644, 0.7679, 0.7642, 0.7633, 0.7666, 0.7560, 0.7425, 0.7257, 0.6907])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/latte/__init__.py b/diffusers/tests/pipelines/latte/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/latte/test_latte.py b/diffusers/tests/pipelines/latte/test_latte.py
new file mode 100644
index 0000000000000000000000000000000000000000..9667ebff249dff47a13406bba517d92f2a773998
--- /dev/null
+++ b/diffusers/tests/pipelines/latte/test_latte.py
@@ -0,0 +1,303 @@
+# coding=utf-8
+# Copyright 2024 Latte Team and HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LattePipeline,
+    LatteTransformer3DModel,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, to_np
+
+
+enable_full_determinism()
+
+
+class LattePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = LattePipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    required_optional_params = PipelineTesterMixin.required_optional_params
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = LatteTransformer3DModel(
+            sample_size=8,
+            num_layers=1,
+            patch_size=2,
+            attention_head_dim=8,
+            num_attention_heads=3,
+            caption_channels=32,
+            in_channels=4,
+            cross_attention_dim=24,
+            out_channels=8,
+            attention_bias=True,
+            activation_fn="gelu-approximate",
+            num_embeds_ada_norm=1000,
+            norm_type="ada_norm_single",
+            norm_elementwise_affine=False,
+            norm_eps=1e-6,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL()
+
+        scheduler = DDIMScheduler()
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        components = {
+            "transformer": transformer.eval(),
+            "vae": vae.eval(),
+            "scheduler": scheduler,
+            "text_encoder": text_encoder.eval(),
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "negative_prompt": "low quality",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "height": 8,
+            "width": 8,
+            "video_length": 1,
+            "output_type": "pt",
+            "clean_caption": False,
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        video = pipe(**inputs).frames
+        generated_video = video[0]
+
+        self.assertEqual(generated_video.shape, (1, 3, 8, 8))
+        expected_video = torch.randn(1, 3, 8, 8)
+        max_diff = np.abs(generated_video - expected_video).max()
+        self.assertLessEqual(max_diff, 1e10)
+
+    def test_callback_inputs(self):
+        sig = inspect.signature(self.pipeline_class.__call__)
+        has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
+        has_callback_step_end = "callback_on_step_end" in sig.parameters
+
+        if not (has_callback_tensor_inputs and has_callback_step_end):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        self.assertTrue(
+            hasattr(pipe, "_callback_tensor_inputs"),
+            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
+        )
+
+        def callback_inputs_subset(pipe, i, t, callback_kwargs):
+            # iterate over callback args
+            for tensor_name, tensor_value in callback_kwargs.items():
+                # check that we're only passing in allowed tensor inputs
+                assert tensor_name in pipe._callback_tensor_inputs
+
+            return callback_kwargs
+
+        def callback_inputs_all(pipe, i, t, callback_kwargs):
+            for tensor_name in pipe._callback_tensor_inputs:
+                assert tensor_name in callback_kwargs
+
+            # iterate over callback args
+            for tensor_name, tensor_value in callback_kwargs.items():
+                # check that we're only passing in allowed tensor inputs
+                assert tensor_name in pipe._callback_tensor_inputs
+
+            return callback_kwargs
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        # Test passing in a subset
+        inputs["callback_on_step_end"] = callback_inputs_subset
+        inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
+        output = pipe(**inputs)[0]
+
+        # Test passing in a everything
+        inputs["callback_on_step_end"] = callback_inputs_all
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        output = pipe(**inputs)[0]
+
+        def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
+            is_last = i == (pipe.num_timesteps - 1)
+            if is_last:
+                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
+            return callback_kwargs
+
+        inputs["callback_on_step_end"] = callback_inputs_change_tensor
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        output = pipe(**inputs)[0]
+        assert output.abs().sum() < 1e10
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-3)
+
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_save_load_optional_components(self):
+        if not hasattr(self.pipeline_class, "_optional_components"):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = inputs["prompt"]
+        generator = inputs["generator"]
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+        ) = pipe.encode_prompt(prompt)
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "negative_prompt": None,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "height": 8,
+            "width": 8,
+            "video_length": 1,
+            "mask_feature": False,
+            "output_type": "pt",
+            "clean_caption": False,
+        }
+
+        # set all optional components to None
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir, safe_serialization=False)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+
+            for component in pipe_loaded.components.values():
+                if hasattr(component, "set_default_attn_processor"):
+                    component.set_default_attn_processor()
+
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, 1.0)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        super()._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
+
+
+@slow
+@require_torch_gpu
+class LattePipelineIntegrationTests(unittest.TestCase):
+    prompt = "A painting of a squirrel eating a burger."
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_latte(self):
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16)
+        pipe.enable_model_cpu_offload()
+        prompt = self.prompt
+
+        videos = pipe(
+            prompt=prompt,
+            height=512,
+            width=512,
+            generator=generator,
+            num_inference_steps=2,
+            clean_caption=False,
+        ).frames
+
+        video = videos[0]
+        expected_video = torch.randn(1, 512, 512, 3).numpy()
+
+        max_diff = numpy_cosine_similarity_distance(video.flatten(), expected_video)
+        assert max_diff < 1e-3, f"Max diff is too high. got {video.flatten()}"
diff --git a/diffusers/tests/pipelines/ledits_pp/__init__.py b/diffusers/tests/pipelines/ledits_pp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion.py b/diffusers/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..effea2619749763d405ff56f827c85580864af5b
--- /dev/null
+++ b/diffusers/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion.py
@@ -0,0 +1,249 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DPMSolverMultistepScheduler,
+    LEditsPPPipelineStableDiffusion,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_device,
+)
+
+
+enable_full_determinism()
+
+
+@skip_mps
+class LEditsPPPipelineStableDiffusionFastTests(unittest.TestCase):
+    pipeline_class = LEditsPPPipelineStableDiffusion
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = DPMSolverMultistepScheduler(algorithm_type="sde-dpmsolver++", solver_order=2)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "generator": generator,
+            "editing_prompt": ["wearing glasses", "sunshine"],
+            "reverse_editing_direction": [False, True],
+            "edit_guidance_scale": [10.0, 5.0],
+        }
+        return inputs
+
+    def get_dummy_inversion_inputs(self, device, seed=0):
+        images = floats_tensor((2, 3, 32, 32), rng=random.Random(0)).cpu().permute(0, 2, 3, 1)
+        images = 255 * images
+        image_1 = Image.fromarray(np.uint8(images[0])).convert("RGB")
+        image_2 = Image.fromarray(np.uint8(images[1])).convert("RGB")
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "image": [image_1, image_2],
+            "source_prompt": "",
+            "source_guidance_scale": 3.5,
+            "num_inversion_steps": 20,
+            "skip": 0.15,
+            "generator": generator,
+        }
+        return inputs
+
+    def test_ledits_pp_inversion(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = LEditsPPPipelineStableDiffusion(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inversion_inputs(device)
+        inputs["image"] = inputs["image"][0]
+        sd_pipe.invert(**inputs)
+        assert sd_pipe.init_latents.shape == (
+            1,
+            4,
+            int(32 / sd_pipe.vae_scale_factor),
+            int(32 / sd_pipe.vae_scale_factor),
+        )
+
+        latent_slice = sd_pipe.init_latents[0, -1, -3:, -3:].to(device)
+        print(latent_slice.flatten())
+        expected_slice = np.array([-0.9084, -0.0367, 0.2940, 0.0839, 0.6890, 0.2651, -0.7104, 2.1090, -0.7822])
+        assert np.abs(latent_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_ledits_pp_inversion_batch(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = LEditsPPPipelineStableDiffusion(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inversion_inputs(device)
+        sd_pipe.invert(**inputs)
+        assert sd_pipe.init_latents.shape == (
+            2,
+            4,
+            int(32 / sd_pipe.vae_scale_factor),
+            int(32 / sd_pipe.vae_scale_factor),
+        )
+
+        latent_slice = sd_pipe.init_latents[0, -1, -3:, -3:].to(device)
+        print(latent_slice.flatten())
+        expected_slice = np.array([0.2528, 0.1458, -0.2166, 0.4565, -0.5657, -1.0286, -0.9961, 0.5933, 1.1173])
+        assert np.abs(latent_slice.flatten() - expected_slice).max() < 1e-3
+
+        latent_slice = sd_pipe.init_latents[1, -1, -3:, -3:].to(device)
+        print(latent_slice.flatten())
+        expected_slice = np.array([-0.0796, 2.0583, 0.5501, 0.5358, 0.0282, -0.2803, -1.0470, 0.7023, -0.0072])
+        assert np.abs(latent_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_ledits_pp_warmup_steps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = LEditsPPPipelineStableDiffusion(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inversion_inputs = self.get_dummy_inversion_inputs(device)
+        pipe.invert(**inversion_inputs)
+
+        inputs = self.get_dummy_inputs(device)
+
+        inputs["edit_warmup_steps"] = [0, 5]
+        pipe(**inputs).images
+
+        inputs["edit_warmup_steps"] = [5, 0]
+        pipe(**inputs).images
+
+        inputs["edit_warmup_steps"] = [5, 10]
+        pipe(**inputs).images
+
+        inputs["edit_warmup_steps"] = [10, 5]
+        pipe(**inputs).images
+
+
+@slow
+@require_torch_gpu
+class LEditsPPPipelineStableDiffusionSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @classmethod
+    def setUpClass(cls):
+        raw_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png"
+        )
+        raw_image = raw_image.convert("RGB").resize((512, 512))
+        cls.raw_image = raw_image
+
+    def test_ledits_pp_editing(self):
+        pipe = LEditsPPPipelineStableDiffusion.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        _ = pipe.invert(image=self.raw_image, generator=generator)
+        generator = torch.manual_seed(0)
+        inputs = {
+            "generator": generator,
+            "editing_prompt": ["cat", "dog"],
+            "reverse_editing_direction": [True, False],
+            "edit_guidance_scale": [5.0, 5.0],
+            "edit_threshold": [0.8, 0.8],
+        }
+        reconstruction = pipe(**inputs, output_type="np").images[0]
+
+        output_slice = reconstruction[150:153, 140:143, -1]
+        output_slice = output_slice.flatten()
+        expected_slice = np.array(
+            [0.9453125, 0.93310547, 0.84521484, 0.94628906, 0.9111328, 0.80859375, 0.93847656, 0.9042969, 0.8144531]
+        )
+        assert np.abs(output_slice - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion_xl.py b/diffusers/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcfd0aa51b9f8475c0fbf3046d89eb3170b58295
--- /dev/null
+++ b/diffusers/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion_xl.py
@@ -0,0 +1,289 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    DPMSolverMultistepScheduler,
+    LEditsPPPipelineStableDiffusionXL,
+    UNet2DConditionModel,
+)
+
+# from diffusers.image_processor import VaeImageProcessor
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_device,
+)
+
+
+enable_full_determinism()
+
+
+@skip_mps
+class LEditsPPPipelineStableDiffusionXLFastTests(unittest.TestCase):
+    pipeline_class = LEditsPPPipelineStableDiffusionXL
+
+    def get_dummy_components(self, skip_first_text_encoder=False, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            time_cond_proj_dim=time_cond_proj_dim,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64 if not skip_first_text_encoder else 32,
+        )
+        scheduler = DPMSolverMultistepScheduler(algorithm_type="sde-dpmsolver++", solver_order=2)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=32,
+            image_size=224,
+            projection_dim=32,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+
+        feature_extractor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder if not skip_first_text_encoder else None,
+            "tokenizer": tokenizer if not skip_first_text_encoder else None,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "image_encoder": image_encoder,
+            "feature_extractor": feature_extractor,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "generator": generator,
+            "editing_prompt": ["wearing glasses", "sunshine"],
+            "reverse_editing_direction": [False, True],
+            "edit_guidance_scale": [10.0, 5.0],
+        }
+        return inputs
+
+    def get_dummy_inversion_inputs(self, device, seed=0):
+        images = floats_tensor((2, 3, 32, 32), rng=random.Random(0)).cpu().permute(0, 2, 3, 1)
+        images = 255 * images
+        image_1 = Image.fromarray(np.uint8(images[0])).convert("RGB")
+        image_2 = Image.fromarray(np.uint8(images[1])).convert("RGB")
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "image": [image_1, image_2],
+            "source_prompt": "",
+            "source_guidance_scale": 3.5,
+            "num_inversion_steps": 20,
+            "skip": 0.15,
+            "generator": generator,
+        }
+        return inputs
+
+    def test_ledits_pp_inversion(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = LEditsPPPipelineStableDiffusionXL(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inversion_inputs(device)
+        inputs["image"] = inputs["image"][0]
+        sd_pipe.invert(**inputs)
+        assert sd_pipe.init_latents.shape == (
+            1,
+            4,
+            int(32 / sd_pipe.vae_scale_factor),
+            int(32 / sd_pipe.vae_scale_factor),
+        )
+
+        latent_slice = sd_pipe.init_latents[0, -1, -3:, -3:].to(device)
+        expected_slice = np.array([-0.9084, -0.0367, 0.2940, 0.0839, 0.6890, 0.2651, -0.7103, 2.1090, -0.7821])
+        assert np.abs(latent_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_ledits_pp_inversion_batch(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = LEditsPPPipelineStableDiffusionXL(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inversion_inputs(device)
+        sd_pipe.invert(**inputs)
+        assert sd_pipe.init_latents.shape == (
+            2,
+            4,
+            int(32 / sd_pipe.vae_scale_factor),
+            int(32 / sd_pipe.vae_scale_factor),
+        )
+
+        latent_slice = sd_pipe.init_latents[0, -1, -3:, -3:].to(device)
+        print(latent_slice.flatten())
+        expected_slice = np.array([0.2528, 0.1458, -0.2166, 0.4565, -0.5656, -1.0286, -0.9961, 0.5933, 1.1172])
+        assert np.abs(latent_slice.flatten() - expected_slice).max() < 1e-3
+
+        latent_slice = sd_pipe.init_latents[1, -1, -3:, -3:].to(device)
+        print(latent_slice.flatten())
+        expected_slice = np.array([-0.0796, 2.0583, 0.5500, 0.5358, 0.0282, -0.2803, -1.0470, 0.7024, -0.0072])
+        print(latent_slice.flatten())
+        assert np.abs(latent_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_ledits_pp_warmup_steps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = LEditsPPPipelineStableDiffusionXL(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inversion_inputs = self.get_dummy_inversion_inputs(device)
+        inversion_inputs["image"] = inversion_inputs["image"][0]
+        pipe.invert(**inversion_inputs)
+
+        inputs = self.get_dummy_inputs(device)
+
+        inputs["edit_warmup_steps"] = [0, 5]
+        pipe(**inputs).images
+
+        inputs["edit_warmup_steps"] = [5, 0]
+        pipe(**inputs).images
+
+        inputs["edit_warmup_steps"] = [5, 10]
+        pipe(**inputs).images
+
+        inputs["edit_warmup_steps"] = [10, 5]
+        pipe(**inputs).images
+
+
+@slow
+@require_torch_gpu
+class LEditsPPPipelineStableDiffusionXLSlowTests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        raw_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png"
+        )
+        raw_image = raw_image.convert("RGB").resize((512, 512))
+        cls.raw_image = raw_image
+
+    def test_ledits_pp_edit(self):
+        pipe = LEditsPPPipelineStableDiffusionXL.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", safety_checker=None, add_watermarker=None
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        _ = pipe.invert(image=self.raw_image, generator=generator, num_zero_noise_steps=0)
+        inputs = {
+            "generator": generator,
+            "editing_prompt": ["cat", "dog"],
+            "reverse_editing_direction": [True, False],
+            "edit_guidance_scale": [2.0, 4.0],
+            "edit_threshold": [0.8, 0.8],
+        }
+        reconstruction = pipe(**inputs, output_type="np").images[0]
+
+        output_slice = reconstruction[150:153, 140:143, -1]
+        output_slice = output_slice.flatten()
+        expected_slice = np.array(
+            [0.56419, 0.44121838, 0.2765603, 0.5708484, 0.42763475, 0.30945742, 0.5387106, 0.4735807, 0.3547244]
+        )
+        assert np.abs(output_slice - expected_slice).max() < 1e-3
diff --git a/diffusers/tests/pipelines/lumina/__init__.py b/diffusers/tests/pipelines/lumina/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/lumina/test_lumina_nextdit.py b/diffusers/tests/pipelines/lumina/test_lumina_nextdit.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fd0dbf060502bd9c8cf2b54634a8a3ecefde320
--- /dev/null
+++ b/diffusers/tests/pipelines/lumina/test_lumina_nextdit.py
@@ -0,0 +1,183 @@
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, GemmaConfig, GemmaForCausalLM
+
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, LuminaNextDiT2DModel, LuminaText2ImgPipeline
+from diffusers.utils.testing_utils import (
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+class LuminaText2ImgPipelinePipelineFastTests(unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = LuminaText2ImgPipeline
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "guidance_scale",
+            "negative_prompt",
+            "prompt_embeds",
+            "negative_prompt_embeds",
+        ]
+    )
+    batch_params = frozenset(["prompt", "negative_prompt"])
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = LuminaNextDiT2DModel(
+            sample_size=4,
+            patch_size=2,
+            in_channels=4,
+            hidden_size=4,
+            num_layers=2,
+            num_attention_heads=1,
+            num_kv_heads=1,
+            multiple_of=16,
+            ffn_dim_multiplier=None,
+            norm_eps=1e-5,
+            learn_sigma=True,
+            qk_norm=True,
+            cross_attention_dim=8,
+            scaling_factor=1.0,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL()
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/dummy-gemma")
+
+        torch.manual_seed(0)
+        config = GemmaConfig(
+            head_dim=2,
+            hidden_size=8,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_hidden_layers=2,
+            num_key_value_heads=4,
+        )
+        text_encoder = GemmaForCausalLM(config)
+
+        components = {
+            "transformer": transformer.eval(),
+            "vae": vae.eval(),
+            "scheduler": scheduler,
+            "text_encoder": text_encoder.eval(),
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_lumina_prompt_embeds(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+        inputs = self.get_dummy_inputs(torch_device)
+
+        output_with_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = inputs.pop("prompt")
+
+        do_classifier_free_guidance = inputs["guidance_scale"] > 1
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = pipe.encode_prompt(
+            prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            device=torch_device,
+        )
+        output_with_embeds = pipe(
+            prompt_embeds=prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            **inputs,
+        ).images[0]
+
+        max_diff = np.abs(output_with_prompt - output_with_embeds).max()
+        assert max_diff < 1e-4
+
+    @unittest.skip("xformers attention processor does not exist for Lumina")
+    def test_xformers_attention_forwardGenerator_pass(self):
+        pass
+
+
+@slow
+@require_torch_gpu
+class LuminaText2ImgPipelineSlowTests(unittest.TestCase):
+    pipeline_class = LuminaText2ImgPipeline
+    repo_id = "Alpha-VLLM/Lumina-Next-SFT-diffusers"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        return {
+            "prompt": "A photo of a cat",
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "generator": generator,
+        }
+
+    def test_lumina_inference(self):
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.bfloat16)
+        pipe.enable_model_cpu_offload()
+
+        inputs = self.get_inputs(torch_device)
+
+        image = pipe(**inputs).images[0]
+        image_slice = image[0, :10, :10]
+        expected_slice = np.array(
+            [
+                [0.17773438, 0.18554688, 0.22070312],
+                [0.046875, 0.06640625, 0.10351562],
+                [0.0, 0.0, 0.02148438],
+                [0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0],
+            ],
+            dtype=np.float32,
+        )
+
+        max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
+
+        assert max_diff < 1e-4
diff --git a/diffusers/tests/pipelines/marigold/__init__.py b/diffusers/tests/pipelines/marigold/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/marigold/test_marigold_depth.py b/diffusers/tests/pipelines/marigold/test_marigold_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb9adca7a7bb3b89619f274d0db8a7d39712300
--- /dev/null
+++ b/diffusers/tests/pipelines/marigold/test_marigold_depth.py
@@ -0,0 +1,462 @@
+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# More information and citation instructions are available on the
+# Marigold project website: https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    AutoencoderTiny,
+    LCMScheduler,
+    MarigoldDepthPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    is_flaky,
+    load_image,
+    require_torch_gpu,
+    slow,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class MarigoldDepthPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = MarigoldDepthPipeline
+    params = frozenset(["image"])
+    batch_params = frozenset(["image"])
+    image_params = frozenset(["image"])
+    image_latents_params = frozenset(["latents"])
+    callback_cfg_params = frozenset([])
+    test_xformers_attention = False
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "output_type",
+        ]
+    )
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=8,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = LCMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            prediction_type="v_prediction",
+            set_alpha_to_one=False,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            thresholding=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "prediction_type": "depth",
+            "scale_invariant": True,
+            "shift_invariant": True,
+        }
+        return components
+
+    def get_dummy_tiny_autoencoder(self):
+        return AutoencoderTiny(in_channels=3, out_channels=3, latent_channels=4)
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image / 2 + 0.5
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": image,
+            "num_inference_steps": 1,
+            "processing_resolution": 0,
+            "generator": generator,
+            "output_type": "np",
+        }
+        return inputs
+
+    def _test_marigold_depth(
+        self,
+        generator_seed: int = 0,
+        expected_slice: np.ndarray = None,
+        atol: float = 1e-4,
+        **pipe_kwargs,
+    ):
+        device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe_inputs = self.get_dummy_inputs(device, seed=generator_seed)
+        pipe_inputs.update(**pipe_kwargs)
+
+        prediction = pipe(**pipe_inputs).prediction
+
+        prediction_slice = prediction[0, -3:, -3:, -1].flatten()
+
+        if pipe_inputs.get("match_input_resolution", True):
+            self.assertEqual(prediction.shape, (1, 32, 32, 1), "Unexpected output resolution")
+        else:
+            self.assertTrue(prediction.shape[0] == 1 and prediction.shape[3] == 1, "Unexpected output dimensions")
+            self.assertEqual(
+                max(prediction.shape[1:3]),
+                pipe_inputs.get("processing_resolution", 768),
+                "Unexpected output resolution",
+            )
+
+        self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
+
+    def test_marigold_depth_dummy_defaults(self):
+        self._test_marigold_depth(
+            expected_slice=np.array([0.4529, 0.5184, 0.4985, 0.4355, 0.4273, 0.4153, 0.5229, 0.4818, 0.4627]),
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P32_E1_B1_M1(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.4529, 0.5184, 0.4985, 0.4355, 0.4273, 0.4153, 0.5229, 0.4818, 0.4627]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M1(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.4511, 0.4531, 0.4542, 0.5024, 0.4987, 0.4969, 0.5281, 0.5215, 0.5182]),
+            num_inference_steps=1,
+            processing_resolution=16,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G2024_S1_P32_E1_B1_M1(self):
+        self._test_marigold_depth(
+            generator_seed=2024,
+            expected_slice=np.array([0.4671, 0.4739, 0.5130, 0.4308, 0.4411, 0.4720, 0.5064, 0.4796, 0.4795]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S2_P32_E1_B1_M1(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.4165, 0.4485, 0.4647, 0.4003, 0.4577, 0.5074, 0.5106, 0.5077, 0.5042]),
+            num_inference_steps=2,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P64_E1_B1_M1(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.4817, 0.5425, 0.5146, 0.5367, 0.5034, 0.4743, 0.4395, 0.4734, 0.4399]),
+            num_inference_steps=1,
+            processing_resolution=64,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    @is_flaky
+    def test_marigold_depth_dummy_G0_S1_P32_E3_B1_M1(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.3260, 0.3591, 0.2837, 0.2971, 0.2750, 0.2426, 0.4200, 0.3588, 0.3254]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=3,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    @is_flaky
+    def test_marigold_depth_dummy_G0_S1_P32_E4_B2_M1(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.3180, 0.4194, 0.3013, 0.2902, 0.3245, 0.2897, 0.4718, 0.4174, 0.3705]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=4,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=2,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M0(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.5515, 0.4588, 0.4197, 0.4741, 0.4229, 0.4328, 0.5333, 0.5314, 0.5182]),
+            num_inference_steps=1,
+            processing_resolution=16,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=False,
+        )
+
+    def test_marigold_depth_dummy_no_num_inference_steps(self):
+        with self.assertRaises(ValueError) as e:
+            self._test_marigold_depth(
+                num_inference_steps=None,
+                expected_slice=np.array([0.0]),
+            )
+            self.assertIn("num_inference_steps", str(e))
+
+    def test_marigold_depth_dummy_no_processing_resolution(self):
+        with self.assertRaises(ValueError) as e:
+            self._test_marigold_depth(
+                processing_resolution=None,
+                expected_slice=np.array([0.0]),
+            )
+            self.assertIn("processing_resolution", str(e))
+
+
+@slow
+@require_torch_gpu
+class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def _test_marigold_depth(
+        self,
+        is_fp16: bool = True,
+        device: str = "cuda",
+        generator_seed: int = 0,
+        expected_slice: np.ndarray = None,
+        model_id: str = "prs-eth/marigold-lcm-v1-0",
+        image_url: str = "https://marigoldmonodepth.github.io/images/einstein.jpg",
+        atol: float = 1e-4,
+        **pipe_kwargs,
+    ):
+        from_pretrained_kwargs = {}
+        if is_fp16:
+            from_pretrained_kwargs["variant"] = "fp16"
+            from_pretrained_kwargs["torch_dtype"] = torch.float16
+
+        pipe = MarigoldDepthPipeline.from_pretrained(model_id, **from_pretrained_kwargs)
+        if device == "cuda":
+            pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(generator_seed)
+
+        image = load_image(image_url)
+        width, height = image.size
+
+        prediction = pipe(image, generator=generator, **pipe_kwargs).prediction
+
+        prediction_slice = prediction[0, -3:, -3:, -1].flatten()
+
+        if pipe_kwargs.get("match_input_resolution", True):
+            self.assertEqual(prediction.shape, (1, height, width, 1), "Unexpected output resolution")
+        else:
+            self.assertTrue(prediction.shape[0] == 1 and prediction.shape[3] == 1, "Unexpected output dimensions")
+            self.assertEqual(
+                max(prediction.shape[1:3]),
+                pipe_kwargs.get("processing_resolution", 768),
+                "Unexpected output resolution",
+            )
+
+        self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
+
+    def test_marigold_depth_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=False,
+            device="cpu",
+            generator_seed=0,
+            expected_slice=np.array([0.4323, 0.4323, 0.4323, 0.4323, 0.4323, 0.4323, 0.4323, 0.4323, 0.4323]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=False,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.1244, 0.1265, 0.1292, 0.1240, 0.1252, 0.1266, 0.1246, 0.1226, 0.1180]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.1241, 0.1262, 0.1290, 0.1238, 0.1250, 0.1265, 0.1244, 0.1225, 0.1179]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=2024,
+            expected_slice=np.array([0.1710, 0.1725, 0.1738, 0.1700, 0.1700, 0.1696, 0.1698, 0.1663, 0.1592]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.1085, 0.1098, 0.1110, 0.1081, 0.1085, 0.1082, 0.1085, 0.1057, 0.0996]),
+            num_inference_steps=2,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.2683, 0.2693, 0.2698, 0.2666, 0.2632, 0.2615, 0.2656, 0.2603, 0.2573]),
+            num_inference_steps=1,
+            processing_resolution=512,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.1200, 0.1215, 0.1237, 0.1193, 0.1197, 0.1202, 0.1196, 0.1166, 0.1109]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=3,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.1121, 0.1135, 0.1155, 0.1111, 0.1115, 0.1118, 0.1111, 0.1079, 0.1019]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=4,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=2,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.2671, 0.2690, 0.2720, 0.2659, 0.2676, 0.2739, 0.2664, 0.2686, 0.2573]),
+            num_inference_steps=1,
+            processing_resolution=512,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=False,
+        )
diff --git a/diffusers/tests/pipelines/marigold/test_marigold_normals.py b/diffusers/tests/pipelines/marigold/test_marigold_normals.py
new file mode 100644
index 0000000000000000000000000000000000000000..c86c600be8e55863a546c6696b7c50bba18346b3
--- /dev/null
+++ b/diffusers/tests/pipelines/marigold/test_marigold_normals.py
@@ -0,0 +1,459 @@
+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# More information and citation instructions are available on the
+# Marigold project website: https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    AutoencoderTiny,
+    LCMScheduler,
+    MarigoldNormalsPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    require_torch_gpu,
+    slow,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class MarigoldNormalsPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = MarigoldNormalsPipeline
+    params = frozenset(["image"])
+    batch_params = frozenset(["image"])
+    image_params = frozenset(["image"])
+    image_latents_params = frozenset(["latents"])
+    callback_cfg_params = frozenset([])
+    test_xformers_attention = False
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "output_type",
+        ]
+    )
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=8,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        torch.manual_seed(0)
+        scheduler = LCMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            prediction_type="v_prediction",
+            set_alpha_to_one=False,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            thresholding=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "prediction_type": "normals",
+            "use_full_z_range": True,
+        }
+        return components
+
+    def get_dummy_tiny_autoencoder(self):
+        return AutoencoderTiny(in_channels=3, out_channels=3, latent_channels=4)
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image / 2 + 0.5
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": image,
+            "num_inference_steps": 1,
+            "processing_resolution": 0,
+            "generator": generator,
+            "output_type": "np",
+        }
+        return inputs
+
+    def _test_marigold_normals(
+        self,
+        generator_seed: int = 0,
+        expected_slice: np.ndarray = None,
+        atol: float = 1e-4,
+        **pipe_kwargs,
+    ):
+        device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe_inputs = self.get_dummy_inputs(device, seed=generator_seed)
+        pipe_inputs.update(**pipe_kwargs)
+
+        prediction = pipe(**pipe_inputs).prediction
+
+        prediction_slice = prediction[0, -3:, -3:, -1].flatten()
+
+        if pipe_inputs.get("match_input_resolution", True):
+            self.assertEqual(prediction.shape, (1, 32, 32, 3), "Unexpected output resolution")
+        else:
+            self.assertTrue(prediction.shape[0] == 1 and prediction.shape[3] == 3, "Unexpected output dimensions")
+            self.assertEqual(
+                max(prediction.shape[1:3]),
+                pipe_inputs.get("processing_resolution", 768),
+                "Unexpected output resolution",
+            )
+
+        self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
+
+    def test_marigold_depth_dummy_defaults(self):
+        self._test_marigold_normals(
+            expected_slice=np.array([0.0967, 0.5234, 0.1448, -0.3155, -0.2550, -0.5578, 0.6854, 0.5657, -0.1263]),
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P32_E1_B1_M1(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([0.0967, 0.5234, 0.1448, -0.3155, -0.2550, -0.5578, 0.6854, 0.5657, -0.1263]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M1(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([-0.4128, -0.5918, -0.6540, 0.2446, -0.2687, -0.4607, 0.2935, -0.0483, -0.2086]),
+            num_inference_steps=1,
+            processing_resolution=16,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G2024_S1_P32_E1_B1_M1(self):
+        self._test_marigold_normals(
+            generator_seed=2024,
+            expected_slice=np.array([0.5731, -0.7631, -0.0199, 0.1609, -0.4628, -0.7044, 0.5761, -0.3471, -0.4498]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S2_P32_E1_B1_M1(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([0.1017, -0.6823, -0.2533, 0.1988, 0.3389, 0.8478, 0.7757, 0.5220, 0.8668]),
+            num_inference_steps=2,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P64_E1_B1_M1(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([-0.2391, 0.7969, 0.6224, 0.0698, 0.5669, -0.2167, -0.1362, -0.8945, -0.5501]),
+            num_inference_steps=1,
+            processing_resolution=64,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P32_E3_B1_M1(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([0.3826, -0.9634, -0.3835, 0.3514, 0.0691, -0.6182, 0.8709, 0.1590, -0.2181]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=3,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P32_E4_B2_M1(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([0.2500, -0.3928, -0.2415, 0.1133, 0.2357, -0.4223, 0.9967, 0.4859, -0.1282]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=4,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=2,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M0(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([0.9588, 0.3326, -0.0825, -0.0994, -0.3534, -0.4302, 0.3562, 0.4421, -0.2086]),
+            num_inference_steps=1,
+            processing_resolution=16,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=False,
+        )
+
+    def test_marigold_depth_dummy_no_num_inference_steps(self):
+        with self.assertRaises(ValueError) as e:
+            self._test_marigold_normals(
+                num_inference_steps=None,
+                expected_slice=np.array([0.0]),
+            )
+            self.assertIn("num_inference_steps", str(e))
+
+    def test_marigold_depth_dummy_no_processing_resolution(self):
+        with self.assertRaises(ValueError) as e:
+            self._test_marigold_normals(
+                processing_resolution=None,
+                expected_slice=np.array([0.0]),
+            )
+            self.assertIn("processing_resolution", str(e))
+
+
+@slow
+@require_torch_gpu
+class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def _test_marigold_normals(
+        self,
+        is_fp16: bool = True,
+        device: str = "cuda",
+        generator_seed: int = 0,
+        expected_slice: np.ndarray = None,
+        model_id: str = "prs-eth/marigold-normals-lcm-v0-1",
+        image_url: str = "https://marigoldmonodepth.github.io/images/einstein.jpg",
+        atol: float = 1e-4,
+        **pipe_kwargs,
+    ):
+        from_pretrained_kwargs = {}
+        if is_fp16:
+            from_pretrained_kwargs["variant"] = "fp16"
+            from_pretrained_kwargs["torch_dtype"] = torch.float16
+
+        pipe = MarigoldNormalsPipeline.from_pretrained(model_id, **from_pretrained_kwargs)
+        if device == "cuda":
+            pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(generator_seed)
+
+        image = load_image(image_url)
+        width, height = image.size
+
+        prediction = pipe(image, generator=generator, **pipe_kwargs).prediction
+
+        prediction_slice = prediction[0, -3:, -3:, -1].flatten()
+
+        if pipe_kwargs.get("match_input_resolution", True):
+            self.assertEqual(prediction.shape, (1, height, width, 3), "Unexpected output resolution")
+        else:
+            self.assertTrue(prediction.shape[0] == 1 and prediction.shape[3] == 3, "Unexpected output dimensions")
+            self.assertEqual(
+                max(prediction.shape[1:3]),
+                pipe_kwargs.get("processing_resolution", 768),
+                "Unexpected output resolution",
+            )
+
+        self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
+
+    def test_marigold_normals_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=False,
+            device="cpu",
+            generator_seed=0,
+            expected_slice=np.array([0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=False,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7980, 0.7952, 0.7914, 0.7931, 0.7871, 0.7816, 0.7844, 0.7710, 0.7601]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7979, 0.7949, 0.7915, 0.7930, 0.7871, 0.7817, 0.7842, 0.7710, 0.7603]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=2024,
+            expected_slice=np.array([0.8428, 0.8428, 0.8433, 0.8369, 0.8325, 0.8315, 0.8271, 0.8135, 0.8057]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7095, 0.7095, 0.7104, 0.7070, 0.7051, 0.7061, 0.7017, 0.6938, 0.6914]),
+            num_inference_steps=2,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7168, 0.7163, 0.7163, 0.7080, 0.7061, 0.7046, 0.7031, 0.7007, 0.6987]),
+            num_inference_steps=1,
+            processing_resolution=512,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7114, 0.7124, 0.7144, 0.7085, 0.7070, 0.7080, 0.7051, 0.6958, 0.6924]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=3,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7412, 0.7441, 0.7490, 0.7383, 0.7388, 0.7437, 0.7329, 0.7271, 0.7300]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=4,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=2,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7188, 0.7144, 0.7134, 0.7178, 0.7207, 0.7222, 0.7231, 0.7041, 0.6987]),
+            num_inference_steps=1,
+            processing_resolution=512,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=False,
+        )
diff --git a/diffusers/tests/pipelines/musicldm/__init__.py b/diffusers/tests/pipelines/musicldm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/musicldm/test_musicldm.py b/diffusers/tests/pipelines/musicldm/test_musicldm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e51f5103933a9880b4b0d096b6107f257ea96379
--- /dev/null
+++ b/diffusers/tests/pipelines/musicldm/test_musicldm.py
@@ -0,0 +1,470 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import (
+    ClapAudioConfig,
+    ClapConfig,
+    ClapFeatureExtractor,
+    ClapModel,
+    ClapTextConfig,
+    RobertaTokenizer,
+    SpeechT5HifiGan,
+    SpeechT5HifiGanConfig,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    MusicLDMPipeline,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
+
+from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class MusicLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = MusicLDMPipeline
+    params = TEXT_TO_AUDIO_PARAMS
+    batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "num_waveforms_per_prompt",
+            "generator",
+            "latents",
+            "output_type",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=(32, 64),
+            class_embed_type="simple_projection",
+            projection_class_embeddings_input_dim=32,
+            class_embeddings_concat=True,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=1,
+            out_channels=1,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_branch_config = ClapTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=16,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        audio_branch_config = ClapAudioConfig(
+            spec_size=64,
+            window_size=4,
+            num_mel_bins=64,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            depths=[2, 2],
+            num_attention_heads=[2, 2],
+            num_hidden_layers=2,
+            hidden_size=192,
+            patch_size=2,
+            patch_stride=2,
+            patch_embed_input_channels=4,
+        )
+        text_encoder_config = ClapConfig.from_text_audio_configs(
+            text_config=text_branch_config, audio_config=audio_branch_config, projection_dim=32
+        )
+        text_encoder = ClapModel(text_encoder_config)
+        tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
+        feature_extractor = ClapFeatureExtractor.from_pretrained(
+            "hf-internal-testing/tiny-random-ClapModel", hop_length=7900
+        )
+
+        torch.manual_seed(0)
+        vocoder_config = SpeechT5HifiGanConfig(
+            model_in_dim=8,
+            sampling_rate=16000,
+            upsample_initial_channel=16,
+            upsample_rates=[2, 2],
+            upsample_kernel_sizes=[4, 4],
+            resblock_kernel_sizes=[3, 7],
+            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
+            normalize_before=False,
+        )
+
+        vocoder = SpeechT5HifiGan(vocoder_config)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "feature_extractor": feature_extractor,
+            "vocoder": vocoder,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A hammer hitting a wooden surface",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+        }
+        return inputs
+
+    def test_musicldm_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        musicldm_pipe = MusicLDMPipeline(**components)
+        musicldm_pipe = musicldm_pipe.to(torch_device)
+        musicldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = musicldm_pipe(**inputs)
+        audio = output.audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 256
+
+        audio_slice = audio[:10]
+        expected_slice = np.array(
+            [-0.0027, -0.0036, -0.0037, -0.0020, -0.0035, -0.0019, -0.0037, -0.0020, -0.0038, -0.0019]
+        )
+
+        assert np.abs(audio_slice - expected_slice).max() < 1e-4
+
+    def test_musicldm_prompt_embeds(self):
+        components = self.get_dummy_components()
+        musicldm_pipe = MusicLDMPipeline(**components)
+        musicldm_pipe = musicldm_pipe.to(torch_device)
+        musicldm_pipe = musicldm_pipe.to(torch_device)
+        musicldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = musicldm_pipe(**inputs)
+        audio_1 = output.audios[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        text_inputs = musicldm_pipe.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=musicldm_pipe.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_inputs = text_inputs["input_ids"].to(torch_device)
+
+        prompt_embeds = musicldm_pipe.text_encoder.get_text_features(text_inputs)
+
+        inputs["prompt_embeds"] = prompt_embeds
+
+        # forward
+        output = musicldm_pipe(**inputs)
+        audio_2 = output.audios[0]
+
+        assert np.abs(audio_1 - audio_2).max() < 1e-2
+
+    def test_musicldm_negative_prompt_embeds(self):
+        components = self.get_dummy_components()
+        musicldm_pipe = MusicLDMPipeline(**components)
+        musicldm_pipe = musicldm_pipe.to(torch_device)
+        musicldm_pipe = musicldm_pipe.to(torch_device)
+        musicldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = musicldm_pipe(**inputs)
+        audio_1 = output.audios[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        embeds = []
+        for p in [prompt, negative_prompt]:
+            text_inputs = musicldm_pipe.tokenizer(
+                p,
+                padding="max_length",
+                max_length=musicldm_pipe.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_inputs = text_inputs["input_ids"].to(torch_device)
+
+            text_embeds = musicldm_pipe.text_encoder.get_text_features(
+                text_inputs,
+            )
+            embeds.append(text_embeds)
+
+        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
+
+        # forward
+        output = musicldm_pipe(**inputs)
+        audio_2 = output.audios[0]
+
+        assert np.abs(audio_1 - audio_2).max() < 1e-2
+
+    def test_musicldm_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
+        musicldm_pipe = MusicLDMPipeline(**components)
+        musicldm_pipe = musicldm_pipe.to(device)
+        musicldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "egg cracking"
+        output = musicldm_pipe(**inputs, negative_prompt=negative_prompt)
+        audio = output.audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 256
+
+        audio_slice = audio[:10]
+        expected_slice = np.array(
+            [-0.0027, -0.0036, -0.0037, -0.0019, -0.0035, -0.0018, -0.0037, -0.0021, -0.0038, -0.0018]
+        )
+
+        assert np.abs(audio_slice - expected_slice).max() < 1e-4
+
+    def test_musicldm_num_waveforms_per_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
+        musicldm_pipe = MusicLDMPipeline(**components)
+        musicldm_pipe = musicldm_pipe.to(device)
+        musicldm_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A hammer hitting a wooden surface"
+
+        # test num_waveforms_per_prompt=1 (default)
+        audios = musicldm_pipe(prompt, num_inference_steps=2).audios
+
+        assert audios.shape == (1, 256)
+
+        # test num_waveforms_per_prompt=1 (default) for batch of prompts
+        batch_size = 2
+        audios = musicldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
+
+        assert audios.shape == (batch_size, 256)
+
+        # test num_waveforms_per_prompt for single prompt
+        num_waveforms_per_prompt = 2
+        audios = musicldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios
+
+        assert audios.shape == (num_waveforms_per_prompt, 256)
+
+        # test num_waveforms_per_prompt for batch of prompts
+        batch_size = 2
+        audios = musicldm_pipe(
+            [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
+        ).audios
+
+        assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
+
+    def test_musicldm_audio_length_in_s(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        musicldm_pipe = MusicLDMPipeline(**components)
+        musicldm_pipe = musicldm_pipe.to(torch_device)
+        musicldm_pipe.set_progress_bar_config(disable=None)
+        vocoder_sampling_rate = musicldm_pipe.vocoder.config.sampling_rate
+
+        inputs = self.get_dummy_inputs(device)
+        output = musicldm_pipe(audio_length_in_s=0.016, **inputs)
+        audio = output.audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) / vocoder_sampling_rate == 0.016
+
+        output = musicldm_pipe(audio_length_in_s=0.032, **inputs)
+        audio = output.audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) / vocoder_sampling_rate == 0.032
+
+    def test_musicldm_vocoder_model_in_dim(self):
+        components = self.get_dummy_components()
+        musicldm_pipe = MusicLDMPipeline(**components)
+        musicldm_pipe = musicldm_pipe.to(torch_device)
+        musicldm_pipe.set_progress_bar_config(disable=None)
+
+        prompt = ["hey"]
+
+        output = musicldm_pipe(prompt, num_inference_steps=1)
+        audio_shape = output.audios.shape
+        assert audio_shape == (1, 256)
+
+        config = musicldm_pipe.vocoder.config
+        config.model_in_dim *= 2
+        musicldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
+        output = musicldm_pipe(prompt, num_inference_steps=1)
+        audio_shape = output.audios.shape
+        # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
+        assert audio_shape == (1, 256)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical()
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # The method component.dtype returns the dtype of the first parameter registered in the model, not the
+        # dtype of the entire model. In the case of CLAP, the first parameter is a float64 constant (logit scale)
+        model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
+
+        # Without the logit scale parameters, everything is float32
+        model_dtypes.pop("text_encoder")
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
+
+        # the CLAP sub-models are float32
+        model_dtypes["clap_text_branch"] = components["text_encoder"].text_model.dtype
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
+
+        # Once we send to fp16, all params are in half-precision, including the logit scale
+        pipe.to(dtype=torch.float16)
+        model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
+
+
+@nightly
+@require_torch_gpu
+class MusicLDMPipelineNightlyTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "A hammer hitting a wooden surface",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 2.5,
+        }
+        return inputs
+
+    def test_musicldm(self):
+        musicldm_pipe = MusicLDMPipeline.from_pretrained("cvssp/musicldm")
+        musicldm_pipe = musicldm_pipe.to(torch_device)
+        musicldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 25
+        audio = musicldm_pipe(**inputs).audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 81952
+
+        # check the portion of the generated audio with the largest dynamic range (reduces flakiness)
+        audio_slice = audio[8680:8690]
+        expected_slice = np.array(
+            [-0.1042, -0.1068, -0.1235, -0.1387, -0.1428, -0.136, -0.1213, -0.1097, -0.0967, -0.0945]
+        )
+        max_diff = np.abs(expected_slice - audio_slice).max()
+        assert max_diff < 1e-3
+
+    def test_musicldm_lms(self):
+        musicldm_pipe = MusicLDMPipeline.from_pretrained("cvssp/musicldm")
+        musicldm_pipe.scheduler = LMSDiscreteScheduler.from_config(musicldm_pipe.scheduler.config)
+        musicldm_pipe = musicldm_pipe.to(torch_device)
+        musicldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        audio = musicldm_pipe(**inputs).audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 81952
+
+        # check the portion of the generated audio with the largest dynamic range (reduces flakiness)
+        audio_slice = audio[58020:58030]
+        expected_slice = np.array([0.3592, 0.3477, 0.4084, 0.4665, 0.5048, 0.5891, 0.6461, 0.5579, 0.4595, 0.4403])
+        max_diff = np.abs(expected_slice - audio_slice).max()
+        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/pag/__init__.py b/diffusers/tests/pipelines/pag/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/pag/test_pag_animatediff.py b/diffusers/tests/pipelines/pag/test_pag_animatediff.py
new file mode 100644
index 0000000000000000000000000000000000000000..7efe8002d17c824c5c0156a9830e566b90267243
--- /dev/null
+++ b/diffusers/tests/pipelines/pag/test_pag_animatediff.py
@@ -0,0 +1,555 @@
+import inspect
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AnimateDiffPAGPipeline,
+    AnimateDiffPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    LCMScheduler,
+    MotionAdapter,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    UNetMotionModel,
+)
+from diffusers.models.attention import FreeNoiseTransformerBlock
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineFromPipeTesterMixin,
+    PipelineTesterMixin,
+    SDFunctionTesterMixin,
+)
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+class AnimateDiffPAGPipelineFastTests(
+    IPAdapterTesterMixin, SDFunctionTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
+):
+    pipeline_class = AnimateDiffPAGPipeline
+    params = TEXT_TO_IMAGE_PARAMS.union({"pag_scale", "pag_adaptive_scale"})
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+
+    def get_dummy_components(self):
+        cross_attention_dim = 8
+        block_out_channels = (8, 8)
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=block_out_channels,
+            layers_per_block=2,
+            sample_size=8,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="linear",
+            clip_sample=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=block_out_channels,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=cross_attention_dim,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        motion_adapter = MotionAdapter(
+            block_out_channels=block_out_channels,
+            motion_layers_per_block=2,
+            motion_norm_num_groups=2,
+            motion_num_attention_heads=4,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "motion_adapter": motion_adapter,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 7.5,
+            "pag_scale": 3.0,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_from_pipe_consistent_config(self):
+        assert self.original_pipeline_class == StableDiffusionPipeline
+        original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
+        original_kwargs = {"requires_safety_checker": False}
+
+        # create original_pipeline_class(sd)
+        pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
+
+        # original_pipeline_class(sd) -> pipeline_class
+        pipe_components = self.get_dummy_components()
+        pipe_additional_components = {}
+        for name, component in pipe_components.items():
+            if name not in pipe_original.components:
+                pipe_additional_components[name] = component
+
+        pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
+
+        # pipeline_class -> original_pipeline_class(sd)
+        original_pipe_additional_components = {}
+        for name, component in pipe_original.components.items():
+            if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
+                original_pipe_additional_components[name] = component
+
+        pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
+
+        # compare the config
+        original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
+        original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
+        assert original_config_2 == original_config
+
+    def test_motion_unet_loading(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        assert isinstance(pipe.unet, UNetMotionModel)
+
+    @unittest.skip("Attention slicing is not enabled in this pipeline")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array(
+                [
+                    0.5068,
+                    0.5294,
+                    0.4926,
+                    0.4810,
+                    0.4188,
+                    0.5935,
+                    0.5295,
+                    0.3947,
+                    0.5300,
+                    0.4706,
+                    0.3950,
+                    0.4737,
+                    0.4072,
+                    0.3227,
+                    0.5481,
+                    0.4864,
+                    0.4518,
+                    0.5315,
+                    0.5979,
+                    0.5374,
+                    0.3503,
+                    0.5275,
+                    0.6067,
+                    0.4914,
+                    0.5440,
+                    0.4775,
+                    0.5538,
+                ]
+            )
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        expected_slice = None
+        if torch_device == "cpu":
+            expected_slice = np.array([0.5295, 0.3947, 0.5300, 0.4864, 0.4518, 0.5315, 0.5440, 0.4775, 0.5538])
+        return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.float16)
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    def test_prompt_embeds(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs.pop("prompt")
+        inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
+        pipe(**inputs)
+
+    def test_free_init(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffPAGPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        pipe.enable_free_init(
+            num_iters=2,
+            use_fast_sampling=True,
+            method="butterworth",
+            order=4,
+            spatial_stop_frequency=0.25,
+            temporal_stop_frequency=0.25,
+        )
+        inputs_enable_free_init = self.get_dummy_inputs(torch_device)
+        frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
+
+        pipe.disable_free_init()
+        inputs_disable_free_init = self.get_dummy_inputs(torch_device)
+        frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
+
+        sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+        max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
+        self.assertGreater(
+            sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results"
+        )
+        self.assertLess(
+            max_diff_disabled,
+            1e-3,
+            "Disabling of FreeInit should lead to results similar to the default pipeline results",
+        )
+
+    def test_free_init_with_schedulers(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffPAGPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        schedulers_to_test = [
+            DPMSolverMultistepScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                algorithm_type="dpmsolver++",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+            LCMScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+        ]
+        components.pop("scheduler")
+
+        for scheduler in schedulers_to_test:
+            components["scheduler"] = scheduler
+            pipe: AnimateDiffPAGPipeline = self.pipeline_class(**components)
+            pipe.set_progress_bar_config(disable=None)
+            pipe.to(torch_device)
+
+            pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
+
+            inputs = self.get_dummy_inputs(torch_device)
+            frames_enable_free_init = pipe(**inputs).frames[0]
+            sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+
+            self.assertGreater(
+                sum_enabled,
+                1e1,
+                "Enabling of FreeInit should lead to results different from the default pipeline results",
+            )
+
+    def test_free_noise_blocks(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffPAGPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        pipe.enable_free_noise()
+        for block in pipe.unet.down_blocks:
+            for motion_module in block.motion_modules:
+                for transformer_block in motion_module.transformer_blocks:
+                    self.assertTrue(
+                        isinstance(transformer_block, FreeNoiseTransformerBlock),
+                        "Motion module transformer blocks must be an instance of `FreeNoiseTransformerBlock` after enabling FreeNoise.",
+                    )
+
+        pipe.disable_free_noise()
+        for block in pipe.unet.down_blocks:
+            for motion_module in block.motion_modules:
+                for transformer_block in motion_module.transformer_blocks:
+                    self.assertFalse(
+                        isinstance(transformer_block, FreeNoiseTransformerBlock),
+                        "Motion module transformer blocks must not be an instance of `FreeNoiseTransformerBlock` after disabling FreeNoise.",
+                    )
+
+    def test_free_noise(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffPAGPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        for context_length in [8, 9]:
+            for context_stride in [4, 6]:
+                pipe.enable_free_noise(context_length, context_stride)
+
+                inputs_enable_free_noise = self.get_dummy_inputs(torch_device)
+                frames_enable_free_noise = pipe(**inputs_enable_free_noise).frames[0]
+
+                pipe.disable_free_noise()
+
+                inputs_disable_free_noise = self.get_dummy_inputs(torch_device)
+                frames_disable_free_noise = pipe(**inputs_disable_free_noise).frames[0]
+
+                sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_noise)).sum()
+                max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_noise)).max()
+                self.assertGreater(
+                    sum_enabled,
+                    1e1,
+                    "Enabling of FreeNoise should lead to results different from the default pipeline results",
+                )
+                self.assertLess(
+                    max_diff_disabled,
+                    1e-4,
+                    "Disabling of FreeNoise should lead to results similar to the default pipeline results",
+                )
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = pipe(**inputs).frames[0]
+        output_without_offload = (
+            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
+        )
+
+        pipe.enable_xformers_memory_efficient_attention()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = pipe(**inputs).frames[0]
+        output_with_offload = (
+            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
+        )
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
+
+    def test_vae_slicing(self):
+        return super().test_vae_slicing(image_count=2)
+
+    def test_pag_disable_enable(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base pipeline (expect same output when pag is disabled)
+        components.pop("pag_applied_layers", None)
+        pipe_sd = AnimateDiffPipeline(**components)
+        pipe_sd = pipe_sd.to(device)
+        pipe_sd.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["pag_scale"]
+        assert (
+            "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters
+        ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}."
+        out = pipe_sd(**inputs).frames[0, -3:, -3:, -1]
+
+        components = self.get_dummy_components()
+
+        # pag disabled with pag_scale=0.0
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["pag_scale"] = 0.0
+        out_pag_disabled = pipe_pag(**inputs).frames[0, -3:, -3:, -1]
+
+        # pag enabled
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        out_pag_enabled = pipe_pag(**inputs).frames[0, -3:, -3:, -1]
+
+        assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3
+        assert np.abs(out.flatten() - out_pag_enabled.flatten()).max() > 1e-3
+
+    def test_pag_applied_layers(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base pipeline
+        components.pop("pag_applied_layers", None)
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pag_applied_layers = ["mid","up","down"] should apply to all self-attention layers
+        # Note that for motion modules in AnimateDiff, both attn1 and attn2 are self-attention
+        all_self_attn_layers = [
+            k for k in pipe.unet.attn_processors.keys() if "attn1" in k or ("motion_modules" in k and "attn2" in k)
+        ]
+        original_attn_procs = pipe.unet.attn_processors
+        pag_layers = [
+            "down",
+            "mid",
+            "up",
+        ]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_layers)
+
+        # pag_applied_layers = ["mid"], or ["mid_block.0"] should apply to all self-attention layers in mid_block, i.e.
+        # mid_block.motion_modules.0.transformer_blocks.0.attn1.processor
+        # mid_block.attentions.0.transformer_blocks.0.attn1.processor
+        all_self_attn_mid_layers = [
+            "mid_block.attentions.0.transformer_blocks.0.attn1.processor",
+            "mid_block.motion_modules.0.transformer_blocks.0.attn1.processor",
+            "mid_block.motion_modules.0.transformer_blocks.0.attn2.processor",
+        ]
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_mid_layers)
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid_block"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_mid_layers)
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid_block.(attentions|motion_modules)"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_mid_layers)
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid_block.attentions.1"]
+        with self.assertRaises(ValueError):
+            pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+
+        # pag_applied_layers = "down" should apply to all self-attention layers in down_blocks
+        # down_blocks.1.(attentions|motion_modules).0.transformer_blocks.0.attn1.processor
+        # down_blocks.1.(attentions|motion_modules).0.transformer_blocks.1.attn1.processor
+        # down_blocks.1.(attentions|motion_modules).0.transformer_blocks.0.attn1.processor
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert len(pipe.pag_attn_processors) == 10
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down_blocks.0"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert (len(pipe.pag_attn_processors)) == 6
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["blocks.1"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert len(pipe.pag_attn_processors) == 10
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["motion_modules.42"]
+        with self.assertRaises(ValueError):
+            pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
diff --git a/diffusers/tests/pipelines/pag/test_pag_controlnet_sd.py b/diffusers/tests/pipelines/pag/test_pag_controlnet_sd.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a7eb6f0c67568c90b61b6e97b32e152b7b9cfd3
--- /dev/null
+++ b/diffusers/tests/pipelines/pag/test_pag_controlnet_sd.py
@@ -0,0 +1,248 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    DDIMScheduler,
+    StableDiffusionControlNetPAGPipeline,
+    StableDiffusionControlNetPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineFromPipeTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionControlNetPAGPipelineFastTests(
+    PipelineTesterMixin,
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineFromPipeTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionControlNetPAGPipeline
+    params = TEXT_TO_IMAGE_PARAMS.union({"pag_scale", "pag_adaptive_scale"})
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"})
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        # Copied from tests.pipelines.controlnet.test_controlnet_sdxl.StableDiffusionXLControlNetPipelineFastTests.get_dummy_components
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=8,
+            time_cond_proj_dim=time_cond_proj_dim,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            conditioning_embedding_out_channels=(2, 4),
+            cross_attention_dim=8,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=8,
+            intermediate_size=16,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+        image = randn_tensor(
+            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+            generator=generator,
+            device=torch.device(device),
+        )
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "pag_scale": 3.0,
+            "output_type": "np",
+            "image": image,
+        }
+
+        return inputs
+
+    def test_pag_disable_enable(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base  pipeline (expect same output when pag is disabled)
+        pipe_sd = StableDiffusionControlNetPipeline(**components)
+        pipe_sd = pipe_sd.to(device)
+        pipe_sd.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["pag_scale"]
+        assert (
+            "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters
+        ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}."
+        out = pipe_sd(**inputs).images[0, -3:, -3:, -1]
+
+        # pag disabled with pag_scale=0.0
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["pag_scale"] = 0.0
+        out_pag_disabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        # pag enabled
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        out_pag_enabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3
+        assert np.abs(out.flatten() - out_pag_enabled.flatten()).max() > 1e-3
+
+    def test_pag_cfg(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe_pag(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (
+            1,
+            64,
+            64,
+            3,
+        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
+        expected_slice = np.array(
+            [0.45505235, 0.2785938, 0.16334778, 0.79689944, 0.53095645, 0.40135607, 0.7052706, 0.69065094, 0.41548574]
+        )
+
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
+
+    def test_pag_uncond(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["guidance_scale"] = 0.0
+        image = pipe_pag(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (
+            1,
+            64,
+            64,
+            3,
+        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
+        expected_slice = np.array(
+            [0.45127502, 0.2797252, 0.15970308, 0.7993157, 0.5414344, 0.40160775, 0.7114598, 0.69803864, 0.4217583]
+        )
+
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
diff --git a/diffusers/tests/pipelines/pag/test_pag_controlnet_sdxl.py b/diffusers/tests/pipelines/pag/test_pag_controlnet_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..6400cc2b7cabc4dd83d60a6895002aa5cf7ed496
--- /dev/null
+++ b/diffusers/tests/pipelines/pag/test_pag_controlnet_sdxl.py
@@ -0,0 +1,265 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    EulerDiscreteScheduler,
+    StableDiffusionXLControlNetPAGPipeline,
+    StableDiffusionXLControlNetPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import enable_full_determinism
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineFromPipeTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLControlNetPAGPipelineFastTests(
+    PipelineTesterMixin,
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineFromPipeTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionXLControlNetPAGPipeline
+    params = TEXT_TO_IMAGE_PARAMS.union({"pag_scale", "pag_adaptive_scale"})
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"})
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        # Copied from tests.pipelines.controlnet.test_controlnet_sdxl.StableDiffusionXLControlNetPipelineFastTests.get_dummy_components
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+            time_cond_proj_dim=time_cond_proj_dim,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            conditioning_embedding_out_channels=(16, 32),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        torch.manual_seed(0)
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+        image = randn_tensor(
+            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+            generator=generator,
+            device=torch.device(device),
+        )
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "pag_scale": 3.0,
+            "output_type": "np",
+            "image": image,
+        }
+
+        return inputs
+
+    def test_pag_disable_enable(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base  pipeline (expect same output when pag is disabled)
+        pipe_sd = StableDiffusionXLControlNetPipeline(**components)
+        pipe_sd = pipe_sd.to(device)
+        pipe_sd.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["pag_scale"]
+        assert (
+            "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters
+        ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}."
+        out = pipe_sd(**inputs).images[0, -3:, -3:, -1]
+
+        # pag disabled with pag_scale=0.0
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["pag_scale"] = 0.0
+        out_pag_disabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        # pag enabled
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        out_pag_enabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3
+        assert np.abs(out.flatten() - out_pag_enabled.flatten()).max() > 1e-3
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    def test_pag_cfg(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe_pag(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (
+            1,
+            64,
+            64,
+            3,
+        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
+        expected_slice = np.array([0.7036, 0.5613, 0.5526, 0.6129, 0.5610, 0.5842, 0.4228, 0.4612, 0.5017])
+
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
+
+    def test_pag_uncond(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["guidance_scale"] = 0.0
+        image = pipe_pag(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (
+            1,
+            64,
+            64,
+            3,
+        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
+        expected_slice = np.array([0.6888, 0.5398, 0.5603, 0.6086, 0.5541, 0.5957, 0.4332, 0.4643, 0.5154])
+
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
diff --git a/diffusers/tests/pipelines/pag/test_pag_controlnet_sdxl_img2img.py b/diffusers/tests/pipelines/pag/test_pag_controlnet_sdxl_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..b02f4d8b4561106a069c6e1f06ac685ac7c4fb52
--- /dev/null
+++ b/diffusers/tests/pipelines/pag/test_pag_controlnet_sdxl_img2img.py
@@ -0,0 +1,271 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    EulerDiscreteScheduler,
+    StableDiffusionXLControlNetImg2ImgPipeline,
+    StableDiffusionXLControlNetPAGImg2ImgPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineFromPipeTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLControlNetPAGImg2ImgPipelineFastTests(
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    PipelineFromPipeTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionXLControlNetPAGImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"pag_scale", "pag_adaptive_scale"})
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union(
+        {"add_text_embeds", "add_time_ids", "add_neg_time_ids"}
+    )
+
+    # Copied from tests.pipelines.controlnet.test_controlnet_sdxl_img2img.ControlNetPipelineSDXLImg2ImgFastTests.get_dummy_components
+    def get_dummy_components(self, skip_first_text_encoder=False):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64 if not skip_first_text_encoder else 32,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            conditioning_embedding_out_channels=(16, 32),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        torch.manual_seed(0)
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder if not skip_first_text_encoder else None,
+            "tokenizer": tokenizer if not skip_first_text_encoder else None,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "image_encoder": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    # based on tests.pipelines.controlnet.test_controlnet_sdxl_img2img.ControlNetPipelineSDXLImg2ImgFastTests.get_dummy_inputs
+    # add `pag_scale` to the inputs
+    def get_dummy_inputs(self, device, seed=0):
+        controlnet_embedder_scale_factor = 2
+        image = floats_tensor(
+            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+            rng=random.Random(seed),
+        ).to(device)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "pag_scale": 3.0,
+            "output_type": "np",
+            "image": image,
+            "control_image": image,
+        }
+
+        return inputs
+
+    def test_pag_disable_enable(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base pipeline
+        pipe_sd = StableDiffusionXLControlNetImg2ImgPipeline(**components)
+        pipe_sd = pipe_sd.to(device)
+        pipe_sd.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["pag_scale"]
+        assert (
+            "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters
+        ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}."
+        out = pipe_sd(**inputs).images[0, -3:, -3:, -1]
+
+        # pag disabled with pag_scale=0.0
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["pag_scale"] = 0.0
+        out_pag_disabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        # pag enable
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        out_pag_enabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3
+        assert np.abs(out.flatten() - out_pag_enabled.flatten()).max() > 1e-3
+
+    def test_save_load_optional_components(self):
+        pass
+
+    def test_pag_cfg(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe_pag(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (
+            1,
+            64,
+            64,
+            3,
+        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
+        expected_slice = np.array(
+            [0.5562928, 0.44882968, 0.4588066, 0.63200223, 0.5694165, 0.4955688, 0.6126959, 0.57588536, 0.43827885]
+        )
+
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
+
+    def test_pag_uncond(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["guidance_scale"] = 0.0
+        image = pipe_pag(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (
+            1,
+            64,
+            64,
+            3,
+        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
+        expected_slice = np.array(
+            [0.5543988, 0.45614323, 0.4665692, 0.6202247, 0.5598917, 0.49621183, 0.6084159, 0.5722314, 0.43945464]
+        )
+
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
diff --git a/diffusers/tests/pipelines/pag/test_pag_hunyuan_dit.py b/diffusers/tests/pipelines/pag/test_pag_hunyuan_dit.py
new file mode 100644
index 0000000000000000000000000000000000000000..db0e257760ed44ba159ea7dd53a8a317f329f21a
--- /dev/null
+++ b/diffusers/tests/pipelines/pag/test_pag_hunyuan_dit.py
@@ -0,0 +1,358 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, BertModel, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    HunyuanDiT2DModel,
+    HunyuanDiTPAGPipeline,
+    HunyuanDiTPipeline,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, to_np
+
+
+enable_full_determinism()
+
+
+class HunyuanDiTPAGPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = HunyuanDiTPAGPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    required_optional_params = PipelineTesterMixin.required_optional_params
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = HunyuanDiT2DModel(
+            sample_size=16,
+            num_layers=2,
+            patch_size=2,
+            attention_head_dim=8,
+            num_attention_heads=3,
+            in_channels=4,
+            cross_attention_dim=32,
+            cross_attention_dim_t5=32,
+            pooled_projection_dim=16,
+            hidden_size=24,
+            activation_fn="gelu-approximate",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL()
+
+        scheduler = DDPMScheduler()
+        text_encoder = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel")
+        text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+        tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        components = {
+            "transformer": transformer.eval(),
+            "vae": vae.eval(),
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "use_resolution_binning": False,
+            "pag_scale": 0.0,
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        self.assertEqual(image.shape, (1, 16, 16, 3))
+        expected_slice = np.array(
+            [0.56939435, 0.34541583, 0.35915792, 0.46489206, 0.38775963, 0.45004836, 0.5957267, 0.59481275, 0.33287364]
+        )
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_sequential_cpu_offload_forward_pass(self):
+        # TODO(YiYi) need to fix later
+        pass
+
+    def test_sequential_offload_forward_pass_twice(self):
+        # TODO(YiYi) need to fix later
+        pass
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-3,
+        )
+
+    def test_save_load_optional_components(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = inputs["prompt"]
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        ) = pipe.encode_prompt(prompt, device=torch_device, dtype=torch.float32, text_encoder_index=0)
+
+        (
+            prompt_embeds_2,
+            negative_prompt_embeds_2,
+            prompt_attention_mask_2,
+            negative_prompt_attention_mask_2,
+        ) = pipe.encode_prompt(
+            prompt,
+            device=torch_device,
+            dtype=torch.float32,
+            text_encoder_index=1,
+        )
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attention_mask,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": negative_prompt_attention_mask,
+            "prompt_embeds_2": prompt_embeds_2,
+            "prompt_attention_mask_2": prompt_attention_mask_2,
+            "negative_prompt_embeds_2": negative_prompt_embeds_2,
+            "negative_prompt_attention_mask_2": negative_prompt_attention_mask_2,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "use_resolution_binning": False,
+        }
+
+        # set all optional components to None
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attention_mask,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": negative_prompt_attention_mask,
+            "prompt_embeds_2": prompt_embeds_2,
+            "prompt_attention_mask_2": prompt_attention_mask_2,
+            "negative_prompt_embeds_2": negative_prompt_embeds_2,
+            "negative_prompt_attention_mask_2": negative_prompt_attention_mask_2,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "use_resolution_binning": False,
+        }
+
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, 1e-4)
+
+    def test_feed_forward_chunking(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_no_chunking = image[0, -3:, -3:, -1]
+
+        pipe.transformer.enable_forward_chunking(chunk_size=1, dim=0)
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_chunking = image[0, -3:, -3:, -1]
+
+        max_diff = np.abs(to_np(image_slice_no_chunking) - to_np(image_slice_chunking)).max()
+        self.assertLess(max_diff, 1e-4)
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["return_dict"] = False
+        image = pipe(**inputs)[0]
+        original_image_slice = image[0, -3:, -3:, -1]
+
+        pipe.transformer.fuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        inputs["return_dict"] = False
+        image_fused = pipe(**inputs)[0]
+        image_slice_fused = image_fused[0, -3:, -3:, -1]
+
+        pipe.transformer.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        inputs["return_dict"] = False
+        image_disabled = pipe(**inputs)[0]
+        image_slice_disabled = image_disabled[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-2, rtol=1e-2
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
+
+    def test_pag_disable_enable(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base pipeline (expect same output when pag is disabled)
+        pipe_sd = HunyuanDiTPipeline(**components)
+        pipe_sd = pipe_sd.to(device)
+        pipe_sd.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["pag_scale"]
+        assert (
+            "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters
+        ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}."
+        out = pipe_sd(**inputs).images[0, -3:, -3:, -1]
+
+        components = self.get_dummy_components()
+
+        # pag disabled with pag_scale=0.0
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["pag_scale"] = 0.0
+        out_pag_disabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        # pag enabled
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["pag_scale"] = 3.0
+        out_pag_enabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3
+        assert np.abs(out.flatten() - out_pag_enabled.flatten()).max() > 1e-3
+
+    def test_pag_applied_layers(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base pipeline
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        all_self_attn_layers = [k for k in pipe.transformer.attn_processors.keys() if "attn1" in k]
+        original_attn_procs = pipe.transformer.attn_processors
+        pag_layers = ["blocks.0", "blocks.1"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_layers)
+
+        # blocks.0
+        block_0_self_attn = ["blocks.0.attn1.processor"]
+        pipe.transformer.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["blocks.0"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(block_0_self_attn)
+
+        pipe.transformer.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["blocks.0.attn1"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(block_0_self_attn)
+
+        pipe.transformer.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["blocks.(0|1)"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert (len(pipe.pag_attn_processors)) == 2
+
+        pipe.transformer.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["blocks.0", r"blocks\.1"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert len(pipe.pag_attn_processors) == 2
diff --git a/diffusers/tests/pipelines/pag/test_pag_kolors.py b/diffusers/tests/pipelines/pag/test_pag_kolors.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cfb2c3fd16a583001b8aec913eb658b0ee082dd
--- /dev/null
+++ b/diffusers/tests/pipelines/pag/test_pag_kolors.py
@@ -0,0 +1,252 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    EulerDiscreteScheduler,
+    KolorsPAGPipeline,
+    KolorsPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.kolors import ChatGLMModel, ChatGLMTokenizer
+from diffusers.utils.testing_utils import enable_full_determinism
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    PipelineFromPipeTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class KolorsPAGPipelineFastTests(
+    PipelineTesterMixin,
+    PipelineFromPipeTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = KolorsPAGPipeline
+    params = TEXT_TO_IMAGE_PARAMS.union({"pag_scale", "pag_adaptive_scale"})
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"})
+
+    # Copied from tests.pipelines.kolors.test_kolors.KolorsPipelineFastTests.get_dummy_components
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(2, 4),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=56,
+            cross_attention_dim=8,
+            norm_num_groups=1,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder = ChatGLMModel.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
+        tokenizer = ChatGLMTokenizer.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "image_encoder": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "pag_scale": 0.9,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_pag_disable_enable(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base  pipeline (expect same output when pag is disabled)
+        pipe_sd = KolorsPipeline(**components)
+        pipe_sd = pipe_sd.to(device)
+        pipe_sd.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["pag_scale"]
+        assert (
+            "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters
+        ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}."
+        out = pipe_sd(**inputs).images[0, -3:, -3:, -1]
+
+        # pag disabled with pag_scale=0.0
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["pag_scale"] = 0.0
+        out_pag_disabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        # pag enabled
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        out_pag_enabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3
+        assert np.abs(out.flatten() - out_pag_enabled.flatten()).max() > 1e-3
+
+    def test_pag_applied_layers(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base pipeline
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pag_applied_layers = ["mid","up","down"] should apply to all self-attention layers
+        all_self_attn_layers = [k for k in pipe.unet.attn_processors.keys() if "attn1" in k]
+        original_attn_procs = pipe.unet.attn_processors
+        pag_layers = ["mid", "down", "up"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_layers)
+
+        all_self_attn_mid_layers = [
+            "mid_block.attentions.0.transformer_blocks.0.attn1.processor",
+            "mid_block.attentions.0.transformer_blocks.1.attn1.processor",
+        ]
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_mid_layers)
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid_block"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_mid_layers)
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid_block.attentions.0"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_mid_layers)
+
+        # pag_applied_layers = ["mid.block_0.attentions_1"] does not exist in the model
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid_block.attentions.1"]
+        with self.assertRaises(ValueError):
+            pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+
+        # pag_applied_layers = "down" should apply to all self-attention layers in down_blocks
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert len(pipe.pag_attn_processors) == 4
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down_blocks.0"]
+        with self.assertRaises(ValueError):
+            pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down_blocks.1"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert len(pipe.pag_attn_processors) == 4
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down_blocks.1.attentions.1"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert len(pipe.pag_attn_processors) == 2
+
+    def test_pag_inference(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe_pag(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (
+            1,
+            64,
+            64,
+            3,
+        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
+        expected_slice = np.array(
+            [0.26030684, 0.43192005, 0.4042826, 0.4189067, 0.5181305, 0.3832534, 0.472135, 0.4145031, 0.43726248]
+        )
+
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=3e-3)
diff --git a/diffusers/tests/pipelines/pag/test_pag_pixart_sigma.py b/diffusers/tests/pipelines/pag/test_pag_pixart_sigma.py
new file mode 100644
index 0000000000000000000000000000000000000000..7de19e0f00fceca5b0df7f5c43f9151b6427db7b
--- /dev/null
+++ b/diffusers/tests/pipelines/pag/test_pag_pixart_sigma.py
@@ -0,0 +1,421 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    PixArtSigmaPAGPipeline,
+    PixArtSigmaPipeline,
+    PixArtTransformer2DModel,
+)
+from diffusers.utils import logging
+from diffusers.utils.testing_utils import (
+    CaptureLogger,
+    enable_full_determinism,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference, to_np
+
+
+enable_full_determinism()
+
+
+class PixArtSigmaPAGPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = PixArtSigmaPAGPipeline
+    params = TEXT_TO_IMAGE_PARAMS.union({"pag_scale", "pag_adaptive_scale"})
+    params = set(params)
+    params.remove("cross_attention_kwargs")
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = PixArtTransformer2DModel(
+            sample_size=8,
+            num_layers=2,
+            patch_size=2,
+            attention_head_dim=8,
+            num_attention_heads=3,
+            caption_channels=32,
+            in_channels=4,
+            cross_attention_dim=24,
+            out_channels=8,
+            attention_bias=True,
+            activation_fn="gelu-approximate",
+            num_embeds_ada_norm=1000,
+            norm_type="ada_norm_single",
+            norm_elementwise_affine=False,
+            norm_eps=1e-6,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL()
+
+        scheduler = DDIMScheduler()
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        components = {
+            "transformer": transformer.eval(),
+            "vae": vae.eval(),
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 1.0,
+            "pag_scale": 3.0,
+            "use_resolution_binning": False,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_pag_disable_enable(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base  pipeline (expect same output when pag is disabled)
+        pipe = PixArtSigmaPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["pag_scale"]
+        assert (
+            "pag_scale" not in inspect.signature(pipe.__call__).parameters
+        ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe.__class__.__name__}."
+        out = pipe(**inputs).images[0, -3:, -3:, -1]
+
+        # pag disabled with pag_scale=0.0
+        components["pag_applied_layers"] = ["blocks.1"]
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["pag_scale"] = 0.0
+        out_pag_disabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        # pag enabled
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        out_pag_enabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3
+        assert np.abs(out.flatten() - out_pag_enabled.flatten()).max() > 1e-3
+
+    def test_pag_applied_layers(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base pipeline
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # "attn1" should apply to all self-attention layers.
+        all_self_attn_layers = [k for k in pipe.transformer.attn_processors.keys() if "attn1" in k]
+        pag_layers = ["blocks.0", "blocks.1"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_layers)
+
+    def test_pag_inference(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe_pag(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (
+            1,
+            8,
+            8,
+            3,
+        ), f"the shape of the output image should be (1, 8, 8, 3) but got {image.shape}"
+        expected_slice = np.array([0.6499, 0.3250, 0.3572, 0.6780, 0.4453, 0.4582, 0.2770, 0.5168, 0.4594])
+
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    # Copied from tests.pipelines.pixart_sigma.test_pixart.PixArtSigmaPipelineFastTests.test_save_load_optional_components
+    def test_save_load_optional_components(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = inputs["prompt"]
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = pipe.encode_prompt(prompt)
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attention_mask,
+            "negative_prompt": None,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": negative_prompt_attention_mask,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "use_resolution_binning": False,
+        }
+
+        # set all optional components to None
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, pag_applied_layers=["blocks.1"])
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attention_mask,
+            "negative_prompt": None,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": negative_prompt_attention_mask,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "use_resolution_binning": False,
+        }
+
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, 1e-4)
+
+    # Because the PAG PixArt Sigma has `pag_applied_layers`.
+    # Also, we shouldn't be doing `set_default_attn_processor()` after loading
+    # the pipeline with `pag_applied_layers`.
+    def test_save_load_local(self, expected_max_difference=1e-4):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = pipe(**inputs)[0]
+
+        logger = logging.get_logger("diffusers.pipelines.pipeline_utils")
+        logger.setLevel(diffusers.logging.INFO)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir, safe_serialization=False)
+
+            with CaptureLogger(logger) as cap_logger:
+                pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, pag_applied_layers=["blocks.1"])
+
+            for name in pipe_loaded.components.keys():
+                if name not in pipe_loaded._optional_components:
+                    assert name in str(cap_logger)
+
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, expected_max_difference)
+
+    # We shouldn't be setting `set_default_attn_processor` here.
+    def test_attention_slicing_forward_pass(
+        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_slicing = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=1)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing1 = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=2)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing2 = pipe(**inputs)[0]
+
+        if test_max_difference:
+            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
+            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
+            self.assertLess(
+                max(max_diff1, max_diff2),
+                expected_max_diff,
+                "Attention slicing should not affect the inference results",
+            )
+
+        if test_mean_pixel_difference:
+            assert_mean_pixel_difference(to_np(output_with_slicing1[0]), to_np(output_without_slicing[0]))
+            assert_mean_pixel_difference(to_np(output_with_slicing2[0]), to_np(output_without_slicing[0]))
+
+    # Because we have `pag_applied_layers` we cannot direcly apply
+    # `set_default_attn_processor`
+    def test_dict_tuple_outputs_equivalent(self, expected_slice=None, expected_max_difference=1e-4):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        if expected_slice is None:
+            output = pipe(**self.get_dummy_inputs(generator_device))[0]
+        else:
+            output = expected_slice
+
+        output_tuple = pipe(**self.get_dummy_inputs(generator_device), return_dict=False)[0]
+
+        if expected_slice is None:
+            max_diff = np.abs(to_np(output) - to_np(output_tuple)).max()
+        else:
+            if output_tuple.ndim != 5:
+                max_diff = np.abs(to_np(output) - to_np(output_tuple)[0, -3:, -3:, -1].flatten()).max()
+            else:
+                max_diff = np.abs(to_np(output) - to_np(output_tuple)[0, -3:, -3:, -1, -1].flatten()).max()
+
+        self.assertLess(max_diff, expected_max_difference)
+
+    # Same reason as above
+    def test_inference_batch_single_identical(
+        self,
+        batch_size=2,
+        expected_max_diff=1e-4,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = self.get_generator(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        for name in self.batch_params:
+            if name not in inputs:
+                continue
+
+            value = inputs[name]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
+
+            else:
+                batched_inputs[name] = batch_size * [value]
+
+        if "generator" in inputs:
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        output = pipe(**inputs)
+        output_batch = pipe(**batched_inputs)
+
+        assert output_batch[0].shape[0] == batch_size
+
+        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+        assert max_diff < expected_max_diff
+
+    # Because we're passing `pag_applied_layers` (type of List) in the components as well.
+    def test_components_function(self):
+        init_components = self.get_dummy_components()
+        init_components = {k: v for k, v in init_components.items() if not isinstance(v, (str, int, float, list))}
+
+        pipe = self.pipeline_class(**init_components)
+
+        self.assertTrue(hasattr(pipe, "components"))
+        self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
diff --git a/diffusers/tests/pipelines/pag/test_pag_sd.py b/diffusers/tests/pipelines/pag/test_pag_sd.py
new file mode 100644
index 0000000000000000000000000000000000000000..3979bb170e0b7619e2dfda1557776910b742c5af
--- /dev/null
+++ b/diffusers/tests/pipelines/pag/test_pag_sd.py
@@ -0,0 +1,345 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import inspect
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    AutoPipelineForText2Image,
+    DDIMScheduler,
+    StableDiffusionPAGPipeline,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineFromPipeTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionPAGPipelineFastTests(
+    PipelineTesterMixin,
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineFromPipeTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionPAGPipeline
+    params = TEXT_TO_IMAGE_PARAMS.union({"pag_scale", "pag_adaptive_scale"})
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"})
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        cross_attention_dim = 8
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            sample_size=32,
+            time_cond_proj_dim=time_cond_proj_dim,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=cross_attention_dim,
+            intermediate_size=16,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "pag_scale": 0.9,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_pag_disable_enable(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base  pipeline (expect same output when pag is disabled)
+        pipe_sd = StableDiffusionPipeline(**components)
+        pipe_sd = pipe_sd.to(device)
+        pipe_sd.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["pag_scale"]
+        assert (
+            "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters
+        ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}."
+        out = pipe_sd(**inputs).images[0, -3:, -3:, -1]
+
+        # pag disabled with pag_scale=0.0
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["pag_scale"] = 0.0
+        out_pag_disabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        # pag enabled
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        out_pag_enabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3
+        assert np.abs(out.flatten() - out_pag_enabled.flatten()).max() > 1e-3
+
+    def test_pag_applied_layers(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base pipeline
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pag_applied_layers = ["mid","up","down"] should apply to all self-attention layers
+        all_self_attn_layers = [k for k in pipe.unet.attn_processors.keys() if "attn1" in k]
+        original_attn_procs = pipe.unet.attn_processors
+        pag_layers = [
+            "down",
+            "mid",
+            "up",
+        ]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_layers)
+
+        # pag_applied_layers = ["mid"], or ["mid.block_0"] or ["mid.block_0.attentions_0"] should apply to all self-attention layers in mid_block, i.e.
+        # mid_block.attentions.0.transformer_blocks.0.attn1.processor
+        # mid_block.attentions.0.transformer_blocks.1.attn1.processor
+        all_self_attn_mid_layers = [
+            "mid_block.attentions.0.transformer_blocks.0.attn1.processor",
+            # "mid_block.attentions.0.transformer_blocks.1.attn1.processor",
+        ]
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_mid_layers)
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid_block"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_mid_layers)
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid_block.attentions.0"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_mid_layers)
+
+        # pag_applied_layers = ["mid.block_0.attentions_1"] does not exist in the model
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid_block.attentions.1"]
+        with self.assertRaises(ValueError):
+            pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+
+        # pag_applied_layers = "down" should apply to all self-attention layers in down_blocks
+        # down_blocks.1.attentions.0.transformer_blocks.0.attn1.processor
+        # down_blocks.1.attentions.0.transformer_blocks.1.attn1.processor
+        # down_blocks.1.attentions.0.transformer_blocks.0.attn1.processor
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert len(pipe.pag_attn_processors) == 2
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down_blocks.0"]
+        with self.assertRaises(ValueError):
+            pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down_blocks.1"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert len(pipe.pag_attn_processors) == 2
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down_blocks.1.attentions.1"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert len(pipe.pag_attn_processors) == 1
+
+    def test_pag_inference(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe_pag(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (
+            1,
+            64,
+            64,
+            3,
+        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
+
+        expected_slice = np.array(
+            [0.22802538, 0.44626093, 0.48905736, 0.29633686, 0.36400637, 0.4724258, 0.4678891, 0.32260418, 0.41611585]
+        )
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase):
+    pipeline_class = StableDiffusionPAGPipeline
+    repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", seed=1, guidance_scale=7.0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        inputs = {
+            "prompt": "a polar bear sitting in a chair drinking a milkshake",
+            "negative_prompt": "deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": guidance_scale,
+            "pag_scale": 3.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_pag_cfg(self):
+        pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
+        pipeline.enable_model_cpu_offload()
+        pipeline.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = pipeline(**inputs).images
+
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 512, 512, 3)
+        print(image_slice.flatten())
+        expected_slice = np.array(
+            [0.58251953, 0.5722656, 0.5683594, 0.55029297, 0.52001953, 0.52001953, 0.49951172, 0.45410156, 0.50146484]
+        )
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+        ), f"output is different from expected, {image_slice.flatten()}"
+
+    def test_pag_uncond(self):
+        pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
+        pipeline.enable_model_cpu_offload()
+        pipeline.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device, guidance_scale=0.0)
+        image = pipeline(**inputs).images
+
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.5986328, 0.52441406, 0.3972168, 0.4741211, 0.34985352, 0.22705078, 0.4128418, 0.2866211, 0.31713867]
+        )
+        print(image_slice.flatten())
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+        ), f"output is different from expected, {image_slice.flatten()}"
diff --git a/diffusers/tests/pipelines/pag/test_pag_sd3.py b/diffusers/tests/pipelines/pag/test_pag_sd3.py
new file mode 100644
index 0000000000000000000000000000000000000000..627d613ee20dce842442bfe2d2cb305f22cc2759
--- /dev/null
+++ b/diffusers/tests/pipelines/pag/test_pag_sd3.py
@@ -0,0 +1,296 @@
+import inspect
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    SD3Transformer2DModel,
+    StableDiffusion3PAGPipeline,
+    StableDiffusion3Pipeline,
+)
+from diffusers.utils.testing_utils import (
+    torch_device,
+)
+
+from ..test_pipelines_common import (
+    PipelineTesterMixin,
+    check_qkv_fusion_matches_attn_procs_length,
+    check_qkv_fusion_processors_exist,
+)
+
+
+class StableDiffusion3PAGPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = StableDiffusion3PAGPipeline
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "guidance_scale",
+            "negative_prompt",
+            "prompt_embeds",
+            "negative_prompt_embeds",
+        ]
+    )
+    batch_params = frozenset(["prompt", "negative_prompt"])
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = SD3Transformer2DModel(
+            sample_size=32,
+            patch_size=1,
+            in_channels=4,
+            num_layers=2,
+            attention_head_dim=8,
+            num_attention_heads=4,
+            caption_projection_dim=32,
+            joint_attention_dim=32,
+            pooled_projection_dim=64,
+            out_channels=4,
+        )
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModelWithProjection(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
+
+        text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_3 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=4,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            shift_factor=0.0609,
+            scaling_factor=1.5035,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "text_encoder_3": text_encoder_3,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "tokenizer_3": tokenizer_3,
+            "transformer": transformer,
+            "vae": vae,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "pag_scale": 0.0,
+        }
+        return inputs
+
+    def test_stable_diffusion_3_different_prompts(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_same_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "a different prompt"
+        inputs["prompt_3"] = "another different prompt"
+        output_different_prompts = pipe(**inputs).images[0]
+
+        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
+
+        # Outputs should be different here
+        assert max_diff > 1e-2
+
+    def test_stable_diffusion_3_different_negative_prompts(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_same_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt_2"] = "deformed"
+        inputs["negative_prompt_3"] = "blurry"
+        output_different_prompts = pipe(**inputs).images[0]
+
+        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
+
+        # Outputs should be different here
+        assert max_diff > 1e-2
+
+    def test_stable_diffusion_3_prompt_embeds(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+        inputs = self.get_dummy_inputs(torch_device)
+
+        output_with_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = inputs.pop("prompt")
+
+        do_classifier_free_guidance = inputs["guidance_scale"] > 1
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = pipe.encode_prompt(
+            prompt,
+            prompt_2=None,
+            prompt_3=None,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            device=torch_device,
+        )
+        output_with_embeds = pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            **inputs,
+        ).images[0]
+
+        max_diff = np.abs(output_with_prompt - output_with_embeds).max()
+        assert max_diff < 1e-4
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        original_image_slice = image[0, -3:, -3:, -1]
+
+        # TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added
+        # to the pipeline level.
+        pipe.transformer.fuse_qkv_projections()
+        assert check_qkv_fusion_processors_exist(
+            pipe.transformer
+        ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+        assert check_qkv_fusion_matches_attn_procs_length(
+            pipe.transformer, pipe.transformer.original_attn_processors
+        ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_fused = image[0, -3:, -3:, -1]
+
+        pipe.transformer.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_disabled = image[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
+
+    def test_pag_disable_enable(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base pipeline (expect same output when pag is disabled)
+        pipe_sd = StableDiffusion3Pipeline(**components)
+        pipe_sd = pipe_sd.to(device)
+        pipe_sd.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["pag_scale"]
+        assert (
+            "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters
+        ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}."
+        out = pipe_sd(**inputs).images[0, -3:, -3:, -1]
+
+        components = self.get_dummy_components()
+
+        # pag disabled with pag_scale=0.0
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["pag_scale"] = 0.0
+        out_pag_disabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3
+
+    def test_pag_applied_layers(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base pipeline
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        all_self_attn_layers = [k for k in pipe.transformer.attn_processors.keys() if "attn" in k]
+        original_attn_procs = pipe.transformer.attn_processors
+        pag_layers = ["blocks.0", "blocks.1"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_layers)
+
+        # blocks.0
+        block_0_self_attn = ["transformer_blocks.0.attn.processor"]
+        pipe.transformer.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["blocks.0"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(block_0_self_attn)
+
+        pipe.transformer.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["blocks.0.attn"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(block_0_self_attn)
+
+        pipe.transformer.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["blocks.(0|1)"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert (len(pipe.pag_attn_processors)) == 2
+
+        pipe.transformer.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["blocks.0", r"blocks\.1"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert len(pipe.pag_attn_processors) == 2
diff --git a/diffusers/tests/pipelines/pag/test_pag_sdxl.py b/diffusers/tests/pipelines/pag/test_pag_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..58957338567785fda4f0067cd0d2da626ba56fcb
--- /dev/null
+++ b/diffusers/tests/pipelines/pag/test_pag_sdxl.py
@@ -0,0 +1,353 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import inspect
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    AutoPipelineForText2Image,
+    EulerDiscreteScheduler,
+    StableDiffusionXLPAGPipeline,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineFromPipeTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLPAGPipelineFastTests(
+    PipelineTesterMixin,
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineFromPipeTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionXLPAGPipeline
+    params = TEXT_TO_IMAGE_PARAMS.union({"pag_scale", "pag_adaptive_scale"})
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"})
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        # Copied from tests.pipelines.stable_diffusion_xl.test_stable_diffusion_xl.StableDiffusionXLPipelineFastTests.get_dummy_components
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(2, 4),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+            norm_num_groups=1,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "image_encoder": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "pag_scale": 0.9,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_pag_disable_enable(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base  pipeline (expect same output when pag is disabled)
+        pipe_sd = StableDiffusionXLPipeline(**components)
+        pipe_sd = pipe_sd.to(device)
+        pipe_sd.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["pag_scale"]
+        assert (
+            "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters
+        ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}."
+        out = pipe_sd(**inputs).images[0, -3:, -3:, -1]
+
+        # pag disabled with pag_scale=0.0
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["pag_scale"] = 0.0
+        out_pag_disabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        # pag enabled
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        out_pag_enabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3
+        assert np.abs(out.flatten() - out_pag_enabled.flatten()).max() > 1e-3
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    def test_pag_applied_layers(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # base pipeline
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pag_applied_layers = ["mid","up","down"] should apply to all self-attention layers
+        all_self_attn_layers = [k for k in pipe.unet.attn_processors.keys() if "attn1" in k]
+        original_attn_procs = pipe.unet.attn_processors
+        pag_layers = ["mid", "down", "up"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_layers)
+
+        # pag_applied_layers = ["mid"], or ["mid.block_0"] or ["mid.block_0.attentions_0"] should apply to all self-attention layers in mid_block, i.e.
+        # mid_block.attentions.0.transformer_blocks.0.attn1.processor
+        # mid_block.attentions.0.transformer_blocks.1.attn1.processor
+        all_self_attn_mid_layers = [
+            "mid_block.attentions.0.transformer_blocks.0.attn1.processor",
+            "mid_block.attentions.0.transformer_blocks.1.attn1.processor",
+        ]
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_mid_layers)
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid_block"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_mid_layers)
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid_block.attentions.0"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert set(pipe.pag_attn_processors) == set(all_self_attn_mid_layers)
+
+        # pag_applied_layers = ["mid.block_0.attentions_1"] does not exist in the model
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["mid_block.attentions.1"]
+        with self.assertRaises(ValueError):
+            pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+
+        # pag_applied_layers = "down" should apply to all self-attention layers in down_blocks
+        # down_blocks.1.attentions.0.transformer_blocks.0.attn1.processor
+        # down_blocks.1.attentions.0.transformer_blocks.1.attn1.processor
+        # down_blocks.1.attentions.1.transformer_blocks.0.attn1.processor
+        # down_blocks.1.attentions.1.transformer_blocks.1.attn1.processor
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert len(pipe.pag_attn_processors) == 4
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down_blocks.0"]
+        with self.assertRaises(ValueError):
+            pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down_blocks.1"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert len(pipe.pag_attn_processors) == 4
+
+        pipe.unet.set_attn_processor(original_attn_procs.copy())
+        pag_layers = ["down_blocks.1.attentions.1"]
+        pipe._set_pag_attn_processor(pag_applied_layers=pag_layers, do_classifier_free_guidance=False)
+        assert len(pipe.pag_attn_processors) == 2
+
+    def test_pag_inference(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe_pag(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (
+            1,
+            64,
+            64,
+            3,
+        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
+        expected_slice = np.array([0.5382, 0.5439, 0.4704, 0.4569, 0.5234, 0.4834, 0.5289, 0.5039, 0.4764])
+
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionXLPAGPipelineIntegrationTests(unittest.TestCase):
+    pipeline_class = StableDiffusionXLPAGPipeline
+    repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        inputs = {
+            "prompt": "a polar bear sitting in a chair drinking a milkshake",
+            "negative_prompt": "deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": guidance_scale,
+            "pag_scale": 3.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_pag_cfg(self):
+        pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
+        pipeline.enable_model_cpu_offload()
+        pipeline.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = pipeline(**inputs).images
+
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 1024, 1024, 3)
+        expected_slice = np.array(
+            [0.3123679, 0.31725878, 0.32026544, 0.327533, 0.3266391, 0.3303998, 0.33544615, 0.34181812, 0.34102726]
+        )
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+        ), f"output is different from expected, {image_slice.flatten()}"
+
+    def test_pag_uncond(self):
+        pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
+        pipeline.enable_model_cpu_offload()
+        pipeline.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device, guidance_scale=0.0)
+        image = pipeline(**inputs).images
+
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 1024, 1024, 3)
+        expected_slice = np.array(
+            [0.47400922, 0.48650584, 0.4839625, 0.4724013, 0.4890427, 0.49544555, 0.51707107, 0.54299414, 0.5224372]
+        )
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+        ), f"output is different from expected, {image_slice.flatten()}"
diff --git a/diffusers/tests/pipelines/pag/test_pag_sdxl_img2img.py b/diffusers/tests/pipelines/pag/test_pag_sdxl_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e5fc5fa28b9326e4b0b18bb87b19090d810a99b
--- /dev/null
+++ b/diffusers/tests/pipelines/pag/test_pag_sdxl_img2img.py
@@ -0,0 +1,336 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import inspect
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    AutoPipelineForImage2Image,
+    EulerDiscreteScheduler,
+    StableDiffusionXLImg2ImgPipeline,
+    StableDiffusionXLPAGImg2ImgPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineFromPipeTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLPAGImg2ImgPipelineFastTests(
+    PipelineTesterMixin,
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineFromPipeTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionXLPAGImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS.union({"pag_scale", "pag_adaptive_scale"}) - {"height", "width"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union(
+        {"add_text_embeds", "add_time_ids", "add_neg_time_ids"}
+    )
+
+    #  based on tests.pipelines.stable_diffusion_xl.test_stable_diffusion_xl_img2img_pipeline.get_dummy_components
+    def get_dummy_components(
+        self, skip_first_text_encoder=False, time_cond_proj_dim=None, requires_aesthetics_score=False
+    ):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            time_cond_proj_dim=time_cond_proj_dim,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=72 if requires_aesthetics_score else 80,  # 5 * 8 + 32
+            cross_attention_dim=64 if not skip_first_text_encoder else 32,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=32,
+            image_size=224,
+            projection_dim=32,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+
+        feature_extractor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder if not skip_first_text_encoder else None,
+            "tokenizer": tokenizer if not skip_first_text_encoder else None,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "requires_aesthetics_score": requires_aesthetics_score,
+            "image_encoder": image_encoder,
+            "feature_extractor": feature_extractor,
+        }
+        return components
+
+    # based on tests.pipelines.stable_diffusion_xl.test_stable_diffusion_xl_img2img_pipeline.StableDiffusionXLImg2ImgPipelineFastTests
+    # add `pag_scale` to the inputs
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image / 2 + 0.5
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "pag_scale": 3.0,
+            "output_type": "np",
+            "strength": 0.8,
+        }
+        return inputs
+
+    def test_pag_disable_enable(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(requires_aesthetics_score=True)
+
+        # base pipeline
+        pipe_sd = StableDiffusionXLImg2ImgPipeline(**components)
+        pipe_sd = pipe_sd.to(device)
+        pipe_sd.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["pag_scale"]
+        assert (
+            "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters
+        ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}."
+        out = pipe_sd(**inputs).images[0, -3:, -3:, -1]
+
+        # pag disabled with pag_scale=0.0
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["pag_scale"] = 0.0
+        out_pag_disabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        # pag enabled
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        out_pag_enabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3
+        assert np.abs(out.flatten() - out_pag_enabled.flatten()).max() > 1e-3
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    def test_pag_inference(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(requires_aesthetics_score=True)
+
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe_pag(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (
+            1,
+            32,
+            32,
+            3,
+        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
+        expected_slice = np.array([0.4613, 0.4902, 0.4406, 0.6788, 0.5611, 0.4529, 0.5893, 0.5975, 0.5226])
+
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionXLPAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0):
+        img_url = (
+            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png"
+        )
+
+        init_image = load_image(img_url)
+
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        inputs = {
+            "prompt": "a dog catching a frisbee in the jungle",
+            "generator": generator,
+            "image": init_image,
+            "strength": 0.8,
+            "num_inference_steps": 3,
+            "guidance_scale": guidance_scale,
+            "pag_scale": 3.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_pag_cfg(self):
+        pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
+        pipeline.enable_model_cpu_offload()
+        pipeline.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = pipeline(**inputs).images
+
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 1024, 1024, 3)
+        expected_slice = np.array(
+            [0.20301354, 0.21078318, 0.2021082, 0.20277798, 0.20681083, 0.19562206, 0.20121682, 0.21562952, 0.21277016]
+        )
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+        ), f"output is different from expected, {image_slice.flatten()}"
+
+    def test_pag_uncond(self):
+        pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
+        pipeline.enable_model_cpu_offload()
+        pipeline.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device, guidance_scale=0.0)
+        image = pipeline(**inputs).images
+
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 1024, 1024, 3)
+        expected_slice = np.array(
+            [0.21303111, 0.22188407, 0.2124992, 0.21365267, 0.18823743, 0.17569828, 0.21113116, 0.19419771, 0.18919235]
+        )
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+        ), f"output is different from expected, {image_slice.flatten()}"
diff --git a/diffusers/tests/pipelines/pag/test_pag_sdxl_inpaint.py b/diffusers/tests/pipelines/pag/test_pag_sdxl_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..efc37abd068225dbd9151d88081490f69c943fb2
--- /dev/null
+++ b/diffusers/tests/pipelines/pag/test_pag_sdxl_inpaint.py
@@ -0,0 +1,342 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import inspect
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    AutoPipelineForInpainting,
+    EulerDiscreteScheduler,
+    StableDiffusionXLInpaintPipeline,
+    StableDiffusionXLPAGInpaintPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineFromPipeTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLPAGInpaintPipelineFastTests(
+    PipelineTesterMixin,
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineFromPipeTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionXLPAGInpaintPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS.union({"pag_scale", "pag_adaptive_scale"})
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    image_params = frozenset([])
+    image_latents_params = frozenset([])
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union(
+        {"add_text_embeds", "add_time_ids", "mask", "masked_image_latents"}
+    )
+
+    # based on tests.pipelines.stable_diffusion_xl.test_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipelineFastTests.get_dummy_components
+    def get_dummy_components(
+        self, skip_first_text_encoder=False, time_cond_proj_dim=None, requires_aesthetics_score=False
+    ):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            time_cond_proj_dim=time_cond_proj_dim,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=72 if requires_aesthetics_score else 80,  # 5 * 8 + 32
+            cross_attention_dim=64 if not skip_first_text_encoder else 32,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        torch.manual_seed(0)
+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=32,
+            image_size=224,
+            projection_dim=32,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+
+        feature_extractor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder if not skip_first_text_encoder else None,
+            "tokenizer": tokenizer if not skip_first_text_encoder else None,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "image_encoder": image_encoder,
+            "feature_extractor": feature_extractor,
+            "requires_aesthetics_score": requires_aesthetics_score,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        # create mask
+        image[8:, 8:, :] = 255
+        mask_image = Image.fromarray(np.uint8(image)).convert("L").resize((64, 64))
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": init_image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "strength": 1.0,
+            "pag_scale": 0.9,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_pag_disable_enable(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(requires_aesthetics_score=True)
+
+        # base pipeline
+        pipe_sd = StableDiffusionXLInpaintPipeline(**components)
+        pipe_sd = pipe_sd.to(device)
+        pipe_sd.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["pag_scale"]
+        assert (
+            "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters
+        ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}."
+        out = pipe_sd(**inputs).images[0, -3:, -3:, -1]
+
+        # pag disabled with pag_scale=0.0
+        pipe_pag = self.pipeline_class(**components)
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["pag_scale"] = 0.0
+        out_pag_disabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        # pag enabled
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        out_pag_enabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+        assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3
+        assert np.abs(out.flatten() - out_pag_enabled.flatten()).max() > 1e-3
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    def test_pag_inference(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(requires_aesthetics_score=True)
+
+        pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+        pipe_pag = pipe_pag.to(device)
+        pipe_pag.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe_pag(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (
+            1,
+            64,
+            64,
+            3,
+        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
+        expected_slice = np.array([0.8366, 0.5513, 0.6105, 0.6213, 0.6957, 0.7400, 0.6614, 0.6102, 0.5239])
+
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionXLPAGInpaintPipelineIntegrationTests(unittest.TestCase):
+    repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0):
+        img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+        init_image = load_image(img_url).convert("RGB")
+        mask_image = load_image(mask_url).convert("RGB")
+
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        inputs = {
+            "prompt": "A majestic tiger sitting on a bench",
+            "generator": generator,
+            "image": init_image,
+            "mask_image": mask_image,
+            "strength": 0.8,
+            "num_inference_steps": 3,
+            "guidance_scale": guidance_scale,
+            "pag_scale": 3.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_pag_cfg(self):
+        pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
+        pipeline.enable_model_cpu_offload()
+        pipeline.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = pipeline(**inputs).images
+
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 1024, 1024, 3)
+        expected_slice = np.array(
+            [0.41385046, 0.39608297, 0.4360491, 0.26872507, 0.32187328, 0.4242474, 0.2603805, 0.34167895, 0.46561807]
+        )
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+        ), f"output is different from expected, {image_slice.flatten()}"
+
+    def test_pag_uncond(self):
+        pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
+        pipeline.enable_model_cpu_offload()
+        pipeline.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device, guidance_scale=0.0)
+        image = pipeline(**inputs).images
+
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 1024, 1024, 3)
+        expected_slice = np.array(
+            [0.41597816, 0.39302617, 0.44287828, 0.2687074, 0.28315824, 0.40582314, 0.20877528, 0.2380802, 0.39447647]
+        )
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+        ), f"output is different from expected, {image_slice.flatten()}"
diff --git a/diffusers/tests/pipelines/paint_by_example/__init__.py b/diffusers/tests/pipelines/paint_by_example/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/paint_by_example/test_paint_by_example.py b/diffusers/tests/pipelines/paint_by_example/test_paint_by_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..c71e2d4761c2b6dcc5bc9ff5f15a5f0ad70bc9fe
--- /dev/null
+++ b/diffusers/tests/pipelines/paint_by_example/test_paint_by_example.py
@@ -0,0 +1,226 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPVisionConfig
+
+from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel
+from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    nightly,
+    require_torch_gpu,
+    torch_device,
+)
+
+from ..pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = PaintByExamplePipeline
+    params = IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
+    batch_params = IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    image_params = frozenset([])  # TO_DO: update the image_prams once refactored VaeImageProcessor.preprocess
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=9,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=32,
+            projection_dim=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            image_size=32,
+            patch_size=4,
+        )
+        image_encoder = PaintByExampleImageEncoder(config, proj_size=32)
+        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "image_encoder": image_encoder,
+            "safety_checker": None,
+            "feature_extractor": feature_extractor,
+        }
+        return components
+
+    def convert_to_pt(self, image):
+        image = np.array(image.convert("RGB"))
+        image = image[None].transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+        return image
+
+    def get_dummy_inputs(self, device="cpu", seed=0):
+        # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
+        example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "example_image": example_image,
+            "image": init_image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_paint_by_example_inpaint(self):
+        components = self.get_dummy_components()
+
+        # make sure here that pndm scheduler skips prk
+        pipe = PaintByExamplePipeline(**components)
+        pipe = pipe.to("cpu")
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        output = pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4686, 0.5687, 0.4007, 0.5218, 0.5741, 0.4482, 0.4940, 0.4629, 0.4503])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_paint_by_example_image_tensor(self):
+        device = "cpu"
+        inputs = self.get_dummy_inputs()
+        inputs.pop("mask_image")
+        image = self.convert_to_pt(inputs.pop("image"))
+        mask_image = image.clamp(0, 1) / 2
+
+        # make sure here that pndm scheduler skips prk
+        pipe = PaintByExamplePipeline(**self.get_dummy_components())
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(image=image, mask_image=mask_image[:, 0], **inputs)
+        out_1 = output.images
+
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        mask_image = mask_image.cpu().permute(0, 2, 3, 1)[0]
+
+        image = Image.fromarray(np.uint8(image)).convert("RGB")
+        mask_image = Image.fromarray(np.uint8(mask_image)).convert("RGB")
+
+        output = pipe(**self.get_dummy_inputs())
+        out_2 = output.images
+
+        assert out_1.shape == (1, 64, 64, 3)
+        assert np.abs(out_1.flatten() - out_2.flatten()).max() < 5e-2
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+
+@nightly
+@require_torch_gpu
+class PaintByExamplePipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_paint_by_example(self):
+        # make sure here that pndm scheduler skips prk
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/paint_by_example/dog_in_bucket.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/paint_by_example/mask.png"
+        )
+        example_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/paint_by_example/panda.jpg"
+        )
+
+        pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example")
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(321)
+        output = pipe(
+            image=init_image,
+            mask_image=mask_image,
+            example_image=example_image,
+            generator=generator,
+            guidance_scale=5.0,
+            num_inference_steps=50,
+            output_type="np",
+        )
+
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.5290, 0.5374])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/pia/__init__.py b/diffusers/tests/pipelines/pia/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/pia/test_pia.py b/diffusers/tests/pipelines/pia/test_pia.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca558fbb83e5dd630adec645dddd9b642f6fce0d
--- /dev/null
+++ b/diffusers/tests/pipelines/pia/test_pia.py
@@ -0,0 +1,438 @@
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    LCMScheduler,
+    MotionAdapter,
+    PIAPipeline,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    UNetMotionModel,
+)
+from diffusers.utils import is_xformers_available, logging
+from diffusers.utils.testing_utils import floats_tensor, torch_device
+
+from ..test_pipelines_common import IPAdapterTesterMixin, PipelineFromPipeTesterMixin, PipelineTesterMixin
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase):
+    pipeline_class = PIAPipeline
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "guidance_scale",
+            "negative_prompt",
+            "prompt_embeds",
+            "negative_prompt_embeds",
+            "cross_attention_kwargs",
+        ]
+    )
+    batch_params = frozenset(["prompt", "image", "generator"])
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+
+    def get_dummy_components(self):
+        cross_attention_dim = 8
+        block_out_channels = (8, 8)
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=block_out_channels,
+            layers_per_block=2,
+            sample_size=8,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="linear",
+            clip_sample=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=block_out_channels,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=cross_attention_dim,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        torch.manual_seed(0)
+        motion_adapter = MotionAdapter(
+            block_out_channels=block_out_channels,
+            motion_layers_per_block=2,
+            motion_norm_num_groups=2,
+            motion_num_attention_heads=4,
+            conv_in_channels=9,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "motion_adapter": motion_adapter,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = floats_tensor((1, 3, 8, 8), rng=random.Random(seed)).to(device)
+        inputs = {
+            "image": image,
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 7.5,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_from_pipe_consistent_config(self):
+        assert self.original_pipeline_class == StableDiffusionPipeline
+        original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
+        original_kwargs = {"requires_safety_checker": False}
+
+        # create original_pipeline_class(sd)
+        pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
+
+        # original_pipeline_class(sd) -> pipeline_class
+        pipe_components = self.get_dummy_components()
+        pipe_additional_components = {}
+        for name, component in pipe_components.items():
+            if name not in pipe_original.components:
+                pipe_additional_components[name] = component
+
+        pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
+
+        # pipeline_class -> original_pipeline_class(sd)
+        original_pipe_additional_components = {}
+        for name, component in pipe_original.components.items():
+            if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
+                original_pipe_additional_components[name] = component
+
+        pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
+
+        # compare the config
+        original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
+        original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
+        assert original_config_2 == original_config
+
+    def test_motion_unet_loading(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        assert isinstance(pipe.unet, UNetMotionModel)
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array(
+                [
+                    0.5475,
+                    0.5769,
+                    0.4873,
+                    0.5064,
+                    0.4445,
+                    0.5876,
+                    0.5453,
+                    0.4102,
+                    0.5247,
+                    0.5370,
+                    0.3406,
+                    0.4322,
+                    0.3991,
+                    0.3756,
+                    0.5438,
+                    0.4780,
+                    0.5087,
+                    0.5248,
+                    0.6243,
+                    0.5506,
+                    0.3491,
+                    0.5440,
+                    0.6111,
+                    0.5122,
+                    0.5326,
+                    0.5180,
+                    0.5538,
+                ]
+            )
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        expected_slice = None
+        if torch_device == "cpu":
+            expected_slice = np.array([0.5476, 0.4092, 0.5289, 0.4755, 0.5092, 0.5186, 0.5403, 0.5287, 0.5467])
+        return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
+
+    @unittest.skip("Attention slicing is not enabled in this pipeline")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_inference_batch_single_identical(
+        self,
+        batch_size=2,
+        expected_max_diff=1e-4,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for components in pipe.components.values():
+            if hasattr(components, "set_default_attn_processor"):
+                components.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = self.get_generator(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        for name in self.batch_params:
+            if name not in inputs:
+                continue
+
+            value = inputs[name]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
+
+            else:
+                batched_inputs[name] = batch_size * [value]
+
+        if "generator" in inputs:
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        output = pipe(**inputs)
+        output_batch = pipe(**batched_inputs)
+
+        assert output_batch[0].shape[0] == batch_size
+
+        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+        assert max_diff < expected_max_diff
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.float16)
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    def test_prompt_embeds(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs.pop("prompt")
+        inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
+        pipe(**inputs)
+
+    def test_free_init(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        pipe.enable_free_init(
+            num_iters=2,
+            use_fast_sampling=True,
+            method="butterworth",
+            order=4,
+            spatial_stop_frequency=0.25,
+            temporal_stop_frequency=0.25,
+        )
+        inputs_enable_free_init = self.get_dummy_inputs(torch_device)
+        frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
+
+        pipe.disable_free_init()
+        inputs_disable_free_init = self.get_dummy_inputs(torch_device)
+        frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
+
+        sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+        max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
+        self.assertGreater(
+            sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results"
+        )
+        self.assertLess(
+            max_diff_disabled,
+            1e-4,
+            "Disabling of FreeInit should lead to results similar to the default pipeline results",
+        )
+
+    def test_free_init_with_schedulers(self):
+        components = self.get_dummy_components()
+        pipe: PIAPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        schedulers_to_test = [
+            DPMSolverMultistepScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                algorithm_type="dpmsolver++",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+            LCMScheduler.from_config(
+                components["scheduler"].config,
+                timestep_spacing="linspace",
+                beta_schedule="linear",
+                steps_offset=1,
+                clip_sample=False,
+            ),
+        ]
+        components.pop("scheduler")
+
+        for scheduler in schedulers_to_test:
+            components["scheduler"] = scheduler
+            pipe: PIAPipeline = self.pipeline_class(**components)
+            pipe.set_progress_bar_config(disable=None)
+            pipe.to(torch_device)
+
+            pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
+
+            inputs = self.get_dummy_inputs(torch_device)
+            frames_enable_free_init = pipe(**inputs).frames[0]
+            sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+
+            self.assertGreater(
+                sum_enabled,
+                1e1,
+                "Enabling of FreeInit should lead to results different from the default pipeline results",
+            )
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = pipe(**inputs).frames[0]
+        output_without_offload = (
+            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
+        )
+
+        pipe.enable_xformers_memory_efficient_attention()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = pipe(**inputs).frames[0]
+        output_with_offload = (
+            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
+        )
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
diff --git a/diffusers/tests/pipelines/pipeline_params.py b/diffusers/tests/pipelines/pipeline_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e2c4dcdd9cb96af87754c76d6f33b84d9c4f2f0
--- /dev/null
+++ b/diffusers/tests/pipelines/pipeline_params.py
@@ -0,0 +1,129 @@
+# These are canonical sets of parameters for different types of pipelines.
+# They are set on subclasses of `PipelineTesterMixin` as `params` and
+# `batch_params`.
+#
+# If a pipeline's set of arguments has minor changes from one of the common sets
+# of arguments, do not make modifications to the existing common sets of arguments.
+# I.e. a text to image pipeline with non-configurable height and width arguments
+# should set its attribute as `params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`.
+
+TEXT_TO_IMAGE_PARAMS = frozenset(
+    [
+        "prompt",
+        "height",
+        "width",
+        "guidance_scale",
+        "negative_prompt",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "cross_attention_kwargs",
+    ]
+)
+
+TEXT_TO_IMAGE_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
+
+TEXT_TO_IMAGE_IMAGE_PARAMS = frozenset([])
+
+IMAGE_TO_IMAGE_IMAGE_PARAMS = frozenset(["image"])
+
+IMAGE_VARIATION_PARAMS = frozenset(
+    [
+        "image",
+        "height",
+        "width",
+        "guidance_scale",
+    ]
+)
+
+IMAGE_VARIATION_BATCH_PARAMS = frozenset(["image"])
+
+TEXT_GUIDED_IMAGE_VARIATION_PARAMS = frozenset(
+    [
+        "prompt",
+        "image",
+        "height",
+        "width",
+        "guidance_scale",
+        "negative_prompt",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+)
+
+TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS = frozenset(["prompt", "image", "negative_prompt"])
+
+TEXT_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
+    [
+        # Text guided image variation with an image mask
+        "prompt",
+        "image",
+        "mask_image",
+        "height",
+        "width",
+        "guidance_scale",
+        "negative_prompt",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+)
+
+TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["prompt", "image", "mask_image", "negative_prompt"])
+
+IMAGE_INPAINTING_PARAMS = frozenset(
+    [
+        # image variation with an image mask
+        "image",
+        "mask_image",
+        "height",
+        "width",
+        "guidance_scale",
+    ]
+)
+
+IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["image", "mask_image"])
+
+IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
+    [
+        "example_image",
+        "image",
+        "mask_image",
+        "height",
+        "width",
+        "guidance_scale",
+    ]
+)
+
+IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["example_image", "image", "mask_image"])
+
+CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS = frozenset(["class_labels"])
+
+CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS = frozenset(["class_labels"])
+
+UNCONDITIONAL_IMAGE_GENERATION_PARAMS = frozenset(["batch_size"])
+
+UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS = frozenset([])
+
+UNCONDITIONAL_AUDIO_GENERATION_PARAMS = frozenset(["batch_size"])
+
+UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS = frozenset([])
+
+TEXT_TO_AUDIO_PARAMS = frozenset(
+    [
+        "prompt",
+        "audio_length_in_s",
+        "guidance_scale",
+        "negative_prompt",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "cross_attention_kwargs",
+    ]
+)
+
+TEXT_TO_AUDIO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
+TOKENS_TO_AUDIO_GENERATION_PARAMS = frozenset(["input_tokens"])
+
+TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS = frozenset(["input_tokens"])
+
+TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS = frozenset(["prompt_embeds"])
+
+VIDEO_TO_VIDEO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt", "video"])
diff --git a/diffusers/tests/pipelines/pixart_alpha/__init__.py b/diffusers/tests/pipelines/pixart_alpha/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/pixart_alpha/test_pixart.py b/diffusers/tests/pipelines/pixart_alpha/test_pixart.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7039c61a448de7db7a628b2e802d68de686a165
--- /dev/null
+++ b/diffusers/tests/pipelines/pixart_alpha/test_pixart.py
@@ -0,0 +1,442 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    PixArtAlphaPipeline,
+    PixArtTransformer2DModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, to_np
+
+
+enable_full_determinism()
+
+
+class PixArtAlphaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = PixArtAlphaPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    required_optional_params = PipelineTesterMixin.required_optional_params
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = PixArtTransformer2DModel(
+            sample_size=8,
+            num_layers=2,
+            patch_size=2,
+            attention_head_dim=8,
+            num_attention_heads=3,
+            caption_channels=32,
+            in_channels=4,
+            cross_attention_dim=24,
+            out_channels=8,
+            attention_bias=True,
+            activation_fn="gelu-approximate",
+            num_embeds_ada_norm=1000,
+            norm_type="ada_norm_single",
+            norm_elementwise_affine=False,
+            norm_eps=1e-6,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL()
+
+        scheduler = DDIMScheduler()
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        components = {
+            "transformer": transformer.eval(),
+            "vae": vae.eval(),
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "use_resolution_binning": False,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_sequential_cpu_offload_forward_pass(self):
+        # TODO(PVP, Sayak) need to fix later
+        return
+
+    def test_save_load_optional_components(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = inputs["prompt"]
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = pipe.encode_prompt(prompt)
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attention_mask,
+            "negative_prompt": None,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": negative_prompt_attention_mask,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "use_resolution_binning": False,
+        }
+
+        # set all optional components to None
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attention_mask,
+            "negative_prompt": None,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": negative_prompt_attention_mask,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "use_resolution_binning": False,
+        }
+
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, 1e-4)
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        self.assertEqual(image.shape, (1, 8, 8, 3))
+        expected_slice = np.array([0.6319, 0.3526, 0.3806, 0.6327, 0.4639, 0.483, 0.2583, 0.5331, 0.4852])
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_inference_non_square_images(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs, height=32, width=48).images
+        image_slice = image[0, -3:, -3:, -1]
+        self.assertEqual(image.shape, (1, 32, 48, 3))
+
+        expected_slice = np.array([0.6493, 0.537, 0.4081, 0.4762, 0.3695, 0.4711, 0.3026, 0.5218, 0.5263])
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_inference_with_embeddings_and_multiple_images(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = inputs["prompt"]
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        prompt_embeds, prompt_attn_mask, negative_prompt_embeds, neg_prompt_attn_mask = pipe.encode_prompt(prompt)
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attn_mask,
+            "negative_prompt": None,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": neg_prompt_attn_mask,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "num_images_per_prompt": 2,
+            "use_resolution_binning": False,
+        }
+
+        # set all optional components to None
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attn_mask,
+            "negative_prompt": None,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": neg_prompt_attn_mask,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "num_images_per_prompt": 2,
+            "use_resolution_binning": False,
+        }
+
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, 1e-4)
+
+    def test_inference_with_multiple_images_per_prompt(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_images_per_prompt"] = 2
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        self.assertEqual(image.shape, (2, 8, 8, 3))
+        expected_slice = np.array([0.6319, 0.3526, 0.3806, 0.6327, 0.4639, 0.483, 0.2583, 0.5331, 0.4852])
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_raises_warning_for_mask_feature(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs.update({"mask_feature": True})
+
+        with self.assertWarns(FutureWarning) as warning_ctx:
+            _ = pipe(**inputs).images
+
+        assert "mask_feature" in str(warning_ctx.warning)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
+
+
+@slow
+@require_torch_gpu
+class PixArtAlphaPipelineIntegrationTests(unittest.TestCase):
+    ckpt_id_1024 = "PixArt-alpha/PixArt-XL-2-1024-MS"
+    ckpt_id_512 = "PixArt-alpha/PixArt-XL-2-512x512"
+    prompt = "A small cactus with a happy face in the Sahara desert."
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_pixart_1024(self):
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16)
+        pipe.enable_model_cpu_offload()
+        prompt = self.prompt
+
+        image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.0742, 0.0835, 0.2114, 0.0295, 0.0784, 0.2361, 0.1738, 0.2251, 0.3589])
+
+        max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
+        self.assertLessEqual(max_diff, 1e-4)
+
+    def test_pixart_512(self):
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16)
+        pipe.enable_model_cpu_offload()
+
+        prompt = self.prompt
+
+        image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.3477, 0.3882, 0.4541, 0.3413, 0.3821, 0.4463, 0.4001, 0.4409, 0.4958])
+
+        max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
+        self.assertLessEqual(max_diff, 1e-4)
+
+    def test_pixart_1024_without_resolution_binning(self):
+        generator = torch.manual_seed(0)
+
+        pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16)
+        pipe.enable_model_cpu_offload()
+
+        prompt = self.prompt
+        height, width = 1024, 768
+        num_inference_steps = 2
+
+        image = pipe(
+            prompt,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+        ).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        generator = torch.manual_seed(0)
+        no_res_bin_image = pipe(
+            prompt,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+            use_resolution_binning=False,
+        ).images
+        no_res_bin_image_slice = no_res_bin_image[0, -3:, -3:, -1]
+
+        assert not np.allclose(image_slice, no_res_bin_image_slice, atol=1e-4, rtol=1e-4)
+
+    def test_pixart_512_without_resolution_binning(self):
+        generator = torch.manual_seed(0)
+
+        pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16)
+        pipe.enable_model_cpu_offload()
+
+        prompt = self.prompt
+        height, width = 512, 768
+        num_inference_steps = 2
+
+        image = pipe(
+            prompt,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+        ).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        generator = torch.manual_seed(0)
+        no_res_bin_image = pipe(
+            prompt,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+            use_resolution_binning=False,
+        ).images
+        no_res_bin_image_slice = no_res_bin_image[0, -3:, -3:, -1]
+
+        assert not np.allclose(image_slice, no_res_bin_image_slice, atol=1e-4, rtol=1e-4)
diff --git a/diffusers/tests/pipelines/pixart_sigma/__init__.py b/diffusers/tests/pipelines/pixart_sigma/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/pixart_sigma/test_pixart.py b/diffusers/tests/pipelines/pixart_sigma/test_pixart.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92e99366ee3184f2592252dd79cdf5a401a4035
--- /dev/null
+++ b/diffusers/tests/pipelines/pixart_sigma/test_pixart.py
@@ -0,0 +1,481 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    PixArtSigmaPipeline,
+    PixArtTransformer2DModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    PipelineTesterMixin,
+    check_qkv_fusion_matches_attn_procs_length,
+    check_qkv_fusion_processors_exist,
+    to_np,
+)
+
+
+enable_full_determinism()
+
+
+class PixArtSigmaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = PixArtSigmaPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    required_optional_params = PipelineTesterMixin.required_optional_params
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = PixArtTransformer2DModel(
+            sample_size=8,
+            num_layers=2,
+            patch_size=2,
+            attention_head_dim=8,
+            num_attention_heads=3,
+            caption_channels=32,
+            in_channels=4,
+            cross_attention_dim=24,
+            out_channels=8,
+            attention_bias=True,
+            activation_fn="gelu-approximate",
+            num_embeds_ada_norm=1000,
+            norm_type="ada_norm_single",
+            norm_elementwise_affine=False,
+            norm_eps=1e-6,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL()
+
+        scheduler = DDIMScheduler()
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        components = {
+            "transformer": transformer.eval(),
+            "vae": vae.eval(),
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "use_resolution_binning": False,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_sequential_cpu_offload_forward_pass(self):
+        # TODO(PVP, Sayak) need to fix later
+        return
+
+    def test_save_load_optional_components(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = inputs["prompt"]
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = pipe.encode_prompt(prompt)
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attention_mask,
+            "negative_prompt": None,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": negative_prompt_attention_mask,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "use_resolution_binning": False,
+        }
+
+        # set all optional components to None
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attention_mask,
+            "negative_prompt": None,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": negative_prompt_attention_mask,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "use_resolution_binning": False,
+        }
+
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, 1e-4)
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        self.assertEqual(image.shape, (1, 8, 8, 3))
+        expected_slice = np.array([0.6319, 0.3526, 0.3806, 0.6327, 0.4639, 0.4830, 0.2583, 0.5331, 0.4852])
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_inference_non_square_images(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs, height=32, width=48).images
+        image_slice = image[0, -3:, -3:, -1]
+        self.assertEqual(image.shape, (1, 32, 48, 3))
+
+        expected_slice = np.array([0.6493, 0.5370, 0.4081, 0.4762, 0.3695, 0.4711, 0.3026, 0.5218, 0.5263])
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_inference_with_embeddings_and_multiple_images(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = inputs["prompt"]
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        prompt_embeds, prompt_attn_mask, negative_prompt_embeds, neg_prompt_attn_mask = pipe.encode_prompt(prompt)
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attn_mask,
+            "negative_prompt": None,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": neg_prompt_attn_mask,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "num_images_per_prompt": 2,
+            "use_resolution_binning": False,
+        }
+
+        # set all optional components to None
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "prompt_attention_mask": prompt_attn_mask,
+            "negative_prompt": None,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "negative_prompt_attention_mask": neg_prompt_attn_mask,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+            "num_images_per_prompt": 2,
+            "use_resolution_binning": False,
+        }
+
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, 1e-4)
+
+    def test_inference_with_multiple_images_per_prompt(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_images_per_prompt"] = 2
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        self.assertEqual(image.shape, (2, 8, 8, 3))
+        expected_slice = np.array([0.6319, 0.3526, 0.3806, 0.6327, 0.4639, 0.4830, 0.2583, 0.5331, 0.4852])
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        original_image_slice = image[0, -3:, -3:, -1]
+
+        # TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added
+        # to the pipeline level.
+        pipe.transformer.fuse_qkv_projections()
+        assert check_qkv_fusion_processors_exist(
+            pipe.transformer
+        ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+        assert check_qkv_fusion_matches_attn_procs_length(
+            pipe.transformer, pipe.transformer.original_attn_processors
+        ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_fused = image[0, -3:, -3:, -1]
+
+        pipe.transformer.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_disabled = image[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
+
+
+@slow
+@require_torch_gpu
+class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
+    ckpt_id_1024 = "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS"
+    ckpt_id_512 = "PixArt-alpha/PixArt-Sigma-XL-2-512-MS"
+    prompt = "A small cactus with a happy face in the Sahara desert."
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_pixart_1024(self):
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16)
+        pipe.enable_model_cpu_offload()
+        prompt = self.prompt
+
+        image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.4517, 0.4446, 0.4375, 0.449, 0.4399, 0.4365, 0.4583, 0.4629, 0.4473])
+
+        max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
+        self.assertLessEqual(max_diff, 1e-4)
+
+    def test_pixart_512(self):
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        transformer = PixArtTransformer2DModel.from_pretrained(
+            self.ckpt_id_512, subfolder="transformer", torch_dtype=torch.float16
+        )
+        pipe = PixArtSigmaPipeline.from_pretrained(
+            self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = self.prompt
+
+        image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.0479, 0.0378, 0.0217, 0.0942, 0.064, 0.0791, 0.2073, 0.1975, 0.2017])
+
+        max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
+        self.assertLessEqual(max_diff, 1e-4)
+
+    def test_pixart_1024_without_resolution_binning(self):
+        generator = torch.manual_seed(0)
+
+        pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16)
+        pipe.enable_model_cpu_offload()
+
+        prompt = self.prompt
+        height, width = 1024, 768
+        num_inference_steps = 2
+
+        image = pipe(
+            prompt,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+        ).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        generator = torch.manual_seed(0)
+        no_res_bin_image = pipe(
+            prompt,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+            use_resolution_binning=False,
+        ).images
+        no_res_bin_image_slice = no_res_bin_image[0, -3:, -3:, -1]
+
+        assert not np.allclose(image_slice, no_res_bin_image_slice, atol=1e-4, rtol=1e-4)
+
+    def test_pixart_512_without_resolution_binning(self):
+        generator = torch.manual_seed(0)
+
+        transformer = PixArtTransformer2DModel.from_pretrained(
+            self.ckpt_id_512, subfolder="transformer", torch_dtype=torch.float16
+        )
+        pipe = PixArtSigmaPipeline.from_pretrained(
+            self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = self.prompt
+        height, width = 512, 768
+        num_inference_steps = 2
+
+        image = pipe(
+            prompt,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+        ).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        generator = torch.manual_seed(0)
+        no_res_bin_image = pipe(
+            prompt,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+            use_resolution_binning=False,
+        ).images
+        no_res_bin_image_slice = no_res_bin_image[0, -3:, -3:, -1]
+
+        assert not np.allclose(image_slice, no_res_bin_image_slice, atol=1e-4, rtol=1e-4)
diff --git a/diffusers/tests/pipelines/pndm/__init__.py b/diffusers/tests/pipelines/pndm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/pndm/test_pndm.py b/diffusers/tests/pipelines/pndm/test_pndm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5efb244919da673478e9e061e1d4ecfdd014c449
--- /dev/null
+++ b/diffusers/tests/pipelines/pndm/test_pndm.py
@@ -0,0 +1,87 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import PNDMPipeline, PNDMScheduler, UNet2DModel
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch, torch_device
+
+
+enable_full_determinism()
+
+
+class PNDMPipelineFastTests(unittest.TestCase):
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    def test_inference(self):
+        unet = self.dummy_uncond_unet
+        scheduler = PNDMScheduler()
+
+        pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
+        pndm.to(torch_device)
+        pndm.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = pndm(generator=generator, num_inference_steps=20, output_type="np").images
+
+        generator = torch.manual_seed(0)
+        image_from_tuple = pndm(generator=generator, num_inference_steps=20, output_type="np", return_dict=False)[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@nightly
+@require_torch
+class PNDMPipelineIntegrationTests(unittest.TestCase):
+    def test_inference_cifar10(self):
+        model_id = "google/ddpm-cifar10-32"
+
+        unet = UNet2DModel.from_pretrained(model_id)
+        scheduler = PNDMScheduler()
+
+        pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
+        pndm.to(torch_device)
+        pndm.set_progress_bar_config(disable=None)
+        generator = torch.manual_seed(0)
+        image = pndm(generator=generator, output_type="np").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.1564, 0.14645, 0.1406, 0.14715, 0.12425, 0.14045, 0.13115, 0.12175, 0.125])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/semantic_stable_diffusion/__init__.py b/diffusers/tests/pipelines/semantic_stable_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py b/diffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..990c389a9c5f7683c4dd665589731fc90026c8a2
--- /dev/null
+++ b/diffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
@@ -0,0 +1,620 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
+from diffusers.pipelines.semantic_stable_diffusion import SemanticStableDiffusionPipeline as StableDiffusionPipeline
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    nightly,
+    require_torch_gpu,
+    torch_device,
+)
+
+
+enable_full_determinism()
+
+
+class SafeDiffusionPipelineFastTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    def test_semantic_diffusion_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5753, 0.6114, 0.5001, 0.5034, 0.5470, 0.4729, 0.4971, 0.4867, 0.4867])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_semantic_diffusion_pndm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5122, 0.5712, 0.4825, 0.5053, 0.5646, 0.4769, 0.5179, 0.4894, 0.4994])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_semantic_diffusion_no_safety_checker(self):
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
+        )
+        assert isinstance(pipe, StableDiffusionPipeline)
+        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
+        assert pipe.safety_checker is None
+
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+        # check that there's no error when saving a pipeline with one of the models being None
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
+
+        # sanity check that the pipeline still works
+        assert pipe.safety_checker is None
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_semantic_diffusion_fp16(self):
+        """Test that stable diffusion works with fp16"""
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # put models in fp16
+        unet = unet.half()
+        vae = vae.half()
+        bert = bert.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
+
+        assert image.shape == (1, 64, 64, 3)
+
+
+@nightly
+@require_torch_gpu
+class SemanticDiffusionPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_positive_guidance(self):
+        torch_device = "cuda"
+        pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a photo of a cat"
+        edit = {
+            "editing_prompt": ["sunglasses"],
+            "reverse_editing_direction": [False],
+            "edit_warmup_steps": 10,
+            "edit_guidance_scale": 6,
+            "edit_threshold": 0.95,
+            "edit_momentum_scale": 0.5,
+            "edit_mom_beta": 0.6,
+        }
+
+        seed = 3
+        guidance_scale = 7
+
+        # no sega enabled
+        generator = torch.Generator(torch_device)
+        generator.manual_seed(seed)
+        output = pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [
+            0.34673113,
+            0.38492733,
+            0.37597352,
+            0.34086335,
+            0.35650748,
+            0.35579205,
+            0.3384763,
+            0.34340236,
+            0.3573271,
+        ]
+
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+        # with sega enabled
+        # generator = torch.manual_seed(seed)
+        generator.manual_seed(seed)
+        output = pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            **edit,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [
+            0.41887826,
+            0.37728766,
+            0.30138272,
+            0.41416335,
+            0.41664985,
+            0.36283392,
+            0.36191246,
+            0.43364465,
+            0.43001732,
+        ]
+
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_negative_guidance(self):
+        torch_device = "cuda"
+        pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "an image of a crowded boulevard, realistic, 4k"
+        edit = {
+            "editing_prompt": "crowd, crowded, people",
+            "reverse_editing_direction": True,
+            "edit_warmup_steps": 10,
+            "edit_guidance_scale": 8.3,
+            "edit_threshold": 0.9,
+            "edit_momentum_scale": 0.5,
+            "edit_mom_beta": 0.6,
+        }
+
+        seed = 9
+        guidance_scale = 7
+
+        # no sega enabled
+        generator = torch.Generator(torch_device)
+        generator.manual_seed(seed)
+        output = pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [
+            0.43497998,
+            0.91814065,
+            0.7540739,
+            0.55580205,
+            0.8467265,
+            0.5389691,
+            0.62574506,
+            0.58897763,
+            0.50926757,
+        ]
+
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+        # with sega enabled
+        # generator = torch.manual_seed(seed)
+        generator.manual_seed(seed)
+        output = pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            **edit,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [
+            0.3089719,
+            0.30500144,
+            0.29016042,
+            0.30630964,
+            0.325687,
+            0.29419225,
+            0.2908091,
+            0.28723598,
+            0.27696294,
+        ]
+
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_multi_cond_guidance(self):
+        torch_device = "cuda"
+        pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a castle next to a river"
+        edit = {
+            "editing_prompt": ["boat on a river, boat", "monet, impression, sunrise"],
+            "reverse_editing_direction": False,
+            "edit_warmup_steps": [15, 18],
+            "edit_guidance_scale": 6,
+            "edit_threshold": [0.9, 0.8],
+            "edit_momentum_scale": 0.5,
+            "edit_mom_beta": 0.6,
+        }
+
+        seed = 48
+        guidance_scale = 7
+
+        # no sega enabled
+        generator = torch.Generator(torch_device)
+        generator.manual_seed(seed)
+        output = pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [
+            0.75163555,
+            0.76037145,
+            0.61785,
+            0.9189673,
+            0.8627701,
+            0.85189694,
+            0.8512813,
+            0.87012076,
+            0.8312857,
+        ]
+
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+        # with sega enabled
+        # generator = torch.manual_seed(seed)
+        generator.manual_seed(seed)
+        output = pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            **edit,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [
+            0.73553365,
+            0.7537271,
+            0.74341905,
+            0.66480356,
+            0.6472925,
+            0.63039416,
+            0.64812905,
+            0.6749717,
+            0.6517102,
+        ]
+
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_guidance_fp16(self):
+        torch_device = "cuda"
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a photo of a cat"
+        edit = {
+            "editing_prompt": ["sunglasses"],
+            "reverse_editing_direction": [False],
+            "edit_warmup_steps": 10,
+            "edit_guidance_scale": 6,
+            "edit_threshold": 0.95,
+            "edit_momentum_scale": 0.5,
+            "edit_mom_beta": 0.6,
+        }
+
+        seed = 3
+        guidance_scale = 7
+
+        # no sega enabled
+        generator = torch.Generator(torch_device)
+        generator.manual_seed(seed)
+        output = pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [
+            0.34887695,
+            0.3876953,
+            0.375,
+            0.34423828,
+            0.3581543,
+            0.35717773,
+            0.3383789,
+            0.34570312,
+            0.359375,
+        ]
+
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+        # with sega enabled
+        # generator = torch.manual_seed(seed)
+        generator.manual_seed(seed)
+        output = pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            **edit,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [
+            0.42285156,
+            0.36914062,
+            0.29077148,
+            0.42041016,
+            0.41918945,
+            0.35498047,
+            0.3618164,
+            0.4423828,
+            0.43115234,
+        ]
+
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/shap_e/__init__.py b/diffusers/tests/pipelines/shap_e/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/shap_e/test_shap_e.py b/diffusers/tests/pipelines/shap_e/test_shap_e.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf643fe47a605109ad8250a27beb6f3d75ef610
--- /dev/null
+++ b/diffusers/tests/pipelines/shap_e/test_shap_e.py
@@ -0,0 +1,261 @@
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEPipeline
+from diffusers.pipelines.shap_e import ShapERenderer
+from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, torch_device
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = ShapEPipeline
+    params = ["prompt"]
+    batch_params = ["prompt"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "num_inference_steps",
+        "generator",
+        "latents",
+        "guidance_scale",
+        "frame_size",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 16
+
+    @property
+    def time_input_dim(self):
+        return 16
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def renderer_dim(self):
+        return 8
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config)
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 16,
+            "embedding_dim": self.time_input_dim,
+            "num_embeddings": 32,
+            "embedding_proj_dim": self.text_embedder_hidden_size,
+            "time_embed_dim": self.time_embed_dim,
+            "num_layers": 1,
+            "clip_embed_dim": self.time_input_dim * 2,
+            "additional_embeddings": 0,
+            "time_embed_act_fn": "gelu",
+            "norm_in_type": "layer",
+            "encoder_hid_proj_type": None,
+            "added_emb_type": None,
+        }
+
+        model = PriorTransformer(**model_kwargs)
+        return model
+
+    @property
+    def dummy_renderer(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "param_shapes": (
+                (self.renderer_dim, 93),
+                (self.renderer_dim, 8),
+                (self.renderer_dim, 8),
+                (self.renderer_dim, 8),
+            ),
+            "d_latent": self.time_input_dim,
+            "d_hidden": self.renderer_dim,
+            "n_output": 12,
+            "background": (
+                0.1,
+                0.1,
+                0.1,
+            ),
+        }
+        model = ShapERenderer(**model_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        shap_e_renderer = self.dummy_renderer
+
+        scheduler = HeunDiscreteScheduler(
+            beta_schedule="exp",
+            num_train_timesteps=1024,
+            prediction_type="sample",
+            use_karras_sigmas=True,
+            clip_sample=True,
+            clip_sample_range=1.0,
+        )
+        components = {
+            "prior": prior,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "shap_e_renderer": shap_e_renderer,
+            "scheduler": scheduler,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "generator": generator,
+            "num_inference_steps": 1,
+            "frame_size": 32,
+            "output_type": "latent",
+        }
+        return inputs
+
+    def test_shap_e(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images[0]
+        image = image.cpu().numpy()
+        image_slice = image[-3:, -3:]
+
+        assert image.shape == (32, 16)
+
+        expected_slice = np.array([-1.0000, -0.6559, 1.0000, -0.9096, -0.7252, 0.8211, -0.7647, -0.3308, 0.6462])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_inference_batch_consistent(self):
+        # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
+        self._test_inference_batch_consistent(batch_sizes=[1, 2])
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=6e-3)
+
+    def test_num_images_per_prompt(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        batch_size = 1
+        num_images_per_prompt = 2
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        for key in inputs.keys():
+            if key in self.batch_params:
+                inputs[key] = batch_size * [inputs[key]]
+
+        images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt)[0]
+
+        assert images.shape[0] == batch_size * num_images_per_prompt
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=5e-1)
+
+    def test_save_load_local(self):
+        super().test_save_load_local(expected_max_difference=5e-3)
+
+    @unittest.skip("Key error is raised with accelerate")
+    def test_sequential_cpu_offload_forward_pass(self):
+        pass
+
+
+@nightly
+@require_torch_gpu
+class ShapEPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_shap_e(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/shap_e/test_shap_e_np_out.npy"
+        )
+        pipe = ShapEPipeline.from_pretrained("openai/shap-e")
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+
+        images = pipe(
+            "a shark",
+            generator=generator,
+            guidance_scale=15.0,
+            num_inference_steps=64,
+            frame_size=64,
+            output_type="np",
+        ).images[0]
+
+        assert images.shape == (20, 64, 64, 3)
+
+        assert_mean_pixel_difference(images, expected_image)
diff --git a/diffusers/tests/pipelines/shap_e/test_shap_e_img2img.py b/diffusers/tests/pipelines/shap_e/test_shap_e_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3661355e9dda3f4e2a1c2c231a494e7a09c0109
--- /dev/null
+++ b/diffusers/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -0,0 +1,290 @@
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
+
+from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEImg2ImgPipeline
+from diffusers.pipelines.shap_e import ShapERenderer
+from diffusers.utils.testing_utils import (
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
+    require_torch_gpu,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = ShapEImg2ImgPipeline
+    params = ["image"]
+    batch_params = ["image"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "num_inference_steps",
+        "generator",
+        "latents",
+        "guidance_scale",
+        "frame_size",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 16
+
+    @property
+    def time_input_dim(self):
+        return 16
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def renderer_dim(self):
+        return 8
+
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=self.text_embedder_hidden_size,
+            image_size=32,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=24,
+            num_attention_heads=2,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=1,
+        )
+
+        model = CLIPVisionModel(config)
+        return model
+
+    @property
+    def dummy_image_processor(self):
+        image_processor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        return image_processor
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 16,
+            "embedding_dim": self.time_input_dim,
+            "num_embeddings": 32,
+            "embedding_proj_dim": self.text_embedder_hidden_size,
+            "time_embed_dim": self.time_embed_dim,
+            "num_layers": 1,
+            "clip_embed_dim": self.time_input_dim * 2,
+            "additional_embeddings": 0,
+            "time_embed_act_fn": "gelu",
+            "norm_in_type": "layer",
+            "embedding_proj_norm_type": "layer",
+            "encoder_hid_proj_type": None,
+            "added_emb_type": None,
+        }
+
+        model = PriorTransformer(**model_kwargs)
+        return model
+
+    @property
+    def dummy_renderer(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "param_shapes": (
+                (self.renderer_dim, 93),
+                (self.renderer_dim, 8),
+                (self.renderer_dim, 8),
+                (self.renderer_dim, 8),
+            ),
+            "d_latent": self.time_input_dim,
+            "d_hidden": self.renderer_dim,
+            "n_output": 12,
+            "background": (
+                0.1,
+                0.1,
+                0.1,
+            ),
+        }
+        model = ShapERenderer(**model_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        image_encoder = self.dummy_image_encoder
+        image_processor = self.dummy_image_processor
+        shap_e_renderer = self.dummy_renderer
+
+        scheduler = HeunDiscreteScheduler(
+            beta_schedule="exp",
+            num_train_timesteps=1024,
+            prediction_type="sample",
+            use_karras_sigmas=True,
+            clip_sample=True,
+            clip_sample_range=1.0,
+        )
+        components = {
+            "prior": prior,
+            "image_encoder": image_encoder,
+            "image_processor": image_processor,
+            "shap_e_renderer": shap_e_renderer,
+            "scheduler": scheduler,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": input_image,
+            "generator": generator,
+            "num_inference_steps": 1,
+            "frame_size": 32,
+            "output_type": "latent",
+        }
+        return inputs
+
+    def test_shap_e(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images[0]
+        image_slice = image[-3:, -3:].cpu().numpy()
+
+        assert image.shape == (32, 16)
+
+        expected_slice = np.array(
+            [-1.0, 0.40668195, 0.57322013, -0.9469888, 0.4283227, 0.30348337, -0.81094897, 0.74555075, 0.15342723]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_inference_batch_consistent(self):
+        # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
+        self._test_inference_batch_consistent(batch_sizes=[2])
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            batch_size=2,
+            expected_max_diff=6e-3,
+        )
+
+    def test_num_images_per_prompt(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        batch_size = 1
+        num_images_per_prompt = 2
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        for key in inputs.keys():
+            if key in self.batch_params:
+                inputs[key] = batch_size * [inputs[key]]
+
+        images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt)[0]
+
+        assert images.shape[0] == batch_size * num_images_per_prompt
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=1e-1)
+
+    def test_save_load_local(self):
+        super().test_save_load_local(expected_max_difference=5e-3)
+
+    @unittest.skip("Key error is raised with accelerate")
+    def test_sequential_cpu_offload_forward_pass(self):
+        pass
+
+
+@nightly
+@require_torch_gpu
+class ShapEImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_shap_e_img2img(self):
+        input_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/shap_e/corgi.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/shap_e/test_shap_e_img2img_out.npy"
+        )
+        pipe = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img")
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+
+        images = pipe(
+            input_image,
+            generator=generator,
+            guidance_scale=3.0,
+            num_inference_steps=64,
+            frame_size=64,
+            output_type="np",
+        ).images[0]
+
+        assert images.shape == (20, 64, 64, 3)
+
+        assert_mean_pixel_difference(images, expected_image)
diff --git a/diffusers/tests/pipelines/stable_audio/__init__.py b/diffusers/tests/pipelines/stable_audio/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_audio/test_stable_audio.py b/diffusers/tests/pipelines/stable_audio/test_stable_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..41ac94891c6f36859c7af009471dfa6c8038e5b2
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_audio/test_stable_audio.py
@@ -0,0 +1,462 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import (
+    T5EncoderModel,
+    T5Tokenizer,
+)
+
+from diffusers import (
+    AutoencoderOobleck,
+    CosineDPMSolverMultistepScheduler,
+    StableAudioDiTModel,
+    StableAudioPipeline,
+    StableAudioProjectionModel,
+)
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
+
+from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class StableAudioPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = StableAudioPipeline
+    params = frozenset(
+        [
+            "prompt",
+            "audio_end_in_s",
+            "audio_start_in_s",
+            "guidance_scale",
+            "negative_prompt",
+            "prompt_embeds",
+            "negative_prompt_embeds",
+            "initial_audio_waveforms",
+        ]
+    )
+    batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "num_waveforms_per_prompt",
+            "generator",
+            "latents",
+            "output_type",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
+    # There is not xformers version of the StableAudioPipeline custom attention processor
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = StableAudioDiTModel(
+            sample_size=4,
+            in_channels=3,
+            num_layers=2,
+            attention_head_dim=4,
+            num_key_value_attention_heads=2,
+            out_channels=3,
+            cross_attention_dim=4,
+            time_proj_dim=8,
+            global_states_input_dim=8,
+            cross_attention_input_dim=4,
+        )
+        scheduler = CosineDPMSolverMultistepScheduler(
+            solver_order=2,
+            prediction_type="v_prediction",
+            sigma_data=1.0,
+            sigma_schedule="exponential",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderOobleck(
+            encoder_hidden_size=6,
+            downsampling_ratios=[1, 2],
+            decoder_channels=3,
+            decoder_input_channels=3,
+            audio_channels=2,
+            channel_multiples=[2, 4],
+            sampling_rate=4,
+        )
+        torch.manual_seed(0)
+        t5_repo_id = "hf-internal-testing/tiny-random-T5ForConditionalGeneration"
+        text_encoder = T5EncoderModel.from_pretrained(t5_repo_id)
+        tokenizer = T5Tokenizer.from_pretrained(t5_repo_id, truncation=True, model_max_length=25)
+
+        torch.manual_seed(0)
+        projection_model = StableAudioProjectionModel(
+            text_encoder_dim=text_encoder.config.d_model,
+            conditioning_dim=4,
+            min_value=0,
+            max_value=32,
+        )
+
+        components = {
+            "transformer": transformer,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "projection_model": projection_model,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A hammer hitting a wooden surface",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+        }
+        return inputs
+
+    def test_save_load_local(self):
+        # increase tolerance from 1e-4 -> 7e-3 to account for large composite model
+        super().test_save_load_local(expected_max_difference=7e-3)
+
+    def test_save_load_optional_components(self):
+        # increase tolerance from 1e-4 -> 7e-3 to account for large composite model
+        super().test_save_load_optional_components(expected_max_difference=7e-3)
+
+    def test_stable_audio_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        stable_audio_pipe = StableAudioPipeline(**components)
+        stable_audio_pipe = stable_audio_pipe.to(torch_device)
+        stable_audio_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = stable_audio_pipe(**inputs)
+        audio = output.audios[0]
+
+        assert audio.ndim == 2
+        assert audio.shape == (2, 7)
+
+    def test_stable_audio_without_prompts(self):
+        components = self.get_dummy_components()
+        stable_audio_pipe = StableAudioPipeline(**components)
+        stable_audio_pipe = stable_audio_pipe.to(torch_device)
+        stable_audio_pipe = stable_audio_pipe.to(torch_device)
+        stable_audio_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = stable_audio_pipe(**inputs)
+        audio_1 = output.audios[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        text_inputs = stable_audio_pipe.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=stable_audio_pipe.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        ).to(torch_device)
+        text_input_ids = text_inputs.input_ids
+        attention_mask = text_inputs.attention_mask
+
+        prompt_embeds = stable_audio_pipe.text_encoder(
+            text_input_ids,
+            attention_mask=attention_mask,
+        )[0]
+
+        inputs["prompt_embeds"] = prompt_embeds
+        inputs["attention_mask"] = attention_mask
+
+        # forward
+        output = stable_audio_pipe(**inputs)
+        audio_2 = output.audios[0]
+
+        assert (audio_1 - audio_2).abs().max() < 1e-2
+
+    def test_stable_audio_negative_without_prompts(self):
+        components = self.get_dummy_components()
+        stable_audio_pipe = StableAudioPipeline(**components)
+        stable_audio_pipe = stable_audio_pipe.to(torch_device)
+        stable_audio_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = stable_audio_pipe(**inputs)
+        audio_1 = output.audios[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        text_inputs = stable_audio_pipe.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=stable_audio_pipe.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        ).to(torch_device)
+        text_input_ids = text_inputs.input_ids
+        attention_mask = text_inputs.attention_mask
+
+        prompt_embeds = stable_audio_pipe.text_encoder(
+            text_input_ids,
+            attention_mask=attention_mask,
+        )[0]
+
+        inputs["prompt_embeds"] = prompt_embeds
+        inputs["attention_mask"] = attention_mask
+
+        negative_text_inputs = stable_audio_pipe.tokenizer(
+            negative_prompt,
+            padding="max_length",
+            max_length=stable_audio_pipe.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        ).to(torch_device)
+        negative_text_input_ids = negative_text_inputs.input_ids
+        negative_attention_mask = negative_text_inputs.attention_mask
+
+        negative_prompt_embeds = stable_audio_pipe.text_encoder(
+            negative_text_input_ids,
+            attention_mask=negative_attention_mask,
+        )[0]
+
+        inputs["negative_prompt_embeds"] = negative_prompt_embeds
+        inputs["negative_attention_mask"] = negative_attention_mask
+
+        # forward
+        output = stable_audio_pipe(**inputs)
+        audio_2 = output.audios[0]
+
+        assert (audio_1 - audio_2).abs().max() < 1e-2
+
+    def test_stable_audio_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        stable_audio_pipe = StableAudioPipeline(**components)
+        stable_audio_pipe = stable_audio_pipe.to(device)
+        stable_audio_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "egg cracking"
+        output = stable_audio_pipe(**inputs, negative_prompt=negative_prompt)
+        audio = output.audios[0]
+
+        assert audio.ndim == 2
+        assert audio.shape == (2, 7)
+
+    def test_stable_audio_num_waveforms_per_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        stable_audio_pipe = StableAudioPipeline(**components)
+        stable_audio_pipe = stable_audio_pipe.to(device)
+        stable_audio_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A hammer hitting a wooden surface"
+
+        # test num_waveforms_per_prompt=1 (default)
+        audios = stable_audio_pipe(prompt, num_inference_steps=2).audios
+
+        assert audios.shape == (1, 2, 7)
+
+        # test num_waveforms_per_prompt=1 (default) for batch of prompts
+        batch_size = 2
+        audios = stable_audio_pipe([prompt] * batch_size, num_inference_steps=2).audios
+
+        assert audios.shape == (batch_size, 2, 7)
+
+        # test num_waveforms_per_prompt for single prompt
+        num_waveforms_per_prompt = 2
+        audios = stable_audio_pipe(
+            prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
+        ).audios
+
+        assert audios.shape == (num_waveforms_per_prompt, 2, 7)
+
+        # test num_waveforms_per_prompt for batch of prompts
+        batch_size = 2
+        audios = stable_audio_pipe(
+            [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
+        ).audios
+
+        assert audios.shape == (batch_size * num_waveforms_per_prompt, 2, 7)
+
+    def test_stable_audio_audio_end_in_s(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        stable_audio_pipe = StableAudioPipeline(**components)
+        stable_audio_pipe = stable_audio_pipe.to(torch_device)
+        stable_audio_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = stable_audio_pipe(audio_end_in_s=1.5, **inputs)
+        audio = output.audios[0]
+
+        assert audio.ndim == 2
+        assert audio.shape[1] / stable_audio_pipe.vae.sampling_rate == 1.5
+
+        output = stable_audio_pipe(audio_end_in_s=1.1875, **inputs)
+        audio = output.audios[0]
+
+        assert audio.ndim == 2
+        assert audio.shape[1] / stable_audio_pipe.vae.sampling_rate == 1.0
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=5e-4)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
+
+    def test_stable_audio_input_waveform(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        stable_audio_pipe = StableAudioPipeline(**components)
+        stable_audio_pipe = stable_audio_pipe.to(device)
+        stable_audio_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A hammer hitting a wooden surface"
+
+        initial_audio_waveforms = torch.ones((1, 5))
+
+        # test raises error when no sampling rate
+        with self.assertRaises(ValueError):
+            audios = stable_audio_pipe(
+                prompt, num_inference_steps=2, initial_audio_waveforms=initial_audio_waveforms
+            ).audios
+
+        # test raises error when wrong sampling rate
+        with self.assertRaises(ValueError):
+            audios = stable_audio_pipe(
+                prompt,
+                num_inference_steps=2,
+                initial_audio_waveforms=initial_audio_waveforms,
+                initial_audio_sampling_rate=stable_audio_pipe.vae.sampling_rate - 1,
+            ).audios
+
+        audios = stable_audio_pipe(
+            prompt,
+            num_inference_steps=2,
+            initial_audio_waveforms=initial_audio_waveforms,
+            initial_audio_sampling_rate=stable_audio_pipe.vae.sampling_rate,
+        ).audios
+        assert audios.shape == (1, 2, 7)
+
+        # test works with num_waveforms_per_prompt
+        num_waveforms_per_prompt = 2
+        audios = stable_audio_pipe(
+            prompt,
+            num_inference_steps=2,
+            num_waveforms_per_prompt=num_waveforms_per_prompt,
+            initial_audio_waveforms=initial_audio_waveforms,
+            initial_audio_sampling_rate=stable_audio_pipe.vae.sampling_rate,
+        ).audios
+
+        assert audios.shape == (num_waveforms_per_prompt, 2, 7)
+
+        # test num_waveforms_per_prompt for batch of prompts and input audio (two channels)
+        batch_size = 2
+        initial_audio_waveforms = torch.ones((batch_size, 2, 5))
+        audios = stable_audio_pipe(
+            [prompt] * batch_size,
+            num_inference_steps=2,
+            num_waveforms_per_prompt=num_waveforms_per_prompt,
+            initial_audio_waveforms=initial_audio_waveforms,
+            initial_audio_sampling_rate=stable_audio_pipe.vae.sampling_rate,
+        ).audios
+
+        assert audios.shape == (batch_size * num_waveforms_per_prompt, 2, 7)
+
+    @unittest.skip("Not supported yet")
+    def test_sequential_cpu_offload_forward_pass(self):
+        pass
+
+    @unittest.skip("Not supported yet")
+    def test_sequential_offload_forward_pass_twice(self):
+        pass
+
+
+@nightly
+@require_torch_gpu
+class StableAudioPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 64, 1024))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "A hammer hitting a wooden surface",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "audio_end_in_s": 30,
+            "guidance_scale": 2.5,
+        }
+        return inputs
+
+    def test_stable_audio(self):
+        stable_audio_pipe = StableAudioPipeline.from_pretrained("stabilityai/stable-audio-open-1.0")
+        stable_audio_pipe = stable_audio_pipe.to(torch_device)
+        stable_audio_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 25
+        audio = stable_audio_pipe(**inputs).audios[0]
+
+        assert audio.ndim == 2
+        assert audio.shape == (2, int(inputs["audio_end_in_s"] * stable_audio_pipe.vae.sampling_rate))
+        # check the portion of the generated audio with the largest dynamic range (reduces flakiness)
+        audio_slice = audio[0, 447590:447600]
+        # fmt: off
+        expected_slice = np.array(
+            [-0.0278,  0.1096,  0.1877,  0.3178,  0.5329,  0.6990,  0.6972,  0.6186, 0.5608,  0.5060]
+        )
+         # fmt: one
+        max_diff = np.abs(expected_slice - audio_slice.detach().cpu().numpy()).max()
+        assert max_diff < 1.5e-3
diff --git a/diffusers/tests/pipelines/stable_cascade/__init__.py b/diffusers/tests/pipelines/stable_cascade/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_cascade/test_stable_cascade_combined.py b/diffusers/tests/pipelines/stable_cascade/test_stable_cascade_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..d256deed376c0597034dcf2a84ab50395a293290
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_cascade/test_stable_cascade_combined.py
@@ -0,0 +1,281 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import DDPMWuerstchenScheduler, StableCascadeCombinedPipeline
+from diffusers.models import StableCascadeUNet
+from diffusers.pipelines.wuerstchen import PaellaVQModel
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class StableCascadeCombinedPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = StableCascadeCombinedPipeline
+    params = ["prompt"]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "prior_guidance_scale",
+        "decoder_guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "prior_num_inference_steps",
+        "output_type",
+    ]
+    test_xformers_attention = True
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "conditioning_dim": 128,
+            "block_out_channels": (128, 128),
+            "num_attention_heads": (2, 2),
+            "down_num_layers_per_block": (1, 1),
+            "up_num_layers_per_block": (1, 1),
+            "clip_image_in_channels": 768,
+            "switch_level": (False,),
+            "clip_text_in_channels": self.text_embedder_hidden_size,
+            "clip_text_pooled_in_channels": self.text_embedder_hidden_size,
+        }
+
+        model = StableCascadeUNet(**model_kwargs)
+        return model.eval()
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            projection_dim=self.text_embedder_hidden_size,
+            hidden_size=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config).eval()
+
+    @property
+    def dummy_vqgan(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "bottleneck_blocks": 1,
+            "num_vq_embeddings": 2,
+        }
+        model = PaellaVQModel(**model_kwargs)
+        return model.eval()
+
+    @property
+    def dummy_decoder(self):
+        torch.manual_seed(0)
+        model_kwargs = {
+            "in_channels": 4,
+            "out_channels": 4,
+            "conditioning_dim": 128,
+            "block_out_channels": (16, 32, 64, 128),
+            "num_attention_heads": (-1, -1, 1, 2),
+            "down_num_layers_per_block": (1, 1, 1, 1),
+            "up_num_layers_per_block": (1, 1, 1, 1),
+            "down_blocks_repeat_mappers": (1, 1, 1, 1),
+            "up_blocks_repeat_mappers": (3, 3, 2, 2),
+            "block_types_per_layer": (
+                ("SDCascadeResBlock", "SDCascadeTimestepBlock"),
+                ("SDCascadeResBlock", "SDCascadeTimestepBlock"),
+                ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
+                ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
+            ),
+            "switch_level": None,
+            "clip_text_pooled_in_channels": 32,
+            "dropout": (0.1, 0.1, 0.1, 0.1),
+        }
+
+        model = StableCascadeUNet(**model_kwargs)
+        return model.eval()
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+
+        scheduler = DDPMWuerstchenScheduler()
+        tokenizer = self.dummy_tokenizer
+        text_encoder = self.dummy_text_encoder
+        decoder = self.dummy_decoder
+        vqgan = self.dummy_vqgan
+        prior_text_encoder = self.dummy_text_encoder
+        prior_tokenizer = self.dummy_tokenizer
+
+        components = {
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "decoder": decoder,
+            "scheduler": scheduler,
+            "vqgan": vqgan,
+            "prior_text_encoder": prior_text_encoder,
+            "prior_tokenizer": prior_tokenizer,
+            "prior_prior": prior,
+            "prior_scheduler": scheduler,
+            "prior_feature_extractor": None,
+            "prior_image_encoder": None,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "generator": generator,
+            "prior_guidance_scale": 4.0,
+            "decoder_guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "prior_num_inference_steps": 2,
+            "output_type": "np",
+            "height": 128,
+            "width": 128,
+        }
+        return inputs
+
+    def test_stable_cascade(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[-3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+
+        expected_slice = np.array([0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0])
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=2e-2)
+
+    @unittest.skip(reason="fp16 not supported")
+    def test_float16_inference(self):
+        super().test_float16_inference()
+
+    @unittest.skip(reason="no callback test for combined pipeline")
+    def test_callback_inputs(self):
+        super().test_callback_inputs()
+
+    def test_stable_cascade_combined_prompt_embeds(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = StableCascadeCombinedPipeline(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A photograph of a shiba inu, wearing a hat"
+        (
+            prompt_embeds,
+            prompt_embeds_pooled,
+            negative_prompt_embeds,
+            negative_prompt_embeds_pooled,
+        ) = pipe.prior_pipe.encode_prompt(device, 1, 1, False, prompt=prompt)
+        generator = torch.Generator(device=device)
+
+        output_prompt = pipe(
+            prompt=prompt,
+            num_inference_steps=1,
+            prior_num_inference_steps=1,
+            output_type="np",
+            generator=generator.manual_seed(0),
+        )
+        output_prompt_embeds = pipe(
+            prompt=None,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_pooled=prompt_embeds_pooled,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+            num_inference_steps=1,
+            prior_num_inference_steps=1,
+            output_type="np",
+            generator=generator.manual_seed(0),
+        )
+
+        assert np.abs(output_prompt.images - output_prompt_embeds.images).max() < 1e-5
diff --git a/diffusers/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py b/diffusers/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..07e4244e3c6829763c6eee7f290ebca3b6c21456
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py
@@ -0,0 +1,353 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import DDPMWuerstchenScheduler, StableCascadeDecoderPipeline
+from diffusers.models import StableCascadeUNet
+from diffusers.pipelines.wuerstchen import PaellaVQModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_numpy,
+    load_pt,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class StableCascadeDecoderPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = StableCascadeDecoderPipeline
+    params = ["prompt"]
+    batch_params = ["image_embeddings", "prompt", "negative_prompt"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "num_inference_steps",
+        "latents",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    callback_cfg_params = ["image_embeddings", "text_encoder_hidden_states"]
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            projection_dim=self.text_embedder_hidden_size,
+            hidden_size=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config).eval()
+
+    @property
+    def dummy_vqgan(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "bottleneck_blocks": 1,
+            "num_vq_embeddings": 2,
+        }
+        model = PaellaVQModel(**model_kwargs)
+        return model.eval()
+
+    @property
+    def dummy_decoder(self):
+        torch.manual_seed(0)
+        model_kwargs = {
+            "in_channels": 4,
+            "out_channels": 4,
+            "conditioning_dim": 128,
+            "block_out_channels": [16, 32, 64, 128],
+            "num_attention_heads": [-1, -1, 1, 2],
+            "down_num_layers_per_block": [1, 1, 1, 1],
+            "up_num_layers_per_block": [1, 1, 1, 1],
+            "down_blocks_repeat_mappers": [1, 1, 1, 1],
+            "up_blocks_repeat_mappers": [3, 3, 2, 2],
+            "block_types_per_layer": [
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+            ],
+            "switch_level": None,
+            "clip_text_pooled_in_channels": 32,
+            "dropout": [0.1, 0.1, 0.1, 0.1],
+        }
+        model = StableCascadeUNet(**model_kwargs)
+        return model.eval()
+
+    def get_dummy_components(self):
+        decoder = self.dummy_decoder
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        vqgan = self.dummy_vqgan
+
+        scheduler = DDPMWuerstchenScheduler()
+
+        components = {
+            "decoder": decoder,
+            "vqgan": vqgan,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+            "latent_dim_scale": 4.0,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image_embeddings": torch.ones((1, 4, 4, 4), device=device),
+            "prompt": "horse",
+            "generator": generator,
+            "guidance_scale": 2.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_wuerstchen_decoder(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+        test_mean_pixel_difference = False
+
+        self._test_attention_slicing_forward_pass(
+            test_max_difference=test_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )
+
+    @unittest.skip(reason="fp16 not supported")
+    def test_float16_inference(self):
+        super().test_float16_inference()
+
+    def test_stable_cascade_decoder_prompt_embeds(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = StableCascadeDecoderPipeline(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image_embeddings = inputs["image_embeddings"]
+        prompt = "A photograph of a shiba inu, wearing a hat"
+        (
+            prompt_embeds,
+            prompt_embeds_pooled,
+            negative_prompt_embeds,
+            negative_prompt_embeds_pooled,
+        ) = pipe.encode_prompt(device, 1, 1, False, prompt=prompt)
+        generator = torch.Generator(device=device)
+
+        decoder_output_prompt = pipe(
+            image_embeddings=image_embeddings,
+            prompt=prompt,
+            num_inference_steps=1,
+            output_type="np",
+            generator=generator.manual_seed(0),
+        )
+        decoder_output_prompt_embeds = pipe(
+            image_embeddings=image_embeddings,
+            prompt=None,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_pooled=prompt_embeds_pooled,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+            num_inference_steps=1,
+            output_type="np",
+            generator=generator.manual_seed(0),
+        )
+
+        assert np.abs(decoder_output_prompt.images - decoder_output_prompt_embeds.images).max() < 1e-5
+
+    def test_stable_cascade_decoder_single_prompt_multiple_image_embeddings(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = StableCascadeDecoderPipeline(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        prior_num_images_per_prompt = 2
+        decoder_num_images_per_prompt = 2
+        prompt = ["a cat"]
+        batch_size = len(prompt)
+
+        generator = torch.Generator(device)
+        image_embeddings = randn_tensor(
+            (batch_size * prior_num_images_per_prompt, 4, 4, 4), generator=generator.manual_seed(0)
+        )
+        decoder_output = pipe(
+            image_embeddings=image_embeddings,
+            prompt=prompt,
+            num_inference_steps=1,
+            output_type="np",
+            guidance_scale=0.0,
+            generator=generator.manual_seed(0),
+            num_images_per_prompt=decoder_num_images_per_prompt,
+        )
+
+        assert decoder_output.images.shape[0] == (
+            batch_size * prior_num_images_per_prompt * decoder_num_images_per_prompt
+        )
+
+    def test_stable_cascade_decoder_single_prompt_multiple_image_embeddings_with_guidance(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = StableCascadeDecoderPipeline(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        prior_num_images_per_prompt = 2
+        decoder_num_images_per_prompt = 2
+        prompt = ["a cat"]
+        batch_size = len(prompt)
+
+        generator = torch.Generator(device)
+        image_embeddings = randn_tensor(
+            (batch_size * prior_num_images_per_prompt, 4, 4, 4), generator=generator.manual_seed(0)
+        )
+        decoder_output = pipe(
+            image_embeddings=image_embeddings,
+            prompt=prompt,
+            num_inference_steps=1,
+            output_type="np",
+            guidance_scale=2.0,
+            generator=generator.manual_seed(0),
+            num_images_per_prompt=decoder_num_images_per_prompt,
+        )
+
+        assert decoder_output.images.shape[0] == (
+            batch_size * prior_num_images_per_prompt * decoder_num_images_per_prompt
+        )
+
+
+@slow
+@require_torch_gpu
+class StableCascadeDecoderPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_cascade_decoder(self):
+        pipe = StableCascadeDecoderPipeline.from_pretrained(
+            "stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_embedding = load_pt(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_cascade/image_embedding.pt"
+        )
+
+        image = pipe(
+            prompt=prompt,
+            image_embeddings=image_embedding,
+            output_type="np",
+            num_inference_steps=2,
+            generator=generator,
+        ).images[0]
+
+        assert image.shape == (1024, 1024, 3)
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_cascade/stable_cascade_decoder_image.npy"
+        )
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
+        assert max_diff < 1e-4
diff --git a/diffusers/tests/pipelines/stable_cascade/test_stable_cascade_prior.py b/diffusers/tests/pipelines/stable_cascade/test_stable_cascade_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..0208224a1d808aeb84025a9f3d92666589050667
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_cascade/test_stable_cascade_prior.py
@@ -0,0 +1,313 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import DDPMWuerstchenScheduler, StableCascadePriorPipeline
+from diffusers.models import StableCascadeUNet
+from diffusers.utils.import_utils import is_peft_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_numpy,
+    numpy_cosine_similarity_distance,
+    require_peft_backend,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_device,
+)
+
+
+if is_peft_available():
+    from peft import LoraConfig
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class StableCascadePriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = StableCascadePriorPipeline
+    params = ["prompt"]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "generator",
+        "num_inference_steps",
+        "latents",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    callback_cfg_params = ["text_encoder_hidden_states"]
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config).eval()
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "conditioning_dim": 128,
+            "block_out_channels": (128, 128),
+            "num_attention_heads": (2, 2),
+            "down_num_layers_per_block": (1, 1),
+            "up_num_layers_per_block": (1, 1),
+            "switch_level": (False,),
+            "clip_image_in_channels": 768,
+            "clip_text_in_channels": self.text_embedder_hidden_size,
+            "clip_text_pooled_in_channels": self.text_embedder_hidden_size,
+            "dropout": (0.1, 0.1),
+        }
+
+        model = StableCascadeUNet(**model_kwargs)
+        return model.eval()
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+
+        scheduler = DDPMWuerstchenScheduler()
+
+        components = {
+            "prior": prior,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "generator": generator,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_wuerstchen_prior(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.image_embeddings
+
+        image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)[0]
+
+        image_slice = image[0, 0, 0, -10:]
+
+        image_from_tuple_slice = image_from_tuple[0, 0, 0, -10:]
+        assert image.shape == (1, 16, 24, 24)
+
+        expected_slice = np.array(
+            [94.5498, -21.9481, -117.5025, -192.8760, 38.0117, 73.4709, 38.1142, -185.5593, -47.7869, 167.2853]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 5e-2
+
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-1)
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+        test_mean_pixel_difference = False
+
+        self._test_attention_slicing_forward_pass(
+            test_max_difference=test_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )
+
+    @unittest.skip(reason="fp16 not supported")
+    def test_float16_inference(self):
+        super().test_float16_inference()
+
+    def check_if_lora_correctly_set(self, model) -> bool:
+        """
+        Checks if the LoRA layers are correctly set with peft
+        """
+        for module in model.modules():
+            if isinstance(module, BaseTunerLayer):
+                return True
+        return False
+
+    def get_lora_components(self):
+        prior = self.dummy_prior
+
+        prior_lora_config = LoraConfig(
+            r=4, lora_alpha=4, target_modules=["to_q", "to_k", "to_v", "to_out.0"], init_lora_weights=False
+        )
+
+        return prior, prior_lora_config
+
+    @require_peft_backend
+    @unittest.skip(reason="no lora support for now")
+    def test_inference_with_prior_lora(self):
+        _, prior_lora_config = self.get_lora_components()
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output_no_lora = pipe(**self.get_dummy_inputs(device))
+        image_embed = output_no_lora.image_embeddings
+        self.assertTrue(image_embed.shape == (1, 16, 24, 24))
+
+        pipe.prior.add_adapter(prior_lora_config)
+        self.assertTrue(self.check_if_lora_correctly_set(pipe.prior), "Lora not correctly set in prior")
+
+        output_lora = pipe(**self.get_dummy_inputs(device))
+        lora_image_embed = output_lora.image_embeddings
+
+        self.assertTrue(image_embed.shape == lora_image_embed.shape)
+
+    def test_stable_cascade_decoder_prompt_embeds(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A photograph of a shiba inu, wearing a hat"
+        (
+            prompt_embeds,
+            prompt_embeds_pooled,
+            negative_prompt_embeds,
+            negative_prompt_embeds_pooled,
+        ) = pipe.encode_prompt(device, 1, 1, False, prompt=prompt)
+        generator = torch.Generator(device=device)
+
+        output_prompt = pipe(
+            prompt=prompt,
+            num_inference_steps=1,
+            output_type="np",
+            generator=generator.manual_seed(0),
+        )
+        output_prompt_embeds = pipe(
+            prompt=None,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_pooled=prompt_embeds_pooled,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+            num_inference_steps=1,
+            output_type="np",
+            generator=generator.manual_seed(0),
+        )
+
+        assert np.abs(output_prompt.image_embeddings - output_prompt_embeds.image_embeddings).max() < 1e-5
+
+
+@slow
+@require_torch_gpu
+class StableCascadePriorPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_cascade_prior(self):
+        pipe = StableCascadePriorPipeline.from_pretrained(
+            "stabilityai/stable-cascade-prior", variant="bf16", torch_dtype=torch.bfloat16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        output = pipe(prompt, num_inference_steps=2, output_type="np", generator=generator)
+        image_embedding = output.image_embeddings
+        expected_image_embedding = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_cascade/stable_cascade_prior_image_embeddings.npy"
+        )
+        assert image_embedding.shape == (1, 16, 24, 24)
+
+        max_diff = numpy_cosine_similarity_distance(image_embedding.flatten(), expected_image_embedding.flatten())
+        assert max_diff < 1e-4
diff --git a/diffusers/tests/pipelines/stable_diffusion/__init__.py b/diffusers/tests/pipelines/stable_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7036dee47f0646e88cdbd6d5dc7cab66b643d3a
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
@@ -0,0 +1,376 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+import numpy as np
+
+from diffusers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    OnnxStableDiffusionPipeline,
+    PNDMScheduler,
+)
+from diffusers.utils.testing_utils import is_onnx_available, nightly, require_onnxruntime, require_torch_gpu
+
+from ..test_pipelines_onnx_common import OnnxPipelineTesterMixin
+
+
+if is_onnx_available():
+    import onnxruntime as ort
+
+
+class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
+    hub_checkpoint = "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline"
+
+    def get_dummy_inputs(self, seed=0):
+        generator = np.random.RandomState(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_pipeline_default_ddim(self):
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.65072, 0.58492, 0.48219, 0.55521, 0.53180, 0.55939, 0.50697, 0.39800, 0.46455])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_pipeline_pndm(self):
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config, skip_prk_steps=True)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.65863, 0.59425, 0.49326, 0.56313, 0.53875, 0.56627, 0.51065, 0.39777, 0.46330])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_pipeline_lms(self):
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.53755, 0.60786, 0.47402, 0.49488, 0.51869, 0.49819, 0.47985, 0.38957, 0.44279])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_pipeline_euler(self):
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.53755, 0.60786, 0.47402, 0.49488, 0.51869, 0.49819, 0.47985, 0.38957, 0.44279])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_pipeline_euler_ancestral(self):
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.53817, 0.60812, 0.47384, 0.49530, 0.51894, 0.49814, 0.47984, 0.38958, 0.44271])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_pipeline_dpm_multistep(self):
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.53895, 0.60808, 0.47933, 0.49608, 0.51886, 0.49950, 0.48053, 0.38957, 0.44200])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_prompt_embeds(self):
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        inputs = self.get_dummy_inputs()
+        prompt = 3 * [inputs.pop("prompt")]
+
+        text_inputs = pipe.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=pipe.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+        text_inputs = text_inputs["input_ids"]
+
+        prompt_embeds = pipe.text_encoder(input_ids=text_inputs.astype(np.int32))[0]
+
+        inputs["prompt_embeds"] = prompt_embeds
+
+        # forward
+        output = pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_negative_prompt_embeds(self):
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        inputs = self.get_dummy_inputs()
+        prompt = 3 * [inputs.pop("prompt")]
+
+        embeds = []
+        for p in [prompt, negative_prompt]:
+            text_inputs = pipe.tokenizer(
+                p,
+                padding="max_length",
+                max_length=pipe.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_inputs = text_inputs["input_ids"]
+
+            embeds.append(pipe.text_encoder(input_ids=text_inputs.astype(np.int32))[0])
+
+        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
+
+        # forward
+        output = pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+
+@nightly
+@require_onnxruntime
+@require_torch_gpu
+class OnnxStableDiffusionPipelineIntegrationTests(unittest.TestCase):
+    @property
+    def gpu_provider(self):
+        return (
+            "CUDAExecutionProvider",
+            {
+                "gpu_mem_limit": "15000000000",  # 15GB
+                "arena_extend_strategy": "kSameAsRequested",
+            },
+        )
+
+    @property
+    def gpu_options(self):
+        options = ort.SessionOptions()
+        options.enable_mem_pattern = False
+        return options
+
+    def test_inference_default_pndm(self):
+        # using the PNDM scheduler by default
+        sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            revision="onnx",
+            safety_checker=None,
+            feature_extractor=None,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        np.random.seed(0)
+        output = sd_pipe([prompt], guidance_scale=6.0, num_inference_steps=10, output_type="np")
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0452, 0.0390, 0.0087, 0.0350, 0.0617, 0.0364, 0.0544, 0.0523, 0.0720])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_inference_ddim(self):
+        ddim_scheduler = DDIMScheduler.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
+        )
+        sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            revision="onnx",
+            scheduler=ddim_scheduler,
+            safety_checker=None,
+            feature_extractor=None,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "open neural network exchange"
+        generator = np.random.RandomState(0)
+        output = sd_pipe([prompt], guidance_scale=7.5, num_inference_steps=10, generator=generator, output_type="np")
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.2867, 0.1974, 0.1481, 0.7294, 0.7251, 0.6667, 0.4194, 0.5642, 0.6486])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_inference_k_lms(self):
+        lms_scheduler = LMSDiscreteScheduler.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
+        )
+        sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            revision="onnx",
+            scheduler=lms_scheduler,
+            safety_checker=None,
+            feature_extractor=None,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "open neural network exchange"
+        generator = np.random.RandomState(0)
+        output = sd_pipe([prompt], guidance_scale=7.5, num_inference_steps=10, generator=generator, output_type="np")
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.2306, 0.1959, 0.1593, 0.6549, 0.6394, 0.5408, 0.5065, 0.6010, 0.6161])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_intermediate_state(self):
+        number_of_steps = 0
+
+        def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
+            test_callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 0:
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [-0.6772, -0.3835, -1.2456, 0.1905, -1.0974, 0.6967, -1.9353, 0.0178, 1.0167]
+                )
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
+            elif step == 5:
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [-0.3351, 0.2241, -0.1837, -0.2325, -0.6577, 0.3393, -0.0241, 0.5899, 1.3875]
+                )
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
+
+        test_callback_fn.has_been_called = False
+
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            revision="onnx",
+            safety_checker=None,
+            feature_extractor=None,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "Andromeda galaxy in a bottle"
+
+        generator = np.random.RandomState(0)
+        pipe(
+            prompt=prompt,
+            num_inference_steps=5,
+            guidance_scale=7.5,
+            generator=generator,
+            callback=test_callback_fn,
+            callback_steps=1,
+        )
+        assert test_callback_fn.has_been_called
+        assert number_of_steps == 6
+
+    def test_stable_diffusion_no_safety_checker(self):
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            revision="onnx",
+            safety_checker=None,
+            feature_extractor=None,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        assert isinstance(pipe, OnnxStableDiffusionPipeline)
+        assert pipe.safety_checker is None
+
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+        # check that there's no error when saving a pipeline with one of the models being None
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = OnnxStableDiffusionPipeline.from_pretrained(tmpdirname)
+
+        # sanity check that the pipeline still works
+        assert pipe.safety_checker is None
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..c73ed0f6afe8755da79a82ed999a115b353e2c22
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
@@ -0,0 +1,245 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+
+from diffusers import (
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    OnnxStableDiffusionImg2ImgPipeline,
+    PNDMScheduler,
+)
+from diffusers.utils.testing_utils import (
+    floats_tensor,
+    is_onnx_available,
+    load_image,
+    nightly,
+    require_onnxruntime,
+    require_torch_gpu,
+)
+
+from ..test_pipelines_onnx_common import OnnxPipelineTesterMixin
+
+
+if is_onnx_available():
+    import onnxruntime as ort
+
+
+class OnnxStableDiffusionImg2ImgPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
+    hub_checkpoint = "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline"
+
+    def get_dummy_inputs(self, seed=0):
+        image = floats_tensor((1, 3, 128, 128), rng=random.Random(seed))
+        generator = np.random.RandomState(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_pipeline_default_ddim(self):
+        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.69643, 0.58484, 0.50314, 0.58760, 0.55368, 0.59643, 0.51529, 0.41217, 0.49087])
+        assert np.abs(image_slice - expected_slice).max() < 1e-1
+
+    def test_pipeline_pndm(self):
+        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config, skip_prk_steps=True)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.61737, 0.54642, 0.53183, 0.54465, 0.52742, 0.60525, 0.49969, 0.40655, 0.48154])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+    def test_pipeline_lms(self):
+        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        # warmup pass to apply optimizations
+        _ = pipe(**self.get_dummy_inputs())
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.52761, 0.59977, 0.49033, 0.49619, 0.54282, 0.50311, 0.47600, 0.40918, 0.45203])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+    def test_pipeline_euler(self):
+        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.52911, 0.60004, 0.49229, 0.49805, 0.54502, 0.50680, 0.47777, 0.41028, 0.45304])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+    def test_pipeline_euler_ancestral(self):
+        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.52911, 0.60004, 0.49229, 0.49805, 0.54502, 0.50680, 0.47777, 0.41028, 0.45304])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+    def test_pipeline_dpm_multistep(self):
+        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.65331, 0.58277, 0.48204, 0.56059, 0.53665, 0.56235, 0.50969, 0.40009, 0.46552])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+
+@nightly
+@require_onnxruntime
+@require_torch_gpu
+class OnnxStableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    @property
+    def gpu_provider(self):
+        return (
+            "CUDAExecutionProvider",
+            {
+                "gpu_mem_limit": "15000000000",  # 15GB
+                "arena_extend_strategy": "kSameAsRequested",
+            },
+        )
+
+    @property
+    def gpu_options(self):
+        options = ort.SessionOptions()
+        options.enable_mem_pattern = False
+        return options
+
+    def test_inference_default_pndm(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((768, 512))
+        # using the PNDM scheduler by default
+        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            revision="onnx",
+            safety_checker=None,
+            feature_extractor=None,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        generator = np.random.RandomState(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            num_inference_steps=10,
+            generator=generator,
+            output_type="np",
+        )
+        images = output.images
+        image_slice = images[0, 255:258, 383:386, -1]
+
+        assert images.shape == (1, 512, 768, 3)
+        expected_slice = np.array([0.4909, 0.5059, 0.5372, 0.4623, 0.4876, 0.5049, 0.4820, 0.4956, 0.5019])
+        # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
+
+    def test_inference_k_lms(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((768, 512))
+        lms_scheduler = LMSDiscreteScheduler.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
+        )
+        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            revision="onnx",
+            scheduler=lms_scheduler,
+            safety_checker=None,
+            feature_extractor=None,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        generator = np.random.RandomState(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            num_inference_steps=20,
+            generator=generator,
+            output_type="np",
+        )
+        images = output.images
+        image_slice = images[0, 255:258, 383:386, -1]
+
+        assert images.shape == (1, 512, 768, 3)
+        expected_slice = np.array([0.8043, 0.926, 0.9581, 0.8119, 0.8954, 0.913, 0.7209, 0.7463, 0.7431])
+        # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..09048b5c0e0f9938d474fed515583e33ec06ab82
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from diffusers import LMSDiscreteScheduler, OnnxStableDiffusionInpaintPipeline
+from diffusers.utils.testing_utils import (
+    is_onnx_available,
+    load_image,
+    nightly,
+    require_onnxruntime,
+    require_torch_gpu,
+)
+
+from ..test_pipelines_onnx_common import OnnxPipelineTesterMixin
+
+
+if is_onnx_available():
+    import onnxruntime as ort
+
+
+class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
+    # FIXME: add fast tests
+    pass
+
+
+@nightly
+@require_onnxruntime
+@require_torch_gpu
+class OnnxStableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
+    @property
+    def gpu_provider(self):
+        return (
+            "CUDAExecutionProvider",
+            {
+                "gpu_mem_limit": "15000000000",  # 15GB
+                "arena_extend_strategy": "kSameAsRequested",
+            },
+        )
+
+    @property
+    def gpu_options(self):
+        options = ort.SessionOptions()
+        options.enable_mem_pattern = False
+        return options
+
+    def test_inference_default_pndm(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting",
+            revision="onnx",
+            safety_checker=None,
+            feature_extractor=None,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A red cat sitting on a park bench"
+
+        generator = np.random.RandomState(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            guidance_scale=7.5,
+            num_inference_steps=10,
+            generator=generator,
+            output_type="np",
+        )
+        images = output.images
+        image_slice = images[0, 255:258, 255:258, -1]
+
+        assert images.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.2514, 0.3007, 0.3517, 0.1790, 0.2382, 0.3167, 0.1944, 0.2273, 0.2464])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_inference_k_lms(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        lms_scheduler = LMSDiscreteScheduler.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", subfolder="scheduler", revision="onnx"
+        )
+        pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting",
+            revision="onnx",
+            scheduler=lms_scheduler,
+            safety_checker=None,
+            feature_extractor=None,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A red cat sitting on a park bench"
+
+        generator = np.random.RandomState(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            guidance_scale=7.5,
+            num_inference_steps=20,
+            generator=generator,
+            output_type="np",
+        )
+        images = output.images
+        image_slice = images[0, 255:258, 255:258, -1]
+
+        assert images.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0086, 0.0077, 0.0083, 0.0093, 0.0107, 0.0139, 0.0094, 0.0097, 0.0125])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df64ad1d685323158f53793324ac6511cf77bb7
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+
+from diffusers import (
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    OnnxStableDiffusionUpscalePipeline,
+    PNDMScheduler,
+)
+from diffusers.utils.testing_utils import (
+    floats_tensor,
+    is_onnx_available,
+    load_image,
+    nightly,
+    require_onnxruntime,
+    require_torch_gpu,
+)
+
+from ..test_pipelines_onnx_common import OnnxPipelineTesterMixin
+
+
+if is_onnx_available():
+    import onnxruntime as ort
+
+
+class OnnxStableDiffusionUpscalePipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
+    # TODO: is there an appropriate internal test set?
+    hub_checkpoint = "ssube/stable-diffusion-x4-upscaler-onnx"
+
+    def get_dummy_inputs(self, seed=0):
+        image = floats_tensor((1, 3, 128, 128), rng=random.Random(seed))
+        generator = np.random.RandomState(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_pipeline_default_ddpm(self):
+        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        # started as 128, should now be 512
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.6957, 0.7002, 0.7186, 0.6881, 0.6693, 0.6910, 0.7445, 0.7274, 0.7056])
+        assert np.abs(image_slice - expected_slice).max() < 1e-1
+
+    def test_pipeline_pndm(self):
+        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config, skip_prk_steps=True)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.7349, 0.7347, 0.7034, 0.7696, 0.7876, 0.7597, 0.7916, 0.8085, 0.8036])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+    def test_pipeline_dpm_multistep(self):
+        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.7659278, 0.76437664, 0.75579107, 0.7691116, 0.77666986, 0.7727672, 0.7758664, 0.7812226, 0.76942515]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+    def test_pipeline_euler(self):
+        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.6974782, 0.68902093, 0.70135885, 0.7583618, 0.7804545, 0.7854912, 0.78667426, 0.78743863, 0.78070223]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+    def test_pipeline_euler_ancestral(self):
+        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.77424496, 0.773601, 0.7645288, 0.7769598, 0.7772739, 0.7738688, 0.78187233, 0.77879584, 0.767043]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+
+@nightly
+@require_onnxruntime
+@require_torch_gpu
+class OnnxStableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
+    @property
+    def gpu_provider(self):
+        return (
+            "CUDAExecutionProvider",
+            {
+                "gpu_mem_limit": "15000000000",  # 15GB
+                "arena_extend_strategy": "kSameAsRequested",
+            },
+        )
+
+    @property
+    def gpu_options(self):
+        options = ort.SessionOptions()
+        options.enable_mem_pattern = False
+        return options
+
+    def test_inference_default_ddpm(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((128, 128))
+        # using the PNDM scheduler by default
+        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(
+            "ssube/stable-diffusion-x4-upscaler-onnx",
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        generator = np.random.RandomState(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            guidance_scale=7.5,
+            num_inference_steps=10,
+            generator=generator,
+            output_type="np",
+        )
+        images = output.images
+        image_slice = images[0, 255:258, 383:386, -1]
+
+        assert images.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.4883, 0.4947, 0.4980, 0.4975, 0.4982, 0.4980, 0.5000, 0.5006, 0.4972])
+        # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
+
+    def test_inference_k_lms(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((128, 128))
+        lms_scheduler = LMSDiscreteScheduler.from_pretrained(
+            "ssube/stable-diffusion-x4-upscaler-onnx", subfolder="scheduler"
+        )
+        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(
+            "ssube/stable-diffusion-x4-upscaler-onnx",
+            scheduler=lms_scheduler,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        generator = np.random.RandomState(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            guidance_scale=7.5,
+            num_inference_steps=20,
+            generator=generator,
+            output_type="np",
+        )
+        images = output.images
+        image_slice = images[0, 255:258, 383:386, -1]
+
+        assert images.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.50173753, 0.50223356, 0.502039, 0.50233036, 0.5023725, 0.5022601, 0.5018758, 0.50234085, 0.50241566]
+        )
+        # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..f37d598c83876519c8d3f3eea491018038f989d4
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -0,0 +1,1583 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import gc
+import tempfile
+import time
+import traceback
+import unittest
+
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from transformers import (
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LCMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    logging,
+)
+from diffusers.utils.testing_utils import (
+    CaptureLogger,
+    enable_full_determinism,
+    is_torch_compile,
+    load_image,
+    load_numpy,
+    nightly,
+    numpy_cosine_similarity_distance,
+    require_accelerate_version_greater,
+    require_torch_2,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    run_test_in_subprocess,
+    skip_mps,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+# Will be run via run_test_in_subprocess
+def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        torch_device = inputs.pop("torch_device")
+        seed = inputs.pop("seed")
+        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+
+        sd_pipe.unet.to(memory_format=torch.channels_last)
+        sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
+
+        assert np.abs(image_slice - expected_slice).max() < 5e-3
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
+class StableDiffusionPipelineFastTests(
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        cross_attention_dim = 8
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            sample_size=32,
+            time_cond_proj_dim=time_cond_proj_dim,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=cross_attention_dim,
+            intermediate_size=16,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.1763, 0.4776, 0.4986, 0.2566, 0.3802, 0.4596, 0.5363, 0.3277, 0.3949])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.2368, 0.4900, 0.5019, 0.2723, 0.4473, 0.4578, 0.4551, 0.3532, 0.4133])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_lcm_custom_timesteps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["num_inference_steps"]
+        inputs["timesteps"] = [999, 499]
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.2368, 0.4900, 0.5019, 0.2723, 0.4473, 0.4578, 0.4551, 0.3532, 0.4133])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_ays(self):
+        from diffusers.schedulers import AysSchedules
+
+        timestep_schedule = AysSchedules["StableDiffusionTimesteps"]
+        sigma_schedule = AysSchedules["StableDiffusionSigmas"]
+
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_inference_steps"] = 10
+        output = sd_pipe(**inputs).images
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_inference_steps"] = None
+        inputs["timesteps"] = timestep_schedule
+        output_ts = sd_pipe(**inputs).images
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_inference_steps"] = None
+        inputs["sigmas"] = sigma_schedule
+        output_sigmas = sd_pipe(**inputs).images
+
+        assert (
+            np.abs(output_sigmas.flatten() - output_ts.flatten()).max() < 1e-3
+        ), "ays timesteps and ays sigmas should have the same outputs"
+        assert (
+            np.abs(output.flatten() - output_ts.flatten()).max() > 1e-3
+        ), "use ays timesteps should have different outputs"
+        assert (
+            np.abs(output.flatten() - output_sigmas.flatten()).max() > 1e-3
+        ), "use ays sigmas should have different outputs"
+
+    def test_stable_diffusion_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        text_inputs = sd_pipe.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=sd_pipe.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_inputs = text_inputs["input_ids"].to(torch_device)
+
+        prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]
+
+        inputs["prompt_embeds"] = prompt_embeds
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_negative_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        embeds = []
+        for p in [prompt, negative_prompt]:
+            text_inputs = sd_pipe.tokenizer(
+                p,
+                padding="max_length",
+                max_length=sd_pipe.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_inputs = text_inputs["input_ids"].to(torch_device)
+
+            embeds.append(sd_pipe.text_encoder(text_inputs)[0])
+
+        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_prompt_embeds_no_text_encoder_or_tokenizer(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "this is a negative prompt"
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = inputs.pop("prompt")
+        negative_prompt = "this is a negative prompt"
+
+        prompt_embeds, negative_prompt_embeds = sd_pipe.encode_prompt(
+            prompt,
+            torch_device,
+            1,
+            True,
+            negative_prompt=negative_prompt,
+            prompt_embeds=None,
+            negative_prompt_embeds=None,
+        )
+
+        inputs["prompt_embeds"] = prompt_embeds
+        inputs["negative_prompt_embeds"] = negative_prompt_embeds
+
+        sd_pipe.text_encoder = None
+        sd_pipe.tokenizer = None
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_prompt_embeds_with_plain_negative_prompt_list(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = negative_prompt
+        prompt = 3 * [inputs.pop("prompt")]
+
+        text_inputs = sd_pipe.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=sd_pipe.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_inputs = text_inputs["input_ids"].to(torch_device)
+
+        prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]
+
+        inputs["prompt_embeds"] = prompt_embeds
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_ddim_factor_8(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs, height=136, width=136)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 136, 136, 3)
+        expected_slice = np.array([0.4720, 0.5426, 0.5160, 0.3961, 0.4696, 0.4296, 0.5738, 0.5888, 0.5481])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_pndm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.1941, 0.4748, 0.4880, 0.2222, 0.4221, 0.4545, 0.5604, 0.3488, 0.3902])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_no_safety_checker(self):
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
+        )
+        assert isinstance(pipe, StableDiffusionPipeline)
+        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
+        assert pipe.safety_checker is None
+
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+        # check that there's no error when saving a pipeline with one of the models being None
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
+
+        # sanity check that the pipeline still works
+        assert pipe.safety_checker is None
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+    def test_stable_diffusion_k_lms(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.2681, 0.4785, 0.4857, 0.2426, 0.4473, 0.4481, 0.5610, 0.3676, 0.3855])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_euler_ancestral(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.2682, 0.4782, 0.4855, 0.2424, 0.4472, 0.4479, 0.5612, 0.3676, 0.3854])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.2681, 0.4785, 0.4857, 0.2426, 0.4473, 0.4481, 0.5610, 0.3676, 0.3855])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_vae_slicing(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        image_count = 4
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * image_count
+        output_1 = sd_pipe(**inputs)
+
+        # make sure sliced vae decode yields the same result
+        sd_pipe.enable_vae_slicing()
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * image_count
+        output_2 = sd_pipe(**inputs)
+
+        # there is a small discrepancy at image borders vs. full batch decode
+        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 3e-3
+
+    def test_stable_diffusion_vae_tiling(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        # make sure here that pndm scheduler skips prk
+        components["safety_checker"] = None
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        # Test that tiled decode at 512x512 yields the same result as the non-tiled decode
+        generator = torch.Generator(device=device).manual_seed(0)
+        output_1 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        # make sure tiled vae decode yields the same result
+        sd_pipe.enable_vae_tiling()
+        generator = torch.Generator(device=device).manual_seed(0)
+        output_2 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 5e-1
+
+        # test that tiled decode works with various shapes
+        shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)]
+        for shape in shapes:
+            zeros = torch.zeros(shape).to(device)
+            sd_pipe.vae.decode(zeros)
+
+    def test_stable_diffusion_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "french fries"
+        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.1907, 0.4709, 0.4858, 0.2224, 0.4223, 0.4539, 0.5606, 0.3489, 0.3900])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_long_prompt(self):
+        components = self.get_dummy_components()
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        do_classifier_free_guidance = True
+        negative_prompt = None
+        num_images_per_prompt = 1
+        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+        logger.setLevel(logging.WARNING)
+
+        prompt = 100 * "@"
+        with CaptureLogger(logger) as cap_logger:
+            negative_text_embeddings, text_embeddings = sd_pipe.encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+            if negative_text_embeddings is not None:
+                text_embeddings = torch.cat([negative_text_embeddings, text_embeddings])
+
+        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
+        assert cap_logger.out.count("@") == 25
+
+        negative_prompt = "Hello"
+        with CaptureLogger(logger) as cap_logger_2:
+            negative_text_embeddings_2, text_embeddings_2 = sd_pipe.encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+            if negative_text_embeddings_2 is not None:
+                text_embeddings_2 = torch.cat([negative_text_embeddings_2, text_embeddings_2])
+
+        assert cap_logger.out == cap_logger_2.out
+
+        prompt = 25 * "@"
+        with CaptureLogger(logger) as cap_logger_3:
+            negative_text_embeddings_3, text_embeddings_3 = sd_pipe.encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+            if negative_text_embeddings_3 is not None:
+                text_embeddings_3 = torch.cat([negative_text_embeddings_3, text_embeddings_3])
+
+        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
+        assert text_embeddings.shape[1] == 77
+        assert cap_logger_3.out == ""
+
+    def test_stable_diffusion_height_width_opt(self):
+        components = self.get_dummy_components()
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "hey"
+
+        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
+        image_shape = output.images[0].shape[:2]
+        assert image_shape == (64, 64)
+
+        output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")
+        image_shape = output.images[0].shape[:2]
+        assert image_shape == (96, 96)
+
+        config = dict(sd_pipe.unet.config)
+        config["sample_size"] = 96
+        sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
+        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
+        image_shape = output.images[0].shape[:2]
+        assert image_shape == (192, 192)
+
+    def test_attention_slicing_forward_pass(self):
+        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+    # MPS currently doesn't support ComplexFloats, which are required for freeU - see https://github.com/huggingface/diffusers/issues/7569.
+    @skip_mps
+    def test_freeu_enabled(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "hey"
+        output = sd_pipe(prompt, num_inference_steps=1, output_type="np", generator=torch.manual_seed(0)).images
+
+        sd_pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
+        output_freeu = sd_pipe(prompt, num_inference_steps=1, output_type="np", generator=torch.manual_seed(0)).images
+
+        assert not np.allclose(
+            output[0, -3:, -3:, -1], output_freeu[0, -3:, -3:, -1]
+        ), "Enabling of FreeU should lead to different results."
+
+    def test_freeu_disabled(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "hey"
+        output = sd_pipe(prompt, num_inference_steps=1, output_type="np", generator=torch.manual_seed(0)).images
+
+        sd_pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
+        sd_pipe.disable_freeu()
+
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for upsample_block in sd_pipe.unet.up_blocks:
+            for key in freeu_keys:
+                assert getattr(upsample_block, key) is None, f"Disabling of FreeU should have set {key} to None."
+
+        output_no_freeu = sd_pipe(
+            prompt, num_inference_steps=1, output_type="np", generator=torch.manual_seed(0)
+        ).images
+
+        assert np.allclose(
+            output[0, -3:, -3:, -1], output_no_freeu[0, -3:, -3:, -1]
+        ), "Disabling of FreeU should lead to results similar to the default pipeline results."
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        original_image_slice = image[0, -3:, -3:, -1]
+
+        sd_pipe.fuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice_fused = image[0, -3:, -3:, -1]
+
+        sd_pipe.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice_disabled = image[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-2, rtol=1e-2
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
+
+    def test_pipeline_interrupt(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "hey"
+        num_inference_steps = 3
+
+        # store intermediate latents from the generation process
+        class PipelineState:
+            def __init__(self):
+                self.state = []
+
+            def apply(self, pipe, i, t, callback_kwargs):
+                self.state.append(callback_kwargs["latents"])
+                return callback_kwargs
+
+        pipe_state = PipelineState()
+        sd_pipe(
+            prompt,
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+            generator=torch.Generator("cpu").manual_seed(0),
+            callback_on_step_end=pipe_state.apply,
+        ).images
+
+        # interrupt generation at step index
+        interrupt_step_idx = 1
+
+        def callback_on_step_end(pipe, i, t, callback_kwargs):
+            if i == interrupt_step_idx:
+                pipe._interrupt = True
+
+            return callback_kwargs
+
+        output_interrupted = sd_pipe(
+            prompt,
+            num_inference_steps=num_inference_steps,
+            output_type="latent",
+            generator=torch.Generator("cpu").manual_seed(0),
+            callback_on_step_end=callback_on_step_end,
+        ).images
+
+        # fetch intermediate latents at the interrupted step
+        # from the completed generation process
+        intermediate_latent = pipe_state.state[interrupt_step_idx]
+
+        # compare the intermediate latent to the output of the interrupted process
+        # they should be the same
+        assert torch.allclose(intermediate_latent, output_interrupted, atol=1e-4)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "a photograph of an astronaut riding a horse",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_1_1_pndm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.4363, 0.4355, 0.3667, 0.4066, 0.3970, 0.3866, 0.4394, 0.4356, 0.4059])
+        assert np.abs(image_slice - expected_slice).max() < 3e-3
+
+    def test_stable_diffusion_v1_4_with_freeu(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 25
+
+        sd_pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
+        image = sd_pipe(**inputs).images
+        image = image[0, -3:, -3:, -1].flatten()
+        expected_image = [0.0721, 0.0588, 0.0268, 0.0384, 0.0636, 0.0, 0.0429, 0.0344, 0.0309]
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_1_4_pndm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.5740, 0.4784, 0.3162, 0.6358, 0.5831, 0.5505, 0.5082, 0.5631, 0.5575])
+        assert np.abs(image_slice - expected_slice).max() < 3e-3
+
+    def test_stable_diffusion_ddim(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
+        assert np.abs(image_slice - expected_slice).max() < 1e-4
+
+    def test_stable_diffusion_lms(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.10542, 0.09620, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455])
+        assert np.abs(image_slice - expected_slice).max() < 3e-3
+
+    def test_stable_diffusion_dpm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
+            sd_pipe.scheduler.config,
+            final_sigmas_type="sigma_min",
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.00000])
+        assert np.abs(image_slice - expected_slice).max() < 3e-3
+
+    def test_stable_diffusion_attention_slicing(self):
+        torch.cuda.reset_peak_memory_stats()
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe.unet.set_default_attn_processor()
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # enable attention slicing
+        pipe.enable_attention_slicing()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        image_sliced = pipe(**inputs).images
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+        # make sure that less than 3.75 GB is allocated
+        assert mem_bytes < 3.75 * 10**9
+
+        # disable slicing
+        pipe.disable_attention_slicing()
+        pipe.unet.set_default_attn_processor()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        image = pipe(**inputs).images
+
+        # make sure that more than 3.75 GB is allocated
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes > 3.75 * 10**9
+        max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten())
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_vae_slicing(self):
+        torch.cuda.reset_peak_memory_stats()
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        # enable vae slicing
+        pipe.enable_vae_slicing()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        inputs["prompt"] = [inputs["prompt"]] * 4
+        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
+        image_sliced = pipe(**inputs).images
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+        # make sure that less than 4 GB is allocated
+        assert mem_bytes < 4e9
+
+        # disable vae slicing
+        pipe.disable_vae_slicing()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        inputs["prompt"] = [inputs["prompt"]] * 4
+        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
+        image = pipe(**inputs).images
+
+        # make sure that more than 4 GB is allocated
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes > 4e9
+        # There is a small discrepancy at the image borders vs. a fully batched version.
+        max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten())
+        assert max_diff < 1e-2
+
+    def test_stable_diffusion_vae_tiling(self):
+        torch.cuda.reset_peak_memory_stats()
+        model_id = "CompVis/stable-diffusion-v1-4"
+        pipe = StableDiffusionPipeline.from_pretrained(
+            model_id, variant="fp16", torch_dtype=torch.float16, safety_checker=None
+        )
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+        pipe.unet = pipe.unet.to(memory_format=torch.channels_last)
+        pipe.vae = pipe.vae.to(memory_format=torch.channels_last)
+
+        prompt = "a photograph of an astronaut riding a horse"
+
+        # enable vae tiling
+        pipe.enable_vae_tiling()
+        pipe.enable_model_cpu_offload()
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output_chunked = pipe(
+            [prompt],
+            width=1024,
+            height=1024,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=2,
+            output_type="np",
+        )
+        image_chunked = output_chunked.images
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+
+        # disable vae tiling
+        pipe.disable_vae_tiling()
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipe(
+            [prompt],
+            width=1024,
+            height=1024,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=2,
+            output_type="np",
+        )
+        image = output.images
+
+        assert mem_bytes < 1e10
+        max_diff = numpy_cosine_similarity_distance(image_chunked.flatten(), image.flatten())
+        assert max_diff < 1e-2
+
+    def test_stable_diffusion_fp16_vs_autocast(self):
+        # this test makes sure that the original model with autocast
+        # and the new model with fp16 yield the same result
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        image_fp16 = pipe(**inputs).images
+
+        with torch.autocast(torch_device):
+            inputs = self.get_inputs(torch_device)
+            image_autocast = pipe(**inputs).images
+
+        # Make sure results are close enough
+        diff = np.abs(image_fp16.flatten() - image_autocast.flatten())
+        # They ARE different since ops are not run always at the same precision
+        # however, they should be extremely close.
+        assert diff.mean() < 2e-2
+
+    def test_stable_diffusion_intermediate_state(self):
+        number_of_steps = 0
+
+        def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None:
+            callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 1:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [-0.5693, -0.3018, -0.9746, 0.0518, -0.8770, 0.7559, -1.7402, 0.1022, 1.1582]
+                )
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+            elif step == 2:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [-0.1958, -0.2993, -1.0166, -0.5005, -0.4810, 0.6162, -0.9492, 0.6621, 1.4492]
+                )
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+
+        callback_fn.has_been_called = False
+
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        pipe(**inputs, callback=callback_fn, callback_steps=1)
+        assert callback_fn.has_been_called
+        assert number_of_steps == inputs["num_inference_steps"]
+
+    def test_stable_diffusion_low_cpu_mem_usage(self):
+        pipeline_id = "CompVis/stable-diffusion-v1-4"
+
+        start_time = time.time()
+        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
+        pipeline_low_cpu_mem_usage.to(torch_device)
+        low_cpu_mem_usage_time = time.time() - start_time
+
+        start_time = time.time()
+        _ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
+        normal_load_time = time.time() - start_time
+
+        assert 2 * low_cpu_mem_usage_time < normal_load_time
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        _ = pipe(**inputs)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.8 GB is allocated
+        assert mem_bytes < 2.8 * 10**9
+
+    def test_stable_diffusion_pipeline_with_model_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+
+        # Normal inference
+
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            torch_dtype=torch.float16,
+        )
+        pipe.unet.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        outputs = pipe(**inputs)
+        mem_bytes = torch.cuda.max_memory_allocated()
+
+        # With model offloading
+
+        # Reload but don't move to cuda
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            torch_dtype=torch.float16,
+        )
+        pipe.unet.set_default_attn_processor()
+
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+
+        outputs_offloaded = pipe(**inputs)
+        mem_bytes_offloaded = torch.cuda.max_memory_allocated()
+
+        images = outputs.images
+        offloaded_images = outputs_offloaded.images
+
+        max_diff = numpy_cosine_similarity_distance(images.flatten(), offloaded_images.flatten())
+        assert max_diff < 1e-3
+        assert mem_bytes_offloaded < mem_bytes
+        assert mem_bytes_offloaded < 3.5 * 10**9
+        for module in pipe.text_encoder, pipe.unet, pipe.vae:
+            assert module.device == torch.device("cpu")
+
+        # With attention slicing
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe.enable_attention_slicing()
+        _ = pipe(**inputs)
+        mem_bytes_slicing = torch.cuda.max_memory_allocated()
+
+        assert mem_bytes_slicing < mem_bytes_offloaded
+        assert mem_bytes_slicing < 3 * 10**9
+
+    def test_stable_diffusion_textual_inversion(self):
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")
+
+        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
+        a111_file_neg = hf_hub_download(
+            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
+        )
+        pipe.load_textual_inversion(a111_file)
+        pipe.load_textual_inversion(a111_file_neg)
+        pipe.to("cuda")
+
+        generator = torch.Generator(device="cpu").manual_seed(1)
+
+        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
+        neg_prompt = "Style-Winter-neg"
+
+        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
+        )
+
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 8e-1
+
+    def test_stable_diffusion_textual_inversion_with_model_cpu_offload(self):
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        pipe.enable_model_cpu_offload()
+        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")
+
+        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
+        a111_file_neg = hf_hub_download(
+            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
+        )
+        pipe.load_textual_inversion(a111_file)
+        pipe.load_textual_inversion(a111_file_neg)
+
+        generator = torch.Generator(device="cpu").manual_seed(1)
+
+        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
+        neg_prompt = "Style-Winter-neg"
+
+        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
+        )
+
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 8e-1
+
+    def test_stable_diffusion_textual_inversion_with_sequential_cpu_offload(self):
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        pipe.enable_sequential_cpu_offload()
+        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")
+
+        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
+        a111_file_neg = hf_hub_download(
+            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
+        )
+        pipe.load_textual_inversion(a111_file)
+        pipe.load_textual_inversion(a111_file_neg)
+
+        generator = torch.Generator(device="cpu").manual_seed(1)
+
+        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
+        neg_prompt = "Style-Winter-neg"
+
+        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
+        )
+
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 8e-1
+
+    @is_torch_compile
+    @require_torch_2
+    def test_stable_diffusion_compile(self):
+        seed = 0
+        inputs = self.get_inputs(torch_device, seed=seed)
+        # Can't pickle a Generator object
+        del inputs["generator"]
+        inputs["torch_device"] = torch_device
+        inputs["seed"] = seed
+        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=inputs)
+
+    def test_stable_diffusion_lcm(self):
+        unet = UNet2DConditionModel.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", subfolder="unet")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("Lykon/dreamshaper-7", unet=unet).to(torch_device)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 6
+        inputs["output_type"] = "pil"
+
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/lcm_full/stable_diffusion_lcm.png"
+        )
+
+        image = sd_pipe.image_processor.pil_to_numpy(image)
+        expected_image = sd_pipe.image_processor.pil_to_numpy(expected_image)
+
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
+
+        assert max_diff < 1e-2
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionPipelineCkptTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_download_from_hub(self):
+        ckpt_paths = [
+            "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
+            "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors",
+        ]
+
+        for ckpt_path in ckpt_paths:
+            pipe = StableDiffusionPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16)
+            pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+            pipe.to("cuda")
+
+        image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]
+
+        assert image_out.shape == (512, 512, 3)
+
+    def test_download_local(self):
+        ckpt_filename = hf_hub_download(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.safetensors"
+        )
+        config_filename = hf_hub_download("stable-diffusion-v1-5/stable-diffusion-v1-5", filename="v1-inference.yaml")
+
+        pipe = StableDiffusionPipeline.from_single_file(
+            ckpt_filename, config_files={"v1": config_filename}, torch_dtype=torch.float16
+        )
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.to("cuda")
+
+        image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]
+
+        assert image_out.shape == (512, 512, 3)
+
+
+@nightly
+@require_torch_gpu
+class StableDiffusionPipelineNightlyTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "a photograph of an astronaut riding a horse",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 50,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_1_4_pndm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_pndm.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_1_5_pndm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5").to(
+            torch_device
+        )
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_5_pndm.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_ddim(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 3e-3
+
+    def test_stable_diffusion_lms(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_lms.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_euler(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_euler.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+
+# (sayakpaul): This test suite was run in the DGX with two GPUs (1, 2).
+@slow
+@require_torch_multi_gpu
+@require_accelerate_version_greater("0.27.0")
+class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, generator_device="cpu", seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        inputs = {
+            "prompt": "a photograph of an astronaut riding a horse",
+            "generator": generator,
+            "num_inference_steps": 50,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def get_pipeline_output_without_device_map(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16
+        ).to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=True)
+        inputs = self.get_inputs()
+        no_device_map_image = sd_pipe(**inputs).images
+
+        del sd_pipe
+
+        return no_device_map_image
+
+    def test_forward_pass_balanced_device_map(self):
+        no_device_map_image = self.get_pipeline_output_without_device_map()
+
+        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
+        )
+        sd_pipe_with_device_map.set_progress_bar_config(disable=True)
+        inputs = self.get_inputs()
+        device_map_image = sd_pipe_with_device_map(**inputs).images
+
+        max_diff = np.abs(device_map_image - no_device_map_image).max()
+        assert max_diff < 1e-3
+
+    def test_components_put_in_right_devices(self):
+        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
+        )
+
+        assert len(set(sd_pipe_with_device_map.hf_device_map.values())) >= 2
+
+    def test_max_memory(self):
+        no_device_map_image = self.get_pipeline_output_without_device_map()
+
+        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            device_map="balanced",
+            max_memory={0: "1GB", 1: "1GB"},
+            torch_dtype=torch.float16,
+        )
+        sd_pipe_with_device_map.set_progress_bar_config(disable=True)
+        inputs = self.get_inputs()
+        device_map_image = sd_pipe_with_device_map(**inputs).images
+
+        max_diff = np.abs(device_map_image - no_device_map_image).max()
+        assert max_diff < 1e-3
+
+    def test_reset_device_map(self):
+        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
+        )
+        sd_pipe_with_device_map.reset_device_map()
+
+        assert sd_pipe_with_device_map.hf_device_map is None
+
+        for name, component in sd_pipe_with_device_map.components.items():
+            if isinstance(component, torch.nn.Module):
+                assert component.device.type == "cpu"
+
+    def test_reset_device_map_to(self):
+        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
+        )
+        sd_pipe_with_device_map.reset_device_map()
+
+        assert sd_pipe_with_device_map.hf_device_map is None
+
+        # Make sure `to()` can be used and the pipeline can be called.
+        pipe = sd_pipe_with_device_map.to("cuda")
+        _ = pipe("hello", num_inference_steps=2)
+
+    def test_reset_device_map_enable_model_cpu_offload(self):
+        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
+        )
+        sd_pipe_with_device_map.reset_device_map()
+
+        assert sd_pipe_with_device_map.hf_device_map is None
+
+        # Make sure `enable_model_cpu_offload()` can be used and the pipeline can be called.
+        sd_pipe_with_device_map.enable_model_cpu_offload()
+        _ = sd_pipe_with_device_map("hello", num_inference_steps=2)
+
+    def test_reset_device_map_enable_sequential_cpu_offload(self):
+        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
+        )
+        sd_pipe_with_device_map.reset_device_map()
+
+        assert sd_pipe_with_device_map.hf_device_map is None
+
+        # Make sure `enable_sequential_cpu_offload()` can be used and the pipeline can be called.
+        sd_pipe_with_device_map.enable_sequential_cpu_offload()
+        _ = sd_pipe_with_device_map("hello", num_inference_steps=2)
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ba0bb5a4a5db5b9224a9de59ab7338ca13cb4e8
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -0,0 +1,750 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import traceback
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    AutoencoderTiny,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    HeunDiscreteScheduler,
+    LCMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    is_torch_compile,
+    load_image,
+    load_numpy,
+    nightly,
+    require_torch_2,
+    require_torch_gpu,
+    run_test_in_subprocess,
+    skip_mps,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+# Will be run via run_test_in_subprocess
+def _test_img2img_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        torch_device = inputs.pop("torch_device")
+        seed = inputs.pop("seed")
+        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 768, 3)
+        expected_slice = np.array([0.0606, 0.0570, 0.0805, 0.0579, 0.0628, 0.0623, 0.0843, 0.1115, 0.0806])
+
+        assert np.abs(expected_slice - image_slice).max() < 1e-3
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
+class StableDiffusionImg2ImgPipelineFastTests(
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_tiny_autoencoder(self):
+        return AutoencoderTiny(in_channels=3, out_channels=3, latent_channels=4)
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image / 2 + 0.5
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_img2img_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.4555, 0.3216, 0.4049, 0.4620, 0.4618, 0.4126, 0.4122, 0.4629, 0.4579])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_img2img_default_case_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.5709, 0.4614, 0.4587, 0.5978, 0.5298, 0.6910, 0.6240, 0.5212, 0.5454])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_img2img_default_case_lcm_custom_timesteps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["num_inference_steps"]
+        inputs["timesteps"] = [999, 499]
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.5709, 0.4614, 0.4587, 0.5978, 0.5298, 0.6910, 0.6240, 0.5212, 0.5454])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_img2img_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "french fries"
+        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.4593, 0.3408, 0.4232, 0.4749, 0.4476, 0.4115, 0.4357, 0.4733, 0.4663])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.4932, 0.5092, 0.5135, 0.5517, 0.5626, 0.6621, 0.6490, 0.5021, 0.5441])
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_stable_diffusion_img2img_multiple_init_images(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * 2
+        inputs["image"] = inputs["image"].repeat(2, 1, 1, 1)
+        image = sd_pipe(**inputs).images
+        image_slice = image[-1, -3:, -3:, -1]
+
+        assert image.shape == (2, 32, 32, 3)
+        expected_slice = np.array([0.4241, 0.5576, 0.5711, 0.4792, 0.4311, 0.5952, 0.5827, 0.5138, 0.5109])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_img2img_k_lms(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = LMSDiscreteScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+        )
+        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.4398, 0.4949, 0.4337, 0.6580, 0.5555, 0.4338, 0.5769, 0.5955, 0.5175])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_img2img_tiny_autoencoder(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe.vae = self.get_dummy_tiny_autoencoder()
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.00669, 0.00669, 0.0, 0.00693, 0.00858, 0.0, 0.00567, 0.00515, 0.00125])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    @skip_mps
+    def test_save_load_local(self):
+        return super().test_save_load_local()
+
+    @skip_mps
+    def test_dict_tuple_outputs_equivalent(self):
+        return super().test_dict_tuple_outputs_equivalent()
+
+    @skip_mps
+    def test_save_load_optional_components(self):
+        return super().test_save_load_optional_components()
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        return super().test_attention_slicing_forward_pass(expected_max_diff=5e-3)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=5e-1)
+
+    def test_pipeline_interrupt(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = "hey"
+        num_inference_steps = 3
+
+        # store intermediate latents from the generation process
+        class PipelineState:
+            def __init__(self):
+                self.state = []
+
+            def apply(self, pipe, i, t, callback_kwargs):
+                self.state.append(callback_kwargs["latents"])
+                return callback_kwargs
+
+        pipe_state = PipelineState()
+        sd_pipe(
+            prompt,
+            image=inputs["image"],
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+            generator=torch.Generator("cpu").manual_seed(0),
+            callback_on_step_end=pipe_state.apply,
+        ).images
+
+        # interrupt generation at step index
+        interrupt_step_idx = 1
+
+        def callback_on_step_end(pipe, i, t, callback_kwargs):
+            if i == interrupt_step_idx:
+                pipe._interrupt = True
+
+            return callback_kwargs
+
+        output_interrupted = sd_pipe(
+            prompt,
+            image=inputs["image"],
+            num_inference_steps=num_inference_steps,
+            output_type="latent",
+            generator=torch.Generator("cpu").manual_seed(0),
+            callback_on_step_end=callback_on_step_end,
+        ).images
+
+        # fetch intermediate latents at the interrupted step
+        # from the completed generation process
+        intermediate_latent = pipe_state.state[interrupt_step_idx]
+
+        # compare the intermediate latent to the output of the interrupted process
+        # they should be the same
+        assert torch.allclose(intermediate_latent, output_interrupted, atol=1e-4)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/sketch-mountains-input.png"
+        )
+        inputs = {
+            "prompt": "a fantasy landscape, concept art, high resolution",
+            "image": init_image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_img2img_default(self):
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 768, 3)
+        expected_slice = np.array([0.4300, 0.4662, 0.4930, 0.3990, 0.4307, 0.4525, 0.3719, 0.4064, 0.3923])
+
+        assert np.abs(expected_slice - image_slice).max() < 1e-3
+
+    def test_stable_diffusion_img2img_k_lms(self):
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 768, 3)
+        expected_slice = np.array([0.0389, 0.0346, 0.0415, 0.0290, 0.0218, 0.0210, 0.0408, 0.0567, 0.0271])
+
+        assert np.abs(expected_slice - image_slice).max() < 1e-3
+
+    def test_stable_diffusion_img2img_ddim(self):
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 768, 3)
+        expected_slice = np.array([0.0593, 0.0607, 0.0851, 0.0582, 0.0636, 0.0721, 0.0751, 0.0981, 0.0781])
+
+        assert np.abs(expected_slice - image_slice).max() < 1e-3
+
+    def test_stable_diffusion_img2img_intermediate_state(self):
+        number_of_steps = 0
+
+        def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None:
+            callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 1:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 96)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([-0.4958, 0.5107, 1.1045, 2.7539, 4.6680, 3.8320, 1.5049, 1.8633, 2.6523])
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+            elif step == 2:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 96)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([-0.4956, 0.5078, 1.0918, 2.7520, 4.6484, 3.8125, 1.5146, 1.8633, 2.6367])
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+
+        callback_fn.has_been_called = False
+
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        pipe(**inputs, callback=callback_fn, callback_steps=1)
+        assert callback_fn.has_been_called
+        assert number_of_steps == 2
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        _ = pipe(**inputs)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.2 GB is allocated
+        assert mem_bytes < 2.2 * 10**9
+
+    def test_stable_diffusion_pipeline_with_model_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+
+        # Normal inference
+
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            safety_checker=None,
+            torch_dtype=torch.float16,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe(**inputs)
+        mem_bytes = torch.cuda.max_memory_allocated()
+
+        # With model offloading
+
+        # Reload but don't move to cuda
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            safety_checker=None,
+            torch_dtype=torch.float16,
+        )
+
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+        _ = pipe(**inputs)
+        mem_bytes_offloaded = torch.cuda.max_memory_allocated()
+
+        assert mem_bytes_offloaded < mem_bytes
+        for module in pipe.text_encoder, pipe.unet, pipe.vae:
+            assert module.device == torch.device("cpu")
+
+    def test_img2img_2nd_order(self):
+        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+        sd_pipe.scheduler = HeunDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 10
+        inputs["strength"] = 0.75
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/img2img_heun.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 5e-2
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 11
+        inputs["strength"] = 0.75
+        image_other = sd_pipe(**inputs).images[0]
+
+        mean_diff = np.abs(image - image_other).mean()
+
+        # images should be very similar
+        assert mean_diff < 5e-2
+
+    def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        # resize to resolution that is divisible by 8 but not 16 or 32
+        init_image = init_image.resize((760, 504))
+
+        model_id = "CompVis/stable-diffusion-v1-4"
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+            model_id,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        generator = torch.manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        image_slice = image[255:258, 383:386, -1]
+
+        assert image.shape == (504, 760, 3)
+        expected_slice = np.array([0.9393, 0.9500, 0.9399, 0.9438, 0.9458, 0.9400, 0.9455, 0.9414, 0.9423])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
+
+    def test_img2img_safety_checker_works(self):
+        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 20
+        # make sure the safety checker is activated
+        inputs["prompt"] = "naked, sex, porn"
+        out = sd_pipe(**inputs)
+
+        assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}"
+        assert np.abs(out.images[0]).sum() < 1e-5  # should be all zeros
+
+    @is_torch_compile
+    @require_torch_2
+    def test_img2img_compile(self):
+        seed = 0
+        inputs = self.get_inputs(torch_device, seed=seed)
+        # Can't pickle a Generator object
+        del inputs["generator"]
+        inputs["torch_device"] = torch_device
+        inputs["seed"] = seed
+        run_test_in_subprocess(test_case=self, target_func=_test_img2img_compile, inputs=inputs)
+
+
+@nightly
+@require_torch_gpu
+class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/sketch-mountains-input.png"
+        )
+        inputs = {
+            "prompt": "a fantasy landscape, concept art, high resolution",
+            "image": init_image,
+            "generator": generator,
+            "num_inference_steps": 50,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_img2img_pndm(self):
+        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/stable_diffusion_1_5_pndm.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_img2img_ddim(self):
+        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/stable_diffusion_1_5_ddim.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_img2img_lms(self):
+        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/stable_diffusion_1_5_lms.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_img2img_dpm(self):
+        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 30
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/stable_diffusion_1_5_dpm.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff04ea2cfc5da0c1d340d246ed8093958b32a60e
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -0,0 +1,1110 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import traceback
+import unittest
+
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AsymmetricAutoencoderKL,
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    LCMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionInpaintPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    is_torch_compile,
+    load_image,
+    load_numpy,
+    nightly,
+    require_torch_2,
+    require_torch_gpu,
+    run_test_in_subprocess,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+# Will be run via run_test_in_subprocess
+def _test_inpaint_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        torch_device = inputs.pop("torch_device")
+        seed = inputs.pop("seed")
+        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
+        )
+        pipe.unet.set_default_attn_processor()
+        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        image = pipe(**inputs).images
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0689, 0.0699, 0.0790, 0.0536, 0.0470, 0.0488, 0.041, 0.0508, 0.04179])
+        assert np.abs(expected_slice - image_slice).max() < 3e-3
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
+class StableDiffusionInpaintPipelineFastTests(
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionInpaintPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    image_params = frozenset([])
+    # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
+    image_latents_params = frozenset([])
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"mask", "masked_image_latents"})
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            time_cond_proj_dim=time_cond_proj_dim,
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=9,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0, img_res=64, output_pil=True):
+        # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
+        if output_pil:
+            # Get random floats in [0, 1] as image
+            image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+            image = image.cpu().permute(0, 2, 3, 1)[0]
+            mask_image = torch.ones_like(image)
+            # Convert image and mask_image to [0, 255]
+            image = 255 * image
+            mask_image = 255 * mask_image
+            # Convert to PIL image
+            init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((img_res, img_res))
+            mask_image = Image.fromarray(np.uint8(mask_image)).convert("RGB").resize((img_res, img_res))
+        else:
+            # Get random floats in [0, 1] as image with spatial size (img_res, img_res)
+            image = floats_tensor((1, 3, img_res, img_res), rng=random.Random(seed)).to(device)
+            # Convert image to [-1, 1]
+            init_image = 2.0 * image - 1.0
+            mask_image = torch.ones((1, 1, img_res, img_res), device=device)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": init_image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_inpaint(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionInpaintPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4703, 0.5697, 0.3879, 0.5470, 0.6042, 0.4413, 0.5078, 0.4728, 0.4469])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_inpaint_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionInpaintPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4931, 0.5988, 0.4569, 0.5556, 0.6650, 0.5087, 0.5966, 0.5358, 0.5269])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_inpaint_lcm_custom_timesteps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionInpaintPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["num_inference_steps"]
+        inputs["timesteps"] = [999, 499]
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4931, 0.5988, 0.4569, 0.5556, 0.6650, 0.5087, 0.5966, 0.5358, 0.5269])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_inpaint_image_tensor(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionInpaintPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        out_pil = output.images
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["image"] = torch.tensor(np.array(inputs["image"]) / 127.5 - 1).permute(2, 0, 1).unsqueeze(0)
+        inputs["mask_image"] = torch.tensor(np.array(inputs["mask_image"]) / 255).permute(2, 0, 1)[:1].unsqueeze(0)
+        output = sd_pipe(**inputs)
+        out_tensor = output.images
+
+        assert out_pil.shape == (1, 64, 64, 3)
+        assert np.abs(out_pil.flatten() - out_tensor.flatten()).max() < 5e-2
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+    def test_stable_diffusion_inpaint_strength_zero_test(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionInpaintPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+
+        # check that the pipeline raises value error when num_inference_steps is < 1
+        inputs["strength"] = 0.01
+        with self.assertRaises(ValueError):
+            sd_pipe(**inputs).images
+
+    def test_stable_diffusion_inpaint_mask_latents(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # normal mask + normal image
+        ##  `image`: pil, `mask_image``: pil, `masked_image_latents``: None
+        inputs = self.get_dummy_inputs(device)
+        inputs["strength"] = 0.9
+        out_0 = sd_pipe(**inputs).images
+
+        # image latents + mask latents
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe.image_processor.preprocess(inputs["image"]).to(sd_pipe.device)
+        mask = sd_pipe.mask_processor.preprocess(inputs["mask_image"]).to(sd_pipe.device)
+        masked_image = image * (mask < 0.5)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_latents = (
+            sd_pipe.vae.encode(image).latent_dist.sample(generator=generator) * sd_pipe.vae.config.scaling_factor
+        )
+        torch.randn((1, 4, 32, 32), generator=generator)
+        mask_latents = (
+            sd_pipe.vae.encode(masked_image).latent_dist.sample(generator=generator)
+            * sd_pipe.vae.config.scaling_factor
+        )
+        inputs["image"] = image_latents
+        inputs["masked_image_latents"] = mask_latents
+        inputs["mask_image"] = mask
+        inputs["strength"] = 0.9
+        generator = torch.Generator(device=device).manual_seed(0)
+        torch.randn((1, 4, 32, 32), generator=generator)
+        inputs["generator"] = generator
+        out_1 = sd_pipe(**inputs).images
+        assert np.abs(out_0 - out_1).max() < 1e-2
+
+    def test_pipeline_interrupt(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionInpaintPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = "hey"
+        num_inference_steps = 3
+
+        # store intermediate latents from the generation process
+        class PipelineState:
+            def __init__(self):
+                self.state = []
+
+            def apply(self, pipe, i, t, callback_kwargs):
+                self.state.append(callback_kwargs["latents"])
+                return callback_kwargs
+
+        pipe_state = PipelineState()
+        sd_pipe(
+            prompt,
+            image=inputs["image"],
+            mask_image=inputs["mask_image"],
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+            generator=torch.Generator("cpu").manual_seed(0),
+            callback_on_step_end=pipe_state.apply,
+        ).images
+
+        # interrupt generation at step index
+        interrupt_step_idx = 1
+
+        def callback_on_step_end(pipe, i, t, callback_kwargs):
+            if i == interrupt_step_idx:
+                pipe._interrupt = True
+
+            return callback_kwargs
+
+        output_interrupted = sd_pipe(
+            prompt,
+            image=inputs["image"],
+            mask_image=inputs["mask_image"],
+            num_inference_steps=num_inference_steps,
+            output_type="latent",
+            generator=torch.Generator("cpu").manual_seed(0),
+            callback_on_step_end=callback_on_step_end,
+        ).images
+
+        # fetch intermediate latents at the interrupted step
+        # from the completed generation process
+        intermediate_latent = pipe_state.state[interrupt_step_idx]
+
+        # compare the intermediate latent to the output of the interrupted process
+        # they should be the same
+        assert torch.allclose(intermediate_latent, output_interrupted, atol=1e-4)
+
+    def test_ip_adapter(self, from_simple=False, expected_pipe_slice=None):
+        if not from_simple:
+            expected_pipe_slice = None
+            if torch_device == "cpu":
+                expected_pipe_slice = np.array(
+                    [0.4390, 0.5452, 0.3772, 0.5448, 0.6031, 0.4480, 0.5194, 0.4687, 0.4640]
+                )
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+
+class StableDiffusionSimpleInpaintPipelineFastTests(StableDiffusionInpaintPipelineFastTests):
+    pipeline_class = StableDiffusionInpaintPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    image_params = frozenset([])
+    # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs_2images(self, device, seed=0, img_res=64):
+        # Get random floats in [0, 1] as image with spatial size (img_res, img_res)
+        image1 = floats_tensor((1, 3, img_res, img_res), rng=random.Random(seed)).to(device)
+        image2 = floats_tensor((1, 3, img_res, img_res), rng=random.Random(seed + 22)).to(device)
+        # Convert images to [-1, 1]
+        init_image1 = 2.0 * image1 - 1.0
+        init_image2 = 2.0 * image2 - 1.0
+
+        # empty mask
+        mask_image = torch.zeros((1, 1, img_res, img_res), device=device)
+
+        if str(device).startswith("mps"):
+            generator1 = torch.manual_seed(seed)
+            generator2 = torch.manual_seed(seed)
+        else:
+            generator1 = torch.Generator(device=device).manual_seed(seed)
+            generator2 = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": ["A painting of a squirrel eating a burger"] * 2,
+            "image": [init_image1, init_image2],
+            "mask_image": [mask_image] * 2,
+            "generator": [generator1, generator2],
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.6345, 0.5395, 0.5611, 0.5403, 0.5830, 0.5855, 0.5193, 0.5443, 0.5211])
+        return super().test_ip_adapter(from_simple=True, expected_pipe_slice=expected_pipe_slice)
+
+    def test_stable_diffusion_inpaint(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionInpaintPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.6584, 0.5424, 0.5649, 0.5449, 0.5897, 0.6111, 0.5404, 0.5463, 0.5214])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_inpaint_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionInpaintPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.6240, 0.5355, 0.5649, 0.5378, 0.5374, 0.6242, 0.5132, 0.5347, 0.5396])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_inpaint_lcm_custom_timesteps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionInpaintPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["num_inference_steps"]
+        inputs["timesteps"] = [999, 499]
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.6240, 0.5355, 0.5649, 0.5378, 0.5374, 0.6242, 0.5132, 0.5347, 0.5396])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_inpaint_2_images(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # test to confirm if we pass two same image, we will get same output
+        inputs = self.get_dummy_inputs(device)
+        gen1 = torch.Generator(device=device).manual_seed(0)
+        gen2 = torch.Generator(device=device).manual_seed(0)
+        for name in ["prompt", "image", "mask_image"]:
+            inputs[name] = [inputs[name]] * 2
+        inputs["generator"] = [gen1, gen2]
+        images = sd_pipe(**inputs).images
+
+        assert images.shape == (2, 64, 64, 3)
+
+        image_slice1 = images[0, -3:, -3:, -1]
+        image_slice2 = images[1, -3:, -3:, -1]
+        assert np.abs(image_slice1.flatten() - image_slice2.flatten()).max() < 1e-4
+
+        # test to confirm that if we pass two different images, we will get different output
+        inputs = self.get_dummy_inputs_2images(device)
+        images = sd_pipe(**inputs).images
+        assert images.shape == (2, 64, 64, 3)
+
+        image_slice1 = images[0, -3:, -3:, -1]
+        image_slice2 = images[1, -3:, -3:, -1]
+        assert np.abs(image_slice1.flatten() - image_slice2.flatten()).max() > 1e-2
+
+    def test_stable_diffusion_inpaint_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionInpaintPipeline(**components)
+        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device, output_pil=False)
+        half_dim = inputs["image"].shape[2] // 2
+        inputs["mask_image"][0, 0, :half_dim, :half_dim] = 0
+
+        inputs["num_inference_steps"] = 4
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [[0.6387283, 0.5564158, 0.58631873, 0.5539942, 0.5494673, 0.6461868, 0.5251618, 0.5497595, 0.5508756]]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/input_bench_image.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/input_bench_mask.png"
+        )
+        inputs = {
+            "prompt": "Face of a yellow cat, high resolution, sitting on a park bench",
+            "image": init_image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_inpaint_ddim(self):
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0427, 0.0460, 0.0483, 0.0460, 0.0584, 0.0521, 0.1549, 0.1695, 0.1794])
+
+        assert np.abs(expected_slice - image_slice).max() < 6e-4
+
+    def test_stable_diffusion_inpaint_fp16(self):
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", torch_dtype=torch.float16, safety_checker=None
+        )
+        pipe.unet.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        image = pipe(**inputs).images
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.1509, 0.1245, 0.1672, 0.1655, 0.1519, 0.1226, 0.1462, 0.1567, 0.2451])
+        assert np.abs(expected_slice - image_slice).max() < 1e-1
+
+    def test_stable_diffusion_inpaint_pndm(self):
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
+        )
+        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0425, 0.0273, 0.0344, 0.1694, 0.1727, 0.1812, 0.3256, 0.3311, 0.3272])
+
+        assert np.abs(expected_slice - image_slice).max() < 5e-3
+
+    def test_stable_diffusion_inpaint_k_lms(self):
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
+        )
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.9314, 0.7575, 0.9432, 0.8885, 0.9028, 0.7298, 0.9811, 0.9667, 0.7633])
+
+        assert np.abs(expected_slice - image_slice).max() < 6e-3
+
+    def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        _ = pipe(**inputs)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.2 GB is allocated
+        assert mem_bytes < 2.2 * 10**9
+
+    @is_torch_compile
+    @require_torch_2
+    def test_inpaint_compile(self):
+        seed = 0
+        inputs = self.get_inputs(torch_device, seed=seed)
+        # Can't pickle a Generator object
+        del inputs["generator"]
+        inputs["torch_device"] = torch_device
+        inputs["seed"] = seed
+        run_test_in_subprocess(test_case=self, target_func=_test_inpaint_compile, inputs=inputs)
+
+    def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
+        )
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        # change input image to a random size (one that would cause a tensor mismatch error)
+        inputs["image"] = inputs["image"].resize((127, 127))
+        inputs["mask_image"] = inputs["mask_image"].resize((127, 127))
+        inputs["height"] = 128
+        inputs["width"] = 128
+        image = pipe(**inputs).images
+        # verify that the returned image has the same height and width as the input height and width
+        assert image.shape == (1, inputs["height"], inputs["width"], 3)
+
+    def test_stable_diffusion_inpaint_strength_test(self):
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
+        )
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        # change input strength
+        inputs["strength"] = 0.75
+        image = pipe(**inputs).images
+        # verify that the returned image has the same height and width as the input height and width
+        assert image.shape == (1, 512, 512, 3)
+
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+        expected_slice = np.array([0.2728, 0.2803, 0.2665, 0.2511, 0.2774, 0.2586, 0.2391, 0.2392, 0.2582])
+        assert np.abs(expected_slice - image_slice).max() < 1e-3
+
+    def test_stable_diffusion_simple_inpaint_ddim(self):
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None
+        )
+        pipe.unet.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.3757, 0.3875, 0.4445, 0.4353, 0.3780, 0.4513, 0.3965, 0.3984, 0.4362])
+        assert np.abs(expected_slice - image_slice).max() < 1e-3
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/input_bench_image.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/input_bench_mask.png"
+        )
+        inputs = {
+            "prompt": "Face of a yellow cat, high resolution, sitting on a park bench",
+            "image": init_image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_inpaint_ddim(self):
+        vae = AsymmetricAutoencoderKL.from_pretrained("cross-attention/asymmetric-autoencoder-kl-x-1-5")
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
+        )
+        pipe.vae = vae
+        pipe.unet.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0522, 0.0604, 0.0596, 0.0449, 0.0493, 0.0427, 0.1186, 0.1289, 0.1442])
+
+        assert np.abs(expected_slice - image_slice).max() < 1e-3
+
+    def test_stable_diffusion_inpaint_fp16(self):
+        vae = AsymmetricAutoencoderKL.from_pretrained(
+            "cross-attention/asymmetric-autoencoder-kl-x-1-5", torch_dtype=torch.float16
+        )
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", torch_dtype=torch.float16, safety_checker=None
+        )
+        pipe.unet.set_default_attn_processor()
+        pipe.vae = vae
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        image = pipe(**inputs).images
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.1343, 0.1406, 0.1440, 0.1504, 0.1729, 0.0989, 0.1807, 0.2822, 0.1179])
+
+        assert np.abs(expected_slice - image_slice).max() < 5e-2
+
+    def test_stable_diffusion_inpaint_pndm(self):
+        vae = AsymmetricAutoencoderKL.from_pretrained("cross-attention/asymmetric-autoencoder-kl-x-1-5")
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
+        )
+        pipe.unet.set_default_attn_processor()
+        pipe.vae = vae
+        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0966, 0.1083, 0.1148, 0.1422, 0.1318, 0.1197, 0.3702, 0.3537, 0.3288])
+
+        assert np.abs(expected_slice - image_slice).max() < 5e-3
+
+    def test_stable_diffusion_inpaint_k_lms(self):
+        vae = AsymmetricAutoencoderKL.from_pretrained("cross-attention/asymmetric-autoencoder-kl-x-1-5")
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
+        )
+        pipe.unet.set_default_attn_processor()
+        pipe.vae = vae
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.8931, 0.8683, 0.8965, 0.8501, 0.8592, 0.9118, 0.8734, 0.7463, 0.8990])
+        assert np.abs(expected_slice - image_slice).max() < 6e-3
+
+    def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        vae = AsymmetricAutoencoderKL.from_pretrained(
+            "cross-attention/asymmetric-autoencoder-kl-x-1-5", torch_dtype=torch.float16
+        )
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe.vae = vae
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        _ = pipe(**inputs)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.45 GB is allocated
+        assert mem_bytes < 2.45 * 10**9
+
+    @is_torch_compile
+    @require_torch_2
+    def test_inpaint_compile(self):
+        pass
+
+    def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
+        vae = AsymmetricAutoencoderKL.from_pretrained(
+            "cross-attention/asymmetric-autoencoder-kl-x-1-5",
+        )
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
+        )
+        pipe.vae = vae
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        # change input image to a random size (one that would cause a tensor mismatch error)
+        inputs["image"] = inputs["image"].resize((127, 127))
+        inputs["mask_image"] = inputs["mask_image"].resize((127, 127))
+        inputs["height"] = 128
+        inputs["width"] = 128
+        image = pipe(**inputs).images
+        # verify that the returned image has the same height and width as the input height and width
+        assert image.shape == (1, inputs["height"], inputs["width"], 3)
+
+    def test_stable_diffusion_inpaint_strength_test(self):
+        vae = AsymmetricAutoencoderKL.from_pretrained("cross-attention/asymmetric-autoencoder-kl-x-1-5")
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
+        )
+        pipe.unet.set_default_attn_processor()
+        pipe.vae = vae
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        # change input strength
+        inputs["strength"] = 0.75
+        image = pipe(**inputs).images
+        # verify that the returned image has the same height and width as the input height and width
+        assert image.shape == (1, 512, 512, 3)
+
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+        expected_slice = np.array([0.2458, 0.2576, 0.3124, 0.2679, 0.2669, 0.2796, 0.2872, 0.2975, 0.2661])
+        assert np.abs(expected_slice - image_slice).max() < 3e-3
+
+    def test_stable_diffusion_simple_inpaint_ddim(self):
+        vae = AsymmetricAutoencoderKL.from_pretrained("cross-attention/asymmetric-autoencoder-kl-x-1-5")
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None
+        )
+        pipe.vae = vae
+        pipe.unet.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.3296, 0.4041, 0.4097, 0.4145, 0.4342, 0.4152, 0.4927, 0.4931, 0.4430])
+        assert np.abs(expected_slice - image_slice).max() < 1e-3
+
+    def test_download_local(self):
+        vae = AsymmetricAutoencoderKL.from_pretrained(
+            "cross-attention/asymmetric-autoencoder-kl-x-1-5", torch_dtype=torch.float16
+        )
+        filename = hf_hub_download("botp/stable-diffusion-v1-5-inpainting", filename="sd-v1-5-inpainting.ckpt")
+
+        pipe = StableDiffusionInpaintPipeline.from_single_file(filename, torch_dtype=torch.float16)
+        pipe.vae = vae
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.to("cuda")
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 1
+        image_out = pipe(**inputs).images[0]
+
+        assert image_out.shape == (512, 512, 3)
+
+
+@nightly
+@require_torch_gpu
+class StableDiffusionInpaintPipelineNightlyTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/input_bench_image.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/input_bench_mask.png"
+        )
+        inputs = {
+            "prompt": "Face of a yellow cat, high resolution, sitting on a park bench",
+            "image": init_image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 50,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_inpaint_ddim(self):
+        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("botp/stable-diffusion-v1-5-inpainting")
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/stable_diffusion_inpaint_ddim.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_inpaint_pndm(self):
+        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("botp/stable-diffusion-v1-5-inpainting")
+        sd_pipe.scheduler = PNDMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/stable_diffusion_inpaint_pndm.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_inpaint_lms(self):
+        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("botp/stable-diffusion-v1-5-inpainting")
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/stable_diffusion_inpaint_lms.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_inpaint_dpm(self):
+        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("botp/stable-diffusion-v1-5-inpainting")
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 30
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/stable_diffusion_inpaint_dpm_multi.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9b061c060c0847b15cb23b7d0aaecfec4b9a7da
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
@@ -0,0 +1,430 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionInstructPix2PixPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+)
+from ..test_pipelines_common import (
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionInstructPix2PixPipelineFastTests(
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionInstructPix2PixPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "cross_attention_kwargs"}
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"image_latents"}) - {"negative_prompt_embeds"}
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=8,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        image = Image.fromarray(np.uint8(image)).convert("RGB")
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "image_guidance_scale": 1,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_pix2pix_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.7526, 0.3750, 0.4547, 0.6117, 0.5866, 0.5016, 0.4327, 0.5642, 0.4815])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_pix2pix_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "french fries"
+        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.7511, 0.3642, 0.4553, 0.6236, 0.5797, 0.5013, 0.4343, 0.5611, 0.4831])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_pix2pix_multiple_init_images(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * 2
+
+        image = np.array(inputs["image"]).astype(np.float32) / 255.0
+        image = torch.from_numpy(image).unsqueeze(0).to(device)
+        image = image / 2 + 0.5
+        image = image.permute(0, 3, 1, 2)
+        inputs["image"] = image.repeat(2, 1, 1, 1)
+
+        image = sd_pipe(**inputs).images
+        image_slice = image[-1, -3:, -3:, -1]
+
+        assert image.shape == (2, 32, 32, 3)
+        expected_slice = np.array([0.5812, 0.5748, 0.5222, 0.5908, 0.5695, 0.7174, 0.6804, 0.5523, 0.5579])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_pix2pix_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = EulerAncestralDiscreteScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+        )
+        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        slice = [round(x, 4) for x in image_slice.flatten().tolist()]
+        print(",".join([str(x) for x in slice]))
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.7417, 0.3842, 0.4732, 0.5776, 0.5891, 0.5139, 0.4052, 0.5673, 0.4986])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+    # Overwrite the default test_latents_inputs because pix2pix encode the image differently
+    def test_latents_input(self):
+        components = self.get_dummy_components()
+        pipe = StableDiffusionInstructPix2PixPipeline(**components)
+        pipe.image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        out = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="pt"))[0]
+
+        vae = components["vae"]
+        inputs = self.get_dummy_inputs_by_type(torch_device, input_image_type="pt")
+
+        for image_param in self.image_latents_params:
+            if image_param in inputs.keys():
+                inputs[image_param] = vae.encode(inputs[image_param]).latent_dist.mode()
+
+        out_latents_inputs = pipe(**inputs)[0]
+
+        max_diff = np.abs(out - out_latents_inputs).max()
+        self.assertLess(max_diff, 1e-4, "passing latents as image input generate different result from passing image")
+
+    # Override the default test_callback_cfg because pix2pix create inputs for cfg differently
+    def test_callback_cfg(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        def callback_no_cfg(pipe, i, t, callback_kwargs):
+            if i == 1:
+                for k, w in callback_kwargs.items():
+                    if k in self.callback_cfg_params:
+                        callback_kwargs[k] = callback_kwargs[k].chunk(3)[0]
+                pipe._guidance_scale = 1.0
+
+            return callback_kwargs
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["guidance_scale"] = 1.0
+        inputs["num_inference_steps"] = 2
+        out_no_cfg = pipe(**inputs)[0]
+
+        inputs["guidance_scale"] = 7.5
+        inputs["callback_on_step_end"] = callback_no_cfg
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        out_callback_no_cfg = pipe(**inputs)[0]
+
+        assert out_no_cfg.shape == out_callback_no_cfg.shape
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, seed=0):
+        generator = torch.manual_seed(seed)
+        image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_pix2pix/example.jpg"
+        )
+        inputs = {
+            "prompt": "turn him into a cyborg",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "image_guidance_scale": 1.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_pix2pix_default(self):
+        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+            "timbrooks/instruct-pix2pix", safety_checker=None
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.5902, 0.6015, 0.6027, 0.5983, 0.6092, 0.6061, 0.5765, 0.5785, 0.5555])
+
+        assert np.abs(expected_slice - image_slice).max() < 1e-3
+
+    def test_stable_diffusion_pix2pix_k_lms(self):
+        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+            "timbrooks/instruct-pix2pix", safety_checker=None
+        )
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.6578, 0.6817, 0.6972, 0.6761, 0.6856, 0.6916, 0.6428, 0.6516, 0.6301])
+
+        assert np.abs(expected_slice - image_slice).max() < 1e-3
+
+    def test_stable_diffusion_pix2pix_ddim(self):
+        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+            "timbrooks/instruct-pix2pix", safety_checker=None
+        )
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.3828, 0.3834, 0.3818, 0.3792, 0.3865, 0.3752, 0.3792, 0.3847, 0.3753])
+
+        assert np.abs(expected_slice - image_slice).max() < 1e-3
+
+    def test_stable_diffusion_pix2pix_intermediate_state(self):
+        number_of_steps = 0
+
+        def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None:
+            callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 1:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([-0.2463, -0.4644, -0.9756, 1.5176, 1.4414, 0.7866, 0.9897, 0.8521, 0.7983])
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+            elif step == 2:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([-0.2644, -0.4626, -0.9653, 1.5176, 1.4551, 0.7686, 0.9805, 0.8452, 0.8115])
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+
+        callback_fn.has_been_called = False
+
+        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+            "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs()
+        pipe(**inputs, callback=callback_fn, callback_steps=1)
+        assert callback_fn.has_been_called
+        assert number_of_steps == 3
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+            "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        inputs = self.get_inputs()
+        _ = pipe(**inputs)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.2 GB is allocated
+        assert mem_bytes < 2.2 * 10**9
+
+    def test_stable_diffusion_pix2pix_pipeline_multiple_of_8(self):
+        inputs = self.get_inputs()
+        # resize to resolution that is divisible by 8 but not 16 or 32
+        inputs["image"] = inputs["image"].resize((504, 504))
+
+        model_id = "timbrooks/instruct-pix2pix"
+        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+            model_id,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        output = pipe(**inputs)
+        image = output.images[0]
+
+        image_slice = image[255:258, 383:386, -1]
+
+        assert image.shape == (504, 504, 3)
+        expected_slice = np.array([0.2726, 0.2529, 0.2664, 0.2655, 0.2641, 0.2642, 0.2591, 0.2649, 0.2590])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/__init__.py b/diffusers/tests/pipelines/stable_diffusion_2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7114d19e2086e7f07cf3962be9cdd973b0a5114
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -0,0 +1,428 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    logging,
+)
+from diffusers.utils.testing_utils import (
+    CaptureLogger,
+    backend_empty_cache,
+    enable_full_determinism,
+    load_numpy,
+    nightly,
+    numpy_cosine_similarity_distance,
+    require_torch_accelerator,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDFunctionTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusion2PipelineFastTests(
+    SDFunctionTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=512,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        generator_device = "cpu" if not device.startswith("cuda") else "cuda"
+        if not str(device).startswith("mps"):
+            generator = torch.Generator(device=generator_device).manual_seed(seed)
+        else:
+            generator = torch.manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5753, 0.6113, 0.5005, 0.5036, 0.5464, 0.4725, 0.4982, 0.4865, 0.4861])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_pndm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5121, 0.5714, 0.4827, 0.5057, 0.5646, 0.4766, 0.5189, 0.4895, 0.4990])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_lms(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4865, 0.5439, 0.4840, 0.4995, 0.5543, 0.4846, 0.5199, 0.4942, 0.5061])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_euler_ancestral(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = EulerAncestralDiscreteScheduler.from_config(components["scheduler"].config)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4864, 0.5440, 0.4842, 0.4994, 0.5543, 0.4846, 0.5196, 0.4942, 0.5063])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = EulerDiscreteScheduler.from_config(components["scheduler"].config)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4865, 0.5439, 0.4840, 0.4995, 0.5543, 0.4846, 0.5199, 0.4942, 0.5061])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_unflawed(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = DDIMScheduler.from_config(
+            components["scheduler"].config, timestep_spacing="trailing"
+        )
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["guidance_rescale"] = 0.7
+        inputs["num_inference_steps"] = 10
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4736, 0.5405, 0.4705, 0.4955, 0.5675, 0.4812, 0.5310, 0.4967, 0.5064])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_long_prompt(self):
+        components = self.get_dummy_components()
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        do_classifier_free_guidance = True
+        negative_prompt = None
+        num_images_per_prompt = 1
+        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+        logger.setLevel(logging.WARNING)
+
+        prompt = 25 * "@"
+        with CaptureLogger(logger) as cap_logger_3:
+            text_embeddings_3, negeative_text_embeddings_3 = sd_pipe.encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+            if negeative_text_embeddings_3 is not None:
+                text_embeddings_3 = torch.cat([negeative_text_embeddings_3, text_embeddings_3])
+
+        prompt = 100 * "@"
+        with CaptureLogger(logger) as cap_logger:
+            text_embeddings, negative_embeddings = sd_pipe.encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+            if negative_embeddings is not None:
+                text_embeddings = torch.cat([negative_embeddings, text_embeddings])
+
+        negative_prompt = "Hello"
+        with CaptureLogger(logger) as cap_logger_2:
+            text_embeddings_2, negative_text_embeddings_2 = sd_pipe.encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+            if negative_text_embeddings_2 is not None:
+                text_embeddings_2 = torch.cat([negative_text_embeddings_2, text_embeddings_2])
+
+        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
+        assert text_embeddings.shape[1] == 77
+
+        assert cap_logger.out == cap_logger_2.out
+        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
+        assert cap_logger.out.count("@") == 25
+        assert cap_logger_3.out == ""
+
+    def test_attention_slicing_forward_pass(self):
+        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+
+@slow
+@require_torch_accelerator
+@skip_mps
+class StableDiffusion2PipelineSlowTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        _generator_device = "cpu" if not generator_device.startswith("cuda") else "cuda"
+        if not str(device).startswith("mps"):
+            generator = torch.Generator(device=_generator_device).manual_seed(seed)
+        else:
+            generator = torch.manual_seed(seed)
+
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "a photograph of an astronaut riding a horse",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_default_ddim(self):
+        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506])
+        assert np.abs(image_slice - expected_slice).max() < 7e-3
+
+    @require_torch_gpu
+    def test_stable_diffusion_attention_slicing(self):
+        torch.cuda.reset_peak_memory_stats()
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-base", torch_dtype=torch.float16
+        )
+        pipe.unet.set_default_attn_processor()
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # enable attention slicing
+        pipe.enable_attention_slicing()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        image_sliced = pipe(**inputs).images
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+        # make sure that less than 3.3 GB is allocated
+        assert mem_bytes < 3.3 * 10**9
+
+        # disable slicing
+        pipe.disable_attention_slicing()
+        pipe.unet.set_default_attn_processor()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        image = pipe(**inputs).images
+
+        # make sure that more than 3.3 GB is allocated
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes > 3.3 * 10**9
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_sliced.flatten())
+        assert max_diff < 5e-3
+
+
+@nightly
+@require_torch_accelerator
+@skip_mps
+class StableDiffusion2PipelineNightlyTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        _generator_device = "cpu" if not generator_device.startswith("cuda") else "cuda"
+        if not str(device).startswith("mps"):
+            generator = torch.Generator(device=_generator_device).manual_seed(seed)
+        else:
+            generator = torch.manual_seed(seed)
+
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "a photograph of an astronaut riding a horse",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_2_1_default(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_2_text2img/stable_diffusion_2_0_pndm.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c2b3a3c1e85da09cd4022cb84e23187c78427af
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
@@ -0,0 +1,259 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    StableDiffusionAttendAndExcitePipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    load_numpy,
+    nightly,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    skip_mps,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    PipelineFromPipeTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+@skip_mps
+class StableDiffusionAttendAndExcitePipelineFastTests(
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    PipelineFromPipeTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionAttendAndExcitePipeline
+    test_attention_slicing = False
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"token_indices"})
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    # Attend and excite requires being able to run a backward pass at
+    # inference time. There's no deterministic backward operator for pad
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        torch.use_deterministic_algorithms(False)
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        torch.use_deterministic_algorithms(True)
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=1,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=512,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "a cat and a frog",
+            "token_indices": [2, 5],
+            "generator": generator,
+            "num_inference_steps": 1,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "max_iter_to_alter": 2,
+            "thresholds": {0: 0.7},
+        }
+        return inputs
+
+    def test_dict_tuple_outputs_equivalent(self):
+        expected_slice = None
+        if torch_device == "cpu":
+            expected_slice = np.array([0.6391, 0.6290, 0.4860, 0.5134, 0.5550, 0.4577, 0.5033, 0.5023, 0.4538])
+        super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice, expected_max_difference=3e-3)
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        self.assertEqual(image.shape, (1, 64, 64, 3))
+        expected_slice = np.array(
+            [0.63905364, 0.62897307, 0.48599017, 0.5133624, 0.5550048, 0.45769516, 0.50326973, 0.5023139, 0.45384496]
+        )
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_sequential_cpu_offload_forward_pass(self):
+        super().test_sequential_cpu_offload_forward_pass(expected_max_diff=5e-4)
+
+    def test_inference_batch_consistent(self):
+        # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
+        self._test_inference_batch_consistent(batch_sizes=[1, 2])
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=7e-4)
+
+    def test_pt_np_pil_outputs_equivalent(self):
+        super().test_pt_np_pil_outputs_equivalent(expected_max_diff=5e-4)
+
+    def test_save_load_local(self):
+        super().test_save_load_local(expected_max_difference=5e-4)
+
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=4e-4)
+
+    def test_karras_schedulers_shape(self):
+        super().test_karras_schedulers_shape(num_inference_steps_for_strength_for_iterations=3)
+
+    def test_from_pipe_consistent_forward_pass_cpu_offload(self):
+        super().test_from_pipe_consistent_forward_pass_cpu_offload(expected_max_diff=5e-3)
+
+
+@require_torch_gpu
+@nightly
+class StableDiffusionAttendAndExcitePipelineIntegrationTests(unittest.TestCase):
+    # Attend and excite requires being able to run a backward pass at
+    # inference time. There's no deterministic backward operator for pad
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        torch.use_deterministic_algorithms(False)
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        torch.use_deterministic_algorithms(True)
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_attend_and_excite_fp16(self):
+        generator = torch.manual_seed(51)
+
+        pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe.to("cuda")
+
+        prompt = "a painting of an elephant with glasses"
+        token_indices = [5, 7]
+
+        image = pipe(
+            prompt=prompt,
+            token_indices=token_indices,
+            guidance_scale=7.5,
+            generator=generator,
+            num_inference_steps=5,
+            max_iter_to_alter=5,
+            output_type="np",
+        ).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/attend-and-excite/elephant_glasses.npy"
+        )
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
+        assert max_diff < 5e-1
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..42eef061069ebfa83b88f4284d5239b37132ce7b
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
@@ -0,0 +1,459 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import (
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    DPTConfig,
+    DPTForDepthEstimation,
+    DPTImageProcessor,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    PNDMScheduler,
+    StableDiffusionDepth2ImgPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils import is_accelerate_available, is_accelerate_version
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+@skip_mps
+class StableDiffusionDepth2ImgPipelineFastTests(
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionDepth2ImgPipeline
+    test_save_load_optional_components = False
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"depth_mask"})
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=5,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+        )
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        backbone_config = {
+            "global_padding": "same",
+            "layer_type": "bottleneck",
+            "depths": [3, 4, 9],
+            "out_features": ["stage1", "stage2", "stage3"],
+            "embedding_dynamic_padding": True,
+            "hidden_sizes": [96, 192, 384, 768],
+            "num_groups": 2,
+        }
+        depth_estimator_config = DPTConfig(
+            image_size=32,
+            patch_size=16,
+            num_channels=3,
+            hidden_size=32,
+            num_hidden_layers=4,
+            backbone_out_indices=(0, 1, 2, 3),
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            is_decoder=False,
+            initializer_range=0.02,
+            is_hybrid=True,
+            backbone_config=backbone_config,
+            backbone_featmap_shape=[1, 384, 24, 24],
+        )
+        depth_estimator = DPTForDepthEstimation(depth_estimator_config).eval()
+        feature_extractor = DPTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-DPTForDepthEstimation")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "depth_estimator": depth_estimator,
+            "feature_extractor": feature_extractor,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_save_load_local(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(output - output_loaded).max()
+        self.assertLess(max_diff, 1e-4)
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self):
+        components = self.get_dummy_components()
+        for name, module in components.items():
+            if hasattr(module, "half"):
+                components[name] = module.to(torch_device).half()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for name, component in pipe_loaded.components.items():
+            if hasattr(component, "dtype"):
+                self.assertTrue(
+                    component.dtype == torch.float16,
+                    f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.",
+                )
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(output - output_loaded).max()
+        self.assertLess(max_diff, 2e-2, "The output of the fp16 pipeline changed after saving and loading.")
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_float16_inference(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        for name, module in components.items():
+            if hasattr(module, "half"):
+                components[name] = module.half()
+        pipe_fp16 = self.pipeline_class(**components)
+        pipe_fp16.to(torch_device)
+        pipe_fp16.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(torch_device))[0]
+        output_fp16 = pipe_fp16(**self.get_dummy_inputs(torch_device))[0]
+
+        max_diff = np.abs(output - output_fp16).max()
+        self.assertLess(max_diff, 1.3e-2, "The outputs of the fp16 and fp32 pipelines are too different.")
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"),
+        reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
+    )
+    def test_cpu_offload_forward_pass(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = pipe(**inputs)[0]
+
+        pipe.enable_sequential_cpu_offload()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = pipe(**inputs)[0]
+
+        max_diff = np.abs(output_with_offload - output_without_offload).max()
+        self.assertLess(max_diff, 1e-4, "CPU offloading should not affect the inference results")
+
+    def test_dict_tuple_outputs_equivalent(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(torch_device))[0]
+        output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0]
+
+        max_diff = np.abs(output - output_tuple).max()
+        self.assertLess(max_diff, 1e-4)
+
+    def test_stable_diffusion_depth2img_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = StableDiffusionDepth2ImgPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        if torch_device == "mps":
+            expected_slice = np.array([0.6071, 0.5035, 0.4378, 0.5776, 0.5753, 0.4316, 0.4513, 0.5263, 0.4546])
+        else:
+            expected_slice = np.array([0.5435, 0.4992, 0.3783, 0.4411, 0.5842, 0.4654, 0.3786, 0.5077, 0.4655])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_depth2img_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = StableDiffusionDepth2ImgPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "french fries"
+        output = pipe(**inputs, negative_prompt=negative_prompt)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        if torch_device == "mps":
+            expected_slice = np.array([0.6296, 0.5125, 0.3890, 0.4456, 0.5955, 0.4621, 0.3810, 0.5310, 0.4626])
+        else:
+            expected_slice = np.array([0.6012, 0.4507, 0.3769, 0.4121, 0.5566, 0.4585, 0.3803, 0.5045, 0.4631])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_depth2img_multiple_init_images(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = StableDiffusionDepth2ImgPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * 2
+        inputs["image"] = 2 * [inputs["image"]]
+        image = pipe(**inputs).images
+        image_slice = image[-1, -3:, -3:, -1]
+
+        assert image.shape == (2, 32, 32, 3)
+
+        if torch_device == "mps":
+            expected_slice = np.array([0.6501, 0.5150, 0.4939, 0.6688, 0.5437, 0.5758, 0.5115, 0.4406, 0.4551])
+        else:
+            expected_slice = np.array([0.6557, 0.6214, 0.6254, 0.5775, 0.4785, 0.5949, 0.5904, 0.4785, 0.4730])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_depth2img_pil(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = StableDiffusionDepth2ImgPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        if torch_device == "mps":
+            expected_slice = np.array([0.53232, 0.47015, 0.40868, 0.45651, 0.4891, 0.4668, 0.4287, 0.48822, 0.47439])
+        else:
+            expected_slice = np.array([0.5435, 0.4992, 0.3783, 0.4411, 0.5842, 0.4654, 0.3786, 0.5077, 0.4655])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        return super().test_attention_slicing_forward_pass()
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=7e-3)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/depth2img/two_cats.png"
+        )
+        inputs = {
+            "prompt": "two tigers",
+            "image": init_image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_depth2img_pipeline_default(self):
+        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-depth", safety_checker=None
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+        assert image.shape == (1, 480, 640, 3)
+        expected_slice = np.array([0.5435, 0.4992, 0.3783, 0.4411, 0.5842, 0.4654, 0.3786, 0.5077, 0.4655])
+
+        assert np.abs(expected_slice - image_slice).max() < 6e-1
+
+
+@nightly
+@require_torch_gpu
+class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/depth2img/two_cats.png"
+        )
+        inputs = {
+            "prompt": "two tigers",
+            "image": init_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_depth2img(self):
+        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth")
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs()
+        image = pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_depth2img/stable_diffusion_2_0_pndm.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cb03ddd96d703f538e6b94a10f6c814cef31c40
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
@@ -0,0 +1,444 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMInverseScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepInverseScheduler,
+    DPMSolverMultistepScheduler,
+    StableDiffusionDiffEditPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    nightly,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+from ..test_pipelines_common import PipelineFromPipeTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class StableDiffusionDiffEditPipelineFastTests(
+    PipelineLatentTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionDiffEditPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"height", "width", "image"} | {"image_latents"}
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - {"image"} | {"image_latents"}
+    image_params = frozenset(
+        []
+    )  # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
+    image_latents_params = frozenset([])
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        inverse_scheduler = DDIMInverseScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_zero=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=512,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "inverse_scheduler": inverse_scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        mask = floats_tensor((1, 16, 16), rng=random.Random(seed)).to(device)
+        latents = floats_tensor((1, 2, 4, 16, 16), rng=random.Random(seed)).to(device)
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "a dog and a newt",
+            "mask_image": mask,
+            "image_latents": latents,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "inpaint_strength": 1.0,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    def get_dummy_mask_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        image = Image.fromarray(np.uint8(image)).convert("RGB")
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": image,
+            "source_prompt": "a cat and a frog",
+            "target_prompt": "a dog and a newt",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "num_maps_per_mask": 2,
+            "mask_encode_strength": 1.0,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    def get_dummy_inversion_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        image = Image.fromarray(np.uint8(image)).convert("RGB")
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": image,
+            "prompt": "a cat and a frog",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "inpaint_strength": 1.0,
+            "guidance_scale": 6.0,
+            "decode_latents": True,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_save_load_optional_components(self):
+        if not hasattr(self.pipeline_class, "_optional_components"):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # set all optional components to None and update pipeline config accordingly
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+        pipe.register_modules(**{optional_component: None for optional_component in pipe._optional_components})
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(output - output_loaded).max()
+        self.assertLess(max_diff, 1e-4)
+
+    def test_mask(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_mask_inputs(device)
+        mask = pipe.generate_mask(**inputs)
+        mask_slice = mask[0, -3:, -3:]
+
+        self.assertEqual(mask.shape, (1, 16, 16))
+        expected_slice = np.array([0] * 9)
+        max_diff = np.abs(mask_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+        self.assertEqual(mask[0, -3, -4], 0)
+
+    def test_inversion(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inversion_inputs(device)
+        image = pipe.invert(**inputs).images
+        image_slice = image[0, -1, -3:, -3:]
+
+        self.assertEqual(image.shape, (2, 32, 32, 3))
+        expected_slice = np.array(
+            [0.5160, 0.5115, 0.5060, 0.5456, 0.4704, 0.5060, 0.5019, 0.4405, 0.4726],
+        )
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=5e-3)
+
+    def test_inversion_dpm(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        scheduler_args = {"beta_start": 0.00085, "beta_end": 0.012, "beta_schedule": "scaled_linear"}
+        components["scheduler"] = DPMSolverMultistepScheduler(**scheduler_args)
+        components["inverse_scheduler"] = DPMSolverMultistepInverseScheduler(**scheduler_args)
+
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inversion_inputs(device)
+        image = pipe.invert(**inputs).images
+        image_slice = image[0, -1, -3:, -3:]
+
+        self.assertEqual(image.shape, (2, 32, 32, 3))
+        expected_slice = np.array(
+            [0.5305, 0.4673, 0.5314, 0.5308, 0.4886, 0.5279, 0.5142, 0.4724, 0.4892],
+        )
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+
+@require_torch_gpu
+@nightly
+class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @classmethod
+    def setUpClass(cls):
+        raw_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/diffedit/fruit.png"
+        )
+        raw_image = raw_image.convert("RGB").resize((256, 256))
+
+        cls.raw_image = raw_image
+
+    def test_stable_diffusion_diffedit_full(self):
+        generator = torch.manual_seed(0)
+
+        pipe = StableDiffusionDiffEditPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-1-base", safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.scheduler.clip_sample = True
+
+        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        source_prompt = "a bowl of fruit"
+        target_prompt = "a bowl of pears"
+
+        mask_image = pipe.generate_mask(
+            image=self.raw_image,
+            source_prompt=source_prompt,
+            target_prompt=target_prompt,
+            generator=generator,
+        )
+
+        inv_latents = pipe.invert(
+            prompt=source_prompt,
+            image=self.raw_image,
+            inpaint_strength=0.7,
+            generator=generator,
+            num_inference_steps=5,
+        ).latents
+
+        image = pipe(
+            prompt=target_prompt,
+            mask_image=mask_image,
+            image_latents=inv_latents,
+            generator=generator,
+            negative_prompt=source_prompt,
+            inpaint_strength=0.7,
+            num_inference_steps=5,
+            output_type="np",
+        ).images[0]
+
+        expected_image = (
+            np.array(
+                load_image(
+                    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+                    "/diffedit/pears.png"
+                ).resize((256, 256))
+            )
+            / 255
+        )
+
+        assert numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten()) < 2e-1
+
+
+@nightly
+@require_torch_gpu
+class StableDiffusionDiffEditPipelineNightlyTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @classmethod
+    def setUpClass(cls):
+        raw_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/diffedit/fruit.png"
+        )
+
+        raw_image = raw_image.convert("RGB").resize((768, 768))
+
+        cls.raw_image = raw_image
+
+    def test_stable_diffusion_diffedit_dpm(self):
+        generator = torch.manual_seed(0)
+
+        pipe = StableDiffusionDiffEditPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        pipe.inverse_scheduler = DPMSolverMultistepInverseScheduler.from_config(pipe.scheduler.config)
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        source_prompt = "a bowl of fruit"
+        target_prompt = "a bowl of pears"
+
+        mask_image = pipe.generate_mask(
+            image=self.raw_image,
+            source_prompt=source_prompt,
+            target_prompt=target_prompt,
+            generator=generator,
+        )
+
+        inv_latents = pipe.invert(
+            prompt=source_prompt,
+            image=self.raw_image,
+            inpaint_strength=0.7,
+            generator=generator,
+            num_inference_steps=25,
+        ).latents
+
+        image = pipe(
+            prompt=target_prompt,
+            mask_image=mask_image,
+            image_latents=inv_latents,
+            generator=generator,
+            negative_prompt=source_prompt,
+            inpaint_strength=0.7,
+            num_inference_steps=25,
+            output_type="np",
+        ).images[0]
+
+        expected_image = (
+            np.array(
+                load_image(
+                    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+                    "/diffedit/pears.png"
+                ).resize((768, 768))
+            )
+            / 255
+        )
+        assert np.abs((expected_image - image).max()) < 5e-1
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc855f44b817fb16dba652f47ade5c5c5ffeaf71
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+from diffusers import FlaxDPMSolverMultistepScheduler, FlaxStableDiffusionPipeline
+from diffusers.utils import is_flax_available
+from diffusers.utils.testing_utils import nightly, require_flax
+
+
+if is_flax_available():
+    import jax
+    import jax.numpy as jnp
+    from flax.jax_utils import replicate
+    from flax.training.common_utils import shard
+
+
+@nightly
+@require_flax
+class FlaxStableDiffusion2PipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+
+    def test_stable_diffusion_flax(self):
+        sd_pipe, params = FlaxStableDiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2",
+            variant="bf16",
+            dtype=jnp.bfloat16,
+        )
+
+        prompt = "A painting of a squirrel eating a burger"
+        num_samples = jax.device_count()
+        prompt = num_samples * [prompt]
+        prompt_ids = sd_pipe.prepare_inputs(prompt)
+
+        params = replicate(params)
+        prompt_ids = shard(prompt_ids)
+
+        prng_seed = jax.random.PRNGKey(0)
+        prng_seed = jax.random.split(prng_seed, jax.device_count())
+
+        images = sd_pipe(prompt_ids, params, prng_seed, num_inference_steps=25, jit=True)[0]
+        assert images.shape == (jax.device_count(), 1, 768, 768, 3)
+
+        images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
+        image_slice = images[0, 253:256, 253:256, -1]
+
+        output_slice = jnp.asarray(jax.device_get(image_slice.flatten()))
+        expected_slice = jnp.array([0.4238, 0.4414, 0.4395, 0.4453, 0.4629, 0.4590, 0.4531, 0.45508, 0.4512])
+        print(f"output_slice: {output_slice}")
+        assert jnp.abs(output_slice - expected_slice).max() < 1e-2
+
+
+@nightly
+@require_flax
+class FlaxStableDiffusion2PipelineNightlyTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+
+    def test_stable_diffusion_dpm_flax(self):
+        model_id = "stabilityai/stable-diffusion-2"
+        scheduler, scheduler_params = FlaxDPMSolverMultistepScheduler.from_pretrained(model_id, subfolder="scheduler")
+        sd_pipe, params = FlaxStableDiffusionPipeline.from_pretrained(
+            model_id,
+            scheduler=scheduler,
+            variant="bf16",
+            dtype=jnp.bfloat16,
+        )
+        params["scheduler"] = scheduler_params
+
+        prompt = "A painting of a squirrel eating a burger"
+        num_samples = jax.device_count()
+        prompt = num_samples * [prompt]
+        prompt_ids = sd_pipe.prepare_inputs(prompt)
+
+        params = replicate(params)
+        prompt_ids = shard(prompt_ids)
+
+        prng_seed = jax.random.PRNGKey(0)
+        prng_seed = jax.random.split(prng_seed, jax.device_count())
+
+        images = sd_pipe(prompt_ids, params, prng_seed, num_inference_steps=25, jit=True)[0]
+        assert images.shape == (jax.device_count(), 1, 768, 768, 3)
+
+        images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
+        image_slice = images[0, 253:256, 253:256, -1]
+
+        output_slice = jnp.asarray(jax.device_get(image_slice.flatten()))
+        expected_slice = jnp.array([0.4336, 0.42969, 0.4453, 0.4199, 0.4297, 0.4531, 0.4434, 0.4434, 0.4297])
+        print(f"output_slice: {output_slice}")
+        assert jnp.abs(output_slice - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax_inpaint.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f039980ec244931da7391303d7ee5a9f439beaf
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax_inpaint.py
@@ -0,0 +1,82 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+from diffusers import FlaxStableDiffusionInpaintPipeline
+from diffusers.utils import is_flax_available, load_image
+from diffusers.utils.testing_utils import require_flax, slow
+
+
+if is_flax_available():
+    import jax
+    import jax.numpy as jnp
+    from flax.jax_utils import replicate
+    from flax.training.common_utils import shard
+
+
+@slow
+@require_flax
+class FlaxStableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+
+    def test_stable_diffusion_inpaint_pipeline(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-inpaint/init_image.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
+        )
+
+        model_id = "xvjiarui/stable-diffusion-2-inpainting"
+        pipeline, params = FlaxStableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None)
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        prng_seed = jax.random.PRNGKey(0)
+        num_inference_steps = 50
+
+        num_samples = jax.device_count()
+        prompt = num_samples * [prompt]
+        init_image = num_samples * [init_image]
+        mask_image = num_samples * [mask_image]
+        prompt_ids, processed_masked_images, processed_masks = pipeline.prepare_inputs(prompt, init_image, mask_image)
+
+        # shard inputs and rng
+        params = replicate(params)
+        prng_seed = jax.random.split(prng_seed, jax.device_count())
+        prompt_ids = shard(prompt_ids)
+        processed_masked_images = shard(processed_masked_images)
+        processed_masks = shard(processed_masks)
+
+        output = pipeline(
+            prompt_ids, processed_masks, processed_masked_images, params, prng_seed, num_inference_steps, jit=True
+        )
+
+        images = output.images.reshape(num_samples, 512, 512, 3)
+
+        image_slice = images[0, 253:256, 253:256, -1]
+
+        output_slice = jnp.asarray(jax.device_get(image_slice.flatten()))
+        expected_slice = jnp.array(
+            [0.3611307, 0.37649736, 0.3757408, 0.38213953, 0.39295167, 0.3841631, 0.41554978, 0.4137475, 0.4217084]
+        )
+        print(f"output_slice: {output_slice}")
+        assert jnp.abs(output_slice - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..b99a1816456e1c3d7bb4376ca418779c9b3b292f
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
@@ -0,0 +1,282 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+)
+from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class StableDiffusion2InpaintPipelineFastTests(
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionInpaintPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    image_params = frozenset(
+        []
+    )  # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
+    image_latents_params = frozenset([])
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"mask", "masked_image_latents"})
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=9,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+        )
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=512,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": init_image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_inpaint(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionInpaintPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4727, 0.5735, 0.3941, 0.5446, 0.5926, 0.4394, 0.5062, 0.4654, 0.4476])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_inpaint_pipeline(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-inpaint/init_image.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint"
+            "/yellow_cat_sitting_on_a_park_bench.npy"
+        )
+
+        model_id = "stabilityai/stable-diffusion-2-inpainting"
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 9e-3
+
+    def test_stable_diffusion_inpaint_pipeline_fp16(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-inpaint/init_image.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint"
+            "/yellow_cat_sitting_on_a_park_bench_fp16.npy"
+        )
+
+        model_id = "stabilityai/stable-diffusion-2-inpainting"
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 5e-1
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-inpaint/init_image.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
+        )
+
+        model_id = "stabilityai/stable-diffusion-2-inpainting"
+        pndm = PNDMScheduler.from_pretrained(model_id, subfolder="scheduler")
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            model_id,
+            safety_checker=None,
+            scheduler=pndm,
+            torch_dtype=torch.float16,
+        )
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.manual_seed(0)
+        _ = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.65 GB is allocated
+        assert mem_bytes < 2.65 * 10**9
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
new file mode 100644
index 0000000000000000000000000000000000000000..134175bdaffed90164b8c2f1e6cc8aa9f43dc51a
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
@@ -0,0 +1,351 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    EulerDiscreteScheduler,
+    StableDiffusionLatentUpscalePipeline,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+def check_same_shape(tensor_list):
+    shapes = [tensor.shape for tensor in tensor_list]
+    return all(shape == shapes[0] for shape in shapes[1:])
+
+
+class StableDiffusionLatentUpscalePipelineFastTests(
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionLatentUpscalePipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {
+        "height",
+        "width",
+        "cross_attention_kwargs",
+        "negative_prompt_embeds",
+        "prompt_embeds",
+    }
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"num_images_per_prompt"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = frozenset(
+        []
+    )  # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
+    image_latents_params = frozenset([])
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 4
+        sizes = (16, 16)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            act_fn="gelu",
+            attention_head_dim=8,
+            norm_num_groups=None,
+            block_out_channels=[32, 32, 64, 64],
+            time_cond_proj_dim=160,
+            conv_in_kernel=1,
+            conv_out_kernel=1,
+            cross_attention_dim=32,
+            down_block_types=(
+                "KDownBlock2D",
+                "KCrossAttnDownBlock2D",
+                "KCrossAttnDownBlock2D",
+                "KCrossAttnDownBlock2D",
+            ),
+            in_channels=8,
+            mid_block_type=None,
+            only_cross_attention=False,
+            out_channels=5,
+            resnet_time_scale_shift="scale_shift",
+            time_embedding_type="fourier",
+            timestep_post_act="gelu",
+            up_block_types=("KCrossAttnUpBlock2D", "KCrossAttnUpBlock2D", "KCrossAttnUpBlock2D", "KUpBlock2D"),
+        )
+        vae = AutoencoderKL(
+            block_out_channels=[32, 32, 64, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=[
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+            ],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        scheduler = EulerDiscreteScheduler(prediction_type="sample")
+        text_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="quick_gelu",
+            projection_dim=512,
+        )
+        text_encoder = CLIPTextModel(text_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": model.eval(),
+            "vae": vae.eval(),
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": self.dummy_image.cpu(),
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        self.assertEqual(image.shape, (1, 256, 256, 3))
+        expected_slice = np.array(
+            [0.47222412, 0.41921633, 0.44717434, 0.46874192, 0.42588258, 0.46150726, 0.4677534, 0.45583832, 0.48579055]
+        )
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)
+
+    def test_stable_diffusion_latent_upscaler_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionLatentUpscalePipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "french fries"
+        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array(
+            [0.43865365, 0.404124, 0.42618454, 0.44333526, 0.40564927, 0.43818694, 0.4411913, 0.43404633, 0.46392226]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_latent_upscaler_multiple_init_images(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionLatentUpscalePipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * 2
+        inputs["image"] = inputs["image"].repeat(2, 1, 1, 1)
+        image = sd_pipe(**inputs).images
+        image_slice = image[-1, -3:, -3:, -1]
+
+        assert image.shape == (2, 256, 256, 3)
+        expected_slice = np.array(
+            [0.38730142, 0.35695046, 0.40646142, 0.40967226, 0.3981609, 0.4195988, 0.4248805, 0.430259, 0.45694894]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_attention_slicing_forward_pass(self):
+        super().test_attention_slicing_forward_pass(expected_max_diff=7e-3)
+
+    def test_sequential_cpu_offload_forward_pass(self):
+        super().test_sequential_cpu_offload_forward_pass(expected_max_diff=3e-3)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=7e-3)
+
+    def test_pt_np_pil_outputs_equivalent(self):
+        super().test_pt_np_pil_outputs_equivalent(expected_max_diff=3e-3)
+
+    def test_save_load_local(self):
+        super().test_save_load_local(expected_max_difference=3e-3)
+
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=3e-3)
+
+    def test_karras_schedulers_shape(self):
+        skip_schedulers = [
+            "DDIMScheduler",
+            "DDPMScheduler",
+            "PNDMScheduler",
+            "HeunDiscreteScheduler",
+            "EulerAncestralDiscreteScheduler",
+            "KDPM2DiscreteScheduler",
+            "KDPM2AncestralDiscreteScheduler",
+            "DPMSolverSDEScheduler",
+            "EDMEulerScheduler",
+        ]
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        # make sure that PNDM does not need warm-up
+        pipe.scheduler.register_to_config(skip_prk_steps=True)
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = 2
+
+        outputs = []
+        for scheduler_enum in KarrasDiffusionSchedulers:
+            if scheduler_enum.name in skip_schedulers:
+                # no sigma schedulers are not supported
+                # no schedulers
+                continue
+
+            scheduler_cls = getattr(diffusers, scheduler_enum.name)
+            pipe.scheduler = scheduler_cls.from_config(pipe.scheduler.config)
+            output = pipe(**inputs)[0]
+            outputs.append(output)
+
+        assert check_same_shape(outputs)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=5e-1)
+
+
+@require_torch_gpu
+@slow
+class StableDiffusionLatentUpscalePipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_latent_upscaler_fp16(self):
+        generator = torch.manual_seed(33)
+
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe.to("cuda")
+
+        upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
+            "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16
+        )
+        upscaler.to("cuda")
+
+        prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic"
+
+        low_res_latents = pipe(prompt, generator=generator, output_type="latent").images
+
+        image = upscaler(
+            prompt=prompt,
+            image=low_res_latents,
+            num_inference_steps=20,
+            guidance_scale=0,
+            generator=generator,
+            output_type="np",
+        ).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/astronaut_1024.npy"
+        )
+        assert np.abs((expected_image - image).mean()) < 5e-2
+
+    def test_latent_upscaler_fp16_image(self):
+        generator = torch.manual_seed(33)
+
+        upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
+            "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16
+        )
+        upscaler.to("cuda")
+
+        prompt = "the temple of fire by Ross Tran and Gerardo Dottori, oil on canvas"
+
+        low_res_img = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/fire_temple_512.png"
+        )
+
+        image = upscaler(
+            prompt=prompt,
+            image=low_res_img,
+            num_inference_steps=20,
+            guidance_scale=0,
+            generator=generator,
+            output_type="np",
+        ).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/fire_temple_1024.npy"
+        )
+        assert np.abs((expected_image - image).max()) < 5e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
new file mode 100644
index 0000000000000000000000000000000000000000..c21da7af6d2c7b0d75a0aa27ba82428dff4b027c
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
@@ -0,0 +1,492 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableDiffusionUpscalePipeline, UNet2DConditionModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionUpscalePipelineFastTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_cond_unet_upscale(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=7,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            # SD2-specific config below
+            attention_head_dim=8,
+            use_linear_projection=True,
+            only_cross_attention=(True, True, False),
+            num_class_embeds=100,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=512,
+        )
+        return CLIPTextModel(config)
+
+    def test_stable_diffusion_upscale(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet_upscale
+        low_res_scheduler = DDPMScheduler()
+        scheduler = DDIMScheduler(prediction_type="v_prediction")
+        vae = self.dummy_vae
+        text_encoder = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionUpscalePipeline(
+            unet=unet,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            max_noise_level=350,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            image=low_res_image,
+            generator=generator,
+            guidance_scale=6.0,
+            noise_level=20,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            image=low_res_image,
+            generator=generator,
+            guidance_scale=6.0,
+            noise_level=20,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        expected_height_width = low_res_image.size[0] * 4
+        assert image.shape == (1, expected_height_width, expected_height_width, 3)
+        expected_slice = np.array([0.3113, 0.3910, 0.4272, 0.4859, 0.5061, 0.4652, 0.5362, 0.5715, 0.5661])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_upscale_batch(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet_upscale
+        low_res_scheduler = DDPMScheduler()
+        scheduler = DDIMScheduler(prediction_type="v_prediction")
+        vae = self.dummy_vae
+        text_encoder = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionUpscalePipeline(
+            unet=unet,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            max_noise_level=350,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        output = sd_pipe(
+            2 * [prompt],
+            image=2 * [low_res_image],
+            guidance_scale=6.0,
+            noise_level=20,
+            num_inference_steps=2,
+            output_type="np",
+        )
+        image = output.images
+        assert image.shape[0] == 2
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            image=low_res_image,
+            generator=generator,
+            num_images_per_prompt=2,
+            guidance_scale=6.0,
+            noise_level=20,
+            num_inference_steps=2,
+            output_type="np",
+        )
+        image = output.images
+        assert image.shape[0] == 2
+
+    def test_stable_diffusion_upscale_prompt_embeds(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet_upscale
+        low_res_scheduler = DDPMScheduler()
+        scheduler = DDIMScheduler(prediction_type="v_prediction")
+        vae = self.dummy_vae
+        text_encoder = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionUpscalePipeline(
+            unet=unet,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            max_noise_level=350,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            image=low_res_image,
+            generator=generator,
+            guidance_scale=6.0,
+            noise_level=20,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        prompt_embeds, negative_prompt_embeds = sd_pipe.encode_prompt(prompt, device, 1, False)
+        if negative_prompt_embeds is not None:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        image_from_prompt_embeds = sd_pipe(
+            prompt_embeds=prompt_embeds,
+            image=[low_res_image],
+            generator=generator,
+            guidance_scale=6.0,
+            noise_level=20,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_prompt_embeds_slice = image_from_prompt_embeds[0, -3:, -3:, -1]
+
+        expected_height_width = low_res_image.size[0] * 4
+        assert image.shape == (1, expected_height_width, expected_height_width, 3)
+        expected_slice = np.array([0.3113, 0.3910, 0.4272, 0.4859, 0.5061, 0.4652, 0.5362, 0.5715, 0.5661])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_prompt_embeds_slice.flatten() - expected_slice).max() < 1e-2
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_upscale_fp16(self):
+        """Test that stable diffusion upscale works with fp16"""
+        unet = self.dummy_cond_unet_upscale
+        low_res_scheduler = DDPMScheduler()
+        scheduler = DDIMScheduler(prediction_type="v_prediction")
+        vae = self.dummy_vae
+        text_encoder = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+
+        # put models in fp16, except vae as it overflows in fp16
+        unet = unet.half()
+        text_encoder = text_encoder.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionUpscalePipeline(
+            unet=unet,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            max_noise_level=350,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        image = sd_pipe(
+            [prompt],
+            image=low_res_image,
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+
+        expected_height_width = low_res_image.size[0] * 4
+        assert image.shape == (1, expected_height_width, expected_height_width, 3)
+
+    def test_stable_diffusion_upscale_from_save_pretrained(self):
+        pipes = []
+
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        low_res_scheduler = DDPMScheduler()
+        scheduler = DDIMScheduler(prediction_type="v_prediction")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionUpscalePipeline(
+            unet=self.dummy_cond_unet_upscale,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+            vae=self.dummy_vae,
+            text_encoder=self.dummy_text_encoder,
+            tokenizer=tokenizer,
+            max_noise_level=350,
+        )
+        sd_pipe = sd_pipe.to(device)
+        pipes.append(sd_pipe)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sd_pipe.save_pretrained(tmpdirname)
+            sd_pipe = StableDiffusionUpscalePipeline.from_pretrained(tmpdirname).to(device)
+        pipes.append(sd_pipe)
+
+        prompt = "A painting of a squirrel eating a burger"
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+
+        image_slices = []
+        for pipe in pipes:
+            generator = torch.Generator(device=device).manual_seed(0)
+            image = pipe(
+                [prompt],
+                image=low_res_image,
+                generator=generator,
+                guidance_scale=6.0,
+                noise_level=20,
+                num_inference_steps=2,
+                output_type="np",
+            ).images
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_upscale_pipeline(self):
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-upscale/low_res_cat.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale"
+            "/upsampled_cat.npy"
+        )
+
+        model_id = "stabilityai/stable-diffusion-x4-upscaler"
+        pipe = StableDiffusionUpscalePipeline.from_pretrained(model_id)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "a cat sitting on a park bench"
+
+        generator = torch.manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-3
+
+    def test_stable_diffusion_upscale_pipeline_fp16(self):
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-upscale/low_res_cat.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale"
+            "/upsampled_cat_fp16.npy"
+        )
+
+        model_id = "stabilityai/stable-diffusion-x4-upscaler"
+        pipe = StableDiffusionUpscalePipeline.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "a cat sitting on a park bench"
+
+        generator = torch.manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 5e-1
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-upscale/low_res_cat.png"
+        )
+
+        model_id = "stabilityai/stable-diffusion-x4-upscaler"
+        pipe = StableDiffusionUpscalePipeline.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16,
+        )
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        prompt = "a cat sitting on a park bench"
+
+        generator = torch.manual_seed(0)
+        _ = pipe(
+            prompt=prompt,
+            image=image,
+            generator=generator,
+            num_inference_steps=5,
+            output_type="np",
+        )
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.9 GB is allocated
+        assert mem_bytes < 2.9 * 10**9
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
new file mode 100644
index 0000000000000000000000000000000000000000..703c3b7a39d83c703586a95aebd8b6af3dadee60
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
@@ -0,0 +1,548 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import time
+import unittest
+
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_numpy,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=64,
+        )
+        return CLIPTextModel(config)
+
+    def test_stable_diffusion_v_pred_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+            prediction_type="v_prediction",
+        )
+
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=None,
+            image_encoder=None,
+            requires_safety_checker=False,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.6569, 0.6525, 0.5142, 0.4968, 0.4923, 0.4601, 0.4996, 0.5041, 0.4544])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_v_pred_k_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", prediction_type="v_prediction"
+        )
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=None,
+            image_encoder=None,
+            requires_safety_checker=False,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5644, 0.6514, 0.5190, 0.5663, 0.5287, 0.4953, 0.5430, 0.5243, 0.4778])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_v_pred_fp16(self):
+        """Test that stable diffusion v-prediction works with fp16"""
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+            prediction_type="v_prediction",
+        )
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # put models in fp16
+        unet = unet.half()
+        vae = vae.half()
+        bert = bert.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=None,
+            image_encoder=None,
+            requires_safety_checker=False,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
+
+        assert image.shape == (1, 64, 64, 3)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_v_pred_default(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.enable_attention_slicing()
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=20, output_type="np")
+
+        image = output.images
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 768, 768, 3)
+        expected_slice = np.array([0.1868, 0.1922, 0.1527, 0.1921, 0.1908, 0.1624, 0.1779, 0.1652, 0.1734])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_v_pred_upcast_attention(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.enable_attention_slicing()
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=20, output_type="np")
+
+        image = output.images
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 768, 768, 3)
+        expected_slice = np.array([0.4209, 0.4087, 0.4097, 0.4209, 0.3860, 0.4329, 0.4280, 0.4324, 0.4187])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
+
+    def test_stable_diffusion_v_pred_euler(self):
+        scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2", subfolder="scheduler")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.enable_attention_slicing()
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+
+        output = sd_pipe([prompt], generator=generator, num_inference_steps=5, output_type="np")
+        image = output.images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 768, 768, 3)
+        expected_slice = np.array([0.1781, 0.1695, 0.1661, 0.1705, 0.1588, 0.1699, 0.2005, 0.1589, 0.1677])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_v_pred_dpm(self):
+        """
+        TODO: update this test after making DPM compatible with V-prediction!
+        """
+        scheduler = DPMSolverMultistepScheduler.from_pretrained(
+            "stabilityai/stable-diffusion-2",
+            subfolder="scheduler",
+            final_sigmas_type="sigma_min",
+        )
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.enable_attention_slicing()
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a photograph of an astronaut riding a horse"
+        generator = torch.manual_seed(0)
+        image = sd_pipe(
+            [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=5, output_type="np"
+        ).images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+        assert image.shape == (1, 768, 768, 3)
+        expected_slice = np.array([0.3303, 0.3184, 0.3291, 0.3300, 0.3256, 0.3113, 0.2965, 0.3134, 0.3192])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_attention_slicing_v_pred(self):
+        torch.cuda.reset_peak_memory_stats()
+        model_id = "stabilityai/stable-diffusion-2"
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a photograph of an astronaut riding a horse"
+
+        # make attention efficient
+        pipe.enable_attention_slicing()
+        generator = torch.manual_seed(0)
+        output_chunked = pipe(
+            [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="np"
+        )
+        image_chunked = output_chunked.images
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+        # make sure that less than 5.5 GB is allocated
+        assert mem_bytes < 5.5 * 10**9
+
+        # disable slicing
+        pipe.disable_attention_slicing()
+        generator = torch.manual_seed(0)
+        output = pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="np")
+        image = output.images
+
+        # make sure that more than 3.0 GB is allocated
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes > 3 * 10**9
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_chunked.flatten())
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_text2img_pipeline_v_pred_default(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
+            "sd2-text2img/astronaut_riding_a_horse_v_pred.npy"
+        )
+
+        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
+        pipe.to(torch_device)
+        pipe.enable_attention_slicing()
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "astronaut riding a horse"
+
+        generator = torch.manual_seed(0)
+        output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np")
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_text2img_pipeline_unflawed(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
+            "sd2-text2img/lion_galaxy.npy"
+        )
+
+        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
+        pipe.scheduler = DDIMScheduler.from_config(
+            pipe.scheduler.config, timestep_spacing="trailing", rescale_betas_zero_snr=True
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"
+
+        generator = torch.Generator("cpu").manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            guidance_scale=7.5,
+            num_inference_steps=10,
+            guidance_rescale=0.7,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
+        assert max_diff < 5e-2
+
+    def test_stable_diffusion_text2img_pipeline_v_pred_fp16(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
+            "sd2-text2img/astronaut_riding_a_horse_v_pred_fp16.npy"
+        )
+
+        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "astronaut riding a horse"
+
+        generator = torch.manual_seed(0)
+        output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np")
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
+        assert max_diff < 1e-3
+
+    def test_download_local(self):
+        filename = hf_hub_download("stabilityai/stable-diffusion-2-1", filename="v2-1_768-ema-pruned.safetensors")
+
+        pipe = StableDiffusionPipeline.from_single_file(filename, torch_dtype=torch.float16)
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.enable_model_cpu_offload()
+
+        image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]
+
+        assert image_out.shape == (768, 768, 3)
+
+    def test_stable_diffusion_text2img_intermediate_state_v_pred(self):
+        number_of_steps = 0
+
+        def test_callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None:
+            test_callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 0:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 96, 96)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([0.7749, 0.0325, 0.5088, 0.1619, 0.3372, 0.3667, -0.5186, 0.6860, 1.4326])
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+            elif step == 19:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 96, 96)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([1.3887, 1.0273, 1.7266, 0.0726, 0.6611, 0.1598, -1.0547, 0.1522, 0.0227])
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+
+        test_callback_fn.has_been_called = False
+
+        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Andromeda galaxy in a bottle"
+
+        generator = torch.manual_seed(0)
+        pipe(
+            prompt=prompt,
+            num_inference_steps=20,
+            guidance_scale=7.5,
+            generator=generator,
+            callback=test_callback_fn,
+            callback_steps=1,
+        )
+        assert test_callback_fn.has_been_called
+        assert number_of_steps == 20
+
+    def test_stable_diffusion_low_cpu_mem_usage_v_pred(self):
+        pipeline_id = "stabilityai/stable-diffusion-2"
+
+        start_time = time.time()
+        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
+        pipeline_low_cpu_mem_usage.to(torch_device)
+        low_cpu_mem_usage_time = time.time() - start_time
+
+        start_time = time.time()
+        _ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
+        normal_load_time = time.time() - start_time
+
+        assert 2 * low_cpu_mem_usage_time < normal_load_time
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading_v_pred(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipeline_id = "stabilityai/stable-diffusion-2"
+        prompt = "Andromeda galaxy in a bottle"
+
+        pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
+        pipeline.enable_attention_slicing(1)
+        pipeline.enable_sequential_cpu_offload()
+
+        generator = torch.manual_seed(0)
+        _ = pipeline(prompt, generator=generator, num_inference_steps=5)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.8 GB is allocated
+        assert mem_bytes < 2.8 * 10**9
diff --git a/diffusers/tests/pipelines/stable_diffusion_3/__init__.py b/diffusers/tests/pipelines/stable_diffusion_3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py b/diffusers/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..94a85a56f510f917382c4aa2fc41fb6a721aba6e
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py
@@ -0,0 +1,303 @@
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
+
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, SD3Transformer2DModel, StableDiffusion3Pipeline
+from diffusers.utils.testing_utils import (
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import (
+    PipelineTesterMixin,
+    check_qkv_fusion_matches_attn_procs_length,
+    check_qkv_fusion_processors_exist,
+)
+
+
+class StableDiffusion3PipelineFastTests(unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = StableDiffusion3Pipeline
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "guidance_scale",
+            "negative_prompt",
+            "prompt_embeds",
+            "negative_prompt_embeds",
+        ]
+    )
+    batch_params = frozenset(["prompt", "negative_prompt"])
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = SD3Transformer2DModel(
+            sample_size=32,
+            patch_size=1,
+            in_channels=4,
+            num_layers=1,
+            attention_head_dim=8,
+            num_attention_heads=4,
+            caption_projection_dim=32,
+            joint_attention_dim=32,
+            pooled_projection_dim=64,
+            out_channels=4,
+        )
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModelWithProjection(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
+
+        text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_3 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=4,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            shift_factor=0.0609,
+            scaling_factor=1.5035,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "text_encoder_3": text_encoder_3,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "tokenizer_3": tokenizer_3,
+            "transformer": transformer,
+            "vae": vae,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_3_different_prompts(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_same_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "a different prompt"
+        inputs["prompt_3"] = "another different prompt"
+        output_different_prompts = pipe(**inputs).images[0]
+
+        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
+
+        # Outputs should be different here
+        assert max_diff > 1e-2
+
+    def test_stable_diffusion_3_different_negative_prompts(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_same_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt_2"] = "deformed"
+        inputs["negative_prompt_3"] = "blurry"
+        output_different_prompts = pipe(**inputs).images[0]
+
+        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
+
+        # Outputs should be different here
+        assert max_diff > 1e-2
+
+    def test_stable_diffusion_3_prompt_embeds(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+        inputs = self.get_dummy_inputs(torch_device)
+
+        output_with_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = inputs.pop("prompt")
+
+        do_classifier_free_guidance = inputs["guidance_scale"] > 1
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = pipe.encode_prompt(
+            prompt,
+            prompt_2=None,
+            prompt_3=None,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            device=torch_device,
+        )
+        output_with_embeds = pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            **inputs,
+        ).images[0]
+
+        max_diff = np.abs(output_with_prompt - output_with_embeds).max()
+        assert max_diff < 1e-4
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        original_image_slice = image[0, -3:, -3:, -1]
+
+        # TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added
+        # to the pipeline level.
+        pipe.transformer.fuse_qkv_projections()
+        assert check_qkv_fusion_processors_exist(
+            pipe.transformer
+        ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+        assert check_qkv_fusion_matches_attn_procs_length(
+            pipe.transformer, pipe.transformer.original_attn_processors
+        ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_fused = image[0, -3:, -3:, -1]
+
+        pipe.transformer.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_disabled = image[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
+
+
+@slow
+@require_torch_gpu
+class StableDiffusion3PipelineSlowTests(unittest.TestCase):
+    pipeline_class = StableDiffusion3Pipeline
+    repo_id = "stabilityai/stable-diffusion-3-medium-diffusers"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        return {
+            "prompt": "A photo of a cat",
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "generator": generator,
+        }
+
+    def test_sd3_inference(self):
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
+        pipe.enable_model_cpu_offload()
+
+        inputs = self.get_inputs(torch_device)
+
+        image = pipe(**inputs).images[0]
+        image_slice = image[0, :10, :10]
+        expected_slice = np.array(
+            [
+                0.4648,
+                0.4404,
+                0.4177,
+                0.5063,
+                0.4800,
+                0.4287,
+                0.5425,
+                0.5190,
+                0.4717,
+                0.5430,
+                0.5195,
+                0.4766,
+                0.5361,
+                0.5122,
+                0.4612,
+                0.4871,
+                0.4749,
+                0.4058,
+                0.4756,
+                0.4678,
+                0.3804,
+                0.4832,
+                0.4822,
+                0.3799,
+                0.5103,
+                0.5034,
+                0.3953,
+                0.5073,
+                0.4839,
+                0.3884,
+            ]
+        )
+
+        max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
+
+        assert max_diff < 1e-4
diff --git a/diffusers/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py b/diffusers/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d131b28c308d877b434342911bd640d03f37c87
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py
@@ -0,0 +1,276 @@
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    SD3Transformer2DModel,
+    StableDiffusion3Img2ImgPipeline,
+)
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    floats_tensor,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+class StableDiffusion3Img2ImgPipelineFastTests(PipelineLatentTesterMixin, unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = StableDiffusion3Img2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
+    required_optional_params = PipelineTesterMixin.required_optional_params
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = SD3Transformer2DModel(
+            sample_size=32,
+            patch_size=1,
+            in_channels=4,
+            num_layers=1,
+            attention_head_dim=8,
+            num_attention_heads=4,
+            joint_attention_dim=32,
+            caption_projection_dim=32,
+            pooled_projection_dim=64,
+            out_channels=4,
+        )
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModelWithProjection(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
+
+        text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_3 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=4,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            shift_factor=0.0609,
+            scaling_factor=1.5035,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "text_encoder_3": text_encoder_3,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "tokenizer_3": tokenizer_3,
+            "transformer": transformer,
+            "vae": vae,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "strength": 0.8,
+        }
+        return inputs
+
+    def test_stable_diffusion_3_img2img_different_prompts(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_same_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "a different prompt"
+        inputs["prompt_3"] = "another different prompt"
+        output_different_prompts = pipe(**inputs).images[0]
+
+        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
+
+        # Outputs should be different here
+        assert max_diff > 1e-2
+
+    def test_stable_diffusion_3_img2img_different_negative_prompts(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_same_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt_2"] = "deformed"
+        inputs["negative_prompt_3"] = "blurry"
+        output_different_prompts = pipe(**inputs).images[0]
+
+        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
+
+        # Outputs should be different here
+        assert max_diff > 1e-2
+
+    def test_stable_diffusion_3_img2img_prompt_embeds(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+        inputs = self.get_dummy_inputs(torch_device)
+
+        output_with_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = inputs.pop("prompt")
+
+        do_classifier_free_guidance = inputs["guidance_scale"] > 1
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = pipe.encode_prompt(
+            prompt,
+            prompt_2=None,
+            prompt_3=None,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            device=torch_device,
+        )
+        output_with_embeds = pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            **inputs,
+        ).images[0]
+
+        max_diff = np.abs(output_with_prompt - output_with_embeds).max()
+        assert max_diff < 1e-4
+
+    def test_multi_vae(self):
+        pass
+
+
+@slow
+@require_torch_gpu
+class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase):
+    pipeline_class = StableDiffusion3Img2ImgPipeline
+    repo_id = "stabilityai/stable-diffusion-3-medium-diffusers"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, seed=0):
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/sketch-mountains-input.png"
+        )
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        return {
+            "prompt": "A photo of a cat",
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "generator": generator,
+            "image": init_image,
+        }
+
+    def test_sd3_img2img_inference(self):
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
+        pipe.enable_model_cpu_offload()
+
+        inputs = self.get_inputs(torch_device)
+
+        image = pipe(**inputs).images[0]
+        image_slice = image[0, :10, :10]
+        expected_slice = np.array(
+            [
+                0.5435,
+                0.4673,
+                0.5732,
+                0.4438,
+                0.3557,
+                0.4912,
+                0.4331,
+                0.3491,
+                0.4915,
+                0.4287,
+                0.3477,
+                0.4849,
+                0.4355,
+                0.3469,
+                0.4871,
+                0.4431,
+                0.3538,
+                0.4912,
+                0.4521,
+                0.3643,
+                0.5059,
+                0.4587,
+                0.3730,
+                0.5166,
+                0.4685,
+                0.3845,
+                0.5264,
+                0.4746,
+                0.3914,
+                0.5342,
+            ]
+        )
+
+        max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
+
+        assert max_diff < 1e-4, f"Outputs are not close enough, got {max_diff}"
diff --git a/diffusers/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_inpaint.py b/diffusers/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..464ef6d017dff4c3229e8b71afb4860db67a251f
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_inpaint.py
@@ -0,0 +1,199 @@
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    SD3Transformer2DModel,
+    StableDiffusion3InpaintPipeline,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+)
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class StableDiffusion3InpaintPipelineFastTests(PipelineLatentTesterMixin, unittest.TestCase, PipelineTesterMixin):
+    pipeline_class = StableDiffusion3InpaintPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    image_params = frozenset(
+        []
+    )  # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
+    image_latents_params = frozenset([])
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"mask", "masked_image_latents"})
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = SD3Transformer2DModel(
+            sample_size=32,
+            patch_size=1,
+            in_channels=16,
+            num_layers=1,
+            attention_head_dim=8,
+            num_attention_heads=4,
+            joint_attention_dim=32,
+            caption_projection_dim=32,
+            pooled_projection_dim=64,
+            out_channels=16,
+        )
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModelWithProjection(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
+
+        text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_3 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=16,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            shift_factor=0.0609,
+            scaling_factor=1.5035,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "text_encoder_3": text_encoder_3,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "tokenizer_3": tokenizer_3,
+            "transformer": transformer,
+            "vae": vae,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        mask_image = torch.ones((1, 1, 32, 32)).to(device)
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "mask_image": mask_image,
+            "height": 32,
+            "width": 32,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "strength": 0.8,
+        }
+        return inputs
+
+    def test_stable_diffusion_3_inpaint_different_prompts(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_same_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "a different prompt"
+        inputs["prompt_3"] = "another different prompt"
+        output_different_prompts = pipe(**inputs).images[0]
+
+        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
+
+        # Outputs should be different here
+        assert max_diff > 1e-2
+
+    def test_stable_diffusion_3_inpaint_different_negative_prompts(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_same_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt_2"] = "deformed"
+        inputs["negative_prompt_3"] = "blurry"
+        output_different_prompts = pipe(**inputs).images[0]
+
+        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
+
+        # Outputs should be different here
+        assert max_diff > 1e-2
+
+    def test_stable_diffusion_3_inpaint_prompt_embeds(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+        inputs = self.get_dummy_inputs(torch_device)
+
+        output_with_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = inputs.pop("prompt")
+
+        do_classifier_free_guidance = inputs["guidance_scale"] > 1
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = pipe.encode_prompt(
+            prompt,
+            prompt_2=None,
+            prompt_3=None,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            device=torch_device,
+        )
+        output_with_embeds = pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            **inputs,
+        ).images[0]
+
+        max_diff = np.abs(output_with_prompt - output_with_embeds).max()
+        assert max_diff < 1e-4
+
+    def test_multi_vae(self):
+        pass
diff --git a/diffusers/tests/pipelines/stable_diffusion_adapter/__init__.py b/diffusers/tests/pipelines/stable_diffusion_adapter/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py b/diffusers/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a1e691e9e8fbb8c966e3f9b38ddf207b863e2ac
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py
@@ -0,0 +1,717 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from parameterized import parameterized
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    LCMScheduler,
+    MultiAdapter,
+    PNDMScheduler,
+    StableDiffusionAdapterPipeline,
+    T2IAdapter,
+    UNet2DConditionModel,
+)
+from diffusers.utils import logging
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineFromPipeTesterMixin, PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class AdapterTests:
+    pipeline_class = StableDiffusionAdapterPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+
+    def get_dummy_components(self, adapter_type, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            time_cond_proj_dim=time_cond_proj_dim,
+        )
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        torch.manual_seed(0)
+
+        if adapter_type == "full_adapter" or adapter_type == "light_adapter":
+            adapter = T2IAdapter(
+                in_channels=3,
+                channels=[32, 64],
+                num_res_blocks=2,
+                downscale_factor=2,
+                adapter_type=adapter_type,
+            )
+        elif adapter_type == "multi_adapter":
+            adapter = MultiAdapter(
+                [
+                    T2IAdapter(
+                        in_channels=3,
+                        channels=[32, 64],
+                        num_res_blocks=2,
+                        downscale_factor=2,
+                        adapter_type="full_adapter",
+                    ),
+                    T2IAdapter(
+                        in_channels=3,
+                        channels=[32, 64],
+                        num_res_blocks=2,
+                        downscale_factor=2,
+                        adapter_type="full_adapter",
+                    ),
+                ]
+            )
+        else:
+            raise ValueError(
+                f"Unknown adapter type: {adapter_type}, must be one of 'full_adapter', 'light_adapter', or 'multi_adapter''"
+            )
+
+        components = {
+            "adapter": adapter,
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_components_with_full_downscaling(self, adapter_type):
+        """Get dummy components with x8 VAE downscaling and 4 UNet down blocks.
+        These dummy components are intended to fully-exercise the T2I-Adapter
+        downscaling behavior.
+        """
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 32, 32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"),
+            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 32, 32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        torch.manual_seed(0)
+
+        if adapter_type == "full_adapter" or adapter_type == "light_adapter":
+            adapter = T2IAdapter(
+                in_channels=3,
+                channels=[32, 32, 32, 64],
+                num_res_blocks=2,
+                downscale_factor=8,
+                adapter_type=adapter_type,
+            )
+        elif adapter_type == "multi_adapter":
+            adapter = MultiAdapter(
+                [
+                    T2IAdapter(
+                        in_channels=3,
+                        channels=[32, 32, 32, 64],
+                        num_res_blocks=2,
+                        downscale_factor=8,
+                        adapter_type="full_adapter",
+                    ),
+                    T2IAdapter(
+                        in_channels=3,
+                        channels=[32, 32, 32, 64],
+                        num_res_blocks=2,
+                        downscale_factor=8,
+                        adapter_type="full_adapter",
+                    ),
+                ]
+            )
+        else:
+            raise ValueError(
+                f"Unknown adapter type: {adapter_type}, must be one of 'full_adapter', 'light_adapter', or 'multi_adapter''"
+            )
+
+        components = {
+            "adapter": adapter,
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0, height=64, width=64, num_images=1):
+        if num_images == 1:
+            image = floats_tensor((1, 3, height, width), rng=random.Random(seed)).to(device)
+        else:
+            image = [
+                floats_tensor((1, 3, height, width), rng=random.Random(seed)).to(device) for _ in range(num_images)
+            ]
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    @parameterized.expand(
+        [
+            # (dim=264) The internal feature map will be 33x33 after initial pixel unshuffling (downscaled x8).
+            (((4 * 8 + 1) * 8),),
+            # (dim=272) The internal feature map will be 17x17 after the first T2I down block (downscaled x16).
+            (((4 * 4 + 1) * 16),),
+            # (dim=288) The internal feature map will be 9x9 after the second T2I down block (downscaled x32).
+            (((4 * 2 + 1) * 32),),
+            # (dim=320) The internal feature map will be 5x5 after the third T2I down block (downscaled x64).
+            (((4 * 1 + 1) * 64),),
+        ]
+    )
+    def test_multiple_image_dimensions(self, dim):
+        """Test that the T2I-Adapter pipeline supports any input dimension that
+        is divisible by the adapter's `downscale_factor`. This test was added in
+        response to an issue where the T2I Adapter's downscaling padding
+        behavior did not match the UNet's behavior.
+
+        Note that we have selected `dim` values to produce odd resolutions at
+        each downscaling level.
+        """
+        components = self.get_dummy_components_with_full_downscaling()
+        sd_pipe = StableDiffusionAdapterPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device, height=dim, width=dim)
+        image = sd_pipe(**inputs).images
+
+        assert image.shape == (1, dim, dim, 3)
+
+    def test_adapter_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionAdapterPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4535, 0.5493, 0.4359, 0.5452, 0.6086, 0.4441, 0.5544, 0.501, 0.4859])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_adapter_lcm_custom_timesteps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionAdapterPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["num_inference_steps"]
+        inputs["timesteps"] = [999, 499]
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4535, 0.5493, 0.4359, 0.5452, 0.6086, 0.4441, 0.5544, 0.501, 0.4859])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+
+class StableDiffusionFullAdapterPipelineFastTests(
+    AdapterTests, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
+):
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        return super().get_dummy_components("full_adapter", time_cond_proj_dim=time_cond_proj_dim)
+
+    def get_dummy_components_with_full_downscaling(self):
+        return super().get_dummy_components_with_full_downscaling("full_adapter")
+
+    def test_stable_diffusion_adapter_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionAdapterPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4858, 0.5500, 0.4278, 0.4669, 0.6184, 0.4322, 0.5010, 0.5033, 0.4746])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
+
+    def test_from_pipe_consistent_forward_pass_cpu_offload(self):
+        super().test_from_pipe_consistent_forward_pass_cpu_offload(expected_max_diff=6e-3)
+
+
+class StableDiffusionLightAdapterPipelineFastTests(AdapterTests, PipelineTesterMixin, unittest.TestCase):
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        return super().get_dummy_components("light_adapter", time_cond_proj_dim=time_cond_proj_dim)
+
+    def get_dummy_components_with_full_downscaling(self):
+        return super().get_dummy_components_with_full_downscaling("light_adapter")
+
+    def test_stable_diffusion_adapter_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionAdapterPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4965, 0.5548, 0.4330, 0.4771, 0.6226, 0.4382, 0.5037, 0.5071, 0.4782])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
+
+
+class StableDiffusionMultiAdapterPipelineFastTests(AdapterTests, PipelineTesterMixin, unittest.TestCase):
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        return super().get_dummy_components("multi_adapter", time_cond_proj_dim=time_cond_proj_dim)
+
+    def get_dummy_components_with_full_downscaling(self):
+        return super().get_dummy_components_with_full_downscaling("multi_adapter")
+
+    def get_dummy_inputs(self, device, height=64, width=64, seed=0):
+        inputs = super().get_dummy_inputs(device, seed, height=height, width=width, num_images=2)
+        inputs["adapter_conditioning_scale"] = [0.5, 0.5]
+        return inputs
+
+    def test_stable_diffusion_adapter_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionAdapterPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4902, 0.5539, 0.4317, 0.4682, 0.6190, 0.4351, 0.5018, 0.5046, 0.4772])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
+
+    def test_inference_batch_consistent(
+        self, batch_sizes=[2, 4, 13], additional_params_copy_to_batched_inputs=["num_inference_steps"]
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        for batch_size in batch_sizes:
+            batched_inputs = {}
+            for name, value in inputs.items():
+                if name in self.batch_params:
+                    # prompt is string
+                    if name == "prompt":
+                        len_prompt = len(value)
+                        # make unequal batch sizes
+                        batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+
+                        # make last batch super long
+                        batched_inputs[name][-1] = 100 * "very long"
+                    elif name == "image":
+                        batched_images = []
+
+                        for image in value:
+                            batched_images.append(batch_size * [image])
+
+                        batched_inputs[name] = batched_images
+                    else:
+                        batched_inputs[name] = batch_size * [value]
+
+                elif name == "batch_size":
+                    batched_inputs[name] = batch_size
+                else:
+                    batched_inputs[name] = value
+
+            for arg in additional_params_copy_to_batched_inputs:
+                batched_inputs[arg] = inputs[arg]
+
+            batched_inputs["output_type"] = "np"
+
+            if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
+                batched_inputs.pop("output_type")
+
+            output = pipe(**batched_inputs)
+
+            assert len(output[0]) == batch_size
+
+            batched_inputs["output_type"] = "np"
+
+            if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
+                batched_inputs.pop("output_type")
+
+            output = pipe(**batched_inputs)[0]
+
+            assert output.shape[0] == batch_size
+
+        logger.setLevel(level=diffusers.logging.WARNING)
+
+    def test_num_images_per_prompt(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        batch_sizes = [1, 2]
+        num_images_per_prompts = [1, 2]
+
+        for batch_size in batch_sizes:
+            for num_images_per_prompt in num_images_per_prompts:
+                inputs = self.get_dummy_inputs(torch_device)
+
+                for key in inputs.keys():
+                    if key in self.batch_params:
+                        if key == "image":
+                            batched_images = []
+
+                            for image in inputs[key]:
+                                batched_images.append(batch_size * [image])
+
+                            inputs[key] = batched_images
+                        else:
+                            inputs[key] = batch_size * [inputs[key]]
+
+                images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt)[0]
+
+                assert images.shape[0] == batch_size * num_images_per_prompt
+
+    def test_inference_batch_single_identical(
+        self,
+        batch_size=3,
+        test_max_difference=None,
+        test_mean_pixel_difference=None,
+        relax_max_difference=False,
+        expected_max_diff=2e-3,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        if test_max_difference is None:
+            # TODO(Pedro) - not sure why, but not at all reproducible at the moment it seems
+            # make sure that batched and non-batched is identical
+            test_max_difference = torch_device != "mps"
+
+        if test_mean_pixel_difference is None:
+            # TODO same as above
+            test_mean_pixel_difference = torch_device != "mps"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        for name, value in inputs.items():
+            if name in self.batch_params:
+                # prompt is string
+                if name == "prompt":
+                    len_prompt = len(value)
+                    # make unequal batch sizes
+                    batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+
+                    # make last batch super long
+                    batched_inputs[name][-1] = 100 * "very long"
+                elif name == "image":
+                    batched_images = []
+
+                    for image in value:
+                        batched_images.append(batch_size * [image])
+
+                    batched_inputs[name] = batched_images
+                else:
+                    batched_inputs[name] = batch_size * [value]
+            elif name == "batch_size":
+                batched_inputs[name] = batch_size
+            elif name == "generator":
+                batched_inputs[name] = [self.get_generator(i) for i in range(batch_size)]
+            else:
+                batched_inputs[name] = value
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        if self.pipeline_class.__name__ != "DanceDiffusionPipeline":
+            batched_inputs["output_type"] = "np"
+
+        output_batch = pipe(**batched_inputs)
+        assert output_batch[0].shape[0] == batch_size
+
+        inputs["generator"] = self.get_generator(0)
+
+        output = pipe(**inputs)
+
+        logger.setLevel(level=diffusers.logging.WARNING)
+        if test_max_difference:
+            if relax_max_difference:
+                # Taking the median of the largest <n> differences
+                # is resilient to outliers
+                diff = np.abs(output_batch[0][0] - output[0][0])
+                diff = diff.flatten()
+                diff.sort()
+                max_diff = np.median(diff[-5:])
+            else:
+                max_diff = np.abs(output_batch[0][0] - output[0][0]).max()
+            assert max_diff < expected_max_diff
+
+        if test_mean_pixel_difference:
+            assert_mean_pixel_difference(output_batch[0][0], output[0][0])
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionAdapterPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_adapter_depth_sd_v15(self):
+        adapter_model = "TencentARC/t2iadapter_depth_sd15v2"
+        sd_model = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+        prompt = "desk"
+        image_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/t2i_adapter/desk_depth.png"
+        input_channels = 3
+        out_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/t2i_adapter/t2iadapter_depth_sd15v2.npy"
+        out_url = "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_adapter/sd_adapter_v15_zoe_depth.npy"
+
+        image = load_image(image_url)
+        expected_out = load_numpy(out_url)
+        if input_channels == 1:
+            image = image.convert("L")
+
+        adapter = T2IAdapter.from_pretrained(adapter_model, torch_dtype=torch.float16)
+
+        pipe = StableDiffusionAdapterPipeline.from_pretrained(sd_model, adapter=adapter, safety_checker=None)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        out = pipe(prompt=prompt, image=image, generator=generator, num_inference_steps=2, output_type="np").images
+
+        max_diff = numpy_cosine_similarity_distance(out.flatten(), expected_out.flatten())
+        assert max_diff < 1e-2
+
+    def test_stable_diffusion_adapter_zoedepth_sd_v15(self):
+        adapter_model = "TencentARC/t2iadapter_zoedepth_sd15v1"
+        sd_model = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+        prompt = "motorcycle"
+        image_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/t2i_adapter/motorcycle.png"
+        input_channels = 3
+        out_url = "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_adapter/sd_adapter_v15_zoe_depth.npy"
+
+        image = load_image(image_url)
+        expected_out = load_numpy(out_url)
+        if input_channels == 1:
+            image = image.convert("L")
+
+        adapter = T2IAdapter.from_pretrained(adapter_model, torch_dtype=torch.float16)
+
+        pipe = StableDiffusionAdapterPipeline.from_pretrained(sd_model, adapter=adapter, safety_checker=None)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_model_cpu_offload()
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        out = pipe(prompt=prompt, image=image, generator=generator, num_inference_steps=2, output_type="np").images
+
+        max_diff = numpy_cosine_similarity_distance(out.flatten(), expected_out.flatten())
+        assert max_diff < 1e-2
+
+    def test_stable_diffusion_adapter_canny_sd_v15(self):
+        adapter_model = "TencentARC/t2iadapter_canny_sd15v2"
+        sd_model = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+        prompt = "toy"
+        image_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/t2i_adapter/toy_canny.png"
+        input_channels = 1
+        out_url = "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_adapter/sd_adapter_v15_zoe_depth.npy"
+
+        image = load_image(image_url)
+        expected_out = load_numpy(out_url)
+        if input_channels == 1:
+            image = image.convert("L")
+
+        adapter = T2IAdapter.from_pretrained(adapter_model, torch_dtype=torch.float16)
+
+        pipe = StableDiffusionAdapterPipeline.from_pretrained(sd_model, adapter=adapter, safety_checker=None)
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        out = pipe(prompt=prompt, image=image, generator=generator, num_inference_steps=2, output_type="np").images
+
+        max_diff = numpy_cosine_similarity_distance(out.flatten(), expected_out.flatten())
+        assert max_diff < 1e-2
+
+    def test_stable_diffusion_adapter_sketch_sd15(self):
+        adapter_model = "TencentARC/t2iadapter_sketch_sd15v2"
+        sd_model = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+        prompt = "cat"
+        image_url = (
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/t2i_adapter/edge.png"
+        )
+        input_channels = 1
+        out_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/t2i_adapter/t2iadapter_sketch_sd15v2.npy"
+
+        image = load_image(image_url)
+        expected_out = load_numpy(out_url)
+        if input_channels == 1:
+            image = image.convert("L")
+
+        adapter = T2IAdapter.from_pretrained(adapter_model, torch_dtype=torch.float16)
+
+        pipe = StableDiffusionAdapterPipeline.from_pretrained(sd_model, adapter=adapter, safety_checker=None)
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        out = pipe(prompt=prompt, image=image, generator=generator, num_inference_steps=2, output_type="np").images
+
+        max_diff = numpy_cosine_similarity_distance(out.flatten(), expected_out.flatten())
+        assert max_diff < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion_gligen/__init__.py b/diffusers/tests/pipelines/stable_diffusion_gligen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_diffusion_gligen/test_stable_diffusion_gligen.py b/diffusers/tests/pipelines/stable_diffusion_gligen/test_stable_diffusion_gligen.py
new file mode 100644
index 0000000000000000000000000000000000000000..405809aee19ed711f30fea992c03dad837806592
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_gligen/test_stable_diffusion_gligen.py
@@ -0,0 +1,171 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    StableDiffusionGLIGENPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import enable_full_determinism
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    PipelineFromPipeTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class GligenPipelineFastTests(
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    PipelineFromPipeTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionGLIGENPipeline
+    params = TEXT_TO_IMAGE_PARAMS | {"gligen_phrases", "gligen_boxes"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            attention_type="gated",
+        )
+        # unet.position_net = PositionNet(32,32)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A modern livingroom",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "gligen_phrases": ["a birthday cake"],
+            "gligen_boxes": [[0.2676, 0.6088, 0.4773, 0.7183]],
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_gligen_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionGLIGENPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5069, 0.5561, 0.4577, 0.4792, 0.5203, 0.4089, 0.5039, 0.4919, 0.4499])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_gligen_k_euler_ancestral(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionGLIGENPipeline(**components)
+        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.425, 0.494, 0.429, 0.469, 0.525, 0.417, 0.533, 0.5, 0.47])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_attention_slicing_forward_pass(self):
+        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(batch_size=3, expected_max_diff=3e-3)
diff --git a/diffusers/tests/pipelines/stable_diffusion_gligen_text_image/__init__.py b/diffusers/tests/pipelines/stable_diffusion_gligen_text_image/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py b/diffusers/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..748702541b1e9079cd12ec1297eec20b307b8734
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py
@@ -0,0 +1,207 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from transformers import (
+    CLIPProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    StableDiffusionGLIGENTextImagePipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion import CLIPImageProjection
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import enable_full_determinism, torch_device
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    PipelineFromPipeTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class GligenTextImagePipelineFastTests(
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    PipelineFromPipeTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionGLIGENTextImagePipeline
+    params = TEXT_TO_IMAGE_PARAMS | {"gligen_phrases", "gligen_images", "gligen_boxes"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            attention_type="gated-text-image",
+        )
+        # unet.position_net = PositionNet(32,32)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=32,
+            projection_dim=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+        )
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+        processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+
+        image_project = CLIPImageProjection(hidden_size=32)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": image_encoder,
+            "image_project": image_project,
+            "processor": processor,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        gligen_images = load_image(
+            "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/livingroom_modern.png"
+        )
+        inputs = {
+            "prompt": "A modern livingroom",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "gligen_phrases": ["a birthday cake"],
+            "gligen_images": [gligen_images],
+            "gligen_boxes": [[0.2676, 0.6088, 0.4773, 0.7183]],
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_dict_tuple_outputs_equivalent(self):
+        expected_slice = None
+        if torch_device == "cpu":
+            expected_slice = np.array([0.5052, 0.5546, 0.4567, 0.4770, 0.5195, 0.4085, 0.5026, 0.4909, 0.4495])
+        super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
+
+    def test_stable_diffusion_gligen_text_image_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionGLIGENTextImagePipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5069, 0.5561, 0.4577, 0.4792, 0.5203, 0.4089, 0.5039, 0.4919, 0.4499])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_gligen_k_euler_ancestral(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionGLIGENTextImagePipeline(**components)
+        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.425, 0.494, 0.429, 0.469, 0.525, 0.417, 0.533, 0.5, 0.47])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_attention_slicing_forward_pass(self):
+        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(batch_size=3, expected_max_diff=3e-3)
diff --git a/diffusers/tests/pipelines/stable_diffusion_image_variation/__init__.py b/diffusers/tests/pipelines/stable_diffusion_image_variation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py b/diffusers/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3b0f70ccb1a42a86c96bba62bdca75ed2526c8
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py
@@ -0,0 +1,339 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModelWithProjection
+
+from diffusers import (
+    AutoencoderKL,
+    DPMSolverMultistepScheduler,
+    PNDMScheduler,
+    StableDiffusionImageVariationPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class StableDiffusionImageVariationPipelineFastTests(
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionImageVariationPipeline
+    params = IMAGE_VARIATION_PARAMS
+    batch_params = IMAGE_VARIATION_BATCH_PARAMS
+    image_params = frozenset([])
+    # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
+    image_latents_params = frozenset([])
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=32,
+            projection_dim=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            image_size=32,
+            patch_size=4,
+        )
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "image_encoder": image_encoder,
+            "feature_extractor": feature_extractor,
+            "safety_checker": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_img_variation_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionImageVariationPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5348, 0.5924, 0.4798, 0.5237, 0.5741, 0.4651, 0.5344, 0.4942, 0.4851])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_img_variation_multiple_images(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionImageVariationPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["image"] = 2 * [inputs["image"]]
+        output = sd_pipe(**inputs)
+
+        image = output.images
+
+        image_slice = image[-1, -3:, -3:, -1]
+
+        assert image.shape == (2, 64, 64, 3)
+        expected_slice = np.array([0.6647, 0.5557, 0.5723, 0.5567, 0.5869, 0.6044, 0.5502, 0.5439, 0.5189])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_imgvar/input_image_vermeer.png"
+        )
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "image": init_image,
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_img_variation_pipeline_default(self):
+        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            "lambdalabs/sd-image-variations-diffusers", safety_checker=None
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_inputs(generator_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.5348, 0.5924, 0.4798, 0.5237, 0.5741, 0.4651, 0.5344, 0.4942, 0.4851])
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 1e-4
+
+    def test_stable_diffusion_img_variation_intermediate_state(self):
+        number_of_steps = 0
+
+        def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None:
+            callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 1:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([0.5348, 0.5924, 0.4798, 0.5237, 0.5741, 0.4651, 0.5344, 0.4942, 0.4851])
+                max_diff = numpy_cosine_similarity_distance(latents_slice.flatten(), expected_slice)
+
+                assert max_diff < 1e-3
+
+            elif step == 2:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([0.5348, 0.5924, 0.4798, 0.5237, 0.5741, 0.4651, 0.5344, 0.4942, 0.4851])
+                max_diff = numpy_cosine_similarity_distance(latents_slice.flatten(), expected_slice)
+
+                assert max_diff < 1e-3
+
+        callback_fn.has_been_called = False
+
+        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            "lambdalabs/sd-image-variations-diffusers",
+            safety_checker=None,
+            torch_dtype=torch.float16,
+        )
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        generator_device = "cpu"
+        inputs = self.get_inputs(generator_device, dtype=torch.float16)
+        pipe(**inputs, callback=callback_fn, callback_steps=1)
+        assert callback_fn.has_been_called
+        assert number_of_steps == inputs["num_inference_steps"]
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            "lambdalabs/sd-image-variations-diffusers", safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        _ = pipe(**inputs)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.6 GB is allocated
+        assert mem_bytes < 2.6 * 10**9
+
+
+@nightly
+@require_torch_gpu
+class StableDiffusionImageVariationPipelineNightlyTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_imgvar/input_image_vermeer.png"
+        )
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "image": init_image,
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 50,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_img_variation_pndm(self):
+        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_imgvar/lambdalabs_variations_pndm.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_img_variation_dpm(self):
+        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 25
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_imgvar/lambdalabs_variations_dpm_multi.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion_k_diffusion/__init__.py b/diffusers/tests/pipelines/stable_diffusion_k_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_diffusion_k_diffusion/test_stable_diffusion_k_diffusion.py b/diffusers/tests/pipelines/stable_diffusion_k_diffusion/test_stable_diffusion_k_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe78f0ec3c1af3089f6660cfa0d67839fc2e53a0
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_k_diffusion/test_stable_diffusion_k_diffusion.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import StableDiffusionKDiffusionPipeline
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
+
+
+enable_full_determinism()
+
+
+@nightly
+@require_torch_gpu
+class StableDiffusionPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_1(self):
+        sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        sd_pipe.set_scheduler("sample_euler")
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=9.0, num_inference_steps=20, output_type="np")
+
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0447, 0.0492, 0.0468, 0.0408, 0.0383, 0.0408, 0.0354, 0.0380, 0.0339])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_2(self):
+        sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        sd_pipe.set_scheduler("sample_euler")
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=9.0, num_inference_steps=20, output_type="np")
+
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.1237, 0.1320, 0.1438, 0.1359, 0.1390, 0.1132, 0.1277, 0.1175, 0.1112])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-1
+
+    def test_stable_diffusion_karras_sigmas(self):
+        sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        sd_pipe.set_scheduler("sample_dpmpp_2m")
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=15,
+            output_type="np",
+            use_karras_sigmas=True,
+        )
+
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.11381689, 0.12112921, 0.1389457, 0.12549606, 0.1244964, 0.10831517, 0.11562866, 0.10867816, 0.10499048]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_noise_sampler_seed(self):
+        sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        sd_pipe.set_scheduler("sample_dpmpp_sde")
+
+        prompt = "A painting of a squirrel eating a burger"
+        seed = 0
+        images1 = sd_pipe(
+            [prompt],
+            generator=torch.manual_seed(seed),
+            noise_sampler_seed=seed,
+            guidance_scale=9.0,
+            num_inference_steps=20,
+            output_type="np",
+        ).images
+        images2 = sd_pipe(
+            [prompt],
+            generator=torch.manual_seed(seed),
+            noise_sampler_seed=seed,
+            guidance_scale=9.0,
+            num_inference_steps=20,
+            output_type="np",
+        ).images
+
+        assert images1.shape == (1, 512, 512, 3)
+        assert images2.shape == (1, 512, 512, 3)
+        assert np.abs(images1.flatten() - images2.flatten()).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion_ldm3d/__init__.py b/diffusers/tests/pipelines/stable_diffusion_ldm3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_diffusion_ldm3d/test_stable_diffusion_ldm3d.py b/diffusers/tests/pipelines/stable_diffusion_ldm3d/test_stable_diffusion_ldm3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f07d02aad5eada3a02a93d5b2432db355d1625a
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_ldm3d/test_stable_diffusion_ldm3d.py
@@ -0,0 +1,320 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    PNDMScheduler,
+    StableDiffusionLDM3DPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+
+
+enable_full_determinism()
+
+
+class StableDiffusionLDM3DPipelineFastTests(unittest.TestCase):
+    pipeline_class = StableDiffusionLDM3DPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=6,
+            out_channels=6,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        ldm3d_pipe = StableDiffusionLDM3DPipeline(**components)
+        ldm3d_pipe = ldm3d_pipe.to(torch_device)
+        ldm3d_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = ldm3d_pipe(**inputs)
+        rgb, depth = output.rgb, output.depth
+
+        image_slice_rgb = rgb[0, -3:, -3:, -1]
+        image_slice_depth = depth[0, -3:, -1]
+
+        assert rgb.shape == (1, 64, 64, 3)
+        assert depth.shape == (1, 64, 64)
+
+        expected_slice_rgb = np.array(
+            [0.37338176, 0.70247, 0.74203193, 0.51643604, 0.58256793, 0.60932136, 0.4181095, 0.48355877, 0.46535262]
+        )
+        expected_slice_depth = np.array([103.46727, 85.812004, 87.849236])
+
+        assert np.abs(image_slice_rgb.flatten() - expected_slice_rgb).max() < 1e-2
+        assert np.abs(image_slice_depth.flatten() - expected_slice_depth).max() < 1e-2
+
+    def test_stable_diffusion_prompt_embeds(self):
+        components = self.get_dummy_components()
+        ldm3d_pipe = StableDiffusionLDM3DPipeline(**components)
+        ldm3d_pipe = ldm3d_pipe.to(torch_device)
+        ldm3d_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = ldm3d_pipe(**inputs)
+        rgb_slice_1, depth_slice_1 = output.rgb, output.depth
+        rgb_slice_1 = rgb_slice_1[0, -3:, -3:, -1]
+        depth_slice_1 = depth_slice_1[0, -3:, -1]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        text_inputs = ldm3d_pipe.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=ldm3d_pipe.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_inputs = text_inputs["input_ids"].to(torch_device)
+
+        prompt_embeds = ldm3d_pipe.text_encoder(text_inputs)[0]
+
+        inputs["prompt_embeds"] = prompt_embeds
+
+        # forward
+        output = ldm3d_pipe(**inputs)
+        rgb_slice_2, depth_slice_2 = output.rgb, output.depth
+        rgb_slice_2 = rgb_slice_2[0, -3:, -3:, -1]
+        depth_slice_2 = depth_slice_2[0, -3:, -1]
+
+        assert np.abs(rgb_slice_1.flatten() - rgb_slice_2.flatten()).max() < 1e-4
+        assert np.abs(depth_slice_1.flatten() - depth_slice_2.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
+        ldm3d_pipe = StableDiffusionLDM3DPipeline(**components)
+        ldm3d_pipe = ldm3d_pipe.to(device)
+        ldm3d_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "french fries"
+        output = ldm3d_pipe(**inputs, negative_prompt=negative_prompt)
+
+        rgb, depth = output.rgb, output.depth
+        rgb_slice = rgb[0, -3:, -3:, -1]
+        depth_slice = depth[0, -3:, -1]
+
+        assert rgb.shape == (1, 64, 64, 3)
+        assert depth.shape == (1, 64, 64)
+
+        expected_slice_rgb = np.array(
+            [0.37044, 0.71811503, 0.7223251, 0.48603675, 0.5638391, 0.6364948, 0.42833704, 0.4901315, 0.47926217]
+        )
+        expected_slice_depth = np.array([107.84738, 84.62802, 89.962135])
+        assert np.abs(rgb_slice.flatten() - expected_slice_rgb).max() < 1e-2
+        assert np.abs(depth_slice.flatten() - expected_slice_depth).max() < 1e-2
+
+
+@nightly
+@require_torch_gpu
+class StableDiffusionLDM3DPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "a photograph of an astronaut riding a horse",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_ldm3d_stable_diffusion(self):
+        ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d")
+        ldm3d_pipe = ldm3d_pipe.to(torch_device)
+        ldm3d_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        output = ldm3d_pipe(**inputs)
+        rgb, depth = output.rgb, output.depth
+        rgb_slice = rgb[0, -3:, -3:, -1].flatten()
+        depth_slice = rgb[0, -3:, -1].flatten()
+
+        assert rgb.shape == (1, 512, 512, 3)
+        assert depth.shape == (1, 512, 512)
+
+        expected_slice_rgb = np.array(
+            [0.53805465, 0.56707305, 0.5486515, 0.57012236, 0.5814511, 0.56253487, 0.54843014, 0.55092263, 0.6459706]
+        )
+        expected_slice_depth = np.array(
+            [0.9263781, 0.6678672, 0.5486515, 0.92202145, 0.67831135, 0.56253487, 0.9241694, 0.7551478, 0.6459706]
+        )
+        assert np.abs(rgb_slice - expected_slice_rgb).max() < 3e-3
+        assert np.abs(depth_slice - expected_slice_depth).max() < 3e-3
+
+
+@nightly
+@require_torch_gpu
+class StableDiffusionPipelineNightlyTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "a photograph of an astronaut riding a horse",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 50,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_ldm3d(self):
+        ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d").to(torch_device)
+        ldm3d_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        output = ldm3d_pipe(**inputs)
+        rgb, depth = output.rgb, output.depth
+
+        expected_rgb_mean = 0.495586
+        expected_rgb_std = 0.33795515
+        expected_depth_mean = 112.48518
+        expected_depth_std = 98.489746
+        assert np.abs(expected_rgb_mean - rgb.mean()) < 1e-3
+        assert np.abs(expected_rgb_std - rgb.std()) < 1e-3
+        assert np.abs(expected_depth_mean - depth.mean()) < 1e-3
+        assert np.abs(expected_depth_std - depth.std()) < 1e-3
+
+    def test_ldm3d_v2(self):
+        ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d-4c").to(torch_device)
+        ldm3d_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        output = ldm3d_pipe(**inputs)
+        rgb, depth = output.rgb, output.depth
+
+        expected_rgb_mean = 0.4194127
+        expected_rgb_std = 0.35375586
+        expected_depth_mean = 0.5638502
+        expected_depth_std = 0.34686103
+
+        assert rgb.shape == (1, 512, 512, 3)
+        assert depth.shape == (1, 512, 512, 1)
+        assert np.abs(expected_rgb_mean - rgb.mean()) < 1e-3
+        assert np.abs(expected_rgb_std - rgb.std()) < 1e-3
+        assert np.abs(expected_depth_mean - depth.mean()) < 1e-3
+        assert np.abs(expected_depth_std - depth.std()) < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion_panorama/__init__.py b/diffusers/tests/pipelines/stable_diffusion_panorama/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py b/diffusers/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dc6c31ae9a7c498d292be0e23adeeafd91a48ab
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py
@@ -0,0 +1,427 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPanoramaPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, skip_mps, torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineFromPipeTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+@skip_mps
+class StableDiffusionPanoramaPipelineFastTests(
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    PipelineFromPipeTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionPanoramaPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=1,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = DDIMScheduler()
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "a photo of the dolomites",
+            "generator": generator,
+            # Setting height and width to None to prevent OOMs on CPU.
+            "height": None,
+            "width": None,
+            "num_inference_steps": 1,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_panorama_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPanoramaPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.6186, 0.5374, 0.4915, 0.4135, 0.4114, 0.4563, 0.5128, 0.4977, 0.4757])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_panorama_circular_padding_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPanoramaPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs, circular_padding=True).images
+        image_slice = image[0, -3:, -3:, -1]
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.6127, 0.6299, 0.4595, 0.4051, 0.4543, 0.3925, 0.5510, 0.5693, 0.5031])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    # override to speed the overall test timing up.
+    def test_inference_batch_consistent(self):
+        super().test_inference_batch_consistent(batch_sizes=[1, 2])
+
+    # override to speed the overall test timing up.
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=5.0e-3)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=1e-1)
+
+    def test_stable_diffusion_panorama_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPanoramaPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "french fries"
+        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.6187, 0.5375, 0.4915, 0.4136, 0.4114, 0.4563, 0.5128, 0.4976, 0.4757])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_panorama_views_batch(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPanoramaPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs, view_batch_size=2)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.6187, 0.5375, 0.4915, 0.4136, 0.4114, 0.4563, 0.5128, 0.4976, 0.4757])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_panorama_views_batch_circular_padding(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPanoramaPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs, circular_padding=True, view_batch_size=2)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.6127, 0.6299, 0.4595, 0.4051, 0.4543, 0.3925, 0.5510, 0.5693, 0.5031])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_panorama_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = EulerAncestralDiscreteScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+        )
+        sd_pipe = StableDiffusionPanoramaPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.4024, 0.6510, 0.4901, 0.5378, 0.5813, 0.5622, 0.4795, 0.4467, 0.4952])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_panorama_pndm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
+        )
+        sd_pipe = StableDiffusionPanoramaPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.6391, 0.6291, 0.4861, 0.5134, 0.5552, 0.4578, 0.5032, 0.5023, 0.4539])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@nightly
+@require_torch_gpu
+class StableDiffusionPanoramaNightlyTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, seed=0):
+        generator = torch.manual_seed(seed)
+        inputs = {
+            "prompt": "a photo of the dolomites",
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_panorama_default(self):
+        model_ckpt = "stabilityai/stable-diffusion-2-base"
+        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
+        pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 2048, 3)
+
+        expected_slice = np.array(
+            [
+                0.36968392,
+                0.27025372,
+                0.32446766,
+                0.28379387,
+                0.36363274,
+                0.30733347,
+                0.27100027,
+                0.27054125,
+                0.25536096,
+            ]
+        )
+
+        assert np.abs(expected_slice - image_slice).max() < 1e-2
+
+    def test_stable_diffusion_panorama_k_lms(self):
+        pipe = StableDiffusionPanoramaPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-base", safety_checker=None
+        )
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert image.shape == (1, 512, 2048, 3)
+
+        expected_slice = np.array(
+            [
+                [
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                ]
+            ]
+        )
+
+        assert np.abs(expected_slice - image_slice).max() < 1e-2
+
+    def test_stable_diffusion_panorama_intermediate_state(self):
+        number_of_steps = 0
+
+        def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None:
+            callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 1:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 256)
+                latents_slice = latents[0, -3:, -3:, -1]
+
+                expected_slice = np.array(
+                    [
+                        0.18681869,
+                        0.33907816,
+                        0.5361276,
+                        0.14432865,
+                        -0.02856611,
+                        -0.73941123,
+                        0.23397987,
+                        0.47322682,
+                        -0.37823164,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+            elif step == 2:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 256)
+                latents_slice = latents[0, -3:, -3:, -1]
+
+                expected_slice = np.array(
+                    [
+                        0.18539645,
+                        0.33987248,
+                        0.5378559,
+                        0.14437142,
+                        -0.02455261,
+                        -0.7338317,
+                        0.23990755,
+                        0.47356272,
+                        -0.3786505,
+                    ]
+                )
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+
+        callback_fn.has_been_called = False
+
+        model_ckpt = "stabilityai/stable-diffusion-2-base"
+        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
+        pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs()
+        pipe(**inputs, callback=callback_fn, callback_steps=1)
+        assert callback_fn.has_been_called
+        assert number_of_steps == 3
+
+    def test_stable_diffusion_panorama_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        model_ckpt = "stabilityai/stable-diffusion-2-base"
+        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
+        pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        inputs = self.get_inputs()
+        _ = pipe(**inputs)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 5.2 GB is allocated
+        assert mem_bytes < 5.5 * 10**9
diff --git a/diffusers/tests/pipelines/stable_diffusion_safe/__init__.py b/diffusers/tests/pipelines/stable_diffusion_safe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/diffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccb20a1c218e23f2f8517a0aee8f0fe7d146e027
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
@@ -0,0 +1,451 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion_safe import StableDiffusionPipelineSafe as StableDiffusionPipeline
+from diffusers.utils.testing_utils import floats_tensor, nightly, require_torch_gpu, torch_device
+
+
+class SafeDiffusionPipelineFastTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    def test_safe_diffusion_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5756, 0.6118, 0.5005, 0.5041, 0.5471, 0.4726, 0.4976, 0.4865, 0.4864])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_pndm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5125, 0.5716, 0.4828, 0.5060, 0.5650, 0.4768, 0.5185, 0.4895, 0.4993])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_no_safety_checker(self):
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
+        )
+        assert isinstance(pipe, StableDiffusionPipeline)
+        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
+        assert pipe.safety_checker is None
+
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+        # check that there's no error when saving a pipeline with one of the models being None
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
+
+        # sanity check that the pipeline still works
+        assert pipe.safety_checker is None
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_fp16(self):
+        """Test that stable diffusion works with fp16"""
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # put models in fp16
+        unet = unet.half()
+        vae = vae.half()
+        bert = bert.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
+
+        assert image.shape == (1, 64, 64, 3)
+
+
+@nightly
+@require_torch_gpu
+class SafeDiffusionPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_harm_safe_stable_diffusion(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None
+        )
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = (
+            "portrait of girl with smokey eyes makeup in abandoned hotel, grange clothes, redshift, wide high angle"
+            " coloured polaroid photograph with flash, kodak film, hyper real, stunning moody cinematography, with"
+            " anamorphic lenses, by maripol, fallen angels by wong kar - wai, style of suspiria and neon demon and"
+            " children from bahnhof zoo, detailed "
+        )
+        seed = 4003660346
+        guidance_scale = 7
+
+        # without safety guidance (sld_guidance_scale = 0)
+        generator = torch.manual_seed(seed)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            sld_guidance_scale=0,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [0.2278, 0.2231, 0.2249, 0.2333, 0.2303, 0.1885, 0.2273, 0.2144, 0.2176]
+
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+        # without safety guidance (strong configuration)
+        generator = torch.manual_seed(seed)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            sld_guidance_scale=2000,
+            sld_warmup_steps=7,
+            sld_threshold=0.025,
+            sld_momentum_scale=0.5,
+            sld_mom_beta=0.7,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [0.2383, 0.2276, 0.236, 0.2192, 0.2186, 0.2053, 0.1971, 0.1901, 0.1719]
+
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_nudity_safe_stable_diffusion(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None
+        )
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "padme amidala taking a bath artwork, safe for work, no nudity"
+        seed = 2734971755
+        guidance_scale = 7
+
+        generator = torch.manual_seed(seed)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            sld_guidance_scale=0,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [0.3502, 0.3622, 0.3396, 0.3642, 0.3478, 0.3318, 0.35, 0.3348, 0.3297]
+
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+        generator = torch.manual_seed(seed)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            sld_guidance_scale=2000,
+            sld_warmup_steps=7,
+            sld_threshold=0.025,
+            sld_momentum_scale=0.5,
+            sld_mom_beta=0.7,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [0.5531, 0.5206, 0.4895, 0.5156, 0.5182, 0.4751, 0.4802, 0.4803, 0.4443]
+
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_nudity_safetychecker_safe_stable_diffusion(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = (
+            "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c."
+            " leyendecker"
+        )
+        seed = 1044355234
+        guidance_scale = 12
+
+        generator = torch.manual_seed(seed)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            sld_guidance_scale=0,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
+
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-7
+
+        generator = torch.manual_seed(seed)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            sld_guidance_scale=2000,
+            sld_warmup_steps=7,
+            sld_threshold=0.025,
+            sld_momentum_scale=0.5,
+            sld_mom_beta=0.7,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334, 0.6561])
+        assert image.shape == (1, 512, 512, 3)
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion_sag/__init__.py b/diffusers/tests/pipelines/stable_diffusion_sag/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_diffusion_sag/test_stable_diffusion_sag.py b/diffusers/tests/pipelines/stable_diffusion_sag/test_stable_diffusion_sag.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4e66bd65f069b54d7be0ac5a9a13ef820470d6
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_sag/test_stable_diffusion_sag.py
@@ -0,0 +1,232 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+    StableDiffusionSAGPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineFromPipeTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionSAGPipelineFastTests(
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    PipelineFromPipeTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionSAGPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            sample_size=8,
+            norm_num_groups=1,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=8,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            norm_num_groups=1,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=8,
+            num_hidden_layers=2,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": ".",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 1.0,
+            "sag_scale": 1.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+    @unittest.skip("Not necessary to test here.")
+    def test_xformers_attention_forwardGenerator_pass(self):
+        pass
+
+    def test_pipeline_different_schedulers(self):
+        pipeline = self.pipeline_class(**self.get_dummy_components())
+        inputs = self.get_dummy_inputs("cpu")
+
+        expected_image_size = (16, 16, 3)
+        for scheduler_cls in [DDIMScheduler, DEISMultistepScheduler, DPMSolverMultistepScheduler]:
+            pipeline.scheduler = scheduler_cls.from_config(pipeline.scheduler.config)
+            image = pipeline(**inputs).images[0]
+
+            shape = image.shape
+            assert shape == expected_image_size
+
+        pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+
+        with self.assertRaises(ValueError):
+            # Karras schedulers are not supported
+            image = pipeline(**inputs).images[0]
+
+
+@nightly
+@require_torch_gpu
+class StableDiffusionPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_1(self):
+        sag_pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        sag_pipe = sag_pipe.to(torch_device)
+        sag_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "."
+        generator = torch.manual_seed(0)
+        output = sag_pipe(
+            [prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np"
+        )
+
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.1568, 0.1738, 0.1695, 0.1693, 0.1507, 0.1705, 0.1547, 0.1751, 0.1949])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
+
+    def test_stable_diffusion_2(self):
+        sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
+        sag_pipe = sag_pipe.to(torch_device)
+        sag_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "."
+        generator = torch.manual_seed(0)
+        output = sag_pipe(
+            [prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np"
+        )
+
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.3459, 0.2876, 0.2537, 0.3002, 0.2671, 0.2160, 0.3026, 0.2262, 0.2371])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
+
+    def test_stable_diffusion_2_non_square(self):
+        sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
+        sag_pipe = sag_pipe.to(torch_device)
+        sag_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "."
+        generator = torch.manual_seed(0)
+        output = sag_pipe(
+            [prompt],
+            width=768,
+            height=512,
+            generator=generator,
+            guidance_scale=7.5,
+            sag_scale=1.0,
+            num_inference_steps=20,
+            output_type="np",
+        )
+
+        image = output.images
+
+        assert image.shape == (1, 512, 768, 3)
diff --git a/diffusers/tests/pipelines/stable_diffusion_xl/__init__.py b/diffusers/tests/pipelines/stable_diffusion_xl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..8550f258045eb462676b5ca9979e51b173545abe
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
@@ -0,0 +1,1088 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import gc
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LCMScheduler,
+    StableDiffusionXLImg2ImgPipeline,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+    UniPCMultistepScheduler,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_image,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDFunctionTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLPipelineFastTests(
+    SDFunctionTesterMixin,
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionXLPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"})
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(2, 4),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+            norm_num_groups=1,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "image_encoder": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_xl_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5388, 0.5452, 0.4694, 0.4583, 0.5253, 0.4832, 0.5288, 0.5035, 0.47])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_xl_euler_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionXLPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4917, 0.6555, 0.4348, 0.5219, 0.7324, 0.4855, 0.5168, 0.5447, 0.5156])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_xl_euler_lcm_custom_timesteps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionXLPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["num_inference_steps"]
+        inputs["timesteps"] = [999, 499]
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4917, 0.6555, 0.4348, 0.5219, 0.7324, 0.4855, 0.5168, 0.5447, 0.5156])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_ays(self):
+        from diffusers.schedulers import AysSchedules
+
+        timestep_schedule = AysSchedules["StableDiffusionXLTimesteps"]
+        sigma_schedule = AysSchedules["StableDiffusionXLSigmas"]
+
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionXLPipeline(**components)
+        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_inference_steps"] = 10
+        output = sd_pipe(**inputs).images
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_inference_steps"] = None
+        inputs["timesteps"] = timestep_schedule
+        output_ts = sd_pipe(**inputs).images
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_inference_steps"] = None
+        inputs["sigmas"] = sigma_schedule
+        output_sigmas = sd_pipe(**inputs).images
+
+        assert (
+            np.abs(output_sigmas.flatten() - output_ts.flatten()).max() < 1e-3
+        ), "ays timesteps and ays sigmas should have the same outputs"
+        assert (
+            np.abs(output.flatten() - output_ts.flatten()).max() > 1e-3
+        ), "use ays timesteps should have different outputs"
+        assert (
+            np.abs(output.flatten() - output_sigmas.flatten()).max() > 1e-3
+        ), "use ays sigmas should have different outputs"
+
+    def test_stable_diffusion_xl_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward without prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = 2 * [inputs["prompt"]]
+        inputs["num_images_per_prompt"] = 2
+
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 2 * [inputs.pop("prompt")]
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = sd_pipe.encode_prompt(prompt)
+
+        output = sd_pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # make sure that it's equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_xl_negative_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward without prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        prompt = 3 * [inputs.pop("prompt")]
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
+
+        output = sd_pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # make sure that it's equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.5388, 0.5452, 0.4694, 0.4583, 0.5253, 0.4832, 0.5288, 0.5035, 0.4766])
+
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_attention_slicing_forward_pass(self):
+        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @require_torch_gpu
+    def test_stable_diffusion_xl_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            pipe.unet.set_default_attn_processor()
+
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_stable_diffusion_xl_img2img_prompt_embeds_only(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward without prompt embeds
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with prompt embeds
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        (
+            prompt_embeds,
+            _,
+            pooled_prompt_embeds,
+            _,
+        ) = sd_pipe.encode_prompt(prompt)
+
+        output = sd_pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+        )
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # make sure that it's equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_two_xl_mixture_of_denoiser_fast(self):
+        components = self.get_dummy_components()
+        pipe_1 = StableDiffusionXLPipeline(**components).to(torch_device)
+        pipe_1.unet.set_default_attn_processor()
+        pipe_2 = StableDiffusionXLImg2ImgPipeline(**components).to(torch_device)
+        pipe_2.unet.set_default_attn_processor()
+
+        def assert_run_mixture(
+            num_steps,
+            split,
+            scheduler_cls_orig,
+            expected_tss,
+            num_train_timesteps=pipe_1.scheduler.config.num_train_timesteps,
+        ):
+            inputs = self.get_dummy_inputs(torch_device)
+            inputs["num_inference_steps"] = num_steps
+
+            class scheduler_cls(scheduler_cls_orig):
+                pass
+
+            pipe_1.scheduler = scheduler_cls.from_config(pipe_1.scheduler.config)
+            pipe_2.scheduler = scheduler_cls.from_config(pipe_2.scheduler.config)
+
+            # Let's retrieve the number of timesteps we want to use
+            pipe_1.scheduler.set_timesteps(num_steps)
+            expected_steps = pipe_1.scheduler.timesteps.tolist()
+
+            if pipe_1.scheduler.order == 2:
+                expected_steps_1 = list(filter(lambda ts: ts >= split, expected_tss))
+                expected_steps_2 = expected_steps_1[-1:] + list(filter(lambda ts: ts < split, expected_tss))
+                expected_steps = expected_steps_1 + expected_steps_2
+            else:
+                expected_steps_1 = list(filter(lambda ts: ts >= split, expected_tss))
+                expected_steps_2 = list(filter(lambda ts: ts < split, expected_tss))
+
+            # now we monkey patch step `done_steps`
+            # list into the step function for testing
+            done_steps = []
+            old_step = copy.copy(scheduler_cls.step)
+
+            def new_step(self, *args, **kwargs):
+                done_steps.append(args[1].cpu().item())  # args[1] is always the passed `t`
+                return old_step(self, *args, **kwargs)
+
+            scheduler_cls.step = new_step
+
+            inputs_1 = {
+                **inputs,
+                **{
+                    "denoising_end": 1.0 - (split / num_train_timesteps),
+                    "output_type": "latent",
+                },
+            }
+            latents = pipe_1(**inputs_1).images[0]
+
+            assert expected_steps_1 == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}"
+
+            inputs_2 = {
+                **inputs,
+                **{
+                    "denoising_start": 1.0 - (split / num_train_timesteps),
+                    "image": latents,
+                },
+            }
+            pipe_2(**inputs_2).images[0]
+
+            assert expected_steps_2 == done_steps[len(expected_steps_1) :]
+            assert expected_steps == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}"
+
+        steps = 10
+        for split in [300, 700]:
+            for scheduler_cls_timesteps in [
+                (EulerDiscreteScheduler, [901, 801, 701, 601, 501, 401, 301, 201, 101, 1]),
+                (
+                    HeunDiscreteScheduler,
+                    [
+                        901.0,
+                        801.0,
+                        801.0,
+                        701.0,
+                        701.0,
+                        601.0,
+                        601.0,
+                        501.0,
+                        501.0,
+                        401.0,
+                        401.0,
+                        301.0,
+                        301.0,
+                        201.0,
+                        201.0,
+                        101.0,
+                        101.0,
+                        1.0,
+                        1.0,
+                    ],
+                ),
+            ]:
+                assert_run_mixture(steps, split, scheduler_cls_timesteps[0], scheduler_cls_timesteps[1])
+
+    @slow
+    def test_stable_diffusion_two_xl_mixture_of_denoiser(self):
+        components = self.get_dummy_components()
+        pipe_1 = StableDiffusionXLPipeline(**components).to(torch_device)
+        pipe_1.unet.set_default_attn_processor()
+        pipe_2 = StableDiffusionXLImg2ImgPipeline(**components).to(torch_device)
+        pipe_2.unet.set_default_attn_processor()
+
+        def assert_run_mixture(
+            num_steps,
+            split,
+            scheduler_cls_orig,
+            expected_tss,
+            num_train_timesteps=pipe_1.scheduler.config.num_train_timesteps,
+        ):
+            inputs = self.get_dummy_inputs(torch_device)
+            inputs["num_inference_steps"] = num_steps
+
+            class scheduler_cls(scheduler_cls_orig):
+                pass
+
+            pipe_1.scheduler = scheduler_cls.from_config(pipe_1.scheduler.config)
+            pipe_2.scheduler = scheduler_cls.from_config(pipe_2.scheduler.config)
+
+            # Let's retrieve the number of timesteps we want to use
+            pipe_1.scheduler.set_timesteps(num_steps)
+            expected_steps = pipe_1.scheduler.timesteps.tolist()
+
+            if pipe_1.scheduler.order == 2:
+                expected_steps_1 = list(filter(lambda ts: ts >= split, expected_tss))
+                expected_steps_2 = expected_steps_1[-1:] + list(filter(lambda ts: ts < split, expected_tss))
+                expected_steps = expected_steps_1 + expected_steps_2
+            else:
+                expected_steps_1 = list(filter(lambda ts: ts >= split, expected_tss))
+                expected_steps_2 = list(filter(lambda ts: ts < split, expected_tss))
+
+            # now we monkey patch step `done_steps`
+            # list into the step function for testing
+            done_steps = []
+            old_step = copy.copy(scheduler_cls.step)
+
+            def new_step(self, *args, **kwargs):
+                done_steps.append(args[1].cpu().item())  # args[1] is always the passed `t`
+                return old_step(self, *args, **kwargs)
+
+            scheduler_cls.step = new_step
+
+            inputs_1 = {
+                **inputs,
+                **{
+                    "denoising_end": 1.0 - (split / num_train_timesteps),
+                    "output_type": "latent",
+                },
+            }
+            latents = pipe_1(**inputs_1).images[0]
+
+            assert expected_steps_1 == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}"
+
+            inputs_2 = {
+                **inputs,
+                **{
+                    "denoising_start": 1.0 - (split / num_train_timesteps),
+                    "image": latents,
+                },
+            }
+            pipe_2(**inputs_2).images[0]
+
+            assert expected_steps_2 == done_steps[len(expected_steps_1) :]
+            assert expected_steps == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}"
+
+        steps = 10
+        for split in [300, 500, 700]:
+            for scheduler_cls_timesteps in [
+                (DDIMScheduler, [901, 801, 701, 601, 501, 401, 301, 201, 101, 1]),
+                (EulerDiscreteScheduler, [901, 801, 701, 601, 501, 401, 301, 201, 101, 1]),
+                (DPMSolverMultistepScheduler, [901, 811, 721, 631, 541, 451, 361, 271, 181, 91]),
+                (UniPCMultistepScheduler, [901, 811, 721, 631, 541, 451, 361, 271, 181, 91]),
+                (
+                    HeunDiscreteScheduler,
+                    [
+                        901.0,
+                        801.0,
+                        801.0,
+                        701.0,
+                        701.0,
+                        601.0,
+                        601.0,
+                        501.0,
+                        501.0,
+                        401.0,
+                        401.0,
+                        301.0,
+                        301.0,
+                        201.0,
+                        201.0,
+                        101.0,
+                        101.0,
+                        1.0,
+                        1.0,
+                    ],
+                ),
+            ]:
+                assert_run_mixture(steps, split, scheduler_cls_timesteps[0], scheduler_cls_timesteps[1])
+
+        steps = 25
+        for split in [300, 500, 700]:
+            for scheduler_cls_timesteps in [
+                (
+                    DDIMScheduler,
+                    [
+                        961,
+                        921,
+                        881,
+                        841,
+                        801,
+                        761,
+                        721,
+                        681,
+                        641,
+                        601,
+                        561,
+                        521,
+                        481,
+                        441,
+                        401,
+                        361,
+                        321,
+                        281,
+                        241,
+                        201,
+                        161,
+                        121,
+                        81,
+                        41,
+                        1,
+                    ],
+                ),
+                (
+                    EulerDiscreteScheduler,
+                    [
+                        961.0,
+                        921.0,
+                        881.0,
+                        841.0,
+                        801.0,
+                        761.0,
+                        721.0,
+                        681.0,
+                        641.0,
+                        601.0,
+                        561.0,
+                        521.0,
+                        481.0,
+                        441.0,
+                        401.0,
+                        361.0,
+                        321.0,
+                        281.0,
+                        241.0,
+                        201.0,
+                        161.0,
+                        121.0,
+                        81.0,
+                        41.0,
+                        1.0,
+                    ],
+                ),
+                (
+                    DPMSolverMultistepScheduler,
+                    [
+                        951,
+                        913,
+                        875,
+                        837,
+                        799,
+                        761,
+                        723,
+                        685,
+                        647,
+                        609,
+                        571,
+                        533,
+                        495,
+                        457,
+                        419,
+                        381,
+                        343,
+                        305,
+                        267,
+                        229,
+                        191,
+                        153,
+                        115,
+                        77,
+                        39,
+                    ],
+                ),
+                (
+                    UniPCMultistepScheduler,
+                    [
+                        951,
+                        913,
+                        875,
+                        837,
+                        799,
+                        761,
+                        723,
+                        685,
+                        647,
+                        609,
+                        571,
+                        533,
+                        495,
+                        457,
+                        419,
+                        381,
+                        343,
+                        305,
+                        267,
+                        229,
+                        191,
+                        153,
+                        115,
+                        77,
+                        39,
+                    ],
+                ),
+                (
+                    HeunDiscreteScheduler,
+                    [
+                        961.0,
+                        921.0,
+                        921.0,
+                        881.0,
+                        881.0,
+                        841.0,
+                        841.0,
+                        801.0,
+                        801.0,
+                        761.0,
+                        761.0,
+                        721.0,
+                        721.0,
+                        681.0,
+                        681.0,
+                        641.0,
+                        641.0,
+                        601.0,
+                        601.0,
+                        561.0,
+                        561.0,
+                        521.0,
+                        521.0,
+                        481.0,
+                        481.0,
+                        441.0,
+                        441.0,
+                        401.0,
+                        401.0,
+                        361.0,
+                        361.0,
+                        321.0,
+                        321.0,
+                        281.0,
+                        281.0,
+                        241.0,
+                        241.0,
+                        201.0,
+                        201.0,
+                        161.0,
+                        161.0,
+                        121.0,
+                        121.0,
+                        81.0,
+                        81.0,
+                        41.0,
+                        41.0,
+                        1.0,
+                        1.0,
+                    ],
+                ),
+            ]:
+                assert_run_mixture(steps, split, scheduler_cls_timesteps[0], scheduler_cls_timesteps[1])
+
+    @slow
+    def test_stable_diffusion_three_xl_mixture_of_denoiser(self):
+        components = self.get_dummy_components()
+        pipe_1 = StableDiffusionXLPipeline(**components).to(torch_device)
+        pipe_1.unet.set_default_attn_processor()
+        pipe_2 = StableDiffusionXLImg2ImgPipeline(**components).to(torch_device)
+        pipe_2.unet.set_default_attn_processor()
+        pipe_3 = StableDiffusionXLImg2ImgPipeline(**components).to(torch_device)
+        pipe_3.unet.set_default_attn_processor()
+
+        def assert_run_mixture(
+            num_steps,
+            split_1,
+            split_2,
+            scheduler_cls_orig,
+            num_train_timesteps=pipe_1.scheduler.config.num_train_timesteps,
+        ):
+            inputs = self.get_dummy_inputs(torch_device)
+            inputs["num_inference_steps"] = num_steps
+
+            class scheduler_cls(scheduler_cls_orig):
+                pass
+
+            pipe_1.scheduler = scheduler_cls.from_config(pipe_1.scheduler.config)
+            pipe_2.scheduler = scheduler_cls.from_config(pipe_2.scheduler.config)
+            pipe_3.scheduler = scheduler_cls.from_config(pipe_3.scheduler.config)
+
+            # Let's retrieve the number of timesteps we want to use
+            pipe_1.scheduler.set_timesteps(num_steps)
+            expected_steps = pipe_1.scheduler.timesteps.tolist()
+
+            split_1_ts = num_train_timesteps - int(round(num_train_timesteps * split_1))
+            split_2_ts = num_train_timesteps - int(round(num_train_timesteps * split_2))
+
+            if pipe_1.scheduler.order == 2:
+                expected_steps_1 = list(filter(lambda ts: ts >= split_1_ts, expected_steps))
+                expected_steps_2 = expected_steps_1[-1:] + list(
+                    filter(lambda ts: ts >= split_2_ts and ts < split_1_ts, expected_steps)
+                )
+                expected_steps_3 = expected_steps_2[-1:] + list(filter(lambda ts: ts < split_2_ts, expected_steps))
+                expected_steps = expected_steps_1 + expected_steps_2 + expected_steps_3
+            else:
+                expected_steps_1 = list(filter(lambda ts: ts >= split_1_ts, expected_steps))
+                expected_steps_2 = list(filter(lambda ts: ts >= split_2_ts and ts < split_1_ts, expected_steps))
+                expected_steps_3 = list(filter(lambda ts: ts < split_2_ts, expected_steps))
+
+            # now we monkey patch step `done_steps`
+            # list into the step function for testing
+            done_steps = []
+            old_step = copy.copy(scheduler_cls.step)
+
+            def new_step(self, *args, **kwargs):
+                done_steps.append(args[1].cpu().item())  # args[1] is always the passed `t`
+                return old_step(self, *args, **kwargs)
+
+            scheduler_cls.step = new_step
+
+            inputs_1 = {**inputs, **{"denoising_end": split_1, "output_type": "latent"}}
+            latents = pipe_1(**inputs_1).images[0]
+
+            assert (
+                expected_steps_1 == done_steps
+            ), f"Failure with {scheduler_cls.__name__} and {num_steps} and {split_1} and {split_2}"
+
+            with self.assertRaises(ValueError) as cm:
+                inputs_2 = {
+                    **inputs,
+                    **{
+                        "denoising_start": split_2,
+                        "denoising_end": split_1,
+                        "image": latents,
+                        "output_type": "latent",
+                    },
+                }
+                pipe_2(**inputs_2).images[0]
+            assert "cannot be larger than or equal to `denoising_end`" in str(cm.exception)
+
+            inputs_2 = {
+                **inputs,
+                **{"denoising_start": split_1, "denoising_end": split_2, "image": latents, "output_type": "latent"},
+            }
+            pipe_2(**inputs_2).images[0]
+
+            assert expected_steps_2 == done_steps[len(expected_steps_1) :]
+
+            inputs_3 = {**inputs, **{"denoising_start": split_2, "image": latents}}
+            pipe_3(**inputs_3).images[0]
+
+            assert expected_steps_3 == done_steps[len(expected_steps_1) + len(expected_steps_2) :]
+            assert (
+                expected_steps == done_steps
+            ), f"Failure with {scheduler_cls.__name__} and {num_steps} and {split_1} and {split_2}"
+
+        for steps in [7, 11, 20]:
+            for split_1, split_2 in zip([0.19, 0.32], [0.81, 0.68]):
+                for scheduler_cls in [
+                    DDIMScheduler,
+                    EulerDiscreteScheduler,
+                    DPMSolverMultistepScheduler,
+                    UniPCMultistepScheduler,
+                    HeunDiscreteScheduler,
+                ]:
+                    assert_run_mixture(steps, split_1, split_2, scheduler_cls)
+
+    def test_stable_diffusion_xl_multi_prompts(self):
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+
+        # forward with single prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = inputs["prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "different prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+        # manually set a negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same negative_prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = inputs["negative_prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = "different negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+    def test_stable_diffusion_xl_negative_conditions(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice_with_no_neg_cond = image[0, -3:, -3:, -1]
+
+        image = sd_pipe(
+            **inputs,
+            negative_original_size=(512, 512),
+            negative_crops_coords_top_left=(0, 0),
+            negative_target_size=(1024, 1024),
+        ).images
+        image_slice_with_neg_cond = image[0, -3:, -3:, -1]
+
+        self.assertTrue(np.abs(image_slice_with_no_neg_cond - image_slice_with_neg_cond).max() > 1e-2)
+
+    def test_stable_diffusion_xl_save_from_pretrained(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sd_pipe.save_pretrained(tmpdirname)
+            sd_pipe = StableDiffusionXLPipeline.from_pretrained(tmpdirname).to(torch_device)
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            pipe.unet.set_default_attn_processor()
+
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+
+    def test_pipeline_interrupt(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "hey"
+        num_inference_steps = 3
+
+        # store intermediate latents from the generation process
+        class PipelineState:
+            def __init__(self):
+                self.state = []
+
+            def apply(self, pipe, i, t, callback_kwargs):
+                self.state.append(callback_kwargs["latents"])
+                return callback_kwargs
+
+        pipe_state = PipelineState()
+        sd_pipe(
+            prompt,
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+            generator=torch.Generator("cpu").manual_seed(0),
+            callback_on_step_end=pipe_state.apply,
+        ).images
+
+        # interrupt generation at step index
+        interrupt_step_idx = 1
+
+        def callback_on_step_end(pipe, i, t, callback_kwargs):
+            if i == interrupt_step_idx:
+                pipe._interrupt = True
+
+            return callback_kwargs
+
+        output_interrupted = sd_pipe(
+            prompt,
+            num_inference_steps=num_inference_steps,
+            output_type="latent",
+            generator=torch.Generator("cpu").manual_seed(0),
+            callback_on_step_end=callback_on_step_end,
+        ).images
+
+        # fetch intermediate latents at the interrupted step
+        # from the completed generation process
+        intermediate_latent = pipe_state.state[interrupt_step_idx]
+
+        # compare the intermediate latent to the output of the interrupted process
+        # they should be the same
+        assert torch.allclose(intermediate_latent, output_interrupted, atol=1e-4)
+
+
+@slow
+class StableDiffusionXLPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_lcm(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel.from_pretrained(
+            "latent-consistency/lcm-ssd-1b", torch_dtype=torch.float16, variant="fp16"
+        )
+        sd_pipe = StableDiffusionXLPipeline.from_pretrained(
+            "segmind/SSD-1B", unet=unet, torch_dtype=torch.float16, variant="fp16"
+        ).to(torch_device)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a red car standing on the side of the street"
+
+        image = sd_pipe(
+            prompt, num_inference_steps=4, guidance_scale=8.0, generator=torch.Generator("cpu").manual_seed(0)
+        ).images[0]
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/lcm_full/stable_diffusion_ssd_1b_lcm.png"
+        )
+
+        image = sd_pipe.image_processor.pil_to_numpy(image)
+        expected_image = sd_pipe.image_processor.pil_to_numpy(expected_image)
+
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
+
+        assert max_diff < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py b/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..2091af9c0383052f229ab46920e005978d0e1efc
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
@@ -0,0 +1,673 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+import torch
+from parameterized import parameterized
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    EulerDiscreteScheduler,
+    LCMScheduler,
+    MultiAdapter,
+    StableDiffusionXLAdapterPipeline,
+    T2IAdapter,
+    UNet2DConditionModel,
+)
+from diffusers.utils import logging
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    assert_mean_pixel_difference,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLAdapterPipelineFastTests(
+    IPAdapterTesterMixin, PipelineTesterMixin, SDXLOptionalComponentsTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionXLAdapterPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+
+    def get_dummy_components(self, adapter_type="full_adapter_xl", time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+            time_cond_proj_dim=time_cond_proj_dim,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        if adapter_type == "full_adapter_xl":
+            adapter = T2IAdapter(
+                in_channels=3,
+                channels=[32, 64],
+                num_res_blocks=2,
+                downscale_factor=4,
+                adapter_type=adapter_type,
+            )
+        elif adapter_type == "multi_adapter":
+            adapter = MultiAdapter(
+                [
+                    T2IAdapter(
+                        in_channels=3,
+                        channels=[32, 64],
+                        num_res_blocks=2,
+                        downscale_factor=4,
+                        adapter_type="full_adapter_xl",
+                    ),
+                    T2IAdapter(
+                        in_channels=3,
+                        channels=[32, 64],
+                        num_res_blocks=2,
+                        downscale_factor=4,
+                        adapter_type="full_adapter_xl",
+                    ),
+                ]
+            )
+        else:
+            raise ValueError(
+                f"Unknown adapter type: {adapter_type}, must be one of 'full_adapter_xl', or 'multi_adapter''"
+            )
+
+        components = {
+            "adapter": adapter,
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            # "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_components_with_full_downscaling(self, adapter_type="full_adapter_xl"):
+        """Get dummy components with x8 VAE downscaling and 3 UNet down blocks.
+        These dummy components are intended to fully-exercise the T2I-Adapter
+        downscaling behavior.
+        """
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=2,
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=1,
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 32, 32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        if adapter_type == "full_adapter_xl":
+            adapter = T2IAdapter(
+                in_channels=3,
+                channels=[32, 32, 64],
+                num_res_blocks=2,
+                downscale_factor=16,
+                adapter_type=adapter_type,
+            )
+        elif adapter_type == "multi_adapter":
+            adapter = MultiAdapter(
+                [
+                    T2IAdapter(
+                        in_channels=3,
+                        channels=[32, 32, 64],
+                        num_res_blocks=2,
+                        downscale_factor=16,
+                        adapter_type="full_adapter_xl",
+                    ),
+                    T2IAdapter(
+                        in_channels=3,
+                        channels=[32, 32, 64],
+                        num_res_blocks=2,
+                        downscale_factor=16,
+                        adapter_type="full_adapter_xl",
+                    ),
+                ]
+            )
+        else:
+            raise ValueError(
+                f"Unknown adapter type: {adapter_type}, must be one of 'full_adapter_xl', or 'multi_adapter''"
+            )
+
+        components = {
+            "adapter": adapter,
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            # "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0, height=64, width=64, num_images=1):
+        if num_images == 1:
+            image = floats_tensor((1, 3, height, width), rng=random.Random(seed)).to(device)
+        else:
+            image = [
+                floats_tensor((1, 3, height, width), rng=random.Random(seed)).to(device) for _ in range(num_images)
+            ]
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_ip_adapter(self, from_multi=False, expected_pipe_slice=None):
+        if not from_multi:
+            expected_pipe_slice = None
+            if torch_device == "cpu":
+                expected_pipe_slice = np.array(
+                    [0.5752, 0.6155, 0.4826, 0.5111, 0.5741, 0.4678, 0.5199, 0.5231, 0.4794]
+                )
+
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_stable_diffusion_adapter_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLAdapterPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([00.5752, 0.6155, 0.4826, 0.5111, 0.5741, 0.4678, 0.5199, 0.5231, 0.4794])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
+
+    @parameterized.expand(
+        [
+            # (dim=144) The internal feature map will be 9x9 after initial pixel unshuffling (downscaled x16).
+            (((4 * 2 + 1) * 16),),
+            # (dim=160) The internal feature map will be 5x5 after the first T2I down block (downscaled x32).
+            (((4 * 1 + 1) * 32),),
+        ]
+    )
+    def test_multiple_image_dimensions(self, dim):
+        """Test that the T2I-Adapter pipeline supports any input dimension that
+        is divisible by the adapter's `downscale_factor`. This test was added in
+        response to an issue where the T2I Adapter's downscaling padding
+        behavior did not match the UNet's behavior.
+
+        Note that we have selected `dim` values to produce odd resolutions at
+        each downscaling level.
+        """
+        components = self.get_dummy_components_with_full_downscaling()
+        sd_pipe = StableDiffusionXLAdapterPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device, height=dim, width=dim)
+        image = sd_pipe(**inputs).images
+
+        assert image.shape == (1, dim, dim, 3)
+
+    @parameterized.expand(["full_adapter", "full_adapter_xl", "light_adapter"])
+    def test_total_downscale_factor(self, adapter_type):
+        """Test that the T2IAdapter correctly reports its total_downscale_factor."""
+        batch_size = 1
+        in_channels = 3
+        out_channels = [320, 640, 1280, 1280]
+        in_image_size = 512
+
+        adapter = T2IAdapter(
+            in_channels=in_channels,
+            channels=out_channels,
+            num_res_blocks=2,
+            downscale_factor=8,
+            adapter_type=adapter_type,
+        )
+        adapter.to(torch_device)
+
+        in_image = floats_tensor((batch_size, in_channels, in_image_size, in_image_size)).to(torch_device)
+
+        adapter_state = adapter(in_image)
+
+        # Assume that the last element in `adapter_state` has been downsampled the most, and check
+        # that it matches the `total_downscale_factor`.
+        expected_out_image_size = in_image_size // adapter.total_downscale_factor
+        assert adapter_state[-1].shape == (
+            batch_size,
+            out_channels[-1],
+            expected_out_image_size,
+            expected_out_image_size,
+        )
+
+    def test_save_load_optional_components(self):
+        return self._test_save_load_optional_components()
+
+    def test_adapter_sdxl_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionXLAdapterPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5425, 0.5385, 0.4964, 0.5045, 0.6149, 0.4974, 0.5469, 0.5332, 0.5426])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_adapter_sdxl_lcm_custom_timesteps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionXLAdapterPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["num_inference_steps"]
+        inputs["timesteps"] = [999, 499]
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5425, 0.5385, 0.4964, 0.5045, 0.6149, 0.4974, 0.5469, 0.5332, 0.5426])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+
+class StableDiffusionXLMultiAdapterPipelineFastTests(
+    StableDiffusionXLAdapterPipelineFastTests, PipelineTesterMixin, unittest.TestCase
+):
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        return super().get_dummy_components("multi_adapter", time_cond_proj_dim=time_cond_proj_dim)
+
+    def get_dummy_components_with_full_downscaling(self):
+        return super().get_dummy_components_with_full_downscaling("multi_adapter")
+
+    def get_dummy_inputs(self, device, seed=0, height=64, width=64):
+        inputs = super().get_dummy_inputs(device, seed, height, width, num_images=2)
+        inputs["adapter_conditioning_scale"] = [0.5, 0.5]
+        return inputs
+
+    def test_stable_diffusion_adapter_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLAdapterPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5617, 0.6081, 0.4807, 0.5071, 0.5665, 0.4614, 0.5165, 0.5164, 0.4786])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.5617, 0.6081, 0.4807, 0.5071, 0.5665, 0.4614, 0.5165, 0.5164, 0.4786])
+
+        return super().test_ip_adapter(from_multi=True, expected_pipe_slice=expected_pipe_slice)
+
+    def test_inference_batch_consistent(
+        self, batch_sizes=[2, 4, 13], additional_params_copy_to_batched_inputs=["num_inference_steps"]
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        for batch_size in batch_sizes:
+            batched_inputs = {}
+            for name, value in inputs.items():
+                if name in self.batch_params:
+                    # prompt is string
+                    if name == "prompt":
+                        len_prompt = len(value)
+                        # make unequal batch sizes
+                        batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+
+                        # make last batch super long
+                        batched_inputs[name][-1] = 100 * "very long"
+                    elif name == "image":
+                        batched_images = []
+
+                        for image in value:
+                            batched_images.append(batch_size * [image])
+
+                        batched_inputs[name] = batched_images
+                    else:
+                        batched_inputs[name] = batch_size * [value]
+
+                elif name == "batch_size":
+                    batched_inputs[name] = batch_size
+                else:
+                    batched_inputs[name] = value
+
+            for arg in additional_params_copy_to_batched_inputs:
+                batched_inputs[arg] = inputs[arg]
+
+            batched_inputs["output_type"] = "np"
+
+            output = pipe(**batched_inputs)
+
+            assert len(output[0]) == batch_size
+
+            batched_inputs["output_type"] = "np"
+
+            output = pipe(**batched_inputs)[0]
+
+            assert output.shape[0] == batch_size
+
+        logger.setLevel(level=diffusers.logging.WARNING)
+
+    def test_num_images_per_prompt(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        batch_sizes = [1, 2]
+        num_images_per_prompts = [1, 2]
+
+        for batch_size in batch_sizes:
+            for num_images_per_prompt in num_images_per_prompts:
+                inputs = self.get_dummy_inputs(torch_device)
+
+                for key in inputs.keys():
+                    if key in self.batch_params:
+                        if key == "image":
+                            batched_images = []
+
+                            for image in inputs[key]:
+                                batched_images.append(batch_size * [image])
+
+                            inputs[key] = batched_images
+                        else:
+                            inputs[key] = batch_size * [inputs[key]]
+
+                images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt)[0]
+
+                assert images.shape[0] == batch_size * num_images_per_prompt
+
+    def test_inference_batch_single_identical(
+        self,
+        batch_size=3,
+        test_max_difference=None,
+        test_mean_pixel_difference=None,
+        relax_max_difference=False,
+        expected_max_diff=2e-3,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        if test_max_difference is None:
+            # TODO(Pedro) - not sure why, but not at all reproducible at the moment it seems
+            # make sure that batched and non-batched is identical
+            test_max_difference = torch_device != "mps"
+
+        if test_mean_pixel_difference is None:
+            # TODO same as above
+            test_mean_pixel_difference = torch_device != "mps"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        for name, value in inputs.items():
+            if name in self.batch_params:
+                # prompt is string
+                if name == "prompt":
+                    len_prompt = len(value)
+                    # make unequal batch sizes
+                    batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+
+                    # make last batch super long
+                    batched_inputs[name][-1] = 100 * "very long"
+                elif name == "image":
+                    batched_images = []
+
+                    for image in value:
+                        batched_images.append(batch_size * [image])
+
+                    batched_inputs[name] = batched_images
+                else:
+                    batched_inputs[name] = batch_size * [value]
+            elif name == "batch_size":
+                batched_inputs[name] = batch_size
+            elif name == "generator":
+                batched_inputs[name] = [self.get_generator(i) for i in range(batch_size)]
+            else:
+                batched_inputs[name] = value
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        output_batch = pipe(**batched_inputs)
+        assert output_batch[0].shape[0] == batch_size
+
+        inputs["generator"] = self.get_generator(0)
+
+        output = pipe(**inputs)
+
+        logger.setLevel(level=diffusers.logging.WARNING)
+        if test_max_difference:
+            if relax_max_difference:
+                # Taking the median of the largest <n> differences
+                # is resilient to outliers
+                diff = np.abs(output_batch[0][0] - output[0][0])
+                diff = diff.flatten()
+                diff.sort()
+                max_diff = np.median(diff[-5:])
+            else:
+                max_diff = np.abs(output_batch[0][0] - output[0][0]).max()
+            assert max_diff < expected_max_diff
+
+        if test_mean_pixel_difference:
+            assert_mean_pixel_difference(output_batch[0][0], output[0][0])
+
+    def test_adapter_sdxl_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionXLAdapterPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5313, 0.5375, 0.4942, 0.5021, 0.6142, 0.4968, 0.5434, 0.5311, 0.5448])
+
+        debug = [str(round(i, 4)) for i in image_slice.flatten().tolist()]
+        print(",".join(debug))
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_adapter_sdxl_lcm_custom_timesteps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionXLAdapterPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["num_inference_steps"]
+        inputs["timesteps"] = [999, 499]
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5313, 0.5375, 0.4942, 0.5021, 0.6142, 0.4968, 0.5434, 0.5311, 0.5448])
+
+        debug = [str(round(i, 4)) for i in image_slice.flatten().tolist()]
+        print(",".join(debug))
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..db0905a483107782e93dc408d1749510b3ced441
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -0,0 +1,834 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    AutoencoderTiny,
+    EDMDPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+    LCMScheduler,
+    StableDiffusionXLImg2ImgPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+)
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLImg2ImgPipelineFastTests(
+    IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionXLImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union(
+        {"add_text_embeds", "add_time_ids", "add_neg_time_ids"}
+    )
+
+    def get_dummy_components(self, skip_first_text_encoder=False, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            time_cond_proj_dim=time_cond_proj_dim,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=72,  # 5 * 8 + 32
+            cross_attention_dim=64 if not skip_first_text_encoder else 32,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=32,
+            image_size=224,
+            projection_dim=32,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+
+        feature_extractor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder if not skip_first_text_encoder else None,
+            "tokenizer": tokenizer if not skip_first_text_encoder else None,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "requires_aesthetics_score": True,
+            "image_encoder": image_encoder,
+            "feature_extractor": feature_extractor,
+        }
+        return components
+
+    def get_dummy_tiny_autoencoder(self):
+        return AutoencoderTiny(in_channels=3, out_channels=3, latent_channels=4)
+
+    def test_components_function(self):
+        init_components = self.get_dummy_components()
+        init_components.pop("requires_aesthetics_score")
+        pipe = self.pipeline_class(**init_components)
+
+        self.assertTrue(hasattr(pipe, "components"))
+        self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image / 2 + 0.5
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "strength": 0.8,
+        }
+        return inputs
+
+    def test_stable_diffusion_xl_img2img_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+
+        expected_slice = np.array([0.4664, 0.4886, 0.4403, 0.6902, 0.5592, 0.4534, 0.5931, 0.5951, 0.5224])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_xl_img2img_euler_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+
+        expected_slice = np.array([0.5604, 0.4352, 0.4717, 0.5844, 0.5101, 0.6704, 0.6290, 0.5460, 0.5286])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_xl_img2img_euler_lcm_custom_timesteps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["num_inference_steps"]
+        inputs["timesteps"] = [999, 499]
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+
+        expected_slice = np.array([0.5604, 0.4352, 0.4717, 0.5844, 0.5101, 0.6704, 0.6290, 0.5460, 0.5286])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_attention_slicing_forward_pass(self):
+        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+    # TODO(Patrick, Sayak) - skip for now as this requires more refiner tests
+    def test_save_load_optional_components(self):
+        pass
+
+    def test_stable_diffusion_xl_img2img_negative_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward without prompt embeds
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with prompt embeds
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        prompt = 3 * [inputs.pop("prompt")]
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
+
+        output = sd_pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # make sure that it's equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.5133, 0.4626, 0.4970, 0.6273, 0.5160, 0.6891, 0.6639, 0.5892, 0.5709])
+
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_stable_diffusion_xl_img2img_tiny_autoencoder(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
+        sd_pipe.vae = self.get_dummy_tiny_autoencoder()
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.0, 0.0, 0.0106, 0.0, 0.0, 0.0087, 0.0052, 0.0062, 0.0177])
+
+        assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
+
+    @require_torch_gpu
+    def test_stable_diffusion_xl_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            pipe.unet.set_default_attn_processor()
+
+            generator_device = "cpu"
+            inputs = self.get_dummy_inputs(generator_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_stable_diffusion_xl_multi_prompts(self):
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+
+        # forward with single prompt
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["num_inference_steps"] = 5
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same prompt duplicated
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["num_inference_steps"] = 5
+        inputs["prompt_2"] = inputs["prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different prompt
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["num_inference_steps"] = 5
+        inputs["prompt_2"] = "different prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+        # manually set a negative_prompt
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["num_inference_steps"] = 5
+        inputs["negative_prompt"] = "negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same negative_prompt duplicated
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["num_inference_steps"] = 5
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = inputs["negative_prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different negative_prompt
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["num_inference_steps"] = 5
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = "different negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+    def test_stable_diffusion_xl_img2img_negative_conditions(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice_with_no_neg_conditions = image[0, -3:, -3:, -1]
+
+        image = sd_pipe(
+            **inputs,
+            negative_original_size=(512, 512),
+            negative_crops_coords_top_left=(
+                0,
+                0,
+            ),
+            negative_target_size=(1024, 1024),
+        ).images
+        image_slice_with_neg_conditions = image[0, -3:, -3:, -1]
+
+        assert (
+            np.abs(image_slice_with_no_neg_conditions.flatten() - image_slice_with_neg_conditions.flatten()).max()
+            > 1e-4
+        )
+
+    def test_pipeline_interrupt(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = "hey"
+        num_inference_steps = 5
+
+        # store intermediate latents from the generation process
+        class PipelineState:
+            def __init__(self):
+                self.state = []
+
+            def apply(self, pipe, i, t, callback_kwargs):
+                self.state.append(callback_kwargs["latents"])
+                return callback_kwargs
+
+        pipe_state = PipelineState()
+        sd_pipe(
+            prompt,
+            image=inputs["image"],
+            strength=0.8,
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+            generator=torch.Generator("cpu").manual_seed(0),
+            callback_on_step_end=pipe_state.apply,
+        ).images
+
+        # interrupt generation at step index
+        interrupt_step_idx = 1
+
+        def callback_on_step_end(pipe, i, t, callback_kwargs):
+            if i == interrupt_step_idx:
+                pipe._interrupt = True
+
+            return callback_kwargs
+
+        output_interrupted = sd_pipe(
+            prompt,
+            image=inputs["image"],
+            strength=0.8,
+            num_inference_steps=num_inference_steps,
+            output_type="latent",
+            generator=torch.Generator("cpu").manual_seed(0),
+            callback_on_step_end=callback_on_step_end,
+        ).images
+
+        # fetch intermediate latents at the interrupted step
+        # from the completed generation process
+        intermediate_latent = pipe_state.state[interrupt_step_idx]
+
+        # compare the intermediate latent to the output of the interrupted process
+        # they should be the same
+        assert torch.allclose(intermediate_latent, output_interrupted, atol=1e-4)
+
+
+class StableDiffusionXLImg2ImgRefinerOnlyPipelineFastTests(
+    PipelineLatentTesterMixin, PipelineTesterMixin, SDXLOptionalComponentsTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionXLImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=72,  # 5 * 8 + 32
+            cross_attention_dim=32,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "tokenizer": None,
+            "text_encoder": None,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "requires_aesthetics_score": True,
+            "image_encoder": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def test_components_function(self):
+        init_components = self.get_dummy_components()
+        init_components.pop("requires_aesthetics_score")
+        pipe = self.pipeline_class(**init_components)
+
+        self.assertTrue(hasattr(pipe, "components"))
+        self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image / 2 + 0.5
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "output_type": "np",
+            "strength": 0.8,
+        }
+        return inputs
+
+    def test_stable_diffusion_xl_img2img_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+
+        expected_slice = np.array([0.4745, 0.4924, 0.4338, 0.6468, 0.5547, 0.4419, 0.5646, 0.5897, 0.5146])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @require_torch_gpu
+    def test_stable_diffusion_xl_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            pipe.unet.set_default_attn_processor()
+
+            generator_device = "cpu"
+            inputs = self.get_dummy_inputs(generator_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_stable_diffusion_xl_img2img_negative_conditions(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice_with_no_neg_conditions = image[0, -3:, -3:, -1]
+
+        image = sd_pipe(
+            **inputs,
+            negative_original_size=(512, 512),
+            negative_crops_coords_top_left=(
+                0,
+                0,
+            ),
+            negative_target_size=(1024, 1024),
+        ).images
+        image_slice_with_neg_conditions = image[0, -3:, -3:, -1]
+
+        assert (
+            np.abs(image_slice_with_no_neg_conditions.flatten() - image_slice_with_neg_conditions.flatten()).max()
+            > 1e-4
+        )
+
+    def test_stable_diffusion_xl_img2img_negative_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward without prompt embeds
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with prompt embeds
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        prompt = 3 * [inputs.pop("prompt")]
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
+
+        output = sd_pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # make sure that it's equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_xl_img2img_prompt_embeds_only(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward without prompt embeds
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with prompt embeds
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        (
+            prompt_embeds,
+            _,
+            pooled_prompt_embeds,
+            _,
+        ) = sd_pipe.encode_prompt(prompt)
+
+        output = sd_pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+        )
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # make sure that it's equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_attention_slicing_forward_pass(self):
+        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+
+@slow
+class StableDiffusionXLImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_xl_img2img_playground(self):
+        torch.manual_seed(0)
+        model_path = "playgroundai/playground-v2.5-1024px-aesthetic"
+
+        sd_pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+            model_path, torch_dtype=torch.float16, variant="fp16", add_watermarker=False
+        )
+
+        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.scheduler = EDMDPMSolverMultistepScheduler.from_config(
+            sd_pipe.scheduler.config, use_karras_sigmas=True
+        )
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a photo of an astronaut riding a horse on mars"
+
+        url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
+
+        init_image = load_image(url).convert("RGB")
+
+        image = sd_pipe(
+            prompt,
+            num_inference_steps=30,
+            guidance_scale=8.0,
+            image=init_image,
+            height=1024,
+            width=1024,
+            output_type="np",
+        ).images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 1024, 1024, 3)
+
+        expected_slice = np.array([0.3519, 0.3149, 0.3364, 0.3505, 0.3402, 0.3371, 0.3554, 0.3495, 0.3333])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py b/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..964c7123dd321427711bc506d6075d6306daf6c5
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
@@ -0,0 +1,817 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LCMScheduler,
+    StableDiffusionXLInpaintPipeline,
+    UNet2DConditionModel,
+    UniPCMultistepScheduler,
+)
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, slow, torch_device
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+)
+from ..test_pipelines_common import IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLInpaintPipelineFastTests(
+    IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionXLInpaintPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    image_params = frozenset([])
+    # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
+    image_latents_params = frozenset([])
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union(
+        {
+            "add_text_embeds",
+            "add_time_ids",
+            "mask",
+            "masked_image_latents",
+        }
+    )
+
+    def get_dummy_components(self, skip_first_text_encoder=False, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            time_cond_proj_dim=time_cond_proj_dim,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=72,  # 5 * 8 + 32
+            cross_attention_dim=64 if not skip_first_text_encoder else 32,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        torch.manual_seed(0)
+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=32,
+            image_size=224,
+            projection_dim=32,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+
+        feature_extractor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder if not skip_first_text_encoder else None,
+            "tokenizer": tokenizer if not skip_first_text_encoder else None,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "image_encoder": image_encoder,
+            "feature_extractor": feature_extractor,
+            "requires_aesthetics_score": True,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        # create mask
+        image[8:, 8:, :] = 255
+        mask_image = Image.fromarray(np.uint8(image)).convert("L").resize((64, 64))
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": init_image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "strength": 1.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def get_dummy_inputs_2images(self, device, seed=0, img_res=64):
+        # Get random floats in [0, 1] as image with spatial size (img_res, img_res)
+        image1 = floats_tensor((1, 3, img_res, img_res), rng=random.Random(seed)).to(device)
+        image2 = floats_tensor((1, 3, img_res, img_res), rng=random.Random(seed + 22)).to(device)
+        # Convert images to [-1, 1]
+        init_image1 = 2.0 * image1 - 1.0
+        init_image2 = 2.0 * image2 - 1.0
+
+        # empty mask
+        mask_image = torch.zeros((1, 1, img_res, img_res), device=device)
+
+        if str(device).startswith("mps"):
+            generator1 = torch.manual_seed(seed)
+            generator2 = torch.manual_seed(seed)
+        else:
+            generator1 = torch.Generator(device=device).manual_seed(seed)
+            generator2 = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": ["A painting of a squirrel eating a burger"] * 2,
+            "image": [init_image1, init_image2],
+            "mask_image": [mask_image] * 2,
+            "generator": [generator1, generator2],
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_ip_adapter(self):
+        expected_pipe_slice = None
+        if torch_device == "cpu":
+            expected_pipe_slice = np.array([0.8274, 0.5538, 0.6141, 0.5843, 0.6865, 0.7082, 0.5861, 0.6123, 0.5344])
+
+        return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
+
+    def test_components_function(self):
+        init_components = self.get_dummy_components()
+        init_components.pop("requires_aesthetics_score")
+        pipe = self.pipeline_class(**init_components)
+
+        self.assertTrue(hasattr(pipe, "components"))
+        self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
+
+    def test_stable_diffusion_xl_inpaint_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLInpaintPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.8279, 0.5673, 0.6088, 0.6156, 0.6923, 0.7347, 0.6547, 0.6108, 0.5198])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_xl_inpaint_euler_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionXLInpaintPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.6611, 0.5569, 0.5531, 0.5471, 0.5918, 0.6393, 0.5074, 0.5468, 0.5185])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_xl_inpaint_euler_lcm_custom_timesteps(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionXLInpaintPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        del inputs["num_inference_steps"]
+        inputs["timesteps"] = [999, 499]
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.6611, 0.5569, 0.5531, 0.5471, 0.5918, 0.6393, 0.5074, 0.5468, 0.5185])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_attention_slicing_forward_pass(self):
+        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+    # TODO(Patrick, Sayak) - skip for now as this requires more refiner tests
+    def test_save_load_optional_components(self):
+        pass
+
+    def test_stable_diffusion_xl_inpaint_negative_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLInpaintPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward without prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        prompt = 3 * [inputs.pop("prompt")]
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
+
+        output = sd_pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # make sure that it's equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    @require_torch_gpu
+    def test_stable_diffusion_xl_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLInpaintPipeline(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLInpaintPipeline(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLInpaintPipeline(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            pipe.unet.set_default_attn_processor()
+
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_stable_diffusion_xl_refiner(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components(skip_first_text_encoder=True)
+
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.7540, 0.5231, 0.5833, 0.6217, 0.6339, 0.7067, 0.6507, 0.5672, 0.5030])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_two_xl_mixture_of_denoiser_fast(self):
+        components = self.get_dummy_components()
+        pipe_1 = StableDiffusionXLInpaintPipeline(**components).to(torch_device)
+        pipe_1.unet.set_default_attn_processor()
+        pipe_2 = StableDiffusionXLInpaintPipeline(**components).to(torch_device)
+        pipe_2.unet.set_default_attn_processor()
+
+        def assert_run_mixture(
+            num_steps, split, scheduler_cls_orig, num_train_timesteps=pipe_1.scheduler.config.num_train_timesteps
+        ):
+            inputs = self.get_dummy_inputs(torch_device)
+            inputs["num_inference_steps"] = num_steps
+
+            class scheduler_cls(scheduler_cls_orig):
+                pass
+
+            pipe_1.scheduler = scheduler_cls.from_config(pipe_1.scheduler.config)
+            pipe_2.scheduler = scheduler_cls.from_config(pipe_2.scheduler.config)
+
+            # Let's retrieve the number of timesteps we want to use
+            pipe_1.scheduler.set_timesteps(num_steps)
+            expected_steps = pipe_1.scheduler.timesteps.tolist()
+
+            split_ts = num_train_timesteps - int(round(num_train_timesteps * split))
+
+            if pipe_1.scheduler.order == 2:
+                expected_steps_1 = list(filter(lambda ts: ts >= split_ts, expected_steps))
+                expected_steps_2 = expected_steps_1[-1:] + list(filter(lambda ts: ts < split_ts, expected_steps))
+                expected_steps = expected_steps_1 + expected_steps_2
+            else:
+                expected_steps_1 = list(filter(lambda ts: ts >= split_ts, expected_steps))
+                expected_steps_2 = list(filter(lambda ts: ts < split_ts, expected_steps))
+
+            # now we monkey patch step `done_steps`
+            # list into the step function for testing
+            done_steps = []
+            old_step = copy.copy(scheduler_cls.step)
+
+            def new_step(self, *args, **kwargs):
+                done_steps.append(args[1].cpu().item())  # args[1] is always the passed `t`
+                return old_step(self, *args, **kwargs)
+
+            scheduler_cls.step = new_step
+
+            inputs_1 = {**inputs, **{"denoising_end": split, "output_type": "latent"}}
+            latents = pipe_1(**inputs_1).images[0]
+
+            assert expected_steps_1 == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}"
+
+            inputs_2 = {**inputs, **{"denoising_start": split, "image": latents}}
+            pipe_2(**inputs_2).images[0]
+
+            assert expected_steps_2 == done_steps[len(expected_steps_1) :]
+            assert expected_steps == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}"
+
+        for steps in [7, 20]:
+            assert_run_mixture(steps, 0.33, EulerDiscreteScheduler)
+            assert_run_mixture(steps, 0.33, HeunDiscreteScheduler)
+
+    @slow
+    def test_stable_diffusion_two_xl_mixture_of_denoiser(self):
+        components = self.get_dummy_components()
+        pipe_1 = StableDiffusionXLInpaintPipeline(**components).to(torch_device)
+        pipe_1.unet.set_default_attn_processor()
+        pipe_2 = StableDiffusionXLInpaintPipeline(**components).to(torch_device)
+        pipe_2.unet.set_default_attn_processor()
+
+        def assert_run_mixture(
+            num_steps, split, scheduler_cls_orig, num_train_timesteps=pipe_1.scheduler.config.num_train_timesteps
+        ):
+            inputs = self.get_dummy_inputs(torch_device)
+            inputs["num_inference_steps"] = num_steps
+
+            class scheduler_cls(scheduler_cls_orig):
+                pass
+
+            pipe_1.scheduler = scheduler_cls.from_config(pipe_1.scheduler.config)
+            pipe_2.scheduler = scheduler_cls.from_config(pipe_2.scheduler.config)
+
+            # Let's retrieve the number of timesteps we want to use
+            pipe_1.scheduler.set_timesteps(num_steps)
+            expected_steps = pipe_1.scheduler.timesteps.tolist()
+
+            split_ts = num_train_timesteps - int(round(num_train_timesteps * split))
+
+            if pipe_1.scheduler.order == 2:
+                expected_steps_1 = list(filter(lambda ts: ts >= split_ts, expected_steps))
+                expected_steps_2 = expected_steps_1[-1:] + list(filter(lambda ts: ts < split_ts, expected_steps))
+                expected_steps = expected_steps_1 + expected_steps_2
+            else:
+                expected_steps_1 = list(filter(lambda ts: ts >= split_ts, expected_steps))
+                expected_steps_2 = list(filter(lambda ts: ts < split_ts, expected_steps))
+
+            # now we monkey patch step `done_steps`
+            # list into the step function for testing
+            done_steps = []
+            old_step = copy.copy(scheduler_cls.step)
+
+            def new_step(self, *args, **kwargs):
+                done_steps.append(args[1].cpu().item())  # args[1] is always the passed `t`
+                return old_step(self, *args, **kwargs)
+
+            scheduler_cls.step = new_step
+
+            inputs_1 = {**inputs, **{"denoising_end": split, "output_type": "latent"}}
+            latents = pipe_1(**inputs_1).images[0]
+
+            assert expected_steps_1 == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}"
+
+            inputs_2 = {**inputs, **{"denoising_start": split, "image": latents}}
+            pipe_2(**inputs_2).images[0]
+
+            assert expected_steps_2 == done_steps[len(expected_steps_1) :]
+            assert expected_steps == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}"
+
+        for steps in [5, 8, 20]:
+            for split in [0.33, 0.49, 0.71]:
+                for scheduler_cls in [
+                    DDIMScheduler,
+                    EulerDiscreteScheduler,
+                    DPMSolverMultistepScheduler,
+                    UniPCMultistepScheduler,
+                    HeunDiscreteScheduler,
+                ]:
+                    assert_run_mixture(steps, split, scheduler_cls)
+
+    @slow
+    def test_stable_diffusion_three_xl_mixture_of_denoiser(self):
+        components = self.get_dummy_components()
+        pipe_1 = StableDiffusionXLInpaintPipeline(**components).to(torch_device)
+        pipe_1.unet.set_default_attn_processor()
+        pipe_2 = StableDiffusionXLInpaintPipeline(**components).to(torch_device)
+        pipe_2.unet.set_default_attn_processor()
+        pipe_3 = StableDiffusionXLInpaintPipeline(**components).to(torch_device)
+        pipe_3.unet.set_default_attn_processor()
+
+        def assert_run_mixture(
+            num_steps,
+            split_1,
+            split_2,
+            scheduler_cls_orig,
+            num_train_timesteps=pipe_1.scheduler.config.num_train_timesteps,
+        ):
+            inputs = self.get_dummy_inputs(torch_device)
+            inputs["num_inference_steps"] = num_steps
+
+            class scheduler_cls(scheduler_cls_orig):
+                pass
+
+            pipe_1.scheduler = scheduler_cls.from_config(pipe_1.scheduler.config)
+            pipe_2.scheduler = scheduler_cls.from_config(pipe_2.scheduler.config)
+            pipe_3.scheduler = scheduler_cls.from_config(pipe_3.scheduler.config)
+
+            # Let's retrieve the number of timesteps we want to use
+            pipe_1.scheduler.set_timesteps(num_steps)
+            expected_steps = pipe_1.scheduler.timesteps.tolist()
+
+            split_1_ts = num_train_timesteps - int(round(num_train_timesteps * split_1))
+            split_2_ts = num_train_timesteps - int(round(num_train_timesteps * split_2))
+
+            if pipe_1.scheduler.order == 2:
+                expected_steps_1 = list(filter(lambda ts: ts >= split_1_ts, expected_steps))
+                expected_steps_2 = expected_steps_1[-1:] + list(
+                    filter(lambda ts: ts >= split_2_ts and ts < split_1_ts, expected_steps)
+                )
+                expected_steps_3 = expected_steps_2[-1:] + list(filter(lambda ts: ts < split_2_ts, expected_steps))
+                expected_steps = expected_steps_1 + expected_steps_2 + expected_steps_3
+            else:
+                expected_steps_1 = list(filter(lambda ts: ts >= split_1_ts, expected_steps))
+                expected_steps_2 = list(filter(lambda ts: ts >= split_2_ts and ts < split_1_ts, expected_steps))
+                expected_steps_3 = list(filter(lambda ts: ts < split_2_ts, expected_steps))
+
+            # now we monkey patch step `done_steps`
+            # list into the step function for testing
+            done_steps = []
+            old_step = copy.copy(scheduler_cls.step)
+
+            def new_step(self, *args, **kwargs):
+                done_steps.append(args[1].cpu().item())  # args[1] is always the passed `t`
+                return old_step(self, *args, **kwargs)
+
+            scheduler_cls.step = new_step
+
+            inputs_1 = {**inputs, **{"denoising_end": split_1, "output_type": "latent"}}
+            latents = pipe_1(**inputs_1).images[0]
+
+            assert (
+                expected_steps_1 == done_steps
+            ), f"Failure with {scheduler_cls.__name__} and {num_steps} and {split_1} and {split_2}"
+
+            inputs_2 = {
+                **inputs,
+                **{"denoising_start": split_1, "denoising_end": split_2, "image": latents, "output_type": "latent"},
+            }
+            pipe_2(**inputs_2).images[0]
+
+            assert expected_steps_2 == done_steps[len(expected_steps_1) :]
+
+            inputs_3 = {**inputs, **{"denoising_start": split_2, "image": latents}}
+            pipe_3(**inputs_3).images[0]
+
+            assert expected_steps_3 == done_steps[len(expected_steps_1) + len(expected_steps_2) :]
+            assert (
+                expected_steps == done_steps
+            ), f"Failure with {scheduler_cls.__name__} and {num_steps} and {split_1} and {split_2}"
+
+        for steps in [7, 11, 20]:
+            for split_1, split_2 in zip([0.19, 0.32], [0.81, 0.68]):
+                for scheduler_cls in [
+                    DDIMScheduler,
+                    EulerDiscreteScheduler,
+                    DPMSolverMultistepScheduler,
+                    UniPCMultistepScheduler,
+                    HeunDiscreteScheduler,
+                ]:
+                    assert_run_mixture(steps, split_1, split_2, scheduler_cls)
+
+    def test_stable_diffusion_xl_multi_prompts(self):
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+
+        # forward with single prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = 5
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = 5
+        inputs["prompt_2"] = inputs["prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = 5
+        inputs["prompt_2"] = "different prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+        # manually set a negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = 5
+        inputs["negative_prompt"] = "negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same negative_prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = 5
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = inputs["negative_prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = 5
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = "different negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+    def test_stable_diffusion_xl_img2img_negative_conditions(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice_with_no_neg_conditions = image[0, -3:, -3:, -1]
+
+        image = sd_pipe(
+            **inputs,
+            negative_original_size=(512, 512),
+            negative_crops_coords_top_left=(
+                0,
+                0,
+            ),
+            negative_target_size=(1024, 1024),
+        ).images
+        image_slice_with_neg_conditions = image[0, -3:, -3:, -1]
+
+        assert (
+            np.abs(image_slice_with_no_neg_conditions.flatten() - image_slice_with_neg_conditions.flatten()).max()
+            > 1e-4
+        )
+
+    def test_stable_diffusion_xl_inpaint_mask_latents(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # normal mask + normal image
+        ##  `image`: pil, `mask_image``: pil, `masked_image_latents``: None
+        inputs = self.get_dummy_inputs(device)
+        inputs["strength"] = 0.9
+        out_0 = sd_pipe(**inputs).images
+
+        # image latents + mask latents
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe.image_processor.preprocess(inputs["image"]).to(sd_pipe.device)
+        mask = sd_pipe.mask_processor.preprocess(inputs["mask_image"]).to(sd_pipe.device)
+        masked_image = image * (mask < 0.5)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_latents = sd_pipe._encode_vae_image(image, generator=generator)
+        torch.randn((1, 4, 32, 32), generator=generator)
+        mask_latents = sd_pipe._encode_vae_image(masked_image, generator=generator)
+        inputs["image"] = image_latents
+        inputs["masked_image_latents"] = mask_latents
+        inputs["mask_image"] = mask
+        inputs["strength"] = 0.9
+        generator = torch.Generator(device=device).manual_seed(0)
+        torch.randn((1, 4, 32, 32), generator=generator)
+        inputs["generator"] = generator
+        out_1 = sd_pipe(**inputs).images
+        assert np.abs(out_0 - out_1).max() < 1e-2
+
+    def test_stable_diffusion_xl_inpaint_2_images(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # test to confirm if we pass two same image, we will get same output
+        inputs = self.get_dummy_inputs(device)
+        gen1 = torch.Generator(device=device).manual_seed(0)
+        gen2 = torch.Generator(device=device).manual_seed(0)
+        for name in ["prompt", "image", "mask_image"]:
+            inputs[name] = [inputs[name]] * 2
+        inputs["generator"] = [gen1, gen2]
+        images = sd_pipe(**inputs).images
+
+        assert images.shape == (2, 64, 64, 3)
+
+        image_slice1 = images[0, -3:, -3:, -1]
+        image_slice2 = images[1, -3:, -3:, -1]
+        assert np.abs(image_slice1.flatten() - image_slice2.flatten()).max() < 1e-4
+
+        # test to confirm that if we pass two different images, we will get different output
+        inputs = self.get_dummy_inputs_2images(device)
+        images = sd_pipe(**inputs).images
+        assert images.shape == (2, 64, 64, 3)
+
+        image_slice1 = images[0, -3:, -3:, -1]
+        image_slice2 = images[1, -3:, -3:, -1]
+        assert np.abs(image_slice1.flatten() - image_slice2.flatten()).max() > 1e-2
+
+    def test_pipeline_interrupt(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLInpaintPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = "hey"
+        num_inference_steps = 5
+
+        # store intermediate latents from the generation process
+        class PipelineState:
+            def __init__(self):
+                self.state = []
+
+            def apply(self, pipe, i, t, callback_kwargs):
+                self.state.append(callback_kwargs["latents"])
+                return callback_kwargs
+
+        pipe_state = PipelineState()
+        sd_pipe(
+            prompt,
+            image=inputs["image"],
+            mask_image=inputs["mask_image"],
+            strength=0.8,
+            num_inference_steps=num_inference_steps,
+            output_type="np",
+            generator=torch.Generator("cpu").manual_seed(0),
+            callback_on_step_end=pipe_state.apply,
+        ).images
+
+        # interrupt generation at step index
+        interrupt_step_idx = 1
+
+        def callback_on_step_end(pipe, i, t, callback_kwargs):
+            if i == interrupt_step_idx:
+                pipe._interrupt = True
+
+            return callback_kwargs
+
+        output_interrupted = sd_pipe(
+            prompt,
+            image=inputs["image"],
+            mask_image=inputs["mask_image"],
+            strength=0.8,
+            num_inference_steps=num_inference_steps,
+            output_type="latent",
+            generator=torch.Generator("cpu").manual_seed(0),
+            callback_on_step_end=callback_on_step_end,
+        ).images
+
+        # fetch intermediate latents at the interrupted step
+        # from the completed generation process
+        intermediate_latent = pipe_state.state[interrupt_step_idx]
+
+        # compare the intermediate latent to the output of the interrupted process
+        # they should be the same
+        assert torch.allclose(intermediate_latent, output_interrupted, atol=1e-4)
diff --git a/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_instruction_pix2pix.py b/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_instruction_pix2pix.py
new file mode 100644
index 0000000000000000000000000000000000000000..98cecb4e0f7c429258ffd49c0cd1b1944dafb1ec
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_instruction_pix2pix.py
@@ -0,0 +1,189 @@
+# coding=utf-8
+# Copyright 2024 Harutatsu Akiyama and HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    EulerDiscreteScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_instruct_pix2pix import (
+    StableDiffusionXLInstructPix2PixPipeline,
+)
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, torch_device
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import (
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLInstructPix2PixPipelineFastTests(
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionXLInstructPix2PixPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "cross_attention_kwargs"}
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=8,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 5 * 8 + 32
+            cross_attention_dim=64,
+        )
+
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image / 2 + 0.5
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "image_guidance_scale": 1,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_components_function(self):
+        init_components = self.get_dummy_components()
+        pipe = self.pipeline_class(**init_components)
+
+        self.assertTrue(hasattr(pipe, "components"))
+        self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+    def test_attention_slicing_forward_pass(self):
+        super().test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    # Overwrite the default test_latents_inputs because pix2pix encode the image differently
+    def test_latents_input(self):
+        components = self.get_dummy_components()
+        pipe = StableDiffusionXLInstructPix2PixPipeline(**components)
+        pipe.image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        out = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="pt"))[0]
+
+        vae = components["vae"]
+        inputs = self.get_dummy_inputs_by_type(torch_device, input_image_type="pt")
+
+        for image_param in self.image_latents_params:
+            if image_param in inputs.keys():
+                inputs[image_param] = vae.encode(inputs[image_param]).latent_dist.mode()
+
+        out_latents_inputs = pipe(**inputs)[0]
+
+        max_diff = np.abs(out - out_latents_inputs).max()
+        self.assertLess(max_diff, 1e-4, "passing latents as image input generate different result from passing image")
+
+    def test_cfg(self):
+        pass
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
diff --git a/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py b/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ee9f0facc88c0fa30e1ddcd01012e91b185b6c
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py
@@ -0,0 +1,140 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import StableDiffusionXLKDiffusionPipeline
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionXLKPipelineIntegrationTests(unittest.TestCase):
+    dtype = torch.float16
+
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_xl(self):
+        sd_pipe = StableDiffusionXLKDiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=self.dtype
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        sd_pipe.set_scheduler("sample_euler")
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=9.0,
+            num_inference_steps=2,
+            height=512,
+            width=512,
+            output_type="np",
+        )
+
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.5420, 0.5038, 0.2439, 0.5371, 0.4660, 0.1906, 0.5221, 0.4290, 0.2566])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_karras_sigmas(self):
+        sd_pipe = StableDiffusionXLKDiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=self.dtype
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        sd_pipe.set_scheduler("sample_dpmpp_2m")
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=2,
+            output_type="np",
+            use_karras_sigmas=True,
+            height=512,
+            width=512,
+        )
+
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.6418, 0.6424, 0.6462, 0.6271, 0.6314, 0.6295, 0.6249, 0.6339, 0.6335])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_noise_sampler_seed(self):
+        sd_pipe = StableDiffusionXLKDiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=self.dtype
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        sd_pipe.set_scheduler("sample_dpmpp_sde")
+
+        prompt = "A painting of a squirrel eating a burger"
+        seed = 0
+        images1 = sd_pipe(
+            [prompt],
+            generator=torch.manual_seed(seed),
+            noise_sampler_seed=seed,
+            guidance_scale=9.0,
+            num_inference_steps=2,
+            output_type="np",
+            height=512,
+            width=512,
+        ).images
+        images2 = sd_pipe(
+            [prompt],
+            generator=torch.manual_seed(seed),
+            noise_sampler_seed=seed,
+            guidance_scale=9.0,
+            num_inference_steps=2,
+            output_type="np",
+            height=512,
+            width=512,
+        ).images
+        assert images1.shape == (1, 512, 512, 3)
+        assert images2.shape == (1, 512, 512, 3)
+        assert np.abs(images1.flatten() - images2.flatten()).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_unclip/__init__.py b/diffusers/tests/pipelines/stable_unclip/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_unclip/test_stable_unclip.py b/diffusers/tests/pipelines/stable_unclip/test_stable_unclip.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb54d212a7861c1f0e5e39c0c81a70d2182b919e
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_unclip/test_stable_unclip.py
@@ -0,0 +1,243 @@
+import gc
+import unittest
+
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DDPMScheduler,
+    PriorTransformer,
+    StableUnCLIPPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, nightly, require_torch_gpu, torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    assert_mean_pixel_difference,
+)
+
+
+enable_full_determinism()
+
+
+class StableUnCLIPPipelineFastTests(
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableUnCLIPPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    # TODO(will) Expected attn_bias.stride(1) == 0 to be true, but got false
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        embedder_hidden_size = 32
+        embedder_projection_dim = embedder_hidden_size
+
+        # prior components
+
+        torch.manual_seed(0)
+        prior_tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        torch.manual_seed(0)
+        prior_text_encoder = CLIPTextModelWithProjection(
+            CLIPTextConfig(
+                bos_token_id=0,
+                eos_token_id=2,
+                hidden_size=embedder_hidden_size,
+                projection_dim=embedder_projection_dim,
+                intermediate_size=37,
+                layer_norm_eps=1e-05,
+                num_attention_heads=4,
+                num_hidden_layers=5,
+                pad_token_id=1,
+                vocab_size=1000,
+            )
+        )
+
+        torch.manual_seed(0)
+        prior = PriorTransformer(
+            num_attention_heads=2,
+            attention_head_dim=12,
+            embedding_dim=embedder_projection_dim,
+            num_layers=1,
+        )
+
+        torch.manual_seed(0)
+        prior_scheduler = DDPMScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="sample",
+            num_train_timesteps=1000,
+            clip_sample=True,
+            clip_sample_range=5.0,
+            beta_schedule="squaredcos_cap_v2",
+        )
+
+        # regular denoising components
+
+        torch.manual_seed(0)
+        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size)
+        image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2")
+
+        torch.manual_seed(0)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModel(
+            CLIPTextConfig(
+                bos_token_id=0,
+                eos_token_id=2,
+                hidden_size=embedder_hidden_size,
+                projection_dim=32,
+                intermediate_size=37,
+                layer_norm_eps=1e-05,
+                num_attention_heads=4,
+                num_hidden_layers=5,
+                pad_token_id=1,
+                vocab_size=1000,
+            )
+        )
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
+            block_out_channels=(32, 64),
+            attention_head_dim=(2, 4),
+            class_embed_type="projection",
+            # The class embeddings are the noise augmented image embeddings.
+            # I.e. the image embeddings concated with the noised embeddings of the same dimension
+            projection_class_embeddings_input_dim=embedder_projection_dim * 2,
+            cross_attention_dim=embedder_hidden_size,
+            layers_per_block=1,
+            upcast_attention=True,
+            use_linear_projection=True,
+        )
+
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_schedule="scaled_linear",
+            beta_start=0.00085,
+            beta_end=0.012,
+            prediction_type="v_prediction",
+            set_alpha_to_one=False,
+            steps_offset=1,
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL()
+
+        components = {
+            # prior components
+            "prior_tokenizer": prior_tokenizer,
+            "prior_text_encoder": prior_text_encoder,
+            "prior": prior,
+            "prior_scheduler": prior_scheduler,
+            # image noising components
+            "image_normalizer": image_normalizer,
+            "image_noising_scheduler": image_noising_scheduler,
+            # regular denoising components
+            "tokenizer": tokenizer,
+            "text_encoder": text_encoder,
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "prior_num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
+    # because UnCLIP GPU undeterminism requires a looser check.
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+
+        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
+
+    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
+    # because UnCLIP undeterminism requires a looser check.
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
+
+
+@nightly
+@require_torch_gpu
+class StableUnCLIPPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_unclip(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_l_anime_turtle_fp16.npy"
+        )
+
+        pipe = StableUnCLIPPipeline.from_pretrained("fusing/stable-unclip-2-1-l", torch_dtype=torch.float16)
+        pipe.set_progress_bar_config(disable=None)
+        # stable unclip will oom when integration tests are run on a V100,
+        # so turn on memory savings
+        pipe.enable_attention_slicing()
+        pipe.enable_sequential_cpu_offload()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipe("anime turle", generator=generator, output_type="np")
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
+
+    def test_stable_unclip_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe = StableUnCLIPPipeline.from_pretrained("fusing/stable-unclip-2-1-l", torch_dtype=torch.float16)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+        pipe.enable_sequential_cpu_offload()
+
+        _ = pipe(
+            "anime turtle",
+            prior_num_inference_steps=2,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 7 GB is allocated
+        assert mem_bytes < 7 * 10**9
diff --git a/diffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/diffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5cbf77615014e276c64da559466c2820f34c040
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
@@ -0,0 +1,303 @@
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableUnCLIPImg2ImgPipeline, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
+    require_torch_gpu,
+    skip_mps,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import (
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    assert_mean_pixel_difference,
+)
+
+
+enable_full_determinism()
+
+
+class StableUnCLIPImg2ImgPipelineFastTests(
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableUnCLIPImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = frozenset(
+        []
+    )  # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
+    image_latents_params = frozenset([])
+
+    def get_dummy_components(self):
+        embedder_hidden_size = 32
+        embedder_projection_dim = embedder_hidden_size
+
+        # image encoding components
+
+        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
+
+        torch.manual_seed(0)
+        image_encoder = CLIPVisionModelWithProjection(
+            CLIPVisionConfig(
+                hidden_size=embedder_hidden_size,
+                projection_dim=embedder_projection_dim,
+                num_hidden_layers=5,
+                num_attention_heads=4,
+                image_size=32,
+                intermediate_size=37,
+                patch_size=1,
+            )
+        )
+
+        # regular denoising components
+
+        torch.manual_seed(0)
+        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size)
+        image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2")
+
+        torch.manual_seed(0)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModel(
+            CLIPTextConfig(
+                bos_token_id=0,
+                eos_token_id=2,
+                hidden_size=embedder_hidden_size,
+                projection_dim=32,
+                intermediate_size=37,
+                layer_norm_eps=1e-05,
+                num_attention_heads=4,
+                num_hidden_layers=5,
+                pad_token_id=1,
+                vocab_size=1000,
+            )
+        )
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
+            block_out_channels=(32, 64),
+            attention_head_dim=(2, 4),
+            class_embed_type="projection",
+            # The class embeddings are the noise augmented image embeddings.
+            # I.e. the image embeddings concated with the noised embeddings of the same dimension
+            projection_class_embeddings_input_dim=embedder_projection_dim * 2,
+            cross_attention_dim=embedder_hidden_size,
+            layers_per_block=1,
+            upcast_attention=True,
+            use_linear_projection=True,
+        )
+
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_schedule="scaled_linear",
+            beta_start=0.00085,
+            beta_end=0.012,
+            prediction_type="v_prediction",
+            set_alpha_to_one=False,
+            steps_offset=1,
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL()
+
+        components = {
+            # image encoding components
+            "feature_extractor": feature_extractor,
+            "image_encoder": image_encoder.eval(),
+            # image noising components
+            "image_normalizer": image_normalizer.eval(),
+            "image_noising_scheduler": image_noising_scheduler,
+            # regular denoising components
+            "tokenizer": tokenizer,
+            "text_encoder": text_encoder.eval(),
+            "unet": unet.eval(),
+            "scheduler": scheduler,
+            "vae": vae.eval(),
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0, pil_image=True):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        if pil_image:
+            input_image = input_image * 0.5 + 0.5
+            input_image = input_image.clamp(0, 1)
+            input_image = input_image.cpu().permute(0, 2, 3, 1).float().numpy()
+            input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
+
+        return {
+            "prompt": "An anime racoon running a marathon",
+            "image": input_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+
+    @skip_mps
+    def test_image_embeds_none(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableUnCLIPImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs.update({"image_embeds": None})
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.4397, 0.7080, 0.5590, 0.4255, 0.7181, 0.5938, 0.4051, 0.3720, 0.5116])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
+    # because GPU undeterminism requires a looser check.
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device in ["cpu", "mps"]
+
+        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
+
+    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
+    # because undeterminism requires a looser check.
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(test_max_difference=False)
+
+
+@nightly
+@require_torch_gpu
+class StableUnCLIPImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_unclip_l_img2img(self):
+        input_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png"
+        )
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_l_img2img_anime_turtle_fp16.npy"
+        )
+
+        pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
+            "fusing/stable-unclip-2-1-l-img2img", torch_dtype=torch.float16
+        )
+        pipe.set_progress_bar_config(disable=None)
+        # stable unclip will oom when integration tests are run on a V100,
+        # so turn on memory savings
+        pipe.enable_attention_slicing()
+        pipe.enable_sequential_cpu_offload()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipe(input_image, "anime turle", generator=generator, output_type="np")
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
+
+    def test_stable_unclip_h_img2img(self):
+        input_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png"
+        )
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_h_img2img_anime_turtle_fp16.npy"
+        )
+
+        pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
+            "fusing/stable-unclip-2-1-h-img2img", torch_dtype=torch.float16
+        )
+        pipe.set_progress_bar_config(disable=None)
+        # stable unclip will oom when integration tests are run on a V100,
+        # so turn on memory savings
+        pipe.enable_attention_slicing()
+        pipe.enable_sequential_cpu_offload()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipe(input_image, "anime turle", generator=generator, output_type="np")
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
+
+    def test_stable_unclip_img2img_pipeline_with_sequential_cpu_offloading(self):
+        input_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png"
+        )
+
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
+            "fusing/stable-unclip-2-1-h-img2img", torch_dtype=torch.float16
+        )
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+        pipe.enable_sequential_cpu_offload()
+
+        _ = pipe(
+            input_image,
+            "anime turtle",
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 7 GB is allocated
+        assert mem_bytes < 7 * 10**9
diff --git a/diffusers/tests/pipelines/stable_video_diffusion/__init__.py b/diffusers/tests/pipelines/stable_video_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py b/diffusers/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..60fc21e2027b0513d211a5d1771646ebe31c1625
--- /dev/null
+++ b/diffusers/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py
@@ -0,0 +1,559 @@
+import gc
+import random
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+import diffusers
+from diffusers import (
+    AutoencoderKLTemporalDecoder,
+    EulerDiscreteScheduler,
+    StableVideoDiffusionPipeline,
+    UNetSpatioTemporalConditionModel,
+)
+from diffusers.utils import is_accelerate_available, is_accelerate_version, load_image, logging
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    CaptureLogger,
+    enable_full_determinism,
+    floats_tensor,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+class StableVideoDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = StableVideoDiffusionPipeline
+    params = frozenset(["image"])
+    batch_params = frozenset(["image", "generator"])
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+        ]
+    )
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNetSpatioTemporalConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=8,
+            out_channels=4,
+            down_block_types=(
+                "CrossAttnDownBlockSpatioTemporal",
+                "DownBlockSpatioTemporal",
+            ),
+            up_block_types=("UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal"),
+            cross_attention_dim=32,
+            num_attention_heads=8,
+            projection_class_embeddings_input_dim=96,
+            addition_time_embed_dim=32,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            interpolation_type="linear",
+            num_train_timesteps=1000,
+            prediction_type="v_prediction",
+            sigma_max=700.0,
+            sigma_min=0.002,
+            steps_offset=1,
+            timestep_spacing="leading",
+            timestep_type="continuous",
+            trained_betas=None,
+            use_karras_sigmas=True,
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderKLTemporalDecoder(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            latent_channels=4,
+        )
+
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=32,
+            projection_dim=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            image_size=32,
+            intermediate_size=37,
+            patch_size=1,
+        )
+        image_encoder = CLIPVisionModelWithProjection(config)
+
+        torch.manual_seed(0)
+        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
+        components = {
+            "unet": unet,
+            "image_encoder": image_encoder,
+            "scheduler": scheduler,
+            "vae": vae,
+            "feature_extractor": feature_extractor,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(0)).to(device)
+        inputs = {
+            "generator": generator,
+            "image": image,
+            "num_inference_steps": 2,
+            "output_type": "pt",
+            "min_guidance_scale": 1.0,
+            "max_guidance_scale": 2.5,
+            "num_frames": 2,
+            "height": 32,
+            "width": 32,
+        }
+        return inputs
+
+    @unittest.skip("Deprecated functionality")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    @unittest.skip("Batched inference works and outputs look correct, but the test is failing")
+    def test_inference_batch_single_identical(
+        self,
+        batch_size=2,
+        expected_max_diff=1e-4,
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for components in pipe.components.values():
+            if hasattr(components, "set_default_attn_processor"):
+                components.set_default_attn_processor()
+        pipe.to(torch_device)
+
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = torch.Generator("cpu").manual_seed(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        batched_inputs["generator"] = [torch.Generator("cpu").manual_seed(0) for i in range(batch_size)]
+        batched_inputs["image"] = torch.cat([inputs["image"]] * batch_size, dim=0)
+
+        output = pipe(**inputs).frames
+        output_batch = pipe(**batched_inputs).frames
+
+        assert len(output_batch) == batch_size
+
+        max_diff = np.abs(to_np(output_batch[0]) - to_np(output[0])).max()
+        assert max_diff < expected_max_diff
+
+    @unittest.skip("Test is similar to test_inference_batch_single_identical")
+    def test_inference_batch_consistent(self):
+        pass
+
+    def test_np_output_type(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["output_type"] = "np"
+        output = pipe(**inputs).frames
+        self.assertTrue(isinstance(output, np.ndarray))
+        self.assertEqual(len(output.shape), 5)
+
+    def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        output = pipe(**self.get_dummy_inputs(generator_device)).frames[0]
+        output_tuple = pipe(**self.get_dummy_inputs(generator_device), return_dict=False)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_tuple)).max()
+        self.assertLess(max_diff, expected_max_difference)
+
+    @unittest.skip("Test is currently failing")
+    def test_float16_inference(self, expected_max_diff=5e-2):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        components = self.get_dummy_components()
+        pipe_fp16 = self.pipeline_class(**components)
+        for component in pipe_fp16.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe_fp16.to(torch_device, torch.float16)
+        pipe_fp16.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = pipe(**inputs).frames[0]
+
+        fp16_inputs = self.get_dummy_inputs(torch_device)
+        output_fp16 = pipe_fp16(**fp16_inputs).frames[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_fp16)).max()
+        self.assertLess(max_diff, expected_max_diff, "The outputs of the fp16 and fp32 pipelines are too different.")
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self, expected_max_diff=1e-2):
+        components = self.get_dummy_components()
+        for name, module in components.items():
+            if hasattr(module, "half"):
+                components[name] = module.to(torch_device).half()
+
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = pipe(**inputs).frames[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16)
+            for component in pipe_loaded.components.values():
+                if hasattr(component, "set_default_attn_processor"):
+                    component.set_default_attn_processor()
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for name, component in pipe_loaded.components.items():
+            if hasattr(component, "dtype"):
+                self.assertTrue(
+                    component.dtype == torch.float16,
+                    f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.",
+                )
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_loaded = pipe_loaded(**inputs).frames[0]
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(
+            max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."
+        )
+
+    def test_save_load_optional_components(self, expected_max_difference=1e-4):
+        if not hasattr(self.pipeline_class, "_optional_components"):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # set all optional components to None
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output = pipe(**inputs).frames[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir, safe_serialization=False)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            for component in pipe_loaded.components.values():
+                if hasattr(component, "set_default_attn_processor"):
+                    component.set_default_attn_processor()
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(generator_device)
+        output_loaded = pipe_loaded(**inputs).frames[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, expected_max_difference)
+
+    def test_save_load_local(self, expected_max_difference=9e-4):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = pipe(**inputs).frames[0]
+
+        logger = logging.get_logger("diffusers.pipelines.pipeline_utils")
+        logger.setLevel(diffusers.logging.INFO)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir, safe_serialization=False)
+
+            with CaptureLogger(logger) as cap_logger:
+                pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+
+            for name in pipe_loaded.components.keys():
+                if name not in pipe_loaded._optional_components:
+                    assert name in str(cap_logger)
+
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_loaded = pipe_loaded(**inputs).frames[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, expected_max_difference)
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu")).frames[0]
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cuda")).frames[0]
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.float16)
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"),
+        reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
+    )
+    def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_offload = pipe(**inputs).frames[0]
+
+        pipe.enable_sequential_cpu_offload()
+
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_offload = pipe(**inputs).frames[0]
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
+        reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
+    )
+    def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
+        generator_device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_offload = pipe(**inputs).frames[0]
+
+        pipe.enable_model_cpu_offload()
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_offload = pipe(**inputs).frames[0]
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
+        offloaded_modules = [
+            v
+            for k, v in pipe.components.items()
+            if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
+        ]
+        (
+            self.assertTrue(all(v.device.type == "cpu" for v in offloaded_modules)),
+            f"Not offloaded: {[v for v in offloaded_modules if v.device.type != 'cpu']}",
+        )
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        expected_max_diff = 9e-4
+
+        if not self.test_xformers_attention:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = pipe(**inputs).frames[0]
+        output_without_offload = (
+            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
+        )
+
+        pipe.enable_xformers_memory_efficient_attention()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = pipe(**inputs).frames[0]
+        output_with_offload = (
+            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
+        )
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, expected_max_diff, "XFormers attention should not affect the inference results")
+
+    def test_disable_cfg(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["max_guidance_scale"] = 1.0
+        output = pipe(**inputs).frames
+        self.assertEqual(len(output.shape), 5)
+
+
+@slow
+@require_torch_gpu
+class StableVideoDiffusionPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_sd_video(self):
+        pipe = StableVideoDiffusionPipeline.from_pretrained(
+            "stabilityai/stable-video-diffusion-img2vid",
+            variant="fp16",
+            torch_dtype=torch.float16,
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true"
+        )
+
+        generator = torch.Generator("cpu").manual_seed(0)
+        num_frames = 3
+
+        output = pipe(
+            image=image,
+            num_frames=num_frames,
+            generator=generator,
+            num_inference_steps=3,
+            output_type="np",
+        )
+
+        image = output.frames[0]
+        assert image.shape == (num_frames, 576, 1024, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.8592, 0.8645, 0.8499, 0.8722, 0.8769, 0.8421, 0.8557, 0.8528, 0.8285])
+        assert numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice.flatten()) < 1e-3
diff --git a/diffusers/tests/pipelines/test_pipeline_utils.py b/diffusers/tests/pipelines/test_pipeline_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..697244dcb10528c025a4ac8bd046b70a9effb594
--- /dev/null
+++ b/diffusers/tests/pipelines/test_pipeline_utils.py
@@ -0,0 +1,446 @@
+import contextlib
+import io
+import re
+import unittest
+
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AnimateDiffPipeline,
+    AnimateDiffVideoToVideoPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    MotionAdapter,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipeline,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.pipeline_utils import is_safetensors_compatible
+from diffusers.utils.testing_utils import torch_device
+
+
+class IsSafetensorsCompatibleTests(unittest.TestCase):
+    def test_all_is_compatible(self):
+        filenames = [
+            "safety_checker/pytorch_model.bin",
+            "safety_checker/model.safetensors",
+            "vae/diffusion_pytorch_model.bin",
+            "vae/diffusion_pytorch_model.safetensors",
+            "text_encoder/pytorch_model.bin",
+            "text_encoder/model.safetensors",
+            "unet/diffusion_pytorch_model.bin",
+            "unet/diffusion_pytorch_model.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames))
+
+    def test_diffusers_model_is_compatible(self):
+        filenames = [
+            "unet/diffusion_pytorch_model.bin",
+            "unet/diffusion_pytorch_model.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames))
+
+    def test_diffusers_model_is_not_compatible(self):
+        filenames = [
+            "safety_checker/pytorch_model.bin",
+            "safety_checker/model.safetensors",
+            "vae/diffusion_pytorch_model.bin",
+            "vae/diffusion_pytorch_model.safetensors",
+            "text_encoder/pytorch_model.bin",
+            "text_encoder/model.safetensors",
+            "unet/diffusion_pytorch_model.bin",
+            # Removed: 'unet/diffusion_pytorch_model.safetensors',
+        ]
+        self.assertFalse(is_safetensors_compatible(filenames))
+
+    def test_transformer_model_is_compatible(self):
+        filenames = [
+            "text_encoder/pytorch_model.bin",
+            "text_encoder/model.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames))
+
+    def test_transformer_model_is_not_compatible(self):
+        filenames = [
+            "safety_checker/pytorch_model.bin",
+            "safety_checker/model.safetensors",
+            "vae/diffusion_pytorch_model.bin",
+            "vae/diffusion_pytorch_model.safetensors",
+            "text_encoder/pytorch_model.bin",
+            # Removed: 'text_encoder/model.safetensors',
+            "unet/diffusion_pytorch_model.bin",
+            "unet/diffusion_pytorch_model.safetensors",
+        ]
+        self.assertFalse(is_safetensors_compatible(filenames))
+
+    def test_all_is_compatible_variant(self):
+        filenames = [
+            "safety_checker/pytorch_model.fp16.bin",
+            "safety_checker/model.fp16.safetensors",
+            "vae/diffusion_pytorch_model.fp16.bin",
+            "vae/diffusion_pytorch_model.fp16.safetensors",
+            "text_encoder/pytorch_model.fp16.bin",
+            "text_encoder/model.fp16.safetensors",
+            "unet/diffusion_pytorch_model.fp16.bin",
+            "unet/diffusion_pytorch_model.fp16.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames))
+
+    def test_diffusers_model_is_compatible_variant(self):
+        filenames = [
+            "unet/diffusion_pytorch_model.fp16.bin",
+            "unet/diffusion_pytorch_model.fp16.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames))
+
+    def test_diffusers_model_is_compatible_variant_mixed(self):
+        filenames = [
+            "unet/diffusion_pytorch_model.bin",
+            "unet/diffusion_pytorch_model.fp16.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames))
+
+    def test_diffusers_model_is_not_compatible_variant(self):
+        filenames = [
+            "safety_checker/pytorch_model.fp16.bin",
+            "safety_checker/model.fp16.safetensors",
+            "vae/diffusion_pytorch_model.fp16.bin",
+            "vae/diffusion_pytorch_model.fp16.safetensors",
+            "text_encoder/pytorch_model.fp16.bin",
+            "text_encoder/model.fp16.safetensors",
+            "unet/diffusion_pytorch_model.fp16.bin",
+            # Removed: 'unet/diffusion_pytorch_model.fp16.safetensors',
+        ]
+        self.assertFalse(is_safetensors_compatible(filenames))
+
+    def test_transformer_model_is_compatible_variant(self):
+        filenames = [
+            "text_encoder/pytorch_model.fp16.bin",
+            "text_encoder/model.fp16.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames))
+
+    def test_transformer_model_is_not_compatible_variant(self):
+        filenames = [
+            "safety_checker/pytorch_model.fp16.bin",
+            "safety_checker/model.fp16.safetensors",
+            "vae/diffusion_pytorch_model.fp16.bin",
+            "vae/diffusion_pytorch_model.fp16.safetensors",
+            "text_encoder/pytorch_model.fp16.bin",
+            "unet/diffusion_pytorch_model.fp16.bin",
+            "unet/diffusion_pytorch_model.fp16.safetensors",
+        ]
+        self.assertFalse(is_safetensors_compatible(filenames))
+
+    def test_transformer_model_is_compatible_variant_extra_folder(self):
+        filenames = [
+            "safety_checker/pytorch_model.fp16.bin",
+            "safety_checker/model.fp16.safetensors",
+            "vae/diffusion_pytorch_model.fp16.bin",
+            "vae/diffusion_pytorch_model.fp16.safetensors",
+            "text_encoder/pytorch_model.fp16.bin",
+            "unet/diffusion_pytorch_model.fp16.bin",
+            "unet/diffusion_pytorch_model.fp16.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames, folder_names={"vae", "unet"}))
+
+    def test_transformer_model_is_not_compatible_variant_extra_folder(self):
+        filenames = [
+            "safety_checker/pytorch_model.fp16.bin",
+            "safety_checker/model.fp16.safetensors",
+            "vae/diffusion_pytorch_model.fp16.bin",
+            "vae/diffusion_pytorch_model.fp16.safetensors",
+            "text_encoder/pytorch_model.fp16.bin",
+            "unet/diffusion_pytorch_model.fp16.bin",
+            "unet/diffusion_pytorch_model.fp16.safetensors",
+        ]
+        self.assertFalse(is_safetensors_compatible(filenames, folder_names={"text_encoder"}))
+
+    def test_transformers_is_compatible_sharded(self):
+        filenames = [
+            "text_encoder/pytorch_model.bin",
+            "text_encoder/model-00001-of-00002.safetensors",
+            "text_encoder/model-00002-of-00002.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames))
+
+    def test_transformers_is_compatible_variant_sharded(self):
+        filenames = [
+            "text_encoder/pytorch_model.bin",
+            "text_encoder/model.fp16-00001-of-00002.safetensors",
+            "text_encoder/model.fp16-00001-of-00002.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames))
+
+    def test_diffusers_is_compatible_sharded(self):
+        filenames = [
+            "unet/diffusion_pytorch_model.bin",
+            "unet/diffusion_pytorch_model-00001-of-00002.safetensors",
+            "unet/diffusion_pytorch_model-00002-of-00002.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames))
+
+    def test_diffusers_is_compatible_variant_sharded(self):
+        filenames = [
+            "unet/diffusion_pytorch_model.bin",
+            "unet/diffusion_pytorch_model.fp16-00001-of-00002.safetensors",
+            "unet/diffusion_pytorch_model.fp16-00001-of-00002.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames))
+
+    def test_diffusers_is_compatible_only_variants(self):
+        filenames = [
+            "unet/diffusion_pytorch_model.fp16.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames))
+
+
+class ProgressBarTests(unittest.TestCase):
+    def get_dummy_components_image_generation(self):
+        cross_attention_dim = 8
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=cross_attention_dim,
+            intermediate_size=16,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_components_video_generation(self):
+        cross_attention_dim = 8
+        block_out_channels = (8, 8)
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=block_out_channels,
+            layers_per_block=2,
+            sample_size=8,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=cross_attention_dim,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="linear",
+            clip_sample=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=block_out_channels,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=cross_attention_dim,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        torch.manual_seed(0)
+        motion_adapter = MotionAdapter(
+            block_out_channels=block_out_channels,
+            motion_layers_per_block=2,
+            motion_norm_num_groups=2,
+            motion_num_attention_heads=4,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "motion_adapter": motion_adapter,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def test_text_to_image(self):
+        components = self.get_dummy_components_image_generation()
+        pipe = StableDiffusionPipeline(**components)
+        pipe.to(torch_device)
+
+        inputs = {"prompt": "a cute cat", "num_inference_steps": 2}
+        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
+            _ = pipe(**inputs)
+            stderr = stderr.getvalue()
+            # we can't calculate the number of progress steps beforehand e.g. for strength-dependent img2img,
+            # so we just match "5" in "#####| 1/5 [00:01<00:00]"
+            max_steps = re.search("/(.*?) ", stderr).group(1)
+            self.assertTrue(max_steps is not None and len(max_steps) > 0)
+            self.assertTrue(
+                f"{max_steps}/{max_steps}" in stderr, "Progress bar should be enabled and stopped at the max step"
+            )
+
+        pipe.set_progress_bar_config(disable=True)
+        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
+            _ = pipe(**inputs)
+            self.assertTrue(stderr.getvalue() == "", "Progress bar should be disabled")
+
+    def test_image_to_image(self):
+        components = self.get_dummy_components_image_generation()
+        pipe = StableDiffusionImg2ImgPipeline(**components)
+        pipe.to(torch_device)
+
+        image = Image.new("RGB", (32, 32))
+        inputs = {"prompt": "a cute cat", "num_inference_steps": 2, "strength": 0.5, "image": image}
+        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
+            _ = pipe(**inputs)
+            stderr = stderr.getvalue()
+            # we can't calculate the number of progress steps beforehand e.g. for strength-dependent img2img,
+            # so we just match "5" in "#####| 1/5 [00:01<00:00]"
+            max_steps = re.search("/(.*?) ", stderr).group(1)
+            self.assertTrue(max_steps is not None and len(max_steps) > 0)
+            self.assertTrue(
+                f"{max_steps}/{max_steps}" in stderr, "Progress bar should be enabled and stopped at the max step"
+            )
+
+        pipe.set_progress_bar_config(disable=True)
+        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
+            _ = pipe(**inputs)
+            self.assertTrue(stderr.getvalue() == "", "Progress bar should be disabled")
+
+    def test_inpainting(self):
+        components = self.get_dummy_components_image_generation()
+        pipe = StableDiffusionInpaintPipeline(**components)
+        pipe.to(torch_device)
+
+        image = Image.new("RGB", (32, 32))
+        mask = Image.new("RGB", (32, 32))
+        inputs = {
+            "prompt": "a cute cat",
+            "num_inference_steps": 2,
+            "strength": 0.5,
+            "image": image,
+            "mask_image": mask,
+        }
+        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
+            _ = pipe(**inputs)
+            stderr = stderr.getvalue()
+            # we can't calculate the number of progress steps beforehand e.g. for strength-dependent img2img,
+            # so we just match "5" in "#####| 1/5 [00:01<00:00]"
+            max_steps = re.search("/(.*?) ", stderr).group(1)
+            self.assertTrue(max_steps is not None and len(max_steps) > 0)
+            self.assertTrue(
+                f"{max_steps}/{max_steps}" in stderr, "Progress bar should be enabled and stopped at the max step"
+            )
+
+        pipe.set_progress_bar_config(disable=True)
+        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
+            _ = pipe(**inputs)
+            self.assertTrue(stderr.getvalue() == "", "Progress bar should be disabled")
+
+    def test_text_to_video(self):
+        components = self.get_dummy_components_video_generation()
+        pipe = AnimateDiffPipeline(**components)
+        pipe.to(torch_device)
+
+        inputs = {"prompt": "a cute cat", "num_inference_steps": 2, "num_frames": 2}
+        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
+            _ = pipe(**inputs)
+            stderr = stderr.getvalue()
+            # we can't calculate the number of progress steps beforehand e.g. for strength-dependent img2img,
+            # so we just match "5" in "#####| 1/5 [00:01<00:00]"
+            max_steps = re.search("/(.*?) ", stderr).group(1)
+            self.assertTrue(max_steps is not None and len(max_steps) > 0)
+            self.assertTrue(
+                f"{max_steps}/{max_steps}" in stderr, "Progress bar should be enabled and stopped at the max step"
+            )
+
+        pipe.set_progress_bar_config(disable=True)
+        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
+            _ = pipe(**inputs)
+            self.assertTrue(stderr.getvalue() == "", "Progress bar should be disabled")
+
+    def test_video_to_video(self):
+        components = self.get_dummy_components_video_generation()
+        pipe = AnimateDiffVideoToVideoPipeline(**components)
+        pipe.to(torch_device)
+
+        num_frames = 2
+        video = [Image.new("RGB", (32, 32))] * num_frames
+        inputs = {"prompt": "a cute cat", "num_inference_steps": 2, "video": video}
+        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
+            _ = pipe(**inputs)
+            stderr = stderr.getvalue()
+            # we can't calculate the number of progress steps beforehand e.g. for strength-dependent img2img,
+            # so we just match "5" in "#####| 1/5 [00:01<00:00]"
+            max_steps = re.search("/(.*?) ", stderr).group(1)
+            self.assertTrue(max_steps is not None and len(max_steps) > 0)
+            self.assertTrue(
+                f"{max_steps}/{max_steps}" in stderr, "Progress bar should be enabled and stopped at the max step"
+            )
+
+        pipe.set_progress_bar_config(disable=True)
+        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
+            _ = pipe(**inputs)
+            self.assertTrue(stderr.getvalue() == "", "Progress bar should be disabled")
diff --git a/diffusers/tests/pipelines/test_pipelines.py b/diffusers/tests/pipelines/test_pipelines.py
new file mode 100644
index 0000000000000000000000000000000000000000..c73a12a4cbf81bbc750b00e071715b3ca5bc066e
--- /dev/null
+++ b/diffusers/tests/pipelines/test_pipelines.py
@@ -0,0 +1,2016 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import json
+import os
+import random
+import shutil
+import sys
+import tempfile
+import traceback
+import unittest
+import unittest.mock as mock
+
+import numpy as np
+import PIL.Image
+import requests_mock
+import safetensors.torch
+import torch
+import torch.nn as nn
+from parameterized import parameterized
+from PIL import Image
+from requests.exceptions import HTTPError
+from transformers import CLIPImageProcessor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    ConfigMixin,
+    DDIMPipeline,
+    DDIMScheduler,
+    DDPMPipeline,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    ModelMixin,
+    PNDMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipelineLegacy,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    UNet2DModel,
+    UniPCMultistepScheduler,
+    logging,
+)
+from diffusers.pipelines.pipeline_utils import _get_pipeline_class
+from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
+from diffusers.utils import (
+    CONFIG_NAME,
+    WEIGHTS_NAME,
+)
+from diffusers.utils.testing_utils import (
+    CaptureLogger,
+    enable_full_determinism,
+    floats_tensor,
+    get_python_version,
+    get_tests_dir,
+    is_torch_compile,
+    load_numpy,
+    nightly,
+    require_compel,
+    require_flax,
+    require_onnxruntime,
+    require_torch_2,
+    require_torch_gpu,
+    run_test_in_subprocess,
+    slow,
+    torch_device,
+)
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+enable_full_determinism()
+
+
+# Will be run via run_test_in_subprocess
+def _test_from_save_pretrained_dynamo(in_queue, out_queue, timeout):
+    error = None
+    try:
+        # 1. Load models
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        model = torch.compile(model)
+        scheduler = DDPMScheduler(num_train_timesteps=10)
+
+        ddpm = DDPMPipeline(model, scheduler)
+
+        # previous diffusers versions stripped compilation off
+        # compiled modules
+        assert is_compiled_module(ddpm.unet)
+
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ddpm.save_pretrained(tmpdirname)
+            new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
+            new_ddpm.to(torch_device)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = ddpm(generator=generator, num_inference_steps=5, output_type="np").images
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="np").images
+
+        assert np.abs(image - new_image).max() < 1e-5, "Models don't give the same forward pass"
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
+class CustomEncoder(ModelMixin, ConfigMixin):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(3, 3)
+
+
+class CustomPipeline(DiffusionPipeline):
+    def __init__(self, encoder: CustomEncoder, scheduler: DDIMScheduler):
+        super().__init__()
+        self.register_modules(encoder=encoder, scheduler=scheduler)
+
+
+class DownloadTests(unittest.TestCase):
+    @unittest.skip("Flaky behaviour on CI. Re-enable after migrating to new runners")
+    def test_one_request_upon_cached(self):
+        # TODO: For some reason this test fails on MPS where no HEAD call is made.
+        if torch_device == "mps":
+            return
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            with requests_mock.mock(real_http=True) as m:
+                DiffusionPipeline.download("hf-internal-testing/tiny-stable-diffusion-pipe", cache_dir=tmpdirname)
+
+            download_requests = [r.method for r in m.request_history]
+            assert download_requests.count("HEAD") == 15, "15 calls to files"
+            assert download_requests.count("GET") == 17, "15 calls to files + model_info + model_index.json"
+            assert (
+                len(download_requests) == 32
+            ), "2 calls per file (15 files) + send_telemetry, model_info and model_index.json"
+
+            with requests_mock.mock(real_http=True) as m:
+                DiffusionPipeline.download(
+                    "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname
+                )
+
+            cache_requests = [r.method for r in m.request_history]
+            assert cache_requests.count("HEAD") == 1, "model_index.json is only HEAD"
+            assert cache_requests.count("GET") == 1, "model info is only GET"
+            assert (
+                len(cache_requests) == 2
+            ), "We should call only `model_info` to check for _commit hash and `send_telemetry`"
+
+    def test_less_downloads_passed_object(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            cached_folder = DiffusionPipeline.download(
+                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname
+            )
+
+            # make sure safety checker is not downloaded
+            assert "safety_checker" not in os.listdir(cached_folder)
+
+            # make sure rest is downloaded
+            assert "unet" in os.listdir(cached_folder)
+            assert "tokenizer" in os.listdir(cached_folder)
+            assert "vae" in os.listdir(cached_folder)
+            assert "model_index.json" in os.listdir(cached_folder)
+            assert "scheduler" in os.listdir(cached_folder)
+            assert "feature_extractor" in os.listdir(cached_folder)
+
+    @unittest.skip("Flaky behaviour on CI. Re-enable after migrating to new runners")
+    def test_less_downloads_passed_object_calls(self):
+        # TODO: For some reason this test fails on MPS where no HEAD call is made.
+        if torch_device == "mps":
+            return
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            with requests_mock.mock(real_http=True) as m:
+                DiffusionPipeline.download(
+                    "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname
+                )
+
+            download_requests = [r.method for r in m.request_history]
+            # 15 - 2 because no call to config or model file for `safety_checker`
+            assert download_requests.count("HEAD") == 13, "13 calls to files"
+            # 17 - 2 because no call to config or model file for `safety_checker`
+            assert download_requests.count("GET") == 15, "13 calls to files + model_info + model_index.json"
+            assert (
+                len(download_requests) == 28
+            ), "2 calls per file (13 files) + send_telemetry, model_info and model_index.json"
+
+            with requests_mock.mock(real_http=True) as m:
+                DiffusionPipeline.download(
+                    "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname
+                )
+
+            cache_requests = [r.method for r in m.request_history]
+            assert cache_requests.count("HEAD") == 1, "model_index.json is only HEAD"
+            assert cache_requests.count("GET") == 1, "model info is only GET"
+            assert (
+                len(cache_requests) == 2
+            ), "We should call only `model_info` to check for _commit hash and `send_telemetry`"
+
+    def test_download_only_pytorch(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            # pipeline has Flax weights
+            tmpdirname = DiffusionPipeline.download(
+                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname
+            )
+
+            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
+            files = [item for sublist in all_root_files for item in sublist]
+
+            # None of the downloaded files should be a flax file even if we have some here:
+            # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack
+            assert not any(f.endswith(".msgpack") for f in files)
+            # We need to never convert this tiny model to safetensors for this test to pass
+            assert not any(f.endswith(".safetensors") for f in files)
+
+    def test_force_safetensors_error(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            # pipeline has Flax weights
+            with self.assertRaises(EnvironmentError):
+                tmpdirname = DiffusionPipeline.download(
+                    "hf-internal-testing/tiny-stable-diffusion-pipe-no-safetensors",
+                    safety_checker=None,
+                    cache_dir=tmpdirname,
+                    use_safetensors=True,
+                )
+
+    def test_download_safetensors(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            # pipeline has Flax weights
+            tmpdirname = DiffusionPipeline.download(
+                "hf-internal-testing/tiny-stable-diffusion-pipe-safetensors",
+                safety_checker=None,
+                cache_dir=tmpdirname,
+            )
+
+            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
+            files = [item for sublist in all_root_files for item in sublist]
+
+            # None of the downloaded files should be a pytorch file even if we have some here:
+            # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack
+            assert not any(f.endswith(".bin") for f in files)
+
+    def test_download_safetensors_index(self):
+        for variant in ["fp16", None]:
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                tmpdirname = DiffusionPipeline.download(
+                    "hf-internal-testing/tiny-stable-diffusion-pipe-indexes",
+                    cache_dir=tmpdirname,
+                    use_safetensors=True,
+                    variant=variant,
+                )
+
+                all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
+                files = [item for sublist in all_root_files for item in sublist]
+
+                # None of the downloaded files should be a safetensors file even if we have some here:
+                # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe-indexes/tree/main/text_encoder
+                if variant is None:
+                    assert not any("fp16" in f for f in files)
+                else:
+                    model_files = [f for f in files if "safetensors" in f]
+                    assert all("fp16" in f for f in model_files)
+
+                assert len([f for f in files if ".safetensors" in f]) == 8
+                assert not any(".bin" in f for f in files)
+
+    def test_download_bin_index(self):
+        for variant in ["fp16", None]:
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                tmpdirname = DiffusionPipeline.download(
+                    "hf-internal-testing/tiny-stable-diffusion-pipe-indexes",
+                    cache_dir=tmpdirname,
+                    use_safetensors=False,
+                    variant=variant,
+                )
+
+                all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
+                files = [item for sublist in all_root_files for item in sublist]
+
+                # None of the downloaded files should be a safetensors file even if we have some here:
+                # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe-indexes/tree/main/text_encoder
+                if variant is None:
+                    assert not any("fp16" in f for f in files)
+                else:
+                    model_files = [f for f in files if "bin" in f]
+                    assert all("fp16" in f for f in model_files)
+
+                assert len([f for f in files if ".bin" in f]) == 8
+                assert not any(".safetensors" in f for f in files)
+
+    def test_download_no_openvino_by_default(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            tmpdirname = DiffusionPipeline.download(
+                "hf-internal-testing/tiny-stable-diffusion-open-vino",
+                cache_dir=tmpdirname,
+            )
+
+            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
+            files = [item for sublist in all_root_files for item in sublist]
+
+            # make sure that by default no openvino weights are downloaded
+            assert all((f.endswith(".json") or f.endswith(".bin") or f.endswith(".txt")) for f in files)
+            assert not any("openvino_" in f for f in files)
+
+    def test_download_no_onnx_by_default(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            tmpdirname = DiffusionPipeline.download(
+                "hf-internal-testing/tiny-stable-diffusion-xl-pipe",
+                cache_dir=tmpdirname,
+                use_safetensors=False,
+            )
+
+            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
+            files = [item for sublist in all_root_files for item in sublist]
+
+            # make sure that by default no onnx weights are downloaded for non-ONNX pipelines
+            assert all((f.endswith(".json") or f.endswith(".bin") or f.endswith(".txt")) for f in files)
+            assert not any((f.endswith(".onnx") or f.endswith(".pb")) for f in files)
+
+    @require_onnxruntime
+    def test_download_onnx_by_default_for_onnx_pipelines(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            tmpdirname = DiffusionPipeline.download(
+                "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline",
+                cache_dir=tmpdirname,
+            )
+
+            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
+            files = [item for sublist in all_root_files for item in sublist]
+
+            # make sure that by default onnx weights are downloaded for ONNX pipelines
+            assert any((f.endswith(".json") or f.endswith(".bin") or f.endswith(".txt")) for f in files)
+            assert any((f.endswith(".onnx")) for f in files)
+            assert any((f.endswith(".pb")) for f in files)
+
+    def test_download_no_safety_checker(self):
+        prompt = "hello"
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
+        pipe = pipe.to(torch_device)
+        generator = torch.manual_seed(0)
+        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="np").images
+
+        pipe_2 = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
+        pipe_2 = pipe_2.to(torch_device)
+        generator = torch.manual_seed(0)
+        out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="np").images
+
+        assert np.max(np.abs(out - out_2)) < 1e-3
+
+    def test_load_no_safety_checker_explicit_locally(self):
+        prompt = "hello"
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
+        pipe = pipe.to(torch_device)
+        generator = torch.manual_seed(0)
+        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="np").images
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname, safety_checker=None)
+            pipe_2 = pipe_2.to(torch_device)
+
+            generator = torch.manual_seed(0)
+
+            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="np").images
+
+        assert np.max(np.abs(out - out_2)) < 1e-3
+
+    def test_load_no_safety_checker_default_locally(self):
+        prompt = "hello"
+        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
+        pipe = pipe.to(torch_device)
+
+        generator = torch.manual_seed(0)
+        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="np").images
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname)
+            pipe_2 = pipe_2.to(torch_device)
+
+            generator = torch.manual_seed(0)
+
+            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="np").images
+
+        assert np.max(np.abs(out - out_2)) < 1e-3
+
+    def test_cached_files_are_used_when_no_internet(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        orig_pipe = DiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
+        orig_comps = {k: v for k, v in orig_pipe.components.items() if hasattr(v, "parameters")}
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.request", return_value=response_mock):
+            # Download this model to make sure it's in the cache.
+            pipe = DiffusionPipeline.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+            )
+            comps = {k: v for k, v in pipe.components.items() if hasattr(v, "parameters")}
+
+        for m1, m2 in zip(orig_comps.values(), comps.values()):
+            for p1, p2 in zip(m1.parameters(), m2.parameters()):
+                if p1.data.ne(p2.data).sum() > 0:
+                    assert False, "Parameters not the same!"
+
+    def test_local_files_only_are_used_when_no_internet(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # first check that with local files only the pipeline can only be used if cached
+        with self.assertRaises(FileNotFoundError):
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                orig_pipe = DiffusionPipeline.from_pretrained(
+                    "hf-internal-testing/tiny-stable-diffusion-torch", local_files_only=True, cache_dir=tmpdirname
+                )
+
+        # now download
+        orig_pipe = DiffusionPipeline.download("hf-internal-testing/tiny-stable-diffusion-torch")
+
+        # make sure it can be loaded with local_files_only
+        orig_pipe = DiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", local_files_only=True
+        )
+        orig_comps = {k: v for k, v in orig_pipe.components.items() if hasattr(v, "parameters")}
+
+        # Under the mock environment we get a 500 error when trying to connect to the internet.
+        # Make sure it works local_files_only only works here!
+        with mock.patch("requests.request", return_value=response_mock):
+            # Download this model to make sure it's in the cache.
+            pipe = DiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
+            comps = {k: v for k, v in pipe.components.items() if hasattr(v, "parameters")}
+
+        for m1, m2 in zip(orig_comps.values(), comps.values()):
+            for p1, p2 in zip(m1.parameters(), m2.parameters()):
+                if p1.data.ne(p2.data).sum() > 0:
+                    assert False, "Parameters not the same!"
+
+    def test_download_from_variant_folder(self):
+        for use_safetensors in [False, True]:
+            other_format = ".bin" if use_safetensors else ".safetensors"
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                tmpdirname = StableDiffusionPipeline.download(
+                    "hf-internal-testing/stable-diffusion-all-variants",
+                    cache_dir=tmpdirname,
+                    use_safetensors=use_safetensors,
+                )
+                all_root_files = [t[-1] for t in os.walk(tmpdirname)]
+                files = [item for sublist in all_root_files for item in sublist]
+
+                # None of the downloaded files should be a variant file even if we have some here:
+                # https://huggingface.co/hf-internal-testing/stable-diffusion-all-variants/tree/main/unet
+                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
+                assert not any(f.endswith(other_format) for f in files)
+                # no variants
+                assert not any(len(f.split(".")) == 3 for f in files)
+
+    def test_download_variant_all(self):
+        for use_safetensors in [False, True]:
+            other_format = ".bin" if use_safetensors else ".safetensors"
+            this_format = ".safetensors" if use_safetensors else ".bin"
+            variant = "fp16"
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                tmpdirname = StableDiffusionPipeline.download(
+                    "hf-internal-testing/stable-diffusion-all-variants",
+                    cache_dir=tmpdirname,
+                    variant=variant,
+                    use_safetensors=use_safetensors,
+                )
+                all_root_files = [t[-1] for t in os.walk(tmpdirname)]
+                files = [item for sublist in all_root_files for item in sublist]
+
+                # None of the downloaded files should be a non-variant file even if we have some here:
+                # https://huggingface.co/hf-internal-testing/stable-diffusion-all-variants/tree/main/unet
+                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
+                # unet, vae, text_encoder, safety_checker
+                assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 4
+                # all checkpoints should have variant ending
+                assert not any(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files)
+                assert not any(f.endswith(other_format) for f in files)
+
+    def test_download_variant_partly(self):
+        for use_safetensors in [False, True]:
+            other_format = ".bin" if use_safetensors else ".safetensors"
+            this_format = ".safetensors" if use_safetensors else ".bin"
+            variant = "no_ema"
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                tmpdirname = StableDiffusionPipeline.download(
+                    "hf-internal-testing/stable-diffusion-all-variants",
+                    cache_dir=tmpdirname,
+                    variant=variant,
+                    use_safetensors=use_safetensors,
+                )
+                all_root_files = [t[-1] for t in os.walk(tmpdirname)]
+                files = [item for sublist in all_root_files for item in sublist]
+
+                unet_files = os.listdir(os.path.join(tmpdirname, "unet"))
+
+                # Some of the downloaded files should be a non-variant file, check:
+                # https://huggingface.co/hf-internal-testing/stable-diffusion-all-variants/tree/main/unet
+                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
+                # only unet has "no_ema" variant
+                assert f"diffusion_pytorch_model.{variant}{this_format}" in unet_files
+                assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 1
+                # vae, safety_checker and text_encoder should have no variant
+                assert sum(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files) == 3
+                assert not any(f.endswith(other_format) for f in files)
+
+    def test_download_safetensors_only_variant_exists_for_model(self):
+        variant = None
+        use_safetensors = True
+
+        # text encoder is missing no variant weights, so the following can't work
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            with self.assertRaises(OSError) as error_context:
+                tmpdirname = StableDiffusionPipeline.from_pretrained(
+                    "hf-internal-testing/stable-diffusion-broken-variants",
+                    cache_dir=tmpdirname,
+                    variant=variant,
+                    use_safetensors=use_safetensors,
+                )
+            assert "Error no file name" in str(error_context.exception)
+
+        # text encoder has fp16 variants so we can load it
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            tmpdirname = StableDiffusionPipeline.download(
+                "hf-internal-testing/stable-diffusion-broken-variants",
+                use_safetensors=use_safetensors,
+                cache_dir=tmpdirname,
+                variant="fp16",
+            )
+            all_root_files = [t[-1] for t in os.walk(tmpdirname)]
+            files = [item for sublist in all_root_files for item in sublist]
+            # None of the downloaded files should be a non-variant file even if we have some here:
+            # https://huggingface.co/hf-internal-testing/stable-diffusion-broken-variants/tree/main/unet
+            assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
+
+    def test_download_bin_only_variant_exists_for_model(self):
+        variant = None
+        use_safetensors = False
+
+        # text encoder is missing Non-variant weights, so the following can't work
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            with self.assertRaises(OSError) as error_context:
+                tmpdirname = StableDiffusionPipeline.from_pretrained(
+                    "hf-internal-testing/stable-diffusion-broken-variants",
+                    cache_dir=tmpdirname,
+                    variant=variant,
+                    use_safetensors=use_safetensors,
+                )
+            assert "Error no file name" in str(error_context.exception)
+
+        # text encoder has fp16 variants so we can load it
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            tmpdirname = StableDiffusionPipeline.download(
+                "hf-internal-testing/stable-diffusion-broken-variants",
+                use_safetensors=use_safetensors,
+                cache_dir=tmpdirname,
+                variant="fp16",
+            )
+            all_root_files = [t[-1] for t in os.walk(tmpdirname)]
+            files = [item for sublist in all_root_files for item in sublist]
+            # None of the downloaded files should be a non-variant file even if we have some here:
+            # https://huggingface.co/hf-internal-testing/stable-diffusion-broken-variants/tree/main/unet
+            assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
+
+    def test_download_safetensors_variant_does_not_exist_for_model(self):
+        variant = "no_ema"
+        use_safetensors = True
+
+        # text encoder is missing no_ema variant weights, so the following can't work
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            with self.assertRaises(OSError) as error_context:
+                tmpdirname = StableDiffusionPipeline.from_pretrained(
+                    "hf-internal-testing/stable-diffusion-broken-variants",
+                    cache_dir=tmpdirname,
+                    variant=variant,
+                    use_safetensors=use_safetensors,
+                )
+
+            assert "Error no file name" in str(error_context.exception)
+
+    def test_download_bin_variant_does_not_exist_for_model(self):
+        variant = "no_ema"
+        use_safetensors = False
+
+        # text encoder is missing no_ema variant weights, so the following can't work
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            with self.assertRaises(OSError) as error_context:
+                tmpdirname = StableDiffusionPipeline.from_pretrained(
+                    "hf-internal-testing/stable-diffusion-broken-variants",
+                    cache_dir=tmpdirname,
+                    variant=variant,
+                    use_safetensors=use_safetensors,
+                )
+            assert "Error no file name" in str(error_context.exception)
+
+    def test_local_save_load_index(self):
+        prompt = "hello"
+        for variant in [None, "fp16"]:
+            for use_safe in [True, False]:
+                pipe = StableDiffusionPipeline.from_pretrained(
+                    "hf-internal-testing/tiny-stable-diffusion-pipe-indexes",
+                    variant=variant,
+                    use_safetensors=use_safe,
+                    safety_checker=None,
+                )
+                pipe = pipe.to(torch_device)
+                generator = torch.manual_seed(0)
+                out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="np").images
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    pipe.save_pretrained(tmpdirname)
+                    pipe_2 = StableDiffusionPipeline.from_pretrained(
+                        tmpdirname, safe_serialization=use_safe, variant=variant
+                    )
+                    pipe_2 = pipe_2.to(torch_device)
+
+                generator = torch.manual_seed(0)
+
+                out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="np").images
+
+                assert np.max(np.abs(out - out_2)) < 1e-3
+
+    def test_text_inversion_download(self):
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
+        pipe = pipe.to(torch_device)
+
+        num_tokens = len(pipe.tokenizer)
+
+        # single token load local
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ten = {"<*>": torch.ones((32,))}
+            torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
+
+            pipe.load_textual_inversion(tmpdirname)
+
+            token = pipe.tokenizer.convert_tokens_to_ids("<*>")
+            assert token == num_tokens, "Added token must be at spot `num_tokens`"
+            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 32
+            assert pipe._maybe_convert_prompt("<*>", pipe.tokenizer) == "<*>"
+
+            prompt = "hey <*>"
+            out = pipe(prompt, num_inference_steps=1, output_type="np").images
+            assert out.shape == (1, 128, 128, 3)
+
+        # single token load local with weight name
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ten = {"<**>": 2 * torch.ones((1, 32))}
+            torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
+
+            pipe.load_textual_inversion(tmpdirname, weight_name="learned_embeds.bin")
+
+            token = pipe.tokenizer.convert_tokens_to_ids("<**>")
+            assert token == num_tokens + 1, "Added token must be at spot `num_tokens`"
+            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 64
+            assert pipe._maybe_convert_prompt("<**>", pipe.tokenizer) == "<**>"
+
+            prompt = "hey <**>"
+            out = pipe(prompt, num_inference_steps=1, output_type="np").images
+            assert out.shape == (1, 128, 128, 3)
+
+        # multi token load
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ten = {"<***>": torch.cat([3 * torch.ones((1, 32)), 4 * torch.ones((1, 32)), 5 * torch.ones((1, 32))])}
+            torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
+
+            pipe.load_textual_inversion(tmpdirname)
+
+            token = pipe.tokenizer.convert_tokens_to_ids("<***>")
+            token_1 = pipe.tokenizer.convert_tokens_to_ids("<***>_1")
+            token_2 = pipe.tokenizer.convert_tokens_to_ids("<***>_2")
+
+            assert token == num_tokens + 2, "Added token must be at spot `num_tokens`"
+            assert token_1 == num_tokens + 3, "Added token must be at spot `num_tokens`"
+            assert token_2 == num_tokens + 4, "Added token must be at spot `num_tokens`"
+            assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96
+            assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128
+            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160
+            assert pipe._maybe_convert_prompt("<***>", pipe.tokenizer) == "<***> <***>_1 <***>_2"
+
+            prompt = "hey <***>"
+            out = pipe(prompt, num_inference_steps=1, output_type="np").images
+            assert out.shape == (1, 128, 128, 3)
+
+        # multi token load a1111
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ten = {
+                "string_to_param": {
+                    "*": torch.cat([3 * torch.ones((1, 32)), 4 * torch.ones((1, 32)), 5 * torch.ones((1, 32))])
+                },
+                "name": "<****>",
+            }
+            torch.save(ten, os.path.join(tmpdirname, "a1111.bin"))
+
+            pipe.load_textual_inversion(tmpdirname, weight_name="a1111.bin")
+
+            token = pipe.tokenizer.convert_tokens_to_ids("<****>")
+            token_1 = pipe.tokenizer.convert_tokens_to_ids("<****>_1")
+            token_2 = pipe.tokenizer.convert_tokens_to_ids("<****>_2")
+
+            assert token == num_tokens + 5, "Added token must be at spot `num_tokens`"
+            assert token_1 == num_tokens + 6, "Added token must be at spot `num_tokens`"
+            assert token_2 == num_tokens + 7, "Added token must be at spot `num_tokens`"
+            assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96
+            assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128
+            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160
+            assert pipe._maybe_convert_prompt("<****>", pipe.tokenizer) == "<****> <****>_1 <****>_2"
+
+            prompt = "hey <****>"
+            out = pipe(prompt, num_inference_steps=1, output_type="np").images
+            assert out.shape == (1, 128, 128, 3)
+
+        # multi embedding load
+        with tempfile.TemporaryDirectory() as tmpdirname1:
+            with tempfile.TemporaryDirectory() as tmpdirname2:
+                ten = {"<*****>": torch.ones((32,))}
+                torch.save(ten, os.path.join(tmpdirname1, "learned_embeds.bin"))
+
+                ten = {"<******>": 2 * torch.ones((1, 32))}
+                torch.save(ten, os.path.join(tmpdirname2, "learned_embeds.bin"))
+
+                pipe.load_textual_inversion([tmpdirname1, tmpdirname2])
+
+                token = pipe.tokenizer.convert_tokens_to_ids("<*****>")
+                assert token == num_tokens + 8, "Added token must be at spot `num_tokens`"
+                assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 32
+                assert pipe._maybe_convert_prompt("<*****>", pipe.tokenizer) == "<*****>"
+
+                token = pipe.tokenizer.convert_tokens_to_ids("<******>")
+                assert token == num_tokens + 9, "Added token must be at spot `num_tokens`"
+                assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 64
+                assert pipe._maybe_convert_prompt("<******>", pipe.tokenizer) == "<******>"
+
+                prompt = "hey <*****> <******>"
+                out = pipe(prompt, num_inference_steps=1, output_type="np").images
+                assert out.shape == (1, 128, 128, 3)
+
+        # single token state dict load
+        ten = {"<x>": torch.ones((32,))}
+        pipe.load_textual_inversion(ten)
+
+        token = pipe.tokenizer.convert_tokens_to_ids("<x>")
+        assert token == num_tokens + 10, "Added token must be at spot `num_tokens`"
+        assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 32
+        assert pipe._maybe_convert_prompt("<x>", pipe.tokenizer) == "<x>"
+
+        prompt = "hey <x>"
+        out = pipe(prompt, num_inference_steps=1, output_type="np").images
+        assert out.shape == (1, 128, 128, 3)
+
+        # multi embedding state dict load
+        ten1 = {"<xxxxx>": torch.ones((32,))}
+        ten2 = {"<xxxxxx>": 2 * torch.ones((1, 32))}
+
+        pipe.load_textual_inversion([ten1, ten2])
+
+        token = pipe.tokenizer.convert_tokens_to_ids("<xxxxx>")
+        assert token == num_tokens + 11, "Added token must be at spot `num_tokens`"
+        assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 32
+        assert pipe._maybe_convert_prompt("<xxxxx>", pipe.tokenizer) == "<xxxxx>"
+
+        token = pipe.tokenizer.convert_tokens_to_ids("<xxxxxx>")
+        assert token == num_tokens + 12, "Added token must be at spot `num_tokens`"
+        assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 64
+        assert pipe._maybe_convert_prompt("<xxxxxx>", pipe.tokenizer) == "<xxxxxx>"
+
+        prompt = "hey <xxxxx> <xxxxxx>"
+        out = pipe(prompt, num_inference_steps=1, output_type="np").images
+        assert out.shape == (1, 128, 128, 3)
+
+        # auto1111 multi-token state dict load
+        ten = {
+            "string_to_param": {
+                "*": torch.cat([3 * torch.ones((1, 32)), 4 * torch.ones((1, 32)), 5 * torch.ones((1, 32))])
+            },
+            "name": "<xxxx>",
+        }
+
+        pipe.load_textual_inversion(ten)
+
+        token = pipe.tokenizer.convert_tokens_to_ids("<xxxx>")
+        token_1 = pipe.tokenizer.convert_tokens_to_ids("<xxxx>_1")
+        token_2 = pipe.tokenizer.convert_tokens_to_ids("<xxxx>_2")
+
+        assert token == num_tokens + 13, "Added token must be at spot `num_tokens`"
+        assert token_1 == num_tokens + 14, "Added token must be at spot `num_tokens`"
+        assert token_2 == num_tokens + 15, "Added token must be at spot `num_tokens`"
+        assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96
+        assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128
+        assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160
+        assert pipe._maybe_convert_prompt("<xxxx>", pipe.tokenizer) == "<xxxx> <xxxx>_1 <xxxx>_2"
+
+        prompt = "hey <xxxx>"
+        out = pipe(prompt, num_inference_steps=1, output_type="np").images
+        assert out.shape == (1, 128, 128, 3)
+
+        # multiple references to multi embedding
+        ten = {"<cat>": torch.ones(3, 32)}
+        pipe.load_textual_inversion(ten)
+
+        assert (
+            pipe._maybe_convert_prompt("<cat> <cat>", pipe.tokenizer) == "<cat> <cat>_1 <cat>_2 <cat> <cat>_1 <cat>_2"
+        )
+
+        prompt = "hey <cat> <cat>"
+        out = pipe(prompt, num_inference_steps=1, output_type="np").images
+        assert out.shape == (1, 128, 128, 3)
+
+    def test_text_inversion_multi_tokens(self):
+        pipe1 = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
+        pipe1 = pipe1.to(torch_device)
+
+        token1, token2 = "<*>", "<**>"
+        ten1 = torch.ones((32,))
+        ten2 = torch.ones((32,)) * 2
+
+        num_tokens = len(pipe1.tokenizer)
+
+        pipe1.load_textual_inversion(ten1, token=token1)
+        pipe1.load_textual_inversion(ten2, token=token2)
+        emb1 = pipe1.text_encoder.get_input_embeddings().weight
+
+        pipe2 = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
+        pipe2 = pipe2.to(torch_device)
+        pipe2.load_textual_inversion([ten1, ten2], token=[token1, token2])
+        emb2 = pipe2.text_encoder.get_input_embeddings().weight
+
+        pipe3 = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
+        pipe3 = pipe3.to(torch_device)
+        pipe3.load_textual_inversion(torch.stack([ten1, ten2], dim=0), token=[token1, token2])
+        emb3 = pipe3.text_encoder.get_input_embeddings().weight
+
+        assert len(pipe1.tokenizer) == len(pipe2.tokenizer) == len(pipe3.tokenizer) == num_tokens + 2
+        assert (
+            pipe1.tokenizer.convert_tokens_to_ids(token1)
+            == pipe2.tokenizer.convert_tokens_to_ids(token1)
+            == pipe3.tokenizer.convert_tokens_to_ids(token1)
+            == num_tokens
+        )
+        assert (
+            pipe1.tokenizer.convert_tokens_to_ids(token2)
+            == pipe2.tokenizer.convert_tokens_to_ids(token2)
+            == pipe3.tokenizer.convert_tokens_to_ids(token2)
+            == num_tokens + 1
+        )
+        assert emb1[num_tokens].sum().item() == emb2[num_tokens].sum().item() == emb3[num_tokens].sum().item()
+        assert (
+            emb1[num_tokens + 1].sum().item() == emb2[num_tokens + 1].sum().item() == emb3[num_tokens + 1].sum().item()
+        )
+
+    def test_download_ignore_files(self):
+        # Check https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files/blob/72f58636e5508a218c6b3f60550dc96445547817/model_index.json#L4
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            # pipeline has Flax weights
+            tmpdirname = DiffusionPipeline.download("hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files")
+            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
+            files = [item for sublist in all_root_files for item in sublist]
+
+            # None of the downloaded files should be a pytorch file even if we have some here:
+            # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack
+            assert not any(f in ["vae/diffusion_pytorch_model.bin", "text_encoder/config.json"] for f in files)
+            assert len(files) == 14
+
+    def test_get_pipeline_class_from_flax(self):
+        flax_config = {"_class_name": "FlaxStableDiffusionPipeline"}
+        config = {"_class_name": "StableDiffusionPipeline"}
+
+        # when loading a PyTorch Pipeline from a FlaxPipeline `model_index.json`, e.g.: https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-lms-pipe/blob/7a9063578b325779f0f1967874a6771caa973cad/model_index.json#L2
+        # we need to make sure that we don't load the Flax Pipeline class, but instead the PyTorch pipeline class
+        assert _get_pipeline_class(DiffusionPipeline, flax_config) == _get_pipeline_class(DiffusionPipeline, config)
+
+
+class CustomPipelineTests(unittest.TestCase):
+    def test_load_custom_pipeline(self):
+        pipeline = DiffusionPipeline.from_pretrained(
+            "google/ddpm-cifar10-32", custom_pipeline="hf-internal-testing/diffusers-dummy-pipeline"
+        )
+        pipeline = pipeline.to(torch_device)
+        # NOTE that `"CustomPipeline"` is not a class that is defined in this library, but solely on the Hub
+        # under https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py#L24
+        assert pipeline.__class__.__name__ == "CustomPipeline"
+
+    def test_load_custom_github(self):
+        pipeline = DiffusionPipeline.from_pretrained(
+            "google/ddpm-cifar10-32", custom_pipeline="one_step_unet", custom_revision="main"
+        )
+
+        # make sure that on "main" pipeline gives only ones because of: https://github.com/huggingface/diffusers/pull/1690
+        with torch.no_grad():
+            output = pipeline()
+
+        assert output.numel() == output.sum()
+
+        # hack since Python doesn't like overwriting modules: https://stackoverflow.com/questions/3105801/unload-a-module-in-python
+        # Could in the future work with hashes instead.
+        del sys.modules["diffusers_modules.git.one_step_unet"]
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "google/ddpm-cifar10-32", custom_pipeline="one_step_unet", custom_revision="0.10.2"
+        )
+        with torch.no_grad():
+            output = pipeline()
+
+        assert output.numel() != output.sum()
+
+        assert pipeline.__class__.__name__ == "UnetSchedulerOneForwardPipeline"
+
+    def test_run_custom_pipeline(self):
+        pipeline = DiffusionPipeline.from_pretrained(
+            "google/ddpm-cifar10-32", custom_pipeline="hf-internal-testing/diffusers-dummy-pipeline"
+        )
+        pipeline = pipeline.to(torch_device)
+        images, output_str = pipeline(num_inference_steps=2, output_type="np")
+
+        assert images[0].shape == (1, 32, 32, 3)
+
+        # compare output to https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py#L102
+        assert output_str == "This is a test"
+
+    def test_remote_components(self):
+        # make sure that trust remote code has to be passed
+        with self.assertRaises(ValueError):
+            pipeline = DiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sdxl-custom-components")
+
+        # Check that only loading custom components "my_unet", "my_scheduler" works
+        pipeline = DiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-sdxl-custom-components", trust_remote_code=True
+        )
+
+        assert pipeline.config.unet == ("diffusers_modules.local.my_unet_model", "MyUNetModel")
+        assert pipeline.config.scheduler == ("diffusers_modules.local.my_scheduler", "MyScheduler")
+        assert pipeline.__class__.__name__ == "StableDiffusionXLPipeline"
+
+        pipeline = pipeline.to(torch_device)
+        images = pipeline("test", num_inference_steps=2, output_type="np")[0]
+
+        assert images.shape == (1, 64, 64, 3)
+
+        # Check that only loading custom components "my_unet", "my_scheduler" and explicit custom pipeline works
+        pipeline = DiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-sdxl-custom-components", custom_pipeline="my_pipeline", trust_remote_code=True
+        )
+
+        assert pipeline.config.unet == ("diffusers_modules.local.my_unet_model", "MyUNetModel")
+        assert pipeline.config.scheduler == ("diffusers_modules.local.my_scheduler", "MyScheduler")
+        assert pipeline.__class__.__name__ == "MyPipeline"
+
+        pipeline = pipeline.to(torch_device)
+        images = pipeline("test", num_inference_steps=2, output_type="np")[0]
+
+        assert images.shape == (1, 64, 64, 3)
+
+    def test_remote_auto_custom_pipe(self):
+        # make sure that trust remote code has to be passed
+        with self.assertRaises(ValueError):
+            pipeline = DiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sdxl-custom-all")
+
+        # Check that only loading custom components "my_unet", "my_scheduler" and auto custom pipeline works
+        pipeline = DiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-sdxl-custom-all", trust_remote_code=True
+        )
+
+        assert pipeline.config.unet == ("diffusers_modules.local.my_unet_model", "MyUNetModel")
+        assert pipeline.config.scheduler == ("diffusers_modules.local.my_scheduler", "MyScheduler")
+        assert pipeline.__class__.__name__ == "MyPipeline"
+
+        pipeline = pipeline.to(torch_device)
+        images = pipeline("test", num_inference_steps=2, output_type="np")[0]
+
+        assert images.shape == (1, 64, 64, 3)
+
+    def test_local_custom_pipeline_repo(self):
+        local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline")
+        pipeline = DiffusionPipeline.from_pretrained(
+            "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path
+        )
+        pipeline = pipeline.to(torch_device)
+        images, output_str = pipeline(num_inference_steps=2, output_type="np")
+
+        assert pipeline.__class__.__name__ == "CustomLocalPipeline"
+        assert images[0].shape == (1, 32, 32, 3)
+        # compare to https://github.com/huggingface/diffusers/blob/main/tests/fixtures/custom_pipeline/pipeline.py#L102
+        assert output_str == "This is a local test"
+
+    def test_local_custom_pipeline_file(self):
+        local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline")
+        local_custom_pipeline_path = os.path.join(local_custom_pipeline_path, "what_ever.py")
+        pipeline = DiffusionPipeline.from_pretrained(
+            "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path
+        )
+        pipeline = pipeline.to(torch_device)
+        images, output_str = pipeline(num_inference_steps=2, output_type="np")
+
+        assert pipeline.__class__.__name__ == "CustomLocalPipeline"
+        assert images[0].shape == (1, 32, 32, 3)
+        # compare to https://github.com/huggingface/diffusers/blob/main/tests/fixtures/custom_pipeline/pipeline.py#L102
+        assert output_str == "This is a local test"
+
+    def test_custom_model_and_pipeline(self):
+        pipe = CustomPipeline(
+            encoder=CustomEncoder(),
+            scheduler=DDIMScheduler(),
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname, safe_serialization=False)
+
+            pipe_new = CustomPipeline.from_pretrained(tmpdirname)
+            pipe_new.save_pretrained(tmpdirname)
+
+        conf_1 = dict(pipe.config)
+        conf_2 = dict(pipe_new.config)
+
+        del conf_2["_name_or_path"]
+
+        assert conf_1 == conf_2
+
+    @slow
+    @require_torch_gpu
+    def test_download_from_git(self):
+        # Because adaptive_avg_pool2d_backward_cuda
+        # does not have a deterministic implementation.
+        clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
+
+        feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id)
+        clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16)
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            custom_pipeline="clip_guided_stable_diffusion",
+            clip_model=clip_model,
+            feature_extractor=feature_extractor,
+            torch_dtype=torch.float16,
+        )
+        pipeline.enable_attention_slicing()
+        pipeline = pipeline.to(torch_device)
+
+        # NOTE that `"CLIPGuidedStableDiffusion"` is not a class that is defined in the pypi package of th e library, but solely on the community examples folder of GitHub under:
+        # https://github.com/huggingface/diffusers/blob/main/examples/community/clip_guided_stable_diffusion.py
+        assert pipeline.__class__.__name__ == "CLIPGuidedStableDiffusion"
+
+        image = pipeline("a prompt", num_inference_steps=2, output_type="np").images[0]
+        assert image.shape == (512, 512, 3)
+
+    def test_save_pipeline_change_config(self):
+        pipe = DiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = DiffusionPipeline.from_pretrained(tmpdirname)
+
+            assert pipe.scheduler.__class__.__name__ == "PNDMScheduler"
+
+        # let's make sure that changing the scheduler is correctly reflected
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+            pipe.save_pretrained(tmpdirname)
+            pipe = DiffusionPipeline.from_pretrained(tmpdirname)
+
+            assert pipe.scheduler.__class__.__name__ == "DPMSolverMultistepScheduler"
+
+
+class PipelineFastTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    def dummy_uncond_unet(self, sample_size=32):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=sample_size,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    def dummy_cond_unet(self, sample_size=32):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=sample_size,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    @parameterized.expand(
+        [
+            [DDIMScheduler, DDIMPipeline, 32],
+            [DDPMScheduler, DDPMPipeline, 32],
+            [DDIMScheduler, DDIMPipeline, (32, 64)],
+            [DDPMScheduler, DDPMPipeline, (64, 32)],
+        ]
+    )
+    def test_uncond_unet_components(self, scheduler_fn=DDPMScheduler, pipeline_fn=DDPMPipeline, sample_size=32):
+        unet = self.dummy_uncond_unet(sample_size)
+        scheduler = scheduler_fn()
+        pipeline = pipeline_fn(unet, scheduler).to(torch_device)
+
+        generator = torch.manual_seed(0)
+        out_image = pipeline(
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+        sample_size = (sample_size, sample_size) if isinstance(sample_size, int) else sample_size
+        assert out_image.shape == (1, *sample_size, 3)
+
+    def test_stable_diffusion_components(self):
+        """Test that components property works correctly"""
+        unet = self.dummy_cond_unet()
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image().cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
+
+        # make sure here that pndm scheduler skips prk
+        inpaint = StableDiffusionInpaintPipelineLegacy(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        ).to(torch_device)
+        img2img = StableDiffusionImg2ImgPipeline(**inpaint.components, image_encoder=None).to(torch_device)
+        text2img = StableDiffusionPipeline(**inpaint.components, image_encoder=None).to(torch_device)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        generator = torch.manual_seed(0)
+        image_inpaint = inpaint(
+            [prompt],
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+            image=init_image,
+            mask_image=mask_image,
+        ).images
+        image_img2img = img2img(
+            [prompt],
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+            image=init_image,
+        ).images
+        image_text2img = text2img(
+            [prompt],
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+
+        assert image_inpaint.shape == (1, 32, 32, 3)
+        assert image_img2img.shape == (1, 32, 32, 3)
+        assert image_text2img.shape == (1, 64, 64, 3)
+
+    @require_torch_gpu
+    def test_pipe_false_offload_warn(self):
+        unet = self.dummy_cond_unet()
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        sd = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+
+        sd.enable_model_cpu_offload()
+
+        logger = logging.get_logger("diffusers.pipelines.pipeline_utils")
+        with CaptureLogger(logger) as cap_logger:
+            sd.to("cuda")
+
+        assert "It is strongly recommended against doing so" in str(cap_logger)
+
+        sd = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+
+    def test_set_scheduler(self):
+        unet = self.dummy_cond_unet()
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        sd = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+
+        sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, DDIMScheduler)
+        sd.scheduler = DDPMScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, DDPMScheduler)
+        sd.scheduler = PNDMScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, PNDMScheduler)
+        sd.scheduler = LMSDiscreteScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, LMSDiscreteScheduler)
+        sd.scheduler = EulerDiscreteScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, EulerDiscreteScheduler)
+        sd.scheduler = EulerAncestralDiscreteScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, EulerAncestralDiscreteScheduler)
+        sd.scheduler = DPMSolverMultistepScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, DPMSolverMultistepScheduler)
+
+    def test_set_component_to_none(self):
+        unet = self.dummy_cond_unet()
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        pipeline = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        prompt = "This is a flower"
+
+        out_image = pipeline(
+            prompt=prompt,
+            generator=generator,
+            num_inference_steps=1,
+            output_type="np",
+        ).images
+
+        pipeline.feature_extractor = None
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        out_image_2 = pipeline(
+            prompt=prompt,
+            generator=generator,
+            num_inference_steps=1,
+            output_type="np",
+        ).images
+
+        assert out_image.shape == (1, 64, 64, 3)
+        assert np.abs(out_image - out_image_2).max() < 1e-3
+
+    def test_optional_components_is_none(self):
+        unet = self.dummy_cond_unet()
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        items = {
+            "feature_extractor": self.dummy_extractor,
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": bert,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            # we don't add an image encoder
+        }
+
+        pipeline = StableDiffusionPipeline(**items)
+
+        assert sorted(pipeline.components.keys()) == sorted(["image_encoder"] + list(items.keys()))
+        assert pipeline.image_encoder is None
+
+    def test_set_scheduler_consistency(self):
+        unet = self.dummy_cond_unet()
+        pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
+        ddim = DDIMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        sd = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=pndm,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+
+        pndm_config = sd.scheduler.config
+        sd.scheduler = DDPMScheduler.from_config(pndm_config)
+        sd.scheduler = PNDMScheduler.from_config(sd.scheduler.config)
+        pndm_config_2 = sd.scheduler.config
+        pndm_config_2 = {k: v for k, v in pndm_config_2.items() if k in pndm_config}
+
+        assert dict(pndm_config) == dict(pndm_config_2)
+
+        sd = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=ddim,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+
+        ddim_config = sd.scheduler.config
+        sd.scheduler = LMSDiscreteScheduler.from_config(ddim_config)
+        sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
+        ddim_config_2 = sd.scheduler.config
+        ddim_config_2 = {k: v for k, v in ddim_config_2.items() if k in ddim_config}
+
+        assert dict(ddim_config) == dict(ddim_config_2)
+
+    def test_save_safe_serialization(self):
+        pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipeline.save_pretrained(tmpdirname, safe_serialization=True)
+
+            # Validate that the VAE safetensor exists and are of the correct format
+            vae_path = os.path.join(tmpdirname, "vae", "diffusion_pytorch_model.safetensors")
+            assert os.path.exists(vae_path), f"Could not find {vae_path}"
+            _ = safetensors.torch.load_file(vae_path)
+
+            # Validate that the UNet safetensor exists and are of the correct format
+            unet_path = os.path.join(tmpdirname, "unet", "diffusion_pytorch_model.safetensors")
+            assert os.path.exists(unet_path), f"Could not find {unet_path}"
+            _ = safetensors.torch.load_file(unet_path)
+
+            # Validate that the text encoder safetensor exists and are of the correct format
+            text_encoder_path = os.path.join(tmpdirname, "text_encoder", "model.safetensors")
+            assert os.path.exists(text_encoder_path), f"Could not find {text_encoder_path}"
+            _ = safetensors.torch.load_file(text_encoder_path)
+
+            pipeline = StableDiffusionPipeline.from_pretrained(tmpdirname)
+            assert pipeline.unet is not None
+            assert pipeline.vae is not None
+            assert pipeline.text_encoder is not None
+            assert pipeline.scheduler is not None
+            assert pipeline.feature_extractor is not None
+
+    def test_no_pytorch_download_when_doing_safetensors(self):
+        # by default we don't download
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            _ = StableDiffusionPipeline.from_pretrained(
+                "hf-internal-testing/diffusers-stable-diffusion-tiny-all", cache_dir=tmpdirname
+            )
+
+            path = os.path.join(
+                tmpdirname,
+                "models--hf-internal-testing--diffusers-stable-diffusion-tiny-all",
+                "snapshots",
+                "07838d72e12f9bcec1375b0482b80c1d399be843",
+                "unet",
+            )
+            # safetensors exists
+            assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors"))
+            # pytorch does not
+            assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin"))
+
+    def test_no_safetensors_download_when_doing_pytorch(self):
+        use_safetensors = False
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            _ = StableDiffusionPipeline.from_pretrained(
+                "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
+                cache_dir=tmpdirname,
+                use_safetensors=use_safetensors,
+            )
+
+            path = os.path.join(
+                tmpdirname,
+                "models--hf-internal-testing--diffusers-stable-diffusion-tiny-all",
+                "snapshots",
+                "07838d72e12f9bcec1375b0482b80c1d399be843",
+                "unet",
+            )
+            # safetensors does not exists
+            assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors"))
+            # pytorch does
+            assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin"))
+
+    def test_optional_components(self):
+        unet = self.dummy_cond_unet()
+        pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        orig_sd = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=pndm,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=unet,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd = orig_sd
+
+        assert sd.config.requires_safety_checker is True
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sd.save_pretrained(tmpdirname)
+
+            # Test that passing None works
+            sd = StableDiffusionPipeline.from_pretrained(
+                tmpdirname, feature_extractor=None, safety_checker=None, requires_safety_checker=False
+            )
+
+            assert sd.config.requires_safety_checker is False
+            assert sd.config.safety_checker == (None, None)
+            assert sd.config.feature_extractor == (None, None)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sd.save_pretrained(tmpdirname)
+
+            # Test that loading previous None works
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
+
+            assert sd.config.requires_safety_checker is False
+            assert sd.config.safety_checker == (None, None)
+            assert sd.config.feature_extractor == (None, None)
+
+            orig_sd.save_pretrained(tmpdirname)
+
+            # Test that loading without any directory works
+            shutil.rmtree(os.path.join(tmpdirname, "safety_checker"))
+            with open(os.path.join(tmpdirname, sd.config_name)) as f:
+                config = json.load(f)
+                config["safety_checker"] = [None, None]
+            with open(os.path.join(tmpdirname, sd.config_name), "w") as f:
+                json.dump(config, f)
+
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, requires_safety_checker=False)
+            sd.save_pretrained(tmpdirname)
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
+
+            assert sd.config.requires_safety_checker is False
+            assert sd.config.safety_checker == (None, None)
+            assert sd.config.feature_extractor == (None, None)
+
+            # Test that loading from deleted model index works
+            with open(os.path.join(tmpdirname, sd.config_name)) as f:
+                config = json.load(f)
+                del config["safety_checker"]
+                del config["feature_extractor"]
+            with open(os.path.join(tmpdirname, sd.config_name), "w") as f:
+                json.dump(config, f)
+
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
+
+            assert sd.config.requires_safety_checker is False
+            assert sd.config.safety_checker == (None, None)
+            assert sd.config.feature_extractor == (None, None)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sd.save_pretrained(tmpdirname)
+
+            # Test that partially loading works
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor)
+
+            assert sd.config.requires_safety_checker is False
+            assert sd.config.safety_checker == (None, None)
+            assert sd.config.feature_extractor != (None, None)
+
+            # Test that partially loading works
+            sd = StableDiffusionPipeline.from_pretrained(
+                tmpdirname,
+                feature_extractor=self.dummy_extractor,
+                safety_checker=unet,
+                requires_safety_checker=[True, True],
+            )
+
+            assert sd.config.requires_safety_checker == [True, True]
+            assert sd.config.safety_checker != (None, None)
+            assert sd.config.feature_extractor != (None, None)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sd.save_pretrained(tmpdirname)
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor)
+
+            assert sd.config.requires_safety_checker == [True, True]
+            assert sd.config.safety_checker != (None, None)
+            assert sd.config.feature_extractor != (None, None)
+
+    def test_name_or_path(self):
+        model_path = "hf-internal-testing/tiny-stable-diffusion-torch"
+        sd = DiffusionPipeline.from_pretrained(model_path)
+
+        assert sd.name_or_path == model_path
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sd.save_pretrained(tmpdirname)
+            sd = DiffusionPipeline.from_pretrained(tmpdirname)
+
+            assert sd.name_or_path == tmpdirname
+
+    def test_error_no_variant_available(self):
+        variant = "fp16"
+        with self.assertRaises(ValueError) as error_context:
+            _ = StableDiffusionPipeline.download(
+                "hf-internal-testing/diffusers-stable-diffusion-tiny-all", variant=variant
+            )
+
+        assert "but no such modeling files are available" in str(error_context.exception)
+        assert variant in str(error_context.exception)
+
+    def test_pipe_to(self):
+        unet = self.dummy_cond_unet()
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        sd = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+
+        device_type = torch.device(torch_device).type
+
+        sd1 = sd.to(device_type)
+        sd2 = sd.to(torch.device(device_type))
+        sd3 = sd.to(device_type, torch.float32)
+        sd4 = sd.to(device=device_type)
+        sd5 = sd.to(torch_device=device_type)
+        sd6 = sd.to(device_type, dtype=torch.float32)
+        sd7 = sd.to(device_type, torch_dtype=torch.float32)
+
+        assert sd1.device.type == device_type
+        assert sd2.device.type == device_type
+        assert sd3.device.type == device_type
+        assert sd4.device.type == device_type
+        assert sd5.device.type == device_type
+        assert sd6.device.type == device_type
+        assert sd7.device.type == device_type
+
+        sd1 = sd.to(torch.float16)
+        sd2 = sd.to(None, torch.float16)
+        sd3 = sd.to(dtype=torch.float16)
+        sd4 = sd.to(dtype=torch.float16)
+        sd5 = sd.to(None, dtype=torch.float16)
+        sd6 = sd.to(None, torch_dtype=torch.float16)
+
+        assert sd1.dtype == torch.float16
+        assert sd2.dtype == torch.float16
+        assert sd3.dtype == torch.float16
+        assert sd4.dtype == torch.float16
+        assert sd5.dtype == torch.float16
+        assert sd6.dtype == torch.float16
+
+        sd1 = sd.to(device=device_type, dtype=torch.float16)
+        sd2 = sd.to(torch_device=device_type, torch_dtype=torch.float16)
+        sd3 = sd.to(device_type, torch.float16)
+
+        assert sd1.dtype == torch.float16
+        assert sd2.dtype == torch.float16
+        assert sd3.dtype == torch.float16
+
+        assert sd1.device.type == device_type
+        assert sd2.device.type == device_type
+        assert sd3.device.type == device_type
+
+    def test_pipe_same_device_id_offload(self):
+        unet = self.dummy_cond_unet()
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        sd = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+
+        sd.enable_model_cpu_offload(gpu_id=5)
+        assert sd._offload_gpu_id == 5
+        sd.maybe_free_model_hooks()
+        assert sd._offload_gpu_id == 5
+
+
+@slow
+@require_torch_gpu
+class PipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_smart_download(self):
+        model_id = "hf-internal-testing/unet-pipeline-dummy"
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            _ = DiffusionPipeline.from_pretrained(model_id, cache_dir=tmpdirname, force_download=True)
+            local_repo_name = "--".join(["models"] + model_id.split("/"))
+            snapshot_dir = os.path.join(tmpdirname, local_repo_name, "snapshots")
+            snapshot_dir = os.path.join(snapshot_dir, os.listdir(snapshot_dir)[0])
+
+            # inspect all downloaded files to make sure that everything is included
+            assert os.path.isfile(os.path.join(snapshot_dir, DiffusionPipeline.config_name))
+            assert os.path.isfile(os.path.join(snapshot_dir, CONFIG_NAME))
+            assert os.path.isfile(os.path.join(snapshot_dir, SCHEDULER_CONFIG_NAME))
+            assert os.path.isfile(os.path.join(snapshot_dir, WEIGHTS_NAME))
+            assert os.path.isfile(os.path.join(snapshot_dir, "scheduler", SCHEDULER_CONFIG_NAME))
+            assert os.path.isfile(os.path.join(snapshot_dir, "unet", WEIGHTS_NAME))
+            assert os.path.isfile(os.path.join(snapshot_dir, "unet", WEIGHTS_NAME))
+            # let's make sure the super large numpy file:
+            # https://huggingface.co/hf-internal-testing/unet-pipeline-dummy/blob/main/big_array.npy
+            # is not downloaded, but all the expected ones
+            assert not os.path.isfile(os.path.join(snapshot_dir, "big_array.npy"))
+
+    def test_warning_unused_kwargs(self):
+        model_id = "hf-internal-testing/unet-pipeline-dummy"
+        logger = logging.get_logger("diffusers.pipelines")
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            with CaptureLogger(logger) as cap_logger:
+                DiffusionPipeline.from_pretrained(
+                    model_id,
+                    not_used=True,
+                    cache_dir=tmpdirname,
+                    force_download=True,
+                )
+
+        assert (
+            cap_logger.out.strip().split("\n")[-1]
+            == "Keyword arguments {'not_used': True} are not expected by DDPMPipeline and will be ignored."
+        )
+
+    def test_from_save_pretrained(self):
+        # 1. Load models
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        scheduler = DDPMScheduler(num_train_timesteps=10)
+
+        ddpm = DDPMPipeline(model, scheduler)
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ddpm.save_pretrained(tmpdirname)
+            new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
+            new_ddpm.to(torch_device)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = ddpm(generator=generator, num_inference_steps=5, output_type="np").images
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="np").images
+
+        assert np.abs(image - new_image).max() < 1e-5, "Models don't give the same forward pass"
+
+    @is_torch_compile
+    @require_torch_2
+    @unittest.skipIf(
+        get_python_version == (3, 12),
+        reason="Torch Dynamo isn't yet supported for Python 3.12.",
+    )
+    def test_from_save_pretrained_dynamo(self):
+        run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None)
+
+    def test_from_pretrained_hub(self):
+        model_path = "google/ddpm-cifar10-32"
+
+        scheduler = DDPMScheduler(num_train_timesteps=10)
+
+        ddpm = DDPMPipeline.from_pretrained(model_path, scheduler=scheduler)
+        ddpm = ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
+        ddpm_from_hub = ddpm_from_hub.to(torch_device)
+        ddpm_from_hub.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = ddpm(generator=generator, num_inference_steps=5, output_type="np").images
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="np").images
+
+        assert np.abs(image - new_image).max() < 1e-5, "Models don't give the same forward pass"
+
+    def test_from_pretrained_hub_pass_model(self):
+        model_path = "google/ddpm-cifar10-32"
+
+        scheduler = DDPMScheduler(num_train_timesteps=10)
+
+        # pass unet into DiffusionPipeline
+        unet = UNet2DModel.from_pretrained(model_path)
+        ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(model_path, unet=unet, scheduler=scheduler)
+        ddpm_from_hub_custom_model = ddpm_from_hub_custom_model.to(torch_device)
+        ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
+
+        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
+        ddpm_from_hub = ddpm_from_hub.to(torch_device)
+        ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = ddpm_from_hub_custom_model(generator=generator, num_inference_steps=5, output_type="np").images
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="np").images
+
+        assert np.abs(image - new_image).max() < 1e-5, "Models don't give the same forward pass"
+
+    def test_output_format(self):
+        model_path = "google/ddpm-cifar10-32"
+
+        scheduler = DDIMScheduler.from_pretrained(model_path)
+        pipe = DDIMPipeline.from_pretrained(model_path, scheduler=scheduler)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        images = pipe(output_type="np").images
+        assert images.shape == (1, 32, 32, 3)
+        assert isinstance(images, np.ndarray)
+
+        images = pipe(output_type="pil", num_inference_steps=4).images
+        assert isinstance(images, list)
+        assert len(images) == 1
+        assert isinstance(images[0], PIL.Image.Image)
+
+        # use PIL by default
+        images = pipe(num_inference_steps=4).images
+        assert isinstance(images, list)
+        assert isinstance(images[0], PIL.Image.Image)
+
+    @require_flax
+    def test_from_flax_from_pt(self):
+        pipe_pt = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
+        pipe_pt.to(torch_device)
+
+        from diffusers import FlaxStableDiffusionPipeline
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe_pt.save_pretrained(tmpdirname)
+
+            pipe_flax, params = FlaxStableDiffusionPipeline.from_pretrained(
+                tmpdirname, safety_checker=None, from_pt=True
+            )
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe_flax.save_pretrained(tmpdirname, params=params)
+            pipe_pt_2 = StableDiffusionPipeline.from_pretrained(tmpdirname, safety_checker=None, from_flax=True)
+            pipe_pt_2.to(torch_device)
+
+        prompt = "Hello"
+
+        generator = torch.manual_seed(0)
+        image_0 = pipe_pt(
+            [prompt],
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        ).images[0]
+
+        generator = torch.manual_seed(0)
+        image_1 = pipe_pt_2(
+            [prompt],
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        ).images[0]
+
+        assert np.abs(image_0 - image_1).sum() < 1e-5, "Models don't give the same forward pass"
+
+    @require_compel
+    def test_weighted_prompts_compel(self):
+        from compel import Compel
+
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        pipe.enable_model_cpu_offload()
+        pipe.enable_attention_slicing()
+
+        compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
+
+        prompt = "a red cat playing with a ball{}"
+
+        prompts = [prompt.format(s) for s in ["", "++", "--"]]
+
+        prompt_embeds = compel(prompts)
+
+        generator = [torch.Generator(device="cpu").manual_seed(33) for _ in range(prompt_embeds.shape[0])]
+
+        images = pipe(
+            prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20, output_type="np"
+        ).images
+
+        for i, image in enumerate(images):
+            expected_image = load_numpy(
+                "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+                f"/compel/forest_{i}.npy"
+            )
+
+            assert np.abs(image - expected_image).max() < 3e-1
+
+
+@nightly
+@require_torch_gpu
+class PipelineNightlyTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_ddpm_ddim_equality_batched(self):
+        seed = 0
+        model_id = "google/ddpm-cifar10-32"
+
+        unet = UNet2DModel.from_pretrained(model_id)
+        ddpm_scheduler = DDPMScheduler()
+        ddim_scheduler = DDIMScheduler()
+
+        ddpm = DDPMPipeline(unet=unet, scheduler=ddpm_scheduler)
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        ddim = DDIMPipeline(unet=unet, scheduler=ddim_scheduler)
+        ddim.to(torch_device)
+        ddim.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=torch_device).manual_seed(seed)
+        ddpm_images = ddpm(batch_size=2, generator=generator, output_type="np").images
+
+        generator = torch.Generator(device=torch_device).manual_seed(seed)
+        ddim_images = ddim(
+            batch_size=2,
+            generator=generator,
+            num_inference_steps=1000,
+            eta=1.0,
+            output_type="np",
+            use_clipped_model_output=True,  # Need this to make DDIM match DDPM
+        ).images
+
+        # the values aren't exactly equal, but the images look the same visually
+        assert np.abs(ddpm_images - ddim_images).max() < 1e-1
diff --git a/diffusers/tests/pipelines/test_pipelines_auto.py b/diffusers/tests/pipelines/test_pipelines_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..561a9011c6ae9a988188afe2cbeb96642e19383b
--- /dev/null
+++ b/diffusers/tests/pipelines/test_pipelines_auto.py
@@ -0,0 +1,574 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+import shutil
+import unittest
+from collections import OrderedDict
+from pathlib import Path
+
+import torch
+from transformers import CLIPVisionConfig, CLIPVisionModelWithProjection
+
+from diffusers import (
+    AutoPipelineForImage2Image,
+    AutoPipelineForInpainting,
+    AutoPipelineForText2Image,
+    ControlNetModel,
+    DiffusionPipeline,
+)
+from diffusers.pipelines.auto_pipeline import (
+    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
+    AUTO_INPAINT_PIPELINES_MAPPING,
+    AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
+)
+from diffusers.utils.testing_utils import slow
+
+
+PRETRAINED_MODEL_REPO_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", "stable-diffusion-v1-5/stable-diffusion-v1-5"),
+        ("if", "DeepFloyd/IF-I-XL-v1.0"),
+        ("kandinsky", "kandinsky-community/kandinsky-2-1"),
+        ("kandinsky22", "kandinsky-community/kandinsky-2-2-decoder"),
+    ]
+)
+
+
+class AutoPipelineFastTest(unittest.TestCase):
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=1,
+            projection_dim=1,
+            num_hidden_layers=1,
+            num_attention_heads=1,
+            image_size=1,
+            intermediate_size=1,
+            patch_size=1,
+        )
+        return CLIPVisionModelWithProjection(config)
+
+    def test_from_pipe_consistent(self):
+        pipe = AutoPipelineForText2Image.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-pipe", requires_safety_checker=False
+        )
+        original_config = dict(pipe.config)
+
+        pipe = AutoPipelineForImage2Image.from_pipe(pipe)
+        assert dict(pipe.config) == original_config
+
+        pipe = AutoPipelineForText2Image.from_pipe(pipe)
+        assert dict(pipe.config) == original_config
+
+    def test_from_pipe_override(self):
+        pipe = AutoPipelineForText2Image.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-pipe", requires_safety_checker=False
+        )
+
+        pipe = AutoPipelineForImage2Image.from_pipe(pipe, requires_safety_checker=True)
+        assert pipe.config.requires_safety_checker is True
+
+        pipe = AutoPipelineForText2Image.from_pipe(pipe, requires_safety_checker=True)
+        assert pipe.config.requires_safety_checker is True
+
+    def test_from_pipe_consistent_sdxl(self):
+        pipe = AutoPipelineForImage2Image.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-xl-pipe",
+            requires_aesthetics_score=True,
+            force_zeros_for_empty_prompt=False,
+        )
+
+        original_config = dict(pipe.config)
+
+        pipe = AutoPipelineForText2Image.from_pipe(pipe)
+        pipe = AutoPipelineForImage2Image.from_pipe(pipe)
+
+        assert dict(pipe.config) == original_config
+
+    def test_kwargs_local_files_only(self):
+        repo = "hf-internal-testing/tiny-stable-diffusion-torch"
+        tmpdirname = DiffusionPipeline.download(repo)
+        tmpdirname = Path(tmpdirname)
+
+        # edit commit_id to so that it's not the latest commit
+        commit_id = tmpdirname.name
+        new_commit_id = commit_id + "hug"
+
+        ref_dir = tmpdirname.parent.parent / "refs/main"
+        with open(ref_dir, "w") as f:
+            f.write(new_commit_id)
+
+        new_tmpdirname = tmpdirname.parent / new_commit_id
+        os.rename(tmpdirname, new_tmpdirname)
+
+        try:
+            AutoPipelineForText2Image.from_pretrained(repo, local_files_only=True)
+        except OSError:
+            assert False, "not able to load local files"
+
+        shutil.rmtree(tmpdirname.parent.parent)
+
+    def test_from_pretrained_text2img(self):
+        repo = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
+        pipe = AutoPipelineForText2Image.from_pretrained(repo)
+        assert pipe.__class__.__name__ == "StableDiffusionXLPipeline"
+
+        controlnet = ControlNetModel.from_pretrained("hf-internal-testing/tiny-controlnet")
+        pipe_control = AutoPipelineForText2Image.from_pretrained(repo, controlnet=controlnet)
+        assert pipe_control.__class__.__name__ == "StableDiffusionXLControlNetPipeline"
+
+        pipe_pag = AutoPipelineForText2Image.from_pretrained(repo, enable_pag=True)
+        assert pipe_pag.__class__.__name__ == "StableDiffusionXLPAGPipeline"
+
+        pipe_control_pag = AutoPipelineForText2Image.from_pretrained(repo, controlnet=controlnet, enable_pag=True)
+        assert pipe_control_pag.__class__.__name__ == "StableDiffusionXLControlNetPAGPipeline"
+
+    def test_from_pipe_pag_text2img(self):
+        # test from StableDiffusionXLPipeline
+        pipe = AutoPipelineForText2Image.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
+        controlnet = ControlNetModel.from_pretrained("hf-internal-testing/tiny-controlnet")
+
+        #  - test `enable_pag` flag
+        pipe_pag = AutoPipelineForText2Image.from_pipe(pipe, enable_pag=True)
+        assert pipe_pag.__class__.__name__ == "StableDiffusionXLPAGPipeline"
+        assert "controlnet" not in pipe_pag.components
+
+        pipe = AutoPipelineForText2Image.from_pipe(pipe, enable_pag=False)
+        assert pipe.__class__.__name__ == "StableDiffusionXLPipeline"
+        assert "controlnet" not in pipe.components
+
+        #  - test `enabe_pag` + `controlnet` flag
+        pipe_control_pag = AutoPipelineForText2Image.from_pipe(pipe, controlnet=controlnet, enable_pag=True)
+        assert pipe_control_pag.__class__.__name__ == "StableDiffusionXLControlNetPAGPipeline"
+        assert "controlnet" in pipe_control_pag.components
+
+        pipe_control = AutoPipelineForText2Image.from_pipe(pipe, controlnet=controlnet, enable_pag=False)
+        assert pipe_control.__class__.__name__ == "StableDiffusionXLControlNetPipeline"
+        assert "controlnet" in pipe_control.components
+
+        pipe_pag = AutoPipelineForText2Image.from_pipe(pipe, controlnet=None, enable_pag=True)
+        assert pipe_pag.__class__.__name__ == "StableDiffusionXLPAGPipeline"
+        assert "controlnet" not in pipe_pag.components
+
+        pipe = AutoPipelineForText2Image.from_pipe(pipe, controlnet=None, enable_pag=False)
+        assert pipe.__class__.__name__ == "StableDiffusionXLPipeline"
+        assert "controlnet" not in pipe.components
+
+        # test from StableDiffusionXLControlNetPipeline
+        # - test `enable_pag` flag
+        pipe_control_pag = AutoPipelineForText2Image.from_pipe(pipe_control, enable_pag=True)
+        assert pipe_control_pag.__class__.__name__ == "StableDiffusionXLControlNetPAGPipeline"
+        assert "controlnet" in pipe_control_pag.components
+
+        pipe_control = AutoPipelineForText2Image.from_pipe(pipe_control, enable_pag=False)
+        assert pipe_control.__class__.__name__ == "StableDiffusionXLControlNetPipeline"
+        assert "controlnet" in pipe_control.components
+
+        # - test `enable_pag` + `controlnet` flag
+        pipe_control_pag = AutoPipelineForText2Image.from_pipe(pipe_control, controlnet=controlnet, enable_pag=True)
+        assert pipe_control_pag.__class__.__name__ == "StableDiffusionXLControlNetPAGPipeline"
+        assert "controlnet" in pipe_control_pag.components
+
+        pipe_control = AutoPipelineForText2Image.from_pipe(pipe_control, controlnet=controlnet, enable_pag=False)
+        assert pipe_control.__class__.__name__ == "StableDiffusionXLControlNetPipeline"
+        assert "controlnet" in pipe_control.components
+
+        pipe_pag = AutoPipelineForText2Image.from_pipe(pipe_control, controlnet=None, enable_pag=True)
+        assert pipe_pag.__class__.__name__ == "StableDiffusionXLPAGPipeline"
+        assert "controlnet" not in pipe_pag.components
+
+        pipe = AutoPipelineForText2Image.from_pipe(pipe_control, controlnet=None, enable_pag=False)
+        assert pipe.__class__.__name__ == "StableDiffusionXLPipeline"
+        assert "controlnet" not in pipe.components
+
+        # test from StableDiffusionXLControlNetPAGPipeline
+        # - test `enable_pag` flag
+        pipe_control_pag = AutoPipelineForText2Image.from_pipe(pipe_control_pag, enable_pag=True)
+        assert pipe_control_pag.__class__.__name__ == "StableDiffusionXLControlNetPAGPipeline"
+        assert "controlnet" in pipe_control_pag.components
+
+        pipe_control = AutoPipelineForText2Image.from_pipe(pipe_control_pag, enable_pag=False)
+        assert pipe_control.__class__.__name__ == "StableDiffusionXLControlNetPipeline"
+        assert "controlnet" in pipe_control.components
+
+        # - test `enable_pag` + `controlnet` flag
+        pipe_control_pag = AutoPipelineForText2Image.from_pipe(
+            pipe_control_pag, controlnet=controlnet, enable_pag=True
+        )
+        assert pipe_control_pag.__class__.__name__ == "StableDiffusionXLControlNetPAGPipeline"
+        assert "controlnet" in pipe_control_pag.components
+
+        pipe_control = AutoPipelineForText2Image.from_pipe(pipe_control_pag, controlnet=controlnet, enable_pag=False)
+        assert pipe_control.__class__.__name__ == "StableDiffusionXLControlNetPipeline"
+        assert "controlnet" in pipe_control.components
+
+        pipe_pag = AutoPipelineForText2Image.from_pipe(pipe_control_pag, controlnet=None, enable_pag=True)
+        assert pipe_pag.__class__.__name__ == "StableDiffusionXLPAGPipeline"
+        assert "controlnet" not in pipe_pag.components
+
+        pipe = AutoPipelineForText2Image.from_pipe(pipe_control_pag, controlnet=None, enable_pag=False)
+        assert pipe.__class__.__name__ == "StableDiffusionXLPipeline"
+        assert "controlnet" not in pipe.components
+
+        pipe = AutoPipelineForText2Image.from_pipe(pipe_control_pag, enable_pag=False)
+        assert pipe.__class__.__name__ == "StableDiffusionXLControlNetPipeline"
+        assert "controlnet" in pipe.components
+
+    def test_from_pretrained_img2img(self):
+        repo = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
+
+        pipe = AutoPipelineForImage2Image.from_pretrained(repo)
+        assert pipe.__class__.__name__ == "StableDiffusionXLImg2ImgPipeline"
+
+        controlnet = ControlNetModel.from_pretrained("hf-internal-testing/tiny-controlnet")
+        pipe_control = AutoPipelineForImage2Image.from_pretrained(repo, controlnet=controlnet)
+        assert pipe_control.__class__.__name__ == "StableDiffusionXLControlNetImg2ImgPipeline"
+
+        pipe_pag = AutoPipelineForImage2Image.from_pretrained(repo, enable_pag=True)
+        assert pipe_pag.__class__.__name__ == "StableDiffusionXLPAGImg2ImgPipeline"
+
+        pipe_control_pag = AutoPipelineForImage2Image.from_pretrained(repo, controlnet=controlnet, enable_pag=True)
+        assert pipe_control_pag.__class__.__name__ == "StableDiffusionXLControlNetPAGImg2ImgPipeline"
+
+    def test_from_pretrained_img2img_refiner(self):
+        repo = "hf-internal-testing/tiny-stable-diffusion-xl-refiner-pipe"
+
+        pipe = AutoPipelineForImage2Image.from_pretrained(repo)
+        assert pipe.__class__.__name__ == "StableDiffusionXLImg2ImgPipeline"
+
+        controlnet = ControlNetModel.from_pretrained("hf-internal-testing/tiny-controlnet")
+        pipe_control = AutoPipelineForImage2Image.from_pretrained(repo, controlnet=controlnet)
+        assert pipe_control.__class__.__name__ == "StableDiffusionXLControlNetImg2ImgPipeline"
+
+        pipe_pag = AutoPipelineForImage2Image.from_pretrained(repo, enable_pag=True)
+        assert pipe_pag.__class__.__name__ == "StableDiffusionXLPAGImg2ImgPipeline"
+
+        pipe_control_pag = AutoPipelineForImage2Image.from_pretrained(repo, controlnet=controlnet, enable_pag=True)
+        assert pipe_control_pag.__class__.__name__ == "StableDiffusionXLControlNetPAGImg2ImgPipeline"
+
+    def test_from_pipe_pag_img2img(self):
+        # test from tableDiffusionXLPAGImg2ImgPipeline
+        pipe = AutoPipelineForImage2Image.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
+        # - test `enable_pag` flag
+        pipe_pag = AutoPipelineForImage2Image.from_pipe(pipe, enable_pag=True)
+        assert pipe_pag.__class__.__name__ == "StableDiffusionXLPAGImg2ImgPipeline"
+
+        pipe = AutoPipelineForImage2Image.from_pipe(pipe, enable_pag=False)
+        assert pipe.__class__.__name__ == "StableDiffusionXLImg2ImgPipeline"
+
+        # testing from StableDiffusionXLPAGImg2ImgPipeline
+        # - test `enable_pag` flag
+        pipe_pag = AutoPipelineForImage2Image.from_pipe(pipe_pag, enable_pag=True)
+        assert pipe_pag.__class__.__name__ == "StableDiffusionXLPAGImg2ImgPipeline"
+
+        pipe = AutoPipelineForImage2Image.from_pipe(pipe_pag, enable_pag=False)
+        assert pipe.__class__.__name__ == "StableDiffusionXLImg2ImgPipeline"
+
+    def test_from_pretrained_inpaint(self):
+        repo = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
+
+        pipe = AutoPipelineForInpainting.from_pretrained(repo)
+        assert pipe.__class__.__name__ == "StableDiffusionXLInpaintPipeline"
+
+        pipe_pag = AutoPipelineForInpainting.from_pretrained(repo, enable_pag=True)
+        assert pipe_pag.__class__.__name__ == "StableDiffusionXLPAGInpaintPipeline"
+
+    def test_from_pretrained_inpaint_from_inpaint(self):
+        repo = "hf-internal-testing/tiny-stable-diffusion-xl-inpaint-pipe"
+
+        pipe = AutoPipelineForInpainting.from_pretrained(repo)
+        assert pipe.__class__.__name__ == "StableDiffusionXLInpaintPipeline"
+
+        # make sure you can use pag with inpaint-specific pipeline
+        pipe = AutoPipelineForInpainting.from_pretrained(repo, enable_pag=True)
+        assert pipe.__class__.__name__ == "StableDiffusionXLPAGInpaintPipeline"
+
+    def test_from_pipe_pag_inpaint(self):
+        # test from tableDiffusionXLPAGInpaintPipeline
+        pipe = AutoPipelineForInpainting.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
+        # - test `enable_pag` flag
+        pipe_pag = AutoPipelineForInpainting.from_pipe(pipe, enable_pag=True)
+        assert pipe_pag.__class__.__name__ == "StableDiffusionXLPAGInpaintPipeline"
+
+        pipe = AutoPipelineForInpainting.from_pipe(pipe, enable_pag=False)
+        assert pipe.__class__.__name__ == "StableDiffusionXLInpaintPipeline"
+
+        # testing from StableDiffusionXLPAGInpaintPipeline
+        # - test `enable_pag` flag
+        pipe_pag = AutoPipelineForInpainting.from_pipe(pipe_pag, enable_pag=True)
+        assert pipe_pag.__class__.__name__ == "StableDiffusionXLPAGInpaintPipeline"
+
+        pipe = AutoPipelineForInpainting.from_pipe(pipe_pag, enable_pag=False)
+        assert pipe.__class__.__name__ == "StableDiffusionXLInpaintPipeline"
+
+    def test_from_pipe_pag_new_task(self):
+        # for from_pipe_new_task we only need to make sure it can map to the same pipeline from a different task,
+        # i.e. no need to test `enable_pag` + `controlnet` flag because it is already tested in `test_from_pipe_pag_text2img` and `test_from_pipe_pag_inpaint`etc
+        pipe_pag_text2img = AutoPipelineForText2Image.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-xl-pipe", enable_pag=True
+        )
+
+        # text2img pag -> inpaint pag
+        pipe_pag_inpaint = AutoPipelineForInpainting.from_pipe(pipe_pag_text2img)
+        assert pipe_pag_inpaint.__class__.__name__ == "StableDiffusionXLPAGInpaintPipeline"
+        # text2img pag -> img2img pag
+        pipe_pag_img2img = AutoPipelineForImage2Image.from_pipe(pipe_pag_text2img)
+        assert pipe_pag_img2img.__class__.__name__ == "StableDiffusionXLPAGImg2ImgPipeline"
+
+        # inpaint pag -> text2img pag
+        pipe_pag_text2img = AutoPipelineForText2Image.from_pipe(pipe_pag_inpaint)
+        assert pipe_pag_text2img.__class__.__name__ == "StableDiffusionXLPAGPipeline"
+        # inpaint pag -> img2img pag
+        pipe_pag_img2img = AutoPipelineForImage2Image.from_pipe(pipe_pag_inpaint)
+        assert pipe_pag_img2img.__class__.__name__ == "StableDiffusionXLPAGImg2ImgPipeline"
+
+        # img2img pag -> text2img pag
+        pipe_pag_text2img = AutoPipelineForText2Image.from_pipe(pipe_pag_img2img)
+        assert pipe_pag_text2img.__class__.__name__ == "StableDiffusionXLPAGPipeline"
+        # img2img pag -> inpaint pag
+        pipe_pag_inpaint = AutoPipelineForInpainting.from_pipe(pipe_pag_img2img)
+        assert pipe_pag_inpaint.__class__.__name__ == "StableDiffusionXLPAGInpaintPipeline"
+
+    def test_from_pipe_controlnet_text2img(self):
+        pipe = AutoPipelineForText2Image.from_pretrained("hf-internal-testing/tiny-stable-diffusion-pipe")
+        controlnet = ControlNetModel.from_pretrained("hf-internal-testing/tiny-controlnet")
+
+        pipe = AutoPipelineForText2Image.from_pipe(pipe, controlnet=controlnet)
+        assert pipe.__class__.__name__ == "StableDiffusionControlNetPipeline"
+        assert "controlnet" in pipe.components
+
+        pipe = AutoPipelineForText2Image.from_pipe(pipe, controlnet=None)
+        assert pipe.__class__.__name__ == "StableDiffusionPipeline"
+        assert "controlnet" not in pipe.components
+
+    def test_from_pipe_controlnet_img2img(self):
+        pipe = AutoPipelineForImage2Image.from_pretrained("hf-internal-testing/tiny-stable-diffusion-pipe")
+        controlnet = ControlNetModel.from_pretrained("hf-internal-testing/tiny-controlnet")
+
+        pipe = AutoPipelineForImage2Image.from_pipe(pipe, controlnet=controlnet)
+        assert pipe.__class__.__name__ == "StableDiffusionControlNetImg2ImgPipeline"
+        assert "controlnet" in pipe.components
+
+        pipe = AutoPipelineForImage2Image.from_pipe(pipe, controlnet=None)
+        assert pipe.__class__.__name__ == "StableDiffusionImg2ImgPipeline"
+        assert "controlnet" not in pipe.components
+
+    def test_from_pipe_controlnet_inpaint(self):
+        pipe = AutoPipelineForInpainting.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
+        controlnet = ControlNetModel.from_pretrained("hf-internal-testing/tiny-controlnet")
+
+        pipe = AutoPipelineForInpainting.from_pipe(pipe, controlnet=controlnet)
+        assert pipe.__class__.__name__ == "StableDiffusionControlNetInpaintPipeline"
+        assert "controlnet" in pipe.components
+
+        pipe = AutoPipelineForInpainting.from_pipe(pipe, controlnet=None)
+        assert pipe.__class__.__name__ == "StableDiffusionInpaintPipeline"
+        assert "controlnet" not in pipe.components
+
+    def test_from_pipe_controlnet_new_task(self):
+        pipe_text2img = AutoPipelineForText2Image.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
+        controlnet = ControlNetModel.from_pretrained("hf-internal-testing/tiny-controlnet")
+
+        pipe_control_img2img = AutoPipelineForImage2Image.from_pipe(pipe_text2img, controlnet=controlnet)
+        assert pipe_control_img2img.__class__.__name__ == "StableDiffusionControlNetImg2ImgPipeline"
+        assert "controlnet" in pipe_control_img2img.components
+
+        pipe_inpaint = AutoPipelineForInpainting.from_pipe(pipe_control_img2img, controlnet=None)
+        assert pipe_inpaint.__class__.__name__ == "StableDiffusionInpaintPipeline"
+        assert "controlnet" not in pipe_inpaint.components
+
+        # testing `from_pipe` for text2img controlnet
+        ## 1. from a different controlnet pipe, without controlnet argument
+        pipe_control_text2img = AutoPipelineForText2Image.from_pipe(pipe_control_img2img)
+        assert pipe_control_text2img.__class__.__name__ == "StableDiffusionControlNetPipeline"
+        assert "controlnet" in pipe_control_text2img.components
+
+        ## 2. from a different controlnet pipe, with controlnet argument
+        pipe_control_text2img = AutoPipelineForText2Image.from_pipe(pipe_control_img2img, controlnet=controlnet)
+        assert pipe_control_text2img.__class__.__name__ == "StableDiffusionControlNetPipeline"
+        assert "controlnet" in pipe_control_text2img.components
+
+        ## 3. from same controlnet pipeline class, with a different controlnet component
+        pipe_control_text2img = AutoPipelineForText2Image.from_pipe(pipe_control_text2img, controlnet=controlnet)
+        assert pipe_control_text2img.__class__.__name__ == "StableDiffusionControlNetPipeline"
+        assert "controlnet" in pipe_control_text2img.components
+
+        # testing from_pipe for inpainting
+        ## 1. from a different controlnet pipeline class
+        pipe_control_inpaint = AutoPipelineForInpainting.from_pipe(pipe_control_img2img)
+        assert pipe_control_inpaint.__class__.__name__ == "StableDiffusionControlNetInpaintPipeline"
+        assert "controlnet" in pipe_control_inpaint.components
+
+        ## from a different controlnet pipe, with a different controlnet
+        pipe_control_inpaint = AutoPipelineForInpainting.from_pipe(pipe_control_img2img, controlnet=controlnet)
+        assert pipe_control_inpaint.__class__.__name__ == "StableDiffusionControlNetInpaintPipeline"
+        assert "controlnet" in pipe_control_inpaint.components
+
+        ## from same controlnet pipe, with a different controlnet
+        pipe_control_inpaint = AutoPipelineForInpainting.from_pipe(pipe_control_inpaint, controlnet=controlnet)
+        assert pipe_control_inpaint.__class__.__name__ == "StableDiffusionControlNetInpaintPipeline"
+        assert "controlnet" in pipe_control_inpaint.components
+
+        # testing from_pipe from img2img controlnet
+        ## from a different controlnet pipe, without controlnet argument
+        pipe_control_img2img = AutoPipelineForImage2Image.from_pipe(pipe_control_text2img)
+        assert pipe_control_img2img.__class__.__name__ == "StableDiffusionControlNetImg2ImgPipeline"
+        assert "controlnet" in pipe_control_img2img.components
+
+        # from a different controlnet pipe, with a different controlnet component
+        pipe_control_img2img = AutoPipelineForImage2Image.from_pipe(pipe_control_text2img, controlnet=controlnet)
+        assert pipe_control_img2img.__class__.__name__ == "StableDiffusionControlNetImg2ImgPipeline"
+        assert "controlnet" in pipe_control_img2img.components
+
+        # from same controlnet pipeline class, with a different controlnet
+        pipe_control_img2img = AutoPipelineForImage2Image.from_pipe(pipe_control_img2img, controlnet=controlnet)
+        assert pipe_control_img2img.__class__.__name__ == "StableDiffusionControlNetImg2ImgPipeline"
+        assert "controlnet" in pipe_control_img2img.components
+
+    def test_from_pipe_optional_components(self):
+        image_encoder = self.dummy_image_encoder
+
+        pipe = AutoPipelineForText2Image.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-pipe",
+            image_encoder=image_encoder,
+        )
+
+        pipe = AutoPipelineForImage2Image.from_pipe(pipe)
+        assert pipe.image_encoder is not None
+
+        pipe = AutoPipelineForText2Image.from_pipe(pipe, image_encoder=None)
+        assert pipe.image_encoder is None
+
+
+@slow
+class AutoPipelineIntegrationTest(unittest.TestCase):
+    def test_pipe_auto(self):
+        for model_name, model_repo in PRETRAINED_MODEL_REPO_MAPPING.items():
+            # test txt2img
+            pipe_txt2img = AutoPipelineForText2Image.from_pretrained(
+                model_repo, variant="fp16", torch_dtype=torch.float16
+            )
+            self.assertIsInstance(pipe_txt2img, AUTO_TEXT2IMAGE_PIPELINES_MAPPING[model_name])
+
+            pipe_to = AutoPipelineForText2Image.from_pipe(pipe_txt2img)
+            self.assertIsInstance(pipe_to, AUTO_TEXT2IMAGE_PIPELINES_MAPPING[model_name])
+
+            pipe_to = AutoPipelineForImage2Image.from_pipe(pipe_txt2img)
+            self.assertIsInstance(pipe_to, AUTO_IMAGE2IMAGE_PIPELINES_MAPPING[model_name])
+
+            if "kandinsky" not in model_name:
+                pipe_to = AutoPipelineForInpainting.from_pipe(pipe_txt2img)
+                self.assertIsInstance(pipe_to, AUTO_INPAINT_PIPELINES_MAPPING[model_name])
+
+            del pipe_txt2img, pipe_to
+            gc.collect()
+
+            # test img2img
+
+            pipe_img2img = AutoPipelineForImage2Image.from_pretrained(
+                model_repo, variant="fp16", torch_dtype=torch.float16
+            )
+            self.assertIsInstance(pipe_img2img, AUTO_IMAGE2IMAGE_PIPELINES_MAPPING[model_name])
+
+            pipe_to = AutoPipelineForText2Image.from_pipe(pipe_img2img)
+            self.assertIsInstance(pipe_to, AUTO_TEXT2IMAGE_PIPELINES_MAPPING[model_name])
+
+            pipe_to = AutoPipelineForImage2Image.from_pipe(pipe_img2img)
+            self.assertIsInstance(pipe_to, AUTO_IMAGE2IMAGE_PIPELINES_MAPPING[model_name])
+
+            if "kandinsky" not in model_name:
+                pipe_to = AutoPipelineForInpainting.from_pipe(pipe_img2img)
+                self.assertIsInstance(pipe_to, AUTO_INPAINT_PIPELINES_MAPPING[model_name])
+
+            del pipe_img2img, pipe_to
+            gc.collect()
+
+            # test inpaint
+
+            if "kandinsky" not in model_name:
+                pipe_inpaint = AutoPipelineForInpainting.from_pretrained(
+                    model_repo, variant="fp16", torch_dtype=torch.float16
+                )
+                self.assertIsInstance(pipe_inpaint, AUTO_INPAINT_PIPELINES_MAPPING[model_name])
+
+                pipe_to = AutoPipelineForText2Image.from_pipe(pipe_inpaint)
+                self.assertIsInstance(pipe_to, AUTO_TEXT2IMAGE_PIPELINES_MAPPING[model_name])
+
+                pipe_to = AutoPipelineForImage2Image.from_pipe(pipe_inpaint)
+                self.assertIsInstance(pipe_to, AUTO_IMAGE2IMAGE_PIPELINES_MAPPING[model_name])
+
+                pipe_to = AutoPipelineForInpainting.from_pipe(pipe_inpaint)
+                self.assertIsInstance(pipe_to, AUTO_INPAINT_PIPELINES_MAPPING[model_name])
+
+                del pipe_inpaint, pipe_to
+                gc.collect()
+
+    def test_from_pipe_consistent(self):
+        for model_name, model_repo in PRETRAINED_MODEL_REPO_MAPPING.items():
+            if model_name in ["kandinsky", "kandinsky22"]:
+                auto_pipes = [AutoPipelineForText2Image, AutoPipelineForImage2Image]
+            else:
+                auto_pipes = [AutoPipelineForText2Image, AutoPipelineForImage2Image, AutoPipelineForInpainting]
+
+            # test from_pretrained
+            for pipe_from_class in auto_pipes:
+                pipe_from = pipe_from_class.from_pretrained(model_repo, variant="fp16", torch_dtype=torch.float16)
+                pipe_from_config = dict(pipe_from.config)
+
+                for pipe_to_class in auto_pipes:
+                    pipe_to = pipe_to_class.from_pipe(pipe_from)
+                    self.assertEqual(dict(pipe_to.config), pipe_from_config)
+
+                del pipe_from, pipe_to
+                gc.collect()
+
+    def test_controlnet(self):
+        # test from_pretrained
+        model_repo = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+        controlnet_repo = "lllyasviel/sd-controlnet-canny"
+
+        controlnet = ControlNetModel.from_pretrained(controlnet_repo, torch_dtype=torch.float16)
+
+        pipe_txt2img = AutoPipelineForText2Image.from_pretrained(
+            model_repo, controlnet=controlnet, torch_dtype=torch.float16
+        )
+        self.assertIsInstance(pipe_txt2img, AUTO_TEXT2IMAGE_PIPELINES_MAPPING["stable-diffusion-controlnet"])
+
+        pipe_img2img = AutoPipelineForImage2Image.from_pretrained(
+            model_repo, controlnet=controlnet, torch_dtype=torch.float16
+        )
+        self.assertIsInstance(pipe_img2img, AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["stable-diffusion-controlnet"])
+
+        pipe_inpaint = AutoPipelineForInpainting.from_pretrained(
+            model_repo, controlnet=controlnet, torch_dtype=torch.float16
+        )
+        self.assertIsInstance(pipe_inpaint, AUTO_INPAINT_PIPELINES_MAPPING["stable-diffusion-controlnet"])
+
+        # test from_pipe
+        for pipe_from in [pipe_txt2img, pipe_img2img, pipe_inpaint]:
+            pipe_to = AutoPipelineForText2Image.from_pipe(pipe_from)
+            self.assertIsInstance(pipe_to, AUTO_TEXT2IMAGE_PIPELINES_MAPPING["stable-diffusion-controlnet"])
+            self.assertEqual(dict(pipe_to.config), dict(pipe_txt2img.config))
+
+            pipe_to = AutoPipelineForImage2Image.from_pipe(pipe_from)
+            self.assertIsInstance(pipe_to, AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["stable-diffusion-controlnet"])
+            self.assertEqual(dict(pipe_to.config), dict(pipe_img2img.config))
+
+            pipe_to = AutoPipelineForInpainting.from_pipe(pipe_from)
+            self.assertIsInstance(pipe_to, AUTO_INPAINT_PIPELINES_MAPPING["stable-diffusion-controlnet"])
+            self.assertEqual(dict(pipe_to.config), dict(pipe_inpaint.config))
diff --git a/diffusers/tests/pipelines/test_pipelines_combined.py b/diffusers/tests/pipelines/test_pipelines_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..adedd54fea40076e1cac914d91ad93ef52761a56
--- /dev/null
+++ b/diffusers/tests/pipelines/test_pipelines_combined.py
@@ -0,0 +1,128 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from huggingface_hub import ModelCard
+
+from diffusers import (
+    DDPMScheduler,
+    DiffusionPipeline,
+    KandinskyV22CombinedPipeline,
+    KandinskyV22Pipeline,
+    KandinskyV22PriorPipeline,
+)
+from diffusers.pipelines.pipeline_utils import CONNECTED_PIPES_KEYS
+
+
+def state_dicts_almost_equal(sd1, sd2):
+    sd1 = dict(sorted(sd1.items()))
+    sd2 = dict(sorted(sd2.items()))
+
+    models_are_equal = True
+    for ten1, ten2 in zip(sd1.values(), sd2.values()):
+        if (ten1 - ten2).abs().sum() > 1e-3:
+            models_are_equal = False
+
+    return models_are_equal
+
+
+class CombinedPipelineFastTest(unittest.TestCase):
+    def modelcard_has_connected_pipeline(self, model_id):
+        modelcard = ModelCard.load(model_id)
+        connected_pipes = {prefix: getattr(modelcard.data, prefix, [None])[0] for prefix in CONNECTED_PIPES_KEYS}
+        connected_pipes = {k: v for k, v in connected_pipes.items() if v is not None}
+
+        return len(connected_pipes) > 0
+
+    def test_correct_modelcard_format(self):
+        # hf-internal-testing/tiny-random-kandinsky-v22-prior has no metadata
+        assert not self.modelcard_has_connected_pipeline("hf-internal-testing/tiny-random-kandinsky-v22-prior")
+
+        # see https://huggingface.co/hf-internal-testing/tiny-random-kandinsky-v22-decoder/blob/8baff9897c6be017013e21b5c562e5a381646c7e/README.md?code=true#L2
+        assert self.modelcard_has_connected_pipeline("hf-internal-testing/tiny-random-kandinsky-v22-decoder")
+
+    def test_load_connected_checkpoint_when_specified(self):
+        pipeline_prior = DiffusionPipeline.from_pretrained("hf-internal-testing/tiny-random-kandinsky-v22-prior")
+        pipeline_prior_connected = DiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-random-kandinsky-v22-prior", load_connected_pipeline=True
+        )
+
+        # Passing `load_connected_pipeline` to prior is a no-op as the pipeline has no connected pipeline
+        assert pipeline_prior.__class__ == pipeline_prior_connected.__class__
+
+        pipeline = DiffusionPipeline.from_pretrained("hf-internal-testing/tiny-random-kandinsky-v22-decoder")
+        pipeline_connected = DiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-random-kandinsky-v22-decoder", load_connected_pipeline=True
+        )
+
+        # Passing `load_connected_pipeline` to decoder loads the combined pipeline
+        assert pipeline.__class__ != pipeline_connected.__class__
+        assert pipeline.__class__ == KandinskyV22Pipeline
+        assert pipeline_connected.__class__ == KandinskyV22CombinedPipeline
+
+        # check that loaded components match prior and decoder components
+        assert set(pipeline_connected.components.keys()) == set(
+            ["prior_" + k for k in pipeline_prior.components.keys()] + list(pipeline.components.keys())
+        )
+
+    def test_load_connected_checkpoint_default(self):
+        prior = KandinskyV22PriorPipeline.from_pretrained("hf-internal-testing/tiny-random-kandinsky-v22-prior")
+        decoder = KandinskyV22Pipeline.from_pretrained("hf-internal-testing/tiny-random-kandinsky-v22-decoder")
+
+        # check that combined pipeline loads both prior & decoder because of
+        # https://huggingface.co/hf-internal-testing/tiny-random-kandinsky-v22-decoder/blob/8baff9897c6be017013e21b5c562e5a381646c7e/README.md?code=true#L3
+        assert (
+            KandinskyV22CombinedPipeline._load_connected_pipes
+        )  # combined pipelines will download more checkpoints that just the one specified
+        pipeline = KandinskyV22CombinedPipeline.from_pretrained(
+            "hf-internal-testing/tiny-random-kandinsky-v22-decoder"
+        )
+
+        prior_comps = prior.components
+        decoder_comps = decoder.components
+        for k, component in pipeline.components.items():
+            if k.startswith("prior_"):
+                k = k[6:]
+                comp = prior_comps[k]
+            else:
+                comp = decoder_comps[k]
+
+            if isinstance(component, torch.nn.Module):
+                assert state_dicts_almost_equal(component.state_dict(), comp.state_dict())
+            elif hasattr(component, "config"):
+                assert dict(component.config) == dict(comp.config)
+            else:
+                assert component.__class__ == comp.__class__
+
+    def test_load_connected_checkpoint_with_passed_obj(self):
+        pipeline = KandinskyV22CombinedPipeline.from_pretrained(
+            "hf-internal-testing/tiny-random-kandinsky-v22-decoder"
+        )
+        prior_scheduler = DDPMScheduler.from_config(pipeline.prior_scheduler.config)
+        scheduler = DDPMScheduler.from_config(pipeline.scheduler.config)
+
+        # make sure we pass a different scheduler and prior_scheduler
+        assert pipeline.prior_scheduler.__class__ != prior_scheduler.__class__
+        assert pipeline.scheduler.__class__ != scheduler.__class__
+
+        pipeline_new = KandinskyV22CombinedPipeline.from_pretrained(
+            "hf-internal-testing/tiny-random-kandinsky-v22-decoder",
+            prior_scheduler=prior_scheduler,
+            scheduler=scheduler,
+        )
+        assert dict(pipeline_new.prior_scheduler.config) == dict(prior_scheduler.config)
+        assert dict(pipeline_new.scheduler.config) == dict(scheduler.config)
diff --git a/diffusers/tests/pipelines/test_pipelines_common.py b/diffusers/tests/pipelines/test_pipelines_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..49da08e2ca45f5edce1cb70c217fe0f7980252dd
--- /dev/null
+++ b/diffusers/tests/pipelines/test_pipelines_common.py
@@ -0,0 +1,2129 @@
+import gc
+import inspect
+import json
+import os
+import tempfile
+import unittest
+import uuid
+from typing import Any, Callable, Dict, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn as nn
+from huggingface_hub import ModelCard, delete_repo
+from huggingface_hub.utils import is_jinja_available
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AsymmetricAutoencoderKL,
+    AutoencoderKL,
+    AutoencoderTiny,
+    ConsistencyDecoderVAE,
+    DDIMScheduler,
+    DiffusionPipeline,
+    KolorsPipeline,
+    StableDiffusionPipeline,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import IPAdapterMixin
+from diffusers.models.attention_processor import AttnProcessor
+from diffusers.models.controlnet_xs import UNetControlNetXSModel
+from diffusers.models.unets.unet_3d_condition import UNet3DConditionModel
+from diffusers.models.unets.unet_i2vgen_xl import I2VGenXLUNet
+from diffusers.models.unets.unet_motion_model import UNetMotionModel
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import logging
+from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available
+from diffusers.utils.testing_utils import (
+    CaptureLogger,
+    require_torch,
+    skip_mps,
+    torch_device,
+)
+
+from ..models.autoencoders.test_models_vae import (
+    get_asym_autoencoder_kl_config,
+    get_autoencoder_kl_config,
+    get_autoencoder_tiny_config,
+    get_consistency_vae_config,
+)
+from ..models.unets.test_models_unet_2d_condition import (
+    create_ip_adapter_faceid_state_dict,
+    create_ip_adapter_state_dict,
+)
+from ..others.test_utils import TOKEN, USER, is_staging_test
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+def check_same_shape(tensor_list):
+    shapes = [tensor.shape for tensor in tensor_list]
+    return all(shape == shapes[0] for shape in shapes[1:])
+
+
+def check_qkv_fusion_matches_attn_procs_length(model, original_attn_processors):
+    current_attn_processors = model.attn_processors
+    return len(current_attn_processors) == len(original_attn_processors)
+
+
+def check_qkv_fusion_processors_exist(model):
+    current_attn_processors = model.attn_processors
+    proc_names = [v.__class__.__name__ for _, v in current_attn_processors.items()]
+    return all(p.startswith("Fused") for p in proc_names)
+
+
+class SDFunctionTesterMixin:
+    """
+    This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes.
+    It provides a set of common tests for PyTorch pipeline that inherit from StableDiffusionMixin, e.g. vae_slicing, vae_tiling, freeu, etc.
+    """
+
+    def test_vae_slicing(self, image_count=4):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        # components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * image_count
+        if "image" in inputs:  # fix batch size mismatch in I2V_Gen pipeline
+            inputs["image"] = [inputs["image"]] * image_count
+        output_1 = pipe(**inputs)
+
+        # make sure sliced vae decode yields the same result
+        pipe.enable_vae_slicing()
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * image_count
+        if "image" in inputs:
+            inputs["image"] = [inputs["image"]] * image_count
+        inputs["return_dict"] = False
+        output_2 = pipe(**inputs)
+
+        assert np.abs(output_2[0].flatten() - output_1[0].flatten()).max() < 1e-2
+
+    def test_vae_tiling(self):
+        components = self.get_dummy_components()
+
+        # make sure here that pndm scheduler skips prk
+        if "safety_checker" in components:
+            components["safety_checker"] = None
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["return_dict"] = False
+
+        # Test that tiled decode at 512x512 yields the same result as the non-tiled decode
+        output_1 = pipe(**inputs)[0]
+
+        # make sure tiled vae decode yields the same result
+        pipe.enable_vae_tiling()
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["return_dict"] = False
+        output_2 = pipe(**inputs)[0]
+
+        assert np.abs(to_np(output_2) - to_np(output_1)).max() < 5e-1
+
+        # test that tiled decode works with various shapes
+        shapes = [(1, 4, 73, 97), (1, 4, 65, 49)]
+        with torch.no_grad():
+            for shape in shapes:
+                zeros = torch.zeros(shape).to(torch_device)
+                pipe.vae.decode(zeros)
+
+    # MPS currently doesn't support ComplexFloats, which are required for FreeU - see https://github.com/huggingface/diffusers/issues/7569.
+    @skip_mps
+    def test_freeu(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # Normal inference
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["return_dict"] = False
+        inputs["output_type"] = "np"
+        output = pipe(**inputs)[0]
+
+        # FreeU-enabled inference
+        pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["return_dict"] = False
+        inputs["output_type"] = "np"
+        output_freeu = pipe(**inputs)[0]
+
+        # FreeU-disabled inference
+        pipe.disable_freeu()
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for upsample_block in pipe.unet.up_blocks:
+            for key in freeu_keys:
+                assert getattr(upsample_block, key) is None, f"Disabling of FreeU should have set {key} to None."
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["return_dict"] = False
+        inputs["output_type"] = "np"
+        output_no_freeu = pipe(**inputs)[0]
+
+        assert not np.allclose(
+            output[0, -3:, -3:, -1], output_freeu[0, -3:, -3:, -1]
+        ), "Enabling of FreeU should lead to different results."
+        assert np.allclose(
+            output, output_no_freeu, atol=1e-2
+        ), f"Disabling of FreeU should lead to results similar to the default pipeline results but Max Abs Error={np.abs(output_no_freeu - output).max()}."
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["return_dict"] = False
+        image = pipe(**inputs)[0]
+        original_image_slice = image[0, -3:, -3:, -1]
+
+        pipe.fuse_qkv_projections()
+        for _, component in pipe.components.items():
+            if (
+                isinstance(component, nn.Module)
+                and hasattr(component, "original_attn_processors")
+                and component.original_attn_processors is not None
+            ):
+                assert check_qkv_fusion_processors_exist(
+                    component
+                ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+                assert check_qkv_fusion_matches_attn_procs_length(
+                    component, component.original_attn_processors
+                ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["return_dict"] = False
+        image_fused = pipe(**inputs)[0]
+        image_slice_fused = image_fused[0, -3:, -3:, -1]
+
+        pipe.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        inputs["return_dict"] = False
+        image_disabled = pipe(**inputs)[0]
+        image_slice_disabled = image_disabled[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-2, rtol=1e-2
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
+
+
+class IPAdapterTesterMixin:
+    """
+    This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes.
+    It provides a set of common tests for pipelines that support IP Adapters.
+    """
+
+    def test_pipeline_signature(self):
+        parameters = inspect.signature(self.pipeline_class.__call__).parameters
+
+        assert issubclass(self.pipeline_class, IPAdapterMixin)
+        self.assertIn(
+            "ip_adapter_image",
+            parameters,
+            "`ip_adapter_image` argument must be supported by the `__call__` method",
+        )
+        self.assertIn(
+            "ip_adapter_image_embeds",
+            parameters,
+            "`ip_adapter_image_embeds` argument must be supported by the `__call__` method",
+        )
+
+    def _get_dummy_image_embeds(self, cross_attention_dim: int = 32):
+        return torch.randn((2, 1, cross_attention_dim), device=torch_device)
+
+    def _get_dummy_faceid_image_embeds(self, cross_attention_dim: int = 32):
+        return torch.randn((2, 1, 1, cross_attention_dim), device=torch_device)
+
+    def _get_dummy_masks(self, input_size: int = 64):
+        _masks = torch.zeros((1, 1, input_size, input_size), device=torch_device)
+        _masks[0, :, :, : int(input_size / 2)] = 1
+        return _masks
+
+    def _modify_inputs_for_ip_adapter_test(self, inputs: Dict[str, Any]):
+        parameters = inspect.signature(self.pipeline_class.__call__).parameters
+        if "image" in parameters.keys() and "strength" in parameters.keys():
+            inputs["num_inference_steps"] = 4
+
+        inputs["output_type"] = "np"
+        inputs["return_dict"] = False
+        return inputs
+
+    def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=None):
+        r"""Tests for IP-Adapter.
+
+        The following scenarios are tested:
+          - Single IP-Adapter with scale=0 should produce same output as no IP-Adapter.
+          - Multi IP-Adapter with scale=0 should produce same output as no IP-Adapter.
+          - Single IP-Adapter with scale!=0 should produce different output compared to no IP-Adapter.
+          - Multi IP-Adapter with scale!=0 should produce different output compared to no IP-Adapter.
+        """
+        # Raising the tolerance for this test when it's run on a CPU because we
+        # compare against static slices and that can be shaky (with a VVVV low probability).
+        expected_max_diff = 9e-4 if torch_device == "cpu" else expected_max_diff
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components).to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        cross_attention_dim = pipe.unet.config.get("cross_attention_dim", 32)
+
+        # forward pass without ip adapter
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        if expected_pipe_slice is None:
+            output_without_adapter = pipe(**inputs)[0]
+        else:
+            output_without_adapter = expected_pipe_slice
+
+        # 1. Single IP-Adapter test cases
+        adapter_state_dict = create_ip_adapter_state_dict(pipe.unet)
+        pipe.unet._load_ip_adapter_weights(adapter_state_dict)
+
+        # forward pass with single ip adapter, but scale=0 which should have no effect
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
+        pipe.set_ip_adapter_scale(0.0)
+        output_without_adapter_scale = pipe(**inputs)[0]
+        if expected_pipe_slice is not None:
+            output_without_adapter_scale = output_without_adapter_scale[0, -3:, -3:, -1].flatten()
+
+        # forward pass with single ip adapter, but with scale of adapter weights
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
+        pipe.set_ip_adapter_scale(42.0)
+        output_with_adapter_scale = pipe(**inputs)[0]
+        if expected_pipe_slice is not None:
+            output_with_adapter_scale = output_with_adapter_scale[0, -3:, -3:, -1].flatten()
+
+        max_diff_without_adapter_scale = np.abs(output_without_adapter_scale - output_without_adapter).max()
+        max_diff_with_adapter_scale = np.abs(output_with_adapter_scale - output_without_adapter).max()
+
+        self.assertLess(
+            max_diff_without_adapter_scale,
+            expected_max_diff,
+            "Output without ip-adapter must be same as normal inference",
+        )
+        self.assertGreater(
+            max_diff_with_adapter_scale, 1e-2, "Output with ip-adapter must be different from normal inference"
+        )
+
+        # 2. Multi IP-Adapter test cases
+        adapter_state_dict_1 = create_ip_adapter_state_dict(pipe.unet)
+        adapter_state_dict_2 = create_ip_adapter_state_dict(pipe.unet)
+        pipe.unet._load_ip_adapter_weights([adapter_state_dict_1, adapter_state_dict_2])
+
+        # forward pass with multi ip adapter, but scale=0 which should have no effect
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
+        pipe.set_ip_adapter_scale([0.0, 0.0])
+        output_without_multi_adapter_scale = pipe(**inputs)[0]
+        if expected_pipe_slice is not None:
+            output_without_multi_adapter_scale = output_without_multi_adapter_scale[0, -3:, -3:, -1].flatten()
+
+        # forward pass with multi ip adapter, but with scale of adapter weights
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
+        pipe.set_ip_adapter_scale([42.0, 42.0])
+        output_with_multi_adapter_scale = pipe(**inputs)[0]
+        if expected_pipe_slice is not None:
+            output_with_multi_adapter_scale = output_with_multi_adapter_scale[0, -3:, -3:, -1].flatten()
+
+        max_diff_without_multi_adapter_scale = np.abs(
+            output_without_multi_adapter_scale - output_without_adapter
+        ).max()
+        max_diff_with_multi_adapter_scale = np.abs(output_with_multi_adapter_scale - output_without_adapter).max()
+        self.assertLess(
+            max_diff_without_multi_adapter_scale,
+            expected_max_diff,
+            "Output without multi-ip-adapter must be same as normal inference",
+        )
+        self.assertGreater(
+            max_diff_with_multi_adapter_scale,
+            1e-2,
+            "Output with multi-ip-adapter scale must be different from normal inference",
+        )
+
+    def test_ip_adapter_cfg(self, expected_max_diff: float = 1e-4):
+        parameters = inspect.signature(self.pipeline_class.__call__).parameters
+
+        if "guidance_scale" not in parameters:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components).to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        cross_attention_dim = pipe.unet.config.get("cross_attention_dim", 32)
+
+        adapter_state_dict = create_ip_adapter_state_dict(pipe.unet)
+        pipe.unet._load_ip_adapter_weights(adapter_state_dict)
+        pipe.set_ip_adapter_scale(1.0)
+
+        # forward pass with CFG not applied
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)[0].unsqueeze(0)]
+        inputs["guidance_scale"] = 1.0
+        out_no_cfg = pipe(**inputs)[0]
+
+        # forward pass with CFG applied
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
+        inputs["guidance_scale"] = 7.5
+        out_cfg = pipe(**inputs)[0]
+
+        assert out_cfg.shape == out_no_cfg.shape
+
+    def test_ip_adapter_masks(self, expected_max_diff: float = 1e-4):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components).to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        cross_attention_dim = pipe.unet.config.get("cross_attention_dim", 32)
+        sample_size = pipe.unet.config.get("sample_size", 32)
+        block_out_channels = pipe.vae.config.get("block_out_channels", [128, 256, 512, 512])
+        input_size = sample_size * (2 ** (len(block_out_channels) - 1))
+
+        # forward pass without ip adapter
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        output_without_adapter = pipe(**inputs)[0]
+        output_without_adapter = output_without_adapter[0, -3:, -3:, -1].flatten()
+
+        adapter_state_dict = create_ip_adapter_state_dict(pipe.unet)
+        pipe.unet._load_ip_adapter_weights(adapter_state_dict)
+
+        # forward pass with single ip adapter and masks, but scale=0 which should have no effect
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
+        inputs["cross_attention_kwargs"] = {"ip_adapter_masks": [self._get_dummy_masks(input_size)]}
+        pipe.set_ip_adapter_scale(0.0)
+        output_without_adapter_scale = pipe(**inputs)[0]
+        output_without_adapter_scale = output_without_adapter_scale[0, -3:, -3:, -1].flatten()
+
+        # forward pass with single ip adapter and masks, but with scale of adapter weights
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
+        inputs["cross_attention_kwargs"] = {"ip_adapter_masks": [self._get_dummy_masks(input_size)]}
+        pipe.set_ip_adapter_scale(42.0)
+        output_with_adapter_scale = pipe(**inputs)[0]
+        output_with_adapter_scale = output_with_adapter_scale[0, -3:, -3:, -1].flatten()
+
+        max_diff_without_adapter_scale = np.abs(output_without_adapter_scale - output_without_adapter).max()
+        max_diff_with_adapter_scale = np.abs(output_with_adapter_scale - output_without_adapter).max()
+
+        self.assertLess(
+            max_diff_without_adapter_scale,
+            expected_max_diff,
+            "Output without ip-adapter must be same as normal inference",
+        )
+        self.assertGreater(
+            max_diff_with_adapter_scale, 1e-3, "Output with ip-adapter must be different from normal inference"
+        )
+
+    def test_ip_adapter_faceid(self, expected_max_diff: float = 1e-4):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components).to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        cross_attention_dim = pipe.unet.config.get("cross_attention_dim", 32)
+
+        # forward pass without ip adapter
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        output_without_adapter = pipe(**inputs)[0]
+        output_without_adapter = output_without_adapter[0, -3:, -3:, -1].flatten()
+
+        adapter_state_dict = create_ip_adapter_faceid_state_dict(pipe.unet)
+        pipe.unet._load_ip_adapter_weights(adapter_state_dict)
+
+        # forward pass with single ip adapter, but scale=0 which should have no effect
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs["ip_adapter_image_embeds"] = [self._get_dummy_faceid_image_embeds(cross_attention_dim)]
+        pipe.set_ip_adapter_scale(0.0)
+        output_without_adapter_scale = pipe(**inputs)[0]
+        output_without_adapter_scale = output_without_adapter_scale[0, -3:, -3:, -1].flatten()
+
+        # forward pass with single ip adapter, but with scale of adapter weights
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs["ip_adapter_image_embeds"] = [self._get_dummy_faceid_image_embeds(cross_attention_dim)]
+        pipe.set_ip_adapter_scale(42.0)
+        output_with_adapter_scale = pipe(**inputs)[0]
+        output_with_adapter_scale = output_with_adapter_scale[0, -3:, -3:, -1].flatten()
+
+        max_diff_without_adapter_scale = np.abs(output_without_adapter_scale - output_without_adapter).max()
+        max_diff_with_adapter_scale = np.abs(output_with_adapter_scale - output_without_adapter).max()
+
+        self.assertLess(
+            max_diff_without_adapter_scale,
+            expected_max_diff,
+            "Output without ip-adapter must be same as normal inference",
+        )
+        self.assertGreater(
+            max_diff_with_adapter_scale, 1e-3, "Output with ip-adapter must be different from normal inference"
+        )
+
+
+class PipelineLatentTesterMixin:
+    """
+    This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes.
+    It provides a set of common tests for PyTorch pipeline that has vae, e.g.
+    equivalence of different input and output types, etc.
+    """
+
+    @property
+    def image_params(self) -> frozenset:
+        raise NotImplementedError(
+            "You need to set the attribute `image_params` in the child test class. "
+            "`image_params` are tested for if all accepted input image types (i.e. `pt`,`pil`,`np`) are producing same results"
+        )
+
+    @property
+    def image_latents_params(self) -> frozenset:
+        raise NotImplementedError(
+            "You need to set the attribute `image_latents_params` in the child test class. "
+            "`image_latents_params` are tested for if passing latents directly are producing same results"
+        )
+
+    def get_dummy_inputs_by_type(self, device, seed=0, input_image_type="pt", output_type="np"):
+        inputs = self.get_dummy_inputs(device, seed)
+
+        def convert_to_pt(image):
+            if isinstance(image, torch.Tensor):
+                input_image = image
+            elif isinstance(image, np.ndarray):
+                input_image = VaeImageProcessor.numpy_to_pt(image)
+            elif isinstance(image, PIL.Image.Image):
+                input_image = VaeImageProcessor.pil_to_numpy(image)
+                input_image = VaeImageProcessor.numpy_to_pt(input_image)
+            else:
+                raise ValueError(f"unsupported input_image_type {type(image)}")
+            return input_image
+
+        def convert_pt_to_type(image, input_image_type):
+            if input_image_type == "pt":
+                input_image = image
+            elif input_image_type == "np":
+                input_image = VaeImageProcessor.pt_to_numpy(image)
+            elif input_image_type == "pil":
+                input_image = VaeImageProcessor.pt_to_numpy(image)
+                input_image = VaeImageProcessor.numpy_to_pil(input_image)
+            else:
+                raise ValueError(f"unsupported input_image_type {input_image_type}.")
+            return input_image
+
+        for image_param in self.image_params:
+            if image_param in inputs.keys():
+                inputs[image_param] = convert_pt_to_type(
+                    convert_to_pt(inputs[image_param]).to(device), input_image_type
+                )
+
+        inputs["output_type"] = output_type
+
+        return inputs
+
+    def test_pt_np_pil_outputs_equivalent(self, expected_max_diff=1e-4):
+        self._test_pt_np_pil_outputs_equivalent(expected_max_diff=expected_max_diff)
+
+    def _test_pt_np_pil_outputs_equivalent(self, expected_max_diff=1e-4, input_image_type="pt"):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        output_pt = pipe(
+            **self.get_dummy_inputs_by_type(torch_device, input_image_type=input_image_type, output_type="pt")
+        )[0]
+        output_np = pipe(
+            **self.get_dummy_inputs_by_type(torch_device, input_image_type=input_image_type, output_type="np")
+        )[0]
+        output_pil = pipe(
+            **self.get_dummy_inputs_by_type(torch_device, input_image_type=input_image_type, output_type="pil")
+        )[0]
+
+        max_diff = np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max()
+        self.assertLess(
+            max_diff, expected_max_diff, "`output_type=='pt'` generate different results from `output_type=='np'`"
+        )
+
+        max_diff = np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max()
+        self.assertLess(max_diff, 2.0, "`output_type=='pil'` generate different results from `output_type=='np'`")
+
+    def test_pt_np_pil_inputs_equivalent(self):
+        if len(self.image_params) == 0:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        out_input_pt = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="pt"))[0]
+        out_input_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
+        out_input_pil = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="pil"))[0]
+
+        max_diff = np.abs(out_input_pt - out_input_np).max()
+        self.assertLess(max_diff, 1e-4, "`input_type=='pt'` generate different result from `input_type=='np'`")
+        max_diff = np.abs(out_input_pil - out_input_np).max()
+        self.assertLess(max_diff, 1e-2, "`input_type=='pt'` generate different result from `input_type=='np'`")
+
+    def test_latents_input(self):
+        if len(self.image_latents_params) == 0:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        out = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="pt"))[0]
+
+        vae = components["vae"]
+        inputs = self.get_dummy_inputs_by_type(torch_device, input_image_type="pt")
+        generator = inputs["generator"]
+        for image_param in self.image_latents_params:
+            if image_param in inputs.keys():
+                inputs[image_param] = (
+                    vae.encode(inputs[image_param]).latent_dist.sample(generator) * vae.config.scaling_factor
+                )
+        out_latents_inputs = pipe(**inputs)[0]
+
+        max_diff = np.abs(out - out_latents_inputs).max()
+        self.assertLess(max_diff, 1e-4, "passing latents as image input generate different result from passing image")
+
+    def test_multi_vae(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        block_out_channels = pipe.vae.config.block_out_channels
+        norm_num_groups = pipe.vae.config.norm_num_groups
+
+        vae_classes = [AutoencoderKL, AsymmetricAutoencoderKL, ConsistencyDecoderVAE, AutoencoderTiny]
+        configs = [
+            get_autoencoder_kl_config(block_out_channels, norm_num_groups),
+            get_asym_autoencoder_kl_config(block_out_channels, norm_num_groups),
+            get_consistency_vae_config(block_out_channels, norm_num_groups),
+            get_autoencoder_tiny_config(block_out_channels),
+        ]
+
+        out_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
+
+        for vae_cls, config in zip(vae_classes, configs):
+            vae = vae_cls(**config)
+            vae = vae.to(torch_device)
+            components["vae"] = vae
+            vae_pipe = self.pipeline_class(**components)
+            out_vae_np = vae_pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
+
+            assert out_vae_np.shape == out_np.shape
+
+
+@require_torch
+class PipelineFromPipeTesterMixin:
+    @property
+    def original_pipeline_class(self):
+        if "xl" in self.pipeline_class.__name__.lower():
+            original_pipeline_class = StableDiffusionXLPipeline
+        elif "kolors" in self.pipeline_class.__name__.lower():
+            original_pipeline_class = KolorsPipeline
+        else:
+            original_pipeline_class = StableDiffusionPipeline
+
+        return original_pipeline_class
+
+    def get_dummy_inputs_pipe(self, device, seed=0):
+        inputs = self.get_dummy_inputs(device, seed=seed)
+        inputs["output_type"] = "np"
+        inputs["return_dict"] = False
+        return inputs
+
+    def get_dummy_inputs_for_pipe_original(self, device, seed=0):
+        inputs = {}
+        for k, v in self.get_dummy_inputs_pipe(device, seed=seed).items():
+            if k in set(inspect.signature(self.original_pipeline_class.__call__).parameters.keys()):
+                inputs[k] = v
+        return inputs
+
+    def test_from_pipe_consistent_config(self):
+        if self.original_pipeline_class == StableDiffusionPipeline:
+            original_repo = "hf-internal-testing/tiny-stable-diffusion-pipe"
+            original_kwargs = {"requires_safety_checker": False}
+        elif self.original_pipeline_class == StableDiffusionXLPipeline:
+            original_repo = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
+            original_kwargs = {"requires_aesthetics_score": True, "force_zeros_for_empty_prompt": False}
+        elif self.original_pipeline_class == KolorsPipeline:
+            original_repo = "hf-internal-testing/tiny-kolors-pipe"
+            original_kwargs = {"force_zeros_for_empty_prompt": False}
+        else:
+            raise ValueError(
+                "original_pipeline_class must be either StableDiffusionPipeline or StableDiffusionXLPipeline"
+            )
+
+        # create original_pipeline_class(sd/sdxl)
+        pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
+
+        # original_pipeline_class(sd/sdxl) -> pipeline_class
+        pipe_components = self.get_dummy_components()
+        pipe_additional_components = {}
+        for name, component in pipe_components.items():
+            if name not in pipe_original.components:
+                pipe_additional_components[name] = component
+
+        pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
+
+        # pipeline_class -> original_pipeline_class(sd/sdxl)
+        original_pipe_additional_components = {}
+        for name, component in pipe_original.components.items():
+            if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
+                original_pipe_additional_components[name] = component
+
+        pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
+
+        # compare the config
+        original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
+        original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
+        assert original_config_2 == original_config
+
+    def test_from_pipe_consistent_forward_pass(self, expected_max_diff=1e-3):
+        components = self.get_dummy_components()
+        original_expected_modules, _ = self.original_pipeline_class._get_signature_keys(self.original_pipeline_class)
+
+        # pipeline components that are also expected to be in the original pipeline
+        original_pipe_components = {}
+        # additional components that are not in the pipeline, but expected in the original pipeline
+        original_pipe_additional_components = {}
+        # additional components that are in the pipeline, but not expected in the original pipeline
+        current_pipe_additional_components = {}
+
+        for name, component in components.items():
+            if name in original_expected_modules:
+                original_pipe_components[name] = component
+            else:
+                current_pipe_additional_components[name] = component
+        for name in original_expected_modules:
+            if name not in original_pipe_components:
+                if name in self.original_pipeline_class._optional_components:
+                    original_pipe_additional_components[name] = None
+                else:
+                    raise ValueError(f"missing required module for {self.original_pipeline_class.__class__}: {name}")
+
+        pipe_original = self.original_pipeline_class(**original_pipe_components, **original_pipe_additional_components)
+        for component in pipe_original.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe_original.to(torch_device)
+        pipe_original.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs_for_pipe_original(torch_device)
+        output_original = pipe_original(**inputs)[0]
+
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs_pipe(torch_device)
+        output = pipe(**inputs)[0]
+
+        pipe_from_original = self.pipeline_class.from_pipe(pipe_original, **current_pipe_additional_components)
+        pipe_from_original.to(torch_device)
+        pipe_from_original.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs_pipe(torch_device)
+        output_from_original = pipe_from_original(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_from_original)).max()
+        self.assertLess(
+            max_diff,
+            expected_max_diff,
+            "The outputs of the pipelines created with `from_pipe` and `__init__` are different.",
+        )
+
+        inputs = self.get_dummy_inputs_for_pipe_original(torch_device)
+        output_original_2 = pipe_original(**inputs)[0]
+
+        max_diff = np.abs(to_np(output_original) - to_np(output_original_2)).max()
+        self.assertLess(max_diff, expected_max_diff, "`from_pipe` should not change the output of original pipeline.")
+
+        for component in pipe_original.components.values():
+            if hasattr(component, "attn_processors"):
+                assert all(
+                    type(proc) == AttnProcessor for proc in component.attn_processors.values()
+                ), "`from_pipe` changed the attention processor in original pipeline."
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"),
+        reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
+    )
+    def test_from_pipe_consistent_forward_pass_cpu_offload(self, expected_max_diff=1e-3):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs_pipe(torch_device)
+        output = pipe(**inputs)[0]
+
+        original_expected_modules, _ = self.original_pipeline_class._get_signature_keys(self.original_pipeline_class)
+        # pipeline components that are also expected to be in the original pipeline
+        original_pipe_components = {}
+        # additional components that are not in the pipeline, but expected in the original pipeline
+        original_pipe_additional_components = {}
+        # additional components that are in the pipeline, but not expected in the original pipeline
+        current_pipe_additional_components = {}
+        for name, component in components.items():
+            if name in original_expected_modules:
+                original_pipe_components[name] = component
+            else:
+                current_pipe_additional_components[name] = component
+        for name in original_expected_modules:
+            if name not in original_pipe_components:
+                if name in self.original_pipeline_class._optional_components:
+                    original_pipe_additional_components[name] = None
+                else:
+                    raise ValueError(f"missing required module for {self.original_pipeline_class.__class__}: {name}")
+
+        pipe_original = self.original_pipeline_class(**original_pipe_components, **original_pipe_additional_components)
+        for component in pipe_original.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe_original.set_progress_bar_config(disable=None)
+
+        pipe_from_original = self.pipeline_class.from_pipe(pipe_original, **current_pipe_additional_components)
+        for component in pipe_from_original.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe_from_original.enable_model_cpu_offload()
+        pipe_from_original.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs_pipe(torch_device)
+        output_from_original = pipe_from_original(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_from_original)).max()
+        self.assertLess(
+            max_diff,
+            expected_max_diff,
+            "The outputs of the pipelines created with `from_pipe` and `__init__` are different.",
+        )
+
+
+@require_torch
+class PipelineKarrasSchedulerTesterMixin:
+    """
+    This mixin is designed to be used with unittest.TestCase classes.
+    It provides a set of common tests for each PyTorch pipeline that makes use of KarrasDiffusionSchedulers
+    equivalence of dict and tuple outputs, etc.
+    """
+
+    def test_karras_schedulers_shape(
+        self, num_inference_steps_for_strength=4, num_inference_steps_for_strength_for_iterations=5
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        # make sure that PNDM does not need warm-up
+        pipe.scheduler.register_to_config(skip_prk_steps=True)
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["num_inference_steps"] = 2
+
+        if "strength" in inputs:
+            inputs["num_inference_steps"] = num_inference_steps_for_strength
+            inputs["strength"] = 0.5
+
+        outputs = []
+        for scheduler_enum in KarrasDiffusionSchedulers:
+            if "KDPM2" in scheduler_enum.name:
+                inputs["num_inference_steps"] = num_inference_steps_for_strength_for_iterations
+
+            scheduler_cls = getattr(diffusers, scheduler_enum.name)
+            pipe.scheduler = scheduler_cls.from_config(pipe.scheduler.config)
+            output = pipe(**inputs)[0]
+            outputs.append(output)
+
+            if "KDPM2" in scheduler_enum.name:
+                inputs["num_inference_steps"] = 2
+
+        assert check_same_shape(outputs)
+
+
+@require_torch
+class PipelineTesterMixin:
+    """
+    This mixin is designed to be used with unittest.TestCase classes.
+    It provides a set of common tests for each PyTorch pipeline, e.g. saving and loading the pipeline,
+    equivalence of dict and tuple outputs, etc.
+    """
+
+    # Canonical parameters that are passed to `__call__` regardless
+    # of the type of pipeline. They are always optional and have common
+    # sense default values.
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "num_images_per_prompt",
+            "generator",
+            "latents",
+            "output_type",
+            "return_dict",
+        ]
+    )
+
+    # set these parameters to False in the child class if the pipeline does not support the corresponding functionality
+    test_attention_slicing = True
+
+    test_xformers_attention = True
+
+    def get_generator(self, seed):
+        device = torch_device if torch_device != "mps" else "cpu"
+        generator = torch.Generator(device).manual_seed(seed)
+        return generator
+
+    @property
+    def pipeline_class(self) -> Union[Callable, DiffusionPipeline]:
+        raise NotImplementedError(
+            "You need to set the attribute `pipeline_class = ClassNameOfPipeline` in the child test class. "
+            "See existing pipeline tests for reference."
+        )
+
+    def get_dummy_components(self):
+        raise NotImplementedError(
+            "You need to implement `get_dummy_components(self)` in the child test class. "
+            "See existing pipeline tests for reference."
+        )
+
+    def get_dummy_inputs(self, device, seed=0):
+        raise NotImplementedError(
+            "You need to implement `get_dummy_inputs(self, device, seed)` in the child test class. "
+            "See existing pipeline tests for reference."
+        )
+
+    @property
+    def params(self) -> frozenset:
+        raise NotImplementedError(
+            "You need to set the attribute `params` in the child test class. "
+            "`params` are checked for if all values are present in `__call__`'s signature."
+            " You can set `params` using one of the common set of parameters defined in `pipeline_params.py`"
+            " e.g., `TEXT_TO_IMAGE_PARAMS` defines the common parameters used in text to  "
+            "image pipelines, including prompts and prompt embedding overrides."
+            "If your pipeline's set of arguments has minor changes from one of the common sets of arguments, "
+            "do not make modifications to the existing common sets of arguments. I.e. a text to image pipeline "
+            "with non-configurable height and width arguments should set the attribute as "
+            "`params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`. "
+            "See existing pipeline tests for reference."
+        )
+
+    @property
+    def batch_params(self) -> frozenset:
+        raise NotImplementedError(
+            "You need to set the attribute `batch_params` in the child test class. "
+            "`batch_params` are the parameters required to be batched when passed to the pipeline's "
+            "`__call__` method. `pipeline_params.py` provides some common sets of parameters such as "
+            "`TEXT_TO_IMAGE_BATCH_PARAMS`, `IMAGE_VARIATION_BATCH_PARAMS`, etc... If your pipeline's "
+            "set of batch arguments has minor changes from one of the common sets of batch arguments, "
+            "do not make modifications to the existing common sets of batch arguments. I.e. a text to "
+            "image pipeline `negative_prompt` is not batched should set the attribute as "
+            "`batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - {'negative_prompt'}`. "
+            "See existing pipeline tests for reference."
+        )
+
+    @property
+    def callback_cfg_params(self) -> frozenset:
+        raise NotImplementedError(
+            "You need to set the attribute `callback_cfg_params` in the child test class that requires to run test_callback_cfg. "
+            "`callback_cfg_params` are the parameters that needs to be passed to the pipeline's callback "
+            "function when dynamically adjusting `guidance_scale`. They are variables that require special"
+            "treatment when `do_classifier_free_guidance` is `True`. `pipeline_params.py` provides some common"
+            " sets of parameters such as `TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS`. If your pipeline's "
+            "set of cfg arguments has minor changes from one of the common sets of cfg arguments, "
+            "do not make modifications to the existing common sets of cfg arguments. I.e. for inpaint pipeline, you "
+            " need to adjust batch size of `mask` and `masked_image_latents` so should set the attribute as"
+            "`callback_cfg_params = TEXT_TO_IMAGE_CFG_PARAMS.union({'mask', 'masked_image_latents'})`"
+        )
+
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test in case of CUDA runtime errors
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_save_load_local(self, expected_max_difference=5e-4):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = pipe(**inputs)[0]
+
+        logger = logging.get_logger("diffusers.pipelines.pipeline_utils")
+        logger.setLevel(diffusers.logging.INFO)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir, safe_serialization=False)
+
+            with CaptureLogger(logger) as cap_logger:
+                pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+
+            for component in pipe_loaded.components.values():
+                if hasattr(component, "set_default_attn_processor"):
+                    component.set_default_attn_processor()
+
+            for name in pipe_loaded.components.keys():
+                if name not in pipe_loaded._optional_components:
+                    assert name in str(cap_logger)
+
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, expected_max_difference)
+
+    def test_pipeline_call_signature(self):
+        self.assertTrue(
+            hasattr(self.pipeline_class, "__call__"), f"{self.pipeline_class} should have a `__call__` method"
+        )
+
+        parameters = inspect.signature(self.pipeline_class.__call__).parameters
+
+        optional_parameters = set()
+
+        for k, v in parameters.items():
+            if v.default != inspect._empty:
+                optional_parameters.add(k)
+
+        parameters = set(parameters.keys())
+        parameters.remove("self")
+        parameters.discard("kwargs")  # kwargs can be added if arguments of pipeline call function are deprecated
+
+        remaining_required_parameters = set()
+
+        for param in self.params:
+            if param not in parameters:
+                remaining_required_parameters.add(param)
+
+        self.assertTrue(
+            len(remaining_required_parameters) == 0,
+            f"Required parameters not present: {remaining_required_parameters}",
+        )
+
+        remaining_required_optional_parameters = set()
+
+        for param in self.required_optional_params:
+            if param not in optional_parameters:
+                remaining_required_optional_parameters.add(param)
+
+        self.assertTrue(
+            len(remaining_required_optional_parameters) == 0,
+            f"Required optional parameters not present: {remaining_required_optional_parameters}",
+        )
+
+    def test_inference_batch_consistent(self, batch_sizes=[2]):
+        self._test_inference_batch_consistent(batch_sizes=batch_sizes)
+
+    def _test_inference_batch_consistent(
+        self, batch_sizes=[2], additional_params_copy_to_batched_inputs=["num_inference_steps"], batch_generator=True
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["generator"] = self.get_generator(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # prepare batched inputs
+        batched_inputs = []
+        for batch_size in batch_sizes:
+            batched_input = {}
+            batched_input.update(inputs)
+
+            for name in self.batch_params:
+                if name not in inputs:
+                    continue
+
+                value = inputs[name]
+                if name == "prompt":
+                    len_prompt = len(value)
+                    # make unequal batch sizes
+                    batched_input[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+
+                    # make last batch super long
+                    batched_input[name][-1] = 100 * "very long"
+
+                else:
+                    batched_input[name] = batch_size * [value]
+
+            if batch_generator and "generator" in inputs:
+                batched_input["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+            if "batch_size" in inputs:
+                batched_input["batch_size"] = batch_size
+
+            batched_inputs.append(batched_input)
+
+        logger.setLevel(level=diffusers.logging.WARNING)
+        for batch_size, batched_input in zip(batch_sizes, batched_inputs):
+            output = pipe(**batched_input)
+            assert len(output[0]) == batch_size
+
+    def test_inference_batch_single_identical(self, batch_size=3, expected_max_diff=1e-4):
+        self._test_inference_batch_single_identical(batch_size=batch_size, expected_max_diff=expected_max_diff)
+
+    def _test_inference_batch_single_identical(
+        self,
+        batch_size=2,
+        expected_max_diff=1e-4,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for components in pipe.components.values():
+            if hasattr(components, "set_default_attn_processor"):
+                components.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = self.get_generator(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        for name in self.batch_params:
+            if name not in inputs:
+                continue
+
+            value = inputs[name]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
+
+            else:
+                batched_inputs[name] = batch_size * [value]
+
+        if "generator" in inputs:
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        output = pipe(**inputs)
+        output_batch = pipe(**batched_inputs)
+
+        assert output_batch[0].shape[0] == batch_size
+
+        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+        assert max_diff < expected_max_diff
+
+    def test_dict_tuple_outputs_equivalent(self, expected_slice=None, expected_max_difference=1e-4):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        if expected_slice is None:
+            output = pipe(**self.get_dummy_inputs(generator_device))[0]
+        else:
+            output = expected_slice
+
+        output_tuple = pipe(**self.get_dummy_inputs(generator_device), return_dict=False)[0]
+
+        if expected_slice is None:
+            max_diff = np.abs(to_np(output) - to_np(output_tuple)).max()
+        else:
+            if output_tuple.ndim != 5:
+                max_diff = np.abs(to_np(output) - to_np(output_tuple)[0, -3:, -3:, -1].flatten()).max()
+            else:
+                max_diff = np.abs(to_np(output) - to_np(output_tuple)[0, -3:, -3:, -1, -1].flatten()).max()
+
+        self.assertLess(max_diff, expected_max_difference)
+
+    def test_components_function(self):
+        init_components = self.get_dummy_components()
+        init_components = {k: v for k, v in init_components.items() if not isinstance(v, (str, int, float))}
+
+        pipe = self.pipeline_class(**init_components)
+
+        self.assertTrue(hasattr(pipe, "components"))
+        self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_float16_inference(self, expected_max_diff=5e-2):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        components = self.get_dummy_components()
+        pipe_fp16 = self.pipeline_class(**components)
+        for component in pipe_fp16.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe_fp16.to(torch_device, torch.float16)
+        pipe_fp16.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is used inside dummy inputs
+        if "generator" in inputs:
+            inputs["generator"] = self.get_generator(0)
+
+        output = pipe(**inputs)[0]
+
+        fp16_inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is used inside dummy inputs
+        if "generator" in fp16_inputs:
+            fp16_inputs["generator"] = self.get_generator(0)
+
+        output_fp16 = pipe_fp16(**fp16_inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_fp16)).max()
+        self.assertLess(max_diff, expected_max_diff, "The outputs of the fp16 and fp32 pipelines are too different.")
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self, expected_max_diff=1e-2):
+        components = self.get_dummy_components()
+        for name, module in components.items():
+            if hasattr(module, "half"):
+                components[name] = module.to(torch_device).half()
+
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16)
+            for component in pipe_loaded.components.values():
+                if hasattr(component, "set_default_attn_processor"):
+                    component.set_default_attn_processor()
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for name, component in pipe_loaded.components.items():
+            if hasattr(component, "dtype"):
+                self.assertTrue(
+                    component.dtype == torch.float16,
+                    f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.",
+                )
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_loaded = pipe_loaded(**inputs)[0]
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(
+            max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."
+        )
+
+    def test_save_load_optional_components(self, expected_max_difference=1e-4):
+        if not hasattr(self.pipeline_class, "_optional_components"):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # set all optional components to None
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir, safe_serialization=False)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            for component in pipe_loaded.components.values():
+                if hasattr(component, "set_default_attn_processor"):
+                    component.set_default_attn_processor()
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(generator_device)
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, expected_max_difference)
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        model_devices = [component.device.type for component in components.values() if hasattr(component, "device")]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [component.device.type for component in components.values() if hasattr(component, "device")]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.float16)
+        model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    def test_attention_slicing_forward_pass(self, expected_max_diff=1e-3):
+        self._test_attention_slicing_forward_pass(expected_max_diff=expected_max_diff)
+
+    def _test_attention_slicing_forward_pass(
+        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
+    ):
+        if not self.test_attention_slicing:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_slicing = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=1)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing1 = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=2)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing2 = pipe(**inputs)[0]
+
+        if test_max_difference:
+            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
+            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
+            self.assertLess(
+                max(max_diff1, max_diff2),
+                expected_max_diff,
+                "Attention slicing should not affect the inference results",
+            )
+
+        if test_mean_pixel_difference:
+            assert_mean_pixel_difference(to_np(output_with_slicing1[0]), to_np(output_without_slicing[0]))
+            assert_mean_pixel_difference(to_np(output_with_slicing2[0]), to_np(output_without_slicing[0]))
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"),
+        reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
+    )
+    def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
+        import accelerate
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_offload = pipe(**inputs)[0]
+
+        pipe.enable_sequential_cpu_offload()
+        assert pipe._execution_device.type == "cuda"
+
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_offload = pipe(**inputs)[0]
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
+
+        # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
+        offloaded_modules = {
+            k: v
+            for k, v in pipe.components.items()
+            if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
+        }
+        # 1. all offloaded modules should be saved to cpu and moved to meta device
+        self.assertTrue(
+            all(v.device.type == "meta" for v in offloaded_modules.values()),
+            f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'meta']}",
+        )
+        # 2. all offloaded modules should have hook installed
+        self.assertTrue(
+            all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
+            f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
+        )
+        # 3. all offloaded modules should have correct hooks installed, should be either one of these two
+        #    - `AlignDevicesHook`
+        #    - a SequentialHook` that contains `AlignDevicesHook`
+        offloaded_modules_with_incorrect_hooks = {}
+        for k, v in offloaded_modules.items():
+            if hasattr(v, "_hf_hook"):
+                if isinstance(v._hf_hook, accelerate.hooks.SequentialHook):
+                    # if it is a `SequentialHook`, we loop through its `hooks` attribute to check if it only contains `AlignDevicesHook`
+                    for hook in v._hf_hook.hooks:
+                        if not isinstance(hook, accelerate.hooks.AlignDevicesHook):
+                            offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook.hooks[0])
+                elif not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook):
+                    offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
+
+        self.assertTrue(
+            len(offloaded_modules_with_incorrect_hooks) == 0,
+            f"Not installed correct hook: {offloaded_modules_with_incorrect_hooks}",
+        )
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
+        reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
+    )
+    def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
+        import accelerate
+
+        generator_device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_offload = pipe(**inputs)[0]
+
+        pipe.enable_model_cpu_offload()
+        assert pipe._execution_device.type == "cuda"
+
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_offload = pipe(**inputs)[0]
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
+
+        # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
+        offloaded_modules = {
+            k: v
+            for k, v in pipe.components.items()
+            if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
+        }
+        # 1. check if all offloaded modules are saved to cpu
+        self.assertTrue(
+            all(v.device.type == "cpu" for v in offloaded_modules.values()),
+            f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'cpu']}",
+        )
+        # 2. check if all offloaded modules have hooks installed
+        self.assertTrue(
+            all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
+            f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
+        )
+        # 3. check if all offloaded modules have correct type of hooks installed, should be `CpuOffload`
+        offloaded_modules_with_incorrect_hooks = {}
+        for k, v in offloaded_modules.items():
+            if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.CpuOffload):
+                offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
+
+        self.assertTrue(
+            len(offloaded_modules_with_incorrect_hooks) == 0,
+            f"Not installed correct hook: {offloaded_modules_with_incorrect_hooks}",
+        )
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
+        reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
+    )
+    def test_cpu_offload_forward_pass_twice(self, expected_max_diff=2e-4):
+        import accelerate
+
+        generator_device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.enable_model_cpu_offload()
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_offload = pipe(**inputs)[0]
+
+        pipe.enable_model_cpu_offload()
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_offload_twice = pipe(**inputs)[0]
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_with_offload_twice)).max()
+        self.assertLess(
+            max_diff, expected_max_diff, "running CPU offloading 2nd time should not affect the inference results"
+        )
+
+        # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
+        offloaded_modules = {
+            k: v
+            for k, v in pipe.components.items()
+            if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
+        }
+        # 1. check if all offloaded modules are saved to cpu
+        self.assertTrue(
+            all(v.device.type == "cpu" for v in offloaded_modules.values()),
+            f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'cpu']}",
+        )
+        # 2. check if all offloaded modules have hooks installed
+        self.assertTrue(
+            all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
+            f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
+        )
+        # 3. check if all offloaded modules have correct type of hooks installed, should be `CpuOffload`
+        offloaded_modules_with_incorrect_hooks = {}
+        for k, v in offloaded_modules.items():
+            if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.CpuOffload):
+                offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
+
+        self.assertTrue(
+            len(offloaded_modules_with_incorrect_hooks) == 0,
+            f"Not installed correct hook: {offloaded_modules_with_incorrect_hooks}",
+        )
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"),
+        reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
+    )
+    def test_sequential_offload_forward_pass_twice(self, expected_max_diff=2e-4):
+        import accelerate
+
+        generator_device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.enable_sequential_cpu_offload()
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_offload = pipe(**inputs)[0]
+
+        pipe.enable_sequential_cpu_offload()
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_offload_twice = pipe(**inputs)[0]
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_with_offload_twice)).max()
+        self.assertLess(
+            max_diff, expected_max_diff, "running sequential offloading second time should have the inference results"
+        )
+
+        # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
+        offloaded_modules = {
+            k: v
+            for k, v in pipe.components.items()
+            if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
+        }
+        # 1. check if all offloaded modules are moved to meta device
+        self.assertTrue(
+            all(v.device.type == "meta" for v in offloaded_modules.values()),
+            f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'meta']}",
+        )
+        # 2. check if all offloaded modules have hook installed
+        self.assertTrue(
+            all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
+            f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
+        )
+        # 3. check if all offloaded modules have correct hooks installed, should be either one of these two
+        #    - `AlignDevicesHook`
+        #    - a SequentialHook` that contains `AlignDevicesHook`
+        offloaded_modules_with_incorrect_hooks = {}
+        for k, v in offloaded_modules.items():
+            if hasattr(v, "_hf_hook"):
+                if isinstance(v._hf_hook, accelerate.hooks.SequentialHook):
+                    # if it is a `SequentialHook`, we loop through its `hooks` attribute to check if it only contains `AlignDevicesHook`
+                    for hook in v._hf_hook.hooks:
+                        if not isinstance(hook, accelerate.hooks.AlignDevicesHook):
+                            offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook.hooks[0])
+                elif not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook):
+                    offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
+
+        self.assertTrue(
+            len(offloaded_modules_with_incorrect_hooks) == 0,
+            f"Not installed correct hook: {offloaded_modules_with_incorrect_hooks}",
+        )
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass()
+
+    def _test_xformers_attention_forwardGenerator_pass(
+        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-4
+    ):
+        if not self.test_xformers_attention:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = pipe(**inputs)[0]
+        output_without_offload = (
+            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
+        )
+
+        pipe.enable_xformers_memory_efficient_attention()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = pipe(**inputs)[0]
+        output_with_offload = (
+            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
+        )
+
+        if test_max_difference:
+            max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+            self.assertLess(max_diff, expected_max_diff, "XFormers attention should not affect the inference results")
+
+        if test_mean_pixel_difference:
+            assert_mean_pixel_difference(output_with_offload[0], output_without_offload[0])
+
+    def test_num_images_per_prompt(self):
+        sig = inspect.signature(self.pipeline_class.__call__)
+
+        if "num_images_per_prompt" not in sig.parameters:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        batch_sizes = [1, 2]
+        num_images_per_prompts = [1, 2]
+
+        for batch_size in batch_sizes:
+            for num_images_per_prompt in num_images_per_prompts:
+                inputs = self.get_dummy_inputs(torch_device)
+
+                for key in inputs.keys():
+                    if key in self.batch_params:
+                        inputs[key] = batch_size * [inputs[key]]
+
+                images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt)[0]
+
+                assert images.shape[0] == batch_size * num_images_per_prompt
+
+    def test_cfg(self):
+        sig = inspect.signature(self.pipeline_class.__call__)
+
+        if "guidance_scale" not in sig.parameters:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        inputs["guidance_scale"] = 1.0
+        out_no_cfg = pipe(**inputs)[0]
+
+        inputs["guidance_scale"] = 7.5
+        out_cfg = pipe(**inputs)[0]
+
+        assert out_cfg.shape == out_no_cfg.shape
+
+    def test_callback_inputs(self):
+        sig = inspect.signature(self.pipeline_class.__call__)
+        has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
+        has_callback_step_end = "callback_on_step_end" in sig.parameters
+
+        if not (has_callback_tensor_inputs and has_callback_step_end):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        self.assertTrue(
+            hasattr(pipe, "_callback_tensor_inputs"),
+            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
+        )
+
+        def callback_inputs_subset(pipe, i, t, callback_kwargs):
+            # iterate over callback args
+            for tensor_name, tensor_value in callback_kwargs.items():
+                # check that we're only passing in allowed tensor inputs
+                assert tensor_name in pipe._callback_tensor_inputs
+
+            return callback_kwargs
+
+        def callback_inputs_all(pipe, i, t, callback_kwargs):
+            for tensor_name in pipe._callback_tensor_inputs:
+                assert tensor_name in callback_kwargs
+
+            # iterate over callback args
+            for tensor_name, tensor_value in callback_kwargs.items():
+                # check that we're only passing in allowed tensor inputs
+                assert tensor_name in pipe._callback_tensor_inputs
+
+            return callback_kwargs
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        # Test passing in a subset
+        inputs["callback_on_step_end"] = callback_inputs_subset
+        inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
+        inputs["output_type"] = "latent"
+        output = pipe(**inputs)[0]
+
+        # Test passing in a everything
+        inputs["callback_on_step_end"] = callback_inputs_all
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        inputs["output_type"] = "latent"
+        output = pipe(**inputs)[0]
+
+        def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
+            is_last = i == (pipe.num_timesteps - 1)
+            if is_last:
+                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
+            return callback_kwargs
+
+        inputs["callback_on_step_end"] = callback_inputs_change_tensor
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        inputs["output_type"] = "latent"
+        output = pipe(**inputs)[0]
+        assert output.abs().sum() == 0
+
+    def test_callback_cfg(self):
+        sig = inspect.signature(self.pipeline_class.__call__)
+        has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
+        has_callback_step_end = "callback_on_step_end" in sig.parameters
+
+        if not (has_callback_tensor_inputs and has_callback_step_end):
+            return
+
+        if "guidance_scale" not in sig.parameters:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        self.assertTrue(
+            hasattr(pipe, "_callback_tensor_inputs"),
+            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
+        )
+
+        def callback_increase_guidance(pipe, i, t, callback_kwargs):
+            pipe._guidance_scale += 1.0
+
+            return callback_kwargs
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        # use cfg guidance because some pipelines modify the shape of the latents
+        # outside of the denoising loop
+        inputs["guidance_scale"] = 2.0
+        inputs["callback_on_step_end"] = callback_increase_guidance
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        _ = pipe(**inputs)[0]
+
+        # we increase the guidance scale by 1.0 at every step
+        # check that the guidance scale is increased by the number of scheduler timesteps
+        # accounts for models that modify the number of inference steps based on strength
+        assert pipe.guidance_scale == (inputs["guidance_scale"] + pipe.num_timesteps)
+
+    def test_StableDiffusionMixin_component(self):
+        """Any pipeline that have LDMFuncMixin should have vae and unet components."""
+        if not issubclass(self.pipeline_class, StableDiffusionMixin):
+            return
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        self.assertTrue(hasattr(pipe, "vae") and isinstance(pipe.vae, (AutoencoderKL, AutoencoderTiny)))
+        self.assertTrue(
+            hasattr(pipe, "unet")
+            and isinstance(
+                pipe.unet,
+                (UNet2DConditionModel, UNet3DConditionModel, I2VGenXLUNet, UNetMotionModel, UNetControlNetXSModel),
+            )
+        )
+
+
+@is_staging_test
+class PipelinePushToHubTester(unittest.TestCase):
+    identifier = uuid.uuid4()
+    repo_id = f"test-pipeline-{identifier}"
+    org_repo_id = f"valid_org/{repo_id}-org"
+
+    def get_pipeline_components(self):
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            dummy_vocab = {"<|startoftext|>": 0, "<|endoftext|>": 1, "!": 2}
+            vocab_path = os.path.join(tmpdir, "vocab.json")
+            with open(vocab_path, "w") as f:
+                json.dump(dummy_vocab, f)
+
+            merges = "Ġ t\nĠt h"
+            merges_path = os.path.join(tmpdir, "merges.txt")
+            with open(merges_path, "w") as f:
+                f.writelines(merges)
+            tokenizer = CLIPTokenizer(vocab_file=vocab_path, merges_file=merges_path)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def test_push_to_hub(self):
+        components = self.get_pipeline_components()
+        pipeline = StableDiffusionPipeline(**components)
+        pipeline.push_to_hub(self.repo_id, token=TOKEN)
+
+        new_model = UNet2DConditionModel.from_pretrained(f"{USER}/{self.repo_id}", subfolder="unet")
+        unet = components["unet"]
+        for p1, p2 in zip(unet.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(token=TOKEN, repo_id=self.repo_id)
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            pipeline.save_pretrained(tmp_dir, repo_id=self.repo_id, push_to_hub=True, token=TOKEN)
+
+        new_model = UNet2DConditionModel.from_pretrained(f"{USER}/{self.repo_id}", subfolder="unet")
+        for p1, p2 in zip(unet.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(self.repo_id, token=TOKEN)
+
+    def test_push_to_hub_in_organization(self):
+        components = self.get_pipeline_components()
+        pipeline = StableDiffusionPipeline(**components)
+        pipeline.push_to_hub(self.org_repo_id, token=TOKEN)
+
+        new_model = UNet2DConditionModel.from_pretrained(self.org_repo_id, subfolder="unet")
+        unet = components["unet"]
+        for p1, p2 in zip(unet.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(token=TOKEN, repo_id=self.org_repo_id)
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            pipeline.save_pretrained(tmp_dir, push_to_hub=True, token=TOKEN, repo_id=self.org_repo_id)
+
+        new_model = UNet2DConditionModel.from_pretrained(self.org_repo_id, subfolder="unet")
+        for p1, p2 in zip(unet.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(self.org_repo_id, token=TOKEN)
+
+    @unittest.skipIf(
+        not is_jinja_available(),
+        reason="Model card tests cannot be performed without Jinja installed.",
+    )
+    def test_push_to_hub_library_name(self):
+        components = self.get_pipeline_components()
+        pipeline = StableDiffusionPipeline(**components)
+        pipeline.push_to_hub(self.repo_id, token=TOKEN)
+
+        model_card = ModelCard.load(f"{USER}/{self.repo_id}", token=TOKEN).data
+        assert model_card.library_name == "diffusers"
+
+        # Reset repo
+        delete_repo(self.repo_id, token=TOKEN)
+
+
+# For SDXL and its derivative pipelines (such as ControlNet), we have the text encoders
+# and the tokenizers as optional components. So, we need to override the `test_save_load_optional_components()`
+# test for all such pipelines. This requires us to use a custom `encode_prompt()` function.
+class SDXLOptionalComponentsTesterMixin:
+    def encode_prompt(
+        self, tokenizers, text_encoders, prompt: str, num_images_per_prompt: int = 1, negative_prompt: str = None
+    ):
+        device = text_encoders[0].device
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        batch_size = len(prompt)
+
+        prompt_embeds_list = []
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            text_input_ids = text_inputs.input_ids
+
+            prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+            prompt_embeds_list.append(prompt_embeds)
+
+        prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        if negative_prompt is None:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        else:
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            negative_prompt_embeds_list = []
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(uncond_input.input_ids.to(device), output_hidden_states=True)
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # for classifier-free guidance
+        # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+        seq_len = negative_prompt_embeds.shape[1]
+
+        negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+
+        # for classifier-free guidance
+        negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    def _test_save_load_optional_components(self, expected_max_difference=1e-4):
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+
+        tokenizer = components.pop("tokenizer")
+        tokenizer_2 = components.pop("tokenizer_2")
+        text_encoder = components.pop("text_encoder")
+        text_encoder_2 = components.pop("text_encoder_2")
+
+        tokenizers = [tokenizer, tokenizer_2] if tokenizer is not None else [tokenizer_2]
+        text_encoders = [text_encoder, text_encoder_2] if text_encoder is not None else [text_encoder_2]
+        prompt = inputs.pop("prompt")
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(tokenizers, text_encoders, prompt)
+        inputs["prompt_embeds"] = prompt_embeds
+        inputs["negative_prompt_embeds"] = negative_prompt_embeds
+        inputs["pooled_prompt_embeds"] = pooled_prompt_embeds
+        inputs["negative_pooled_prompt_embeds"] = negative_pooled_prompt_embeds
+
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            for component in pipe_loaded.components.values():
+                if hasattr(component, "set_default_attn_processor"):
+                    component.set_default_attn_processor()
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(generator_device)
+        _ = inputs.pop("prompt")
+        inputs["prompt_embeds"] = prompt_embeds
+        inputs["negative_prompt_embeds"] = negative_prompt_embeds
+        inputs["pooled_prompt_embeds"] = pooled_prompt_embeds
+        inputs["negative_pooled_prompt_embeds"] = negative_pooled_prompt_embeds
+
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, expected_max_difference)
+
+
+# Some models (e.g. unCLIP) are extremely likely to significantly deviate depending on which hardware is used.
+# This helper function is used to check that the image doesn't deviate on average more than 10 pixels from a
+# reference image.
+def assert_mean_pixel_difference(image, expected_image, expected_max_diff=10):
+    image = np.asarray(DiffusionPipeline.numpy_to_pil(image)[0], dtype=np.float32)
+    expected_image = np.asarray(DiffusionPipeline.numpy_to_pil(expected_image)[0], dtype=np.float32)
+    avg_diff = np.abs(image - expected_image).mean()
+    assert avg_diff < expected_max_diff, f"Error image deviates {avg_diff} pixels on average"
diff --git a/diffusers/tests/pipelines/test_pipelines_flax.py b/diffusers/tests/pipelines/test_pipelines_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..efd3da4c61275fcc83cfa8feadcdb97c01af08b2
--- /dev/null
+++ b/diffusers/tests/pipelines/test_pipelines_flax.py
@@ -0,0 +1,260 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+
+from diffusers.utils import is_flax_available
+from diffusers.utils.testing_utils import require_flax, slow
+
+
+if is_flax_available():
+    import jax
+    import jax.numpy as jnp
+    from flax.jax_utils import replicate
+    from flax.training.common_utils import shard
+
+    from diffusers import FlaxDDIMScheduler, FlaxDiffusionPipeline, FlaxStableDiffusionPipeline
+
+
+@require_flax
+class DownloadTests(unittest.TestCase):
+    def test_download_only_pytorch(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            # pipeline has Flax weights
+            _ = FlaxDiffusionPipeline.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname
+            )
+
+            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname, os.listdir(tmpdirname)[0], "snapshots"))]
+            files = [item for sublist in all_root_files for item in sublist]
+
+            # None of the downloaded files should be a PyTorch file even if we have some here:
+            # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_pytorch_model.bin
+            assert not any(f.endswith(".bin") for f in files)
+
+
+@slow
+@require_flax
+class FlaxPipelineTests(unittest.TestCase):
+    def test_dummy_all_tpus(self):
+        pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
+        )
+
+        prompt = (
+            "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of"
+            " field, close up, split lighting, cinematic"
+        )
+
+        prng_seed = jax.random.PRNGKey(0)
+        num_inference_steps = 4
+
+        num_samples = jax.device_count()
+        prompt = num_samples * [prompt]
+        prompt_ids = pipeline.prepare_inputs(prompt)
+
+        # shard inputs and rng
+        params = replicate(params)
+        prng_seed = jax.random.split(prng_seed, num_samples)
+        prompt_ids = shard(prompt_ids)
+
+        images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
+
+        assert images.shape == (num_samples, 1, 64, 64, 3)
+        if jax.device_count() == 8:
+            assert np.abs(np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 4.1514745) < 1e-3
+            assert np.abs(np.abs(images, dtype=np.float32).sum() - 49947.875) < 5e-1
+
+        images_pil = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
+        assert len(images_pil) == num_samples
+
+    def test_stable_diffusion_v1_4(self):
+        pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", revision="flax", safety_checker=None
+        )
+
+        prompt = (
+            "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of"
+            " field, close up, split lighting, cinematic"
+        )
+
+        prng_seed = jax.random.PRNGKey(0)
+        num_inference_steps = 50
+
+        num_samples = jax.device_count()
+        prompt = num_samples * [prompt]
+        prompt_ids = pipeline.prepare_inputs(prompt)
+
+        # shard inputs and rng
+        params = replicate(params)
+        prng_seed = jax.random.split(prng_seed, num_samples)
+        prompt_ids = shard(prompt_ids)
+
+        images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
+
+        assert images.shape == (num_samples, 1, 512, 512, 3)
+        if jax.device_count() == 8:
+            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.05652401)) < 1e-2
+            assert np.abs((np.abs(images, dtype=np.float32).sum() - 2383808.2)) < 5e-1
+
+    def test_stable_diffusion_v1_4_bfloat_16(self):
+        pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", variant="bf16", dtype=jnp.bfloat16, safety_checker=None
+        )
+
+        prompt = (
+            "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of"
+            " field, close up, split lighting, cinematic"
+        )
+
+        prng_seed = jax.random.PRNGKey(0)
+        num_inference_steps = 50
+
+        num_samples = jax.device_count()
+        prompt = num_samples * [prompt]
+        prompt_ids = pipeline.prepare_inputs(prompt)
+
+        # shard inputs and rng
+        params = replicate(params)
+        prng_seed = jax.random.split(prng_seed, num_samples)
+        prompt_ids = shard(prompt_ids)
+
+        images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
+
+        assert images.shape == (num_samples, 1, 512, 512, 3)
+        if jax.device_count() == 8:
+            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.04003906)) < 5e-2
+            assert np.abs((np.abs(images, dtype=np.float32).sum() - 2373516.75)) < 5e-1
+
+    def test_stable_diffusion_v1_4_bfloat_16_with_safety(self):
+        pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", variant="bf16", dtype=jnp.bfloat16
+        )
+
+        prompt = (
+            "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of"
+            " field, close up, split lighting, cinematic"
+        )
+
+        prng_seed = jax.random.PRNGKey(0)
+        num_inference_steps = 50
+
+        num_samples = jax.device_count()
+        prompt = num_samples * [prompt]
+        prompt_ids = pipeline.prepare_inputs(prompt)
+
+        # shard inputs and rng
+        params = replicate(params)
+        prng_seed = jax.random.split(prng_seed, num_samples)
+        prompt_ids = shard(prompt_ids)
+
+        images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
+
+        assert images.shape == (num_samples, 1, 512, 512, 3)
+        if jax.device_count() == 8:
+            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.04003906)) < 5e-2
+            assert np.abs((np.abs(images, dtype=np.float32).sum() - 2373516.75)) < 5e-1
+
+    def test_stable_diffusion_v1_4_bfloat_16_ddim(self):
+        scheduler = FlaxDDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            set_alpha_to_one=False,
+            steps_offset=1,
+        )
+
+        pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            variant="bf16",
+            dtype=jnp.bfloat16,
+            scheduler=scheduler,
+            safety_checker=None,
+        )
+        scheduler_state = scheduler.create_state()
+
+        params["scheduler"] = scheduler_state
+
+        prompt = (
+            "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of"
+            " field, close up, split lighting, cinematic"
+        )
+
+        prng_seed = jax.random.PRNGKey(0)
+        num_inference_steps = 50
+
+        num_samples = jax.device_count()
+        prompt = num_samples * [prompt]
+        prompt_ids = pipeline.prepare_inputs(prompt)
+
+        # shard inputs and rng
+        params = replicate(params)
+        prng_seed = jax.random.split(prng_seed, num_samples)
+        prompt_ids = shard(prompt_ids)
+
+        images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
+
+        assert images.shape == (num_samples, 1, 512, 512, 3)
+        if jax.device_count() == 8:
+            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.045043945)) < 5e-2
+            assert np.abs((np.abs(images, dtype=np.float32).sum() - 2347693.5)) < 5e-1
+
+    def test_jax_memory_efficient_attention(self):
+        prompt = (
+            "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of"
+            " field, close up, split lighting, cinematic"
+        )
+
+        num_samples = jax.device_count()
+        prompt = num_samples * [prompt]
+        prng_seed = jax.random.split(jax.random.PRNGKey(0), num_samples)
+
+        pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            variant="bf16",
+            dtype=jnp.bfloat16,
+            safety_checker=None,
+        )
+
+        params = replicate(params)
+        prompt_ids = pipeline.prepare_inputs(prompt)
+        prompt_ids = shard(prompt_ids)
+        images = pipeline(prompt_ids, params, prng_seed, jit=True).images
+        assert images.shape == (num_samples, 1, 512, 512, 3)
+        slice = images[2, 0, 256, 10:17, 1]
+
+        # With memory efficient attention
+        pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            variant="bf16",
+            dtype=jnp.bfloat16,
+            safety_checker=None,
+            use_memory_efficient_attention=True,
+        )
+
+        params = replicate(params)
+        prompt_ids = pipeline.prepare_inputs(prompt)
+        prompt_ids = shard(prompt_ids)
+        images_eff = pipeline(prompt_ids, params, prng_seed, jit=True).images
+        assert images_eff.shape == (num_samples, 1, 512, 512, 3)
+        slice_eff = images[2, 0, 256, 10:17, 1]
+
+        # I checked the results visually and they are very similar. However, I saw that the max diff is `1` and the `sum`
+        # over the 8 images is exactly `256`, which is very suspicious. Testing a random slice for now.
+        assert abs(slice_eff - slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/test_pipelines_onnx_common.py b/diffusers/tests/pipelines/test_pipelines_onnx_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..575ecd0075318e8ec62ab7cd76bff5b0b1ca82ad
--- /dev/null
+++ b/diffusers/tests/pipelines/test_pipelines_onnx_common.py
@@ -0,0 +1,12 @@
+from diffusers.utils.testing_utils import require_onnxruntime
+
+
+@require_onnxruntime
+class OnnxPipelineTesterMixin:
+    """
+    This mixin is designed to be used with unittest.TestCase classes.
+    It provides a set of common tests for each ONNXRuntime pipeline, e.g. saving and loading the pipeline,
+    equivalence of dict and tuple outputs, etc.
+    """
+
+    pass
diff --git a/diffusers/tests/pipelines/text_to_video_synthesis/__init__.py b/diffusers/tests/pipelines/text_to_video_synthesis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/text_to_video_synthesis/test_text_to_video.py b/diffusers/tests/pipelines/text_to_video_synthesis/test_text_to_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..bca4fdbfae646d7003bc6a7ed4e224394530ba3e
--- /dev/null
+++ b/diffusers/tests/pipelines/text_to_video_synthesis/test_text_to_video.py
@@ -0,0 +1,222 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoSDPipeline, UNet3DConditionModel
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_numpy,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, SDFunctionTesterMixin
+
+
+enable_full_determinism()
+
+
+@skip_mps
+class TextToVideoSDPipelineFastTests(PipelineTesterMixin, SDFunctionTesterMixin, unittest.TestCase):
+    pipeline_class = TextToVideoSDPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    # No `output_type`.
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet3DConditionModel(
+            block_out_channels=(8, 8),
+            layers_per_block=1,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
+            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
+            cross_attention_dim=4,
+            attention_head_dim=4,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=(8,),
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=32,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=4,
+            intermediate_size=16,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_dict_tuple_outputs_equivalent(self):
+        return super().test_dict_tuple_outputs_equivalent()
+
+    def test_text_to_video_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = TextToVideoSDPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["output_type"] = "np"
+        frames = sd_pipe(**inputs).frames
+
+        image_slice = frames[0][0][-3:, -3:, -1]
+        assert frames[0][0].shape == (32, 32, 3)
+        expected_slice = np.array([0.8093, 0.2751, 0.6976, 0.5927, 0.4616, 0.4336, 0.5094, 0.5683, 0.4796])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @unittest.skipIf(torch_device != "cuda", reason="Feature isn't heavily used. Test in CUDA environment only.")
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False, expected_max_diff=3e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=1e-2)
+
+    # (todo): sayakpaul
+    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
+    def test_inference_batch_consistent(self):
+        pass
+
+    # (todo): sayakpaul
+    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
+    def test_inference_batch_single_identical(self):
+        pass
+
+    @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
+    def test_num_images_per_prompt(self):
+        pass
+
+
+@slow
+@skip_mps
+@require_torch_gpu
+class TextToVideoSDPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_two_step_model(self):
+        expected_video = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/video_2step.npy"
+        )
+
+        pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
+        pipe = pipe.to(torch_device)
+
+        prompt = "Spiderman is surfing"
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").frames
+        assert numpy_cosine_similarity_distance(expected_video.flatten(), video_frames.flatten()) < 1e-4
+
+    def test_two_step_model_with_freeu(self):
+        expected_video = []
+
+        pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
+        pipe = pipe.to(torch_device)
+
+        prompt = "Spiderman is surfing"
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
+        video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").frames
+        video = video_frames[0, 0, -3:, -3:, -1].flatten()
+
+        expected_video = [0.3643, 0.3455, 0.3831, 0.3923, 0.2978, 0.3247, 0.3278, 0.3201, 0.3475]
+
+        assert np.abs(expected_video - video).mean() < 5e-2
diff --git a/diffusers/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py b/diffusers/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1bf6ee52206551a12cd2e126369c53ec3086c0a
--- /dev/null
+++ b/diffusers/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import torch
+
+from diffusers import DDIMScheduler, TextToVideoZeroPipeline
+from diffusers.utils.testing_utils import load_pt, nightly, require_torch_gpu
+
+from ..test_pipelines_common import assert_mean_pixel_difference
+
+
+@nightly
+@require_torch_gpu
+class TextToVideoZeroPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_full_model(self):
+        model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+        pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        generator = torch.Generator(device="cuda").manual_seed(0)
+
+        prompt = "A bear is playing a guitar on Times Square"
+        result = pipe(prompt=prompt, generator=generator).images
+
+        expected_result = load_pt(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/A bear is playing a guitar on Times Square.pt"
+        )
+
+        assert_mean_pixel_difference(result, expected_result)
diff --git a/diffusers/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py b/diffusers/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bef0cede154022365694aeca1f957d5b50c965a
--- /dev/null
+++ b/diffusers/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py
@@ -0,0 +1,397 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoZeroSDXLPipeline, UNet2DConditionModel
+from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineFromPipeTesterMixin, PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase):
+    pipeline_class = TextToVideoZeroSDXLPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    generator_device = "cpu"
+
+    def get_dummy_components(self, seed=0):
+        torch.manual_seed(seed)
+        unet = UNet2DConditionModel(
+            block_out_channels=(2, 4),
+            layers_per_block=2,
+            sample_size=2,
+            norm_num_groups=2,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_start=0.0001,
+            beta_end=0.02,
+            beta_schedule="linear",
+            trained_betas=None,
+            clip_sample=True,
+            set_alpha_to_one=True,
+            steps_offset=0,
+            prediction_type="epsilon",
+            thresholding=False,
+            dynamic_thresholding_ratio=0.995,
+            clip_sample_range=1.0,
+            sample_max_value=1.0,
+            timestep_spacing="leading",
+            rescale_betas_zero_snr=False,
+        )
+        torch.manual_seed(seed)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(seed)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "image_encoder": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A panda dancing in Antarctica",
+            "generator": generator,
+            "num_inference_steps": 5,
+            "t0": 1,
+            "t1": 3,
+            "height": 64,
+            "width": 64,
+            "video_length": 3,
+            "output_type": "np",
+        }
+        return inputs
+
+    def get_generator(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        return generator
+
+    def test_text_to_video_zero_sdxl(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(self.generator_device)
+        result = pipe(**inputs).images
+
+        first_frame_slice = result[0, -3:, -3:, -1]
+        last_frame_slice = result[-1, -3:, -3:, 0]
+
+        expected_slice1 = np.array(
+            [0.6008109, 0.73051643, 0.51778656, 0.55817354, 0.45222935, 0.45998418, 0.57017255, 0.54874814, 0.47078788]
+        )
+        expected_slice2 = np.array(
+            [0.6011751, 0.47420046, 0.41660714, 0.6472957, 0.41261768, 0.5438129, 0.7401535, 0.6756011, 0.53652245]
+        )
+
+        assert np.abs(first_frame_slice.flatten() - expected_slice1).max() < 1e-2
+        assert np.abs(last_frame_slice.flatten() - expected_slice2).max() < 1e-2
+
+    @unittest.skip(
+        reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
+    )
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_cfg(self):
+        sig = inspect.signature(self.pipeline_class.__call__)
+        if "guidance_scale" not in sig.parameters:
+            return
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(self.generator_device)
+
+        inputs["guidance_scale"] = 1.0
+        out_no_cfg = pipe(**inputs)[0]
+
+        inputs["guidance_scale"] = 7.5
+        out_cfg = pipe(**inputs)[0]
+
+        assert out_cfg.shape == out_no_cfg.shape
+
+    def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(self.generator_device))[0]
+        output_tuple = pipe(**self.get_dummy_inputs(self.generator_device), return_dict=False)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_tuple)).max()
+        self.assertLess(max_diff, expected_max_difference)
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_float16_inference(self, expected_max_diff=5e-2):
+        components = self.get_dummy_components()
+        for name, module in components.items():
+            if hasattr(module, "half"):
+                components[name] = module.to(torch_device).half()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        components = self.get_dummy_components()
+        pipe_fp16 = self.pipeline_class(**components)
+        pipe_fp16.to(torch_device, torch.float16)
+        pipe_fp16.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(self.generator_device)
+        # # Reset generator in case it is used inside dummy inputs
+        if "generator" in inputs:
+            inputs["generator"] = self.get_generator(self.generator_device)
+
+        output = pipe(**inputs)[0]
+
+        fp16_inputs = self.get_dummy_inputs(self.generator_device)
+        # Reset generator in case it is used inside dummy inputs
+        if "generator" in fp16_inputs:
+            fp16_inputs["generator"] = self.get_generator(self.generator_device)
+
+        output_fp16 = pipe_fp16(**fp16_inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_fp16)).max()
+        self.assertLess(max_diff, expected_max_diff, "The outputs of the fp16 and fp32 pipelines are too different.")
+
+    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
+    def test_inference_batch_consistent(self):
+        pass
+
+    @unittest.skip(
+        reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
+    )
+    def test_inference_batch_single_identical(self):
+        pass
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
+        reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
+    )
+    def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(self.generator_device)
+        output_without_offload = pipe(**inputs)[0]
+
+        pipe.enable_model_cpu_offload()
+        inputs = self.get_dummy_inputs(self.generator_device)
+        output_with_offload = pipe(**inputs)[0]
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
+
+    @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
+    def test_pipeline_call_signature(self):
+        pass
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self, expected_max_diff=1e-2):
+        components = self.get_dummy_components()
+        for name, module in components.items():
+            if hasattr(module, "half"):
+                components[name] = module.to(torch_device).half()
+
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(self.generator_device)
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for name, component in pipe_loaded.components.items():
+            if hasattr(component, "dtype"):
+                self.assertTrue(
+                    component.dtype == torch.float16,
+                    f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.",
+                )
+
+        inputs = self.get_dummy_inputs(self.generator_device)
+        output_loaded = pipe_loaded(**inputs)[0]
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(
+            max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."
+        )
+
+    @unittest.skip(
+        reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
+    )
+    def test_save_load_local(self):
+        pass
+
+    @unittest.skip(
+        reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
+    )
+    def test_save_load_optional_components(self):
+        pass
+
+    @unittest.skip(
+        reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
+    )
+    def test_sequential_cpu_offload_forward_pass(self):
+        pass
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        model_devices = [component.device.type for component in components.values() if hasattr(component, "device")]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]  # generator set to cpu
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [component.device.type for component in components.values() if hasattr(component, "device")]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cpu"))[0]  # generator set to cpu
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+    @unittest.skip(
+        reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        pass
+
+
+@nightly
+@require_torch_gpu
+class TextToVideoZeroSDXLPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_full_model(self):
+        model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+        pipe = TextToVideoZeroSDXLPipeline.from_pretrained(
+            model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.enable_vae_slicing()
+
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        prompt = "A panda dancing in Antarctica"
+        result = pipe(prompt=prompt, generator=generator).images
+
+        first_frame_slice = result[0, -3:, -3:, -1]
+        last_frame_slice = result[-1, -3:, -3:, 0]
+
+        expected_slice1 = np.array([0.57, 0.57, 0.57, 0.57, 0.57, 0.56, 0.55, 0.56, 0.56])
+        expected_slice2 = np.array([0.54, 0.53, 0.53, 0.53, 0.53, 0.52, 0.53, 0.53, 0.53])
+
+        assert np.abs(first_frame_slice.flatten() - expected_slice1).max() < 1e-2
+        assert np.abs(last_frame_slice.flatten() - expected_slice2).max() < 1e-2
diff --git a/diffusers/tests/pipelines/text_to_video_synthesis/test_video_to_video.py b/diffusers/tests/pipelines/text_to_video_synthesis/test_video_to_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..34ccb09e2204640ed3f54d096df1c862d01ddd95
--- /dev/null
+++ b/diffusers/tests/pipelines/text_to_video_synthesis/test_video_to_video.py
@@ -0,0 +1,221 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    UNet3DConditionModel,
+    VideoToVideoSDPipeline,
+)
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    is_flaky,
+    nightly,
+    numpy_cosine_similarity_distance,
+    skip_mps,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+@skip_mps
+class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = VideoToVideoSDPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS.union({"video"}) - {"image", "width", "height"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"video"}) - {"image"}
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+    test_attention_slicing = False
+
+    # No `output_type`.
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet3DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
+            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
+            cross_attention_dim=32,
+            attention_head_dim=4,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=True,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[
+                8,
+            ],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=[
+                "DownEncoderBlock2D",
+            ],
+            up_block_types=["UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=32,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=512,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        # 3 frames
+        video = floats_tensor((1, 3, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "video": video,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_text_to_video_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = VideoToVideoSDPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["output_type"] = "np"
+        frames = sd_pipe(**inputs).frames
+        image_slice = frames[0][0][-3:, -3:, -1]
+
+        assert frames[0][0].shape == (32, 32, 3)
+        expected_slice = np.array([0.6391, 0.5350, 0.5202, 0.5521, 0.5453, 0.5393, 0.6652, 0.5270, 0.5185])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @is_flaky()
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=0.001)
+
+    @is_flaky()
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent()
+
+    @is_flaky()
+    def test_save_load_local(self):
+        super().test_save_load_local()
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=5e-3)
+
+    # (todo): sayakpaul
+    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
+    def test_inference_batch_consistent(self):
+        pass
+
+    # (todo): sayakpaul
+    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
+    def test_inference_batch_single_identical(self):
+        pass
+
+    @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
+    def test_num_images_per_prompt(self):
+        pass
+
+
+@nightly
+@skip_mps
+class VideoToVideoSDPipelineSlowTests(unittest.TestCase):
+    def test_two_step_model(self):
+        pipe = VideoToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dtype=torch.float16)
+        pipe.enable_model_cpu_offload()
+
+        # 10 frames
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        video = torch.randn((1, 10, 3, 320, 576), generator=generator)
+
+        prompt = "Spiderman is surfing"
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        video_frames = pipe(prompt, video=video, generator=generator, num_inference_steps=3, output_type="np").frames
+
+        expected_array = np.array(
+            [0.17114258, 0.13720703, 0.08886719, 0.14819336, 0.1730957, 0.24584961, 0.22021484, 0.35180664, 0.2607422]
+        )
+        output_array = video_frames[0, 0, :3, :3, 0].flatten()
+        assert numpy_cosine_similarity_distance(expected_array, output_array) < 1e-3
diff --git a/diffusers/tests/pipelines/unclip/__init__.py b/diffusers/tests/pipelines/unclip/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/unclip/test_unclip.py b/diffusers/tests/pipelines/unclip/test_unclip.py
new file mode 100644
index 0000000000000000000000000000000000000000..07590c9db458a21954becb302d8f61feabcddd48
--- /dev/null
+++ b/diffusers/tests/pipelines/unclip/test_unclip.py
@@ -0,0 +1,518 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel
+from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_numpy,
+    nightly,
+    require_torch_gpu,
+    skip_mps,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = UnCLIPPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {
+        "negative_prompt",
+        "height",
+        "width",
+        "negative_prompt_embeds",
+        "guidance_scale",
+        "prompt_embeds",
+        "cross_attention_kwargs",
+    }
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    required_optional_params = [
+        "generator",
+        "return_dict",
+        "prior_num_inference_steps",
+        "decoder_num_inference_steps",
+        "super_res_num_inference_steps",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config)
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "embedding_dim": self.text_embedder_hidden_size,
+            "num_layers": 1,
+        }
+
+        model = PriorTransformer(**model_kwargs)
+        return model
+
+    @property
+    def dummy_text_proj(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "clip_embeddings_dim": self.text_embedder_hidden_size,
+            "time_embed_dim": self.time_embed_dim,
+            "cross_attention_dim": self.cross_attention_dim,
+        }
+
+        model = UnCLIPTextProjModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_decoder(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "sample_size": 32,
+            # RGB in channels
+            "in_channels": 3,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 6,
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": "identity",
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_super_res_kwargs(self):
+        return {
+            "sample_size": 64,
+            "layers_per_block": 1,
+            "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
+            "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "in_channels": 6,
+            "out_channels": 3,
+        }
+
+    @property
+    def dummy_super_res_first(self):
+        torch.manual_seed(0)
+
+        model = UNet2DModel(**self.dummy_super_res_kwargs)
+        return model
+
+    @property
+    def dummy_super_res_last(self):
+        # seeded differently to get different unet than `self.dummy_super_res_first`
+        torch.manual_seed(1)
+
+        model = UNet2DModel(**self.dummy_super_res_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        decoder = self.dummy_decoder
+        text_proj = self.dummy_text_proj
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        super_res_first = self.dummy_super_res_first
+        super_res_last = self.dummy_super_res_last
+
+        prior_scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="sample",
+            num_train_timesteps=1000,
+            clip_sample_range=5.0,
+        )
+
+        decoder_scheduler = UnCLIPScheduler(
+            variance_type="learned_range",
+            prediction_type="epsilon",
+            num_train_timesteps=1000,
+        )
+
+        super_res_scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="epsilon",
+            num_train_timesteps=1000,
+        )
+
+        components = {
+            "prior": prior,
+            "decoder": decoder,
+            "text_proj": text_proj,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "super_res_first": super_res_first,
+            "super_res_last": super_res_last,
+            "prior_scheduler": prior_scheduler,
+            "decoder_scheduler": decoder_scheduler,
+            "super_res_scheduler": super_res_scheduler,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "generator": generator,
+            "prior_num_inference_steps": 2,
+            "decoder_num_inference_steps": 2,
+            "super_res_num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_unclip(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [
+                0.9997,
+                0.9988,
+                0.0028,
+                0.9997,
+                0.9984,
+                0.9965,
+                0.0029,
+                0.9986,
+                0.0025,
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_unclip_passed_text_embed(self):
+        device = torch.device("cpu")
+
+        class DummyScheduler:
+            init_noise_sigma = 1
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        prior = components["prior"]
+        decoder = components["decoder"]
+        super_res_first = components["super_res_first"]
+        tokenizer = components["tokenizer"]
+        text_encoder = components["text_encoder"]
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        dtype = prior.dtype
+        batch_size = 1
+
+        shape = (batch_size, prior.config.embedding_dim)
+        prior_latents = pipe.prepare_latents(
+            shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
+        )
+        shape = (batch_size, decoder.config.in_channels, decoder.config.sample_size, decoder.config.sample_size)
+        decoder_latents = pipe.prepare_latents(
+            shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
+        )
+
+        shape = (
+            batch_size,
+            super_res_first.config.in_channels // 2,
+            super_res_first.config.sample_size,
+            super_res_first.config.sample_size,
+        )
+        super_res_latents = pipe.prepare_latents(
+            shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
+        )
+
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "this is a prompt example"
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = pipe(
+            [prompt],
+            generator=generator,
+            prior_num_inference_steps=2,
+            decoder_num_inference_steps=2,
+            super_res_num_inference_steps=2,
+            prior_latents=prior_latents,
+            decoder_latents=decoder_latents,
+            super_res_latents=super_res_latents,
+            output_type="np",
+        )
+        image = output.images
+
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_model_output = text_encoder(text_inputs.input_ids)
+        text_attention_mask = text_inputs.attention_mask
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_text = pipe(
+            generator=generator,
+            prior_num_inference_steps=2,
+            decoder_num_inference_steps=2,
+            super_res_num_inference_steps=2,
+            prior_latents=prior_latents,
+            decoder_latents=decoder_latents,
+            super_res_latents=super_res_latents,
+            text_model_output=text_model_output,
+            text_attention_mask=text_attention_mask,
+            output_type="np",
+        )[0]
+
+        # make sure passing text embeddings manually is identical
+        assert np.abs(image - image_from_text).max() < 1e-4
+
+    # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
+    # because UnCLIP GPU undeterminism requires a looser check.
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+
+        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference, expected_max_diff=0.01)
+
+    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
+    # because UnCLIP undeterminism requires a looser check.
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        additional_params_copy_to_batched_inputs = [
+            "prior_num_inference_steps",
+            "decoder_num_inference_steps",
+            "super_res_num_inference_steps",
+        ]
+
+        self._test_inference_batch_single_identical(
+            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, expected_max_diff=5e-3
+        )
+
+    def test_inference_batch_consistent(self):
+        additional_params_copy_to_batched_inputs = [
+            "prior_num_inference_steps",
+            "decoder_num_inference_steps",
+            "super_res_num_inference_steps",
+        ]
+
+        if torch_device == "mps":
+            # TODO: MPS errors with larger batch sizes
+            batch_sizes = [2, 3]
+            self._test_inference_batch_consistent(
+                batch_sizes=batch_sizes,
+                additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
+            )
+        else:
+            self._test_inference_batch_consistent(
+                additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs
+            )
+
+    @skip_mps
+    def test_dict_tuple_outputs_equivalent(self):
+        return super().test_dict_tuple_outputs_equivalent()
+
+    @skip_mps
+    def test_save_load_local(self):
+        return super().test_save_load_local(expected_max_difference=5e-3)
+
+    @skip_mps
+    def test_save_load_optional_components(self):
+        return super().test_save_load_optional_components()
+
+    @unittest.skip("UnCLIP produces very large differences in fp16 vs fp32. Test is not useful.")
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=1.0)
+
+
+@nightly
+class UnCLIPPipelineCPUIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_unclip_karlo_cpu_fp32(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/unclip/karlo_v1_alpha_horse_cpu.npy"
+        )
+
+        pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha")
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        output = pipeline(
+            "horse",
+            num_images_per_prompt=1,
+            generator=generator,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (256, 256, 3)
+        assert np.abs(expected_image - image).max() < 1e-1
+
+
+@nightly
+@require_torch_gpu
+class UnCLIPPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_unclip_karlo(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/unclip/karlo_v1_alpha_horse_fp16.npy"
+        )
+
+        pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16)
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipeline(
+            "horse",
+            generator=generator,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (256, 256, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
+
+    def test_unclip_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+        pipe.enable_sequential_cpu_offload()
+
+        _ = pipe(
+            "horse",
+            num_images_per_prompt=1,
+            prior_num_inference_steps=2,
+            decoder_num_inference_steps=2,
+            super_res_num_inference_steps=2,
+            output_type="np",
+        )
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 7 GB is allocated
+        assert mem_bytes < 7 * 10**9
diff --git a/diffusers/tests/pipelines/unclip/test_unclip_image_variation.py b/diffusers/tests/pipelines/unclip/test_unclip_image_variation.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfc3acc0c0f2a83e52b22462e542c0962d8e4aed
--- /dev/null
+++ b/diffusers/tests/pipelines/unclip/test_unclip_image_variation.py
@@ -0,0 +1,537 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    DiffusionPipeline,
+    UnCLIPImageVariationPipeline,
+    UnCLIPScheduler,
+    UNet2DConditionModel,
+    UNet2DModel,
+)
+from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
+    require_torch_gpu,
+    skip_mps,
+    torch_device,
+)
+
+from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = UnCLIPImageVariationPipeline
+    params = IMAGE_VARIATION_PARAMS - {"height", "width", "guidance_scale"}
+    batch_params = IMAGE_VARIATION_BATCH_PARAMS
+
+    required_optional_params = [
+        "generator",
+        "return_dict",
+        "decoder_num_inference_steps",
+        "super_res_num_inference_steps",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config)
+
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            image_size=32,
+            intermediate_size=37,
+            patch_size=1,
+        )
+        return CLIPVisionModelWithProjection(config)
+
+    @property
+    def dummy_text_proj(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "clip_embeddings_dim": self.text_embedder_hidden_size,
+            "time_embed_dim": self.time_embed_dim,
+            "cross_attention_dim": self.cross_attention_dim,
+        }
+
+        model = UnCLIPTextProjModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_decoder(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "sample_size": 32,
+            # RGB in channels
+            "in_channels": 3,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 6,
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": "identity",
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_super_res_kwargs(self):
+        return {
+            "sample_size": 64,
+            "layers_per_block": 1,
+            "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
+            "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "in_channels": 6,
+            "out_channels": 3,
+        }
+
+    @property
+    def dummy_super_res_first(self):
+        torch.manual_seed(0)
+
+        model = UNet2DModel(**self.dummy_super_res_kwargs)
+        return model
+
+    @property
+    def dummy_super_res_last(self):
+        # seeded differently to get different unet than `self.dummy_super_res_first`
+        torch.manual_seed(1)
+
+        model = UNet2DModel(**self.dummy_super_res_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        decoder = self.dummy_decoder
+        text_proj = self.dummy_text_proj
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        super_res_first = self.dummy_super_res_first
+        super_res_last = self.dummy_super_res_last
+
+        decoder_scheduler = UnCLIPScheduler(
+            variance_type="learned_range",
+            prediction_type="epsilon",
+            num_train_timesteps=1000,
+        )
+
+        super_res_scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="epsilon",
+            num_train_timesteps=1000,
+        )
+
+        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
+
+        image_encoder = self.dummy_image_encoder
+
+        return {
+            "decoder": decoder,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_proj": text_proj,
+            "feature_extractor": feature_extractor,
+            "image_encoder": image_encoder,
+            "super_res_first": super_res_first,
+            "super_res_last": super_res_last,
+            "decoder_scheduler": decoder_scheduler,
+            "super_res_scheduler": super_res_scheduler,
+        }
+
+    def get_dummy_inputs(self, device, seed=0, pil_image=True):
+        input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        if pil_image:
+            input_image = input_image * 0.5 + 0.5
+            input_image = input_image.clamp(0, 1)
+            input_image = input_image.cpu().permute(0, 2, 3, 1).float().numpy()
+            input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
+
+        return {
+            "image": input_image,
+            "generator": generator,
+            "decoder_num_inference_steps": 2,
+            "super_res_num_inference_steps": 2,
+            "output_type": "np",
+        }
+
+    def test_unclip_image_variation_input_tensor(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
+
+        output = pipe(**pipeline_inputs)
+        image = output.images
+
+        tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
+
+        image_from_tuple = pipe(
+            **tuple_pipeline_inputs,
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [
+                0.9997,
+                0.0002,
+                0.9997,
+                0.9997,
+                0.9969,
+                0.0023,
+                0.9997,
+                0.9969,
+                0.9970,
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_unclip_image_variation_input_image(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
+
+        output = pipe(**pipeline_inputs)
+        image = output.images
+
+        tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
+
+        image_from_tuple = pipe(
+            **tuple_pipeline_inputs,
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.9997, 0.0003, 0.9997, 0.9997, 0.9970, 0.0024, 0.9997, 0.9971, 0.9971])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_unclip_image_variation_input_list_images(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
+        pipeline_inputs["image"] = [
+            pipeline_inputs["image"],
+            pipeline_inputs["image"],
+        ]
+
+        output = pipe(**pipeline_inputs)
+        image = output.images
+
+        tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
+        tuple_pipeline_inputs["image"] = [
+            tuple_pipeline_inputs["image"],
+            tuple_pipeline_inputs["image"],
+        ]
+
+        image_from_tuple = pipe(
+            **tuple_pipeline_inputs,
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (2, 64, 64, 3)
+
+        expected_slice = np.array(
+            [
+                0.9997,
+                0.9989,
+                0.0008,
+                0.0021,
+                0.9960,
+                0.0018,
+                0.0014,
+                0.0002,
+                0.9933,
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_unclip_passed_image_embed(self):
+        device = torch.device("cpu")
+
+        class DummyScheduler:
+            init_noise_sigma = 1
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        dtype = pipe.decoder.dtype
+        batch_size = 1
+
+        shape = (
+            batch_size,
+            pipe.decoder.config.in_channels,
+            pipe.decoder.config.sample_size,
+            pipe.decoder.config.sample_size,
+        )
+        decoder_latents = pipe.prepare_latents(
+            shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
+        )
+
+        shape = (
+            batch_size,
+            pipe.super_res_first.config.in_channels // 2,
+            pipe.super_res_first.config.sample_size,
+            pipe.super_res_first.config.sample_size,
+        )
+        super_res_latents = pipe.prepare_latents(
+            shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
+        )
+
+        pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
+
+        img_out_1 = pipe(
+            **pipeline_inputs, decoder_latents=decoder_latents, super_res_latents=super_res_latents
+        ).images
+
+        pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
+        # Don't pass image, instead pass embedding
+        image = pipeline_inputs.pop("image")
+        image_embeddings = pipe.image_encoder(image).image_embeds
+
+        img_out_2 = pipe(
+            **pipeline_inputs,
+            decoder_latents=decoder_latents,
+            super_res_latents=super_res_latents,
+            image_embeddings=image_embeddings,
+        ).images
+
+        # make sure passing text embeddings manually is identical
+        assert np.abs(img_out_1 - img_out_2).max() < 1e-4
+
+    # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
+    # because UnCLIP GPU undeterminism requires a looser check.
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+
+        # Check is relaxed because there is not a torch 2.0 sliced attention added kv processor
+        expected_max_diff = 1e-2
+
+        self._test_attention_slicing_forward_pass(
+            test_max_difference=test_max_difference, expected_max_diff=expected_max_diff
+        )
+
+    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
+    # because UnCLIP undeterminism requires a looser check.
+    @unittest.skip("UnCLIP produces very large differences. Test is not useful.")
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        additional_params_copy_to_batched_inputs = [
+            "decoder_num_inference_steps",
+            "super_res_num_inference_steps",
+        ]
+        self._test_inference_batch_single_identical(
+            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, expected_max_diff=5e-3
+        )
+
+    def test_inference_batch_consistent(self):
+        additional_params_copy_to_batched_inputs = [
+            "decoder_num_inference_steps",
+            "super_res_num_inference_steps",
+        ]
+
+        if torch_device == "mps":
+            # TODO: MPS errors with larger batch sizes
+            batch_sizes = [2, 3]
+            self._test_inference_batch_consistent(
+                batch_sizes=batch_sizes,
+                additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
+            )
+        else:
+            self._test_inference_batch_consistent(
+                additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs
+            )
+
+    @skip_mps
+    def test_dict_tuple_outputs_equivalent(self):
+        return super().test_dict_tuple_outputs_equivalent()
+
+    @unittest.skip("UnCLIP produces very large difference. Test is not useful.")
+    @skip_mps
+    def test_save_load_local(self):
+        return super().test_save_load_local(expected_max_difference=4e-3)
+
+    @skip_mps
+    def test_save_load_optional_components(self):
+        return super().test_save_load_optional_components()
+
+    @unittest.skip("UnCLIP produces very large difference in fp16 vs fp32. Test is not useful.")
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=1.0)
+
+
+@nightly
+@require_torch_gpu
+class UnCLIPImageVariationPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_unclip_image_variation_karlo(self):
+        input_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unclip/cat.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/unclip/karlo_v1_alpha_cat_variation_fp16.npy"
+        )
+
+        pipeline = UnCLIPImageVariationPipeline.from_pretrained(
+            "kakaobrain/karlo-v1-alpha-image-variations", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipeline(
+            input_image,
+            generator=generator,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (256, 256, 3)
+
+        assert_mean_pixel_difference(image, expected_image, 15)
diff --git a/diffusers/tests/pipelines/unidiffuser/__init__.py b/diffusers/tests/pipelines/unidiffuser/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/unidiffuser/test_unidiffuser.py b/diffusers/tests/pipelines/unidiffuser/test_unidiffuser.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e0ba1cfb8eb16fe9cf5f0ca6849e1952b5d214c
--- /dev/null
+++ b/diffusers/tests/pipelines/unidiffuser/test_unidiffuser.py
@@ -0,0 +1,806 @@
+import gc
+import random
+import traceback
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    GPT2Tokenizer,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    DPMSolverMultistepScheduler,
+    UniDiffuserModel,
+    UniDiffuserPipeline,
+    UniDiffuserTextDecoder,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    nightly,
+    require_torch_2,
+    require_torch_gpu,
+    run_test_in_subprocess,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+# Will be run via run_test_in_subprocess
+def _test_unidiffuser_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        torch_device = inputs.pop("torch_device")
+        seed = inputs.pop("seed")
+        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+        pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
+        # pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe = pipe.to(torch_device)
+
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520])
+        assert np.abs(image_slice - expected_slice).max() < 1e-1
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
+class UniDiffuserPipelineFastTests(
+    PipelineTesterMixin, PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
+):
+    pipeline_class = UniDiffuserPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    # vae_latents, not latents, is the argument that corresponds to VAE latent inputs
+    image_latents_params = frozenset(["vae_latents"])
+
+    def get_dummy_components(self):
+        unet = UniDiffuserModel.from_pretrained(
+            "hf-internal-testing/unidiffuser-diffusers-test",
+            subfolder="unet",
+        )
+
+        scheduler = DPMSolverMultistepScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            solver_order=3,
+        )
+
+        vae = AutoencoderKL.from_pretrained(
+            "hf-internal-testing/unidiffuser-diffusers-test",
+            subfolder="vae",
+        )
+
+        text_encoder = CLIPTextModel.from_pretrained(
+            "hf-internal-testing/unidiffuser-diffusers-test",
+            subfolder="text_encoder",
+        )
+        clip_tokenizer = CLIPTokenizer.from_pretrained(
+            "hf-internal-testing/unidiffuser-diffusers-test",
+            subfolder="clip_tokenizer",
+        )
+
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            "hf-internal-testing/unidiffuser-diffusers-test",
+            subfolder="image_encoder",
+        )
+        # From the Stable Diffusion Image Variation pipeline tests
+        clip_image_processor = CLIPImageProcessor(crop_size=32, size=32)
+        # image_processor = CLIPImageProcessor.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_tokenizer = GPT2Tokenizer.from_pretrained(
+            "hf-internal-testing/unidiffuser-diffusers-test",
+            subfolder="text_tokenizer",
+        )
+        text_decoder = UniDiffuserTextDecoder.from_pretrained(
+            "hf-internal-testing/unidiffuser-diffusers-test",
+            subfolder="text_decoder",
+        )
+
+        components = {
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "image_encoder": image_encoder,
+            "clip_image_processor": clip_image_processor,
+            "clip_tokenizer": clip_tokenizer,
+            "text_decoder": text_decoder,
+            "text_tokenizer": text_tokenizer,
+            "unet": unet,
+            "scheduler": scheduler,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        image = Image.fromarray(np.uint8(image)).convert("RGB")
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "an elephant under the sea",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def get_fixed_latents(self, device, seed=0):
+        if isinstance(device, str):
+            device = torch.device(device)
+        generator = torch.Generator(device=device).manual_seed(seed)
+        # Hardcode the shapes for now.
+        prompt_latents = randn_tensor((1, 77, 32), generator=generator, device=device, dtype=torch.float32)
+        vae_latents = randn_tensor((1, 4, 16, 16), generator=generator, device=device, dtype=torch.float32)
+        clip_latents = randn_tensor((1, 1, 32), generator=generator, device=device, dtype=torch.float32)
+
+        latents = {
+            "prompt_latents": prompt_latents,
+            "vae_latents": vae_latents,
+            "clip_latents": clip_latents,
+        }
+        return latents
+
+    def get_dummy_inputs_with_latents(self, device, seed=0):
+        # image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        # image = image.cpu().permute(0, 2, 3, 1)[0]
+        # image = Image.fromarray(np.uint8(image)).convert("RGB")
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg",
+        )
+        image = image.resize((32, 32))
+        latents = self.get_fixed_latents(device, seed=seed)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "an elephant under the sea",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "prompt_latents": latents.get("prompt_latents"),
+            "vae_latents": latents.get("vae_latents"),
+            "clip_latents": latents.get("clip_latents"),
+        }
+        return inputs
+
+    def test_dict_tuple_outputs_equivalent(self):
+        expected_slice = None
+        if torch_device == "cpu":
+            expected_slice = np.array([0.7489, 0.3722, 0.4475, 0.5630, 0.5923, 0.4992, 0.3936, 0.5844, 0.4975])
+        super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
+
+    def test_unidiffuser_default_joint_v0(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        unidiffuser_pipe = UniDiffuserPipeline(**components)
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'joint'
+        unidiffuser_pipe.set_joint_mode()
+        assert unidiffuser_pipe.mode == "joint"
+
+        # inputs = self.get_dummy_inputs(device)
+        inputs = self.get_dummy_inputs_with_latents(device)
+        # Delete prompt and image for joint inference.
+        del inputs["prompt"]
+        del inputs["image"]
+        sample = unidiffuser_pipe(**inputs)
+        image = sample.images
+        text = sample.text
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716])
+        assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
+
+        expected_text_prefix = " no no no "
+        assert text[0][:10] == expected_text_prefix
+
+    def test_unidiffuser_default_joint_no_cfg_v0(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        unidiffuser_pipe = UniDiffuserPipeline(**components)
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'joint'
+        unidiffuser_pipe.set_joint_mode()
+        assert unidiffuser_pipe.mode == "joint"
+
+        # inputs = self.get_dummy_inputs(device)
+        inputs = self.get_dummy_inputs_with_latents(device)
+        # Delete prompt and image for joint inference.
+        del inputs["prompt"]
+        del inputs["image"]
+        # Set guidance scale to 1.0 to turn off CFG
+        inputs["guidance_scale"] = 1.0
+        sample = unidiffuser_pipe(**inputs)
+        image = sample.images
+        text = sample.text
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716])
+        assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
+
+        expected_text_prefix = " no no no "
+        assert text[0][:10] == expected_text_prefix
+
+    def test_unidiffuser_default_text2img_v0(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        unidiffuser_pipe = UniDiffuserPipeline(**components)
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'text2img'
+        unidiffuser_pipe.set_text_to_image_mode()
+        assert unidiffuser_pipe.mode == "text2img"
+
+        inputs = self.get_dummy_inputs_with_latents(device)
+        # Delete image for text-conditioned image generation
+        del inputs["image"]
+        image = unidiffuser_pipe(**inputs).images
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.5758, 0.6269, 0.6570, 0.4967, 0.4639, 0.5664, 0.5257, 0.5067, 0.5715])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_unidiffuser_default_image_0(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        unidiffuser_pipe = UniDiffuserPipeline(**components)
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'img'
+        unidiffuser_pipe.set_image_mode()
+        assert unidiffuser_pipe.mode == "img"
+
+        inputs = self.get_dummy_inputs(device)
+        # Delete prompt and image for unconditional ("marginal") text generation.
+        del inputs["prompt"]
+        del inputs["image"]
+        image = unidiffuser_pipe(**inputs).images
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.5760, 0.6270, 0.6571, 0.4966, 0.4638, 0.5663, 0.5254, 0.5068, 0.5715])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_unidiffuser_default_text_v0(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        unidiffuser_pipe = UniDiffuserPipeline(**components)
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'img'
+        unidiffuser_pipe.set_text_mode()
+        assert unidiffuser_pipe.mode == "text"
+
+        inputs = self.get_dummy_inputs(device)
+        # Delete prompt and image for unconditional ("marginal") text generation.
+        del inputs["prompt"]
+        del inputs["image"]
+        text = unidiffuser_pipe(**inputs).text
+
+        expected_text_prefix = " no no no "
+        assert text[0][:10] == expected_text_prefix
+
+    def test_unidiffuser_default_img2text_v0(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        unidiffuser_pipe = UniDiffuserPipeline(**components)
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'img2text'
+        unidiffuser_pipe.set_image_to_text_mode()
+        assert unidiffuser_pipe.mode == "img2text"
+
+        inputs = self.get_dummy_inputs_with_latents(device)
+        # Delete text for image-conditioned text generation
+        del inputs["prompt"]
+        text = unidiffuser_pipe(**inputs).text
+
+        expected_text_prefix = " no no no "
+        assert text[0][:10] == expected_text_prefix
+
+    def test_unidiffuser_default_joint_v1(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1")
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'joint'
+        unidiffuser_pipe.set_joint_mode()
+        assert unidiffuser_pipe.mode == "joint"
+
+        # inputs = self.get_dummy_inputs(device)
+        inputs = self.get_dummy_inputs_with_latents(device)
+        # Delete prompt and image for joint inference.
+        del inputs["prompt"]
+        del inputs["image"]
+        inputs["data_type"] = 1
+        sample = unidiffuser_pipe(**inputs)
+        image = sample.images
+        text = sample.text
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716])
+        assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
+
+        expected_text_prefix = " no no no "
+        assert text[0][:10] == expected_text_prefix
+
+    def test_unidiffuser_default_text2img_v1(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1")
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'text2img'
+        unidiffuser_pipe.set_text_to_image_mode()
+        assert unidiffuser_pipe.mode == "text2img"
+
+        inputs = self.get_dummy_inputs_with_latents(device)
+        # Delete image for text-conditioned image generation
+        del inputs["image"]
+        image = unidiffuser_pipe(**inputs).images
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.5758, 0.6269, 0.6570, 0.4967, 0.4639, 0.5664, 0.5257, 0.5067, 0.5715])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_unidiffuser_default_img2text_v1(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1")
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'img2text'
+        unidiffuser_pipe.set_image_to_text_mode()
+        assert unidiffuser_pipe.mode == "img2text"
+
+        inputs = self.get_dummy_inputs_with_latents(device)
+        # Delete text for image-conditioned text generation
+        del inputs["prompt"]
+        text = unidiffuser_pipe(**inputs).text
+
+        expected_text_prefix = " no no no "
+        assert text[0][:10] == expected_text_prefix
+
+    def test_unidiffuser_text2img_multiple_images(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        unidiffuser_pipe = UniDiffuserPipeline(**components)
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'text2img'
+        unidiffuser_pipe.set_text_to_image_mode()
+        assert unidiffuser_pipe.mode == "text2img"
+
+        inputs = self.get_dummy_inputs(device)
+        # Delete image for text-conditioned image generation
+        del inputs["image"]
+        inputs["num_images_per_prompt"] = 2
+        inputs["num_prompts_per_image"] = 3
+        image = unidiffuser_pipe(**inputs).images
+        assert image.shape == (2, 32, 32, 3)
+
+    def test_unidiffuser_img2text_multiple_prompts(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        unidiffuser_pipe = UniDiffuserPipeline(**components)
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'img2text'
+        unidiffuser_pipe.set_image_to_text_mode()
+        assert unidiffuser_pipe.mode == "img2text"
+
+        inputs = self.get_dummy_inputs(device)
+        # Delete text for image-conditioned text generation
+        del inputs["prompt"]
+        inputs["num_images_per_prompt"] = 2
+        inputs["num_prompts_per_image"] = 3
+        text = unidiffuser_pipe(**inputs).text
+
+        assert len(text) == 3
+
+    def test_unidiffuser_text2img_multiple_images_with_latents(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        unidiffuser_pipe = UniDiffuserPipeline(**components)
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'text2img'
+        unidiffuser_pipe.set_text_to_image_mode()
+        assert unidiffuser_pipe.mode == "text2img"
+
+        inputs = self.get_dummy_inputs_with_latents(device)
+        # Delete image for text-conditioned image generation
+        del inputs["image"]
+        inputs["num_images_per_prompt"] = 2
+        inputs["num_prompts_per_image"] = 3
+        image = unidiffuser_pipe(**inputs).images
+        assert image.shape == (2, 32, 32, 3)
+
+    def test_unidiffuser_img2text_multiple_prompts_with_latents(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        unidiffuser_pipe = UniDiffuserPipeline(**components)
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'img2text'
+        unidiffuser_pipe.set_image_to_text_mode()
+        assert unidiffuser_pipe.mode == "img2text"
+
+        inputs = self.get_dummy_inputs_with_latents(device)
+        # Delete text for image-conditioned text generation
+        del inputs["prompt"]
+        inputs["num_images_per_prompt"] = 2
+        inputs["num_prompts_per_image"] = 3
+        text = unidiffuser_pipe(**inputs).text
+
+        assert len(text) == 3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=2e-4)
+
+    @require_torch_gpu
+    def test_unidiffuser_default_joint_v1_cuda_fp16(self):
+        device = "cuda"
+        unidiffuser_pipe = UniDiffuserPipeline.from_pretrained(
+            "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16
+        )
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'joint'
+        unidiffuser_pipe.set_joint_mode()
+        assert unidiffuser_pipe.mode == "joint"
+
+        inputs = self.get_dummy_inputs_with_latents(device)
+        # Delete prompt and image for joint inference.
+        del inputs["prompt"]
+        del inputs["image"]
+        inputs["data_type"] = 1
+        sample = unidiffuser_pipe(**inputs)
+        image = sample.images
+        text = sample.text
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_img_slice = np.array([0.5049, 0.5498, 0.5854, 0.3052, 0.4460, 0.6489, 0.5122, 0.4810, 0.6138])
+        assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
+
+        expected_text_prefix = '" This This'
+        assert text[0][: len(expected_text_prefix)] == expected_text_prefix
+
+    @require_torch_gpu
+    def test_unidiffuser_default_text2img_v1_cuda_fp16(self):
+        device = "cuda"
+        unidiffuser_pipe = UniDiffuserPipeline.from_pretrained(
+            "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16
+        )
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'text2img'
+        unidiffuser_pipe.set_text_to_image_mode()
+        assert unidiffuser_pipe.mode == "text2img"
+
+        inputs = self.get_dummy_inputs_with_latents(device)
+        # Delete prompt and image for joint inference.
+        del inputs["image"]
+        inputs["data_type"] = 1
+        sample = unidiffuser_pipe(**inputs)
+        image = sample.images
+        assert image.shape == (1, 32, 32, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_img_slice = np.array([0.5054, 0.5498, 0.5854, 0.3052, 0.4458, 0.6489, 0.5122, 0.4810, 0.6138])
+        assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
+
+    @require_torch_gpu
+    def test_unidiffuser_default_img2text_v1_cuda_fp16(self):
+        device = "cuda"
+        unidiffuser_pipe = UniDiffuserPipeline.from_pretrained(
+            "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16
+        )
+        unidiffuser_pipe = unidiffuser_pipe.to(device)
+        unidiffuser_pipe.set_progress_bar_config(disable=None)
+
+        # Set mode to 'img2text'
+        unidiffuser_pipe.set_image_to_text_mode()
+        assert unidiffuser_pipe.mode == "img2text"
+
+        inputs = self.get_dummy_inputs_with_latents(device)
+        # Delete prompt and image for joint inference.
+        del inputs["prompt"]
+        inputs["data_type"] = 1
+        text = unidiffuser_pipe(**inputs).text
+
+        expected_text_prefix = '" This This'
+        assert text[0][: len(expected_text_prefix)] == expected_text_prefix
+
+
+@nightly
+@require_torch_gpu
+class UniDiffuserPipelineSlowTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, seed=0, generate_latents=False):
+        generator = torch.manual_seed(seed)
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg"
+        )
+        inputs = {
+            "prompt": "an elephant under the sea",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 8.0,
+            "output_type": "np",
+        }
+        if generate_latents:
+            latents = self.get_fixed_latents(device, seed=seed)
+            for latent_name, latent_tensor in latents.items():
+                inputs[latent_name] = latent_tensor
+        return inputs
+
+    def get_fixed_latents(self, device, seed=0):
+        if isinstance(device, str):
+            device = torch.device(device)
+        latent_device = torch.device("cpu")
+        generator = torch.Generator(device=latent_device).manual_seed(seed)
+        # Hardcode the shapes for now.
+        prompt_latents = randn_tensor((1, 77, 768), generator=generator, device=device, dtype=torch.float32)
+        vae_latents = randn_tensor((1, 4, 64, 64), generator=generator, device=device, dtype=torch.float32)
+        clip_latents = randn_tensor((1, 1, 512), generator=generator, device=device, dtype=torch.float32)
+
+        # Move latents onto desired device.
+        prompt_latents = prompt_latents.to(device)
+        vae_latents = vae_latents.to(device)
+        clip_latents = clip_latents.to(device)
+
+        latents = {
+            "prompt_latents": prompt_latents,
+            "vae_latents": vae_latents,
+            "clip_latents": clip_latents,
+        }
+        return latents
+
+    def test_unidiffuser_default_joint_v1(self):
+        pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        # inputs = self.get_dummy_inputs(device)
+        inputs = self.get_inputs(device=torch_device, generate_latents=True)
+        # Delete prompt and image for joint inference.
+        del inputs["prompt"]
+        del inputs["image"]
+        sample = pipe(**inputs)
+        image = sample.images
+        text = sample.text
+        assert image.shape == (1, 512, 512, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_img_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520])
+        assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-1
+
+        expected_text_prefix = "a living room"
+        assert text[0][: len(expected_text_prefix)] == expected_text_prefix
+
+    def test_unidiffuser_default_text2img_v1(self):
+        pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(device=torch_device, generate_latents=True)
+        del inputs["image"]
+        sample = pipe(**inputs)
+        image = sample.images
+        assert image.shape == (1, 512, 512, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.0242, 0.0103, 0.0022, 0.0129, 0.0000, 0.0090, 0.0376, 0.0508, 0.0005])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+    def test_unidiffuser_default_img2text_v1(self):
+        pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(device=torch_device, generate_latents=True)
+        del inputs["prompt"]
+        sample = pipe(**inputs)
+        text = sample.text
+
+        expected_text_prefix = "An astronaut"
+        assert text[0][: len(expected_text_prefix)] == expected_text_prefix
+
+    @unittest.skip(reason="Skip torch.compile test to speed up the slow test suite.")
+    @require_torch_2
+    def test_unidiffuser_compile(self, seed=0):
+        inputs = self.get_inputs(torch_device, seed=seed, generate_latents=True)
+        # Delete prompt and image for joint inference.
+        del inputs["prompt"]
+        del inputs["image"]
+        # Can't pickle a Generator object
+        del inputs["generator"]
+        inputs["torch_device"] = torch_device
+        inputs["seed"] = seed
+        run_test_in_subprocess(test_case=self, target_func=_test_unidiffuser_compile, inputs=inputs)
+
+
+@nightly
+@require_torch_gpu
+class UniDiffuserPipelineNightlyTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, seed=0, generate_latents=False):
+        generator = torch.manual_seed(seed)
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg"
+        )
+        inputs = {
+            "prompt": "an elephant under the sea",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 8.0,
+            "output_type": "np",
+        }
+        if generate_latents:
+            latents = self.get_fixed_latents(device, seed=seed)
+            for latent_name, latent_tensor in latents.items():
+                inputs[latent_name] = latent_tensor
+        return inputs
+
+    def get_fixed_latents(self, device, seed=0):
+        if isinstance(device, str):
+            device = torch.device(device)
+        latent_device = torch.device("cpu")
+        generator = torch.Generator(device=latent_device).manual_seed(seed)
+        # Hardcode the shapes for now.
+        prompt_latents = randn_tensor((1, 77, 768), generator=generator, device=device, dtype=torch.float32)
+        vae_latents = randn_tensor((1, 4, 64, 64), generator=generator, device=device, dtype=torch.float32)
+        clip_latents = randn_tensor((1, 1, 512), generator=generator, device=device, dtype=torch.float32)
+
+        # Move latents onto desired device.
+        prompt_latents = prompt_latents.to(device)
+        vae_latents = vae_latents.to(device)
+        clip_latents = clip_latents.to(device)
+
+        latents = {
+            "prompt_latents": prompt_latents,
+            "vae_latents": vae_latents,
+            "clip_latents": clip_latents,
+        }
+        return latents
+
+    def test_unidiffuser_default_joint_v1_fp16(self):
+        pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        # inputs = self.get_dummy_inputs(device)
+        inputs = self.get_inputs(device=torch_device, generate_latents=True)
+        # Delete prompt and image for joint inference.
+        del inputs["prompt"]
+        del inputs["image"]
+        sample = pipe(**inputs)
+        image = sample.images
+        text = sample.text
+        assert image.shape == (1, 512, 512, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_img_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520])
+        assert np.abs(image_slice.flatten() - expected_img_slice).max() < 2e-1
+
+        expected_text_prefix = "a living room"
+        assert text[0][: len(expected_text_prefix)] == expected_text_prefix
+
+    def test_unidiffuser_default_text2img_v1_fp16(self):
+        pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(device=torch_device, generate_latents=True)
+        del inputs["image"]
+        sample = pipe(**inputs)
+        image = sample.images
+        assert image.shape == (1, 512, 512, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.0242, 0.0103, 0.0022, 0.0129, 0.0000, 0.0090, 0.0376, 0.0508, 0.0005])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+    def test_unidiffuser_default_img2text_v1_fp16(self):
+        pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(device=torch_device, generate_latents=True)
+        del inputs["prompt"]
+        sample = pipe(**inputs)
+        text = sample.text
+
+        expected_text_prefix = "An astronaut"
+        assert text[0][: len(expected_text_prefix)] == expected_text_prefix
diff --git a/diffusers/tests/pipelines/wuerstchen/__init__.py b/diffusers/tests/pipelines/wuerstchen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/pipelines/wuerstchen/test_wuerstchen_combined.py b/diffusers/tests/pipelines/wuerstchen/test_wuerstchen_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..0caed159100abf3d537f87442c7d55479643a66d
--- /dev/null
+++ b/diffusers/tests/pipelines/wuerstchen/test_wuerstchen_combined.py
@@ -0,0 +1,239 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import DDPMWuerstchenScheduler, WuerstchenCombinedPipeline
+from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt, WuerstchenPrior
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class WuerstchenCombinedPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = WuerstchenCombinedPipeline
+    params = ["prompt"]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "prior_guidance_scale",
+        "decoder_guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "prior_num_inference_steps",
+        "output_type",
+    ]
+    test_xformers_attention = True
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {"c_in": 2, "c": 8, "depth": 2, "c_cond": 32, "c_r": 8, "nhead": 2}
+        model = WuerstchenPrior(**model_kwargs)
+        return model.eval()
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_prior_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config).eval()
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            projection_dim=self.text_embedder_hidden_size,
+            hidden_size=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config).eval()
+
+    @property
+    def dummy_vqgan(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "bottleneck_blocks": 1,
+            "num_vq_embeddings": 2,
+        }
+        model = PaellaVQModel(**model_kwargs)
+        return model.eval()
+
+    @property
+    def dummy_decoder(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "c_cond": self.text_embedder_hidden_size,
+            "c_hidden": [320],
+            "nhead": [-1],
+            "blocks": [4],
+            "level_config": ["CT"],
+            "clip_embd": self.text_embedder_hidden_size,
+            "inject_effnet": [False],
+        }
+
+        model = WuerstchenDiffNeXt(**model_kwargs)
+        return model.eval()
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        prior_text_encoder = self.dummy_prior_text_encoder
+
+        scheduler = DDPMWuerstchenScheduler()
+        tokenizer = self.dummy_tokenizer
+
+        text_encoder = self.dummy_text_encoder
+        decoder = self.dummy_decoder
+        vqgan = self.dummy_vqgan
+
+        components = {
+            "tokenizer": tokenizer,
+            "text_encoder": text_encoder,
+            "decoder": decoder,
+            "vqgan": vqgan,
+            "scheduler": scheduler,
+            "prior_prior": prior,
+            "prior_text_encoder": prior_text_encoder,
+            "prior_tokenizer": tokenizer,
+            "prior_scheduler": scheduler,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "generator": generator,
+            "prior_guidance_scale": 4.0,
+            "decoder_guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "prior_num_inference_steps": 2,
+            "output_type": "np",
+            "height": 128,
+            "width": 128,
+        }
+        return inputs
+
+    def test_wuerstchen(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[-3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+
+        expected_slice = np.array([0.7616304, 0.0, 1.0, 0.0, 1.0, 0.0, 0.05925313, 0.0, 0.951898])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    @unittest.skip(reason="flakey and float16 requires CUDA")
+    def test_float16_inference(self):
+        super().test_float16_inference()
+
+    def test_callback_inputs(self):
+        pass
+
+    def test_callback_cfg(self):
+        pass
diff --git a/diffusers/tests/pipelines/wuerstchen/test_wuerstchen_decoder.py b/diffusers/tests/pipelines/wuerstchen/test_wuerstchen_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..4675501387904fb9ac67bf79c2e11555559baddd
--- /dev/null
+++ b/diffusers/tests/pipelines/wuerstchen/test_wuerstchen_decoder.py
@@ -0,0 +1,188 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import DDPMWuerstchenScheduler, WuerstchenDecoderPipeline
+from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt
+from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class WuerstchenDecoderPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = WuerstchenDecoderPipeline
+    params = ["prompt"]
+    batch_params = ["image_embeddings", "prompt", "negative_prompt"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "num_inference_steps",
+        "latents",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    callback_cfg_params = ["image_embeddings", "text_encoder_hidden_states"]
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            projection_dim=self.text_embedder_hidden_size,
+            hidden_size=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config).eval()
+
+    @property
+    def dummy_vqgan(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "bottleneck_blocks": 1,
+            "num_vq_embeddings": 2,
+        }
+        model = PaellaVQModel(**model_kwargs)
+        return model.eval()
+
+    @property
+    def dummy_decoder(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "c_cond": self.text_embedder_hidden_size,
+            "c_hidden": [320],
+            "nhead": [-1],
+            "blocks": [4],
+            "level_config": ["CT"],
+            "clip_embd": self.text_embedder_hidden_size,
+            "inject_effnet": [False],
+        }
+
+        model = WuerstchenDiffNeXt(**model_kwargs)
+        return model.eval()
+
+    def get_dummy_components(self):
+        decoder = self.dummy_decoder
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        vqgan = self.dummy_vqgan
+
+        scheduler = DDPMWuerstchenScheduler()
+
+        components = {
+            "decoder": decoder,
+            "vqgan": vqgan,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+            "latent_dim_scale": 4.0,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image_embeddings": torch.ones((1, 4, 4, 4), device=device),
+            "prompt": "horse",
+            "generator": generator,
+            "guidance_scale": 1.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_wuerstchen_decoder(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.0000, 0.0000, 0.0089, 1.0000, 1.0000, 0.3927, 1.0000, 1.0000, 1.0000])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=1e-5)
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+        test_mean_pixel_difference = False
+
+        self._test_attention_slicing_forward_pass(
+            test_max_difference=test_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )
+
+    @unittest.skip(reason="bf16 not supported and requires CUDA")
+    def test_float16_inference(self):
+        super().test_float16_inference()
diff --git a/diffusers/tests/pipelines/wuerstchen/test_wuerstchen_prior.py b/diffusers/tests/pipelines/wuerstchen/test_wuerstchen_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..460004da6f0482743c085a336d858011a8126eeb
--- /dev/null
+++ b/diffusers/tests/pipelines/wuerstchen/test_wuerstchen_prior.py
@@ -0,0 +1,269 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import DDPMWuerstchenScheduler, WuerstchenPriorPipeline
+from diffusers.pipelines.wuerstchen import WuerstchenPrior
+from diffusers.utils.import_utils import is_peft_available
+from diffusers.utils.testing_utils import enable_full_determinism, require_peft_backend, skip_mps, torch_device
+
+
+if is_peft_available():
+    from peft import LoraConfig
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class WuerstchenPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = WuerstchenPriorPipeline
+    params = ["prompt"]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "generator",
+        "num_inference_steps",
+        "latents",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    callback_cfg_params = ["text_encoder_hidden_states"]
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config).eval()
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "c_in": 2,
+            "c": 8,
+            "depth": 2,
+            "c_cond": 32,
+            "c_r": 8,
+            "nhead": 2,
+        }
+
+        model = WuerstchenPrior(**model_kwargs)
+        return model.eval()
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+
+        scheduler = DDPMWuerstchenScheduler()
+
+        components = {
+            "prior": prior,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "generator": generator,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_wuerstchen_prior(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.image_embeddings
+
+        image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)[0]
+
+        image_slice = image[0, 0, 0, -10:]
+        image_from_tuple_slice = image_from_tuple[0, 0, 0, -10:]
+        assert image.shape == (1, 2, 24, 24)
+
+        expected_slice = np.array(
+            [
+                -7172.837,
+                -3438.855,
+                -1093.312,
+                388.8835,
+                -7471.467,
+                -7998.1206,
+                -5328.259,
+                218.00089,
+                -2731.5745,
+                -8056.734,
+            ]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 5e-2
+
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=3e-1,
+        )
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+        test_mean_pixel_difference = False
+
+        self._test_attention_slicing_forward_pass(
+            test_max_difference=test_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )
+
+    @unittest.skip(reason="flaky for now")
+    def test_float16_inference(self):
+        super().test_float16_inference()
+
+    # override because we need to make sure latent_mean and latent_std to be 0
+    def test_callback_inputs(self):
+        components = self.get_dummy_components()
+        components["latent_mean"] = 0
+        components["latent_std"] = 0
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        self.assertTrue(
+            hasattr(pipe, "_callback_tensor_inputs"),
+            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
+        )
+
+        def callback_inputs_test(pipe, i, t, callback_kwargs):
+            missing_callback_inputs = set()
+            for v in pipe._callback_tensor_inputs:
+                if v not in callback_kwargs:
+                    missing_callback_inputs.add(v)
+            self.assertTrue(
+                len(missing_callback_inputs) == 0, f"Missing callback tensor inputs: {missing_callback_inputs}"
+            )
+            last_i = pipe.num_timesteps - 1
+            if i == last_i:
+                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
+            return callback_kwargs
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["callback_on_step_end"] = callback_inputs_test
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        inputs["output_type"] = "latent"
+
+        output = pipe(**inputs)[0]
+        assert output.abs().sum() == 0
+
+    def check_if_lora_correctly_set(self, model) -> bool:
+        """
+        Checks if the LoRA layers are correctly set with peft
+        """
+        for module in model.modules():
+            if isinstance(module, BaseTunerLayer):
+                return True
+        return False
+
+    def get_lora_components(self):
+        prior = self.dummy_prior
+
+        prior_lora_config = LoraConfig(
+            r=4, lora_alpha=4, target_modules=["to_q", "to_k", "to_v", "to_out.0"], init_lora_weights=False
+        )
+
+        return prior, prior_lora_config
+
+    @require_peft_backend
+    def test_inference_with_prior_lora(self):
+        _, prior_lora_config = self.get_lora_components()
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output_no_lora = pipe(**self.get_dummy_inputs(device))
+        image_embed = output_no_lora.image_embeddings
+        self.assertTrue(image_embed.shape == (1, 2, 24, 24))
+
+        pipe.prior.add_adapter(prior_lora_config)
+        self.assertTrue(self.check_if_lora_correctly_set(pipe.prior), "Lora not correctly set in prior")
+
+        output_lora = pipe(**self.get_dummy_inputs(device))
+        lora_image_embed = output_lora.image_embeddings
+
+        self.assertTrue(image_embed.shape == lora_image_embed.shape)
diff --git a/diffusers/tests/schedulers/__init__.py b/diffusers/tests/schedulers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/schedulers/test_scheduler_consistency_model.py b/diffusers/tests/schedulers/test_scheduler_consistency_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f773d7db05f752d2b1e216071af5a1ca808d658
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_consistency_model.py
@@ -0,0 +1,189 @@
+import torch
+
+from diffusers import CMStochasticIterativeScheduler
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class CMStochasticIterativeSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (CMStochasticIterativeScheduler,)
+    num_inference_steps = 10
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 201,
+            "sigma_min": 0.002,
+            "sigma_max": 80.0,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    # Override test_step_shape to add CMStochasticIterativeScheduler-specific logic regarding timesteps
+    # Problem is that we don't know two timesteps that will always be in the timestep schedule from only the scheduler
+    # config; scaled sigma_max is always in the timestep schedule, but sigma_min is in the sigma schedule while scaled
+    # sigma_min is not in the timestep schedule
+    def test_step_shape(self):
+        num_inference_steps = 10
+
+        scheduler_config = self.get_scheduler_config()
+        scheduler = self.scheduler_classes[0](**scheduler_config)
+
+        scheduler.set_timesteps(num_inference_steps)
+
+        timestep_0 = scheduler.timesteps[0]
+        timestep_1 = scheduler.timesteps[1]
+
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+
+        output_0 = scheduler.step(residual, timestep_0, sample).prev_sample
+        output_1 = scheduler.step(residual, timestep_1, sample).prev_sample
+
+        self.assertEqual(output_0.shape, sample.shape)
+        self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_timesteps(self):
+        for timesteps in [10, 50, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_clip_denoised(self):
+        for clip_denoised in [True, False]:
+            self.check_over_configs(clip_denoised=clip_denoised)
+
+    def test_full_loop_no_noise_onestep(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 1
+        scheduler.set_timesteps(num_inference_steps)
+        timesteps = scheduler.timesteps
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+
+        for i, t in enumerate(timesteps):
+            # 1. scale model input
+            scaled_sample = scheduler.scale_model_input(sample, t)
+
+            # 2. predict noise residual
+            residual = model(scaled_sample, t)
+
+            # 3. predict previous sample x_t-1
+            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
+
+            sample = pred_prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 192.7614) < 1e-2
+        assert abs(result_mean.item() - 0.2510) < 1e-3
+
+    def test_full_loop_no_noise_multistep(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [106, 0]
+        scheduler.set_timesteps(timesteps=timesteps)
+        timesteps = scheduler.timesteps
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+
+        for t in timesteps:
+            # 1. scale model input
+            scaled_sample = scheduler.scale_model_input(sample, t)
+
+            # 2. predict noise residual
+            residual = model(scaled_sample, t)
+
+            # 3. predict previous sample x_t-1
+            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
+
+            sample = pred_prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 347.6357) < 1e-2
+        assert abs(result_mean.item() - 0.4527) < 1e-3
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        t_start = 8
+
+        scheduler.set_timesteps(num_inference_steps)
+        timesteps = scheduler.timesteps
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+
+        noise = self.dummy_noise_deter
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for t in timesteps:
+            # 1. scale model input
+            scaled_sample = scheduler.scale_model_input(sample, t)
+
+            # 2. predict noise residual
+            residual = model(scaled_sample, t)
+
+            # 3. predict previous sample x_t-1
+            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
+
+            sample = pred_prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 763.9186) < 1e-2, f" expected result sum 763.9186, but get {result_sum}"
+        assert abs(result_mean.item() - 0.9947) < 1e-3, f" expected result mean 0.9947, but get {result_mean}"
+
+    def test_custom_timesteps_increasing_order(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [39, 30, 12, 15, 0]
+
+        with self.assertRaises(ValueError, msg="`timesteps` must be in descending order."):
+            scheduler.set_timesteps(timesteps=timesteps)
+
+    def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [39, 30, 12, 1, 0]
+        num_inference_steps = len(timesteps)
+
+        with self.assertRaises(ValueError, msg="Can only pass one of `num_inference_steps` or `timesteps`."):
+            scheduler.set_timesteps(num_inference_steps=num_inference_steps, timesteps=timesteps)
+
+    def test_custom_timesteps_too_large(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [scheduler.config.num_train_timesteps]
+
+        with self.assertRaises(
+            ValueError,
+            msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}",
+        ):
+            scheduler.set_timesteps(timesteps=timesteps)
diff --git a/diffusers/tests/schedulers/test_scheduler_ddim.py b/diffusers/tests/schedulers/test_scheduler_ddim.py
new file mode 100644
index 0000000000000000000000000000000000000000..13b353a44b0885ed52eda0ce97d3ca7bd4b18ad5
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_ddim.py
@@ -0,0 +1,176 @@
+import torch
+
+from diffusers import DDIMScheduler
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class DDIMSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (DDIMScheduler,)
+    forward_default_kwargs = (("eta", 0.0), ("num_inference_steps", 50))
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "clip_sample": True,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def full_loop(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps, eta = 10, 0.0
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+
+        scheduler.set_timesteps(num_inference_steps)
+
+        for t in scheduler.timesteps:
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample, eta).prev_sample
+
+        return sample
+
+    def test_timesteps(self):
+        for timesteps in [100, 500, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_steps_offset(self):
+        for steps_offset in [0, 1]:
+            self.check_over_configs(steps_offset=steps_offset)
+
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(steps_offset=1)
+        scheduler = scheduler_class(**scheduler_config)
+        scheduler.set_timesteps(5)
+        assert torch.equal(scheduler.timesteps, torch.LongTensor([801, 601, 401, 201, 1]))
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_clip_sample(self):
+        for clip_sample in [True, False]:
+            self.check_over_configs(clip_sample=clip_sample)
+
+    def test_timestep_spacing(self):
+        for timestep_spacing in ["trailing", "leading"]:
+            self.check_over_configs(timestep_spacing=timestep_spacing)
+
+    def test_rescale_betas_zero_snr(self):
+        for rescale_betas_zero_snr in [True, False]:
+            self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr)
+
+    def test_thresholding(self):
+        self.check_over_configs(thresholding=False)
+        for threshold in [0.5, 1.0, 2.0]:
+            for prediction_type in ["epsilon", "v_prediction"]:
+                self.check_over_configs(
+                    thresholding=True,
+                    prediction_type=prediction_type,
+                    sample_max_value=threshold,
+                )
+
+    def test_time_indices(self):
+        for t in [1, 10, 49]:
+            self.check_over_forward(time_step=t)
+
+    def test_inference_steps(self):
+        for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]):
+            self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps)
+
+    def test_eta(self):
+        for t, eta in zip([1, 10, 49], [0.0, 0.5, 1.0]):
+            self.check_over_forward(time_step=t, eta=eta)
+
+    def test_variance(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        assert torch.sum(torch.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(420, 400) - 0.14771)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(980, 960) - 0.32460)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(487, 486) - 0.00979)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(999, 998) - 0.02)) < 1e-5
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 172.0067) < 1e-2
+        assert abs(result_mean.item() - 0.223967) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction")
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 52.5302) < 1e-2
+        assert abs(result_mean.item() - 0.0684) < 1e-3
+
+    def test_full_loop_with_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 149.8295) < 1e-2
+        assert abs(result_mean.item() - 0.1951) < 1e-3
+
+    def test_full_loop_with_no_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 149.0784) < 1e-2
+        assert abs(result_mean.item() - 0.1941) < 1e-3
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps, eta = 10, 0.0
+        t_start = 8
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+
+        scheduler.set_timesteps(num_inference_steps)
+
+        # add noise
+        noise = self.dummy_noise_deter
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for t in timesteps:
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample, eta).prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 354.5418) < 1e-2, f" expected result sum 218.4379, but get {result_sum}"
+        assert abs(result_mean.item() - 0.4616) < 1e-3, f" expected result mean 0.2844, but get {result_mean}"
diff --git a/diffusers/tests/schedulers/test_scheduler_ddim_inverse.py b/diffusers/tests/schedulers/test_scheduler_ddim_inverse.py
new file mode 100644
index 0000000000000000000000000000000000000000..696f57644a83e2d1ce9be73b66aeb57afab7080d
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_ddim_inverse.py
@@ -0,0 +1,135 @@
+import torch
+
+from diffusers import DDIMInverseScheduler
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class DDIMInverseSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (DDIMInverseScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 50),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "clip_sample": True,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def full_loop(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+
+        scheduler.set_timesteps(num_inference_steps)
+
+        for t in scheduler.timesteps:
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        return sample
+
+    def test_timesteps(self):
+        for timesteps in [100, 500, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_steps_offset(self):
+        for steps_offset in [0, 1]:
+            self.check_over_configs(steps_offset=steps_offset)
+
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(steps_offset=1)
+        scheduler = scheduler_class(**scheduler_config)
+        scheduler.set_timesteps(5)
+        assert torch.equal(scheduler.timesteps, torch.LongTensor([1, 201, 401, 601, 801]))
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_clip_sample(self):
+        for clip_sample in [True, False]:
+            self.check_over_configs(clip_sample=clip_sample)
+
+    def test_timestep_spacing(self):
+        for timestep_spacing in ["trailing", "leading"]:
+            self.check_over_configs(timestep_spacing=timestep_spacing)
+
+    def test_rescale_betas_zero_snr(self):
+        for rescale_betas_zero_snr in [True, False]:
+            self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr)
+
+    def test_thresholding(self):
+        self.check_over_configs(thresholding=False)
+        for threshold in [0.5, 1.0, 2.0]:
+            for prediction_type in ["epsilon", "v_prediction"]:
+                self.check_over_configs(
+                    thresholding=True,
+                    prediction_type=prediction_type,
+                    sample_max_value=threshold,
+                )
+
+    def test_time_indices(self):
+        for t in [1, 10, 49]:
+            self.check_over_forward(time_step=t)
+
+    def test_inference_steps(self):
+        for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]):
+            self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps)
+
+    def test_add_noise_device(self):
+        pass
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 671.6816) < 1e-2
+        assert abs(result_mean.item() - 0.8746) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction")
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 1394.2185) < 1e-2
+        assert abs(result_mean.item() - 1.8154) < 1e-3
+
+    def test_full_loop_with_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 539.9622) < 1e-2
+        assert abs(result_mean.item() - 0.7031) < 1e-3
+
+    def test_full_loop_with_no_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 542.6722) < 1e-2
+        assert abs(result_mean.item() - 0.7066) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_ddim_parallel.py b/diffusers/tests/schedulers/test_scheduler_ddim_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..5434d08b5628abbb79453353a7dc68e0cc0ff3b0
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_ddim_parallel.py
@@ -0,0 +1,216 @@
+# Copyright 2024 ParaDiGMS authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from diffusers import DDIMParallelScheduler
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class DDIMParallelSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (DDIMParallelScheduler,)
+    forward_default_kwargs = (("eta", 0.0), ("num_inference_steps", 50))
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "clip_sample": True,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def full_loop(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps, eta = 10, 0.0
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+
+        scheduler.set_timesteps(num_inference_steps)
+
+        for t in scheduler.timesteps:
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample, eta).prev_sample
+
+        return sample
+
+    def test_timesteps(self):
+        for timesteps in [100, 500, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_steps_offset(self):
+        for steps_offset in [0, 1]:
+            self.check_over_configs(steps_offset=steps_offset)
+
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(steps_offset=1)
+        scheduler = scheduler_class(**scheduler_config)
+        scheduler.set_timesteps(5)
+        assert torch.equal(scheduler.timesteps, torch.LongTensor([801, 601, 401, 201, 1]))
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_clip_sample(self):
+        for clip_sample in [True, False]:
+            self.check_over_configs(clip_sample=clip_sample)
+
+    def test_timestep_spacing(self):
+        for timestep_spacing in ["trailing", "leading"]:
+            self.check_over_configs(timestep_spacing=timestep_spacing)
+
+    def test_rescale_betas_zero_snr(self):
+        for rescale_betas_zero_snr in [True, False]:
+            self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr)
+
+    def test_thresholding(self):
+        self.check_over_configs(thresholding=False)
+        for threshold in [0.5, 1.0, 2.0]:
+            for prediction_type in ["epsilon", "v_prediction"]:
+                self.check_over_configs(
+                    thresholding=True,
+                    prediction_type=prediction_type,
+                    sample_max_value=threshold,
+                )
+
+    def test_time_indices(self):
+        for t in [1, 10, 49]:
+            self.check_over_forward(time_step=t)
+
+    def test_inference_steps(self):
+        for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]):
+            self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps)
+
+    def test_eta(self):
+        for t, eta in zip([1, 10, 49], [0.0, 0.5, 1.0]):
+            self.check_over_forward(time_step=t, eta=eta)
+
+    def test_variance(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        assert torch.sum(torch.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(420, 400) - 0.14771)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(980, 960) - 0.32460)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(487, 486) - 0.00979)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(999, 998) - 0.02)) < 1e-5
+
+    def test_batch_step_no_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps, eta = 10, 0.0
+        scheduler.set_timesteps(num_inference_steps)
+
+        model = self.dummy_model()
+        sample1 = self.dummy_sample_deter
+        sample2 = self.dummy_sample_deter + 0.1
+        sample3 = self.dummy_sample_deter - 0.1
+
+        per_sample_batch = sample1.shape[0]
+        samples = torch.stack([sample1, sample2, sample3], dim=0)
+        timesteps = torch.arange(num_inference_steps)[0:3, None].repeat(1, per_sample_batch)
+
+        residual = model(samples.flatten(0, 1), timesteps.flatten(0, 1))
+        pred_prev_sample = scheduler.batch_step_no_noise(residual, timesteps.flatten(0, 1), samples.flatten(0, 1), eta)
+
+        result_sum = torch.sum(torch.abs(pred_prev_sample))
+        result_mean = torch.mean(torch.abs(pred_prev_sample))
+
+        assert abs(result_sum.item() - 1147.7904) < 1e-2
+        assert abs(result_mean.item() - 0.4982) < 1e-3
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 172.0067) < 1e-2
+        assert abs(result_mean.item() - 0.223967) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction")
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 52.5302) < 1e-2
+        assert abs(result_mean.item() - 0.0684) < 1e-3
+
+    def test_full_loop_with_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 149.8295) < 1e-2
+        assert abs(result_mean.item() - 0.1951) < 1e-3
+
+    def test_full_loop_with_no_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 149.0784) < 1e-2
+        assert abs(result_mean.item() - 0.1941) < 1e-3
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps, eta = 10, 0.0
+        t_start = 8
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+
+        scheduler.set_timesteps(num_inference_steps)
+
+        # add noise
+        noise = self.dummy_noise_deter
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for t in timesteps:
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample, eta).prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 354.5418) < 1e-2, f" expected result sum 354.5418, but get {result_sum}"
+        assert abs(result_mean.item() - 0.4616) < 1e-3, f" expected result mean 0.4616, but get {result_mean}"
diff --git a/diffusers/tests/schedulers/test_scheduler_ddpm.py b/diffusers/tests/schedulers/test_scheduler_ddpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..056b5d83350e4baeadf24693d2597fb185d44f46
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_ddpm.py
@@ -0,0 +1,222 @@
+import torch
+
+from diffusers import DDPMScheduler
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class DDPMSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (DDPMScheduler,)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "variance_type": "fixed_small",
+            "clip_sample": True,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [1, 5, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_variance_type(self):
+        for variance in ["fixed_small", "fixed_large", "other"]:
+            self.check_over_configs(variance_type=variance)
+
+    def test_clip_sample(self):
+        for clip_sample in [True, False]:
+            self.check_over_configs(clip_sample=clip_sample)
+
+    def test_thresholding(self):
+        self.check_over_configs(thresholding=False)
+        for threshold in [0.5, 1.0, 2.0]:
+            for prediction_type in ["epsilon", "sample", "v_prediction"]:
+                self.check_over_configs(
+                    thresholding=True,
+                    prediction_type=prediction_type,
+                    sample_max_value=threshold,
+                )
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "sample", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_time_indices(self):
+        for t in [0, 500, 999]:
+            self.check_over_forward(time_step=t)
+
+    def test_variance(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        assert torch.sum(torch.abs(scheduler._get_variance(0) - 0.0)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(487) - 0.00979)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(999) - 0.02)) < 1e-5
+
+    def test_rescale_betas_zero_snr(self):
+        for rescale_betas_zero_snr in [True, False]:
+            self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr)
+
+    def test_full_loop_no_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_trained_timesteps = len(scheduler)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        generator = torch.manual_seed(0)
+
+        for t in reversed(range(num_trained_timesteps)):
+            # 1. predict noise residual
+            residual = model(sample, t)
+
+            # 2. predict previous mean of sample x_t-1
+            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
+
+            # if t > 0:
+            #     noise = self.dummy_sample_deter
+            #     variance = scheduler.get_variance(t) ** (0.5) * noise
+            #
+            # sample = pred_prev_sample + variance
+            sample = pred_prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 258.9606) < 1e-2
+        assert abs(result_mean.item() - 0.3372) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_trained_timesteps = len(scheduler)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        generator = torch.manual_seed(0)
+
+        for t in reversed(range(num_trained_timesteps)):
+            # 1. predict noise residual
+            residual = model(sample, t)
+
+            # 2. predict previous mean of sample x_t-1
+            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
+
+            # if t > 0:
+            #     noise = self.dummy_sample_deter
+            #     variance = scheduler.get_variance(t) ** (0.5) * noise
+            #
+            # sample = pred_prev_sample + variance
+            sample = pred_prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 202.0296) < 1e-2
+        assert abs(result_mean.item() - 0.2631) < 1e-3
+
+    def test_custom_timesteps(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 1, 0]
+
+        scheduler.set_timesteps(timesteps=timesteps)
+
+        scheduler_timesteps = scheduler.timesteps
+
+        for i, timestep in enumerate(scheduler_timesteps):
+            if i == len(timesteps) - 1:
+                expected_prev_t = -1
+            else:
+                expected_prev_t = timesteps[i + 1]
+
+            prev_t = scheduler.previous_timestep(timestep)
+            prev_t = prev_t.item()
+
+            self.assertEqual(prev_t, expected_prev_t)
+
+    def test_custom_timesteps_increasing_order(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 51, 0]
+
+        with self.assertRaises(ValueError, msg="`custom_timesteps` must be in descending order."):
+            scheduler.set_timesteps(timesteps=timesteps)
+
+    def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 1, 0]
+        num_inference_steps = len(timesteps)
+
+        with self.assertRaises(ValueError, msg="Can only pass one of `num_inference_steps` or `custom_timesteps`."):
+            scheduler.set_timesteps(num_inference_steps=num_inference_steps, timesteps=timesteps)
+
+    def test_custom_timesteps_too_large(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [scheduler.config.num_train_timesteps]
+
+        with self.assertRaises(
+            ValueError,
+            msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}",
+        ):
+            scheduler.set_timesteps(timesteps=timesteps)
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_trained_timesteps = len(scheduler)
+        t_start = num_trained_timesteps - 2
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        generator = torch.manual_seed(0)
+
+        # add noise
+        noise = self.dummy_noise_deter
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for t in timesteps:
+            # 1. predict noise residual
+            residual = model(sample, t)
+
+            # 2. predict previous mean of sample x_t-1
+            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
+            sample = pred_prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 387.9466) < 1e-2, f" expected result sum 387.9466, but get {result_sum}"
+        assert abs(result_mean.item() - 0.5051) < 1e-3, f" expected result mean 0.5051, but get {result_mean}"
diff --git a/diffusers/tests/schedulers/test_scheduler_ddpm_parallel.py b/diffusers/tests/schedulers/test_scheduler_ddpm_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..c358ad991c1d14ab6d3587b173d1252d343fedcd
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_ddpm_parallel.py
@@ -0,0 +1,251 @@
+# Copyright 2024 ParaDiGMS authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from diffusers import DDPMParallelScheduler
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class DDPMParallelSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (DDPMParallelScheduler,)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "variance_type": "fixed_small",
+            "clip_sample": True,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [1, 5, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_variance_type(self):
+        for variance in ["fixed_small", "fixed_large", "other"]:
+            self.check_over_configs(variance_type=variance)
+
+    def test_clip_sample(self):
+        for clip_sample in [True, False]:
+            self.check_over_configs(clip_sample=clip_sample)
+
+    def test_thresholding(self):
+        self.check_over_configs(thresholding=False)
+        for threshold in [0.5, 1.0, 2.0]:
+            for prediction_type in ["epsilon", "sample", "v_prediction"]:
+                self.check_over_configs(
+                    thresholding=True,
+                    prediction_type=prediction_type,
+                    sample_max_value=threshold,
+                )
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "sample", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_time_indices(self):
+        for t in [0, 500, 999]:
+            self.check_over_forward(time_step=t)
+
+    def test_variance(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        assert torch.sum(torch.abs(scheduler._get_variance(0) - 0.0)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(487) - 0.00979)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(999) - 0.02)) < 1e-5
+
+    def test_rescale_betas_zero_snr(self):
+        for rescale_betas_zero_snr in [True, False]:
+            self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr)
+
+    def test_batch_step_no_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_trained_timesteps = len(scheduler)
+
+        model = self.dummy_model()
+        sample1 = self.dummy_sample_deter
+        sample2 = self.dummy_sample_deter + 0.1
+        sample3 = self.dummy_sample_deter - 0.1
+
+        per_sample_batch = sample1.shape[0]
+        samples = torch.stack([sample1, sample2, sample3], dim=0)
+        timesteps = torch.arange(num_trained_timesteps)[0:3, None].repeat(1, per_sample_batch)
+
+        residual = model(samples.flatten(0, 1), timesteps.flatten(0, 1))
+        pred_prev_sample = scheduler.batch_step_no_noise(residual, timesteps.flatten(0, 1), samples.flatten(0, 1))
+
+        result_sum = torch.sum(torch.abs(pred_prev_sample))
+        result_mean = torch.mean(torch.abs(pred_prev_sample))
+
+        assert abs(result_sum.item() - 1153.1833) < 1e-2
+        assert abs(result_mean.item() - 0.5005) < 1e-3
+
+    def test_full_loop_no_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_trained_timesteps = len(scheduler)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        generator = torch.manual_seed(0)
+
+        for t in reversed(range(num_trained_timesteps)):
+            # 1. predict noise residual
+            residual = model(sample, t)
+
+            # 2. predict previous mean of sample x_t-1
+            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
+
+            sample = pred_prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 258.9606) < 1e-2
+        assert abs(result_mean.item() - 0.3372) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_trained_timesteps = len(scheduler)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        generator = torch.manual_seed(0)
+
+        for t in reversed(range(num_trained_timesteps)):
+            # 1. predict noise residual
+            residual = model(sample, t)
+
+            # 2. predict previous mean of sample x_t-1
+            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
+
+            sample = pred_prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 202.0296) < 1e-2
+        assert abs(result_mean.item() - 0.2631) < 1e-3
+
+    def test_custom_timesteps(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 1, 0]
+
+        scheduler.set_timesteps(timesteps=timesteps)
+
+        scheduler_timesteps = scheduler.timesteps
+
+        for i, timestep in enumerate(scheduler_timesteps):
+            if i == len(timesteps) - 1:
+                expected_prev_t = -1
+            else:
+                expected_prev_t = timesteps[i + 1]
+
+            prev_t = scheduler.previous_timestep(timestep)
+            prev_t = prev_t.item()
+
+            self.assertEqual(prev_t, expected_prev_t)
+
+    def test_custom_timesteps_increasing_order(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 51, 0]
+
+        with self.assertRaises(ValueError, msg="`custom_timesteps` must be in descending order."):
+            scheduler.set_timesteps(timesteps=timesteps)
+
+    def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 1, 0]
+        num_inference_steps = len(timesteps)
+
+        with self.assertRaises(ValueError, msg="Can only pass one of `num_inference_steps` or `custom_timesteps`."):
+            scheduler.set_timesteps(num_inference_steps=num_inference_steps, timesteps=timesteps)
+
+    def test_custom_timesteps_too_large(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [scheduler.config.num_train_timesteps]
+
+        with self.assertRaises(
+            ValueError,
+            msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}",
+        ):
+            scheduler.set_timesteps(timesteps=timesteps)
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_trained_timesteps = len(scheduler)
+        t_start = num_trained_timesteps - 2
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        generator = torch.manual_seed(0)
+
+        # add noise
+        noise = self.dummy_noise_deter
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for t in timesteps:
+            # 1. predict noise residual
+            residual = model(sample, t)
+
+            # 2. predict previous mean of sample x_t-1
+            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
+            sample = pred_prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 387.9466) < 1e-2, f" expected result sum 387.9466, but get {result_sum}"
+        assert abs(result_mean.item() - 0.5051) < 1e-3, f" expected result mean 0.5051, but get {result_mean}"
diff --git a/diffusers/tests/schedulers/test_scheduler_deis.py b/diffusers/tests/schedulers/test_scheduler_deis.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2823a0cb47e998dfa493ecbf93bb6f3d9af16c2
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_deis.py
@@ -0,0 +1,265 @@
+import tempfile
+
+import torch
+
+from diffusers import (
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    UniPCMultistepScheduler,
+)
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class DEISMultistepSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (DEISMultistepScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 25),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "solver_order": 2,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+            # copy over dummy past residuals
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                new_scheduler.set_timesteps(num_inference_steps)
+                # copy over dummy past residuals
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            output, new_output = sample, sample
+            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+                t = scheduler.timesteps[t]
+                output = scheduler.step(residual, t, output, **kwargs).prev_sample
+                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
+
+                assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_save_pretrained(self):
+        pass
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+
+            # copy over dummy past residuals (must be after setting timesteps)
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                # copy over dummy past residuals
+                new_scheduler.set_timesteps(num_inference_steps)
+
+                # copy over dummy past residual (must be after setting timesteps)
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def full_loop(self, scheduler=None, **config):
+        if scheduler is None:
+            scheduler_class = self.scheduler_classes[0]
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        return sample
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            time_step_0 = scheduler.timesteps[5]
+            time_step_1 = scheduler.timesteps[6]
+
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_switch(self):
+        # make sure that iterating over schedulers with same config names gives same results
+        # for defaults
+        scheduler = DEISMultistepScheduler(**self.get_scheduler_config())
+        sample = self.full_loop(scheduler=scheduler)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.23916) < 1e-3
+
+        scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config)
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
+        scheduler = DEISMultistepScheduler.from_config(scheduler.config)
+
+        sample = self.full_loop(scheduler=scheduler)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.23916) < 1e-3
+
+    def test_timesteps(self):
+        for timesteps in [25, 50, 100, 999, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_thresholding(self):
+        self.check_over_configs(thresholding=False)
+        for order in [1, 2, 3]:
+            for solver_type in ["logrho"]:
+                for threshold in [0.5, 1.0, 2.0]:
+                    for prediction_type in ["epsilon", "sample"]:
+                        self.check_over_configs(
+                            thresholding=True,
+                            prediction_type=prediction_type,
+                            sample_max_value=threshold,
+                            algorithm_type="deis",
+                            solver_order=order,
+                            solver_type=solver_type,
+                        )
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_solver_order_and_type(self):
+        for algorithm_type in ["deis"]:
+            for solver_type in ["logrho"]:
+                for order in [1, 2, 3]:
+                    for prediction_type in ["epsilon", "sample"]:
+                        self.check_over_configs(
+                            solver_order=order,
+                            solver_type=solver_type,
+                            prediction_type=prediction_type,
+                            algorithm_type=algorithm_type,
+                        )
+                        sample = self.full_loop(
+                            solver_order=order,
+                            solver_type=solver_type,
+                            prediction_type=prediction_type,
+                            algorithm_type=algorithm_type,
+                        )
+                        assert not torch.isnan(sample).any(), "Samples have nan numbers"
+
+    def test_lower_order_final(self):
+        self.check_over_configs(lower_order_final=True)
+        self.check_over_configs(lower_order_final=False)
+
+    def test_inference_steps(self):
+        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.23916) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction")
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.091) < 1e-3
+
+    def test_fp16_support(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.half()
+        scheduler.set_timesteps(num_inference_steps)
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        assert sample.dtype == torch.float16
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        t_start = 8
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        # add noise
+        noise = self.dummy_noise_deter
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for i, t in enumerate(timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 315.3016) < 1e-2, f" expected result sum 315.3016, but get {result_sum}"
+        assert abs(result_mean.item() - 0.41054) < 1e-3, f" expected result mean 0.41054, but get {result_mean}"
diff --git a/diffusers/tests/schedulers/test_scheduler_dpm_multi.py b/diffusers/tests/schedulers/test_scheduler_dpm_multi.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef407eaa3dc931a99a415fc2959a86c9151a2535
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_dpm_multi.py
@@ -0,0 +1,360 @@
+import tempfile
+
+import torch
+
+from diffusers import (
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    UniPCMultistepScheduler,
+)
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class DPMSolverMultistepSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (DPMSolverMultistepScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 25),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "solver_order": 2,
+            "prediction_type": "epsilon",
+            "thresholding": False,
+            "sample_max_value": 1.0,
+            "algorithm_type": "dpmsolver++",
+            "solver_type": "midpoint",
+            "lower_order_final": False,
+            "euler_at_final": False,
+            "lambda_min_clipped": -float("inf"),
+            "variance_type": None,
+            "final_sigmas_type": "sigma_min",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+            # copy over dummy past residuals
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                new_scheduler.set_timesteps(num_inference_steps)
+                # copy over dummy past residuals
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            output, new_output = sample, sample
+            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+                t = new_scheduler.timesteps[t]
+                output = scheduler.step(residual, t, output, **kwargs).prev_sample
+                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
+
+                assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_save_pretrained(self):
+        pass
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+
+            # copy over dummy past residuals (must be after setting timesteps)
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                # copy over dummy past residuals
+                new_scheduler.set_timesteps(num_inference_steps)
+
+                # copy over dummy past residual (must be after setting timesteps)
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            time_step = new_scheduler.timesteps[time_step]
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def full_loop(self, scheduler=None, **config):
+        if scheduler is None:
+            scheduler_class = self.scheduler_classes[0]
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        generator = torch.manual_seed(0)
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
+
+        return sample
+
+    def full_loop_custom_timesteps(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        scheduler.set_timesteps(num_inference_steps)
+        timesteps = scheduler.timesteps
+        # reset the timesteps using `timesteps`
+        scheduler = scheduler_class(**scheduler_config)
+        scheduler.set_timesteps(num_inference_steps=None, timesteps=timesteps)
+
+        generator = torch.manual_seed(0)
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
+
+        return sample
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            time_step_0 = scheduler.timesteps[5]
+            time_step_1 = scheduler.timesteps[6]
+
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_timesteps(self):
+        for timesteps in [25, 50, 100, 999, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_thresholding(self):
+        self.check_over_configs(thresholding=False)
+        for order in [1, 2, 3]:
+            for solver_type in ["midpoint", "heun"]:
+                for threshold in [0.5, 1.0, 2.0]:
+                    for prediction_type in ["epsilon", "sample"]:
+                        self.check_over_configs(
+                            thresholding=True,
+                            prediction_type=prediction_type,
+                            sample_max_value=threshold,
+                            algorithm_type="dpmsolver++",
+                            solver_order=order,
+                            solver_type=solver_type,
+                        )
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_solver_order_and_type(self):
+        for algorithm_type in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"]:
+            for solver_type in ["midpoint", "heun"]:
+                for order in [1, 2, 3]:
+                    for prediction_type in ["epsilon", "sample"]:
+                        if algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+                            if order == 3:
+                                continue
+                        else:
+                            self.check_over_configs(
+                                solver_order=order,
+                                solver_type=solver_type,
+                                prediction_type=prediction_type,
+                                algorithm_type=algorithm_type,
+                            )
+                        sample = self.full_loop(
+                            solver_order=order,
+                            solver_type=solver_type,
+                            prediction_type=prediction_type,
+                            algorithm_type=algorithm_type,
+                        )
+                        assert not torch.isnan(sample).any(), "Samples have nan numbers"
+
+    def test_lower_order_final(self):
+        self.check_over_configs(lower_order_final=True)
+        self.check_over_configs(lower_order_final=False)
+
+    def test_euler_at_final(self):
+        self.check_over_configs(euler_at_final=True)
+        self.check_over_configs(euler_at_final=False)
+
+    def test_lambda_min_clipped(self):
+        self.check_over_configs(lambda_min_clipped=-float("inf"))
+        self.check_over_configs(lambda_min_clipped=-5.1)
+
+    def test_variance_type(self):
+        self.check_over_configs(variance_type=None)
+        self.check_over_configs(variance_type="learned_range")
+
+    def test_inference_steps(self):
+        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
+
+    def test_rescale_betas_zero_snr(self):
+        for rescale_betas_zero_snr in [True, False]:
+            self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr)
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.3301) < 1e-3
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        t_start = 5
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        # add noise
+        noise = self.dummy_noise_deter
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for i, t in enumerate(timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 318.4111) < 1e-2, f" expected result sum 318.4111, but get {result_sum}"
+        assert abs(result_mean.item() - 0.4146) < 1e-3, f" expected result mean 0.4146, but get {result_mean}"
+
+    def test_full_loop_no_noise_thres(self):
+        sample = self.full_loop(thresholding=True, dynamic_thresholding_ratio=0.87, sample_max_value=0.5)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 1.1364) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction")
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2251) < 1e-3
+
+    def test_full_loop_with_karras_and_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction", use_karras_sigmas=True)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2096) < 1e-3
+
+    def test_full_loop_with_lu_and_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction", use_lu_lambdas=True)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.1554) < 1e-3
+
+    def test_switch(self):
+        # make sure that iterating over schedulers with same config names gives same results
+        # for defaults
+        scheduler = DPMSolverMultistepScheduler(**self.get_scheduler_config())
+        sample = self.full_loop(scheduler=scheduler)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.3301) < 1e-3
+
+        scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config)
+        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
+        scheduler = DEISMultistepScheduler.from_config(scheduler.config)
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+
+        sample = self.full_loop(scheduler=scheduler)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.3301) < 1e-3
+
+    def test_fp16_support(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.half()
+        scheduler.set_timesteps(num_inference_steps)
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        assert sample.dtype == torch.float16
+
+    def test_duplicated_timesteps(self):
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            scheduler.set_timesteps(scheduler.config.num_train_timesteps)
+            assert len(scheduler.timesteps) == scheduler.num_inference_steps
+
+    def test_custom_timesteps(self):
+        for algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
+            for prediction_type in ["epsilon", "sample", "v_prediction"]:
+                for final_sigmas_type in ["sigma_min", "zero"]:
+                    sample = self.full_loop(
+                        algorithm_type=algorithm_type,
+                        prediction_type=prediction_type,
+                        final_sigmas_type=final_sigmas_type,
+                    )
+                    sample_custom_timesteps = self.full_loop_custom_timesteps(
+                        algorithm_type=algorithm_type,
+                        prediction_type=prediction_type,
+                        final_sigmas_type=final_sigmas_type,
+                    )
+                    assert (
+                        torch.sum(torch.abs(sample - sample_custom_timesteps)) < 1e-5
+                    ), f"Scheduler outputs are not identical for algorithm_type: {algorithm_type}, prediction_type: {prediction_type} and final_sigmas_type: {final_sigmas_type}"
diff --git a/diffusers/tests/schedulers/test_scheduler_dpm_multi_inverse.py b/diffusers/tests/schedulers/test_scheduler_dpm_multi_inverse.py
new file mode 100644
index 0000000000000000000000000000000000000000..014c901680e3aee1a52e4d9014bfc4d7377080ba
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_dpm_multi_inverse.py
@@ -0,0 +1,267 @@
+import tempfile
+
+import torch
+
+from diffusers import DPMSolverMultistepInverseScheduler, DPMSolverMultistepScheduler
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class DPMSolverMultistepSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (DPMSolverMultistepInverseScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 25),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "solver_order": 2,
+            "prediction_type": "epsilon",
+            "thresholding": False,
+            "sample_max_value": 1.0,
+            "algorithm_type": "dpmsolver++",
+            "solver_type": "midpoint",
+            "lower_order_final": False,
+            "lambda_min_clipped": -float("inf"),
+            "variance_type": None,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+            # copy over dummy past residuals
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                new_scheduler.set_timesteps(num_inference_steps)
+                # copy over dummy past residuals
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            output, new_output = sample, sample
+            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+                t = scheduler.timesteps[t]
+                output = scheduler.step(residual, t, output, **kwargs).prev_sample
+                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
+
+                assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_save_pretrained(self):
+        pass
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+
+            # copy over dummy past residuals (must be after setting timesteps)
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                # copy over dummy past residuals
+                new_scheduler.set_timesteps(num_inference_steps)
+
+                # copy over dummy past residual (must be after setting timesteps)
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def full_loop(self, scheduler=None, **config):
+        if scheduler is None:
+            scheduler_class = self.scheduler_classes[0]
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        return sample
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            time_step_0 = scheduler.timesteps[5]
+            time_step_1 = scheduler.timesteps[6]
+
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_timesteps(self):
+        for timesteps in [25, 50, 100, 999, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_thresholding(self):
+        self.check_over_configs(thresholding=False)
+        for order in [1, 2, 3]:
+            for solver_type in ["midpoint", "heun"]:
+                for threshold in [0.5, 1.0, 2.0]:
+                    for prediction_type in ["epsilon", "sample"]:
+                        self.check_over_configs(
+                            thresholding=True,
+                            prediction_type=prediction_type,
+                            sample_max_value=threshold,
+                            algorithm_type="dpmsolver++",
+                            solver_order=order,
+                            solver_type=solver_type,
+                        )
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_solver_order_and_type(self):
+        for algorithm_type in ["dpmsolver", "dpmsolver++"]:
+            for solver_type in ["midpoint", "heun"]:
+                for order in [1, 2, 3]:
+                    for prediction_type in ["epsilon", "sample"]:
+                        self.check_over_configs(
+                            solver_order=order,
+                            solver_type=solver_type,
+                            prediction_type=prediction_type,
+                            algorithm_type=algorithm_type,
+                        )
+                        sample = self.full_loop(
+                            solver_order=order,
+                            solver_type=solver_type,
+                            prediction_type=prediction_type,
+                            algorithm_type=algorithm_type,
+                        )
+                        assert not torch.isnan(sample).any(), "Samples have nan numbers"
+
+    def test_lower_order_final(self):
+        self.check_over_configs(lower_order_final=True)
+        self.check_over_configs(lower_order_final=False)
+
+    def test_lambda_min_clipped(self):
+        self.check_over_configs(lambda_min_clipped=-float("inf"))
+        self.check_over_configs(lambda_min_clipped=-5.1)
+
+    def test_variance_type(self):
+        self.check_over_configs(variance_type=None)
+        self.check_over_configs(variance_type="learned_range")
+
+    def test_timestep_spacing(self):
+        for timestep_spacing in ["trailing", "leading"]:
+            self.check_over_configs(timestep_spacing=timestep_spacing)
+
+    def test_inference_steps(self):
+        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.7047) < 1e-3
+
+    def test_full_loop_no_noise_thres(self):
+        sample = self.full_loop(thresholding=True, dynamic_thresholding_ratio=0.87, sample_max_value=0.5)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 19.8933) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction")
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 1.5194) < 1e-3
+
+    def test_full_loop_with_karras_and_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction", use_karras_sigmas=True)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 1.7833) < 2e-3
+
+    def test_switch(self):
+        # make sure that iterating over schedulers with same config names gives same results
+        # for defaults
+        scheduler = DPMSolverMultistepInverseScheduler(**self.get_scheduler_config())
+        sample = self.full_loop(scheduler=scheduler)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.7047) < 1e-3
+
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+        scheduler = DPMSolverMultistepInverseScheduler.from_config(scheduler.config)
+
+        sample = self.full_loop(scheduler=scheduler)
+        new_result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(new_result_mean.item() - result_mean.item()) < 1e-3
+
+    def test_fp16_support(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.half()
+        scheduler.set_timesteps(num_inference_steps)
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        assert sample.dtype == torch.float16
+
+    def test_unique_timesteps(self, **config):
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+
+            scheduler.set_timesteps(scheduler.config.num_train_timesteps)
+            assert len(scheduler.timesteps.unique()) == scheduler.num_inference_steps
diff --git a/diffusers/tests/schedulers/test_scheduler_dpm_sde.py b/diffusers/tests/schedulers/test_scheduler_dpm_sde.py
new file mode 100644
index 0000000000000000000000000000000000000000..253a0a478b415f69ffa4f6715dfa5019ec090d62
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_dpm_sde.py
@@ -0,0 +1,167 @@
+import torch
+
+from diffusers import DPMSolverSDEScheduler
+from diffusers.utils.testing_utils import require_torchsde, torch_device
+
+from .test_schedulers import SchedulerCommonTest
+
+
+@require_torchsde
+class DPMSolverSDESchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (DPMSolverSDEScheduler,)
+    num_inference_steps = 10
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1100,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "noise_sampler_seed": 0,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [10, 50, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "scaled_linear"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_full_loop_no_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["mps"]:
+            assert abs(result_sum.item() - 167.47821044921875) < 1e-2
+            assert abs(result_mean.item() - 0.2178705964565277) < 1e-3
+        elif torch_device in ["cuda"]:
+            assert abs(result_sum.item() - 171.59352111816406) < 1e-2
+            assert abs(result_mean.item() - 0.22342906892299652) < 1e-3
+        else:
+            assert abs(result_sum.item() - 162.52383422851562) < 1e-2
+            assert abs(result_mean.item() - 0.211619570851326) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["mps"]:
+            assert abs(result_sum.item() - 124.77149200439453) < 1e-2
+            assert abs(result_mean.item() - 0.16226289014816284) < 1e-3
+        elif torch_device in ["cuda"]:
+            assert abs(result_sum.item() - 128.1663360595703) < 1e-2
+            assert abs(result_mean.item() - 0.16688326001167297) < 1e-3
+        else:
+            assert abs(result_sum.item() - 119.8487548828125) < 1e-2
+            assert abs(result_mean.item() - 0.1560530662536621) < 1e-3
+
+    def test_full_loop_device(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["mps"]:
+            assert abs(result_sum.item() - 167.46957397460938) < 1e-2
+            assert abs(result_mean.item() - 0.21805934607982635) < 1e-3
+        elif torch_device in ["cuda"]:
+            assert abs(result_sum.item() - 171.59353637695312) < 1e-2
+            assert abs(result_mean.item() - 0.22342908382415771) < 1e-3
+        else:
+            assert abs(result_sum.item() - 162.52383422851562) < 1e-2
+            assert abs(result_mean.item() - 0.211619570851326) < 1e-3
+
+    def test_full_loop_device_karras_sigmas(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config, use_karras_sigmas=True)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["mps"]:
+            assert abs(result_sum.item() - 176.66974135742188) < 1e-2
+            assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
+        elif torch_device in ["cuda"]:
+            assert abs(result_sum.item() - 177.63653564453125) < 1e-2
+            assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
+        else:
+            assert abs(result_sum.item() - 170.3135223388672) < 1e-2
+            assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
diff --git a/diffusers/tests/schedulers/test_scheduler_dpm_single.py b/diffusers/tests/schedulers/test_scheduler_dpm_single.py
new file mode 100644
index 0000000000000000000000000000000000000000..873eaecd0a5c346a21b953f4622fc78c475c4fa1
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_dpm_single.py
@@ -0,0 +1,348 @@
+import tempfile
+
+import torch
+
+from diffusers import (
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    UniPCMultistepScheduler,
+)
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class DPMSolverSinglestepSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (DPMSolverSinglestepScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 25),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "solver_order": 2,
+            "prediction_type": "epsilon",
+            "thresholding": False,
+            "sample_max_value": 1.0,
+            "algorithm_type": "dpmsolver++",
+            "solver_type": "midpoint",
+            "lambda_min_clipped": -float("inf"),
+            "variance_type": None,
+            "final_sigmas_type": "sigma_min",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+            # copy over dummy past residuals
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                new_scheduler.set_timesteps(num_inference_steps)
+                # copy over dummy past residuals
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            output, new_output = sample, sample
+            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+                t = scheduler.timesteps[t]
+                output = scheduler.step(residual, t, output, **kwargs).prev_sample
+                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
+
+                assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_save_pretrained(self):
+        pass
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+
+            # copy over dummy past residuals (must be after setting timesteps)
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                # copy over dummy past residuals
+                new_scheduler.set_timesteps(num_inference_steps)
+
+                # copy over dummy past residual (must be after setting timesteps)
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def full_loop(self, scheduler=None, **config):
+        if scheduler is None:
+            scheduler_class = self.scheduler_classes[0]
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        return sample
+
+    def full_loop_custom_timesteps(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        scheduler.set_timesteps(num_inference_steps)
+        timesteps = scheduler.timesteps
+        # reset the timesteps using`timesteps`
+        scheduler = scheduler_class(**scheduler_config)
+        scheduler.set_timesteps(num_inference_steps=None, timesteps=timesteps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        return sample
+
+    def test_full_uneven_loop(self):
+        scheduler = DPMSolverSinglestepScheduler(**self.get_scheduler_config())
+        num_inference_steps = 50
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        # make sure that the first t is uneven
+        for i, t in enumerate(scheduler.timesteps[3:]):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2574) < 1e-3
+
+    def test_timesteps(self):
+        for timesteps in [25, 50, 100, 999, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_switch(self):
+        # make sure that iterating over schedulers with same config names gives same results
+        # for defaults
+        scheduler = DPMSolverSinglestepScheduler(**self.get_scheduler_config())
+        sample = self.full_loop(scheduler=scheduler)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2791) < 1e-3
+
+        scheduler = DEISMultistepScheduler.from_config(scheduler.config)
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
+        scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config)
+
+        sample = self.full_loop(scheduler=scheduler)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2791) < 1e-3
+
+    def test_thresholding(self):
+        self.check_over_configs(thresholding=False)
+        for order in [1, 2, 3]:
+            for solver_type in ["midpoint", "heun"]:
+                for threshold in [0.5, 1.0, 2.0]:
+                    for prediction_type in ["epsilon", "sample"]:
+                        self.check_over_configs(
+                            thresholding=True,
+                            prediction_type=prediction_type,
+                            sample_max_value=threshold,
+                            algorithm_type="dpmsolver++",
+                            solver_order=order,
+                            solver_type=solver_type,
+                        )
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_solver_order_and_type(self):
+        for algorithm_type in ["dpmsolver", "dpmsolver++", "sde-dpmsolver++"]:
+            for solver_type in ["midpoint", "heun"]:
+                for order in [1, 2, 3]:
+                    for prediction_type in ["epsilon", "sample"]:
+                        if algorithm_type == "sde-dpmsolver++":
+                            if order == 3:
+                                continue
+                        else:
+                            self.check_over_configs(
+                                solver_order=order,
+                                solver_type=solver_type,
+                                prediction_type=prediction_type,
+                                algorithm_type=algorithm_type,
+                            )
+                        sample = self.full_loop(
+                            solver_order=order,
+                            solver_type=solver_type,
+                            prediction_type=prediction_type,
+                            algorithm_type=algorithm_type,
+                        )
+                        assert not torch.isnan(sample).any(), "Samples have nan numbers"
+
+    def test_lower_order_final(self):
+        self.check_over_configs(lower_order_final=True)
+        self.check_over_configs(lower_order_final=False)
+
+    def test_lambda_min_clipped(self):
+        self.check_over_configs(lambda_min_clipped=-float("inf"))
+        self.check_over_configs(lambda_min_clipped=-5.1)
+
+    def test_variance_type(self):
+        self.check_over_configs(variance_type=None)
+        self.check_over_configs(variance_type="learned_range")
+
+    def test_inference_steps(self):
+        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2791) < 1e-3
+
+    def test_full_loop_with_karras(self):
+        sample = self.full_loop(use_karras_sigmas=True)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2248) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction")
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.1453) < 1e-3
+
+    def test_full_loop_with_karras_and_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction", use_karras_sigmas=True)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.0649) < 1e-3
+
+    def test_fp16_support(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.half()
+        scheduler.set_timesteps(num_inference_steps)
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        assert sample.dtype == torch.float16
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            time_step_0 = scheduler.timesteps[0]
+            time_step_1 = scheduler.timesteps[1]
+
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        t_start = 5
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        # add noise
+        noise = self.dummy_noise_deter
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for i, t in enumerate(timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 269.2187) < 1e-2, f" expected result sum  269.2187, but get {result_sum}"
+        assert abs(result_mean.item() - 0.3505) < 1e-3, f" expected result mean 0.3505, but get {result_mean}"
+
+    def test_custom_timesteps(self):
+        for prediction_type in ["epsilon", "sample", "v_prediction"]:
+            for lower_order_final in [True, False]:
+                for final_sigmas_type in ["sigma_min", "zero"]:
+                    sample = self.full_loop(
+                        prediction_type=prediction_type,
+                        lower_order_final=lower_order_final,
+                        final_sigmas_type=final_sigmas_type,
+                    )
+                    sample_custom_timesteps = self.full_loop_custom_timesteps(
+                        prediction_type=prediction_type,
+                        lower_order_final=lower_order_final,
+                        final_sigmas_type=final_sigmas_type,
+                    )
+                    assert (
+                        torch.sum(torch.abs(sample - sample_custom_timesteps)) < 1e-5
+                    ), f"Scheduler outputs are not identical for prediction_type: {prediction_type}, lower_order_final: {lower_order_final} and final_sigmas_type: {final_sigmas_type}"
diff --git a/diffusers/tests/schedulers/test_scheduler_edm_dpmsolver_multistep.py b/diffusers/tests/schedulers/test_scheduler_edm_dpmsolver_multistep.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5522f5991f7965d05f50a1ee62231390490f4e0
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_edm_dpmsolver_multistep.py
@@ -0,0 +1,262 @@
+import tempfile
+import unittest
+
+import torch
+
+from diffusers import (
+    EDMDPMSolverMultistepScheduler,
+)
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class EDMDPMSolverMultistepSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (EDMDPMSolverMultistepScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 25),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "sigma_min": 0.002,
+            "sigma_max": 80.0,
+            "sigma_data": 0.5,
+            "num_train_timesteps": 1000,
+            "solver_order": 2,
+            "prediction_type": "epsilon",
+            "thresholding": False,
+            "sample_max_value": 1.0,
+            "algorithm_type": "dpmsolver++",
+            "solver_type": "midpoint",
+            "lower_order_final": False,
+            "euler_at_final": False,
+            "final_sigmas_type": "sigma_min",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+            # copy over dummy past residuals
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                new_scheduler.set_timesteps(num_inference_steps)
+                # copy over dummy past residuals
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            output, new_output = sample, sample
+            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+                t = new_scheduler.timesteps[t]
+                output = scheduler.step(residual, t, output, **kwargs).prev_sample
+                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
+
+                assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_save_pretrained(self):
+        pass
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+
+            # copy over dummy past residuals (must be after setting timesteps)
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                # copy over dummy past residuals
+                new_scheduler.set_timesteps(num_inference_steps)
+
+                # copy over dummy past residual (must be after setting timesteps)
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            time_step = new_scheduler.timesteps[time_step]
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def full_loop(self, scheduler=None, **config):
+        if scheduler is None:
+            scheduler_class = self.scheduler_classes[0]
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        return sample
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            time_step_0 = scheduler.timesteps[5]
+            time_step_1 = scheduler.timesteps[6]
+
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_timesteps(self):
+        for timesteps in [25, 50, 100, 999, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_thresholding(self):
+        self.check_over_configs(thresholding=False)
+        for order in [1, 2, 3]:
+            for solver_type in ["midpoint", "heun"]:
+                for threshold in [0.5, 1.0, 2.0]:
+                    for prediction_type in ["epsilon", "v_prediction"]:
+                        self.check_over_configs(
+                            thresholding=True,
+                            prediction_type=prediction_type,
+                            sample_max_value=threshold,
+                            algorithm_type="dpmsolver++",
+                            solver_order=order,
+                            solver_type=solver_type,
+                        )
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    # TODO (patil-suraj): Fix this test
+    @unittest.skip("Skip for now, as it failing currently but works with the actual model")
+    def test_solver_order_and_type(self):
+        for algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
+            for solver_type in ["midpoint", "heun"]:
+                for order in [1, 2, 3]:
+                    for prediction_type in ["epsilon", "v_prediction"]:
+                        if algorithm_type == "sde-dpmsolver++":
+                            if order == 3:
+                                continue
+                        else:
+                            self.check_over_configs(
+                                solver_order=order,
+                                solver_type=solver_type,
+                                prediction_type=prediction_type,
+                                algorithm_type=algorithm_type,
+                            )
+                        sample = self.full_loop(
+                            solver_order=order,
+                            solver_type=solver_type,
+                            prediction_type=prediction_type,
+                            algorithm_type=algorithm_type,
+                        )
+                        assert (
+                            not torch.isnan(sample).any()
+                        ), f"Samples have nan numbers, {order}, {solver_type}, {prediction_type}, {algorithm_type}"
+
+    def test_lower_order_final(self):
+        self.check_over_configs(lower_order_final=True)
+        self.check_over_configs(lower_order_final=False)
+
+    def test_euler_at_final(self):
+        self.check_over_configs(euler_at_final=True)
+        self.check_over_configs(euler_at_final=False)
+
+    def test_inference_steps(self):
+        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.0001) < 1e-3
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        t_start = 5
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        # add noise
+        noise = self.dummy_noise_deter
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for i, t in enumerate(timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 8.1661) < 1e-2, f" expected result sum 8.1661, but get {result_sum}"
+        assert abs(result_mean.item() - 0.0106) < 1e-3, f" expected result mean 0.0106, but get {result_mean}"
+
+    def test_full_loop_no_noise_thres(self):
+        sample = self.full_loop(thresholding=True, dynamic_thresholding_ratio=0.87, sample_max_value=0.5)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.0080) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction")
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.0092) < 1e-3
+
+    def test_duplicated_timesteps(self, **config):
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+
+            scheduler.set_timesteps(scheduler.config.num_train_timesteps)
+            assert len(scheduler.timesteps) == scheduler.num_inference_steps
+
+    def test_trained_betas(self):
+        pass
diff --git a/diffusers/tests/schedulers/test_scheduler_edm_euler.py b/diffusers/tests/schedulers/test_scheduler_edm_euler.py
new file mode 100644
index 0000000000000000000000000000000000000000..acac4b1f4caed2aff6741a40e9214adedc0f744f
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_edm_euler.py
@@ -0,0 +1,203 @@
+import inspect
+import tempfile
+import unittest
+from typing import Dict, List, Tuple
+
+import torch
+
+from diffusers import EDMEulerScheduler
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class EDMEulerSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (EDMEulerScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 10),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 256,
+            "sigma_min": 0.002,
+            "sigma_max": 80.0,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [10, 50, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_full_loop_no_noise(self, num_inference_steps=10, seed=0):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+
+        for i, t in enumerate(scheduler.timesteps):
+            scaled_sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(scaled_sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 34.1855) < 1e-3
+        assert abs(result_mean.item() - 0.044) < 1e-3
+
+    def test_full_loop_device(self, num_inference_steps=10, seed=0):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+
+        for i, t in enumerate(scheduler.timesteps):
+            scaled_sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(scaled_sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 34.1855) < 1e-3
+        assert abs(result_mean.item() - 0.044) < 1e-3
+
+    # Override test_from_save_pretrained to use EDMEulerScheduler-specific logic
+    def test_from_save_pretrained(self):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+
+            scheduler.set_timesteps(num_inference_steps)
+            new_scheduler.set_timesteps(num_inference_steps)
+            timestep = scheduler.timesteps[0]
+
+            sample = self.dummy_sample
+
+            scaled_sample = scheduler.scale_model_input(sample, timestep)
+            residual = 0.1 * scaled_sample
+
+            new_scaled_sample = new_scheduler.scale_model_input(sample, timestep)
+            new_residual = 0.1 * new_scaled_sample
+
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.manual_seed(0)
+            output = scheduler.step(residual, timestep, sample, **kwargs).prev_sample
+
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.manual_seed(0)
+            new_output = new_scheduler.step(new_residual, timestep, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    # Override test_from_save_pretrained to use EDMEulerScheduler-specific logic
+    def test_step_shape(self):
+        num_inference_steps = 10
+
+        scheduler_config = self.get_scheduler_config()
+        scheduler = self.scheduler_classes[0](**scheduler_config)
+
+        scheduler.set_timesteps(num_inference_steps)
+
+        timestep_0 = scheduler.timesteps[0]
+        timestep_1 = scheduler.timesteps[1]
+
+        sample = self.dummy_sample
+        scaled_sample = scheduler.scale_model_input(sample, timestep_0)
+        residual = 0.1 * scaled_sample
+
+        output_0 = scheduler.step(residual, timestep_0, sample).prev_sample
+        output_1 = scheduler.step(residual, timestep_1, sample).prev_sample
+
+        self.assertEqual(output_0.shape, sample.shape)
+        self.assertEqual(output_0.shape, output_1.shape)
+
+    # Override test_from_save_pretrained to use EDMEulerScheduler-specific logic
+    def test_scheduler_outputs_equivalence(self):
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def recursive_check(tuple_object, dict_object):
+            if isinstance(tuple_object, (List, Tuple)):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif isinstance(tuple_object, Dict):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif tuple_object is None:
+                return
+            else:
+                self.assertTrue(
+                    torch.allclose(
+                        set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                    ),
+                    msg=(
+                        "Tuple and dict output are not equal. Difference:"
+                        f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                        f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                        f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                    ),
+                )
+
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", 50)
+
+        timestep = 0
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            scheduler.set_timesteps(num_inference_steps)
+            timestep = scheduler.timesteps[0]
+
+            sample = self.dummy_sample
+            scaled_sample = scheduler.scale_model_input(sample, timestep)
+            residual = 0.1 * scaled_sample
+
+            # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.manual_seed(0)
+            outputs_dict = scheduler.step(residual, timestep, sample, **kwargs)
+
+            scheduler.set_timesteps(num_inference_steps)
+
+            scaled_sample = scheduler.scale_model_input(sample, timestep)
+            residual = 0.1 * scaled_sample
+
+            # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.manual_seed(0)
+            outputs_tuple = scheduler.step(residual, timestep, sample, return_dict=False, **kwargs)
+
+            recursive_check(outputs_tuple, outputs_dict)
+
+    @unittest.skip(reason="EDMEulerScheduler does not support beta schedules.")
+    def test_trained_betas(self):
+        pass
diff --git a/diffusers/tests/schedulers/test_scheduler_euler.py b/diffusers/tests/schedulers/test_scheduler_euler.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbb49b16416596359f1ec665cd5b12d5bd51d7c6
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_euler.py
@@ -0,0 +1,265 @@
+import torch
+
+from diffusers import EulerDiscreteScheduler
+from diffusers.utils.testing_utils import torch_device
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class EulerDiscreteSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (EulerDiscreteScheduler,)
+    num_inference_steps = 10
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1100,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [10, 50, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "scaled_linear"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_timestep_type(self):
+        timestep_types = ["discrete", "continuous"]
+        for timestep_type in timestep_types:
+            self.check_over_configs(timestep_type=timestep_type)
+
+    def test_karras_sigmas(self):
+        self.check_over_configs(use_karras_sigmas=True, sigma_min=0.02, sigma_max=700.0)
+
+    def test_rescale_betas_zero_snr(self):
+        for rescale_betas_zero_snr in [True, False]:
+            self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr)
+
+    def full_loop(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = self.num_inference_steps
+        scheduler.set_timesteps(num_inference_steps)
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+        return sample
+
+    def full_loop_custom_timesteps(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = self.num_inference_steps
+        scheduler.set_timesteps(num_inference_steps)
+        timesteps = scheduler.timesteps
+        # reset the timesteps using `timesteps`
+        scheduler = scheduler_class(**scheduler_config)
+        scheduler.set_timesteps(num_inference_steps=None, timesteps=timesteps)
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+        return sample
+
+    def full_loop_custom_sigmas(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = self.num_inference_steps
+        scheduler.set_timesteps(num_inference_steps)
+        sigmas = scheduler.sigmas
+        # reset the timesteps using `sigmas`
+        scheduler = scheduler_class(**scheduler_config)
+        scheduler.set_timesteps(num_inference_steps=None, sigmas=sigmas)
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+        return sample
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 10.0807) < 1e-2
+        assert abs(result_mean.item() - 0.0131) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction")
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 0.0002) < 1e-2
+        assert abs(result_mean.item() - 2.2676e-06) < 1e-3
+
+    def test_full_loop_device(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma.cpu()
+        sample = sample.to(torch_device)
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 10.0807) < 1e-2
+        assert abs(result_mean.item() - 0.0131) < 1e-3
+
+    def test_full_loop_device_karras_sigmas(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config, use_karras_sigmas=True)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma.cpu()
+        sample = sample.to(torch_device)
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 124.52299499511719) < 1e-2
+        assert abs(result_mean.item() - 0.16213932633399963) < 1e-3
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+
+        # add noise
+        t_start = self.num_inference_steps - 2
+        noise = self.dummy_noise_deter
+        noise = noise.to(sample.device)
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for i, t in enumerate(timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 57062.9297) < 1e-2, f" expected result sum 57062.9297, but get {result_sum}"
+        assert abs(result_mean.item() - 74.3007) < 1e-3, f" expected result mean 74.3007, but get {result_mean}"
+
+    def test_custom_timesteps(self):
+        for prediction_type in ["epsilon", "sample", "v_prediction"]:
+            for interpolation_type in ["linear", "log_linear"]:
+                for final_sigmas_type in ["sigma_min", "zero"]:
+                    sample = self.full_loop(
+                        prediction_type=prediction_type,
+                        interpolation_type=interpolation_type,
+                        final_sigmas_type=final_sigmas_type,
+                    )
+                    sample_custom_timesteps = self.full_loop_custom_timesteps(
+                        prediction_type=prediction_type,
+                        interpolation_type=interpolation_type,
+                        final_sigmas_type=final_sigmas_type,
+                    )
+                    assert (
+                        torch.sum(torch.abs(sample - sample_custom_timesteps)) < 1e-5
+                    ), f"Scheduler outputs are not identical for prediction_type: {prediction_type}, interpolation_type: {interpolation_type} and final_sigmas_type: {final_sigmas_type}"
+
+    def test_custom_sigmas(self):
+        for prediction_type in ["epsilon", "sample", "v_prediction"]:
+            for final_sigmas_type in ["sigma_min", "zero"]:
+                sample = self.full_loop(
+                    prediction_type=prediction_type,
+                    final_sigmas_type=final_sigmas_type,
+                )
+                sample_custom_timesteps = self.full_loop_custom_sigmas(
+                    prediction_type=prediction_type,
+                    final_sigmas_type=final_sigmas_type,
+                )
+                assert (
+                    torch.sum(torch.abs(sample - sample_custom_timesteps)) < 1e-5
+                ), f"Scheduler outputs are not identical for prediction_type: {prediction_type} and final_sigmas_type: {final_sigmas_type}"
diff --git a/diffusers/tests/schedulers/test_scheduler_euler_ancestral.py b/diffusers/tests/schedulers/test_scheduler_euler_ancestral.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f22ab38ddafa800e3f142f98ba7d26d6c41493a
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_euler_ancestral.py
@@ -0,0 +1,156 @@
+import torch
+
+from diffusers import EulerAncestralDiscreteScheduler
+from diffusers.utils.testing_utils import torch_device
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class EulerAncestralDiscreteSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (EulerAncestralDiscreteScheduler,)
+    num_inference_steps = 10
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1100,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [10, 50, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "scaled_linear"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_rescale_betas_zero_snr(self):
+        for rescale_betas_zero_snr in [True, False]:
+            self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr)
+
+    def test_full_loop_no_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma.cpu()
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 152.3192) < 1e-2
+        assert abs(result_mean.item() - 0.1983) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 108.4439) < 1e-2
+        assert abs(result_mean.item() - 0.1412) < 1e-3
+
+    def test_full_loop_device(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma.cpu()
+        sample = sample.to(torch_device)
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 152.3192) < 1e-2
+        assert abs(result_mean.item() - 0.1983) < 1e-3
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        t_start = self.num_inference_steps - 2
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+
+        # add noise
+        noise = self.dummy_noise_deter
+        noise = noise.to(sample.device)
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for i, t in enumerate(timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 56163.0508) < 1e-2, f" expected result sum 56163.0508, but get {result_sum}"
+        assert abs(result_mean.item() - 73.1290) < 1e-3, f" expected result mean  73.1290, but get {result_mean}"
diff --git a/diffusers/tests/schedulers/test_scheduler_flax.py b/diffusers/tests/schedulers/test_scheduler_flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2ee7e13146da7a3b68187bb1d717a7dc6100888
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_flax.py
@@ -0,0 +1,919 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import tempfile
+import unittest
+from typing import Dict, List, Tuple
+
+from diffusers import FlaxDDIMScheduler, FlaxDDPMScheduler, FlaxPNDMScheduler
+from diffusers.utils import is_flax_available
+from diffusers.utils.testing_utils import require_flax
+
+
+if is_flax_available():
+    import jax
+    import jax.numpy as jnp
+    from jax import random
+
+    jax_device = jax.default_backend()
+
+
+@require_flax
+class FlaxSchedulerCommonTest(unittest.TestCase):
+    scheduler_classes = ()
+    forward_default_kwargs = ()
+
+    @property
+    def dummy_sample(self):
+        batch_size = 4
+        num_channels = 3
+        height = 8
+        width = 8
+
+        key1, key2 = random.split(random.PRNGKey(0))
+        sample = random.uniform(key1, (batch_size, num_channels, height, width))
+
+        return sample, key2
+
+    @property
+    def dummy_sample_deter(self):
+        batch_size = 4
+        num_channels = 3
+        height = 8
+        width = 8
+
+        num_elems = batch_size * num_channels * height * width
+        sample = jnp.arange(num_elems)
+        sample = sample.reshape(num_channels, height, width, batch_size)
+        sample = sample / num_elems
+        return jnp.transpose(sample, (3, 0, 1, 2))
+
+    def get_scheduler_config(self):
+        raise NotImplementedError
+
+    def dummy_model(self):
+        def model(sample, t, *args):
+            return sample * t / (t + 1)
+
+        return model
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            sample, key = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output = scheduler.step(state, residual, time_step, sample, key, **kwargs).prev_sample
+            new_output = new_scheduler.step(new_state, residual, time_step, sample, key, **kwargs).prev_sample
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        kwargs.update(forward_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            sample, key = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output = scheduler.step(state, residual, time_step, sample, key, **kwargs).prev_sample
+            new_output = new_scheduler.step(new_state, residual, time_step, sample, key, **kwargs).prev_sample
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_save_pretrained(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            sample, key = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output = scheduler.step(state, residual, 1, sample, key, **kwargs).prev_sample
+            new_output = new_scheduler.step(new_state, residual, 1, sample, key, **kwargs).prev_sample
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            sample, key = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output_0 = scheduler.step(state, residual, 0, sample, key, **kwargs).prev_sample
+            output_1 = scheduler.step(state, residual, 1, sample, key, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_scheduler_outputs_equivalence(self):
+        def set_nan_tensor_to_zero(t):
+            return t.at[t != t].set(0)
+
+        def recursive_check(tuple_object, dict_object):
+            if isinstance(tuple_object, (List, Tuple)):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif isinstance(tuple_object, Dict):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif tuple_object is None:
+                return
+            else:
+                self.assertTrue(
+                    jnp.allclose(set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5),
+                    msg=(
+                        "Tuple and dict output are not equal. Difference:"
+                        f" {jnp.max(jnp.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                        f" {jnp.isnan(tuple_object).any()} and `inf`: {jnp.isinf(tuple_object)}. Dict has"
+                        f" `nan`: {jnp.isnan(dict_object).any()} and `inf`: {jnp.isinf(dict_object)}."
+                    ),
+                )
+
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            sample, key = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            outputs_dict = scheduler.step(state, residual, 0, sample, key, **kwargs)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            outputs_tuple = scheduler.step(state, residual, 0, sample, key, return_dict=False, **kwargs)
+
+            recursive_check(outputs_tuple[0], outputs_dict.prev_sample)
+
+    def test_deprecated_kwargs(self):
+        for scheduler_class in self.scheduler_classes:
+            has_kwarg_in_model_class = "kwargs" in inspect.signature(scheduler_class.__init__).parameters
+            has_deprecated_kwarg = len(scheduler_class._deprecated_kwargs) > 0
+
+            if has_kwarg_in_model_class and not has_deprecated_kwarg:
+                raise ValueError(
+                    f"{scheduler_class} has `**kwargs` in its __init__ method but has not defined any deprecated"
+                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if"
+                    " there are no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
+                    " [<deprecated_argument>]`"
+                )
+
+            if not has_kwarg_in_model_class and has_deprecated_kwarg:
+                raise ValueError(
+                    f"{scheduler_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated"
+                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs`"
+                    f" argument to {self.model_class}.__init__ if there are deprecated arguments or remove the"
+                    " deprecated argument from `_deprecated_kwargs = [<deprecated_argument>]`"
+                )
+
+
+@require_flax
+class FlaxDDPMSchedulerTest(FlaxSchedulerCommonTest):
+    scheduler_classes = (FlaxDDPMScheduler,)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "variance_type": "fixed_small",
+            "clip_sample": True,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [1, 5, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_variance_type(self):
+        for variance in ["fixed_small", "fixed_large", "other"]:
+            self.check_over_configs(variance_type=variance)
+
+    def test_clip_sample(self):
+        for clip_sample in [True, False]:
+            self.check_over_configs(clip_sample=clip_sample)
+
+    def test_time_indices(self):
+        for t in [0, 500, 999]:
+            self.check_over_forward(time_step=t)
+
+    def test_variance(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
+
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0) - 0.0)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 487) - 0.00979)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 999) - 0.02)) < 1e-5
+
+    def test_full_loop_no_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
+
+        num_trained_timesteps = len(scheduler)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        key1, key2 = random.split(random.PRNGKey(0))
+
+        for t in reversed(range(num_trained_timesteps)):
+            # 1. predict noise residual
+            residual = model(sample, t)
+
+            # 2. predict previous mean of sample x_t-1
+            output = scheduler.step(state, residual, t, sample, key1)
+            pred_prev_sample = output.prev_sample
+            state = output.state
+            key1, key2 = random.split(key2)
+
+            # if t > 0:
+            #     noise = self.dummy_sample_deter
+            #     variance = scheduler.get_variance(t) ** (0.5) * noise
+            #
+            # sample = pred_prev_sample + variance
+            sample = pred_prev_sample
+
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        if jax_device == "tpu":
+            assert abs(result_sum - 255.0714) < 1e-2
+            assert abs(result_mean - 0.332124) < 1e-3
+        else:
+            assert abs(result_sum - 255.1113) < 1e-1
+            assert abs(result_mean - 0.332176) < 1e-3
+
+
+@require_flax
+class FlaxDDIMSchedulerTest(FlaxSchedulerCommonTest):
+    scheduler_classes = (FlaxDDIMScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 50),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def full_loop(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
+        key1, key2 = random.split(random.PRNGKey(0))
+
+        num_inference_steps = 10
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+
+        state = scheduler.set_timesteps(state, num_inference_steps)
+
+        for t in state.timesteps:
+            residual = model(sample, t)
+            output = scheduler.step(state, residual, t, sample)
+            sample = output.prev_sample
+            state = output.state
+            key1, key2 = random.split(key2)
+
+        return sample
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output = scheduler.step(state, residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(new_state, residual, time_step, sample, **kwargs).prev_sample
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_save_pretrained(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output = scheduler.step(state, residual, 1, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(new_state, residual, 1, sample, **kwargs).prev_sample
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        kwargs.update(forward_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output = scheduler.step(state, residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(new_state, residual, time_step, sample, **kwargs).prev_sample
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_scheduler_outputs_equivalence(self):
+        def set_nan_tensor_to_zero(t):
+            return t.at[t != t].set(0)
+
+        def recursive_check(tuple_object, dict_object):
+            if isinstance(tuple_object, (List, Tuple)):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif isinstance(tuple_object, Dict):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif tuple_object is None:
+                return
+            else:
+                self.assertTrue(
+                    jnp.allclose(set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5),
+                    msg=(
+                        "Tuple and dict output are not equal. Difference:"
+                        f" {jnp.max(jnp.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                        f" {jnp.isnan(tuple_object).any()} and `inf`: {jnp.isinf(tuple_object)}. Dict has"
+                        f" `nan`: {jnp.isnan(dict_object).any()} and `inf`: {jnp.isinf(dict_object)}."
+                    ),
+                )
+
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            outputs_dict = scheduler.step(state, residual, 0, sample, **kwargs)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            outputs_tuple = scheduler.step(state, residual, 0, sample, return_dict=False, **kwargs)
+
+            recursive_check(outputs_tuple[0], outputs_dict.prev_sample)
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output_0 = scheduler.step(state, residual, 0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(state, residual, 1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_timesteps(self):
+        for timesteps in [100, 500, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_steps_offset(self):
+        for steps_offset in [0, 1]:
+            self.check_over_configs(steps_offset=steps_offset)
+
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(steps_offset=1)
+        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
+        state = scheduler.set_timesteps(state, 5)
+        assert jnp.equal(state.timesteps, jnp.array([801, 601, 401, 201, 1])).all()
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_time_indices(self):
+        for t in [1, 10, 49]:
+            self.check_over_forward(time_step=t)
+
+    def test_inference_steps(self):
+        for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]):
+            self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps)
+
+    def test_variance(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
+
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0, 0) - 0.0)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 420, 400) - 0.14771)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 980, 960) - 0.32460)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0, 0) - 0.0)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 487, 486) - 0.00979)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 999, 998) - 0.02)) < 1e-5
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        assert abs(result_sum - 172.0067) < 1e-2
+        assert abs(result_mean - 0.223967) < 1e-3
+
+    def test_full_loop_with_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        if jax_device == "tpu":
+            assert abs(result_sum - 149.8409) < 1e-2
+            assert abs(result_mean - 0.1951) < 1e-3
+        else:
+            assert abs(result_sum - 149.8295) < 1e-2
+            assert abs(result_mean - 0.1951) < 1e-3
+
+    def test_full_loop_with_no_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        if jax_device == "tpu":
+            pass
+            # FIXME: both result_sum and result_mean are nan on TPU
+            # assert jnp.isnan(result_sum)
+            # assert jnp.isnan(result_mean)
+        else:
+            assert abs(result_sum - 149.0784) < 1e-2
+            assert abs(result_mean - 0.1941) < 1e-3
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "sample", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+
+@require_flax
+class FlaxPNDMSchedulerTest(FlaxSchedulerCommonTest):
+    scheduler_classes = (FlaxPNDMScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 50),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample, _ = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = jnp.array([residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05])
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+            state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+            # copy over dummy past residuals
+            state = state.replace(ets=dummy_past_residuals[:])
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps, shape=sample.shape)
+                # copy over dummy past residuals
+                new_state = new_state.replace(ets=dummy_past_residuals[:])
+
+            (prev_sample, state) = scheduler.step_prk(state, residual, time_step, sample, **kwargs)
+            (new_prev_sample, new_state) = new_scheduler.step_prk(new_state, residual, time_step, sample, **kwargs)
+
+            assert jnp.sum(jnp.abs(prev_sample - new_prev_sample)) < 1e-5, "Scheduler outputs are not identical"
+
+            output, _ = scheduler.step_plms(state, residual, time_step, sample, **kwargs)
+            new_output, _ = new_scheduler.step_plms(new_state, residual, time_step, sample, **kwargs)
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_save_pretrained(self):
+        pass
+
+    def test_scheduler_outputs_equivalence(self):
+        def set_nan_tensor_to_zero(t):
+            return t.at[t != t].set(0)
+
+        def recursive_check(tuple_object, dict_object):
+            if isinstance(tuple_object, (List, Tuple)):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif isinstance(tuple_object, Dict):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif tuple_object is None:
+                return
+            else:
+                self.assertTrue(
+                    jnp.allclose(set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5),
+                    msg=(
+                        "Tuple and dict output are not equal. Difference:"
+                        f" {jnp.max(jnp.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                        f" {jnp.isnan(tuple_object).any()} and `inf`: {jnp.isinf(tuple_object)}. Dict has"
+                        f" `nan`: {jnp.isnan(dict_object).any()} and `inf`: {jnp.isinf(dict_object)}."
+                    ),
+                )
+
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            outputs_dict = scheduler.step(state, residual, 0, sample, **kwargs)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            outputs_tuple = scheduler.step(state, residual, 0, sample, return_dict=False, **kwargs)
+
+            recursive_check(outputs_tuple[0], outputs_dict.prev_sample)
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample, _ = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = jnp.array([residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05])
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+            state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+
+            # copy over dummy past residuals (must be after setting timesteps)
+            scheduler.ets = dummy_past_residuals[:]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
+                # copy over dummy past residuals
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps, shape=sample.shape)
+
+                # copy over dummy past residual (must be after setting timesteps)
+                new_state.replace(ets=dummy_past_residuals[:])
+
+            output, state = scheduler.step_prk(state, residual, time_step, sample, **kwargs)
+            new_output, new_state = new_scheduler.step_prk(new_state, residual, time_step, sample, **kwargs)
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+            output, _ = scheduler.step_plms(state, residual, time_step, sample, **kwargs)
+            new_output, _ = new_scheduler.step_plms(new_state, residual, time_step, sample, **kwargs)
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def full_loop(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+
+        for i, t in enumerate(state.prk_timesteps):
+            residual = model(sample, t)
+            sample, state = scheduler.step_prk(state, residual, t, sample)
+
+        for i, t in enumerate(state.plms_timesteps):
+            residual = model(sample, t)
+            sample, state = scheduler.step_plms(state, residual, t, sample)
+
+        return sample
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = jnp.array([residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05])
+            state = state.replace(ets=dummy_past_residuals[:])
+
+            output_0, state = scheduler.step_prk(state, residual, 0, sample, **kwargs)
+            output_1, state = scheduler.step_prk(state, residual, 1, sample, **kwargs)
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+            output_0, state = scheduler.step_plms(state, residual, 0, sample, **kwargs)
+            output_1, state = scheduler.step_plms(state, residual, 1, sample, **kwargs)
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_timesteps(self):
+        for timesteps in [100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_steps_offset(self):
+        for steps_offset in [0, 1]:
+            self.check_over_configs(steps_offset=steps_offset)
+
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(steps_offset=1)
+        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
+        state = scheduler.set_timesteps(state, 10, shape=())
+        assert jnp.equal(
+            state.timesteps,
+            jnp.array([901, 851, 851, 801, 801, 751, 751, 701, 701, 651, 651, 601, 601, 501, 401, 301, 201, 101, 1]),
+        ).all()
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001], [0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_time_indices(self):
+        for t in [1, 5, 10]:
+            self.check_over_forward(time_step=t)
+
+    def test_inference_steps(self):
+        for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]):
+            self.check_over_forward(num_inference_steps=num_inference_steps)
+
+    def test_pow_of_3_inference_steps(self):
+        # earlier version of set_timesteps() caused an error indexing alpha's with inference steps as power of 3
+        num_inference_steps = 27
+
+        for scheduler_class in self.scheduler_classes:
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+
+            # before power of 3 fix, would error on first step, so we only need to do two
+            for i, t in enumerate(state.prk_timesteps[:2]):
+                sample, state = scheduler.step_prk(state, residual, t, sample)
+
+    def test_inference_plms_no_past_residuals(self):
+        with self.assertRaises(ValueError):
+            scheduler_class = self.scheduler_classes[0]
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            scheduler.step_plms(state, self.dummy_sample, 1, self.dummy_sample).prev_sample
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        if jax_device == "tpu":
+            assert abs(result_sum - 198.1275) < 1e-2
+            assert abs(result_mean - 0.2580) < 1e-3
+        else:
+            assert abs(result_sum - 198.1318) < 1e-2
+            assert abs(result_mean - 0.2580) < 1e-3
+
+    def test_full_loop_with_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        if jax_device == "tpu":
+            assert abs(result_sum - 186.83226) < 1e-2
+            assert abs(result_mean - 0.24327) < 1e-3
+        else:
+            assert abs(result_sum - 186.9466) < 1e-2
+            assert abs(result_mean - 0.24342) < 1e-3
+
+    def test_full_loop_with_no_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        if jax_device == "tpu":
+            assert abs(result_sum - 186.83226) < 1e-2
+            assert abs(result_mean - 0.24327) < 1e-3
+        else:
+            assert abs(result_sum - 186.9482) < 1e-2
+            assert abs(result_mean - 0.2434) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_heun.py b/diffusers/tests/schedulers/test_scheduler_heun.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3689ef2ea635c6c2c143f389c7c0f96b12e7134
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_heun.py
@@ -0,0 +1,221 @@
+import torch
+
+from diffusers import HeunDiscreteScheduler
+from diffusers.utils.testing_utils import torch_device
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class HeunDiscreteSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (HeunDiscreteScheduler,)
+    num_inference_steps = 10
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1100,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [10, 50, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "scaled_linear", "exp"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_clip_sample(self):
+        for clip_sample_range in [1.0, 2.0, 3.0]:
+            self.check_over_configs(clip_sample_range=clip_sample_range, clip_sample=True)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction", "sample"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def full_loop(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = self.num_inference_steps
+        scheduler.set_timesteps(num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        return sample
+
+    def full_loop_custom_timesteps(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = self.num_inference_steps
+        scheduler.set_timesteps(num_inference_steps)
+        timesteps = scheduler.timesteps
+        timesteps = torch.cat([timesteps[:1], timesteps[1::2]])
+        # reset the timesteps using `timesteps`
+        scheduler = scheduler_class(**scheduler_config)
+        scheduler.set_timesteps(num_inference_steps=None, timesteps=timesteps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        return sample
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["cpu", "mps"]:
+            assert abs(result_sum.item() - 0.1233) < 1e-2
+            assert abs(result_mean.item() - 0.0002) < 1e-3
+        else:
+            # CUDA
+            assert abs(result_sum.item() - 0.1233) < 1e-2
+            assert abs(result_mean.item() - 0.0002) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction")
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["cpu", "mps"]:
+            assert abs(result_sum.item() - 4.6934e-07) < 1e-2
+            assert abs(result_mean.item() - 6.1112e-10) < 1e-3
+        else:
+            # CUDA
+            assert abs(result_sum.item() - 4.693428650170972e-07) < 1e-2
+            assert abs(result_mean.item() - 0.0002) < 1e-3
+
+    def test_full_loop_device(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if str(torch_device).startswith("cpu"):
+            # The following sum varies between 148 and 156 on mps. Why?
+            assert abs(result_sum.item() - 0.1233) < 1e-2
+            assert abs(result_mean.item() - 0.0002) < 1e-3
+        elif str(torch_device).startswith("mps"):
+            # Larger tolerance on mps
+            assert abs(result_mean.item() - 0.0002) < 1e-2
+        else:
+            # CUDA
+            assert abs(result_sum.item() - 0.1233) < 1e-2
+            assert abs(result_mean.item() - 0.0002) < 1e-3
+
+    def test_full_loop_device_karras_sigmas(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config, use_karras_sigmas=True)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 0.00015) < 1e-2
+        assert abs(result_mean.item() - 1.9869554535034695e-07) < 1e-2
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        t_start = self.num_inference_steps - 2
+        noise = self.dummy_noise_deter
+        noise = noise.to(torch_device)
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for i, t in enumerate(timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 75074.8906) < 1e-2, f" expected result sum 75074.8906, but get {result_sum}"
+        assert abs(result_mean.item() - 97.7538) < 1e-3, f" expected result mean 97.7538, but get {result_mean}"
+
+    def test_custom_timesteps(self):
+        for prediction_type in ["epsilon", "sample", "v_prediction"]:
+            for timestep_spacing in ["linspace", "leading"]:
+                sample = self.full_loop(
+                    prediction_type=prediction_type,
+                    timestep_spacing=timestep_spacing,
+                )
+                sample_custom_timesteps = self.full_loop_custom_timesteps(
+                    prediction_type=prediction_type,
+                    timestep_spacing=timestep_spacing,
+                )
+                assert (
+                    torch.sum(torch.abs(sample - sample_custom_timesteps)) < 1e-5
+                ), f"Scheduler outputs are not identical for prediction_type: {prediction_type}, timestep_spacing: {timestep_spacing}"
diff --git a/diffusers/tests/schedulers/test_scheduler_ipndm.py b/diffusers/tests/schedulers/test_scheduler_ipndm.py
new file mode 100644
index 0000000000000000000000000000000000000000..87c8da3ee3c15a7ca77042eb47a44710c1a76c6e
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_ipndm.py
@@ -0,0 +1,163 @@
+import tempfile
+
+import torch
+
+from diffusers import IPNDMScheduler
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class IPNDMSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (IPNDMScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 50),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {"num_train_timesteps": 1000}
+        config.update(**kwargs)
+        return config
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+            # copy over dummy past residuals
+            scheduler.ets = dummy_past_residuals[:]
+
+            if time_step is None:
+                time_step = scheduler.timesteps[len(scheduler.timesteps) // 2]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                new_scheduler.set_timesteps(num_inference_steps)
+                # copy over dummy past residuals
+                new_scheduler.ets = dummy_past_residuals[:]
+
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_save_pretrained(self):
+        pass
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+
+            # copy over dummy past residuals (must be after setting timesteps)
+            scheduler.ets = dummy_past_residuals[:]
+
+            if time_step is None:
+                time_step = scheduler.timesteps[len(scheduler.timesteps) // 2]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                # copy over dummy past residuals
+                new_scheduler.set_timesteps(num_inference_steps)
+
+                # copy over dummy past residual (must be after setting timesteps)
+                new_scheduler.ets = dummy_past_residuals[:]
+
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def full_loop(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        scheduler._step_index = None
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        return sample
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
+            scheduler.ets = dummy_past_residuals[:]
+
+            time_step_0 = scheduler.timesteps[5]
+            time_step_1 = scheduler.timesteps[6]
+
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_timesteps(self):
+        for timesteps in [100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps, time_step=None)
+
+    def test_inference_steps(self):
+        for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]):
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=None)
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 2540529) < 10
diff --git a/diffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py b/diffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6e8e96e084a69bf6c2def3ca9da821bfa754c6a
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py
@@ -0,0 +1,158 @@
+import torch
+
+from diffusers import KDPM2AncestralDiscreteScheduler
+from diffusers.utils.testing_utils import torch_device
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class KDPM2AncestralDiscreteSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (KDPM2AncestralDiscreteScheduler,)
+    num_inference_steps = 10
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1100,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [10, 50, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "scaled_linear"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_full_loop_no_noise(self):
+        if torch_device == "mps":
+            return
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 13849.3877) < 1e-2
+        assert abs(result_mean.item() - 18.0331) < 5e-3
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_full_loop_with_v_prediction(self):
+        if torch_device == "mps":
+            return
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        generator = torch.manual_seed(0)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 328.9970) < 1e-2
+        assert abs(result_mean.item() - 0.4284) < 1e-3
+
+    def test_full_loop_device(self):
+        if torch_device == "mps":
+            return
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 13849.3818) < 1e-1
+        assert abs(result_mean.item() - 18.0331) < 1e-3
+
+    def test_full_loop_with_noise(self):
+        if torch_device == "mps":
+            return
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        generator = torch.manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+
+        # add noise
+        t_start = self.num_inference_steps - 2
+        noise = self.dummy_noise_deter
+        noise = noise.to(sample.device)
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for i, t in enumerate(timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 93087.0312) < 1e-2, f" expected result sum 93087.0312, but get {result_sum}"
+        assert abs(result_mean.item() - 121.2071) < 5e-3, f" expected result mean 121.2071, but get {result_mean}"
diff --git a/diffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py b/diffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..a992edcd95516f31e0c3840d159c98077020b549
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py
@@ -0,0 +1,166 @@
+import torch
+
+from diffusers import KDPM2DiscreteScheduler
+from diffusers.utils.testing_utils import torch_device
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class KDPM2DiscreteSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (KDPM2DiscreteScheduler,)
+    num_inference_steps = 10
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1100,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [10, 50, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "scaled_linear"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_full_loop_with_v_prediction(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["cpu", "mps"]:
+            assert abs(result_sum.item() - 4.6934e-07) < 1e-2
+            assert abs(result_mean.item() - 6.1112e-10) < 1e-3
+        else:
+            # CUDA
+            assert abs(result_sum.item() - 4.693428650170972e-07) < 1e-2
+            assert abs(result_mean.item() - 0.0002) < 1e-3
+
+    def test_full_loop_no_noise(self):
+        if torch_device == "mps":
+            return
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["cpu", "mps"]:
+            assert abs(result_sum.item() - 20.4125) < 1e-2
+            assert abs(result_mean.item() - 0.0266) < 1e-3
+        else:
+            # CUDA
+            assert abs(result_sum.item() - 20.4125) < 1e-2
+            assert abs(result_mean.item() - 0.0266) < 1e-3
+
+    def test_full_loop_device(self):
+        if torch_device == "mps":
+            return
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if str(torch_device).startswith("cpu"):
+            # The following sum varies between 148 and 156 on mps. Why?
+            assert abs(result_sum.item() - 20.4125) < 1e-2
+            assert abs(result_mean.item() - 0.0266) < 1e-3
+        else:
+            # CUDA
+            assert abs(result_sum.item() - 20.4125) < 1e-2
+            assert abs(result_mean.item() - 0.0266) < 1e-3
+
+    def test_full_loop_with_noise(self):
+        if torch_device == "mps":
+            return
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        # add noise
+        t_start = self.num_inference_steps - 2
+        noise = self.dummy_noise_deter
+        noise = noise.to(sample.device)
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for i, t in enumerate(timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 70408.4062) < 1e-2, f" expected result sum 70408.4062, but get {result_sum}"
+        assert abs(result_mean.item() - 91.6776) < 1e-3, f" expected result mean 91.6776, but get {result_mean}"
diff --git a/diffusers/tests/schedulers/test_scheduler_lcm.py b/diffusers/tests/schedulers/test_scheduler_lcm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2c6530faa118951eabe53226fd924b988d925ca
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_lcm.py
@@ -0,0 +1,300 @@
+import tempfile
+from typing import Dict, List, Tuple
+
+import torch
+
+from diffusers import LCMScheduler
+from diffusers.utils.testing_utils import torch_device
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class LCMSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (LCMScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 10),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.00085,
+            "beta_end": 0.0120,
+            "beta_schedule": "scaled_linear",
+            "prediction_type": "epsilon",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    @property
+    def default_valid_timestep(self):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        scheduler_config = self.get_scheduler_config()
+        scheduler = self.scheduler_classes[0](**scheduler_config)
+
+        scheduler.set_timesteps(num_inference_steps)
+        timestep = scheduler.timesteps[-1]
+        return timestep
+
+    def test_timesteps(self):
+        for timesteps in [100, 500, 1000]:
+            # 0 is not guaranteed to be in the timestep schedule, but timesteps - 1 is
+            self.check_over_configs(time_step=timesteps - 1, num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
+            self.check_over_configs(time_step=self.default_valid_timestep, beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "scaled_linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(time_step=self.default_valid_timestep, beta_schedule=schedule)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(time_step=self.default_valid_timestep, prediction_type=prediction_type)
+
+    def test_clip_sample(self):
+        for clip_sample in [True, False]:
+            self.check_over_configs(time_step=self.default_valid_timestep, clip_sample=clip_sample)
+
+    def test_thresholding(self):
+        self.check_over_configs(time_step=self.default_valid_timestep, thresholding=False)
+        for threshold in [0.5, 1.0, 2.0]:
+            for prediction_type in ["epsilon", "v_prediction"]:
+                self.check_over_configs(
+                    time_step=self.default_valid_timestep,
+                    thresholding=True,
+                    prediction_type=prediction_type,
+                    sample_max_value=threshold,
+                )
+
+    def test_time_indices(self):
+        # Get default timestep schedule.
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        scheduler_config = self.get_scheduler_config()
+        scheduler = self.scheduler_classes[0](**scheduler_config)
+
+        scheduler.set_timesteps(num_inference_steps)
+        timesteps = scheduler.timesteps
+        for t in timesteps:
+            self.check_over_forward(time_step=t)
+
+    def test_inference_steps(self):
+        # Hardcoded for now
+        for t, num_inference_steps in zip([99, 39, 39, 19], [10, 25, 26, 50]):
+            self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps)
+
+    # Override test_add_noise_device because the hardcoded num_inference_steps of 100 doesn't work
+    # for LCMScheduler under default settings
+    def test_add_noise_device(self, num_inference_steps=10):
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+
+            sample = self.dummy_sample.to(torch_device)
+            scaled_sample = scheduler.scale_model_input(sample, 0.0)
+            self.assertEqual(sample.shape, scaled_sample.shape)
+
+            noise = torch.randn_like(scaled_sample).to(torch_device)
+            t = scheduler.timesteps[5][None]
+            noised = scheduler.add_noise(scaled_sample, noise, t)
+            self.assertEqual(noised.shape, scaled_sample.shape)
+
+    # Override test_from_save_pretrained because it hardcodes a timestep of 1
+    def test_from_save_pretrained(self):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            timestep = self.default_valid_timestep
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+
+            scheduler.set_timesteps(num_inference_steps)
+            new_scheduler.set_timesteps(num_inference_steps)
+
+            kwargs["generator"] = torch.manual_seed(0)
+            output = scheduler.step(residual, timestep, sample, **kwargs).prev_sample
+
+            kwargs["generator"] = torch.manual_seed(0)
+            new_output = new_scheduler.step(residual, timestep, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    # Override test_step_shape because uses 0 and 1 as hardcoded timesteps
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler.set_timesteps(num_inference_steps)
+
+            timestep_0 = scheduler.timesteps[-2]
+            timestep_1 = scheduler.timesteps[-1]
+
+            output_0 = scheduler.step(residual, timestep_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, timestep_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    # Override test_set_scheduler_outputs_equivalence since it uses 0 as a hardcoded timestep
+    def test_scheduler_outputs_equivalence(self):
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def recursive_check(tuple_object, dict_object):
+            if isinstance(tuple_object, (List, Tuple)):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif isinstance(tuple_object, Dict):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif tuple_object is None:
+                return
+            else:
+                self.assertTrue(
+                    torch.allclose(
+                        set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                    ),
+                    msg=(
+                        "Tuple and dict output are not equal. Difference:"
+                        f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                        f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                        f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                    ),
+                )
+
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", 50)
+
+        timestep = self.default_valid_timestep
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler.set_timesteps(num_inference_steps)
+            kwargs["generator"] = torch.manual_seed(0)
+            outputs_dict = scheduler.step(residual, timestep, sample, **kwargs)
+
+            scheduler.set_timesteps(num_inference_steps)
+            kwargs["generator"] = torch.manual_seed(0)
+            outputs_tuple = scheduler.step(residual, timestep, sample, return_dict=False, **kwargs)
+
+            recursive_check(outputs_tuple, outputs_dict)
+
+    def full_loop(self, num_inference_steps=10, seed=0, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        generator = torch.manual_seed(seed)
+
+        scheduler.set_timesteps(num_inference_steps)
+
+        for t in scheduler.timesteps:
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample, generator).prev_sample
+
+        return sample
+
+    def test_full_loop_onestep(self):
+        sample = self.full_loop(num_inference_steps=1)
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        # TODO: get expected sum and mean
+        assert abs(result_sum.item() - 18.7097) < 1e-3
+        assert abs(result_mean.item() - 0.0244) < 1e-3
+
+    def test_full_loop_multistep(self):
+        sample = self.full_loop(num_inference_steps=10)
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        # TODO: get expected sum and mean
+        assert abs(result_sum.item() - 197.7616) < 1e-3
+        assert abs(result_mean.item() - 0.2575) < 1e-3
+
+    def test_custom_timesteps(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 1, 0]
+
+        scheduler.set_timesteps(timesteps=timesteps)
+
+        scheduler_timesteps = scheduler.timesteps
+
+        for i, timestep in enumerate(scheduler_timesteps):
+            if i == len(timesteps) - 1:
+                expected_prev_t = -1
+            else:
+                expected_prev_t = timesteps[i + 1]
+
+            prev_t = scheduler.previous_timestep(timestep)
+            prev_t = prev_t.item()
+
+            self.assertEqual(prev_t, expected_prev_t)
+
+    def test_custom_timesteps_increasing_order(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 51, 0]
+
+        with self.assertRaises(ValueError, msg="`custom_timesteps` must be in descending order."):
+            scheduler.set_timesteps(timesteps=timesteps)
+
+    def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 1, 0]
+        num_inference_steps = len(timesteps)
+
+        with self.assertRaises(ValueError, msg="Can only pass one of `num_inference_steps` or `custom_timesteps`."):
+            scheduler.set_timesteps(num_inference_steps=num_inference_steps, timesteps=timesteps)
+
+    def test_custom_timesteps_too_large(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [scheduler.config.num_train_timesteps]
+
+        with self.assertRaises(
+            ValueError,
+            msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}",
+        ):
+            scheduler.set_timesteps(timesteps=timesteps)
diff --git a/diffusers/tests/schedulers/test_scheduler_lms.py b/diffusers/tests/schedulers/test_scheduler_lms.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c163ce9fe7a0ddd12cc9bd4618a4dacbf97600e
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_lms.py
@@ -0,0 +1,170 @@
+import torch
+
+from diffusers import LMSDiscreteScheduler
+from diffusers.utils.testing_utils import torch_device
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class LMSDiscreteSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (LMSDiscreteScheduler,)
+    num_inference_steps = 10
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1100,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [10, 50, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "scaled_linear"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_time_indices(self):
+        for t in [0, 500, 800]:
+            self.check_over_forward(time_step=t)
+
+    def test_full_loop_no_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 1006.388) < 1e-2
+        assert abs(result_mean.item() - 1.31) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 0.0017) < 1e-2
+        assert abs(result_mean.item() - 2.2676e-06) < 1e-3
+
+    def test_full_loop_device(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma.cpu()
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 1006.388) < 1e-2
+        assert abs(result_mean.item() - 1.31) < 1e-3
+
+    def test_full_loop_device_karras_sigmas(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config, use_karras_sigmas=True)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 3812.9927) < 2e-2
+        assert abs(result_mean.item() - 4.9648) < 1e-3
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+
+        # add noise
+        t_start = self.num_inference_steps - 2
+        noise = self.dummy_noise_deter
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for i, t in enumerate(timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 27663.6895) < 1e-2
+        assert abs(result_mean.item() - 36.0204) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_pndm.py b/diffusers/tests/schedulers/test_scheduler_pndm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1519f7c7e8e113aca61c8749c3a08f6f390309f
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_pndm.py
@@ -0,0 +1,242 @@
+import tempfile
+
+import torch
+
+from diffusers import PNDMScheduler
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class PNDMSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (PNDMScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 50),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+            # copy over dummy past residuals
+            scheduler.ets = dummy_past_residuals[:]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                new_scheduler.set_timesteps(num_inference_steps)
+                # copy over dummy past residuals
+                new_scheduler.ets = dummy_past_residuals[:]
+
+            output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+            output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_save_pretrained(self):
+        pass
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+
+            # copy over dummy past residuals (must be after setting timesteps)
+            scheduler.ets = dummy_past_residuals[:]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                # copy over dummy past residuals
+                new_scheduler.set_timesteps(num_inference_steps)
+
+                # copy over dummy past residual (must be after setting timesteps)
+                new_scheduler.ets = dummy_past_residuals[:]
+
+            output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+            output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def full_loop(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        for i, t in enumerate(scheduler.prk_timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step_prk(residual, t, sample).prev_sample
+
+        for i, t in enumerate(scheduler.plms_timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step_plms(residual, t, sample).prev_sample
+
+        return sample
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
+            scheduler.ets = dummy_past_residuals[:]
+
+            output_0 = scheduler.step_prk(residual, 0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step_prk(residual, 1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+            output_0 = scheduler.step_plms(residual, 0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step_plms(residual, 1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_timesteps(self):
+        for timesteps in [100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_steps_offset(self):
+        for steps_offset in [0, 1]:
+            self.check_over_configs(steps_offset=steps_offset)
+
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(steps_offset=1)
+        scheduler = scheduler_class(**scheduler_config)
+        scheduler.set_timesteps(10)
+        assert torch.equal(
+            scheduler.timesteps,
+            torch.LongTensor(
+                [901, 851, 851, 801, 801, 751, 751, 701, 701, 651, 651, 601, 601, 501, 401, 301, 201, 101, 1]
+            ),
+        )
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001], [0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_time_indices(self):
+        for t in [1, 5, 10]:
+            self.check_over_forward(time_step=t)
+
+    def test_inference_steps(self):
+        for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]):
+            self.check_over_forward(num_inference_steps=num_inference_steps)
+
+    def test_pow_of_3_inference_steps(self):
+        # earlier version of set_timesteps() caused an error indexing alpha's with inference steps as power of 3
+        num_inference_steps = 27
+
+        for scheduler_class in self.scheduler_classes:
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            scheduler.set_timesteps(num_inference_steps)
+
+            # before power of 3 fix, would error on first step, so we only need to do two
+            for i, t in enumerate(scheduler.prk_timesteps[:2]):
+                sample = scheduler.step_prk(residual, t, sample).prev_sample
+
+    def test_inference_plms_no_past_residuals(self):
+        with self.assertRaises(ValueError):
+            scheduler_class = self.scheduler_classes[0]
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            scheduler.step_plms(self.dummy_sample, 1, self.dummy_sample).prev_sample
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 198.1318) < 1e-2
+        assert abs(result_mean.item() - 0.2580) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction")
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 67.3986) < 1e-2
+        assert abs(result_mean.item() - 0.0878) < 1e-3
+
+    def test_full_loop_with_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 230.0399) < 1e-2
+        assert abs(result_mean.item() - 0.2995) < 1e-3
+
+    def test_full_loop_with_no_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 186.9482) < 1e-2
+        assert abs(result_mean.item() - 0.2434) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_sasolver.py b/diffusers/tests/schedulers/test_scheduler_sasolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..574194632df0034da40eca325b073881c443ebb8
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_sasolver.py
@@ -0,0 +1,202 @@
+import torch
+
+from diffusers import SASolverScheduler
+from diffusers.utils.testing_utils import require_torchsde, torch_device
+
+from .test_schedulers import SchedulerCommonTest
+
+
+@require_torchsde
+class SASolverSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (SASolverScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 10),)
+    num_inference_steps = 10
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1100,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+            scheduler.model_outputs = dummy_past_residuals[
+                : max(
+                    scheduler.config.predictor_order,
+                    scheduler.config.corrector_order - 1,
+                )
+            ]
+
+            time_step_0 = scheduler.timesteps[5]
+            time_step_1 = scheduler.timesteps[6]
+
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_timesteps(self):
+        for timesteps in [10, 50, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "scaled_linear"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_full_loop_no_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+        generator = torch.manual_seed(0)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t, generator=generator)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["cpu"]:
+            assert abs(result_sum.item() - 337.394287109375) < 1e-2
+            assert abs(result_mean.item() - 0.43931546807289124) < 1e-3
+        elif torch_device in ["cuda"]:
+            assert abs(result_sum.item() - 329.1999816894531) < 1e-2
+            assert abs(result_mean.item() - 0.4286458194255829) < 1e-3
+        else:
+            print("None")
+
+    def test_full_loop_with_v_prediction(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+        generator = torch.manual_seed(0)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t, generator=generator)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["cpu"]:
+            assert abs(result_sum.item() - 193.1467742919922) < 1e-2
+            assert abs(result_mean.item() - 0.2514931857585907) < 1e-3
+        elif torch_device in ["cuda"]:
+            assert abs(result_sum.item() - 193.4154052734375) < 1e-2
+            assert abs(result_mean.item() - 0.2518429756164551) < 1e-3
+        else:
+            print("None")
+
+    def test_full_loop_device(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+        generator = torch.manual_seed(0)
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["cpu"]:
+            assert abs(result_sum.item() - 337.394287109375) < 1e-2
+            assert abs(result_mean.item() - 0.43931546807289124) < 1e-3
+        elif torch_device in ["cuda"]:
+            assert abs(result_sum.item() - 337.394287109375) < 1e-2
+            assert abs(result_mean.item() - 0.4393154978752136) < 1e-3
+        else:
+            print("None")
+
+    def test_full_loop_device_karras_sigmas(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config, use_karras_sigmas=True)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+        generator = torch.manual_seed(0)
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["cpu"]:
+            assert abs(result_sum.item() - 837.2554931640625) < 1e-2
+            assert abs(result_mean.item() - 1.0901764631271362) < 1e-2
+        elif torch_device in ["cuda"]:
+            assert abs(result_sum.item() - 837.25537109375) < 1e-2
+            assert abs(result_mean.item() - 1.0901763439178467) < 1e-2
+        else:
+            print("None")
diff --git a/diffusers/tests/schedulers/test_scheduler_score_sde_ve.py b/diffusers/tests/schedulers/test_scheduler_score_sde_ve.py
new file mode 100644
index 0000000000000000000000000000000000000000..08c30f9b1e0c2ce1f7baab82f5076efabe465a69
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_score_sde_ve.py
@@ -0,0 +1,189 @@
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import ScoreSdeVeScheduler
+
+
+class ScoreSdeVeSchedulerTest(unittest.TestCase):
+    # TODO adapt with class SchedulerCommonTest (scheduler needs Numpy Integration)
+    scheduler_classes = (ScoreSdeVeScheduler,)
+    forward_default_kwargs = ()
+
+    @property
+    def dummy_sample(self):
+        batch_size = 4
+        num_channels = 3
+        height = 8
+        width = 8
+
+        sample = torch.rand((batch_size, num_channels, height, width))
+
+        return sample
+
+    @property
+    def dummy_sample_deter(self):
+        batch_size = 4
+        num_channels = 3
+        height = 8
+        width = 8
+
+        num_elems = batch_size * num_channels * height * width
+        sample = torch.arange(num_elems)
+        sample = sample.reshape(num_channels, height, width, batch_size)
+        sample = sample / num_elems
+        sample = sample.permute(3, 0, 1, 2)
+
+        return sample
+
+    def dummy_model(self):
+        def model(sample, t, *args):
+            return sample * t / (t + 1)
+
+        return model
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 2000,
+            "snr": 0.15,
+            "sigma_min": 0.01,
+            "sigma_max": 1348,
+            "sampling_eps": 1e-5,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+
+        for scheduler_class in self.scheduler_classes:
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+
+            output = scheduler.step_pred(
+                residual, time_step, sample, generator=torch.manual_seed(0), **kwargs
+            ).prev_sample
+            new_output = new_scheduler.step_pred(
+                residual, time_step, sample, generator=torch.manual_seed(0), **kwargs
+            ).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+            output = scheduler.step_correct(residual, sample, generator=torch.manual_seed(0), **kwargs).prev_sample
+            new_output = new_scheduler.step_correct(
+                residual, sample, generator=torch.manual_seed(0), **kwargs
+            ).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical"
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        kwargs.update(forward_kwargs)
+
+        for scheduler_class in self.scheduler_classes:
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+
+            output = scheduler.step_pred(
+                residual, time_step, sample, generator=torch.manual_seed(0), **kwargs
+            ).prev_sample
+            new_output = new_scheduler.step_pred(
+                residual, time_step, sample, generator=torch.manual_seed(0), **kwargs
+            ).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+            output = scheduler.step_correct(residual, sample, generator=torch.manual_seed(0), **kwargs).prev_sample
+            new_output = new_scheduler.step_correct(
+                residual, sample, generator=torch.manual_seed(0), **kwargs
+            ).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical"
+
+    def test_timesteps(self):
+        for timesteps in [10, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_sigmas(self):
+        for sigma_min, sigma_max in zip([0.0001, 0.001, 0.01], [1, 100, 1000]):
+            self.check_over_configs(sigma_min=sigma_min, sigma_max=sigma_max)
+
+    def test_time_indices(self):
+        for t in [0.1, 0.5, 0.75]:
+            self.check_over_forward(time_step=t)
+
+    def test_full_loop_no_noise(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 3
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+
+        scheduler.set_sigmas(num_inference_steps)
+        scheduler.set_timesteps(num_inference_steps)
+        generator = torch.manual_seed(0)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sigma_t = scheduler.sigmas[i]
+
+            for _ in range(scheduler.config.correct_steps):
+                with torch.no_grad():
+                    model_output = model(sample, sigma_t)
+                sample = scheduler.step_correct(model_output, sample, generator=generator, **kwargs).prev_sample
+
+            with torch.no_grad():
+                model_output = model(sample, sigma_t)
+
+            output = scheduler.step_pred(model_output, t, sample, generator=generator, **kwargs)
+            sample, _ = output.prev_sample, output.prev_sample_mean
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert np.isclose(result_sum.item(), 14372758528.0)
+        assert np.isclose(result_mean.item(), 18714530.0)
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output_0 = scheduler.step_pred(residual, 0, sample, generator=torch.manual_seed(0), **kwargs).prev_sample
+            output_1 = scheduler.step_pred(residual, 1, sample, generator=torch.manual_seed(0), **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
diff --git a/diffusers/tests/schedulers/test_scheduler_tcd.py b/diffusers/tests/schedulers/test_scheduler_tcd.py
new file mode 100644
index 0000000000000000000000000000000000000000..e95c536c7f0a279df379d872a335c1e6b30d6ccc
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_tcd.py
@@ -0,0 +1,180 @@
+import torch
+
+from diffusers import TCDScheduler
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class TCDSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (TCDScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 10),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.00085,
+            "beta_end": 0.0120,
+            "beta_schedule": "scaled_linear",
+            "prediction_type": "epsilon",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    @property
+    def default_num_inference_steps(self):
+        return 10
+
+    @property
+    def default_valid_timestep(self):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        scheduler_config = self.get_scheduler_config()
+        scheduler = self.scheduler_classes[0](**scheduler_config)
+
+        scheduler.set_timesteps(num_inference_steps)
+        timestep = scheduler.timesteps[-1]
+        return timestep
+
+    def test_timesteps(self):
+        for timesteps in [100, 500, 1000]:
+            # 0 is not guaranteed to be in the timestep schedule, but timesteps - 1 is
+            self.check_over_configs(time_step=timesteps - 1, num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
+            self.check_over_configs(time_step=self.default_valid_timestep, beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "scaled_linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(time_step=self.default_valid_timestep, beta_schedule=schedule)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(time_step=self.default_valid_timestep, prediction_type=prediction_type)
+
+    def test_clip_sample(self):
+        for clip_sample in [True, False]:
+            self.check_over_configs(time_step=self.default_valid_timestep, clip_sample=clip_sample)
+
+    def test_thresholding(self):
+        self.check_over_configs(time_step=self.default_valid_timestep, thresholding=False)
+        for threshold in [0.5, 1.0, 2.0]:
+            for prediction_type in ["epsilon", "v_prediction"]:
+                self.check_over_configs(
+                    time_step=self.default_valid_timestep,
+                    thresholding=True,
+                    prediction_type=prediction_type,
+                    sample_max_value=threshold,
+                )
+
+    def test_time_indices(self):
+        # Get default timestep schedule.
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        scheduler_config = self.get_scheduler_config()
+        scheduler = self.scheduler_classes[0](**scheduler_config)
+
+        scheduler.set_timesteps(num_inference_steps)
+        timesteps = scheduler.timesteps
+        for t in timesteps:
+            self.check_over_forward(time_step=t)
+
+    def test_inference_steps(self):
+        # Hardcoded for now
+        for t, num_inference_steps in zip([99, 39, 39, 19], [10, 25, 26, 50]):
+            self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps)
+
+    def full_loop(self, num_inference_steps=10, seed=0, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        eta = 0.0  # refer to gamma in the paper
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        generator = torch.manual_seed(seed)
+        scheduler.set_timesteps(num_inference_steps)
+
+        for t in scheduler.timesteps:
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample, eta, generator).prev_sample
+
+        return sample
+
+    def test_full_loop_onestep_deter(self):
+        sample = self.full_loop(num_inference_steps=1)
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 29.8715) < 1e-3  # 0.0778918
+        assert abs(result_mean.item() - 0.0389) < 1e-3
+
+    def test_full_loop_multistep_deter(self):
+        sample = self.full_loop(num_inference_steps=10)
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 181.2040) < 1e-3
+        assert abs(result_mean.item() - 0.2359) < 1e-3
+
+    def test_custom_timesteps(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 1, 0]
+
+        scheduler.set_timesteps(timesteps=timesteps)
+
+        scheduler_timesteps = scheduler.timesteps
+
+        for i, timestep in enumerate(scheduler_timesteps):
+            if i == len(timesteps) - 1:
+                expected_prev_t = -1
+            else:
+                expected_prev_t = timesteps[i + 1]
+
+            prev_t = scheduler.previous_timestep(timestep)
+            prev_t = prev_t.item()
+
+            self.assertEqual(prev_t, expected_prev_t)
+
+    def test_custom_timesteps_increasing_order(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 51, 0]
+
+        with self.assertRaises(ValueError, msg="`custom_timesteps` must be in descending order."):
+            scheduler.set_timesteps(timesteps=timesteps)
+
+    def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 1, 0]
+        num_inference_steps = len(timesteps)
+
+        with self.assertRaises(ValueError, msg="Can only pass one of `num_inference_steps` or `custom_timesteps`."):
+            scheduler.set_timesteps(num_inference_steps=num_inference_steps, timesteps=timesteps)
+
+    def test_custom_timesteps_too_large(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [scheduler.config.num_train_timesteps]
+
+        with self.assertRaises(
+            ValueError,
+            msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}",
+        ):
+            scheduler.set_timesteps(timesteps=timesteps)
diff --git a/diffusers/tests/schedulers/test_scheduler_unclip.py b/diffusers/tests/schedulers/test_scheduler_unclip.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0ce1312e79f6762bc7573c3a90e58cb33a21bad
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_unclip.py
@@ -0,0 +1,137 @@
+import torch
+
+from diffusers import UnCLIPScheduler
+
+from .test_schedulers import SchedulerCommonTest
+
+
+# UnCLIPScheduler is a modified DDPMScheduler with a subset of the configuration.
+class UnCLIPSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (UnCLIPScheduler,)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "variance_type": "fixed_small_log",
+            "clip_sample": True,
+            "clip_sample_range": 1.0,
+            "prediction_type": "epsilon",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [1, 5, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_variance_type(self):
+        for variance in ["fixed_small_log", "learned_range"]:
+            self.check_over_configs(variance_type=variance)
+
+    def test_clip_sample(self):
+        for clip_sample in [True, False]:
+            self.check_over_configs(clip_sample=clip_sample)
+
+    def test_clip_sample_range(self):
+        for clip_sample_range in [1, 5, 10, 20]:
+            self.check_over_configs(clip_sample_range=clip_sample_range)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "sample"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_time_indices(self):
+        for time_step in [0, 500, 999]:
+            for prev_timestep in [None, 5, 100, 250, 500, 750]:
+                if prev_timestep is not None and prev_timestep >= time_step:
+                    continue
+
+                self.check_over_forward(time_step=time_step, prev_timestep=prev_timestep)
+
+    def test_variance_fixed_small_log(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(variance_type="fixed_small_log")
+        scheduler = scheduler_class(**scheduler_config)
+
+        assert torch.sum(torch.abs(scheduler._get_variance(0) - 1.0000e-10)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(487) - 0.0549625)) < 1e-5
+        assert torch.sum(torch.abs(scheduler._get_variance(999) - 0.9994987)) < 1e-5
+
+    def test_variance_learned_range(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(variance_type="learned_range")
+        scheduler = scheduler_class(**scheduler_config)
+
+        predicted_variance = 0.5
+
+        assert scheduler._get_variance(1, predicted_variance=predicted_variance) - -10.1712790 < 1e-5
+        assert scheduler._get_variance(487, predicted_variance=predicted_variance) - -5.7998052 < 1e-5
+        assert scheduler._get_variance(999, predicted_variance=predicted_variance) - -0.0010011 < 1e-5
+
+    def test_full_loop(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = scheduler.timesteps
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        generator = torch.manual_seed(0)
+
+        for i, t in enumerate(timesteps):
+            # 1. predict noise residual
+            residual = model(sample, t)
+
+            # 2. predict previous mean of sample x_t-1
+            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
+
+            sample = pred_prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 252.2682495) < 1e-2
+        assert abs(result_mean.item() - 0.3284743) < 1e-3
+
+    def test_full_loop_skip_timesteps(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(25)
+
+        timesteps = scheduler.timesteps
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        generator = torch.manual_seed(0)
+
+        for i, t in enumerate(timesteps):
+            # 1. predict noise residual
+            residual = model(sample, t)
+
+            if i + 1 == timesteps.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = timesteps[i + 1]
+
+            # 2. predict previous mean of sample x_t-1
+            pred_prev_sample = scheduler.step(
+                residual, t, sample, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
+
+            sample = pred_prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 258.2044983) < 1e-2
+        assert abs(result_mean.item() - 0.3362038) < 1e-3
+
+    def test_trained_betas(self):
+        pass
+
+    def test_add_noise_device(self):
+        pass
diff --git a/diffusers/tests/schedulers/test_scheduler_unipc.py b/diffusers/tests/schedulers/test_scheduler_unipc.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eb4d5ceef01ad30ce575a197f48ee190f3d0f86
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_unipc.py
@@ -0,0 +1,395 @@
+import tempfile
+
+import torch
+
+from diffusers import (
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    UniPCMultistepScheduler,
+)
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class UniPCMultistepSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (UniPCMultistepScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 25),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "solver_order": 2,
+            "solver_type": "bh2",
+            "final_sigmas_type": "sigma_min",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+            # copy over dummy past residuals
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                new_scheduler.set_timesteps(num_inference_steps)
+                # copy over dummy past residuals
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            output, new_output = sample, sample
+            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+                t = scheduler.timesteps[t]
+                output = scheduler.step(residual, t, output, **kwargs).prev_sample
+                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
+
+                assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+
+            # copy over dummy past residuals (must be after setting timesteps)
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+                # copy over dummy past residuals
+                new_scheduler.set_timesteps(num_inference_steps)
+
+                # copy over dummy past residual (must be after setting timesteps)
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def full_loop(self, scheduler=None, **config):
+        if scheduler is None:
+            scheduler_class = self.scheduler_classes[0]
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        return sample
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            time_step_0 = scheduler.timesteps[5]
+            time_step_1 = scheduler.timesteps[6]
+
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_switch(self):
+        # make sure that iterating over schedulers with same config names gives same results
+        # for defaults
+        scheduler = UniPCMultistepScheduler(**self.get_scheduler_config())
+        sample = self.full_loop(scheduler=scheduler)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2464) < 1e-3
+
+        scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config)
+        scheduler = DEISMultistepScheduler.from_config(scheduler.config)
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
+
+        sample = self.full_loop(scheduler=scheduler)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2464) < 1e-3
+
+    def test_timesteps(self):
+        for timesteps in [25, 50, 100, 999, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_thresholding(self):
+        self.check_over_configs(thresholding=False)
+        for order in [1, 2, 3]:
+            for solver_type in ["bh1", "bh2"]:
+                for threshold in [0.5, 1.0, 2.0]:
+                    for prediction_type in ["epsilon", "sample"]:
+                        self.check_over_configs(
+                            thresholding=True,
+                            prediction_type=prediction_type,
+                            sample_max_value=threshold,
+                            solver_order=order,
+                            solver_type=solver_type,
+                        )
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_rescale_betas_zero_snr(self):
+        for rescale_betas_zero_snr in [True, False]:
+            self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr)
+
+    def test_solver_order_and_type(self):
+        for solver_type in ["bh1", "bh2"]:
+            for order in [1, 2, 3]:
+                for prediction_type in ["epsilon", "sample"]:
+                    self.check_over_configs(
+                        solver_order=order,
+                        solver_type=solver_type,
+                        prediction_type=prediction_type,
+                    )
+                    sample = self.full_loop(
+                        solver_order=order,
+                        solver_type=solver_type,
+                        prediction_type=prediction_type,
+                    )
+                    assert not torch.isnan(sample).any(), "Samples have nan numbers"
+
+    def test_lower_order_final(self):
+        self.check_over_configs(lower_order_final=True)
+        self.check_over_configs(lower_order_final=False)
+
+    def test_inference_steps(self):
+        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2464) < 1e-3
+
+    def test_full_loop_with_karras(self):
+        sample = self.full_loop(use_karras_sigmas=True)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2925) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction")
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.1014) < 1e-3
+
+    def test_full_loop_with_karras_and_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction", use_karras_sigmas=True)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.1966) < 1e-3
+
+    def test_fp16_support(self):
+        for order in [1, 2, 3]:
+            for solver_type in ["bh1", "bh2"]:
+                for prediction_type in ["epsilon", "sample", "v_prediction"]:
+                    scheduler_class = self.scheduler_classes[0]
+                    scheduler_config = self.get_scheduler_config(
+                        thresholding=True,
+                        dynamic_thresholding_ratio=0,
+                        prediction_type=prediction_type,
+                        solver_order=order,
+                        solver_type=solver_type,
+                    )
+                    scheduler = scheduler_class(**scheduler_config)
+
+                    num_inference_steps = 10
+                    model = self.dummy_model()
+                    sample = self.dummy_sample_deter.half()
+                    scheduler.set_timesteps(num_inference_steps)
+
+                    for i, t in enumerate(scheduler.timesteps):
+                        residual = model(sample, t)
+                        sample = scheduler.step(residual, t, sample).prev_sample
+
+                    assert sample.dtype == torch.float16
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        t_start = 8
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        # add noise
+        noise = self.dummy_noise_deter
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for i, t in enumerate(timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 315.5757) < 1e-2, f" expected result sum 315.5757, but get {result_sum}"
+        assert abs(result_mean.item() - 0.4109) < 1e-3, f" expected result mean 0.4109, but get {result_mean}"
+
+
+class UniPCMultistepScheduler1DTest(UniPCMultistepSchedulerTest):
+    @property
+    def dummy_sample(self):
+        batch_size = 4
+        num_channels = 3
+        width = 8
+
+        sample = torch.rand((batch_size, num_channels, width))
+
+        return sample
+
+    @property
+    def dummy_noise_deter(self):
+        batch_size = 4
+        num_channels = 3
+        width = 8
+
+        num_elems = batch_size * num_channels * width
+        sample = torch.arange(num_elems).flip(-1)
+        sample = sample.reshape(num_channels, width, batch_size)
+        sample = sample / num_elems
+        sample = sample.permute(2, 0, 1)
+
+        return sample
+
+    @property
+    def dummy_sample_deter(self):
+        batch_size = 4
+        num_channels = 3
+        width = 8
+
+        num_elems = batch_size * num_channels * width
+        sample = torch.arange(num_elems)
+        sample = sample.reshape(num_channels, width, batch_size)
+        sample = sample / num_elems
+        sample = sample.permute(2, 0, 1)
+
+        return sample
+
+    def test_switch(self):
+        # make sure that iterating over schedulers with same config names gives same results
+        # for defaults
+        scheduler = UniPCMultistepScheduler(**self.get_scheduler_config())
+        sample = self.full_loop(scheduler=scheduler)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2441) < 1e-3
+
+        scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config)
+        scheduler = DEISMultistepScheduler.from_config(scheduler.config)
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
+
+        sample = self.full_loop(scheduler=scheduler)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2441) < 1e-3
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2441) < 1e-3
+
+    def test_full_loop_with_karras(self):
+        sample = self.full_loop(use_karras_sigmas=True)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.2898) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction")
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.1014) < 1e-3
+
+    def test_full_loop_with_karras_and_v_prediction(self):
+        sample = self.full_loop(prediction_type="v_prediction", use_karras_sigmas=True)
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.1944) < 1e-3
+
+    def test_full_loop_with_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        t_start = 8
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        # add noise
+        noise = self.dummy_noise_deter
+        timesteps = scheduler.timesteps[t_start * scheduler.order :]
+        sample = scheduler.add_noise(sample, noise, timesteps[:1])
+
+        for i, t in enumerate(timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 39.0870) < 1e-2, f" expected result sum 39.0870, but get {result_sum}"
+        assert abs(result_mean.item() - 0.4072) < 1e-3, f" expected result mean 0.4072, but get {result_mean}"
diff --git a/diffusers/tests/schedulers/test_scheduler_vq_diffusion.py b/diffusers/tests/schedulers/test_scheduler_vq_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..74437ad4548074a488917d3ea9b5eef4f0ac1532
--- /dev/null
+++ b/diffusers/tests/schedulers/test_scheduler_vq_diffusion.py
@@ -0,0 +1,56 @@
+import torch
+import torch.nn.functional as F
+
+from diffusers import VQDiffusionScheduler
+
+from .test_schedulers import SchedulerCommonTest
+
+
+class VQDiffusionSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (VQDiffusionScheduler,)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_vec_classes": 4097,
+            "num_train_timesteps": 100,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def dummy_sample(self, num_vec_classes):
+        batch_size = 4
+        height = 8
+        width = 8
+
+        sample = torch.randint(0, num_vec_classes, (batch_size, height * width))
+
+        return sample
+
+    @property
+    def dummy_sample_deter(self):
+        assert False
+
+    def dummy_model(self, num_vec_classes):
+        def model(sample, t, *args):
+            batch_size, num_latent_pixels = sample.shape
+            logits = torch.rand((batch_size, num_vec_classes - 1, num_latent_pixels))
+            return_value = F.log_softmax(logits.double(), dim=1).float()
+            return return_value
+
+        return model
+
+    def test_timesteps(self):
+        for timesteps in [2, 5, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_num_vec_classes(self):
+        for num_vec_classes in [5, 100, 1000, 4000]:
+            self.check_over_configs(num_vec_classes=num_vec_classes)
+
+    def test_time_indices(self):
+        for t in [0, 50, 99]:
+            self.check_over_forward(time_step=t)
+
+    def test_add_noise_device(self):
+        pass
diff --git a/diffusers/tests/schedulers/test_schedulers.py b/diffusers/tests/schedulers/test_schedulers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc7f22d2a8e58d5440c8ff04206de32520eaffeb
--- /dev/null
+++ b/diffusers/tests/schedulers/test_schedulers.py
@@ -0,0 +1,868 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import json
+import os
+import tempfile
+import unittest
+import uuid
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+from huggingface_hub import delete_repo
+
+import diffusers
+from diffusers import (
+    CMStochasticIterativeScheduler,
+    DDIMScheduler,
+    DEISMultistepScheduler,
+    DiffusionPipeline,
+    EDMEulerScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    IPNDMScheduler,
+    LMSDiscreteScheduler,
+    UniPCMultistepScheduler,
+    VQDiffusionScheduler,
+)
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils import logging
+from diffusers.utils.testing_utils import CaptureLogger, torch_device
+
+from ..others.test_utils import TOKEN, USER, is_staging_test
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class SchedulerObject(SchedulerMixin, ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        e=[1, 3],
+    ):
+        pass
+
+
+class SchedulerObject2(SchedulerMixin, ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        f=[1, 3],
+    ):
+        pass
+
+
+class SchedulerObject3(SchedulerMixin, ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        e=[1, 3],
+        f=[1, 3],
+    ):
+        pass
+
+
+class SchedulerBaseTests(unittest.TestCase):
+    def test_save_load_from_different_config(self):
+        obj = SchedulerObject()
+
+        # mock add obj class to `diffusers`
+        setattr(diffusers, "SchedulerObject", SchedulerObject)
+        logger = logging.get_logger("diffusers.configuration_utils")
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            obj.save_config(tmpdirname)
+            with CaptureLogger(logger) as cap_logger_1:
+                config = SchedulerObject2.load_config(tmpdirname)
+                new_obj_1 = SchedulerObject2.from_config(config)
+
+            # now save a config parameter that is not expected
+            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f:
+                data = json.load(f)
+                data["unexpected"] = True
+
+            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f:
+                json.dump(data, f)
+
+            with CaptureLogger(logger) as cap_logger_2:
+                config = SchedulerObject.load_config(tmpdirname)
+                new_obj_2 = SchedulerObject.from_config(config)
+
+            with CaptureLogger(logger) as cap_logger_3:
+                config = SchedulerObject2.load_config(tmpdirname)
+                new_obj_3 = SchedulerObject2.from_config(config)
+
+        assert new_obj_1.__class__ == SchedulerObject2
+        assert new_obj_2.__class__ == SchedulerObject
+        assert new_obj_3.__class__ == SchedulerObject2
+
+        assert cap_logger_1.out == ""
+        assert (
+            cap_logger_2.out
+            == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
+            " will"
+            " be ignored. Please verify your config.json configuration file.\n"
+        )
+        assert cap_logger_2.out.replace("SchedulerObject", "SchedulerObject2") == cap_logger_3.out
+
+    def test_save_load_compatible_schedulers(self):
+        SchedulerObject2._compatibles = ["SchedulerObject"]
+        SchedulerObject._compatibles = ["SchedulerObject2"]
+
+        obj = SchedulerObject()
+
+        # mock add obj class to `diffusers`
+        setattr(diffusers, "SchedulerObject", SchedulerObject)
+        setattr(diffusers, "SchedulerObject2", SchedulerObject2)
+        logger = logging.get_logger("diffusers.configuration_utils")
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            obj.save_config(tmpdirname)
+
+            # now save a config parameter that is expected by another class, but not origin class
+            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f:
+                data = json.load(f)
+                data["f"] = [0, 0]
+                data["unexpected"] = True
+
+            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f:
+                json.dump(data, f)
+
+            with CaptureLogger(logger) as cap_logger:
+                config = SchedulerObject.load_config(tmpdirname)
+                new_obj = SchedulerObject.from_config(config)
+
+        assert new_obj.__class__ == SchedulerObject
+
+        assert (
+            cap_logger.out
+            == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
+            " will"
+            " be ignored. Please verify your config.json configuration file.\n"
+        )
+
+    def test_save_load_from_different_config_comp_schedulers(self):
+        SchedulerObject3._compatibles = ["SchedulerObject", "SchedulerObject2"]
+        SchedulerObject2._compatibles = ["SchedulerObject", "SchedulerObject3"]
+        SchedulerObject._compatibles = ["SchedulerObject2", "SchedulerObject3"]
+
+        obj = SchedulerObject()
+
+        # mock add obj class to `diffusers`
+        setattr(diffusers, "SchedulerObject", SchedulerObject)
+        setattr(diffusers, "SchedulerObject2", SchedulerObject2)
+        setattr(diffusers, "SchedulerObject3", SchedulerObject3)
+        logger = logging.get_logger("diffusers.configuration_utils")
+        logger.setLevel(diffusers.logging.INFO)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            obj.save_config(tmpdirname)
+
+            with CaptureLogger(logger) as cap_logger_1:
+                config = SchedulerObject.load_config(tmpdirname)
+                new_obj_1 = SchedulerObject.from_config(config)
+
+            with CaptureLogger(logger) as cap_logger_2:
+                config = SchedulerObject2.load_config(tmpdirname)
+                new_obj_2 = SchedulerObject2.from_config(config)
+
+            with CaptureLogger(logger) as cap_logger_3:
+                config = SchedulerObject3.load_config(tmpdirname)
+                new_obj_3 = SchedulerObject3.from_config(config)
+
+        assert new_obj_1.__class__ == SchedulerObject
+        assert new_obj_2.__class__ == SchedulerObject2
+        assert new_obj_3.__class__ == SchedulerObject3
+
+        assert cap_logger_1.out == ""
+        assert cap_logger_2.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
+        assert cap_logger_3.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
+
+    def test_default_arguments_not_in_config(self):
+        pipe = DiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-pipe", torch_dtype=torch.float16
+        )
+        assert pipe.scheduler.__class__ == DDIMScheduler
+
+        # Default for DDIMScheduler
+        assert pipe.scheduler.config.timestep_spacing == "leading"
+
+        # Switch to a different one, verify we use the default for that class
+        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+        assert pipe.scheduler.config.timestep_spacing == "linspace"
+
+        # Override with kwargs
+        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
+        assert pipe.scheduler.config.timestep_spacing == "trailing"
+
+        # Verify overridden kwargs stick
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        assert pipe.scheduler.config.timestep_spacing == "trailing"
+
+        # And stick
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        assert pipe.scheduler.config.timestep_spacing == "trailing"
+
+    def test_default_solver_type_after_switch(self):
+        pipe = DiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-pipe", torch_dtype=torch.float16
+        )
+        assert pipe.scheduler.__class__ == DDIMScheduler
+
+        pipe.scheduler = DEISMultistepScheduler.from_config(pipe.scheduler.config)
+        assert pipe.scheduler.config.solver_type == "logrho"
+
+        # Switch to UniPC, verify the solver is the default
+        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        assert pipe.scheduler.config.solver_type == "bh2"
+
+
+class SchedulerCommonTest(unittest.TestCase):
+    scheduler_classes = ()
+    forward_default_kwargs = ()
+
+    @property
+    def default_num_inference_steps(self):
+        return 50
+
+    @property
+    def default_timestep(self):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.get("num_inference_steps", self.default_num_inference_steps)
+
+        try:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = self.scheduler_classes[0](**scheduler_config)
+
+            scheduler.set_timesteps(num_inference_steps)
+            timestep = scheduler.timesteps[0]
+        except NotImplementedError:
+            logger.warning(
+                f"The scheduler {self.__class__.__name__} does not implement a `get_scheduler_config` method."
+                f" `default_timestep` will be set to the default value of 1."
+            )
+            timestep = 1
+
+        return timestep
+
+    # NOTE: currently taking the convention that default_timestep > default_timestep_2 (alternatively,
+    # default_timestep comes earlier in the timestep schedule than default_timestep_2)
+    @property
+    def default_timestep_2(self):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.get("num_inference_steps", self.default_num_inference_steps)
+
+        try:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = self.scheduler_classes[0](**scheduler_config)
+
+            scheduler.set_timesteps(num_inference_steps)
+            if len(scheduler.timesteps) >= 2:
+                timestep_2 = scheduler.timesteps[1]
+            else:
+                logger.warning(
+                    f"Using num_inference_steps from the scheduler testing class's default config leads to a timestep"
+                    f" scheduler of length {len(scheduler.timesteps)} < 2. The default `default_timestep_2` value of 0"
+                    f" will be used."
+                )
+                timestep_2 = 0
+        except NotImplementedError:
+            logger.warning(
+                f"The scheduler {self.__class__.__name__} does not implement a `get_scheduler_config` method."
+                f" `default_timestep_2` will be set to the default value of 0."
+            )
+            timestep_2 = 0
+
+        return timestep_2
+
+    @property
+    def dummy_sample(self):
+        batch_size = 4
+        num_channels = 3
+        height = 8
+        width = 8
+
+        sample = torch.rand((batch_size, num_channels, height, width))
+
+        return sample
+
+    @property
+    def dummy_noise_deter(self):
+        batch_size = 4
+        num_channels = 3
+        height = 8
+        width = 8
+
+        num_elems = batch_size * num_channels * height * width
+        sample = torch.arange(num_elems).flip(-1)
+        sample = sample.reshape(num_channels, height, width, batch_size)
+        sample = sample / num_elems
+        sample = sample.permute(3, 0, 1, 2)
+
+        return sample
+
+    @property
+    def dummy_sample_deter(self):
+        batch_size = 4
+        num_channels = 3
+        height = 8
+        width = 8
+
+        num_elems = batch_size * num_channels * height * width
+        sample = torch.arange(num_elems)
+        sample = sample.reshape(num_channels, height, width, batch_size)
+        sample = sample / num_elems
+        sample = sample.permute(3, 0, 1, 2)
+
+        return sample
+
+    def get_scheduler_config(self):
+        raise NotImplementedError
+
+    def dummy_model(self):
+        def model(sample, t, *args):
+            # if t is a tensor, match the number of dimensions of sample
+            if isinstance(t, torch.Tensor):
+                num_dims = len(sample.shape)
+                # pad t with 1s to match num_dims
+                t = t.reshape(-1, *(1,) * (num_dims - 1)).to(sample.device).to(sample.dtype)
+
+            return sample * t / (t + 1)
+
+        return model
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        time_step = time_step if time_step is not None else self.default_timestep
+
+        for scheduler_class in self.scheduler_classes:
+            # TODO(Suraj) - delete the following two lines once DDPM, DDIM, and PNDM have timesteps casted to float by default
+            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
+                time_step = float(time_step)
+
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+
+            if scheduler_class == CMStochasticIterativeScheduler:
+                # Get valid timestep based on sigma_max, which should always be in timestep schedule.
+                scaled_sigma_max = scheduler.sigma_to_t(scheduler.config.sigma_max)
+                time_step = scaled_sigma_max
+
+            if scheduler_class == EDMEulerScheduler:
+                time_step = scheduler.timesteps[-1]
+
+            if scheduler_class == VQDiffusionScheduler:
+                num_vec_classes = scheduler_config["num_vec_classes"]
+                sample = self.dummy_sample(num_vec_classes)
+                model = self.dummy_model(num_vec_classes)
+                residual = model(sample, time_step)
+            else:
+                sample = self.dummy_sample
+                residual = 0.1 * sample
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+                new_scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # Make sure `scale_model_input` is invoked to prevent a warning
+            if scheduler_class == CMStochasticIterativeScheduler:
+                # Get valid timestep based on sigma_max, which should always be in timestep schedule.
+                _ = scheduler.scale_model_input(sample, scaled_sigma_max)
+                _ = new_scheduler.scale_model_input(sample, scaled_sigma_max)
+            elif scheduler_class != VQDiffusionScheduler:
+                _ = scheduler.scale_model_input(sample, scheduler.timesteps[-1])
+                _ = new_scheduler.scale_model_input(sample, scheduler.timesteps[-1])
+
+            # Set the seed before step() as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.manual_seed(0)
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.manual_seed(0)
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        kwargs.update(forward_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        time_step = time_step if time_step is not None else self.default_timestep
+
+        for scheduler_class in self.scheduler_classes:
+            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
+                time_step = float(time_step)
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            if scheduler_class == VQDiffusionScheduler:
+                num_vec_classes = scheduler_config["num_vec_classes"]
+                sample = self.dummy_sample(num_vec_classes)
+                model = self.dummy_model(num_vec_classes)
+                residual = model(sample, time_step)
+            else:
+                sample = self.dummy_sample
+                residual = 0.1 * sample
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+                new_scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.manual_seed(0)
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.manual_seed(0)
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_save_pretrained(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", self.default_num_inference_steps)
+
+        for scheduler_class in self.scheduler_classes:
+            timestep = self.default_timestep
+            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
+                timestep = float(timestep)
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            if scheduler_class == CMStochasticIterativeScheduler:
+                # Get valid timestep based on sigma_max, which should always be in timestep schedule.
+                timestep = scheduler.sigma_to_t(scheduler.config.sigma_max)
+
+            if scheduler_class == VQDiffusionScheduler:
+                num_vec_classes = scheduler_config["num_vec_classes"]
+                sample = self.dummy_sample(num_vec_classes)
+                model = self.dummy_model(num_vec_classes)
+                residual = model(sample, timestep)
+            else:
+                sample = self.dummy_sample
+                residual = 0.1 * sample
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+                new_scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.manual_seed(0)
+            output = scheduler.step(residual, timestep, sample, **kwargs).prev_sample
+
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.manual_seed(0)
+            new_output = new_scheduler.step(residual, timestep, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_compatibles(self):
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+
+            scheduler = scheduler_class(**scheduler_config)
+
+            assert all(c is not None for c in scheduler.compatibles)
+
+            for comp_scheduler_cls in scheduler.compatibles:
+                comp_scheduler = comp_scheduler_cls.from_config(scheduler.config)
+                assert comp_scheduler is not None
+
+            new_scheduler = scheduler_class.from_config(comp_scheduler.config)
+
+            new_scheduler_config = {k: v for k, v in new_scheduler.config.items() if k in scheduler.config}
+            scheduler_diff = {k: v for k, v in new_scheduler.config.items() if k not in scheduler.config}
+
+            # make sure that configs are essentially identical
+            assert new_scheduler_config == dict(scheduler.config)
+
+            # make sure that only differences are for configs that are not in init
+            init_keys = inspect.signature(scheduler_class.__init__).parameters.keys()
+            assert set(scheduler_diff.keys()).intersection(set(init_keys)) == set()
+
+    def test_from_pretrained(self):
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+
+            scheduler = scheduler_class(**scheduler_config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_pretrained(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+
+            # `_use_default_values` should not exist for just saved & loaded scheduler
+            scheduler_config = dict(scheduler.config)
+            del scheduler_config["_use_default_values"]
+
+            assert scheduler_config == new_scheduler.config
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", self.default_num_inference_steps)
+
+        timestep_0 = self.default_timestep
+        timestep_1 = self.default_timestep_2
+
+        for scheduler_class in self.scheduler_classes:
+            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
+                timestep_0 = float(timestep_0)
+                timestep_1 = float(timestep_1)
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            if scheduler_class == VQDiffusionScheduler:
+                num_vec_classes = scheduler_config["num_vec_classes"]
+                sample = self.dummy_sample(num_vec_classes)
+                model = self.dummy_model(num_vec_classes)
+                residual = model(sample, timestep_0)
+            else:
+                sample = self.dummy_sample
+                residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output_0 = scheduler.step(residual, timestep_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, timestep_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_scheduler_outputs_equivalence(self):
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def recursive_check(tuple_object, dict_object):
+            if isinstance(tuple_object, (List, Tuple)):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif isinstance(tuple_object, Dict):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif tuple_object is None:
+                return
+            else:
+                self.assertTrue(
+                    torch.allclose(
+                        set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                    ),
+                    msg=(
+                        "Tuple and dict output are not equal. Difference:"
+                        f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                        f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                        f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                    ),
+                )
+
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", self.default_num_inference_steps)
+
+        timestep = self.default_timestep
+        if len(self.scheduler_classes) > 0 and self.scheduler_classes[0] == IPNDMScheduler:
+            timestep = 1
+
+        for scheduler_class in self.scheduler_classes:
+            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
+                timestep = float(timestep)
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            if scheduler_class == CMStochasticIterativeScheduler:
+                # Get valid timestep based on sigma_max, which should always be in timestep schedule.
+                timestep = scheduler.sigma_to_t(scheduler.config.sigma_max)
+
+            if scheduler_class == VQDiffusionScheduler:
+                num_vec_classes = scheduler_config["num_vec_classes"]
+                sample = self.dummy_sample(num_vec_classes)
+                model = self.dummy_model(num_vec_classes)
+                residual = model(sample, timestep)
+            else:
+                sample = self.dummy_sample
+                residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.manual_seed(0)
+            outputs_dict = scheduler.step(residual, timestep, sample, **kwargs)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.manual_seed(0)
+            outputs_tuple = scheduler.step(residual, timestep, sample, return_dict=False, **kwargs)
+
+            recursive_check(outputs_tuple, outputs_dict)
+
+    def test_scheduler_public_api(self):
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            if scheduler_class != VQDiffusionScheduler:
+                self.assertTrue(
+                    hasattr(scheduler, "init_noise_sigma"),
+                    f"{scheduler_class} does not implement a required attribute `init_noise_sigma`",
+                )
+                self.assertTrue(
+                    hasattr(scheduler, "scale_model_input"),
+                    (
+                        f"{scheduler_class} does not implement a required class method `scale_model_input(sample,"
+                        " timestep)`"
+                    ),
+                )
+            self.assertTrue(
+                hasattr(scheduler, "step"),
+                f"{scheduler_class} does not implement a required class method `step(...)`",
+            )
+
+            if scheduler_class != VQDiffusionScheduler:
+                sample = self.dummy_sample
+                if scheduler_class == CMStochasticIterativeScheduler:
+                    # Get valid timestep based on sigma_max, which should always be in timestep schedule.
+                    scaled_sigma_max = scheduler.sigma_to_t(scheduler.config.sigma_max)
+                    scaled_sample = scheduler.scale_model_input(sample, scaled_sigma_max)
+                elif scheduler_class == EDMEulerScheduler:
+                    scaled_sample = scheduler.scale_model_input(sample, scheduler.timesteps[-1])
+                else:
+                    scaled_sample = scheduler.scale_model_input(sample, 0.0)
+                self.assertEqual(sample.shape, scaled_sample.shape)
+
+    def test_add_noise_device(self):
+        for scheduler_class in self.scheduler_classes:
+            if scheduler_class == IPNDMScheduler:
+                continue
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(self.default_num_inference_steps)
+
+            sample = self.dummy_sample.to(torch_device)
+            if scheduler_class == CMStochasticIterativeScheduler:
+                # Get valid timestep based on sigma_max, which should always be in timestep schedule.
+                scaled_sigma_max = scheduler.sigma_to_t(scheduler.config.sigma_max)
+                scaled_sample = scheduler.scale_model_input(sample, scaled_sigma_max)
+            elif scheduler_class == EDMEulerScheduler:
+                scaled_sample = scheduler.scale_model_input(sample, scheduler.timesteps[-1])
+            else:
+                scaled_sample = scheduler.scale_model_input(sample, 0.0)
+            self.assertEqual(sample.shape, scaled_sample.shape)
+
+            noise = torch.randn_like(scaled_sample).to(torch_device)
+            t = scheduler.timesteps[5][None]
+            noised = scheduler.add_noise(scaled_sample, noise, t)
+            self.assertEqual(noised.shape, scaled_sample.shape)
+
+    def test_deprecated_kwargs(self):
+        for scheduler_class in self.scheduler_classes:
+            has_kwarg_in_model_class = "kwargs" in inspect.signature(scheduler_class.__init__).parameters
+            has_deprecated_kwarg = len(scheduler_class._deprecated_kwargs) > 0
+
+            if has_kwarg_in_model_class and not has_deprecated_kwarg:
+                raise ValueError(
+                    f"{scheduler_class} has `**kwargs` in its __init__ method but has not defined any deprecated"
+                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if"
+                    " there are no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
+                    " [<deprecated_argument>]`"
+                )
+
+            if not has_kwarg_in_model_class and has_deprecated_kwarg:
+                raise ValueError(
+                    f"{scheduler_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated"
+                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs`"
+                    f" argument to {self.model_class}.__init__ if there are deprecated arguments or remove the"
+                    " deprecated argument from `_deprecated_kwargs = [<deprecated_argument>]`"
+                )
+
+    def test_trained_betas(self):
+        for scheduler_class in self.scheduler_classes:
+            if scheduler_class in (VQDiffusionScheduler, CMStochasticIterativeScheduler):
+                continue
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config, trained_betas=np.array([0.1, 0.3]))
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_pretrained(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+
+            assert scheduler.betas.tolist() == new_scheduler.betas.tolist()
+
+    def test_getattr_is_correct(self):
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            # save some things to test
+            scheduler.dummy_attribute = 5
+            scheduler.register_to_config(test_attribute=5)
+
+            logger = logging.get_logger("diffusers.configuration_utils")
+            # 30 for warning
+            logger.setLevel(30)
+            with CaptureLogger(logger) as cap_logger:
+                assert hasattr(scheduler, "dummy_attribute")
+                assert getattr(scheduler, "dummy_attribute") == 5
+                assert scheduler.dummy_attribute == 5
+
+            # no warning should be thrown
+            assert cap_logger.out == ""
+
+            logger = logging.get_logger("diffusers.schedulers.scheduling_utils")
+            # 30 for warning
+            logger.setLevel(30)
+            with CaptureLogger(logger) as cap_logger:
+                assert hasattr(scheduler, "save_pretrained")
+                fn = scheduler.save_pretrained
+                fn_1 = getattr(scheduler, "save_pretrained")
+
+                assert fn == fn_1
+            # no warning should be thrown
+            assert cap_logger.out == ""
+
+            # warning should be thrown
+            with self.assertWarns(FutureWarning):
+                assert scheduler.test_attribute == 5
+
+            with self.assertWarns(FutureWarning):
+                assert getattr(scheduler, "test_attribute") == 5
+
+            with self.assertRaises(AttributeError) as error:
+                scheduler.does_not_exist
+
+            assert str(error.exception) == f"'{type(scheduler).__name__}' object has no attribute 'does_not_exist'"
+
+
+@is_staging_test
+class SchedulerPushToHubTester(unittest.TestCase):
+    identifier = uuid.uuid4()
+    repo_id = f"test-scheduler-{identifier}"
+    org_repo_id = f"valid_org/{repo_id}-org"
+
+    def test_push_to_hub(self):
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        scheduler.push_to_hub(self.repo_id, token=TOKEN)
+        scheduler_loaded = DDIMScheduler.from_pretrained(f"{USER}/{self.repo_id}")
+
+        assert type(scheduler) == type(scheduler_loaded)
+
+        # Reset repo
+        delete_repo(token=TOKEN, repo_id=self.repo_id)
+
+        # Push to hub via save_config
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            scheduler.save_config(tmp_dir, repo_id=self.repo_id, push_to_hub=True, token=TOKEN)
+
+        scheduler_loaded = DDIMScheduler.from_pretrained(f"{USER}/{self.repo_id}")
+
+        assert type(scheduler) == type(scheduler_loaded)
+
+        # Reset repo
+        delete_repo(token=TOKEN, repo_id=self.repo_id)
+
+    def test_push_to_hub_in_organization(self):
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        scheduler.push_to_hub(self.org_repo_id, token=TOKEN)
+        scheduler_loaded = DDIMScheduler.from_pretrained(self.org_repo_id)
+
+        assert type(scheduler) == type(scheduler_loaded)
+
+        # Reset repo
+        delete_repo(token=TOKEN, repo_id=self.org_repo_id)
+
+        # Push to hub via save_config
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            scheduler.save_config(tmp_dir, repo_id=self.org_repo_id, push_to_hub=True, token=TOKEN)
+
+        scheduler_loaded = DDIMScheduler.from_pretrained(self.org_repo_id)
+
+        assert type(scheduler) == type(scheduler_loaded)
+
+        # Reset repo
+        delete_repo(token=TOKEN, repo_id=self.org_repo_id)
diff --git a/diffusers/tests/single_file/__init__.py b/diffusers/tests/single_file/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/diffusers/tests/single_file/single_file_testing_utils.py b/diffusers/tests/single_file/single_file_testing_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b89578c5a8c44adfc901c01cb10a9eaf343098d
--- /dev/null
+++ b/diffusers/tests/single_file/single_file_testing_utils.py
@@ -0,0 +1,409 @@
+import tempfile
+from io import BytesIO
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download, snapshot_download
+
+from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name
+from diffusers.models.attention_processor import AttnProcessor
+from diffusers.utils.testing_utils import (
+    numpy_cosine_similarity_distance,
+    torch_device,
+)
+
+
+def download_single_file_checkpoint(repo_id, filename, tmpdir):
+    path = hf_hub_download(repo_id, filename=filename, local_dir=tmpdir)
+    return path
+
+
+def download_original_config(config_url, tmpdir):
+    original_config_file = BytesIO(requests.get(config_url).content)
+    path = f"{tmpdir}/config.yaml"
+    with open(path, "wb") as f:
+        f.write(original_config_file.read())
+
+    return path
+
+
+def download_diffusers_config(repo_id, tmpdir):
+    path = snapshot_download(
+        repo_id,
+        ignore_patterns=[
+            "**/*.ckpt",
+            "*.ckpt",
+            "**/*.bin",
+            "*.bin",
+            "**/*.pt",
+            "*.pt",
+            "**/*.safetensors",
+            "*.safetensors",
+        ],
+        allow_patterns=["**/*.json", "*.json", "*.txt", "**/*.txt"],
+        local_dir=tmpdir,
+    )
+    return path
+
+
+class SDSingleFileTesterMixin:
+    def _compare_component_configs(self, pipe, single_file_pipe):
+        for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items():
+            if param_name in ["torch_dtype", "architectures", "_name_or_path"]:
+                continue
+            assert pipe.text_encoder.config.to_dict()[param_name] == param_value
+
+        PARAMS_TO_IGNORE = [
+            "torch_dtype",
+            "_name_or_path",
+            "architectures",
+            "_use_default_values",
+            "_diffusers_version",
+        ]
+        for component_name, component in single_file_pipe.components.items():
+            if component_name in single_file_pipe._optional_components:
+                continue
+
+            # skip testing transformer based components here
+            # skip text encoders / safety checkers since they have already been tested
+            if component_name in ["text_encoder", "tokenizer", "safety_checker", "feature_extractor"]:
+                continue
+
+            assert component_name in pipe.components, f"single file {component_name} not found in pretrained pipeline"
+            assert isinstance(
+                component, pipe.components[component_name].__class__
+            ), f"single file {component.__class__.__name__} and pretrained {pipe.components[component_name].__class__.__name__} are not the same"
+
+            for param_name, param_value in component.config.items():
+                if param_name in PARAMS_TO_IGNORE:
+                    continue
+
+                # Some pretrained configs will set upcast attention to None
+                # In single file loading it defaults to the value in the class __init__ which is False
+                if param_name == "upcast_attention" and pipe.components[component_name].config[param_name] is None:
+                    pipe.components[component_name].config[param_name] = param_value
+
+                assert (
+                    pipe.components[component_name].config[param_name] == param_value
+                ), f"single file {param_name}: {param_value} differs from pretrained {pipe.components[component_name].config[param_name]}"
+
+    def test_single_file_components(self, pipe=None, single_file_pipe=None):
+        single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+            self.ckpt_path, safety_checker=None
+        )
+        pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None)
+
+        self._compare_component_configs(pipe, single_file_pipe)
+
+    def test_single_file_components_local_files_only(self, pipe=None, single_file_pipe=None):
+        pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+
+            single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+                local_ckpt_path, safety_checker=None, local_files_only=True
+            )
+
+        self._compare_component_configs(pipe, single_file_pipe)
+
+    def test_single_file_components_with_original_config(
+        self,
+        pipe=None,
+        single_file_pipe=None,
+    ):
+        pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None)
+        # Not possible to infer this value when original config is provided
+        # we just pass it in here otherwise this test will fail
+        upcast_attention = pipe.unet.config.upcast_attention
+
+        single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+            self.ckpt_path,
+            original_config=self.original_config,
+            safety_checker=None,
+            upcast_attention=upcast_attention,
+        )
+
+        self._compare_component_configs(pipe, single_file_pipe)
+
+    def test_single_file_components_with_original_config_local_files_only(
+        self,
+        pipe=None,
+        single_file_pipe=None,
+    ):
+        pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None)
+
+        # Not possible to infer this value when original config is provided
+        # we just pass it in here otherwise this test will fail
+        upcast_attention = pipe.unet.config.upcast_attention
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+            local_original_config = download_original_config(self.original_config, tmpdir)
+
+            single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+                local_ckpt_path,
+                original_config=local_original_config,
+                safety_checker=None,
+                upcast_attention=upcast_attention,
+                local_files_only=True,
+            )
+
+        self._compare_component_configs(pipe, single_file_pipe)
+
+    def test_single_file_format_inference_is_same_as_pretrained(self, expected_max_diff=1e-4):
+        sf_pipe = self.pipeline_class.from_single_file(self.ckpt_path, safety_checker=None)
+        sf_pipe.unet.set_attn_processor(AttnProcessor())
+        sf_pipe.enable_model_cpu_offload()
+
+        inputs = self.get_inputs(torch_device)
+        image_single_file = sf_pipe(**inputs).images[0]
+
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None)
+        pipe.unet.set_attn_processor(AttnProcessor())
+        pipe.enable_model_cpu_offload()
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images[0]
+
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten())
+
+        assert max_diff < expected_max_diff
+
+    def test_single_file_components_with_diffusers_config(
+        self,
+        pipe=None,
+        single_file_pipe=None,
+    ):
+        single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+            self.ckpt_path, config=self.repo_id, safety_checker=None
+        )
+        pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None)
+
+        self._compare_component_configs(pipe, single_file_pipe)
+
+    def test_single_file_components_with_diffusers_config_local_files_only(
+        self,
+        pipe=None,
+        single_file_pipe=None,
+    ):
+        pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+            local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir)
+
+            single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+                local_ckpt_path, config=local_diffusers_config, safety_checker=None, local_files_only=True
+            )
+
+        self._compare_component_configs(pipe, single_file_pipe)
+
+    def test_single_file_setting_pipeline_dtype_to_fp16(
+        self,
+        single_file_pipe=None,
+    ):
+        single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+            self.ckpt_path, torch_dtype=torch.float16
+        )
+
+        for component_name, component in single_file_pipe.components.items():
+            if not isinstance(component, torch.nn.Module):
+                continue
+
+            assert component.dtype == torch.float16
+
+
+class SDXLSingleFileTesterMixin:
+    def _compare_component_configs(self, pipe, single_file_pipe):
+        # Skip testing the text_encoder for Refiner Pipelines
+        if pipe.text_encoder:
+            for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items():
+                if param_name in ["torch_dtype", "architectures", "_name_or_path"]:
+                    continue
+                assert pipe.text_encoder.config.to_dict()[param_name] == param_value
+
+        for param_name, param_value in single_file_pipe.text_encoder_2.config.to_dict().items():
+            if param_name in ["torch_dtype", "architectures", "_name_or_path"]:
+                continue
+            assert pipe.text_encoder_2.config.to_dict()[param_name] == param_value
+
+        PARAMS_TO_IGNORE = [
+            "torch_dtype",
+            "_name_or_path",
+            "architectures",
+            "_use_default_values",
+            "_diffusers_version",
+        ]
+        for component_name, component in single_file_pipe.components.items():
+            if component_name in single_file_pipe._optional_components:
+                continue
+
+            # skip text encoders since they have already been tested
+            if component_name in ["text_encoder", "text_encoder_2", "tokenizer", "tokenizer_2"]:
+                continue
+
+            # skip safety checker if it is not present in the pipeline
+            if component_name in ["safety_checker", "feature_extractor"]:
+                continue
+
+            assert component_name in pipe.components, f"single file {component_name} not found in pretrained pipeline"
+            assert isinstance(
+                component, pipe.components[component_name].__class__
+            ), f"single file {component.__class__.__name__} and pretrained {pipe.components[component_name].__class__.__name__} are not the same"
+
+            for param_name, param_value in component.config.items():
+                if param_name in PARAMS_TO_IGNORE:
+                    continue
+
+                # Some pretrained configs will set upcast attention to None
+                # In single file loading it defaults to the value in the class __init__ which is False
+                if param_name == "upcast_attention" and pipe.components[component_name].config[param_name] is None:
+                    pipe.components[component_name].config[param_name] = param_value
+
+                assert (
+                    pipe.components[component_name].config[param_name] == param_value
+                ), f"single file {param_name}: {param_value} differs from pretrained {pipe.components[component_name].config[param_name]}"
+
+    def test_single_file_components(self, pipe=None, single_file_pipe=None):
+        single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+            self.ckpt_path, safety_checker=None
+        )
+        pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None)
+
+        self._compare_component_configs(
+            pipe,
+            single_file_pipe,
+        )
+
+    def test_single_file_components_local_files_only(
+        self,
+        pipe=None,
+        single_file_pipe=None,
+    ):
+        pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+
+            single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+                local_ckpt_path, safety_checker=None, local_files_only=True
+            )
+
+        self._compare_component_configs(pipe, single_file_pipe)
+
+    def test_single_file_components_with_original_config(
+        self,
+        pipe=None,
+        single_file_pipe=None,
+    ):
+        pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None)
+        # Not possible to infer this value when original config is provided
+        # we just pass it in here otherwise this test will fail
+        upcast_attention = pipe.unet.config.upcast_attention
+        single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+            self.ckpt_path,
+            original_config=self.original_config,
+            safety_checker=None,
+            upcast_attention=upcast_attention,
+        )
+
+        self._compare_component_configs(
+            pipe,
+            single_file_pipe,
+        )
+
+    def test_single_file_components_with_original_config_local_files_only(
+        self,
+        pipe=None,
+        single_file_pipe=None,
+    ):
+        pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None)
+        # Not possible to infer this value when original config is provided
+        # we just pass it in here otherwise this test will fail
+        upcast_attention = pipe.unet.config.upcast_attention
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+            local_original_config = download_original_config(self.original_config, tmpdir)
+
+            single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+                local_ckpt_path,
+                original_config=local_original_config,
+                upcast_attention=upcast_attention,
+                safety_checker=None,
+                local_files_only=True,
+            )
+
+        self._compare_component_configs(
+            pipe,
+            single_file_pipe,
+        )
+
+    def test_single_file_components_with_diffusers_config(
+        self,
+        pipe=None,
+        single_file_pipe=None,
+    ):
+        single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+            self.ckpt_path, config=self.repo_id, safety_checker=None
+        )
+        pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None)
+
+        self._compare_component_configs(pipe, single_file_pipe)
+
+    def test_single_file_components_with_diffusers_config_local_files_only(
+        self,
+        pipe=None,
+        single_file_pipe=None,
+    ):
+        pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+            local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir)
+
+            single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+                local_ckpt_path, config=local_diffusers_config, safety_checker=None, local_files_only=True
+            )
+
+        self._compare_component_configs(pipe, single_file_pipe)
+
+    def test_single_file_format_inference_is_same_as_pretrained(self, expected_max_diff=1e-4):
+        sf_pipe = self.pipeline_class.from_single_file(self.ckpt_path, torch_dtype=torch.float16, safety_checker=None)
+        sf_pipe.unet.set_default_attn_processor()
+        sf_pipe.enable_model_cpu_offload()
+
+        inputs = self.get_inputs(torch_device)
+        image_single_file = sf_pipe(**inputs).images[0]
+
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16, safety_checker=None)
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
+
+        inputs = self.get_inputs(torch_device)
+        image = pipe(**inputs).images[0]
+
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten())
+
+        assert max_diff < expected_max_diff
+
+    def test_single_file_setting_pipeline_dtype_to_fp16(
+        self,
+        single_file_pipe=None,
+    ):
+        single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file(
+            self.ckpt_path, torch_dtype=torch.float16
+        )
+
+        for component_name, component in single_file_pipe.components.items():
+            if not isinstance(component, torch.nn.Module):
+                continue
+
+            assert component.dtype == torch.float16
diff --git a/diffusers/tests/single_file/test_model_controlnet_single_file.py b/diffusers/tests/single_file/test_model_controlnet_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d5b790ebb4a9c2e1a7397b9258b134169f41e7d
--- /dev/null
+++ b/diffusers/tests/single_file/test_model_controlnet_single_file.py
@@ -0,0 +1,78 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import torch
+
+from diffusers import (
+    ControlNetModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+)
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class ControlNetModelSingleFileTests(unittest.TestCase):
+    model_class = ControlNetModel
+    ckpt_path = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"
+    repo_id = "lllyasviel/control_v11p_sd15_canny"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_single_file_components(self):
+        model = self.model_class.from_pretrained(self.repo_id)
+        model_single_file = self.model_class.from_single_file(self.ckpt_path)
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between single file loading and pretrained loading"
+
+    def test_single_file_arguments(self):
+        model_default = self.model_class.from_single_file(self.ckpt_path)
+
+        assert model_default.config.upcast_attention is False
+        assert model_default.dtype == torch.float32
+
+        torch_dtype = torch.float16
+        upcast_attention = True
+
+        model = self.model_class.from_single_file(
+            self.ckpt_path,
+            upcast_attention=upcast_attention,
+            torch_dtype=torch_dtype,
+        )
+        assert model.config.upcast_attention == upcast_attention
+        assert model.dtype == torch_dtype
diff --git a/diffusers/tests/single_file/test_model_motion_adapter_single_file.py b/diffusers/tests/single_file/test_model_motion_adapter_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..b195f25d094bf8053224a89d7e0c42d0cbb11faa
--- /dev/null
+++ b/diffusers/tests/single_file/test_model_motion_adapter_single_file.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from diffusers import (
+    MotionAdapter,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+)
+
+
+enable_full_determinism()
+
+
+class MotionAdapterSingleFileTests(unittest.TestCase):
+    model_class = MotionAdapter
+
+    def test_single_file_components_version_v1_5(self):
+        ckpt_path = "https://huggingface.co/guoyww/animatediff/blob/main/mm_sd_v15.ckpt"
+        repo_id = "guoyww/animatediff-motion-adapter-v1-5"
+
+        model = self.model_class.from_pretrained(repo_id)
+        model_single_file = self.model_class.from_single_file(ckpt_path)
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between pretrained loading and single file loading"
+
+    def test_single_file_components_version_v1_5_2(self):
+        ckpt_path = "https://huggingface.co/guoyww/animatediff/blob/main/mm_sd_v15_v2.ckpt"
+        repo_id = "guoyww/animatediff-motion-adapter-v1-5-2"
+
+        model = self.model_class.from_pretrained(repo_id)
+        model_single_file = self.model_class.from_single_file(ckpt_path)
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between pretrained loading and single file loading"
+
+    def test_single_file_components_version_v1_5_3(self):
+        ckpt_path = "https://huggingface.co/guoyww/animatediff/blob/main/v3_sd15_mm.ckpt"
+        repo_id = "guoyww/animatediff-motion-adapter-v1-5-3"
+
+        model = self.model_class.from_pretrained(repo_id)
+        model_single_file = self.model_class.from_single_file(ckpt_path)
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between pretrained loading and single file loading"
+
+    def test_single_file_components_version_sdxl_beta(self):
+        ckpt_path = "https://huggingface.co/guoyww/animatediff/blob/main/mm_sdxl_v10_beta.ckpt"
+        repo_id = "guoyww/animatediff-motion-adapter-sdxl-beta"
+
+        model = self.model_class.from_pretrained(repo_id)
+        model_single_file = self.model_class.from_single_file(ckpt_path)
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between pretrained loading and single file loading"
diff --git a/diffusers/tests/single_file/test_model_sd_cascade_unet_single_file.py b/diffusers/tests/single_file/test_model_sd_cascade_unet_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..43253eb6d59f3aacceb3b0018dc26e843f4789d2
--- /dev/null
+++ b/diffusers/tests/single_file/test_model_sd_cascade_unet_single_file.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import torch
+
+from diffusers import StableCascadeUNet
+from diffusers.utils import logging
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+)
+
+
+logger = logging.get_logger(__name__)
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableCascadeUNetSingleFileTest(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_single_file_components_stage_b(self):
+        model_single_file = StableCascadeUNet.from_single_file(
+            "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_bf16.safetensors",
+            torch_dtype=torch.bfloat16,
+        )
+        model = StableCascadeUNet.from_pretrained(
+            "stabilityai/stable-cascade", variant="bf16", subfolder="decoder", use_safetensors=True
+        )
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between single file loading and pretrained loading"
+
+    def test_single_file_components_stage_b_lite(self):
+        model_single_file = StableCascadeUNet.from_single_file(
+            "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_lite_bf16.safetensors",
+            torch_dtype=torch.bfloat16,
+        )
+        model = StableCascadeUNet.from_pretrained(
+            "stabilityai/stable-cascade", variant="bf16", subfolder="decoder_lite"
+        )
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between single file loading and pretrained loading"
+
+    def test_single_file_components_stage_c(self):
+        model_single_file = StableCascadeUNet.from_single_file(
+            "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_c_bf16.safetensors",
+            torch_dtype=torch.bfloat16,
+        )
+        model = StableCascadeUNet.from_pretrained(
+            "stabilityai/stable-cascade-prior", variant="bf16", subfolder="prior"
+        )
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between single file loading and pretrained loading"
+
+    def test_single_file_components_stage_c_lite(self):
+        model_single_file = StableCascadeUNet.from_single_file(
+            "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_c_lite_bf16.safetensors",
+            torch_dtype=torch.bfloat16,
+        )
+        model = StableCascadeUNet.from_pretrained(
+            "stabilityai/stable-cascade-prior", variant="bf16", subfolder="prior_lite"
+        )
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between single file loading and pretrained loading"
diff --git a/diffusers/tests/single_file/test_model_vae_single_file.py b/diffusers/tests/single_file/test_model_vae_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..63f2bb7574728ef0ce6782e98ca2078d9c5abb02
--- /dev/null
+++ b/diffusers/tests/single_file/test_model_vae_single_file.py
@@ -0,0 +1,117 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_hf_numpy,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class AutoencoderKLSingleFileTests(unittest.TestCase):
+    model_class = AutoencoderKL
+    ckpt_path = (
+        "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors"
+    )
+    repo_id = "stabilityai/sd-vae-ft-mse"
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_file_format(self, seed, shape):
+        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
+
+    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
+        dtype = torch.float16 if fp16 else torch.float32
+        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
+        return image
+
+    def test_single_file_inference_same_as_pretrained(self):
+        model_1 = self.model_class.from_pretrained(self.repo_id).to(torch_device)
+        model_2 = self.model_class.from_single_file(self.ckpt_path, config=self.repo_id).to(torch_device)
+
+        image = self.get_sd_image(33)
+
+        generator = torch.Generator(torch_device)
+
+        with torch.no_grad():
+            sample_1 = model_1(image, generator=generator.manual_seed(0)).sample
+            sample_2 = model_2(image, generator=generator.manual_seed(0)).sample
+
+        assert sample_1.shape == sample_2.shape
+
+        output_slice_1 = sample_1.flatten().float().cpu()
+        output_slice_2 = sample_2.flatten().float().cpu()
+
+        assert numpy_cosine_similarity_distance(output_slice_1, output_slice_2) < 1e-4
+
+    def test_single_file_components(self):
+        model = self.model_class.from_pretrained(self.repo_id)
+        model_single_file = self.model_class.from_single_file(self.ckpt_path, config=self.repo_id)
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between pretrained loading and single file loading"
+
+    def test_single_file_arguments(self):
+        model_default = self.model_class.from_single_file(self.ckpt_path, config=self.repo_id)
+
+        assert model_default.config.scaling_factor == 0.18215
+        assert model_default.config.sample_size == 256
+        assert model_default.dtype == torch.float32
+
+        scaling_factor = 2.0
+        sample_size = 512
+        torch_dtype = torch.float16
+
+        model = self.model_class.from_single_file(
+            self.ckpt_path,
+            config=self.repo_id,
+            sample_size=sample_size,
+            scaling_factor=scaling_factor,
+            torch_dtype=torch_dtype,
+        )
+        assert model.config.scaling_factor == scaling_factor
+        assert model.config.sample_size == sample_size
+        assert model.dtype == torch_dtype
diff --git a/diffusers/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py b/diffusers/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..332bcfbe03b63e6f6c233151e3eef8704fb3ead2
--- /dev/null
+++ b/diffusers/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py
@@ -0,0 +1,196 @@
+import gc
+import tempfile
+import unittest
+
+import torch
+
+from diffusers import ControlNetModel, StableDiffusionControlNetPipeline
+from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from .single_file_testing_utils import (
+    SDSingleFileTesterMixin,
+    download_diffusers_config,
+    download_original_config,
+    download_single_file_checkpoint,
+)
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionControlNetPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+    pipeline_class = StableDiffusionControlNetPipeline
+    ckpt_path = (
+        "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
+    )
+    original_config = (
+        "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+    )
+    repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/sketch-mountains-input.png"
+        )
+        control_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ).resize((512, 512))
+        prompt = "bird"
+
+        inputs = {
+            "prompt": prompt,
+            "image": init_image,
+            "control_image": control_image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet)
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
+
+        pipe_sf = self.pipeline_class.from_single_file(
+            self.ckpt_path,
+            controlnet=controlnet,
+        )
+        pipe_sf.unet.set_default_attn_processor()
+        pipe_sf.enable_model_cpu_offload()
+
+        inputs = self.get_inputs(torch_device)
+        output = pipe(**inputs).images[0]
+
+        inputs = self.get_inputs(torch_device)
+        output_sf = pipe_sf(**inputs).images[0]
+
+        max_diff = numpy_cosine_similarity_distance(output_sf.flatten(), output.flatten())
+        assert max_diff < 1e-3
+
+    def test_single_file_components(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id, variant="fp16", safety_checker=None, controlnet=controlnet
+        )
+        pipe_single_file = self.pipeline_class.from_single_file(
+            self.ckpt_path,
+            safety_checker=None,
+            controlnet=controlnet,
+        )
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_local_files_only(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weights_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weights_name, tmpdir)
+
+            pipe_single_file = self.pipeline_class.from_single_file(
+                local_ckpt_path, controlnet=controlnet, safety_checker=None, local_files_only=True
+            )
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_original_config(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16")
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet)
+        pipe_single_file = self.pipeline_class.from_single_file(
+            self.ckpt_path, controlnet=controlnet, safety_checker=None, original_config=self.original_config
+        )
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_original_config_local_files_only(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16"
+        )
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            controlnet=controlnet,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weights_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weights_name, tmpdir)
+
+            local_original_config = download_original_config(self.original_config, tmpdir)
+
+            pipe_single_file = self.pipeline_class.from_single_file(
+                local_ckpt_path,
+                original_config=local_original_config,
+                controlnet=controlnet,
+                safety_checker=None,
+                local_files_only=True,
+            )
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_diffusers_config(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16")
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet)
+        pipe_single_file = self.pipeline_class.from_single_file(
+            self.ckpt_path, controlnet=controlnet, safety_checker=None, original_config=self.original_config
+        )
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_diffusers_config_local_files_only(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16"
+        )
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            controlnet=controlnet,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weights_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weights_name, tmpdir)
+
+            local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir)
+
+            pipe_single_file = self.pipeline_class.from_single_file(
+                local_ckpt_path,
+                config=local_diffusers_config,
+                safety_checker=None,
+                controlnet=controlnet,
+                local_files_only=True,
+            )
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_setting_pipeline_dtype_to_fp16(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16"
+        )
+        single_file_pipe = self.pipeline_class.from_single_file(
+            self.ckpt_path, controlnet=controlnet, safety_checker=None, torch_dtype=torch.float16
+        )
+        super().test_single_file_setting_pipeline_dtype_to_fp16(single_file_pipe)
diff --git a/diffusers/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py b/diffusers/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0d70123b2867fb27469dc2c54001d4bc8c868f5
--- /dev/null
+++ b/diffusers/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py
@@ -0,0 +1,195 @@
+import gc
+import tempfile
+import unittest
+
+import torch
+
+from diffusers import ControlNetModel, StableDiffusionControlNetInpaintPipeline
+from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+)
+
+from .single_file_testing_utils import (
+    SDSingleFileTesterMixin,
+    download_diffusers_config,
+    download_original_config,
+    download_single_file_checkpoint,
+)
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionControlNetInpaintPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+    pipeline_class = StableDiffusionControlNetInpaintPipeline
+    ckpt_path = "https://huggingface.co/botp/stable-diffusion-v1-5-inpainting/blob/main/sd-v1-5-inpainting.ckpt"
+    original_config = "https://raw.githubusercontent.com/runwayml/stable-diffusion/main/configs/stable-diffusion/v1-inpainting-inference.yaml"
+    repo_id = "stable-diffusion-v1-5/stable-diffusion-inpainting"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self):
+        control_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ).resize((512, 512))
+        image = load_image(
+            "https://huggingface.co/lllyasviel/sd-controlnet-canny/resolve/main/images/bird.png"
+        ).resize((512, 512))
+        mask_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/input_bench_mask.png"
+        ).resize((512, 512))
+
+        inputs = {
+            "prompt": "bird",
+            "image": image,
+            "control_image": control_image,
+            "mask_image": mask_image,
+            "generator": torch.Generator(device="cpu").manual_seed(0),
+            "num_inference_steps": 3,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet, safety_checker=None)
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
+
+        pipe_sf = self.pipeline_class.from_single_file(self.ckpt_path, controlnet=controlnet, safety_checker=None)
+        pipe_sf.unet.set_default_attn_processor()
+        pipe_sf.enable_model_cpu_offload()
+
+        inputs = self.get_inputs()
+        output = pipe(**inputs).images[0]
+
+        inputs = self.get_inputs()
+        output_sf = pipe_sf(**inputs).images[0]
+
+        max_diff = numpy_cosine_similarity_distance(output_sf.flatten(), output.flatten())
+        assert max_diff < 2e-3
+
+    def test_single_file_components(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id, variant="fp16", safety_checker=None, controlnet=controlnet
+        )
+        pipe_single_file = self.pipeline_class.from_single_file(
+            self.ckpt_path,
+            safety_checker=None,
+            controlnet=controlnet,
+        )
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_local_files_only(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None, controlnet=controlnet)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+
+            pipe_single_file = self.pipeline_class.from_single_file(
+                local_ckpt_path, controlnet=controlnet, safety_checker=None, local_files_only=True
+            )
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    @unittest.skip("runwayml original config repo does not exist")
+    def test_single_file_components_with_original_config(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16")
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet)
+        pipe_single_file = self.pipeline_class.from_single_file(
+            self.ckpt_path, controlnet=controlnet, original_config=self.original_config
+        )
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    @unittest.skip("runwayml original config repo does not exist")
+    def test_single_file_components_with_original_config_local_files_only(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16"
+        )
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            controlnet=controlnet,
+            safety_checker=None,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+            local_original_config = download_original_config(self.original_config, tmpdir)
+
+            pipe_single_file = self.pipeline_class.from_single_file(
+                local_ckpt_path,
+                original_config=local_original_config,
+                controlnet=controlnet,
+                safety_checker=None,
+                local_files_only=True,
+            )
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_diffusers_config(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16")
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet)
+        pipe_single_file = self.pipeline_class.from_single_file(
+            self.ckpt_path,
+            controlnet=controlnet,
+            config=self.repo_id,
+        )
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_diffusers_config_local_files_only(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "lllyasviel/control_v11p_sd15_canny",
+            torch_dtype=torch.float16,
+            variant="fp16",
+        )
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            controlnet=controlnet,
+            safety_checker=None,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+            local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir)
+
+            pipe_single_file = self.pipeline_class.from_single_file(
+                local_ckpt_path,
+                config=local_diffusers_config,
+                controlnet=controlnet,
+                safety_checker=None,
+                local_files_only=True,
+            )
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_setting_pipeline_dtype_to_fp16(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16"
+        )
+        single_file_pipe = self.pipeline_class.from_single_file(
+            self.ckpt_path, controlnet=controlnet, safety_checker=None, torch_dtype=torch.float16
+        )
+        super().test_single_file_setting_pipeline_dtype_to_fp16(single_file_pipe)
diff --git a/diffusers/tests/single_file/test_stable_diffusion_controlnet_single_file.py b/diffusers/tests/single_file/test_stable_diffusion_controlnet_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b5cf910b080c86943b0604894c3c726e2ca9bfb
--- /dev/null
+++ b/diffusers/tests/single_file/test_stable_diffusion_controlnet_single_file.py
@@ -0,0 +1,183 @@
+import gc
+import tempfile
+import unittest
+
+import torch
+
+from diffusers import ControlNetModel, StableDiffusionControlNetPipeline
+from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+)
+
+from .single_file_testing_utils import (
+    SDSingleFileTesterMixin,
+    download_diffusers_config,
+    download_original_config,
+    download_single_file_checkpoint,
+)
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionControlNetPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+    pipeline_class = StableDiffusionControlNetPipeline
+    ckpt_path = (
+        "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
+    )
+    original_config = (
+        "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+    )
+    repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self):
+        control_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ).resize((512, 512))
+        inputs = {
+            "prompt": "bird",
+            "image": control_image,
+            "generator": torch.Generator(device="cpu").manual_seed(0),
+            "num_inference_steps": 3,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet)
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
+
+        pipe_sf = self.pipeline_class.from_single_file(
+            self.ckpt_path,
+            controlnet=controlnet,
+        )
+        pipe_sf.unet.set_default_attn_processor()
+        pipe_sf.enable_model_cpu_offload()
+
+        inputs = self.get_inputs()
+        output = pipe(**inputs).images[0]
+
+        inputs = self.get_inputs()
+        output_sf = pipe_sf(**inputs).images[0]
+
+        max_diff = numpy_cosine_similarity_distance(output_sf.flatten(), output.flatten())
+        assert max_diff < 1e-3
+
+    def test_single_file_components(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id, variant="fp16", safety_checker=None, controlnet=controlnet
+        )
+        pipe_single_file = self.pipeline_class.from_single_file(
+            self.ckpt_path,
+            safety_checker=None,
+            controlnet=controlnet,
+        )
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_local_files_only(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+
+            pipe_single_file = self.pipeline_class.from_single_file(
+                local_ckpt_path, controlnet=controlnet, local_files_only=True
+            )
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_original_config(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16")
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet)
+        pipe_single_file = self.pipeline_class.from_single_file(
+            self.ckpt_path, controlnet=controlnet, original_config=self.original_config
+        )
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_original_config_local_files_only(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16"
+        )
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            controlnet=controlnet,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+            local_original_config = download_original_config(self.original_config, tmpdir)
+
+            pipe_single_file = self.pipeline_class.from_single_file(
+                local_ckpt_path, original_config=local_original_config, controlnet=controlnet, local_files_only=True
+            )
+            pipe_single_file.scheduler = pipe.scheduler
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_diffusers_config(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16")
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet)
+        pipe_single_file = self.pipeline_class.from_single_file(
+            self.ckpt_path, controlnet=controlnet, safety_checker=None, config=self.repo_id
+        )
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_diffusers_config_local_files_only(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16"
+        )
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            controlnet=controlnet,
+            safety_checker=None,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+            local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir)
+
+            pipe_single_file = self.pipeline_class.from_single_file(
+                local_ckpt_path,
+                config=local_diffusers_config,
+                controlnet=controlnet,
+                safety_checker=None,
+                local_files_only=True,
+            )
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_setting_pipeline_dtype_to_fp16(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16"
+        )
+        single_file_pipe = self.pipeline_class.from_single_file(
+            self.ckpt_path, controlnet=controlnet, safety_checker=None, torch_dtype=torch.float16
+        )
+        super().test_single_file_setting_pipeline_dtype_to_fp16(single_file_pipe)
diff --git a/diffusers/tests/single_file/test_stable_diffusion_img2img_single_file.py b/diffusers/tests/single_file/test_stable_diffusion_img2img_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..04f36f255014bcb79cfc2e02fc0ca57cae6b01e0
--- /dev/null
+++ b/diffusers/tests/single_file/test_stable_diffusion_img2img_single_file.py
@@ -0,0 +1,101 @@
+import gc
+import unittest
+
+import torch
+
+from diffusers import (
+    StableDiffusionImg2ImgPipeline,
+)
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+)
+
+from .single_file_testing_utils import SDSingleFileTesterMixin
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionImg2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+    pipeline_class = StableDiffusionImg2ImgPipeline
+    ckpt_path = (
+        "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
+    )
+    original_config = (
+        "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+    )
+    repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/sketch-mountains-input.png"
+        )
+        inputs = {
+            "prompt": "a fantasy landscape, concept art, high resolution",
+            "image": init_image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusion21Img2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+    pipeline_class = StableDiffusionImg2ImgPipeline
+    ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors"
+    original_config = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
+    repo_id = "stabilityai/stable-diffusion-2-1"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/sketch-mountains-input.png"
+        )
+        inputs = {
+            "prompt": "a fantasy landscape, concept art, high resolution",
+            "image": init_image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3)
diff --git a/diffusers/tests/single_file/test_stable_diffusion_inpaint_single_file.py b/diffusers/tests/single_file/test_stable_diffusion_inpaint_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c6734a9a33e7db368cf7d39ebd4ea997a10ec70
--- /dev/null
+++ b/diffusers/tests/single_file/test_stable_diffusion_inpaint_single_file.py
@@ -0,0 +1,122 @@
+import gc
+import unittest
+
+import torch
+
+from diffusers import (
+    StableDiffusionInpaintPipeline,
+)
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+)
+
+from .single_file_testing_utils import SDSingleFileTesterMixin
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionInpaintPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+    pipeline_class = StableDiffusionInpaintPipeline
+    ckpt_path = "https://huggingface.co/botp/stable-diffusion-v1-5-inpainting/blob/main/sd-v1-5-inpainting.ckpt"
+    original_config = "https://raw.githubusercontent.com/runwayml/stable-diffusion/main/configs/stable-diffusion/v1-inpainting-inference.yaml"
+    repo_id = "botp/stable-diffusion-v1-5-inpainting"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/input_bench_image.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/input_bench_mask.png"
+        )
+        inputs = {
+            "prompt": "Face of a yellow cat, high resolution, sitting on a park bench",
+            "image": init_image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3)
+
+    def test_single_file_loading_4_channel_unet(self):
+        # Test loading single file inpaint with a 4 channel UNet
+        ckpt_path = "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
+        pipe = self.pipeline_class.from_single_file(ckpt_path)
+
+        assert pipe.unet.config.in_channels == 4
+
+    @unittest.skip("runwayml original config has been removed")
+    def test_single_file_components_with_original_config(self):
+        return
+
+    @unittest.skip("runwayml original config has been removed")
+    def test_single_file_components_with_original_config_local_files_only(self):
+        return
+
+
+@slow
+@require_torch_gpu
+class StableDiffusion21InpaintPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+    pipeline_class = StableDiffusionInpaintPipeline
+    ckpt_path = (
+        "https://huggingface.co/stabilityai/stable-diffusion-2-inpainting/blob/main/512-inpainting-ema.safetensors"
+    )
+    original_config = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inpainting-inference.yaml"
+    repo_id = "stabilityai/stable-diffusion-2-inpainting"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/input_bench_image.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_inpaint/input_bench_mask.png"
+        )
+        inputs = {
+            "prompt": "Face of a yellow cat, high resolution, sitting on a park bench",
+            "image": init_image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3)
diff --git a/diffusers/tests/single_file/test_stable_diffusion_single_file.py b/diffusers/tests/single_file/test_stable_diffusion_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..e46e87e18c18946ae5220a5322f1412f512afd7b
--- /dev/null
+++ b/diffusers/tests/single_file/test_stable_diffusion_single_file.py
@@ -0,0 +1,118 @@
+import gc
+import tempfile
+import unittest
+
+import torch
+
+from diffusers import EulerDiscreteScheduler, StableDiffusionPipeline
+from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+)
+
+from .single_file_testing_utils import (
+    SDSingleFileTesterMixin,
+    download_original_config,
+    download_single_file_checkpoint,
+)
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+    pipeline_class = StableDiffusionPipeline
+    ckpt_path = (
+        "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
+    )
+    original_config = (
+        "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+    )
+    repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        inputs = {
+            "prompt": "a fantasy landscape, concept art, high resolution",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3)
+
+    def test_single_file_legacy_scheduler_loading(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+            local_original_config = download_original_config(self.original_config, tmpdir)
+
+            pipe = self.pipeline_class.from_single_file(
+                local_ckpt_path,
+                original_config=local_original_config,
+                cache_dir=tmpdir,
+                local_files_only=True,
+                scheduler_type="euler",
+            )
+
+        # Default is PNDM for this checkpoint
+        assert isinstance(pipe.scheduler, EulerDiscreteScheduler)
+
+    def test_single_file_legacy_scaling_factor(self):
+        new_scaling_factor = 10.0
+        init_pipe = self.pipeline_class.from_single_file(self.ckpt_path)
+        pipe = self.pipeline_class.from_single_file(self.ckpt_path, scaling_factor=new_scaling_factor)
+
+        assert init_pipe.vae.config.scaling_factor != new_scaling_factor
+        assert pipe.vae.config.scaling_factor == new_scaling_factor
+
+
+@slow
+class StableDiffusion21PipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+    pipeline_class = StableDiffusionPipeline
+    ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors"
+    original_config = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
+    repo_id = "stabilityai/stable-diffusion-2-1"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        inputs = {
+            "prompt": "a fantasy landscape, concept art, high resolution",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3)
diff --git a/diffusers/tests/single_file/test_stable_diffusion_upscale_single_file.py b/diffusers/tests/single_file/test_stable_diffusion_upscale_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c26d001c2b032c3aca5220cdf7c72c5f5dae1d2
--- /dev/null
+++ b/diffusers/tests/single_file/test_stable_diffusion_upscale_single_file.py
@@ -0,0 +1,68 @@
+import gc
+import unittest
+
+import torch
+
+from diffusers import (
+    StableDiffusionUpscalePipeline,
+)
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+)
+
+from .single_file_testing_utils import SDSingleFileTesterMixin
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionUpscalePipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+    pipeline_class = StableDiffusionUpscalePipeline
+    ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler/blob/main/x4-upscaler-ema.safetensors"
+    original_config = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml"
+    repo_id = "stabilityai/stable-diffusion-x4-upscaler"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-upscale/low_res_cat.png"
+        )
+
+        prompt = "a cat sitting on a park bench"
+        pipe = StableDiffusionUpscalePipeline.from_pretrained(self.repo_id)
+        pipe.enable_model_cpu_offload()
+
+        generator = torch.Generator("cpu").manual_seed(0)
+        output = pipe(prompt=prompt, image=image, generator=generator, output_type="np", num_inference_steps=3)
+        image_from_pretrained = output.images[0]
+
+        pipe_from_single_file = StableDiffusionUpscalePipeline.from_single_file(self.ckpt_path)
+        pipe_from_single_file.enable_model_cpu_offload()
+
+        generator = torch.Generator("cpu").manual_seed(0)
+        output_from_single_file = pipe_from_single_file(
+            prompt=prompt, image=image, generator=generator, output_type="np", num_inference_steps=3
+        )
+        image_from_single_file = output_from_single_file.images[0]
+
+        assert image_from_pretrained.shape == (512, 512, 3)
+        assert image_from_single_file.shape == (512, 512, 3)
+        assert (
+            numpy_cosine_similarity_distance(image_from_pretrained.flatten(), image_from_single_file.flatten()) < 1e-3
+        )
diff --git a/diffusers/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py b/diffusers/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..ead77a1d6553c3962d16fbbc97477abf85916acb
--- /dev/null
+++ b/diffusers/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py
@@ -0,0 +1,211 @@
+import gc
+import tempfile
+import unittest
+
+import torch
+
+from diffusers import (
+    StableDiffusionXLAdapterPipeline,
+    T2IAdapter,
+)
+from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+)
+
+from .single_file_testing_utils import (
+    SDXLSingleFileTesterMixin,
+    download_diffusers_config,
+    download_original_config,
+    download_single_file_checkpoint,
+)
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionXLAdapterPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin):
+    pipeline_class = StableDiffusionXLAdapterPipeline
+    ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
+    repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
+    original_config = (
+        "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
+    )
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self):
+        prompt = "toy"
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/t2i_adapter/toy_canny.png"
+        )
+
+        inputs = {
+            "prompt": prompt,
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16)
+        pipe_single_file = StableDiffusionXLAdapterPipeline.from_single_file(
+            self.ckpt_path,
+            adapter=adapter,
+            torch_dtype=torch.float16,
+            safety_checker=None,
+        )
+        pipe_single_file.enable_model_cpu_offload()
+        pipe_single_file.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs()
+        images_single_file = pipe_single_file(**inputs).images[0]
+
+        pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
+            self.repo_id,
+            adapter=adapter,
+            torch_dtype=torch.float16,
+            safety_checker=None,
+        )
+        pipe.enable_model_cpu_offload()
+
+        inputs = self.get_inputs()
+        images = pipe(**inputs).images[0]
+
+        assert images_single_file.shape == (768, 512, 3)
+        assert images.shape == (768, 512, 3)
+
+        max_diff = numpy_cosine_similarity_distance(images.flatten(), images_single_file.flatten())
+        assert max_diff < 5e-3
+
+    def test_single_file_components(self):
+        adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            variant="fp16",
+            adapter=adapter,
+            torch_dtype=torch.float16,
+        )
+
+        pipe_single_file = self.pipeline_class.from_single_file(self.ckpt_path, safety_checker=None, adapter=adapter)
+        super().test_single_file_components(pipe, pipe_single_file)
+
+    def test_single_file_components_local_files_only(self):
+        adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            variant="fp16",
+            adapter=adapter,
+            torch_dtype=torch.float16,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+
+            single_file_pipe = self.pipeline_class.from_single_file(
+                local_ckpt_path, adapter=adapter, safety_checker=None, local_files_only=True
+            )
+
+        self._compare_component_configs(pipe, single_file_pipe)
+
+    def test_single_file_components_with_diffusers_config(self):
+        adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            variant="fp16",
+            adapter=adapter,
+            torch_dtype=torch.float16,
+            safety_checker=None,
+        )
+
+        pipe_single_file = self.pipeline_class.from_single_file(self.ckpt_path, config=self.repo_id, adapter=adapter)
+        self._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_diffusers_config_local_files_only(self):
+        adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            variant="fp16",
+            adapter=adapter,
+            torch_dtype=torch.float16,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+            local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir)
+
+            pipe_single_file = self.pipeline_class.from_single_file(
+                local_ckpt_path,
+                config=local_diffusers_config,
+                adapter=adapter,
+                safety_checker=None,
+                local_files_only=True,
+            )
+        self._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_original_config(self):
+        adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            variant="fp16",
+            adapter=adapter,
+            torch_dtype=torch.float16,
+            safety_checker=None,
+        )
+
+        pipe_single_file = self.pipeline_class.from_single_file(
+            self.ckpt_path, original_config=self.original_config, adapter=adapter
+        )
+        self._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_original_config_local_files_only(self):
+        adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            variant="fp16",
+            adapter=adapter,
+            torch_dtype=torch.float16,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+            local_original_config = download_original_config(self.original_config, tmpdir)
+
+            pipe_single_file = self.pipeline_class.from_single_file(
+                local_ckpt_path,
+                original_config=local_original_config,
+                adapter=adapter,
+                safety_checker=None,
+                local_files_only=True,
+            )
+        self._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_setting_pipeline_dtype_to_fp16(self):
+        adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16)
+
+        single_file_pipe = self.pipeline_class.from_single_file(
+            self.ckpt_path, adapter=adapter, torch_dtype=torch.float16
+        )
+        super().test_single_file_setting_pipeline_dtype_to_fp16(single_file_pipe)
diff --git a/diffusers/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py b/diffusers/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..9491adf2dfa4d37e7ad749010b834e72104a2507
--- /dev/null
+++ b/diffusers/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py
@@ -0,0 +1,207 @@
+import gc
+import tempfile
+import unittest
+
+import torch
+
+from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
+from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from .single_file_testing_utils import (
+    SDXLSingleFileTesterMixin,
+    download_diffusers_config,
+    download_single_file_checkpoint,
+)
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionXLControlNetPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin):
+    pipeline_class = StableDiffusionXLControlNetPipeline
+    ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
+    repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
+    original_config = (
+        "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
+    )
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
+        )
+        inputs = {
+            "prompt": "Stormtrooper's lecture",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16)
+        pipe_single_file = self.pipeline_class.from_single_file(
+            self.ckpt_path, controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe_single_file.unet.set_default_attn_processor()
+        pipe_single_file.enable_model_cpu_offload()
+        pipe_single_file.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        single_file_images = pipe_single_file(**inputs).images[0]
+
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet, torch_dtype=torch.float16)
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
+
+        inputs = self.get_inputs(torch_device)
+        images = pipe(**inputs).images[0]
+
+        assert images.shape == (512, 512, 3)
+        assert single_file_images.shape == (512, 512, 3)
+
+        max_diff = numpy_cosine_similarity_distance(images[0].flatten(), single_file_images[0].flatten())
+        assert max_diff < 5e-2
+
+    def test_single_file_components(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16"
+        )
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            variant="fp16",
+            controlnet=controlnet,
+            torch_dtype=torch.float16,
+        )
+
+        pipe_single_file = self.pipeline_class.from_single_file(self.ckpt_path, controlnet=controlnet)
+        super().test_single_file_components(pipe, pipe_single_file)
+
+    def test_single_file_components_local_files_only(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16"
+        )
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            variant="fp16",
+            controlnet=controlnet,
+            torch_dtype=torch.float16,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+
+            single_file_pipe = self.pipeline_class.from_single_file(
+                local_ckpt_path, controlnet=controlnet, safety_checker=None, local_files_only=True
+            )
+
+        self._compare_component_configs(pipe, single_file_pipe)
+
+    def test_single_file_components_with_original_config(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16"
+        )
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            variant="fp16",
+            controlnet=controlnet,
+            torch_dtype=torch.float16,
+        )
+
+        pipe_single_file = self.pipeline_class.from_single_file(
+            self.ckpt_path,
+            original_config=self.original_config,
+            controlnet=controlnet,
+        )
+        self._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_original_config_local_files_only(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16"
+        )
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            variant="fp16",
+            controlnet=controlnet,
+            torch_dtype=torch.float16,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+
+            pipe_single_file = self.pipeline_class.from_single_file(
+                local_ckpt_path,
+                safety_checker=None,
+                controlnet=controlnet,
+                local_files_only=True,
+            )
+        self._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_diffusers_config(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16"
+        )
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet)
+        pipe_single_file = self.pipeline_class.from_single_file(
+            self.ckpt_path, controlnet=controlnet, config=self.repo_id
+        )
+
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_components_with_diffusers_config_local_files_only(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16"
+        )
+        pipe = self.pipeline_class.from_pretrained(
+            self.repo_id,
+            controlnet=controlnet,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo_id, weight_name = _extract_repo_id_and_weights_name(self.ckpt_path)
+            local_ckpt_path = download_single_file_checkpoint(repo_id, weight_name, tmpdir)
+            local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir)
+
+            pipe_single_file = self.pipeline_class.from_single_file(
+                local_ckpt_path,
+                config=local_diffusers_config,
+                safety_checker=None,
+                controlnet=controlnet,
+                local_files_only=True,
+            )
+        super()._compare_component_configs(pipe, pipe_single_file)
+
+    def test_single_file_setting_pipeline_dtype_to_fp16(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16"
+        )
+        single_file_pipe = self.pipeline_class.from_single_file(
+            self.ckpt_path, controlnet=controlnet, safety_checker=None, torch_dtype=torch.float16
+        )
+        super().test_single_file_setting_pipeline_dtype_to_fp16(single_file_pipe)
diff --git a/diffusers/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py b/diffusers/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..71b57eb7c6c9bdd2d9355dee7cd3e97545596913
--- /dev/null
+++ b/diffusers/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py
@@ -0,0 +1,105 @@
+import gc
+import unittest
+
+import torch
+
+from diffusers import (
+    DDIMScheduler,
+    StableDiffusionXLImg2ImgPipeline,
+)
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+)
+
+from .single_file_testing_utils import SDXLSingleFileTesterMixin
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionXLImg2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin):
+    pipeline_class = StableDiffusionXLImg2ImgPipeline
+    ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
+    repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
+    original_config = (
+        "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
+    )
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/sketch-mountains-input.png"
+        )
+        inputs = {
+            "prompt": "a fantasy landscape, concept art, high resolution",
+            "image": init_image,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionXLImg2ImgRefinerPipelineSingleFileSlowTests(unittest.TestCase):
+    pipeline_class = StableDiffusionXLImg2ImgPipeline
+    ckpt_path = (
+        "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors"
+    )
+    repo_id = "stabilityai/stable-diffusion-xl-refiner-1.0"
+    original_config = (
+        "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml"
+    )
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/sketch-mountains-input.png"
+        )
+
+        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image = pipe(
+            prompt="mountains", image=init_image, num_inference_steps=5, generator=generator, output_type="np"
+        ).images[0]
+
+        pipe_single_file = self.pipeline_class.from_single_file(self.ckpt_path, torch_dtype=torch.float16)
+        pipe_single_file.scheduler = DDIMScheduler.from_config(pipe_single_file.scheduler.config)
+        pipe_single_file.unet.set_default_attn_processor()
+        pipe_single_file.enable_model_cpu_offload()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_single_file = pipe_single_file(
+            prompt="mountains", image=init_image, num_inference_steps=5, generator=generator, output_type="np"
+        ).images[0]
+
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten())
+
+        assert max_diff < 5e-4
diff --git a/diffusers/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py b/diffusers/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ebddc8555bbf40b892384aa28ea65f15101e3d8
--- /dev/null
+++ b/diffusers/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py
@@ -0,0 +1,50 @@
+import gc
+import unittest
+
+import torch
+
+from diffusers import StableDiffusionXLInstructPix2PixPipeline
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+)
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionXLInstructPix2PixPipeline(unittest.TestCase):
+    pipeline_class = StableDiffusionXLInstructPix2PixPipeline
+    ckpt_path = "https://huggingface.co/stabilityai/cosxl/blob/main/cosxl_edit.safetensors"
+    original_config = None
+    repo_id = "diffusers/sdxl-instructpix2pix-768"
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        inputs = {
+            "prompt": "a fantasy landscape, concept art, high resolution",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_single_file_setting_cosxl_edit(self):
+        # Default is PNDM for this checkpoint
+        pipe = self.pipeline_class.from_single_file(self.ckpt_path, config=self.repo_id, is_cosxl_edit=True)
+        assert pipe.is_cosxl_edit is True
diff --git a/diffusers/tests/single_file/test_stable_diffusion_xl_single_file.py b/diffusers/tests/single_file/test_stable_diffusion_xl_single_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..a143a35a2bbc4baf99d928b5d88c385b6753b461
--- /dev/null
+++ b/diffusers/tests/single_file/test_stable_diffusion_xl_single_file.py
@@ -0,0 +1,54 @@
+import gc
+import unittest
+
+import torch
+
+from diffusers import (
+    StableDiffusionXLPipeline,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    require_torch_gpu,
+    slow,
+)
+
+from .single_file_testing_utils import SDXLSingleFileTesterMixin
+
+
+enable_full_determinism()
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionXLPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin):
+    pipeline_class = StableDiffusionXLPipeline
+    ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
+    repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
+    original_config = (
+        "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
+    )
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        inputs = {
+            "prompt": "a fantasy landscape, concept art, high resolution",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "strength": 0.75,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_single_file_format_inference_is_same_as_pretrained(self):
+        super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3)
diff --git a/diffusers/utils/check_config_docstrings.py b/diffusers/utils/check_config_docstrings.py
new file mode 100644
index 0000000000000000000000000000000000000000..626a9a46857245f006312229710d1f5d0faa565f
--- /dev/null
+++ b/diffusers/utils/check_config_docstrings.py
@@ -0,0 +1,84 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+import os
+import re
+
+
+# All paths are set with the intent you should run this script from the root of the repo with the command
+# python utils/check_config_docstrings.py
+PATH_TO_TRANSFORMERS = "src/transformers"
+
+
+# This is to make sure the transformers module imported is the one in the repo.
+spec = importlib.util.spec_from_file_location(
+    "transformers",
+    os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"),
+    submodule_search_locations=[PATH_TO_TRANSFORMERS],
+)
+transformers = spec.loader.load_module()
+
+CONFIG_MAPPING = transformers.models.auto.configuration_auto.CONFIG_MAPPING
+
+# Regex pattern used to find the checkpoint mentioned in the docstring of `config_class`.
+# For example, `[bert-base-uncased](https://huggingface.co/bert-base-uncased)`
+_re_checkpoint = re.compile(r"\[(.+?)\]\((https://huggingface\.co/.+?)\)")
+
+
+CONFIG_CLASSES_TO_IGNORE_FOR_DOCSTRING_CHECKPOINT_CHECK = {
+    "CLIPConfigMixin",
+    "DecisionTransformerConfigMixin",
+    "EncoderDecoderConfigMixin",
+    "RagConfigMixin",
+    "SpeechEncoderDecoderConfigMixin",
+    "VisionEncoderDecoderConfigMixin",
+    "VisionTextDualEncoderConfigMixin",
+}
+
+
+def check_config_docstrings_have_checkpoints():
+    configs_without_checkpoint = []
+
+    for config_class in list(CONFIG_MAPPING.values()):
+        checkpoint_found = False
+
+        # source code of `config_class`
+        config_source = inspect.getsource(config_class)
+        checkpoints = _re_checkpoint.findall(config_source)
+
+        for checkpoint in checkpoints:
+            # Each `checkpoint` is a tuple of a checkpoint name and a checkpoint link.
+            # For example, `('bert-base-uncased', 'https://huggingface.co/bert-base-uncased')`
+            ckpt_name, ckpt_link = checkpoint
+
+            # verify the checkpoint name corresponds to the checkpoint link
+            ckpt_link_from_name = f"https://huggingface.co/{ckpt_name}"
+            if ckpt_link == ckpt_link_from_name:
+                checkpoint_found = True
+                break
+
+        name = config_class.__name__
+        if not checkpoint_found and name not in CONFIG_CLASSES_TO_IGNORE_FOR_DOCSTRING_CHECKPOINT_CHECK:
+            configs_without_checkpoint.append(name)
+
+    if len(configs_without_checkpoint) > 0:
+        message = "\n".join(sorted(configs_without_checkpoint))
+        raise ValueError(f"The following configurations don't contain any valid checkpoint:\n{message}")
+
+
+if __name__ == "__main__":
+    check_config_docstrings_have_checkpoints()
diff --git a/diffusers/utils/check_copies.py b/diffusers/utils/check_copies.py
new file mode 100644
index 0000000000000000000000000000000000000000..20449e790db238fa5f6aa26c064cf30bd7270bcf
--- /dev/null
+++ b/diffusers/utils/check_copies.py
@@ -0,0 +1,222 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import glob
+import os
+import re
+import subprocess
+
+
+# All paths are set with the intent you should run this script from the root of the repo with the command
+# python utils/check_copies.py
+DIFFUSERS_PATH = "src/diffusers"
+REPO_PATH = "."
+
+
+def _should_continue(line, indent):
+    return line.startswith(indent) or len(line) <= 1 or re.search(r"^\s*\)(\s*->.*:|:)\s*$", line) is not None
+
+
+def find_code_in_diffusers(object_name):
+    """Find and return the code source code of `object_name`."""
+    parts = object_name.split(".")
+    i = 0
+
+    # First let's find the module where our object lives.
+    module = parts[i]
+    while i < len(parts) and not os.path.isfile(os.path.join(DIFFUSERS_PATH, f"{module}.py")):
+        i += 1
+        if i < len(parts):
+            module = os.path.join(module, parts[i])
+    if i >= len(parts):
+        raise ValueError(f"`object_name` should begin with the name of a module of diffusers but got {object_name}.")
+
+    with open(
+        os.path.join(DIFFUSERS_PATH, f"{module}.py"),
+        "r",
+        encoding="utf-8",
+        newline="\n",
+    ) as f:
+        lines = f.readlines()
+
+    # Now let's find the class / func in the code!
+    indent = ""
+    line_index = 0
+    for name in parts[i + 1 :]:
+        while (
+            line_index < len(lines) and re.search(rf"^{indent}(class|def)\s+{name}(\(|\:)", lines[line_index]) is None
+        ):
+            line_index += 1
+        indent += "    "
+        line_index += 1
+
+    if line_index >= len(lines):
+        raise ValueError(f" {object_name} does not match any function or class in {module}.")
+
+    # We found the beginning of the class / func, now let's find the end (when the indent diminishes).
+    start_index = line_index
+    while line_index < len(lines) and _should_continue(lines[line_index], indent):
+        line_index += 1
+    # Clean up empty lines at the end (if any).
+    while len(lines[line_index - 1]) <= 1:
+        line_index -= 1
+
+    code_lines = lines[start_index:line_index]
+    return "".join(code_lines)
+
+
+_re_copy_warning = re.compile(r"^(\s*)#\s*Copied from\s+diffusers\.(\S+\.\S+)\s*($|\S.*$)")
+_re_replace_pattern = re.compile(r"^\s*(\S+)->(\S+)(\s+.*|$)")
+_re_fill_pattern = re.compile(r"<FILL\s+[^>]*>")
+
+
+def get_indent(code):
+    lines = code.split("\n")
+    idx = 0
+    while idx < len(lines) and len(lines[idx]) == 0:
+        idx += 1
+    if idx < len(lines):
+        return re.search(r"^(\s*)\S", lines[idx]).groups()[0]
+    return ""
+
+
+def run_ruff(code):
+    command = ["ruff", "format", "-", "--config", "pyproject.toml", "--silent"]
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
+    stdout, _ = process.communicate(input=code.encode())
+    return stdout.decode()
+
+
+def stylify(code: str) -> str:
+    """
+    Applies the ruff part of our `make style` command to some code. This formats the code using `ruff format`.
+    As `ruff` does not provide a python api this cannot be done on the fly.
+
+    Args:
+        code (`str`): The code to format.
+
+    Returns:
+        `str`: The formatted code.
+    """
+    has_indent = len(get_indent(code)) > 0
+    if has_indent:
+        code = f"class Bla:\n{code}"
+    formatted_code = run_ruff(code)
+    return formatted_code[len("class Bla:\n") :] if has_indent else formatted_code
+
+
+def is_copy_consistent(filename, overwrite=False):
+    """
+    Check if the code commented as a copy in `filename` matches the original.
+    Return the differences or overwrites the content depending on `overwrite`.
+    """
+    with open(filename, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+    diffs = []
+    line_index = 0
+    # Not a for loop cause `lines` is going to change (if `overwrite=True`).
+    while line_index < len(lines):
+        search = _re_copy_warning.search(lines[line_index])
+        if search is None:
+            line_index += 1
+            continue
+
+        # There is some copied code here, let's retrieve the original.
+        indent, object_name, replace_pattern = search.groups()
+        theoretical_code = find_code_in_diffusers(object_name)
+        theoretical_indent = get_indent(theoretical_code)
+
+        start_index = line_index + 1 if indent == theoretical_indent else line_index + 2
+        indent = theoretical_indent
+        line_index = start_index
+
+        # Loop to check the observed code, stop when indentation diminishes or if we see a End copy comment.
+        should_continue = True
+        while line_index < len(lines) and should_continue:
+            line_index += 1
+            if line_index >= len(lines):
+                break
+            line = lines[line_index]
+            should_continue = _should_continue(line, indent) and re.search(f"^{indent}# End copy", line) is None
+        # Clean up empty lines at the end (if any).
+        while len(lines[line_index - 1]) <= 1:
+            line_index -= 1
+
+        observed_code_lines = lines[start_index:line_index]
+        observed_code = "".join(observed_code_lines)
+
+        # Remove any nested `Copied from` comments to avoid circular copies
+        theoretical_code = [line for line in theoretical_code.split("\n") if _re_copy_warning.search(line) is None]
+        theoretical_code = "\n".join(theoretical_code)
+
+        # Before comparing, use the `replace_pattern` on the original code.
+        if len(replace_pattern) > 0:
+            patterns = replace_pattern.replace("with", "").split(",")
+            patterns = [_re_replace_pattern.search(p) for p in patterns]
+            for pattern in patterns:
+                if pattern is None:
+                    continue
+                obj1, obj2, option = pattern.groups()
+                theoretical_code = re.sub(obj1, obj2, theoretical_code)
+                if option.strip() == "all-casing":
+                    theoretical_code = re.sub(obj1.lower(), obj2.lower(), theoretical_code)
+                    theoretical_code = re.sub(obj1.upper(), obj2.upper(), theoretical_code)
+
+            # stylify after replacement. To be able to do that, we need the header (class or function definition)
+            # from the previous line
+            theoretical_code = stylify(lines[start_index - 1] + theoretical_code)
+            theoretical_code = theoretical_code[len(lines[start_index - 1]) :]
+
+        # Test for a diff and act accordingly.
+        if observed_code != theoretical_code:
+            diffs.append([object_name, start_index])
+            if overwrite:
+                lines = lines[:start_index] + [theoretical_code] + lines[line_index:]
+                line_index = start_index + 1
+
+    if overwrite and len(diffs) > 0:
+        # Warn the user a file has been modified.
+        print(f"Detected changes, rewriting {filename}.")
+        with open(filename, "w", encoding="utf-8", newline="\n") as f:
+            f.writelines(lines)
+    return diffs
+
+
+def check_copies(overwrite: bool = False):
+    all_files = glob.glob(os.path.join(DIFFUSERS_PATH, "**/*.py"), recursive=True)
+    diffs = []
+    for filename in all_files:
+        new_diffs = is_copy_consistent(filename, overwrite)
+        diffs += [f"- {filename}: copy does not match {d[0]} at line {d[1]}" for d in new_diffs]
+    if not overwrite and len(diffs) > 0:
+        diff = "\n".join(diffs)
+        raise Exception(
+            "Found the following copy inconsistencies:\n"
+            + diff
+            + "\nRun `make fix-copies` or `python utils/check_copies.py --fix_and_overwrite` to fix them."
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--fix_and_overwrite",
+        action="store_true",
+        help="Whether to fix inconsistencies.",
+    )
+    args = parser.parse_args()
+
+    check_copies(args.fix_and_overwrite)
diff --git a/diffusers/utils/check_doc_toc.py b/diffusers/utils/check_doc_toc.py
new file mode 100644
index 0000000000000000000000000000000000000000..35ded936650d4041c5d7981270cb1da41d13b479
--- /dev/null
+++ b/diffusers/utils/check_doc_toc.py
@@ -0,0 +1,158 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from collections import defaultdict
+
+import yaml
+
+
+PATH_TO_TOC = "docs/source/en/_toctree.yml"
+
+
+def clean_doc_toc(doc_list):
+    """
+    Cleans the table of content of the model documentation by removing duplicates and sorting models alphabetically.
+    """
+    counts = defaultdict(int)
+    overview_doc = []
+    new_doc_list = []
+    for doc in doc_list:
+        if "local" in doc:
+            counts[doc["local"]] += 1
+
+        if doc["title"].lower() == "overview":
+            overview_doc.append({"local": doc["local"], "title": doc["title"]})
+        else:
+            new_doc_list.append(doc)
+
+    doc_list = new_doc_list
+    duplicates = [key for key, value in counts.items() if value > 1]
+
+    new_doc = []
+    for duplicate_key in duplicates:
+        titles = list({doc["title"] for doc in doc_list if doc["local"] == duplicate_key})
+        if len(titles) > 1:
+            raise ValueError(
+                f"{duplicate_key} is present several times in the documentation table of content at "
+                "`docs/source/en/_toctree.yml` with different *Title* values. Choose one of those and remove the "
+                "others."
+            )
+        # Only add this once
+        new_doc.append({"local": duplicate_key, "title": titles[0]})
+
+    # Add none duplicate-keys
+    new_doc.extend([doc for doc in doc_list if "local" not in counts or counts[doc["local"]] == 1])
+    new_doc = sorted(new_doc, key=lambda s: s["title"].lower())
+
+    # "overview" gets special treatment and is always first
+    if len(overview_doc) > 1:
+        raise ValueError("{doc_list} has two 'overview' docs which is not allowed.")
+
+    overview_doc.extend(new_doc)
+
+    # Sort
+    return overview_doc
+
+
+def check_scheduler_doc(overwrite=False):
+    with open(PATH_TO_TOC, encoding="utf-8") as f:
+        content = yaml.safe_load(f.read())
+
+    # Get to the API doc
+    api_idx = 0
+    while content[api_idx]["title"] != "API":
+        api_idx += 1
+    api_doc = content[api_idx]["sections"]
+
+    # Then to the model doc
+    scheduler_idx = 0
+    while api_doc[scheduler_idx]["title"] != "Schedulers":
+        scheduler_idx += 1
+
+    scheduler_doc = api_doc[scheduler_idx]["sections"]
+    new_scheduler_doc = clean_doc_toc(scheduler_doc)
+
+    diff = False
+    if new_scheduler_doc != scheduler_doc:
+        diff = True
+        if overwrite:
+            api_doc[scheduler_idx]["sections"] = new_scheduler_doc
+
+    if diff:
+        if overwrite:
+            content[api_idx]["sections"] = api_doc
+            with open(PATH_TO_TOC, "w", encoding="utf-8") as f:
+                f.write(yaml.dump(content, allow_unicode=True))
+        else:
+            raise ValueError(
+                "The model doc part of the table of content is not properly sorted, run `make style` to fix this."
+            )
+
+
+def check_pipeline_doc(overwrite=False):
+    with open(PATH_TO_TOC, encoding="utf-8") as f:
+        content = yaml.safe_load(f.read())
+
+    # Get to the API doc
+    api_idx = 0
+    while content[api_idx]["title"] != "API":
+        api_idx += 1
+    api_doc = content[api_idx]["sections"]
+
+    # Then to the model doc
+    pipeline_idx = 0
+    while api_doc[pipeline_idx]["title"] != "Pipelines":
+        pipeline_idx += 1
+
+    diff = False
+    pipeline_docs = api_doc[pipeline_idx]["sections"]
+    new_pipeline_docs = []
+
+    # sort sub pipeline docs
+    for pipeline_doc in pipeline_docs:
+        if "section" in pipeline_doc:
+            sub_pipeline_doc = pipeline_doc["section"]
+            new_sub_pipeline_doc = clean_doc_toc(sub_pipeline_doc)
+            if overwrite:
+                pipeline_doc["section"] = new_sub_pipeline_doc
+        new_pipeline_docs.append(pipeline_doc)
+
+    # sort overall pipeline doc
+    new_pipeline_docs = clean_doc_toc(new_pipeline_docs)
+
+    if new_pipeline_docs != pipeline_docs:
+        diff = True
+        if overwrite:
+            api_doc[pipeline_idx]["sections"] = new_pipeline_docs
+
+    if diff:
+        if overwrite:
+            content[api_idx]["sections"] = api_doc
+            with open(PATH_TO_TOC, "w", encoding="utf-8") as f:
+                f.write(yaml.dump(content, allow_unicode=True))
+        else:
+            raise ValueError(
+                "The model doc part of the table of content is not properly sorted, run `make style` to fix this."
+            )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
+    args = parser.parse_args()
+
+    check_scheduler_doc(args.fix_and_overwrite)
+    check_pipeline_doc(args.fix_and_overwrite)
diff --git a/diffusers/utils/check_dummies.py b/diffusers/utils/check_dummies.py
new file mode 100644
index 0000000000000000000000000000000000000000..af99eeb05c6d5fbbd5fdca97ed2885652854d58a
--- /dev/null
+++ b/diffusers/utils/check_dummies.py
@@ -0,0 +1,175 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import re
+
+
+# All paths are set with the intent you should run this script from the root of the repo with the command
+# python utils/check_dummies.py
+PATH_TO_DIFFUSERS = "src/diffusers"
+
+# Matches is_xxx_available()
+_re_backend = re.compile(r"is\_([a-z_]*)_available\(\)")
+# Matches from xxx import bla
+_re_single_line_import = re.compile(r"\s+from\s+\S*\s+import\s+([^\(\s].*)\n")
+
+
+DUMMY_CONSTANT = """
+{0} = None
+"""
+
+DUMMY_CLASS = """
+class {0}(metaclass=DummyObject):
+    _backends = {1}
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, {1})
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, {1})
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, {1})
+"""
+
+
+DUMMY_FUNCTION = """
+def {0}(*args, **kwargs):
+    requires_backends({0}, {1})
+"""
+
+
+def find_backend(line):
+    """Find one (or multiple) backend in a code line of the init."""
+    backends = _re_backend.findall(line)
+    if len(backends) == 0:
+        return None
+
+    return "_and_".join(backends)
+
+
+def read_init():
+    """Read the init and extracts PyTorch, TensorFlow, SentencePiece and Tokenizers objects."""
+    with open(os.path.join(PATH_TO_DIFFUSERS, "__init__.py"), "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+
+    # Get to the point we do the actual imports for type checking
+    line_index = 0
+    while not lines[line_index].startswith("if TYPE_CHECKING"):
+        line_index += 1
+
+    backend_specific_objects = {}
+    # Go through the end of the file
+    while line_index < len(lines):
+        # If the line contains is_backend_available, we grab all objects associated with the `else` block
+        backend = find_backend(lines[line_index])
+        if backend is not None:
+            while not lines[line_index].startswith("    else:"):
+                line_index += 1
+            line_index += 1
+            objects = []
+            # Until we unindent, add backend objects to the list
+            while len(lines[line_index]) <= 1 or lines[line_index].startswith(" " * 8):
+                line = lines[line_index]
+                single_line_import_search = _re_single_line_import.search(line)
+                if single_line_import_search is not None:
+                    objects.extend(single_line_import_search.groups()[0].split(", "))
+                elif line.startswith(" " * 12):
+                    objects.append(line[12:-2])
+                line_index += 1
+
+            if len(objects) > 0:
+                backend_specific_objects[backend] = objects
+        else:
+            line_index += 1
+
+    return backend_specific_objects
+
+
+def create_dummy_object(name, backend_name):
+    """Create the code for the dummy object corresponding to `name`."""
+    if name.isupper():
+        return DUMMY_CONSTANT.format(name)
+    elif name.islower():
+        return DUMMY_FUNCTION.format(name, backend_name)
+    else:
+        return DUMMY_CLASS.format(name, backend_name)
+
+
+def create_dummy_files(backend_specific_objects=None):
+    """Create the content of the dummy files."""
+    if backend_specific_objects is None:
+        backend_specific_objects = read_init()
+    # For special correspondence backend to module name as used in the function requires_modulename
+    dummy_files = {}
+
+    for backend, objects in backend_specific_objects.items():
+        backend_name = "[" + ", ".join(f'"{b}"' for b in backend.split("_and_")) + "]"
+        dummy_file = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
+        dummy_file += "from ..utils import DummyObject, requires_backends\n\n"
+        dummy_file += "\n".join([create_dummy_object(o, backend_name) for o in objects])
+        dummy_files[backend] = dummy_file
+
+    return dummy_files
+
+
+def check_dummies(overwrite=False):
+    """Check if the dummy files are up to date and maybe `overwrite` with the right content."""
+    dummy_files = create_dummy_files()
+    # For special correspondence backend to shortcut as used in utils/dummy_xxx_objects.py
+    short_names = {"torch": "pt"}
+
+    # Locate actual dummy modules and read their content.
+    path = os.path.join(PATH_TO_DIFFUSERS, "utils")
+    dummy_file_paths = {
+        backend: os.path.join(path, f"dummy_{short_names.get(backend, backend)}_objects.py")
+        for backend in dummy_files.keys()
+    }
+
+    actual_dummies = {}
+    for backend, file_path in dummy_file_paths.items():
+        if os.path.isfile(file_path):
+            with open(file_path, "r", encoding="utf-8", newline="\n") as f:
+                actual_dummies[backend] = f.read()
+        else:
+            actual_dummies[backend] = ""
+
+    for backend in dummy_files.keys():
+        if dummy_files[backend] != actual_dummies[backend]:
+            if overwrite:
+                print(
+                    f"Updating diffusers.utils.dummy_{short_names.get(backend, backend)}_objects.py as the main "
+                    "__init__ has new objects."
+                )
+                with open(dummy_file_paths[backend], "w", encoding="utf-8", newline="\n") as f:
+                    f.write(dummy_files[backend])
+            else:
+                raise ValueError(
+                    "The main __init__ has objects that are not present in "
+                    f"diffusers.utils.dummy_{short_names.get(backend, backend)}_objects.py. Run `make fix-copies` "
+                    "to fix this."
+                )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
+    args = parser.parse_args()
+
+    check_dummies(args.fix_and_overwrite)
diff --git a/diffusers/utils/check_inits.py b/diffusers/utils/check_inits.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c514046afaa26b4aa2025e32790c759c6b890dd
--- /dev/null
+++ b/diffusers/utils/check_inits.py
@@ -0,0 +1,299 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import importlib.util
+import os
+import re
+from pathlib import Path
+
+
+PATH_TO_TRANSFORMERS = "src/transformers"
+
+
+# Matches is_xxx_available()
+_re_backend = re.compile(r"is\_([a-z_]*)_available()")
+# Catches a one-line _import_struct = {xxx}
+_re_one_line_import_struct = re.compile(r"^_import_structure\s+=\s+\{([^\}]+)\}")
+# Catches a line with a key-values pattern: "bla": ["foo", "bar"]
+_re_import_struct_key_value = re.compile(r'\s+"\S*":\s+\[([^\]]*)\]')
+# Catches a line if not is_foo_available
+_re_test_backend = re.compile(r"^\s*if\s+not\s+is\_[a-z_]*\_available\(\)")
+# Catches a line _import_struct["bla"].append("foo")
+_re_import_struct_add_one = re.compile(r'^\s*_import_structure\["\S*"\]\.append\("(\S*)"\)')
+# Catches a line _import_struct["bla"].extend(["foo", "bar"]) or _import_struct["bla"] = ["foo", "bar"]
+_re_import_struct_add_many = re.compile(r"^\s*_import_structure\[\S*\](?:\.extend\(|\s*=\s+)\[([^\]]*)\]")
+# Catches a line with an object between quotes and a comma:     "MyModel",
+_re_quote_object = re.compile(r'^\s+"([^"]+)",')
+# Catches a line with objects between brackets only:    ["foo", "bar"],
+_re_between_brackets = re.compile(r"^\s+\[([^\]]+)\]")
+# Catches a line with from foo import bar, bla, boo
+_re_import = re.compile(r"\s+from\s+\S*\s+import\s+([^\(\s].*)\n")
+# Catches a line with try:
+_re_try = re.compile(r"^\s*try:")
+# Catches a line with else:
+_re_else = re.compile(r"^\s*else:")
+
+
+def find_backend(line):
+    """Find one (or multiple) backend in a code line of the init."""
+    if _re_test_backend.search(line) is None:
+        return None
+    backends = [b[0] for b in _re_backend.findall(line)]
+    backends.sort()
+    return "_and_".join(backends)
+
+
+def parse_init(init_file):
+    """
+    Read an init_file and parse (per backend) the _import_structure objects defined and the TYPE_CHECKING objects
+    defined
+    """
+    with open(init_file, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+
+    line_index = 0
+    while line_index < len(lines) and not lines[line_index].startswith("_import_structure = {"):
+        line_index += 1
+
+    # If this is a traditional init, just return.
+    if line_index >= len(lines):
+        return None
+
+    # First grab the objects without a specific backend in _import_structure
+    objects = []
+    while not lines[line_index].startswith("if TYPE_CHECKING") and find_backend(lines[line_index]) is None:
+        line = lines[line_index]
+        # If we have everything on a single line, let's deal with it.
+        if _re_one_line_import_struct.search(line):
+            content = _re_one_line_import_struct.search(line).groups()[0]
+            imports = re.findall(r"\[([^\]]+)\]", content)
+            for imp in imports:
+                objects.extend([obj[1:-1] for obj in imp.split(", ")])
+            line_index += 1
+            continue
+        single_line_import_search = _re_import_struct_key_value.search(line)
+        if single_line_import_search is not None:
+            imports = [obj[1:-1] for obj in single_line_import_search.groups()[0].split(", ") if len(obj) > 0]
+            objects.extend(imports)
+        elif line.startswith(" " * 8 + '"'):
+            objects.append(line[9:-3])
+        line_index += 1
+
+    import_dict_objects = {"none": objects}
+    # Let's continue with backend-specific objects in _import_structure
+    while not lines[line_index].startswith("if TYPE_CHECKING"):
+        # If the line is an if not is_backend_available, we grab all objects associated.
+        backend = find_backend(lines[line_index])
+        # Check if the backend declaration is inside a try block:
+        if _re_try.search(lines[line_index - 1]) is None:
+            backend = None
+
+        if backend is not None:
+            line_index += 1
+
+            # Scroll until we hit the else block of try-except-else
+            while _re_else.search(lines[line_index]) is None:
+                line_index += 1
+
+            line_index += 1
+
+            objects = []
+            # Until we unindent, add backend objects to the list
+            while len(lines[line_index]) <= 1 or lines[line_index].startswith(" " * 4):
+                line = lines[line_index]
+                if _re_import_struct_add_one.search(line) is not None:
+                    objects.append(_re_import_struct_add_one.search(line).groups()[0])
+                elif _re_import_struct_add_many.search(line) is not None:
+                    imports = _re_import_struct_add_many.search(line).groups()[0].split(", ")
+                    imports = [obj[1:-1] for obj in imports if len(obj) > 0]
+                    objects.extend(imports)
+                elif _re_between_brackets.search(line) is not None:
+                    imports = _re_between_brackets.search(line).groups()[0].split(", ")
+                    imports = [obj[1:-1] for obj in imports if len(obj) > 0]
+                    objects.extend(imports)
+                elif _re_quote_object.search(line) is not None:
+                    objects.append(_re_quote_object.search(line).groups()[0])
+                elif line.startswith(" " * 8 + '"'):
+                    objects.append(line[9:-3])
+                elif line.startswith(" " * 12 + '"'):
+                    objects.append(line[13:-3])
+                line_index += 1
+
+            import_dict_objects[backend] = objects
+        else:
+            line_index += 1
+
+    # At this stage we are in the TYPE_CHECKING part, first grab the objects without a specific backend
+    objects = []
+    while (
+        line_index < len(lines)
+        and find_backend(lines[line_index]) is None
+        and not lines[line_index].startswith("else")
+    ):
+        line = lines[line_index]
+        single_line_import_search = _re_import.search(line)
+        if single_line_import_search is not None:
+            objects.extend(single_line_import_search.groups()[0].split(", "))
+        elif line.startswith(" " * 8):
+            objects.append(line[8:-2])
+        line_index += 1
+
+    type_hint_objects = {"none": objects}
+    # Let's continue with backend-specific objects
+    while line_index < len(lines):
+        # If the line is an if is_backend_available, we grab all objects associated.
+        backend = find_backend(lines[line_index])
+        # Check if the backend declaration is inside a try block:
+        if _re_try.search(lines[line_index - 1]) is None:
+            backend = None
+
+        if backend is not None:
+            line_index += 1
+
+            # Scroll until we hit the else block of try-except-else
+            while _re_else.search(lines[line_index]) is None:
+                line_index += 1
+
+            line_index += 1
+
+            objects = []
+            # Until we unindent, add backend objects to the list
+            while len(lines[line_index]) <= 1 or lines[line_index].startswith(" " * 8):
+                line = lines[line_index]
+                single_line_import_search = _re_import.search(line)
+                if single_line_import_search is not None:
+                    objects.extend(single_line_import_search.groups()[0].split(", "))
+                elif line.startswith(" " * 12):
+                    objects.append(line[12:-2])
+                line_index += 1
+
+            type_hint_objects[backend] = objects
+        else:
+            line_index += 1
+
+    return import_dict_objects, type_hint_objects
+
+
+def analyze_results(import_dict_objects, type_hint_objects):
+    """
+    Analyze the differences between _import_structure objects and TYPE_CHECKING objects found in an init.
+    """
+
+    def find_duplicates(seq):
+        return [k for k, v in collections.Counter(seq).items() if v > 1]
+
+    if list(import_dict_objects.keys()) != list(type_hint_objects.keys()):
+        return ["Both sides of the init do not have the same backends!"]
+
+    errors = []
+    for key in import_dict_objects.keys():
+        duplicate_imports = find_duplicates(import_dict_objects[key])
+        if duplicate_imports:
+            errors.append(f"Duplicate _import_structure definitions for: {duplicate_imports}")
+        duplicate_type_hints = find_duplicates(type_hint_objects[key])
+        if duplicate_type_hints:
+            errors.append(f"Duplicate TYPE_CHECKING objects for: {duplicate_type_hints}")
+
+        if sorted(set(import_dict_objects[key])) != sorted(set(type_hint_objects[key])):
+            name = "base imports" if key == "none" else f"{key} backend"
+            errors.append(f"Differences for {name}:")
+            for a in type_hint_objects[key]:
+                if a not in import_dict_objects[key]:
+                    errors.append(f"  {a} in TYPE_HINT but not in _import_structure.")
+            for a in import_dict_objects[key]:
+                if a not in type_hint_objects[key]:
+                    errors.append(f"  {a} in _import_structure but not in TYPE_HINT.")
+    return errors
+
+
+def check_all_inits():
+    """
+    Check all inits in the transformers repo and raise an error if at least one does not define the same objects in
+    both halves.
+    """
+    failures = []
+    for root, _, files in os.walk(PATH_TO_TRANSFORMERS):
+        if "__init__.py" in files:
+            fname = os.path.join(root, "__init__.py")
+            objects = parse_init(fname)
+            if objects is not None:
+                errors = analyze_results(*objects)
+                if len(errors) > 0:
+                    errors[0] = f"Problem in {fname}, both halves do not define the same objects.\n{errors[0]}"
+                    failures.append("\n".join(errors))
+    if len(failures) > 0:
+        raise ValueError("\n\n".join(failures))
+
+
+def get_transformers_submodules():
+    """
+    Returns the list of Transformers submodules.
+    """
+    submodules = []
+    for path, directories, files in os.walk(PATH_TO_TRANSFORMERS):
+        for folder in directories:
+            # Ignore private modules
+            if folder.startswith("_"):
+                directories.remove(folder)
+                continue
+            # Ignore leftovers from branches (empty folders apart from pycache)
+            if len(list((Path(path) / folder).glob("*.py"))) == 0:
+                continue
+            short_path = str((Path(path) / folder).relative_to(PATH_TO_TRANSFORMERS))
+            submodule = short_path.replace(os.path.sep, ".")
+            submodules.append(submodule)
+        for fname in files:
+            if fname == "__init__.py":
+                continue
+            short_path = str((Path(path) / fname).relative_to(PATH_TO_TRANSFORMERS))
+            submodule = short_path.replace(".py", "").replace(os.path.sep, ".")
+            if len(submodule.split(".")) == 1:
+                submodules.append(submodule)
+    return submodules
+
+
+IGNORE_SUBMODULES = [
+    "convert_pytorch_checkpoint_to_tf2",
+    "modeling_flax_pytorch_utils",
+]
+
+
+def check_submodules():
+    # This is to make sure the transformers module imported is the one in the repo.
+    spec = importlib.util.spec_from_file_location(
+        "transformers",
+        os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"),
+        submodule_search_locations=[PATH_TO_TRANSFORMERS],
+    )
+    transformers = spec.loader.load_module()
+
+    module_not_registered = [
+        module
+        for module in get_transformers_submodules()
+        if module not in IGNORE_SUBMODULES and module not in transformers._import_structure.keys()
+    ]
+    if len(module_not_registered) > 0:
+        list_of_modules = "\n".join(f"- {module}" for module in module_not_registered)
+        raise ValueError(
+            "The following submodules are not properly registered in the main init of Transformers:\n"
+            f"{list_of_modules}\n"
+            "Make sure they appear somewhere in the keys of `_import_structure` with an empty list as value."
+        )
+
+
+if __name__ == "__main__":
+    check_all_inits()
+    check_submodules()
diff --git a/diffusers/utils/check_repo.py b/diffusers/utils/check_repo.py
new file mode 100644
index 0000000000000000000000000000000000000000..597893f267cabf72d759483c70660b32db30ec2b
--- /dev/null
+++ b/diffusers/utils/check_repo.py
@@ -0,0 +1,755 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+import os
+import re
+import warnings
+from collections import OrderedDict
+from difflib import get_close_matches
+from pathlib import Path
+
+from diffusers.models.auto import get_values
+from diffusers.utils import ENV_VARS_TRUE_VALUES, is_flax_available, is_torch_available
+
+
+# All paths are set with the intent you should run this script from the root of the repo with the command
+# python utils/check_repo.py
+PATH_TO_DIFFUSERS = "src/diffusers"
+PATH_TO_TESTS = "tests"
+PATH_TO_DOC = "docs/source/en"
+
+# Update this list with models that are supposed to be private.
+PRIVATE_MODELS = [
+    "DPRSpanPredictor",
+    "RealmBertModel",
+    "T5Stack",
+    "TFDPRSpanPredictor",
+]
+
+# Update this list for models that are not tested with a comment explaining the reason it should not be.
+# Being in this list is an exception and should **not** be the rule.
+IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
+    # models to ignore for not tested
+    "OPTDecoder",  # Building part of bigger (tested) model.
+    "DecisionTransformerGPT2Model",  # Building part of bigger (tested) model.
+    "SegformerDecodeHead",  # Building part of bigger (tested) model.
+    "PLBartEncoder",  # Building part of bigger (tested) model.
+    "PLBartDecoder",  # Building part of bigger (tested) model.
+    "PLBartDecoderWrapper",  # Building part of bigger (tested) model.
+    "BigBirdPegasusEncoder",  # Building part of bigger (tested) model.
+    "BigBirdPegasusDecoder",  # Building part of bigger (tested) model.
+    "BigBirdPegasusDecoderWrapper",  # Building part of bigger (tested) model.
+    "DetrEncoder",  # Building part of bigger (tested) model.
+    "DetrDecoder",  # Building part of bigger (tested) model.
+    "DetrDecoderWrapper",  # Building part of bigger (tested) model.
+    "M2M100Encoder",  # Building part of bigger (tested) model.
+    "M2M100Decoder",  # Building part of bigger (tested) model.
+    "Speech2TextEncoder",  # Building part of bigger (tested) model.
+    "Speech2TextDecoder",  # Building part of bigger (tested) model.
+    "LEDEncoder",  # Building part of bigger (tested) model.
+    "LEDDecoder",  # Building part of bigger (tested) model.
+    "BartDecoderWrapper",  # Building part of bigger (tested) model.
+    "BartEncoder",  # Building part of bigger (tested) model.
+    "BertLMHeadModel",  # Needs to be setup as decoder.
+    "BlenderbotSmallEncoder",  # Building part of bigger (tested) model.
+    "BlenderbotSmallDecoderWrapper",  # Building part of bigger (tested) model.
+    "BlenderbotEncoder",  # Building part of bigger (tested) model.
+    "BlenderbotDecoderWrapper",  # Building part of bigger (tested) model.
+    "MBartEncoder",  # Building part of bigger (tested) model.
+    "MBartDecoderWrapper",  # Building part of bigger (tested) model.
+    "MegatronBertLMHeadModel",  # Building part of bigger (tested) model.
+    "MegatronBertEncoder",  # Building part of bigger (tested) model.
+    "MegatronBertDecoder",  # Building part of bigger (tested) model.
+    "MegatronBertDecoderWrapper",  # Building part of bigger (tested) model.
+    "PegasusEncoder",  # Building part of bigger (tested) model.
+    "PegasusDecoderWrapper",  # Building part of bigger (tested) model.
+    "DPREncoder",  # Building part of bigger (tested) model.
+    "ProphetNetDecoderWrapper",  # Building part of bigger (tested) model.
+    "RealmBertModel",  # Building part of bigger (tested) model.
+    "RealmReader",  # Not regular model.
+    "RealmScorer",  # Not regular model.
+    "RealmForOpenQA",  # Not regular model.
+    "ReformerForMaskedLM",  # Needs to be setup as decoder.
+    "Speech2Text2DecoderWrapper",  # Building part of bigger (tested) model.
+    "TFDPREncoder",  # Building part of bigger (tested) model.
+    "TFElectraMainLayer",  # Building part of bigger (tested) model (should it be a TFModelMixin ?)
+    "TFRobertaForMultipleChoice",  # TODO: fix
+    "TrOCRDecoderWrapper",  # Building part of bigger (tested) model.
+    "SeparableConv1D",  # Building part of bigger (tested) model.
+    "FlaxBartForCausalLM",  # Building part of bigger (tested) model.
+    "FlaxBertForCausalLM",  # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM.
+    "OPTDecoderWrapper",
+]
+
+# Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't
+# trigger the common tests.
+TEST_FILES_WITH_NO_COMMON_TESTS = [
+    "models/decision_transformer/test_modeling_decision_transformer.py",
+    "models/camembert/test_modeling_camembert.py",
+    "models/mt5/test_modeling_flax_mt5.py",
+    "models/mbart/test_modeling_mbart.py",
+    "models/mt5/test_modeling_mt5.py",
+    "models/pegasus/test_modeling_pegasus.py",
+    "models/camembert/test_modeling_tf_camembert.py",
+    "models/mt5/test_modeling_tf_mt5.py",
+    "models/xlm_roberta/test_modeling_tf_xlm_roberta.py",
+    "models/xlm_roberta/test_modeling_flax_xlm_roberta.py",
+    "models/xlm_prophetnet/test_modeling_xlm_prophetnet.py",
+    "models/xlm_roberta/test_modeling_xlm_roberta.py",
+    "models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py",
+    "models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py",
+    "models/decision_transformer/test_modeling_decision_transformer.py",
+]
+
+# Update this list for models that are not in any of the auto MODEL_XXX_MAPPING. Being in this list is an exception and
+# should **not** be the rule.
+IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
+    # models to ignore for model xxx mapping
+    "DPTForDepthEstimation",
+    "DecisionTransformerGPT2Model",
+    "GLPNForDepthEstimation",
+    "ViltForQuestionAnswering",
+    "ViltForImagesAndTextClassification",
+    "ViltForImageAndTextRetrieval",
+    "ViltForMaskedLM",
+    "XGLMEncoder",
+    "XGLMDecoder",
+    "XGLMDecoderWrapper",
+    "PerceiverForMultimodalAutoencoding",
+    "PerceiverForOpticalFlow",
+    "SegformerDecodeHead",
+    "FlaxBeitForMaskedImageModeling",
+    "PLBartEncoder",
+    "PLBartDecoder",
+    "PLBartDecoderWrapper",
+    "BeitForMaskedImageModeling",
+    "CLIPTextModel",
+    "CLIPVisionModel",
+    "TFCLIPTextModel",
+    "TFCLIPVisionModel",
+    "FlaxCLIPTextModel",
+    "FlaxCLIPVisionModel",
+    "FlaxWav2Vec2ForCTC",
+    "DetrForSegmentation",
+    "DPRReader",
+    "FlaubertForQuestionAnswering",
+    "FlavaImageCodebook",
+    "FlavaTextModel",
+    "FlavaImageModel",
+    "FlavaMultimodalModel",
+    "GPT2DoubleHeadsModel",
+    "LukeForMaskedLM",
+    "LukeForEntityClassification",
+    "LukeForEntityPairClassification",
+    "LukeForEntitySpanClassification",
+    "OpenAIGPTDoubleHeadsModel",
+    "RagModel",
+    "RagSequenceForGeneration",
+    "RagTokenForGeneration",
+    "RealmEmbedder",
+    "RealmForOpenQA",
+    "RealmScorer",
+    "RealmReader",
+    "TFDPRReader",
+    "TFGPT2DoubleHeadsModel",
+    "TFOpenAIGPTDoubleHeadsModel",
+    "TFRagModel",
+    "TFRagSequenceForGeneration",
+    "TFRagTokenForGeneration",
+    "Wav2Vec2ForCTC",
+    "HubertForCTC",
+    "SEWForCTC",
+    "SEWDForCTC",
+    "XLMForQuestionAnswering",
+    "XLNetForQuestionAnswering",
+    "SeparableConv1D",
+    "VisualBertForRegionToPhraseAlignment",
+    "VisualBertForVisualReasoning",
+    "VisualBertForQuestionAnswering",
+    "VisualBertForMultipleChoice",
+    "TFWav2Vec2ForCTC",
+    "TFHubertForCTC",
+    "MaskFormerForInstanceSegmentation",
+]
+
+# Update this list for models that have multiple model types for the same
+# model doc
+MODEL_TYPE_TO_DOC_MAPPING = OrderedDict(
+    [
+        ("data2vec-text", "data2vec"),
+        ("data2vec-audio", "data2vec"),
+        ("data2vec-vision", "data2vec"),
+    ]
+)
+
+
+# This is to make sure the transformers module imported is the one in the repo.
+spec = importlib.util.spec_from_file_location(
+    "diffusers",
+    os.path.join(PATH_TO_DIFFUSERS, "__init__.py"),
+    submodule_search_locations=[PATH_TO_DIFFUSERS],
+)
+diffusers = spec.loader.load_module()
+
+
+def check_model_list():
+    """Check the model list inside the transformers library."""
+    # Get the models from the directory structure of `src/diffusers/models/`
+    models_dir = os.path.join(PATH_TO_DIFFUSERS, "models")
+    _models = []
+    for model in os.listdir(models_dir):
+        model_dir = os.path.join(models_dir, model)
+        if os.path.isdir(model_dir) and "__init__.py" in os.listdir(model_dir):
+            _models.append(model)
+
+    # Get the models from the directory structure of `src/transformers/models/`
+    models = [model for model in dir(diffusers.models) if not model.startswith("__")]
+
+    missing_models = sorted(set(_models).difference(models))
+    if missing_models:
+        raise Exception(
+            f"The following models should be included in {models_dir}/__init__.py: {','.join(missing_models)}."
+        )
+
+
+# If some modeling modules should be ignored for all checks, they should be added in the nested list
+# _ignore_modules of this function.
+def get_model_modules():
+    """Get the model modules inside the transformers library."""
+    _ignore_modules = [
+        "modeling_auto",
+        "modeling_encoder_decoder",
+        "modeling_marian",
+        "modeling_mmbt",
+        "modeling_outputs",
+        "modeling_retribert",
+        "modeling_utils",
+        "modeling_flax_auto",
+        "modeling_flax_encoder_decoder",
+        "modeling_flax_utils",
+        "modeling_speech_encoder_decoder",
+        "modeling_flax_speech_encoder_decoder",
+        "modeling_flax_vision_encoder_decoder",
+        "modeling_transfo_xl_utilities",
+        "modeling_tf_auto",
+        "modeling_tf_encoder_decoder",
+        "modeling_tf_outputs",
+        "modeling_tf_pytorch_utils",
+        "modeling_tf_utils",
+        "modeling_tf_transfo_xl_utilities",
+        "modeling_tf_vision_encoder_decoder",
+        "modeling_vision_encoder_decoder",
+    ]
+    modules = []
+    for model in dir(diffusers.models):
+        # There are some magic dunder attributes in the dir, we ignore them
+        if not model.startswith("__"):
+            model_module = getattr(diffusers.models, model)
+            for submodule in dir(model_module):
+                if submodule.startswith("modeling") and submodule not in _ignore_modules:
+                    modeling_module = getattr(model_module, submodule)
+                    if inspect.ismodule(modeling_module):
+                        modules.append(modeling_module)
+    return modules
+
+
+def get_models(module, include_pretrained=False):
+    """Get the objects in module that are models."""
+    models = []
+    model_classes = (diffusers.ModelMixin, diffusers.TFModelMixin, diffusers.FlaxModelMixin)
+    for attr_name in dir(module):
+        if not include_pretrained and ("Pretrained" in attr_name or "PreTrained" in attr_name):
+            continue
+        attr = getattr(module, attr_name)
+        if isinstance(attr, type) and issubclass(attr, model_classes) and attr.__module__ == module.__name__:
+            models.append((attr_name, attr))
+    return models
+
+
+def is_a_private_model(model):
+    """Returns True if the model should not be in the main init."""
+    if model in PRIVATE_MODELS:
+        return True
+
+    # Wrapper, Encoder and Decoder are all privates
+    if model.endswith("Wrapper"):
+        return True
+    if model.endswith("Encoder"):
+        return True
+    if model.endswith("Decoder"):
+        return True
+    return False
+
+
+def check_models_are_in_init():
+    """Checks all models defined in the library are in the main init."""
+    models_not_in_init = []
+    dir_transformers = dir(diffusers)
+    for module in get_model_modules():
+        models_not_in_init += [
+            model[0] for model in get_models(module, include_pretrained=True) if model[0] not in dir_transformers
+        ]
+
+    # Remove private models
+    models_not_in_init = [model for model in models_not_in_init if not is_a_private_model(model)]
+    if len(models_not_in_init) > 0:
+        raise Exception(f"The following models should be in the main init: {','.join(models_not_in_init)}.")
+
+
+# If some test_modeling files should be ignored when checking models are all tested, they should be added in the
+# nested list _ignore_files of this function.
+def get_model_test_files():
+    """Get the model test files.
+
+    The returned files should NOT contain the `tests` (i.e. `PATH_TO_TESTS` defined in this script). They will be
+    considered as paths relative to `tests`. A caller has to use `os.path.join(PATH_TO_TESTS, ...)` to access the files.
+    """
+
+    _ignore_files = [
+        "test_modeling_common",
+        "test_modeling_encoder_decoder",
+        "test_modeling_flax_encoder_decoder",
+        "test_modeling_flax_speech_encoder_decoder",
+        "test_modeling_marian",
+        "test_modeling_tf_common",
+        "test_modeling_tf_encoder_decoder",
+    ]
+    test_files = []
+    # Check both `PATH_TO_TESTS` and `PATH_TO_TESTS/models`
+    model_test_root = os.path.join(PATH_TO_TESTS, "models")
+    model_test_dirs = []
+    for x in os.listdir(model_test_root):
+        x = os.path.join(model_test_root, x)
+        if os.path.isdir(x):
+            model_test_dirs.append(x)
+
+    for target_dir in [PATH_TO_TESTS] + model_test_dirs:
+        for file_or_dir in os.listdir(target_dir):
+            path = os.path.join(target_dir, file_or_dir)
+            if os.path.isfile(path):
+                filename = os.path.split(path)[-1]
+                if "test_modeling" in filename and os.path.splitext(filename)[0] not in _ignore_files:
+                    file = os.path.join(*path.split(os.sep)[1:])
+                    test_files.append(file)
+
+    return test_files
+
+
+# This is a bit hacky but I didn't find a way to import the test_file as a module and read inside the tester class
+# for the all_model_classes variable.
+def find_tested_models(test_file):
+    """Parse the content of test_file to detect what's in all_model_classes"""
+    # This is a bit hacky but I didn't find a way to import the test_file as a module and read inside the class
+    with open(os.path.join(PATH_TO_TESTS, test_file), "r", encoding="utf-8", newline="\n") as f:
+        content = f.read()
+    all_models = re.findall(r"all_model_classes\s+=\s+\(\s*\(([^\)]*)\)", content)
+    # Check with one less parenthesis as well
+    all_models += re.findall(r"all_model_classes\s+=\s+\(([^\)]*)\)", content)
+    if len(all_models) > 0:
+        model_tested = []
+        for entry in all_models:
+            for line in entry.split(","):
+                name = line.strip()
+                if len(name) > 0:
+                    model_tested.append(name)
+        return model_tested
+
+
+def check_models_are_tested(module, test_file):
+    """Check models defined in module are tested in test_file."""
+    # XxxModelMixin are not tested
+    defined_models = get_models(module)
+    tested_models = find_tested_models(test_file)
+    if tested_models is None:
+        if test_file.replace(os.path.sep, "/") in TEST_FILES_WITH_NO_COMMON_TESTS:
+            return
+        return [
+            f"{test_file} should define `all_model_classes` to apply common tests to the models it tests. "
+            + "If this intentional, add the test filename to `TEST_FILES_WITH_NO_COMMON_TESTS` in the file "
+            + "`utils/check_repo.py`."
+        ]
+    failures = []
+    for model_name, _ in defined_models:
+        if model_name not in tested_models and model_name not in IGNORE_NON_TESTED:
+            failures.append(
+                f"{model_name} is defined in {module.__name__} but is not tested in "
+                + f"{os.path.join(PATH_TO_TESTS, test_file)}. Add it to the all_model_classes in that file."
+                + "If common tests should not applied to that model, add its name to `IGNORE_NON_TESTED`"
+                + "in the file `utils/check_repo.py`."
+            )
+    return failures
+
+
+def check_all_models_are_tested():
+    """Check all models are properly tested."""
+    modules = get_model_modules()
+    test_files = get_model_test_files()
+    failures = []
+    for module in modules:
+        test_file = [file for file in test_files if f"test_{module.__name__.split('.')[-1]}.py" in file]
+        if len(test_file) == 0:
+            failures.append(f"{module.__name__} does not have its corresponding test file {test_file}.")
+        elif len(test_file) > 1:
+            failures.append(f"{module.__name__} has several test files: {test_file}.")
+        else:
+            test_file = test_file[0]
+            new_failures = check_models_are_tested(module, test_file)
+            if new_failures is not None:
+                failures += new_failures
+    if len(failures) > 0:
+        raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
+
+
+def get_all_auto_configured_models():
+    """Return the list of all models in at least one auto class."""
+    result = set()  # To avoid duplicates we concatenate all model classes in a set.
+    if is_torch_available():
+        for attr_name in dir(diffusers.models.auto.modeling_auto):
+            if attr_name.startswith("MODEL_") and attr_name.endswith("MAPPING_NAMES"):
+                result = result | set(get_values(getattr(diffusers.models.auto.modeling_auto, attr_name)))
+    if is_flax_available():
+        for attr_name in dir(diffusers.models.auto.modeling_flax_auto):
+            if attr_name.startswith("FLAX_MODEL_") and attr_name.endswith("MAPPING_NAMES"):
+                result = result | set(get_values(getattr(diffusers.models.auto.modeling_flax_auto, attr_name)))
+    return list(result)
+
+
+def ignore_unautoclassed(model_name):
+    """Rules to determine if `name` should be in an auto class."""
+    # Special white list
+    if model_name in IGNORE_NON_AUTO_CONFIGURED:
+        return True
+    # Encoder and Decoder should be ignored
+    if "Encoder" in model_name or "Decoder" in model_name:
+        return True
+    return False
+
+
+def check_models_are_auto_configured(module, all_auto_models):
+    """Check models defined in module are each in an auto class."""
+    defined_models = get_models(module)
+    failures = []
+    for model_name, _ in defined_models:
+        if model_name not in all_auto_models and not ignore_unautoclassed(model_name):
+            failures.append(
+                f"{model_name} is defined in {module.__name__} but is not present in any of the auto mapping. "
+                "If that is intended behavior, add its name to `IGNORE_NON_AUTO_CONFIGURED` in the file "
+                "`utils/check_repo.py`."
+            )
+    return failures
+
+
+def check_all_models_are_auto_configured():
+    """Check all models are each in an auto class."""
+    missing_backends = []
+    if not is_torch_available():
+        missing_backends.append("PyTorch")
+    if not is_flax_available():
+        missing_backends.append("Flax")
+    if len(missing_backends) > 0:
+        missing = ", ".join(missing_backends)
+        if os.getenv("TRANSFORMERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
+            raise Exception(
+                "Full quality checks require all backends to be installed (with `pip install -e .[dev]` in the "
+                f"Transformers repo, the following are missing: {missing}."
+            )
+        else:
+            warnings.warn(
+                "Full quality checks require all backends to be installed (with `pip install -e .[dev]` in the "
+                f"Transformers repo, the following are missing: {missing}. While it's probably fine as long as you "
+                "didn't make any change in one of those backends modeling files, you should probably execute the "
+                "command above to be on the safe side."
+            )
+    modules = get_model_modules()
+    all_auto_models = get_all_auto_configured_models()
+    failures = []
+    for module in modules:
+        new_failures = check_models_are_auto_configured(module, all_auto_models)
+        if new_failures is not None:
+            failures += new_failures
+    if len(failures) > 0:
+        raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
+
+
+_re_decorator = re.compile(r"^\s*@(\S+)\s+$")
+
+
+def check_decorator_order(filename):
+    """Check that in the test file `filename` the slow decorator is always last."""
+    with open(filename, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+    decorator_before = None
+    errors = []
+    for i, line in enumerate(lines):
+        search = _re_decorator.search(line)
+        if search is not None:
+            decorator_name = search.groups()[0]
+            if decorator_before is not None and decorator_name.startswith("parameterized"):
+                errors.append(i)
+            decorator_before = decorator_name
+        elif decorator_before is not None:
+            decorator_before = None
+    return errors
+
+
+def check_all_decorator_order():
+    """Check that in all test files, the slow decorator is always last."""
+    errors = []
+    for fname in os.listdir(PATH_TO_TESTS):
+        if fname.endswith(".py"):
+            filename = os.path.join(PATH_TO_TESTS, fname)
+            new_errors = check_decorator_order(filename)
+            errors += [f"- {filename}, line {i}" for i in new_errors]
+    if len(errors) > 0:
+        msg = "\n".join(errors)
+        raise ValueError(
+            "The parameterized decorator (and its variants) should always be first, but this is not the case in the"
+            f" following files:\n{msg}"
+        )
+
+
+def find_all_documented_objects():
+    """Parse the content of all doc files to detect which classes and functions it documents"""
+    documented_obj = []
+    for doc_file in Path(PATH_TO_DOC).glob("**/*.rst"):
+        with open(doc_file, "r", encoding="utf-8", newline="\n") as f:
+            content = f.read()
+        raw_doc_objs = re.findall(r"(?:autoclass|autofunction):: transformers.(\S+)\s+", content)
+        documented_obj += [obj.split(".")[-1] for obj in raw_doc_objs]
+    for doc_file in Path(PATH_TO_DOC).glob("**/*.md"):
+        with open(doc_file, "r", encoding="utf-8", newline="\n") as f:
+            content = f.read()
+        raw_doc_objs = re.findall(r"\[\[autodoc\]\]\s+(\S+)\s+", content)
+        documented_obj += [obj.split(".")[-1] for obj in raw_doc_objs]
+    return documented_obj
+
+
+# One good reason for not being documented is to be deprecated. Put in this list deprecated objects.
+DEPRECATED_OBJECTS = [
+    "AutoModelWithLMHead",
+    "BartPretrainedModel",
+    "DataCollator",
+    "DataCollatorForSOP",
+    "GlueDataset",
+    "GlueDataTrainingArguments",
+    "LineByLineTextDataset",
+    "LineByLineWithRefDataset",
+    "LineByLineWithSOPTextDataset",
+    "PretrainedBartModel",
+    "PretrainedFSMTModel",
+    "SingleSentenceClassificationProcessor",
+    "SquadDataTrainingArguments",
+    "SquadDataset",
+    "SquadExample",
+    "SquadFeatures",
+    "SquadV1Processor",
+    "SquadV2Processor",
+    "TFAutoModelWithLMHead",
+    "TFBartPretrainedModel",
+    "TextDataset",
+    "TextDatasetForNextSentencePrediction",
+    "Wav2Vec2ForMaskedLM",
+    "Wav2Vec2Tokenizer",
+    "glue_compute_metrics",
+    "glue_convert_examples_to_features",
+    "glue_output_modes",
+    "glue_processors",
+    "glue_tasks_num_labels",
+    "squad_convert_examples_to_features",
+    "xnli_compute_metrics",
+    "xnli_output_modes",
+    "xnli_processors",
+    "xnli_tasks_num_labels",
+    "TFTrainer",
+    "TFTrainingArguments",
+]
+
+# Exceptionally, some objects should not be documented after all rules passed.
+# ONLY PUT SOMETHING IN THIS LIST AS A LAST RESORT!
+UNDOCUMENTED_OBJECTS = [
+    "AddedToken",  # This is a tokenizers class.
+    "BasicTokenizer",  # Internal, should never have been in the main init.
+    "CharacterTokenizer",  # Internal, should never have been in the main init.
+    "DPRPretrainedReader",  # Like an Encoder.
+    "DummyObject",  # Just picked by mistake sometimes.
+    "MecabTokenizer",  # Internal, should never have been in the main init.
+    "ModelCard",  # Internal type.
+    "SqueezeBertModule",  # Internal building block (should have been called SqueezeBertLayer)
+    "TFDPRPretrainedReader",  # Like an Encoder.
+    "TransfoXLCorpus",  # Internal type.
+    "WordpieceTokenizer",  # Internal, should never have been in the main init.
+    "absl",  # External module
+    "add_end_docstrings",  # Internal, should never have been in the main init.
+    "add_start_docstrings",  # Internal, should never have been in the main init.
+    "cached_path",  # Internal used for downloading models.
+    "convert_tf_weight_name_to_pt_weight_name",  # Internal used to convert model weights
+    "logger",  # Internal logger
+    "logging",  # External module
+    "requires_backends",  # Internal function
+]
+
+# This list should be empty. Objects in it should get their own doc page.
+SHOULD_HAVE_THEIR_OWN_PAGE = [
+    # Benchmarks
+    "PyTorchBenchmark",
+    "PyTorchBenchmarkArguments",
+    "TensorFlowBenchmark",
+    "TensorFlowBenchmarkArguments",
+]
+
+
+def ignore_undocumented(name):
+    """Rules to determine if `name` should be undocumented."""
+    # NOT DOCUMENTED ON PURPOSE.
+    # Constants uppercase are not documented.
+    if name.isupper():
+        return True
+    # ModelMixins / Encoders / Decoders / Layers / Embeddings / Attention are not documented.
+    if (
+        name.endswith("ModelMixin")
+        or name.endswith("Decoder")
+        or name.endswith("Encoder")
+        or name.endswith("Layer")
+        or name.endswith("Embeddings")
+        or name.endswith("Attention")
+    ):
+        return True
+    # Submodules are not documented.
+    if os.path.isdir(os.path.join(PATH_TO_DIFFUSERS, name)) or os.path.isfile(
+        os.path.join(PATH_TO_DIFFUSERS, f"{name}.py")
+    ):
+        return True
+    # All load functions are not documented.
+    if name.startswith("load_tf") or name.startswith("load_pytorch"):
+        return True
+    # is_xxx_available functions are not documented.
+    if name.startswith("is_") and name.endswith("_available"):
+        return True
+    # Deprecated objects are not documented.
+    if name in DEPRECATED_OBJECTS or name in UNDOCUMENTED_OBJECTS:
+        return True
+    # MMBT model does not really work.
+    if name.startswith("MMBT"):
+        return True
+    if name in SHOULD_HAVE_THEIR_OWN_PAGE:
+        return True
+    return False
+
+
+def check_all_objects_are_documented():
+    """Check all models are properly documented."""
+    documented_objs = find_all_documented_objects()
+    modules = diffusers._modules
+    objects = [c for c in dir(diffusers) if c not in modules and not c.startswith("_")]
+    undocumented_objs = [c for c in objects if c not in documented_objs and not ignore_undocumented(c)]
+    if len(undocumented_objs) > 0:
+        raise Exception(
+            "The following objects are in the public init so should be documented:\n - "
+            + "\n - ".join(undocumented_objs)
+        )
+    check_docstrings_are_in_md()
+    check_model_type_doc_match()
+
+
+def check_model_type_doc_match():
+    """Check all doc pages have a corresponding model type."""
+    model_doc_folder = Path(PATH_TO_DOC) / "model_doc"
+    model_docs = [m.stem for m in model_doc_folder.glob("*.md")]
+
+    model_types = list(diffusers.models.auto.configuration_auto.MODEL_NAMES_MAPPING.keys())
+    model_types = [MODEL_TYPE_TO_DOC_MAPPING[m] if m in MODEL_TYPE_TO_DOC_MAPPING else m for m in model_types]
+
+    errors = []
+    for m in model_docs:
+        if m not in model_types and m != "auto":
+            close_matches = get_close_matches(m, model_types)
+            error_message = f"{m} is not a proper model identifier."
+            if len(close_matches) > 0:
+                close_matches = "/".join(close_matches)
+                error_message += f" Did you mean {close_matches}?"
+            errors.append(error_message)
+
+    if len(errors) > 0:
+        raise ValueError(
+            "Some model doc pages do not match any existing model type:\n"
+            + "\n".join(errors)
+            + "\nYou can add any missing model type to the `MODEL_NAMES_MAPPING` constant in "
+            "models/auto/configuration_auto.py."
+        )
+
+
+# Re pattern to catch :obj:`xx`, :class:`xx`, :func:`xx` or :meth:`xx`.
+_re_rst_special_words = re.compile(r":(?:obj|func|class|meth):`([^`]+)`")
+# Re pattern to catch things between double backquotes.
+_re_double_backquotes = re.compile(r"(^|[^`])``([^`]+)``([^`]|$)")
+# Re pattern to catch example introduction.
+_re_rst_example = re.compile(r"^\s*Example.*::\s*$", flags=re.MULTILINE)
+
+
+def is_rst_docstring(docstring):
+    """
+    Returns `True` if `docstring` is written in rst.
+    """
+    if _re_rst_special_words.search(docstring) is not None:
+        return True
+    if _re_double_backquotes.search(docstring) is not None:
+        return True
+    if _re_rst_example.search(docstring) is not None:
+        return True
+    return False
+
+
+def check_docstrings_are_in_md():
+    """Check all docstrings are in md"""
+    files_with_rst = []
+    for file in Path(PATH_TO_DIFFUSERS).glob("**/*.py"):
+        with open(file, "r") as f:
+            code = f.read()
+        docstrings = code.split('"""')
+
+        for idx, docstring in enumerate(docstrings):
+            if idx % 2 == 0 or not is_rst_docstring(docstring):
+                continue
+            files_with_rst.append(file)
+            break
+
+    if len(files_with_rst) > 0:
+        raise ValueError(
+            "The following files have docstrings written in rst:\n"
+            + "\n".join([f"- {f}" for f in files_with_rst])
+            + "\nTo fix this run `doc-builder convert path_to_py_file` after installing `doc-builder`\n"
+            "(`pip install git+https://github.com/huggingface/doc-builder`)"
+        )
+
+
+def check_repo_quality():
+    """Check all models are properly tested and documented."""
+    print("Checking all models are included.")
+    check_model_list()
+    print("Checking all models are public.")
+    check_models_are_in_init()
+    print("Checking all models are properly tested.")
+    check_all_decorator_order()
+    check_all_models_are_tested()
+    print("Checking all objects are properly documented.")
+    check_all_objects_are_documented()
+    print("Checking all models are in at least one auto class.")
+    check_all_models_are_auto_configured()
+
+
+if __name__ == "__main__":
+    check_repo_quality()
diff --git a/diffusers/utils/check_table.py b/diffusers/utils/check_table.py
new file mode 100644
index 0000000000000000000000000000000000000000..80fd5660bb46e623fdaee168db4db634dda9ef70
--- /dev/null
+++ b/diffusers/utils/check_table.py
@@ -0,0 +1,185 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import collections
+import importlib.util
+import os
+import re
+
+
+# All paths are set with the intent you should run this script from the root of the repo with the command
+# python utils/check_table.py
+TRANSFORMERS_PATH = "src/diffusers"
+PATH_TO_DOCS = "docs/source/en"
+REPO_PATH = "."
+
+
+def _find_text_in_file(filename, start_prompt, end_prompt):
+    """
+    Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty
+    lines.
+    """
+    with open(filename, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+    # Find the start prompt.
+    start_index = 0
+    while not lines[start_index].startswith(start_prompt):
+        start_index += 1
+    start_index += 1
+
+    end_index = start_index
+    while not lines[end_index].startswith(end_prompt):
+        end_index += 1
+    end_index -= 1
+
+    while len(lines[start_index]) <= 1:
+        start_index += 1
+    while len(lines[end_index]) <= 1:
+        end_index -= 1
+    end_index += 1
+    return "".join(lines[start_index:end_index]), start_index, end_index, lines
+
+
+# Add here suffixes that are used to identify models, separated by |
+ALLOWED_MODEL_SUFFIXES = "Model|Encoder|Decoder|ForConditionalGeneration"
+# Regexes that match TF/Flax/PT model names.
+_re_tf_models = re.compile(r"TF(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
+_re_flax_models = re.compile(r"Flax(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
+# Will match any TF or Flax model too so need to be in an else branch afterthe two previous regexes.
+_re_pt_models = re.compile(r"(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
+
+
+# This is to make sure the diffusers module imported is the one in the repo.
+spec = importlib.util.spec_from_file_location(
+    "diffusers",
+    os.path.join(TRANSFORMERS_PATH, "__init__.py"),
+    submodule_search_locations=[TRANSFORMERS_PATH],
+)
+diffusers_module = spec.loader.load_module()
+
+
+# Thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
+def camel_case_split(identifier):
+    """Split a camelcased `identifier` into words."""
+    matches = re.finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
+    return [m.group(0) for m in matches]
+
+
+def _center_text(text, width):
+    text_length = 2 if text == "✅" or text == "❌" else len(text)
+    left_indent = (width - text_length) // 2
+    right_indent = width - text_length - left_indent
+    return " " * left_indent + text + " " * right_indent
+
+
+def get_model_table_from_auto_modules():
+    """Generates an up-to-date model table from the content of the auto modules."""
+    # Dictionary model names to config.
+    config_mapping_names = diffusers_module.models.auto.configuration_auto.CONFIG_MAPPING_NAMES
+    model_name_to_config = {
+        name: config_mapping_names[code]
+        for code, name in diffusers_module.MODEL_NAMES_MAPPING.items()
+        if code in config_mapping_names
+    }
+    model_name_to_prefix = {name: config.replace("ConfigMixin", "") for name, config in model_name_to_config.items()}
+
+    # Dictionaries flagging if each model prefix has a slow/fast tokenizer, backend in PT/TF/Flax.
+    slow_tokenizers = collections.defaultdict(bool)
+    fast_tokenizers = collections.defaultdict(bool)
+    pt_models = collections.defaultdict(bool)
+    tf_models = collections.defaultdict(bool)
+    flax_models = collections.defaultdict(bool)
+
+    # Let's lookup through all diffusers object (once).
+    for attr_name in dir(diffusers_module):
+        lookup_dict = None
+        if attr_name.endswith("Tokenizer"):
+            lookup_dict = slow_tokenizers
+            attr_name = attr_name[:-9]
+        elif attr_name.endswith("TokenizerFast"):
+            lookup_dict = fast_tokenizers
+            attr_name = attr_name[:-13]
+        elif _re_tf_models.match(attr_name) is not None:
+            lookup_dict = tf_models
+            attr_name = _re_tf_models.match(attr_name).groups()[0]
+        elif _re_flax_models.match(attr_name) is not None:
+            lookup_dict = flax_models
+            attr_name = _re_flax_models.match(attr_name).groups()[0]
+        elif _re_pt_models.match(attr_name) is not None:
+            lookup_dict = pt_models
+            attr_name = _re_pt_models.match(attr_name).groups()[0]
+
+        if lookup_dict is not None:
+            while len(attr_name) > 0:
+                if attr_name in model_name_to_prefix.values():
+                    lookup_dict[attr_name] = True
+                    break
+                # Try again after removing the last word in the name
+                attr_name = "".join(camel_case_split(attr_name)[:-1])
+
+    # Let's build that table!
+    model_names = list(model_name_to_config.keys())
+    model_names.sort(key=str.lower)
+    columns = ["Model", "Tokenizer slow", "Tokenizer fast", "PyTorch support", "TensorFlow support", "Flax Support"]
+    # We'll need widths to properly display everything in the center (+2 is to leave one extra space on each side).
+    widths = [len(c) + 2 for c in columns]
+    widths[0] = max([len(name) for name in model_names]) + 2
+
+    # Build the table per se
+    table = "|" + "|".join([_center_text(c, w) for c, w in zip(columns, widths)]) + "|\n"
+    # Use ":-----:" format to center-aligned table cell texts
+    table += "|" + "|".join([":" + "-" * (w - 2) + ":" for w in widths]) + "|\n"
+
+    check = {True: "✅", False: "❌"}
+    for name in model_names:
+        prefix = model_name_to_prefix[name]
+        line = [
+            name,
+            check[slow_tokenizers[prefix]],
+            check[fast_tokenizers[prefix]],
+            check[pt_models[prefix]],
+            check[tf_models[prefix]],
+            check[flax_models[prefix]],
+        ]
+        table += "|" + "|".join([_center_text(l, w) for l, w in zip(line, widths)]) + "|\n"
+    return table
+
+
+def check_model_table(overwrite=False):
+    """Check the model table in the index.rst is consistent with the state of the lib and maybe `overwrite`."""
+    current_table, start_index, end_index, lines = _find_text_in_file(
+        filename=os.path.join(PATH_TO_DOCS, "index.md"),
+        start_prompt="<!--This table is updated automatically from the auto modules",
+        end_prompt="<!-- End table-->",
+    )
+    new_table = get_model_table_from_auto_modules()
+
+    if current_table != new_table:
+        if overwrite:
+            with open(os.path.join(PATH_TO_DOCS, "index.md"), "w", encoding="utf-8", newline="\n") as f:
+                f.writelines(lines[:start_index] + [new_table] + lines[end_index:])
+        else:
+            raise ValueError(
+                "The model table in the `index.md` has not been updated. Run `make fix-copies` to fix this."
+            )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
+    args = parser.parse_args()
+
+    check_model_table(args.fix_and_overwrite)
diff --git a/diffusers/utils/custom_init_isort.py b/diffusers/utils/custom_init_isort.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c2bb7f5d69c2629334a8c5e5f6ebacd042a2dd9
--- /dev/null
+++ b/diffusers/utils/custom_init_isort.py
@@ -0,0 +1,330 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utility that sorts the imports in the custom inits of Diffusers. Diffusers uses init files that delay the
+import of an object to when it's actually needed. This is to avoid the main init importing all models, which would
+make the line `import transformers` very slow when the user has all optional dependencies installed. The inits with
+delayed imports have two halves: one defining a dictionary `_import_structure` which maps modules to the name of the
+objects in each module, and one in `TYPE_CHECKING` which looks like a normal init for type-checkers. `isort` or `ruff`
+properly sort the second half which looks like traditionl imports, the goal of this script is to sort the first half.
+
+Use from the root of the repo with:
+
+```bash
+python utils/custom_init_isort.py
+```
+
+which will auto-sort the imports (used in `make style`).
+
+For a check only (as used in `make quality`) run:
+
+```bash
+python utils/custom_init_isort.py --check_only
+```
+"""
+
+import argparse
+import os
+import re
+from typing import Any, Callable, List, Optional
+
+
+# Path is defined with the intent you should run this script from the root of the repo.
+PATH_TO_TRANSFORMERS = "src/diffusers"
+
+# Pattern that looks at the indentation in a line.
+_re_indent = re.compile(r"^(\s*)\S")
+# Pattern that matches `"key":" and puts `key` in group 0.
+_re_direct_key = re.compile(r'^\s*"([^"]+)":')
+# Pattern that matches `_import_structure["key"]` and puts `key` in group 0.
+_re_indirect_key = re.compile(r'^\s*_import_structure\["([^"]+)"\]')
+# Pattern that matches `"key",` and puts `key` in group 0.
+_re_strip_line = re.compile(r'^\s*"([^"]+)",\s*$')
+# Pattern that matches any `[stuff]` and puts `stuff` in group 0.
+_re_bracket_content = re.compile(r"\[([^\]]+)\]")
+
+
+def get_indent(line: str) -> str:
+    """Returns the indent in  given line (as string)."""
+    search = _re_indent.search(line)
+    return "" if search is None else search.groups()[0]
+
+
+def split_code_in_indented_blocks(
+    code: str, indent_level: str = "", start_prompt: Optional[str] = None, end_prompt: Optional[str] = None
+) -> List[str]:
+    """
+    Split some code into its indented blocks, starting at a given level.
+
+    Args:
+        code (`str`): The code to split.
+        indent_level (`str`): The indent level (as string) to use for identifying the blocks to split.
+        start_prompt (`str`, *optional*): If provided, only starts splitting at the line where this text is.
+        end_prompt (`str`, *optional*): If provided, stops splitting at a line where this text is.
+
+    Warning:
+        The text before `start_prompt` or after `end_prompt` (if provided) is not ignored, just not split. The input `code`
+        can thus be retrieved by joining the result.
+
+    Returns:
+        `List[str]`: The list of blocks.
+    """
+    # Let's split the code into lines and move to start_index.
+    index = 0
+    lines = code.split("\n")
+    if start_prompt is not None:
+        while not lines[index].startswith(start_prompt):
+            index += 1
+        blocks = ["\n".join(lines[:index])]
+    else:
+        blocks = []
+
+    # This variable contains the block treated at a given time.
+    current_block = [lines[index]]
+    index += 1
+    # We split into blocks until we get to the `end_prompt` (or the end of the file).
+    while index < len(lines) and (end_prompt is None or not lines[index].startswith(end_prompt)):
+        # We have a non-empty line with the proper indent -> start of a new block
+        if len(lines[index]) > 0 and get_indent(lines[index]) == indent_level:
+            # Store the current block in the result and rest. There are two cases: the line is part of the block (like
+            # a closing parenthesis) or not.
+            if len(current_block) > 0 and get_indent(current_block[-1]).startswith(indent_level + " "):
+                # Line is part of the current block
+                current_block.append(lines[index])
+                blocks.append("\n".join(current_block))
+                if index < len(lines) - 1:
+                    current_block = [lines[index + 1]]
+                    index += 1
+                else:
+                    current_block = []
+            else:
+                # Line is not part of the current block
+                blocks.append("\n".join(current_block))
+                current_block = [lines[index]]
+        else:
+            # Just add the line to the current block
+            current_block.append(lines[index])
+        index += 1
+
+    # Adds current block if it's nonempty.
+    if len(current_block) > 0:
+        blocks.append("\n".join(current_block))
+
+    # Add final block after end_prompt if provided.
+    if end_prompt is not None and index < len(lines):
+        blocks.append("\n".join(lines[index:]))
+
+    return blocks
+
+
+def ignore_underscore_and_lowercase(key: Callable[[Any], str]) -> Callable[[Any], str]:
+    """
+    Wraps a key function (as used in a sort) to lowercase and ignore underscores.
+    """
+
+    def _inner(x):
+        return key(x).lower().replace("_", "")
+
+    return _inner
+
+
+def sort_objects(objects: List[Any], key: Optional[Callable[[Any], str]] = None) -> List[Any]:
+    """
+    Sort a list of objects following the rules of isort (all uppercased first, camel-cased second and lower-cased
+    last).
+
+    Args:
+        objects (`List[Any]`):
+            The list of objects to sort.
+        key (`Callable[[Any], str]`, *optional*):
+            A function taking an object as input and returning a string, used to sort them by alphabetical order.
+            If not provided, will default to noop (so a `key` must be provided if the `objects` are not of type string).
+
+    Returns:
+        `List[Any]`: The sorted list with the same elements as in the inputs
+    """
+
+    # If no key is provided, we use a noop.
+    def noop(x):
+        return x
+
+    if key is None:
+        key = noop
+    # Constants are all uppercase, they go first.
+    constants = [obj for obj in objects if key(obj).isupper()]
+    # Classes are not all uppercase but start with a capital, they go second.
+    classes = [obj for obj in objects if key(obj)[0].isupper() and not key(obj).isupper()]
+    # Functions begin with a lowercase, they go last.
+    functions = [obj for obj in objects if not key(obj)[0].isupper()]
+
+    # Then we sort each group.
+    key1 = ignore_underscore_and_lowercase(key)
+    return sorted(constants, key=key1) + sorted(classes, key=key1) + sorted(functions, key=key1)
+
+
+def sort_objects_in_import(import_statement: str) -> str:
+    """
+    Sorts the imports in a single import statement.
+
+    Args:
+        import_statement (`str`): The import statement in which to sort the imports.
+
+    Returns:
+        `str`: The same as the input, but with objects properly sorted.
+    """
+
+    # This inner function sort imports between [ ].
+    def _replace(match):
+        imports = match.groups()[0]
+        # If there is one import only, nothing to do.
+        if "," not in imports:
+            return f"[{imports}]"
+        keys = [part.strip().replace('"', "") for part in imports.split(",")]
+        # We will have a final empty element if the line finished with a comma.
+        if len(keys[-1]) == 0:
+            keys = keys[:-1]
+        return "[" + ", ".join([f'"{k}"' for k in sort_objects(keys)]) + "]"
+
+    lines = import_statement.split("\n")
+    if len(lines) > 3:
+        # Here we have to sort internal imports that are on several lines (one per name):
+        # key: [
+        #     "object1",
+        #     "object2",
+        #     ...
+        # ]
+
+        # We may have to ignore one or two lines on each side.
+        idx = 2 if lines[1].strip() == "[" else 1
+        keys_to_sort = [(i, _re_strip_line.search(line).groups()[0]) for i, line in enumerate(lines[idx:-idx])]
+        sorted_indices = sort_objects(keys_to_sort, key=lambda x: x[1])
+        sorted_lines = [lines[x[0] + idx] for x in sorted_indices]
+        return "\n".join(lines[:idx] + sorted_lines + lines[-idx:])
+    elif len(lines) == 3:
+        # Here we have to sort internal imports that are on one separate line:
+        # key: [
+        #     "object1", "object2", ...
+        # ]
+        if _re_bracket_content.search(lines[1]) is not None:
+            lines[1] = _re_bracket_content.sub(_replace, lines[1])
+        else:
+            keys = [part.strip().replace('"', "") for part in lines[1].split(",")]
+            # We will have a final empty element if the line finished with a comma.
+            if len(keys[-1]) == 0:
+                keys = keys[:-1]
+            lines[1] = get_indent(lines[1]) + ", ".join([f'"{k}"' for k in sort_objects(keys)])
+        return "\n".join(lines)
+    else:
+        # Finally we have to deal with imports fitting on one line
+        import_statement = _re_bracket_content.sub(_replace, import_statement)
+        return import_statement
+
+
+def sort_imports(file: str, check_only: bool = True):
+    """
+    Sort the imports defined in the `_import_structure` of a given init.
+
+    Args:
+        file (`str`): The path to the init to check/fix.
+        check_only (`bool`, *optional*, defaults to `True`): Whether or not to just check (and not auto-fix) the init.
+    """
+    with open(file, encoding="utf-8") as f:
+        code = f.read()
+
+    # If the file is not a custom init, there is nothing to do.
+    if "_import_structure" not in code:
+        return
+
+    # Blocks of indent level 0
+    main_blocks = split_code_in_indented_blocks(
+        code, start_prompt="_import_structure = {", end_prompt="if TYPE_CHECKING:"
+    )
+
+    # We ignore block 0 (everything untils start_prompt) and the last block (everything after end_prompt).
+    for block_idx in range(1, len(main_blocks) - 1):
+        # Check if the block contains some `_import_structure`s thingy to sort.
+        block = main_blocks[block_idx]
+        block_lines = block.split("\n")
+
+        # Get to the start of the imports.
+        line_idx = 0
+        while line_idx < len(block_lines) and "_import_structure" not in block_lines[line_idx]:
+            # Skip dummy import blocks
+            if "import dummy" in block_lines[line_idx]:
+                line_idx = len(block_lines)
+            else:
+                line_idx += 1
+        if line_idx >= len(block_lines):
+            continue
+
+        # Ignore beginning and last line: they don't contain anything.
+        internal_block_code = "\n".join(block_lines[line_idx:-1])
+        indent = get_indent(block_lines[1])
+        # Slit the internal block into blocks of indent level 1.
+        internal_blocks = split_code_in_indented_blocks(internal_block_code, indent_level=indent)
+        # We have two categories of import key: list or _import_structure[key].append/extend
+        pattern = _re_direct_key if "_import_structure = {" in block_lines[0] else _re_indirect_key
+        # Grab the keys, but there is a trap: some lines are empty or just comments.
+        keys = [(pattern.search(b).groups()[0] if pattern.search(b) is not None else None) for b in internal_blocks]
+        # We only sort the lines with a key.
+        keys_to_sort = [(i, key) for i, key in enumerate(keys) if key is not None]
+        sorted_indices = [x[0] for x in sorted(keys_to_sort, key=lambda x: x[1])]
+
+        # We reorder the blocks by leaving empty lines/comments as they were and reorder the rest.
+        count = 0
+        reordered_blocks = []
+        for i in range(len(internal_blocks)):
+            if keys[i] is None:
+                reordered_blocks.append(internal_blocks[i])
+            else:
+                block = sort_objects_in_import(internal_blocks[sorted_indices[count]])
+                reordered_blocks.append(block)
+                count += 1
+
+        # And we put our main block back together with its first and last line.
+        main_blocks[block_idx] = "\n".join(block_lines[:line_idx] + reordered_blocks + [block_lines[-1]])
+
+    if code != "\n".join(main_blocks):
+        if check_only:
+            return True
+        else:
+            print(f"Overwriting {file}.")
+            with open(file, "w", encoding="utf-8") as f:
+                f.write("\n".join(main_blocks))
+
+
+def sort_imports_in_all_inits(check_only=True):
+    """
+    Sort the imports defined in the `_import_structure` of all inits in the repo.
+
+    Args:
+        check_only (`bool`, *optional*, defaults to `True`): Whether or not to just check (and not auto-fix) the init.
+    """
+    failures = []
+    for root, _, files in os.walk(PATH_TO_TRANSFORMERS):
+        if "__init__.py" in files:
+            result = sort_imports(os.path.join(root, "__init__.py"), check_only=check_only)
+            if result:
+                failures = [os.path.join(root, "__init__.py")]
+    if len(failures) > 0:
+        raise ValueError(f"Would overwrite {len(failures)} files, run `make style`.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--check_only", action="store_true", help="Whether to only check or fix style.")
+    args = parser.parse_args()
+
+    sort_imports_in_all_inits(check_only=args.check_only)
diff --git a/diffusers/utils/fetch_latest_release_branch.py b/diffusers/utils/fetch_latest_release_branch.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bf578a5f58e94a3937ee3eb22ddd6ffd6ecf85e
--- /dev/null
+++ b/diffusers/utils/fetch_latest_release_branch.py
@@ -0,0 +1,68 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+from packaging.version import parse
+
+
+# GitHub repository details
+USER = "huggingface"
+REPO = "diffusers"
+
+
+def fetch_all_branches(user, repo):
+    branches = []  # List to store all branches
+    page = 1  # Start from first page
+    while True:
+        # Make a request to the GitHub API for the branches
+        response = requests.get(f"https://api.github.com/repos/{user}/{repo}/branches", params={"page": page})
+
+        # Check if the request was successful
+        if response.status_code == 200:
+            # Add the branches from the current page to the list
+            branches.extend([branch["name"] for branch in response.json()])
+
+            # Check if there is a 'next' link for pagination
+            if "next" in response.links:
+                page += 1  # Move to the next page
+            else:
+                break  # Exit loop if there is no next page
+        else:
+            print("Failed to retrieve branches:", response.status_code)
+            break
+
+    return branches
+
+
+def main():
+    # Fetch all branches
+    branches = fetch_all_branches(USER, REPO)
+
+    # Filter branches.
+    # print(f"Total branches: {len(branches)}")
+    filtered_branches = []
+    for branch in branches:
+        if branch.startswith("v") and ("-release" in branch or "-patch" in branch):
+            filtered_branches.append(branch)
+            # print(f"Filtered: {branch}")
+
+    sorted_branches = sorted(filtered_branches, key=lambda x: parse(x.split("-")[0][1:]), reverse=True)
+    latest_branch = sorted_branches[0]
+    # print(f"Latest branch: {latest_branch}")
+    return latest_branch
+
+
+if __name__ == "__main__":
+    print(main())
diff --git a/diffusers/utils/fetch_torch_cuda_pipeline_test_matrix.py b/diffusers/utils/fetch_torch_cuda_pipeline_test_matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6a9c4b6a3bddeb62a6c86641381364362efedfb
--- /dev/null
+++ b/diffusers/utils/fetch_torch_cuda_pipeline_test_matrix.py
@@ -0,0 +1,101 @@
+import json
+import logging
+import os
+from collections import defaultdict
+from pathlib import Path
+
+from huggingface_hub import HfApi
+
+import diffusers
+
+
+PATH_TO_REPO = Path(__file__).parent.parent.resolve()
+ALWAYS_TEST_PIPELINE_MODULES = [
+    "controlnet",
+    "stable_diffusion",
+    "stable_diffusion_2",
+    "stable_diffusion_xl",
+    "stable_diffusion_adapter",
+    "deepfloyd_if",
+    "ip_adapters",
+    "kandinsky",
+    "kandinsky2_2",
+    "text_to_video_synthesis",
+    "wuerstchen",
+]
+PIPELINE_USAGE_CUTOFF = int(os.getenv("PIPELINE_USAGE_CUTOFF", 50000))
+
+logger = logging.getLogger(__name__)
+api = HfApi()
+
+
+def filter_pipelines(usage_dict, usage_cutoff=10000):
+    output = []
+    for diffusers_object, usage in usage_dict.items():
+        if usage < usage_cutoff:
+            continue
+
+        is_diffusers_pipeline = hasattr(diffusers.pipelines, diffusers_object)
+        if not is_diffusers_pipeline:
+            continue
+
+        output.append(diffusers_object)
+
+    return output
+
+
+def fetch_pipeline_objects():
+    models = api.list_models(library="diffusers")
+    downloads = defaultdict(int)
+
+    for model in models:
+        is_counted = False
+        for tag in model.tags:
+            if tag.startswith("diffusers:"):
+                is_counted = True
+                downloads[tag[len("diffusers:") :]] += model.downloads
+
+        if not is_counted:
+            downloads["other"] += model.downloads
+
+    # Remove 0 downloads
+    downloads = {k: v for k, v in downloads.items() if v > 0}
+    pipeline_objects = filter_pipelines(downloads, PIPELINE_USAGE_CUTOFF)
+
+    return pipeline_objects
+
+
+def fetch_pipeline_modules_to_test():
+    try:
+        pipeline_objects = fetch_pipeline_objects()
+    except Exception as e:
+        logger.error(e)
+        raise RuntimeError("Unable to fetch model list from HuggingFace Hub.")
+
+    test_modules = []
+    for pipeline_name in pipeline_objects:
+        module = getattr(diffusers, pipeline_name)
+
+        test_module = module.__module__.split(".")[-2].strip()
+        test_modules.append(test_module)
+
+    return test_modules
+
+
+def main():
+    test_modules = fetch_pipeline_modules_to_test()
+    test_modules.extend(ALWAYS_TEST_PIPELINE_MODULES)
+
+    # Get unique modules
+    test_modules = sorted(set(test_modules))
+    print(json.dumps(test_modules))
+
+    save_path = f"{PATH_TO_REPO}/reports"
+    os.makedirs(save_path, exist_ok=True)
+
+    with open(f"{save_path}/test-pipelines.json", "w") as f:
+        json.dump({"pipeline_test_modules": test_modules}, f)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/utils/get_modified_files.py b/diffusers/utils/get_modified_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..a252bc648be51e13cba9f6156be0483f8a7b0b86
--- /dev/null
+++ b/diffusers/utils/get_modified_files.py
@@ -0,0 +1,34 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this script reports modified .py files under the desired list of top-level sub-dirs passed as a list of arguments, e.g.:
+#   python ./utils/get_modified_files.py utils src tests examples
+#
+# it uses git to find the forking point and which files were modified - i.e. files not under git won't be considered
+# since the output of this script is fed into Makefile commands it doesn't print a newline after the results
+
+import re
+import subprocess
+import sys
+
+
+fork_point_sha = subprocess.check_output("git merge-base main HEAD".split()).decode("utf-8")
+modified_files = subprocess.check_output(f"git diff --name-only {fork_point_sha}".split()).decode("utf-8").split()
+
+joined_dirs = "|".join(sys.argv[1:])
+regex = re.compile(rf"^({joined_dirs}).*?\.py$")
+
+relevant_modified_files = [x for x in modified_files if regex.match(x)]
+print(" ".join(relevant_modified_files), end="")
diff --git a/diffusers/utils/log_reports.py b/diffusers/utils/log_reports.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd1b258519d7b1ffc6f3a48cfe55033e3eeb2a38
--- /dev/null
+++ b/diffusers/utils/log_reports.py
@@ -0,0 +1,139 @@
+import argparse
+import json
+import os
+from datetime import date
+from pathlib import Path
+
+from slack_sdk import WebClient
+from tabulate import tabulate
+
+
+MAX_LEN_MESSAGE = 2900  # slack endpoint has a limit of 3001 characters
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--slack_channel_name", default="diffusers-ci-nightly")
+
+
+def main(slack_channel_name=None):
+    failed = []
+    passed = []
+
+    group_info = []
+
+    total_num_failed = 0
+    empty_file = False or len(list(Path().glob("*.log"))) == 0
+
+    total_empty_files = []
+
+    for log in Path().glob("*.log"):
+        section_num_failed = 0
+        i = 0
+        with open(log) as f:
+            for line in f:
+                line = json.loads(line)
+                i += 1
+                if line.get("nodeid", "") != "":
+                    test = line["nodeid"]
+                    if line.get("duration", None) is not None:
+                        duration = f'{line["duration"]:.4f}'
+                        if line.get("outcome", "") == "failed":
+                            section_num_failed += 1
+                            failed.append([test, duration, log.name.split("_")[0]])
+                            total_num_failed += 1
+                        else:
+                            passed.append([test, duration, log.name.split("_")[0]])
+            empty_file = i == 0
+        group_info.append([str(log), section_num_failed, failed])
+        total_empty_files.append(empty_file)
+        os.remove(log)
+        failed = []
+    text = (
+        "🌞 There were no failures!"
+        if not any(total_empty_files)
+        else "Something went wrong there is at least one empty file - please check GH action results."
+    )
+    no_error_payload = {
+        "type": "section",
+        "text": {
+            "type": "plain_text",
+            "text": text,
+            "emoji": True,
+        },
+    }
+
+    message = ""
+    payload = [
+        {
+            "type": "header",
+            "text": {
+                "type": "plain_text",
+                "text": "🤗 Results of the Diffusers scheduled nightly tests.",
+            },
+        },
+    ]
+    if total_num_failed > 0:
+        for i, (name, num_failed, failed_tests) in enumerate(group_info):
+            if num_failed > 0:
+                if num_failed == 1:
+                    message += f"*{name}: {num_failed} failed test*\n"
+                else:
+                    message += f"*{name}: {num_failed} failed tests*\n"
+                failed_table = []
+                for test in failed_tests:
+                    failed_table.append(test[0].split("::"))
+                failed_table = tabulate(
+                    failed_table,
+                    headers=["Test Location", "Test Case", "Test Name"],
+                    showindex="always",
+                    tablefmt="grid",
+                    maxcolwidths=[12, 12, 12],
+                )
+                message += "\n```\n" + failed_table + "\n```"
+
+            if total_empty_files[i]:
+                message += f"\n*{name}: Warning! Empty file - please check the GitHub action job *\n"
+        print(f"### {message}")
+    else:
+        payload.append(no_error_payload)
+
+    if len(message) > MAX_LEN_MESSAGE:
+        print(f"Truncating long message from {len(message)} to {MAX_LEN_MESSAGE}")
+        message = message[:MAX_LEN_MESSAGE] + "..."
+
+    if len(message) != 0:
+        md_report = {
+            "type": "section",
+            "text": {"type": "mrkdwn", "text": message},
+        }
+        payload.append(md_report)
+        action_button = {
+            "type": "section",
+            "text": {"type": "mrkdwn", "text": "*For more details:*"},
+            "accessory": {
+                "type": "button",
+                "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
+                "url": f"https://github.com/huggingface/diffusers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
+            },
+        }
+        payload.append(action_button)
+
+    date_report = {
+        "type": "context",
+        "elements": [
+            {
+                "type": "plain_text",
+                "text": f"Nightly test results for {date.today()}",
+            },
+        ],
+    }
+    payload.append(date_report)
+
+    print(payload)
+
+    client = WebClient(token=os.environ.get("SLACK_API_TOKEN"))
+    client.chat_postMessage(channel=f"#{slack_channel_name}", text=message, blocks=payload)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.slack_channel_name)
diff --git a/diffusers/utils/notify_benchmarking_status.py b/diffusers/utils/notify_benchmarking_status.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9c6ab485f597672e72f54418b3437209385d6a9
--- /dev/null
+++ b/diffusers/utils/notify_benchmarking_status.py
@@ -0,0 +1,56 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+import requests
+
+
+# Configuration
+GITHUB_REPO = "huggingface/diffusers"
+GITHUB_RUN_ID = os.getenv("GITHUB_RUN_ID")
+SLACK_WEBHOOK_URL = os.getenv("SLACK_WEBHOOK_URL")
+
+
+def main(args):
+    action_url = f"https://github.com/{GITHUB_REPO}/actions/runs/{GITHUB_RUN_ID}"
+    if args.status == "success":
+        hub_path = "https://huggingface.co/datasets/diffusers/benchmarks/blob/main/collated_results.csv"
+        message = (
+            "✅ New benchmark workflow successfully run.\n"
+            f"🕸️ GitHub Action URL: {action_url}.\n"
+            f"🤗 Check out the benchmarks here: {hub_path}."
+        )
+    else:
+        message = (
+            "❌ Something wrong happened in the benchmarking workflow.\n"
+            f"Check out the GitHub Action to know more: {action_url}."
+        )
+
+    payload = {"text": message}
+    response = requests.post(SLACK_WEBHOOK_URL, json=payload)
+
+    if response.status_code == 200:
+        print("Notification sent to Slack successfully.")
+    else:
+        print("Failed to send notification to Slack.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--status", type=str, default="success", choices=["success", "failure"])
+    args = parser.parse_args()
+    main(args)
diff --git a/diffusers/utils/notify_community_pipelines_mirror.py b/diffusers/utils/notify_community_pipelines_mirror.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d3a31c988e2376cc10766bbec08dd2404a6cd7
--- /dev/null
+++ b/diffusers/utils/notify_community_pipelines_mirror.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+import requests
+
+
+# Configuration
+GITHUB_REPO = "huggingface/diffusers"
+GITHUB_RUN_ID = os.getenv("GITHUB_RUN_ID")
+SLACK_WEBHOOK_URL = os.getenv("SLACK_WEBHOOK_URL")
+PATH_IN_REPO = os.getenv("PATH_IN_REPO")
+
+
+def main(args):
+    action_url = f"https://github.com/{GITHUB_REPO}/actions/runs/{GITHUB_RUN_ID}"
+    if args.status == "success":
+        hub_path = f"https://huggingface.co/datasets/diffusers/community-pipelines-mirror/tree/main/{PATH_IN_REPO}"
+        message = (
+            "✅ Community pipelines successfully mirrored.\n"
+            f"🕸️ GitHub Action URL: {action_url}.\n"
+            f"🤗 Hub location: {hub_path}."
+        )
+    else:
+        message = f"❌ Something wrong happened. Check out the GitHub Action to know more: {action_url}."
+
+    payload = {"text": message}
+    response = requests.post(SLACK_WEBHOOK_URL, json=payload)
+
+    if response.status_code == 200:
+        print("Notification sent to Slack successfully.")
+    else:
+        print("Failed to send notification to Slack.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--status", type=str, default="success", choices=["success", "failure"])
+    args = parser.parse_args()
+    main(args)
diff --git a/diffusers/utils/notify_slack_about_release.py b/diffusers/utils/notify_slack_about_release.py
new file mode 100644
index 0000000000000000000000000000000000000000..a67dd8bd06859562f135848130ea308ec8440078
--- /dev/null
+++ b/diffusers/utils/notify_slack_about_release.py
@@ -0,0 +1,81 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import requests
+
+
+# Configuration
+LIBRARY_NAME = "diffusers"
+GITHUB_REPO = "huggingface/diffusers"
+SLACK_WEBHOOK_URL = os.getenv("SLACK_WEBHOOK_URL")
+
+
+def check_pypi_for_latest_release(library_name):
+    """Check PyPI for the latest release of the library."""
+    response = requests.get(f"https://pypi.org/pypi/{library_name}/json")
+    if response.status_code == 200:
+        data = response.json()
+        return data["info"]["version"]
+    else:
+        print("Failed to fetch library details from PyPI.")
+        return None
+
+
+def get_github_release_info(github_repo):
+    """Fetch the latest release info from GitHub."""
+    url = f"https://api.github.com/repos/{github_repo}/releases/latest"
+    response = requests.get(url)
+
+    if response.status_code == 200:
+        data = response.json()
+        return {"tag_name": data["tag_name"], "url": data["html_url"], "release_time": data["published_at"]}
+
+    else:
+        print("Failed to fetch release info from GitHub.")
+        return None
+
+
+def notify_slack(webhook_url, library_name, version, release_info):
+    """Send a notification to a Slack channel."""
+    message = (
+        f"🚀 New release for {library_name} available: version **{version}** 🎉\n"
+        f"📜 Release Notes: {release_info['url']}\n"
+        f"⏱️ Release time: {release_info['release_time']}"
+    )
+    payload = {"text": message}
+    response = requests.post(webhook_url, json=payload)
+
+    if response.status_code == 200:
+        print("Notification sent to Slack successfully.")
+    else:
+        print("Failed to send notification to Slack.")
+
+
+def main():
+    latest_version = check_pypi_for_latest_release(LIBRARY_NAME)
+    release_info = get_github_release_info(GITHUB_REPO)
+    parsed_version = release_info["tag_name"].replace("v", "")
+
+    if latest_version and release_info and latest_version == parsed_version:
+        notify_slack(SLACK_WEBHOOK_URL, LIBRARY_NAME, latest_version, release_info)
+    else:
+        print(f"{latest_version=}, {release_info=}, {parsed_version=}")
+        raise ValueError("There were some problems.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/utils/overwrite_expected_slice.py b/diffusers/utils/overwrite_expected_slice.py
new file mode 100644
index 0000000000000000000000000000000000000000..07778a05b1eefc2976dd2ed7dbd32b42b84b15e0
--- /dev/null
+++ b/diffusers/utils/overwrite_expected_slice.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from collections import defaultdict
+
+
+def overwrite_file(file, class_name, test_name, correct_line, done_test):
+    _id = f"{file}_{class_name}_{test_name}"
+    done_test[_id] += 1
+
+    with open(file, "r") as f:
+        lines = f.readlines()
+
+    class_regex = f"class {class_name}("
+    test_regex = f"{4 * ' '}def {test_name}("
+    line_begin_regex = f"{8 * ' '}{correct_line.split()[0]}"
+    another_line_begin_regex = f"{16 * ' '}{correct_line.split()[0]}"
+    in_class = False
+    in_func = False
+    in_line = False
+    insert_line = False
+    count = 0
+    spaces = 0
+
+    new_lines = []
+    for line in lines:
+        if line.startswith(class_regex):
+            in_class = True
+        elif in_class and line.startswith(test_regex):
+            in_func = True
+        elif in_class and in_func and (line.startswith(line_begin_regex) or line.startswith(another_line_begin_regex)):
+            spaces = len(line.split(correct_line.split()[0])[0])
+            count += 1
+
+            if count == done_test[_id]:
+                in_line = True
+
+        if in_class and in_func and in_line:
+            if ")" not in line:
+                continue
+            else:
+                insert_line = True
+
+        if in_class and in_func and in_line and insert_line:
+            new_lines.append(f"{spaces * ' '}{correct_line}")
+            in_class = in_func = in_line = insert_line = False
+        else:
+            new_lines.append(line)
+
+    with open(file, "w") as f:
+        for line in new_lines:
+            f.write(line)
+
+
+def main(correct, fail=None):
+    if fail is not None:
+        with open(fail, "r") as f:
+            test_failures = {l.strip() for l in f.readlines()}
+    else:
+        test_failures = None
+
+    with open(correct, "r") as f:
+        correct_lines = f.readlines()
+
+    done_tests = defaultdict(int)
+    for line in correct_lines:
+        file, class_name, test_name, correct_line = line.split("::")
+        if test_failures is None or "::".join([file, class_name, test_name]) in test_failures:
+            overwrite_file(file, class_name, test_name, correct_line, done_tests)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--correct_filename", help="filename of tests with expected result")
+    parser.add_argument("--fail_filename", help="filename of test failures", type=str, default=None)
+    args = parser.parse_args()
+
+    main(args.correct_filename, args.fail_filename)
diff --git a/diffusers/utils/print_env.py b/diffusers/utils/print_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e4495c980940530720593afc92214103551744d
--- /dev/null
+++ b/diffusers/utils/print_env.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this script dumps information about the environment
+
+import os
+import platform
+import sys
+
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+
+print("Python version:", sys.version)
+
+print("OS platform:", platform.platform())
+print("OS architecture:", platform.machine())
+
+try:
+    import torch
+
+    print("Torch version:", torch.__version__)
+    print("Cuda available:", torch.cuda.is_available())
+    print("Cuda version:", torch.version.cuda)
+    print("CuDNN version:", torch.backends.cudnn.version())
+    print("Number of GPUs available:", torch.cuda.device_count())
+except ImportError:
+    print("Torch version:", None)
+
+try:
+    import transformers
+
+    print("transformers version:", transformers.__version__)
+except ImportError:
+    print("transformers version:", None)
diff --git a/diffusers/utils/release.py b/diffusers/utils/release.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0800b99fbebd9d2956b64fd92d93490e36f9a41
--- /dev/null
+++ b/diffusers/utils/release.py
@@ -0,0 +1,162 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import re
+
+import packaging.version
+
+
+PATH_TO_EXAMPLES = "examples/"
+REPLACE_PATTERNS = {
+    "examples": (re.compile(r'^check_min_version\("[^"]+"\)\s*$', re.MULTILINE), 'check_min_version("VERSION")\n'),
+    "init": (re.compile(r'^__version__\s+=\s+"([^"]+)"\s*$', re.MULTILINE), '__version__ = "VERSION"\n'),
+    "setup": (re.compile(r'^(\s*)version\s*=\s*"[^"]+",', re.MULTILINE), r'\1version="VERSION",'),
+    "doc": (re.compile(r'^(\s*)release\s*=\s*"[^"]+"$', re.MULTILINE), 'release = "VERSION"\n'),
+}
+REPLACE_FILES = {
+    "init": "src/diffusers/__init__.py",
+    "setup": "setup.py",
+}
+README_FILE = "README.md"
+
+
+def update_version_in_file(fname, version, pattern):
+    """Update the version in one file using a specific pattern."""
+    with open(fname, "r", encoding="utf-8", newline="\n") as f:
+        code = f.read()
+    re_pattern, replace = REPLACE_PATTERNS[pattern]
+    replace = replace.replace("VERSION", version)
+    code = re_pattern.sub(replace, code)
+    with open(fname, "w", encoding="utf-8", newline="\n") as f:
+        f.write(code)
+
+
+def update_version_in_examples(version):
+    """Update the version in all examples files."""
+    for folder, directories, fnames in os.walk(PATH_TO_EXAMPLES):
+        # Removing some of the folders with non-actively maintained examples from the walk
+        if "research_projects" in directories:
+            directories.remove("research_projects")
+        if "legacy" in directories:
+            directories.remove("legacy")
+        for fname in fnames:
+            if fname.endswith(".py"):
+                update_version_in_file(os.path.join(folder, fname), version, pattern="examples")
+
+
+def global_version_update(version, patch=False):
+    """Update the version in all needed files."""
+    for pattern, fname in REPLACE_FILES.items():
+        update_version_in_file(fname, version, pattern)
+    if not patch:
+        update_version_in_examples(version)
+
+
+def clean_main_ref_in_model_list():
+    """Replace the links from main doc tp stable doc in the model list of the README."""
+    # If the introduction or the conclusion of the list change, the prompts may need to be updated.
+    _start_prompt = "🤗 Transformers currently provides the following architectures"
+    _end_prompt = "1. Want to contribute a new model?"
+    with open(README_FILE, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+
+    # Find the start of the list.
+    start_index = 0
+    while not lines[start_index].startswith(_start_prompt):
+        start_index += 1
+    start_index += 1
+
+    index = start_index
+    # Update the lines in the model list.
+    while not lines[index].startswith(_end_prompt):
+        if lines[index].startswith("1."):
+            lines[index] = lines[index].replace(
+                "https://huggingface.co/docs/diffusers/main/model_doc",
+                "https://huggingface.co/docs/diffusers/model_doc",
+            )
+        index += 1
+
+    with open(README_FILE, "w", encoding="utf-8", newline="\n") as f:
+        f.writelines(lines)
+
+
+def get_version():
+    """Reads the current version in the __init__."""
+    with open(REPLACE_FILES["init"], "r") as f:
+        code = f.read()
+    default_version = REPLACE_PATTERNS["init"][0].search(code).groups()[0]
+    return packaging.version.parse(default_version)
+
+
+def pre_release_work(patch=False):
+    """Do all the necessary pre-release steps."""
+    # First let's get the default version: base version if we are in dev, bump minor otherwise.
+    default_version = get_version()
+    if patch and default_version.is_devrelease:
+        raise ValueError("Can't create a patch version from the dev branch, checkout a released version!")
+    if default_version.is_devrelease:
+        default_version = default_version.base_version
+    elif patch:
+        default_version = f"{default_version.major}.{default_version.minor}.{default_version.micro + 1}"
+    else:
+        default_version = f"{default_version.major}.{default_version.minor + 1}.0"
+
+    # Now let's ask nicely if that's the right one.
+    version = input(f"Which version are you releasing? [{default_version}]")
+    if len(version) == 0:
+        version = default_version
+
+    print(f"Updating version to {version}.")
+    global_version_update(version, patch=patch)
+
+
+#    if not patch:
+#        print("Cleaning main README, don't forget to run `make fix-copies`.")
+#        clean_main_ref_in_model_list()
+
+
+def post_release_work():
+    """Do all the necessary post-release steps."""
+    # First let's get the current version
+    current_version = get_version()
+    dev_version = f"{current_version.major}.{current_version.minor + 1}.0.dev0"
+    current_version = current_version.base_version
+
+    # Check with the user we got that right.
+    version = input(f"Which version are we developing now? [{dev_version}]")
+    if len(version) == 0:
+        version = dev_version
+
+    print(f"Updating version to {version}.")
+    global_version_update(version)
+
+
+#    print("Cleaning main README, don't forget to run `make fix-copies`.")
+#    clean_main_ref_in_model_list()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--post_release", action="store_true", help="Whether this is pre or post release.")
+    parser.add_argument("--patch", action="store_true", help="Whether or not this is a patch release.")
+    args = parser.parse_args()
+    if not args.post_release:
+        pre_release_work(patch=args.patch)
+    elif args.patch:
+        print("Nothing to do after a patch :-)")
+    else:
+        post_release_work()
diff --git a/diffusers/utils/stale.py b/diffusers/utils/stale.py
new file mode 100644
index 0000000000000000000000000000000000000000..c01b6d5682e9c9895b62155d50d1a1cd2c516ef1
--- /dev/null
+++ b/diffusers/utils/stale.py
@@ -0,0 +1,68 @@
+# Copyright 2024 The HuggingFace Team, the AllenNLP library authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Script to close stale issue. Taken in part from the AllenNLP repository.
+https://github.com/allenai/allennlp.
+"""
+
+import os
+from datetime import datetime as dt
+from datetime import timezone
+
+from github import Github
+
+
+LABELS_TO_EXEMPT = [
+    "good first issue",
+    "good second issue",
+    "good difficult issue",
+    "enhancement",
+    "new pipeline/model",
+    "new scheduler",
+    "wip",
+]
+
+
+def main():
+    g = Github(os.environ["GITHUB_TOKEN"])
+    repo = g.get_repo("huggingface/diffusers")
+    open_issues = repo.get_issues(state="open")
+
+    for issue in open_issues:
+        labels = [label.name.lower() for label in issue.get_labels()]
+        if "stale" in labels:
+            comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True)
+            last_comment = comments[0] if len(comments) > 0 else None
+            if last_comment is not None and last_comment.user.login != "github-actions[bot]":
+                # Opens the issue if someone other than Stalebot commented.
+                issue.edit(state="open")
+                issue.remove_from_labels("stale")
+        elif (
+            (dt.now(timezone.utc) - issue.updated_at).days > 23
+            and (dt.now(timezone.utc) - issue.created_at).days >= 30
+            and not any(label in LABELS_TO_EXEMPT for label in labels)
+        ):
+            # Post a Stalebot notification after 23 days of inactivity.
+            issue.create_comment(
+                "This issue has been automatically marked as stale because it has not had "
+                "recent activity. If you think this still needs to be addressed "
+                "please comment on this thread.\n\nPlease note that issues that do not follow the "
+                "[contributing guidelines](https://github.com/huggingface/diffusers/blob/main/CONTRIBUTING.md) "
+                "are likely to be ignored."
+            )
+            issue.add_to_labels("stale")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffusers/utils/tests_fetcher.py b/diffusers/utils/tests_fetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..abdc9fd409dbe85e3cf2f72cc32de617b2434ade
--- /dev/null
+++ b/diffusers/utils/tests_fetcher.py
@@ -0,0 +1,1128 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Welcome to tests_fetcher V2.
+
+This util is designed to fetch tests to run on a PR so that only the tests impacted by the modifications are run, and
+when too many models are being impacted, only run the tests of a subset of core models. It works like this.
+
+Stage 1: Identify the modified files. For jobs that run on the main branch, it's just the diff with the last commit.
+On a PR, this takes all the files from the branching point to the current commit (so all modifications in a PR, not
+just the last commit) but excludes modifications that are on docstrings or comments only.
+
+Stage 2: Extract the tests to run. This is done by looking at the imports in each module and test file: if module A
+imports module B, then changing module B impacts module A, so the tests using module A should be run. We thus get the
+dependencies of each model and then recursively builds the 'reverse' map of dependencies to get all modules and tests
+impacted by a given file. We then only keep the tests (and only the core models tests if there are too many modules).
+
+Caveats:
+  - This module only filters tests by files (not individual tests) so it's better to have tests for different things
+    in different files.
+  - This module assumes inits are just importing things, not really building objects, so it's better to structure
+    them this way and move objects building in separate submodules.
+
+Usage:
+
+Base use to fetch the tests in a pull request
+
+```bash
+python utils/tests_fetcher.py
+```
+
+Base use to fetch the tests on a the main branch (with diff from the last commit):
+
+```bash
+python utils/tests_fetcher.py --diff_with_last_commit
+```
+"""
+
+import argparse
+import collections
+import json
+import os
+import re
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+from git import Repo
+
+
+PATH_TO_REPO = Path(__file__).parent.parent.resolve()
+PATH_TO_EXAMPLES = PATH_TO_REPO / "examples"
+PATH_TO_DIFFUSERS = PATH_TO_REPO / "src/diffusers"
+PATH_TO_TESTS = PATH_TO_REPO / "tests"
+
+# Ignore fixtures in tests folder
+# Ignore lora since they are always tested
+MODULES_TO_IGNORE = ["fixtures", "lora"]
+
+IMPORTANT_PIPELINES = [
+    "controlnet",
+    "stable_diffusion",
+    "stable_diffusion_2",
+    "stable_diffusion_xl",
+    "stable_video_diffusion",
+    "deepfloyd_if",
+    "kandinsky",
+    "kandinsky2_2",
+    "text_to_video_synthesis",
+    "wuerstchen",
+]
+
+
+@contextmanager
+def checkout_commit(repo: Repo, commit_id: str):
+    """
+    Context manager that checks out a given commit when entered, but gets back to the reference it was at on exit.
+
+    Args:
+        repo (`git.Repo`): A git repository (for instance the Transformers repo).
+        commit_id (`str`): The commit reference to checkout inside the context manager.
+    """
+    current_head = repo.head.commit if repo.head.is_detached else repo.head.ref
+
+    try:
+        repo.git.checkout(commit_id)
+        yield
+
+    finally:
+        repo.git.checkout(current_head)
+
+
+def clean_code(content: str) -> str:
+    """
+    Remove docstrings, empty line or comments from some code (used to detect if a diff is real or only concern
+    comments or docstrings).
+
+    Args:
+        content (`str`): The code to clean
+
+    Returns:
+        `str`: The cleaned code.
+    """
+    # We need to deactivate autoformatting here to write escaped triple quotes (we cannot use real triple quotes or
+    # this would mess up the result if this function applied to this particular file).
+    # fmt: off
+    # Remove docstrings by splitting on triple " then triple ':
+    splits = content.split('\"\"\"')
+    content = "".join(splits[::2])
+    splits = content.split("\'\'\'")
+    # fmt: on
+    content = "".join(splits[::2])
+
+    # Remove empty lines and comments
+    lines_to_keep = []
+    for line in content.split("\n"):
+        # remove anything that is after a # sign.
+        line = re.sub("#.*$", "", line)
+        # remove white lines
+        if len(line) != 0 and not line.isspace():
+            lines_to_keep.append(line)
+    return "\n".join(lines_to_keep)
+
+
+def keep_doc_examples_only(content: str) -> str:
+    """
+    Remove everything from the code content except the doc examples (used to determined if a diff should trigger doc
+    tests or not).
+
+    Args:
+        content (`str`): The code to clean
+
+    Returns:
+        `str`: The cleaned code.
+    """
+    # Keep doc examples only by splitting on triple "`"
+    splits = content.split("```")
+    # Add leading and trailing "```" so the navigation is easier when compared to the original input `content`
+    content = "```" + "```".join(splits[1::2]) + "```"
+
+    # Remove empty lines and comments
+    lines_to_keep = []
+    for line in content.split("\n"):
+        # remove anything that is after a # sign.
+        line = re.sub("#.*$", "", line)
+        # remove white lines
+        if len(line) != 0 and not line.isspace():
+            lines_to_keep.append(line)
+    return "\n".join(lines_to_keep)
+
+
+def get_all_tests() -> List[str]:
+    """
+    Walks the `tests` folder to return a list of files/subfolders. This is used to split the tests to run when using
+    parallelism. The split is:
+
+    - folders under `tests`: (`tokenization`, `pipelines`, etc) except the subfolder `models` is excluded.
+    - folders under `tests/models`: `bert`, `gpt2`, etc.
+    - test files under `tests`: `test_modeling_common.py`, `test_tokenization_common.py`, etc.
+    """
+
+    # test folders/files directly under `tests` folder
+    tests = os.listdir(PATH_TO_TESTS)
+    tests = [f"tests/{f}" for f in tests if "__pycache__" not in f]
+    tests = sorted([f for f in tests if (PATH_TO_REPO / f).is_dir() or f.startswith("tests/test_")])
+
+    return tests
+
+
+def diff_is_docstring_only(repo: Repo, branching_point: str, filename: str) -> bool:
+    """
+    Check if the diff is only in docstrings (or comments and whitespace) in a filename.
+
+    Args:
+        repo (`git.Repo`): A git repository (for instance the Transformers repo).
+        branching_point (`str`): The commit reference of where to compare for the diff.
+        filename (`str`): The filename where we want to know if the diff isonly in docstrings/comments.
+
+    Returns:
+        `bool`: Whether the diff is docstring/comments only or not.
+    """
+    folder = Path(repo.working_dir)
+    with checkout_commit(repo, branching_point):
+        with open(folder / filename, "r", encoding="utf-8") as f:
+            old_content = f.read()
+
+    with open(folder / filename, "r", encoding="utf-8") as f:
+        new_content = f.read()
+
+    old_content_clean = clean_code(old_content)
+    new_content_clean = clean_code(new_content)
+
+    return old_content_clean == new_content_clean
+
+
+def diff_contains_doc_examples(repo: Repo, branching_point: str, filename: str) -> bool:
+    """
+    Check if the diff is only in code examples of the doc in a filename.
+
+    Args:
+        repo (`git.Repo`): A git repository (for instance the Transformers repo).
+        branching_point (`str`): The commit reference of where to compare for the diff.
+        filename (`str`): The filename where we want to know if the diff is only in codes examples.
+
+    Returns:
+        `bool`: Whether the diff is only in code examples of the doc or not.
+    """
+    folder = Path(repo.working_dir)
+    with checkout_commit(repo, branching_point):
+        with open(folder / filename, "r", encoding="utf-8") as f:
+            old_content = f.read()
+
+    with open(folder / filename, "r", encoding="utf-8") as f:
+        new_content = f.read()
+
+    old_content_clean = keep_doc_examples_only(old_content)
+    new_content_clean = keep_doc_examples_only(new_content)
+
+    return old_content_clean != new_content_clean
+
+
+def get_diff(repo: Repo, base_commit: str, commits: List[str]) -> List[str]:
+    """
+    Get the diff between a base commit and one or several commits.
+
+    Args:
+        repo (`git.Repo`):
+            A git repository (for instance the Transformers repo).
+        base_commit (`str`):
+            The commit reference of where to compare for the diff. This is the current commit, not the branching point!
+        commits (`List[str]`):
+            The list of commits with which to compare the repo at `base_commit` (so the branching point).
+
+    Returns:
+        `List[str]`: The list of Python files with a diff (files added, renamed or deleted are always returned, files
+        modified are returned if the diff in the file is not only in docstrings or comments, see
+        `diff_is_docstring_only`).
+    """
+    print("\n### DIFF ###\n")
+    code_diff = []
+    for commit in commits:
+        for diff_obj in commit.diff(base_commit):
+            # We always add new python files
+            if diff_obj.change_type == "A" and diff_obj.b_path.endswith(".py"):
+                code_diff.append(diff_obj.b_path)
+            # We check that deleted python files won't break corresponding tests.
+            elif diff_obj.change_type == "D" and diff_obj.a_path.endswith(".py"):
+                code_diff.append(diff_obj.a_path)
+            # Now for modified files
+            elif diff_obj.change_type in ["M", "R"] and diff_obj.b_path.endswith(".py"):
+                # In case of renames, we'll look at the tests using both the old and new name.
+                if diff_obj.a_path != diff_obj.b_path:
+                    code_diff.extend([diff_obj.a_path, diff_obj.b_path])
+                else:
+                    # Otherwise, we check modifications are in code and not docstrings.
+                    if diff_is_docstring_only(repo, commit, diff_obj.b_path):
+                        print(f"Ignoring diff in {diff_obj.b_path} as it only concerns docstrings or comments.")
+                    else:
+                        code_diff.append(diff_obj.a_path)
+
+    return code_diff
+
+
+def get_modified_python_files(diff_with_last_commit: bool = False) -> List[str]:
+    """
+    Return a list of python files that have been modified between:
+
+    - the current head and the main branch if `diff_with_last_commit=False` (default)
+    - the current head and its parent commit otherwise.
+
+    Returns:
+        `List[str]`: The list of Python files with a diff (files added, renamed or deleted are always returned, files
+        modified are returned if the diff in the file is not only in docstrings or comments, see
+        `diff_is_docstring_only`).
+    """
+    repo = Repo(PATH_TO_REPO)
+
+    if not diff_with_last_commit:
+        # Need to fetch refs for main using remotes when running with github actions.
+        upstream_main = repo.remotes.origin.refs.main
+
+        print(f"main is at {upstream_main.commit}")
+        print(f"Current head is at {repo.head.commit}")
+
+        branching_commits = repo.merge_base(upstream_main, repo.head)
+        for commit in branching_commits:
+            print(f"Branching commit: {commit}")
+        return get_diff(repo, repo.head.commit, branching_commits)
+    else:
+        print(f"main is at {repo.head.commit}")
+        parent_commits = repo.head.commit.parents
+        for commit in parent_commits:
+            print(f"Parent commit: {commit}")
+        return get_diff(repo, repo.head.commit, parent_commits)
+
+
+def get_diff_for_doctesting(repo: Repo, base_commit: str, commits: List[str]) -> List[str]:
+    """
+    Get the diff in doc examples between a base commit and one or several commits.
+
+    Args:
+        repo (`git.Repo`):
+            A git repository (for instance the Transformers repo).
+        base_commit (`str`):
+            The commit reference of where to compare for the diff. This is the current commit, not the branching point!
+        commits (`List[str]`):
+            The list of commits with which to compare the repo at `base_commit` (so the branching point).
+
+    Returns:
+        `List[str]`: The list of Python and Markdown files with a diff (files added or renamed are always returned, files
+        modified are returned if the diff in the file is only in doctest examples).
+    """
+    print("\n### DIFF ###\n")
+    code_diff = []
+    for commit in commits:
+        for diff_obj in commit.diff(base_commit):
+            # We only consider Python files and doc files.
+            if not diff_obj.b_path.endswith(".py") and not diff_obj.b_path.endswith(".md"):
+                continue
+            # We always add new python/md files
+            if diff_obj.change_type in ["A"]:
+                code_diff.append(diff_obj.b_path)
+            # Now for modified files
+            elif diff_obj.change_type in ["M", "R"]:
+                # In case of renames, we'll look at the tests using both the old and new name.
+                if diff_obj.a_path != diff_obj.b_path:
+                    code_diff.extend([diff_obj.a_path, diff_obj.b_path])
+                else:
+                    # Otherwise, we check modifications contain some doc example(s).
+                    if diff_contains_doc_examples(repo, commit, diff_obj.b_path):
+                        code_diff.append(diff_obj.a_path)
+                    else:
+                        print(f"Ignoring diff in {diff_obj.b_path} as it doesn't contain any doc example.")
+
+    return code_diff
+
+
+def get_all_doctest_files() -> List[str]:
+    """
+    Return the complete list of python and Markdown files on which we run doctest.
+
+    At this moment, we restrict this to only take files from `src/` or `docs/source/en/` that are not in `utils/not_doctested.txt`.
+
+    Returns:
+        `List[str]`: The complete list of Python and Markdown files on which we run doctest.
+    """
+    py_files = [str(x.relative_to(PATH_TO_REPO)) for x in PATH_TO_REPO.glob("**/*.py")]
+    md_files = [str(x.relative_to(PATH_TO_REPO)) for x in PATH_TO_REPO.glob("**/*.md")]
+    test_files_to_run = py_files + md_files
+
+    # only include files in `src` or `docs/source/en/`
+    test_files_to_run = [x for x in test_files_to_run if x.startswith(("src/", "docs/source/en/"))]
+    # not include init files
+    test_files_to_run = [x for x in test_files_to_run if not x.endswith(("__init__.py",))]
+
+    # These are files not doctested yet.
+    with open("utils/not_doctested.txt") as fp:
+        not_doctested = {x.split(" ")[0] for x in fp.read().strip().split("\n")}
+
+    # So far we don't have 100% coverage for doctest. This line will be removed once we achieve 100%.
+    test_files_to_run = [x for x in test_files_to_run if x not in not_doctested]
+
+    return sorted(test_files_to_run)
+
+
+def get_new_doctest_files(repo, base_commit, branching_commit) -> List[str]:
+    """
+    Get the list of files that were removed from "utils/not_doctested.txt", between `base_commit` and
+    `branching_commit`.
+
+    Returns:
+        `List[str]`: List of files that were removed from "utils/not_doctested.txt".
+    """
+    for diff_obj in branching_commit.diff(base_commit):
+        # Ignores all but the "utils/not_doctested.txt" file.
+        if diff_obj.a_path != "utils/not_doctested.txt":
+            continue
+        # Loads the two versions
+        folder = Path(repo.working_dir)
+        with checkout_commit(repo, branching_commit):
+            with open(folder / "utils/not_doctested.txt", "r", encoding="utf-8") as f:
+                old_content = f.read()
+        with open(folder / "utils/not_doctested.txt", "r", encoding="utf-8") as f:
+            new_content = f.read()
+        # Compute the removed lines and return them
+        removed_content = {x.split(" ")[0] for x in old_content.split("\n")} - {
+            x.split(" ")[0] for x in new_content.split("\n")
+        }
+        return sorted(removed_content)
+    return []
+
+
+def get_doctest_files(diff_with_last_commit: bool = False) -> List[str]:
+    """
+    Return a list of python and Markdown files where doc example have been modified between:
+
+    - the current head and the main branch if `diff_with_last_commit=False` (default)
+    - the current head and its parent commit otherwise.
+
+    Returns:
+        `List[str]`: The list of Python and Markdown files with a diff (files added or renamed are always returned, files
+        modified are returned if the diff in the file is only in doctest examples).
+    """
+    repo = Repo(PATH_TO_REPO)
+
+    test_files_to_run = []  # noqa
+    if not diff_with_last_commit:
+        upstream_main = repo.remotes.origin.refs.main
+        print(f"main is at {upstream_main.commit}")
+        print(f"Current head is at {repo.head.commit}")
+
+        branching_commits = repo.merge_base(upstream_main, repo.head)
+        for commit in branching_commits:
+            print(f"Branching commit: {commit}")
+        test_files_to_run = get_diff_for_doctesting(repo, repo.head.commit, branching_commits)
+    else:
+        print(f"main is at {repo.head.commit}")
+        parent_commits = repo.head.commit.parents
+        for commit in parent_commits:
+            print(f"Parent commit: {commit}")
+        test_files_to_run = get_diff_for_doctesting(repo, repo.head.commit, parent_commits)
+
+    all_test_files_to_run = get_all_doctest_files()
+
+    # Add to the test files to run any removed entry from "utils/not_doctested.txt".
+    new_test_files = get_new_doctest_files(repo, repo.head.commit, upstream_main.commit)
+    test_files_to_run = list(set(test_files_to_run + new_test_files))
+
+    # Do not run slow doctest tests on CircleCI
+    with open("utils/slow_documentation_tests.txt") as fp:
+        slow_documentation_tests = set(fp.read().strip().split("\n"))
+    test_files_to_run = [
+        x for x in test_files_to_run if x in all_test_files_to_run and x not in slow_documentation_tests
+    ]
+
+    # Make sure we did not end up with a test file that was removed
+    test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()]
+
+    return sorted(test_files_to_run)
+
+
+# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
+# \s*from\s+(\.+\S+)\s+import\s+([^\n]+) -> Line only contains from .xxx import yyy and we catch .xxx and yyy
+# (?=\n) -> Look-ahead to a new line. We can't just put \n here or using find_all on this re will only catch every
+#           other import.
+_re_single_line_relative_imports = re.compile(r"(?:^|\n)\s*from\s+(\.+\S+)\s+import\s+([^\n]+)(?=\n)")
+# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
+# \s*from\s+(\.+\S+)\s+import\s+\(([^\)]+)\) -> Line continues with from .xxx import (yyy) and we catch .xxx and yyy
+# yyy will take multiple lines otherwise there wouldn't be parenthesis.
+_re_multi_line_relative_imports = re.compile(r"(?:^|\n)\s*from\s+(\.+\S+)\s+import\s+\(([^\)]+)\)")
+# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
+# \s*from\s+transformers(\S*)\s+import\s+([^\n]+) -> Line only contains from transformers.xxx import yyy and we catch
+#           .xxx and yyy
+# (?=\n) -> Look-ahead to a new line. We can't just put \n here or using find_all on this re will only catch every
+#           other import.
+_re_single_line_direct_imports = re.compile(r"(?:^|\n)\s*from\s+diffusers(\S*)\s+import\s+([^\n]+)(?=\n)")
+# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
+# \s*from\s+transformers(\S*)\s+import\s+\(([^\)]+)\) -> Line continues with from transformers.xxx import (yyy) and we
+# catch .xxx and yyy. yyy will take multiple lines otherwise there wouldn't be parenthesis.
+_re_multi_line_direct_imports = re.compile(r"(?:^|\n)\s*from\s+diffusers(\S*)\s+import\s+\(([^\)]+)\)")
+
+
+def extract_imports(module_fname: str, cache: Dict[str, List[str]] = None) -> List[str]:
+    """
+    Get the imports a given module makes.
+
+    Args:
+        module_fname (`str`):
+            The name of the file of the module where we want to look at the imports (given relative to the root of
+            the repo).
+        cache (Dictionary `str` to `List[str]`, *optional*):
+            To speed up this function if it was previously called on `module_fname`, the cache of all previously
+            computed results.
+
+    Returns:
+        `List[str]`: The list of module filenames imported in the input `module_fname` (a submodule we import from that
+        is a subfolder will give its init file).
+    """
+    if cache is not None and module_fname in cache:
+        return cache[module_fname]
+
+    with open(PATH_TO_REPO / module_fname, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Filter out all docstrings to not get imports in code examples. As before we need to deactivate formatting to
+    # keep this as escaped quotes and avoid this function failing on this file.
+    # fmt: off
+    splits = content.split('\"\"\"')
+    # fmt: on
+    content = "".join(splits[::2])
+
+    module_parts = str(module_fname).split(os.path.sep)
+    imported_modules = []
+
+    # Let's start with relative imports
+    relative_imports = _re_single_line_relative_imports.findall(content)
+    relative_imports = [
+        (mod, imp) for mod, imp in relative_imports if "# tests_ignore" not in imp and imp.strip() != "("
+    ]
+    multiline_relative_imports = _re_multi_line_relative_imports.findall(content)
+    relative_imports += [(mod, imp) for mod, imp in multiline_relative_imports if "# tests_ignore" not in imp]
+
+    # We need to remove parts of the module name depending on the depth of the relative imports.
+    for module, imports in relative_imports:
+        level = 0
+        while module.startswith("."):
+            module = module[1:]
+            level += 1
+
+        if len(module) > 0:
+            dep_parts = module_parts[: len(module_parts) - level] + module.split(".")
+        else:
+            dep_parts = module_parts[: len(module_parts) - level]
+        imported_module = os.path.sep.join(dep_parts)
+        imported_modules.append((imported_module, [imp.strip() for imp in imports.split(",")]))
+
+    # Let's continue with direct imports
+    direct_imports = _re_single_line_direct_imports.findall(content)
+    direct_imports = [(mod, imp) for mod, imp in direct_imports if "# tests_ignore" not in imp and imp.strip() != "("]
+    multiline_direct_imports = _re_multi_line_direct_imports.findall(content)
+    direct_imports += [(mod, imp) for mod, imp in multiline_direct_imports if "# tests_ignore" not in imp]
+
+    # We need to find the relative path of those imports.
+    for module, imports in direct_imports:
+        import_parts = module.split(".")[1:]  # ignore the name of the repo since we add it below.
+        dep_parts = ["src", "diffusers"] + import_parts
+        imported_module = os.path.sep.join(dep_parts)
+        imported_modules.append((imported_module, [imp.strip() for imp in imports.split(",")]))
+
+    result = []
+    # Double check we get proper modules (either a python file or a folder with an init).
+    for module_file, imports in imported_modules:
+        if (PATH_TO_REPO / f"{module_file}.py").is_file():
+            module_file = f"{module_file}.py"
+        elif (PATH_TO_REPO / module_file).is_dir() and (PATH_TO_REPO / module_file / "__init__.py").is_file():
+            module_file = os.path.sep.join([module_file, "__init__.py"])
+        imports = [imp for imp in imports if len(imp) > 0 and re.match("^[A-Za-z0-9_]*$", imp)]
+        if len(imports) > 0:
+            result.append((module_file, imports))
+
+    if cache is not None:
+        cache[module_fname] = result
+
+    return result
+
+
+def get_module_dependencies(module_fname: str, cache: Dict[str, List[str]] = None) -> List[str]:
+    """
+    Refines the result of `extract_imports` to remove subfolders and get a proper list of module filenames: if a file
+    as an import `from utils import Foo, Bar`, with `utils` being a subfolder containing many files, this will traverse
+    the `utils` init file to check where those dependencies come from: for instance the files utils/foo.py and utils/bar.py.
+
+    Warning: This presupposes that all intermediate inits are properly built (with imports from the respective
+    submodules) and work better if objects are defined in submodules and not the intermediate init (otherwise the
+    intermediate init is added, and inits usually have a lot of dependencies).
+
+    Args:
+        module_fname (`str`):
+            The name of the file of the module where we want to look at the imports (given relative to the root of
+            the repo).
+        cache (Dictionary `str` to `List[str]`, *optional*):
+            To speed up this function if it was previously called on `module_fname`, the cache of all previously
+            computed results.
+
+    Returns:
+        `List[str]`: The list of module filenames imported in the input `module_fname` (with submodule imports refined).
+    """
+    dependencies = []
+    imported_modules = extract_imports(module_fname, cache=cache)
+    # The while loop is to recursively traverse all inits we may encounter: we will add things as we go.
+    while len(imported_modules) > 0:
+        new_modules = []
+        for module, imports in imported_modules:
+            # If we end up in an __init__ we are often not actually importing from this init (except in the case where
+            # the object is fully defined in the __init__)
+            if module.endswith("__init__.py"):
+                # So we get the imports from that init then try to find where our objects come from.
+                new_imported_modules = extract_imports(module, cache=cache)
+                for new_module, new_imports in new_imported_modules:
+                    if any(i in new_imports for i in imports):
+                        if new_module not in dependencies:
+                            new_modules.append((new_module, [i for i in new_imports if i in imports]))
+                        imports = [i for i in imports if i not in new_imports]
+                if len(imports) > 0:
+                    # If there are any objects lefts, they may be a submodule
+                    path_to_module = PATH_TO_REPO / module.replace("__init__.py", "")
+                    dependencies.extend(
+                        [
+                            os.path.join(module.replace("__init__.py", ""), f"{i}.py")
+                            for i in imports
+                            if (path_to_module / f"{i}.py").is_file()
+                        ]
+                    )
+                    imports = [i for i in imports if not (path_to_module / f"{i}.py").is_file()]
+                    if len(imports) > 0:
+                        # Then if there are still objects left, they are fully defined in the init, so we keep it as a
+                        # dependency.
+                        dependencies.append(module)
+            else:
+                dependencies.append(module)
+
+        imported_modules = new_modules
+
+    return dependencies
+
+
+def create_reverse_dependency_tree() -> List[Tuple[str, str]]:
+    """
+    Create a list of all edges (a, b) which mean that modifying a impacts b with a going over all module and test files.
+    """
+    cache = {}
+    all_modules = list(PATH_TO_DIFFUSERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py"))
+    all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
+    edges = [(dep, mod) for mod in all_modules for dep in get_module_dependencies(mod, cache=cache)]
+
+    return list(set(edges))
+
+
+def get_tree_starting_at(module: str, edges: List[Tuple[str, str]]) -> List[Union[str, List[str]]]:
+    """
+    Returns the tree starting at a given module following all edges.
+
+    Args:
+        module (`str`): The module that will be the root of the subtree we want.
+        edges (`List[Tuple[str, str]]`): The list of all edges of the tree.
+
+    Returns:
+        `List[Union[str, List[str]]]`: The tree to print in the following format: [module, [list of edges
+        starting at module], [list of edges starting at the preceding level], ...]
+    """
+    vertices_seen = [module]
+    new_edges = [edge for edge in edges if edge[0] == module and edge[1] != module and "__init__.py" not in edge[1]]
+    tree = [module]
+    while len(new_edges) > 0:
+        tree.append(new_edges)
+        final_vertices = list({edge[1] for edge in new_edges})
+        vertices_seen.extend(final_vertices)
+        new_edges = [
+            edge
+            for edge in edges
+            if edge[0] in final_vertices and edge[1] not in vertices_seen and "__init__.py" not in edge[1]
+        ]
+
+    return tree
+
+
+def print_tree_deps_of(module, all_edges=None):
+    """
+    Prints the tree of modules depending on a given module.
+
+    Args:
+        module (`str`): The module that will be the root of the subtree we want.
+        all_edges (`List[Tuple[str, str]]`, *optional*):
+            The list of all edges of the tree. Will be set to `create_reverse_dependency_tree()` if not passed.
+    """
+    if all_edges is None:
+        all_edges = create_reverse_dependency_tree()
+    tree = get_tree_starting_at(module, all_edges)
+
+    # The list of lines is a list of tuples (line_to_be_printed, module)
+    # Keeping the modules lets us know where to insert each new lines in the list.
+    lines = [(tree[0], tree[0])]
+    for index in range(1, len(tree)):
+        edges = tree[index]
+        start_edges = {edge[0] for edge in edges}
+
+        for start in start_edges:
+            end_edges = {edge[1] for edge in edges if edge[0] == start}
+            # We will insert all those edges just after the line showing start.
+            pos = 0
+            while lines[pos][1] != start:
+                pos += 1
+            lines = lines[: pos + 1] + [(" " * (2 * index) + end, end) for end in end_edges] + lines[pos + 1 :]
+
+    for line in lines:
+        # We don't print the refs that where just here to help build lines.
+        print(line[0])
+
+
+def init_test_examples_dependencies() -> Tuple[Dict[str, List[str]], List[str]]:
+    """
+    The test examples do not import from the examples (which are just scripts, not modules) so we need some extra
+    care initializing the dependency map, which is the goal of this function. It initializes the dependency map for
+    example files by linking each example to the example test file for the example framework.
+
+    Returns:
+        `Tuple[Dict[str, List[str]], List[str]]`: A tuple with two elements: the initialized dependency map which is a
+        dict test example file to list of example files potentially tested by that test file, and the list of all
+        example files (to avoid recomputing it later).
+    """
+    test_example_deps = {}
+    all_examples = []
+    for framework in ["flax", "pytorch", "tensorflow"]:
+        test_files = list((PATH_TO_EXAMPLES / framework).glob("test_*.py"))
+        all_examples.extend(test_files)
+        # Remove the files at the root of examples/framework since they are not proper examples (they are either utils
+        # or example test files).
+        examples = [
+            f for f in (PATH_TO_EXAMPLES / framework).glob("**/*.py") if f.parent != PATH_TO_EXAMPLES / framework
+        ]
+        all_examples.extend(examples)
+        for test_file in test_files:
+            with open(test_file, "r", encoding="utf-8") as f:
+                content = f.read()
+            # Map all examples to the test files found in examples/framework.
+            test_example_deps[str(test_file.relative_to(PATH_TO_REPO))] = [
+                str(e.relative_to(PATH_TO_REPO)) for e in examples if e.name in content
+            ]
+            # Also map the test files to themselves.
+            test_example_deps[str(test_file.relative_to(PATH_TO_REPO))].append(
+                str(test_file.relative_to(PATH_TO_REPO))
+            )
+    return test_example_deps, all_examples
+
+
+def create_reverse_dependency_map() -> Dict[str, List[str]]:
+    """
+    Create the dependency map from module/test filename to the list of modules/tests that depend on it recursively.
+
+    Returns:
+        `Dict[str, List[str]]`: The reverse dependency map as a dictionary mapping filenames to all the filenames
+        depending on it recursively. This way the tests impacted by a change in file A are the test files in the list
+        corresponding to key A in this result.
+    """
+    cache = {}
+    # Start from the example deps init.
+    example_deps, examples = init_test_examples_dependencies()
+    # Add all modules and all tests to all examples
+    all_modules = list(PATH_TO_DIFFUSERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py")) + examples
+    all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
+    # Compute the direct dependencies of all modules.
+    direct_deps = {m: get_module_dependencies(m, cache=cache) for m in all_modules}
+    direct_deps.update(example_deps)
+
+    # This recurses the dependencies
+    something_changed = True
+    while something_changed:
+        something_changed = False
+        for m in all_modules:
+            for d in direct_deps[m]:
+                # We stop recursing at an init (cause we always end up in the main init and we don't want to add all
+                # files which the main init imports)
+                if d.endswith("__init__.py"):
+                    continue
+                if d not in direct_deps:
+                    raise ValueError(f"KeyError:{d}. From {m}")
+                new_deps = set(direct_deps[d]) - set(direct_deps[m])
+                if len(new_deps) > 0:
+                    direct_deps[m].extend(list(new_deps))
+                    something_changed = True
+
+    # Finally we can build the reverse map.
+    reverse_map = collections.defaultdict(list)
+    for m in all_modules:
+        for d in direct_deps[m]:
+            reverse_map[d].append(m)
+
+    # For inits, we don't do the reverse deps but the direct deps: if modifying an init, we want to make sure we test
+    # all the modules impacted by that init.
+    for m in [f for f in all_modules if f.endswith("__init__.py")]:
+        direct_deps = get_module_dependencies(m, cache=cache)
+        deps = sum([reverse_map[d] for d in direct_deps if not d.endswith("__init__.py")], direct_deps)
+        reverse_map[m] = list(set(deps) - {m})
+
+    return reverse_map
+
+
+def create_module_to_test_map(reverse_map: Dict[str, List[str]] = None) -> Dict[str, List[str]]:
+    """
+    Extract the tests from the reverse_dependency_map and potentially filters the model tests.
+
+    Args:
+        reverse_map (`Dict[str, List[str]]`, *optional*):
+            The reverse dependency map as created by `create_reverse_dependency_map`. Will default to the result of
+            that function if not provided.
+        filter_pipelines (`bool`, *optional*, defaults to `False`):
+            Whether or not to filter pipeline tests to only include core pipelines if a file impacts a lot of models.
+
+    Returns:
+        `Dict[str, List[str]]`: A dictionary that maps each file to the tests to execute if that file was modified.
+    """
+    if reverse_map is None:
+        reverse_map = create_reverse_dependency_map()
+
+    # Utility that tells us if a given file is a test (taking test examples into account)
+    def is_test(fname):
+        if fname.startswith("tests"):
+            return True
+        if fname.startswith("examples") and fname.split(os.path.sep)[-1].startswith("test"):
+            return True
+        return False
+
+    # Build the test map
+    test_map = {module: [f for f in deps if is_test(f)] for module, deps in reverse_map.items()}
+
+    return test_map
+
+
+def check_imports_all_exist():
+    """
+    Isn't used per se by the test fetcher but might be used later as a quality check. Putting this here for now so the
+    code is not lost. This checks all imports in a given file do exist.
+    """
+    cache = {}
+    all_modules = list(PATH_TO_DIFFUSERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py"))
+    all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
+    direct_deps = {m: get_module_dependencies(m, cache=cache) for m in all_modules}
+
+    for module, deps in direct_deps.items():
+        for dep in deps:
+            if not (PATH_TO_REPO / dep).is_file():
+                print(f"{module} has dependency on {dep} which does not exist.")
+
+
+def _print_list(l) -> str:
+    """
+    Pretty print a list of elements with one line per element and a - starting each line.
+    """
+    return "\n".join([f"- {f}" for f in l])
+
+
+def update_test_map_with_core_pipelines(json_output_file: str):
+    print(f"\n### ADD CORE PIPELINE TESTS ###\n{_print_list(IMPORTANT_PIPELINES)}")
+    with open(json_output_file, "rb") as fp:
+        test_map = json.load(fp)
+
+    # Add core pipelines as their own test group
+    test_map["core_pipelines"] = " ".join(
+        sorted([str(PATH_TO_TESTS / f"pipelines/{pipe}") for pipe in IMPORTANT_PIPELINES])
+    )
+
+    # If there are no existing pipeline tests save the map
+    if "pipelines" not in test_map:
+        with open(json_output_file, "w", encoding="UTF-8") as fp:
+            json.dump(test_map, fp, ensure_ascii=False)
+
+    pipeline_tests = test_map.pop("pipelines")
+    pipeline_tests = pipeline_tests.split(" ")
+
+    # Remove core pipeline tests from the fetched pipeline tests
+    updated_pipeline_tests = []
+    for pipe in pipeline_tests:
+        if pipe == "tests/pipelines" or Path(pipe).parts[2] in IMPORTANT_PIPELINES:
+            continue
+        updated_pipeline_tests.append(pipe)
+
+    if len(updated_pipeline_tests) > 0:
+        test_map["pipelines"] = " ".join(sorted(updated_pipeline_tests))
+
+    with open(json_output_file, "w", encoding="UTF-8") as fp:
+        json.dump(test_map, fp, ensure_ascii=False)
+
+
+def create_json_map(test_files_to_run: List[str], json_output_file: Optional[str] = None):
+    """
+    Creates a map from a list of tests to run to easily split them by category, when running parallelism of slow tests.
+
+    Args:
+        test_files_to_run (`List[str]`): The list of tests to run.
+        json_output_file (`str`): The path where to store the built json map.
+    """
+    if json_output_file is None:
+        return
+
+    test_map = {}
+    for test_file in test_files_to_run:
+        # `test_file` is a path to a test folder/file, starting with `tests/`. For example,
+        #   - `tests/models/bert/test_modeling_bert.py` or `tests/models/bert`
+        #   - `tests/trainer/test_trainer.py` or `tests/trainer`
+        #   - `tests/test_modeling_common.py`
+        names = test_file.split(os.path.sep)
+        module = names[1]
+        if module in MODULES_TO_IGNORE:
+            continue
+
+        if len(names) > 2 or not test_file.endswith(".py"):
+            # test folders under `tests` or python files under them
+            # take the part like tokenization, `pipeline`, etc. for other test categories
+            key = os.path.sep.join(names[1:2])
+        else:
+            # common test files directly under `tests/`
+            key = "common"
+
+        if key not in test_map:
+            test_map[key] = []
+        test_map[key].append(test_file)
+
+    # sort the keys & values
+    keys = sorted(test_map.keys())
+    test_map = {k: " ".join(sorted(test_map[k])) for k in keys}
+
+    with open(json_output_file, "w", encoding="UTF-8") as fp:
+        json.dump(test_map, fp, ensure_ascii=False)
+
+
+def infer_tests_to_run(
+    output_file: str,
+    diff_with_last_commit: bool = False,
+    json_output_file: Optional[str] = None,
+):
+    """
+    The main function called by the test fetcher. Determines the tests to run from the diff.
+
+    Args:
+        output_file (`str`):
+            The path where to store the summary of the test fetcher analysis. Other files will be stored in the same
+            folder:
+
+            - examples_test_list.txt: The list of examples tests to run.
+            - test_repo_utils.txt: Will indicate if the repo utils tests should be run or not.
+            - doctest_list.txt: The list of doctests to run.
+
+        diff_with_last_commit (`bool`, *optional*, defaults to `False`):
+            Whether to analyze the diff with the last commit (for use on the main branch after a PR is merged) or with
+            the branching point from main (for use on each PR).
+        filter_models (`bool`, *optional*, defaults to `True`):
+            Whether or not to filter the tests to core models only, when a file modified results in a lot of model
+            tests.
+        json_output_file (`str`, *optional*):
+            The path where to store the json file mapping categories of tests to tests to run (used for parallelism or
+            the slow tests).
+    """
+    modified_files = get_modified_python_files(diff_with_last_commit=diff_with_last_commit)
+    print(f"\n### MODIFIED FILES ###\n{_print_list(modified_files)}")
+    # Create the map that will give us all impacted modules.
+    reverse_map = create_reverse_dependency_map()
+    impacted_files = modified_files.copy()
+    for f in modified_files:
+        if f in reverse_map:
+            impacted_files.extend(reverse_map[f])
+
+    # Remove duplicates
+    impacted_files = sorted(set(impacted_files))
+    print(f"\n### IMPACTED FILES ###\n{_print_list(impacted_files)}")
+
+    # Grab the corresponding test files:
+    if any(x in modified_files for x in ["setup.py"]):
+        test_files_to_run = ["tests", "examples"]
+
+    # in order to trigger pipeline tests even if no code change at all
+    if "tests/utils/tiny_model_summary.json" in modified_files:
+        test_files_to_run = ["tests"]
+        any(f.split(os.path.sep)[0] == "utils" for f in modified_files)
+    else:
+        # All modified tests need to be run.
+        test_files_to_run = [
+            f for f in modified_files if f.startswith("tests") and f.split(os.path.sep)[-1].startswith("test")
+        ]
+        # Then we grab the corresponding test files.
+        test_map = create_module_to_test_map(reverse_map=reverse_map)
+        for f in modified_files:
+            if f in test_map:
+                test_files_to_run.extend(test_map[f])
+        test_files_to_run = sorted(set(test_files_to_run))
+        # Make sure we did not end up with a test file that was removed
+        test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()]
+
+        any(f.split(os.path.sep)[0] == "utils" for f in modified_files)
+
+    examples_tests_to_run = [f for f in test_files_to_run if f.startswith("examples")]
+    test_files_to_run = [f for f in test_files_to_run if not f.startswith("examples")]
+    print(f"\n### TEST TO RUN ###\n{_print_list(test_files_to_run)}")
+    if len(test_files_to_run) > 0:
+        with open(output_file, "w", encoding="utf-8") as f:
+            f.write(" ".join(test_files_to_run))
+
+        # Create a map that maps test categories to test files, i.e. `models/bert` -> [...test_modeling_bert.py, ...]
+
+        # Get all test directories (and some common test files) under `tests` and `tests/models` if `test_files_to_run`
+        # contains `tests` (i.e. when `setup.py` is changed).
+        if "tests" in test_files_to_run:
+            test_files_to_run = get_all_tests()
+
+        create_json_map(test_files_to_run, json_output_file)
+
+    print(f"\n### EXAMPLES TEST TO RUN ###\n{_print_list(examples_tests_to_run)}")
+    if len(examples_tests_to_run) > 0:
+        # We use `all` in the case `commit_flags["test_all"]` as well as in `create_circleci_config.py` for processing
+        if examples_tests_to_run == ["examples"]:
+            examples_tests_to_run = ["all"]
+        example_file = Path(output_file).parent / "examples_test_list.txt"
+        with open(example_file, "w", encoding="utf-8") as f:
+            f.write(" ".join(examples_tests_to_run))
+
+
+def filter_tests(output_file: str, filters: List[str]):
+    """
+    Reads the content of the output file and filters out all the tests in a list of given folders.
+
+    Args:
+        output_file (`str` or `os.PathLike`): The path to the output file of the tests fetcher.
+        filters (`List[str]`): A list of folders to filter.
+    """
+    if not os.path.isfile(output_file):
+        print("No test file found.")
+        return
+    with open(output_file, "r", encoding="utf-8") as f:
+        test_files = f.read().split(" ")
+
+    if len(test_files) == 0 or test_files == [""]:
+        print("No tests to filter.")
+        return
+
+    if test_files == ["tests"]:
+        test_files = [os.path.join("tests", f) for f in os.listdir("tests") if f not in ["__init__.py"] + filters]
+    else:
+        test_files = [f for f in test_files if f.split(os.path.sep)[1] not in filters]
+
+    with open(output_file, "w", encoding="utf-8") as f:
+        f.write(" ".join(test_files))
+
+
+def parse_commit_message(commit_message: str) -> Dict[str, bool]:
+    """
+    Parses the commit message to detect if a command is there to skip, force all or part of the CI.
+
+    Args:
+        commit_message (`str`): The commit message of the current commit.
+
+    Returns:
+        `Dict[str, bool]`: A dictionary of strings to bools with keys the following keys: `"skip"`,
+        `"test_all_models"` and `"test_all"`.
+    """
+    if commit_message is None:
+        return {"skip": False, "no_filter": False, "test_all": False}
+
+    command_search = re.search(r"\[([^\]]*)\]", commit_message)
+    if command_search is not None:
+        command = command_search.groups()[0]
+        command = command.lower().replace("-", " ").replace("_", " ")
+        skip = command in ["ci skip", "skip ci", "circleci skip", "skip circleci"]
+        no_filter = set(command.split(" ")) == {"no", "filter"}
+        test_all = set(command.split(" ")) == {"test", "all"}
+        return {"skip": skip, "no_filter": no_filter, "test_all": test_all}
+    else:
+        return {"skip": False, "no_filter": False, "test_all": False}
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--output_file", type=str, default="test_list.txt", help="Where to store the list of tests to run"
+    )
+    parser.add_argument(
+        "--json_output_file",
+        type=str,
+        default="test_map.json",
+        help="Where to store the tests to run in a dictionary format mapping test categories to test files",
+    )
+    parser.add_argument(
+        "--diff_with_last_commit",
+        action="store_true",
+        help="To fetch the tests between the current commit and the last commit",
+    )
+    parser.add_argument(
+        "--filter_tests",
+        action="store_true",
+        help="Will filter the pipeline/repo utils tests outside of the generated list of tests.",
+    )
+    parser.add_argument(
+        "--print_dependencies_of",
+        type=str,
+        help="Will only print the tree of modules depending on the file passed.",
+        default=None,
+    )
+    parser.add_argument(
+        "--commit_message",
+        type=str,
+        help="The commit message (which could contain a command to force all tests or skip the CI).",
+        default=None,
+    )
+    args = parser.parse_args()
+    if args.print_dependencies_of is not None:
+        print_tree_deps_of(args.print_dependencies_of)
+    else:
+        repo = Repo(PATH_TO_REPO)
+        commit_message = repo.head.commit.message
+        commit_flags = parse_commit_message(commit_message)
+        if commit_flags["skip"]:
+            print("Force-skipping the CI")
+            quit()
+        if commit_flags["no_filter"]:
+            print("Running all tests fetched without filtering.")
+        if commit_flags["test_all"]:
+            print("Force-launching all tests")
+
+        diff_with_last_commit = args.diff_with_last_commit
+        if not diff_with_last_commit and not repo.head.is_detached and repo.head.ref == repo.refs.main:
+            print("main branch detected, fetching tests against last commit.")
+            diff_with_last_commit = True
+
+        if not commit_flags["test_all"]:
+            try:
+                infer_tests_to_run(
+                    args.output_file,
+                    diff_with_last_commit=diff_with_last_commit,
+                    json_output_file=args.json_output_file,
+                )
+                filter_tests(args.output_file, ["repo_utils"])
+                update_test_map_with_core_pipelines(json_output_file=args.json_output_file)
+
+            except Exception as e:
+                print(f"\nError when trying to grab the relevant tests: {e}\n\nRunning all tests.")
+                commit_flags["test_all"] = True
+
+        if commit_flags["test_all"]:
+            with open(args.output_file, "w", encoding="utf-8") as f:
+                f.write("tests")
+            example_file = Path(args.output_file).parent / "examples_test_list.txt"
+            with open(example_file, "w", encoding="utf-8") as f:
+                f.write("all")
+
+            test_files_to_run = get_all_tests()
+            create_json_map(test_files_to_run, args.json_output_file)
+            update_test_map_with_core_pipelines(json_output_file=args.json_output_file)
diff --git a/diffusers/utils/update_metadata.py b/diffusers/utils/update_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..103a2b9ab0cc1036f1eaa731f1a9500ab0396e03
--- /dev/null
+++ b/diffusers/utils/update_metadata.py
@@ -0,0 +1,126 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utility that updates the metadata of the Diffusers library in the repository `huggingface/diffusers-metadata`.
+
+Usage for an update (as used by the GitHub action `update_metadata`):
+
+```bash
+python utils/update_metadata.py
+```
+
+Script modified from:
+https://github.com/huggingface/transformers/blob/main/utils/update_metadata.py
+"""
+
+import argparse
+import os
+import tempfile
+
+import pandas as pd
+from datasets import Dataset
+from huggingface_hub import hf_hub_download, upload_folder
+
+from diffusers.pipelines.auto_pipeline import (
+    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
+    AUTO_INPAINT_PIPELINES_MAPPING,
+    AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
+)
+
+
+PIPELINE_TAG_JSON = "pipeline_tags.json"
+
+
+def get_supported_pipeline_table() -> dict:
+    """
+    Generates a dictionary containing the supported auto classes for each pipeline type,
+    using the content of the auto modules.
+    """
+    # All supported pipelines for automatic mapping.
+    all_supported_pipeline_classes = [
+        (class_name.__name__, "text-to-image", "AutoPipelineForText2Image")
+        for _, class_name in AUTO_TEXT2IMAGE_PIPELINES_MAPPING.items()
+    ]
+    all_supported_pipeline_classes += [
+        (class_name.__name__, "image-to-image", "AutoPipelineForImage2Image")
+        for _, class_name in AUTO_IMAGE2IMAGE_PIPELINES_MAPPING.items()
+    ]
+    all_supported_pipeline_classes += [
+        (class_name.__name__, "image-to-image", "AutoPipelineForInpainting")
+        for _, class_name in AUTO_INPAINT_PIPELINES_MAPPING.items()
+    ]
+    all_supported_pipeline_classes = list(set(all_supported_pipeline_classes))
+    all_supported_pipeline_classes.sort(key=lambda x: x[0])
+
+    data = {}
+    data["pipeline_class"] = [sample[0] for sample in all_supported_pipeline_classes]
+    data["pipeline_tag"] = [sample[1] for sample in all_supported_pipeline_classes]
+    data["auto_class"] = [sample[2] for sample in all_supported_pipeline_classes]
+
+    return data
+
+
+def update_metadata(commit_sha: str):
+    """
+    Update the metadata for the Diffusers repo in `huggingface/diffusers-metadata`.
+
+    Args:
+        commit_sha (`str`): The commit SHA on Diffusers corresponding to this update.
+    """
+    pipelines_table = get_supported_pipeline_table()
+    pipelines_table = pd.DataFrame(pipelines_table)
+    pipelines_dataset = Dataset.from_pandas(pipelines_table)
+
+    hub_pipeline_tags_json = hf_hub_download(
+        repo_id="huggingface/diffusers-metadata",
+        filename=PIPELINE_TAG_JSON,
+        repo_type="dataset",
+    )
+    with open(hub_pipeline_tags_json) as f:
+        hub_pipeline_tags_json = f.read()
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        pipelines_dataset.to_json(os.path.join(tmp_dir, PIPELINE_TAG_JSON))
+
+        with open(os.path.join(tmp_dir, PIPELINE_TAG_JSON)) as f:
+            pipeline_tags_json = f.read()
+
+        hub_pipeline_tags_equal = hub_pipeline_tags_json == pipeline_tags_json
+        if hub_pipeline_tags_equal:
+            print("No updates, not pushing the metadata files.")
+            return
+
+        if commit_sha is not None:
+            commit_message = (
+                f"Update with commit {commit_sha}\n\nSee: "
+                f"https://github.com/huggingface/diffusers/commit/{commit_sha}"
+            )
+        else:
+            commit_message = "Update"
+
+        upload_folder(
+            repo_id="huggingface/diffusers-metadata",
+            folder_path=tmp_dir,
+            repo_type="dataset",
+            commit_message=commit_message,
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--commit_sha", default=None, type=str, help="The sha of the commit going with this update.")
+    args = parser.parse_args()
+
+    update_metadata(args.commit_sha)
diff --git a/examples/line/example0/input.png b/examples/line/example0/input.png
new file mode 100644
index 0000000000000000000000000000000000000000..822385e68214f2cd644120fe03613625fffa407f
--- /dev/null
+++ b/examples/line/example0/input.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ea4b6995ceebcb0fd89137fd8db4810ef414e727b1647f3f6d64f6bd60aa6ff
+size 876225
diff --git a/examples/line/example0/reference_image_0.png b/examples/line/example0/reference_image_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..402cb8a120b88e7752551d34ca5908fcf56a842c
--- /dev/null
+++ b/examples/line/example0/reference_image_0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8a8e2d9224dd87832fa46f56e524f26f34020a9751c30f327d3554fdcdd66b9
+size 1224885
diff --git a/examples/line/example0/reference_image_1.png b/examples/line/example0/reference_image_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..9243368e59c0dad92c04e9fa47e8b273f6a7e38a
--- /dev/null
+++ b/examples/line/example0/reference_image_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4af4fd609dd95de5253752d6e8e34187761096b0e1a6f71c65ee433dbc6a4647
+size 1135462
diff --git a/examples/line/example0/reference_image_2.png b/examples/line/example0/reference_image_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..74fe694a60dc240fae5208a36a69c376caff9408
--- /dev/null
+++ b/examples/line/example0/reference_image_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0aba47e145a704e291af4c9558601df10ff12c3566dc01f743de8244757a062
+size 1100154
diff --git a/examples/line/example1/input.png b/examples/line/example1/input.png
new file mode 100644
index 0000000000000000000000000000000000000000..3eed1deda1ff01a30e62a392513ec5a37dc41a15
--- /dev/null
+++ b/examples/line/example1/input.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8523d515b3bfec681bd5b950d05e6dec2514aa0e919cc4df854c1002624f3c01
+size 944378
diff --git a/examples/line/example1/reference_image_0.png b/examples/line/example1/reference_image_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..096def299db98cd98ffd0ff9b4a75939de5bfe8c
--- /dev/null
+++ b/examples/line/example1/reference_image_0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7170d97925fa1f66ecef1dc4dba88d1e9df81db754f421043f55b4380685ae66
+size 998709
diff --git a/examples/line/example2/input.png b/examples/line/example2/input.png
new file mode 100644
index 0000000000000000000000000000000000000000..afbbba680d84e5f1733a38c09ef6fb6ae98c80d9
--- /dev/null
+++ b/examples/line/example2/input.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:774225c60774d30f33b67277cbee364cc4399a193734fccc58c6e721ba266910
+size 1287235
diff --git a/examples/line/example2/reference_image_0.png b/examples/line/example2/reference_image_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..df82c97d5f5f6a2c8d9cd592af572969f804f217
--- /dev/null
+++ b/examples/line/example2/reference_image_0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0230a0eb14b05a6ba7090fe724735e238d63903e690a315b92f9aa8805151f6a
+size 1420492
diff --git a/examples/line/example2/reference_image_1.png b/examples/line/example2/reference_image_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..e431bf1f92356dd8b7d5489378c038b5139fa353
--- /dev/null
+++ b/examples/line/example2/reference_image_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4f66c5e8becdc38bb341974e0918471dacb00dc64ec44abc974cab9abd59a7e
+size 1437314
diff --git a/examples/line/example2/reference_image_2.png b/examples/line/example2/reference_image_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e02b78bb346b93a7225370a7d6a77ef9538f4da
--- /dev/null
+++ b/examples/line/example2/reference_image_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4c686de26db7d1fba7fb3192d7524752fd1caeb47d39f13765ae4e1dc4a51fa
+size 1408956
diff --git a/examples/line/example2/reference_image_3.png b/examples/line/example2/reference_image_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..6a8642baac6bf25c2d69e62754a375f47887b849
--- /dev/null
+++ b/examples/line/example2/reference_image_3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85575477d1f70ec649653d118b9a4474dbba756b95e5c5f4b2ccfaab8e13f1bc
+size 1351100
diff --git a/examples/line/example3/input.png b/examples/line/example3/input.png
new file mode 100644
index 0000000000000000000000000000000000000000..4a8819a22f0d54c942f792fc15cfe628bce60f9e
--- /dev/null
+++ b/examples/line/example3/input.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8a85639f85cb8b1af4c1eaa7e492658c6d32312a4b159bc2f19fb91b4d3842a
+size 1114773
diff --git a/examples/line/example3/reference_image_0.png b/examples/line/example3/reference_image_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..3c03ddacc8aaca10aa6d23edca34871af24acd01
--- /dev/null
+++ b/examples/line/example3/reference_image_0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29e91526edcb799dc015a19a9ffe94c4d623e03aa25964b494b4963d584f1860
+size 1148488
diff --git a/examples/shadow/example0/input.png b/examples/shadow/example0/input.png
new file mode 100644
index 0000000000000000000000000000000000000000..afbbba680d84e5f1733a38c09ef6fb6ae98c80d9
--- /dev/null
+++ b/examples/shadow/example0/input.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:774225c60774d30f33b67277cbee364cc4399a193734fccc58c6e721ba266910
+size 1287235
diff --git a/examples/shadow/example0/reference_image_0.png b/examples/shadow/example0/reference_image_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..df82c97d5f5f6a2c8d9cd592af572969f804f217
--- /dev/null
+++ b/examples/shadow/example0/reference_image_0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0230a0eb14b05a6ba7090fe724735e238d63903e690a315b92f9aa8805151f6a
+size 1420492
diff --git a/examples/shadow/example0/reference_image_1.png b/examples/shadow/example0/reference_image_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..e431bf1f92356dd8b7d5489378c038b5139fa353
--- /dev/null
+++ b/examples/shadow/example0/reference_image_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4f66c5e8becdc38bb341974e0918471dacb00dc64ec44abc974cab9abd59a7e
+size 1437314
diff --git a/examples/shadow/example0/reference_image_2.png b/examples/shadow/example0/reference_image_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e02b78bb346b93a7225370a7d6a77ef9538f4da
--- /dev/null
+++ b/examples/shadow/example0/reference_image_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4c686de26db7d1fba7fb3192d7524752fd1caeb47d39f13765ae4e1dc4a51fa
+size 1408956
diff --git a/examples/shadow/example0/reference_image_3.png b/examples/shadow/example0/reference_image_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..6a8642baac6bf25c2d69e62754a375f47887b849
--- /dev/null
+++ b/examples/shadow/example0/reference_image_3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85575477d1f70ec649653d118b9a4474dbba756b95e5c5f4b2ccfaab8e13f1bc
+size 1351100
diff --git a/examples/shadow/example1/input.png b/examples/shadow/example1/input.png
new file mode 100644
index 0000000000000000000000000000000000000000..6a8642baac6bf25c2d69e62754a375f47887b849
--- /dev/null
+++ b/examples/shadow/example1/input.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85575477d1f70ec649653d118b9a4474dbba756b95e5c5f4b2ccfaab8e13f1bc
+size 1351100
diff --git a/examples/shadow/example1/reference_image_0.png b/examples/shadow/example1/reference_image_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..df82c97d5f5f6a2c8d9cd592af572969f804f217
--- /dev/null
+++ b/examples/shadow/example1/reference_image_0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0230a0eb14b05a6ba7090fe724735e238d63903e690a315b92f9aa8805151f6a
+size 1420492
diff --git a/examples/shadow/example1/reference_image_1.png b/examples/shadow/example1/reference_image_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..270ebb161aefb429d07e4b7c344268dcc28ed025
--- /dev/null
+++ b/examples/shadow/example1/reference_image_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfbb0699a6e669443d2022ebcd897a6b4146e5422f9b1f492a136d23dc7aad24
+size 1340874
diff --git a/examples/shadow/example1/reference_image_2.png b/examples/shadow/example1/reference_image_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e02b78bb346b93a7225370a7d6a77ef9538f4da
--- /dev/null
+++ b/examples/shadow/example1/reference_image_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4c686de26db7d1fba7fb3192d7524752fd1caeb47d39f13765ae4e1dc4a51fa
+size 1408956
diff --git a/examples/shadow/example1/reference_image_3.png b/examples/shadow/example1/reference_image_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..afbbba680d84e5f1733a38c09ef6fb6ae98c80d9
--- /dev/null
+++ b/examples/shadow/example1/reference_image_3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:774225c60774d30f33b67277cbee364cc4399a193734fccc58c6e721ba266910
+size 1287235
diff --git a/examples/shadow/example1/reference_image_4.png b/examples/shadow/example1/reference_image_4.png
new file mode 100644
index 0000000000000000000000000000000000000000..200e074ebcbfb3f0ee40261f3bacb00b3144081d
--- /dev/null
+++ b/examples/shadow/example1/reference_image_4.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6344bb8f8ea3763ff85e82c002ef0d77cbbaed73d34214da9f925343686a7cdb
+size 1350148
diff --git a/examples/shadow/example1/reference_image_5.png b/examples/shadow/example1/reference_image_5.png
new file mode 100644
index 0000000000000000000000000000000000000000..195862913867bb4b5b8ea1df7d829ebe3b1ee5b7
--- /dev/null
+++ b/examples/shadow/example1/reference_image_5.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e9c90c67279fe3fe24aae04b648138c8ca61d92f60852b93dfde8731f14028b
+size 1183988
diff --git a/examples/shadow/example2/input.png b/examples/shadow/example2/input.png
new file mode 100644
index 0000000000000000000000000000000000000000..f2c5d19a15d3ff452c4d3647942e1f613db79530
--- /dev/null
+++ b/examples/shadow/example2/input.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c0834bf5a2cc2abf5f647b5d7d7a81dd590293ff35d9f73b1f03b5f82544237
+size 642863
diff --git a/examples/shadow/example2/reference_image_0.png b/examples/shadow/example2/reference_image_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..6e88f6372daa248eedbd5dd21087d237c909ef89
--- /dev/null
+++ b/examples/shadow/example2/reference_image_0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ca74e55996ef528ed7d9bc7a45452ba9ae38f61cd0bb4196baa3143d1084aaf
+size 722492
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..06b3d33dbd6b121913810ee920534bcc06f1d0f8
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,19 @@
+-e ./diffusers
+torch==2.5.1
+torchvision==0.20.1
+accelerate==1.5.2
+datasets==3.2.0
+einops==0.8.1
+gradio==5.22.0
+gradio_client==1.8.0
+huggingface-hub==0.30.2
+matplotlib==3.10.1
+numpy==1.26.4
+opencv-python==4.11.0.86
+peft==0.15.0
+Pillow==11.1.0
+safetensors==0.5.3
+sentencepiece==0.2.0
+torchaudio==2.5.1
+tqdm==4.67.1
+transformers==4.48.3
\ No newline at end of file